diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
index 82661c6c66..5bee0c9be0 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/__init__.py
@@ -35,6 +35,9 @@
     new_output = { 'madevent_simd' : output.SIMD_ProcessExporter,
                    'madevent_gpu' : output.GPU_ProcessExporter,
                    'standalone_cudacpp' : output.PLUGIN_ProcessExporter,
+                   # the following one are used for the second exporter class 
+                   # (not really needed so far but interesting if need
+                   #  specialization in the futur) 
                    'standalone_simd' :  output.SIMD_ProcessExporter,
                    'standalone_cuda' :  output.GPU_ProcessExporter,
                   }
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/counters.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/counters.cc
similarity index 100%
rename from epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/counters.cc
rename to epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/counters.cc
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/fbridge_common.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge_common.inc
similarity index 100%
rename from epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/fbridge_common.inc
rename to epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge_common.inc
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/ompnumthreads.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/ompnumthreads.cc
similarity index 100%
rename from epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/ompnumthreads.cc
rename to epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/ompnumthreads.cc
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
index f2ed8897b3..a75a9dce64 100644
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py
@@ -110,6 +110,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU):
                                       s+'gpu/perf.py', s+'gpu/profile.sh',
                                       s+'CMake/SubProcesses/CMakeLists.txt'],
                      'test': [s+'gpu/cudacpp_test.mk']}
+
     to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h',
                     'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h',
                     'MemoryAccessHelpers.h', 'MemoryAccessVectors.h',
@@ -196,13 +197,15 @@ def generate_subprocess_directory(self, subproc_group, fortran_model, me=None):
         misc.sprint('  type(subproc_group)=%s'%type(subproc_group)) # e.g. madgraph.core.helas_objects.HelasMatrixElement
         misc.sprint('  type(fortran_model)=%s'%type(fortran_model)) # e.g. madgraph.iolibs.helas_call_writers.GPUFOHelasCallWriter
         misc.sprint('  type(me)=%s me=%s'%(type(me) if me is not None else None, me)) # e.g. int
-        return super().generate_subprocess_directory(subproc_group, fortran_model, me)
-
+        misc.sprint("need to link", self.to_link_in_P)
+        out = super().generate_subprocess_directory(subproc_group, fortran_model, me)
+        return out
     # AV (default from OM's tutorial) - add a debug printout
     def convert_model(self, model, wanted_lorentz=[], wanted_coupling=[]):
         misc.sprint('Entering PLUGIN_ProcessExporter.convert_model (create the model)')
         return super().convert_model(model, wanted_lorentz, wanted_coupling)
 
+
     # AV (default from OM's tutorial) - add a debug printout
     def finalize(self, matrix_element, cmdhistory, MG5options, outputflag):
         """Typically creating jpeg/HTML output/ compilation/...
@@ -281,7 +284,22 @@ def add_madevent_plugin_fct(self):
 
 #------------------------------------------------------------------------------------
 
-class SIMD_ProcessExporter(PLUGIN_ProcessExporter):
+class PLUGIN_ProcessExporter_MadEvent(PLUGIN_ProcessExporter):
+    """ a class to include all tweak related to madevent and not related to standalone.
+        in practise this class is never called but only the SIMD or GPU related class"""
+
+    s = PLUGINDIR + '/madgraph/iolibs/template_files/'
+    # add template file/ linking only needed in the madevent mode and not in standalone
+    from_template = dict(PLUGIN_ProcessExporter.from_template)
+    from_template['SubProcesses'] = from_template['SubProcesses'] + [s+'gpu/fbridge_common.inc',
+                                      s+'gpu/counters.cc',
+                                      s+'gpu/ompnumthreads.cc']
+     
+    to_link_in_P = PLUGIN_ProcessExporter.to_link_in_P + ['fbridge_common.inc', 'counters.cc','ompnumthreads.cc'] 
+
+#------------------------------------------------------------------------------------
+
+class SIMD_ProcessExporter(PLUGIN_ProcessExporter_MadEvent):
     def change_output_args(args, cmd):
         """ """
         cmd._export_format = "madevent"
@@ -293,7 +311,7 @@ def change_output_args(args, cmd):
 
 #------------------------------------------------------------------------------------
 
-class GPU_ProcessExporter(PLUGIN_ProcessExporter):
+class GPU_ProcessExporter(PLUGIN_ProcessExporter_MadEvent):
     def change_output_args(args, cmd):
         """ """
         cmd._export_format = "madevent"
diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/patchMad.sh b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/patchMad.sh
index 7edafba599..8739cff3ea 100755
--- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/patchMad.sh
+++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/patchMad.sh
@@ -46,7 +46,7 @@ if [ "${patchlevel}" == "0" ]; then exit $status; fi
 # Patch the default Fortran code to provide the integration with the cudacpp plugin
 # (1) Process-independent patches
 touch ${dir}/Events/.keep # this file should already be present (mg5amcnlo copies it from Template/LO/Events/.keep) 
-\cp -pr ${scrdir}/MG5aMC_patches/${dir_patches}/fbridge_common.inc ${dir}/SubProcesses # new file
+#\cp -pr ${scrdir}/MG5aMC_patches/${dir_patches}/fbridge_common.inc ${dir}/SubProcesses # new file
 if [ "${patchlevel}" == "2" ]; then
   cd ${dir}
   echo "DEBUG: cd ${PWD}; patch -p4 -i ${scrdir}/MG5aMC_patches/${dir_patches}/patch.common"
@@ -57,13 +57,9 @@ if [ "${patchlevel}" == "2" ]; then
 fi
 for p1dir in ${dir}/SubProcesses/P*; do
   cd $p1dir
-  ln -sf ../fbridge_common.inc . # new file
-  cp -pr ${scrdir}/MG5aMC_patches/${dir_patches}/counters.cc . # new file
-  cp -pr ${scrdir}/MG5aMC_patches/${dir_patches}/ompnumthreads.cc . # new file
-  ###cp -pr ${scrdir}/MG5aMC_patches/${dir_patches}/counters.cc ${dir}/SubProcesses/ # new file (SH)
-  ###cp -pr ${scrdir}/MG5aMC_patches/${dir_patches}/ompnumthreads.cc ${dir}/SubProcesses/ # new file (SH)
-  ###ln -sf ../counters.cc . # new file (SH)
-  ###ln -sf ../ompnumthreads.cc . # new file (SH)
+  #ln -sf ../fbridge_common.inc . # new file
+  #cp -pr ${scrdir}/MG5aMC_patches/${dir_patches}/counters.cc . # new file
+  #cp -pr ${scrdir}/MG5aMC_patches/${dir_patches}/ompnumthreads.cc . # new file
   if [ "${patchlevel}" == "2" ]; then
     echo "DEBUG: cd ${PWD}; patch -p6 -i ${scrdir}/MG5aMC_patches/${dir_patches}/patch.P1"
     if ! patch -p6 -i ${scrdir}/MG5aMC_patches/${dir_patches}/patch.P1; then status=1; fi      
diff --git a/epochX/cudacpp/CODEGEN/generateAndCompare.sh b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
index 72b6687dd0..c9e85c1b91 100755
--- a/epochX/cudacpp/CODEGEN/generateAndCompare.sh
+++ b/epochX/cudacpp/CODEGEN/generateAndCompare.sh
@@ -232,9 +232,9 @@ function codeGenAndDiff()
     elif [ "${OUTBCK}" == "madonly" ]; then # $SCRBCK=cudacpp and $OUTBCK=madonly
       echo "output madevent ${outproc} ${helrecopt} --vector_size=${vecsize}" >> ${outproc}.mg
     elif [ "${OUTBCK}" == "mad" ]; then # $SCRBCK=cudacpp and $OUTBCK=mad
-      echo "output madevent ${outproc} ${helrecopt} --vector_size=${vecsize} --me_exporter=standalone_cudacpp" >> ${outproc}.mg
+      echo "output madevent_simd ${outproc} ${helrecopt} --vector_size=${vecsize} " >> ${outproc}.mg
     elif [ "${OUTBCK}" == "madcpp" ]; then # $SCRBCK=cudacpp and $OUTBCK=madcpp
-      echo "output madevent ${outproc} ${helrecopt} --vector_size=32 --me_exporter=standalone_cpp" >> ${outproc}.mg
+      echo "output madevent_simd ${outproc} ${helrecopt} --vector_size=32" >> ${outproc}.mg
     elif [ "${OUTBCK}" == "madgpu" ]; then # $SCRBCK=cudacpp and $OUTBCK=madgpu
       echo "output madevent ${outproc} ${helrecopt} --vector_size=32 --me_exporter=standalone_gpu" >> ${outproc}.mg
     else # $SCRBCK=cudacpp and $OUTBCK=cudacpp, cpp or gpu
diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
index 122537896c..c0d823893a 100644
--- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00548553466796875 [0m
+[1;32mDEBUG: model prefixing  takes 0.005708217620849609 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -156,27 +156,28 @@ INFO: Trying process: e+ e- > mu+ mu- WEIGHTED<=4 @1
 INFO: Process has 2 diagrams 
 1 processes with 2 diagrams generated in 0.004 s
 Total: 1 processes with 2 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.5.3_lo_vect. 
 It has been validated for the last time with version: 3.5.2[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
+[1;32mDEBUG:  cformat = [0m standalone_simd [1;30m[export_cpp.py at line 3071][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_ee_mumu 
 INFO: remove old information in CODEGEN_mad_ee_mumu 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
 INFO: Creating files in directory P1_epem_mupmum 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f4ab3cdeb80> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ffb4864fbb0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -193,19 +194,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group epem_mupmum 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-Wrote files for 8 helas calls in 0.098 s
+Wrote files for 8 helas calls in 0.101 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
-ALOHA: aloha creates 3 routines in  0.198 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+ALOHA: aloha creates 3 routines in  0.207 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 7 routines in  0.260 s
+ALOHA: aloha creates 7 routines in  0.275 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -214,28 +215,27 @@ ALOHA: aloha creates 7 routines in  0.260 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
-Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/SubProcesses/P1_epem_mupmum; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 496 (offset 12 lines).
 patching file driver.f
@@ -243,16 +243,16 @@ patching file matrix1.f
 Hunk #3 succeeded at 230 (offset 9 lines).
 Hunk #4 succeeded at 267 (offset 18 lines).
 Hunk #5 succeeded at 312 (offset 18 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 238][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 241][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
+/data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.925s
-user	0m1.621s
-sys	0m0.231s
+real	0m1.910s
+user	0m1.675s
+sys	0m0.221s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -274,9 +274,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -304,9 +304,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
index cdeedc7863..ce678812fe 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
index b49ff5e24a..9b246807bc 100644
--- a/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/ee_mumu.mad/Cards/proc_card_mg5.dat
@@ -45,5 +45,5 @@ define l+ = e+ mu+
 define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
-output madevent ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=False --\
-vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_ee_mumu --hel_recycling=Fal\
+se --vector_size=32
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/counters.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/counters.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/ompnumthreads.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/ompnumthreads.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/counters.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/counters.cc
new file mode 100644
index 0000000000..3bbdec9387
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/counters.cc
@@ -0,0 +1,98 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include "timer.h"
+#define TIMERTYPE std::chrono::high_resolution_clock
+
+#include <cassert>
+#include <cstdio>
+
+// NB1: The C functions counters_xxx_ in this file are called by Fortran code
+// Hence the trailing "_": 'call counters_end()' links to counters_end_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+extern "C"
+{
+  // Now: fortran=-1, cudacpp=0
+  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
+  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
+  const char* iimplC2TXT( int iimplC )
+  {
+    const int iimplF = iimplC - 1;
+    switch( iimplF )
+    {
+      case -1: return "Fortran"; break;
+      case +0: return "CudaCpp"; break;
+      default: assert( false ); break;
+    }
+  }
+
+  static mgOnGpu::Timer<TIMERTYPE> program_timer;
+  static float program_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
+  static float smatrix1_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
+  static float smatrix1multi_totaltime[nimplC] = { 0 };
+  static int smatrix1_counter = 0;
+  static int smatrix1multi_counter[nimplC] = { 0 };
+
+  void counters_initialise_()
+  {
+    program_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_start_()
+  {
+    smatrix1_counter++;
+    smatrix1_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_stop_()
+  {
+    smatrix1_totaltime += smatrix1_timer.GetDuration();
+    return;
+  }
+
+  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_counter[iimplC] += *pnevt;
+    smatrix1multi_timer[iimplC].Start();
+    return;
+  }
+
+  void counters_smatrix1multi_stop_( const int* iimplF )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
+    return;
+  }
+
+  void counters_finalise_()
+  {
+    program_totaltime += program_timer.GetDuration();
+    // Write to stdout
+    float overhead_totaltime = program_totaltime;
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
+    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
+    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+      if( smatrix1multi_counter[iimplC] > 0 )
+        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                iimplC2TXT( iimplC ),
+                iimplC + 1,
+                smatrix1multi_totaltime[iimplC],
+                smatrix1multi_counter[iimplC],
+                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+    return;
+  }
+}
diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/ompnumthreads.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/ompnumthreads.cc
new file mode 100644
index 0000000000..1d004923b9
--- /dev/null
+++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/ompnumthreads.cc
@@ -0,0 +1,25 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include <ompnumthreads.h>
+
+// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
+// Hence the trailing "_": 'call xxx()' links to xxx_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+#ifdef _OPENMP
+extern "C"
+{
+  void ompnumthreads_not_set_means_one_thread_()
+  {
+    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
+    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
+  }
+}
+#endif
diff --git a/epochX/cudacpp/ee_mumu.mad/mg5.in b/epochX/cudacpp/ee_mumu.mad/mg5.in
index 4e83015b40..ce5c0456e0 100644
--- a/epochX/cudacpp/ee_mumu.mad/mg5.in
+++ b/epochX/cudacpp/ee_mumu.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate e+ e- > mu+ mu-
-output madevent ee_mumu.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ee_mumu.mad --hel_recycling=False --vector_size=32 
diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
index 0ca2931a2d..71b04c8320 100644
--- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
+++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate e+ e- > mu+ mu-
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005585908889770508 [0m
+[1;32mDEBUG: model prefixing  takes 0.005678653717041016 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -162,28 +162,29 @@ Load PLUGIN.CUDACPP_OUTPUT
 It has been validated for the last time with version: 3.5.2[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 
 INFO: Processing color information for process: e+ e- > mu+ mu- @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 195][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 196][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 197][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 198][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 198][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 199][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 200][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/. 
 Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates FFV2 routines[0m
 ALOHA: aloha creates FFV4 routines[0m
 ALOHA: aloha creates FFV2_4 routines[0m
-ALOHA: aloha creates 4 routines in  0.267 s
+ALOHA: aloha creates 4 routines in  0.272 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2
@@ -192,17 +193,17 @@ ALOHA: aloha creates 4 routines in  0.267 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
 <class 'aloha.create_aloha.AbstractRoutine'> FFV2_4
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. 
 quit
 
-real	0m0.705s
-user	0m0.594s
-sys	0m0.053s
-Code generation completed in 1 seconds
+real	0m0.668s
+user	0m0.613s
+sys	0m0.048s
+Code generation completed in 0 seconds
diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
index 465f0fdf8e..1fc03e0c34 100644
--- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005699872970581055 [0m
+[1;32mDEBUG: model prefixing  takes 0.005725383758544922 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -157,27 +157,28 @@ INFO: Trying process: g g > t t~ WEIGHTED<=2 @1
 INFO: Process has 3 diagrams 
 1 processes with 3 diagrams generated in 0.008 s
 Total: 1 processes with 3 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.5.3_lo_vect. 
 It has been validated for the last time with version: 3.5.2[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
+[1;32mDEBUG:  cformat = [0m standalone_simd [1;30m[export_cpp.py at line 3071][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt 
 INFO: remove old information in CODEGEN_mad_gg_tt 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe96c21e700> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f9dd03b2550> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -193,56 +194,55 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-Wrote files for 10 helas calls in 0.099 s
+Wrote files for 10 helas calls in 0.103 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.146 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+ALOHA: aloha creates 2 routines in  0.147 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.132 s
+ALOHA: aloha creates 4 routines in  0.135 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
-Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 238][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 241][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
+/data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m1.712s
-user	0m1.461s
-sys	0m0.230s
-Code generation completed in 1 seconds
+real	0m1.715s
+user	0m1.502s
+sys	0m0.214s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -263,9 +263,9 @@ Code generation completed in 1 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -293,9 +293,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
index cdeedc7863..ce678812fe 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
index 9973b6a3db..90d6b27048 100644
--- a/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt.mad/Cards/proc_card_mg5.dat
@@ -45,5 +45,5 @@ define l+ = e+ mu+
 define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
-output madevent ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False --ve\
-ctor_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt --hel_recycling=False\
+ --vector_size=32
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/counters.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/counters.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/counters.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/counters.cc
new file mode 100644
index 0000000000..3bbdec9387
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/counters.cc
@@ -0,0 +1,98 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include "timer.h"
+#define TIMERTYPE std::chrono::high_resolution_clock
+
+#include <cassert>
+#include <cstdio>
+
+// NB1: The C functions counters_xxx_ in this file are called by Fortran code
+// Hence the trailing "_": 'call counters_end()' links to counters_end_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+extern "C"
+{
+  // Now: fortran=-1, cudacpp=0
+  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
+  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
+  const char* iimplC2TXT( int iimplC )
+  {
+    const int iimplF = iimplC - 1;
+    switch( iimplF )
+    {
+      case -1: return "Fortran"; break;
+      case +0: return "CudaCpp"; break;
+      default: assert( false ); break;
+    }
+  }
+
+  static mgOnGpu::Timer<TIMERTYPE> program_timer;
+  static float program_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
+  static float smatrix1_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
+  static float smatrix1multi_totaltime[nimplC] = { 0 };
+  static int smatrix1_counter = 0;
+  static int smatrix1multi_counter[nimplC] = { 0 };
+
+  void counters_initialise_()
+  {
+    program_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_start_()
+  {
+    smatrix1_counter++;
+    smatrix1_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_stop_()
+  {
+    smatrix1_totaltime += smatrix1_timer.GetDuration();
+    return;
+  }
+
+  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_counter[iimplC] += *pnevt;
+    smatrix1multi_timer[iimplC].Start();
+    return;
+  }
+
+  void counters_smatrix1multi_stop_( const int* iimplF )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
+    return;
+  }
+
+  void counters_finalise_()
+  {
+    program_totaltime += program_timer.GetDuration();
+    // Write to stdout
+    float overhead_totaltime = program_totaltime;
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
+    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
+    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+      if( smatrix1multi_counter[iimplC] > 0 )
+        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                iimplC2TXT( iimplC ),
+                iimplC + 1,
+                smatrix1multi_totaltime[iimplC],
+                smatrix1multi_counter[iimplC],
+                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+    return;
+  }
+}
diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/ompnumthreads.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/ompnumthreads.cc
new file mode 100644
index 0000000000..1d004923b9
--- /dev/null
+++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/ompnumthreads.cc
@@ -0,0 +1,25 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include <ompnumthreads.h>
+
+// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
+// Hence the trailing "_": 'call xxx()' links to xxx_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+#ifdef _OPENMP
+extern "C"
+{
+  void ompnumthreads_not_set_means_one_thread_()
+  {
+    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
+    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
+  }
+}
+#endif
diff --git a/epochX/cudacpp/gg_tt.mad/mg5.in b/epochX/cudacpp/gg_tt.mad/mg5.in
index b4b356fc51..95b259f47e 100644
--- a/epochX/cudacpp/gg_tt.mad/mg5.in
+++ b/epochX/cudacpp/gg_tt.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~
-output madevent gg_tt.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd gg_tt.mad --hel_recycling=False --vector_size=32 
diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
index 04ee1fae0a..96a207eb00 100644
--- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
+++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005332469940185547 [0m
+[1;32mDEBUG: model prefixing  takes 0.005444765090942383 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=2: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ WEIGHTED<=2 @1  
 INFO: Process has 3 diagrams 
-1 processes with 3 diagrams generated in 0.008 s
+1 processes with 3 diagrams generated in 0.009 s
 Total: 1 processes with 3 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_tt
 Load PLUGIN.CUDACPP_OUTPUT
@@ -163,41 +163,42 @@ Load PLUGIN.CUDACPP_OUTPUT
 It has been validated for the last time with version: 3.5.2[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > t t~ @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 195][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 196][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 197][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 198][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 198][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 199][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 200][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/SubProcesses/P1_Sigma_sm_gg_ttx/. 
 Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 set of routines with options: P0[0m
 ALOHA: aloha creates FFV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.143 s
+ALOHA: aloha creates 2 routines in  0.146 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. 
 quit
 
-real	0m0.640s
-user	0m0.463s
-sys	0m0.061s
-Code generation completed in 0 seconds
+real	0m0.539s
+user	0m0.476s
+sys	0m0.058s
+Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
index 86aeb0137d..99081f6854 100644
--- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005301952362060547 [0m
+[1;32mDEBUG: model prefixing  takes 0.005807399749755859 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -163,23 +163,24 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.019 s
+1 processes with 16 diagrams generated in 0.020 s
 Total: 2 processes with 19 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.5.3_lo_vect. 
 It has been validated for the last time with version: 3.5.2[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
+[1;32mDEBUG:  cformat = [0m standalone_simd [1;30m[export_cpp.py at line 3071][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_tt01g 
 INFO: remove old information in CODEGEN_mad_gg_tt01g 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @2 
 INFO: Processing color information for process: g g > t t~ g @2 
@@ -187,7 +188,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1
 INFO: Processing color information for process: g g > t t~ @1 
 INFO: Creating files in directory P2_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f529e7651c0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f044cb7e250> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -204,7 +205,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P1_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f529e6f3fd0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f044cb7bbe0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -219,23 +220,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1871][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
-Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s
-Wrote files for 46 helas calls in 0.250 s
+Generated helas calls for 2 subprocesses (19 diagrams) in 0.043 s
+Wrote files for 46 helas calls in 0.247 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.323 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+ALOHA: aloha creates 5 routines in  0.334 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.307 s
+ALOHA: aloha creates 10 routines in  0.320 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -245,32 +246,31 @@ ALOHA: aloha creates 10 routines in  0.307 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
-Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P1_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P2_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/SubProcesses/P2_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
@@ -278,16 +278,16 @@ Hunk #2 succeeded at 159 (offset 16 lines).
 Hunk #3 succeeded at 237 (offset 16 lines).
 Hunk #4 succeeded at 265 (offset 16 lines).
 Hunk #5 succeeded at 310 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 238][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 241][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
+/data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.306s
-user	0m2.029s
-sys	0m0.243s
+real	0m2.337s
+user	0m2.082s
+sys	0m0.248s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -309,9 +309,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -339,9 +339,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
index cdeedc7863..ce678812fe 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
index 2fd6fdb2bf..1b2fc5f0b6 100644
--- a/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_tt01g.mad/Cards/proc_card_mg5.dat
@@ -46,5 +46,5 @@ define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 add process g g > t t~ g
-output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False -\
--vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=Fa\
+lse --vector_size=32
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/counters.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/counters.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/counters.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/counters.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/ompnumthreads.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/ompnumthreads.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/counters.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/counters.cc
new file mode 100644
index 0000000000..3bbdec9387
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/counters.cc
@@ -0,0 +1,98 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include "timer.h"
+#define TIMERTYPE std::chrono::high_resolution_clock
+
+#include <cassert>
+#include <cstdio>
+
+// NB1: The C functions counters_xxx_ in this file are called by Fortran code
+// Hence the trailing "_": 'call counters_end()' links to counters_end_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+extern "C"
+{
+  // Now: fortran=-1, cudacpp=0
+  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
+  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
+  const char* iimplC2TXT( int iimplC )
+  {
+    const int iimplF = iimplC - 1;
+    switch( iimplF )
+    {
+      case -1: return "Fortran"; break;
+      case +0: return "CudaCpp"; break;
+      default: assert( false ); break;
+    }
+  }
+
+  static mgOnGpu::Timer<TIMERTYPE> program_timer;
+  static float program_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
+  static float smatrix1_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
+  static float smatrix1multi_totaltime[nimplC] = { 0 };
+  static int smatrix1_counter = 0;
+  static int smatrix1multi_counter[nimplC] = { 0 };
+
+  void counters_initialise_()
+  {
+    program_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_start_()
+  {
+    smatrix1_counter++;
+    smatrix1_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_stop_()
+  {
+    smatrix1_totaltime += smatrix1_timer.GetDuration();
+    return;
+  }
+
+  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_counter[iimplC] += *pnevt;
+    smatrix1multi_timer[iimplC].Start();
+    return;
+  }
+
+  void counters_smatrix1multi_stop_( const int* iimplF )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
+    return;
+  }
+
+  void counters_finalise_()
+  {
+    program_totaltime += program_timer.GetDuration();
+    // Write to stdout
+    float overhead_totaltime = program_totaltime;
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
+    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
+    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+      if( smatrix1multi_counter[iimplC] > 0 )
+        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                iimplC2TXT( iimplC ),
+                iimplC + 1,
+                smatrix1multi_totaltime[iimplC],
+                smatrix1multi_counter[iimplC],
+                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+    return;
+  }
+}
diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/ompnumthreads.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/ompnumthreads.cc
new file mode 100644
index 0000000000..1d004923b9
--- /dev/null
+++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/ompnumthreads.cc
@@ -0,0 +1,25 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include <ompnumthreads.h>
+
+// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
+// Hence the trailing "_": 'call xxx()' links to xxx_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+#ifdef _OPENMP
+extern "C"
+{
+  void ompnumthreads_not_set_means_one_thread_()
+  {
+    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
+    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
+  }
+}
+#endif
diff --git a/epochX/cudacpp/gg_tt01g.mad/mg5.in b/epochX/cudacpp/gg_tt01g.mad/mg5.in
index 95984fcf10..f297253b90 100644
--- a/epochX/cudacpp/gg_tt01g.mad/mg5.in
+++ b/epochX/cudacpp/gg_tt01g.mad/mg5.in
@@ -2,4 +2,4 @@ set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~
 add process g g > t t~ g
-output madevent gg_tt01g.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd gg_tt01g.mad --hel_recycling=False --vector_size=32 
diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
index 1d520f7648..fa7fec111b 100644
--- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0055522918701171875 [0m
+[1;32mDEBUG: model prefixing  takes 0.005398988723754883 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,29 +155,30 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.021 s
+1 processes with 16 diagrams generated in 0.022 s
 Total: 1 processes with 16 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.5.3_lo_vect. 
 It has been validated for the last time with version: 3.5.2[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
+[1;32mDEBUG:  cformat = [0m standalone_simd [1;30m[export_cpp.py at line 3071][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttg 
 INFO: remove old information in CODEGEN_mad_gg_ttg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f1f32f57700> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fe966ec4550> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -192,23 +193,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1871][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
-Wrote files for 36 helas calls in 0.148 s
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
+Wrote files for 36 helas calls in 0.152 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.327 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+ALOHA: aloha creates 5 routines in  0.338 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 10 routines in  0.315 s
+ALOHA: aloha creates 10 routines in  0.332 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -218,28 +219,27 @@ ALOHA: aloha creates 10 routines in  0.315 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
-Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
@@ -247,16 +247,16 @@ Hunk #2 succeeded at 159 (offset 16 lines).
 Hunk #3 succeeded at 237 (offset 16 lines).
 Hunk #4 succeeded at 265 (offset 16 lines).
 Hunk #5 succeeded at 310 (offset 16 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 238][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 241][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
+/data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m2.204s
-user	0m1.932s
-sys	0m0.252s
+real	0m2.239s
+user	0m2.005s
+sys	0m0.234s
 Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
@@ -278,9 +278,9 @@ Code generation completed in 2 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -308,9 +308,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
index cdeedc7863..ce678812fe 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
index bc26df64f3..72d7a0efd4 100644
--- a/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttg.mad/Cards/proc_card_mg5.dat
@@ -45,5 +45,5 @@ define l+ = e+ mu+
 define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --v\
-ector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=Fals\
+e --vector_size=32
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/counters.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/counters.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/counters.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/counters.cc
new file mode 100644
index 0000000000..3bbdec9387
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/counters.cc
@@ -0,0 +1,98 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include "timer.h"
+#define TIMERTYPE std::chrono::high_resolution_clock
+
+#include <cassert>
+#include <cstdio>
+
+// NB1: The C functions counters_xxx_ in this file are called by Fortran code
+// Hence the trailing "_": 'call counters_end()' links to counters_end_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+extern "C"
+{
+  // Now: fortran=-1, cudacpp=0
+  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
+  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
+  const char* iimplC2TXT( int iimplC )
+  {
+    const int iimplF = iimplC - 1;
+    switch( iimplF )
+    {
+      case -1: return "Fortran"; break;
+      case +0: return "CudaCpp"; break;
+      default: assert( false ); break;
+    }
+  }
+
+  static mgOnGpu::Timer<TIMERTYPE> program_timer;
+  static float program_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
+  static float smatrix1_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
+  static float smatrix1multi_totaltime[nimplC] = { 0 };
+  static int smatrix1_counter = 0;
+  static int smatrix1multi_counter[nimplC] = { 0 };
+
+  void counters_initialise_()
+  {
+    program_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_start_()
+  {
+    smatrix1_counter++;
+    smatrix1_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_stop_()
+  {
+    smatrix1_totaltime += smatrix1_timer.GetDuration();
+    return;
+  }
+
+  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_counter[iimplC] += *pnevt;
+    smatrix1multi_timer[iimplC].Start();
+    return;
+  }
+
+  void counters_smatrix1multi_stop_( const int* iimplF )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
+    return;
+  }
+
+  void counters_finalise_()
+  {
+    program_totaltime += program_timer.GetDuration();
+    // Write to stdout
+    float overhead_totaltime = program_totaltime;
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
+    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
+    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+      if( smatrix1multi_counter[iimplC] > 0 )
+        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                iimplC2TXT( iimplC ),
+                iimplC + 1,
+                smatrix1multi_totaltime[iimplC],
+                smatrix1multi_counter[iimplC],
+                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+    return;
+  }
+}
diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/ompnumthreads.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/ompnumthreads.cc
new file mode 100644
index 0000000000..1d004923b9
--- /dev/null
+++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/ompnumthreads.cc
@@ -0,0 +1,25 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include <ompnumthreads.h>
+
+// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
+// Hence the trailing "_": 'call xxx()' links to xxx_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+#ifdef _OPENMP
+extern "C"
+{
+  void ompnumthreads_not_set_means_one_thread_()
+  {
+    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
+    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
+  }
+}
+#endif
diff --git a/epochX/cudacpp/gg_ttg.mad/mg5.in b/epochX/cudacpp/gg_ttg.mad/mg5.in
index e37d10d865..f4b43bcd8d 100644
--- a/epochX/cudacpp/gg_ttg.mad/mg5.in
+++ b/epochX/cudacpp/gg_ttg.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~ g
-output madevent gg_ttg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd gg_ttg.mad --hel_recycling=False --vector_size=32 
diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
index 26d60e142d..ecf3e1d46a 100644
--- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
+++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0054738521575927734 [0m
+[1;32mDEBUG: model prefixing  takes 0.005639791488647461 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1  
 INFO: Process has 16 diagrams 
-1 processes with 16 diagrams generated in 0.021 s
+1 processes with 16 diagrams generated in 0.022 s
 Total: 1 processes with 16 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -163,29 +163,30 @@ Load PLUGIN.CUDACPP_OUTPUT
 It has been validated for the last time with version: 3.5.2[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 
 INFO: Processing color information for process: g g > t t~ g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 195][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 196][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 197][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 198][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
-Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 198][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 199][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 200][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. 
+Generated helas calls for 1 subprocesses (16 diagrams) in 0.038 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV3 set of routines with options: P0[0m
 ALOHA: aloha creates VVVV4 set of routines with options: P0[0m
-ALOHA: aloha creates 5 routines in  0.323 s
+ALOHA: aloha creates 5 routines in  0.332 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -195,17 +196,17 @@ ALOHA: aloha creates 5 routines in  0.323 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. 
 quit
 
-real	0m0.778s
-user	0m0.718s
-sys	0m0.051s
-Code generation completed in 1 seconds
+real	0m0.790s
+user	0m0.736s
+sys	0m0.050s
+Code generation completed in 0 seconds
diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
index 8b11d9e97b..11131eaf14 100644
--- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005313873291015625 [0m
+[1;32mDEBUG: model prefixing  takes 0.005640506744384766 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,29 +155,30 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.157 s
+1 processes with 123 diagrams generated in 0.160 s
 Total: 1 processes with 123 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.5.3_lo_vect. 
 It has been validated for the last time with version: 3.5.2[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
+[1;32mDEBUG:  cformat = [0m standalone_simd [1;30m[export_cpp.py at line 3071][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttgg 
 INFO: remove old information in CODEGEN_mad_gg_ttgg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
 INFO: Creating files in directory P1_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f5fd0ee0fa0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f0146762fd0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -192,23 +193,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1871][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s
-Wrote files for 222 helas calls in 0.700 s
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.432 s
+Wrote files for 222 helas calls in 0.704 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.327 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+ALOHA: aloha creates 5 routines in  0.337 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.310 s
+ALOHA: aloha creates 10 routines in  0.322 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -221,28 +222,27 @@ ALOHA: aloha creates 10 routines in  0.310 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
-Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/SubProcesses/P1_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
@@ -250,16 +250,16 @@ Hunk #2 succeeded at 191 (offset 48 lines).
 Hunk #3 succeeded at 269 (offset 48 lines).
 Hunk #4 succeeded at 297 (offset 48 lines).
 Hunk #5 succeeded at 342 (offset 48 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 238][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 241][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
+/data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.327s
-user	0m3.009s
-sys	0m0.283s
+real	0m3.333s
+user	0m3.080s
+sys	0m0.236s
 Code generation completed in 3 seconds
 ************************************************************
 *                                                          *
@@ -281,9 +281,9 @@ Code generation completed in 3 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -311,9 +311,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
index cdeedc7863..ce678812fe 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
index b9a294d7a3..f4efb79920 100644
--- a/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttgg.mad/Cards/proc_card_mg5.dat
@@ -45,5 +45,5 @@ define l+ = e+ mu+
 define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --\
-vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=Fal\
+se --vector_size=32
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/counters.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/counters.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/ompnumthreads.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/ompnumthreads.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/counters.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/counters.cc
new file mode 100644
index 0000000000..3bbdec9387
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/counters.cc
@@ -0,0 +1,98 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include "timer.h"
+#define TIMERTYPE std::chrono::high_resolution_clock
+
+#include <cassert>
+#include <cstdio>
+
+// NB1: The C functions counters_xxx_ in this file are called by Fortran code
+// Hence the trailing "_": 'call counters_end()' links to counters_end_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+extern "C"
+{
+  // Now: fortran=-1, cudacpp=0
+  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
+  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
+  const char* iimplC2TXT( int iimplC )
+  {
+    const int iimplF = iimplC - 1;
+    switch( iimplF )
+    {
+      case -1: return "Fortran"; break;
+      case +0: return "CudaCpp"; break;
+      default: assert( false ); break;
+    }
+  }
+
+  static mgOnGpu::Timer<TIMERTYPE> program_timer;
+  static float program_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
+  static float smatrix1_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
+  static float smatrix1multi_totaltime[nimplC] = { 0 };
+  static int smatrix1_counter = 0;
+  static int smatrix1multi_counter[nimplC] = { 0 };
+
+  void counters_initialise_()
+  {
+    program_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_start_()
+  {
+    smatrix1_counter++;
+    smatrix1_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_stop_()
+  {
+    smatrix1_totaltime += smatrix1_timer.GetDuration();
+    return;
+  }
+
+  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_counter[iimplC] += *pnevt;
+    smatrix1multi_timer[iimplC].Start();
+    return;
+  }
+
+  void counters_smatrix1multi_stop_( const int* iimplF )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
+    return;
+  }
+
+  void counters_finalise_()
+  {
+    program_totaltime += program_timer.GetDuration();
+    // Write to stdout
+    float overhead_totaltime = program_totaltime;
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
+    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
+    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+      if( smatrix1multi_counter[iimplC] > 0 )
+        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                iimplC2TXT( iimplC ),
+                iimplC + 1,
+                smatrix1multi_totaltime[iimplC],
+                smatrix1multi_counter[iimplC],
+                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+    return;
+  }
+}
diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/ompnumthreads.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/ompnumthreads.cc
new file mode 100644
index 0000000000..1d004923b9
--- /dev/null
+++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/ompnumthreads.cc
@@ -0,0 +1,25 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include <ompnumthreads.h>
+
+// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
+// Hence the trailing "_": 'call xxx()' links to xxx_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+#ifdef _OPENMP
+extern "C"
+{
+  void ompnumthreads_not_set_means_one_thread_()
+  {
+    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
+    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
+  }
+}
+#endif
diff --git a/epochX/cudacpp/gg_ttgg.mad/mg5.in b/epochX/cudacpp/gg_ttgg.mad/mg5.in
index 53784bf161..05b3fbcbac 100644
--- a/epochX/cudacpp/gg_ttgg.mad/mg5.in
+++ b/epochX/cudacpp/gg_ttgg.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~ g g
-output madevent gg_ttgg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd gg_ttgg.mad --hel_recycling=False --vector_size=32 
diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
index 8a72b5a0f4..38a3c3a518 100644
--- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
+++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0057239532470703125 [0m
+[1;32mDEBUG: model prefixing  takes 0.00548243522644043 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1  
 INFO: Process has 123 diagrams 
-1 processes with 123 diagrams generated in 0.157 s
+1 processes with 123 diagrams generated in 0.162 s
 Total: 1 processes with 123 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -163,29 +163,30 @@ Load PLUGIN.CUDACPP_OUTPUT
 It has been validated for the last time with version: 3.5.2[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 
 INFO: Processing color information for process: g g > t t~ g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 195][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 196][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 197][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 198][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
-Generated helas calls for 1 subprocesses (123 diagrams) in 0.421 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 198][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 199][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 200][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. 
+Generated helas calls for 1 subprocesses (123 diagrams) in 0.435 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.320 s
+ALOHA: aloha creates 5 routines in  0.326 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -198,17 +199,17 @@ ALOHA: aloha creates 5 routines in  0.320 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. 
 quit
 
-real	0m1.486s
-user	0m1.373s
-sys	0m0.056s
+real	0m1.468s
+user	0m1.398s
+sys	0m0.062s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
index df59413576..e8b21a0952 100644
--- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.0055010318756103516 [0m
+[1;32mDEBUG: model prefixing  takes 0.005578756332397461 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,23 +155,24 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.859 s
+1 processes with 1240 diagrams generated in 1.919 s
 Total: 1 processes with 1240 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.5.3_lo_vect. 
 It has been validated for the last time with version: 3.5.2[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
+[1;32mDEBUG:  cformat = [0m standalone_simd [1;30m[export_cpp.py at line 3071][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gg_ttggg 
 INFO: remove old information in CODEGEN_mad_gg_ttggg 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
@@ -179,7 +180,7 @@ INFO: Creating files in directory P1_gg_ttxggg
 INFO: Computing Color-Flow optimization [15120 term] 
 INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7ff305d425e0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f17eacb05e0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -194,23 +195,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1871][0m [0m
 INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.577 s
-Wrote files for 2281 helas calls in 18.096 s
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.694 s
+Wrote files for 2281 helas calls in 18.830 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.313 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+ALOHA: aloha creates 5 routines in  0.326 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.305 s
+ALOHA: aloha creates 10 routines in  0.320 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -223,28 +224,27 @@ ALOHA: aloha creates 10 routines in  0.305 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
-Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/SubProcesses/P1_gg_ttxggg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
@@ -252,17 +252,17 @@ Hunk #2 succeeded at 255 (offset 112 lines).
 Hunk #3 succeeded at 333 (offset 112 lines).
 Hunk #4 succeeded at 361 (offset 112 lines).
 Hunk #5 succeeded at 406 (offset 112 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 238][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 241][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
+/data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m28.769s
-user	0m28.239s
-sys	0m0.414s
-Code generation completed in 29 seconds
+real	0m30.669s
+user	0m29.221s
+sys	0m0.403s
+Code generation completed in 31 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -283,9 +283,9 @@ Code generation completed in 29 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -313,9 +313,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
index cdeedc7863..ce678812fe 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
index 2e2a09aef7..c16335faca 100644
--- a/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gg_ttggg.mad/Cards/proc_card_mg5.dat
@@ -45,5 +45,5 @@ define l+ = e+ mu+
 define l- = e- mu-
 define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
-output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False -\
--vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=Fa\
+lse --vector_size=32
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/counters.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/counters.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/ompnumthreads.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/ompnumthreads.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/counters.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/counters.cc
new file mode 100644
index 0000000000..3bbdec9387
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/counters.cc
@@ -0,0 +1,98 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include "timer.h"
+#define TIMERTYPE std::chrono::high_resolution_clock
+
+#include <cassert>
+#include <cstdio>
+
+// NB1: The C functions counters_xxx_ in this file are called by Fortran code
+// Hence the trailing "_": 'call counters_end()' links to counters_end_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+extern "C"
+{
+  // Now: fortran=-1, cudacpp=0
+  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
+  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
+  const char* iimplC2TXT( int iimplC )
+  {
+    const int iimplF = iimplC - 1;
+    switch( iimplF )
+    {
+      case -1: return "Fortran"; break;
+      case +0: return "CudaCpp"; break;
+      default: assert( false ); break;
+    }
+  }
+
+  static mgOnGpu::Timer<TIMERTYPE> program_timer;
+  static float program_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
+  static float smatrix1_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
+  static float smatrix1multi_totaltime[nimplC] = { 0 };
+  static int smatrix1_counter = 0;
+  static int smatrix1multi_counter[nimplC] = { 0 };
+
+  void counters_initialise_()
+  {
+    program_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_start_()
+  {
+    smatrix1_counter++;
+    smatrix1_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_stop_()
+  {
+    smatrix1_totaltime += smatrix1_timer.GetDuration();
+    return;
+  }
+
+  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_counter[iimplC] += *pnevt;
+    smatrix1multi_timer[iimplC].Start();
+    return;
+  }
+
+  void counters_smatrix1multi_stop_( const int* iimplF )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
+    return;
+  }
+
+  void counters_finalise_()
+  {
+    program_totaltime += program_timer.GetDuration();
+    // Write to stdout
+    float overhead_totaltime = program_totaltime;
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
+    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
+    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+      if( smatrix1multi_counter[iimplC] > 0 )
+        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                iimplC2TXT( iimplC ),
+                iimplC + 1,
+                smatrix1multi_totaltime[iimplC],
+                smatrix1multi_counter[iimplC],
+                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+    return;
+  }
+}
diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/ompnumthreads.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/ompnumthreads.cc
new file mode 100644
index 0000000000..1d004923b9
--- /dev/null
+++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/ompnumthreads.cc
@@ -0,0 +1,25 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include <ompnumthreads.h>
+
+// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
+// Hence the trailing "_": 'call xxx()' links to xxx_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+#ifdef _OPENMP
+extern "C"
+{
+  void ompnumthreads_not_set_means_one_thread_()
+  {
+    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
+    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
+  }
+}
+#endif
diff --git a/epochX/cudacpp/gg_ttggg.mad/mg5.in b/epochX/cudacpp/gg_ttggg.mad/mg5.in
index f92d17d219..4865da91cd 100644
--- a/epochX/cudacpp/gg_ttggg.mad/mg5.in
+++ b/epochX/cudacpp/gg_ttggg.mad/mg5.in
@@ -1,4 +1,4 @@
 set stdout_level DEBUG
 set zerowidth_tchannel F
 generate g g > t t~ g g g
-output madevent gg_ttggg.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd gg_ttggg.mad --hel_recycling=False --vector_size=32 
diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
index faec804f1b..6d60b544b0 100644
--- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
+++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -62,7 +62,7 @@ generate g g > t t~ g g g
 No model currently active, so we import the Standard Model
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005358695983886719 [0m
+[1;32mDEBUG: model prefixing  takes 0.005746364593505859 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step.
 INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED 
 INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1  
 INFO: Process has 1240 diagrams 
-1 processes with 1240 diagrams generated in 1.872 s
+1 processes with 1240 diagrams generated in 1.934 s
 Total: 1 processes with 1240 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg
 Load PLUGIN.CUDACPP_OUTPUT
@@ -163,29 +163,30 @@ Load PLUGIN.CUDACPP_OUTPUT
 It has been validated for the last time with version: 3.5.2[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g g WEIGHTED<=5 @1 
 INFO: Processing color information for process: g g > t t~ g g g @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 195][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 196][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 197][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 198][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
-Generated helas calls for 1 subprocesses (1240 diagrams) in 6.517 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 198][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 199][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 200][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. 
+Generated helas calls for 1 subprocesses (1240 diagrams) in 6.709 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.346 s
+ALOHA: aloha creates 5 routines in  0.354 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -198,17 +199,17 @@ ALOHA: aloha creates 5 routines in  0.346 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. 
 quit
 
-real	0m12.953s
-user	0m12.719s
-sys	0m0.135s
+real	0m13.375s
+user	0m13.203s
+sys	0m0.119s
 Code generation completed in 13 seconds
diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
index e72f8836f6..7015773962 100644
--- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005477190017700195 [0m
+[1;32mDEBUG: model prefixing  takes 0.005769014358520508 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,23 +170,24 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.077 s
+8 processes with 40 diagrams generated in 0.079 s
 Total: 8 processes with 40 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.5.3_lo_vect. 
 It has been validated for the last time with version: 3.5.2[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
+[1;32mDEBUG:  cformat = [0m standalone_simd [1;30m[export_cpp.py at line 3071][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_gq_ttq 
 INFO: remove old information in CODEGEN_mad_gq_ttq 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Processing color information for process: g u > t t~ u @1 
@@ -200,7 +201,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7faa05dee760> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f42a1553c40> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -217,7 +218,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7faa05c99c10> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f42a1401c40> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -233,43 +234,42 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s
-Wrote files for 32 helas calls in 0.217 s
+Wrote files for 32 helas calls in 0.223 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.143 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+ALOHA: aloha creates 2 routines in  0.147 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 4 routines in  0.130 s
+ALOHA: aloha creates 4 routines in  0.135 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
-Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 528 (offset 44 lines).
 patching file driver.f
@@ -279,7 +279,7 @@ Hunk #2 succeeded at 162 (offset 19 lines).
 Hunk #3 succeeded at 247 (offset 26 lines).
 Hunk #4 succeeded at 281 (offset 32 lines).
 Hunk #5 succeeded at 326 (offset 32 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 528 (offset 44 lines).
 patching file driver.f
@@ -289,17 +289,17 @@ Hunk #2 succeeded at 162 (offset 19 lines).
 Hunk #3 succeeded at 247 (offset 26 lines).
 Hunk #4 succeeded at 281 (offset 32 lines).
 Hunk #5 succeeded at 326 (offset 32 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 238][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 241][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
+/data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m3.073s
-user	0m1.692s
-sys	0m0.237s
-Code generation completed in 3 seconds
+real	0m1.956s
+user	0m1.706s
+sys	0m0.245s
+Code generation completed in 2 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -320,9 +320,9 @@ Code generation completed in 3 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -350,9 +350,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
index cdeedc7863..ce678812fe 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
index 5ab65926b2..deab56cf41 100644
--- a/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/gq_ttq.mad/Cards/proc_card_mg5.dat
@@ -47,5 +47,5 @@ define vl = ve vm vt
 define vl~ = ve~ vm~ vt~
 define q = u c d s u~ c~ d~ s~
 generate g q > t t~ q
-output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --v\
-ector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=Fals\
+e --vector_size=32
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/counters.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/counters.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/counters.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/counters.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/counters.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/counters.cc
new file mode 100644
index 0000000000..3bbdec9387
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/counters.cc
@@ -0,0 +1,98 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include "timer.h"
+#define TIMERTYPE std::chrono::high_resolution_clock
+
+#include <cassert>
+#include <cstdio>
+
+// NB1: The C functions counters_xxx_ in this file are called by Fortran code
+// Hence the trailing "_": 'call counters_end()' links to counters_end_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+extern "C"
+{
+  // Now: fortran=-1, cudacpp=0
+  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
+  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
+  const char* iimplC2TXT( int iimplC )
+  {
+    const int iimplF = iimplC - 1;
+    switch( iimplF )
+    {
+      case -1: return "Fortran"; break;
+      case +0: return "CudaCpp"; break;
+      default: assert( false ); break;
+    }
+  }
+
+  static mgOnGpu::Timer<TIMERTYPE> program_timer;
+  static float program_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
+  static float smatrix1_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
+  static float smatrix1multi_totaltime[nimplC] = { 0 };
+  static int smatrix1_counter = 0;
+  static int smatrix1multi_counter[nimplC] = { 0 };
+
+  void counters_initialise_()
+  {
+    program_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_start_()
+  {
+    smatrix1_counter++;
+    smatrix1_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_stop_()
+  {
+    smatrix1_totaltime += smatrix1_timer.GetDuration();
+    return;
+  }
+
+  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_counter[iimplC] += *pnevt;
+    smatrix1multi_timer[iimplC].Start();
+    return;
+  }
+
+  void counters_smatrix1multi_stop_( const int* iimplF )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
+    return;
+  }
+
+  void counters_finalise_()
+  {
+    program_totaltime += program_timer.GetDuration();
+    // Write to stdout
+    float overhead_totaltime = program_totaltime;
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
+    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
+    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+      if( smatrix1multi_counter[iimplC] > 0 )
+        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                iimplC2TXT( iimplC ),
+                iimplC + 1,
+                smatrix1multi_totaltime[iimplC],
+                smatrix1multi_counter[iimplC],
+                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+    return;
+  }
+}
diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/ompnumthreads.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/ompnumthreads.cc
new file mode 100644
index 0000000000..1d004923b9
--- /dev/null
+++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/ompnumthreads.cc
@@ -0,0 +1,25 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include <ompnumthreads.h>
+
+// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
+// Hence the trailing "_": 'call xxx()' links to xxx_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+#ifdef _OPENMP
+extern "C"
+{
+  void ompnumthreads_not_set_means_one_thread_()
+  {
+    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
+    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
+  }
+}
+#endif
diff --git a/epochX/cudacpp/gq_ttq.mad/mg5.in b/epochX/cudacpp/gq_ttq.mad/mg5.in
index 2273ae9cfd..f02829a969 100644
--- a/epochX/cudacpp/gq_ttq.mad/mg5.in
+++ b/epochX/cudacpp/gq_ttq.mad/mg5.in
@@ -2,4 +2,4 @@ set stdout_level DEBUG
 set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 generate g q > t t~ q
-output madevent gq_ttq.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd gq_ttq.mad --hel_recycling=False --vector_size=32 
diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
index 3957e3a7d6..8ed0b3a1c7 100644
--- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
+++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define q = u c d s u~ c~ d~ s~
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.00551295280456543 [0m
+[1;32mDEBUG: model prefixing  takes 0.005808115005493164 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams.
 INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. 
 INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. 
 INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. 
-8 processes with 40 diagrams generated in 0.077 s
+8 processes with 40 diagrams generated in 0.084 s
 Total: 8 processes with 40 diagrams
 output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq
 Load PLUGIN.CUDACPP_OUTPUT
@@ -178,9 +178,9 @@ Load PLUGIN.CUDACPP_OUTPUT
 It has been validated for the last time with version: 3.5.2[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g u > t t~ u WEIGHTED<=3 @1 
 INFO: Processing color information for process: g u > t t~ u @1 
@@ -192,44 +192,46 @@ INFO: Processing color information for process: g u~ > t t~ u~ @1
 INFO: Combined process g c~ > t t~ c~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
 INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 195][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 196][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 197][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 198][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 195][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 196][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 197][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 198][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
-Generated helas calls for 2 subprocesses (10 diagrams) in 0.030 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 198][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 199][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 200][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gu_ttxu/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 198][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=1 [1;30m[output.py at line 199][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 200][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/SubProcesses/P1_Sigma_sm_gux_ttxux/. 
+Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVV1 routines[0m
-ALOHA: aloha creates 2 routines in  0.143 s
+ALOHA: aloha creates 2 routines in  0.203 s
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. 
 quit
 
-real	0m1.009s
-user	0m0.580s
-sys	0m0.067s
+real	0m1.010s
+user	0m0.617s
+sys	0m0.055s
 Code generation completed in 1 seconds
diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
index 9d97c918db..ef394b2a87 100644
--- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
+++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt
@@ -53,15 +53,21 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
 set zerowidth_tchannel F
 set auto_convert_model T
 save options auto_convert_model
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt
 import model heft
+INFO: reload from .py file 
+INFO: load particles 
+INFO: load vertices 
+[1;34mWARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model. [0m
+[1;34mWARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model. [0m
+[1;32mDEBUG: model prefixing  takes 0.005836963653564453 [0m
 INFO: Restrict model heft with file models/heft/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: s u w+ at order: QED=1 [0m
@@ -137,37 +143,38 @@ Load PLUGIN.CUDACPP_OUTPUT
 It has been validated for the last time with version: 3.5.2[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1;32mDEBUG:  cformat = [0m plugin [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h 
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 
 INFO: Processing color information for process: g g > h HIG<=1 HIW<=1 @1 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 195][0m [0m
-[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 196][0m [0m
-[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 197][0m [0m
-[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 198][0m [0m
-INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h 
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc
-INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. 
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.generate_subprocess_directory (create the directory) [1;30m[output.py at line 196][0m [0m
+[1;32mDEBUG:    type(subproc_group)=<class 'madgraph.core.helas_objects.HelasMatrixElement'> [1;30m[output.py at line 197][0m [0m
+[1;32mDEBUG:    type(fortran_model)=<class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_GPUFOHelasCallWriter'> [1;30m[output.py at line 198][0m [0m
+[1;32mDEBUG:    type(me)=<class 'int'> me=0 [1;30m[output.py at line 199][0m [0m
+[1;32mDEBUG:  "need to link", self.to_link_in_P = [0m need to link ['nvtx.h', 'timer.h', 'timermap.h', 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', 'MemoryAccessAmplitudes.h', 'MemoryAccessWavefunctions.h', 'MemoryAccessGs.h', 'MemoryAccessCouplingsFixed.h', 'MemoryAccessNumerators.h', 'MemoryAccessDenominators.h', 'EventStatistics.h', 'CommonRandomNumbers.h', 'CrossSectionKernels.cc', 'CrossSectionKernels.h', 'MatrixElementKernels.cc', 'MatrixElementKernels.h', 'RamboSamplingKernels.cc', 'RamboSamplingKernels.h', 'RandomNumberKernels.h', 'CommonRandomNumberKernel.cc', 'CurandRandomNumberKernel.cc', 'HiprandRandomNumberKernel.cc', 'Bridge.h', 'BridgeKernels.cc', 'BridgeKernels.h', 'fbridge.cc', 'fbridge.inc', 'fsampler.cc', 'fsampler.inc', 'MadgraphTest.h', 'runTest.cc', 'testmisc.cc', 'testxxx_cc_ref.txt', 'cudacpp.mk', 'testxxx.cc', 'MemoryBuffers.h', 'MemoryAccessCouplings.h', 'perf.py', 'profile.sh'] [1;30m[output.py at line 200][0m [0m
+INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/./CPPProcess.cc
+INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/SubProcesses/P1_Sigma_heft_gg_h/. 
 Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVS3 routines[0m
-ALOHA: aloha creates 1 routines in  0.061 s
+ALOHA: aloha creates 1 routines in  0.063 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVS3
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h
-INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h
+INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./Parameters_heft.cc
 INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. 
 quit
 
-real	0m0.423s
-user	0m0.361s
-sys	0m0.053s
+real	0m0.451s
+user	0m0.390s
+sys	0m0.052s
 Code generation completed in 0 seconds
diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
index 4b983ad8d3..fa869aa432 100644
--- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt
@@ -53,7 +53,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
-import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
+import /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg
 The import format was not given, so we guess it as command
 set stdout_level DEBUG
 set output information to level: 10
@@ -61,7 +61,7 @@ set zerowidth_tchannel F
 define j = p
 INFO: load particles 
 INFO: load vertices 
-[1;32mDEBUG: model prefixing  takes 0.005466461181640625 [0m
+[1;32mDEBUG: model prefixing  takes 0.00567317008972168 [0m
 INFO: Restrict model sm with file models/sm/restrict_default.dat . 
 [1;32mDEBUG: Simplifying conditional expressions [0m
 [1;32mDEBUG: remove interactions: u s w+ at order: QED=1 [0m
@@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~
 INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ 
 INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ 
 INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ 
-5 processes with 7 diagrams generated in 0.029 s
+5 processes with 7 diagrams generated in 0.030 s
 Total: 5 processes with 7 diagrams
 add process p p > t t~ j @1
 INFO: Checking for minimal orders which gives processes. 
@@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~
 INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g 
 INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ 
 INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g 
-13 processes with 76 diagrams generated in 0.135 s
+13 processes with 76 diagrams generated in 0.141 s
 Total: 18 processes with 83 diagrams
 add process p p > t t~ j j @2
 INFO: Checking for minimal orders which gives processes. 
@@ -378,23 +378,24 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~
 INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ 
 INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ 
 INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. 
-65 processes with 1119 diagrams generated in 1.826 s
+65 processes with 1119 diagrams generated in 1.880 s
 Total: 83 processes with 1202 diagrams
-output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32
 Load PLUGIN.CUDACPP_OUTPUT
 [1;34mPlugin PLUGIN.CUDACPP_OUTPUT has marked as NOT being validated with this version: 3.5.3_lo_vect. 
 It has been validated for the last time with version: 3.5.2[0m
+[1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mAddition matrix-element will be done with PLUGIN: CUDACPP_OUTPUT[0m
 [1mOutput will be done with PLUGIN: CUDACPP_OUTPUT[0m
-[1;32mDEBUG:  cformat = [0m standalone_cudacpp [1;30m[export_cpp.py at line 3071][0m [0m
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 161][0m [0m
+[1;32mDEBUG:  cformat = [0m standalone_simd [1;30m[export_cpp.py at line 3071][0m [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.__init__ (initialise the exporter) [1;30m[output.py at line 162][0m [0m
 INFO: initialize a new directory: CODEGEN_mad_pp_tt012j 
 INFO: remove old information in CODEGEN_mad_pp_tt012j 
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 166][0m [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
-INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
-[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses [0m
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.copy_template (initialise the directory) [1;30m[output.py at line 167][0m [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j [0m
+INFO: Creating subdirectories in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j 
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards [0m
+[1;34mWARNING: File exists /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses [0m
 INFO: Organizing processes into subprocess groups 
 INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @2 
 INFO: Processing color information for process: g g > t t~ g g @2 
@@ -499,7 +500,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED
 INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 
 INFO: Creating files in directory P2_gg_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ab2f9c40> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e05708e0> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -516,7 +517,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg 
 INFO: Creating files in directory P2_gg_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9aba0e100> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e0c7e130> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -533,7 +534,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux 
 INFO: Creating files in directory P2_gu_ttxgu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9aba0e100> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e06f6e80> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -550,7 +551,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu 
 INFO: Creating files in directory P2_gux_ttxgux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9aba0e130> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e0c87130> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -567,7 +568,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux 
 INFO: Creating files in directory P2_uux_ttxgg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9aba0e100> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e0c87130> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -584,7 +585,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg 
 INFO: Creating files in directory P1_gg_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ac2f50a0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e0c26d00> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -601,7 +602,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gg_ttxg 
 INFO: Creating files in directory P2_uu_ttxuu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ac2f50a0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e054f940> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -618,7 +619,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu 
 INFO: Creating files in directory P2_uux_ttxuux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ac2f50a0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e054f940> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -635,7 +636,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux 
 INFO: Creating files in directory P2_uxux_ttxuxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ac2f50a0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e10acf40> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -652,7 +653,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux 
 INFO: Creating files in directory P2_uc_ttxuc 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ab9fc490> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e0c87130> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -669,7 +670,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc 
 INFO: Creating files in directory P2_uux_ttxccx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ab9fc490> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e054ba00> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -686,7 +687,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx 
 INFO: Creating files in directory P2_ucx_ttxucx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ab9fc490> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e10acf40> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -703,7 +704,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx 
 INFO: Creating files in directory P2_uxcx_ttxuxcx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ab64bf40> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e0c87130> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -720,7 +721,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2
 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx 
 INFO: Creating files in directory P1_gu_ttxu 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ab9fc490> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e0c87130> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -737,7 +738,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gu_ttxu 
 INFO: Creating files in directory P1_gux_ttxux 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ab64b1c0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e0866070> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -754,7 +755,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group gux_ttxux 
 INFO: Creating files in directory P1_uux_ttxg 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ab6022e0> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e0c7e100> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -771,7 +772,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1
 INFO: Finding symmetric diagrams for subprocess group uux_ttxg 
 INFO: Creating files in directory P0_gg_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ab5b6d00> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e0c7e130> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -788,7 +789,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2
 INFO: Finding symmetric diagrams for subprocess group gg_ttx 
 INFO: Creating files in directory P0_uux_ttx 
 [1;32mDEBUG:  kwargs[prefix] = 0 [1;30m[model_handling.py at line 1057][0m [0m
-[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7fd9ab2fca00> [1;30m[export_v4.py at line 6261][0m [0m
+[1;32mDEBUG:  process_exporter_cpp = [0m <PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_OneProcessExporter object at 0x7f36e08b9e20> [1;30m[export_v4.py at line 6261][0m [0m
 INFO: Creating files in directory . 
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.h
 FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for ././CPPProcess.cc
@@ -803,23 +804,23 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./.
 [1;32mDEBUG:  vector, subproc_group,self.opt['vector_size'] = [0m 32 True 32 [1;30m[export_v4.py at line 1871][0m [0m
 INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 
 INFO: Finding symmetric diagrams for subprocess group uux_ttx 
-Generated helas calls for 18 subprocesses (372 diagrams) in 1.280 s
-Wrote files for 810 helas calls in 3.230 s
+Generated helas calls for 18 subprocesses (372 diagrams) in 1.306 s
+Wrote files for 810 helas calls in 3.324 s
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 5 routines in  0.333 s
-[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 203][0m [0m
+ALOHA: aloha creates 5 routines in  0.340 s
+[1;32mDEBUG:  Entering PLUGIN_ProcessExporter.convert_model (create the model) [1;30m[output.py at line 205][0m [0m
 ALOHA: aloha starts to compute helicity amplitudes
 ALOHA: aloha creates VVV1 routines[0m
 ALOHA: aloha creates FFV1 routines[0m
 ALOHA: aloha creates VVVV1 routines[0m
 ALOHA: aloha creates VVVV3 routines[0m
 ALOHA: aloha creates VVVV4 routines[0m
-ALOHA: aloha creates 10 routines in  0.309 s
+ALOHA: aloha creates 10 routines in  0.317 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> VVV1
 <class 'aloha.create_aloha.AbstractRoutine'> FFV1
@@ -832,32 +833,31 @@ ALOHA: aloha creates 10 routines in  0.309 s
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV3
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
 <class 'aloha.create_aloha.AbstractRoutine'> VVVV4
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h
-INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./HelAmps_sm.h
+INFO: Created file HelAmps_sm.h in directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
 super_write_set_parameters_onlyfixMajorana (hardcoded=False)
 super_write_set_parameters_onlyfixMajorana (hardcoded=True)
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h
-FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.h
+FileWriter <class 'PLUGIN.CUDACPP_OUTPUT.model_handling.PLUGIN_CPPWriter'> for /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/./Parameters_sm.cc
 INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory 
-INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
+INFO: /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. and /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/src/. 
 The option zerowidth_tchannel is modified [True] but will not be written in the configuration files.
 If you want to make this value the default for future session, you can run 'save options --all'
-save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt
+save configuration file to /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt
 INFO: Use Fortran compiler gfortran 
 INFO: Use c++ compiler g++ 
 INFO: Generate web pages 
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j; patch -p4 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.common
 patching file Source/genps.inc
 patching file Source/makefile
 patching file SubProcesses/makefile
 patching file bin/internal/gen_ximprove.py
-Hunk #1 succeeded at 391 (offset 6 lines).
 patching file bin/internal/madevent_interface.py
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_gg_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_uux_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P0_uux_ttx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 539 (offset 55 lines).
 patching file driver.f
@@ -867,7 +867,7 @@ Hunk #2 succeeded at 146 (offset 3 lines).
 Hunk #3 succeeded at 224 (offset 3 lines).
 Hunk #4 succeeded at 252 (offset 3 lines).
 Hunk #5 succeeded at 297 (offset 3 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gg_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
@@ -875,7 +875,7 @@ Hunk #2 succeeded at 159 (offset 16 lines).
 Hunk #3 succeeded at 237 (offset 16 lines).
 Hunk #4 succeeded at 265 (offset 16 lines).
 Hunk #5 succeeded at 310 (offset 16 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gu_ttxu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 528 (offset 44 lines).
 patching file driver.f
@@ -885,7 +885,7 @@ Hunk #2 succeeded at 162 (offset 19 lines).
 Hunk #3 succeeded at 240 (offset 19 lines).
 Hunk #4 succeeded at 268 (offset 19 lines).
 Hunk #5 succeeded at 313 (offset 19 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_gux_ttxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 528 (offset 44 lines).
 patching file driver.f
@@ -895,7 +895,7 @@ Hunk #2 succeeded at 162 (offset 19 lines).
 Hunk #3 succeeded at 240 (offset 19 lines).
 Hunk #4 succeeded at 268 (offset 19 lines).
 Hunk #5 succeeded at 313 (offset 19 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_uux_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P1_uux_ttxg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 539 (offset 55 lines).
 patching file driver.f
@@ -905,7 +905,7 @@ Hunk #2 succeeded at 162 (offset 19 lines).
 Hunk #3 succeeded at 240 (offset 19 lines).
 Hunk #4 succeeded at 268 (offset 19 lines).
 Hunk #5 succeeded at 313 (offset 19 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 patching file driver.f
 patching file matrix1.f
@@ -913,7 +913,7 @@ Hunk #2 succeeded at 191 (offset 48 lines).
 Hunk #3 succeeded at 269 (offset 48 lines).
 Hunk #4 succeeded at 297 (offset 48 lines).
 Hunk #5 succeeded at 342 (offset 48 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gg_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 517 (offset 33 lines).
 patching file driver.f
@@ -923,7 +923,7 @@ Hunk #2 succeeded at 194 (offset 51 lines).
 Hunk #3 succeeded at 272 (offset 51 lines).
 Hunk #4 succeeded at 300 (offset 51 lines).
 Hunk #5 succeeded at 345 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gu_ttxgu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gu_ttxgu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 528 (offset 44 lines).
 patching file driver.f
@@ -933,7 +933,7 @@ Hunk #2 succeeded at 194 (offset 51 lines).
 Hunk #3 succeeded at 272 (offset 51 lines).
 Hunk #4 succeeded at 300 (offset 51 lines).
 Hunk #5 succeeded at 345 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gux_ttxgux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_gux_ttxgux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 528 (offset 44 lines).
 patching file driver.f
@@ -943,7 +943,7 @@ Hunk #2 succeeded at 194 (offset 51 lines).
 Hunk #3 succeeded at 272 (offset 51 lines).
 Hunk #4 succeeded at 300 (offset 51 lines).
 Hunk #5 succeeded at 345 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uc_ttxuc; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uc_ttxuc; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 555 (offset 71 lines).
 patching file driver.f
@@ -953,7 +953,7 @@ Hunk #2 succeeded at 196 (offset 53 lines).
 Hunk #3 succeeded at 274 (offset 53 lines).
 Hunk #4 succeeded at 302 (offset 53 lines).
 Hunk #5 succeeded at 347 (offset 53 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_ucx_ttxucx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_ucx_ttxucx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 627 (offset 143 lines).
 patching file driver.f
@@ -963,7 +963,7 @@ Hunk #2 succeeded at 202 (offset 59 lines).
 Hunk #3 succeeded at 280 (offset 59 lines).
 Hunk #4 succeeded at 308 (offset 59 lines).
 Hunk #5 succeeded at 353 (offset 59 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uu_ttxuu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uu_ttxuu; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 539 (offset 55 lines).
 patching file driver.f
@@ -973,7 +973,7 @@ Hunk #2 succeeded at 194 (offset 51 lines).
 Hunk #3 succeeded at 272 (offset 51 lines).
 Hunk #4 succeeded at 300 (offset 51 lines).
 Hunk #5 succeeded at 345 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxccx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxccx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 627 (offset 143 lines).
 patching file driver.f
@@ -983,7 +983,7 @@ Hunk #2 succeeded at 202 (offset 59 lines).
 Hunk #3 succeeded at 280 (offset 59 lines).
 Hunk #4 succeeded at 308 (offset 59 lines).
 Hunk #5 succeeded at 353 (offset 59 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxgg; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 539 (offset 55 lines).
 patching file driver.f
@@ -993,7 +993,7 @@ Hunk #2 succeeded at 194 (offset 51 lines).
 Hunk #3 succeeded at 272 (offset 51 lines).
 Hunk #4 succeeded at 300 (offset 51 lines).
 Hunk #5 succeeded at 345 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uux_ttxuux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 539 (offset 55 lines).
 patching file driver.f
@@ -1003,7 +1003,7 @@ Hunk #2 succeeded at 194 (offset 51 lines).
 Hunk #3 succeeded at 272 (offset 51 lines).
 Hunk #4 succeeded at 300 (offset 51 lines).
 Hunk #5 succeeded at 345 (offset 51 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxcx_ttxuxcx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxcx_ttxuxcx; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 555 (offset 71 lines).
 patching file driver.f
@@ -1013,7 +1013,7 @@ Hunk #2 succeeded at 196 (offset 53 lines).
 Hunk #3 succeeded at 274 (offset 53 lines).
 Hunk #4 succeeded at 302 (offset 53 lines).
 Hunk #5 succeeded at 347 (offset 53 lines).
-DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxux_ttxuxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
+DEBUG: cd /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/SubProcesses/P2_uxux_ttxuxux; patch -p6 -i /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/MG5aMC_patches/PROD/patch.P1
 patching file auto_dsig1.f
 Hunk #1 succeeded at 539 (offset 55 lines).
 patching file driver.f
@@ -1023,17 +1023,17 @@ Hunk #2 succeeded at 194 (offset 51 lines).
 Hunk #3 succeeded at 272 (offset 51 lines).
 Hunk #4 succeeded at 300 (offset 51 lines).
 Hunk #5 succeeded at 345 (offset 51 lines).
-[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 238][0m [0m
-Output to directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
+[1;32mDEBUG:  p.returncode = [0m 0 [1;30m[output.py at line 241][0m [0m
+Output to directory /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j done.
 Type "launch" to generate events from this process, or see
-/data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
+/data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/README
 Run "open index.html" to see more information about this process.
 quit
 
-real	0m8.828s
-user	0m8.276s
-sys	0m0.507s
-Code generation completed in 9 seconds
+real	0m10.739s
+user	0m8.492s
+sys	0m0.454s
+Code generation completed in 11 seconds
 ************************************************************
 *                                                          *
 *                      W E L C O M E to                    *
@@ -1054,9 +1054,9 @@ Code generation completed in 9 seconds
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
@@ -1084,9 +1084,9 @@ launch in debug mode
 *               Type 'help' for in-line help.              *
 *                                                          *
 ************************************************************
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
-INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo/input/mg5_configuration.txt  
+INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt  
 Using default text editor "vi". Set another one in ./input/mg5_configuration.txt
 Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt
 Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
index cdeedc7863..ce678812fe 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/me5_configuration.txt
@@ -234,7 +234,7 @@
 # pineappl = pineappl
 
 
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo 
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo 
 
 # MG5 MAIN DIRECTORY
-#mg5_path = /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo
+#mg5_path = /data/avalassi/GPU2023/madgraph4gpuBis/MG5aMC/mg5amcnlo
diff --git a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
index 8af6ddf9c1..2f22f719bc 100644
--- a/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
+++ b/epochX/cudacpp/pp_tt012j.mad/Cards/proc_card_mg5.dat
@@ -49,5 +49,5 @@ define j = p
 generate p p > t t~ @0
 add process p p > t t~ j @1
 add process p p > t t~ j j @2
-output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False \
---vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=F\
+alse --vector_size=32
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/counters.cc
deleted file mode 100644
index 3bbdec9387..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/counters.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include "timer.h"
-#define TIMERTYPE std::chrono::high_resolution_clock
-
-#include <cassert>
-#include <cstdio>
-
-// NB1: The C functions counters_xxx_ in this file are called by Fortran code
-// Hence the trailing "_": 'call counters_end()' links to counters_end_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-extern "C"
-{
-  // Now: fortran=-1, cudacpp=0
-  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
-  constexpr unsigned int nimplC = 2;
-  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
-  const char* iimplC2TXT( int iimplC )
-  {
-    const int iimplF = iimplC - 1;
-    switch( iimplF )
-    {
-      case -1: return "Fortran"; break;
-      case +0: return "CudaCpp"; break;
-      default: assert( false ); break;
-    }
-  }
-
-  static mgOnGpu::Timer<TIMERTYPE> program_timer;
-  static float program_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
-  static float smatrix1_totaltime = 0;
-  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
-  static float smatrix1multi_totaltime[nimplC] = { 0 };
-  static int smatrix1_counter = 0;
-  static int smatrix1multi_counter[nimplC] = { 0 };
-
-  void counters_initialise_()
-  {
-    program_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_start_()
-  {
-    smatrix1_counter++;
-    smatrix1_timer.Start();
-    return;
-  }
-
-  void counters_smatrix1_stop_()
-  {
-    smatrix1_totaltime += smatrix1_timer.GetDuration();
-    return;
-  }
-
-  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_counter[iimplC] += *pnevt;
-    smatrix1multi_timer[iimplC].Start();
-    return;
-  }
-
-  void counters_smatrix1multi_stop_( const int* iimplF )
-  {
-    const unsigned int iimplC = iimplF2C( *iimplF );
-    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
-    return;
-  }
-
-  void counters_finalise_()
-  {
-    program_totaltime += program_timer.GetDuration();
-    // Write to stdout
-    float overhead_totaltime = program_totaltime;
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
-    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
-    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
-    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
-      if( smatrix1multi_counter[iimplC] > 0 )
-        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
-                iimplC2TXT( iimplC ),
-                iimplC + 1,
-                smatrix1multi_totaltime[iimplC],
-                smatrix1multi_counter[iimplC],
-                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
-    return;
-  }
-}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/counters.cc
new file mode 120000
index 0000000000..06e29b46f9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/counters.cc
@@ -0,0 +1 @@
+../counters.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/ompnumthreads.cc
deleted file mode 100644
index 1d004923b9..0000000000
--- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/ompnumthreads.cc
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (C) 2020-2023 CERN and UCLouvain.
-// Licensed under the GNU Lesser General Public License (version 3 or later).
-// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
-// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
-
-#include <ompnumthreads.h>
-
-// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
-// Hence the trailing "_": 'call xxx()' links to xxx_
-// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
-
-// NB2: This file also contains C++ code and is built using g++
-// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
-// See https://www.geeksforgeeks.org/extern-c-in-c
-
-#ifdef _OPENMP
-extern "C"
-{
-  void ompnumthreads_not_set_means_one_thread_()
-  {
-    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
-    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
-  }
-}
-#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/ompnumthreads.cc
new file mode 120000
index 0000000000..645dc78215
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/ompnumthreads.cc
@@ -0,0 +1 @@
+../ompnumthreads.cc
\ No newline at end of file
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/counters.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/counters.cc
new file mode 100644
index 0000000000..3bbdec9387
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/counters.cc
@@ -0,0 +1,98 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: S. Hageboeck, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include "timer.h"
+#define TIMERTYPE std::chrono::high_resolution_clock
+
+#include <cassert>
+#include <cstdio>
+
+// NB1: The C functions counters_xxx_ in this file are called by Fortran code
+// Hence the trailing "_": 'call counters_end()' links to counters_end_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+extern "C"
+{
+  // Now: fortran=-1, cudacpp=0
+  // Eventually: fortran=-1, cuda=0, cpp/none=1, cpp/sse4=2, etc...
+  constexpr unsigned int nimplC = 2;
+  constexpr unsigned int iimplF2C( int iimplF ) { return iimplF + 1; }
+  const char* iimplC2TXT( int iimplC )
+  {
+    const int iimplF = iimplC - 1;
+    switch( iimplF )
+    {
+      case -1: return "Fortran"; break;
+      case +0: return "CudaCpp"; break;
+      default: assert( false ); break;
+    }
+  }
+
+  static mgOnGpu::Timer<TIMERTYPE> program_timer;
+  static float program_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1_timer;
+  static float smatrix1_totaltime = 0;
+  static mgOnGpu::Timer<TIMERTYPE> smatrix1multi_timer[nimplC];
+  static float smatrix1multi_totaltime[nimplC] = { 0 };
+  static int smatrix1_counter = 0;
+  static int smatrix1multi_counter[nimplC] = { 0 };
+
+  void counters_initialise_()
+  {
+    program_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_start_()
+  {
+    smatrix1_counter++;
+    smatrix1_timer.Start();
+    return;
+  }
+
+  void counters_smatrix1_stop_()
+  {
+    smatrix1_totaltime += smatrix1_timer.GetDuration();
+    return;
+  }
+
+  void counters_smatrix1multi_start_( const int* iimplF, const int* pnevt )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_counter[iimplC] += *pnevt;
+    smatrix1multi_timer[iimplC].Start();
+    return;
+  }
+
+  void counters_smatrix1multi_stop_( const int* iimplF )
+  {
+    const unsigned int iimplC = iimplF2C( *iimplF );
+    smatrix1multi_totaltime[iimplC] += smatrix1multi_timer[iimplC].GetDuration();
+    return;
+  }
+
+  void counters_finalise_()
+  {
+    program_totaltime += program_timer.GetDuration();
+    // Write to stdout
+    float overhead_totaltime = program_totaltime;
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ ) overhead_totaltime -= smatrix1multi_totaltime[iimplC];
+    printf( " [COUNTERS] PROGRAM TOTAL          : %9.4fs\n", program_totaltime );
+    printf( " [COUNTERS] Fortran Overhead ( 0 ) : %9.4fs\n", overhead_totaltime );
+    for( unsigned int iimplC = 0; iimplC < nimplC; iimplC++ )
+      if( smatrix1multi_counter[iimplC] > 0 )
+        printf( " [COUNTERS] %7s MEs      ( %1d ) : %9.4fs for %8d events => throughput is %8.2E events/s\n",
+                iimplC2TXT( iimplC ),
+                iimplC + 1,
+                smatrix1multi_totaltime[iimplC],
+                smatrix1multi_counter[iimplC],
+                smatrix1multi_counter[iimplC] / smatrix1multi_totaltime[iimplC] );
+    return;
+  }
+}
diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/ompnumthreads.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/ompnumthreads.cc
new file mode 100644
index 0000000000..1d004923b9
--- /dev/null
+++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/ompnumthreads.cc
@@ -0,0 +1,25 @@
+// Copyright (C) 2020-2023 CERN and UCLouvain.
+// Licensed under the GNU Lesser General Public License (version 3 or later).
+// Created by: A. Valassi (Dec 2022) for the MG5aMC CUDACPP plugin.
+// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin.
+
+#include <ompnumthreads.h>
+
+// NB1: The C function ompnumthreadsNotSetMeansOneThread_ is called by Fortran code
+// Hence the trailing "_": 'call xxx()' links to xxx_
+// See http://www.yolinux.com/TUTORIALS/LinuxTutorialMixingFortranAndC.html
+
+// NB2: This file also contains C++ code and is built using g++
+// Hence use 'extern "C"' to avoid name mangling by the C++ compiler
+// See https://www.geeksforgeeks.org/extern-c-in-c
+
+#ifdef _OPENMP
+extern "C"
+{
+  void ompnumthreads_not_set_means_one_thread_()
+  {
+    const int debuglevel = 0;                        // quiet(-1), info(0), debug(1)
+    ompnumthreadsNotSetMeansOneThread( debuglevel ); // call the inline C++ function defined in the .h file
+  }
+}
+#endif
diff --git a/epochX/cudacpp/pp_tt012j.mad/mg5.in b/epochX/cudacpp/pp_tt012j.mad/mg5.in
index 6bc40e7968..c22e2d6100 100644
--- a/epochX/cudacpp/pp_tt012j.mad/mg5.in
+++ b/epochX/cudacpp/pp_tt012j.mad/mg5.in
@@ -4,4 +4,4 @@ define j = p
 generate p p > t t~ @0
 add process p p > t t~ j @1
 add process p p > t t~ j j @2
-output madevent pp_tt012j.mad --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp
+output madevent_simd pp_tt012j.mad --hel_recycling=False --vector_size=32