diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 3515a6d..51b81e7 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -30,23 +30,32 @@ jobs: cmake -B build -DCMAKE_INSTALL_PREFIX=$HOME/kokkos -DKokkos_CXX_STANDARD=14 -DKokkos_ENABLE_${{ matrix.backend }}=ON cmake --build build --parallel 2 cmake --install build + - name: Checkout ALL + run: | + git clone --depth 1 --branch master https://gitlab.jsc.fz-juelich.de/SLMS/loadbalancing ALL + - name: Build ALL + working-directory: ALL + run: | + cmake -B build -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$HOME/ALL + cmake --build build --parallel 2 + cmake --install build - name: Checkout Cabana uses: actions/checkout@v2.2.0 with: - repository: ECP-copa/Cabana - ref: master + repository: aetx/Cabana + ref: cajita_loadbalancer path: Cabana - name: Build Cabana working-directory: Cabana run: | - cmake -B build -DCMAKE_INSTALL_PREFIX=$HOME/Cabana -DCMAKE_PREFIX_PATH="$HOME/kokkos" -DCabana_REQUIRE_${{ matrix.backend }}=ON + cmake -B build -DCMAKE_INSTALL_PREFIX=$HOME/Cabana -DCMAKE_PREFIX_PATH="$HOME/kokkos;$HOME/ALL" -DCabana_REQUIRE_${{ matrix.backend }}=ON -DCabana_REQUIRE_ALL=ON cmake --build build --parallel 2 cmake --install build - name: Checkout code uses: actions/checkout@v2.2.0 - name: Build run: | - cmake -B build -DCMAKE_PREFIX_PATH="$HOME/kokkos;$HOME/Cabana" + cmake -B build -DCMAKE_PREFIX_PATH="$HOME/kokkos;$HOME/Cabana;$HOME/ALL" cmake --build build --parallel 2 cd build && ctest --output-on-failure - name: Format diff --git a/.gitignore b/.gitignore index a53d962..35aea67 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -*.mp4 \ No newline at end of file +*.mp4 +*.swp diff --git a/CMakeLists.txt b/CMakeLists.txt index a0a242f..1e21d27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,6 +4,7 @@ cmake_minimum_required(VERSION 3.12) project(ExaMPM LANGUAGES CXX VERSION 0.1.0) include(GNUInstallDirs) +include(CMakeDependentOption) # find dependencies set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH}) @@ -16,6 +17,8 @@ if( NOT Cabana_ENABLE_CAJITA ) endif() find_package(Silo REQUIRED) +cmake_dependent_option(ExaMPM_ENABLE_LB "Enable Cabana load balancing" ON Cabana_ENABLE_ALL OFF) + # find Clang Format find_package( CLANG_FORMAT 10 ) diff --git a/examples/dam_break.cpp b/examples/dam_break.cpp index e137445..47c4370 100644 --- a/examples/dam_break.cpp +++ b/examples/dam_break.cpp @@ -66,7 +66,7 @@ struct ParticleInitFunc //---------------------------------------------------------------------------// void damBreak( const double cell_size, const int ppc, const int halo_size, const double delta_t, const double t_final, const int write_freq, - const std::string& device ) + const std::string& device, const int lb_freq ) { // The dam break domain is in a box on [0,1] in each dimension. Kokkos::Array global_box = { 0.0, 0.0, 0.0, 1.0, 1.0, 1.0 }; @@ -87,7 +87,8 @@ void damBreak( const double cell_size, const int ppc, const int halo_size, int comm_size; MPI_Comm_size( MPI_COMM_WORLD, &comm_size ); std::array ranks_per_dim = { 1, comm_size, 1 }; - Cajita::ManualPartitioner partitioner( ranks_per_dim ); + std::shared_ptr partitioner = + std::make_shared( ranks_per_dim ); // Material properties. double bulk_modulus = 1.0e5; @@ -112,7 +113,7 @@ void damBreak( const double cell_size, const int ppc, const int halo_size, device, MPI_COMM_WORLD, global_box, global_num_cell, periodic, partitioner, halo_size, ParticleInitFunc( cell_size, ppc, density ), ppc, bulk_modulus, density, gamma, kappa, delta_t, gravity, bc ); - solver->solve( t_final, write_freq ); + solver->solve( t_final, write_freq, lb_freq ); } //---------------------------------------------------------------------------// @@ -143,8 +144,12 @@ int main( int argc, char* argv[] ) // device type std::string device( argv[7] ); + // optional lb frequency + int lb_freq = argc > 8 ? std::atoi( argv[8] ) : 10; + // run the problem. - damBreak( cell_size, ppc, halo_size, delta_t, t_final, write_freq, device ); + damBreak( cell_size, ppc, halo_size, delta_t, t_final, write_freq, device, + lb_freq ); Kokkos::finalize(); diff --git a/examples/free_fall.cpp b/examples/free_fall.cpp index 4b4eb19..9bb24f9 100644 --- a/examples/free_fall.cpp +++ b/examples/free_fall.cpp @@ -65,7 +65,7 @@ struct ParticleInitFunc //---------------------------------------------------------------------------// void freeFall( const double cell_size, const int ppc, const int halo_size, const double delta_t, const double t_final, const int write_freq, - const std::string& device ) + const std::string& device, const int lb_freq ) { // The dam break domain is in a box on [0,1] in each dimension. Kokkos::Array global_box = { -0.5, -0.5, -0.5, 0.5, 0.5, 0.5 }; @@ -86,7 +86,8 @@ void freeFall( const double cell_size, const int ppc, const int halo_size, int comm_size; MPI_Comm_size( MPI_COMM_WORLD, &comm_size ); std::array ranks_per_dim = { 1, comm_size, 1 }; - Cajita::ManualPartitioner partitioner( ranks_per_dim ); + std::shared_ptr partitioner = + std::make_shared( ranks_per_dim ); // Material properties. double bulk_modulus = 5.0e5; @@ -111,7 +112,7 @@ void freeFall( const double cell_size, const int ppc, const int halo_size, device, MPI_COMM_WORLD, global_box, global_num_cell, periodic, partitioner, halo_size, ParticleInitFunc( cell_size, ppc, density ), ppc, bulk_modulus, density, gamma, kappa, delta_t, gravity, bc ); - solver->solve( t_final, write_freq ); + solver->solve( t_final, write_freq, lb_freq ); } //---------------------------------------------------------------------------// @@ -142,8 +143,12 @@ int main( int argc, char* argv[] ) // device type std::string device( argv[7] ); + // optional lb frequency + int lb_freq = argc > 8 ? std::atoi( argv[8] ) : 10; + // run the problem. - freeFall( cell_size, ppc, halo_size, delta_t, t_final, write_freq, device ); + freeFall( cell_size, ppc, halo_size, delta_t, t_final, write_freq, device, + lb_freq ); Kokkos::finalize(); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1dd8405..70c88e1 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,6 +8,7 @@ set(HEADERS ExaMPM_Solver.hpp ExaMPM_TimeIntegrator.hpp ExaMPM_Types.hpp + ExaMPM_VTKDomainWriter.hpp ExaMPM_VelocityInterpolation.hpp ) @@ -17,6 +18,10 @@ set(SOURCES add_library(exampm ${SOURCES}) +if(ExaMPM_ENABLE_LB) + target_compile_definitions(exampm PUBLIC ExaMPM_ENABLE_LB) +endif() + target_link_libraries(exampm Cabana::cabanacore Cabana::Cajita diff --git a/src/ExaMPM_Mesh.hpp b/src/ExaMPM_Mesh.hpp index c8336bc..bdf886a 100644 --- a/src/ExaMPM_Mesh.hpp +++ b/src/ExaMPM_Mesh.hpp @@ -91,16 +91,25 @@ class Mesh } // Create the global mesh. - auto global_mesh = Cajita::createUniformGlobalMesh( + _global_mesh = Cajita::createUniformGlobalMesh( global_low_corner, global_high_corner, num_cell ); // Build the global grid. - auto global_grid = Cajita::createGlobalGrid( comm, global_mesh, - periodic, partitioner ); + _global_grid = Cajita::createGlobalGrid( comm, _global_mesh, periodic, + partitioner ); // Build the local grid. - int halo_width = std::max( minimum_halo_cell_width, halo_cell_width ); - _local_grid = Cajita::createLocalGrid( global_grid, halo_width ); + _halo_width = std::max( minimum_halo_cell_width, halo_cell_width ); + _local_grid = Cajita::createLocalGrid( _global_grid, _halo_width ); + } + + // Global grid updated and update local values + void newGlobalGrid( + std::shared_ptr>>& + global_grid ) + { + _global_grid = global_grid; + _local_grid = Cajita::createLocalGrid( _global_grid, _halo_width ); } // Get the local grid. @@ -110,6 +119,19 @@ class Mesh return _local_grid; } + // Get the mutable global grid. + const std::shared_ptr>>& + globalGrid() + { + return _global_grid; + } + + const std::shared_ptr>>& + globalMesh() const + { + return _global_mesh; + } + // Get the cell size. double cellSize() const { @@ -130,6 +152,12 @@ class Mesh public: std::shared_ptr>> _local_grid; + std::shared_ptr>> + _global_grid; + std::shared_ptr>> + _global_mesh; + + int _halo_width; Kokkos::Array _min_domain_global_node_index; Kokkos::Array _max_domain_global_node_index; diff --git a/src/ExaMPM_ProblemManager.hpp b/src/ExaMPM_ProblemManager.hpp index a479846..2d7b1f1 100644 --- a/src/ExaMPM_ProblemManager.hpp +++ b/src/ExaMPM_ProblemManager.hpp @@ -116,7 +116,12 @@ class ProblemManager { initializeParticles( exec_space, *( _mesh->localGrid() ), particles_per_cell, create_functor, _particles ); + updateMesh( mesh ); + } + void updateMesh( const std::shared_ptr& mesh ) + { + _mesh = mesh; auto node_vector_layout = Cajita::createArrayLayout( _mesh->localGrid(), 3, Cajita::Node() ); auto node_scalar_layout = @@ -259,6 +264,9 @@ class ProblemManager void scatter( Location::Cell, Field::Density ) const { + printf( "%s: _density extent %lu %lu %lu\n", __func__, + _density->view().extent( 0 ), _density->view().extent( 1 ), + _density->view().extent( 2 ) ); _cell_scalar_halo->scatter( execution_space(), Cajita::ScatterReduce::Sum(), *_density ); } diff --git a/src/ExaMPM_SiloParticleWriter.hpp b/src/ExaMPM_SiloParticleWriter.hpp index 99a332a..6b55794 100644 --- a/src/ExaMPM_SiloParticleWriter.hpp +++ b/src/ExaMPM_SiloParticleWriter.hpp @@ -413,6 +413,8 @@ void writeTimeStep( const GlobalGridType& global_grid, // Mirror the coordinates to the host. auto host_coords = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), view ); + // todo(sschulz): Handle empty domains. Without kokkos debug build this + // seems to work. But it should still be handled properly. // Add the point mesh. std::string mesh_name = "particles"; diff --git a/src/ExaMPM_Solver.hpp b/src/ExaMPM_Solver.hpp index fbd9653..7bd0b02 100644 --- a/src/ExaMPM_Solver.hpp +++ b/src/ExaMPM_Solver.hpp @@ -17,11 +17,15 @@ #include #include #include +#include + +#ifdef ExaMPM_ENABLE_LB +#include +#endif #include #include -#include #include @@ -33,6 +37,8 @@ class SolverBase public: virtual ~SolverBase() = default; virtual void solve( const double t_final, const int write_freq ) = 0; + virtual void solve( const double t_final, const int write_freq, + const int lb_freq ) = 0; }; //---------------------------------------------------------------------------// @@ -44,7 +50,7 @@ class Solver : public SolverBase Solver( MPI_Comm comm, const Kokkos::Array& global_bounding_box, const std::array& global_num_cell, const std::array& periodic, - const Cajita::BlockPartitioner<3>& partitioner, + const std::shared_ptr& partitioner, const int halo_cell_width, const InitFunc& create_functor, const int particles_per_cell, const double bulk_modulus, const double density, const double gamma, const double kappa, @@ -54,9 +60,11 @@ class Solver : public SolverBase , _gravity( gravity ) , _bc( bc ) , _halo_min( 3 ) + , _comm( comm ) + , _partitioner( partitioner ) { _mesh = std::make_shared>( - global_bounding_box, global_num_cell, periodic, partitioner, + global_bounding_box, global_num_cell, periodic, *partitioner, halo_cell_width, _halo_min, comm ); _bc.min = _mesh->minDomainGlobalNodeIndex(); @@ -67,16 +75,58 @@ class Solver : public SolverBase bulk_modulus, density, gamma, kappa ); MPI_Comm_rank( comm, &_rank ); + +#ifdef ExaMPM_ENABLE_LB + _lb = std::make_shared< + Cajita::Experimental::LoadBalancer>>( + _comm, _mesh->globalGrid(), 3. * _mesh->cellSize() ); +#endif } void solve( const double t_final, const int write_freq ) override { + solve( t_final, write_freq, 10 ); + } + + void solve( const double t_final, const int write_freq, + const int lb_freq ) override + { + double output_time = 0; + double integrate_time = 0; + double lb_time = 0; + double comm_part_time = 0; + Kokkos::Timer timer, output_timer, integrate_timer, lb_timer, + comm_part_timer, step_timer; + + output_timer.reset(); SiloParticleWriter::writeTimeStep( _mesh->localGrid()->globalGrid(), 0, 0.0, _pm->get( Location::Particle(), Field::Position() ), _pm->get( Location::Particle(), Field::Velocity() ), _pm->get( Location::Particle(), Field::J() ) ); + std::string vtk_actual_domain_basename( "domain_act" ); + std::string vtk_lb_domain_basename( "domain_lb" ); + std::array vertices; + +#ifdef ExaMPM_ENABLE_LB + vertices = _lb->getVertices(); + VTKDomainWriter::writeDomain( _comm, 0, vertices, + vtk_actual_domain_basename ); + vertices = _lb->getInternalVertices(); + VTKDomainWriter::writeDomain( _comm, 0, vertices, + vtk_lb_domain_basename ); +#endif + output_time += output_timer.seconds(); + + int total_num_particle; + int num_particle = _pm->numParticle(); + MPI_Reduce( &num_particle, &total_num_particle, 1, MPI_INT, MPI_SUM, 0, + _comm ); + int comm_size; + MPI_Comm_size( _comm, &comm_size ); + step_timer.reset(); + int num_step = t_final / _dt; double delta_t = t_final / num_step; double time = 0.0; @@ -85,20 +135,85 @@ class Solver : public SolverBase if ( 0 == _rank && 0 == t % write_freq ) printf( "Step %d / %d\n", t + 1, num_step ); + integrate_timer.reset(); TimeIntegrator::step( ExecutionSpace(), *_pm, delta_t, _gravity, _bc ); + integrate_time += integrate_timer.seconds(); + lb_timer.reset(); +#ifdef ExaMPM_ENABLE_LB + if ( lb_freq > 0 && 0 == t % lb_freq ) + { + double work = _pm->numParticle(); + auto global_grid = _lb->createBalancedGlobalGrid( + _mesh->globalMesh(), *_partitioner, work ); + _mesh->newGlobalGrid( global_grid ); + _pm->updateMesh( _mesh ); + } +#endif + lb_time += lb_timer.seconds(); + + comm_part_timer.reset(); _pm->communicateParticles( _halo_min ); + comm_part_time += comm_part_timer.seconds(); if ( 0 == t % write_freq ) + { + output_timer.reset(); SiloParticleWriter::writeTimeStep( _mesh->localGrid()->globalGrid(), t + 1, time, _pm->get( Location::Particle(), Field::Position() ), _pm->get( Location::Particle(), Field::Velocity() ), _pm->get( Location::Particle(), Field::J() ) ); +#ifdef ExaMPM_ENABLE_LB + vertices = _lb->getVertices(); + VTKDomainWriter::writeDomain( _comm, t, vertices, + vtk_actual_domain_basename ); + vertices = _lb->getInternalVertices(); + VTKDomainWriter::writeDomain( _comm, t, vertices, + vtk_lb_domain_basename ); + double imbalance = _lb->getImbalance(); + output_time += output_timer.seconds(); + output_timer.reset(); + if ( _rank == 0 ) + { + double step_time = step_timer.seconds(); + std::cout + << std::fixed << std::setprecision( 5 ) << t << " " + << imbalance << " " << write_freq / step_time << " " + << write_freq / step_time * total_num_particle << " " + << write_freq / step_time * total_num_particle / + comm_size + << std::endl; + } + step_timer.reset(); + output_time += output_timer.seconds(); +#endif + } + time += delta_t; } + double total_time = timer.seconds(); + double steps_per_sec = 1. * num_step / total_time; + if ( _rank == 0 ) + { + std::cout + << std::fixed << std::setprecision( 2 ) + << "\n#Procs Particles | Time T_Out T_lb T_Int T_Comm_part\n" + << comm_size << " " << total_num_particle << " | " << total_time + << " " << output_time << " " << lb_time << " " << integrate_time + << " " << comm_part_time << " | PERFORMANCE\n" + << comm_size << " " << total_num_particle << " | " + << "1.0" + << " " << output_time / total_time << " " + << lb_time / total_time << " " << integrate_time / total_time + << " " << comm_part_time / total_time << " | FRACTION\n\n" + << "#Steps/s Particlesteps/s Particlesteps/(proc*s)\n" + << std::scientific << steps_per_sec << " " + << steps_per_sec * total_num_particle << " " + << steps_per_sec * total_num_particle / comm_size << std::endl; + } } private: @@ -109,6 +224,13 @@ class Solver : public SolverBase std::shared_ptr> _mesh; std::shared_ptr> _pm; int _rank; + MPI_Comm _comm; + std::shared_ptr _partitioner; +#ifdef ExaMPM_ENABLE_LB + std::shared_ptr< + Cajita::Experimental::LoadBalancer>> + _lb; +#endif }; //---------------------------------------------------------------------------// @@ -119,7 +241,7 @@ createSolver( const std::string& device, MPI_Comm comm, const Kokkos::Array& global_bounding_box, const std::array& global_num_cell, const std::array& periodic, - const Cajita::BlockPartitioner<3>& partitioner, + const std::shared_ptr& partitioner, const int halo_cell_width, const InitFunc& create_functor, const int particles_per_cell, const double bulk_modulus, const double density, const double gamma, const double kappa, diff --git a/src/ExaMPM_VTKDomainWriter.hpp b/src/ExaMPM_VTKDomainWriter.hpp new file mode 100644 index 0000000..ff806d6 --- /dev/null +++ b/src/ExaMPM_VTKDomainWriter.hpp @@ -0,0 +1,133 @@ +/**************************************************************************** + * Copyright (c) 2018-2020 by the ExaMPM authors * + * All rights reserved. * + * * + * This file is part of the ExaMPM library. ExaMPM is distributed under a * + * BSD 3-clause license. For the licensing terms see the LICENSE file in * + * the top-level directory. * + * * + * SPDX-License-Identifier: BSD-3-Clause * + ****************************************************************************/ + +#ifndef EXAMPM_VTKDOMAINWRITER_HPP +#define EXAMPM_VTKDOMAINWRITER_HPP +#include + +#include +#include + +namespace ExaMPM +{ +namespace VTKDomainWriter +{ +// Write PVTU +void writeDomainParallelFile( MPI_Comm comm, int time_step, + std::string& basename ) +{ + // Should only be called from a single rank + int size; + MPI_Comm_size( comm, &size ); + std::stringstream filename; + filename << basename << "_" << time_step << ".pvtu"; + FILE* file = fopen( filename.str().c_str(), "w" ); + fprintf( file, "\n" ); + fprintf( file, "\n" ); + fprintf( file, "\n" ); + fprintf( file, "\t\n" ); + fprintf( file, "\t\t\n" ); + fprintf( file, "\t\n" ); + fprintf( file, "\t\n" ); + fprintf( file, "\t\t\n" ); + fprintf( file, "\t\n" ); + for ( int i = 0; i < size; ++i ) + fprintf( file, "\t\n", basename.c_str(), + time_step, i ); + fprintf( file, "\n" ); + fprintf( file, "\n" ); + fclose( file ); +} +// Write VTU for domain (low corner, high corner) +void writeDomain( MPI_Comm comm, int time_step, + std::array& domain_vertices, + std::string& basename ) +{ + int rank; + MPI_Comm_rank( comm, &rank ); + if ( rank == 1 ) + writeDomainParallelFile( comm, time_step, basename ); + std::stringstream filename; + // todo(sschulz): properly format, according to max rank + filename << basename << "_" << time_step << "_" << rank << ".vtu"; + FILE* file = fopen( filename.str().c_str(), "w" ); + fprintf( file, "\n" ); + fprintf( file, "\n" ); + fprintf( file, "\n" ); + std::array vertices; + vertices[0 * 3 + 0] = domain_vertices[0]; + vertices[2 * 3 + 0] = domain_vertices[0]; + vertices[4 * 3 + 0] = domain_vertices[0]; + vertices[6 * 3 + 0] = domain_vertices[0]; + vertices[0 * 3 + 1] = domain_vertices[1]; + vertices[1 * 3 + 1] = domain_vertices[1]; + vertices[4 * 3 + 1] = domain_vertices[1]; + vertices[5 * 3 + 1] = domain_vertices[1]; + vertices[0 * 3 + 2] = domain_vertices[2]; + vertices[1 * 3 + 2] = domain_vertices[2]; + vertices[2 * 3 + 2] = domain_vertices[2]; + vertices[3 * 3 + 2] = domain_vertices[2]; + vertices[1 * 3 + 0] = domain_vertices[3]; + vertices[3 * 3 + 0] = domain_vertices[3]; + vertices[5 * 3 + 0] = domain_vertices[3]; + vertices[7 * 3 + 0] = domain_vertices[3]; + vertices[2 * 3 + 1] = domain_vertices[4]; + vertices[3 * 3 + 1] = domain_vertices[4]; + vertices[6 * 3 + 1] = domain_vertices[4]; + vertices[7 * 3 + 1] = domain_vertices[4]; + vertices[4 * 3 + 2] = domain_vertices[5]; + vertices[5 * 3 + 2] = domain_vertices[5]; + vertices[6 * 3 + 2] = domain_vertices[5]; + vertices[7 * 3 + 2] = domain_vertices[5]; + std::array connectivity = { 0, 1, 2, 3, 4, 5, 6, 7 }; + fprintf( file, "\n" ); + fprintf( file, "\t\n" ); + fprintf( file, "\t\n" ); + fprintf( file, "\t\n" ); + fprintf( file, "\t\t\n" ); + fprintf( file, "%d", rank ); + fprintf( file, "\n\t\t\n" ); + fprintf( file, "\t\n" ); + fprintf( file, "\t\n" ); + fprintf( file, "\t\t\n" ); + for ( const double& vert : vertices ) + fprintf( file, "%g ", vert ); + fprintf( file, "\n\t\t\n" ); + fprintf( file, "\t\n" ); + fprintf( file, "\t\n" ); + fprintf( file, "\t\t\n" ); + for ( const int& conn : connectivity ) + fprintf( file, "%d ", conn ); + fprintf( file, "\n\t\t\n" ); + fprintf( file, "\t\t\n" ); + fprintf( file, "8\n" ); + fprintf( file, "\n\t\t\n" ); + fprintf( file, "\t\t\n" ); + fprintf( file, "11\n" ); + fprintf( file, "\n\t\t\n" ); + fprintf( file, "\t\n" ); + fprintf( file, "\n" ); + fprintf( file, "\n" ); + fprintf( file, "\n" ); + fclose( file ); +} + +} // end namespace VTKDomainWriter +} // end namespace ExaMPM +#endif // EXAMPM_VTKDOMAINWRITER_HPP