diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp index bcb7e955764..cadea8ab45a 100644 --- a/benchmark/preconditioner/preconditioner.cpp +++ b/benchmark/preconditioner/preconditioner.cpp @@ -225,7 +225,6 @@ struct PreconditionerBenchmark : Benchmark { auto x_clone = clone(state.x); auto precond = precond_factory.at(decoded_precond_name)(exec); - std::unique_ptr precond_op; { auto gen_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, @@ -236,7 +235,7 @@ struct PreconditionerBenchmark : Benchmark { exec->get_master()->add_logger(gen_logger); } for (auto i = 0u; i < ic_gen.get_num_repetitions(); ++i) { - precond_op = precond->generate(state.system_matrix); + auto precond_op = precond->generate(state.system_matrix); } if (exec->get_master() != exec) { exec->get_master()->remove_logger(gen_logger); @@ -244,6 +243,9 @@ struct PreconditionerBenchmark : Benchmark { exec->remove_logger(gen_logger); } + // generate it for apply usage + auto precond_op = precond->generate(state.system_matrix); + auto apply_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, precond_case["apply"]["components"], diff --git a/benchmark/solver/solver_common.hpp b/benchmark/solver/solver_common.hpp index 5dc32ec712a..cb01c8c55dd 100644 --- a/benchmark/solver/solver_common.hpp +++ b/benchmark/solver/solver_common.hpp @@ -34,6 +34,11 @@ DEFINE_bool( rel_residual, false, "Use relative residual instead of residual reduction stopping criterion"); +DEFINE_bool(benchmark_from_scratch, false, + "benchmark the solver from scratch everytime which requires " + "workspace initialization everytime. When this is true, the " + "repetition progress will use the solver generated additionally."); + DEFINE_string(solvers, "cg", "A comma-separated list of solvers to run. " "Supported values are: bicgstab, bicg, cb_gmres_keep, " @@ -499,7 +504,6 @@ struct SolverBenchmark : Benchmark> { if (FLAGS_detailed && !FLAGS_overhead) { // slow run, get the time of each functions auto x_clone = clone(state.x); - { auto gen_logger = create_operations_logger( FLAGS_gpu_timer, FLAGS_nested_names, exec, @@ -509,10 +513,12 @@ struct SolverBenchmark : Benchmark> { exec->get_master()->add_logger(gen_logger); } - auto precond = precond_factory.at(precond_name)(exec); - solver = generate_solver(exec, give(precond), solver_name, - FLAGS_max_iters) - ->generate(state.system_matrix); + { + auto precond = precond_factory.at(precond_name)(exec); + auto solver = generate_solver(exec, give(precond), + solver_name, FLAGS_max_iters) + ->generate(state.system_matrix); + } exec->remove_logger(gen_logger); if (exec != exec->get_master()) { @@ -520,8 +526,14 @@ struct SolverBenchmark : Benchmark> { } } - if (auto prec = - dynamic_cast(solver.get())) { + // generate it for apply usage + auto precond = precond_factory.at(precond_name)(exec); + auto detailed_solver = generate_solver(exec, give(precond), + solver_name, FLAGS_max_iters) + ->generate(state.system_matrix); + + if (auto prec = dynamic_cast( + detailed_solver.get())) { solver_case["preconditioner"] = json::object(); write_precond_info( clone(exec->get_master(), prec->get_preconditioner()).get(), @@ -537,7 +549,7 @@ struct SolverBenchmark : Benchmark> { exec->get_master()->add_logger(apply_logger); } - solver->apply(state.b, x_clone); + detailed_solver->apply(state.b, x_clone); exec->remove_logger(apply_logger); if (exec != exec->get_master()) { @@ -554,8 +566,8 @@ struct SolverBenchmark : Benchmark> { solver_case["true_residuals"], solver_case["implicit_residuals"], solver_case["iteration_timestamps"]); - solver->add_logger(res_logger); - solver->apply(state.b, x_clone); + detailed_solver->add_logger(res_logger); + detailed_solver->apply(state.b, x_clone); if (!res_logger->has_implicit_res_norms()) { solver_case.erase("implicit_residuals"); } @@ -568,17 +580,33 @@ struct SolverBenchmark : Benchmark> { auto generate_timer = get_timer(exec, FLAGS_gpu_timer); auto apply_timer = ic.get_timer(); auto x_clone = clone(state.x); + // if we benchmark from scratch, we generate it here and do operations + // once. we can not rely on the warmup one because it use different + // iteration criterion. + if (FLAGS_benchmark_from_scratch) { + auto precond = precond_factory.at(precond_name)(exec); + solver = gko::share(generate_solver(exec, give(precond), + solver_name, FLAGS_max_iters) + ->generate(state.system_matrix)); + solver->apply(state.b, x_clone); + } for (auto status : ic.run(false)) { auto range = annotate("repetition"); x_clone = clone(state.x); - exec->synchronize(); - generate_timer->tic(); - auto precond = precond_factory.at(precond_name)(exec); - solver = generate_solver(exec, give(precond), solver_name, - FLAGS_max_iters) - ->generate(state.system_matrix); - generate_timer->toc(); + { + exec->synchronize(); + generate_timer->tic(); + auto precond = precond_factory.at(precond_name)(exec); + auto generated_solver = + gko::share(generate_solver(exec, give(precond), solver_name, + FLAGS_max_iters) + ->generate(state.system_matrix)); + generate_timer->toc(); + if (FLAGS_benchmark_from_scratch || !solver) { + solver = generated_solver; + } + } exec->synchronize(); if (ic.get_num_repetitions() == 0) { diff --git a/benchmark/test/reference/distributed_solver.matrix.stdout b/benchmark/test/reference/distributed_solver.matrix.stdout index 67ac333bec5..784cf3b6b86 100644 --- a/benchmark/test/reference/distributed_solver.matrix.stdout +++ b/benchmark/test/reference/distributed_solver.matrix.stdout @@ -14,7 +14,6 @@ "generate": { "components": { "generate()": 1.0, - "free": 1.0, "overhead": 1.0 }, "time": 1.0 diff --git a/benchmark/test/reference/distributed_solver.simple.stdout b/benchmark/test/reference/distributed_solver.simple.stdout index 458115e6ab2..abeb536f9c4 100644 --- a/benchmark/test/reference/distributed_solver.simple.stdout +++ b/benchmark/test/reference/distributed_solver.simple.stdout @@ -16,7 +16,6 @@ "generate": { "components": { "generate()": 1.0, - "free": 1.0, "overhead": 1.0 }, "time": 1.0 diff --git a/benchmark/test/reference/distributed_solver_dcomplex.simple.stdout b/benchmark/test/reference/distributed_solver_dcomplex.simple.stdout index 458115e6ab2..abeb536f9c4 100644 --- a/benchmark/test/reference/distributed_solver_dcomplex.simple.stdout +++ b/benchmark/test/reference/distributed_solver_dcomplex.simple.stdout @@ -16,7 +16,6 @@ "generate": { "components": { "generate()": 1.0, - "free": 1.0, "overhead": 1.0 }, "time": 1.0 diff --git a/benchmark/test/reference/solver.matrix.stdout b/benchmark/test/reference/solver.matrix.stdout index 594a3887921..16ebab11663 100644 --- a/benchmark/test/reference/solver.matrix.stdout +++ b/benchmark/test/reference/solver.matrix.stdout @@ -14,7 +14,6 @@ "generate": { "components": { "generate()": 1.0, - "free": 1.0, "overhead": 1.0 }, "time": 1.0 diff --git a/benchmark/test/reference/solver.reordered.stdout b/benchmark/test/reference/solver.reordered.stdout index c1b826ae3fc..6389ab6bdaa 100644 --- a/benchmark/test/reference/solver.reordered.stdout +++ b/benchmark/test/reference/solver.reordered.stdout @@ -15,7 +15,6 @@ "generate": { "components": { "generate()": 1.0, - "free": 1.0, "overhead": 1.0 }, "time": 1.0 diff --git a/benchmark/test/reference/solver.simple.stdout b/benchmark/test/reference/solver.simple.stdout index 0ee0e4b9a4b..963dc9b1a8c 100644 --- a/benchmark/test/reference/solver.simple.stdout +++ b/benchmark/test/reference/solver.simple.stdout @@ -15,7 +15,6 @@ "generate": { "components": { "generate()": 1.0, - "free": 1.0, "overhead": 1.0 }, "time": 1.0 diff --git a/benchmark/test/reference/solver_dcomplex.simple.stdout b/benchmark/test/reference/solver_dcomplex.simple.stdout index 0ee0e4b9a4b..963dc9b1a8c 100644 --- a/benchmark/test/reference/solver_dcomplex.simple.stdout +++ b/benchmark/test/reference/solver_dcomplex.simple.stdout @@ -15,7 +15,6 @@ "generate": { "components": { "generate()": 1.0, - "free": 1.0, "overhead": 1.0 }, "time": 1.0