diff --git a/include/Residual/ResidualGive/applyAGive.inl b/include/Residual/ResidualGive/applyAGive.inl index 2802e0ef5..db10f0ba3 100644 --- a/include/Residual/ResidualGive/applyAGive.inl +++ b/include/Residual/ResidualGive/applyAGive.inl @@ -4,14 +4,16 @@ namespace residual_give { template -static inline void node_apply_a_give(int i_r, int i_theta, double r, double theta, const PolarGrid& grid, - const LevelCacheType& level_cache, bool DirBC_Interior, Vector& result, - ConstVector& x) +static inline void node_apply_a_give(int i_r, int i_theta, const PolarGrid& grid, const LevelCacheType& level_cache, + bool DirBC_Interior, Vector& result, ConstVector& x) { /* ---------------------------------------- */ /* Compute or retrieve stencil coefficients */ /* ---------------------------------------- */ - const int center = grid.index(i_r, i_theta); + const int center = grid.index(i_r, i_theta); + const double r = grid.radius(i_r); + const double theta = grid.theta(i_theta); + double coeff_beta, arr, att, art, detDF; level_cache.obtainValues(i_r, i_theta, center, r, theta, coeff_beta, arr, att, art, detDF); @@ -292,32 +294,97 @@ static inline void node_apply_a_give(int i_r, int i_theta, double r, double thet } // namespace residual_give template -void ResidualGive::applyCircleSection(const int i_r, Vector result, ConstVector x) const +void ResidualGive::applySystemOperator(Vector result, ConstVector x) const { - using residual_give::node_apply_a_give; + assert(result.size() == x.size()); - const PolarGrid& grid = Residual::grid_; + assign(result, 0.0); - const double r = grid.radius(i_r); - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - const double theta = grid.theta(i_theta); - node_apply_a_give(i_r, i_theta, r, theta, grid, Residual::level_cache_, - Residual::DirBC_Interior_, result, x); - } -} + const PolarGrid& grid = Residual::grid_; + const LevelCacheType& level_cache = Residual::level_cache_; + const bool DirBC_Interior = Residual::DirBC_Interior_; + + const LevelCacheType* level_cache_ptr = &level_cache; + const PolarGrid* grid_ptr = &grid; + + const int num_smoother_circles = grid.numberSmootherCircles(); + const int additional_radial_tasks = grid.ntheta() % 3; + const int num_radial_tasks = grid.ntheta() - additional_radial_tasks; -template -void ResidualGive::applyRadialSection(const int i_theta, Vector result, - ConstVector x) const -{ using residual_give::node_apply_a_give; - const PolarGrid& grid = Residual::grid_; + /* ---------------- */ + /* Circular section */ + /* ---------------- */ + // We parallelize over i_r (step 3) to avoid data race conditions between adjacent circles. + // The i_theta loop is sequential inside the kernel. + for (int start_circle = 0; start_circle < 3; ++start_circle) { + const int num_circular_tasks = (num_smoother_circles - start_circle + 2) / 3; + Kokkos::parallel_for( + "ResidualGive: ApplyA (Circular)", Kokkos::RangePolicy<>(0, num_circular_tasks), + KOKKOS_LAMBDA(const int circle_task) { + const int i_r = start_circle + circle_task * 3; + for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { + node_apply_a_give(i_r, i_theta, *grid_ptr, *level_cache_ptr, DirBC_Interior, result, x); + } + }); + Kokkos::fence(); + } - const double theta = grid.theta(i_theta); - for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { - const double r = grid.radius(i_r); - node_apply_a_give(i_r, i_theta, r, theta, grid, Residual::level_cache_, - Residual::DirBC_Interior_, result, x); + /* -------------- */ + /* Radial section */ + /* -------------- */ + // We parallelize over i_theta (step 3) to avoid data race conditions between adjacent radial lines. + // The i_r loop is sequential inside the kernel. + // Due to periodicity in the angular direction, handle up to 2 additional + // radial lines (i_theta = 0 and 1) before the parallel passes. + for (int i_theta = 0; i_theta < additional_radial_tasks; i_theta++) { + Kokkos::parallel_for( + "ResidualGive: ApplyA (Radial, additional)", Kokkos::RangePolicy<>(0, 1), KOKKOS_LAMBDA(const int) { + for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { + node_apply_a_give(i_r, i_theta, *grid_ptr, *level_cache_ptr, DirBC_Interior, result, x); + } + }); + Kokkos::fence(); + } + { + const int start_radial = additional_radial_tasks + 0; + const int num_radial_batches = num_radial_tasks / 3; + Kokkos::parallel_for( + "ResidualGive: ApplyA (Radial, pass 0)", Kokkos::RangePolicy<>(0, num_radial_batches), + KOKKOS_LAMBDA(const int radial_task) { + const int i_theta = start_radial + radial_task * 3; + for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { + node_apply_a_give(i_r, i_theta, *grid_ptr, *level_cache_ptr, DirBC_Interior, result, x); + } + }); + Kokkos::fence(); + } + { + const int start_radial = additional_radial_tasks + 1; + const int num_radial_batches = num_radial_tasks / 3; + Kokkos::parallel_for( + "ResidualGive: ApplyA (Radial, pass 1)", Kokkos::RangePolicy<>(0, num_radial_batches), + KOKKOS_LAMBDA(const int radial_task) { + const int i_theta = start_radial + radial_task * 3; + for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { + node_apply_a_give(i_r, i_theta, *grid_ptr, *level_cache_ptr, DirBC_Interior, result, x); + } + }); + Kokkos::fence(); + } + { + const int start_radial = additional_radial_tasks + 2; + const int num_radial_batches = num_radial_tasks / 3; + Kokkos::parallel_for( + "ResidualGive: ApplyA (Radial, pass 2)", Kokkos::RangePolicy<>(0, num_radial_batches), + KOKKOS_LAMBDA(const int radial_task) { + const int i_theta = start_radial + radial_task * 3; + for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { + node_apply_a_give(i_r, i_theta, *grid_ptr, *level_cache_ptr, DirBC_Interior, result, x); + } + }); + Kokkos::fence(); } } +// clang-format on \ No newline at end of file diff --git a/include/Residual/ResidualGive/residualGive.h b/include/Residual/ResidualGive/residualGive.h index 37d1040d1..a29c1dc1b 100644 --- a/include/Residual/ResidualGive/residualGive.h +++ b/include/Residual/ResidualGive/residualGive.h @@ -16,10 +16,6 @@ class ResidualGive : public Residual void computeResidual(Vector result, ConstVector rhs, ConstVector x) const final; void applySystemOperator(Vector result, ConstVector x) const final; - -private: - void applyCircleSection(const int i_r, Vector result, ConstVector x) const; - void applyRadialSection(const int i_theta, Vector result, ConstVector x) const; }; #include "residualGive.inl" diff --git a/include/Residual/ResidualGive/residualGive.inl b/include/Residual/ResidualGive/residualGive.inl index bc50f6202..72fb6e4ee 100644 --- a/include/Residual/ResidualGive/residualGive.inl +++ b/include/Residual/ResidualGive/residualGive.inl @@ -7,105 +7,6 @@ ResidualGive::ResidualGive(const PolarGrid& grid, const LevelCac { } -/* ------------ */ -/* result = A*x */ - -// clang-format off -template -void ResidualGive::applySystemOperator(Vector result, ConstVector x) const -{ - assert(result.size() == x.size()); - - assign(result, 0.0); - - const PolarGrid& grid = Residual::grid_; - - const int num_omp_threads = Residual::num_omp_threads_; - - /* Single-threaded execution */ - if (num_omp_threads == 1) { - for (int i_r = 0; i_r < grid.numberSmootherCircles(); i_r++) { - applyCircleSection(i_r, result, x); - } - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - applyRadialSection(i_theta, result, x); - } - } - /* Multi-threaded execution */ - else { - const int num_circle_tasks = grid.numberSmootherCircles(); - const int additional_radial_tasks = grid.ntheta() % 3; - const int num_radial_tasks = grid.ntheta() - additional_radial_tasks; - - #pragma omp parallel num_threads(num_omp_threads) - { - /* Circle Section 0 */ - #pragma omp for - for (int circle_task = 0; circle_task < num_circle_tasks; circle_task += 3) { - int i_r = grid.numberSmootherCircles() - circle_task - 1; - applyCircleSection(i_r, result, x); - } - /* Circle Section 1 */ - #pragma omp for - for (int circle_task = 1; circle_task < num_circle_tasks; circle_task += 3) { - int i_r = grid.numberSmootherCircles() - circle_task - 1; - applyCircleSection(i_r, result, x); - } - /* Circle Section 2 */ - #pragma omp for nowait - for (int circle_task = 2; circle_task < num_circle_tasks; circle_task += 3) { - int i_r = grid.numberSmootherCircles() - circle_task - 1; - applyCircleSection(i_r, result, x); - } - - /* Radial Section 0 */ - #pragma omp for - for (int radial_task = 0; radial_task < num_radial_tasks; radial_task += 3) { - if (radial_task > 0) { - int i_theta = radial_task + additional_radial_tasks; - applyRadialSection(i_theta, result, x); - } - else { - if (additional_radial_tasks == 0) { - applyRadialSection(0, result, x); - } - else if (additional_radial_tasks >= 1) { - applyRadialSection(0, result, x); - applyRadialSection(1, result, x); - } - } - } - /* Radial Section 1 */ - #pragma omp for - for (int radial_task = 1; radial_task < num_radial_tasks; radial_task += 3) { - if (radial_task > 1) { - int i_theta = radial_task + additional_radial_tasks; - applyRadialSection(i_theta, result, x); - } - else { - if (additional_radial_tasks == 0) { - applyRadialSection(1, result, x); - } - else if (additional_radial_tasks == 1) { - applyRadialSection(2, result, x); - } - else if (additional_radial_tasks == 2) { - applyRadialSection(2, result, x); - applyRadialSection(3, result, x); - } - } - } - /* Radial Section 2 */ - #pragma omp for - for (int radial_task = 2; radial_task < num_radial_tasks; radial_task += 3) { - int i_theta = radial_task + additional_radial_tasks; - applyRadialSection(i_theta, result, x); - } - } - } -} -// clang-format on - /* ------------------ */ /* result = rhs - A*x */ template