diff --git a/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/buildTridiagonalAsc.inl b/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/buildTridiagonalAsc.inl index 40aa90ba..fc8e02ab 100644 --- a/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/buildTridiagonalAsc.inl +++ b/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/buildTridiagonalAsc.inl @@ -5,25 +5,24 @@ namespace extrapolated_smoother_give { -static inline void updateMatrixElement(BatchedTridiagonalSolver& solver, int batch, int row, int column, - double value) +static KOKKOS_INLINE_FUNCTION void updateMatrixElement(const BatchedTridiagonalSolver& solver, int batch, + int row, int column, double value) { if (row == column) - solver.main_diagonal(batch, row) += value; + solver.increase_main_diagonal(batch, row, value); else if (row == column - 1) - solver.sub_diagonal(batch, row) += value; + solver.increase_sub_diagonal(batch, row, value); else if (row == 0 && column == solver.matrixDimension() - 1) - solver.cyclic_corner(batch) += value; + solver.increase_cyclic_corner(batch, value); } } // namespace extrapolated_smoother_give template -void ExtrapolatedSmootherGive::nodeBuildTridiagonalSolverMatrices( - int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior, - BatchedTridiagonalSolver& circle_tridiagonal_solver, - BatchedTridiagonalSolver& radial_tridiagonal_solver, double arr, double att, double art, double detDF, - double coeff_beta) +KOKKOS_FUNCTION void ExtrapolatedSmootherGive::nodeBuildTridiagonalSolverMatrices( + int i_r, int i_theta, const PolarGrid& grid, const LevelCacheType& level_cache, bool DirBC_Interior, + const BatchedTridiagonalSolver& circle_tridiagonal_solver, + const BatchedTridiagonalSolver& radial_tridiagonal_solver) { using extrapolated_smoother_give::updateMatrixElement; @@ -36,6 +35,16 @@ void ExtrapolatedSmootherGive::nodeBuildTridiagonalSolverMatrice assert(numberSmootherCircles >= 3); assert(lengthRadialSmoother >= 3); + /* ---------------------------------------- */ + /* Compute or retrieve stencil coefficients */ + /* ---------------------------------------- */ + const int center = grid.index(i_r, i_theta); + const double radius = grid.radius(i_r); + const double theta = grid.theta(i_theta); + + double coeff_beta, arr, att, art, detDF; + level_cache.obtainValues(i_r, i_theta, center, radius, theta, coeff_beta, arr, att, art, detDF); + int ptr, offset; int row, column; double value; @@ -1040,108 +1049,72 @@ void ExtrapolatedSmootherGive::nodeBuildTridiagonalSolverMatrice } } -template -void ExtrapolatedSmootherGive::buildTridiagonalCircleSection(int i_r) -{ - const PolarGrid& grid = ExtrapolatedSmoother::grid_; - const LevelCacheType& level_cache = ExtrapolatedSmoother::level_cache_; - const bool DirBC_Interior = ExtrapolatedSmoother::DirBC_Interior_; - - // Access pattern is aligned with the memory layout of the grid data to maximize cache efficiency. - const double r = grid.radius(i_r); - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - const int global_index = grid.index(i_r, i_theta); - const double theta = grid.theta(i_theta); - - double coeff_beta, arr, att, art, detDF; - level_cache.obtainValues(i_r, i_theta, global_index, r, theta, coeff_beta, arr, att, art, detDF); - - // Build Asc at the current node - nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver_, - radial_tridiagonal_solver_, arr, att, art, detDF, coeff_beta); - } -} - -template -void ExtrapolatedSmootherGive::buildTridiagonalRadialSection(int i_theta) -{ - const PolarGrid& grid = ExtrapolatedSmoother::grid_; - const LevelCacheType& level_cache = ExtrapolatedSmoother::level_cache_; - const bool DirBC_Interior = ExtrapolatedSmoother::DirBC_Interior_; - - // Access pattern is aligned with the memory layout of the grid data to maximize cache efficiency. - const double theta = grid.theta(i_theta); - for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { - const int global_index = grid.index(i_r, i_theta); - const double r = grid.radius(i_r); - - double coeff_beta, arr, att, art, detDF; - level_cache.obtainValues(i_r, i_theta, global_index, r, theta, coeff_beta, arr, att, art, detDF); - - // Build Asc at the current node - nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver_, - radial_tridiagonal_solver_, arr, att, art, detDF, coeff_beta); - } -} - template void ExtrapolatedSmootherGive::buildTridiagonalSolverMatrices() { const PolarGrid& grid = ExtrapolatedSmoother::grid_; const LevelCacheType& level_cache = ExtrapolatedSmoother::level_cache_; const bool DirBC_Interior = ExtrapolatedSmoother::DirBC_Interior_; - const int num_omp_threads = ExtrapolatedSmoother::num_omp_threads_; - const int num_smoother_circles = grid.numberSmootherCircles(); - const int additional_radial_tasks = grid.ntheta() % 3; - const int num_radial_tasks = grid.ntheta() - additional_radial_tasks; + const BatchedTridiagonalSolver& circle_tridiagonal_solver = circle_tridiagonal_solver_; + const BatchedTridiagonalSolver& radial_tridiagonal_solver = radial_tridiagonal_solver_; /* ---------------- */ /* Circular section */ /* ---------------- */ - // We parallelize the loop with step 3 to avoid data race conditions between adjacent circles. -#pragma omp parallel num_threads(num_omp_threads) - { -#pragma omp for - for (int i_r = 0; i_r < num_smoother_circles; i_r += 3) { - buildTridiagonalCircleSection(i_r); - } /* Implicit barrier */ -#pragma omp for - for (int i_r = 1; i_r < num_smoother_circles; i_r += 3) { - buildTridiagonalCircleSection(i_r); - } /* Implicit barrier */ -#pragma omp for - for (int i_r = 2; i_r < num_smoother_circles; i_r += 3) { - buildTridiagonalCircleSection(i_r); - } /* Implicit barrier */ + // We parallelize over i_r (step 3) to avoid data race conditions between adjacent circles. + // The i_theta loop is sequential inside the kernel. + const int num_circle_tasks = grid.numberSmootherCircles(); + + for (int start_circle = 0; start_circle < 3; ++start_circle) { + const int num_circular_tasks = (num_circle_tasks - start_circle + 2) / 3; + Kokkos::parallel_for( + "SmootherGive: buildTridiagonalSolverMatrices (Circular)", + Kokkos::RangePolicy(0, num_circular_tasks), + KOKKOS_LAMBDA(const int circle_task) { + const int i_r = start_circle + circle_task * 3; + for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { + nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, level_cache, DirBC_Interior, + circle_tridiagonal_solver, radial_tridiagonal_solver); + } + }); + Kokkos::fence(); } - /* ---------------- */ + /* -------------- */ /* Radial section */ - /* ---------------- */ - // We parallelize the loop with step 3 to avoid data race conditions between adjacent radial lines. - // Due to the periodicity in the angular direction, we can have at most 2 additional radial tasks - // that are handled serially before the parallel loops. + /* -------------- */ + // We parallelize over i_theta (step 3) to avoid data race conditions between adjacent radial lines. + // The i_r loop is sequential inside the kernel. + // Due to periodicity in the angular direction, handle up to 2 additional + // radial lines (i_theta = 0 and 1) before the parallel passes. + const int additional_radial_tasks = grid.ntheta() % 3; + const int num_radial_tasks = grid.ntheta() - additional_radial_tasks; + for (int i_theta = 0; i_theta < additional_radial_tasks; i_theta++) { - buildTridiagonalRadialSection(i_theta); + Kokkos::parallel_for( + "SmootherGive: buildTridiagonalSolverMatrices (Radial, additional)", + Kokkos::RangePolicy(0, 1), KOKKOS_LAMBDA(const int) { + for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { + nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, level_cache, DirBC_Interior, + circle_tridiagonal_solver, radial_tridiagonal_solver); + } + }); + Kokkos::fence(); } -#pragma omp parallel num_threads(num_omp_threads) - { -#pragma omp for - for (int radial_task = 0; radial_task < num_radial_tasks; radial_task += 3) { - const int i_theta = radial_task + additional_radial_tasks; - buildTridiagonalRadialSection(i_theta); - } /* Implicit barrier */ -#pragma omp for - for (int radial_task = 1; radial_task < num_radial_tasks; radial_task += 3) { - const int i_theta = radial_task + additional_radial_tasks; - buildTridiagonalRadialSection(i_theta); - } /* Implicit barrier */ -#pragma omp for - for (int radial_task = 2; radial_task < num_radial_tasks; radial_task += 3) { - const int i_theta = radial_task + additional_radial_tasks; - buildTridiagonalRadialSection(i_theta); - } /* Implicit barrier */ + for (int start_radial = 0; start_radial < 3; ++start_radial) { + const int num_radial_batches = (num_radial_tasks - start_radial + 2) / 3; + Kokkos::parallel_for( + "SmootherGive: buildTridiagonalSolverMatrices (Radial)", + Kokkos::RangePolicy(0, num_radial_batches), + KOKKOS_LAMBDA(const int radial_task) { + const int i_theta = additional_radial_tasks + start_radial + radial_task * 3; + for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { + nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, level_cache, DirBC_Interior, + circle_tridiagonal_solver, radial_tridiagonal_solver); + } + }); + Kokkos::fence(); } } diff --git a/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/extrapolatedSmootherGive.h b/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/extrapolatedSmootherGive.h index 32f448d0..a54eee2a 100644 --- a/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/extrapolatedSmootherGive.h +++ b/include/ExtrapolatedSmoother/ExtrapolatedSmootherGive/extrapolatedSmootherGive.h @@ -137,6 +137,8 @@ class ExtrapolatedSmootherGive : public ExtrapolatedSmoother // Solver object (owns matrix if MUMPS, references if in-house solver). InnerBoundarySolver inner_boundary_solver_; + // Public is required as Cuda needs to be able to get the address of functions enclosing lambda functions +public: /* -------------- */ /* Stencil access */ /* -------------- */ @@ -154,13 +156,12 @@ class ExtrapolatedSmootherGive : public ExtrapolatedSmoother /* --------------- */ // Build all A_sc matrices for circle and radial smoothers. void buildTridiagonalSolverMatrices(); - void buildTridiagonalCircleSection(int i_r); - void buildTridiagonalRadialSection(int i_theta); // Build the tridiagonal solver matrices for a specific node (i_r, i_theta) - void nodeBuildTridiagonalSolverMatrices(int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior, - BatchedTridiagonalSolver& circle_tridiagonal_solver, - BatchedTridiagonalSolver& radial_tridiagonal_solver, double arr, - double att, double art, double detDF, double coeff_beta); + static KOKKOS_FUNCTION void + nodeBuildTridiagonalSolverMatrices(int i_r, int i_theta, const PolarGrid& grid, const LevelCacheType& level_cache, + bool DirBC_Interior, + const BatchedTridiagonalSolver& circle_tridiagonal_solver, + const BatchedTridiagonalSolver& radial_tridiagonal_solver); // Build the solver matrix for the interior boundary (i_r = 0) which is non-tridiagonal due to across-origin coupling. InnerBoundaryMatrix buildInteriorBoundarySolverMatrix(); diff --git a/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/buildTridiagonalAsc.inl b/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/buildTridiagonalAsc.inl index 13d78f8a..74bb14d3 100644 --- a/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/buildTridiagonalAsc.inl +++ b/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/buildTridiagonalAsc.inl @@ -3,25 +3,25 @@ namespace extrapolated_smoother_take { -static inline void updateMatrixElement(BatchedTridiagonalSolver& solver, int batch, int row, int column, - double value) +static KOKKOS_INLINE_FUNCTION void updateMatrixElement(const BatchedTridiagonalSolver& solver, int batch, + int row, int column, double value) { if (row == column) - solver.main_diagonal(batch, row) = value; + solver.set_main_diagonal(batch, row, value); else if (row == column - 1) - solver.sub_diagonal(batch, row) = value; + solver.set_sub_diagonal(batch, row, value); else if (row == 0 && column == solver.matrixDimension() - 1) - solver.cyclic_corner(batch) = value; + solver.set_cyclic_corner(batch, value); } } // namespace extrapolated_smoother_take template -void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrices( +KOKKOS_FUNCTION void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrices( int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior, - BatchedTridiagonalSolver& circle_tridiagonal_solver, - BatchedTridiagonalSolver& radial_tridiagonal_solver, ConstVector& arr, ConstVector& att, - ConstVector& art, ConstVector& detDF, ConstVector& coeff_beta) + const BatchedTridiagonalSolver& circle_tridiagonal_solver, + const BatchedTridiagonalSolver& radial_tridiagonal_solver, ConstVector& arr, + ConstVector& att, ConstVector& art, ConstVector& detDF, ConstVector& coeff_beta) { using extrapolated_smoother_take::updateMatrixElement; @@ -52,6 +52,7 @@ void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrice double coeff2 = 0.5 * (k1 + k2) / h2; double coeff3 = 0.5 * (h1 + h2) / k1; double coeff4 = 0.5 * (h1 + h2) / k2; + double coeff5 = 0.25 * (h1 + h2) * (k1 + k2); int i_theta_M1 = grid.wrapThetaIndex(i_theta - 1); int i_theta_P1 = grid.wrapThetaIndex(i_theta + 1); @@ -89,9 +90,9 @@ void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrice /* Center: (Left, Right, Bottom, Top) */ row = center_index; column = center_index; - value = 0.25 * (h1 + h2) * (k1 + k2) * coeff_beta[center] * std::fabs(detDF[center]) + - coeff1 * (arr[center] + arr[left]) + coeff2 * (arr[center] + arr[right]) + - coeff3 * (att[center] + att[bottom]) + coeff4 * (att[center] + att[top]); + value = coeff5 * coeff_beta[center] * std::fabs(detDF[center]) + coeff1 * (arr[center] + arr[left]) + + coeff2 * (arr[center] + arr[right]) + coeff3 * (att[center] + att[bottom]) + + coeff4 * (att[center] + att[top]); updateMatrixElement(solver, batch, row, column, value); /* Bottom */ @@ -128,9 +129,9 @@ void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrice /* Center: (Left, Right, Bottom, Top) */ row = center_index; column = center_index; - value = 0.25 * (h1 + h2) * (k1 + k2) * coeff_beta[center] * std::fabs(detDF[center]) + - coeff1 * (arr[center] + arr[left]) + coeff2 * (arr[center] + arr[right]) + - coeff3 * (att[center] + att[bottom]) + coeff4 * (att[center] + att[top]); + value = coeff5 * coeff_beta[center] * std::fabs(detDF[center]) + coeff1 * (arr[center] + arr[left]) + + coeff2 * (arr[center] + arr[right]) + coeff3 * (att[center] + att[bottom]) + + coeff4 * (att[center] + att[top]); updateMatrixElement(solver, batch, row, column, value); } else { @@ -190,6 +191,7 @@ void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrice double coeff2 = 0.5 * (k1 + k2) / h2; double coeff3 = 0.5 * (h1 + h2) / k1; double coeff4 = 0.5 * (h1 + h2) / k2; + double coeff5 = 0.25 * (h1 + h2) * (k1 + k2); const int i_theta_M1 = grid.wrapThetaIndex(i_theta - 1); const int i_theta_P1 = grid.wrapThetaIndex(i_theta + 1); @@ -232,9 +234,9 @@ void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrice /* Center: (Left, Right, Bottom, Top) */ row = center_index; column = center_index; - value = 0.25 * (h1 + h2) * (k1 + k2) * coeff_beta[center] * std::fabs(detDF[center]) + - coeff1 * (arr[center] + arr[left]) + coeff2 * (arr[center] + arr[right]) + - coeff3 * (att[center] + att[bottom]) + coeff4 * (att[center] + att[top]); + value = coeff5 * coeff_beta[center] * std::fabs(detDF[center]) + coeff1 * (arr[center] + arr[left]) + + coeff2 * (arr[center] + arr[right]) + coeff3 * (att[center] + att[bottom]) + + coeff4 * (att[center] + att[top]); updateMatrixElement(solver, batch, row, column, value); /* Left */ @@ -306,6 +308,7 @@ void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrice double coeff2 = 0.5 * (k1 + k2) / h2; double coeff3 = 0.5 * (h1 + h2) / k1; double coeff4 = 0.5 * (h1 + h2) / k2; + double coeff5 = 0.25 * (h1 + h2) * (k1 + k2); const int i_theta_M1 = grid.wrapThetaIndex(i_theta - 1); const int i_theta_P1 = grid.wrapThetaIndex(i_theta + 1); @@ -340,9 +343,9 @@ void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrice /* Center: (Left, Right, Bottom, Top) */ row = center_index; column = center_index; - value = 0.25 * (h1 + h2) * (k1 + k2) * coeff_beta[center] * std::fabs(detDF[center]) + - coeff1 * (arr[center] + arr[left]) + coeff2 * (arr[center] + arr[right]) + - coeff3 * (att[center] + att[bottom]) + coeff4 * (att[center] + att[top]); + value = coeff5 * coeff_beta[center] * std::fabs(detDF[center]) + coeff1 * (arr[center] + arr[left]) + + coeff2 * (arr[center] + arr[right]) + coeff3 * (att[center] + att[bottom]) + + coeff4 * (att[center] + att[top]); updateMatrixElement(solver, batch, row, column, value); /* Right */ @@ -363,9 +366,9 @@ void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrice /* Center: (Left, Right, Bottom, Top) */ row = center_index; column = center_index; - value = 0.25 * (h1 + h2) * (k1 + k2) * coeff_beta[center] * std::fabs(detDF[center]) + - coeff1 * (arr[center] + arr[left]) + coeff2 * (arr[center] + arr[right]) + - coeff3 * (att[center] + att[bottom]) + coeff4 * (att[center] + att[top]); + value = coeff5 * coeff_beta[center] * std::fabs(detDF[center]) + coeff1 * (arr[center] + arr[left]) + + coeff2 * (arr[center] + arr[right]) + coeff3 * (att[center] + att[bottom]) + + coeff4 * (att[center] + att[top]); updateMatrixElement(solver, batch, row, column, value); } else { @@ -399,6 +402,7 @@ void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrice double coeff2 = 0.5 * (k1 + k2) / h2; double coeff3 = 0.5 * (h1 + h2) / k1; double coeff4 = 0.5 * (h1 + h2) / k2; + double coeff5 = 0.25 * (h1 + h2) * (k1 + k2); const int i_theta_M1 = grid.wrapThetaIndex(i_theta - 1); const int i_theta_P1 = grid.wrapThetaIndex(i_theta + 1); @@ -429,9 +433,9 @@ void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrice /* Center: (Left, Right, Bottom, Top) */ row = center_index; column = center_index; - value = 0.25 * (h1 + h2) * (k1 + k2) * coeff_beta[center] * std::fabs(detDF[center]) + - coeff1 * (arr[center] + arr[left]) + coeff2 * (arr[center] + arr[right]) + - coeff3 * (att[center] + att[bottom]) + coeff4 * (att[center] + att[top]); + value = coeff5 * coeff_beta[center] * std::fabs(detDF[center]) + coeff1 * (arr[center] + arr[left]) + + coeff2 * (arr[center] + arr[right]) + coeff3 * (att[center] + att[bottom]) + + coeff4 * (att[center] + att[top]); updateMatrixElement(solver, batch, row, column, value); /* Left */ @@ -457,9 +461,9 @@ void ExtrapolatedSmootherTake::nodeBuildTridiagonalSolverMatrice /* Center: (Left, Right, Bottom, Top) */ row = center_index; column = center_index; - value = 0.25 * (h1 + h2) * (k1 + k2) * coeff_beta[center] * std::fabs(detDF[center]) + - coeff1 * (arr[center] + arr[left]) + coeff2 * (arr[center] + arr[right]) + - coeff3 * (att[center] + att[bottom]) + coeff4 * (att[center] + att[top]); + value = coeff5 * coeff_beta[center] * std::fabs(detDF[center]) + coeff1 * (arr[center] + arr[left]) + + coeff2 * (arr[center] + arr[right]) + coeff3 * (att[center] + att[bottom]) + + coeff4 * (att[center] + att[top]); updateMatrixElement(solver, batch, row, column, value); } } @@ -521,7 +525,6 @@ void ExtrapolatedSmootherTake::buildTridiagonalSolverMatrices() const PolarGrid& grid = ExtrapolatedSmoother::grid_; const LevelCacheType& level_cache = ExtrapolatedSmoother::level_cache_; const bool DirBC_Interior = ExtrapolatedSmoother::DirBC_Interior_; - const int num_omp_threads = ExtrapolatedSmoother::num_omp_threads_; assert(level_cache.cacheDensityProfileCoefficients()); assert(level_cache.cacheDomainGeometry()); @@ -532,22 +535,37 @@ void ExtrapolatedSmootherTake::buildTridiagonalSolverMatrices() ConstVector detDF = level_cache.detDF(); ConstVector coeff_beta = level_cache.coeff_beta(); -#pragma omp parallel num_threads(num_omp_threads) - { -#pragma omp for nowait - for (int i_r = 0; i_r < grid.numberSmootherCircles(); i_r++) { - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver_, - radial_tridiagonal_solver_, arr, att, art, detDF, coeff_beta); - } - } - -#pragma omp for nowait - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { - nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver_, - radial_tridiagonal_solver_, arr, att, art, detDF, coeff_beta); - } - } - } + const BatchedTridiagonalSolver& circle_tridiagonal_solver = circle_tridiagonal_solver_; + const BatchedTridiagonalSolver& radial_tridiagonal_solver = radial_tridiagonal_solver_; + + /* We split the loops into two regions to better respect the */ + /* access patterns of the smoother and improve cache locality. */ + + // The For loop matches circular access pattern */ + Kokkos::parallel_for( + "Extrapolated Smoother Take: Build Tridiagonal Matrices (Circular)", + Kokkos::MDRangePolicy>( // Rank of the index space + {0, 0}, // Starting point of the index space + {grid.numberSmootherCircles(), grid.ntheta()} // Ending point of the index space + ), + // Kokkos lambda function to execute for each point in the index space + KOKKOS_LAMBDA(const int i_r, const int i_theta) { + nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver, + radial_tridiagonal_solver, arr, att, art, detDF, coeff_beta); + }); + + /* For loop matches radial access pattern */ + Kokkos::parallel_for( + "Extrapolated Smoother Take: Build Tridiagonal Matrices (Radial)", + Kokkos::MDRangePolicy>( // Rank of the index space + {0, grid.numberSmootherCircles()}, // Starting point of the index space + {grid.ntheta(), grid.nr()} // Ending point of the index space + ), + // Kokkos lambda function to execute for each point in the index space + KOKKOS_LAMBDA(const int i_theta, const int i_r) { + nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver, + radial_tridiagonal_solver, arr, att, art, detDF, coeff_beta); + }); + + Kokkos::fence(); } diff --git a/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/extrapolatedSmootherTake.h b/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/extrapolatedSmootherTake.h index c14dc32a..3a2bbdeb 100644 --- a/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/extrapolatedSmootherTake.h +++ b/include/ExtrapolatedSmoother/ExtrapolatedSmootherTake/extrapolatedSmootherTake.h @@ -135,6 +135,8 @@ class ExtrapolatedSmootherTake : public ExtrapolatedSmoother // Solver object (owns matrix if MUMPS, references if in-house solver). InnerBoundarySolver inner_boundary_solver_; + // Public is required as Cuda needs to be able to get the address of functions enclosing lambda functions +public: /* -------------- */ /* Stencil access */ /* -------------- */ @@ -153,12 +155,12 @@ class ExtrapolatedSmootherTake : public ExtrapolatedSmoother // Build all A_sc matrices for circle and radial smoothers. void buildTridiagonalSolverMatrices(); // Build the tridiagonal solver matrices for a specific node (i_r, i_theta) - void nodeBuildTridiagonalSolverMatrices(int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior, - BatchedTridiagonalSolver& circle_tridiagonal_solver, - BatchedTridiagonalSolver& radial_tridiagonal_solver, - ConstVector& arr, ConstVector& att, - ConstVector& art, ConstVector& detDF, - ConstVector& coeff_beta); + static KOKKOS_FUNCTION void + nodeBuildTridiagonalSolverMatrices(int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior, + const BatchedTridiagonalSolver& circle_tridiagonal_solver, + const BatchedTridiagonalSolver& radial_tridiagonal_solver, + ConstVector& arr, ConstVector& att, ConstVector& art, + ConstVector& detDF, ConstVector& coeff_beta); // Build the solver matrix for the interior boundary (i_r = 0) which is non-tridiagonal due to across-origin coupling. InnerBoundaryMatrix buildInteriorBoundarySolverMatrix(); diff --git a/include/LinearAlgebra/Solvers/tridiagonal_solver.h b/include/LinearAlgebra/Solvers/tridiagonal_solver.h index 77d8601c..a969d543 100644 --- a/include/LinearAlgebra/Solvers/tridiagonal_solver.h +++ b/include/LinearAlgebra/Solvers/tridiagonal_solver.h @@ -30,12 +30,12 @@ class BatchedTridiagonalSolver /* Accessors for sizes */ /* ------------------- */ - int matrixDimension() const + KOKKOS_INLINE_FUNCTION int matrixDimension() const { return matrix_dimension_; } - int batchCount() const + KOKKOS_INLINE_FUNCTION int batchCount() const { return batch_count_; } @@ -44,39 +44,44 @@ class BatchedTridiagonalSolver /* Accessors for matrix entries */ /* ---------------------------- */ - KOKKOS_INLINE_FUNCTION - const T& main_diagonal(const int batch_idx, const int index) const + KOKKOS_INLINE_FUNCTION const T& main_diagonal(const int batch_idx, const int index) const { return main_diagonal_(batch_idx * matrix_dimension_ + index); } - KOKKOS_INLINE_FUNCTION - T& main_diagonal(const int batch_idx, const int index) + KOKKOS_INLINE_FUNCTION void set_main_diagonal(const int batch_idx, const int index, const T& value) const { - return main_diagonal_(batch_idx * matrix_dimension_ + index); + main_diagonal_(batch_idx * matrix_dimension_ + index) = value; + } + KOKKOS_INLINE_FUNCTION void increase_main_diagonal(const int batch_idx, const int index, const T& value) const + { + main_diagonal_(batch_idx * matrix_dimension_ + index) += value; } - KOKKOS_INLINE_FUNCTION - const T& sub_diagonal(const int batch_idx, const int index) const + KOKKOS_INLINE_FUNCTION const T& sub_diagonal(const int batch_idx, const int index) const { return sub_diagonal_(batch_idx * matrix_dimension_ + index); } - KOKKOS_INLINE_FUNCTION - T& sub_diagonal(const int batch_idx, const int index) + KOKKOS_INLINE_FUNCTION void set_sub_diagonal(const int batch_idx, const int index, const T& value) const { - return sub_diagonal_(batch_idx * matrix_dimension_ + index); + sub_diagonal_(batch_idx * matrix_dimension_ + index) = value; } - - KOKKOS_INLINE_FUNCTION - const T& cyclic_corner(const int batch_idx) const + KOKKOS_INLINE_FUNCTION void increase_sub_diagonal(const int batch_idx, const int index, const T& value) const { - return sub_diagonal_(batch_idx * matrix_dimension_ + (matrix_dimension_ - 1)); + sub_diagonal_(batch_idx * matrix_dimension_ + index) += value; } - KOKKOS_INLINE_FUNCTION - T& cyclic_corner(const int batch_idx) + KOKKOS_INLINE_FUNCTION const T& cyclic_corner(const int batch_idx) const { return sub_diagonal_(batch_idx * matrix_dimension_ + (matrix_dimension_ - 1)); } + KOKKOS_INLINE_FUNCTION T& set_cyclic_corner(const int batch_idx, const T& value) const + { + return sub_diagonal_(batch_idx * matrix_dimension_ + (matrix_dimension_ - 1)) = value; + } + KOKKOS_INLINE_FUNCTION void increase_cyclic_corner(const int batch_idx, const T& value) const + { + sub_diagonal_(batch_idx * matrix_dimension_ + (matrix_dimension_ - 1)) += value; + } /* ---------------------------------------------- */ /* Setup: Cholesky Decomposition: A = L * D * L^T */ diff --git a/include/Smoother/SmootherGive/buildTridiagonalAsc.inl b/include/Smoother/SmootherGive/buildTridiagonalAsc.inl index b2bc94bc..53b9f2ba 100644 --- a/include/Smoother/SmootherGive/buildTridiagonalAsc.inl +++ b/include/Smoother/SmootherGive/buildTridiagonalAsc.inl @@ -3,25 +3,24 @@ namespace smoother_give { -static inline void updateMatrixElement(BatchedTridiagonalSolver& solver, int batch, int row, int column, - double value) +static KOKKOS_INLINE_FUNCTION void updateMatrixElement(const BatchedTridiagonalSolver& solver, int batch, + int row, int column, double value) { if (row == column) - solver.main_diagonal(batch, row) += value; + solver.increase_main_diagonal(batch, row, value); else if (row == column - 1) - solver.sub_diagonal(batch, row) += value; + solver.increase_sub_diagonal(batch, row, value); else if (row == 0 && column == solver.matrixDimension() - 1) - solver.cyclic_corner(batch) += value; + solver.increase_cyclic_corner(batch, value); } } // namespace smoother_give template -void SmootherGive::nodeBuildTridiagonalSolverMatrices( - int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior, - BatchedTridiagonalSolver& circle_tridiagonal_solver, - BatchedTridiagonalSolver& radial_tridiagonal_solver, double arr, double att, double art, double detDF, - double coeff_beta) +KOKKOS_FUNCTION void SmootherGive::nodeBuildTridiagonalSolverMatrices( + int i_r, int i_theta, const PolarGrid& grid, const LevelCacheType& level_cache, bool DirBC_Interior, + const BatchedTridiagonalSolver& circle_tridiagonal_solver, + const BatchedTridiagonalSolver& radial_tridiagonal_solver) { using smoother_give::updateMatrixElement; @@ -34,6 +33,16 @@ void SmootherGive::nodeBuildTridiagonalSolverMatrices( assert(numberSmootherCircles >= 2); assert(lengthRadialSmoother >= 3); + /* ---------------------------------------- */ + /* Compute or retrieve stencil coefficients */ + /* ---------------------------------------- */ + const int center = grid.index(i_r, i_theta); + const double radius = grid.radius(i_r); + const double theta = grid.theta(i_theta); + + double coeff_beta, arr, att, art, detDF; + level_cache.obtainValues(i_r, i_theta, center, radius, theta, coeff_beta, arr, att, art, detDF); + int ptr, offset; int row, column, col; double value, val; @@ -465,108 +474,72 @@ void SmootherGive::nodeBuildTridiagonalSolverMatrices( } } -template -void SmootherGive::buildTridiagonalCircleSection(int i_r) -{ - const PolarGrid& grid = Smoother::grid_; - const LevelCacheType& level_cache = Smoother::level_cache_; - const bool DirBC_Interior = Smoother::DirBC_Interior_; - - // Access pattern is aligned with the memory layout of the grid data to maximize cache efficiency. - const double r = grid.radius(i_r); - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - const int global_index = grid.index(i_r, i_theta); - const double theta = grid.theta(i_theta); - - double coeff_beta, arr, att, art, detDF; - level_cache.obtainValues(i_r, i_theta, global_index, r, theta, coeff_beta, arr, att, art, detDF); - - // Build Asc at the current node - nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver_, - radial_tridiagonal_solver_, arr, att, art, detDF, coeff_beta); - } -} - -template -void SmootherGive::buildTridiagonalRadialSection(int i_theta) -{ - const PolarGrid& grid = Smoother::grid_; - const LevelCacheType& level_cache = Smoother::level_cache_; - const bool DirBC_Interior = Smoother::DirBC_Interior_; - - // Access pattern is aligned with the memory layout of the grid data to maximize cache efficiency. - const double theta = grid.theta(i_theta); - for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { - const int global_index = grid.index(i_r, i_theta); - const double r = grid.radius(i_r); - - double coeff_beta, arr, att, art, detDF; - level_cache.obtainValues(i_r, i_theta, global_index, r, theta, coeff_beta, arr, att, art, detDF); - - // Build Asc at the current node - nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver_, - radial_tridiagonal_solver_, arr, att, art, detDF, coeff_beta); - } -} - template void SmootherGive::buildTridiagonalSolverMatrices() { const PolarGrid& grid = Smoother::grid_; const LevelCacheType& level_cache = Smoother::level_cache_; const bool DirBC_Interior = Smoother::DirBC_Interior_; - const int num_omp_threads = Smoother::num_omp_threads_; - const int num_smoother_circles = grid.numberSmootherCircles(); - const int additional_radial_tasks = grid.ntheta() % 3; - const int num_radial_tasks = grid.ntheta() - additional_radial_tasks; + const BatchedTridiagonalSolver& circle_tridiagonal_solver = circle_tridiagonal_solver_; + const BatchedTridiagonalSolver& radial_tridiagonal_solver = radial_tridiagonal_solver_; /* ---------------- */ /* Circular section */ /* ---------------- */ - // We parallelize the loop with step 3 to avoid data race conditions between adjacent circles. -#pragma omp parallel num_threads(num_omp_threads) - { -#pragma omp for - for (int i_r = 0; i_r < num_smoother_circles; i_r += 3) { - buildTridiagonalCircleSection(i_r); - } /* Implicit barrier */ -#pragma omp for - for (int i_r = 1; i_r < num_smoother_circles; i_r += 3) { - buildTridiagonalCircleSection(i_r); - } /* Implicit barrier */ -#pragma omp for - for (int i_r = 2; i_r < num_smoother_circles; i_r += 3) { - buildTridiagonalCircleSection(i_r); - } /* Implicit barrier */ + // We parallelize over i_r (step 3) to avoid data race conditions between adjacent circles. + // The i_theta loop is sequential inside the kernel. + const int num_circle_tasks = grid.numberSmootherCircles(); + + for (int start_circle = 0; start_circle < 3; ++start_circle) { + const int num_circular_tasks = (num_circle_tasks - start_circle + 2) / 3; + Kokkos::parallel_for( + "SmootherGive: buildTridiagonalSolverMatrices (Circular)", + Kokkos::RangePolicy(0, num_circular_tasks), + KOKKOS_LAMBDA(const int circle_task) { + const int i_r = start_circle + circle_task * 3; + for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { + nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, level_cache, DirBC_Interior, + circle_tridiagonal_solver, radial_tridiagonal_solver); + } + }); + Kokkos::fence(); } - /* ---------------- */ + /* -------------- */ /* Radial section */ - /* ---------------- */ - // We parallelize the loop with step 3 to avoid data race conditions between adjacent radial lines. - // Due to the periodicity in the angular direction, we can have at most 2 additional radial tasks - // that are handled serially before the parallel loops. + /* -------------- */ + // We parallelize over i_theta (step 3) to avoid data race conditions between adjacent radial lines. + // The i_r loop is sequential inside the kernel. + // Due to periodicity in the angular direction, handle up to 2 additional + // radial lines (i_theta = 0 and 1) before the parallel passes. + const int additional_radial_tasks = grid.ntheta() % 3; + const int num_radial_tasks = grid.ntheta() - additional_radial_tasks; + for (int i_theta = 0; i_theta < additional_radial_tasks; i_theta++) { - buildTridiagonalRadialSection(i_theta); + Kokkos::parallel_for( + "SmootherGive: buildTridiagonalSolverMatrices (Radial, additional)", + Kokkos::RangePolicy(0, 1), KOKKOS_LAMBDA(const int) { + for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { + nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, level_cache, DirBC_Interior, + circle_tridiagonal_solver, radial_tridiagonal_solver); + } + }); + Kokkos::fence(); } -#pragma omp parallel num_threads(num_omp_threads) - { -#pragma omp for - for (int radial_task = 0; radial_task < num_radial_tasks; radial_task += 3) { - const int i_theta = radial_task + additional_radial_tasks; - buildTridiagonalRadialSection(i_theta); - } /* Implicit barrier */ -#pragma omp for - for (int radial_task = 1; radial_task < num_radial_tasks; radial_task += 3) { - const int i_theta = radial_task + additional_radial_tasks; - buildTridiagonalRadialSection(i_theta); - } /* Implicit barrier */ -#pragma omp for - for (int radial_task = 2; radial_task < num_radial_tasks; radial_task += 3) { - const int i_theta = radial_task + additional_radial_tasks; - buildTridiagonalRadialSection(i_theta); - } /* Implicit barrier */ + for (int start_radial = 0; start_radial < 3; ++start_radial) { + const int num_radial_batches = (num_radial_tasks - start_radial + 2) / 3; + Kokkos::parallel_for( + "SmootherGive: buildTridiagonalSolverMatrices (Radial)", + Kokkos::RangePolicy(0, num_radial_batches), + KOKKOS_LAMBDA(const int radial_task) { + const int i_theta = additional_radial_tasks + start_radial + radial_task * 3; + for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { + nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, level_cache, DirBC_Interior, + circle_tridiagonal_solver, radial_tridiagonal_solver); + } + }); + Kokkos::fence(); } } diff --git a/include/Smoother/SmootherGive/smootherGive.h b/include/Smoother/SmootherGive/smootherGive.h index c2988286..b7f2d013 100644 --- a/include/Smoother/SmootherGive/smootherGive.h +++ b/include/Smoother/SmootherGive/smootherGive.h @@ -130,6 +130,8 @@ class SmootherGive : public Smoother // Solver object (owns matrix if MUMPS, references if in-house solver). InnerBoundarySolver inner_boundary_solver_; + // Public is required as Cuda needs to be able to get the address of functions enclosing lambda functions +public: /* -------------- */ /* Stencil access */ /* -------------- */ @@ -147,13 +149,12 @@ class SmootherGive : public Smoother /* --------------- */ // Build all A_sc matrices for circle and radial smoothers. void buildTridiagonalSolverMatrices(); - void buildTridiagonalCircleSection(int i_r); - void buildTridiagonalRadialSection(int i_theta); // Build the tridiagonal solver matrices for a specific node (i_r, i_theta) - void nodeBuildTridiagonalSolverMatrices(int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior, - BatchedTridiagonalSolver& circle_tridiagonal_solver, - BatchedTridiagonalSolver& radial_tridiagonal_solver, double arr, - double att, double art, double detDF, double coeff_beta); + static KOKKOS_FUNCTION void + nodeBuildTridiagonalSolverMatrices(int i_r, int i_theta, const PolarGrid& grid, const LevelCacheType& level_cache, + bool DirBC_Interior, + const BatchedTridiagonalSolver& circle_tridiagonal_solver, + const BatchedTridiagonalSolver& radial_tridiagonal_solver); // Build the solver matrix for the interior boundary (i_r = 0) which is non-tridiagonal due to across-origin coupling. InnerBoundaryMatrix buildInteriorBoundarySolverMatrix(); diff --git a/include/Smoother/SmootherTake/buildTridiagonalAsc.inl b/include/Smoother/SmootherTake/buildTridiagonalAsc.inl index 9fd360b2..3c7cd9b4 100644 --- a/include/Smoother/SmootherTake/buildTridiagonalAsc.inl +++ b/include/Smoother/SmootherTake/buildTridiagonalAsc.inl @@ -3,25 +3,25 @@ namespace smoother_take { -static inline void updateMatrixElement(BatchedTridiagonalSolver& solver, int batch, int row, int column, - double value) +static KOKKOS_INLINE_FUNCTION void updateMatrixElement(const BatchedTridiagonalSolver& solver, int batch, + int row, int column, double value) { if (row == column) - solver.main_diagonal(batch, row) = value; + solver.set_main_diagonal(batch, row, value); else if (row == column - 1) - solver.sub_diagonal(batch, row) = value; + solver.set_sub_diagonal(batch, row, value); else if (row == 0 && column == solver.matrixDimension() - 1) - solver.cyclic_corner(batch) = value; + solver.set_cyclic_corner(batch, value); } } // namespace smoother_take template -void SmootherTake::nodeBuildTridiagonalSolverMatrices( +KOKKOS_FUNCTION void SmootherTake::nodeBuildTridiagonalSolverMatrices( int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior, - BatchedTridiagonalSolver& circle_tridiagonal_solver, - BatchedTridiagonalSolver& radial_tridiagonal_solver, ConstVector& arr, ConstVector& att, - ConstVector& art, ConstVector& detDF, ConstVector& coeff_beta) + const BatchedTridiagonalSolver& circle_tridiagonal_solver, + const BatchedTridiagonalSolver& radial_tridiagonal_solver, ConstVector& arr, + ConstVector& att, ConstVector& art, ConstVector& detDF, ConstVector& coeff_beta) { using smoother_take::updateMatrixElement; @@ -308,7 +308,6 @@ void SmootherTake::buildTridiagonalSolverMatrices() const PolarGrid& grid = Smoother::grid_; const LevelCacheType& level_cache = Smoother::level_cache_; const bool DirBC_Interior = Smoother::DirBC_Interior_; - const int num_omp_threads = Smoother::num_omp_threads_; assert(level_cache.cacheDensityProfileCoefficients()); assert(level_cache.cacheDomainGeometry()); @@ -319,22 +318,37 @@ void SmootherTake::buildTridiagonalSolverMatrices() ConstVector detDF = level_cache.detDF(); ConstVector coeff_beta = level_cache.coeff_beta(); -#pragma omp parallel num_threads(num_omp_threads) - { -#pragma omp for nowait - for (int i_r = 0; i_r < grid.numberSmootherCircles(); i_r++) { - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver_, - radial_tridiagonal_solver_, arr, att, art, detDF, coeff_beta); - } - } - -#pragma omp for nowait - for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) { - for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) { - nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver_, - radial_tridiagonal_solver_, arr, att, art, detDF, coeff_beta); - } - } - } -} \ No newline at end of file + const BatchedTridiagonalSolver& circle_tridiagonal_solver = circle_tridiagonal_solver_; + const BatchedTridiagonalSolver& radial_tridiagonal_solver = radial_tridiagonal_solver_; + + /* We split the loops into two regions to better respect the */ + /* access patterns of the smoother and improve cache locality. */ + + // The For loop matches circular access pattern */ + Kokkos::parallel_for( + "Extrapolated Smoother Take: Build Tridiagonal Matrices (Circular)", + Kokkos::MDRangePolicy>( // Rank of the index space + {0, 0}, // Starting point of the index space + {grid.numberSmootherCircles(), grid.ntheta()} // Ending point of the index space + ), + // Kokkos lambda function to execute for each point in the index space + KOKKOS_LAMBDA(const int i_r, const int i_theta) { + nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver, + radial_tridiagonal_solver, arr, att, art, detDF, coeff_beta); + }); + + /* For loop matches radial access pattern */ + Kokkos::parallel_for( + "Extrapolated Smoother Take: Build Tridiagonal Matrices (Radial)", + Kokkos::MDRangePolicy>( // Rank of the index space + {0, grid.numberSmootherCircles()}, // Starting point of the index space + {grid.ntheta(), grid.nr()} // Ending point of the index space + ), + // Kokkos lambda function to execute for each point in the index space + KOKKOS_LAMBDA(const int i_theta, const int i_r) { + nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver, + radial_tridiagonal_solver, arr, att, art, detDF, coeff_beta); + }); + + Kokkos::fence(); +} diff --git a/include/Smoother/SmootherTake/smootherTake.h b/include/Smoother/SmootherTake/smootherTake.h index 763a406f..01cd9542 100644 --- a/include/Smoother/SmootherTake/smootherTake.h +++ b/include/Smoother/SmootherTake/smootherTake.h @@ -130,6 +130,8 @@ class SmootherTake : public Smoother // Solver object (owns matrix if MUMPS, references if in-house solver). InnerBoundarySolver inner_boundary_solver_; + // Public is required as Cuda needs to be able to get the address of functions enclosing lambda functions +public: /* -------------- */ /* Stencil access */ /* -------------- */ @@ -148,12 +150,12 @@ class SmootherTake : public Smoother // Build all A_sc matrices for circle and radial smoothers. void buildTridiagonalSolverMatrices(); // Build the tridiagonal solver matrices for a specific node (i_r, i_theta) - void nodeBuildTridiagonalSolverMatrices(int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior, - BatchedTridiagonalSolver& circle_tridiagonal_solver, - BatchedTridiagonalSolver& radial_tridiagonal_solver, - ConstVector& arr, ConstVector& att, - ConstVector& art, ConstVector& detDF, - ConstVector& coeff_beta); + static KOKKOS_FUNCTION void + nodeBuildTridiagonalSolverMatrices(int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior, + const BatchedTridiagonalSolver& circle_tridiagonal_solver, + const BatchedTridiagonalSolver& radial_tridiagonal_solver, + ConstVector& arr, ConstVector& att, ConstVector& art, + ConstVector& detDF, ConstVector& coeff_beta); // Build the solver matrix for the interior boundary (i_r = 0) which is non-tridiagonal due to across-origin coupling. InnerBoundaryMatrix buildInteriorBoundarySolverMatrix(); @@ -167,8 +169,6 @@ class SmootherTake : public Smoother /* Orthogonal application */ /* ---------------------- */ - // Public is required as Cuda needs to be able to get the address of functions enclosing lambda functions -public: // Compute temp = f_sc − A_sc^ortho * u_sc^ortho (precomputed right-hand side) // where x = u_sc and rhs = f_sc void applyAscOrthoBlackCircleSection(ConstVector x, ConstVector rhs, Vector temp); diff --git a/tests/LinearAlgebra/Solvers/tridiagonal_solver.cpp b/tests/LinearAlgebra/Solvers/tridiagonal_solver.cpp index dc0c9d92..d334aa61 100644 --- a/tests/LinearAlgebra/Solvers/tridiagonal_solver.cpp +++ b/tests/LinearAlgebra/Solvers/tridiagonal_solver.cpp @@ -25,25 +25,25 @@ TEST(BatchedTridiagonalSolvers, non_cyclic_tridiagonal_n_4) // System 4: {{5,1,0,0},{1,7,2,0},{0,2,9,3},{0,0,3,11}} * {{a},{b},{c},{d}} = {{4},{5},{6},{7}} // a = 248/355, b = 36/71, c = 267/710, d = 379/710 - solver.main_diagonal(0,0) = 2.0; solver.sub_diagonal(0,0) = 1.0; - solver.main_diagonal(0,1) = 4.0; solver.sub_diagonal(0,1) = 2.0; - solver.main_diagonal(0,2) = 6.0; solver.sub_diagonal(0,2) = 3.0; - solver.main_diagonal(0,3) = 8.0; - - solver.main_diagonal(1,0) = 3.0; solver.sub_diagonal(1,0) = 1.0; - solver.main_diagonal(1,1) = 5.0; solver.sub_diagonal(1,1) = 2.0; - solver.main_diagonal(1,2) = 7.0; solver.sub_diagonal(1,2) = 3.0; - solver.main_diagonal(1,3) = 9.0; - - solver.main_diagonal(2,0) = 4.0; solver.sub_diagonal(2,0) = 1.0; - solver.main_diagonal(2,1) = 6.0; solver.sub_diagonal(2,1) = 2.0; - solver.main_diagonal(2,2) = 8.0; solver.sub_diagonal(2,2) = 3.0; - solver.main_diagonal(2,3) = 10.0; - - solver.main_diagonal(3,0) = 5.0; solver.sub_diagonal(3,0) = 1.0; - solver.main_diagonal(3,1) = 7.0; solver.sub_diagonal(3,1) = 2.0; - solver.main_diagonal(3,2) = 9.0; solver.sub_diagonal(3,2) = 3.0; - solver.main_diagonal(3,3) = 11.0; + solver.set_main_diagonal(0,0, 2.0); solver.set_sub_diagonal(0,0, 1.0); + solver.set_main_diagonal(0,1, 4.0); solver.set_sub_diagonal(0,1, 2.0); + solver.set_main_diagonal(0,2, 6.0); solver.set_sub_diagonal(0,2, 3.0); + solver.set_main_diagonal(0,3, 8.0); + + solver.set_main_diagonal(1,0, 3.0); solver.set_sub_diagonal(1,0, 1.0); + solver.set_main_diagonal(1,1, 5.0); solver.set_sub_diagonal(1,1, 2.0); + solver.set_main_diagonal(1,2, 7.0); solver.set_sub_diagonal(1,2, 3.0); + solver.set_main_diagonal(1,3, 9.0); + + solver.set_main_diagonal(2,0, 4.0); solver.set_sub_diagonal(2,0, 1.0); + solver.set_main_diagonal(2,1, 6.0); solver.set_sub_diagonal(2,1, 2.0); + solver.set_main_diagonal(2,2, 8.0); solver.set_sub_diagonal(2,2, 3.0); + solver.set_main_diagonal(2,3, 10.0); + + solver.set_main_diagonal(3,0, 5.0); solver.set_sub_diagonal(3,0, 1.0); + solver.set_main_diagonal(3,1, 7.0); solver.set_sub_diagonal(3,1, 2.0); + solver.set_main_diagonal(3,2, 9.0); solver.set_sub_diagonal(3,2, 3.0); + solver.set_main_diagonal(3,3, 11.0); Vector rhs("rhs", matrix_dimension * batch_count); @@ -104,25 +104,25 @@ TEST(BatchedTridiagonalSolvers, cyclic_tridiagonal_n_4) // System 4: {{5,1,0,-4},{1,7,2,0},{0,2,9,3},{-4,0,3,11}} * {{a},{b},{c},{d}} = {{4},{5},{6},{7}} // a = 271/162, b = 23/54, c = 14/81, d = 97/81 - solver.main_diagonal(0,0) = 2.0; solver.sub_diagonal(0,0) = 1.0; - solver.main_diagonal(0,1) = 4.0; solver.sub_diagonal(0,1) = 2.0; - solver.main_diagonal(0,2) = 6.0; solver.sub_diagonal(0,2) = 3.0; - solver.main_diagonal(0,3) = 8.0; solver.cyclic_corner(0) = -1.0; + solver.set_main_diagonal(0,0, 2.0); solver.set_sub_diagonal(0,0, 1.0); + solver.set_main_diagonal(0,1, 4.0); solver.set_sub_diagonal(0,1, 2.0); + solver.set_main_diagonal(0,2, 6.0); solver.set_sub_diagonal(0,2, 3.0); + solver.set_main_diagonal(0,3, 8.0); solver.set_cyclic_corner(0, -1.0); - solver.main_diagonal(1,0) = 3.0; solver.sub_diagonal(1,0) = 1.0; - solver.main_diagonal(1,1) = 5.0; solver.sub_diagonal(1,1) = 2.0; - solver.main_diagonal(1,2) = 7.0; solver.sub_diagonal(1,2) = 3.0; - solver.main_diagonal(1,3) = 9.0; solver.cyclic_corner(1) = -2.0; + solver.set_main_diagonal(1,0, 3.0); solver.set_sub_diagonal(1,0, 1.0); + solver.set_main_diagonal(1,1, 5.0); solver.set_sub_diagonal(1,1, 2.0); + solver.set_main_diagonal(1,2, 7.0); solver.set_sub_diagonal(1,2, 3.0); + solver.set_main_diagonal(1,3, 9.0); solver.set_cyclic_corner(1, -2.0); - solver.main_diagonal(2,0) = 4.0; solver.sub_diagonal(2,0) = 1.0; - solver.main_diagonal(2,1) = 6.0; solver.sub_diagonal(2,1) = 2.0; - solver.main_diagonal(2,2) = 8.0; solver.sub_diagonal(2,2) = 3.0; - solver.main_diagonal(2,3) = 10.0; solver.cyclic_corner(2) = -3.0; + solver.set_main_diagonal(2,0, 4.0); solver.set_sub_diagonal(2,0, 1.0); + solver.set_main_diagonal(2,1, 6.0); solver.set_sub_diagonal(2,1, 2.0); + solver.set_main_diagonal(2,2, 8.0); solver.set_sub_diagonal(2,2, 3.0); + solver.set_main_diagonal(2,3, 10.0); solver.set_cyclic_corner(2, -3.0); - solver.main_diagonal(3,0) = 5.0; solver.sub_diagonal(3,0) = 1.0; - solver.main_diagonal(3,1) = 7.0; solver.sub_diagonal(3,1) = 2.0; - solver.main_diagonal(3,2) = 9.0; solver.sub_diagonal(3,2) = 3.0; - solver.main_diagonal(3,3) = 11.0; solver.cyclic_corner(3) = -4.0; + solver.set_main_diagonal(3,0, 5.0); solver.set_sub_diagonal(3,0, 1.0); + solver.set_main_diagonal(3,1, 7.0); solver.set_sub_diagonal(3,1, 2.0); + solver.set_main_diagonal(3,2, 9.0); solver.set_sub_diagonal(3,2, 3.0); + solver.set_main_diagonal(3,3, 11.0); solver.set_cyclic_corner(3, -4.0); Vector rhs("rhs", matrix_dimension * batch_count); @@ -174,25 +174,25 @@ TEST(BatchedTridiagonalSolvers, non_cyclic_diagonal_n_4) BatchedTridiagonalSolver solver(matrix_dimension, batch_count, is_cyclic); - solver.main_diagonal(0,0) = 2.0; - solver.main_diagonal(0,1) = 4.0; - solver.main_diagonal(0,2) = 6.0; - solver.main_diagonal(0,3) = 8.0; + solver.set_main_diagonal(0,0, 2.0); + solver.set_main_diagonal(0,1, 4.0); + solver.set_main_diagonal(0,2, 6.0); + solver.set_main_diagonal(0,3, 8.0); - solver.main_diagonal(1,0) = 3.0; - solver.main_diagonal(1,1) = 5.0; - solver.main_diagonal(1,2) = 7.0; - solver.main_diagonal(1,3) = 9.0; + solver.set_main_diagonal(1,0, 3.0); + solver.set_main_diagonal(1,1, 5.0); + solver.set_main_diagonal(1,2, 7.0); + solver.set_main_diagonal(1,3, 9.0); - solver.main_diagonal(2,0) = 4.0; - solver.main_diagonal(2,1) = 6.0; - solver.main_diagonal(2,2) = 8.0; - solver.main_diagonal(2,3) = 10.0; + solver.set_main_diagonal(2,0, 4.0); + solver.set_main_diagonal(2,1, 6.0); + solver.set_main_diagonal(2,2, 8.0); + solver.set_main_diagonal(2,3, 10.0); - solver.main_diagonal(3,0) = 5.0; - solver.main_diagonal(3,1) = 7.0; - solver.main_diagonal(3,2) = 9.0; - solver.main_diagonal(3,3) = 11.0; + solver.set_main_diagonal(3,0, 5.0); + solver.set_main_diagonal(3,1, 7.0); + solver.set_main_diagonal(3,2, 9.0); + solver.set_main_diagonal(3,3, 11.0); Vector rhs("rhs", matrix_dimension * batch_count); @@ -244,25 +244,25 @@ TEST(BatchedTridiagonalSolvers, cyclic_diagonal_n_4) BatchedTridiagonalSolver solver(matrix_dimension, batch_count, is_cyclic); - solver.main_diagonal(0,0) = 2.0; - solver.main_diagonal(0,1) = 4.0; - solver.main_diagonal(0,2) = 6.0; - solver.main_diagonal(0,3) = 8.0; - - solver.main_diagonal(1,0) = 3.0; - solver.main_diagonal(1,1) = 5.0; - solver.main_diagonal(1,2) = 7.0; - solver.main_diagonal(1,3) = 9.0; - - solver.main_diagonal(2,0) = 4.0; - solver.main_diagonal(2,1) = 6.0; - solver.main_diagonal(2,2) = 8.0; - solver.main_diagonal(2,3) = 10.0; - - solver.main_diagonal(3,0) = 5.0; - solver.main_diagonal(3,1) = 7.0; - solver.main_diagonal(3,2) = 9.0; - solver.main_diagonal(3,3) = 11.0; + solver.set_main_diagonal(0,0, 2.0); + solver.set_main_diagonal(0,1, 4.0); + solver.set_main_diagonal(0,2, 6.0); + solver.set_main_diagonal(0,3, 8.0); + + solver.set_main_diagonal(1,0, 3.0); + solver.set_main_diagonal(1,1, 5.0); + solver.set_main_diagonal(1,2, 7.0); + solver.set_main_diagonal(1,3, 9.0); + + solver.set_main_diagonal(2,0, 4.0); + solver.set_main_diagonal(2,1, 6.0); + solver.set_main_diagonal(2,2, 8.0); + solver.set_main_diagonal(2,3, 10.0); + + solver.set_main_diagonal(3,0, 5.0); + solver.set_main_diagonal(3,1, 7.0); + solver.set_main_diagonal(3,2, 9.0); + solver.set_main_diagonal(3,3, 11.0); Vector rhs("rhs", matrix_dimension * batch_count);