Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,24 @@
namespace extrapolated_smoother_give
{

static inline void updateMatrixElement(BatchedTridiagonalSolver<double>& solver, int batch, int row, int column,
double value)
static KOKKOS_INLINE_FUNCTION void updateMatrixElement(const BatchedTridiagonalSolver<double>& solver, int batch,
int row, int column, double value)
{
if (row == column)
solver.main_diagonal(batch, row) += value;
solver.increase_main_diagonal(batch, row, value);
else if (row == column - 1)
solver.sub_diagonal(batch, row) += value;
solver.increase_sub_diagonal(batch, row, value);
else if (row == 0 && column == solver.matrixDimension() - 1)
solver.cyclic_corner(batch) += value;
solver.increase_cyclic_corner(batch, value);
}

} // namespace extrapolated_smoother_give

template <class LevelCacheType>
void ExtrapolatedSmootherGive<LevelCacheType>::nodeBuildTridiagonalSolverMatrices(
int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior,
BatchedTridiagonalSolver<double>& circle_tridiagonal_solver,
BatchedTridiagonalSolver<double>& radial_tridiagonal_solver, double arr, double att, double art, double detDF,
double coeff_beta)
KOKKOS_FUNCTION void ExtrapolatedSmootherGive<LevelCacheType>::nodeBuildTridiagonalSolverMatrices(
int i_r, int i_theta, const PolarGrid& grid, const LevelCacheType& level_cache, bool DirBC_Interior,
const BatchedTridiagonalSolver<double>& circle_tridiagonal_solver,
const BatchedTridiagonalSolver<double>& radial_tridiagonal_solver)
{
using extrapolated_smoother_give::updateMatrixElement;

Expand All @@ -36,6 +35,16 @@ void ExtrapolatedSmootherGive<LevelCacheType>::nodeBuildTridiagonalSolverMatrice
assert(numberSmootherCircles >= 3);
assert(lengthRadialSmoother >= 3);

/* ---------------------------------------- */
/* Compute or retrieve stencil coefficients */
/* ---------------------------------------- */
const int center = grid.index(i_r, i_theta);
const double radius = grid.radius(i_r);
const double theta = grid.theta(i_theta);

double coeff_beta, arr, att, art, detDF;
level_cache.obtainValues(i_r, i_theta, center, radius, theta, coeff_beta, arr, att, art, detDF);

int ptr, offset;
int row, column;
double value;
Expand Down Expand Up @@ -1040,108 +1049,72 @@ void ExtrapolatedSmootherGive<LevelCacheType>::nodeBuildTridiagonalSolverMatrice
}
}

template <class LevelCacheType>
void ExtrapolatedSmootherGive<LevelCacheType>::buildTridiagonalCircleSection(int i_r)
{
const PolarGrid& grid = ExtrapolatedSmoother<LevelCacheType>::grid_;
const LevelCacheType& level_cache = ExtrapolatedSmoother<LevelCacheType>::level_cache_;
const bool DirBC_Interior = ExtrapolatedSmoother<LevelCacheType>::DirBC_Interior_;

// Access pattern is aligned with the memory layout of the grid data to maximize cache efficiency.
const double r = grid.radius(i_r);
for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) {
const int global_index = grid.index(i_r, i_theta);
const double theta = grid.theta(i_theta);

double coeff_beta, arr, att, art, detDF;
level_cache.obtainValues(i_r, i_theta, global_index, r, theta, coeff_beta, arr, att, art, detDF);

// Build Asc at the current node
nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver_,
radial_tridiagonal_solver_, arr, att, art, detDF, coeff_beta);
}
}

template <class LevelCacheType>
void ExtrapolatedSmootherGive<LevelCacheType>::buildTridiagonalRadialSection(int i_theta)
{
const PolarGrid& grid = ExtrapolatedSmoother<LevelCacheType>::grid_;
const LevelCacheType& level_cache = ExtrapolatedSmoother<LevelCacheType>::level_cache_;
const bool DirBC_Interior = ExtrapolatedSmoother<LevelCacheType>::DirBC_Interior_;

// Access pattern is aligned with the memory layout of the grid data to maximize cache efficiency.
const double theta = grid.theta(i_theta);
for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) {
const int global_index = grid.index(i_r, i_theta);
const double r = grid.radius(i_r);

double coeff_beta, arr, att, art, detDF;
level_cache.obtainValues(i_r, i_theta, global_index, r, theta, coeff_beta, arr, att, art, detDF);

// Build Asc at the current node
nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, DirBC_Interior, circle_tridiagonal_solver_,
radial_tridiagonal_solver_, arr, att, art, detDF, coeff_beta);
}
}

template <class LevelCacheType>
void ExtrapolatedSmootherGive<LevelCacheType>::buildTridiagonalSolverMatrices()
{
const PolarGrid& grid = ExtrapolatedSmoother<LevelCacheType>::grid_;
const LevelCacheType& level_cache = ExtrapolatedSmoother<LevelCacheType>::level_cache_;
const bool DirBC_Interior = ExtrapolatedSmoother<LevelCacheType>::DirBC_Interior_;
const int num_omp_threads = ExtrapolatedSmoother<LevelCacheType>::num_omp_threads_;

const int num_smoother_circles = grid.numberSmootherCircles();
const int additional_radial_tasks = grid.ntheta() % 3;
const int num_radial_tasks = grid.ntheta() - additional_radial_tasks;
const BatchedTridiagonalSolver<double>& circle_tridiagonal_solver = circle_tridiagonal_solver_;
const BatchedTridiagonalSolver<double>& radial_tridiagonal_solver = radial_tridiagonal_solver_;

/* ---------------- */
/* Circular section */
/* ---------------- */
// We parallelize the loop with step 3 to avoid data race conditions between adjacent circles.
#pragma omp parallel num_threads(num_omp_threads)
{
#pragma omp for
for (int i_r = 0; i_r < num_smoother_circles; i_r += 3) {
buildTridiagonalCircleSection(i_r);
} /* Implicit barrier */
#pragma omp for
for (int i_r = 1; i_r < num_smoother_circles; i_r += 3) {
buildTridiagonalCircleSection(i_r);
} /* Implicit barrier */
#pragma omp for
for (int i_r = 2; i_r < num_smoother_circles; i_r += 3) {
buildTridiagonalCircleSection(i_r);
} /* Implicit barrier */
// We parallelize over i_r (step 3) to avoid data race conditions between adjacent circles.
// The i_theta loop is sequential inside the kernel.
const int num_circle_tasks = grid.numberSmootherCircles();

for (int start_circle = 0; start_circle < 3; ++start_circle) {
const int num_circular_tasks = (num_circle_tasks - start_circle + 2) / 3;
Kokkos::parallel_for(
"SmootherGive: buildTridiagonalSolverMatrices (Circular)",
Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, num_circular_tasks),
KOKKOS_LAMBDA(const int circle_task) {
const int i_r = start_circle + circle_task * 3;
for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) {
nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, level_cache, DirBC_Interior,
circle_tridiagonal_solver, radial_tridiagonal_solver);
}
});
Kokkos::fence();
}

/* ---------------- */
/* -------------- */
/* Radial section */
/* ---------------- */
// We parallelize the loop with step 3 to avoid data race conditions between adjacent radial lines.
// Due to the periodicity in the angular direction, we can have at most 2 additional radial tasks
// that are handled serially before the parallel loops.
/* -------------- */
// We parallelize over i_theta (step 3) to avoid data race conditions between adjacent radial lines.
// The i_r loop is sequential inside the kernel.
// Due to periodicity in the angular direction, handle up to 2 additional
// radial lines (i_theta = 0 and 1) before the parallel passes.
const int additional_radial_tasks = grid.ntheta() % 3;
const int num_radial_tasks = grid.ntheta() - additional_radial_tasks;

for (int i_theta = 0; i_theta < additional_radial_tasks; i_theta++) {
buildTridiagonalRadialSection(i_theta);
Kokkos::parallel_for(
"SmootherGive: buildTridiagonalSolverMatrices (Radial, additional)",
Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, 1), KOKKOS_LAMBDA(const int) {
for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) {
nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, level_cache, DirBC_Interior,
circle_tridiagonal_solver, radial_tridiagonal_solver);
}
});
Kokkos::fence();
}

#pragma omp parallel num_threads(num_omp_threads)
{
#pragma omp for
for (int radial_task = 0; radial_task < num_radial_tasks; radial_task += 3) {
const int i_theta = radial_task + additional_radial_tasks;
buildTridiagonalRadialSection(i_theta);
} /* Implicit barrier */
#pragma omp for
for (int radial_task = 1; radial_task < num_radial_tasks; radial_task += 3) {
const int i_theta = radial_task + additional_radial_tasks;
buildTridiagonalRadialSection(i_theta);
} /* Implicit barrier */
#pragma omp for
for (int radial_task = 2; radial_task < num_radial_tasks; radial_task += 3) {
const int i_theta = radial_task + additional_radial_tasks;
buildTridiagonalRadialSection(i_theta);
} /* Implicit barrier */
for (int start_radial = 0; start_radial < 3; ++start_radial) {
const int num_radial_batches = (num_radial_tasks - start_radial + 2) / 3;
Kokkos::parallel_for(
"SmootherGive: buildTridiagonalSolverMatrices (Radial)",
Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, num_radial_batches),
KOKKOS_LAMBDA(const int radial_task) {
const int i_theta = additional_radial_tasks + start_radial + radial_task * 3;
for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) {
nodeBuildTridiagonalSolverMatrices(i_r, i_theta, grid, level_cache, DirBC_Interior,
circle_tridiagonal_solver, radial_tridiagonal_solver);
}
});
Kokkos::fence();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ class ExtrapolatedSmootherGive : public ExtrapolatedSmoother<LevelCacheType>
// Solver object (owns matrix if MUMPS, references if in-house solver).
InnerBoundarySolver inner_boundary_solver_;

// Public is required as Cuda needs to be able to get the address of functions enclosing lambda functions
public:
/* -------------- */
/* Stencil access */
/* -------------- */
Expand All @@ -154,13 +156,12 @@ class ExtrapolatedSmootherGive : public ExtrapolatedSmoother<LevelCacheType>
/* --------------- */
// Build all A_sc matrices for circle and radial smoothers.
void buildTridiagonalSolverMatrices();
void buildTridiagonalCircleSection(int i_r);
void buildTridiagonalRadialSection(int i_theta);
// Build the tridiagonal solver matrices for a specific node (i_r, i_theta)
void nodeBuildTridiagonalSolverMatrices(int i_r, int i_theta, const PolarGrid& grid, bool DirBC_Interior,
BatchedTridiagonalSolver<double>& circle_tridiagonal_solver,
BatchedTridiagonalSolver<double>& radial_tridiagonal_solver, double arr,
double att, double art, double detDF, double coeff_beta);
static KOKKOS_FUNCTION void
nodeBuildTridiagonalSolverMatrices(int i_r, int i_theta, const PolarGrid& grid, const LevelCacheType& level_cache,
bool DirBC_Interior,
const BatchedTridiagonalSolver<double>& circle_tridiagonal_solver,
const BatchedTridiagonalSolver<double>& radial_tridiagonal_solver);

// Build the solver matrix for the interior boundary (i_r = 0) which is non-tridiagonal due to across-origin coupling.
InnerBoundaryMatrix buildInteriorBoundarySolverMatrix();
Expand Down
Loading
Loading