Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 91 additions & 24 deletions include/Residual/ResidualGive/applyAGive.inl
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,16 @@ namespace residual_give
{

template <class LevelCacheType>
static inline void node_apply_a_give(int i_r, int i_theta, double r, double theta, const PolarGrid& grid,
const LevelCacheType& level_cache, bool DirBC_Interior, Vector<double>& result,
ConstVector<double>& x)
static inline void node_apply_a_give(int i_r, int i_theta, const PolarGrid& grid, const LevelCacheType& level_cache,
bool DirBC_Interior, Vector<double>& result, ConstVector<double>& x)
{
/* ---------------------------------------- */
/* Compute or retrieve stencil coefficients */
/* ---------------------------------------- */
const int center = grid.index(i_r, i_theta);
const int center = grid.index(i_r, i_theta);
const double r = grid.radius(i_r);
const double theta = grid.theta(i_theta);

double coeff_beta, arr, att, art, detDF;
level_cache.obtainValues(i_r, i_theta, center, r, theta, coeff_beta, arr, att, art, detDF);

Expand Down Expand Up @@ -292,32 +294,97 @@ static inline void node_apply_a_give(int i_r, int i_theta, double r, double thet
} // namespace residual_give

template <class LevelCacheType>
void ResidualGive<LevelCacheType>::applyCircleSection(const int i_r, Vector<double> result, ConstVector<double> x) const
void ResidualGive<LevelCacheType>::applySystemOperator(Vector<double> result, ConstVector<double> x) const
{
using residual_give::node_apply_a_give;
assert(result.size() == x.size());

const PolarGrid& grid = Residual<LevelCacheType>::grid_;
assign(result, 0.0);

const double r = grid.radius(i_r);
for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) {
const double theta = grid.theta(i_theta);
node_apply_a_give(i_r, i_theta, r, theta, grid, Residual<LevelCacheType>::level_cache_,
Residual<LevelCacheType>::DirBC_Interior_, result, x);
}
}
const PolarGrid& grid = Residual<LevelCacheType>::grid_;
const LevelCacheType& level_cache = Residual<LevelCacheType>::level_cache_;
const bool DirBC_Interior = Residual<LevelCacheType>::DirBC_Interior_;

const LevelCacheType* level_cache_ptr = &level_cache;
const PolarGrid* grid_ptr = &grid;
Comment on lines +307 to +308
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned in a different PR, the use of pointers will prevent this from working on GPU

Copy link
Copy Markdown
Collaborator Author

@julianlitz julianlitz May 6, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing the Grid pointer already works. The data will not be copied.

But doing the same for the LevelCache not quite:
We need to change the LevelCache member to AllocatableVector so no new copies are made.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to change the LevelCache member to AllocatableVector so no new copies are made.

I don't understand why this would be the case.

We do need to remove references from LevelCache though

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't test it yet. But before merging we just need to be sure that LevelCache isn't fully copied when parsed to a kokkos lambda


const int num_smoother_circles = grid.numberSmootherCircles();
const int additional_radial_tasks = grid.ntheta() % 3;
const int num_radial_tasks = grid.ntheta() - additional_radial_tasks;

template <class LevelCacheType>
void ResidualGive<LevelCacheType>::applyRadialSection(const int i_theta, Vector<double> result,
ConstVector<double> x) const
{
using residual_give::node_apply_a_give;

const PolarGrid& grid = Residual<LevelCacheType>::grid_;
/* ---------------- */
/* Circular section */
/* ---------------- */
// We parallelize over i_r (step 3) to avoid data race conditions between adjacent circles.
// The i_theta loop is sequential inside the kernel.
for (int start_circle = 0; start_circle < 3; ++start_circle) {
const int num_circular_tasks = (num_smoother_circles - start_circle + 2) / 3;
Kokkos::parallel_for(
"ResidualGive: ApplyA (Circular)", Kokkos::RangePolicy<>(0, num_circular_tasks),
KOKKOS_LAMBDA(const int circle_task) {
const int i_r = start_circle + circle_task * 3;
for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) {
node_apply_a_give(i_r, i_theta, *grid_ptr, *level_cache_ptr, DirBC_Interior, result, x);
}
});
Kokkos::fence();
}

const double theta = grid.theta(i_theta);
for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) {
const double r = grid.radius(i_r);
node_apply_a_give(i_r, i_theta, r, theta, grid, Residual<LevelCacheType>::level_cache_,
Residual<LevelCacheType>::DirBC_Interior_, result, x);
/* -------------- */
/* Radial section */
/* -------------- */
// We parallelize over i_theta (step 3) to avoid data race conditions between adjacent radial lines.
// The i_r loop is sequential inside the kernel.
// Due to periodicity in the angular direction, handle up to 2 additional
// radial lines (i_theta = 0 and 1) before the parallel passes.
for (int i_theta = 0; i_theta < additional_radial_tasks; i_theta++) {
Kokkos::parallel_for(
"ResidualGive: ApplyA (Radial, additional)", Kokkos::RangePolicy<>(0, 1), KOKKOS_LAMBDA(const int) {
for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) {
node_apply_a_give(i_r, i_theta, *grid_ptr, *level_cache_ptr, DirBC_Interior, result, x);
}
});
Kokkos::fence();
}
{
const int start_radial = additional_radial_tasks + 0;
const int num_radial_batches = num_radial_tasks / 3;
Kokkos::parallel_for(
"ResidualGive: ApplyA (Radial, pass 0)", Kokkos::RangePolicy<>(0, num_radial_batches),
KOKKOS_LAMBDA(const int radial_task) {
const int i_theta = start_radial + radial_task * 3;
for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) {
node_apply_a_give(i_r, i_theta, *grid_ptr, *level_cache_ptr, DirBC_Interior, result, x);
}
});
Kokkos::fence();
}
{
const int start_radial = additional_radial_tasks + 1;
const int num_radial_batches = num_radial_tasks / 3;
Kokkos::parallel_for(
"ResidualGive: ApplyA (Radial, pass 1)", Kokkos::RangePolicy<>(0, num_radial_batches),
KOKKOS_LAMBDA(const int radial_task) {
const int i_theta = start_radial + radial_task * 3;
for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) {
node_apply_a_give(i_r, i_theta, *grid_ptr, *level_cache_ptr, DirBC_Interior, result, x);
}
});
Kokkos::fence();
}
{
const int start_radial = additional_radial_tasks + 2;
const int num_radial_batches = num_radial_tasks / 3;
Kokkos::parallel_for(
"ResidualGive: ApplyA (Radial, pass 2)", Kokkos::RangePolicy<>(0, num_radial_batches),
KOKKOS_LAMBDA(const int radial_task) {
const int i_theta = start_radial + radial_task * 3;
for (int i_r = grid.numberSmootherCircles(); i_r < grid.nr(); i_r++) {
node_apply_a_give(i_r, i_theta, *grid_ptr, *level_cache_ptr, DirBC_Interior, result, x);
}
});
Kokkos::fence();
}
}
// clang-format on
4 changes: 0 additions & 4 deletions include/Residual/ResidualGive/residualGive.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@ class ResidualGive : public Residual<LevelCacheType>
void computeResidual(Vector<double> result, ConstVector<double> rhs, ConstVector<double> x) const final;

void applySystemOperator(Vector<double> result, ConstVector<double> x) const final;

private:
void applyCircleSection(const int i_r, Vector<double> result, ConstVector<double> x) const;
void applyRadialSection(const int i_theta, Vector<double> result, ConstVector<double> x) const;
};

#include "residualGive.inl"
Expand Down
99 changes: 0 additions & 99 deletions include/Residual/ResidualGive/residualGive.inl
Original file line number Diff line number Diff line change
Expand Up @@ -7,105 +7,6 @@ ResidualGive<LevelCacheType>::ResidualGive(const PolarGrid& grid, const LevelCac
{
}

/* ------------ */
/* result = A*x */

// clang-format off
template <class LevelCacheType>
void ResidualGive<LevelCacheType>::applySystemOperator(Vector<double> result, ConstVector<double> x) const
{
assert(result.size() == x.size());

assign(result, 0.0);

const PolarGrid& grid = Residual<LevelCacheType>::grid_;

const int num_omp_threads = Residual<LevelCacheType>::num_omp_threads_;

/* Single-threaded execution */
if (num_omp_threads == 1) {
for (int i_r = 0; i_r < grid.numberSmootherCircles(); i_r++) {
applyCircleSection(i_r, result, x);
}
for (int i_theta = 0; i_theta < grid.ntheta(); i_theta++) {
applyRadialSection(i_theta, result, x);
}
}
/* Multi-threaded execution */
else {
const int num_circle_tasks = grid.numberSmootherCircles();
const int additional_radial_tasks = grid.ntheta() % 3;
const int num_radial_tasks = grid.ntheta() - additional_radial_tasks;

#pragma omp parallel num_threads(num_omp_threads)
{
/* Circle Section 0 */
#pragma omp for
for (int circle_task = 0; circle_task < num_circle_tasks; circle_task += 3) {
int i_r = grid.numberSmootherCircles() - circle_task - 1;
applyCircleSection(i_r, result, x);
}
/* Circle Section 1 */
#pragma omp for
for (int circle_task = 1; circle_task < num_circle_tasks; circle_task += 3) {
int i_r = grid.numberSmootherCircles() - circle_task - 1;
applyCircleSection(i_r, result, x);
}
/* Circle Section 2 */
#pragma omp for nowait
for (int circle_task = 2; circle_task < num_circle_tasks; circle_task += 3) {
int i_r = grid.numberSmootherCircles() - circle_task - 1;
applyCircleSection(i_r, result, x);
}

/* Radial Section 0 */
#pragma omp for
for (int radial_task = 0; radial_task < num_radial_tasks; radial_task += 3) {
if (radial_task > 0) {
int i_theta = radial_task + additional_radial_tasks;
applyRadialSection(i_theta, result, x);
}
else {
if (additional_radial_tasks == 0) {
applyRadialSection(0, result, x);
}
else if (additional_radial_tasks >= 1) {
applyRadialSection(0, result, x);
applyRadialSection(1, result, x);
}
}
}
/* Radial Section 1 */
#pragma omp for
for (int radial_task = 1; radial_task < num_radial_tasks; radial_task += 3) {
if (radial_task > 1) {
int i_theta = radial_task + additional_radial_tasks;
applyRadialSection(i_theta, result, x);
}
else {
if (additional_radial_tasks == 0) {
applyRadialSection(1, result, x);
}
else if (additional_radial_tasks == 1) {
applyRadialSection(2, result, x);
}
else if (additional_radial_tasks == 2) {
applyRadialSection(2, result, x);
applyRadialSection(3, result, x);
}
}
}
/* Radial Section 2 */
#pragma omp for
for (int radial_task = 2; radial_task < num_radial_tasks; radial_task += 3) {
int i_theta = radial_task + additional_radial_tasks;
applyRadialSection(i_theta, result, x);
}
}
}
}
// clang-format on

/* ------------------ */
/* result = rhs - A*x */
template <class LevelCacheType>
Expand Down