-
Notifications
You must be signed in to change notification settings - Fork 1
[#5][#6] Add UniformCheckpointing class; Update cuSZp compressor. #7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
186 changes: 186 additions & 0 deletions
186
src/Prefetch/include/checkpointing/uniform/UniformCheckpointing.cpp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,186 @@ | ||
| #pragma once | ||
|
|
||
| #include <stdio.h> | ||
| #include <stdlib.h> | ||
|
|
||
| #include <map> | ||
| #include <string> | ||
| #include <vector> | ||
|
|
||
| #include <fstream> | ||
| #include <iostream> | ||
| #include <sstream> | ||
| #include <cmath> | ||
|
|
||
| #include "../Checkpointing.hpp" | ||
|
|
||
| #include "../../common/GPUZIPLogger.cpp" | ||
|
|
||
| /** | ||
| * @class UniformCheckpointing | ||
| * @brief Implements a checkpointing mechanism using uniform checkpoint spacing. | ||
| * @author Bruno Ortega <brunoteixeira@estudante.ufscar.br> | ||
| * @date Jun 3rd, 2026 | ||
| * | ||
| * This class extends the base `Checkpointing` class to provide specific | ||
| * checkpointing actions (save, restore, forward, backward, terminate) using | ||
| * a fixed-spacing checkpoint distribution strategy. | ||
| * | ||
| * The algorithm stores checkpoints at approximately uniform timestep | ||
| * intervals and, during the adjoint phase, restores the most recent | ||
| * checkpoint and recomputes forward states as needed before executing | ||
| * backward operations. | ||
| */ | ||
| class UniformCheckpointing : public Checkpointing { | ||
|
|
||
| private: | ||
| std::vector<int> checkpoints; //< Vector that stores the timestep value of each checkpoint. | ||
| int checkpoint_idx = 0; //< Checkpoint index to access its timestep value. | ||
| bool adjoint = false; //< Indicates whether execution is currently in | ||
| // the forward or in the adjoint phase. | ||
| bool save = false; //< Controls the two-step checkpoint creation process: | ||
| // first issue FORWARD, then SAVE for the same timestep. | ||
| bool restore = false; //< Controls the two-step restore sequence: | ||
| // first execute BACKWARD at a checkpoint boundary, | ||
| // then issue RESTORE on the next scheduler call. | ||
| int current_ts = 1; //< Current timestep. | ||
| int adj_fwd_ts = 0; //< Current timestep during forward recomputation | ||
| // in the adjoint phase. | ||
|
|
||
| protected: | ||
|
|
||
| /** | ||
| * @brief Resets the internal state of the checkpointing process. | ||
| * | ||
| * Sets `checkpoints`, `checkpoint_idx`, `adjoint`, `save`, `restore`, | ||
| * `current_ts` and `adj_fwd_ts` to their initial values. | ||
| * This is typically called to reinitialize the checkpointing algorithm. | ||
| */ | ||
| void reset() override { | ||
| checkpoints.clear(); | ||
| checkpoint_idx = 0; | ||
| adjoint = false; | ||
| save = false; | ||
| restore = false; | ||
| current_ts = 1; | ||
| adj_fwd_ts = 0; | ||
| } | ||
|
|
||
| /** | ||
| * @brief Sets the checkpoints vector with its timesteps. | ||
| * | ||
| * Computes approximately uniformly spaced checkpoint locations | ||
| * and stores their timestep indices in the internal checkpoint list. | ||
| */ | ||
| void setCheckpoints() { | ||
|
|
||
| checkpoints.push_back(1); | ||
|
|
||
| for (int i = 1; i < snaps; i++) { | ||
| int cp = std::round(i * static_cast<double>(steps) / snaps); | ||
| checkpoints.push_back(cp); | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * @brief Determines the next action to perform in the checkpointing process. | ||
| * | ||
| * @return An `Action` object describing the next step, including its type and | ||
| * relevant parameters. | ||
| */ | ||
| Action getAction() override { | ||
| // Forward from first to last timestep | ||
| if(!adjoint){ | ||
| // At last timestep, forward finishes and adjoint begins | ||
| if(current_ts == steps) { | ||
| adjoint = true; | ||
| checkpoint_idx--; | ||
| return Action(current_ts, ACTION_FORWARD); | ||
| } | ||
|
|
||
| // Apply forward and save for the current timestep | ||
| if(current_ts == checkpoints[checkpoint_idx]) { | ||
| if(!save) { | ||
| save = true; | ||
| return Action(current_ts, ACTION_FORWARD); | ||
| } | ||
| save = false; | ||
| current_ts++; | ||
| checkpoint_idx++; | ||
| return Action(current_ts-1, ACTION_SAVE); | ||
| } | ||
|
|
||
| // Apply forward for the current timestep | ||
| current_ts++; | ||
| return Action(current_ts-1, ACTION_FORWARD); | ||
| } | ||
|
|
||
| // Adjoint from last to first timestep | ||
| // Beginning of a recomputation interval. | ||
| if(current_ts == checkpoints[checkpoint_idx+1] || current_ts == steps){ | ||
|
|
||
| // First visit: execute backward at the interval boundary. | ||
| if(!restore) { | ||
| restore = true; | ||
| return Action(current_ts, ACTION_BACKWARD); | ||
| } | ||
|
|
||
| // No remaining checkpoints to restore: adjoint phase finished. | ||
| if(checkpoint_idx < 0) | ||
| return Action(current_ts, ACTION_TERMINATE); | ||
|
|
||
| // Second visit: restore the previous checkpoint. | ||
| adj_fwd_ts = checkpoints[checkpoint_idx]; | ||
| restore = false; | ||
| current_ts--; | ||
| checkpoint_idx--; | ||
| return Action(checkpoints[checkpoint_idx+1], ACTION_RESTORE); | ||
| } | ||
|
|
||
| // Recompute forward states from the restored checkpoint | ||
| // until reaching the current adjoint timestep. | ||
| if(adj_fwd_ts <= current_ts) { | ||
| adj_fwd_ts++; | ||
| return Action(adj_fwd_ts-1, ACTION_FORWARD); | ||
| } else { // Recomputed state available: execute backward. | ||
| current_ts--; | ||
| adj_fwd_ts = checkpoints[checkpoint_idx+1]; | ||
| return Action(current_ts+1, ACTION_BACKWARD); | ||
| } | ||
|
|
||
| return Action(current_ts, ACTION_ERROR); | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * @brief Returns the configured number of checkpoints. | ||
| * | ||
| * Uniform checkpointing requires the number of checkpoints (`snaps`) | ||
| * to be explicitly defined during construction. | ||
| * | ||
| * @return The configured number of checkpoints. | ||
| */ | ||
| int getNumberOfCheckpoints() override { | ||
| if (snaps == 0) { | ||
| GPUZIPLogger::Error("There must be set a value for snapshots.\n"); | ||
| } | ||
| return snaps; | ||
| } | ||
|
|
||
| public: | ||
|
|
||
| /** | ||
| * @brief Constructor for the UniformCheckpointing class. | ||
| * | ||
| * @param steps The number of computational steps for which checkpointing is | ||
| * required. | ||
| * @param snaps Total number of checkpoints used by the algorithm. | ||
| * | ||
| * Initializes the base `Checkpointing` class and computes the uniformly | ||
| * distributed checkpoint locations. | ||
| */ | ||
| UniformCheckpointing(int steps, int snaps) | ||
| : Checkpointing(steps, snaps) { | ||
| setCheckpoints(); | ||
| } | ||
| }; | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.