Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2137,11 +2137,18 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
}
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_N_CPU_MOE_DRAFT"));
GGML_ASSERT(params.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
add_opt(common_arg(
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
string_format("max. number of layers to store in VRAM (default: %d)", params.n_gpu_layers),
[](common_params & params, int value) {
params.n_gpu_layers = value;
string_format("max. number of layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)", params.n_gpu_layers == -1 ? "auto" : "all"),
[](common_params & params, const std::string & value) {
if (value == "auto") {
params.n_gpu_layers = -1;
} else if (value == "all") {
params.n_gpu_layers = -2;
} else {
params.n_gpu_layers = std::stoi(value);
}
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: no usable GPU found, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
Expand Down Expand Up @@ -3175,11 +3182,19 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.speculative.devices = parse_device_list(value);
}
).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_CLI}));
GGML_ASSERT(params.speculative.n_gpu_layers < 0); // string_format would need to be extended for a default >= 0
add_opt(common_arg(
{"-ngld", "--gpu-layers-draft", "--n-gpu-layers-draft"}, "N",
"number of layers to store in VRAM for the draft model",
[](common_params & params, int value) {
params.speculative.n_gpu_layers = value;
string_format("max. number of draft model layers to store in VRAM, either an exact number, 'auto', or 'all' (default: %s)",
params.speculative.n_gpu_layers == -1 ? "auto" : "all"),
[](common_params & params, const std::string & value) {
if (value == "auto") {
params.speculative.n_gpu_layers = -1;
} else if (value == "all") {
params.speculative.n_gpu_layers = -2;
} else {
params.speculative.n_gpu_layers = std::stoi(value);
}
if (!llama_supports_gpu_offload()) {
fprintf(stderr, "warning: no usable GPU found, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: one possible reason is that llama.cpp was compiled without GPU support\n");
Expand Down
5 changes: 1 addition & 4 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1339,10 +1339,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.devices = params.devices.data();
}

if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}

mparams.n_gpu_layers = params.n_gpu_layers;
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
Expand Down
2 changes: 1 addition & 1 deletion common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ struct common_params {
// offload params
std::vector<ggml_backend_dev_t> devices; // devices to use for offloading

int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
bool fit_params = true; // whether to fit unset model/context parameters to free device memory
Expand Down
7 changes: 4 additions & 3 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ extern "C" {
// NULL-terminated list of buffer types to use for tensors that match a pattern
const struct llama_model_tensor_buft_override * tensor_buft_overrides;

int32_t n_gpu_layers; // number of layers to store in VRAM
int32_t n_gpu_layers; // number of layers to store in VRAM, a negative value means all layers
enum llama_split_mode split_mode; // how to split the model across multiple GPUs

// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
Expand Down Expand Up @@ -468,8 +468,9 @@ extern "C" {
LLAMA_API void llama_free(struct llama_context * ctx);

// fits mparams and cparams to free device memory (assumes system memory is unlimited)
// returns true if the parameters could be successfully modified to fit device memory
// this function is NOT thread safe because it modifies the global llama logger state
// - returns true if the parameters could be successfully modified to fit device memory
// - this function is NOT thread safe because it modifies the global llama logger state
// - only parameters that have the same value as in llama_default_model_params are modified
LLAMA_API bool llama_params_fit(
const char * path_model,
struct llama_model_params * mparams,
Expand Down
6 changes: 3 additions & 3 deletions src/llama-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,8 +294,8 @@ llama_context::llama_context(
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
bool pipeline_parallel =
model.n_devices() > 1 &&
model.params.n_gpu_layers > (int) model.hparams.n_layer &&
model.params.split_mode == LLAMA_SPLIT_MODE_LAYER &&
model.n_gpu_layers() > model.hparams.n_layer &&
model.split_mode() == LLAMA_SPLIT_MODE_LAYER &&
cparams.offload_kqv &&
!model.has_tensor_overrides();

Expand Down Expand Up @@ -1571,7 +1571,7 @@ llm_graph_cb llama_context::graph_get_cb() const {

// norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
// FIXME: fix in ggml_backend_sched
const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
const bool full_offload = model.n_gpu_layers() > model.hparams.n_layer;
if (ubatch.n_tokens < 32 || full_offload) {
if (il != -1 && strcmp(name, "norm") == 0) {
const auto & dev_layer = model.dev_layer(il);
Expand Down
14 changes: 11 additions & 3 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2329,11 +2329,11 @@ void llama_model::load_vocab(llama_model_loader & ml) {

bool llama_model::load_tensors(llama_model_loader & ml) {
const auto & split_mode = params.split_mode;
const auto & n_gpu_layers = params.n_gpu_layers;
const auto & use_mlock = params.use_mlock;
const auto & tensor_split = params.tensor_split;

const int n_layer = hparams.n_layer;
const int n_layer = hparams.n_layer;
const int n_gpu_layers = this->n_gpu_layers();

const bool use_mmap_buffer = true;

Expand Down Expand Up @@ -6765,6 +6765,14 @@ size_t llama_model::n_devices() const {
return devices.size();
}

uint32_t llama_model::n_gpu_layers() const {
return params.n_gpu_layers >= 0 ? params.n_gpu_layers : hparams.n_layer + 1;
}

llama_split_mode llama_model::split_mode() const {
return params.split_mode;
}

std::map<ggml_backend_buffer_type_t, size_t> llama_model::memory_breakdown() const {
std::map<ggml_backend_buffer_type_t, size_t> ret;
for (const auto & [ctx, bufs] : pimpl->ctxs_bufs) {
Expand Down Expand Up @@ -7662,7 +7670,7 @@ llama_model_params llama_model_default_params() {
llama_model_params result = {
/*.devices =*/ nullptr,
/*.tensor_buft_overrides =*/ nullptr,
/*.n_gpu_layers =*/ 999,
/*.n_gpu_layers =*/ -1,
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
Expand Down
7 changes: 5 additions & 2 deletions src/llama-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -462,8 +462,6 @@ struct llama_model {
struct ggml_tensor * dense_2_out_layers = nullptr;
struct ggml_tensor * dense_3_out_layers = nullptr;

llama_model_params params;

// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;

Expand Down Expand Up @@ -494,6 +492,9 @@ struct llama_model {
size_t n_tensors() const;
size_t n_devices() const;

uint32_t n_gpu_layers() const;
llama_split_mode split_mode() const;

std::map<ggml_backend_buffer_type_t, size_t> memory_breakdown() const;

// total number of parameters in the model
Expand Down Expand Up @@ -522,6 +523,8 @@ struct llama_model {
ggml_cgraph * build_graph(const llm_graph_params & params) const;

private:
llama_model_params params;

struct impl;
std::unique_ptr<impl> pimpl;
};
Expand Down
8 changes: 4 additions & 4 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ static void llama_params_fit_impl(
}
}

int64_t sum_total = 0;
int64_t sum_free = 0;
int64_t sum_projected_free = 0;
int64_t min_projected_free = INT64_MAX;
int64_t sum_projected_used = 0;
Expand All @@ -197,7 +197,7 @@ static void llama_params_fit_impl(
const int64_t projected_used = dmd.mb.total();
const int64_t projected_free = dmd.free - projected_used;

sum_total += dmd.total;
sum_free += dmd.free;
sum_projected_used += projected_used;
sum_projected_free += projected_free;
min_projected_free = std::min(min_projected_free, projected_free);
Expand All @@ -210,10 +210,10 @@ static void llama_params_fit_impl(
projected_free >= 0 ? "surplus" : "deficit");
}
}
assert(sum_total >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
assert(sum_free >= 0 && sum_projected_used >= 0 && sum_projected_ctx >= 0);
assert(sum_projected_used >= sum_projected_ctx);
LLAMA_LOG_INFO("%s: projected to use %" PRId64 " MiB of device memory vs. %" PRId64 " MiB of free device memory\n",
__func__, sum_projected_used/MiB, sum_total/MiB);
__func__, sum_projected_used/MiB, sum_free/MiB);
if (min_projected_free >= margin) {
if (nd == 1) {
LLAMA_LOG_INFO("%s: will leave %" PRId64 " >= %" PRId64 " MiB of free device memory, no changes needed\n",
Expand Down
Loading