Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backend : offload large batches to GPU #6083

Merged
merged 9 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
fix CUDA split buffers
  • Loading branch information
slaren committed Mar 16, 2024
commit c0fe6298ae0fa1e2e1612d548a45d3b3affb1036
12 changes: 10 additions & 2 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -1051,8 +1051,9 @@ struct ggml_backend_sched {
struct ggml_cgraph * graph;

// graph splits
struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS];
struct ggml_backend_sched_split * splits;
int n_splits;
int splits_capacity;

// pipeline parallelism support
int n_copies;
Expand Down Expand Up @@ -1443,6 +1444,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg
if (node_backend_id != cur_backend_id || offload) {
split->i_end = i;
i_split++;
if (i_split >= sched->splits_capacity) {
sched->splits_capacity *= 2;
sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split));
}
GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS);
split = &sched->splits[i_split];
split->backend_id = node_backend_id;
Expand Down Expand Up @@ -1711,7 +1716,9 @@ ggml_backend_sched_t ggml_backend_sched_new(

sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;

GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES);
const int initial_splits_capacity = 16;
sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity);
sched->splits_capacity = initial_splits_capacity;

for (int b = 0; b < n_backends; b++) {
sched->backends[b] = backends[b];
Expand Down Expand Up @@ -1742,6 +1749,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) {
}
ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx);
free(sched->splits);
free(sched->hash_set.keys);
free(sched->tensor_backend_id);
free(sched->tensor_copies);
Expand Down
14 changes: 6 additions & 8 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10755,6 +10755,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
};

GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
ggml_init_cublas();

// FIXME: this is not thread safe
if (device >= ggml_backend_cuda_get_device_count()) {
return nullptr;
Expand Down Expand Up @@ -11039,6 +11041,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface
};

GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
ggml_init_cublas();

// FIXME: this is not thread safe
static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;

Expand Down Expand Up @@ -11389,15 +11393,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
}

GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
const ggml_tensor * dst = op;

const int min_batch_size = 32;

if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) {
return true;
}

return false;
return op->ne[1] > min_batch_size && op->op != GGML_OP_GET_ROWS;
}

static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) {
Expand Down Expand Up @@ -11476,7 +11474,7 @@ static ggml_guid_t ggml_backend_cuda_guid() {
}

GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) {
ggml_init_cublas(); // TODO: remove from ggml.c
ggml_init_cublas();

if (device < 0 || device >= ggml_cuda_get_device_count()) {
fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
Expand Down
6 changes: 5 additions & 1 deletion llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5039,7 +5039,11 @@ static bool llm_load_tensors(
ml.get_mapping_range(&first, &last, ctx);
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
#ifdef GGML_USE_CUBLAS
ggml_backend_cuda_register_host_buffer((char *) ml.mapping->addr + first, last - first);
if (n_layer >= n_gpu_layers) {
ggml_backend_cuda_register_host_buffer(
ggml_backend_buffer_get_base(buf),
ggml_backend_buffer_get_size(buf));
}
#endif
}
#ifdef GGML_USE_METAL
Expand Down
Loading