Skip to content

Commit

Permalink
drm/amdkfd: pause autosuspend when creating pdd
Browse files Browse the repository at this point in the history
When using MES creating a pdd will require talking to the GPU to
setup the relevant context. The code here forgot to wake up the GPU
in case it was in suspend, this causes KVM to EFAULT for passthrough
GPU for example. This issue can be masked if the GPU was woken up by
other things (e.g. opening the KMS node) first and have not yet gone to sleep.

v4: do the allocation of proc_ctx_bo in a lazy fashion
when the first queue is created in a process (Felix)

Signed-off-by: Jesse Zhang <[email protected]>
Reviewed-by: Yunxiang Li <[email protected]>
Signed-off-by: Alex Deucher <[email protected]>
Cc: [email protected]
  • Loading branch information
Jie1zhang authored and alexdeucher committed Dec 10, 2024
1 parent f4df208 commit 438b39a
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 21 deletions.
15 changes: 15 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,21 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q,
if (!down_read_trylock(&adev->reset_domain->sem))
return -EIO;

if (!pdd->proc_ctx_cpu_ptr) {
r = amdgpu_amdkfd_alloc_gtt_mem(adev,
AMDGPU_MES_PROC_CTX_SIZE,
&pdd->proc_ctx_bo,
&pdd->proc_ctx_gpu_addr,
&pdd->proc_ctx_cpu_ptr,
false);
if (r) {
dev_err(adev->dev,
"failed to allocate process context bo\n");
return r;
}
memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
}

memset(&queue_input, 0x0, sizeof(struct mes_add_queue_input));
queue_input.process_id = qpd->pqm->process->pasid;
queue_input.page_table_base_addr = qpd->page_table_base;
Expand Down
23 changes: 2 additions & 21 deletions drivers/gpu/drm/amd/amdkfd/kfd_process.c
Original file line number Diff line number Diff line change
Expand Up @@ -1076,7 +1076,8 @@ static void kfd_process_destroy_pdds(struct kfd_process *p)

kfd_free_process_doorbells(pdd->dev->kfd, pdd);

if (pdd->dev->kfd->shared_resources.enable_mes)
if (pdd->dev->kfd->shared_resources.enable_mes &&
pdd->proc_ctx_cpu_ptr)
amdgpu_amdkfd_free_gtt_mem(pdd->dev->adev,
&pdd->proc_ctx_bo);
/*
Expand Down Expand Up @@ -1608,7 +1609,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
struct kfd_process *p)
{
struct kfd_process_device *pdd = NULL;
int retval = 0;

if (WARN_ON_ONCE(p->n_pdds >= MAX_GPU_INSTANCE))
return NULL;
Expand All @@ -1632,21 +1632,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
pdd->user_gpu_id = dev->id;
atomic64_set(&pdd->evict_duration_counter, 0);

if (dev->kfd->shared_resources.enable_mes) {
retval = amdgpu_amdkfd_alloc_gtt_mem(dev->adev,
AMDGPU_MES_PROC_CTX_SIZE,
&pdd->proc_ctx_bo,
&pdd->proc_ctx_gpu_addr,
&pdd->proc_ctx_cpu_ptr,
false);
if (retval) {
dev_err(dev->adev->dev,
"failed to allocate process context bo\n");
goto err_free_pdd;
}
memset(pdd->proc_ctx_cpu_ptr, 0, AMDGPU_MES_PROC_CTX_SIZE);
}

p->pdds[p->n_pdds++] = pdd;
if (kfd_dbg_is_per_vmid_supported(pdd->dev))
pdd->spi_dbg_override = pdd->dev->kfd2kgd->disable_debug_trap(
Expand All @@ -1658,10 +1643,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_node *dev,
idr_init(&pdd->alloc_idr);

return pdd;

err_free_pdd:
kfree(pdd);
return NULL;
}

/**
Expand Down

0 comments on commit 438b39a

Please sign in to comment.