summaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdkfd/kfd_process.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_process.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_process.c138
1 files changed, 65 insertions, 73 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 7a33e06f5..58c1fe542 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -664,7 +664,8 @@ int kfd_process_create_wq(void)
if (!kfd_process_wq)
kfd_process_wq = alloc_workqueue("kfd_process_wq", 0, 0);
if (!kfd_restore_wq)
- kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq", 0);
+ kfd_restore_wq = alloc_ordered_workqueue("kfd_restore_wq",
+ WQ_FREEZABLE);
if (!kfd_process_wq || !kfd_restore_wq) {
kfd_process_destroy_wq();
@@ -818,9 +819,9 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
mutex_lock(&kfd_processes_mutex);
if (kfd_is_locked()) {
- mutex_unlock(&kfd_processes_mutex);
pr_debug("KFD is locked! Cannot create process");
- return ERR_PTR(-EINVAL);
+ process = ERR_PTR(-EINVAL);
+ goto out;
}
/* A prior open of /dev/kfd could have already created the process. */
@@ -1109,6 +1110,7 @@ static void kfd_process_wq_release(struct work_struct *work)
{
struct kfd_process *p = container_of(work, struct kfd_process,
release_work);
+ struct dma_fence *ef;
kfd_process_dequeue_from_all_devices(p);
pqm_uninit(&p->pqm);
@@ -1117,7 +1119,9 @@ static void kfd_process_wq_release(struct work_struct *work)
* destroyed. This allows any BOs to be freed without
* triggering pointless evictions or waiting for fences.
*/
- dma_fence_signal(p->ef);
+ synchronize_rcu();
+ ef = rcu_access_pointer(p->ef);
+ dma_fence_signal(ef);
kfd_process_remove_sysfs(p);
@@ -1126,7 +1130,7 @@ static void kfd_process_wq_release(struct work_struct *work)
svm_range_list_fini(p);
kfd_process_destroy_pdds(p);
- dma_fence_put(p->ef);
+ dma_fence_put(ef);
kfd_event_free_process(p);
@@ -1642,6 +1646,7 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
struct amdgpu_fpriv *drv_priv;
struct amdgpu_vm *avm;
struct kfd_process *p;
+ struct dma_fence *ef;
struct kfd_node *dev;
int ret;
@@ -1661,13 +1666,13 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, avm,
&p->kgd_process_info,
- &p->ef);
+ &ef);
if (ret) {
pr_err("Failed to create process VM object\n");
return ret;
}
+ RCU_INIT_POINTER(p->ef, ef);
pdd->drm_priv = drm_file->private_data;
- atomic64_set(&pdd->tlb_seq, 0);
ret = kfd_process_device_reserve_ib_mem(pdd);
if (ret)
@@ -1909,6 +1914,23 @@ kfd_process_gpuid_from_node(struct kfd_process *p, struct kfd_node *node,
return -EINVAL;
}
+static int signal_eviction_fence(struct kfd_process *p)
+{
+ struct dma_fence *ef;
+ int ret;
+
+ rcu_read_lock();
+ ef = dma_fence_get_rcu_safe(&p->ef);
+ rcu_read_unlock();
+ if (!ef)
+ return -EINVAL;
+
+ ret = dma_fence_signal(ef);
+ dma_fence_put(ef);
+
+ return ret;
+}
+
static void evict_process_worker(struct work_struct *work)
{
int ret;
@@ -1921,31 +1943,45 @@ static void evict_process_worker(struct work_struct *work)
* lifetime of this thread, kfd_process p will be valid
*/
p = container_of(dwork, struct kfd_process, eviction_work);
- WARN_ONCE(p->last_eviction_seqno != p->ef->seqno,
- "Eviction fence mismatch\n");
-
- /* Narrow window of overlap between restore and evict work
- * item is possible. Once amdgpu_amdkfd_gpuvm_restore_process_bos
- * unreserves KFD BOs, it is possible to evicted again. But
- * restore has few more steps of finish. So lets wait for any
- * previous restore work to complete
- */
- flush_delayed_work(&p->restore_work);
pr_debug("Started evicting pasid 0x%x\n", p->pasid);
ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_TTM);
if (!ret) {
- dma_fence_signal(p->ef);
- dma_fence_put(p->ef);
- p->ef = NULL;
- queue_delayed_work(kfd_restore_wq, &p->restore_work,
- msecs_to_jiffies(PROCESS_RESTORE_TIME_MS));
+ /* If another thread already signaled the eviction fence,
+ * they are responsible stopping the queues and scheduling
+ * the restore work.
+ */
+ if (signal_eviction_fence(p) ||
+ mod_delayed_work(kfd_restore_wq, &p->restore_work,
+ msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
+ kfd_process_restore_queues(p);
pr_debug("Finished evicting pasid 0x%x\n", p->pasid);
} else
pr_err("Failed to evict queues of pasid 0x%x\n", p->pasid);
}
+static int restore_process_helper(struct kfd_process *p)
+{
+ int ret = 0;
+
+ /* VMs may not have been acquired yet during debugging. */
+ if (p->kgd_process_info) {
+ ret = amdgpu_amdkfd_gpuvm_restore_process_bos(
+ p->kgd_process_info, &p->ef);
+ if (ret)
+ return ret;
+ }
+
+ ret = kfd_process_restore_queues(p);
+ if (!ret)
+ pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
+ else
+ pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
+
+ return ret;
+}
+
static void restore_process_worker(struct work_struct *work)
{
struct delayed_work *dwork;
@@ -1971,24 +2007,15 @@ static void restore_process_worker(struct work_struct *work)
*/
p->last_restore_timestamp = get_jiffies_64();
- /* VMs may not have been acquired yet during debugging. */
- if (p->kgd_process_info)
- ret = amdgpu_amdkfd_gpuvm_restore_process_bos(p->kgd_process_info,
- &p->ef);
+
+ ret = restore_process_helper(p);
if (ret) {
pr_debug("Failed to restore BOs of pasid 0x%x, retry after %d ms\n",
p->pasid, PROCESS_BACK_OFF_TIME_MS);
- ret = queue_delayed_work(kfd_restore_wq, &p->restore_work,
- msecs_to_jiffies(PROCESS_BACK_OFF_TIME_MS));
- WARN(!ret, "reschedule restore work failed\n");
- return;
+ if (mod_delayed_work(kfd_restore_wq, &p->restore_work,
+ msecs_to_jiffies(PROCESS_RESTORE_TIME_MS)))
+ kfd_process_restore_queues(p);
}
-
- ret = kfd_process_restore_queues(p);
- if (!ret)
- pr_debug("Finished restoring pasid 0x%x\n", p->pasid);
- else
- pr_err("Failed to restore queues of pasid 0x%x\n", p->pasid);
}
void kfd_suspend_all_processes(void)
@@ -1999,14 +2026,9 @@ void kfd_suspend_all_processes(void)
WARN(debug_evictions, "Evicting all processes");
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
- cancel_delayed_work_sync(&p->eviction_work);
- flush_delayed_work(&p->restore_work);
-
if (kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_TRIGGER_SUSPEND))
pr_err("Failed to suspend process 0x%x\n", p->pasid);
- dma_fence_signal(p->ef);
- dma_fence_put(p->ef);
- p->ef = NULL;
+ signal_eviction_fence(p);
}
srcu_read_unlock(&kfd_processes_srcu, idx);
}
@@ -2018,7 +2040,7 @@ int kfd_resume_all_processes(void)
int ret = 0, idx = srcu_read_lock(&kfd_processes_srcu);
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
- if (!queue_delayed_work(kfd_restore_wq, &p->restore_work, 0)) {
+ if (restore_process_helper(p)) {
pr_err("Restore process %d failed during resume\n",
p->pasid);
ret = -EFAULT;
@@ -2059,36 +2081,6 @@ int kfd_reserved_mem_mmap(struct kfd_node *dev, struct kfd_process *process,
KFD_CWSR_TBA_TMA_SIZE, vma->vm_page_prot);
}
-void kfd_flush_tlb(struct kfd_process_device *pdd, enum TLB_FLUSH_TYPE type)
-{
- struct amdgpu_vm *vm = drm_priv_to_vm(pdd->drm_priv);
- uint64_t tlb_seq = amdgpu_vm_tlb_seq(vm);
- struct kfd_node *dev = pdd->dev;
- uint32_t xcc_mask = dev->xcc_mask;
- int xcc = 0;
-
- /*
- * It can be that we race and lose here, but that is extremely unlikely
- * and the worst thing which could happen is that we flush the changes
- * into the TLB once more which is harmless.
- */
- if (atomic64_xchg(&pdd->tlb_seq, tlb_seq) == tlb_seq)
- return;
-
- if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS) {
- /* Nothing to flush until a VMID is assigned, which
- * only happens when the first queue is created.
- */
- if (pdd->qpd.vmid)
- amdgpu_amdkfd_flush_gpu_tlb_vmid(dev->adev,
- pdd->qpd.vmid);
- } else {
- for_each_inst(xcc, xcc_mask)
- amdgpu_amdkfd_flush_gpu_tlb_pasid(
- dev->adev, pdd->process->pasid, type, xcc);
- }
-}
-
/* assumes caller holds process lock. */
int kfd_process_drain_interrupts(struct kfd_process_device *pdd)
{