From 67adb569ddf4aee12de999e02c67c1a168dd1a15 Mon Sep 17 00:00:00 2001
From: Felix Kuehling <Felix.Kuehling@amd.com>
Date: Sat, 13 Jul 2019 02:27:34 -0400
Subject: [PATCH] drm/amdgpu: Fix silent amdgpu_bo_move failures
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Under memory pressure, buffer moves between RAM to VRAM  can
fail when there is no GTT space available. In those cases
amdgpu_bo_move falls back to ttm_bo_move_memcpy, which seems to
succeed, although it doesn't really support non-contiguous or
invisible VRAM. This manifests as VM faults with corrupted page
table entries in KFD eviction stress tests.

Print some helpful messages when lack of GTT space is causing buffer
moves to fail. Check that source and destination memory regions are
supported by ttm_bo_move_memcpy before taking that fallback.

Signed-off-by: Felix Kuehling <Felix.Kuehling@amd.com>
Reviewed-by: Christian König <christian.koenig@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 40 +++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 9892f13bcbc01..2bb9420b77a18 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -489,6 +489,7 @@ static int amdgpu_move_vram_ram(struct ttm_buffer_object *bo, bool evict,
 	placements.flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT;
 	r = ttm_bo_mem_space(bo, &placement, &tmp_mem, ctx);
 	if (unlikely(r)) {
+		pr_err("Failed to find GTT space for blit from VRAM\n");
 		return r;
 	}
 
@@ -547,6 +548,7 @@ static int amdgpu_move_ram_vram(struct ttm_buffer_object *bo, bool evict,
 	placements.flags = TTM_PL_MASK_CACHING | TTM_PL_FLAG_TT;
 	r = ttm_bo_mem_space(bo, &placement, &tmp_mem, ctx);
 	if (unlikely(r)) {
+		pr_err("Failed to find GTT space for blit to VRAM\n");
 		return r;
 	}
 
@@ -566,6 +568,30 @@ static int amdgpu_move_ram_vram(struct ttm_buffer_object *bo, bool evict,
 	return r;
 }
 
+/**
+ * amdgpu_mem_visible - Check that memory can be accessed by ttm_bo_move_memcpy
+ *
+ * Called by amdgpu_bo_move()
+ */
+static bool amdgpu_mem_visible(struct amdgpu_device *adev,
+			       struct ttm_mem_reg *mem)
+{
+	struct drm_mm_node *nodes = mem->mm_node;
+
+	if (mem->mem_type == TTM_PL_SYSTEM ||
+	    mem->mem_type == TTM_PL_TT)
+		return true;
+	if (mem->mem_type != TTM_PL_VRAM)
+		return false;
+
+	/* ttm_mem_reg_ioremap only supports contiguous memory */
+	if (nodes->size != mem->num_pages)
+		return false;
+
+	return ((nodes->start + nodes->size) << PAGE_SHIFT)
+		<= adev->gmc.visible_vram_size;
+}
+
 /**
  * amdgpu_bo_move - Move a buffer object to a new memory location
  *
@@ -610,8 +636,10 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
 		return 0;
 	}
 
-	if (!adev->mman.buffer_funcs_enabled)
+	if (!adev->mman.buffer_funcs_enabled) {
+		r = -ENODEV;
 		goto memcpy;
+	}
 
 	if (old_mem->mem_type == TTM_PL_VRAM &&
 	    new_mem->mem_type == TTM_PL_SYSTEM) {
@@ -626,10 +654,16 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, bool evict,
 
 	if (r) {
 memcpy:
-		r = ttm_bo_move_memcpy(bo, ctx, new_mem);
-		if (r) {
+		/* Check that all memory is CPU accessible */
+		if (!amdgpu_mem_visible(adev, old_mem) ||
+		    !amdgpu_mem_visible(adev, new_mem)) {
+			pr_err("Move buffer fallback to memcpy unavailable\n");
 			return r;
 		}
+
+		r = ttm_bo_move_memcpy(bo, ctx, new_mem);
+		if (r)
+			return r;
 	}
 
 	if (bo->type == ttm_bo_type_device &&
-- 
GitLab