[PATCH v2 1/1] drm/bridge: anx7625: send DPCD command to downstream

2022-01-10 Thread Xin Ji
Send DPCD command to downstream before anx7625 power down,
let downstream monitor enter into standby mode.

Signed-off-by: Xin Ji 
---
 drivers/gpu/drm/bridge/analogix/anx7625.c | 40 ---
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/bridge/analogix/anx7625.c 
b/drivers/gpu/drm/bridge/analogix/anx7625.c
index 33383f83255d..fd2217ae455e 100644
--- a/drivers/gpu/drm/bridge/analogix/anx7625.c
+++ b/drivers/gpu/drm/bridge/analogix/anx7625.c
@@ -129,6 +129,23 @@ static int anx7625_reg_write(struct anx7625_data *ctx,
return ret;
 }
 
+static int anx7625_reg_block_write(struct anx7625_data *ctx,
+  struct i2c_client *client,
+  u8 reg_addr, u8 len, u8 *buf)
+{
+   int ret;
+   struct device *dev = &client->dev;
+
+   i2c_access_workaround(ctx, client);
+
+   ret = i2c_smbus_write_i2c_block_data(client, reg_addr, len, buf);
+   if (ret < 0)
+   DRM_DEV_ERROR(dev, "write i2c block failed id=%x\n:%x",
+ client->addr, reg_addr);
+
+   return ret;
+}
+
 static int anx7625_write_or(struct anx7625_data *ctx,
struct i2c_client *client,
u8 offset, u8 mask)
@@ -214,8 +231,8 @@ static int wait_aux_op_finish(struct anx7625_data *ctx)
return 0;
 }
 
-static int anx7625_aux_dpcd_read(struct anx7625_data *ctx,
-u32 address, u8 len, u8 *buf)
+static int anx7625_aux_dpcd_trans(struct anx7625_data *ctx, u8 op,
+ u32 address, u8 len, u8 *buf)
 {
struct device *dev = &ctx->client->dev;
int ret;
@@ -231,8 +248,7 @@ static int anx7625_aux_dpcd_read(struct anx7625_data *ctx,
addrm = (address >> 8) & 0xFF;
addrh = (address >> 16) & 0xFF;
 
-   cmd = DPCD_CMD(len, DPCD_READ);
-   cmd = ((len - 1) << 4) | 0x09;
+   cmd = DPCD_CMD(len, op);
 
/* Set command and length */
ret = anx7625_reg_write(ctx, ctx->i2c.rx_p0_client,
@@ -246,6 +262,9 @@ static int anx7625_aux_dpcd_read(struct anx7625_data *ctx,
ret |= anx7625_reg_write(ctx, ctx->i2c.rx_p0_client,
 AP_AUX_ADDR_19_16, addrh);
 
+   if (op == DPCD_WRITE)
+   ret |= anx7625_reg_block_write(ctx, ctx->i2c.rx_p0_client,
+  AP_AUX_BUFF_START, len, buf);
/* Enable aux access */
ret |= anx7625_write_or(ctx, ctx->i2c.rx_p0_client,
AP_AUX_CTRL_STATUS, AP_AUX_CTRL_OP_EN);
@@ -263,6 +282,11 @@ static int anx7625_aux_dpcd_read(struct anx7625_data *ctx,
return ret;
}
 
+   /* Write done */
+   if (op == DPCD_WRITE)
+   return 0;
+
+   /* Read done, read out dpcd data */
ret = anx7625_reg_block_read(ctx, ctx->i2c.rx_p0_client,
 AP_AUX_BUFF_START, len, buf);
if (ret < 0) {
@@ -845,7 +869,7 @@ static int anx7625_hdcp_enable(struct anx7625_data *ctx)
}
 
/* Read downstream capability */
-   anx7625_aux_dpcd_read(ctx, 0x68028, 1, &bcap);
+   anx7625_aux_dpcd_trans(ctx, DPCD_READ, 0x68028, 1, &bcap);
if (!(bcap & 0x01)) {
pr_warn("downstream not support HDCP 1.4, cap(%x).\n", bcap);
return 0;
@@ -918,6 +942,7 @@ static void anx7625_dp_stop(struct anx7625_data *ctx)
 {
struct device *dev = &ctx->client->dev;
int ret;
+   u8 data;
 
DRM_DEV_DEBUG_DRIVER(dev, "stop dp output\n");
 
@@ -929,6 +954,11 @@ static void anx7625_dp_stop(struct anx7625_data *ctx)
ret |= anx7625_write_and(ctx, ctx->i2c.tx_p2_client, 0x08, 0x7f);
 
ret |= anx7625_video_mute_control(ctx, 1);
+
+   DRM_DEV_DEBUG_DRIVER(dev, "notify downstream enter into standby\n");
+   /* Downstream monitor enter into standby mode */
+   data = 2;
+   ret |= anx7625_aux_dpcd_trans(ctx, DPCD_WRITE, 0x000600, 1, &data);
if (ret < 0)
DRM_DEV_ERROR(dev, "IO error : mute video fail\n");
 
-- 
2.25.1



Re: Phyr Starter

2022-01-10 Thread Matthew Wilcox
On Mon, Jan 10, 2022 at 08:41:26PM -0400, Jason Gunthorpe wrote:
> On Mon, Jan 10, 2022 at 07:34:49PM +, Matthew Wilcox wrote:
> 
> > Finally, it may be possible to stop using scatterlist to describe the
> > input to the DMA-mapping operation.  We may be able to get struct
> > scatterlist down to just dma_address and dma_length, with chaining
> > handled through an enclosing struct.
> 
> Can you talk about this some more? IMHO one of the key properties of
> the scatterlist is that it can hold huge amounts of pages without
> having to do any kind of special allocation due to the chaining.
> 
> The same will be true of the phyr idea right?

My thinking is that we'd pass a relatively small array of phyr (maybe 16
entries) to get_user_phyr().  If that turned out not to be big enough,
then we have two options; one is to map those 16 ranges with sg and use
the sg chaining functionality before throwing away the phyr and calling
get_user_phyr() again.  The other is to stash those 16 ranges somewhere
(eg a resizing array of some kind) and keep calling get_user_phyr()
to get the next batch of 16; once we've got the entire range, call
sg_map_phyr() passing all of the phyrs.

> > I would like to see phyr replace bio_vec everywhere it's currently used.
> > I don't have time to do that work now because I'm busy with folios.
> > If someone else wants to take that on, I shall cheer from the sidelines.
> > What I do intend to do is:
> 
> I wonder if we mixed things though..
> 
> IMHO there is a lot of optimization to be had by having a
> datastructure that is expressly 'the physical pages underlying a
> contiguous chunk of va'
> 
> If you limit to that scenario then we can be more optimal because
> things like byte granular offsets and size in the interior pages don't
> need to exist. Every interior chunk is always aligned to its order and
> we only need to record the order.
> 
> An overall starting offset and total length allow computing the slice
> of the first/last entry.
> 
> If the physical address is always aligned then we get 12 free bits
> from the min 4k alignment and also only need to store order, not an
> arbitary byte granular length.
> 
> The win is I think we can meaningfully cover most common cases using
> only 8 bytes per physical chunk. The 12 bits can be used to encode the
> common orders (4k, 2M, 1G, etc) and some smart mechanism to get
> another 16 bits to cover 'everything'.
> 
> IMHO storage density here is quite important, we end up having to keep
> this stuff around for a long time.
> 
> I say this here, because I've always though bio_vec/etc are more
> general than we actually need, being byte granular at every chunk.

Oh, I can do you one better on the bit-packing scheme.  There's a
representation of every power-of-two that is naturally aligned, using
just one extra bit.  Let's say we have 3 bits of address space and
4 bits to represent any power of two allocation within that address
space:

 index-0, order-0
0010 index-1, order-0
...
1110 index-7, order-0
0001 index-0, order-1
0101 index-2, order-1
1001 index-4, order-1
1101 index-6, order-1
0011 index-0, order-2
1011 index-4, order-2
0111 index-0, order-3

 has no meaning and can be used to represent an invalid range, if
that's useful.  The lowest clear bit decodes to the order, and
(x & (x+1))/2 gets you the index.

That leaves you with another 11 bits to represent something smart about
partial pages.

The question is whether this is the right kind of optimisation to be
doing.  I hear you that we want a dense format, but it's questionable
whether the kind of thing you're suggesting is actually denser than this
scheme.  For example, if we have 1GB pages and userspace happens to have
allocated pages (3, 4, 5, 6, 7, 8, 9, 10) then this can be represented
as a single phyr.  A power-of-two scheme would have us use four entries
(3, 4-7, 8-9, 10).

Using a (dma_addr, size_t) tuple makes coalescing adjacent pages very
cheap.  If I have to walk PTEs looking for pages which can be combined
together, I end up with interesting behaviour where the length of the
list shrinks and expands.  Using the example above, as I walk successive
PUDs, the data struct looks like this:

(3)
(3, 4)
(3, 4-5)
(3, 4-5, 6)
(3, 4-7)
(3, 4-7, 8)
(3, 4-7, 8-9)
(3, 4-7, 8-9, 10)

We could end up with a situation where we stop because the array is
full, even though if we kept going, it'd shrink back down below the
length of the array (in this example, an array of length 2 would stop
when it saw page 6, even though page 7 shrinks it back down again).

> What is needed is a full scatterlist replacement, including the IOMMU
> part.
> 
> For the IOMMU I would expect the datastructure to be re-used, we start
> with a list of physical pages and then 'dma map' gives us a list of
> IOVA physical pages, in another allocation, but exactly the same
> datastructure.
> 
> This 'dma map' could return a pointer to the first datastructure if
> there is no iommu, allocate a single entry

Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread Linus Torvalds
On Mon, Jan 10, 2022 at 6:44 PM Linus Torvalds
 wrote:
>
> I'll double-check to see if a revert fixes it at the top of my tree.

Yup. It reverts cleanly, and the end result builds and works fine, and
doesn't show the horrendous flickering.

I have done that revert, and will continue the merge window work.
Somebody else gets to figure out what the actual bug is, but that
commit was horribly broken on my machine (Sapphire Pulse RX 580 8GB,
fwiw).

   Linus


Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread Linus Torvalds
On Mon, Jan 10, 2022 at 6:22 PM Linus Torvalds
 wrote:
>
> and I guess I'll do the few more bisections to pick out the exact one.

a896f870f8a5f23ec961d16baffd3fda1f8be57c is the first bad commit.

Attaching ther BISECT_LOG in case anybody cares.

I'll double-check to see if a revert fixes it at the top of my tree.

Linus


BISECT_LOG
Description: Binary data


Re: [PATCH] drm/panel: Extend ACX424AKP bindings to ACX424AKM

2022-01-10 Thread Rob Herring
On Mon, 03 Jan 2022 10:35:01 +0100, Linus Walleij wrote:
> The panel ACX424AKP seems to only be used in prototypes, whereas
> real products use the 10 pixels shorter ACX424AKM. Extend the
> ACX424AKP bindings to also cover the ACX424AKM. The ACX424AKM
> was used in a few different mobile phones from Sony Mobile.
> 
> Cc: devicet...@vger.kernel.org
> Cc: phone-de...@vger.kernel.org
> Signed-off-by: Linus Walleij 
> ---
>  .../bindings/display/panel/sony,acx424akp.yaml| 11 +--
>  1 file changed, 9 insertions(+), 2 deletions(-)
> 

Acked-by: Rob Herring 


[PATCH 1/1] Add test for new hsaKmtAvailableMemory library call

2022-01-10 Thread Daniel Phillips
Using DefaultGPUNode now instead of system memory, usage similar to
other tests. Also cleaned up pSmall, which I originally intended to
just let float away on the mistaken assumption that it would be
cleaned up automatically at the end of the test.

Basic test for the new hsaKmtAvailableMemory library call. This is
a standalone test, does not modify any of the other tests just to
be on the safe side. More elaborate tests coming soon.

Signed-off-by: Daniel Phillips 
Change-Id: I645006a89bd8d55ef7b1605611e8ef0c010dad1a
---
 tests/kfdtest/src/KFDMemoryTest.cpp | 20 
 1 file changed, 20 insertions(+)

diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp 
b/tests/kfdtest/src/KFDMemoryTest.cpp
index 9f62727..d9016de 100644
--- a/tests/kfdtest/src/KFDMemoryTest.cpp
+++ b/tests/kfdtest/src/KFDMemoryTest.cpp
@@ -595,6 +595,26 @@ TEST_F(KFDMemoryTest, MemoryAlloc) {
 TEST_END
 }
 
+// Basic test for hsaKmtAllocMemory
+TEST_F(KFDMemoryTest, MemoryAllocAll) {
+TEST_START(TESTPROFILE_RUNALL)
+
+int defaultGPUNode = m_NodeInfo.HsaDefaultGPUNode();
+unsigned int* pBig = NULL;
+unsigned int* pSmall = NULL;
+m_MemoryFlags.ui32.NoNUMABind = 1;
+HSAuint64 available;
+
+EXPECT_SUCCESS(hsaKmtAvailableMemory(defaultGPUNode, &available));
+EXPECT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, available, m_MemoryFlags, 
reinterpret_cast(&pBig)));
+EXPECT_NE(HSAKMT_STATUS_SUCCESS, hsaKmtAllocMemory(defaultGPUNode, 
PAGE_SIZE, m_MemoryFlags, reinterpret_cast(&pSmall)));
+EXPECT_SUCCESS(hsaKmtFreeMemory(pBig, available));
+EXPECT_SUCCESS(hsaKmtAllocMemory(defaultGPUNode, PAGE_SIZE, m_MemoryFlags, 
reinterpret_cast(&pSmall)));
+EXPECT_SUCCESS(hsaKmtFreeMemory(pSmall, PAGE_SIZE));
+
+TEST_END
+}
+
 TEST_F(KFDMemoryTest, AccessPPRMem) {
 TEST_START(TESTPROFILE_RUNALL)
 
-- 
2.34.1



Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread Linus Torvalds
On Mon, Jan 10, 2022 at 5:21 PM Linus Torvalds
 wrote:
>
> I'll see if I can bisect it at least partially.

It seems to be very reliable. I can see the flickering even at early
boot before gdb has started - the graphical screen where you type the
encrypted disk password at boot already shows it as you type.

Right now it is

  bad: 9602044d1cc12280e20c5f2cd640ae80f69e
  good: 3867e3704f136beadf5e004b61696ef7f990bee4

so it's going to be one of these:

  9602044d1cc1 drm/amd/display: Fix for the no Audio bug with Tiled Displays
  a896f870f8a5 drm/amd/display: Fix for otg synchronization logic
  aba3c3fede54 drm/amd/display: Clear DPCD lane settings after repeater training
  9311ed1e1241 drm/amd/display: add hdmi disable debug check
  6421f7c750e9 drm/amd/display: Allow DSC on supported MST branch devices
  ebe5ffd8e271 drm/amd/display: Enable P010 for DCN3x ASICs
  c022375ae095 drm/amd/display: Add DP-HDMI FRL PCON Support in DC
  50b1f44ec547 drm/amd/display: Add DP-HDMI FRL PCON SST Support in DM
  81d104f4afbf drm/amdgpu: Don't halt RLC on GFX suspend
  fe9c5c9affc9 drm/amdgpu: Use MAX_HWIP instead of HW_ID_MAX
  370016988665 drm/amdgpu: fix the missed handling for SDMA2 and SDMA3
  6c18ecefaba7 drm/amdgpu: declare static function to fix compiler warning
  94a80b5bc7a2 amdgpu/pm: Modify implmentations of
get_power_profile_mode to use amdgpu_pp_profile_name

and I guess I'll do the few more bisections to pick out the exact one.

 Linus


[PATCH 1/1] Add available memory ioctl for libhsakmt

2022-01-10 Thread Daniel Phillips
Add an ioctl to inquire memory available for allocation by libhsakmt
per node, allowing for space consumed by page translation tables.

This ioctl is the underlying mechanism for the new memory availability
library call posted for review here:

   https://lists.freedesktop.org/archives/amd-gfx/2022-January/073352.html

Signed-off-by: Daniel Phillips 

---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h  |  1 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c| 14 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c| 17 +
 include/uapi/linux/kfd_ioctl.h  | 14 --
 4 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index fcbc8a9c9e06..64c6c36685d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -266,6 +266,7 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct 
amdgpu_device *adev,
 void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,
void *drm_priv);
 uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv);
+size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev);
 int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
struct amdgpu_device *adev, uint64_t va, uint64_t size,
void *drm_priv, struct kgd_mem **mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 86a1a6c109d9..b7490a659173 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -190,6 +190,20 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
return ret;
 }
 
+size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev)
+{
+   uint64_t reserved_for_pt =
+   ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+   size_t available_memory;
+
+   spin_lock(&kfd_mem_limit.mem_limit_lock);
+   available_memory =
+   adev->gmc.real_vram_size -
+   adev->kfd.vram_used - reserved_for_pt;
+   spin_unlock(&kfd_mem_limit.mem_limit_lock);
+   return available_memory;
+}
+
 static void unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 4bfc0c8ab764..5c2f6d97ff1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -486,6 +486,20 @@ static int kfd_ioctl_get_queue_wave_state(struct file 
*filep,
return r;
 }
 
+static int kfd_ioctl_get_available_memory(struct file *filep,
+struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_get_available_memory_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   args->available = amdgpu_amdkfd_get_available_memory(dev->adev);
+   return 0;
+}
+
 static int kfd_ioctl_set_memory_policy(struct file *filep,
struct kfd_process *p, void *data)
 {
@@ -1959,6 +1973,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,
kfd_ioctl_set_xnack_mode, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
+   kfd_ioctl_get_available_memory, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index af96af174dc4..94a99add2432 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -32,9 +32,10 @@
  * - 1.4 - Indicate new SRAM EDC bit in device properties
  * - 1.5 - Add SVM API
  * - 1.6 - Query clear flags in SVM get_attr API
+ * - 1.7 - Add available_memory ioctl
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 6
+#define KFD_IOCTL_MINOR_VERSION 7
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
@@ -98,6 +99,12 @@ struct kfd_ioctl_get_queue_wave_state_args {
__u32 pad;
 };
 
+struct kfd_ioctl_get_available_memory_args {
+   __u64 available;/* from KFD */
+   __u32 gpu_id;   /* to KFD */
+   __u32 pad;
+};
+
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
 #define KFD_IOC_CACHE_POLICY_COHERENT 0
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
@@ -742,7 +749,10 @@ struct kfd_ioctl_set_xnack_mode_args {
 #define AMDKFD_IOC_SET_XNACK_MODE  \
AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args)
 
+#define AMDKFD_IOC_AVAILABLE_MEMORY\
+   AMDKFD_IOR(0x22, struct kfd_ioctl_get_available_memory_args)

Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread Linus Torvalds
On Mon, Jan 10, 2022 at 5:21 PM Linus Torvalds
 wrote:
>
> It also seems to depend a bit on the screen contents - or possibly on
> what else is going on. Hiding the browser window makes it happen less,
> I think. But I suspect that's about "less gpu activity" than anything
> else.

Actually, sometimes "more activity" makes it go away too. Moving a
window around wildly with the mouse makes it *stop* happen.

But moving the mouse over different elements of the screen - or
writing text in the web browser email window - seems to make it worse.

Funky.

It does "feel" to me like some bandwidth limitation, it has kind of
the same behavior that I remember from the bad old times when you were
pushing the video card past a resolution that it could really handle.

But that can't be the case, this card has had no problems with this before.

   Linus


Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread Linus Torvalds
On Mon, Jan 10, 2022 at 5:11 PM Alex Deucher  wrote:
>
> We are putting together a system to try and repro the issue.  Does it
> happen with a single monitor or only with two?

Nope. With a single monitor everything seems to look fine. And when I
plug in the second monitor, it immediately starts happening again.

It also seems to depend a bit on the screen contents - or possibly on
what else is going on. Hiding the browser window makes it happen less,
I think. But I suspect that's about "less gpu activity" than anything
else.

I'll see if I can bisect it at least partially.

  Linus


Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread Alex Deucher
On Mon, Jan 10, 2022 at 8:04 PM Linus Torvalds
 wrote:
>
> On Mon, Jan 10, 2022 at 2:13 PM Alex Deucher  wrote:
> >
> > Sounds like something related to watermarks.  That said, we haven't
> > really touched the display code for DCE11 cards in quite a while.  Can
> > you provide your dmesg output?
>
> I'm not seeing anything that would look interesting, but here's the
> parts that look relevant for drm..

We are putting together a system to try and repro the issue.  Does it
happen with a single monitor or only with two?

Thanks,

Alex


Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread Linus Torvalds
On Mon, Jan 10, 2022 at 2:13 PM Alex Deucher  wrote:
>
> Sounds like something related to watermarks.  That said, we haven't
> really touched the display code for DCE11 cards in quite a while.  Can
> you provide your dmesg output?

I'm not seeing anything that would look interesting, but here's the
parts that look relevant for drm..

  Linus


dmesg-gpu
Description: Binary data


Re: Phyr Starter

2022-01-10 Thread Jason Gunthorpe
On Mon, Jan 10, 2022 at 07:34:49PM +, Matthew Wilcox wrote:

> Finally, it may be possible to stop using scatterlist to describe the
> input to the DMA-mapping operation.  We may be able to get struct
> scatterlist down to just dma_address and dma_length, with chaining
> handled through an enclosing struct.

Can you talk about this some more? IMHO one of the key properties of
the scatterlist is that it can hold huge amounts of pages without
having to do any kind of special allocation due to the chaining.

The same will be true of the phyr idea right?

> I would like to see phyr replace bio_vec everywhere it's currently used.
> I don't have time to do that work now because I'm busy with folios.
> If someone else wants to take that on, I shall cheer from the sidelines.
> What I do intend to do is:

I wonder if we mixed things though..

IMHO there is a lot of optimization to be had by having a
datastructure that is expressly 'the physical pages underlying a
contiguous chunk of va'

If you limit to that scenario then we can be more optimal because
things like byte granular offsets and size in the interior pages don't
need to exist. Every interior chunk is always aligned to its order and
we only need to record the order.

An overall starting offset and total length allow computing the slice
of the first/last entry.

If the physical address is always aligned then we get 12 free bits
from the min 4k alignment and also only need to store order, not an
arbitary byte granular length.

The win is I think we can meaningfully cover most common cases using
only 8 bytes per physical chunk. The 12 bits can be used to encode the
common orders (4k, 2M, 1G, etc) and some smart mechanism to get
another 16 bits to cover 'everything'.

IMHO storage density here is quite important, we end up having to keep
this stuff around for a long time.

I say this here, because I've always though bio_vec/etc are more
general than we actually need, being byte granular at every chunk.

>  - Add an interface to gup.c to pin/unpin N phyrs
>  - Add a sg_map_phyrs()
>This will take an array of phyrs and allocate an sg for them
>  - Whatever else I need to do to make one RDMA driver happy with
>this scheme

I spent alot of time already cleaning all the DMA code in RDMA - it is
now nicely uniform and ready to do this sort of change. I was
expecting to be a bio_vec, but this is fine too.

What is needed is a full scatterlist replacement, including the IOMMU
part.

For the IOMMU I would expect the datastructure to be re-used, we start
with a list of physical pages and then 'dma map' gives us a list of
IOVA physical pages, in another allocation, but exactly the same
datastructure.

This 'dma map' could return a pointer to the first datastructure if
there is no iommu, allocate a single entry list if the whole thing can
be linearly mapped with the iommu, and other baroque cases (like pci
offset/etc) will need to allocate full array. ie good HW runs fast and
is memory efficient.

It would be nice to see a patch sketching showing what this
datastructure could look like.

VFIO would like this structure as well as it currently is a very
inefficient page at a time loop when it iommu maps things.

Jason



[PATCH] dma-buf-map: Fix dot vs comma in example

2022-01-10 Thread Lucas De Marchi
Fix typo: separate arguments with comma rather than dot.

Signed-off-by: Lucas De Marchi 
---
 include/linux/dma-buf-map.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/dma-buf-map.h b/include/linux/dma-buf-map.h
index 278d489e4bdd..19fa0b5ae5ec 100644
--- a/include/linux/dma-buf-map.h
+++ b/include/linux/dma-buf-map.h
@@ -52,13 +52,13 @@
  *
  * struct dma_buf_map map = DMA_BUF_MAP_INIT_VADDR(0xdeadbeaf);
  *
- * dma_buf_map_set_vaddr(&map. 0xdeadbeaf);
+ * dma_buf_map_set_vaddr(&map, 0xdeadbeaf);
  *
  * To set an address in I/O memory, use dma_buf_map_set_vaddr_iomem().
  *
  * .. code-block:: c
  *
- * dma_buf_map_set_vaddr_iomem(&map. 0xdeadbeaf);
+ * dma_buf_map_set_vaddr_iomem(&map, 0xdeadbeaf);
  *
  * Instances of struct dma_buf_map do not have to be cleaned up, but
  * can be cleared to NULL with dma_buf_map_clear(). Cleared mappings
-- 
2.34.1



Re: [Patch v4 18/24] drm/amdkfd: CRIU checkpoint and restore xnack mode

2022-01-10 Thread Felix Kuehling

On 2022-01-05 10:22 a.m., philip yang wrote:



On 2021-12-22 7:37 p.m., Rajneesh Bhardwaj wrote:

Recoverable page faults are represented by the xnack mode setting inside
a kfd process and are used to represent the device page faults. For CR,
we don't consider negative values which are typically used for querying
the current xnack mode without modifying it.

Signed-off-by: Rajneesh Bhardwaj
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 15 +++
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  1 +
  2 files changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 178b0ccfb286..446eb9310915 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1845,6 +1845,11 @@ static int criu_checkpoint_process(struct kfd_process *p,
memset(&process_priv, 0, sizeof(process_priv));
  
  	process_priv.version = KFD_CRIU_PRIV_VERSION;

+   /* For CR, we don't consider negative xnack mode which is used for
+* querying without changing it, here 0 simply means disabled and 1
+* means enabled so retry for finding a valid PTE.
+*/
Negative value to query xnack mode is for kfd_ioctl_set_xnack_mode 
user space ioctl interface, which is not used by CRIU, I think this 
comment is misleading,

+   process_priv.xnack_mode = p->xnack_enabled ? 1 : 0;

change to process_priv.xnack_enabled
  
  	ret = copy_to_user(user_priv_data + *priv_offset,

&process_priv, sizeof(process_priv));
@@ -2231,6 +2236,16 @@ static int criu_restore_process(struct kfd_process *p,
return -EINVAL;
}
  
+	pr_debug("Setting XNACK mode\n");

+   if (process_priv.xnack_mode && !kfd_process_xnack_mode(p, true)) {
+   pr_err("xnack mode cannot be set\n");
+   ret = -EPERM;
+   goto exit;
+   } else {


On GFXv9 GPUs except Aldebaran, this means the process checkpointed is 
xnack off, it can restore and resume on GPU with xnack on, then shader 
will continue running successfully, but driver is not guaranteed to 
map svm ranges on GPU all the time, if retry fault happens, the shader 
will not recover. Maybe change to:


If (KFD_GC_VERSION(dev) != IP_VERSION(9, 4, 2) {

The code here was correct. The xnack mode applies to the whole process, 
not just one GPU. The logic for checking the capabilities of all GPUs is 
already in kfd_process_xnack_mode. If XNACK cannot be supported by all 
GPUs, restoring a non-0 XNACK mode will fail.


Any GPU can run in XNACK-disabled mode. So we don't need any limitations 
for process_priv.xnack_enabled == 0.


Regards,
  Felix



    if (process_priv.xnack_enabled != kfd_process_xnack_mode(p, true)) {

 pr_err("xnack mode cannot be set\n");

 ret = -EPERM;

 goto exit;

    }

}

pr_debug("set xnack mode: %d\n", process_priv.xnack_enabled);

p->xnack_enabled = process_priv.xnack_enabled;



+   pr_debug("set xnack mode: %d\n", process_priv.xnack_mode);
+   p->xnack_enabled = process_priv.xnack_mode;
+   }
+
  exit:
return ret;
  }
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 855c162b85ea..d72dda84c18c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1057,6 +1057,7 @@ void kfd_process_set_trap_handler(struct 
qcm_process_device *qpd,
  
  struct kfd_criu_process_priv_data {

uint32_t version;
+   uint32_t xnack_mode;


bool xnack_enabled;

Regards,

Philip


  };
  
  struct kfd_criu_device_priv_data {


Re: [Patch v4 24/24] drm/amdkfd: CRIU resume shared virtual memory ranges

2022-01-10 Thread Felix Kuehling

On 2021-12-22 7:37 p.m., Rajneesh Bhardwaj wrote:

In CRIU resume stage, resume all the shared virtual memory ranges from
the data stored inside the resuming kfd process during CRIU restore
phase. Also setup xnack mode and free up the resources.

Signed-off-by: Rajneesh Bhardwaj 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 10 +
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 55 
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h |  6 +++
  3 files changed, 71 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f7aa15b18f95..6191e37656dd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2759,7 +2759,17 @@ static int criu_resume(struct file *filep,
}
  
  	mutex_lock(&target->mutex);

+   ret = kfd_criu_resume_svm(target);
+   if (ret) {
+   pr_err("kfd_criu_resume_svm failed for %i\n", args->pid);
+   goto exit;
+   }
+
ret =  amdgpu_amdkfd_criu_resume(target->kgd_process_info);
+   if (ret)
+   pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid);
+
+exit:
mutex_unlock(&target->mutex);
  
  	kfd_unref_process(target);

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index e9f6c63c2a26..bd2dce37f345 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3427,6 +3427,61 @@ svm_range_get_attr(struct kfd_process *p, struct 
mm_struct *mm,
return 0;
  }
  
+int kfd_criu_resume_svm(struct kfd_process *p)

+{
+   int nattr_common = 4, nattr_accessibility = 1;
+   struct criu_svm_metadata *criu_svm_md = NULL;
+   struct criu_svm_metadata *next = NULL;
+   struct svm_range_list *svms = &p->svms;
+   int i, j, num_attrs, ret = 0;
+   struct mm_struct *mm;
+
+   if (list_empty(&svms->criu_svm_metadata_list)) {
+   pr_debug("No SVM data from CRIU restore stage 2\n");
+   return ret;
+   }
+
+   mm = get_task_mm(p->lead_thread);
+   if (!mm) {
+   pr_err("failed to get mm for the target process\n");
+   return -ESRCH;
+   }
+
+   num_attrs = nattr_common + (nattr_accessibility * p->n_pdds);
+
+   i = j = 0;
+   list_for_each_entry(criu_svm_md, &svms->criu_svm_metadata_list, list) {
+   pr_debug("criu_svm_md[%d]\n\tstart: 0x%llx size: 0x%llx 
(npages)\n",
+i, criu_svm_md->start_addr, criu_svm_md->size);
+   for (j = 0; j < num_attrs; j++) {
+   pr_debug("\ncriu_svm_md[%d]->attrs[%d].type : 0x%x 
\ncriu_svm_md[%d]->attrs[%d].value : 0x%x\n",
+i,j, criu_svm_md->attrs[j].type,
+i,j, criu_svm_md->attrs[j].value);
+   }


Is this super-detailed debug output really needed?

Regards,
  Felix



+
+   ret = svm_range_set_attr(p, mm, criu_svm_md->start_addr,
+criu_svm_md->size, num_attrs,
+criu_svm_md->attrs);
+   if (ret) {
+   pr_err("CRIU: failed to set range attributes\n");
+   goto exit;
+   }
+
+   i++;
+   }
+
+exit:
+   list_for_each_entry_safe(criu_svm_md, next, 
&svms->criu_svm_metadata_list, list) {
+   pr_debug("freeing criu_svm_md[]\n\tstart: 0x%llx\n",
+   criu_svm_md->start_addr);
+   kfree(criu_svm_md);
+   }
+
+   mmput(mm);
+   return ret;
+
+}
+
  int svm_criu_prepare_for_resume(struct kfd_process *p,
struct kfd_criu_svm_range_priv_data *svm_priv)
  {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
index e0c0853f085c..3b5bcb52723c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -195,6 +195,7 @@ int kfd_criu_restore_svm(struct kfd_process *p,
 uint8_t __user *user_priv_ptr,
 uint64_t *priv_data_offset,
 uint64_t max_priv_data_size);
+int kfd_criu_resume_svm(struct kfd_process *p);
  struct kfd_process_device *
  svm_range_get_pdd_by_adev(struct svm_range *prange, struct amdgpu_device 
*adev);
  void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct 
mm_struct *mm);
@@ -256,6 +257,11 @@ static inline int kfd_criu_restore_svm(struct kfd_process 
*p,
return -EINVAL;
  }
  
+static inline int kfd_criu_resume_svm(struct kfd_process *p)

+{
+   return 0;
+}
+
  #define KFD_IS_SVM_API_SUPPORTED(dev) false
  
  #endif /* IS_ENABLED(CONFIG_HSA_AMD_SVM) */


Re: [Patch v4 23/24] drm/amdkfd: CRIU prepare for svm resume

2022-01-10 Thread Felix Kuehling

On 2022-01-05 9:43 a.m., philip yang wrote:



On 2021-12-22 7:37 p.m., Rajneesh Bhardwaj wrote:

During CRIU restore phase, the VMAs for the virtual address ranges are
not at their final location yet so in this stage, only cache the data
required to successfully resume the svm ranges during an imminent CRIU
resume phase.

Signed-off-by: Rajneesh Bhardwaj
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  4 +-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  5 ++
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 99 
  drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 12 +++
  4 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 916b8d000317..f7aa15b18f95 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2638,8 +2638,8 @@ static int criu_restore_objects(struct file *filep,
goto exit;
break;
case KFD_CRIU_OBJECT_TYPE_SVM_RANGE:
-   /* TODO: Implement SVM range */
-   *priv_offset += sizeof(struct 
kfd_criu_svm_range_priv_data);
+   ret = kfd_criu_restore_svm(p, (uint8_t __user 
*)args->priv_data,
+priv_offset, 
max_priv_data_size);
if (ret)
goto exit;
break;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 87eb6739a78e..92191c541c29 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -790,6 +790,7 @@ struct svm_range_list {
struct list_headlist;
struct work_struct  deferred_list_work;
struct list_headdeferred_range_list;
+   struct list_headcriu_svm_metadata_list;
spinlock_t  deferred_list_lock;
atomic_tevicted_ranges;
booldrain_pagefaults;
@@ -1148,6 +1149,10 @@ int kfd_criu_restore_event(struct file *devkfd,
   uint8_t __user *user_priv_data,
   uint64_t *priv_data_offset,
   uint64_t max_priv_data_size);
+int kfd_criu_restore_svm(struct kfd_process *p,
+uint8_t __user *user_priv_data,
+uint64_t *priv_data_offset,
+uint64_t max_priv_data_size);
  /* CRIU - End */
  
  /* Queue Context Management */

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 6d59f1bedcf2..e9f6c63c2a26 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -45,6 +45,14 @@
   */
  #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING  2000
  
+struct criu_svm_metadata {

+   struct list_head list;
+   __u64 start_addr;
+   __u64 size;
+   /* Variable length array of attributes */
+   struct kfd_ioctl_svm_attribute attrs[0];
+};
This data structure is struct kfd_criu_svm_range_priv_data plus 
list_head, maybe you can add list_head to struct 
kfd_criu_svm_range_priv_data and remove this new data structure, then 
you can remove extra kzalloc, kfree for each svm object resume and 
function svm_criu_prepare_for_resume could be removed. 


Adding list_head to the private structure is a bad idea, because that 
structure is copied to/from user mode. Kernel mode pointers should not 
be exposed to user mode, even in an opaque structure. That's just 
begging for an exploit.


But you could define criu_svm_metadata as

struct criu_svm_metadata {
struct list_head list;
kfd_criu_svm_range_priv_data data;
};

Then copy_from_user directly into criu_svm_md->data in 
kfd_criu_restore_svm to avoid the double allocation.


Regards,
  Felix



+
  static void svm_range_evict_svm_bo_worker(struct work_struct *work);
  static bool
  svm_range_cpu_invalidate_pagetables(struct mmu_interval_notifier *mni,
@@ -2753,6 +2761,7 @@ int svm_range_list_init(struct kfd_process *p)
INIT_DELAYED_WORK(&svms->restore_work, svm_range_restore_work);
INIT_WORK(&svms->deferred_list_work, svm_range_deferred_list_work);
INIT_LIST_HEAD(&svms->deferred_range_list);
+   INIT_LIST_HEAD(&svms->criu_svm_metadata_list);
spin_lock_init(&svms->deferred_list_lock);
  
  	for (i = 0; i < p->n_pdds; i++)

@@ -3418,6 +3427,96 @@ svm_range_get_attr(struct kfd_process *p, struct 
mm_struct *mm,
return 0;
  }
  
+int svm_criu_prepare_for_resume(struct kfd_process *p,

+   struct kfd_criu_svm_range_priv_data *svm_priv)
+{
+   int nattr_common = 4, nattr_accessibility = 1;
+   struct criu_svm_metadata *criu_svm_md = NULL;
+   uint64_t svm_attrs_size, svm_object_md

Re: [PATCH 1/1] Add test for hsaKmtAvailableMemory available memory inquiry

2022-01-10 Thread Felix Kuehling

On 2022-01-10 4:48 p.m., Daniel Phillips wrote:

Basic test for the new hsaKmtAvailableMemory library call. This is
a standalone test, does not modify any of the other tests just to
be on the safe side. More elaborate tests coming soon.

Change-Id: I738600d4b74cc5dba6b857e4c793f6b14b7d2283
Signed-off-by: Daniel Phillips 
---
  tests/kfdtest/src/KFDMemoryTest.cpp | 17 +
  1 file changed, 17 insertions(+)

diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp 
b/tests/kfdtest/src/KFDMemoryTest.cpp
index 9f62727..1f93928 100644
--- a/tests/kfdtest/src/KFDMemoryTest.cpp
+++ b/tests/kfdtest/src/KFDMemoryTest.cpp
@@ -595,6 +595,23 @@ TEST_F(KFDMemoryTest, MemoryAlloc) {
  TEST_END
  }
  
+// Basic test for hsaKmtAllocMemory

+TEST_F(KFDMemoryTest, MemoryAllocAll) {
+TEST_START(TESTPROFILE_RUNALL)
+
+unsigned int* pBig = NULL;
+unsigned int* pSmall = NULL;
+m_MemoryFlags.ui32.NoNUMABind = 1;
+HSAuint64 available;
+EXPECT_SUCCESS(hsaKmtAvailableMemory(0 /* system */, &available));
I don't think you've even implemented this API for system memory. The 
system memory node doesn't have a valid GPUID, so the ioctl will fail.


I'd expect this test to work only for VRAM.



+EXPECT_SUCCESS(hsaKmtAllocMemory(0 /* system */, available, m_MemoryFlags, 
reinterpret_cast(&pBig)));
+EXPECT_NE(HSAKMT_STATUS_SUCCESS, hsaKmtAllocMemory(0 /* system */, PAGE_SIZE, 
m_MemoryFlags, reinterpret_cast(&pSmall)));
+EXPECT_SUCCESS(hsaKmtFreeMemory(pBig, available));
+EXPECT_SUCCESS(hsaKmtAllocMemory(0 /* system */, PAGE_SIZE, m_MemoryFlags, 
reinterpret_cast(&pSmall)));


You're leaking pSmall here.

Regards,
  Felix



+
+TEST_END
+}
+
  TEST_F(KFDMemoryTest, AccessPPRMem) {
  TEST_START(TESTPROFILE_RUNALL)
  


Re: [PATCH 1/1] Add available memory ioctl for libhsakmt

2022-01-10 Thread Felix Kuehling

On 2022-01-10 3:54 p.m., Daniel Phillips wrote:

From: Daniel Phillips 


This is weird. Looks like you've set up the your user email in your 
.gitconfig incorrectly. Or you changed it after you commited this patch 
locally.





Add an ioctl to inquire memory available for allocation by libhsakmt
per node, allowing for space consumed by page translation tables.


Other than the missing signed-off-by, this patch is

Reviewed-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h  |  1 +
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c| 14 ++
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c| 17 +
  include/uapi/linux/kfd_ioctl.h  | 14 --
  4 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index fcbc8a9c9e06..64c6c36685d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -266,6 +266,7 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct 
amdgpu_device *adev,
  void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,
void *drm_priv);
  uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv);
+size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev);
  int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
struct amdgpu_device *adev, uint64_t va, uint64_t size,
void *drm_priv, struct kgd_mem **mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 86a1a6c109d9..b7490a659173 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -190,6 +190,20 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
return ret;
  }
  
+size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev)

+{
+   uint64_t reserved_for_pt =
+   ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+   size_t available_memory;
+
+   spin_lock(&kfd_mem_limit.mem_limit_lock);
+   available_memory =
+   adev->gmc.real_vram_size -
+   adev->kfd.vram_used - reserved_for_pt;
+   spin_unlock(&kfd_mem_limit.mem_limit_lock);
+   return available_memory;
+}
+
  static void unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
  {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 4bfc0c8ab764..5c2f6d97ff1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -486,6 +486,20 @@ static int kfd_ioctl_get_queue_wave_state(struct file 
*filep,
return r;
  }
  
+static int kfd_ioctl_get_available_memory(struct file *filep,

+struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_get_available_memory_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   args->available = amdgpu_amdkfd_get_available_memory(dev->adev);
+   return 0;
+}
+
  static int kfd_ioctl_set_memory_policy(struct file *filep,
struct kfd_process *p, void *data)
  {
@@ -1959,6 +1973,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
  
  	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,

kfd_ioctl_set_xnack_mode, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
+   kfd_ioctl_get_available_memory, 0),
  };
  
  #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)

diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index af96af174dc4..94a99add2432 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -32,9 +32,10 @@
   * - 1.4 - Indicate new SRAM EDC bit in device properties
   * - 1.5 - Add SVM API
   * - 1.6 - Query clear flags in SVM get_attr API
+ * - 1.7 - Add available_memory ioctl
   */
  #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 6
+#define KFD_IOCTL_MINOR_VERSION 7
  
  struct kfd_ioctl_get_version_args {

__u32 major_version;/* from KFD */
@@ -98,6 +99,12 @@ struct kfd_ioctl_get_queue_wave_state_args {
__u32 pad;
  };
  
+struct kfd_ioctl_get_available_memory_args {

+   __u64 available;/* from KFD */
+   __u32 gpu_id;   /* to KFD */
+   __u32 pad;
+};
+
  /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
  #define KFD_IOC_CACHE_POLICY_COHERENT 0
  #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
@@ -742,7 +749,10 @@ struct kfd_ioctl_set_xnack_mode_args {
  #define AMDKFD_IOC_SET_XNACK_MODE \
AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args)
  
+

Re: [PATCH v3 1/2] drm/panel: panel-boe-tv101wum-nl6: tune the power sequence to avoid leakage

2022-01-10 Thread Kevin Hilman
Jitao Shi  writes:

> "auo,kd101n80-45na" 2st LCD SPEC update, need to modify the timing
> between IOVCC and mipi data.
> The 2st version of SPEC modifies the timing requirements from IOVCC to
> Mipi Data. IOVCC is now required to take precedence over MIPI DATA,
> otherwise there is a risk of leakage. It is recommended that the time
> for MIPI to enter LP11 be postponed after IOVCC (delay20ms).

Similar to what Daniel said on v2:  You're changing the behavior of
*all* users of this panel driver with this patch, in order to fix a
single user (in the next patch.)

> Signed-off-by: Jitao Shi 
> Change-Id: Ic5212e2145a7dbf2efef9e5585904a93e1bc5a28

Please drop gerrit IDs from upstream submissions.

Kevin

> ---
>  drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c | 88 
> +++---
>  include/drm/panel_boe_tv101wum_nl6.h   | 28 
>  2 files changed, 94 insertions(+), 22 deletions(-)
>  create mode 100644 include/drm/panel_boe_tv101wum_nl6.h
>
> diff --git a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c 
> b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c
> index db9d0b86d542..02efee06c430 100644
> --- a/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c
> +++ b/drivers/gpu/drm/panel/panel-boe-tv101wum-nl6.c
> @@ -49,7 +49,7 @@ struct boe_panel {
>   struct regulator *avee;
>   struct regulator *avdd;
>   struct gpio_desc *enable_gpio;
> -
> + int powered_refcnt;
>   bool prepared;
>  };
>  
> @@ -488,19 +488,15 @@ static int boe_panel_enter_sleep_mode(struct boe_panel 
> *boe)
>   return 0;
>  }
>  
> -static int boe_panel_unprepare(struct drm_panel *panel)
> +static int boe_panel_power_off(struct drm_panel *panel)
>  {
>   struct boe_panel *boe = to_boe_panel(panel);
> - int ret;
>  
> - if (!boe->prepared)
> - return 0;
> + if (WARN_ON(boe->powered_refcnt == 0))
> + return -EINVAL;
>  
> - ret = boe_panel_enter_sleep_mode(boe);
> - if (ret < 0) {
> - dev_err(panel->dev, "failed to set panel off: %d\n", ret);
> - return ret;
> - }
> + if (--boe->powered_refcnt != 0)
> + return 0;
>  
>   msleep(150);
>  
> @@ -520,17 +516,45 @@ static int boe_panel_unprepare(struct drm_panel *panel)
>   regulator_disable(boe->pp1800);
>   }
>  
> + return 0;
> +}
> +
> +int panel_unprepare_power(struct drm_panel *panel)
> +{
> + if (of_device_is_compatible(panel->dev->of_node, "auo,kd101n80-45na"))
> + return boe_panel_power_off(panel);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(panel_unprepare_power);
> +
> +static int boe_panel_unprepare(struct drm_panel *panel)
> +{
> + struct boe_panel *boe = to_boe_panel(panel);
> + int ret;
> +
> + if (!boe->prepared)
> + return 0;
> +
> + ret = boe_panel_enter_sleep_mode(boe);
> + if (ret < 0) {
> + dev_err(panel->dev, "failed to set panel off: %d\n", ret);
> + return ret;
> + }
> +
> + boe_panel_power_off(panel);
> +
>   boe->prepared = false;
>  
>   return 0;
>  }
>  
> -static int boe_panel_prepare(struct drm_panel *panel)
> +static int boe_panel_power_on(struct drm_panel *panel)
>  {
>   struct boe_panel *boe = to_boe_panel(panel);
>   int ret;
>  
> - if (boe->prepared)
> + if (++boe->powered_refcnt != 1)
>   return 0;
>  
>   gpiod_set_value(boe->enable_gpio, 0);
> @@ -558,18 +582,8 @@ static int boe_panel_prepare(struct drm_panel *panel)
>   gpiod_set_value(boe->enable_gpio, 1);
>   usleep_range(6000, 1);
>  
> - ret = boe_panel_init_dcs_cmd(boe);
> - if (ret < 0) {
> - dev_err(panel->dev, "failed to init panel: %d\n", ret);
> - goto poweroff;
> - }
> -
> - boe->prepared = true;
> -
>   return 0;
>  
> -poweroff:
> - regulator_disable(boe->avee);
>  poweroffavdd:
>   regulator_disable(boe->avdd);
>  poweroff1v8:
> @@ -580,6 +594,36 @@ static int boe_panel_prepare(struct drm_panel *panel)
>   return ret;
>  }
>  
> +int panel_prepare_power(struct drm_panel *panel)
> +{
> + if (of_device_is_compatible(panel->dev->of_node, "auo,kd101n80-45na"))
> + return boe_panel_power_on(panel);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL(panel_prepare_power);
> +
> +static int boe_panel_prepare(struct drm_panel *panel)
> +{
> + struct boe_panel *boe = to_boe_panel(panel);
> + int ret;
> +
> + boe_panel_power_on(panel);
> +
> + if (boe->prepared)
> + return 0;
> +
> + ret = boe_panel_init_dcs_cmd(boe);
> + if (ret < 0) {
> + dev_err(panel->dev, "failed to init panel: %d\n", ret);
> + return ret;
> + }
> +
> + boe->prepared = true;
> +
> + return 0;
> +}
> +
>  static int boe_panel_enable(struct drm_panel *panel)
>  {
>   msleep(130);
> diff --git a/include/drm/panel_boe_tv101wum_nl6.h 
> b/include/drm/panel_boe_tv101wum_nl6.h
> new file mode 100644
> index 0

Re: [Patch v4 13/24] drm/amdkfd: CRIU checkpoint and restore queue mqds

2022-01-10 Thread Felix Kuehling

On 2021-12-22 7:37 p.m., Rajneesh Bhardwaj wrote:

From: David Yat Sin 

Checkpoint contents of queue MQD's on CRIU dump and restore them during
CRIU restore.

Signed-off-by: David Yat Sin 


David has an update for this patch to fix up the doorbell offset in the 
restored SDMA MQD.


Regards,
  Felix




---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |   2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c   |   2 +-
  .../drm/amd/amdkfd/kfd_device_queue_manager.c |  72 +++-
  .../drm/amd/amdkfd/kfd_device_queue_manager.h |  14 +-
  drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.h  |   7 +
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_cik.c  |  67 
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c  |  68 
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   |  68 
  .../gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c   |  69 
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |   5 +
  .../amd/amdkfd/kfd_process_queue_manager.c| 158 --
  11 files changed, 506 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 3fb155f756fd..146879cd3f2b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -312,7 +312,7 @@ static int kfd_ioctl_create_queue(struct file *filep, 
struct kfd_process *p,
p->pasid,
dev->id);
  
-	err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, NULL,

+   err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, 
NULL, NULL,
&doorbell_offset_in_process);
if (err != 0)
goto err_create_queue;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
index 0c50e67e2b51..3a5303ebcabf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_dbgdev.c
@@ -185,7 +185,7 @@ static int dbgdev_register_diq(struct kfd_dbgdev *dbgdev)
properties.type = KFD_QUEUE_TYPE_DIQ;
  
  	status = pqm_create_queue(dbgdev->pqm, dbgdev->dev, NULL,

-   &properties, &qid, NULL, NULL);
+   &properties, &qid, NULL, NULL, NULL);
  
  	if (status) {

pr_err("Failed to create DIQ\n");
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index a0f5b8533a03..a92274f9f1f7 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -331,7 +331,8 @@ static void deallocate_vmid(struct device_queue_manager 
*dqm,
  static int create_queue_nocpsch(struct device_queue_manager *dqm,
struct queue *q,
struct qcm_process_device *qpd,
-   const struct kfd_criu_queue_priv_data *qd)
+   const struct kfd_criu_queue_priv_data *qd,
+   const void *restore_mqd)
  {
struct mqd_manager *mqd_mgr;
int retval;
@@ -390,8 +391,14 @@ static int create_queue_nocpsch(struct 
device_queue_manager *dqm,
retval = -ENOMEM;
goto out_deallocate_doorbell;
}
-   mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
-   &q->gart_mqd_addr, &q->properties);
+
+   if (qd)
+   mqd_mgr->restore_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, 
&q->gart_mqd_addr,
+&q->properties, restore_mqd);
+   else
+   mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
+   &q->gart_mqd_addr, &q->properties);
+
if (q->properties.is_active) {
if (!dqm->sched_running) {
WARN_ONCE(1, "Load non-HWS mqd while stopped\n");
@@ -1339,7 +1346,8 @@ static void destroy_kernel_queue_cpsch(struct 
device_queue_manager *dqm,
  
  static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,

struct qcm_process_device *qpd,
-   const struct kfd_criu_queue_priv_data *qd)
+   const struct kfd_criu_queue_priv_data *qd,
+   const void *restore_mqd)
  {
int retval;
struct mqd_manager *mqd_mgr;
@@ -1385,8 +1393,12 @@ static int create_queue_cpsch(struct 
device_queue_manager *dqm, struct queue *q,
 * updates the is_evicted flag but is a no-op otherwise.
 */
q->properties.is_evicted = !!qpd->evicted;
-   mqd_mgr->init_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj,
-   &q->gart_mqd_addr, &q->properties);
+   if (qd)
+   mqd_mgr->restore_mqd(mqd_mgr, &q->mqd, q->mqd_mem_obj, 
&q->gart_mqd_addr,
+&q->properties, restore_mqd);
+   else
+   

Re: [Patch v4 07/24] drm/amdkfd: CRIU Implement KFD resume ioctl

2022-01-10 Thread Felix Kuehling

On 2021-12-22 7:36 p.m., Rajneesh Bhardwaj wrote:

This adds support to create userptr BOs on restore and introduces a new
ioctl to restart memory notifiers for the restored userptr BOs.
When doing CRIU restore MMU notifications can happen anytime after we call
amdgpu_mn_register. Prevent MMU notifications until we reach stage-4 of the
restore process i.e. criu_resume ioctl is received, and the process is
ready to be resumed. This ioctl is different from other KFD CRIU ioctls
since its called by CRIU master restore process for all the target
processes being resumed by CRIU.

Signed-off-by: David Yat Sin 
Signed-off-by: Rajneesh Bhardwaj 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  6 ++-
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 51 +--
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  | 44 ++--
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  1 +
  drivers/gpu/drm/amd/amdkfd/kfd_process.c  | 35 +++--
  5 files changed, 123 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index fcbc8a9c9e06..5c5fc839f701 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -131,6 +131,7 @@ struct amdkfd_process_info {
atomic_t evicted_bos;
struct delayed_work restore_userptr_work;
struct pid *pid;
+   bool block_mmu_notifications;
  };
  
  int amdgpu_amdkfd_init(void);

@@ -269,7 +270,7 @@ uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void 
*drm_priv);
  int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
struct amdgpu_device *adev, uint64_t va, uint64_t size,
void *drm_priv, struct kgd_mem **mem,
-   uint64_t *offset, uint32_t flags);
+   uint64_t *offset, uint32_t flags, bool criu_resume);
  int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv,
uint64_t *size);
@@ -297,6 +298,9 @@ int amdgpu_amdkfd_gpuvm_import_dmabuf(struct amdgpu_device 
*adev,
  int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
struct tile_config *config);
  void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev);
+void amdgpu_amdkfd_block_mmu_notifications(void *p);
+int amdgpu_amdkfd_criu_resume(void *p);
+
  #if IS_ENABLED(CONFIG_HSA_AMD)
  void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
  void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 90b985436878..5679fb75ec88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -846,7 +846,8 @@ static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem 
*mem,
   *
   * Returns 0 for success, negative errno for errors.
   */
-static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
+static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
+  bool criu_resume)
  {
struct amdkfd_process_info *process_info = mem->process_info;
struct amdgpu_bo *bo = mem->bo;
@@ -868,6 +869,17 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t 
user_addr)
goto out;
}
  
+	if (criu_resume) {

+   /*
+* During a CRIU restore operation, the userptr buffer objects
+* will be validated in the restore_userptr_work worker at a
+* later stage when it is scheduled by another ioctl called by
+* CRIU master process for the target pid for restore.
+*/
+   atomic_inc(&mem->invalid);
+   mutex_unlock(&process_info->lock);
+   return 0;
+   }
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
if (ret) {
pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
@@ -1240,6 +1252,7 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void 
**process_info,
INIT_DELAYED_WORK(&info->restore_userptr_work,
  amdgpu_amdkfd_restore_userptr_worker);
  
+		info->block_mmu_notifications = false;

*process_info = info;
*ef = dma_fence_get(&info->eviction_fence->base);
}
@@ -1456,10 +1469,37 @@ uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void 
*drm_priv)
return avm->pd_phys_addr;
  }
  
+void amdgpu_amdkfd_block_mmu_notifications(void *p)

+{
+   struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;
+
+   pinfo->block_mmu_notifications = true;
+}
+
+int amdgpu_amdkfd_criu_resume(void *p)
+{
+   int ret = 0;
+   struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;
+
+   mutex_lock(&pinfo->lock);
+   pr_debug("scheduling work

Re: [Patch v4 21/24] drm/amdkfd: CRIU Discover svm ranges

2022-01-10 Thread philip yang

  


On 2021-12-22 7:37 p.m., Rajneesh
  Bhardwaj wrote:


  A KFD process may contain a number of virtual address ranges for shared
virtual memory management and each such range can have many SVM
attributes spanning across various nodes within the process boundary.
This change reports the total number of such SVM ranges and
their total private data size by extending the PROCESS_INFO op of the the
CRIU IOCTL to discover the svm ranges in the target process and a future
patches brings in the required support for checkpoint and restore for
SVM ranges.


Signed-off-by: Rajneesh Bhardwaj 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 12 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  5 +-
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 60 
 drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 11 +
 4 files changed, 82 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 446eb9310915..1c25d5e9067c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2089,10 +2089,9 @@ static int criu_get_process_object_info(struct kfd_process *p,
 	uint32_t *num_objects,
 	uint64_t *objs_priv_size)
 {
-	int ret;
-	uint64_t priv_size;
+	uint64_t queues_priv_data_size, svm_priv_data_size, priv_size;
 	uint32_t num_queues, num_events, num_svm_ranges;
-	uint64_t queues_priv_data_size;
+	int ret;
 
 	*num_devices = p->n_pdds;
 	*num_bos = get_process_num_bos(p);
@@ -2102,7 +2101,10 @@ static int criu_get_process_object_info(struct kfd_process *p,
 		return ret;
 
 	num_events = kfd_get_num_events(p);
-	num_svm_ranges = 0; /* TODO: Implement SVM-Ranges */
+
+	ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size);
+	if (ret)
+		return ret;
 
 	*num_objects = num_queues + num_events + num_svm_ranges;
 
@@ -2112,7 +2114,7 @@ static int criu_get_process_object_info(struct kfd_process *p,
 		priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
 		priv_size += queues_priv_data_size;
 		priv_size += num_events * sizeof(struct kfd_criu_event_priv_data);
-		/* TODO: Add SVM ranges priv size */
+		priv_size += svm_priv_data_size;
 		*objs_priv_size = priv_size;
 	}
 	return 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d72dda84c18c..87eb6739a78e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1082,7 +1082,10 @@ enum kfd_criu_object_type {
 
 struct kfd_criu_svm_range_priv_data {
 	uint32_t object_type;
-	uint64_t reserved;
+	uint64_t start_addr;
+	uint64_t size;
+	/* Variable length array of attributes */
+	struct kfd_ioctl_svm_attribute attrs[0];
 };
 
 struct kfd_criu_queue_priv_data {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 7c92116153fe..49e05fb5c898 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -3418,6 +3418,66 @@ svm_range_get_attr(struct kfd_process *p, struct mm_struct *mm,
 	return 0;
 }
 
+int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges,
+		   uint64_t *svm_priv_data_size)
+{
+	uint64_t total_size, accessibility_size, common_attr_size;
+	int nattr_common = 4, naatr_accessibility = 1;
+	int num_devices = p->n_pdds;
+	struct svm_range_list *svms;
+	struct svm_range *prange;
+	uint32_t count = 0;
+
+	*svm_priv_data_size = 0;
+
+	svms = &p->svms;

svms is defined as structure inside kfd_process, not pointer, so
&p->svms will never be NULL. 

  
+	if (!svms)
+		return -EINVAL;
+
+	mutex_lock(&svms->lock);
+	list_for_each_entry(prange, &svms->list, list) {
+		pr_debug("prange: 0x%p start: 0x%lx\t npages: 0x%llx\t end: 0x%llx\n",
+			 prange, prange->start, prange->npages,
+			 prange->start + prange->npages - 1);
+		count++;
+	}
+	mutex_unlock(&svms->lock);
+
+	*num_svm_ranges = count;
+	/* Only the accessbility attributes need to be queried for all the gpus
+	 * individually, remaining ones are spanned across the entire process
+	 * regardless of the various gpu nodes. Of the remaining attributes,
+	 * KFD_IOCTL_SVM_ATTR_CLR_FLAGS need not be saved.
+	 *
+	 * KFD_IOCTL_SVM_ATTR_PREFERRED_LOC
+	 * KFD_IOCTL_SVM_ATTR_PREFETCH_LOC
+	 * KFD_IOCTL_SVM_ATTR_SET_FLAGS
+	 * KFD_IOCTL_SVM_ATTR_GRANULARITY
+	 *
+	 * ** ACCESSBILITY ATTRIBUTES **
+	 * (Considered as one, type is altered during query, value is gpuid)
+	 * KFD_IOCTL_SVM_ATTR_ACCESS
+	 * KFD_IOCTL_SVM_ATTR_ACCESS_IN_PLACE
+	 * KFD_IOCTL_SVM_ATTR_NO_ACCESS
+	 */
+	if (*num_svm_ranges > 0)
+	{
+		common_attr_size = sizeof(struct kfd_ioctl_svm_attribute) *
+			nattr_common;
+		accessibility_size = sizeof(struct kfd_ioctl_svm_attribute) *
+			naatr_accessibility * num_devices;
+
+		total_size = sizeof(struct kfd_criu_svm_range_priv_data) +
+			common_attr_size + accessibility_size;
+
+		*svm_priv_data_size = *num_svm_ranges * total_

Re: [Patch v4 06/24] drm/amdkfd: CRIU Implement KFD restore ioctl

2022-01-10 Thread Felix Kuehling

On 2021-12-22 7:36 p.m., Rajneesh Bhardwaj wrote:

This implements the KFD CRIU Restore ioctl that lays the basic
foundation for the CRIU restore operation. It provides support to
create the buffer objects corresponding to Non-Paged system memory
mapped for GPU and/or CPU access and lays basic foundation for the
userptrs buffer objects which will be added in a separate patch.
This ioctl creates various types of buffer objects such as VRAM,
MMIO, Doorbell, GTT based on the date sent from the userspace plugin.
The data mostly contains the previously checkpointed KFD images from
some KFD processs.

While restoring a criu process, attach old IDR values to newly
created BOs. This also adds the minimal gpu mapping support for a single
gpu checkpoint restore use case.

Signed-off-by: David Yat Sin 
Signed-off-by: Rajneesh Bhardwaj 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 298 ++-
  1 file changed, 297 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index cdbb92972338..c93f74ad073f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -2069,11 +2069,307 @@ static int criu_checkpoint(struct file *filep,
return ret;
  }
  
+static int criu_restore_process(struct kfd_process *p,

+   struct kfd_ioctl_criu_args *args,
+   uint64_t *priv_offset,
+   uint64_t max_priv_data_size)
+{
+   int ret = 0;
+   struct kfd_criu_process_priv_data process_priv;
+
+   if (*priv_offset + sizeof(process_priv) > max_priv_data_size)
+   return -EINVAL;
+
+   ret = copy_from_user(&process_priv,
+   (void __user *)(args->priv_data + *priv_offset),
+   sizeof(process_priv));
+   if (ret) {
+   pr_err("Failed to copy process private information from 
user\n");
+   ret = -EFAULT;
+   goto exit;
+   }
+   *priv_offset += sizeof(process_priv);
+
+   if (process_priv.version != KFD_CRIU_PRIV_VERSION) {
+   pr_err("Invalid CRIU API version (checkpointed:%d 
current:%d)\n",
+   process_priv.version, KFD_CRIU_PRIV_VERSION);
+   return -EINVAL;
+   }
+
+exit:
+   return ret;
+}
+
+static int criu_restore_bos(struct kfd_process *p,
+   struct kfd_ioctl_criu_args *args,
+   uint64_t *priv_offset,
+   uint64_t max_priv_data_size)
+{
+   struct kfd_criu_bo_bucket *bo_buckets;
+   struct kfd_criu_bo_priv_data *bo_privs;
+   bool flush_tlbs = false;
+   int ret = 0, j = 0;
+   uint32_t i;
+
+   if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > 
max_priv_data_size)
+   return -EINVAL;
+
+   bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), 
GFP_KERNEL);
+   if (!bo_buckets)
+   return -ENOMEM;
+
+   ret = copy_from_user(bo_buckets, (void __user *)args->bos,
+args->num_bos * sizeof(*bo_buckets));
+   if (ret) {
+   pr_err("Failed to copy BOs information from user\n");
+   ret = -EFAULT;
+   goto exit;
+   }
+
+   bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL);
+   if (!bo_privs) {
+   ret = -ENOMEM;
+   goto exit;
+   }
+
+   ret = copy_from_user(bo_privs, (void __user *)args->priv_data + 
*priv_offset,
+args->num_bos * sizeof(*bo_privs));
+   if (ret) {
+   pr_err("Failed to copy BOs information from user\n");
+   ret = -EFAULT;
+   goto exit;
+   }
+   *priv_offset += args->num_bos * sizeof(*bo_privs);
+
+   /* Create and map new BOs */
+   for (i = 0; i < args->num_bos; i++) {
+   struct kfd_criu_bo_bucket *bo_bucket;
+   struct kfd_criu_bo_priv_data *bo_priv;
+   struct kfd_dev *dev;
+   struct kfd_process_device *pdd;
+   void *mem;
+   u64 offset;
+   int idr_handle;
+
+   bo_bucket = &bo_buckets[i];
+   bo_priv = &bo_privs[i];
+
+   dev = kfd_device_by_id(bo_bucket->gpu_id);
+   if (!dev) {
+   ret = -EINVAL;
+   pr_err("Failed to get pdd\n");
+   goto exit;
+   }
+   pdd = kfd_get_process_device_data(dev, p);
+   if (!pdd) {
+   ret = -EINVAL;
+   pr_err("Failed to get pdd\n");
+   goto exit;
+   }
+
+   pr_debug("kfd restore ioctl - bo_bucket[%d]:\n", i);
+   pr_debug("size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
+ 

Re: [Patch v4 04/24] drm/amdkfd: CRIU Implement KFD process_info ioctl

2022-01-10 Thread Felix Kuehling

On 2021-12-22 7:36 p.m., Rajneesh Bhardwaj wrote:

This IOCTL is expected to be called as a precursor to the actual
Checkpoint operation. This does the basic discovery into the target
process seized by CRIU and relays the information to the userspace that
utilizes it to start the Checkpoint operation via another dedicated
IOCTL.

The process_info IOCTL determines the number of GPUs, buffer objects
that are associated with the target process, its process id in
caller's namespace since /proc/pid/mem interface maybe used to drain
the contents of the discovered buffer objects in userspace and getpid
returns the pid of CRIU dumper process. Also the pid of a process
inside a container might be different than its global pid so return
the ns pid.

Signed-off-by: Rajneesh Bhardwaj 
Signed-off-by: David Yat Sin 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 55 +++-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  2 +
  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 14 ++
  3 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 1b863bd84c96..53d7a20e3c06 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1857,6 +1857,41 @@ static int kfd_ioctl_svm(struct file *filep, struct 
kfd_process *p, void *data)
  }
  #endif
  
+uint64_t get_process_num_bos(struct kfd_process *p)

+{
+   uint64_t num_of_bos = 0, i;
+
+   /* Run over all PDDs of the process */
+   for (i = 0; i < p->n_pdds; i++) {
+   struct kfd_process_device *pdd = p->pdds[i];
+   void *mem;
+   int id;
+
+   idr_for_each_entry(&pdd->alloc_idr, mem, id) {
+   struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
+
+   if ((uint64_t)kgd_mem->va > pdd->gpuvm_base)
+   num_of_bos++;
+   }
+   }
+   return num_of_bos;
+}
+
+static void criu_get_process_object_info(struct kfd_process *p,
+uint32_t *num_bos,
+uint64_t *objs_priv_size)
+{
+   uint64_t priv_size;
+
+   *num_bos = get_process_num_bos(p);
+
+   if (objs_priv_size) {
+   priv_size = sizeof(struct kfd_criu_process_priv_data);
+   priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
+   *objs_priv_size = priv_size;
+   }
+}
+
  static int criu_checkpoint(struct file *filep,
   struct kfd_process *p,
   struct kfd_ioctl_criu_args *args)
@@ -1889,7 +1924,25 @@ static int criu_process_info(struct file *filep,
struct kfd_process *p,
struct kfd_ioctl_criu_args *args)
  {
-   return 0;
+   int ret = 0;
+
+   mutex_lock(&p->mutex);
+
+   if (!kfd_has_process_device_data(p)) {
+   pr_err("No pdd for given process\n");
+   ret = -ENODEV;
+   goto err_unlock;
+   }
+
+   args->pid = task_pid_nr_ns(p->lead_thread,
+   task_active_pid_ns(p->lead_thread));
+
+   criu_get_process_object_info(p, &args->num_bos, &args->priv_data_size);
+
+   dev_dbg(kfd_device, "Num of bos:%u\n", args->num_bos);
+err_unlock:
+   mutex_unlock(&p->mutex);
+   return ret;
  }
  
  static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index e68f692362bb..4d9bc7af03af 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -950,6 +950,8 @@ void *kfd_process_device_translate_handle(struct 
kfd_process_device *p,
  void kfd_process_device_remove_obj_handle(struct kfd_process_device *pdd,
int handle);
  
+bool kfd_has_process_device_data(struct kfd_process *p);

+
  /* PASIDs */
  int kfd_pasid_init(void);
  void kfd_pasid_exit(void);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index d4c8a6948a9f..f77d556ca0fc 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1456,6 +1456,20 @@ static int init_doorbell_bitmap(struct 
qcm_process_device *qpd,
return 0;
  }
  
+bool kfd_has_process_device_data(struct kfd_process *p)

+{
+   int i;
+
+   for (i = 0; i < p->n_pdds; i++) {
+   struct kfd_process_device *pdd = p->pdds[i];


I think checking p->n_pdds is sufficient. All the pdds with i < n_pdds 
should be non-NULL.


Regards,
  Felix



+
+   if (pdd)
+   return true;
+   }
+
+   return false;
+}
+
  struct kfd_process_device *kfd_get_process_device_data(struct kfd_dev *dev,
 

[PATCH v3 06/10] lib: test_hmm add ioctl to get zone device type

2022-01-10 Thread Alex Sierra
new ioctl cmd added to query zone device type. This will be
used once the test_hmm adds zone device coherent type.

Signed-off-by: Alex Sierra 
---
 lib/test_hmm.c  | 14 ++
 lib/test_hmm_uapi.h |  8 
 2 files changed, 22 insertions(+)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index c259842f6d44..97e48164d56a 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -84,6 +84,7 @@ struct dmirror_chunk {
 struct dmirror_device {
struct cdev cdevice;
struct hmm_devmem   *devmem;
+   unsigned intzone_device_type;
 
unsigned intdevmem_capacity;
unsigned intdevmem_count;
@@ -470,6 +471,7 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
if (IS_ERR(res))
goto err_devmem;
 
+   mdevice->zone_device_type = HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
devmem->pagemap.range.start = res->start;
devmem->pagemap.range.end = res->end;
@@ -1025,6 +1027,15 @@ static int dmirror_snapshot(struct dmirror *dmirror,
return ret;
 }
 
+static int dmirror_get_device_type(struct dmirror *dmirror,
+   struct hmm_dmirror_cmd *cmd)
+{
+   mutex_lock(&dmirror->mutex);
+   cmd->zone_device_type = dmirror->mdevice->zone_device_type;
+   mutex_unlock(&dmirror->mutex);
+
+   return 0;
+}
 static long dmirror_fops_unlocked_ioctl(struct file *filp,
unsigned int command,
unsigned long arg)
@@ -1075,6 +1086,9 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
ret = dmirror_snapshot(dmirror, &cmd);
break;
 
+   case HMM_DMIRROR_GET_MEM_DEV_TYPE:
+   ret = dmirror_get_device_type(dmirror, &cmd);
+   break;
default:
return -EINVAL;
}
diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h
index f14dea5dcd06..17f842f1aa02 100644
--- a/lib/test_hmm_uapi.h
+++ b/lib/test_hmm_uapi.h
@@ -19,6 +19,7 @@
  * @npages: (in) number of pages to read/write
  * @cpages: (out) number of pages copied
  * @faults: (out) number of device page faults seen
+ * @zone_device_type: (out) zone device memory type
  */
 struct hmm_dmirror_cmd {
__u64   addr;
@@ -26,6 +27,7 @@ struct hmm_dmirror_cmd {
__u64   npages;
__u64   cpages;
__u64   faults;
+   __u64   zone_device_type;
 };
 
 /* Expose the address space of the calling process through hmm device file */
@@ -35,6 +37,7 @@ struct hmm_dmirror_cmd {
 #define HMM_DMIRROR_SNAPSHOT   _IOWR('H', 0x03, struct hmm_dmirror_cmd)
 #define HMM_DMIRROR_EXCLUSIVE  _IOWR('H', 0x04, struct hmm_dmirror_cmd)
 #define HMM_DMIRROR_CHECK_EXCLUSIVE_IOWR('H', 0x05, struct hmm_dmirror_cmd)
+#define HMM_DMIRROR_GET_MEM_DEV_TYPE   _IOWR('H', 0x06, struct hmm_dmirror_cmd)
 
 /*
  * Values returned in hmm_dmirror_cmd.ptr for HMM_DMIRROR_SNAPSHOT.
@@ -62,4 +65,9 @@ enum {
HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30,
 };
 
+enum {
+   /* 0 is reserved to catch uninitialized type fields */
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE = 1,
+};
+
 #endif /* _LIB_TEST_HMM_UAPI_H */
-- 
2.32.0



[PATCH v3 10/10] tools: update test_hmm script to support SP config

2022-01-10 Thread Alex Sierra
Add two more parameters to set spm_addr_dev0 & spm_addr_dev1
addresses. These two parameters configure the start SP
addresses for each device in test_hmm driver.
Consequently, this configures zone device type as coherent.

Signed-off-by: Alex Sierra 
---
v2:
Add more mknods for device coherent type. These are represented under
/dev/hmm_mirror2 and /dev/hmm_mirror3, only in case they have created
at probing the hmm-test driver.
---
 tools/testing/selftests/vm/test_hmm.sh | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/test_hmm.sh 
b/tools/testing/selftests/vm/test_hmm.sh
index 0647b525a625..539c9371e592 100755
--- a/tools/testing/selftests/vm/test_hmm.sh
+++ b/tools/testing/selftests/vm/test_hmm.sh
@@ -40,11 +40,26 @@ check_test_requirements()
 
 load_driver()
 {
-   modprobe $DRIVER > /dev/null 2>&1
+   if [ $# -eq 0 ]; then
+   modprobe $DRIVER > /dev/null 2>&1
+   else
+   if [ $# -eq 2 ]; then
+   modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2
+   > /dev/null 2>&1
+   else
+   echo "Missing module parameters. Make sure pass"\
+   "spm_addr_dev0 and spm_addr_dev1"
+   usage
+   fi
+   fi
if [ $? == 0 ]; then
major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices)
mknod /dev/hmm_dmirror0 c $major 0
mknod /dev/hmm_dmirror1 c $major 1
+   if [ $# -eq 2 ]; then
+   mknod /dev/hmm_dmirror2 c $major 2
+   mknod /dev/hmm_dmirror3 c $major 3
+   fi
fi
 }
 
@@ -58,7 +73,7 @@ run_smoke()
 {
echo "Running smoke test. Note, this test provides basic coverage."
 
-   load_driver
+   load_driver $1 $2
$(dirname "${BASH_SOURCE[0]}")/hmm-tests
unload_driver
 }
@@ -75,6 +90,9 @@ usage()
echo "# Smoke testing"
echo "./${TEST_NAME}.sh smoke"
echo
+   echo "# Smoke testing with SPM enabled"
+   echo "./${TEST_NAME}.sh smoke  "
+   echo
exit 0
 }
 
@@ -84,7 +102,7 @@ function run_test()
usage
else
if [ "$1" = "smoke" ]; then
-   run_smoke
+   run_smoke $2 $3
else
usage
fi
-- 
2.32.0



[PATCH v3 08/10] lib: add support for device coherent type in test_hmm

2022-01-10 Thread Alex Sierra
Device Coherent type uses device memory that is coherently accesible by
the CPU. This could be shown as SP (special purpose) memory range
at the BIOS-e820 memory enumeration. If no SP memory is supported in
system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP.

Currently, test_hmm only supports two different SP ranges of at least
256MB size. This could be specified in the kernel parameter variable
efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x1 &
0x14000 physical address. Ex.
efi_fake_mem=1G@0x1:0x4,1G@0x14000:0x4

Private and coherent device mirror instances can be created in the same
probed. This is done by passing the module parameters spm_addr_dev0 &
spm_addr_dev1. In this case, it will create four instances of
device_mirror. The first two correspond to private device type, the
last two to coherent type. Then, they can be easily accessed from user
space through /dev/hmm_mirror. Usually num_device 0 and 1
are for private, and 2 and 3 for coherent types. If no module
parameters are passed, two instances of private type device_mirror will
be created only.

Signed-off-by: Alex Sierra 
---
 lib/test_hmm.c  | 247 
 lib/test_hmm_uapi.h |  15 ++-
 2 files changed, 193 insertions(+), 69 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 9edeff52302e..7c641c5a9cfa 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -29,11 +29,22 @@
 
 #include "test_hmm_uapi.h"
 
-#define DMIRROR_NDEVICES   2
+#define DMIRROR_NDEVICES   4
 #define DMIRROR_RANGE_FAULT_TIMEOUT1000
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+/*
+ * For device_private pages, dpage is just a dummy struct page
+ * representing a piece of device memory. dmirror_devmem_alloc_page
+ * allocates a real system memory page as backing storage to fake a
+ * real device. zone_device_data points to that backing page. But
+ * for device_coherent memory, the struct page represents real
+ * physical CPU-accessible memory that we can use directly.
+ */
+#define BACKING_PAGE(page) (is_device_private_page((page)) ? \
+  (page)->zone_device_data : (page))
+
 static unsigned long spm_addr_dev0;
 module_param(spm_addr_dev0, long, 0644);
 MODULE_PARM_DESC(spm_addr_dev0,
@@ -122,6 +133,21 @@ static int dmirror_bounce_init(struct dmirror_bounce 
*bounce,
return 0;
 }
 
+static bool dmirror_is_private_zone(struct dmirror_device *mdevice)
+{
+   return (mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? true : false;
+}
+
+static enum migrate_vma_direction
+   dmirror_select_device(struct dmirror *dmirror)
+{
+   return (dmirror->mdevice->zone_device_type ==
+   HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ?
+   MIGRATE_VMA_SELECT_DEVICE_PRIVATE :
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+}
+
 static void dmirror_bounce_fini(struct dmirror_bounce *bounce)
 {
vfree(bounce->ptr);
@@ -572,16 +598,19 @@ static int dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
 static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice)
 {
struct page *dpage = NULL;
-   struct page *rpage;
+   struct page *rpage = NULL;
 
/*
-* This is a fake device so we alloc real system memory to store
-* our device memory.
+* For ZONE_DEVICE private type, this is a fake device so we alloc real
+* system memory to store our device memory.
+* For ZONE_DEVICE coherent type we use the actual dpage to store the 
data
+* and ignore rpage.
 */
-   rpage = alloc_page(GFP_HIGHUSER);
-   if (!rpage)
-   return NULL;
-
+   if (dmirror_is_private_zone(mdevice)) {
+   rpage = alloc_page(GFP_HIGHUSER);
+   if (!rpage)
+   return NULL;
+   }
spin_lock(&mdevice->lock);
 
if (mdevice->free_pages) {
@@ -601,7 +630,8 @@ static struct page *dmirror_devmem_alloc_page(struct 
dmirror_device *mdevice)
return dpage;
 
 error:
-   __free_page(rpage);
+   if (rpage)
+   __free_page(rpage);
return NULL;
 }
 
@@ -627,12 +657,15 @@ static void dmirror_migrate_alloc_and_copy(struct 
migrate_vma *args,
 * unallocated pte_none() or read-only zero page.
 */
spage = migrate_pfn_to_page(*src);
+   WARN(spage && is_zone_device_page(spage),
+"page already in device spage pfn: 0x%lx\n",
+page_to_pfn(spage));
 
dpage = dmirror_devmem_alloc_page(mdevice);
if (!dpage)
continue;
 
-   rpage = dpage->zone_device_data;
+   rpage = BACKING_PAGE(dpage);
if (spage)
copy_highpage(rpage, spage);
   

[PATCH v3 09/10] tools: update hmm-test to support device coherent type

2022-01-10 Thread Alex Sierra
Test cases such as migrate_fault and migrate_multiple, were modified to
explicit migrate from device to sys memory without the need of page
faults, when using device coherent type.

Snapshot test case updated to read memory device type first and based
on that, get the proper returned results migrate_ping_pong test case
added to test explicit migration from device to sys memory for both
private and coherent zone types.

Helpers to migrate from device to sys memory and vicerversa
were also added.

Signed-off-by: Alex Sierra 
---
v2:
Set FIXTURE_VARIANT to add multiple device types to the FIXTURE. This
will run all the tests for each device type (private and coherent) in
case both existed during hmm-test driver probed.
---
 tools/testing/selftests/vm/hmm-tests.c | 122 -
 1 file changed, 101 insertions(+), 21 deletions(-)

diff --git a/tools/testing/selftests/vm/hmm-tests.c 
b/tools/testing/selftests/vm/hmm-tests.c
index 864f126ffd78..8eb81dfba4b3 100644
--- a/tools/testing/selftests/vm/hmm-tests.c
+++ b/tools/testing/selftests/vm/hmm-tests.c
@@ -44,6 +44,14 @@ struct hmm_buffer {
int fd;
uint64_tcpages;
uint64_tfaults;
+   int zone_device_type;
+};
+
+enum {
+   HMM_PRIVATE_DEVICE_ONE,
+   HMM_PRIVATE_DEVICE_TWO,
+   HMM_COHERENCE_DEVICE_ONE,
+   HMM_COHERENCE_DEVICE_TWO,
 };
 
 #define TWOMEG (1 << 21)
@@ -60,6 +68,21 @@ FIXTURE(hmm)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm)
+{
+   int device_number;
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_private)
+{
+   .device_number = HMM_PRIVATE_DEVICE_ONE,
+};
+
+FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent)
+{
+   .device_number = HMM_COHERENCE_DEVICE_ONE,
+};
+
 FIXTURE(hmm2)
 {
int fd0;
@@ -68,6 +91,24 @@ FIXTURE(hmm2)
unsigned intpage_shift;
 };
 
+FIXTURE_VARIANT(hmm2)
+{
+   int device_number0;
+   int device_number1;
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private)
+{
+   .device_number0 = HMM_PRIVATE_DEVICE_ONE,
+   .device_number1 = HMM_PRIVATE_DEVICE_TWO,
+};
+
+FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent)
+{
+   .device_number0 = HMM_COHERENCE_DEVICE_ONE,
+   .device_number1 = HMM_COHERENCE_DEVICE_TWO,
+};
+
 static int hmm_open(int unit)
 {
char pathname[HMM_PATH_MAX];
@@ -81,12 +122,19 @@ static int hmm_open(int unit)
return fd;
 }
 
+static bool hmm_is_coherent_type(int dev_num)
+{
+   return (dev_num >= HMM_COHERENCE_DEVICE_ONE);
+}
+
 FIXTURE_SETUP(hmm)
 {
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd = hmm_open(0);
+   self->fd = hmm_open(variant->device_number);
+   if (self->fd < 0 && hmm_is_coherent_type(variant->device_number))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd, 0);
 }
 
@@ -95,9 +143,11 @@ FIXTURE_SETUP(hmm2)
self->page_size = sysconf(_SC_PAGE_SIZE);
self->page_shift = ffs(self->page_size) - 1;
 
-   self->fd0 = hmm_open(0);
+   self->fd0 = hmm_open(variant->device_number0);
+   if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0))
+   SKIP(exit(0), "DEVICE_COHERENT not available");
ASSERT_GE(self->fd0, 0);
-   self->fd1 = hmm_open(1);
+   self->fd1 = hmm_open(variant->device_number1);
ASSERT_GE(self->fd1, 0);
 }
 
@@ -144,6 +194,7 @@ static int hmm_dmirror_cmd(int fd,
}
buffer->cpages = cmd.cpages;
buffer->faults = cmd.faults;
+   buffer->zone_device_type = cmd.zone_device_type;
 
return 0;
 }
@@ -211,6 +262,20 @@ static void hmm_nanosleep(unsigned int n)
nanosleep(&t, NULL);
 }
 
+static int hmm_migrate_sys_to_dev(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages);
+}
+
+static int hmm_migrate_dev_to_sys(int fd,
+  struct hmm_buffer *buffer,
+  unsigned long npages)
+{
+   return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages);
+}
+
 /*
  * Simple NULL test of device open/close.
  */
@@ -875,7 +940,7 @@ TEST_F(hmm, migrate)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffer->cpages, npages);
 
@@ -923,7 +988,7 @@ TEST_F(hmm, migrate_fault)
ptr[i] = i;
 
/* Migrate memory to device. */
-   ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_MIGRATE, buffer, npages);
+   ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages);
ASSERT_EQ(ret, 0);
ASSERT_EQ(buffe

[PATCH v3 07/10] lib: test_hmm add module param for zone device type

2022-01-10 Thread Alex Sierra
In order to configure device coherent in test_hmm, two module parameters
should be passed, which correspond to the SP start address of each
device (2) spm_addr_dev0 & spm_addr_dev1. If no parameters are passed,
private device type is configured.

Signed-off-by: Alex Sierra 
---
 lib/test_hmm.c  | 74 +++--
 lib/test_hmm_uapi.h |  1 +
 2 files changed, 53 insertions(+), 22 deletions(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 97e48164d56a..9edeff52302e 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -34,6 +34,16 @@
 #define DEVMEM_CHUNK_SIZE  (256 * 1024 * 1024U)
 #define DEVMEM_CHUNKS_RESERVE  16
 
+static unsigned long spm_addr_dev0;
+module_param(spm_addr_dev0, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev0,
+   "Specify start address for SPM (special purpose memory) used 
for device 0. By setting this Coherent device type will be used. Make sure 
spm_addr_dev1 is set too");
+
+static unsigned long spm_addr_dev1;
+module_param(spm_addr_dev1, long, 0644);
+MODULE_PARM_DESC(spm_addr_dev1,
+   "Specify start address for SPM (special purpose memory) used 
for device 1. By setting this Coherent device type will be used. Make sure 
spm_addr_dev0 is set too");
+
 static const struct dev_pagemap_ops dmirror_devmem_ops;
 static const struct mmu_interval_notifier_ops dmirror_min_ops;
 static dev_t dmirror_dev;
@@ -452,29 +462,44 @@ static int dmirror_write(struct dmirror *dmirror, struct 
hmm_dmirror_cmd *cmd)
return ret;
 }
 
-static bool dmirror_allocate_chunk(struct dmirror_device *mdevice,
+static int dmirror_allocate_chunk(struct dmirror_device *mdevice,
   struct page **ppage)
 {
struct dmirror_chunk *devmem;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long pfn;
unsigned long pfn_first;
unsigned long pfn_last;
void *ptr;
+   int ret = -ENOMEM;
 
devmem = kzalloc(sizeof(*devmem), GFP_KERNEL);
if (!devmem)
-   return false;
+   return ret;
 
-   res = request_free_mem_region(&iomem_resource, DEVMEM_CHUNK_SIZE,
- "hmm_dmirror");
-   if (IS_ERR(res))
+   switch (mdevice->zone_device_type) {
+   case HMM_DMIRROR_MEMORY_DEVICE_PRIVATE:
+   res = request_free_mem_region(&iomem_resource, 
DEVMEM_CHUNK_SIZE,
+ "hmm_dmirror");
+   if (IS_ERR_OR_NULL(res))
+   goto err_devmem;
+   devmem->pagemap.range.start = res->start;
+   devmem->pagemap.range.end = res->end;
+   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
+   break;
+   case HMM_DMIRROR_MEMORY_DEVICE_COHERENT:
+   devmem->pagemap.range.start = (MINOR(mdevice->cdevice.dev) - 2) 
?
+   spm_addr_dev0 :
+   spm_addr_dev1;
+   devmem->pagemap.range.end = devmem->pagemap.range.start +
+   DEVMEM_CHUNK_SIZE - 1;
+   devmem->pagemap.type = MEMORY_DEVICE_COHERENT;
+   break;
+   default:
+   ret = -EINVAL;
goto err_devmem;
+   }
 
-   mdevice->zone_device_type = HMM_DMIRROR_MEMORY_DEVICE_PRIVATE;
-   devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-   devmem->pagemap.range.start = res->start;
-   devmem->pagemap.range.end = res->end;
devmem->pagemap.nr_range = 1;
devmem->pagemap.ops = &dmirror_devmem_ops;
devmem->pagemap.owner = mdevice;
@@ -495,10 +520,14 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
mdevice->devmem_capacity = new_capacity;
mdevice->devmem_chunks = new_chunks;
}
-
ptr = memremap_pages(&devmem->pagemap, numa_node_id());
-   if (IS_ERR(ptr))
+   if (IS_ERR_OR_NULL(ptr)) {
+   if (ptr)
+   ret = PTR_ERR(ptr);
+   else
+   ret = -EFAULT;
goto err_release;
+   }
 
devmem->mdevice = mdevice;
pfn_first = devmem->pagemap.range.start >> PAGE_SHIFT;
@@ -527,15 +556,17 @@ static bool dmirror_allocate_chunk(struct dmirror_device 
*mdevice,
}
spin_unlock(&mdevice->lock);
 
-   return true;
+   return 0;
 
 err_release:
mutex_unlock(&mdevice->devmem_lock);
-   release_mem_region(devmem->pagemap.range.start, 
range_len(&devmem->pagemap.range));
+   if (res && devmem->pagemap.type == MEMORY_DEVICE_PRIVATE)
+   release_mem_region(devmem->pagemap.range.start,
+  range_len(&devmem->pagemap.range));
 err_devmem:
kfree(devmem);
 
-   return false;
+   return ret;
 }
 
 static struct page *dmirror_d

[PATCH v3 05/10] drm/amdkfd: coherent type as sys mem on migration to ram

2022-01-10 Thread Alex Sierra
Coherent device type memory on VRAM to RAM migration, has similar access
as System RAM from the CPU. This flag sets the source from the sender.
Which in Coherent type case, should be set as
MIGRATE_VMA_SELECT_DEVICE_COHERENT.

Signed-off-by: Alex Sierra 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index 9e36fe8aea0f..3e405f078ade 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -661,9 +661,12 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct 
svm_range *prange,
migrate.vma = vma;
migrate.start = start;
migrate.end = end;
-   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
migrate.pgmap_owner = SVM_ADEV_PGMAP_OWNER(adev);
 
+   if (adev->gmc.xgmi.connected_to_cpu)
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_COHERENT;
+   else
+   migrate.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
size = 2 * sizeof(*migrate.src) + sizeof(uint64_t) + sizeof(dma_addr_t);
size *= npages;
buf = kvmalloc(size, GFP_KERNEL | __GFP_ZERO);
-- 
2.32.0



[PATCH v3 04/10] drm/amdkfd: add SPM support for SVM

2022-01-10 Thread Alex Sierra
When CPU is connected throug XGMI, it has coherent
access to VRAM resource. In this case that resource
is taken from a table in the device gmc aperture base.
This resource is used along with the device type, which could
be DEVICE_PRIVATE or DEVICE_COHERENT to create the device
page map region.

Signed-off-by: Alex Sierra 
Reviewed-by: Felix Kuehling 
---
v7:
Remove lookup_resource call, so export symbol for this function
is not longer required. Patch dropped "kernel: resource:
lookup_resource as exported symbol"
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 29 +++-
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index aeade32ec298..9e36fe8aea0f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -935,7 +935,7 @@ int svm_migrate_init(struct amdgpu_device *adev)
 {
struct kfd_dev *kfddev = adev->kfd.dev;
struct dev_pagemap *pgmap;
-   struct resource *res;
+   struct resource *res = NULL;
unsigned long size;
void *r;
 
@@ -950,28 +950,34 @@ int svm_migrate_init(struct amdgpu_device *adev)
 * should remove reserved size
 */
size = ALIGN(adev->gmc.real_vram_size, 2ULL << 20);
-   res = devm_request_free_mem_region(adev->dev, &iomem_resource, size);
-   if (IS_ERR(res))
-   return -ENOMEM;
+   if (adev->gmc.xgmi.connected_to_cpu) {
+   pgmap->range.start = adev->gmc.aper_base;
+   pgmap->range.end = adev->gmc.aper_base + adev->gmc.aper_size - 
1;
+   pgmap->type = MEMORY_DEVICE_COHERENT;
+   } else {
+   res = devm_request_free_mem_region(adev->dev, &iomem_resource, 
size);
+   if (IS_ERR(res))
+   return -ENOMEM;
+   pgmap->range.start = res->start;
+   pgmap->range.end = res->end;
+   pgmap->type = MEMORY_DEVICE_PRIVATE;
+   }
 
-   pgmap->type = MEMORY_DEVICE_PRIVATE;
pgmap->nr_range = 1;
-   pgmap->range.start = res->start;
-   pgmap->range.end = res->end;
pgmap->ops = &svm_migrate_pgmap_ops;
pgmap->owner = SVM_ADEV_PGMAP_OWNER(adev);
-   pgmap->flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE;
-
+   pgmap->flags = 0;
/* Device manager releases device-specific resources, memory region and
 * pgmap when driver disconnects from device.
 */
r = devm_memremap_pages(adev->dev, pgmap);
if (IS_ERR(r)) {
pr_err("failed to register HMM device memory\n");
-
/* Disable SVM support capability */
pgmap->type = 0;
-   devm_release_mem_region(adev->dev, res->start, 
resource_size(res));
+   if (pgmap->type == MEMORY_DEVICE_PRIVATE)
+   devm_release_mem_region(adev->dev, res->start,
+   res->end - res->start + 1);
return PTR_ERR(r);
}
 
@@ -984,3 +990,4 @@ int svm_migrate_init(struct amdgpu_device *adev)
 
return 0;
 }
+
-- 
2.32.0



[PATCH v3 02/10] mm: add device coherent vma selection for memory migration

2022-01-10 Thread Alex Sierra
This case is used to migrate pages from device memory, back to system
memory. Device coherent type memory is cache coherent from device and CPU
point of view.

Signed-off-by: Alex Sierra 
---
v2:
condition added when migrations from device coherent pages.
---
 include/linux/migrate.h | 1 +
 mm/migrate.c| 9 +++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index c8077e936691..e74bb0978f6f 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -138,6 +138,7 @@ static inline unsigned long migrate_pfn(unsigned long pfn)
 enum migrate_vma_direction {
MIGRATE_VMA_SELECT_SYSTEM = 1 << 0,
MIGRATE_VMA_SELECT_DEVICE_PRIVATE = 1 << 1,
+   MIGRATE_VMA_SELECT_DEVICE_COHERENT = 1 << 2,
 };
 
 struct migrate_vma {
diff --git a/mm/migrate.c b/mm/migrate.c
index 91018880dc7f..0367f471211a 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2340,8 +2340,6 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
if (is_writable_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
-   if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
-   goto next;
pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) {
mpfn = MIGRATE_PFN_MIGRATE;
@@ -2349,6 +2347,13 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
goto next;
}
page = vm_normal_page(migrate->vma, addr, pte);
+   if (page && !is_zone_device_page(page) &&
+   !(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+   goto next;
+   if (page && is_device_coherent_page(page) &&
+   (!(migrate->flags & 
MIGRATE_VMA_SELECT_DEVICE_COHERENT) ||
+page->pgmap->owner != migrate->pgmap_owner))
+   goto next;
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
-- 
2.32.0



[PATCH v3 01/10] mm: add zone device coherent type memory support

2022-01-10 Thread Alex Sierra
Device memory that is cache coherent from device and CPU point of view.
This is used on platforms that have an advanced system bus (like CAPI
or CXL). Any page of a process can be migrated to such memory. However,
no one should be allowed to pin such memory so that it can always be
evicted.

Signed-off-by: Alex Sierra 
---
 include/linux/memremap.h |  8 
 include/linux/mm.h   | 16 
 mm/memcontrol.c  |  6 +++---
 mm/memory-failure.c  |  8 ++--
 mm/memremap.c|  5 -
 mm/migrate.c | 21 +
 6 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index c0e9d35889e8..ff4d398edf35 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -39,6 +39,13 @@ struct vmem_altmap {
  * A more complete discussion of unaddressable memory may be found in
  * include/linux/hmm.h and Documentation/vm/hmm.rst.
  *
+ * MEMORY_DEVICE_COHERENT:
+ * Device memory that is cache coherent from device and CPU point of view. This
+ * is used on platforms that have an advanced system bus (like CAPI or CXL). A
+ * driver can hotplug the device memory using ZONE_DEVICE and with that memory
+ * type. Any page of a process can be migrated to such memory. However no one
+ * should be allowed to pin such memory so that it can always be evicted.
+ *
  * MEMORY_DEVICE_FS_DAX:
  * Host memory that has similar access semantics as System RAM i.e. DMA
  * coherent and supports page pinning. In support of coordinating page
@@ -59,6 +66,7 @@ struct vmem_altmap {
 enum memory_type {
/* 0 is reserved to catch uninitialized type fields */
MEMORY_DEVICE_PRIVATE = 1,
+   MEMORY_DEVICE_COHERENT,
MEMORY_DEVICE_FS_DAX,
MEMORY_DEVICE_GENERIC,
MEMORY_DEVICE_PCI_P2PDMA,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 73a52aba448f..fcf96c0fc918 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1162,6 +1162,7 @@ static inline bool page_is_devmap_managed(struct page 
*page)
return false;
switch (page->pgmap->type) {
case MEMORY_DEVICE_PRIVATE:
+   case MEMORY_DEVICE_COHERENT:
case MEMORY_DEVICE_FS_DAX:
return true;
default:
@@ -1191,6 +1192,21 @@ static inline bool is_device_private_page(const struct 
page *page)
page->pgmap->type == MEMORY_DEVICE_PRIVATE;
 }
 
+static inline bool is_device_coherent_page(const struct page *page)
+{
+   return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+   is_zone_device_page(page) &&
+   page->pgmap->type == MEMORY_DEVICE_COHERENT;
+}
+
+static inline bool is_device_page(const struct page *page)
+{
+   return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
+   is_zone_device_page(page) &&
+   (page->pgmap->type == MEMORY_DEVICE_PRIVATE ||
+   page->pgmap->type == MEMORY_DEVICE_COHERENT);
+}
+
 static inline bool is_pci_p2pdma_page(const struct page *page)
 {
return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) &&
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6da5020a8656..d0bab0747c73 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5695,8 +5695,8 @@ static int mem_cgroup_move_account(struct page *page,
  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
  * target for charge migration. if @target is not NULL, the entry is stored
  * in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is 
MEMORY_DEVICE_PRIVATE
- * (so ZONE_DEVICE page and thus not on the lru).
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is device memory and
+ *   thus not on the lru.
  * For now we such page is charge like a regular page would be as for all
  * intent and purposes it is just special memory taking the place of a
  * regular page.
@@ -5730,7 +5730,7 @@ static enum mc_target_type get_mctgt_type(struct 
vm_area_struct *vma,
 */
if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
-   if (is_device_private_page(page))
+   if (is_device_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3e6449f2102a..4cf212e5f432 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1554,12 +1554,16 @@ static int memory_failure_dev_pagemap(unsigned long 
pfn, int flags,
goto unlock;
}
 
-   if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+   switch (pgmap->type) {
+   case MEMORY_DEVICE_PRIVATE:
+   case MEMORY_DEVICE_COHERENT:
/*
-* TODO: Handle HMM pages which may need coordination
+* TODO: Handle device pages which may need coordination
 * with devi

[PATCH v3 00/10] Add MEMORY_DEVICE_COHERENT for coherent device memory mapping

2022-01-10 Thread Alex Sierra
This patch series introduces MEMORY_DEVICE_COHERENT, a type of memory
owned by a device that can be mapped into CPU page tables like
MEMORY_DEVICE_GENERIC and can also be migrated like
MEMORY_DEVICE_PRIVATE.

Christoph, the suggestion to incorporate Ralph Campbell’s refcount
cleanup patch into our hardware page migration patchset originally came
from you, but it proved impractical to do things in that order because
the refcount cleanup introduced a bug with wide ranging structural
implications. Instead, we amended Ralph’s patch so that it could be
applied after merging the migration work. As we saw from the recent
discussion, merging the refcount work is going to take some time and
cooperation between multiple development groups, while the migration
work is ready now and is needed now. So we propose to merge this
patchset first and continue to work with Ralph and others to merge the
refcount cleanup separately, when it is ready.

This patch series is mostly self-contained except for a few places where
it needs to update other subsystems to handle the new memory type.
System stability and performance are not affected according to our
ongoing testing, including xfstests.

How it works: The system BIOS advertises the GPU device memory
(aka VRAM) as SPM (special purpose memory) in the UEFI system address
map.

The amdgpu driver registers the memory with devmap as
MEMORY_DEVICE_COHERENT using devm_memremap_pages. The initial user for
this hardware page migration capability is the Frontier supercomputer
project. This functionality is not AMD-specific. We expect other GPU
vendors to find this functionality useful, and possibly other hardware
types in the future.

Our test nodes in the lab are similar to the Frontier configuration,
with .5 TB of system memory plus 256 GB of device memory split across
4 GPUs, all in a single coherent address space. Page migration is
expected to improve application efficiency significantly. We will
report empirical results as they become available.

We extended hmm_test to cover migration of MEMORY_DEVICE_COHERENT. This
patch set builds on HMM and our SVM memory manager already merged in
5.15.

v2:
- test_hmm is now able to create private and coherent device mirror
instances in the same driver probe. This adds more usability to the hmm
test by not having to remove the kernel module for each device type
test (private/coherent type). This is done by passing the module
parameters spm_addr_dev0 & spm_addr_dev1. In this case, it will create
four instances of device_mirror. The first two correspond to private
device type, the last two to coherent type. Then, they can be easily
accessed from user space through /dev/hmm_mirror. Usually
num_device 0 and 1 are for private, and 2 and 3 for coherent types.

- Coherent device type pages at gup are now migrated back to system
memory if they have been long term pinned (FOLL_LONGTERM). The reason
is these pages could eventually interfere with their own device memory
manager. A new hmm_gup_test has been added to the hmm-test to test this
functionality. It makes use of the gup_test module to long term pin
user pages that have been migrate to device memory first.

- Other patch corrections made by Felix, Alistair and Christoph.

v3:
- Based on last v2 feedback we got from Alistair, we've decided to
remove migration logic for FOLL_LONGTERM coherent device type pages at
gup for now. Ideally, this should be done through the kernel mm,
instead of calling the device driver to do it. Currently, there's no
support for migrating device pages based on pfn, mainly because
migrate_pages() relies on pages being LRU pages. Alistair mentioned, he
has started to work on adding this migrate device pages logic. For now,
we fail on get_user_pages call with FOLL_LONGTERM for DEVICE_COHERENT
pages.

- Also, hmm_gup_test has been removed from hmm-test. We plan to include
it again after this migration work is ready.

- Addressed Liam Howlett's feedback changes.

Alex Sierra (10):
  mm: add zone device coherent type memory support
  mm: add device coherent vma selection for memory migration
  mm/gup: fail get_user_pages for LONGTERM dev coherent type
  drm/amdkfd: add SPM support for SVM
  drm/amdkfd: coherent type as sys mem on migration to ram
  lib: test_hmm add ioctl to get zone device type
  lib: test_hmm add module param for zone device type
  lib: add support for device coherent type in test_hmm
  tools: update hmm-test to support device coherent type
  tools: update test_hmm script to support SP config

 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c |  34 ++-
 include/linux/memremap.h |   8 +
 include/linux/migrate.h  |   1 +
 include/linux/mm.h   |  16 ++
 lib/test_hmm.c   | 333 +--
 lib/test_hmm_uapi.h  |  22 +-
 mm/gup.c |   7 +
 mm/memcontrol.c  |   6 +-
 mm/memory-failure.c  | 

[PATCH v3 03/10] mm/gup: fail get_user_pages for LONGTERM dev coherent type

2022-01-10 Thread Alex Sierra
Avoid long term pinning for Coherent device type pages. This could
interfere with their own device memory manager. For now, we are just
returning error for PIN_LONGTERM Coherent device type pages. Eventually,
these type of pages will get migrated to system memory, once the device
migration pages support is added.

Signed-off-by: Alex Sierra 
---
 mm/gup.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/mm/gup.c b/mm/gup.c
index 886d6148d3d0..9c8a075d862d 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1720,6 +1720,12 @@ static long check_and_migrate_movable_pages(unsigned 
long nr_pages,
 * If we get a movable page, since we are going to be pinning
 * these entries, try to move them out if possible.
 */
+   if (is_device_page(head)) {
+   WARN_ON_ONCE(is_device_private_page(head));
+   ret = -EFAULT;
+   goto unpin_pages;
+   }
+
if (!is_pinnable_page(head)) {
if (PageHuge(head)) {
if (!isolate_huge_page(head, 
&movable_page_list))
@@ -1750,6 +1756,7 @@ static long check_and_migrate_movable_pages(unsigned long 
nr_pages,
if (list_empty(&movable_page_list) && !isolation_error_count)
return nr_pages;
 
+unpin_pages:
if (gup_flags & FOLL_PIN) {
unpin_user_pages(pages, nr_pages);
} else {
-- 
2.32.0



Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread Alex Deucher
On Mon, Jan 10, 2022 at 5:05 PM Daniel Vetter  wrote:
>
> On Mon, Jan 10, 2022 at 10:30 PM Linus Torvalds
>  wrote:
> >
> > On Thu, Jan 6, 2022 at 10:12 PM Dave Airlie  wrote:
> > >
> > >   git://anongit.freedesktop.org/drm/drm tags/drm-next-2022-01-07
> >
> > Gaah. I merged things and it built cleanly, and I pushed it out.
> >
> > But then I actually *booted* it, and that's not pretty.
> >
> > It *works", but it's almost unusable because of random scanline
> > flickering.  I'm not sure how to explain it, but it's as if there
> > wasn't quite enough bandwidth on the scan-out, so you get these lines
> > of noise and/or shifted output. They are temporary - so the
> > framebuffer contents themselves is not damaged (although I don't know
> > how the compositor works - maybe the problem happens before scanout).
> >
> > This is on the same Radeon device:
> >
> >49:00.0 VGA compatible controller: Advanced Micro Devices, Inc.
> > [AMD/ATI] Ellesmere [Radeon RX 470/480/570/570X/580/580X/590] (rev e7)
> >
> > with dual 4k monitors.
> >
> > Any idea?

Sounds like something related to watermarks.  That said, we haven't
really touched the display code for DCE11 cards in quite a while.  Can
you provide your dmesg output?

Alex


>
> Since Christian is mostly the compute/memory side, adding some display
> folks for this.
> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch


Re: [Patch v4 03/24] drm/amdkfd: CRIU Introduce Checkpoint-Restore APIs

2022-01-10 Thread Felix Kuehling

On 2021-12-22 7:36 p.m., Rajneesh Bhardwaj wrote:

Checkpoint-Restore in userspace (CRIU) is a powerful tool that can
snapshot a running process and later restore it on same or a remote
machine but expects the processes that have a device file (e.g. GPU)
associated with them, provide necessary driver support to assist CRIU
and its extensible plugin interface. Thus, In order to support the
Checkpoint-Restore of any ROCm process, the AMD Radeon Open Compute
Kernel driver, needs to provide a set of new APIs that provide
necessary VRAM metadata and its contents to a userspace component
(CRIU plugin) that can store it in form of image files.

This introduces some new ioctls which will be used to checkpoint-Restore
any KFD bound user process. KFD doesn't allow any arbitrary ioctl call
unless it is called by the group leader process. Since these ioctls are
expected to be called from a KFD criu plugin which has elevated ptrace
attached privileges and CAP_CHECKPOINT_RESTORE capabilities attached with
the file descriptors so modify KFD to allow such calls.

(API redesigned by David Yat Sin)
Suggested-by: Felix Kuehling 
Signed-off-by: David Yat Sin 
Signed-off-by: Rajneesh Bhardwaj 
---
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 94 +++-
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 65 +++-
  include/uapi/linux/kfd_ioctl.h   | 79 +++-
  3 files changed, 235 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 4bfc0c8ab764..1b863bd84c96 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -33,6 +33,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include 
  #include "kfd_priv.h"
@@ -1856,6 +1857,75 @@ static int kfd_ioctl_svm(struct file *filep, struct 
kfd_process *p, void *data)
  }
  #endif
  
+static int criu_checkpoint(struct file *filep,

+  struct kfd_process *p,
+  struct kfd_ioctl_criu_args *args)
+{
+   return 0;
+}
+
+static int criu_restore(struct file *filep,
+   struct kfd_process *p,
+   struct kfd_ioctl_criu_args *args)
+{
+   return 0;
+}
+
+static int criu_unpause(struct file *filep,
+   struct kfd_process *p,
+   struct kfd_ioctl_criu_args *args)
+{
+   return 0;
+}
+
+static int criu_resume(struct file *filep,
+   struct kfd_process *p,
+   struct kfd_ioctl_criu_args *args)
+{
+   return 0;
+}
+
+static int criu_process_info(struct file *filep,
+   struct kfd_process *p,
+   struct kfd_ioctl_criu_args *args)
+{
+   return 0;
+}
+
+static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void 
*data)
+{
+   struct kfd_ioctl_criu_args *args = data;
+   int ret;
+
+   dev_dbg(kfd_device, "CRIU operation: %d\n", args->op);
+   switch (args->op) {
+   case KFD_CRIU_OP_PROCESS_INFO:
+   ret = criu_process_info(filep, p, args);
+   break;
+   case KFD_CRIU_OP_CHECKPOINT:
+   ret = criu_checkpoint(filep, p, args);
+   break;
+   case KFD_CRIU_OP_UNPAUSE:
+   ret = criu_unpause(filep, p, args);
+   break;
+   case KFD_CRIU_OP_RESTORE:
+   ret = criu_restore(filep, p, args);
+   break;
+   case KFD_CRIU_OP_RESUME:
+   ret = criu_resume(filep, p, args);
+   break;
+   default:
+   dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", 
args->op);
+   ret = -EINVAL;
+   break;
+   }
+
+   if (ret)
+   dev_dbg(kfd_device, "CRIU operation:%d err:%d\n", args->op, 
ret);
+
+   return ret;
+}
+
  #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
.cmd_drv = 0, .name = #ioctl}
@@ -1959,6 +2029,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
  
  	AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,

kfd_ioctl_set_xnack_mode, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_OP,
+   kfd_ioctl_criu, KFD_IOC_FLAG_CHECKPOINT_RESTORE),
  };
  
  #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)

@@ -1973,6 +2046,7 @@ static long kfd_ioctl(struct file *filep, unsigned int 
cmd, unsigned long arg)
char *kdata = NULL;
unsigned int usize, asize;
int retcode = -EINVAL;
+   bool ptrace_attached = false;
  
  	if (nr >= AMDKFD_CORE_IOCTL_COUNT)

goto err_i1;
@@ -1998,7 +2072,15 @@ static long kfd_ioctl(struct file *filep, unsigned int 
cmd, unsigned long arg)
 * processes need to create their own KFD device context.
 */
process = filep

Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread Daniel Vetter
On Mon, Jan 10, 2022 at 10:30 PM Linus Torvalds
 wrote:
>
> On Thu, Jan 6, 2022 at 10:12 PM Dave Airlie  wrote:
> >
> >   git://anongit.freedesktop.org/drm/drm tags/drm-next-2022-01-07
>
> Gaah. I merged things and it built cleanly, and I pushed it out.
>
> But then I actually *booted* it, and that's not pretty.
>
> It *works", but it's almost unusable because of random scanline
> flickering.  I'm not sure how to explain it, but it's as if there
> wasn't quite enough bandwidth on the scan-out, so you get these lines
> of noise and/or shifted output. They are temporary - so the
> framebuffer contents themselves is not damaged (although I don't know
> how the compositor works - maybe the problem happens before scanout).
>
> This is on the same Radeon device:
>
>49:00.0 VGA compatible controller: Advanced Micro Devices, Inc.
> [AMD/ATI] Ellesmere [Radeon RX 470/480/570/570X/580/580X/590] (rev e7)
>
> with dual 4k monitors.
>
> Any idea?

Since Christian is mostly the compute/memory side, adding some display
folks for this.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


[PATCH 1/1] Add test for hsaKmtAvailableMemory available memory inquiry

2022-01-10 Thread Daniel Phillips
Basic test for the new hsaKmtAvailableMemory library call. This is
a standalone test, does not modify any of the other tests just to
be on the safe side. More elaborate tests coming soon.

Change-Id: I738600d4b74cc5dba6b857e4c793f6b14b7d2283
Signed-off-by: Daniel Phillips 
---
 tests/kfdtest/src/KFDMemoryTest.cpp | 17 +
 1 file changed, 17 insertions(+)

diff --git a/tests/kfdtest/src/KFDMemoryTest.cpp 
b/tests/kfdtest/src/KFDMemoryTest.cpp
index 9f62727..1f93928 100644
--- a/tests/kfdtest/src/KFDMemoryTest.cpp
+++ b/tests/kfdtest/src/KFDMemoryTest.cpp
@@ -595,6 +595,23 @@ TEST_F(KFDMemoryTest, MemoryAlloc) {
 TEST_END
 }
 
+// Basic test for hsaKmtAllocMemory
+TEST_F(KFDMemoryTest, MemoryAllocAll) {
+TEST_START(TESTPROFILE_RUNALL)
+
+unsigned int* pBig = NULL;
+unsigned int* pSmall = NULL;
+m_MemoryFlags.ui32.NoNUMABind = 1;
+HSAuint64 available;
+EXPECT_SUCCESS(hsaKmtAvailableMemory(0 /* system */, &available));
+EXPECT_SUCCESS(hsaKmtAllocMemory(0 /* system */, available, m_MemoryFlags, 
reinterpret_cast(&pBig)));
+EXPECT_NE(HSAKMT_STATUS_SUCCESS, hsaKmtAllocMemory(0 /* system */, 
PAGE_SIZE, m_MemoryFlags, reinterpret_cast(&pSmall)));
+EXPECT_SUCCESS(hsaKmtFreeMemory(pBig, available));
+EXPECT_SUCCESS(hsaKmtAllocMemory(0 /* system */, PAGE_SIZE, m_MemoryFlags, 
reinterpret_cast(&pSmall)));
+
+TEST_END
+}
+
 TEST_F(KFDMemoryTest, AccessPPRMem) {
 TEST_START(TESTPROFILE_RUNALL)
 
-- 
2.34.1



Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread Linus Torvalds
On Thu, Jan 6, 2022 at 10:12 PM Dave Airlie  wrote:
>
>   git://anongit.freedesktop.org/drm/drm tags/drm-next-2022-01-07

Gaah. I merged things and it built cleanly, and I pushed it out.

But then I actually *booted* it, and that's not pretty.

It *works", but it's almost unusable because of random scanline
flickering.  I'm not sure how to explain it, but it's as if there
wasn't quite enough bandwidth on the scan-out, so you get these lines
of noise and/or shifted output. They are temporary - so the
framebuffer contents themselves is not damaged (although I don't know
how the compositor works - maybe the problem happens before scanout).

This is on the same Radeon device:

   49:00.0 VGA compatible controller: Advanced Micro Devices, Inc.
[AMD/ATI] Ellesmere [Radeon RX 470/480/570/570X/580/580X/590] (rev e7)

with dual 4k monitors.

Any idea?

  Linus


Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread pr-tracker-bot
The pull request you sent on Fri, 7 Jan 2022 16:12:06 +1000:

> git://anongit.freedesktop.org/drm/drm tags/drm-next-2022-01-07

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/8d0749b4f83bf4768ceae45ee6a79e6e7eddfc2a

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/prtracker.html


[PATCH 1/1] Add hsaKmtAvailableMemory available memory inquiry to libhsakmt

2022-01-10 Thread Daniel Phillips
Add a library call to inquire memory available for allocation per
node. Uses the AMDKFD_IOC_AVAILABLE_MEMORY ioctl available in KFD
ioctl version 1.7

Change-Id: Id770fc2261e9e076f2fbce7dcdac640a6354ddbe
---
 include/hsakmt.h  | 11 +++
 include/linux/kfd_ioctl.h | 18 --
 src/memory.c  | 23 +++
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/include/hsakmt.h b/include/hsakmt.h
index ff2d023..abc617f 100644
--- a/include/hsakmt.h
+++ b/include/hsakmt.h
@@ -374,6 +374,17 @@ hsaKmtFreeMemory(
 HSAuint64   SizeInBytes //IN
 );
 
+/**
+  Inquires memory available for allocation as a memory buffer
+*/
+
+HSAKMT_STATUS
+HSAKMTAPI
+hsaKmtAvailableMemory(
+HSAuint32 Node,
+HSAuint64 *AvailableBytes
+);
+
 /**
   Registers with KFD a memory buffer that may be accessed by the GPU
 */
diff --git a/include/linux/kfd_ioctl.h b/include/linux/kfd_ioctl.h
index 039b30b..a81ae37 100644
--- a/include/linux/kfd_ioctl.h
+++ b/include/linux/kfd_ioctl.h
@@ -32,9 +32,10 @@
  * - 1.4 - Indicate new SRAM EDC bit in device properties
  * - 1.5 - Add SVM API
  * - 1.6 - Query clear flags in SVM get_attr API
+ * - 1.7 - Add available_memory ioctl
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 6
+#define KFD_IOCTL_MINOR_VERSION 7
 
 /*
  * Debug revision change log
@@ -761,6 +762,16 @@ struct kfd_ioctl_free_memory_of_gpu_args {
__u64 handle;   /* to KFD */
 };
 
+/* Inquire available memory with kfd_ioctl_get_available_memory
+ *
+ * @available: memory available for alloc
+ */
+struct kfd_ioctl_get_available_memory_args {
+   __u64 available;/* from KFD */
+   __u32 gpu_id;   /* to KFD */
+   __u32 pad;
+};
+
 /* Map memory to one or more GPUs
  *
  * @handle:memory handle returned by alloc
@@ -1240,8 +1251,11 @@ struct kfd_ioctl_set_xnack_mode_args {
 #define AMDKFD_IOC_SET_XNACK_MODE  \
AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args)
 
+#define AMDKFD_IOC_AVAILABLE_MEMORY\
+   AMDKFD_IOR(0x22, struct kfd_ioctl_get_available_memory_args)
+
 #define AMDKFD_COMMAND_START   0x01
-#define AMDKFD_COMMAND_END 0x22
+#define AMDKFD_COMMAND_END 0x23
 
 /* non-upstream ioctls */
 #define AMDKFD_IOC_IPC_IMPORT_HANDLE\
diff --git a/src/memory.c b/src/memory.c
index 6d2a4f4..b2cd759 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -199,6 +199,29 @@ HSAKMT_STATUS HSAKMTAPI hsaKmtFreeMemory(void 
*MemoryAddress,
return fmm_release(MemoryAddress);
 }
 
+HSAKMT_STATUS HSAKMTAPI hsaKmtAvailableMemory(HSAuint32 Node, HSAuint64 
*AvailableBytes)
+{
+   struct kfd_ioctl_get_available_memory_args args = {};
+   HSAKMT_STATUS result;
+
+   CHECK_KFD_OPEN();
+   CHECK_KFD_MINOR_VERSION(7);
+
+   pr_debug("[%s] node %d\n", __func__, Node);
+
+   result = validate_nodeid(Node, &args.gpu_id);
+   if (result != HSAKMT_STATUS_SUCCESS) {
+   pr_err("[%s] invalid node ID: %d\n", __func__, Node);
+   return result;
+   }
+
+   if (kmtIoctl(kfd_fd, AMDKFD_IOC_AVAILABLE_MEMORY, &args))
+   return HSAKMT_STATUS_ERROR;
+
+   *AvailableBytes = args.available;
+   return HSAKMT_STATUS_SUCCESS;
+}
+
 HSAKMT_STATUS HSAKMTAPI hsaKmtRegisterMemory(void *MemoryAddress,
 HSAuint64 MemorySizeInBytes)
 {
-- 
2.34.1



Re: [PATCH] drm/amdkfd: Check for null pointer after calling kmemdup

2022-01-10 Thread Felix Kuehling

On 2022-01-05 10:56 a.m., Felix Kuehling wrote:

Am 2022-01-05 um 4:09 a.m. schrieb Jiasheng Jiang:

As the possible failure of the allocation, kmemdup() may return NULL
pointer.
Therefore, it should be better to check the 'props2' in order to prevent
the dereference of NULL pointer.

Fixes: 3a87177eb141 ("drm/amdkfd: Add topology support for dGPUs")
Signed-off-by: Jiasheng Jiang 

Reviewed-by: Felix Kuehling 

I applied the patch to amd-staging-drm-next.

Regards,
  Felix





---
  drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
index c60e82697385..d15380c65c6d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
@@ -410,6 +410,9 @@ static int kfd_parse_subtype_iolink(struct 
crat_subtype_iolink *iolink,
return -ENODEV;
/* same everything but the other direction */
props2 = kmemdup(props, sizeof(*props2), GFP_KERNEL);
+   if (!props2)
+   return -ENOMEM;
+
props2->node_from = id_to;
props2->node_to = id_from;
props2->kobj = NULL;


Re: [PATCH 1/1] Add available memory ioctl for libhsakmt

2022-01-10 Thread Deucher, Alexander
[Public]

This is missing your signed-off-by.  Additionally, for UAPI changes, we need a 
link the patches for the userspace component that will make use of it.

Alex


From: amd-gfx  on behalf of Daniel 
Phillips 
Sent: Monday, January 10, 2022 3:54 PM
To: amd-...@lists.freedesktop.org ; 
dri-devel@lists.freedesktop.org 
Cc: Phillips, Daniel 
Subject: [PATCH 1/1] Add available memory ioctl for libhsakmt

From: Daniel Phillips 

Add an ioctl to inquire memory available for allocation by libhsakmt
per node, allowing for space consumed by page translation tables.
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h  |  1 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c| 14 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c| 17 +
 include/uapi/linux/kfd_ioctl.h  | 14 --
 4 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index fcbc8a9c9e06..64c6c36685d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -266,6 +266,7 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct 
amdgpu_device *adev,
 void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,
 void *drm_priv);
 uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv);
+size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev);
 int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
 struct amdgpu_device *adev, uint64_t va, uint64_t size,
 void *drm_priv, struct kgd_mem **mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 86a1a6c109d9..b7490a659173 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -190,6 +190,20 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
 return ret;
 }

+size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev)
+{
+   uint64_t reserved_for_pt =
+   ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+   size_t available_memory;
+
+   spin_lock(&kfd_mem_limit.mem_limit_lock);
+   available_memory =
+   adev->gmc.real_vram_size -
+   adev->kfd.vram_used - reserved_for_pt;
+   spin_unlock(&kfd_mem_limit.mem_limit_lock);
+   return available_memory;
+}
+
 static void unreserve_mem_limit(struct amdgpu_device *adev,
 uint64_t size, u32 alloc_flag)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 4bfc0c8ab764..5c2f6d97ff1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -486,6 +486,20 @@ static int kfd_ioctl_get_queue_wave_state(struct file 
*filep,
 return r;
 }

+static int kfd_ioctl_get_available_memory(struct file *filep,
+struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_get_available_memory_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   args->available = amdgpu_amdkfd_get_available_memory(dev->adev);
+   return 0;
+}
+
 static int kfd_ioctl_set_memory_policy(struct file *filep,
 struct kfd_process *p, void *data)
 {
@@ -1959,6 +1973,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {

 AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,
 kfd_ioctl_set_xnack_mode, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
+   kfd_ioctl_get_available_memory, 0),
 };

 #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index af96af174dc4..94a99add2432 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -32,9 +32,10 @@
  * - 1.4 - Indicate new SRAM EDC bit in device properties
  * - 1.5 - Add SVM API
  * - 1.6 - Query clear flags in SVM get_attr API
+ * - 1.7 - Add available_memory ioctl
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 6
+#define KFD_IOCTL_MINOR_VERSION 7

 struct kfd_ioctl_get_version_args {
 __u32 major_version;/* from KFD */
@@ -98,6 +99,12 @@ struct kfd_ioctl_get_queue_wave_state_args {
 __u32 pad;
 };

+struct kfd_ioctl_get_available_memory_args {
+   __u64 available;/* from KFD */
+   __u32 gpu_id;   /* to KFD */
+   __u32 pad;
+};
+
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
 #define KFD_IOC_CACHE_POLICY_COHERENT 0
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
@@ -742,7 +749,10 @@ struct kfd_ioctl_set_xnack_mode_args {
 #defin

Re: [git pull] drm for 5.17-rc1 (pre-merge window pull)

2022-01-10 Thread Linus Torvalds
On Thu, Jan 6, 2022 at 10:12 PM Dave Airlie  wrote:
>
> nouveau_fence.c is the only conflict I've seen and I've taken the result from
> our rerere cache in the merge above. It's non trivial, would be good to have
> Christian confirm it as well.

Thanks, that conflict really ended up being one that I would have done
very differently without having had that pointer to your reference
merge. And I would almost certainly have messed it up in the process.

So what I did was to look at your merge resolution (or possibly
Christian's? I don't know how you guys share your trees and the origin
of that rerere), and tried to understand it, and basically recreate
it.

It's not exactly the same (different whitespace and variable
lifetimes), but I think I got the gist of it.

Thanks for the pointer, and hopefully I didn't mess it up _despite_
your merge showing me what I should aim for ;)

   Linus


[PATCH v10 5/5] drm/msm/dp: stop link training after link training 2 failed

2022-01-10 Thread Kuogee Hsieh
Each DP link training contains link training 1 followed by link
training 2.  There is maximum of 5 retries of DP link training
before declared link training failed. It is required to stop link
training at end of link training 2 if it is failed so that next
link training 1 can start freshly. This patch fixes link compliance
test  case 4.3.1.13 (Source Device Link Training EQ Fallback Test).

Changes in v10:
--  group into one series

Fixes: 2e0adc765d88 ("drm/msm/dp: do not end dp link training until video is 
ready")
Signed-off-by: Kuogee Hsieh 
Reviewed-by: Stephen Boyd 
---
 drivers/gpu/drm/msm/dp/dp_ctrl.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/msm/dp/dp_ctrl.c b/drivers/gpu/drm/msm/dp/dp_ctrl.c
index f98df93..245e1b9 100644
--- a/drivers/gpu/drm/msm/dp/dp_ctrl.c
+++ b/drivers/gpu/drm/msm/dp/dp_ctrl.c
@@ -1755,6 +1755,9 @@ int dp_ctrl_on_link(struct dp_ctrl *dp_ctrl)
/* end with failure */
break; /* lane == 1 already */
}
+
+   /* stop link training before start re training  */
+   dp_ctrl_clear_training_pattern(ctrl);
}
}
 
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v10 4/5] drm/msm/dp: add support of tps4 (training pattern 4) for HBR3

2022-01-10 Thread Kuogee Hsieh
From: Kuogee Hsieh 

Some DP sinkers prefer to use tps4 instead of tps3 during training #2.
This patch will use tps4 to perform link training #2 if sinker's DPCD
supports it.

Changes in V2:
-- replace  dp_catalog_ctrl_set_pattern() with  
dp_catalog_ctrl_set_pattern_state_bit()

Changes in V3:
-- change state_ctrl_bits type to u32 and pattern type to u8

Changes in V4:
-- align } else if { and } else {

Changes in v10:
--  group into one series

Signed-off-by: Kuogee Hsieh 

Reviewed-by: Stephen Boyd 
---
 drivers/gpu/drm/msm/dp/dp_catalog.c | 12 ++--
 drivers/gpu/drm/msm/dp/dp_catalog.h |  2 +-
 drivers/gpu/drm/msm/dp/dp_ctrl.c| 17 -
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/msm/dp/dp_catalog.c 
b/drivers/gpu/drm/msm/dp/dp_catalog.c
index 6ae9b29..64f0b26 100644
--- a/drivers/gpu/drm/msm/dp/dp_catalog.c
+++ b/drivers/gpu/drm/msm/dp/dp_catalog.c
@@ -456,19 +456,19 @@ void dp_catalog_ctrl_config_msa(struct dp_catalog 
*dp_catalog,
dp_write_p0(catalog, MMSS_DP_DSC_DTO, 0x0);
 }
 
-int dp_catalog_ctrl_set_pattern(struct dp_catalog *dp_catalog,
-   u32 pattern)
+int dp_catalog_ctrl_set_pattern_state_bit(struct dp_catalog *dp_catalog,
+   u32 state_bit)
 {
int bit, ret;
u32 data;
struct dp_catalog_private *catalog = container_of(dp_catalog,
struct dp_catalog_private, dp_catalog);
 
-   bit = BIT(pattern - 1);
-   DRM_DEBUG_DP("hw: bit=%d train=%d\n", bit, pattern);
+   bit = BIT(state_bit - 1);
+   DRM_DEBUG_DP("hw: bit=%d train=%d\n", bit, state_bit);
dp_catalog_ctrl_state_ctrl(dp_catalog, bit);
 
-   bit = BIT(pattern - 1) << DP_MAINLINK_READY_LINK_TRAINING_SHIFT;
+   bit = BIT(state_bit - 1) << DP_MAINLINK_READY_LINK_TRAINING_SHIFT;
 
/* Poll for mainlink ready status */
ret = readx_poll_timeout(readl, catalog->io->dp_controller.link.base +
@@ -476,7 +476,7 @@ int dp_catalog_ctrl_set_pattern(struct dp_catalog 
*dp_catalog,
data, data & bit,
POLLING_SLEEP_US, POLLING_TIMEOUT_US);
if (ret < 0) {
-   DRM_ERROR("set pattern for link_train=%d failed\n", pattern);
+   DRM_ERROR("set state_bit for link_train=%d failed\n", 
state_bit);
return ret;
}
return 0;
diff --git a/drivers/gpu/drm/msm/dp/dp_catalog.h 
b/drivers/gpu/drm/msm/dp/dp_catalog.h
index 6965afa..7dea101 100644
--- a/drivers/gpu/drm/msm/dp/dp_catalog.h
+++ b/drivers/gpu/drm/msm/dp/dp_catalog.h
@@ -94,7 +94,7 @@ void dp_catalog_ctrl_mainlink_ctrl(struct dp_catalog 
*dp_catalog, bool enable);
 void dp_catalog_ctrl_config_misc(struct dp_catalog *dp_catalog, u32 cc, u32 
tb);
 void dp_catalog_ctrl_config_msa(struct dp_catalog *dp_catalog, u32 rate,
u32 stream_rate_khz, bool fixed_nvid);
-int dp_catalog_ctrl_set_pattern(struct dp_catalog *dp_catalog, u32 pattern);
+int dp_catalog_ctrl_set_pattern_state_bit(struct dp_catalog *dp_catalog, u32 
pattern);
 void dp_catalog_ctrl_reset(struct dp_catalog *dp_catalog);
 bool dp_catalog_ctrl_mainlink_ready(struct dp_catalog *dp_catalog);
 void dp_catalog_ctrl_enable_irq(struct dp_catalog *dp_catalog, bool enable);
diff --git a/drivers/gpu/drm/msm/dp/dp_ctrl.c b/drivers/gpu/drm/msm/dp/dp_ctrl.c
index 9c80b49..f98df93 100644
--- a/drivers/gpu/drm/msm/dp/dp_ctrl.c
+++ b/drivers/gpu/drm/msm/dp/dp_ctrl.c
@@ -1083,7 +1083,7 @@ static int dp_ctrl_link_train_1(struct dp_ctrl_private 
*ctrl,
 
*training_step = DP_TRAINING_1;
 
-   ret = dp_catalog_ctrl_set_pattern(ctrl->catalog, DP_TRAINING_PATTERN_1);
+   ret = dp_catalog_ctrl_set_pattern_state_bit(ctrl->catalog, 1);
if (ret)
return ret;
dp_ctrl_train_pattern_set(ctrl, DP_TRAINING_PATTERN_1 |
@@ -1181,7 +1181,8 @@ static int dp_ctrl_link_train_2(struct dp_ctrl_private 
*ctrl,
int *training_step)
 {
int tries = 0, ret = 0;
-   char pattern;
+   u8 pattern;
+   u32 state_ctrl_bit;
int const maximum_retries = 5;
u8 link_status[DP_LINK_STATUS_SIZE];
 
@@ -1189,12 +1190,18 @@ static int dp_ctrl_link_train_2(struct dp_ctrl_private 
*ctrl,
 
*training_step = DP_TRAINING_2;
 
-   if (drm_dp_tps3_supported(ctrl->panel->dpcd))
+   if (drm_dp_tps4_supported(ctrl->panel->dpcd)) {
+   pattern = DP_TRAINING_PATTERN_4;
+   state_ctrl_bit = 4;
+   } else if (drm_dp_tps3_supported(ctrl->panel->dpcd)) {
pattern = DP_TRAINING_PATTERN_3;
-   else
+   state_ctrl_bit = 3;
+   } else {
pattern = DP_TRAINING_PATTERN_2;
+   state_ctrl_bit = 2;
+   }
 
-   ret = dp_catalog_ctrl_set_pattern(ctrl->catalog, pattern);
+   ret = dp_catalog_ctrl_set_pattern_stat

[PATCH v10 3/5] drm/msm/dp: populate connector of struct dp_panel

2022-01-10 Thread Kuogee Hsieh
DP CTS test case 4.2.2.6 has valid edid with bad checksum on purpose
and expect DP source return correct checksum. During drm edid read,
correct edid checksum is calculated and stored at
connector::real_edid_checksum.

The problem is struct dp_panel::connector never be assigned, instead the
connector is stored in struct msm_dp::connector. When we run compliance
testing test case 4.2.2.6 dp_panel_handle_sink_request() won't have a valid
edid set in struct dp_panel::edid so we'll try to use the connectors
real_edid_checksum and hit a NULL pointer dereference error because the
connector pointer is never assigned.

Changes in V2:
-- populate panel connector at msm_dp_modeset_init() instead of at 
dp_panel_read_sink_caps()

Changes in V3:
-- remove unhelpful kernel crash trace commit text
-- remove renaming dp_display parameter to dp

Changes in V4:
-- add more details to commit text

Changes in v10:
--  group into one series

Fixes: 7948fe12d47 ("drm/msm/dp: return correct edid checksum after corrupted 
edid checksum read")
Signee-off-by: Kuogee Hsieh 

Reviewed-by: Bjorn Andersson 
Reviewed-by: Stephen Boyd 
---
 drivers/gpu/drm/msm/dp/dp_display.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/msm/dp/dp_display.c 
b/drivers/gpu/drm/msm/dp/dp_display.c
index c7f0423..e76e375 100644
--- a/drivers/gpu/drm/msm/dp/dp_display.c
+++ b/drivers/gpu/drm/msm/dp/dp_display.c
@@ -1489,6 +1489,7 @@ int msm_dp_modeset_init(struct msm_dp *dp_display, struct 
drm_device *dev,
struct drm_encoder *encoder)
 {
struct msm_drm_private *priv;
+   struct dp_display_private *dp_priv;
int ret;
 
if (WARN_ON(!encoder) || WARN_ON(!dp_display) || WARN_ON(!dev))
@@ -1497,6 +1498,8 @@ int msm_dp_modeset_init(struct msm_dp *dp_display, struct 
drm_device *dev,
priv = dev->dev_private;
dp_display->drm_dev = dev;
 
+   dp_priv = container_of(dp_display, struct dp_display_private, 
dp_display);
+
ret = dp_display_request_irq(dp_display);
if (ret) {
DRM_ERROR("request_irq failed, ret=%d\n", ret);
@@ -1514,6 +1517,8 @@ int msm_dp_modeset_init(struct msm_dp *dp_display, struct 
drm_device *dev,
return ret;
}
 
+   dp_priv->panel->connector = dp_display->connector;
+
priv->connectors[priv->num_connectors++] = dp_display->connector;
return 0;
 }
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v10 2/5] drm/msm/dp: do not initialize phy until plugin interrupt received

2022-01-10 Thread Kuogee Hsieh
Current DP drivers have regulators, clocks, irq and phy are grouped
together within a function and executed not in a symmetric manner.
This increase difficulty of code maintenance and limited code scalability.
This patch divides the driver life cycle of operation into four states,
resume (including booting up), dongle plugin, dongle unplugged and suspend.
Regulators, core clocks and irq are grouped together and enabled at resume
(or booting up) so that the DP controller is armed and ready to receive HPD
plugin interrupts. HPD plugin interrupt is generated when a dongle plugs
into DUT (device under test). Once HPD plugin interrupt is received, DP
controller will initialize phy so that dpcd read/write will function and
following link training can be proceeded successfully. DP phy will be
disabled after main link is teared down at end of unplugged HPD interrupt
handle triggered by dongle unplugged out of DUT. Finally regulators, code
clocks and irq are disabled at corresponding suspension.

Changes in V2:
-- removed unnecessary dp_ctrl NULL check
-- removed unnecessary phy init_count and power_count DRM_DEBUG_DP logs
-- remove flip parameter out of dp_ctrl_irq_enable()
-- add fixes tag

Changes in V3:
-- call dp_display_host_phy_init() instead of dp_ctrl_phy_init() at
dp_display_host_init() for eDP

Changes in V4:
-- rewording commit text to match this commit changes

Changes in V5:
-- rebase on top of msm-next branch

Changes in V6:
-- delete flip variable

Changes in V7:
-- dp_ctrl_irq_enable/disabe() merged into dp_ctrl_reset_irq_ctrl()

Changes in V8:
-- add more detail comment regrading dp phy at dp_display_host_init()

Changes in V9:
-- remove set phy_initialized to false when -ECONNRESET detected

Changes in v10:
--  group into one series

Fixes: 8ede2ecc3e5e ("drm/msm/dp: Add DP compliance tests on Snapdragon 
Chipsets")
Signed-off-by: Kuogee Hsieh 
---
 drivers/gpu/drm/msm/dp/dp_ctrl.c| 80 +
 drivers/gpu/drm/msm/dp/dp_ctrl.h|  8 ++--
 drivers/gpu/drm/msm/dp/dp_display.c | 89 -
 3 files changed, 94 insertions(+), 83 deletions(-)

diff --git a/drivers/gpu/drm/msm/dp/dp_ctrl.c b/drivers/gpu/drm/msm/dp/dp_ctrl.c
index c724cb0..9c80b49 100644
--- a/drivers/gpu/drm/msm/dp/dp_ctrl.c
+++ b/drivers/gpu/drm/msm/dp/dp_ctrl.c
@@ -1365,60 +1365,44 @@ static int dp_ctrl_enable_stream_clocks(struct 
dp_ctrl_private *ctrl)
return ret;
 }
 
-int dp_ctrl_host_init(struct dp_ctrl *dp_ctrl, bool flip, bool reset)
+void dp_ctrl_reset_irq_ctrl(struct dp_ctrl *dp_ctrl, bool enable)
+{
+   struct dp_ctrl_private *ctrl;
+
+   ctrl = container_of(dp_ctrl, struct dp_ctrl_private, dp_ctrl);
+
+   dp_catalog_ctrl_reset(ctrl->catalog);
+
+   if (enable)
+   dp_catalog_ctrl_enable_irq(ctrl->catalog, enable);
+}
+
+void dp_ctrl_phy_init(struct dp_ctrl *dp_ctrl)
 {
struct dp_ctrl_private *ctrl;
struct dp_io *dp_io;
struct phy *phy;
 
-   if (!dp_ctrl) {
-   DRM_ERROR("Invalid input data\n");
-   return -EINVAL;
-   }
-
ctrl = container_of(dp_ctrl, struct dp_ctrl_private, dp_ctrl);
dp_io = &ctrl->parser->io;
phy = dp_io->phy;
 
-   ctrl->dp_ctrl.orientation = flip;
-
-   if (reset)
-   dp_catalog_ctrl_reset(ctrl->catalog);
-
-   DRM_DEBUG_DP("flip=%d\n", flip);
dp_catalog_ctrl_phy_reset(ctrl->catalog);
phy_init(phy);
-   dp_catalog_ctrl_enable_irq(ctrl->catalog, true);
-
-   return 0;
 }
 
-/**
- * dp_ctrl_host_deinit() - Uninitialize DP controller
- * @dp_ctrl: Display Port Driver data
- *
- * Perform required steps to uninitialize DP controller
- * and its resources.
- */
-void dp_ctrl_host_deinit(struct dp_ctrl *dp_ctrl)
+void dp_ctrl_phy_exit(struct dp_ctrl *dp_ctrl)
 {
struct dp_ctrl_private *ctrl;
struct dp_io *dp_io;
struct phy *phy;
 
-   if (!dp_ctrl) {
-   DRM_ERROR("Invalid input data\n");
-   return;
-   }
-
ctrl = container_of(dp_ctrl, struct dp_ctrl_private, dp_ctrl);
dp_io = &ctrl->parser->io;
phy = dp_io->phy;
 
-   dp_catalog_ctrl_enable_irq(ctrl->catalog, false);
+   dp_catalog_ctrl_phy_reset(ctrl->catalog);
phy_exit(phy);
-
-   DRM_DEBUG_DP("Host deinitialized successfully\n");
 }
 
 static bool dp_ctrl_use_fixed_nvid(struct dp_ctrl_private *ctrl)
@@ -1488,7 +1472,10 @@ static int dp_ctrl_deinitialize_mainlink(struct 
dp_ctrl_private *ctrl)
}
 
phy_power_off(phy);
+
+   /* aux channel down, reinit phy */
phy_exit(phy);
+   phy_init(phy);
 
return 0;
 }
@@ -1893,8 +1880,14 @@ int dp_ctrl_off_link_stream(struct dp_ctrl *dp_ctrl)
return ret;
}
 
+   DRM_DEBUG_DP("Before, phy=%x init_count=%d power_on=%d\n",
+   (u32)(uintptr_t)phy, phy->init_count, phy->power_count);
+
phy_power_off(phy);
 

[PATCH v10 1/5] drm/msm/dp: dp_link_parse_sink_count() return immediately if aux read failed

2022-01-10 Thread Kuogee Hsieh
Add checking aux read/write status at both dp_link_parse_sink_count()
and dp_link_parse_sink_status_filed() to avoid long timeout delay if
dp aux read/write failed at timeout due to cable unplugged. Also make
sure dp controller had been initialized before start dpcd read and write.

Changes in V4:
-- split this patch as stand alone patch

Changes in v5:
-- rebase on msm-next branch

Changes in v6:
-- add more details commit text

Changes in v10:
--  group into one series

Signed-off-by: Kuogee Hsieh 
Reviewed-by: Stephen Boyd 
Tested-by: Stephen Boyd 
---
 drivers/gpu/drm/msm/dp/dp_display.c | 12 +---
 drivers/gpu/drm/msm/dp/dp_link.c| 19 ++-
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/msm/dp/dp_display.c 
b/drivers/gpu/drm/msm/dp/dp_display.c
index 3d61459..0766752 100644
--- a/drivers/gpu/drm/msm/dp/dp_display.c
+++ b/drivers/gpu/drm/msm/dp/dp_display.c
@@ -692,9 +692,15 @@ static int dp_irq_hpd_handle(struct dp_display_private 
*dp, u32 data)
return 0;
}
 
-   ret = dp_display_usbpd_attention_cb(&dp->pdev->dev);
-   if (ret == -ECONNRESET) { /* cable unplugged */
-   dp->core_initialized = false;
+   /*
+* dp core (ahb/aux clks) must be initialized before
+* irq_hpd be handled
+*/
+   if (dp->core_initialized) {
+   ret = dp_display_usbpd_attention_cb(&dp->pdev->dev);
+   if (ret == -ECONNRESET) { /* cable unplugged */
+   dp->core_initialized = false;
+   }
}
DRM_DEBUG_DP("hpd_state=%d\n", state);
 
diff --git a/drivers/gpu/drm/msm/dp/dp_link.c b/drivers/gpu/drm/msm/dp/dp_link.c
index a5bdfc5..d4d31e5 100644
--- a/drivers/gpu/drm/msm/dp/dp_link.c
+++ b/drivers/gpu/drm/msm/dp/dp_link.c
@@ -737,18 +737,25 @@ static int dp_link_parse_sink_count(struct dp_link 
*dp_link)
return 0;
 }
 
-static void dp_link_parse_sink_status_field(struct dp_link_private *link)
+static int dp_link_parse_sink_status_field(struct dp_link_private *link)
 {
int len = 0;
 
link->prev_sink_count = link->dp_link.sink_count;
-   dp_link_parse_sink_count(&link->dp_link);
+   len = dp_link_parse_sink_count(&link->dp_link);
+   if (len < 0) {
+   DRM_ERROR("DP parse sink count failed\n");
+   return len;
+   }
 
len = drm_dp_dpcd_read_link_status(link->aux,
link->link_status);
-   if (len < DP_LINK_STATUS_SIZE)
+   if (len < DP_LINK_STATUS_SIZE) {
DRM_ERROR("DP link status read failed\n");
-   dp_link_parse_request(link);
+   return len;
+   }
+
+   return dp_link_parse_request(link);
 }
 
 /**
@@ -1023,7 +1030,9 @@ int dp_link_process_request(struct dp_link *dp_link)
 
dp_link_reset_data(link);
 
-   dp_link_parse_sink_status_field(link);
+   ret = dp_link_parse_sink_status_field(link);
+   if (ret)
+   return ret;
 
if (link->request.test_requested == DP_TEST_LINK_EDID_READ) {
dp_link->sink_request |= DP_TEST_LINK_EDID_READ;
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v10 0/5] group dp driver related patches into one series

2022-01-10 Thread Kuogee Hsieh
Group below 5 dp driver related patches into one series.

Kuogee Hsieh (5):
  drm/msm/dp: dp_link_parse_sink_count() return immediately if aux read
failed
  drm/msm/dp: do not initialize phy until plugin interrupt received
  drm/msm/dp:  populate connector of struct  dp_panel
  drm/msm/dp: add support of tps4 (training pattern 4) for HBR3
  drm/msm/dp: stop link training after link training 2 failed

 drivers/gpu/drm/msm/dp/dp_catalog.c |  12 ++---
 drivers/gpu/drm/msm/dp/dp_catalog.h |   2 +-
 drivers/gpu/drm/msm/dp/dp_ctrl.c| 100 
 drivers/gpu/drm/msm/dp/dp_ctrl.h|   8 +--
 drivers/gpu/drm/msm/dp/dp_display.c |  98 ---
 drivers/gpu/drm/msm/dp/dp_link.c|  19 +--
 6 files changed, 140 insertions(+), 99 deletions(-)

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH 1/1] Add available memory ioctl for libhsakmt

2022-01-10 Thread Daniel Phillips
From: Daniel Phillips 

Add an ioctl to inquire memory available for allocation by libhsakmt
per node, allowing for space consumed by page translation tables.
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h  |  1 +
 .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c| 14 ++
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c| 17 +
 include/uapi/linux/kfd_ioctl.h  | 14 --
 4 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index fcbc8a9c9e06..64c6c36685d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -266,6 +266,7 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct 
amdgpu_device *adev,
 void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,
void *drm_priv);
 uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv);
+size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev);
 int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
struct amdgpu_device *adev, uint64_t va, uint64_t size,
void *drm_priv, struct kgd_mem **mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 86a1a6c109d9..b7490a659173 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -190,6 +190,20 @@ static int amdgpu_amdkfd_reserve_mem_limit(struct 
amdgpu_device *adev,
return ret;
 }
 
+size_t amdgpu_amdkfd_get_available_memory(struct amdgpu_device *adev)
+{
+   uint64_t reserved_for_pt =
+   ESTIMATE_PT_SIZE(amdgpu_amdkfd_total_mem_size);
+   size_t available_memory;
+
+   spin_lock(&kfd_mem_limit.mem_limit_lock);
+   available_memory =
+   adev->gmc.real_vram_size -
+   adev->kfd.vram_used - reserved_for_pt;
+   spin_unlock(&kfd_mem_limit.mem_limit_lock);
+   return available_memory;
+}
+
 static void unreserve_mem_limit(struct amdgpu_device *adev,
uint64_t size, u32 alloc_flag)
 {
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 4bfc0c8ab764..5c2f6d97ff1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -486,6 +486,20 @@ static int kfd_ioctl_get_queue_wave_state(struct file 
*filep,
return r;
 }
 
+static int kfd_ioctl_get_available_memory(struct file *filep,
+struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_get_available_memory_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   args->available = amdgpu_amdkfd_get_available_memory(dev->adev);
+   return 0;
+}
+
 static int kfd_ioctl_set_memory_policy(struct file *filep,
struct kfd_process *p, void *data)
 {
@@ -1959,6 +1973,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,
kfd_ioctl_set_xnack_mode, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
+   kfd_ioctl_get_available_memory, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/include/uapi/linux/kfd_ioctl.h b/include/uapi/linux/kfd_ioctl.h
index af96af174dc4..94a99add2432 100644
--- a/include/uapi/linux/kfd_ioctl.h
+++ b/include/uapi/linux/kfd_ioctl.h
@@ -32,9 +32,10 @@
  * - 1.4 - Indicate new SRAM EDC bit in device properties
  * - 1.5 - Add SVM API
  * - 1.6 - Query clear flags in SVM get_attr API
+ * - 1.7 - Add available_memory ioctl
  */
 #define KFD_IOCTL_MAJOR_VERSION 1
-#define KFD_IOCTL_MINOR_VERSION 6
+#define KFD_IOCTL_MINOR_VERSION 7
 
 struct kfd_ioctl_get_version_args {
__u32 major_version;/* from KFD */
@@ -98,6 +99,12 @@ struct kfd_ioctl_get_queue_wave_state_args {
__u32 pad;
 };
 
+struct kfd_ioctl_get_available_memory_args {
+   __u64 available;/* from KFD */
+   __u32 gpu_id;   /* to KFD */
+   __u32 pad;
+};
+
 /* For kfd_ioctl_set_memory_policy_args.default_policy and alternate_policy */
 #define KFD_IOC_CACHE_POLICY_COHERENT 0
 #define KFD_IOC_CACHE_POLICY_NONCOHERENT 1
@@ -742,7 +749,10 @@ struct kfd_ioctl_set_xnack_mode_args {
 #define AMDKFD_IOC_SET_XNACK_MODE  \
AMDKFD_IOWR(0x21, struct kfd_ioctl_set_xnack_mode_args)
 
+#define AMDKFD_IOC_AVAILABLE_MEMORY\
+   AMDKFD_IOR(0x22, struct kfd_ioctl_get_available_memory_args)
+
 #define AMDKFD_COMMAND_START   0x01
-#define AMDKFD_COMMAND_END 0x22
+#define AMDKFD_COMMAND_END 0x23
 
 #endif
-- 
2.34.1



Re: [PATCH 1/4] dt-bindings: backlight: qcom-wled: Add PM6150L compatible

2022-01-10 Thread Rob Herring
On Wed, 29 Dec 2021 18:03:55 +0100, Luca Weiss wrote:
> Document the compatible for the wled block found in PM6150L.
> 
> Signed-off-by: Luca Weiss 
> ---
>  Documentation/devicetree/bindings/leds/backlight/qcom-wled.yaml | 1 +
>  1 file changed, 1 insertion(+)
> 

Acked-by: Rob Herring 


Phyr Starter

2022-01-10 Thread Matthew Wilcox
TLDR: I want to introduce a new data type:

struct phyr {
phys_addr_t addr;
size_t len;
};

and use it to replace bio_vec as well as using it to replace the array
of struct pages used by get_user_pages() and friends.

---

There are two distinct problems I want to address: doing I/O to memory
which does not have a struct page and efficiently doing I/O to large
blobs of physically contiguous memory, regardless of whether it has a
struct page.  There are some other improvements which I regard as minor.

There are many types of memory that one might want to do I/O to that do
not have a struct page, some examples:
 - Memory on a graphics card (or other PCI card, but gfx seems to be
   the primary provider of DRAM on the PCI bus today)
 - DAX, or other pmem (there are some fake pages today, but this is
   mostly a workaround for the IO problem today)
 - Guest memory being accessed from the hypervisor (KVM needs to
   create structpages to make this happen.  Xen doesn't ...)
All of these kinds of memories can be addressed by the CPU and so also
by a bus master.  That is, there is a physical address that the CPU
can use which will address this memory, and there is a way to convert
that to a DMA address which can be programmed into another device.
There's no intent here to support memory which can be accessed by a
complex scheme like writing an address to a control register and then
accessing the memory through a FIFO; this is for memory which can be
accessed by DMA and CPU loads and stores.

For get_user_pages() and friends, we currently fill an array of struct
pages, each one representing PAGE_SIZE bytes.  For an application that
is using 1GB hugepages, writing 2^18 entries is a significant overhead.
It also makes drivers hard to write as they have to recoalesce the
struct pages, even though the VM can tell it whether those 2^18 pages
are contiguous.

On the minor side, struct phyr can represent any mappable chunk of memory.
A bio_vec is limited to 2^32 bytes, while on 64-bit machines a phyr
can represent larger than 4GB.  A phyr is the same size as a bio_vec
on 64 bit (16 bytes), and the same size for 32-bit with PAE (12 bytes).
It is smaller for 32-bit machines without PAE (8 bytes instead of 12).

Finally, it may be possible to stop using scatterlist to describe the
input to the DMA-mapping operation.  We may be able to get struct
scatterlist down to just dma_address and dma_length, with chaining
handled through an enclosing struct.

I would like to see phyr replace bio_vec everywhere it's currently used.
I don't have time to do that work now because I'm busy with folios.
If someone else wants to take that on, I shall cheer from the sidelines.
What I do intend to do is:

 - Add an interface to gup.c to pin/unpin N phyrs
 - Add a sg_map_phyrs()
   This will take an array of phyrs and allocate an sg for them
 - Whatever else I need to do to make one RDMA driver happy with
   this scheme

At that point, I intend to stop and let others more familiar with this
area of the kernel continue the conversion of drivers.

P.S. If you've had the Prodigy song running through your head the whole
time you've been reading this email ... I'm sorry / You're welcome.
If people insist, we can rename this to phys_range or something boring,
but I quite like the spelling of phyr with the pronunciation of "fire".


Re: [PATCH] drm/amd/display: invalid parameter check in dmub_hpd_callback

2022-01-10 Thread Alex Deucher
Applied.  Thanks!

Alex

On Mon, Jan 10, 2022 at 11:34 AM Harry Wentland  wrote:
>
> On 2022-01-09 13:42, José Expósito wrote:
> > The function performs a check on the "adev" input parameter, however, it
> > is used before the check.
> >
> > Initialize the "dev" variable after the sanity check to avoid a possible
> > NULL pointer dereference.
> >
> > Fixes: e27c41d5b0681 ("drm/amd/display: Support for DMUB HPD interrupt 
> > handling")
> > Addresses-Coverity-ID: 1493909 ("Null pointer dereference")
> > Signed-off-by: José Expósito 
>
> Reviewed-by: Harry Wentland 
>
> Harry
>
> > ---
> >  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > index e727f1dd2a9a..7fbded7a6d9c 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > @@ -656,7 +656,7 @@ void dmub_hpd_callback(struct amdgpu_device *adev, 
> > struct dmub_notification *not
> >   struct drm_connector_list_iter iter;
> >   struct dc_link *link;
> >   uint8_t link_index = 0;
> > - struct drm_device *dev = adev->dm.ddev;
> > + struct drm_device *dev;
> >
> >   if (adev == NULL)
> >   return;
> > @@ -673,6 +673,7 @@ void dmub_hpd_callback(struct amdgpu_device *adev, 
> > struct dmub_notification *not
> >
> >   link_index = notify->link_index;
> >   link = adev->dm.dc->links[link_index];
> > + dev = adev->dm.ddev;
> >
> >   drm_connector_list_iter_begin(dev, &iter);
> >   drm_for_each_connector_iter(connector, &iter) {
>


Re: [PATCH] drm/msm/dp: Add DisplayPort controller for SM8350

2022-01-10 Thread Rob Herring
On Mon, 27 Dec 2021 20:59:34 -0800, Bjorn Andersson wrote:
> The Qualcomm SM8350 platform comes with a single DisplayPort controller,
> add support for this in the DisplayPort driver.
> 
> Signed-off-by: Bjorn Andersson 
> ---
>  .../devicetree/bindings/display/msm/dp-controller.yaml| 1 +
>  drivers/gpu/drm/msm/dp/dp_display.c   | 8 
>  2 files changed, 9 insertions(+)
> 

Acked-by: Rob Herring 


Re: [PATCH 2/2] drm/panfrost: adjusted job affinity for dual core group GPUs

2022-01-10 Thread Alyssa Rosenzweig
> Whether it's worth the effort depends on whether anyone really cares
> about getting the full performance out of this particular GPU.
> 
> At this stage I think the main UABI change would be to add the opposite
> flag to kbase, (e.g. "PANFROST_JD_DOESNT_NEED_COHERENCY_ON_GPU"[1]) to
> opt-in to allowing the job to run across all cores.
> 
> The second change would be to allow compute jobs to be run on the second
> core group, so another flag: PANFROST_RUN_ON_SECOND_CORE_GROUP.
> 
> But clearly there's little point adding such flags until someone steps
> up to do the Mesa work.

I worry about the maintainence burden (both Mesa and kernel) of adding
UABI only used by a piece of hardware none of us own, and only useful
"sometimes" for that hardware. Doubly so for the second core group
support; currently Mesa doesn't advertise any compute support on
anything older than Mali T760 ... to the best of my knowledge, nobody
has missed that support either...

To be clear I am in favour of merging the patches needed for GLES2 to
work on all Malis, possibly at a performance cost on these dual-core
systems. That's a far cry from the level of support the DDK gave these
chips back in the day ... of course, the DDK doesn't support them at all
anymore, so Panfrost wins there by default! ;)


Re: [RFC PATCH] drm/panfrost: Handle IDVS_GROUP_SIZE feature

2022-01-10 Thread Alyssa Rosenzweig
> > This feature adds an extra IDVS group size field to the JM_CONFIG
> > register. In kbase, the value is configurable via the device tree; kbase
> > uses 0xF as a default if no value is specified. Until we find a device
> > demanding otherwise, let's always set the 0xF default on devices which
> > support this feature mimicking kbase's behaviour.
> 
> This is a performance thing - so I don't think it will break anything if
> this is wrong, it just won't be optimal.

Then interpret my remarks as hardcoding the default until we find a
device where setting to something other than 0xF improves performance
nontrivially. (Read: I am lazy and do not want to write dt-bindings for
something nobody will ever use.)

> > As JM_CONFIG is an undocumented register, it's not clear to me what
> > happens if we fail to include this handling. Index-driven vertex shading
> > already works on Bifrost boards with this feature without this handling.
> > Perhaps this has performance implications? Patch untested for the
> > moment, wanted to give Steven a chance to comment.
> 
> As it's a performance thing you shouldn't see correctness issues with
> not setting it. But 0xF seems to have been chosen as it gave the best
> overall performance (although for individual test content this can
> vary). AFAICT the performance impact isn't massive either.

Good to know, will update the commit message accordingly.

> Reviewed-by: Steven Price 
> 
> Since you've tagged this RFC I won't merge it now, but it looks correct
> to me.

Thanks for the review... I hope you like reviewing Panfrost patches
because I have a Valhall bring-up series waiting o:)

When I get a chance to uprev the kernel on my G52 board I'll see if I
can benchmark the impact of this change, so far this is only
compile-tested. Even if there's no impact the patch should likely go in
to stay consistent with kbase, but hopefully there's a win from this. At
that point I'll send a v2 with your reviewed-by (and hopefully no
changes other than the commit message) and we'll land that.


Re: [PATCH] drm/ttm: Don't inherit GEM object VMAs in child process

2022-01-10 Thread Bhardwaj, Rajneesh

Hi Christian


I have reverted the change from the amd-staging-drm-next as per the 
discussion.  Thank you.



Regards

Rajneesh

On 1/4/2022 1:08 PM, Felix Kuehling wrote:

[+Adrian]

Am 2021-12-23 um 2:05 a.m. schrieb Christian König:


Am 22.12.21 um 21:53 schrieb Daniel Vetter:

On Mon, Dec 20, 2021 at 01:12:51PM -0500, Bhardwaj, Rajneesh wrote:

[SNIP]
Still sounds funky. I think minimally we should have an ack from CRIU
developers that this is officially the right way to solve this
problem. I
really don't want to have random one-off hacks that don't work across
the
board, for a problem where we (drm subsystem) really shouldn't be the
only
one with this problem. Where "this problem" means that the mmap space is
per file description, and not per underlying inode or real device or
whatever. That part sounds like a CRIU problem, and I expect CRIU folks
want a consistent solution across the board for this. Hence please
grab an
ack from them.

Unfortunately it's a KFD design problem. AMD used a single device
node, then mmaped different objects from the same offset to different
processes and expected it to work the rest of the fs subsystem without
churn.

This may be true for mmaps in the KFD device, but not for mmaps in the
DRM render nodes.



So yes, this is indeed because the mmap space is per file descriptor
for the use case here.

No. This is a different problem.

The problem has to do with the way that DRM manages mmap permissions. In
order to be able to mmap an offset in the render node, there needs to be
a BO that was created in the same render node. If you fork a process, it
inherits the VMA. But KFD doesn't know anything about the inherited BOs
from the parent process. Therefore those BOs don't get checkpointed and
restored in the child process. When the CRIU checkpoint is restored, our
CRIU plugin never creates a BO corresponding to the VMA in the child
process' render node FD. We've also lost the relationship between the
parent and child-process' render node FDs. After "fork" the render node
FD points to the same struct file in parent and child. After restoring
the CRIU checkpoint, they are separate struct files, created by separate
"open" system calls. Therefore the mmap call that restores the VMA fails
in the child process.

At least for KFD, there is no point inheriting BOs from a child process,
because the GPU has no way of accessing the BOs in the child process.
The child process has no GPU address space, no user mode queues, no way
to do anything with the GPU before it completely reinitializes its KFD
context.

We can workaround this issue in user mode with madvise(...,
MADV_DONTFORK). In fact we've already done this for some BOs to avoid a
memory leak in the parent process while a child process exists. But it's
slightly racy because there is a short time window where VMA exists
without the VM_DONTCOPY flag. A fork during that time window could still
create a child process with an inherited VMA.

Therefore a safer solution is to set the vm_flags in the VMA in the
driver when the VMA is first created.

Regards,
   Felix



And thanks for pointing this out, this indeed makes the whole change
extremely questionable.

Regards,
Christian.


Cheers, Daniel



Re: [v2 1/3] dt-bindings: msm/dsi: Add 10nm dsi phy tuning properties

2022-01-10 Thread Rob Herring
On Mon, Jan 10, 2022 at 05:06:03PM +0300, Dmitry Baryshkov wrote:
> On Mon, 10 Jan 2022 at 15:56, Rajeev Nandan  wrote:
> >
> > In most cases, the default values of DSI PHY tuning registers should be
> > sufficient as they are fully optimized. However, in some cases where
> > extreme board parasitics cause the eye shape to degrade, the override
> > bits can be used to improve the signal quality.
> >
> > The general guidelines for DSI PHY tuning include:
> > - High and moderate data rates may benefit from the drive strength and
> >   drive level tuning.
> > - Drive strength tuning will affect the output impedance and may be used
> >   for matching optimization.
> > - Drive level tuning will affect the output levels without affecting the
> >   impedance.
> >
> > The clock and data lanes have a calibration circuitry feature. The drive
> > strength tuning can be done by adjusting rescode offset for hstop/hsbot,
> > and the drive level tuning can be done by adjusting the LDO output level
> > for the HSTX drive.
> >
> > Signed-off-by: Rajeev Nandan 
> > ---
> >
> > Changes in v2:
> >  - More details in the commit text (Stephen Boyd)
> >  - Use human understandable values (Stephen Boyd, Dmitry Baryshkov)
> >  - Do not take values that are going to be unused (Dmitry Baryshkov)
> >
> >  .../bindings/display/msm/dsi-phy-10nm.yaml | 33 
> > ++
> >  1 file changed, 33 insertions(+)
> >
> > diff --git 
> > a/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml 
> > b/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml
> > index 4399715..d0eb8f6 100644
> > --- a/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml
> > +++ b/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml
> > @@ -35,6 +35,35 @@ properties:
> >Connected to DSI0_MIPI_DSI_PLL_VDDA0P9 pin for sc7180 target and
> >connected to VDDA_MIPI_DSI_0_PLL_0P9 pin for sdm845 target
> 
> Generic note:
> I think these properties should be prefixed with "qcom," prefix.
> 
> >
> > +  phy-rescode-offset-top:
> > +$ref: /schemas/types.yaml#/definitions/uint8-array
> > +minItems: 5
> > +maxItems: 5
> > +description:
> > +  Integer array of offset for pull-up legs rescode for all five lanes.
> > +  To offset the drive strength from the calibrated value in an 
> > increasing
> > +  or decreasing manner, use 6 bit two’s complement values.
> 
> dtc should support negative values, google hints that <(-2)> should work.

Yes, but the schema checks don't check negative values correctly yet. So 
you can use 'int8-array', but just don't use negative values in the 
examples. I'm working on changes that will fix this issue.

What does 6-bit mean? 0x3f is negative? Just sign extend the values and 
specify the valid range instead:

minimum: -32
maximum: 31

Rob


[PATCH v2] drm: bridge: nwl-dsi: Drop panel_bridge from nwl_dsi

2022-01-10 Thread Jagan Teki
panel_bridge pointer never used anywhere except the one it
looked up at nwl_dsi_bridge_attach.

Drop it from the nwl_dsi structure.

Reviewed-by: Guido Günther 
Signed-off-by: Jagan Teki 
---
Changes for v2:
- collect Guido r-b

Note: This is patch is part of switching of devm_drm_of_get_bridge 
serious however the child node support of devm_drm_of_get_bridge is
still under-review. So send it separatly as it not related to that
API switch.
https://patchwork.kernel.org/project/dri-devel/patch/20211210174819.2250178-1-ja...@amarulasolutions.com/

 drivers/gpu/drm/bridge/nwl-dsi.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/bridge/nwl-dsi.c b/drivers/gpu/drm/bridge/nwl-dsi.c
index fc3ad9fab867..9282e61dfbf0 100644
--- a/drivers/gpu/drm/bridge/nwl-dsi.c
+++ b/drivers/gpu/drm/bridge/nwl-dsi.c
@@ -65,7 +65,6 @@ struct nwl_dsi_transfer {
 struct nwl_dsi {
struct drm_bridge bridge;
struct mipi_dsi_host dsi_host;
-   struct drm_bridge *panel_bridge;
struct device *dev;
struct phy *phy;
union phy_configure_opts phy_cfg;
@@ -924,13 +923,11 @@ static int nwl_dsi_bridge_attach(struct drm_bridge 
*bridge,
if (IS_ERR(panel_bridge))
return PTR_ERR(panel_bridge);
}
-   dsi->panel_bridge = panel_bridge;
 
-   if (!dsi->panel_bridge)
+   if (!panel_bridge)
return -EPROBE_DEFER;
 
-   return drm_bridge_attach(bridge->encoder, dsi->panel_bridge, bridge,
-flags);
+   return drm_bridge_attach(bridge->encoder, panel_bridge, bridge, flags);
 }
 
 static void nwl_dsi_bridge_detach(struct drm_bridge *bridge)
-- 
2.25.1



[PATCH v7 6/6] drm/i915: Use struct vma_resource instead of struct vma_snapshot

2022-01-10 Thread Thomas Hellström
There is always a struct vma_resource guaranteed to be alive when we
access a corresponding struct vma_snapshot.

So ditch the latter and instead of allocating vma_snapshots, reference
the already existning vma_resource.

This requires a couple of extra members in struct vma_resource but that's
a small price to pay for the simplification.

v2:
- Fix a missing include and declaration (kernel test robot )

Signed-off-by: Thomas Hellström 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/Makefile |   1 -
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c|  15 +--
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |   9 +-
 drivers/gpu/drm/i915/i915_gpu_error.c |  87 ++--
 drivers/gpu/drm/i915/i915_request.c   |  12 +-
 drivers/gpu/drm/i915/i915_request.h   |   6 +-
 drivers/gpu/drm/i915/i915_vma.c   |  16 +--
 drivers/gpu/drm/i915/i915_vma_resource.c  |   4 +
 drivers/gpu/drm/i915/i915_vma_resource.h  |  28 +++-
 drivers/gpu/drm/i915/i915_vma_snapshot.c  | 125 --
 drivers/gpu/drm/i915/i915_vma_snapshot.h  | 101 --
 11 files changed, 90 insertions(+), 314 deletions(-)
 delete mode 100644 drivers/gpu/drm/i915/i915_vma_snapshot.c
 delete mode 100644 drivers/gpu/drm/i915/i915_vma_snapshot.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 98433ad74194..aa86ac33effc 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -175,7 +175,6 @@ i915-y += \
  i915_ttm_buddy_manager.o \
  i915_vma.o \
  i915_vma_resource.o \
- i915_vma_snapshot.o \
  intel_wopcm.o
 
 # general-purpose microcontroller (GuC) support
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 3e359de0e460..cf283b5f6ffe 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -31,7 +31,6 @@
 #include "i915_gem_ioctls.h"
 #include "i915_trace.h"
 #include "i915_user_extensions.h"
-#include "i915_vma_snapshot.h"
 
 struct eb_vma {
struct i915_vma *vma;
@@ -1954,7 +1953,6 @@ static void eb_capture_stage(struct i915_execbuffer *eb)
 {
const unsigned int count = eb->buffer_count;
unsigned int i = count, j;
-   struct i915_vma_snapshot *vsnap;
 
while (i--) {
struct eb_vma *ev = &eb->vma[i];
@@ -1964,11 +1962,6 @@ static void eb_capture_stage(struct i915_execbuffer *eb)
if (!(flags & EXEC_OBJECT_CAPTURE))
continue;
 
-   vsnap = i915_vma_snapshot_alloc(GFP_KERNEL);
-   if (!vsnap)
-   continue;
-
-   i915_vma_snapshot_init(vsnap, vma, "user");
for_each_batch_create_order(eb, j) {
struct i915_capture_list *capture;
 
@@ -1977,10 +1970,9 @@ static void eb_capture_stage(struct i915_execbuffer *eb)
continue;
 
capture->next = eb->capture_lists[j];
-   capture->vma_snapshot = i915_vma_snapshot_get(vsnap);
+   capture->vma_res = i915_vma_resource_get(vma->resource);
eb->capture_lists[j] = capture;
}
-   i915_vma_snapshot_put(vsnap);
}
 }
 
@@ -3283,9 +3275,8 @@ eb_requests_create(struct i915_execbuffer *eb, struct 
dma_fence *in_fence,
 * _onstack interface.
 */
if (eb->batches[i]->vma)
-   
i915_vma_snapshot_init_onstack(&eb->requests[i]->batch_snapshot,
-  eb->batches[i]->vma,
-  "batch");
+   eb->requests[i]->batch_res =
+   
i915_vma_resource_get(eb->batches[i]->vma->resource);
if (eb->batch_pool) {
GEM_BUG_ON(intel_context_is_parallel(eb->context));
intel_gt_buffer_pool_mark_active(eb->batch_pool,
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c 
b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index 7b793a295475..baa346a49976 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1708,18 +1708,15 @@ static void intel_engine_print_registers(struct 
intel_engine_cs *engine,
 
 static void print_request_ring(struct drm_printer *m, struct i915_request *rq)
 {
-   struct i915_vma_snapshot *vsnap = &rq->batch_snapshot;
+   struct i915_vma_resource *vma_res = rq->batch_res;
void *ring;
int size;
 
-   if (!i915_vma_snapshot_present(vsnap))
-   vsnap = NULL;
-
drm_printf(m,
   "[head %04x, postfix %04x, tail %04x, batch 0x%08x_%08x]:\n",
   rq->head, rq->postfix, rq->tail,
-  vsnap ? up

[PATCH v7 5/6] drm/i915: Asynchronous migration selftest

2022-01-10 Thread Thomas Hellström
Add a selftest to exercise asynchronous migration and -unbining.
Extend the gem_migrate selftest to perform the migrations while
depending on a spinner and a bound vma set up on the migrated
buffer object.

Signed-off-by: Thomas Hellström 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/gem/i915_gem_object.c|  12 ++
 drivers/gpu/drm/i915/gem/i915_gem_object.h|   3 +
 .../drm/i915/gem/selftests/i915_gem_migrate.c | 192 --
 3 files changed, 192 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index d87b508b59b1..1a9e1f940a7d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -756,6 +756,18 @@ i915_gem_object_get_moving_fence(struct 
drm_i915_gem_object *obj)
return dma_fence_get(i915_gem_to_ttm(obj)->moving);
 }
 
+void i915_gem_object_set_moving_fence(struct drm_i915_gem_object *obj,
+ struct dma_fence *fence)
+{
+   struct dma_fence **moving = &i915_gem_to_ttm(obj)->moving;
+
+   if (*moving == fence)
+   return;
+
+   dma_fence_put(*moving);
+   *moving = dma_fence_get(fence);
+}
+
 /**
  * i915_gem_object_wait_moving_fence - Wait for the object's moving fence if 
any
  * @obj: The object whose moving fence to wait for.
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object.h
index bc448f895ae8..02c37fe4a535 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h
@@ -523,6 +523,9 @@ i915_gem_object_finish_access(struct drm_i915_gem_object 
*obj)
 struct dma_fence *
 i915_gem_object_get_moving_fence(struct drm_i915_gem_object *obj);
 
+void i915_gem_object_set_moving_fence(struct drm_i915_gem_object *obj,
+ struct dma_fence *fence);
+
 int i915_gem_object_wait_moving_fence(struct drm_i915_gem_object *obj,
  bool intr);
 
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c
index ecb691c81d1e..d534141b2cf7 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c
@@ -4,8 +4,13 @@
  */
 
 #include "gt/intel_migrate.h"
+#include "gt/intel_gpu_commands.h"
 #include "gem/i915_gem_ttm_move.h"
 
+#include "i915_deps.h"
+
+#include "selftests/igt_spinner.h"
+
 static int igt_fill_check_buffer(struct drm_i915_gem_object *obj,
 bool fill)
 {
@@ -101,7 +106,8 @@ static int igt_same_create_migrate(void *arg)
 }
 
 static int lmem_pages_migrate_one(struct i915_gem_ww_ctx *ww,
- struct drm_i915_gem_object *obj)
+ struct drm_i915_gem_object *obj,
+ struct i915_vma *vma)
 {
int err;
 
@@ -109,6 +115,24 @@ static int lmem_pages_migrate_one(struct i915_gem_ww_ctx 
*ww,
if (err)
return err;
 
+   if (vma) {
+   err = i915_vma_pin_ww(vma, ww, obj->base.size, 0,
+ 0UL | PIN_OFFSET_FIXED |
+ PIN_USER);
+   if (err) {
+   if (err != -EINTR && err != ERESTARTSYS &&
+   err != -EDEADLK)
+   pr_err("Failed to pin vma.\n");
+   return err;
+   }
+
+   i915_vma_unpin(vma);
+   }
+
+   /*
+* Migration will implicitly unbind (asynchronously) any bound
+* vmas.
+*/
if (i915_gem_object_is_lmem(obj)) {
err = i915_gem_object_migrate(obj, ww, INTEL_REGION_SMEM);
if (err) {
@@ -149,11 +173,15 @@ static int lmem_pages_migrate_one(struct i915_gem_ww_ctx 
*ww,
return err;
 }
 
-static int igt_lmem_pages_migrate(void *arg)
+static int __igt_lmem_pages_migrate(struct intel_gt *gt,
+   struct i915_address_space *vm,
+   struct i915_deps *deps,
+   struct igt_spinner *spin,
+   struct dma_fence *spin_fence)
 {
-   struct intel_gt *gt = arg;
struct drm_i915_private *i915 = gt->i915;
struct drm_i915_gem_object *obj;
+   struct i915_vma *vma = NULL;
struct i915_gem_ww_ctx ww;
struct i915_request *rq;
int err;
@@ -165,6 +193,14 @@ static int igt_lmem_pages_migrate(void *arg)
if (IS_ERR(obj))
return PTR_ERR(obj);
 
+   if (vm) {
+   vma = i915_vma_instance(obj, vm, NULL);
+   if (IS_ERR(vma)) {
+   err = PTR_ERR(vma);
+   goto out_put;
+   }
+   }
+
/* Initial GPU fill, sync, CPU i

[PATCH v7 4/6] drm/i915: Use vma resources for async unbinding

2022-01-10 Thread Thomas Hellström
Implement async (non-blocking) unbinding by not syncing the vma before
calling unbind on the vma_resource.
Add the resulting unbind fence to the object's dma_resv from where it is
picked up by the ttm migration code.
Ideally these unbind fences should be coalesced with the migration blit
fence to avoid stalling the migration blit waiting for unbind, as they
can certainly go on in parallel, but since we don't yet have a
reasonable data structure to use to coalesce fences and attach the
resulting fence to a timeline, we defer that for now.

Note that with async unbinding, even while the unbind waits for the
preceding bind to complete before unbinding, the vma itself might have been
destroyed in the process, clearing the vma pages. Therefore we can
only allow async unbinding if we have a refcounted sg-list and keep a
refcount on that for the vma resource pages to stay intact until
binding occurs. If this condition is not met, a request for an async
unbind is diverted to a sync unbind.

v2:
- Use a separate kmem_cache for vma resources for now to isolate their
  memory allocation and aid debugging.
- Move the check for vm closed to the actual unbinding thread. Regardless
  of whether the vm is closed, we need the unbind fence to properly wait
  for capture.
- Clear vma_res::vm on unbind and update its documentation.
v4:
- Take cache coloring into account when searching for vma resources
  pending unbind. (Matthew Auld)
v5:
- Fix timeout and error check in i915_vma_resource_bind_dep_await().
- Avoid taking a reference on the object for async binding if
  async unbind capable.
- Fix braces around a single-line if statement.
v6:
- Fix up the cache coloring adjustment. (Kernel test robot )
- Don't allow async unbinding if the vma_res pages are not the same as
  the object pages. (Matthew Auld)
v7:
- s/unsigned long/u64/ in a number of places (Matthew Auld)

Signed-off-by: Thomas Hellström 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c |  11 +-
 drivers/gpu/drm/i915/gt/intel_ggtt.c |   2 +-
 drivers/gpu/drm/i915/gt/intel_gtt.c  |   4 +
 drivers/gpu/drm/i915/gt/intel_gtt.h  |   3 +
 drivers/gpu/drm/i915/i915_drv.h  |   1 +
 drivers/gpu/drm/i915/i915_gem.c  |  12 +-
 drivers/gpu/drm/i915/i915_module.c   |   3 +
 drivers/gpu/drm/i915/i915_vma.c  | 205 +--
 drivers/gpu/drm/i915/i915_vma.h  |   3 +-
 drivers/gpu/drm/i915/i915_vma_resource.c | 354 +--
 drivers/gpu/drm/i915/i915_vma_resource.h |  48 +++
 11 files changed, 579 insertions(+), 67 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
index 8653855d808b..1de306c03aaf 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -142,7 +142,16 @@ int i915_ttm_move_notify(struct ttm_buffer_object *bo)
struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
int ret;
 
-   ret = i915_gem_object_unbind(obj, I915_GEM_OBJECT_UNBIND_ACTIVE);
+   /*
+* Note: The async unbinding here will actually transform the
+* blocking wait for unbind into a wait before finally submitting
+* evict / migration blit and thus stall the migration timeline
+* which may not be good for overall throughput. We should make
+* sure we await the unbind fences *after* the migration blit
+* instead of *before* as we currently do.
+*/
+   ret = i915_gem_object_unbind(obj, I915_GEM_OBJECT_UNBIND_ACTIVE |
+I915_GEM_OBJECT_UNBIND_ASYNC);
if (ret)
return ret;
 
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index e49b6250c4b7..a1b2761bc16e 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -142,7 +142,7 @@ void i915_ggtt_suspend_vm(struct i915_address_space *vm)
continue;
 
if (!i915_vma_is_bound(vma, I915_VMA_GLOBAL_BIND)) {
-   __i915_vma_evict(vma);
+   __i915_vma_evict(vma, false);
drm_mm_remove_node(&vma->node);
}
}
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c
index a94be0306464..46be4197b93f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -161,6 +161,9 @@ static void __i915_vm_release(struct work_struct *work)
struct i915_address_space *vm =
container_of(work, struct i915_address_space, release_work);
 
+   /* Synchronize async unbinds. */
+   i915_vma_resource_bind_dep_sync_all(vm);
+
vm->cleanup(vm);
i915_address_space_fini(vm);
 
@@ -189,6 +192,7 @@ void i915_address_space_init(struct i915_address_space *vm, 
int subclass)
if (!kref_read(&vm->

[PATCH v7 3/6] drm/i915: Don't pin the object pages during pending vma binds

2022-01-10 Thread Thomas Hellström
A pin-count is already held by vma->pages so taking an additional pin
during async binds is not necessary.

When we introduce async unbinding we have other means of keeping the
object pages alive.

Signed-off-by: Thomas Hellström 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/i915_vma.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 42fff9ddf096..29c770a764aa 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -306,10 +306,8 @@ static void __vma_release(struct dma_fence_work *work)
 {
struct i915_vma_work *vw = container_of(work, typeof(*vw), base);
 
-   if (vw->pinned) {
-   __i915_gem_object_unpin_pages(vw->pinned);
+   if (vw->pinned)
i915_gem_object_put(vw->pinned);
-   }
 
i915_vm_free_pt_stash(vw->vm, &vw->stash);
i915_vm_put(vw->vm);
@@ -478,7 +476,6 @@ int i915_vma_bind(struct i915_vma *vma,
 
work->base.dma.error = 0; /* enable the queue_work() */
 
-   __i915_gem_object_pin_pages(vma->obj);
work->pinned = i915_gem_object_get(vma->obj);
} else {
if (vma->obj) {
-- 
2.31.1



[PATCH v7 2/6] drm/i915: Use the vma resource as argument for gtt binding / unbinding

2022-01-10 Thread Thomas Hellström
When introducing asynchronous unbinding, the vma itself may no longer
be alive when the actual binding or unbinding takes place.

Update the gtt i915_vma_ops accordingly to take a struct i915_vma_resource
instead of a struct i915_vma for the bind_vma() and unbind_vma() ops.
Similarly change the insert_entries() op for struct i915_address_space.

Replace a couple of i915_vma_snapshot members with their newly introduced
i915_vma_resource counterparts, since they have the same lifetime.

Also make sure to avoid changing the struct i915_vma_flags (in particular
the bind flags) async. That should now only be done sync under the
vm mutex.

v2:
- Update the vma_res::bound_flags when binding to the aliased ggtt
v6:
- Remove I915_VMA_ALLOC_BIT (Matthew Auld)
- Change some members of struct i915_vma_resource from unsigned long to u64
  (Matthew Auld)
v7:
- Fix vma resource size parameters to be u64 rather than unsigned long
  (Matthew Auld)

Signed-off-by: Thomas Hellström 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/display/intel_dpt.c  | 27 ++---
 .../gpu/drm/i915/gem/i915_gem_object_types.h  | 27 +
 .../gpu/drm/i915/gem/selftests/huge_pages.c   | 37 +++
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c  | 19 ++--
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  | 37 +++
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |  4 +-
 drivers/gpu/drm/i915/gt/intel_ggtt.c  | 70 ++---
 drivers/gpu/drm/i915/gt/intel_gtt.h   | 16 +--
 drivers/gpu/drm/i915/gt/intel_ppgtt.c | 22 +++--
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  | 13 ++-
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h  |  2 +-
 drivers/gpu/drm/i915/i915_debugfs.c   |  3 +-
 drivers/gpu/drm/i915/i915_gpu_error.c |  6 +-
 drivers/gpu/drm/i915/i915_vma.c   | 24 -
 drivers/gpu/drm/i915/i915_vma.h   | 11 +--
 drivers/gpu/drm/i915/i915_vma_resource.c  |  9 +-
 drivers/gpu/drm/i915/i915_vma_resource.h  | 99 ++-
 drivers/gpu/drm/i915/i915_vma_snapshot.c  |  4 -
 drivers/gpu/drm/i915/i915_vma_snapshot.h  |  8 --
 drivers/gpu/drm/i915/i915_vma_types.h | 14 ++-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 64 
 drivers/gpu/drm/i915/selftests/mock_gtt.c | 12 +--
 22 files changed, 314 insertions(+), 214 deletions(-)

diff --git a/drivers/gpu/drm/i915/display/intel_dpt.c 
b/drivers/gpu/drm/i915/display/intel_dpt.c
index 8f674745e7e0..63a83d5f85a1 100644
--- a/drivers/gpu/drm/i915/display/intel_dpt.c
+++ b/drivers/gpu/drm/i915/display/intel_dpt.c
@@ -48,7 +48,7 @@ static void dpt_insert_page(struct i915_address_space *vm,
 }
 
 static void dpt_insert_entries(struct i915_address_space *vm,
-  struct i915_vma *vma,
+  struct i915_vma_resource *vma_res,
   enum i915_cache_level level,
   u32 flags)
 {
@@ -64,8 +64,8 @@ static void dpt_insert_entries(struct i915_address_space *vm,
 * not to allow the user to override access to a read only page.
 */
 
-   i = vma->node.start / I915_GTT_PAGE_SIZE;
-   for_each_sgt_daddr(addr, sgt_iter, vma->pages)
+   i = vma_res->start / I915_GTT_PAGE_SIZE;
+   for_each_sgt_daddr(addr, sgt_iter, vma_res->bi.pages)
gen8_set_pte(&base[i++], pte_encode | addr);
 }
 
@@ -76,35 +76,38 @@ static void dpt_clear_range(struct i915_address_space *vm,
 
 static void dpt_bind_vma(struct i915_address_space *vm,
 struct i915_vm_pt_stash *stash,
-struct i915_vma *vma,
+struct i915_vma_resource *vma_res,
 enum i915_cache_level cache_level,
 u32 flags)
 {
-   struct drm_i915_gem_object *obj = vma->obj;
u32 pte_flags;
 
+   if (vma_res->bound_flags)
+   return;
+
/* Applicable to VLV (gen8+ do not support RO in the GGTT) */
pte_flags = 0;
-   if (vma->vm->has_read_only && i915_gem_object_is_readonly(obj))
+   if (vm->has_read_only && vma_res->bi.readonly)
pte_flags |= PTE_READ_ONLY;
-   if (i915_gem_object_is_lmem(obj))
+   if (vma_res->bi.lmem)
pte_flags |= PTE_LM;
 
-   vma->vm->insert_entries(vma->vm, vma, cache_level, pte_flags);
+   vm->insert_entries(vm, vma_res, cache_level, pte_flags);
 
-   vma->page_sizes.gtt = I915_GTT_PAGE_SIZE;
+   vma_res->page_sizes_gtt = I915_GTT_PAGE_SIZE;
 
/*
 * Without aliasing PPGTT there's no difference between
 * GLOBAL/LOCAL_BIND, it's all the same ptes. Hence unconditionally
 * upgrade to both bound if we bind either to avoid double-binding.
 */
-   atomic_or(I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND, &vma->flags);
+   vma_res->bound_flags = I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND;
 }
 
-static void dpt_unbind_vma(struct i915_address_space *

[PATCH v7 1/6] drm/i915: Initial introduction of vma resources

2022-01-10 Thread Thomas Hellström
Introduce vma resources, sort of similar to TTM resources,  needed for
asynchronous bind management. Initially we will use them to hold
completion of unbinding when we capture data from a vma, but they will
be used extensively in upcoming patches for asynchronous vma unbinding.

v6:
- Some documentation updates

Signed-off-by: Thomas Hellström 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/Makefile |   1 +
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c|   2 +-
 drivers/gpu/drm/i915/i915_vma.c   |  55 +++-
 drivers/gpu/drm/i915/i915_vma.h   |  19 ++-
 drivers/gpu/drm/i915/i915_vma_resource.c  | 124 ++
 drivers/gpu/drm/i915/i915_vma_resource.h  |  69 ++
 drivers/gpu/drm/i915/i915_vma_snapshot.c  |  15 +--
 drivers/gpu/drm/i915/i915_vma_snapshot.h  |   7 +-
 drivers/gpu/drm/i915/i915_vma_types.h |   5 +
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |  99 --
 10 files changed, 333 insertions(+), 63 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_vma_resource.c
 create mode 100644 drivers/gpu/drm/i915/i915_vma_resource.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 1b62b9f65196..98433ad74194 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -174,6 +174,7 @@ i915-y += \
  i915_trace_points.o \
  i915_ttm_buddy_manager.o \
  i915_vma.o \
+ i915_vma_resource.o \
  i915_vma_snapshot.o \
  intel_wopcm.o
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 9e221ce42707..3e359de0e460 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -1424,7 +1424,7 @@ eb_relocate_entry(struct i915_execbuffer *eb,
mutex_lock(&vma->vm->mutex);
err = i915_vma_bind(target->vma,
target->vma->obj->cache_level,
-   PIN_GLOBAL, NULL);
+   PIN_GLOBAL, NULL, NULL);
mutex_unlock(&vma->vm->mutex);
reloc_cache_remap(&eb->reloc_cache, ev->vma->obj);
if (err)
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 81a611b7d36f..05dcbc259b82 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -38,6 +38,7 @@
 #include "i915_sw_fence_work.h"
 #include "i915_trace.h"
 #include "i915_vma.h"
+#include "i915_vma_resource.h"
 
 static struct kmem_cache *slab_vmas;
 
@@ -381,6 +382,8 @@ static int i915_vma_verify_bind_complete(struct i915_vma 
*vma)
  * @cache_level: mapping cache level
  * @flags: flags like global or local mapping
  * @work: preallocated worker for allocating and binding the PTE
+ * @vma_res: pointer to a preallocated vma resource. The resource is either
+ * consumed or freed.
  *
  * DMA addresses are taken from the scatter-gather table of this object (or of
  * this VMA in case of non-default GGTT views) and PTE entries set up.
@@ -389,7 +392,8 @@ static int i915_vma_verify_bind_complete(struct i915_vma 
*vma)
 int i915_vma_bind(struct i915_vma *vma,
  enum i915_cache_level cache_level,
  u32 flags,
- struct i915_vma_work *work)
+ struct i915_vma_work *work,
+ struct i915_vma_resource *vma_res)
 {
u32 bind_flags;
u32 vma_flags;
@@ -400,11 +404,15 @@ int i915_vma_bind(struct i915_vma *vma,
 
if (GEM_DEBUG_WARN_ON(range_overflows(vma->node.start,
  vma->node.size,
- vma->vm->total)))
+ vma->vm->total))) {
+   kfree(vma_res);
return -ENODEV;
+   }
 
-   if (GEM_DEBUG_WARN_ON(!flags))
+   if (GEM_DEBUG_WARN_ON(!flags)) {
+   kfree(vma_res);
return -EINVAL;
+   }
 
bind_flags = flags;
bind_flags &= I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND;
@@ -413,11 +421,21 @@ int i915_vma_bind(struct i915_vma *vma,
vma_flags &= I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND;
 
bind_flags &= ~vma_flags;
-   if (bind_flags == 0)
+   if (bind_flags == 0) {
+   kfree(vma_res);
return 0;
+   }
 
GEM_BUG_ON(!atomic_read(&vma->pages_count));
 
+   if (vma->resource || !vma_res) {
+   /* Rebinding with an additional I915_VMA_*_BIND */
+   GEM_WARN_ON(!vma_flags);
+   kfree(vma_res);
+   } else {
+   i915_vma_resource_init(vma_res);
+   vma->resource = vma_res;
+   }
trace_i915_vma_bind(vma, bind_flags);
if (work && bind_flags & vma->v

[PATCH v7 0/6] drm/i915: Asynchronous vma unbinding

2022-01-10 Thread Thomas Hellström


This patch series introduces infrastructure for asynchronous vma
unbinding. The single enabled use-case is initially at buffer object
migration where we otherwise sync when unbinding vmas before migration.
This in theory allows us to pipeline any number of migrations, but in
practice the number is restricted by a sync wait when filling the
migration context ring. We might want to look at that moving forward if
needed.

The other main use-case is to be able to pipeline vma evictions, for
example with softpinning where a new vma wants to reuse the vm range
of an already active vma. We can't support this just yet because we
need dma_resv locking around vma eviction for that, which is under
implementation.

Patch 1 introduces vma resource first for error capture purposes
Patch 2 changes the vm backend interface to take vma resources rather than vmas
Patch 3 removes and unneeded page pinning
Patch 4 introduces the async unbinding itself, and finally
Patch 5 introduces a selftest
Patch 6 realizes we have duplicated functionality and removes the vma snapshots

v2:
-- Some kernel test robot reports addressed.
-- kmem cache for vma resources, See patch 4
-- Various fixes all over the place. See separate commit messages.
v3:
-- Re-add a missing i915_vma_resource_put()
-- Remove a stray debug printout
v4:
-- Patch series split in two. This is the second part.
-- Take cache coloring into account when searching for vma_resources
   pending unbind. (Matthew Auld)
v5:
-- Add a selftest.
-- Remove page pinning while sync binding.
-- A couple of fixes in i915_vma_resource_bind_dep_await()
v6:
-- Some documentation updates
-- Remove I915_VMA_ALLOC_BIT (Matthew Auld)
-- Change some members of struct i915_vma_resource from unsigned long to u64
   (Matthew Auld)
-- Fix up the cache coloring adjustment. (Kernel test robot )
-- Don't allow async unbinding if the vma_res pages are not the same as
   the object pages. (Matthew Auld)
v7:
-- More s/unsigned long/u64/ changes (Matthew Auld)

Thomas Hellström (6):
  drm/i915: Initial introduction of vma resources
  drm/i915: Use the vma resource as argument for gtt binding / unbinding
  drm/i915: Don't pin the object pages during pending vma binds
  drm/i915: Use vma resources for async unbinding
  drm/i915: Asynchronous migration selftest
  drm/i915: Use struct vma_resource instead of struct vma_snapshot

 drivers/gpu/drm/i915/Makefile |   2 +-
 drivers/gpu/drm/i915/display/intel_dpt.c  |  27 +-
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c|  17 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.c|  12 +
 drivers/gpu/drm/i915/gem/i915_gem_object.h|   3 +
 .../gpu/drm/i915/gem/i915_gem_object_types.h  |  27 +-
 drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c  |  11 +-
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |  37 +-
 .../drm/i915/gem/selftests/i915_gem_migrate.c | 192 +++-
 drivers/gpu/drm/i915/gt/gen6_ppgtt.c  |  19 +-
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c  |  37 +-
 drivers/gpu/drm/i915/gt/intel_engine_cs.c |   9 +-
 drivers/gpu/drm/i915/gt/intel_ggtt.c  |  72 +--
 drivers/gpu/drm/i915/gt/intel_gtt.c   |   4 +
 drivers/gpu/drm/i915/gt/intel_gtt.h   |  19 +-
 drivers/gpu/drm/i915/gt/intel_ppgtt.c |  22 +-
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.c  |  13 +-
 drivers/gpu/drm/i915/gt/uc/intel_uc_fw.h  |   2 +-
 drivers/gpu/drm/i915/i915_debugfs.c   |   3 +-
 drivers/gpu/drm/i915/i915_drv.h   |   1 +
 drivers/gpu/drm/i915/i915_gem.c   |  12 +-
 drivers/gpu/drm/i915/i915_gpu_error.c |  87 ++--
 drivers/gpu/drm/i915/i915_module.c|   3 +
 drivers/gpu/drm/i915/i915_request.c   |  12 +-
 drivers/gpu/drm/i915/i915_request.h   |   6 +-
 drivers/gpu/drm/i915/i915_vma.c   | 241 +-
 drivers/gpu/drm/i915/i915_vma.h   |  33 +-
 drivers/gpu/drm/i915/i915_vma_resource.c  | 417 ++
 drivers/gpu/drm/i915/i915_vma_resource.h  | 234 ++
 drivers/gpu/drm/i915/i915_vma_snapshot.c  | 134 --
 drivers/gpu/drm/i915/i915_vma_snapshot.h  | 112 -
 drivers/gpu/drm/i915/i915_vma_types.h |  19 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 159 ---
 drivers/gpu/drm/i915/selftests/mock_gtt.c |  12 +-
 34 files changed, 1421 insertions(+), 589 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_vma_resource.c
 create mode 100644 drivers/gpu/drm/i915/i915_vma_resource.h
 delete mode 100644 drivers/gpu/drm/i915/i915_vma_snapshot.c
 delete mode 100644 drivers/gpu/drm/i915/i915_vma_snapshot.h

-- 
2.31.1



Re: [RFC PATCH] drm/panfrost: Handle IDVS_GROUP_SIZE feature

2022-01-10 Thread Steven Price
On 09/01/2022 17:12, Alyssa Rosenzweig wrote:
> The IDVS group size feature was missing. It is used on some Bifrost and
> Valhall GPUs, and is the last kernel-relevant Bifrost feature we're
> missing.
> 
> This feature adds an extra IDVS group size field to the JM_CONFIG
> register. In kbase, the value is configurable via the device tree; kbase
> uses 0xF as a default if no value is specified. Until we find a device
> demanding otherwise, let's always set the 0xF default on devices which
> support this feature mimicking kbase's behaviour.

This is a performance thing - so I don't think it will break anything if
this is wrong, it just won't be optimal.

> As JM_CONFIG is an undocumented register, it's not clear to me what
> happens if we fail to include this handling. Index-driven vertex shading
> already works on Bifrost boards with this feature without this handling.
> Perhaps this has performance implications? Patch untested for the
> moment, wanted to give Steven a chance to comment.

As it's a performance thing you shouldn't see correctness issues with
not setting it. But 0xF seems to have been chosen as it gave the best
overall performance (although for individual test content this can
vary). AFAICT the performance impact isn't massive either.

> Applies on top of my feature clean up series which should go in first.
> (That's pure cleaunp, this is a behaviour change RFC needing
> discussion.)
> 
> Signed-off-by: Alyssa Rosenzweig 

Reviewed-by: Steven Price 

Since you've tagged this RFC I won't merge it now, but it looks correct
to me.

Thanks,

Steve

> ---
>  drivers/gpu/drm/panfrost/panfrost_features.h | 3 +++
>  drivers/gpu/drm/panfrost/panfrost_gpu.c  | 3 +++
>  drivers/gpu/drm/panfrost/panfrost_regs.h | 1 +
>  3 files changed, 7 insertions(+)
> 
> diff --git a/drivers/gpu/drm/panfrost/panfrost_features.h 
> b/drivers/gpu/drm/panfrost/panfrost_features.h
> index 34f2bae1ec8c..36fadcf9634e 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_features.h
> +++ b/drivers/gpu/drm/panfrost/panfrost_features.h
> @@ -20,6 +20,7 @@ enum panfrost_hw_feature {
>   HW_FEATURE_AARCH64_MMU,
>   HW_FEATURE_TLS_HASHING,
>   HW_FEATURE_THREAD_GROUP_SPLIT,
> + HW_FEATURE_IDVS_GROUP_SIZE,
>   HW_FEATURE_3BIT_EXT_RW_L2_MMU_CONFIG,
>  };
>  
> @@ -74,6 +75,7 @@ enum panfrost_hw_feature {
>   BIT_ULL(HW_FEATURE_FLUSH_REDUCTION) | \
>   BIT_ULL(HW_FEATURE_PROTECTED_MODE) | \
>   BIT_ULL(HW_FEATURE_PROTECTED_DEBUG_MODE) | \
> + BIT_ULL(HW_FEATURE_IDVS_GROUP_SIZE) | \
>   BIT_ULL(HW_FEATURE_COHERENCY_REG))
>  
>  #define hw_features_g76 (\
> @@ -87,6 +89,7 @@ enum panfrost_hw_feature {
>   BIT_ULL(HW_FEATURE_COHERENCY_REG) | \
>   BIT_ULL(HW_FEATURE_AARCH64_MMU) | \
>   BIT_ULL(HW_FEATURE_TLS_HASHING) | \
> + BIT_ULL(HW_FEATURE_IDVS_GROUP_SIZE) | \
>   BIT_ULL(HW_FEATURE_3BIT_EXT_RW_L2_MMU_CONFIG))
>  
>  #define hw_features_g31 (\
> diff --git a/drivers/gpu/drm/panfrost/panfrost_gpu.c 
> b/drivers/gpu/drm/panfrost/panfrost_gpu.c
> index bbe628b306ee..50c8922694d7 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_gpu.c
> +++ b/drivers/gpu/drm/panfrost/panfrost_gpu.c
> @@ -145,6 +145,9 @@ static void panfrost_gpu_init_quirks(struct 
> panfrost_device *pfdev)
>   quirks |= (COHERENCY_ACE_LITE | COHERENCY_ACE) <<
>  JM_FORCE_COHERENCY_FEATURES_SHIFT;
>  
> + if (panfrost_has_hw_feature(pfdev, HW_FEATURE_IDVS_GROUP_SIZE))
> + quirks |= JM_DEFAULT_IDVS_GROUP_SIZE << 
> JM_IDVS_GROUP_SIZE_SHIFT;
> +
>   if (quirks)
>   gpu_write(pfdev, GPU_JM_CONFIG, quirks);
>  
> diff --git a/drivers/gpu/drm/panfrost/panfrost_regs.h 
> b/drivers/gpu/drm/panfrost/panfrost_regs.h
> index 6c5a11ef1ee8..16e776cc82ea 100644
> --- a/drivers/gpu/drm/panfrost/panfrost_regs.h
> +++ b/drivers/gpu/drm/panfrost/panfrost_regs.h
> @@ -208,6 +208,7 @@
>  #define JM_MAX_JOB_THROTTLE_LIMIT0x3F
>  #define JM_FORCE_COHERENCY_FEATURES_SHIFT 2
>  #define JM_IDVS_GROUP_SIZE_SHIFT 16
> +#define JM_DEFAULT_IDVS_GROUP_SIZE   0xF
>  #define JM_MAX_IDVS_GROUP_SIZE   0x3F
>  
>  
> 



Re: [v2 1/3] dt-bindings: msm/dsi: Add 10nm dsi phy tuning properties

2022-01-10 Thread Rob Herring
On Mon, 10 Jan 2022 18:25:35 +0530, Rajeev Nandan wrote:
> In most cases, the default values of DSI PHY tuning registers should be
> sufficient as they are fully optimized. However, in some cases where
> extreme board parasitics cause the eye shape to degrade, the override
> bits can be used to improve the signal quality.
> 
> The general guidelines for DSI PHY tuning include:
> - High and moderate data rates may benefit from the drive strength and
>   drive level tuning.
> - Drive strength tuning will affect the output impedance and may be used
>   for matching optimization.
> - Drive level tuning will affect the output levels without affecting the
>   impedance.
> 
> The clock and data lanes have a calibration circuitry feature. The drive
> strength tuning can be done by adjusting rescode offset for hstop/hsbot,
> and the drive level tuning can be done by adjusting the LDO output level
> for the HSTX drive.
> 
> Signed-off-by: Rajeev Nandan 
> ---
> 
> Changes in v2:
>  - More details in the commit text (Stephen Boyd)
>  - Use human understandable values (Stephen Boyd, Dmitry Baryshkov)
>  - Do not take values that are going to be unused (Dmitry Baryshkov)
> 
>  .../bindings/display/msm/dsi-phy-10nm.yaml | 33 
> ++
>  1 file changed, 33 insertions(+)
> 

My bot found errors running 'make DT_CHECKER_FLAGS=-m dt_binding_check'
on your patch (DT_CHECKER_FLAGS is new in v5.13):

yamllint warnings/errors:
./Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml:63:54: 
[error] syntax error: mapping values are not allowed here (syntax)

dtschema/dtc warnings/errors:
./Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml:  mapping 
values are not allowed in this context
  in "", line 63, column 54
make[1]: *** Deleting file 
'Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.example.dts'
Traceback (most recent call last):
  File "/usr/local/bin/dt-extract-example", line 46, in 
binding = yaml.load(open(args.yamlfile, encoding='utf-8').read())
  File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/main.py", line 434, 
in load
return constructor.get_single_data()
  File "/usr/local/lib/python3.8/dist-packages/ruamel/yaml/constructor.py", 
line 119, in get_single_data
node = self.composer.get_single_node()
  File "_ruamel_yaml.pyx", line 706, in _ruamel_yaml.CParser.get_single_node
  File "_ruamel_yaml.pyx", line 724, in _ruamel_yaml.CParser._compose_document
  File "_ruamel_yaml.pyx", line 775, in _ruamel_yaml.CParser._compose_node
  File "_ruamel_yaml.pyx", line 889, in 
_ruamel_yaml.CParser._compose_mapping_node
  File "_ruamel_yaml.pyx", line 775, in _ruamel_yaml.CParser._compose_node
  File "_ruamel_yaml.pyx", line 889, in 
_ruamel_yaml.CParser._compose_mapping_node
  File "_ruamel_yaml.pyx", line 775, in _ruamel_yaml.CParser._compose_node
  File "_ruamel_yaml.pyx", line 891, in 
_ruamel_yaml.CParser._compose_mapping_node
  File "_ruamel_yaml.pyx", line 904, in _ruamel_yaml.CParser._parse_next_event
ruamel.yaml.scanner.ScannerError: mapping values are not allowed in this context
  in "", line 63, column 54
make[1]: *** [Documentation/devicetree/bindings/Makefile:25: 
Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.example.dts] Error 1
make[1]: *** Waiting for unfinished jobs
/builds/robherring/linux-dt-review/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml:
 ignoring, error parsing file
make: *** [Makefile:1413: dt_binding_check] Error 2

doc reference errors (make refcheckdocs):

See https://patchwork.ozlabs.org/patch/1577891

This check can fail if there are any dependencies. The base for a patch
series is generally the most recent rc1.

If you already ran 'make dt_binding_check' and didn't see the above
error(s), then make sure 'yamllint' is installed and dt-schema is up to
date:

pip3 install dtschema --upgrade

Please check and re-submit.



Re: [git pull] drm fixes for 5.16-rc3

2022-01-10 Thread Linus Torvalds
On Sun, Jan 9, 2022 at 11:38 PM Geert Uytterhoeven  wrote:
>
> The commit that merged this branch made a seemingly innocent change to
> the top Makefile:

"Seemingly" innocent?

Or something darker and more sinister, related to the unrelenting
slaughter of flightless fowl?

You be the judge.

 Linus


Re: [PATCH] drm/panfrost: Update create_bo flags comment

2022-01-10 Thread Steven Price
On 09/01/2022 16:37, Alyssa Rosenzweig wrote:
> Update a comment stating create_bo took no flags, since it now takes a
> bit mask of optional flags NOEXEC and HEAP.
> 
> Signed-off-by: Alyssa Rosenzweig 

Reviewed-by: Steven Price 

I'll push this to drm-misc-next.

Thanks,

Steve

> ---
>  include/uapi/drm/panfrost_drm.h | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/drm/panfrost_drm.h b/include/uapi/drm/panfrost_drm.h
> index 061e700dd06c..9e40277d8185 100644
> --- a/include/uapi/drm/panfrost_drm.h
> +++ b/include/uapi/drm/panfrost_drm.h
> @@ -84,14 +84,14 @@ struct drm_panfrost_wait_bo {
>   __s64 timeout_ns;   /* absolute */
>  };
>  
> +/* Valid flags to pass to drm_panfrost_create_bo */
>  #define PANFROST_BO_NOEXEC   1
>  #define PANFROST_BO_HEAP 2
>  
>  /**
>   * struct drm_panfrost_create_bo - ioctl argument for creating Panfrost BOs.
>   *
> - * There are currently no values for the flags argument, but it may be
> - * used in a future extension.
> + * The flags argument is a bit mask of PANFROST_BO_* flags.
>   */
>  struct drm_panfrost_create_bo {
>   __u32 size;
> 



Re: [PATCH] drm/amd/display: invalid parameter check in dmub_hpd_callback

2022-01-10 Thread Harry Wentland
On 2022-01-09 13:42, José Expósito wrote:
> The function performs a check on the "adev" input parameter, however, it
> is used before the check.
> 
> Initialize the "dev" variable after the sanity check to avoid a possible
> NULL pointer dereference.
> 
> Fixes: e27c41d5b0681 ("drm/amd/display: Support for DMUB HPD interrupt 
> handling")
> Addresses-Coverity-ID: 1493909 ("Null pointer dereference")
> Signed-off-by: José Expósito 

Reviewed-by: Harry Wentland 

Harry

> ---
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index e727f1dd2a9a..7fbded7a6d9c 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -656,7 +656,7 @@ void dmub_hpd_callback(struct amdgpu_device *adev, struct 
> dmub_notification *not
>   struct drm_connector_list_iter iter;
>   struct dc_link *link;
>   uint8_t link_index = 0;
> - struct drm_device *dev = adev->dm.ddev;
> + struct drm_device *dev;
>  
>   if (adev == NULL)
>   return;
> @@ -673,6 +673,7 @@ void dmub_hpd_callback(struct amdgpu_device *adev, struct 
> dmub_notification *not
>  
>   link_index = notify->link_index;
>   link = adev->dm.dc->links[link_index];
> + dev = adev->dm.ddev;
>  
>   drm_connector_list_iter_begin(dev, &iter);
>   drm_for_each_connector_iter(connector, &iter) {



Re: [Intel-gfx] [PATCH 2/2] drm/mst: use DP_GET_SINK_COUNT() for sink count in ESI

2022-01-10 Thread Ville Syrjälä
On Tue, Jan 04, 2022 at 08:48:57PM +0200, Jani Nikula wrote:
> Take bit 7 into account when reading sink count from
> DP_DEVICE_SERVICE_IRQ_VECTOR_ESI0.
> 
> Signed-off-by: Jani Nikula 
> ---
>  drivers/gpu/drm/drm_dp_mst_topology.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c 
> b/drivers/gpu/drm/drm_dp_mst_topology.c
> index f3d79eda94bb..ab4372e9fe43 100644
> --- a/drivers/gpu/drm/drm_dp_mst_topology.c
> +++ b/drivers/gpu/drm/drm_dp_mst_topology.c
> @@ -4196,7 +4196,7 @@ int drm_dp_mst_hpd_irq(struct drm_dp_mst_topology_mgr 
> *mgr, u8 *esi, bool *handl
>   int ret = 0;
>   int sc;
>   *handled = false;
> - sc = esi[0] & 0x3f;
> + sc = DP_GET_SINK_COUNT(esi[0]);

I wouldn't mind a s/sc/sink_count/ as well.

Reviewed-by: Ville Syrjälä 

>  
>   if (sc != mgr->sink_count) {
>   mgr->sink_count = sc;
> -- 
> 2.30.2

-- 
Ville Syrjälä
Intel


Re: [Intel-gfx] [PATCH 1/2] drm/dp: note that DPCD 0x2002-0x2003 match 0x200-0x201

2022-01-10 Thread Ville Syrjälä
On Tue, Jan 04, 2022 at 08:48:56PM +0200, Jani Nikula wrote:
> DP_SINK_COUNT_ESI and DP_DEVICE_SERVICE_IRQ_VECTOR_ESI0 have the same
> contents as DP_SINK_COUNT and DP_DEVICE_SERVICE_IRQ_VECTOR,
> respectively.

IIRC there was an oversight in the earlier spec revisions that
showed bit 7 as reserved for one of the locations. But looks like
that got fixed.

Reviewed-by: Ville Syrjälä 

> 
> Signed-off-by: Jani Nikula 
> ---
>  include/drm/drm_dp_helper.h | 7 ++-
>  1 file changed, 2 insertions(+), 5 deletions(-)
> 
> diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
> index 30359e434c3f..98d020835b49 100644
> --- a/include/drm/drm_dp_helper.h
> +++ b/include/drm/drm_dp_helper.h
> @@ -1038,11 +1038,8 @@ struct drm_panel;
>  #define DP_SIDEBAND_MSG_UP_REQ_BASE  0x1600   /* 1.2 MST */
>  
>  /* DPRX Event Status Indicator */
> -#define DP_SINK_COUNT_ESI0x2002   /* 1.2 */
> -/* 0-5 sink count */
> -# define DP_SINK_COUNT_CP_READY (1 << 6)
> -
> -#define DP_DEVICE_SERVICE_IRQ_VECTOR_ESI0   0x2003   /* 1.2 */
> +#define DP_SINK_COUNT_ESI   0x2002   /* same as 0x200 */
> +#define DP_DEVICE_SERVICE_IRQ_VECTOR_ESI0   0x2003   /* same as 0x201 */
>  
>  #define DP_DEVICE_SERVICE_IRQ_VECTOR_ESI1   0x2004   /* 1.2 */
>  # define DP_RX_GTC_MSTR_REQ_STATUS_CHANGE(1 << 0)
> -- 
> 2.30.2

-- 
Ville Syrjälä
Intel


Re: [PATCH 3/3] drm/atomic: Make private objs proper objects

2022-01-10 Thread Ville Syrjälä
On Fri, Dec 31, 2021 at 03:23:31PM +0200, Jani Nikula wrote:
> On Wed, 12 Jul 2017, ville.syrj...@linux.intel.com wrote:
> > From: Ville Syrjälä 
> >
> > Make the atomic private object stuff less special by introducing proper
> > base classes for the object and its state. Drivers can embed these in
> > their own appropriate objects, after which these things will work
> > exactly like the plane/crtc/connector states during atomic operations.
> >
> > v2: Reorder to not depend on drm_dynarray (Daniel)
> >
> > Cc: Dhinakaran Pandiyan 
> > Cc: Daniel Vetter 
> > Reviewed-by: Daniel Vetter  #v1
> > Signed-off-by: Ville Syrjälä 
> 
> Stumbled upon an old commit
> 
> commit a4370c777406c2810e37fafd166ccddecdb2a60c
> Author: Ville Syrjälä 
> Date:   Wed Jul 12 18:51:02 2017 +0300
> 
> drm/atomic: Make private objs proper objects
> 
> which is this patch.
> 
> > @@ -3050,8 +3043,7 @@ struct drm_dp_mst_topology_state 
> > *drm_atomic_get_mst_topology_state(struct drm_a
> > struct drm_device *dev = mgr->dev;
> >  
> > WARN_ON(!drm_modeset_is_locked(&dev->mode_config.connection_mutex));
> > -   return drm_atomic_get_private_obj_state(state, mgr,
> > -   &mst_state_funcs);
> > +   return to_dp_mst_topology_state(drm_atomic_get_private_obj_state(state, 
> > &mgr->base));
> >  }
> >  EXPORT_SYMBOL(drm_atomic_get_mst_topology_state);
> 
> I don't think this combines well with...
> 
> > diff --git a/include/drm/drm_dp_mst_helper.h 
> > b/include/drm/drm_dp_mst_helper.h
> > index 177ab6f86855..d55abb75f29a 100644
> > --- a/include/drm/drm_dp_mst_helper.h
> > +++ b/include/drm/drm_dp_mst_helper.h
> > @@ -404,12 +404,17 @@ struct drm_dp_payload {
> > int vcpi;
> >  };
> >  
> > +#define to_dp_mst_topology_state(x) container_of(x, struct 
> > drm_dp_mst_topology_state, base)
> 
> ...this in case of error pointers that
> drm_atomic_get_private_obj_state() may return.

offsetof(base)==0 so should work in practice.

-- 
Ville Syrjälä
Intel


Re: [PATCH 1/2] drm: exynos: dsi: Convert to bridge driver

2022-01-10 Thread Robert Foss
On Mon, 10 Jan 2022 at 16:35, Jagan Teki  wrote:
>
> Hi Robert,
>
> On Mon, Jan 10, 2022 at 9:02 PM Robert Foss  wrote:
> >
> > Hey Jagan,
> >
> > This is a mistake on my end, I must have been looking at reviewing
> > this series and then accidentally included it with another batch of
> > patches. Thank you for catching this.
>
> Thanks for the response.
>
> >
> > I would suggest reverting these two patches[1][2]. Is that ok with you?
>
> May be I will revert 1/2. but 2/2 is valid. Please let me know, if you
> have any concerns on reverting 1/2.

Please go ahead!


Re: [git pull] drm final fixes for 5.16

2022-01-10 Thread Daniel Vetter
On Fri, Jan 7, 2022 at 6:42 PM Linus Torvalds
 wrote:
>
> On Thu, Jan 6, 2022 at 7:23 PM Dave Airlie  wrote:
> >
> > There is only the amdgpu runtime pm regression fix in here.
>
> Thanks, from a quick test it works for me - the backlight actually
> does eventually go away.
>
> It does so only on the second time the monitors say "no signal, going
> to power save", but that has been true before too.
>
> So I think there's still some confusion in this area, but it might be
> elsewhere - who knows what Wayland and friends do. At least it doesn't
> look like a regression to me any more.

Well it's not a true fix, just a "go back to exact old behaviour, but
limited to relevant gpus for amdgpu only" so that i915 doesn't
regress. I think there's some more debug to do here and
Alex/Harry&team can look at leisure now :-)

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch


Re: [PATCH 1/2] drm: exynos: dsi: Convert to bridge driver

2022-01-10 Thread Jagan Teki
Hi Robert,

On Mon, Jan 10, 2022 at 9:02 PM Robert Foss  wrote:
>
> Hey Jagan,
>
> This is a mistake on my end, I must have been looking at reviewing
> this series and then accidentally included it with another batch of
> patches. Thank you for catching this.

Thanks for the response.

>
> I would suggest reverting these two patches[1][2]. Is that ok with you?

May be I will revert 1/2. but 2/2 is valid. Please let me know, if you
have any concerns on reverting 1/2.

Thanks,
Jagan.


Re: [PATCH 1/2] drm: exynos: dsi: Convert to bridge driver

2022-01-10 Thread Robert Foss
Hey Jagan,

This is a mistake on my end, I must have been looking at reviewing
this series and then accidentally included it with another batch of
patches. Thank you for catching this.

I would suggest reverting these two patches[1][2]. Is that ok with you?

[1] 
https://cgit.freedesktop.org/drm/drm-misc/commit/?id=92e794fab87af0793403d5e4a547f0be94a0e656
[2] 
https://cgit.freedesktop.org/drm/drm-misc/commit/?id=aee039e66035b66f0c587cc1b0dd32fb04c9a892


On Mon, 10 Jan 2022 at 12:17, Jagan Teki  wrote:
>
> Hi Robert,
>
> On Mon, Nov 22, 2021 at 9:34 PM Marek Szyprowski
>  wrote:
> >
> > On 22.11.2021 16:07, Marek Szyprowski wrote:
> > > On 22.11.2021 15:55, Jagan Teki wrote:
> > >> On Mon, Nov 22, 2021 at 7:59 PM Jagan Teki
> > >>  wrote:
> > >>> On Mon, Nov 22, 2021 at 7:51 PM Jagan Teki
> > >>>  wrote:
> >  On Mon, Nov 22, 2021 at 7:45 PM Marek Szyprowski
> >   wrote:
> > > On 22.11.2021 08:06, Jagan Teki wrote:
> > >> Some display panels would come up with a non-DSI output, those
> > >> can have an option to connect the DSI host by means of interface
> > >> bridge converter.
> > >>
> > >> This DSI to non-DSI interface bridge converter would requires
> > >> DSI Host to handle drm bridge functionalities in order to DSI
> > >> Host to Interface bridge.
> > >>
> > >> This patch convert the existing to a drm bridge driver with a
> > >> built-in encoder support for compatibility with existing
> > >> component drivers.
> > >>
> > >> Signed-off-by: Jagan Teki 
> > >> ---
> > >> Note:
> > >> Hi Marek Szyprowski,
> > >>
> > >> Please test this on Panel and Bridge hardware.
> > > I don't have good news, t crashes:
> > >
> > > [drm] Exynos DRM: using 1380.decon device for DMA mapping
> > > operations
> > > exynos-drm exynos-drm: bound 1380.decon (ops decon_component_ops)
> > > exynos-drm exynos-drm: bound 1388.decon (ops decon_component_ops)
> > > exynos-drm exynos-drm: bound 1393.mic (ops
> > > exynos_mic_component_ops)
> > > [drm:drm_bridge_attach] *ERROR* failed to attach bridge
> > > /soc@0/dsi@1390 to encoder TMDS-67: -22
> > > exynos-drm exynos-drm: failed to bind 1390.dsi (ops
> > > exynos_dsi_component_ops): -22
> > > Internal error: synchronous external abort: 96000210 [#1] PREEMPT SMP
> > > Modules linked in:
> > > CPU: 2 PID: 74 Comm: kworker/u16:1 Not tainted 5.16.0-rc1+ #4141
> > > Hardware name: Samsung TM2E board (DT)
> > > Workqueue: events_unbound deferred_probe_work_func
> > > pstate: 8005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--)
> > > pc : decon_atomic_disable+0x58/0xd4
> > > lr : decon_atomic_disable+0x28/0xd4
> > > sp : 80001390b940
> > > x29: 80001390b940 x28: 80001259a000 x27: 27f39e80
> > > input: stmfts as
> > > /devices/platform/soc@0/14ed.hsi2c/i2c-3/3-0049/input/input0
> > > x26: ffea x25: 25a40280 x24: 0001
> > > x23: 800011b55f98 x22: 315dc000 x21: 2695d100
> > > x20: 27e7a080 x19: 315e6000 x18: 
> > > x17: 645f736f6e797865 x16: 2073706f28206973 x15: 00028ee0
> > > x14: 0028 x13: 0001 x12: 0040
> > > x11: 23c18920 x10: 23c18922 x9 : 8000126352f0
> > > x8 : 23c00270 x7 :  x6 : 23c00268
> > > x5 : 27e7a3a0 x4 : 0001 x3 : 27e7a080
> > > x2 : 0024 x1 : 800013bc8024 x0 : 246117c0
> > > Call trace:
> > >decon_atomic_disable+0x58/0xd4
> > >decon_unbind+0x1c/0x3c
> > >component_unbind+0x38/0x60
> > >component_bind_all+0x16c/0x25c
> > >exynos_drm_bind+0x104/0x1bc
> > >try_to_bring_up_master+0x164/0x1d0
> > >__component_add+0xa8/0x174
> > >component_add+0x14/0x20
> > >hdmi_probe+0x438/0x710
> > >platform_probe+0x68/0xe0
> > >really_probe.part.0+0x9c/0x31c
> > >__driver_probe_device+0x98/0x144
> > >driver_probe_device+0xc8/0x160
> > >__device_attach_driver+0xb8/0x120
> > >bus_for_each_drv+0x78/0xd0
> > >__device_attach+0xd8/0x180
> > >device_initial_probe+0x14/0x20
> > >bus_probe_device+0x9c/0xa4
> > >deferred_probe_work_func+0x88/0xc4
> > >process_one_work+0x288/0x6f0
> > >worker_thread+0x74/0x470
> > >kthread+0x188/0x194
> > >ret_from_fork+0x10/0x20
> > > Code: 11002042 f9481c61 531e7442 8b020021 (88dffc21)
> > > ---[ end trace d73aff585b108954 ]---
> > > Kernel panic - not syncing: synchronous external abort: Fatal
> > > exception
> > > SMP: stopping secondary CPUs
> > > Kernel Offset: disabled
> > > CPU features: 0x2,300071c2,0846
> > > Memory Limit: none
> > > ---[ end Ke

Re: [PATCH v6 4/6] drm/i915: Use vma resources for async unbinding

2022-01-10 Thread Thomas Hellström



On 1/10/22 14:21, Matthew Auld wrote:

On 07/01/2022 14:23, Thomas Hellström wrote:

Implement async (non-blocking) unbinding by not syncing the vma before
calling unbind on the vma_resource.
Add the resulting unbind fence to the object's dma_resv from where it is
picked up by the ttm migration code.
Ideally these unbind fences should be coalesced with the migration blit
fence to avoid stalling the migration blit waiting for unbind, as they
can certainly go on in parallel, but since we don't yet have a
reasonable data structure to use to coalesce fences and attach the
resulting fence to a timeline, we defer that for now.

Note that with async unbinding, even while the unbind waits for the
preceding bind to complete before unbinding, the vma itself might 
have been

destroyed in the process, clearing the vma pages. Therefore we can
only allow async unbinding if we have a refcounted sg-list and keep a
refcount on that for the vma resource pages to stay intact until
binding occurs. If this condition is not met, a request for an async
unbind is diverted to a sync unbind.

v2:
- Use a separate kmem_cache for vma resources for now to isolate their
   memory allocation and aid debugging.
- Move the check for vm closed to the actual unbinding thread. 
Regardless
   of whether the vm is closed, we need the unbind fence to properly 
wait

   for capture.
- Clear vma_res::vm on unbind and update its documentation.
v4:
- Take cache coloring into account when searching for vma resources
   pending unbind. (Matthew Auld)
v5:
- Fix timeout and error check in i915_vma_resource_bind_dep_await().
- Avoid taking a reference on the object for async binding if
   async unbind capable.
- Fix braces around a single-line if statement.
v6:
- Fix up the cache coloring adjustment. (Kernel test robot 
)

- Don't allow async unbinding if the vma_res pages are not the same as
   the object pages.

Signed-off-by: Thomas Hellström 
---
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c |  11 +-
  drivers/gpu/drm/i915/gt/intel_ggtt.c |   2 +-
  drivers/gpu/drm/i915/gt/intel_gtt.c  |   4 +
  drivers/gpu/drm/i915/gt/intel_gtt.h  |   3 +
  drivers/gpu/drm/i915/i915_drv.h  |   1 +
  drivers/gpu/drm/i915/i915_gem.c  |  12 +-
  drivers/gpu/drm/i915/i915_module.c   |   3 +
  drivers/gpu/drm/i915/i915_vma.c  | 205 +--
  drivers/gpu/drm/i915/i915_vma.h  |   3 +-
  drivers/gpu/drm/i915/i915_vma_resource.c | 354 +--
  drivers/gpu/drm/i915/i915_vma_resource.h |  48 +++
  11 files changed, 579 insertions(+), 67 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c

index 8653855d808b..1de306c03aaf 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -142,7 +142,16 @@ int i915_ttm_move_notify(struct 
ttm_buffer_object *bo)

  struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
  int ret;
  -    ret = i915_gem_object_unbind(obj, I915_GEM_OBJECT_UNBIND_ACTIVE);
+    /*
+ * Note: The async unbinding here will actually transform the
+ * blocking wait for unbind into a wait before finally submitting
+ * evict / migration blit and thus stall the migration timeline
+ * which may not be good for overall throughput. We should make
+ * sure we await the unbind fences *after* the migration blit
+ * instead of *before* as we currently do.
+ */
+    ret = i915_gem_object_unbind(obj, I915_GEM_OBJECT_UNBIND_ACTIVE |
+ I915_GEM_OBJECT_UNBIND_ASYNC);
  if (ret)
  return ret;
  diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c

index e49b6250c4b7..a1b2761bc16e 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -142,7 +142,7 @@ void i915_ggtt_suspend_vm(struct 
i915_address_space *vm)

  continue;
    if (!i915_vma_is_bound(vma, I915_VMA_GLOBAL_BIND)) {
-    __i915_vma_evict(vma);
+    __i915_vma_evict(vma, false);
  drm_mm_remove_node(&vma->node);
  }
  }
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c

index a94be0306464..46be4197b93f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -161,6 +161,9 @@ static void __i915_vm_release(struct work_struct 
*work)

  struct i915_address_space *vm =
  container_of(work, struct i915_address_space, release_work);
  +    /* Synchronize async unbinds. */
+    i915_vma_resource_bind_dep_sync_all(vm);
+
  vm->cleanup(vm);
  i915_address_space_fini(vm);
  @@ -189,6 +192,7 @@ void i915_address_space_init(struct 
i915_address_space *vm, int subclass)

  if (!kref_read(&vm->resv_ref))
  kref_init(&vm->resv_ref);
  +    vm->pending_unbind = RB_ROOT_CACHED;
  INIT_WORK(&vm->rele

Re: [PATCH v6 5/6] drm/i915: Asynchronous migration selftest

2022-01-10 Thread Matthew Auld

On 10/01/2022 14:36, Thomas Hellström wrote:


On 1/10/22 14:59, Matthew Auld wrote:

On 07/01/2022 14:23, Thomas Hellström wrote:

Add a selftest to exercise asynchronous migration and -unbining.
Extend the gem_migrate selftest to perform the migrations while
depending on a spinner and a bound vma set up on the migrated
buffer object.

Signed-off-by: Thomas Hellström 
---
  drivers/gpu/drm/i915/gem/i915_gem_object.c    |  12 ++
  drivers/gpu/drm/i915/gem/i915_gem_object.h    |   3 +
  .../drm/i915/gem/selftests/i915_gem_migrate.c | 192 --
  3 files changed, 192 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c

index d87b508b59b1..1a9e1f940a7d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -756,6 +756,18 @@ i915_gem_object_get_moving_fence(struct 
drm_i915_gem_object *obj)

  return dma_fence_get(i915_gem_to_ttm(obj)->moving);
  }
  +void i915_gem_object_set_moving_fence(struct drm_i915_gem_object 
*obj,

+  struct dma_fence *fence)
+{
+    struct dma_fence **moving = &i915_gem_to_ttm(obj)->moving;
+
+    if (*moving == fence)
+    return;
+
+    dma_fence_put(*moving);
+    *moving = dma_fence_get(fence);
+}
+
  /**
   * i915_gem_object_wait_moving_fence - Wait for the object's moving 
fence if any

   * @obj: The object whose moving fence to wait for.
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object.h

index f66d46882ea7..1d178236 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h
@@ -524,6 +524,9 @@ i915_gem_object_finish_access(struct 
drm_i915_gem_object *obj)

  struct dma_fence *
  i915_gem_object_get_moving_fence(struct drm_i915_gem_object *obj);
  +void i915_gem_object_set_moving_fence(struct drm_i915_gem_object 
*obj,

+  struct dma_fence *fence);
+
  int i915_gem_object_wait_moving_fence(struct drm_i915_gem_object *obj,
    bool intr);
  diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c

index ecb691c81d1e..d534141b2cf7 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c
@@ -4,8 +4,13 @@
   */
    #include "gt/intel_migrate.h"
+#include "gt/intel_gpu_commands.h"
  #include "gem/i915_gem_ttm_move.h"
  +#include "i915_deps.h"
+
+#include "selftests/igt_spinner.h"
+
  static int igt_fill_check_buffer(struct drm_i915_gem_object *obj,
   bool fill)
  {
@@ -101,7 +106,8 @@ static int igt_same_create_migrate(void *arg)
  }
    static int lmem_pages_migrate_one(struct i915_gem_ww_ctx *ww,
-  struct drm_i915_gem_object *obj)
+  struct drm_i915_gem_object *obj,
+  struct i915_vma *vma)
  {
  int err;
  @@ -109,6 +115,24 @@ static int lmem_pages_migrate_one(struct 
i915_gem_ww_ctx *ww,

  if (err)
  return err;
  +    if (vma) {
+    err = i915_vma_pin_ww(vma, ww, obj->base.size, 0,
+  0UL | PIN_OFFSET_FIXED |
+  PIN_USER);
+    if (err) {
+    if (err != -EINTR && err != ERESTARTSYS &&
+    err != -EDEADLK)
+    pr_err("Failed to pin vma.\n");
+    return err;
+    }
+
+    i915_vma_unpin(vma);
+    }
+
+    /*
+ * Migration will implicitly unbind (asynchronously) any bound
+ * vmas.
+ */
  if (i915_gem_object_is_lmem(obj)) {
  err = i915_gem_object_migrate(obj, ww, INTEL_REGION_SMEM);
  if (err) {
@@ -149,11 +173,15 @@ static int lmem_pages_migrate_one(struct 
i915_gem_ww_ctx *ww,

  return err;
  }
  -static int igt_lmem_pages_migrate(void *arg)
+static int __igt_lmem_pages_migrate(struct intel_gt *gt,
+    struct i915_address_space *vm,
+    struct i915_deps *deps,
+    struct igt_spinner *spin,
+    struct dma_fence *spin_fence)
  {
-    struct intel_gt *gt = arg;
  struct drm_i915_private *i915 = gt->i915;
  struct drm_i915_gem_object *obj;
+    struct i915_vma *vma = NULL;
  struct i915_gem_ww_ctx ww;
  struct i915_request *rq;
  int err;
@@ -165,6 +193,14 @@ static int igt_lmem_pages_migrate(void *arg)
  if (IS_ERR(obj))
  return PTR_ERR(obj);
  +    if (vm) {
+    vma = i915_vma_instance(obj, vm, NULL);
+    if (IS_ERR(vma)) {
+    err = PTR_ERR(vma);
+    goto out_put;
+    }
+    }
+
  /* Initial GPU fill, sync, CPU initialization. */
  for_i915_gem_ww(&ww, err, true) {
  err = i915_gem_object_lock(obj, &ww);
@@ -175,25 +211,23 @@ static int igt_lmem_pages_migrate(void *arg)
  if (err)
  continue;
  -    err = intel_migrate_clear(>->migrate, &ww, 

Re: [PATCH v6 5/6] drm/i915: Asynchronous migration selftest

2022-01-10 Thread Thomas Hellström



On 1/10/22 14:59, Matthew Auld wrote:

On 07/01/2022 14:23, Thomas Hellström wrote:

Add a selftest to exercise asynchronous migration and -unbining.
Extend the gem_migrate selftest to perform the migrations while
depending on a spinner and a bound vma set up on the migrated
buffer object.

Signed-off-by: Thomas Hellström 
---
  drivers/gpu/drm/i915/gem/i915_gem_object.c    |  12 ++
  drivers/gpu/drm/i915/gem/i915_gem_object.h    |   3 +
  .../drm/i915/gem/selftests/i915_gem_migrate.c | 192 --
  3 files changed, 192 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c

index d87b508b59b1..1a9e1f940a7d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -756,6 +756,18 @@ i915_gem_object_get_moving_fence(struct 
drm_i915_gem_object *obj)

  return dma_fence_get(i915_gem_to_ttm(obj)->moving);
  }
  +void i915_gem_object_set_moving_fence(struct drm_i915_gem_object 
*obj,

+  struct dma_fence *fence)
+{
+    struct dma_fence **moving = &i915_gem_to_ttm(obj)->moving;
+
+    if (*moving == fence)
+    return;
+
+    dma_fence_put(*moving);
+    *moving = dma_fence_get(fence);
+}
+
  /**
   * i915_gem_object_wait_moving_fence - Wait for the object's moving 
fence if any

   * @obj: The object whose moving fence to wait for.
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object.h

index f66d46882ea7..1d178236 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h
@@ -524,6 +524,9 @@ i915_gem_object_finish_access(struct 
drm_i915_gem_object *obj)

  struct dma_fence *
  i915_gem_object_get_moving_fence(struct drm_i915_gem_object *obj);
  +void i915_gem_object_set_moving_fence(struct drm_i915_gem_object 
*obj,

+  struct dma_fence *fence);
+
  int i915_gem_object_wait_moving_fence(struct drm_i915_gem_object *obj,
    bool intr);
  diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c

index ecb691c81d1e..d534141b2cf7 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c
@@ -4,8 +4,13 @@
   */
    #include "gt/intel_migrate.h"
+#include "gt/intel_gpu_commands.h"
  #include "gem/i915_gem_ttm_move.h"
  +#include "i915_deps.h"
+
+#include "selftests/igt_spinner.h"
+
  static int igt_fill_check_buffer(struct drm_i915_gem_object *obj,
   bool fill)
  {
@@ -101,7 +106,8 @@ static int igt_same_create_migrate(void *arg)
  }
    static int lmem_pages_migrate_one(struct i915_gem_ww_ctx *ww,
-  struct drm_i915_gem_object *obj)
+  struct drm_i915_gem_object *obj,
+  struct i915_vma *vma)
  {
  int err;
  @@ -109,6 +115,24 @@ static int lmem_pages_migrate_one(struct 
i915_gem_ww_ctx *ww,

  if (err)
  return err;
  +    if (vma) {
+    err = i915_vma_pin_ww(vma, ww, obj->base.size, 0,
+  0UL | PIN_OFFSET_FIXED |
+  PIN_USER);
+    if (err) {
+    if (err != -EINTR && err != ERESTARTSYS &&
+    err != -EDEADLK)
+    pr_err("Failed to pin vma.\n");
+    return err;
+    }
+
+    i915_vma_unpin(vma);
+    }
+
+    /*
+ * Migration will implicitly unbind (asynchronously) any bound
+ * vmas.
+ */
  if (i915_gem_object_is_lmem(obj)) {
  err = i915_gem_object_migrate(obj, ww, INTEL_REGION_SMEM);
  if (err) {
@@ -149,11 +173,15 @@ static int lmem_pages_migrate_one(struct 
i915_gem_ww_ctx *ww,

  return err;
  }
  -static int igt_lmem_pages_migrate(void *arg)
+static int __igt_lmem_pages_migrate(struct intel_gt *gt,
+    struct i915_address_space *vm,
+    struct i915_deps *deps,
+    struct igt_spinner *spin,
+    struct dma_fence *spin_fence)
  {
-    struct intel_gt *gt = arg;
  struct drm_i915_private *i915 = gt->i915;
  struct drm_i915_gem_object *obj;
+    struct i915_vma *vma = NULL;
  struct i915_gem_ww_ctx ww;
  struct i915_request *rq;
  int err;
@@ -165,6 +193,14 @@ static int igt_lmem_pages_migrate(void *arg)
  if (IS_ERR(obj))
  return PTR_ERR(obj);
  +    if (vm) {
+    vma = i915_vma_instance(obj, vm, NULL);
+    if (IS_ERR(vma)) {
+    err = PTR_ERR(vma);
+    goto out_put;
+    }
+    }
+
  /* Initial GPU fill, sync, CPU initialization. */
  for_i915_gem_ww(&ww, err, true) {
  err = i915_gem_object_lock(obj, &ww);
@@ -175,25 +211,23 @@ static int igt_lmem_pages_migrate(void *arg)
  if (err)
  continue;
  -    err = intel_migrate_clear(>->migrate, &ww, NULL,
+    err = intel_migrate_clear(>->m

Re: [Intel-gfx] [PATCH v6 6/6] drm/i915: Use struct vma_resource instead of struct vma_snapshot

2022-01-10 Thread Matthew Auld
On Fri, 7 Jan 2022 at 14:24, Thomas Hellström
 wrote:
>
> There is always a struct vma_resource guaranteed to be alive when we
> access a corresponding struct vma_snapshot.
>
> So ditch the latter and instead of allocating vma_snapshots, reference
> the already existning vma_resource.
>
> This requires a couple of extra members in struct vma_resource but that's
> a small price to pay for the simplification.
>
> v2:
> - Fix a missing include and declaration (kernel test robot )
>
> Signed-off-by: Thomas Hellström 
Reviewed-by: Matthew Auld 


Re: [PATCH 2/2] drm/panfrost: adjusted job affinity for dual core group GPUs

2022-01-10 Thread Steven Price
On 24/12/2021 08:56, Alexey Sheplyakov wrote:
> Hi,
> 
> On 23.12.2021 18:11, Alyssa Rosenzweig wrote:
>>> The kernel driver itself can't guess which jobs need a such a strict
>>> affinity, so setting proper requirements is the responsibility of
>>> the userspace (Mesa). However the userspace is not smart enough [yet].
>>> Therefore this patch applies the above affinity rule to all jobs on
>>> dual core group GPUs.
>>
>> What does Mesa need to do for this to work "properly"?
> 
> I don't know.
> The blob restricts affinity of jobs with JD_REQ_COHERENT_GROUP requirement.
> In theory jobs without such a requirement can run on any core, but in
> practice all jobs in slots 0, 1 are assigned to core group 0 (with workloads
> I've run - i.e. weston, firefox, glmark2, perhaps it's also SoC dependent).
> So I've forced all jobs in slots 0, 1 to core group 0. Surprisingly this
> (and memory attributes adjustment) appeared to be enough to get panfrost
> working with T628 (on some SoCs). Without these patches GPU locks up in
> a few seconds.

Let me fill in a few details here.

The T628 is pretty unique in that it has two core groups, i.e. more than
one L2 cache. Previous designs (i.e. T604) didn't have enough cores to
require a second core group, and later designs with sufficient cores
have coherent L2 caches so act as a single core group (although the
hardware has multiple L2s it only reports a single one as they act as if
a single cache).

Note that technically the T608, T658 and T678 also exist and have this
problem - but I don't believe any products were produced with these (so
unless you're in ARM with a very unusual FPGA they can be ignored).

The blob/kbase handle this situation with a new flag
JD_REQ_COHERENT_GROUP which specifies that the affinity of a job must
land on a single (coherent) core group, and
JD_REQ_SPECIFIC_COHERENT_GROUP which allows user space to target a
specific group.

In theory fragment shading can be performed over all cores (because a
fragment shader job doesn't need coherency between threads), so doesn't
need the JD_REQ_COHERENT_GROUP flag, vertex shading however requires to
be run on the same core group as the tiler (which always lives in core
group 0).

Of course there are various 'tricks' that can happen even within a
fragment shader which might require coherency.

So the expected sequence is that vertex+tiling is restricted to core
group 0, and fragment shading can be run over all cores. Although there
can be issues with performance doing this naïvely because the Job
Manager doesn't necessarily share the GPUs cores fairly between vertex
and fragment jobs. Also note that a cache flush is needed between
running the vertex+tiling and the fragment job to ensure that the extra
core group is coherent - this can be expensive, so it may not be worth
using the second core group in some situations. I'm not sure what logic
the Blob uses for that.

Finally there's GPU compute (i.e. OpenCL): here coherency is usually
required, but there's often more information about the amount of
coherency needed. In this case it is possible to run different job
chains on each core group. This is the only situation there slot 2 is
used, and is the reason for the JS_REQ_SPECIFIC_COHERENT_GROUP flag.
It's also a nightmare for scheduling as the hardware gets upset if the
affinity masks for slot 1 and slot 2 overlap.

>> What are the limitations of the approach implemented here?
> 
> Suboptimal performance.
> 
> 1) There might be job chains which don't care about affinity
>(I haven't seen any of these yet on systems I've got).

You are effectively throwing away half the cores if everything is pinned
to core group 0, so I'm pretty sure the Blob manages to run (some)
fragment jobs without the COHERENT_GROUP flag.

But equally this is a reasonable first step for the kernel driver - we
can make the GPU look like ever other GPU by pretending the second core
group doesn't exist.

> 2) There might be dual core group GPUs which don't need such a strict 
> affinity.
>(I haven't seen any dual core group T[78]xx GPUs yet. This doesn't mean 
> such
> GPUs don't exist).

They should all be a single core group (fully coherent L2s).

>> If we need to extend it down the line with a UABI change, what would that 
>> look like?
> 
> I have no idea. And I'm not sure if it's worth the effort (since most jobs
> end up on core group 0 anyway).

Whether it's worth the effort depends on whether anyone really cares
about getting the full performance out of this particular GPU.

At this stage I think the main UABI change would be to add the opposite
flag to kbase, (e.g. "PANFROST_JD_DOESNT_NEED_COHERENCY_ON_GPU"[1]) to
opt-in to allowing the job to run across all cores.

The second change would be to allow compute jobs to be run on the second
core group, so another flag: PANFROST_RUN_ON_SECOND_CORE_GROUP.

But clearly there's little point adding such flags until someone steps
up to do the Mesa work.

Steve

[1] Bike-shedding the

Re: [v2 2/3] drm/msm/dsi: Add dsi phy tuning configuration support

2022-01-10 Thread Dmitry Baryshkov
On Mon, 10 Jan 2022 at 15:56, Rajeev Nandan  wrote:
>
> Add support for MSM DSI PHY tuning configuration. Current design is
> to support drive strength and drive level/amplitude tuning for
> 10nm PHY version, but this can be extended to other PHY versions.
>
> Signed-off-by: Rajeev Nandan 
> ---
>
> Changes in v2:
>  - New.
>  - Split into generic code and 10nm-specific part (Dmitry Baryshkov)
>
>  drivers/gpu/drm/msm/dsi/phy/dsi_phy.c |  3 +++
>  drivers/gpu/drm/msm/dsi/phy/dsi_phy.h | 16 
>  2 files changed, 19 insertions(+)
>
> diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c 
> b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
> index 8c65ef6..ee3739d 100644
> --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
> +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.c
> @@ -739,6 +739,9 @@ static int dsi_phy_driver_probe(struct platform_device 
> *pdev)
> }
> }
>
> +   if (phy->cfg->ops.tuning_cfg_init)
> +   phy->cfg->ops.tuning_cfg_init(phy);

Please rename to parse_dt_properties() or something like that.

> +
> ret = dsi_phy_regulator_init(phy);
> if (ret)
> goto fail;
> diff --git a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.h 
> b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.h
> index b91303a..b559a2b 100644
> --- a/drivers/gpu/drm/msm/dsi/phy/dsi_phy.h
> +++ b/drivers/gpu/drm/msm/dsi/phy/dsi_phy.h
> @@ -25,6 +25,7 @@ struct msm_dsi_phy_ops {
> void (*save_pll_state)(struct msm_dsi_phy *phy);
> int (*restore_pll_state)(struct msm_dsi_phy *phy);
> bool (*set_continuous_clock)(struct msm_dsi_phy *phy, bool enable);
> +   void (*tuning_cfg_init)(struct msm_dsi_phy *phy);
>  };
>
>  struct msm_dsi_phy_cfg {
> @@ -81,6 +82,20 @@ struct msm_dsi_dphy_timing {
>  #define DSI_PIXEL_PLL_CLK  1
>  #define NUM_PROVIDED_CLKS  2
>
> +#define DSI_LANE_MAX   5
> +
> +/**
> + * struct msm_dsi_phy_tuning_cfg - Holds PHY tuning config parameters.
> + * @rescode_offset_top: Offset for pull-up legs rescode.
> + * @rescode_offset_bot: Offset for pull-down legs rescode.
> + * @vreg_ctrl: vreg ctrl to drive LDO level
> + */
> +struct msm_dsi_phy_tuning_cfg {
> +   u8 rescode_offset_top[DSI_LANE_MAX];
> +   u8 rescode_offset_bot[DSI_LANE_MAX];
> +   u8 vreg_ctrl;
> +};

How generic is this? In other words, you are adding a struct with the
generic name to the generic structure. I'd expect that it would be
common to several PHY generations.

> +
>  struct msm_dsi_phy {
> struct platform_device *pdev;
> void __iomem *base;
> @@ -98,6 +113,7 @@ struct msm_dsi_phy {
>
> struct msm_dsi_dphy_timing timing;
> const struct msm_dsi_phy_cfg *cfg;
> +   struct msm_dsi_phy_tuning_cfg tuning_cfg;
>
> enum msm_dsi_phy_usecase usecase;
> bool regulator_ldo_mode;
> --
> 2.7.4
>


-- 
With best wishes
Dmitry


Re: [v2 1/3] dt-bindings: msm/dsi: Add 10nm dsi phy tuning properties

2022-01-10 Thread Dmitry Baryshkov
On Mon, 10 Jan 2022 at 15:56, Rajeev Nandan  wrote:
>
> In most cases, the default values of DSI PHY tuning registers should be
> sufficient as they are fully optimized. However, in some cases where
> extreme board parasitics cause the eye shape to degrade, the override
> bits can be used to improve the signal quality.
>
> The general guidelines for DSI PHY tuning include:
> - High and moderate data rates may benefit from the drive strength and
>   drive level tuning.
> - Drive strength tuning will affect the output impedance and may be used
>   for matching optimization.
> - Drive level tuning will affect the output levels without affecting the
>   impedance.
>
> The clock and data lanes have a calibration circuitry feature. The drive
> strength tuning can be done by adjusting rescode offset for hstop/hsbot,
> and the drive level tuning can be done by adjusting the LDO output level
> for the HSTX drive.
>
> Signed-off-by: Rajeev Nandan 
> ---
>
> Changes in v2:
>  - More details in the commit text (Stephen Boyd)
>  - Use human understandable values (Stephen Boyd, Dmitry Baryshkov)
>  - Do not take values that are going to be unused (Dmitry Baryshkov)
>
>  .../bindings/display/msm/dsi-phy-10nm.yaml | 33 
> ++
>  1 file changed, 33 insertions(+)
>
> diff --git a/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml 
> b/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml
> index 4399715..d0eb8f6 100644
> --- a/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml
> +++ b/Documentation/devicetree/bindings/display/msm/dsi-phy-10nm.yaml
> @@ -35,6 +35,35 @@ properties:
>Connected to DSI0_MIPI_DSI_PLL_VDDA0P9 pin for sc7180 target and
>connected to VDDA_MIPI_DSI_0_PLL_0P9 pin for sdm845 target

Generic note:
I think these properties should be prefixed with "qcom," prefix.

>
> +  phy-rescode-offset-top:
> +$ref: /schemas/types.yaml#/definitions/uint8-array
> +minItems: 5
> +maxItems: 5
> +description:
> +  Integer array of offset for pull-up legs rescode for all five lanes.
> +  To offset the drive strength from the calibrated value in an increasing
> +  or decreasing manner, use 6 bit two’s complement values.

dtc should support negative values, google hints that <(-2)> should work.

> +
> +  phy-rescode-offset-bot:
> +$ref: /schemas/types.yaml#/definitions/uint8-array
> +minItems: 5
> +maxItems: 5
> +description:
> +  Integer array of offset for pull-down legs rescode for all five lanes.
> +  To offset the drive strength from the calibrated value in an increasing
> +  or decreasing manner, use 6 bit two’s complement values.
> +
> +  phy-drive-ldo-level:
> +$ref: /schemas/types.yaml#/definitions/uint8
> +minimum: 0
> +maximum: 7
> +description:
> +  The PHY LDO has an amplitude tuning feature to adjust the LDO output
> +  for the HSTX drive. To offset the drive level from the default value,
> +  supported levels are with the following mapping:
> +  0 = 375mV, 1 = 400mV, 2 = 425mV, 3 = 450mV, 4 = 475mV, 5 = 500mV,
> +  6 = 500mV, 7 = 500mV

No encoding please. Specify the values in the dts and convert them
into the register values in the driver.

> +
>  required:
>- compatible
>- reg
> @@ -64,5 +93,9 @@ examples:
>   clocks = <&dispcc DISP_CC_MDSS_AHB_CLK>,
><&rpmhcc RPMH_CXO_CLK>;
>   clock-names = "iface", "ref";
> +
> + phy-resocde-offset-top = /bits/ 8 <0x0 0x0 0x0 0x0 0x0>;
> + phy-rescode-offset-bot = /bits/ 8 <0x0 0x0 0x0 0x0 0x0>;
> + phy-drive-ldo-level = /bits/ 8 <1>;

--
With best wishes
Dmitry


Re: [PATCH v6 5/6] drm/i915: Asynchronous migration selftest

2022-01-10 Thread Matthew Auld

On 07/01/2022 14:23, Thomas Hellström wrote:

Add a selftest to exercise asynchronous migration and -unbining.
Extend the gem_migrate selftest to perform the migrations while
depending on a spinner and a bound vma set up on the migrated
buffer object.

Signed-off-by: Thomas Hellström 
---
  drivers/gpu/drm/i915/gem/i915_gem_object.c|  12 ++
  drivers/gpu/drm/i915/gem/i915_gem_object.h|   3 +
  .../drm/i915/gem/selftests/i915_gem_migrate.c | 192 --
  3 files changed, 192 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index d87b508b59b1..1a9e1f940a7d 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -756,6 +756,18 @@ i915_gem_object_get_moving_fence(struct 
drm_i915_gem_object *obj)
return dma_fence_get(i915_gem_to_ttm(obj)->moving);
  }
  
+void i915_gem_object_set_moving_fence(struct drm_i915_gem_object *obj,

+ struct dma_fence *fence)
+{
+   struct dma_fence **moving = &i915_gem_to_ttm(obj)->moving;
+
+   if (*moving == fence)
+   return;
+
+   dma_fence_put(*moving);
+   *moving = dma_fence_get(fence);
+}
+
  /**
   * i915_gem_object_wait_moving_fence - Wait for the object's moving fence if 
any
   * @obj: The object whose moving fence to wait for.
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object.h
index f66d46882ea7..1d178236 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h
@@ -524,6 +524,9 @@ i915_gem_object_finish_access(struct drm_i915_gem_object 
*obj)
  struct dma_fence *
  i915_gem_object_get_moving_fence(struct drm_i915_gem_object *obj);
  
+void i915_gem_object_set_moving_fence(struct drm_i915_gem_object *obj,

+ struct dma_fence *fence);
+
  int i915_gem_object_wait_moving_fence(struct drm_i915_gem_object *obj,
  bool intr);
  
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c

index ecb691c81d1e..d534141b2cf7 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_migrate.c
@@ -4,8 +4,13 @@
   */
  
  #include "gt/intel_migrate.h"

+#include "gt/intel_gpu_commands.h"
  #include "gem/i915_gem_ttm_move.h"
  
+#include "i915_deps.h"

+
+#include "selftests/igt_spinner.h"
+
  static int igt_fill_check_buffer(struct drm_i915_gem_object *obj,
 bool fill)
  {
@@ -101,7 +106,8 @@ static int igt_same_create_migrate(void *arg)
  }
  
  static int lmem_pages_migrate_one(struct i915_gem_ww_ctx *ww,

- struct drm_i915_gem_object *obj)
+ struct drm_i915_gem_object *obj,
+ struct i915_vma *vma)
  {
int err;
  
@@ -109,6 +115,24 @@ static int lmem_pages_migrate_one(struct i915_gem_ww_ctx *ww,

if (err)
return err;
  
+	if (vma) {

+   err = i915_vma_pin_ww(vma, ww, obj->base.size, 0,
+ 0UL | PIN_OFFSET_FIXED |
+ PIN_USER);
+   if (err) {
+   if (err != -EINTR && err != ERESTARTSYS &&
+   err != -EDEADLK)
+   pr_err("Failed to pin vma.\n");
+   return err;
+   }
+
+   i915_vma_unpin(vma);
+   }
+
+   /*
+* Migration will implicitly unbind (asynchronously) any bound
+* vmas.
+*/
if (i915_gem_object_is_lmem(obj)) {
err = i915_gem_object_migrate(obj, ww, INTEL_REGION_SMEM);
if (err) {
@@ -149,11 +173,15 @@ static int lmem_pages_migrate_one(struct i915_gem_ww_ctx 
*ww,
return err;
  }
  
-static int igt_lmem_pages_migrate(void *arg)

+static int __igt_lmem_pages_migrate(struct intel_gt *gt,
+   struct i915_address_space *vm,
+   struct i915_deps *deps,
+   struct igt_spinner *spin,
+   struct dma_fence *spin_fence)
  {
-   struct intel_gt *gt = arg;
struct drm_i915_private *i915 = gt->i915;
struct drm_i915_gem_object *obj;
+   struct i915_vma *vma = NULL;
struct i915_gem_ww_ctx ww;
struct i915_request *rq;
int err;
@@ -165,6 +193,14 @@ static int igt_lmem_pages_migrate(void *arg)
if (IS_ERR(obj))
return PTR_ERR(obj);
  
+	if (vm) {

+   vma = i915_vma_instance(obj, vm, NULL);
+   if (IS_ERR(vma)) {
+   err = PTR_ERR(vma);
+   goto out_put;
+   }
+

Re: [PATCH v6 4/6] drm/i915: Use vma resources for async unbinding

2022-01-10 Thread Matthew Auld

On 07/01/2022 14:23, Thomas Hellström wrote:

Implement async (non-blocking) unbinding by not syncing the vma before
calling unbind on the vma_resource.
Add the resulting unbind fence to the object's dma_resv from where it is
picked up by the ttm migration code.
Ideally these unbind fences should be coalesced with the migration blit
fence to avoid stalling the migration blit waiting for unbind, as they
can certainly go on in parallel, but since we don't yet have a
reasonable data structure to use to coalesce fences and attach the
resulting fence to a timeline, we defer that for now.

Note that with async unbinding, even while the unbind waits for the
preceding bind to complete before unbinding, the vma itself might have been
destroyed in the process, clearing the vma pages. Therefore we can
only allow async unbinding if we have a refcounted sg-list and keep a
refcount on that for the vma resource pages to stay intact until
binding occurs. If this condition is not met, a request for an async
unbind is diverted to a sync unbind.

v2:
- Use a separate kmem_cache for vma resources for now to isolate their
   memory allocation and aid debugging.
- Move the check for vm closed to the actual unbinding thread. Regardless
   of whether the vm is closed, we need the unbind fence to properly wait
   for capture.
- Clear vma_res::vm on unbind and update its documentation.
v4:
- Take cache coloring into account when searching for vma resources
   pending unbind. (Matthew Auld)
v5:
- Fix timeout and error check in i915_vma_resource_bind_dep_await().
- Avoid taking a reference on the object for async binding if
   async unbind capable.
- Fix braces around a single-line if statement.
v6:
- Fix up the cache coloring adjustment. (Kernel test robot )
- Don't allow async unbinding if the vma_res pages are not the same as
   the object pages.

Signed-off-by: Thomas Hellström 
---
  drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c |  11 +-
  drivers/gpu/drm/i915/gt/intel_ggtt.c |   2 +-
  drivers/gpu/drm/i915/gt/intel_gtt.c  |   4 +
  drivers/gpu/drm/i915/gt/intel_gtt.h  |   3 +
  drivers/gpu/drm/i915/i915_drv.h  |   1 +
  drivers/gpu/drm/i915/i915_gem.c  |  12 +-
  drivers/gpu/drm/i915/i915_module.c   |   3 +
  drivers/gpu/drm/i915/i915_vma.c  | 205 +--
  drivers/gpu/drm/i915/i915_vma.h  |   3 +-
  drivers/gpu/drm/i915/i915_vma_resource.c | 354 +--
  drivers/gpu/drm/i915/i915_vma_resource.h |  48 +++
  11 files changed, 579 insertions(+), 67 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c 
b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
index 8653855d808b..1de306c03aaf 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm_move.c
@@ -142,7 +142,16 @@ int i915_ttm_move_notify(struct ttm_buffer_object *bo)
struct drm_i915_gem_object *obj = i915_ttm_to_gem(bo);
int ret;
  
-	ret = i915_gem_object_unbind(obj, I915_GEM_OBJECT_UNBIND_ACTIVE);

+   /*
+* Note: The async unbinding here will actually transform the
+* blocking wait for unbind into a wait before finally submitting
+* evict / migration blit and thus stall the migration timeline
+* which may not be good for overall throughput. We should make
+* sure we await the unbind fences *after* the migration blit
+* instead of *before* as we currently do.
+*/
+   ret = i915_gem_object_unbind(obj, I915_GEM_OBJECT_UNBIND_ACTIVE |
+I915_GEM_OBJECT_UNBIND_ASYNC);
if (ret)
return ret;
  
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c b/drivers/gpu/drm/i915/gt/intel_ggtt.c

index e49b6250c4b7..a1b2761bc16e 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -142,7 +142,7 @@ void i915_ggtt_suspend_vm(struct i915_address_space *vm)
continue;
  
  		if (!i915_vma_is_bound(vma, I915_VMA_GLOBAL_BIND)) {

-   __i915_vma_evict(vma);
+   __i915_vma_evict(vma, false);
drm_mm_remove_node(&vma->node);
}
}
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c 
b/drivers/gpu/drm/i915/gt/intel_gtt.c
index a94be0306464..46be4197b93f 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.c
@@ -161,6 +161,9 @@ static void __i915_vm_release(struct work_struct *work)
struct i915_address_space *vm =
container_of(work, struct i915_address_space, release_work);
  
+	/* Synchronize async unbinds. */

+   i915_vma_resource_bind_dep_sync_all(vm);
+
vm->cleanup(vm);
i915_address_space_fini(vm);
  
@@ -189,6 +192,7 @@ void i915_address_space_init(struct i915_address_space *vm, int subclass)

if (!kref_read(&vm->resv_ref))
kref_init(&vm->resv_ref);
  
+

[PATCH v4 5/7] drm/i915: Remove assert_object_held_shared

2022-01-10 Thread Maarten Lankhorst
This duck tape workaround is no longer required, unbind and destroy are
fixed to take the obj->resv mutex before destroying and obj->mm.lock has
been removed, always requiring obj->resv as well.

Signed-off-by: Maarten Lankhorst 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/gem/i915_gem_object.c  |  4 ++--
 drivers/gpu/drm/i915/gem/i915_gem_object.h  | 14 --
 drivers/gpu/drm/i915/gem/i915_gem_pages.c   | 10 +-
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c |  2 +-
 drivers/gpu/drm/i915/i915_vma.c |  6 +++---
 5 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.c 
b/drivers/gpu/drm/i915/gem/i915_gem_object.c
index d87b508b59b1..fd34b1a115c4 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.c
@@ -550,7 +550,7 @@ bool i915_gem_object_has_struct_page(const struct 
drm_i915_gem_object *obj)
 #ifdef CONFIG_LOCKDEP
if (IS_DGFX(to_i915(obj->base.dev)) &&
i915_gem_object_evictable((void __force *)obj))
-   assert_object_held_shared(obj);
+   assert_object_held(obj);
 #endif
return obj->mem_flags & I915_BO_FLAG_STRUCT_PAGE;
 }
@@ -569,7 +569,7 @@ bool i915_gem_object_has_iomem(const struct 
drm_i915_gem_object *obj)
 #ifdef CONFIG_LOCKDEP
if (IS_DGFX(to_i915(obj->base.dev)) &&
i915_gem_object_evictable((void __force *)obj))
-   assert_object_held_shared(obj);
+   assert_object_held(obj);
 #endif
return obj->mem_flags & I915_BO_FLAG_IOMEM;
 }
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object.h 
b/drivers/gpu/drm/i915/gem/i915_gem_object.h
index bc448f895ae8..c1cdfaf2d1e3 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object.h
@@ -157,20 +157,6 @@ i915_gem_object_put(struct drm_i915_gem_object *obj)
 
 #define assert_object_held(obj) dma_resv_assert_held((obj)->base.resv)
 
-/*
- * If more than one potential simultaneous locker, assert held.
- */
-static inline void assert_object_held_shared(const struct drm_i915_gem_object 
*obj)
-{
-   /*
-* Note mm list lookup is protected by
-* kref_get_unless_zero().
-*/
-   if (IS_ENABLED(CONFIG_LOCKDEP) &&
-   kref_read(&obj->base.refcount) > 0)
-   assert_object_held(obj);
-}
-
 static inline int __i915_gem_object_lock(struct drm_i915_gem_object *obj,
 struct i915_gem_ww_ctx *ww,
 bool intr)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pages.c 
b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
index 7d2211fbe548..a1a785068779 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_pages.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_pages.c
@@ -19,7 +19,7 @@ void __i915_gem_object_set_pages(struct drm_i915_gem_object 
*obj,
bool shrinkable;
int i;
 
-   assert_object_held_shared(obj);
+   assert_object_held(obj);
 
if (i915_gem_object_is_volatile(obj))
obj->mm.madv = I915_MADV_DONTNEED;
@@ -95,7 +95,7 @@ int i915_gem_object_get_pages(struct drm_i915_gem_object 
*obj)
struct drm_i915_private *i915 = to_i915(obj->base.dev);
int err;
 
-   assert_object_held_shared(obj);
+   assert_object_held(obj);
 
if (unlikely(obj->mm.madv != I915_MADV_WILLNEED)) {
drm_dbg(&i915->drm,
@@ -122,7 +122,7 @@ int __i915_gem_object_get_pages(struct drm_i915_gem_object 
*obj)
 
assert_object_held(obj);
 
-   assert_object_held_shared(obj);
+   assert_object_held(obj);
 
if (unlikely(!i915_gem_object_has_pages(obj))) {
GEM_BUG_ON(i915_gem_object_has_pinned_pages(obj));
@@ -191,7 +191,7 @@ __i915_gem_object_unset_pages(struct drm_i915_gem_object 
*obj)
 {
struct sg_table *pages;
 
-   assert_object_held_shared(obj);
+   assert_object_held(obj);
 
pages = fetch_and_zero(&obj->mm.pages);
if (IS_ERR_OR_NULL(pages))
@@ -222,7 +222,7 @@ int __i915_gem_object_put_pages(struct drm_i915_gem_object 
*obj)
return -EBUSY;
 
/* May be called by shrinker from within get_pages() (on another bo) */
-   assert_object_held_shared(obj);
+   assert_object_held(obj);
 
i915_gem_object_release_mmap_offset(obj);
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
index 3cc01c30dd62..8a0da441225b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c
@@ -109,7 +109,7 @@ static void i915_gem_object_userptr_drop_ref(struct 
drm_i915_gem_object *obj)
 {
struct page **pvec = NULL;
 
-   assert_object_held_shared(obj);
+   assert_object_held(obj);
 
if (!--obj->userptr.page_ref) {
pvec = obj->userptr.pvec;
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i

[PATCH v4 6/7] drm/i915: Remove support for unlocked i915_vma unbind

2022-01-10 Thread Maarten Lankhorst
Now that we require the object lock for all ops, some code handling
race conditions can be removed.

This is required to not take short-term pins inside execbuf.

Signed-off-by: Maarten Lankhorst 
Acked-by: Niranjana Vishwanathapura 
---
 drivers/gpu/drm/i915/i915_vma.c | 55 +
 1 file changed, 8 insertions(+), 47 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 11e10e0d628c..8859feb7d131 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -777,7 +777,6 @@ i915_vma_detach(struct i915_vma *vma)
 static bool try_qad_pin(struct i915_vma *vma, unsigned int flags)
 {
unsigned int bound;
-   bool pinned = true;
 
bound = atomic_read(&vma->flags);
do {
@@ -787,34 +786,10 @@ static bool try_qad_pin(struct i915_vma *vma, unsigned 
int flags)
if (unlikely(bound & (I915_VMA_OVERFLOW | I915_VMA_ERROR)))
return false;
 
-   if (!(bound & I915_VMA_PIN_MASK))
-   goto unpinned;
-
GEM_BUG_ON(((bound + 1) & I915_VMA_PIN_MASK) == 0);
} while (!atomic_try_cmpxchg(&vma->flags, &bound, bound + 1));
 
return true;
-
-unpinned:
-   /*
-* If pin_count==0, but we are bound, check under the lock to avoid
-* racing with a concurrent i915_vma_unbind().
-*/
-   mutex_lock(&vma->vm->mutex);
-   do {
-   if (unlikely(bound & (I915_VMA_OVERFLOW | I915_VMA_ERROR))) {
-   pinned = false;
-   break;
-   }
-
-   if (unlikely(flags & ~bound)) {
-   pinned = false;
-   break;
-   }
-   } while (!atomic_try_cmpxchg(&vma->flags, &bound, bound + 1));
-   mutex_unlock(&vma->vm->mutex);
-
-   return pinned;
 }
 
 static struct scatterlist *
@@ -1153,7 +1128,6 @@ static int
 __i915_vma_get_pages(struct i915_vma *vma)
 {
struct sg_table *pages;
-   int ret;
 
/*
 * The vma->pages are only valid within the lifespan of the borrowed
@@ -1186,18 +1160,16 @@ __i915_vma_get_pages(struct i915_vma *vma)
break;
}
 
-   ret = 0;
if (IS_ERR(pages)) {
-   ret = PTR_ERR(pages);
-   pages = NULL;
drm_err(&vma->vm->i915->drm,
-   "Failed to get pages for VMA view type %u (%d)!\n",
-   vma->ggtt_view.type, ret);
+   "Failed to get pages for VMA view type %u (%ld)!\n",
+   vma->ggtt_view.type, PTR_ERR(pages));
+   return PTR_ERR(pages);
}
 
vma->pages = pages;
 
-   return ret;
+   return 0;
 }
 
 I915_SELFTEST_EXPORT int i915_vma_get_pages(struct i915_vma *vma)
@@ -1229,25 +1201,14 @@ I915_SELFTEST_EXPORT int i915_vma_get_pages(struct 
i915_vma *vma)
 static void __vma_put_pages(struct i915_vma *vma, unsigned int count)
 {
/* We allocate under vma_get_pages, so beware the shrinker */
-   struct sg_table *pages = READ_ONCE(vma->pages);
-
GEM_BUG_ON(atomic_read(&vma->pages_count) < count);
 
if (atomic_sub_return(count, &vma->pages_count) == 0) {
-   /*
-* The atomic_sub_return is a read barrier for the READ_ONCE of
-* vma->pages above.
-*
-* READ_ONCE is safe because this is either called from the same
-* function (i915_vma_pin_ww), or guarded by vma->vm->mutex.
-*
-* TODO: We're leaving vma->pages dangling, until vma->obj->resv
-* lock is required.
-*/
-   if (pages != vma->obj->mm.pages) {
-   sg_free_table(pages);
-   kfree(pages);
+   if (vma->pages != vma->obj->mm.pages) {
+   sg_free_table(vma->pages);
+   kfree(vma->pages);
}
+   vma->pages = NULL;
 
i915_gem_object_unpin_pages(vma->obj);
}
-- 
2.34.1



[PATCH v4 7/7] drm/i915: Remove short-term pins from execbuf, v6.

2022-01-10 Thread Maarten Lankhorst
Add a flag PIN_VALIDATE, to indicate we don't need to pin and only
protected by the object lock.

This removes the need to unpin, which is done by just releasing the
lock.

eb_reserve is slightly reworked for readability, but the same steps
are still done:
- First pass pins with NONBLOCK.
- Second pass unbinds all objects first, then pins.
- Third pass is only called when not all objects are softpinned, and
  unbinds all objects, then calls i915_gem_evict_vm(), then pins.

Changes since v1:
- Split out eb_reserve() into separate functions for readability.
Changes since v2:
- Make batch buffer mappable on platforms where only GGTT is available,
  to prevent moving the batch buffer during relocations.
Changes since v3:
- Preserve current behavior for batch buffer, instead be cautious when
  calling i915_gem_object_ggtt_pin_ww, and re-use the current batch vma
  if it's inside ggtt and map-and-fenceable.
- Remove impossible condition check from eb_reserve. (Matt)
Changes since v5:
- Do not even temporarily pin, just call i915_gem_evict_vm() and mark
  all vma's as unpinned.

Signed-off-by: Maarten Lankhorst 
Reviewed-by: Matthew Auld 
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 220 +-
 drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c  |   1 -
 drivers/gpu/drm/i915/i915_gem_gtt.h   |   1 +
 drivers/gpu/drm/i915/i915_vma.c   |  24 +-
 4 files changed, 128 insertions(+), 118 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index da35a143af36..cfff194d90e7 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -441,7 +441,7 @@ eb_pin_vma(struct i915_execbuffer *eb,
else
pin_flags = entry->offset & PIN_OFFSET_MASK;
 
-   pin_flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED;
+   pin_flags |= PIN_USER | PIN_NOEVICT | PIN_OFFSET_FIXED | PIN_VALIDATE;
if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_GTT))
pin_flags |= PIN_GLOBAL;
 
@@ -459,17 +459,15 @@ eb_pin_vma(struct i915_execbuffer *eb,
 entry->pad_to_size,
 entry->alignment,
 eb_pin_flags(entry, ev->flags) |
-PIN_USER | PIN_NOEVICT);
+PIN_USER | PIN_NOEVICT | 
PIN_VALIDATE);
if (unlikely(err))
return err;
}
 
if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_FENCE)) {
err = i915_vma_pin_fence(vma);
-   if (unlikely(err)) {
-   i915_vma_unpin(vma);
+   if (unlikely(err))
return err;
-   }
 
if (vma->fence)
ev->flags |= __EXEC_OBJECT_HAS_FENCE;
@@ -485,13 +483,9 @@ eb_pin_vma(struct i915_execbuffer *eb,
 static inline void
 eb_unreserve_vma(struct eb_vma *ev)
 {
-   if (!(ev->flags & __EXEC_OBJECT_HAS_PIN))
-   return;
-
if (unlikely(ev->flags & __EXEC_OBJECT_HAS_FENCE))
__i915_vma_unpin_fence(ev->vma);
 
-   __i915_vma_unpin(ev->vma);
ev->flags &= ~__EXEC_OBJECT_RESERVED;
 }
 
@@ -684,10 +678,8 @@ static int eb_reserve_vma(struct i915_execbuffer *eb,
 
if (unlikely(ev->flags & EXEC_OBJECT_NEEDS_FENCE)) {
err = i915_vma_pin_fence(vma);
-   if (unlikely(err)) {
-   i915_vma_unpin(vma);
+   if (unlikely(err))
return err;
-   }
 
if (vma->fence)
ev->flags |= __EXEC_OBJECT_HAS_FENCE;
@@ -699,85 +691,95 @@ static int eb_reserve_vma(struct i915_execbuffer *eb,
return 0;
 }
 
-static int eb_reserve(struct i915_execbuffer *eb)
+static bool eb_unbind(struct i915_execbuffer *eb, bool force)
 {
const unsigned int count = eb->buffer_count;
-   unsigned int pin_flags = PIN_USER | PIN_NONBLOCK;
+   unsigned int i;
struct list_head last;
+   bool unpinned = false;
+
+   /* Resort *all* the objects into priority order */
+   INIT_LIST_HEAD(&eb->unbound);
+   INIT_LIST_HEAD(&last);
+
+   for (i = 0; i < count; i++) {
+   struct eb_vma *ev = &eb->vma[i];
+   unsigned int flags = ev->flags;
+
+   if (!force && flags & EXEC_OBJECT_PINNED &&
+   flags & __EXEC_OBJECT_HAS_PIN)
+   continue;
+
+   unpinned = true;
+   eb_unreserve_vma(ev);
+
+   if (flags & EXEC_OBJECT_PINNED)
+   /* Pinned must have their slot */
+   list_add(&ev->bind_link, &eb->unbound);
+   else if (flags & __EXEC_OBJECT_NEEDS_MAP)
+   /* Map require the lowest 256MiB (apert

[PATCH v4 3/7] drm/i915: Add object locking to i915_gem_evict_for_node and i915_gem_evict_something

2022-01-10 Thread Maarten Lankhorst
Because we will start to require the obj->resv lock for unbinding,
ensure these shrinker functions also take the lock.

This requires some function signature changes, to ensure that the
ww context is passed around, but is mostly straightforward.

Previously this was split up into several patches, but reworking
should allow for easier bisection.

Signed-off-by: Maarten Lankhorst 
---
 drivers/gpu/drm/i915/gt/intel_ggtt.c  |  2 +-
 drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |  2 +-
 drivers/gpu/drm/i915/gvt/aperture_gm.c|  2 +-
 drivers/gpu/drm/i915/i915_drv.h   |  2 ++
 drivers/gpu/drm/i915/i915_gem_evict.c | 34 +++
 drivers/gpu/drm/i915/i915_gem_gtt.c   |  8 +++--
 drivers/gpu/drm/i915/i915_gem_gtt.h   |  3 ++
 drivers/gpu/drm/i915/i915_vgpu.c  |  2 +-
 drivers/gpu/drm/i915/i915_vma.c   |  9 ++---
 .../gpu/drm/i915/selftests/i915_gem_evict.c   | 17 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 14 
 11 files changed, 63 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index ab6c4322dc08..e416e1f12d1a 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -504,7 +504,7 @@ static int ggtt_reserve_guc_top(struct i915_ggtt *ggtt)
GEM_BUG_ON(ggtt->vm.total <= GUC_GGTT_TOP);
size = ggtt->vm.total - GUC_GGTT_TOP;
 
-   ret = i915_gem_gtt_reserve(&ggtt->vm, &ggtt->uc_fw, size,
+   ret = i915_gem_gtt_reserve(&ggtt->vm, NULL, &ggtt->uc_fw, size,
   GUC_GGTT_TOP, I915_COLOR_UNEVICTABLE,
   PIN_NOEVICT);
if (ret)
diff --git a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c 
b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
index 15d63435ec4d..9c21b55b927b 100644
--- a/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
+++ b/drivers/gpu/drm/i915/gt/selftest_hangcheck.c
@@ -1382,7 +1382,7 @@ static int evict_vma(void *data)
complete(&arg->completion);
 
mutex_lock(&vm->mutex);
-   err = i915_gem_evict_for_node(vm, &evict, 0);
+   err = i915_gem_evict_for_node(vm, NULL, &evict, 0);
mutex_unlock(&vm->mutex);
 
return err;
diff --git a/drivers/gpu/drm/i915/gvt/aperture_gm.c 
b/drivers/gpu/drm/i915/gvt/aperture_gm.c
index 0d6d59871308..c08098a167e9 100644
--- a/drivers/gpu/drm/i915/gvt/aperture_gm.c
+++ b/drivers/gpu/drm/i915/gvt/aperture_gm.c
@@ -63,7 +63,7 @@ static int alloc_gm(struct intel_vgpu *vgpu, bool high_gm)
 
mutex_lock(>->ggtt->vm.mutex);
mmio_hw_access_pre(gt);
-   ret = i915_gem_gtt_insert(>->ggtt->vm, node,
+   ret = i915_gem_gtt_insert(>->ggtt->vm, NULL, node,
  size, I915_GTT_PAGE_SIZE,
  I915_COLOR_UNEVICTABLE,
  start, end, flags);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ef121ddef418..88eb203a0742 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1722,11 +1722,13 @@ i915_gem_vm_lookup(struct drm_i915_file_private 
*file_priv, u32 id)
 
 /* i915_gem_evict.c */
 int __must_check i915_gem_evict_something(struct i915_address_space *vm,
+ struct i915_gem_ww_ctx *ww,
  u64 min_size, u64 alignment,
  unsigned long color,
  u64 start, u64 end,
  unsigned flags);
 int __must_check i915_gem_evict_for_node(struct i915_address_space *vm,
+struct i915_gem_ww_ctx *ww,
 struct drm_mm_node *node,
 unsigned int flags);
 int i915_gem_evict_vm(struct i915_address_space *vm,
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c 
b/drivers/gpu/drm/i915/i915_gem_evict.c
index bfd66f539fc1..f502a617b35c 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -51,6 +51,7 @@ static int ggtt_flush(struct intel_gt *gt)
 
 static bool
 mark_free(struct drm_mm_scan *scan,
+ struct i915_gem_ww_ctx *ww,
  struct i915_vma *vma,
  unsigned int flags,
  struct list_head *unwind)
@@ -58,6 +59,9 @@ mark_free(struct drm_mm_scan *scan,
if (i915_vma_is_pinned(vma))
return false;
 
+   if (!i915_gem_object_trylock(vma->obj, ww))
+   return false;
+
list_add(&vma->evict_link, unwind);
return drm_mm_scan_add_block(scan, &vma->node);
 }
@@ -98,6 +102,7 @@ static bool defer_evict(struct i915_vma *vma)
  */
 int
 i915_gem_evict_something(struct i915_address_space *vm,
+struct i915_gem_ww_ctx *ww,
 u64 min_size, u64 al

[PATCH v4 1/7] drm/i915: Call i915_gem_evict_vm in vm_fault_gtt to prevent new ENOSPC errors, v2.

2022-01-10 Thread Maarten Lankhorst
Now that we cannot unbind kill the currently locked object directly
because we're removing short term pinning, we may have to unbind the
object from gtt manually, using a i915_gem_evict_vm() call.

Changes since v1:
- Remove -ENOSPC warning, can still happen with concurrent mmaps
  where we can't unbind the other mmap because of the lock held.
  This  fixes the gem_mmap_gtt@cpuset tests.

Signed-off-by: Maarten Lankhorst 
---
 drivers/gpu/drm/i915/gem/i915_gem_mman.c | 17 +++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index 5ac2506f4ee8..4337f3c1400c 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -358,8 +358,21 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf)
vma = i915_gem_object_ggtt_pin_ww(obj, &ww, &view, 0, 
0, flags);
}
 
-   /* The entire mappable GGTT is pinned? Unexpected! */
-   GEM_BUG_ON(vma == ERR_PTR(-ENOSPC));
+   /*
+* The entire mappable GGTT is pinned? Unexpected!
+* Try to evict the object we locked too, as normally we skip it
+* due to lack of short term pinning inside execbuf.
+*/
+   if (vma == ERR_PTR(-ENOSPC)) {
+   ret = mutex_lock_interruptible(&ggtt->vm.mutex);
+   if (!ret) {
+   ret = i915_gem_evict_vm(&ggtt->vm);
+   mutex_unlock(&ggtt->vm.mutex);
+   }
+   if (ret)
+   goto err_reset;
+   vma = i915_gem_object_ggtt_pin_ww(obj, &ww, &view, 0, 
0, flags);
+   }
}
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
-- 
2.34.1



[PATCH v4 2/7] drm/i915: Add locking to i915_gem_evict_vm()

2022-01-10 Thread Maarten Lankhorst
i915_gem_evict_vm will need to be able to evict objects that are
locked by the current ctx. By testing if the current context already
locked the object, we can do this correctly. This allows us to
evict the entire vm even if we already hold some objects' locks.

Previously, this was spread over several commits, but it makes
more sense to commit the changes to i915_gem_evict_vm separately
from the changes to i915_gem_evict_something() and
i915_gem_evict_for_node().

Signed-off-by: Maarten Lankhorst 
---
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c|  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  2 +-
 drivers/gpu/drm/i915/i915_drv.h   |  3 +-
 drivers/gpu/drm/i915/i915_gem_evict.c | 30 +--
 drivers/gpu/drm/i915/i915_vma.c   |  7 -
 .../gpu/drm/i915/selftests/i915_gem_evict.c   | 10 +--
 6 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
index 5ecc85b96a3d..da35a143af36 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -766,7 +766,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
case 1:
/* Too fragmented, unbind everything and retry */
mutex_lock(&eb->context->vm->mutex);
-   err = i915_gem_evict_vm(eb->context->vm);
+   err = i915_gem_evict_vm(eb->context->vm, &eb->ww);
mutex_unlock(&eb->context->vm->mutex);
if (err)
return err;
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
index 4337f3c1400c..4afad1604a6a 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_mman.c
@@ -366,7 +366,7 @@ static vm_fault_t vm_fault_gtt(struct vm_fault *vmf)
if (vma == ERR_PTR(-ENOSPC)) {
ret = mutex_lock_interruptible(&ggtt->vm.mutex);
if (!ret) {
-   ret = i915_gem_evict_vm(&ggtt->vm);
+   ret = i915_gem_evict_vm(&ggtt->vm, &ww);
mutex_unlock(&ggtt->vm.mutex);
}
if (ret)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 2f9336302e6c..ef121ddef418 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1729,7 +1729,8 @@ int __must_check i915_gem_evict_something(struct 
i915_address_space *vm,
 int __must_check i915_gem_evict_for_node(struct i915_address_space *vm,
 struct drm_mm_node *node,
 unsigned int flags);
-int i915_gem_evict_vm(struct i915_address_space *vm);
+int i915_gem_evict_vm(struct i915_address_space *vm,
+ struct i915_gem_ww_ctx *ww);
 
 /* i915_gem_internal.c */
 struct drm_i915_gem_object *
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c 
b/drivers/gpu/drm/i915/i915_gem_evict.c
index 2b73ddb11c66..bfd66f539fc1 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -367,7 +367,7 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
  * To clarify: This is for freeing up virtual address space, not for freeing
  * memory in e.g. the shrinker.
  */
-int i915_gem_evict_vm(struct i915_address_space *vm)
+int i915_gem_evict_vm(struct i915_address_space *vm, struct i915_gem_ww_ctx 
*ww)
 {
int ret = 0;
 
@@ -388,24 +388,50 @@ int i915_gem_evict_vm(struct i915_address_space *vm)
do {
struct i915_vma *vma, *vn;
LIST_HEAD(eviction_list);
+   LIST_HEAD(locked_eviction_list);
 
list_for_each_entry(vma, &vm->bound_list, vm_link) {
if (i915_vma_is_pinned(vma))
continue;
 
+   /*
+* If we already own the lock, trylock fails. In case 
the resv
+* is shared among multiple objects, we still need the 
object ref.
+*/
+   if (ww && (dma_resv_locking_ctx(vma->obj->base.resv) == 
&ww->ctx)) {
+   __i915_vma_pin(vma);
+   list_add(&vma->evict_link, 
&locked_eviction_list);
+   continue;
+   }
+
+   if (!i915_gem_object_trylock(vma->obj, ww))
+   continue;
+
__i915_vma_pin(vma);
list_add(&vma->evict_link, &eviction_list);
}
-   if (list_empty(&eviction_list))
+   if (list_empty(&eviction_list) && 
list_empty(&locked_eviction_list))
 

[PATCH v4 0/7] drm/i915: Remove short term pins from execbuf by requiring lock to unbind.

2022-01-10 Thread Maarten Lankhorst
Previously, short term pinning in execbuf was required because i915_vma was
effectively independent from objects, and has its own refcount, locking,
lifetime rules and pinning.

This series removes the separate locking, by requiring vma->obj->resv to be
held when pinning and unbinding. This will also be required for VM_BIND work.
Some patches have already been merged, but this contains the mremainder of
the conversion.

With pinning required for pinning and unbinding, the lock is enough to prevent
unbinding when trying to pin with the lock held, for example in execbuf.

This makes binding/unbinding similar to ttm_bo_validate()'s use, which just
cares that an object is in a certain place, without pinning it in place.

Having the VMA part of gem bo removes a lot of the vma refcounting, and makes
i915_vma more a part of the bo, instead of its own floating object that just
happens to be part of a bo. This is also required to make it more compatible
with TTM, and migration in general.

For future work, it makes things a lot simpler and clear. We want to end up
with i915_vma just being a specific mapping of the BO, just like is the
case in other drivers. i915_vma->active removal is the next step there,
and makes it when object is destroyed, the bindings are destroyed (after idle),
instead of object being destroyed when bindings are idle. 

Maarten Lankhorst (7):
  drm/i915: Call i915_gem_evict_vm in vm_fault_gtt to prevent new ENOSPC
errors, v2.
  drm/i915: Add locking to i915_gem_evict_vm()
  drm/i915: Add object locking to i915_gem_evict_for_node and
i915_gem_evict_something
  drm/i915: Add i915_vma_unbind_unlocked, and take obj lock for
i915_vma_unbind, v2.
  drm/i915: Remove assert_object_held_shared
  drm/i915: Remove support for unlocked i915_vma unbind
  drm/i915: Remove short-term pins from execbuf, v6.

 drivers/gpu/drm/i915/display/intel_fb_pin.c   |   2 +-
 .../gpu/drm/i915/gem/i915_gem_execbuffer.c| 220 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  17 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.c|   4 +-
 drivers/gpu/drm/i915/gem/i915_gem_object.h|  14 --
 drivers/gpu/drm/i915/gem/i915_gem_pages.c |  10 +-
 drivers/gpu/drm/i915/gem/i915_gem_userptr.c   |   2 +-
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |   2 +-
 .../i915/gem/selftests/i915_gem_client_blt.c  |   2 +-
 .../drm/i915/gem/selftests/i915_gem_mman.c|   6 +
 drivers/gpu/drm/i915/gt/intel_ggtt.c  |  51 +++-
 drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c  |   1 -
 drivers/gpu/drm/i915/gt/selftest_hangcheck.c  |   2 +-
 drivers/gpu/drm/i915/gvt/aperture_gm.c|   2 +-
 drivers/gpu/drm/i915/i915_drv.h   |   5 +-
 drivers/gpu/drm/i915/i915_gem.c   |   2 +
 drivers/gpu/drm/i915/i915_gem_evict.c |  64 -
 drivers/gpu/drm/i915/i915_gem_gtt.c   |   8 +-
 drivers/gpu/drm/i915/i915_gem_gtt.h   |   4 +
 drivers/gpu/drm/i915/i915_vgpu.c  |   2 +-
 drivers/gpu/drm/i915/i915_vma.c   | 122 +-
 drivers/gpu/drm/i915/i915_vma.h   |   1 +
 .../gpu/drm/i915/selftests/i915_gem_evict.c   |  27 ++-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |  36 +--
 drivers/gpu/drm/i915/selftests/i915_vma.c |   8 +-
 25 files changed, 361 insertions(+), 253 deletions(-)

-- 
2.34.1



[PATCH v4 4/7] drm/i915: Add i915_vma_unbind_unlocked, and take obj lock for i915_vma_unbind, v2.

2022-01-10 Thread Maarten Lankhorst
We want to remove more members of i915_vma, which requires the locking to be
held more often.

Start requiring gem object lock for i915_vma_unbind, as it's one of the
callers that may unpin pages.

Some special care is needed when evicting, because the last reference to the
object may be held by the VMA, so after __i915_vma_unbind, vma may be garbage,
and we need to cache vma->obj before unlocking.

Changes since v1:
- Make trylock failing a WARN. (Matt)
- Remove double i915_vma_wait_for_bind() (Matt)
- Move atomic_set to right before mutex_unlock(), to make it more clear
  they belong together. (Matt)

Signed-off-by: Maarten Lankhorst 
Reviewed-by: Matthew Auld 
---
 drivers/gpu/drm/i915/display/intel_fb_pin.c   |  2 +-
 .../gpu/drm/i915/gem/selftests/huge_pages.c   |  2 +-
 .../i915/gem/selftests/i915_gem_client_blt.c  |  2 +-
 .../drm/i915/gem/selftests/i915_gem_mman.c|  6 +++
 drivers/gpu/drm/i915/gt/intel_ggtt.c  | 49 ---
 drivers/gpu/drm/i915/i915_gem.c   |  2 +
 drivers/gpu/drm/i915/i915_vma.c   | 27 +-
 drivers/gpu/drm/i915/i915_vma.h   |  1 +
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 22 -
 drivers/gpu/drm/i915/selftests/i915_vma.c |  8 +--
 10 files changed, 95 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/i915/display/intel_fb_pin.c 
b/drivers/gpu/drm/i915/display/intel_fb_pin.c
index 31c15e5fca95..9c555f6d1958 100644
--- a/drivers/gpu/drm/i915/display/intel_fb_pin.c
+++ b/drivers/gpu/drm/i915/display/intel_fb_pin.c
@@ -47,7 +47,7 @@ intel_pin_fb_obj_dpt(struct drm_framebuffer *fb,
goto err;
 
if (i915_vma_misplaced(vma, 0, alignment, 0)) {
-   ret = i915_vma_unbind(vma);
+   ret = i915_vma_unbind_unlocked(vma);
if (ret) {
vma = ERR_PTR(ret);
goto err;
diff --git a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c 
b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
index 11f0aa65f8a3..b14c4e0a58d8 100644
--- a/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
+++ b/drivers/gpu/drm/i915/gem/selftests/huge_pages.c
@@ -647,7 +647,7 @@ static int igt_mock_ppgtt_misaligned_dma(void *arg)
 * pages.
 */
for (offset = 4096; offset < page_size; offset += 4096) {
-   err = i915_vma_unbind(vma);
+   err = i915_vma_unbind_unlocked(vma);
if (err)
goto out_unpin;
 
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c
index c08f766e6e15..c8ff8bf0986d 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_client_blt.c
@@ -318,7 +318,7 @@ static int pin_buffer(struct i915_vma *vma, u64 addr)
int err;
 
if (drm_mm_node_allocated(&vma->node) && vma->node.start != addr) {
-   err = i915_vma_unbind(vma);
+   err = i915_vma_unbind_unlocked(vma);
if (err)
return err;
}
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c 
b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c
index f61356b72b1c..ba29767348be 100644
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c
@@ -166,7 +166,9 @@ static int check_partial_mapping(struct drm_i915_gem_object 
*obj,
kunmap(p);
 
 out:
+   i915_gem_object_lock(obj, NULL);
__i915_vma_put(vma);
+   i915_gem_object_unlock(obj);
return err;
 }
 
@@ -261,7 +263,9 @@ static int check_partial_mappings(struct 
drm_i915_gem_object *obj,
if (err)
return err;
 
+   i915_gem_object_lock(obj, NULL);
__i915_vma_put(vma);
+   i915_gem_object_unlock(obj);
 
if (igt_timeout(end_time,
"%s: timed out after tiling=%d stride=%d\n",
@@ -1352,7 +1356,9 @@ static int __igt_mmap_revoke(struct drm_i915_private 
*i915,
 * for other objects. Ergo we have to revoke the previous mmap PTE
 * access as it no longer points to the same object.
 */
+   i915_gem_object_lock(obj, NULL);
err = i915_gem_object_unbind(obj, I915_GEM_OBJECT_UNBIND_ACTIVE);
+   i915_gem_object_unlock(obj);
if (err) {
pr_err("Failed to unbind object!\n");
goto out_unmap;
diff --git a/drivers/gpu/drm/i915/gt/intel_ggtt.c 
b/drivers/gpu/drm/i915/gt/intel_ggtt.c
index e416e1f12d1a..e73d453a0d6b 100644
--- a/drivers/gpu/drm/i915/gt/intel_ggtt.c
+++ b/drivers/gpu/drm/i915/gt/intel_ggtt.c
@@ -129,22 +129,49 @@ void i915_ggtt_suspend_vm(struct i915_address_space *vm)
 
drm_WARN_ON(&vm->i915->drm, !vm->is_ggtt && !vm->is_dpt);
 
+retry:
+   i915_gem_drain_free

[PATCH] drm: bridge: chipone-icn6211: Drop unnecessary bridge type

2022-01-10 Thread Jagan Teki
Explicit assignment of connector to bridge type during bridge
addition is optional.

Some of the bridges like ICN6211 has panel to be connected and
that panel driver has taken care of associated connector type
of it.

Drop it.

Signed-off-by: Jagan Teki 
---
 drivers/gpu/drm/bridge/chipone-icn6211.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/bridge/chipone-icn6211.c 
b/drivers/gpu/drm/bridge/chipone-icn6211.c
index 23c34039ac48..c60170865b74 100644
--- a/drivers/gpu/drm/bridge/chipone-icn6211.c
+++ b/drivers/gpu/drm/bridge/chipone-icn6211.c
@@ -238,7 +238,6 @@ static int chipone_probe(struct mipi_dsi_device *dsi)
return ret;
 
icn->bridge.funcs = &chipone_bridge_funcs;
-   icn->bridge.type = DRM_MODE_CONNECTOR_DPI;
icn->bridge.of_node = dev->of_node;
 
drm_bridge_add(&icn->bridge);
-- 
2.25.1



Re: [PATCH v5 1/3] drm/privacy_screen: Add drvdata in drm_privacy_screen

2022-01-10 Thread Hans de Goede
Hi All,

On 1/7/22 20:02, Rajat Jain wrote:
> Allow a privacy screen provider to stash its private data pointer in the
> drm_privacy_screen, and update the drm_privacy_screen_register() call to
> accept that. Also introduce a *_get_drvdata() so that it can retrieved
> back when needed.
> 
> This also touches the IBM Thinkpad platform driver, the only user of
> privacy screen today, to pass NULL for now to the updated API.
> 
> Signed-off-by: Rajat Jain 
> Reviewed-by: Hans de Goede 

I've pushed this series to drm-misc-next now.

Regards,

Hans



> ---
> v5: Same as v4
> v4: Added "Reviewed-by" from Hans
> v3: Initial version. Came up due to review comments on v2 of other patches.
> v2: No v2
> v1: No v1
> 
>  drivers/gpu/drm/drm_privacy_screen.c|  5 -
>  drivers/platform/x86/thinkpad_acpi.c|  2 +-
>  include/drm/drm_privacy_screen_driver.h | 13 -
>  3 files changed, 17 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_privacy_screen.c 
> b/drivers/gpu/drm/drm_privacy_screen.c
> index beaf99e9120a..03b149cc455b 100644
> --- a/drivers/gpu/drm/drm_privacy_screen.c
> +++ b/drivers/gpu/drm/drm_privacy_screen.c
> @@ -387,7 +387,8 @@ static void drm_privacy_screen_device_release(struct 
> device *dev)
>   * * An ERR_PTR(errno) on failure.
>   */
>  struct drm_privacy_screen *drm_privacy_screen_register(
> - struct device *parent, const struct drm_privacy_screen_ops *ops)
> + struct device *parent, const struct drm_privacy_screen_ops *ops,
> + void *data)
>  {
>   struct drm_privacy_screen *priv;
>   int ret;
> @@ -404,6 +405,7 @@ struct drm_privacy_screen *drm_privacy_screen_register(
>   priv->dev.parent = parent;
>   priv->dev.release = drm_privacy_screen_device_release;
>   dev_set_name(&priv->dev, "privacy_screen-%s", dev_name(parent));
> + priv->drvdata = data;
>   priv->ops = ops;
>  
>   priv->ops->get_hw_state(priv);
> @@ -439,6 +441,7 @@ void drm_privacy_screen_unregister(struct 
> drm_privacy_screen *priv)
>   mutex_unlock(&drm_privacy_screen_devs_lock);
>  
>   mutex_lock(&priv->lock);
> + priv->drvdata = NULL;
>   priv->ops = NULL;
>   mutex_unlock(&priv->lock);
>  
> diff --git a/drivers/platform/x86/thinkpad_acpi.c 
> b/drivers/platform/x86/thinkpad_acpi.c
> index 341655d711ce..ccbfda2b0095 100644
> --- a/drivers/platform/x86/thinkpad_acpi.c
> +++ b/drivers/platform/x86/thinkpad_acpi.c
> @@ -9782,7 +9782,7 @@ static int tpacpi_lcdshadow_init(struct ibm_init_struct 
> *iibm)
>   return 0;
>  
>   lcdshadow_dev = drm_privacy_screen_register(&tpacpi_pdev->dev,
> - &lcdshadow_ops);
> + &lcdshadow_ops, NULL);
>   if (IS_ERR(lcdshadow_dev))
>   return PTR_ERR(lcdshadow_dev);
>  
> diff --git a/include/drm/drm_privacy_screen_driver.h 
> b/include/drm/drm_privacy_screen_driver.h
> index 24591b607675..4ef246d5706f 100644
> --- a/include/drm/drm_privacy_screen_driver.h
> +++ b/include/drm/drm_privacy_screen_driver.h
> @@ -73,10 +73,21 @@ struct drm_privacy_screen {
>* for more info.
>*/
>   enum drm_privacy_screen_status hw_state;
> + /**
> +  * @drvdata: Private data owned by the privacy screen provider
> +  */
> + void *drvdata;
>  };
>  
> +static inline
> +void *drm_privacy_screen_get_drvdata(struct drm_privacy_screen *priv)
> +{
> + return priv->drvdata;
> +}
> +
>  struct drm_privacy_screen *drm_privacy_screen_register(
> - struct device *parent, const struct drm_privacy_screen_ops *ops);
> + struct device *parent, const struct drm_privacy_screen_ops *ops,
> + void *data);
>  void drm_privacy_screen_unregister(struct drm_privacy_screen *priv);
>  
>  void drm_privacy_screen_call_notifier_chain(struct drm_privacy_screen *priv);
> 



  1   2   >