Re: [PATCH v3 09/25] cxl/hdm: Add dynamic capacity size support to endpoint decoders

2024-08-23 Thread Jonathan Cameron
On Fri, 16 Aug 2024 09:44:17 -0500
ira.we...@intel.com wrote:

> From: Navneet Singh 
> 
> To support Dynamic Capacity Devices (DCD) endpoint decoders will need to
> map DC partitions (regions).  In addition to assigning the size of the
> DC partition, the decoder must assign any skip value from the previous
> decoder.  This must be done within a contiguous DPA space.
> 
> Two complications arise with Dynamic Capacity regions which did not
> exist with Ram and PMEM partitions.  First, gaps in the DPA space can
> exist between and around the DC partitions.  Second, the Linux resource
> tree does not allow a resource to be marked across existing nodes within
> a tree.
> 
> For clarity, below is an example of an 60GB device with 10GB of RAM,
> 10GB of PMEM and 10GB for each of 2 DC partitions.  The desired CXL
> mapping is 5GB of RAM, 5GB of PMEM, and 5GB of DC1.
> 
>  DPA RANGE
>  (dpa_res)
> 0GB10GB   20GB   30GB   40GB   50GB   60GB
> |--|--|--|--|--|--|
> 
> RAM PMEM  DC0   DC1
>  (ram_res)  (pmem_res)(dc_res[0])   (dc_res[1])
> |--|--| |--| |--|
> 
>  RAMPMEMDC1
> |X||X||--|--|--|X-|
> 0GB   5GB  10GB  15GB 20GB   30GB   40GB   50GB   60GB
> 
> The previous skip resource between RAM and PMEM was always a child of
> the RAM resource and fit nicely [see (S) below].  Because of this
> simplicity this skip resource reference was not stored in any CXL state.
> On release the skip range could be calculated based on the endpoint
> decoders stored values.
> 
> Now when DC1 is being mapped 4 skip resources must be created as
> children.  One for the PMEM resource (A), two of the parent DPA resource
> (B,D), and one more child of the DC0 resource (C).
> 
> 0GB10GB   20GB   30GB   40GB   50GB   60GB
> |--|--|--|--|--|--|
>| |
> |--|--|| |--|| |--|
> |  |   |  |  |
>(S)(A) (B)(C)(D)
>   v  v   v  v  v
> |X||X||--|--|--|X-|
>skip   skip  skipskip  skip
> 
> Expand the calculation of DPA free space and enhance the logic to
> support this more complex skipping.  To track the potential of multiple
> skip resources an xarray is attached to the endpoint decoder.  The
> existing algorithm between RAM and PMEM is consolidated within the new
> one to streamline the code even though the result is the storage of a
> single skip resource in the xarray.
> 
> Signed-off-by: Navneet Singh 
> Co-developed-by: Ira Weiny 
> Signed-off-by: Ira Weiny 
> 
One query below + request to add a comment on it for when I've
again completely forgotten how this works.

Also a grumpy reviewer comment.

> +static int cxl_reserve_dpa_skip(struct cxl_endpoint_decoder *cxled,
> + resource_size_t base, resource_size_t skipped)
> +{
> + struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> + struct cxl_port *port = cxled_to_port(cxled);
> + struct cxl_dev_state *cxlds = cxlmd->cxlds;
> + resource_size_t skip_base = base - skipped;
> + struct device *dev = &port->dev;
> + resource_size_t skip_len = 0;
> + int rc, index;
> +

> + index = dc_mode_to_region_index(cxled->mode);
> + for (int i = 0; i <= index; i++) {

I'm not sure why this is <= so maybe a comment?

> + struct resource *dcr = &cxlds->dc_res[i];
> +
> + if (skip_base < dcr->start) {
> + skip_len = dcr->start - skip_base;
> + rc = cxl_request_skip(cxled, skip_base, skip_len);
> + if (rc)
> + return rc;
> + skip_base += skip_len;
> + }
> +
> + if (skip_base == base) {
> + dev_dbg(dev, "skip done DC region %d!\n", i);
> + break;
> + }
> +
> + if (resource_size(dcr) && skip_base <= dcr->end) {
> + if (skip_base > base) {
> + dev_err(dev, "Skip error DC region %d; 
> skip_base %pa; base %pa\n",
> + i, &skip_base, &base);
> + return -ENXIO;
> + }
> +
> + skip_len = dcr->end - skip_base + 1;
> + rc = cxl_request_skip(cxled, skip_base, skip_len);
> + if (rc)
> + return rc;
> + skip_base += skip_len;
> + }
> + }
> +
> + return 0;
> +}

>

Re: [PATCH v3 09/25] cxl/hdm: Add dynamic capacity size support to endpoint decoders

2024-08-22 Thread Ira Weiny
Dave Jiang wrote:
> 
> 
> On 8/16/24 7:44 AM, ira.we...@intel.com wrote:
> > From: Navneet Singh 
> > 

[snip]

> > +static int dc_mode_to_region_index(enum cxl_decoder_mode mode)
> > +{
> > +   return mode - CXL_DECODER_DC0;
> > +}
> > +
> > +static int cxl_request_skip(struct cxl_endpoint_decoder *cxled,
> > +   resource_size_t skip_base, resource_size_t skip_len)
> > +{
> > +   struct cxl_dev_state *cxlds = cxled_to_memdev(cxled)->cxlds;
> > +   const char *name = dev_name(&cxled->cxld.dev);
> > +   struct cxl_port *port = cxled_to_port(cxled);
> > +   struct resource *dpa_res = &cxlds->dpa_res;
> > +   struct device *dev = &port->dev;
> > +   struct resource *res;
> > +   int rc;
> > +
> > +   res = __request_region(dpa_res, skip_base, skip_len, name, 0);
> > +   if (!res)
> > +   return -EBUSY;
> > +
> > +   rc = xa_insert(&cxled->skip_res, skip_base, res, GFP_KERNEL);
> 
> Maybe rename skip_res to skip_xa, given most of the vars in CXL with
> _res are 'struct resource' to avoid confusion. See 'dpa_res' above.
> 

Good idea.
[done]

> > +   if (rc) {
> > +   __release_region(dpa_res, skip_base, skip_len);
> > +   return rc;
> > +   }
> > +
> > +   dev_dbg(dev, "decoder%d.%d: skipped space; %pr\n",
> > +   port->id, cxled->cxld.id, res);
> > +   return 0;
> > +}
> > +
> > +static int cxl_reserve_dpa_skip(struct cxl_endpoint_decoder *cxled,
> > +   resource_size_t base, resource_size_t skipped)
> > +{
> > +   struct cxl_memdev *cxlmd = cxled_to_memdev(cxled);
> > +   struct cxl_port *port = cxled_to_port(cxled);
> > +   struct cxl_dev_state *cxlds = cxlmd->cxlds;
> > +   resource_size_t skip_base = base - skipped;
> > +   struct device *dev = &port->dev;
> > +   resource_size_t skip_len = 0;
> > +   int rc, index;
> > +
> > +   if (resource_size(&cxlds->ram_res) && skip_base <= cxlds->ram_res.end) {
> > +   skip_len = cxlds->ram_res.end - skip_base + 1;
> > +   rc = cxl_request_skip(cxled, skip_base, skip_len);
> > +   if (rc)
> > +   return rc;
> > +   skip_base += skip_len;
> > +   }
> > +
> > +   if (skip_base == base) {
> > +   dev_dbg(dev, "skip done ram!\n");
> > +   return 0;
> > +   }
> > +
> > +   if (resource_size(&cxlds->pmem_res) &&
> > +   skip_base <= cxlds->pmem_res.end) {
> > +   skip_len = cxlds->pmem_res.end - skip_base + 1;
> > +   rc = cxl_request_skip(cxled, skip_base, skip_len);
> > +   if (rc)
> > +   return rc;
> > +   skip_base += skip_len;
> > +   }
> 
> Does 'skip_base == base' need to be checked here again before going to DCD?

No it is checked below...

> 
> DJ
> 
> > +
> > +   index = dc_mode_to_region_index(cxled->mode);
> > +   for (int i = 0; i <= index; i++) {
> > +   struct resource *dcr = &cxlds->dc_res[i];
> > +
> > +   if (skip_base < dcr->start) {
> > +   skip_len = dcr->start - skip_base;
> > +   rc = cxl_request_skip(cxled, skip_base, skip_len);
> > +   if (rc)
> > +   return rc;
> > +   skip_base += skip_len;
> > +   }
> > +
> > +   if (skip_base == base) {
> > +   dev_dbg(dev, "skip done DC region %d!\n", i);
> > +   break;
> > +   }

... here.

After any skips between pmem and the first DC partition.
Ira

> > +
> > +   if (resource_size(dcr) && skip_base <= dcr->end) {
> > +   if (skip_base > base) {
> > +   dev_err(dev, "Skip error DC region %d; 
> > skip_base %pa; base %pa\n",
> > +   i, &skip_base, &base);
> > +   return -ENXIO;
> > +   }
> > +
> > +   skip_len = dcr->end - skip_base + 1;
> > +   rc = cxl_request_skip(cxled, skip_base, skip_len);
> > +   if (rc)
> > +   return rc;
> > +   skip_base += skip_len;
> > +   }
> > +   }
> > +
> > +   return 0;
> > +}
> > +

[snip]



Re: [PATCH v3 09/25] cxl/hdm: Add dynamic capacity size support to endpoint decoders

2024-08-16 Thread Dave Jiang



On 8/16/24 7:44 AM, ira.we...@intel.com wrote:
> From: Navneet Singh 
> 
> To support Dynamic Capacity Devices (DCD) endpoint decoders will need to
> map DC partitions (regions).  In addition to assigning the size of the
> DC partition, the decoder must assign any skip value from the previous
> decoder.  This must be done within a contiguous DPA space.
> 
> Two complications arise with Dynamic Capacity regions which did not
> exist with Ram and PMEM partitions.  First, gaps in the DPA space can
> exist between and around the DC partitions.  Second, the Linux resource
> tree does not allow a resource to be marked across existing nodes within
> a tree.
> 
> For clarity, below is an example of an 60GB device with 10GB of RAM,
> 10GB of PMEM and 10GB for each of 2 DC partitions.  The desired CXL
> mapping is 5GB of RAM, 5GB of PMEM, and 5GB of DC1.
> 
>  DPA RANGE
>  (dpa_res)
> 0GB10GB   20GB   30GB   40GB   50GB   60GB
> |--|--|--|--|--|--|
> 
> RAM PMEM  DC0   DC1
>  (ram_res)  (pmem_res)(dc_res[0])   (dc_res[1])
> |--|--| |--| |--|
> 
>  RAMPMEMDC1
> |X||X||--|--|--|X-|
> 0GB   5GB  10GB  15GB 20GB   30GB   40GB   50GB   60GB
> 
> The previous skip resource between RAM and PMEM was always a child of
> the RAM resource and fit nicely [see (S) below].  Because of this
> simplicity this skip resource reference was not stored in any CXL state.
> On release the skip range could be calculated based on the endpoint
> decoders stored values.
> 
> Now when DC1 is being mapped 4 skip resources must be created as
> children.  One for the PMEM resource (A), two of the parent DPA resource
> (B,D), and one more child of the DC0 resource (C).
> 
> 0GB10GB   20GB   30GB   40GB   50GB   60GB
> |--|--|--|--|--|--|
>| |
> |--|--|| |--|| |--|
> |  |   |  |  |
>(S)(A) (B)(C)(D)
>   v  v   v  v  v
> |X||X||--|--|--|X-|
>skip   skip  skipskip  skip
> 
> Expand the calculation of DPA free space and enhance the logic to
> support this more complex skipping.  To track the potential of multiple
> skip resources an xarray is attached to the endpoint decoder.  The
> existing algorithm between RAM and PMEM is consolidated within the new
> one to streamline the code even though the result is the storage of a
> single skip resource in the xarray.
> 
> Signed-off-by: Navneet Singh 
> Co-developed-by: Ira Weiny 
> Signed-off-by: Ira Weiny 
> 
> ---
> Changes:
> [Jonathan: Use an example only mapping 1/2 of DC1]
> [iweiny: Update cover letter]
> [iweiny: Fix 0day bugs
>   https://lore.kernel.org/all/202408090138.rb41ybe8-...@intel.com/
> [djbw/Jonathan: allow more than 1 region per DC partition]
> ---
>  drivers/cxl/core/hdm.c  | 196 
> 
>  drivers/cxl/core/port.c |   2 +
>  drivers/cxl/cxl.h   |   2 +
>  3 files changed, 184 insertions(+), 16 deletions(-)
> 
> diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
> index 3df10517a327..b4a517c6d283 100644
> --- a/drivers/cxl/core/hdm.c
> +++ b/drivers/cxl/core/hdm.c
> @@ -223,6 +223,25 @@ void cxl_dpa_debug(struct seq_file *file, struct 
> cxl_dev_state *cxlds)
>  }
>  EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, CXL);
>  
> +static void cxl_skip_release(struct cxl_endpoint_decoder *cxled)
> +{
> + struct cxl_dev_state *cxlds = cxled_to_memdev(cxled)->cxlds;
> + struct cxl_port *port = cxled_to_port(cxled);
> + struct device *dev = &port->dev;
> + unsigned long index;
> + void *entry;
> +
> + xa_for_each(&cxled->skip_res, index, entry) {
> + struct resource *res = entry;
> +
> + dev_dbg(dev, "decoder%d.%d: releasing skipped space; %pr\n",
> + port->id, cxled->cxld.id, res);
> + __release_region(&cxlds->dpa_res, res->start,
> +  resource_size(res));
> + xa_erase(&cxled->skip_res, index);
> + }
> +}
> +
>  /*
>   * Must be called in a context that synchronizes against this decoder's
>   * port ->remove() callback (like an endpoint decoder sysfs attribute)
> @@ -233,15 +252,11 @@ static void __cxl_dpa_release(struct 
> cxl_endpoint_decoder *cxled)
>   struct cxl_port *port = cxled_to_port(cxled);
>   struct cxl_dev_state *cxlds = cxlmd->cxlds;
>   struct resource *res = cxled->dpa_res;
> - resource_size_t skip_start;
>  
>   lockdep_assert_held_write(&cxl_dpa_rwsem

[PATCH v3 09/25] cxl/hdm: Add dynamic capacity size support to endpoint decoders

2024-08-16 Thread ira . weiny
From: Navneet Singh 

To support Dynamic Capacity Devices (DCD) endpoint decoders will need to
map DC partitions (regions).  In addition to assigning the size of the
DC partition, the decoder must assign any skip value from the previous
decoder.  This must be done within a contiguous DPA space.

Two complications arise with Dynamic Capacity regions which did not
exist with Ram and PMEM partitions.  First, gaps in the DPA space can
exist between and around the DC partitions.  Second, the Linux resource
tree does not allow a resource to be marked across existing nodes within
a tree.

For clarity, below is an example of an 60GB device with 10GB of RAM,
10GB of PMEM and 10GB for each of 2 DC partitions.  The desired CXL
mapping is 5GB of RAM, 5GB of PMEM, and 5GB of DC1.

 DPA RANGE
 (dpa_res)
0GB10GB   20GB   30GB   40GB   50GB   60GB
|--|--|--|--|--|--|

RAM PMEM  DC0   DC1
 (ram_res)  (pmem_res)(dc_res[0])   (dc_res[1])
|--|--| |--| |--|

 RAMPMEMDC1
|X||X||--|--|--|X-|
0GB   5GB  10GB  15GB 20GB   30GB   40GB   50GB   60GB

The previous skip resource between RAM and PMEM was always a child of
the RAM resource and fit nicely [see (S) below].  Because of this
simplicity this skip resource reference was not stored in any CXL state.
On release the skip range could be calculated based on the endpoint
decoders stored values.

Now when DC1 is being mapped 4 skip resources must be created as
children.  One for the PMEM resource (A), two of the parent DPA resource
(B,D), and one more child of the DC0 resource (C).

0GB10GB   20GB   30GB   40GB   50GB   60GB
|--|--|--|--|--|--|
   | |
|--|--|| |--|| |--|
|  |   |  |  |
   (S)(A) (B)(C)(D)
v  v   v  v  v
|X||X||--|--|--|X-|
   skip   skip  skipskip  skip

Expand the calculation of DPA free space and enhance the logic to
support this more complex skipping.  To track the potential of multiple
skip resources an xarray is attached to the endpoint decoder.  The
existing algorithm between RAM and PMEM is consolidated within the new
one to streamline the code even though the result is the storage of a
single skip resource in the xarray.

Signed-off-by: Navneet Singh 
Co-developed-by: Ira Weiny 
Signed-off-by: Ira Weiny 

---
Changes:
[Jonathan: Use an example only mapping 1/2 of DC1]
[iweiny: Update cover letter]
[iweiny: Fix 0day bugs
https://lore.kernel.org/all/202408090138.rb41ybe8-...@intel.com/
[djbw/Jonathan: allow more than 1 region per DC partition]
---
 drivers/cxl/core/hdm.c  | 196 
 drivers/cxl/core/port.c |   2 +
 drivers/cxl/cxl.h   |   2 +
 3 files changed, 184 insertions(+), 16 deletions(-)

diff --git a/drivers/cxl/core/hdm.c b/drivers/cxl/core/hdm.c
index 3df10517a327..b4a517c6d283 100644
--- a/drivers/cxl/core/hdm.c
+++ b/drivers/cxl/core/hdm.c
@@ -223,6 +223,25 @@ void cxl_dpa_debug(struct seq_file *file, struct 
cxl_dev_state *cxlds)
 }
 EXPORT_SYMBOL_NS_GPL(cxl_dpa_debug, CXL);
 
+static void cxl_skip_release(struct cxl_endpoint_decoder *cxled)
+{
+   struct cxl_dev_state *cxlds = cxled_to_memdev(cxled)->cxlds;
+   struct cxl_port *port = cxled_to_port(cxled);
+   struct device *dev = &port->dev;
+   unsigned long index;
+   void *entry;
+
+   xa_for_each(&cxled->skip_res, index, entry) {
+   struct resource *res = entry;
+
+   dev_dbg(dev, "decoder%d.%d: releasing skipped space; %pr\n",
+   port->id, cxled->cxld.id, res);
+   __release_region(&cxlds->dpa_res, res->start,
+resource_size(res));
+   xa_erase(&cxled->skip_res, index);
+   }
+}
+
 /*
  * Must be called in a context that synchronizes against this decoder's
  * port ->remove() callback (like an endpoint decoder sysfs attribute)
@@ -233,15 +252,11 @@ static void __cxl_dpa_release(struct cxl_endpoint_decoder 
*cxled)
struct cxl_port *port = cxled_to_port(cxled);
struct cxl_dev_state *cxlds = cxlmd->cxlds;
struct resource *res = cxled->dpa_res;
-   resource_size_t skip_start;
 
lockdep_assert_held_write(&cxl_dpa_rwsem);
 
-   /* save @skip_start, before @res is released */
-   skip_start = res->start - cxled->skip;
__release_region(&cxlds->dpa_res, res->start, resource_size(res));
-   if (cxled->skip)
-   __release_regio