Re: [PATCH v2 3/4] PCI/AER: Fetch information for FTrace

2024-02-02 Thread Dan Williams
Wang, Qingshun wrote:
> Fetch and store the data of 3 more registers: "Link Status", "Device
> Control 2", and "Advanced Error Capabilities and Control". This data is
> needed for external observation to better understand ANFE.
> 
> Signed-off-by: "Wang, Qingshun" 
> ---
>  drivers/acpi/apei/ghes.c |  8 +++-
>  drivers/cxl/core/pci.c   | 11 ++-
>  drivers/pci/pci.h|  4 
>  drivers/pci/pcie/aer.c   | 26 --
>  include/linux/aer.h  |  6 --
>  5 files changed, 45 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index 6034039d5cff..047cc01be68c 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -594,7 +594,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data 
> *gdata)
>   if (pcie_err->validation_bits & CPER_PCIE_VALID_DEVICE_ID &&
>   pcie_err->validation_bits & CPER_PCIE_VALID_AER_INFO) {
>   struct pcie_capability_regs *pcie_caps;
> + u16 device_control_2 = 0;
>   u16 device_status = 0;
> + u16 link_status = 0;
>   unsigned int devfn;
>   int aer_severity;
>   u8 *aer_info;
> @@ -619,7 +621,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data 
> *gdata)
>  
>   if (pcie_err->validation_bits & CPER_PCIE_VALID_CAPABILITY) {
>   pcie_caps = (struct pcie_capability_regs 
> *)pcie_err->capability;
> + device_control_2 = pcie_caps->device_control_2;
>   device_status = pcie_caps->device_status;
> + link_status = pcie_caps->link_status;
>   }
>  
>   aer_recover_queue(pcie_err->device_id.segment,
> @@ -627,7 +631,9 @@ static void ghes_handle_aer(struct acpi_hest_generic_data 
> *gdata)
> devfn, aer_severity,
> (struct aer_capability_regs *)
> aer_info,
> -   device_status);
> +   device_status,
> +   link_status,
> +   device_control_2);
>   }
>  #endif
>  }
> diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
> index 9111a4415a63..3aa57fe8db42 100644
> --- a/drivers/cxl/core/pci.c
> +++ b/drivers/cxl/core/pci.c
> @@ -903,7 +903,9 @@ static void cxl_handle_rdport_errors(struct cxl_dev_state 
> *cxlds)
>   struct aer_capability_regs aer_regs;
>   struct cxl_dport *dport;
>   struct cxl_port *port;
> + u16 device_control_2;
>   u16 device_status;
> + u16 link_status;
>   int severity;
>  
>   port = cxl_pci_find_port(pdev, );
> @@ -918,10 +920,17 @@ static void cxl_handle_rdport_errors(struct 
> cxl_dev_state *cxlds)
>   if (!cxl_rch_get_aer_severity(_regs, ))
>   return;
>  
> + if (pcie_capability_read_word(pdev, PCI_EXP_DEVCTL2, _control_2))
> + return;
> +
>   if (pcie_capability_read_word(pdev, PCI_EXP_DEVSTA, _status))
>   return;
>  
> - pci_print_aer(pdev, severity, _regs, device_status);
> + if (pcie_capability_read_word(pdev, PCI_EXP_LNKSTA, _status))
> + return;
> +
> + pci_print_aer(pdev, severity, _regs, device_status,
> +   link_status, device_control_2);

Rather than complicate the calling convention of pci_print_aer(), update
the internals of pci_print_aer() to get these extra registers, or
provide a new wrapper interface that satisfies the dependencies and
switch users over to that.  Otherwise multiple touches of the same code
path in one patch set is indicative of the need for a higher level
helper.


Re: [linux-next:master] BUILD REGRESSION c503e3eec382ac708ee7adf874add37b77c5d312

2023-10-31 Thread Dan Williams
Dan Williams wrote:
> Bjorn Helgaas wrote:
> > On Tue, Oct 31, 2023 at 04:35:23AM +0800, kernel test robot wrote:
> > > tree/branch: 
> > > https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
> > > branch HEAD: c503e3eec382ac708ee7adf874add37b77c5d312  Add linux-next 
> > > specific files for 20231030
> > > 
> > > Error/Warning reports:
> > > ... 
> > > https://lore.kernel.org/oe-kbuild-all/202310302206.pkr5ebdi-...@intel.com
> > 
> > > Error/Warning: (recently discovered and may have been fixed)
> > > 
> > > Warning: MAINTAINERS references a file that doesn't exist: 
> > > Documentation/devicetree/bindings/iio/imu/bosch,bma400.yaml
> > > aarch64-linux-ld: drivers/cxl/core/pci.c:921:(.text+0xbbc): undefined 
> > > reference to `pci_print_aer'
> > > ...
> > > arch/riscv/include/asm/mmio.h:67:(.text+0xd66): undefined reference to 
> > > `pci_print_aer'
> > > csky-linux-ld: pci.c:(.text+0x6e8): undefined reference to `pci_print_aer'
> > > drivers/cxl/core/pci.c:921: undefined reference to `pci_print_aer'
> > > drivers/cxl/core/pci.c:921:(.text+0xbc0): undefined reference to 
> > > `pci_print_aer'
> > > ...
> > > ld: drivers/cxl/core/pci.c:921: undefined reference to `pci_print_aer'
> > > loongarch64-linux-ld: drivers/cxl/core/pci.c:921:(.text+0xa38): undefined 
> > > reference to `pci_print_aer'
> > > pci.c:(.text+0x662): undefined reference to `pci_print_aer'
> > > powerpc-linux-ld: pci.c:(.text+0xf10): undefined reference to 
> > > `pci_print_aer'
> > > riscv64-linux-ld: pci.c:(.text+0x11ec): undefined reference to 
> > > `pci_print_aer'
> > 
> > I have no idea about the above (and all the similar ones below); I
> > assume they all have to do with
> > https://lore.kernel.org/r/20231018171713.1883517-13-rrich...@amd.com
> 
> Yes, I will get this fix folded into cxl/next:
> 
> diff --git a/include/linux/aer.h b/include/linux/aer.h
> index f6ea2f57d808..3db310c19ab7 100644
> --- a/include/linux/aer.h
> +++ b/include/linux/aer.h
> @@ -43,16 +43,20 @@ struct aer_capability_regs {
>  #if defined(CONFIG_PCIEAER)
>  int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
>  int pcie_aer_is_native(struct pci_dev *dev);
> +void pci_print_aer(struct pci_dev *dev, int aer_severity,
> +   struct aer_capability_regs *aer);
>  #else
>  static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
>  {
> return -EINVAL;
>  }
>  static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
> +static inline void pci_print_aer(struct pci_dev *dev, int aer_severity,
> +struct aer_capability_regs *aer)
> +{
> +}
>  #endif
>  
> -void pci_print_aer(struct pci_dev *dev, int aer_severity,
> -   struct aer_capability_regs *aer);
>  int cper_severity_to_aer(int cper_severity);
>  void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
>int severity, struct aer_capability_regs *aer_regs);

Actually, no this was my fault for inadvertantly including this patch in
the branch:

http://lore.kernel.org/r/653c7eb29265c_244c8f29...@dwillia2-xfh.jf.intel.com.notmuch

...which had the desired effect, but at the cost of thrashing
linux-next. Apologies for that.



Re: [linux-next:master] BUILD REGRESSION c503e3eec382ac708ee7adf874add37b77c5d312

2023-10-31 Thread Dan Williams
Bjorn Helgaas wrote:
> On Tue, Oct 31, 2023 at 04:35:23AM +0800, kernel test robot wrote:
> > tree/branch: 
> > https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
> > branch HEAD: c503e3eec382ac708ee7adf874add37b77c5d312  Add linux-next 
> > specific files for 20231030
> > 
> > Error/Warning reports:
> > ... 
> > https://lore.kernel.org/oe-kbuild-all/202310302206.pkr5ebdi-...@intel.com
> 
> > Error/Warning: (recently discovered and may have been fixed)
> > 
> > Warning: MAINTAINERS references a file that doesn't exist: 
> > Documentation/devicetree/bindings/iio/imu/bosch,bma400.yaml
> > aarch64-linux-ld: drivers/cxl/core/pci.c:921:(.text+0xbbc): undefined 
> > reference to `pci_print_aer'
> > ...
> > arch/riscv/include/asm/mmio.h:67:(.text+0xd66): undefined reference to 
> > `pci_print_aer'
> > csky-linux-ld: pci.c:(.text+0x6e8): undefined reference to `pci_print_aer'
> > drivers/cxl/core/pci.c:921: undefined reference to `pci_print_aer'
> > drivers/cxl/core/pci.c:921:(.text+0xbc0): undefined reference to 
> > `pci_print_aer'
> > ...
> > ld: drivers/cxl/core/pci.c:921: undefined reference to `pci_print_aer'
> > loongarch64-linux-ld: drivers/cxl/core/pci.c:921:(.text+0xa38): undefined 
> > reference to `pci_print_aer'
> > pci.c:(.text+0x662): undefined reference to `pci_print_aer'
> > powerpc-linux-ld: pci.c:(.text+0xf10): undefined reference to 
> > `pci_print_aer'
> > riscv64-linux-ld: pci.c:(.text+0x11ec): undefined reference to 
> > `pci_print_aer'
> 
> I have no idea about the above (and all the similar ones below); I
> assume they all have to do with
> https://lore.kernel.org/r/20231018171713.1883517-13-rrich...@amd.com

Yes, I will get this fix folded into cxl/next:

diff --git a/include/linux/aer.h b/include/linux/aer.h
index f6ea2f57d808..3db310c19ab7 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -43,16 +43,20 @@ struct aer_capability_regs {
 #if defined(CONFIG_PCIEAER)
 int pci_aer_clear_nonfatal_status(struct pci_dev *dev);
 int pcie_aer_is_native(struct pci_dev *dev);
+void pci_print_aer(struct pci_dev *dev, int aer_severity,
+   struct aer_capability_regs *aer);
 #else
 static inline int pci_aer_clear_nonfatal_status(struct pci_dev *dev)
 {
return -EINVAL;
 }
 static inline int pcie_aer_is_native(struct pci_dev *dev) { return 0; }
+static inline void pci_print_aer(struct pci_dev *dev, int aer_severity,
+struct aer_capability_regs *aer)
+{
+}
 #endif
 
-void pci_print_aer(struct pci_dev *dev, int aer_severity,
-   struct aer_capability_regs *aer);
 int cper_severity_to_aer(int cper_severity);
 void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn,
   int severity, struct aer_capability_regs *aer_regs);


Re: [PATCH v10 13/15] PCI/AER: Forward RCH downstream port-detected errors to the CXL.mem dev handler

2023-09-26 Thread Dan Williams
Terry Bowman wrote:
> Hi Dan,
> 
> On 8/31/23 15:35, Dan Williams wrote:
> > Terry Bowman wrote:
> >> From: Robert Richter 
> >>
> >> In Restricted CXL Device (RCD) mode a CXL device is exposed as an
> >> RCiEP, but CXL downstream and upstream ports are not enumerated and
> >> not visible in the PCIe hierarchy. [1] Protocol and link errors from
> >> these non-enumerated ports are signaled as internal AER errors, either
> >> Uncorrectable Internal Error (UIE) or Corrected Internal Errors (CIE)
> >> via an RCEC.
> >>
> >> Restricted CXL host (RCH) downstream port-detected errors have the
> >> Requester ID of the RCEC set in the RCEC's AER Error Source ID
> >> register. A CXL handler must then inspect the error status in various
> >> CXL registers residing in the dport's component register space (CXL
> >> RAS capability) or the dport's RCRB (PCIe AER extended
> >> capability). [2]
> >>
> >> Errors showing up in the RCEC's error handler must be handled and
> >> connected to the CXL subsystem. Implement this by forwarding the error
> >> to all CXL devices below the RCEC. Since the entire CXL device is
> >> controlled only using PCIe Configuration Space of device 0, function
> >> 0, only pass it there [3]. The error handling is limited to currently
> >> supported devices with the Memory Device class code set (CXL Type 3
> >> Device, PCI_CLASS_MEMORY_CXL, 502h), handle downstream port errors in
> >> the device's cxl_pci driver. Support for other CXL Device Types
> >> (e.g. a CXL.cache Device) can be added later.
> >>
> >> To handle downstream port errors in addition to errors directed to the
> >> CXL endpoint device, a handler must also inspect the CXL RAS and PCIe
> >> AER capabilities of the CXL downstream port the device is connected
> >> to.
> >>
> >> Since CXL downstream port errors are signaled using internal errors,
> >> the handler requires those errors to be unmasked. This is subject of a
> >> follow-on patch.
> >>
> >> The reason for choosing this implementation is that the AER service
> >> driver claims the RCEC device, but does not allow it to register a
> >> custom specific handler to support CXL. Connecting the RCEC hard-wired
> >> with a CXL handler does not work, as the CXL subsystem might not be
> >> present all the time. The alternative to add an implementation to the
> >> portdrv to allow the registration of a custom RCEC error handler isn't
> >> worth doing it as CXL would be its only user. Instead, just check for
> >> an CXL RCEC and pass it down to the connected CXL device's error
> >> handler. With this approach the code can entirely be implemented in
> >> the PCIe AER driver and is independent of the CXL subsystem. The CXL
> >> driver only provides the handler.
> >>
> >> [1] CXL 3.0 spec: 9.11.8 CXL Devices Attached to an RCH
> >> [2] CXL 3.0 spec, 12.2.1.1 RCH Downstream Port-detected Errors
> >> [3] CXL 3.0 spec, 8.1.3 PCIe DVSEC for CXL Devices
> >>
> >> Co-developed-by: Terry Bowman 
> >> Signed-off-by: Terry Bowman 
> >> Signed-off-by: Robert Richter 
> >> Cc: "Oliver O'Halloran" 
> >> Cc: Bjorn Helgaas 
> >> Cc: linuxppc-dev@lists.ozlabs.org
> >> Cc: linux-...@vger.kernel.org
> >> Acked-by: Bjorn Helgaas 
> >> Reviewed-by: Jonathan Cameron 
> >> Reviewed-by: Dave Jiang 
> >> ---
> >>  drivers/pci/pcie/Kconfig | 12 +
> >>  drivers/pci/pcie/aer.c   | 96 +++-
> >>  2 files changed, 106 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
> >> index 228652a59f27..4f0e70fafe2d 100644
> >> --- a/drivers/pci/pcie/Kconfig
> >> +++ b/drivers/pci/pcie/Kconfig
> >> @@ -49,6 +49,18 @@ config PCIEAER_INJECT
> >>  gotten from:
> >> 
> >> https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/
> >>  
> >> +config PCIEAER_CXL
> >> +  bool "PCI Express CXL RAS support for Restricted Hosts (RCH)"
> > 
> > Why the "for Restricted Hosts (RCH)" clarification? I am seeing nothing
> > that prevents this from working with RCECs on VH topologies.
> > 
> 
> The same option can be used in VH mode. Will remove the RCH reference.
> 
> > 
> >> +  default y
> > 
> > Minor, but I think "default PCIEAE

RE: [PATCH v10 13/15] PCI/AER: Forward RCH downstream port-detected errors to the CXL.mem dev handler

2023-08-31 Thread Dan Williams
Terry Bowman wrote:
> From: Robert Richter 
> 
> In Restricted CXL Device (RCD) mode a CXL device is exposed as an
> RCiEP, but CXL downstream and upstream ports are not enumerated and
> not visible in the PCIe hierarchy. [1] Protocol and link errors from
> these non-enumerated ports are signaled as internal AER errors, either
> Uncorrectable Internal Error (UIE) or Corrected Internal Errors (CIE)
> via an RCEC.
> 
> Restricted CXL host (RCH) downstream port-detected errors have the
> Requester ID of the RCEC set in the RCEC's AER Error Source ID
> register. A CXL handler must then inspect the error status in various
> CXL registers residing in the dport's component register space (CXL
> RAS capability) or the dport's RCRB (PCIe AER extended
> capability). [2]
> 
> Errors showing up in the RCEC's error handler must be handled and
> connected to the CXL subsystem. Implement this by forwarding the error
> to all CXL devices below the RCEC. Since the entire CXL device is
> controlled only using PCIe Configuration Space of device 0, function
> 0, only pass it there [3]. The error handling is limited to currently
> supported devices with the Memory Device class code set (CXL Type 3
> Device, PCI_CLASS_MEMORY_CXL, 502h), handle downstream port errors in
> the device's cxl_pci driver. Support for other CXL Device Types
> (e.g. a CXL.cache Device) can be added later.
> 
> To handle downstream port errors in addition to errors directed to the
> CXL endpoint device, a handler must also inspect the CXL RAS and PCIe
> AER capabilities of the CXL downstream port the device is connected
> to.
> 
> Since CXL downstream port errors are signaled using internal errors,
> the handler requires those errors to be unmasked. This is subject of a
> follow-on patch.
> 
> The reason for choosing this implementation is that the AER service
> driver claims the RCEC device, but does not allow it to register a
> custom specific handler to support CXL. Connecting the RCEC hard-wired
> with a CXL handler does not work, as the CXL subsystem might not be
> present all the time. The alternative to add an implementation to the
> portdrv to allow the registration of a custom RCEC error handler isn't
> worth doing it as CXL would be its only user. Instead, just check for
> an CXL RCEC and pass it down to the connected CXL device's error
> handler. With this approach the code can entirely be implemented in
> the PCIe AER driver and is independent of the CXL subsystem. The CXL
> driver only provides the handler.
> 
> [1] CXL 3.0 spec: 9.11.8 CXL Devices Attached to an RCH
> [2] CXL 3.0 spec, 12.2.1.1 RCH Downstream Port-detected Errors
> [3] CXL 3.0 spec, 8.1.3 PCIe DVSEC for CXL Devices
> 
> Co-developed-by: Terry Bowman 
> Signed-off-by: Terry Bowman 
> Signed-off-by: Robert Richter 
> Cc: "Oliver O'Halloran" 
> Cc: Bjorn Helgaas 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-...@vger.kernel.org
> Acked-by: Bjorn Helgaas 
> Reviewed-by: Jonathan Cameron 
> Reviewed-by: Dave Jiang 
> ---
>  drivers/pci/pcie/Kconfig | 12 +
>  drivers/pci/pcie/aer.c   | 96 +++-
>  2 files changed, 106 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
> index 228652a59f27..4f0e70fafe2d 100644
> --- a/drivers/pci/pcie/Kconfig
> +++ b/drivers/pci/pcie/Kconfig
> @@ -49,6 +49,18 @@ config PCIEAER_INJECT
> gotten from:
>
> https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/
>  
> +config PCIEAER_CXL
> + bool "PCI Express CXL RAS support for Restricted Hosts (RCH)"

Why the "for Restricted Hosts (RCH)" clarification? I am seeing nothing
that prevents this from working with RCECs on VH topologies.


> + default y

Minor, but I think "default PCIEAER" makes it slightly clearer that CXL
error handling comes along for the ride with PCIE AER.

> + depends on PCIEAER && CXL_PCI
> + help
> +   Enables error handling of downstream ports of a CXL host
> +   that is operating in RCD mode (Restricted CXL Host, RCH).
> +   The downstream port reports AER errors to a given RCEC.
> +   Errors are handled by the CXL memory device driver.
> +
> +   If unsure, say Y.
> +
>  #
>  # PCI Express ECRC
>  #
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index d3344fcf1f79..c354ca5e8f2b 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -946,14 +946,100 @@ static bool find_source_device(struct pci_dev *parent,
>   return true;
>  }
>  
> +#ifdef CONFIG_PCIEAER_CXL
> +
> +static bool is_cxl_mem_dev(struct pci_dev *dev)
> +{
> + /*
> +  * The capability, status, and control fields in Device 0,
> +  * Function 0 DVSEC control the CXL functionality of the
> +  * entire device (CXL 3.0, 8.1.3).
> +  */
> + if (dev->devfn != PCI_DEVFN(0, 0))
> + return false;
> +
> + /*
> +  * CXL Memory Devices must have the 502h class code set (CXL
> +  * 

RE: [PATCH v5 25/26] PCI/AER: Forward RCH downstream port-detected errors to the CXL.mem dev handler

2023-06-12 Thread Dan Williams
Terry Bowman wrote:
> From: Robert Richter 
> 
> In Restricted CXL Device (RCD) mode a CXL device is exposed as an
> RCiEP, but CXL downstream and upstream ports are not enumerated and
> not visible in the PCIe hierarchy. [1] Protocol and link errors from
> these non-enumerated ports are signaled as internal AER errors, either
> Uncorrectable Internal Error (UIE) or Corrected Internal Errors (CIE)
> via an RCEC.
> 
> Restricted CXL host (RCH) downstream port-detected errors have the
> Requster ID of the RCEC set in the RCEC's AER Error Source ID
> register. A CXL handler must then inspect the error status in various
> CXL registers residing in the dport's component register space (CXL
> RAS capability) or the dport's RCRB (PCIe AER extended
> capability). [2]
> 
> Errors showing up in the RCEC's error handler must be handled and
> connected to the CXL subsystem. Implement this by forwarding the error
> to all CXL devices below the RCEC. Since the entire CXL device is
> controlled only using PCIe Configuration Space of device 0, function
> 0, only pass it there [3]. The error handling is limited to currently
> supported devices with the Memory Device class code set (CXL Type 3
> Device, PCI_CLASS_MEMORY_CXL, 502h), handle downstream port errors in
> the device's cxl_pci driver. Support for other CXL Device Types
> (e.g. a CXL.cache Device) can be added later.
> 
> To handle downstream port errors in addition to errors directed to the
> CXL endpoint device, a handler must also inspect the CXL RAS and PCIe
> AER capabilities of the CXL downstream port the device is connected
> to.
> 
> Since CXL downstream port errors are signaled using internal errors,
> the handler requires those errors to be unmasked. This is subject of a
> follow-on patch.
> 
> The reason for choosing this implementation is that the AER service
> driver claims the RCEC device, but does not allow it to register a
> custom specific handler to support CXL. Connecting the RCEC hard-wired
> with a CXL handler does not work, as the CXL subsystem might not be
> present all the time. The alternative to add an implementation to the
> portdrv to allow the registration of a custom RCEC error handler isn't
> worth doing it as CXL would be its only user. Instead, just check for
> an CXL RCEC and pass it down to the connected CXL device's error
> handler. With this approach the code can entirely be implemented in
> the PCIe AER driver and is independent of the CXL subsystem. The CXL
> driver only provides the handler.
> 
> [1] CXL 3.0 spec: 9.11.8 CXL Devices Attached to an RCH
> [2] CXL 3.0 spec, 12.2.1.1 RCH Downstream Port-detected Errors
> [3] CXL 3.0 spec, 8.1.3 PCIe DVSEC for CXL Devices
> 
> Co-developed-by: Terry Bowman 
> Signed-off-by: Terry Bowman 
> Signed-off-by: Robert Richter 
> Cc: "Oliver O'Halloran" 
> Cc: Bjorn Helgaas 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-...@vger.kernel.org
> Acked-by: Bjorn Helgaas 
> Reviewed-by: Jonathan Cameron 
> ---
>  drivers/pci/pcie/Kconfig | 12 +
>  drivers/pci/pcie/aer.c   | 96 +++-
>  2 files changed, 106 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
> index 228652a59f27..4f0e70fafe2d 100644
> --- a/drivers/pci/pcie/Kconfig
> +++ b/drivers/pci/pcie/Kconfig
> @@ -49,6 +49,18 @@ config PCIEAER_INJECT
> gotten from:
>
> https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/
>  
> +config PCIEAER_CXL
> + bool "PCI Express CXL RAS support for Restricted Hosts (RCH)"
> + default y
> + depends on PCIEAER && CXL_PCI
> + help
> +   Enables error handling of downstream ports of a CXL host
> +   that is operating in RCD mode (Restricted CXL Host, RCH).
> +   The downstream port reports AER errors to a given RCEC.
> +   Errors are handled by the CXL memory device driver.
> +
> +   If unsure, say Y.
> +
>  #
>  # PCI Express ECRC
>  #
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index d3344fcf1f79..c354ca5e8f2b 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -946,14 +946,100 @@ static bool find_source_device(struct pci_dev *parent,
>   return true;
>  }
>  
> +#ifdef CONFIG_PCIEAER_CXL
> +
> +static bool is_cxl_mem_dev(struct pci_dev *dev)
> +{
> + /*
> +  * The capability, status, and control fields in Device 0,
> +  * Function 0 DVSEC control the CXL functionality of the
> +  * entire device (CXL 3.0, 8.1.3).
> +  */
> + if (dev->devfn != PCI_DEVFN(0, 0))
> + return false;
> +
> + /*
> +  * CXL Memory Devices must have the 502h class code set (CXL
> +  * 3.0, 8.1.12.1).
> +  */
> + if ((dev->class >> 8) != PCI_CLASS_MEMORY_CXL)
> + return false;

I think this is ok for now, but I expect it will want to be something
like dev->is_cxl in the future where that is the cached result of:

dev->is_cxl = 

RE: [PATCH v3 6/6] PCI/AER: Unmask RCEC internal errors to enable RCH downstream port error handling

2023-04-17 Thread Dan Williams
Terry Bowman wrote:
> From: Robert Richter 
> 
> RCEC AER corrected and uncorrectable internal errors (CIE/UIE) are
> disabled by default. [1][2] Enable them to receive CXL downstream port
> errors of a Restricted CXL Host (RCH).
> 
> [1] CXL 3.0 Spec, 12.2.1.1 - RCH Downstream Port Detected Errors
> [2] PCIe Base Spec 6.0, 7.8.4.3 Uncorrectable Error Mask Register,
> 7.8.4.6 Correctable Error Mask Register

My comment on patch5 to make CXL link details a first class property of a
'struct pci_dev':

http://lore.kernel.org/r/643debf5af445_1b6629...@dwillia2-xfh.jf.intel.com.notmuch/

...also applies here.

Other than that nothing more from me on this one beyond what Bjorn and
Jonathan have said. I do agree with Robert about being cautious about
only enabling this for CXL devices for now and not all internal errors
for all AER capable devices globally. The rationale being that CXL
devices are a new link on top of PCIe and abuse/reuse internal errors
when they are conceptually functionally equivalent to PCIe link errors.


RE: [PATCH v3 5/6] PCI/AER: Forward RCH downstream port-detected errors to the CXL.mem dev handler

2023-04-17 Thread Dan Williams
Terry Bowman wrote:
> From: Robert Richter 
> 
> In Restricted CXL Device (RCD) mode a CXL device is exposed as an
> RCiEP, but CXL downstream and upstream ports are not enumerated and
> not visible in the PCIe hierarchy. Protocol and link errors are sent
> to an RCEC.
> 
> Restricted CXL host (RCH) downstream port-detected errors are signaled
> as internal AER errors, either Uncorrectable Internal Error (UIE) or
> Corrected Internal Errors (CIE). The error source is the id of the
> RCEC. A CXL handler must then inspect the error status in various CXL
> registers residing in the dport's component register space (CXL RAS
> cap) or the dport's RCRB (AER ext cap). [1]
> 
> Errors showing up in the RCEC's error handler must be handled and
> connected to the CXL subsystem. Implement this by forwarding the error
> to all CXL devices below the RCEC. Since the entire CXL device is
> controlled only using PCIe Configuration Space of device 0, Function
> 0, only pass it there [2]. These devices have the Memory Device class
> code set (PCI_CLASS_MEMORY_CXL, 502h) and the existing cxl_pci driver
> can implement the handler. In addition to errors directed to the CXL
> endpoint device, the handler must also inspect the CXL downstream
> port's CXL RAS and PCIe AER external capabilities that is connected to
> the device.
> 
> Since CXL downstream port errors are signaled using internal errors,
> the handler requires those errors to be unmasked. This is subject of a
> follow-on patch.
> 
> The reason for choosing this implementation is that a CXL RCEC device
> is bound to the AER port driver, but the driver does not allow it to
> register a custom specific handler to support CXL. Connecting the RCEC
> hard-wired with a CXL handler does not work, as the CXL subsystem
> might not be present all the time. The alternative to add an
> implementation to the portdrv to allow the registration of a custom
> RCEC error handler isn't worth doing it as CXL would be its only user.
> Instead, just check for an CXL RCEC and pass it down to the connected
> CXL device's error handler. With this approach the code can entirely
> be implemented in the PCIe AER driver and is independent of the CXL
> subsystem. The CXL driver only provides the handler.
> 
> [1] CXL 3.0 spec, 12.2.1.1 RCH Downstream Port-detected Errors
> [2] CXL 3.0 spec, 8.1.3 PCIe DVSEC for CXL Devices
> 
> Co-developed-by: Terry Bowman 
> Signed-off-by: Robert Richter 
> Signed-off-by: Terry Bowman 
> Cc: "Oliver O'Halloran" 
> Cc: Bjorn Helgaas 
> Cc: Mahesh J Salgaonkar 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: linux-...@vger.kernel.org
> ---
>  drivers/pci/pcie/Kconfig |  8 ++
>  drivers/pci/pcie/aer.c   | 61 
>  2 files changed, 69 insertions(+)
> 
> diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
> index 228652a59f27..b0dbd864d3a3 100644
> --- a/drivers/pci/pcie/Kconfig
> +++ b/drivers/pci/pcie/Kconfig
> @@ -49,6 +49,14 @@ config PCIEAER_INJECT
> gotten from:
>
> https://git.kernel.org/cgit/linux/kernel/git/gong.chen/aer-inject.git/
>  
> +config PCIEAER_CXL
> + bool "PCI Express CXL RAS support"
> + default y
> + depends on PCIEAER && CXL_PCI
> + help
> +   This enables CXL error handling for Restricted CXL Hosts
> +   (RCHs).
> +
>  #
>  # PCI Express ECRC
>  #
> diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
> index 7a25b62d9e01..171a08fd8ebd 100644
> --- a/drivers/pci/pcie/aer.c
> +++ b/drivers/pci/pcie/aer.c
> @@ -946,6 +946,65 @@ static bool find_source_device(struct pci_dev *parent,
>   return true;
>  }
>  
> +#ifdef CONFIG_PCIEAER_CXL
> +
> +static bool is_cxl_mem_dev(struct pci_dev *dev)
> +{
> + /*
> +  * A CXL device is controlled only using PCIe Configuration
> +  * Space of device 0, Function 0.
> +  */
> + if (dev->devfn != PCI_DEVFN(0, 0))
> + return false;
> +
> + /* Right now there is only a CXL.mem driver */
> + if ((dev->class >> 8) != PCI_CLASS_MEMORY_CXL)
> + return false;
> +
> + return true;
> +}

This part feels broken because most the errors of concern here are CXL
link generic and that can involve CXL.cache and CXL.mem errors on
devices that are not PCI_CLASS_MEMORY_CXL. This situation feels like it
wants formal acknowledgement in 'struct pci_dev' that CXL links ride on
top of PCIe links.

If it were not for RCRBs then the PCI core could just do:

dvsec = pci_find_dvsec_capability(pdev, PCI_DVSEC_VENDOR_ID_CXL,
  CXL_DVSEC_FLEXBUS_PORT);

...at bus scan time to identify devices with active CXL links. RCRBs
unfortunately make it so the link presence can not be detected until a
CXL driver is loaded to read that DVSEC out of MMIO space.

However, I still think that looks like a CXL aware driver registering a
'struct cxl_link' (for lack of a better name) object with a
corresponding PCI device. That link can indicate whether this 

[PATCH] cxl/pci: Move tracepoint definitions to drivers/cxl/core/

2022-12-08 Thread Dan Williams
CXL is using tracepoints for reporting RAS capability register payloads
for AER events, and has plans to use tracepoints for the output payload
of Get Poison List and Get Event Records commands. For organization
purposes it would be nice to keep those all under a single + local CXL
trace system. This also organization also potentially helps in the
future when CXL drivers expand beyond generic memory expanders, however
that would also entail a move away from the expander-specific
cxl_dev_state context, save that for later.

Note that the powerpc-specific drivers/misc/cxl/ also defines a 'cxl'
trace system, however, it is unlikely that a single platform will ever
load both drivers simultaneously.

Cc: Steven Rostedt 
Signed-off-by: Dan Williams 
---
This patch is targeting v6.3.  I am sending it out now to enable the
in-flight Event and Poison list patch sets to build upon. It will not
move to a non-rebasing branch until after v6.2-rc2, but in the meantime
I can throw it out on the list and the cxl/preview branch.

 drivers/cxl/core/Makefile  |3 +
 drivers/cxl/core/pci.c |  112 
 drivers/cxl/core/trace.c   |5 ++
 drivers/cxl/core/trace.h   |   11 ++--
 drivers/cxl/cxl.h  |2 +
 drivers/cxl/cxlpci.h   |3 +
 drivers/cxl/pci.c  |  111 
 tools/testing/cxl/Kbuild   |2 +
 8 files changed, 131 insertions(+), 118 deletions(-)
 create mode 100644 drivers/cxl/core/trace.c
 rename include/trace/events/cxl.h => drivers/cxl/core/trace.h (94%)

diff --git a/drivers/cxl/core/Makefile b/drivers/cxl/core/Makefile
index 79c7257f4107..ca4ae31d8f57 100644
--- a/drivers/cxl/core/Makefile
+++ b/drivers/cxl/core/Makefile
@@ -3,6 +3,8 @@ obj-$(CONFIG_CXL_BUS) += cxl_core.o
 obj-$(CONFIG_CXL_SUSPEND) += suspend.o
 
 ccflags-y += -I$(srctree)/drivers/cxl
+CFLAGS_trace.o = -DTRACE_INCLUDE_PATH=. -I$(src)
+
 cxl_core-y := port.o
 cxl_core-y += pmem.o
 cxl_core-y += regs.o
@@ -10,4 +12,5 @@ cxl_core-y += memdev.o
 cxl_core-y += mbox.o
 cxl_core-y += pci.o
 cxl_core-y += hdm.o
+cxl_core-$(CONFIG_TRACING) += trace.o
 cxl_core-$(CONFIG_CXL_REGION) += region.o
diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c
index 57764e9cd19d..1d1492440287 100644
--- a/drivers/cxl/core/pci.c
+++ b/drivers/cxl/core/pci.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include "core.h"
+#include "trace.h"
 
 /**
  * DOC: cxl core pci
@@ -622,3 +623,114 @@ void read_cdat_data(struct cxl_port *port)
}
 }
 EXPORT_SYMBOL_NS_GPL(read_cdat_data, CXL);
+
+void cxl_cor_error_detected(struct pci_dev *pdev)
+{
+   struct cxl_dev_state *cxlds = pci_get_drvdata(pdev);
+   struct cxl_memdev *cxlmd = cxlds->cxlmd;
+   struct device *dev = >dev;
+   void __iomem *addr;
+   u32 status;
+
+   if (!cxlds->regs.ras)
+   return;
+
+   addr = cxlds->regs.ras + CXL_RAS_CORRECTABLE_STATUS_OFFSET;
+   status = readl(addr);
+   if (status & CXL_RAS_CORRECTABLE_STATUS_MASK) {
+   writel(status & CXL_RAS_CORRECTABLE_STATUS_MASK, addr);
+   trace_cxl_aer_correctable_error(dev, status);
+   }
+}
+EXPORT_SYMBOL_NS_GPL(cxl_cor_error_detected, CXL);
+
+/* CXL spec rev3.0 8.2.4.16.1 */
+static void header_log_copy(struct cxl_dev_state *cxlds, u32 *log)
+{
+   void __iomem *addr;
+   u32 *log_addr;
+   int i, log_u32_size = CXL_HEADERLOG_SIZE / sizeof(u32);
+
+   addr = cxlds->regs.ras + CXL_RAS_HEADER_LOG_OFFSET;
+   log_addr = log;
+
+   for (i = 0; i < log_u32_size; i++) {
+   *log_addr = readl(addr);
+   log_addr++;
+   addr += sizeof(u32);
+   }
+}
+
+/*
+ * Log the state of the RAS status registers and prepare them to log the
+ * next error status. Return 1 if reset needed.
+ */
+static bool cxl_report_and_clear(struct cxl_dev_state *cxlds)
+{
+   struct cxl_memdev *cxlmd = cxlds->cxlmd;
+   struct device *dev = >dev;
+   u32 hl[CXL_HEADERLOG_SIZE_U32];
+   void __iomem *addr;
+   u32 status;
+   u32 fe;
+
+   if (!cxlds->regs.ras)
+   return false;
+
+   addr = cxlds->regs.ras + CXL_RAS_UNCORRECTABLE_STATUS_OFFSET;
+   status = readl(addr);
+   if (!(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK))
+   return false;
+
+   /* If multiple errors, log header points to first error from ctrl reg */
+   if (hweight32(status) > 1) {
+   addr = cxlds->regs.ras + CXL_RAS_CAP_CONTROL_OFFSET;
+   fe = BIT(FIELD_GET(CXL_RAS_CAP_CONTROL_FE_MASK, readl(addr)));
+   } else {
+   fe = status;
+   }
+
+   header_log_copy(cxlds, hl);
+   trace_cxl_aer_uncorrectable_error(dev, status, fe, hl);
+   writel(status & CXL_RAS_UNCORRECTABLE_STATUS_MASK, addr);
+
+   return true;
+}
+
+pci_ers_result_t cxl_error_detected(struct p

RE: [PATCH v2] mm, hwpoison: Try to recover from copy-on write faults

2022-10-19 Thread Dan Williams
Tony Luck wrote:
> If the kernel is copying a page as the result of a copy-on-write
> fault and runs into an uncorrectable error, Linux will crash because
> it does not have recovery code for this case where poison is consumed
> by the kernel.
> 
> It is easy to set up a test case. Just inject an error into a private
> page, fork(2), and have the child process write to the page.
> 
> I wrapped that neatly into a test at:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/aegl/ras-tools.git
> 
> just enable ACPI error injection and run:
> 
>   # ./einj_mem-uc -f copy-on-write
> 
> Add a new copy_user_highpage_mc() function that uses copy_mc_to_kernel()
> on architectures where that is available (currently x86 and powerpc).
> When an error is detected during the page copy, return VM_FAULT_HWPOISON
> to caller of wp_page_copy(). This propagates up the call stack. Both x86
> and powerpc have code in their fault handler to deal with this code by
> sending a SIGBUS to the application.
> 
> Note that this patch avoids a system crash and signals the process that
> triggered the copy-on-write action. It does not take any action for the
> memory error that is still in the shared page. To handle that a call to
> memory_failure() is needed. But this cannot be done from wp_page_copy()
> because it holds mmap_lock(). Perhaps the architecture fault handlers
> can deal with this loose end in a subsequent patch?
> 
> On Intel/x86 this loose end will often be handled automatically because
> the memory controller provides an additional notification of the h/w
> poison in memory, the handler for this will call memory_failure(). This
> isn't a 100% solution. If there are multiple errors, not all may be
> logged in this way.
> 
> Signed-off-by: Tony Luck 

Just some minor comments below, but you can add:

Reviewed-by: Dan Williams 

> 
> ---
> Changes in V2:
>Naoya Horiguchi:
>   1) Use -EHWPOISON error code instead of minus one.
>   2) Poison path needs also to deal with old_page
>Tony Luck:
>   Rewrote commit message
>   Added some powerpc folks to Cc: list
> ---
>  include/linux/highmem.h | 19 +++
>  mm/memory.c | 28 +++-
>  2 files changed, 38 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/highmem.h b/include/linux/highmem.h
> index e9912da5441b..5967541fbf0e 100644
> --- a/include/linux/highmem.h
> +++ b/include/linux/highmem.h
> @@ -319,6 +319,25 @@ static inline void copy_user_highpage(struct page *to, 
> struct page *from,
>  
>  #endif
>  
> +static inline int copy_user_highpage_mc(struct page *to, struct page *from,
> + unsigned long vaddr, struct 
> vm_area_struct *vma)
> +{
> + unsigned long ret = 0;
> +#ifdef copy_mc_to_kernel
> + char *vfrom, *vto;
> +
> + vfrom = kmap_local_page(from);
> + vto = kmap_local_page(to);
> + ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
> + kunmap_local(vto);
> + kunmap_local(vfrom);
> +#else
> + copy_user_highpage(to, from, vaddr, vma);
> +#endif
> +
> + return ret;
> +}
> +

There is likely some small benefit of doing this the idiomatic way and
let grep see that there are multiple definitions of
copy_user_highpage_mc() with an organization like:

#ifdef copy_mc_to_kernel
static inline int copy_user_highpage_mc(struct page *to, struct page *from,
unsigned long vaddr,
struct vm_area_struct *vma)
{
unsigned long ret = 0;
char *vfrom, *vto;
 
vfrom = kmap_local_page(from);
vto = kmap_local_page(to);
ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE);
kunmap_local(vto);
kunmap_local(vfrom);
 
return ret;
}
#else
static inline int copy_user_highpage_mc(struct page *to, struct page *from,
unsigned long vaddr,
struct vm_area_struct *vma)
{   
copy_user_highpage(to, from, vaddr, vma);
return 0;
}
#endif

Per the copy_mc* discussion with Linus I would have called this function
copy_mc_to_user_highpage() to clarify that hwpoison is handled from the
source buffer of the copy.

>  #ifndef __HAVE_ARCH_COPY_HIGHPAGE
>  
>  static inline void copy_highpage(struct page *to, struct page *from)
> diff --git a/mm/memory.c b/mm/memory.c
> index f88c351aecd4..a32556c9b689 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2848,8 +2848,14 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
>   return same;
>  }
>  
> -static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
> -

Re: [PATCH 2/7] mm: Free device private pages have zero refcount

2022-09-29 Thread Dan Williams
Alistair Popple wrote:
> 
> Dan Williams  writes:
> 
> > Alistair Popple wrote:
> >>
> >> Jason Gunthorpe  writes:
> >>
> >> > On Mon, Sep 26, 2022 at 04:03:06PM +1000, Alistair Popple wrote:
> >> >> Since 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page
> >> >> refcount") device private pages have no longer had an extra reference
> >> >> count when the page is in use. However before handing them back to the
> >> >> owning device driver we add an extra reference count such that free
> >> >> pages have a reference count of one.
> >> >>
> >> >> This makes it difficult to tell if a page is free or not because both
> >> >> free and in use pages will have a non-zero refcount. Instead we should
> >> >> return pages to the drivers page allocator with a zero reference count.
> >> >> Kernel code can then safely use kernel functions such as
> >> >> get_page_unless_zero().
> >> >>
> >> >> Signed-off-by: Alistair Popple 
> >> >> ---
> >> >>  arch/powerpc/kvm/book3s_hv_uvmem.c   | 1 +
> >> >>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 1 +
> >> >>  drivers/gpu/drm/nouveau/nouveau_dmem.c   | 1 +
> >> >>  lib/test_hmm.c   | 1 +
> >> >>  mm/memremap.c| 5 -
> >> >>  mm/page_alloc.c  | 6 ++
> >> >>  6 files changed, 10 insertions(+), 5 deletions(-)
> >> >
> >> > I think this is a great idea, but I'm surprised no dax stuff is
> >> > touched here?
> >>
> >> free_zone_device_page() shouldn't be called for pgmap->type ==
> >> MEMORY_DEVICE_FS_DAX so I don't think we should have to worry about DAX
> >> there. Except that the folio code looks like it might have introduced a
> >> bug. AFAICT put_page() always calls
> >> put_devmap_managed_page(>page) but folio_put() does not (although
> >> folios_put() does!). So it seems folio_put() won't end up calling
> >> __put_devmap_managed_page_refs() as I think it should.
> >>
> >> I think you're right about the change to __init_zone_device_page() - I
> >> should limit it to DEVICE_PRIVATE/COHERENT pages only. But I need to
> >> look at Dan's patch series more closely as I suspect it might be better
> >> to rebase this patch on top of that.
> >
> > Apologies for the delay I was travelling the past few days. Yes, I think
> > this patch slots in nicely to avoid the introduction of an init_mode
> > [1]:
> >
> > https://lore.kernel.org/nvdimm/166329940343.2786261.6047770378829215962.st...@dwillia2-xfh.jf.intel.com/
> >
> > Mind if I steal it into my series?
> 
> No problem, although I notice Andrew has already merged it into
> mm-unstable. If you end up rebasing your series on top of mine I think
> all that's needed is a patch somewhere in your series to drop the
> various `if (pgmap->type == MEMORY_DEVICE_*)` I added to (hopefully)
> avoid breaking DAX. Assuming DAX takes a pagemap reference on struct
> page allocation something like below.

Yeah, I'll go that route and rebase on top of -mm.

Thanks again.


Re: [PATCH 2/7] mm: Free device private pages have zero refcount

2022-09-29 Thread Dan Williams
Alistair Popple wrote:
> 
> Jason Gunthorpe  writes:
> 
> > On Mon, Sep 26, 2022 at 04:03:06PM +1000, Alistair Popple wrote:
> >> Since 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page
> >> refcount") device private pages have no longer had an extra reference
> >> count when the page is in use. However before handing them back to the
> >> owning device driver we add an extra reference count such that free
> >> pages have a reference count of one.
> >>
> >> This makes it difficult to tell if a page is free or not because both
> >> free and in use pages will have a non-zero refcount. Instead we should
> >> return pages to the drivers page allocator with a zero reference count.
> >> Kernel code can then safely use kernel functions such as
> >> get_page_unless_zero().
> >>
> >> Signed-off-by: Alistair Popple 
> >> ---
> >>  arch/powerpc/kvm/book3s_hv_uvmem.c   | 1 +
> >>  drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 1 +
> >>  drivers/gpu/drm/nouveau/nouveau_dmem.c   | 1 +
> >>  lib/test_hmm.c   | 1 +
> >>  mm/memremap.c| 5 -
> >>  mm/page_alloc.c  | 6 ++
> >>  6 files changed, 10 insertions(+), 5 deletions(-)
> >
> > I think this is a great idea, but I'm surprised no dax stuff is
> > touched here?
> 
> free_zone_device_page() shouldn't be called for pgmap->type ==
> MEMORY_DEVICE_FS_DAX so I don't think we should have to worry about DAX
> there. Except that the folio code looks like it might have introduced a
> bug. AFAICT put_page() always calls
> put_devmap_managed_page(>page) but folio_put() does not (although
> folios_put() does!). So it seems folio_put() won't end up calling
> __put_devmap_managed_page_refs() as I think it should.
> 
> I think you're right about the change to __init_zone_device_page() - I
> should limit it to DEVICE_PRIVATE/COHERENT pages only. But I need to
> look at Dan's patch series more closely as I suspect it might be better
> to rebase this patch on top of that.

Apologies for the delay I was travelling the past few days. Yes, I think
this patch slots in nicely to avoid the introduction of an init_mode
[1]:

https://lore.kernel.org/nvdimm/166329940343.2786261.6047770378829215962.st...@dwillia2-xfh.jf.intel.com/

Mind if I steal it into my series?


Re: [linux-next] Build failure drivers/cxl/cxl_pmem (powerpc)

2022-07-29 Thread Dan Williams
Michael Ellerman wrote:
> Sachin Sant  writes:
> > Linux-next (5.19.0-rc8-next-20220728) fails to build on powerpc with
> > following error:
> >
> > ERROR: modpost: "memory_add_physaddr_to_nid" [drivers/cxl/cxl_pmem.ko] 
> > undefined!
> > make[1]: *** [scripts/Makefile.modpost:128: modules-only.symvers] Error 1
> >
> > The code in question was last changed by following patch:
> >
> > commit 04ad63f086d1
> >cxl/region: Introduce cxl_pmem_region objects
> 
> This should fix it.
> 
> Dan, do you want to apply that on top of your tree to reduce the window
> of breakage?

Yes, will apply, thanks.


RE: [REPOST PATCH] ndtest: Cleanup all of blk namespace specific code

2022-07-12 Thread Dan Williams
Shivaprasad G Bhat wrote:
> With the nd_namespace_blk and nd_blk_region infrastructures being removed,
> the ndtest still has some references to the old code. So the
> compilation fails as below,
> 
> ../tools/testing/nvdimm/test/ndtest.c:204:25: error: 
> ‘ND_DEVICE_NAMESPACE_BLK’ undeclared here (not in a function); did you mean 
> ‘ND_DEVICE_NAMESPACE_IO’?
>   204 | .type = ND_DEVICE_NAMESPACE_BLK,
>   | ^~~
>   | ND_DEVICE_NAMESPACE_IO
> ../tools/testing/nvdimm/test/ndtest.c: In function ‘ndtest_create_region’:
> ../tools/testing/nvdimm/test/ndtest.c:630:17: error: ‘ndbr_desc’ undeclared 
> (first use in this function); did you mean ‘ndr_desc’?
>   630 | ndbr_desc.enable = ndtest_blk_region_enable;
>   | ^
>   | ndr_desc
> ../tools/testing/nvdimm/test/ndtest.c:630:17: note: each undeclared 
> identifier is reported only once for each function it appears in
> ../tools/testing/nvdimm/test/ndtest.c:630:36: error: 
> ‘ndtest_blk_region_enable’ undeclared (first use in this function)
>   630 | ndbr_desc.enable = ndtest_blk_region_enable;
>   |^~~~
> ../tools/testing/nvdimm/test/ndtest.c:631:35: error: ‘ndtest_blk_do_io’ 
> undeclared (first use in this function); did you mean ‘ndtest_blk_mmio’?
>   631 | ndbr_desc.do_io = ndtest_blk_do_io;
>   |   ^~~~
>   |   ndtest_blk_mmio
> 
> The current patch removes the specific code to cleanup all obsolete
> references.
> 
> Signed-off-by: Shivaprasad G Bhat 

Looks good, applied.



[GIT PULL] LIBNVDIMM update for v5.18

2022-03-29 Thread Dan Williams
Hi Linus, please pull from:

  git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
tags/libnvdimm-for-5.18

...to receive the libnvdimm update for this cycle which includes the
deprecation of block-aperture mode and a new perf events interface for
the papr_scm nvdimm driver. The perf events approach was acked by
PeterZ. You will notice the top commit is less than a week old as
linux-next exposure identified some build failure scenarios. Kajol
turned around a fix and it has appeared in linux-next with no
additional reports. Some other fixups for the removal of
block-aperture mode also generated some follow-on fixes from -next
exposure.

I am not aware of anything else outstanding, please pull.

---

The following changes since commit 754e0b0e35608ed5206d6a67a791563c631cec07:

  Linux 5.17-rc4 (2022-02-13 12:13:30 -0800)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm
tags/libnvdimm-for-5.18

for you to fetch changes up to ada8d8d337ee970860c9844126e634df8076aa11:

  nvdimm/blk: Fix title level (2022-03-23 17:52:33 -0700)


libnvdimm for 5.18

- Add perf support for nvdimm events, initially only for 'papr_scm'
  devices.

- Deprecate the 'block aperture' support in libnvdimm, it only ever
  existed in the specification, not in shipping product.


Dan Williams (6):
  nvdimm/region: Fix default alignment for small regions
  nvdimm/blk: Delete the block-aperture window driver
  nvdimm/namespace: Delete blk namespace consideration in shared paths
  nvdimm/namespace: Delete nd_namespace_blk
  ACPI: NFIT: Remove block aperture support
  nvdimm/region: Delete nd_blk_region infrastructure

Kajol Jain (6):
  drivers/nvdimm: Add nvdimm pmu structure
  drivers/nvdimm: Add perf interface to expose nvdimm performance stats
  powerpc/papr_scm: Add perf interface support
  docs: ABI: sysfs-bus-nvdimm: Document sysfs event format entries
for nvdimm pmu
  drivers/nvdimm: Fix build failure when CONFIG_PERF_EVENTS is not set
  powerpc/papr_scm: Fix build failure when

Lukas Bulwahn (1):
  MAINTAINERS: remove section LIBNVDIMM BLK: MMIO-APERTURE DRIVER

Tom Rix (1):
  nvdimm/blk: Fix title level

 Documentation/ABI/testing/sysfs-bus-nvdimm |  35 ++
 Documentation/driver-api/nvdimm/nvdimm.rst | 406 +--
 MAINTAINERS|  11 -
 arch/powerpc/include/asm/device.h  |   5 +
 arch/powerpc/platforms/pseries/papr_scm.c  | 230 +
 drivers/acpi/nfit/core.c   | 387 +-
 drivers/acpi/nfit/nfit.h   |   6 -
 drivers/nvdimm/Kconfig |  25 +-
 drivers/nvdimm/Makefile|   4 +-
 drivers/nvdimm/blk.c   | 335 ---
 drivers/nvdimm/bus.c   |   2 -
 drivers/nvdimm/dimm_devs.c | 204 +---
 drivers/nvdimm/label.c | 346 +---
 drivers/nvdimm/label.h |   5 +-
 drivers/nvdimm/namespace_devs.c| 506 ++---
 drivers/nvdimm/nd-core.h   |  27 +-
 drivers/nvdimm/nd.h|  13 -
 drivers/nvdimm/nd_perf.c   | 329 +++
 drivers/nvdimm/region.c|  31 +-
 drivers/nvdimm/region_devs.c   | 157 ++---
 include/linux/libnvdimm.h  |  24 --
 include/linux/nd.h |  78 +++--
 include/uapi/linux/ndctl.h |   2 -
 tools/testing/nvdimm/Kbuild|   4 -
 tools/testing/nvdimm/config_check.c|   1 -
 tools/testing/nvdimm/test/ndtest.c |  67 +---
 tools/testing/nvdimm/test/nfit.c   |  23 --
 27 files changed, 833 insertions(+), 2430 deletions(-)
 delete mode 100644 drivers/nvdimm/blk.c
 create mode 100644 drivers/nvdimm/nd_perf.c


Re: [PATCH 2/2] powerpc/papr_scm: Fix build failure when CONFIG_PERF_EVENTS is not set

2022-03-23 Thread Dan Williams
On Wed, Mar 23, 2022 at 3:05 AM Michael Ellerman  wrote:
>
> Dan Williams  writes:
> > On Tue, Mar 22, 2022 at 7:30 AM kajoljain  wrote:
> >> On 3/22/22 03:09, Dan Williams wrote:
> >> > On Fri, Mar 18, 2022 at 4:42 AM Kajol Jain  wrote:
> >> >>
> >> >> The following build failure occures when CONFIG_PERF_EVENTS is not set
> >> >> as generic pmu functions are not visible in that scenario.
> >> >>
> >> >> arch/powerpc/platforms/pseries/papr_scm.c:372:35: error: ‘struct 
> >> >> perf_event’ has no member named ‘attr’
> >> >>  p->nvdimm_events_map[event->attr.config],
> >> >>^~
> >> >> In file included from ./include/linux/list.h:5,
> >> >>  from ./include/linux/kobject.h:19,
> >> >>  from ./include/linux/of.h:17,
> >> >>  from arch/powerpc/platforms/pseries/papr_scm.c:5:
> >> >> arch/powerpc/platforms/pseries/papr_scm.c: In function 
> >> >> ‘papr_scm_pmu_event_init’:
> >> >> arch/powerpc/platforms/pseries/papr_scm.c:389:49: error: ‘struct 
> >> >> perf_event’ has no member named ‘pmu’
> >> >>   struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
> >> >>  ^~
> >> >> ./include/linux/container_of.h:18:26: note: in definition of macro 
> >> >> ‘container_of’
> >> >>   void *__mptr = (void *)(ptr); \
> >> >>   ^~~
> >> >> arch/powerpc/platforms/pseries/papr_scm.c:389:30: note: in expansion of 
> >> >> macro ‘to_nvdimm_pmu’
> >> >>   struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
> >> >>   ^
> >> >> In file included from ./include/linux/bits.h:22,
> >> >>  from ./include/linux/bitops.h:6,
> >> >>  from ./include/linux/of.h:15,
> >> >>  from arch/powerpc/platforms/pseries/papr_scm.c:5:
> >> >>
> >> >> Fix the build issue by adding check for CONFIG_PERF_EVENTS config option
> >> >> and disabling the papr_scm perf interface support incase this config
> >> >> is not set
> >> >>
> >> >> Fixes: 4c08d4bbc089 ("powerpc/papr_scm: Add perf interface support") 
> >> >> (Commit id
> >> >> based on linux-next tree)
> >> >> Signed-off-by: Kajol Jain 
> >> >> ---
> >> >>  arch/powerpc/platforms/pseries/papr_scm.c | 15 +++
> >> >
> >> > This is a bit messier than I would have liked mainly because it dumps
> >> > a bunch of ifdefery into a C file contrary to coding style, "Wherever
> >> > possible, don't use preprocessor conditionals (#if, #ifdef) in .c
> >> > files". I would expect this all to move to an organization like:
> >>
> >> Hi Dan,
> >>   Thanks for reviewing the patches. Inorder to avoid the multiple
> >> ifdefs checks, we can also add stub function for papr_scm_pmu_register.
> >> With that change we will just have one ifdef check for
> >> CONFIG_PERF_EVENTS config in both papr_scm.c and nd.h file. Hence we can
> >> avoid adding new files specific for papr_scm perf interface.
> >>
> >> Below is the code snippet for that change, let me know if looks fine to
> >> you. I tested it
> >> with set/unset PAPR_SCM config value and set/unset PERF_EVENTS config
> >> value combinations.
> >>
> >> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c
> >> b/arch/powerpc/platforms/pseries/papr_scm.c
> >> index 4dd513d7c029..38fabb44d3c3 100644
> >> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> >> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> >> @@ -69,8 +69,6 @@
> >>  #define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS)
> >>  #define PAPR_SCM_PERF_STATS_VERSION 0x1
> >>
> >> -#define to_nvdimm_pmu(_pmu)container_of(_pmu, struct nvdimm_pmu, pmu)
> >> -
> >>  /* Struct holding a single performance metric */
> >>  struct papr_scm_perf_stat {
> >> u8 stat_id[8];
> >> @@ -346,6 +344,9 @@ static ssize_t drc_pmem_query_stats(struct
> >> papr_scm_priv *p,
> >> return 0;
> >>  }
> >>
> >> +#ifdef CON

Re: [PATCH 2/2] powerpc/papr_scm: Fix build failure when CONFIG_PERF_EVENTS is not set

2022-03-22 Thread Dan Williams
On Tue, Mar 22, 2022 at 7:30 AM kajoljain  wrote:
>
>
>
> On 3/22/22 03:09, Dan Williams wrote:
> > On Fri, Mar 18, 2022 at 4:42 AM Kajol Jain  wrote:
> >>
> >> The following build failure occures when CONFIG_PERF_EVENTS is not set
> >> as generic pmu functions are not visible in that scenario.
> >>
> >> arch/powerpc/platforms/pseries/papr_scm.c:372:35: error: ‘struct 
> >> perf_event’ has no member named ‘attr’
> >>  p->nvdimm_events_map[event->attr.config],
> >>^~
> >> In file included from ./include/linux/list.h:5,
> >>  from ./include/linux/kobject.h:19,
> >>  from ./include/linux/of.h:17,
> >>  from arch/powerpc/platforms/pseries/papr_scm.c:5:
> >> arch/powerpc/platforms/pseries/papr_scm.c: In function 
> >> ‘papr_scm_pmu_event_init’:
> >> arch/powerpc/platforms/pseries/papr_scm.c:389:49: error: ‘struct 
> >> perf_event’ has no member named ‘pmu’
> >>   struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
> >>  ^~
> >> ./include/linux/container_of.h:18:26: note: in definition of macro 
> >> ‘container_of’
> >>   void *__mptr = (void *)(ptr); \
> >>   ^~~
> >> arch/powerpc/platforms/pseries/papr_scm.c:389:30: note: in expansion of 
> >> macro ‘to_nvdimm_pmu’
> >>   struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
> >>   ^
> >> In file included from ./include/linux/bits.h:22,
> >>  from ./include/linux/bitops.h:6,
> >>  from ./include/linux/of.h:15,
> >>  from arch/powerpc/platforms/pseries/papr_scm.c:5:
> >>
> >> Fix the build issue by adding check for CONFIG_PERF_EVENTS config option
> >> and disabling the papr_scm perf interface support incase this config
> >> is not set
> >>
> >> Fixes: 4c08d4bbc089 ("powerpc/papr_scm: Add perf interface support") 
> >> (Commit id
> >> based on linux-next tree)
> >> Signed-off-by: Kajol Jain 
> >> ---
> >>  arch/powerpc/platforms/pseries/papr_scm.c | 15 +++
> >
> > This is a bit messier than I would have liked mainly because it dumps
> > a bunch of ifdefery into a C file contrary to coding style, "Wherever
> > possible, don't use preprocessor conditionals (#if, #ifdef) in .c
> > files". I would expect this all to move to an organization like:
>
> Hi Dan,
>   Thanks for reviewing the patches. Inorder to avoid the multiple
> ifdefs checks, we can also add stub function for papr_scm_pmu_register.
> With that change we will just have one ifdef check for
> CONFIG_PERF_EVENTS config in both papr_scm.c and nd.h file. Hence we can
> avoid adding new files specific for papr_scm perf interface.
>
> Below is the code snippet for that change, let me know if looks fine to
> you. I tested it
> with set/unset PAPR_SCM config value and set/unset PERF_EVENTS config
> value combinations.
>
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index 4dd513d7c029..38fabb44d3c3 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -69,8 +69,6 @@
>  #define PAPR_SCM_PERF_STATS_EYECATCHER __stringify(SCMSTATS)
>  #define PAPR_SCM_PERF_STATS_VERSION 0x1
>
> -#define to_nvdimm_pmu(_pmu)container_of(_pmu, struct nvdimm_pmu, pmu)
> -
>  /* Struct holding a single performance metric */
>  struct papr_scm_perf_stat {
> u8 stat_id[8];
> @@ -346,6 +344,9 @@ static ssize_t drc_pmem_query_stats(struct
> papr_scm_priv *p,
> return 0;
>  }
>
> +#ifdef CONFIG_PERF_EVENTS
> +#define to_nvdimm_pmu(_pmu)container_of(_pmu, struct nvdimm_pmu, pmu)
> +
>  static int papr_scm_pmu_get_value(struct perf_event *event, struct
> device *dev, u64 *count)
>  {
> struct papr_scm_perf_stat *stat;
> @@ -558,6 +559,10 @@ static void papr_scm_pmu_register(struct
> papr_scm_priv *p)
> dev_info(>pdev->dev, "nvdimm pmu didn't register rc=%d\n", rc);
>  }
>
> +#else
> +static inline void papr_scm_pmu_register(struct papr_scm_priv *p) { }

Since this isn't in a header file, it does not need to be marked
"inline" the compiler will figure it out.

> +#endif

It might be time to create:

arch/powerpc/platforms/pseries/papr_scm.h

...there is quite a bit of header material accrued in papr_scm.c and
once the ifdefs start landing in it then it becomes a nominal coding
style issue. That said, this is certainly more palatable than the
previous version. So if you don't want to create papr_scm.h yet for
this, at least make a note in the changelog that the first portion of
arch/powerpc/platforms/pseries/papr_scm.c is effectively papr_scm.h
content and may move there in the future, or something like that.


Re: [PATCH 1/2] drivers/nvdimm: Fix build failure when CONFIG_PERF_EVENTS is not set

2022-03-21 Thread Dan Williams
On Fri, Mar 18, 2022 at 4:42 AM Kajol Jain  wrote:
>
> The following build failure occures when CONFIG_PERF_EVENTS is not set
> as generic pmu functions are not visible in that scenario.
>
> |-- s390-randconfig-r044-20220313
> |   |-- nd_perf.c:(.text):undefined-reference-to-perf_pmu_migrate_context
> |   |-- nd_perf.c:(.text):undefined-reference-to-perf_pmu_register
> |   `-- nd_perf.c:(.text):undefined-reference-to-perf_pmu_unregister
>
> Similar build failure in nds32 architecture:
> nd_perf.c:(.text+0x21e): undefined reference to `perf_pmu_migrate_context'
> nd_perf.c:(.text+0x434): undefined reference to `perf_pmu_register'
> nd_perf.c:(.text+0x57c): undefined reference to `perf_pmu_unregister'
>
> Fix this issue by adding check for CONFIG_PERF_EVENTS config option
> and disabling the nvdimm perf interface incase this config is not set.
>
> Also removed function declaration of perf_pmu_migrate_context,
> perf_pmu_register, perf_pmu_unregister functions from nd.h as these are
> common pmu functions which are part of perf_event.h and since we
> are disabling nvdimm perf interface incase CONFIG_PERF_EVENTS option
> is not set, we not need to declare them in nd.h
>
> Fixes: 0fab1ba6ad6b ("drivers/nvdimm: Add perf interface to expose
> nvdimm performance stats") (Commit id based on linux-next tree)
> Signed-off-by: Kajol Jain 
> Link: https://lore.kernel.org/all/62317124.ybqfu33+s%2fwdvwgj%25...@intel.com/
> Reported-by: kernel test robot 
> ---
>  drivers/nvdimm/Makefile | 2 +-
>  include/linux/nd.h  | 7 ---
>  2 files changed, 5 insertions(+), 4 deletions(-)
>
> ---
> - This fix patch changes are added and tested on top of linux-next tree
>   on the 'next-20220315' branch.
> ---
> diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile
> index 3fb806748716..ba0296dca9db 100644
> --- a/drivers/nvdimm/Makefile
> +++ b/drivers/nvdimm/Makefile
> @@ -15,7 +15,7 @@ nd_e820-y := e820.o
>  libnvdimm-y := core.o
>  libnvdimm-y += bus.o
>  libnvdimm-y += dimm_devs.o
> -libnvdimm-y += nd_perf.o
> +libnvdimm-$(CONFIG_PERF_EVENTS) += nd_perf.o
>  libnvdimm-y += dimm.o
>  libnvdimm-y += region_devs.o
>  libnvdimm-y += region.o
> diff --git a/include/linux/nd.h b/include/linux/nd.h
> index 7b2ccbdc1cbc..a4265eaf5ae8 100644
> --- a/include/linux/nd.h
> +++ b/include/linux/nd.h
> @@ -8,8 +8,10 @@
>  #include 
>  #include 
>  #include 
> +#ifdef CONFIG_PERF_EVENTS
>  #include 

Why must this not be included? Doesn't it already handle the
CONFIG_PERF_EVENTS=n case internally?

>  #include 

I notice now that this platform-device header should have never been
added in the first place, just forward declare:

struct platform_device;

> +#endif
>
>  enum nvdimm_event {
> NVDIMM_REVALIDATE_POISON,
> @@ -25,6 +27,7 @@ enum nvdimm_claim_class {
> NVDIMM_CCLASS_UNKNOWN,
>  };
>
> +#ifdef CONFIG_PERF_EVENTS
>  #define NVDIMM_EVENT_VAR(_id)  event_attr_##_id
>  #define NVDIMM_EVENT_PTR(_id)  (_attr_##_id.attr.attr)

Why must these be inside the ifdef guard?

>
> @@ -63,9 +66,7 @@ extern ssize_t nvdimm_events_sysfs_show(struct device *dev,
>
>  int register_nvdimm_pmu(struct nvdimm_pmu *nvdimm, struct platform_device 
> *pdev);
>  void unregister_nvdimm_pmu(struct nvdimm_pmu *nd_pmu);

Shouldn't there also be stub functions in the CONFIG_PERF_EVENTS=n case?

static inline int register_nvdimm_pmu(struct nvdimm_pmu *nvdimm,
struct platform_device *pdev)
{
return -ENXIO;
}

static inline void unregister_nvdimm_pmu(struct nvdimm_pmu *nd_pmu)
{
}

> -void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu);
> -int perf_pmu_register(struct pmu *pmu, const char *name, int type);
> -void perf_pmu_unregister(struct pmu *pmu);

Yeah, I should have caught these earlier.

> +#endif
>
>  struct nd_device_driver {
> struct device_driver drv;
> --
> 2.31.1
>


Re: [PATCH 2/2] powerpc/papr_scm: Fix build failure when CONFIG_PERF_EVENTS is not set

2022-03-21 Thread Dan Williams
On Mon, Mar 21, 2022 at 2:39 PM Dan Williams  wrote:
>
> On Fri, Mar 18, 2022 at 4:42 AM Kajol Jain  wrote:
> >
> > The following build failure occures when CONFIG_PERF_EVENTS is not set
> > as generic pmu functions are not visible in that scenario.
> >
> > arch/powerpc/platforms/pseries/papr_scm.c:372:35: error: ‘struct 
> > perf_event’ has no member named ‘attr’
> >  p->nvdimm_events_map[event->attr.config],
> >^~
> > In file included from ./include/linux/list.h:5,
> >  from ./include/linux/kobject.h:19,
> >  from ./include/linux/of.h:17,
> >  from arch/powerpc/platforms/pseries/papr_scm.c:5:
> > arch/powerpc/platforms/pseries/papr_scm.c: In function 
> > ‘papr_scm_pmu_event_init’:
> > arch/powerpc/platforms/pseries/papr_scm.c:389:49: error: ‘struct 
> > perf_event’ has no member named ‘pmu’
> >   struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
> >  ^~
> > ./include/linux/container_of.h:18:26: note: in definition of macro 
> > ‘container_of’
> >   void *__mptr = (void *)(ptr); \
> >   ^~~
> > arch/powerpc/platforms/pseries/papr_scm.c:389:30: note: in expansion of 
> > macro ‘to_nvdimm_pmu’
> >   struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
> >   ^
> > In file included from ./include/linux/bits.h:22,
> >  from ./include/linux/bitops.h:6,
> >  from ./include/linux/of.h:15,
> >  from arch/powerpc/platforms/pseries/papr_scm.c:5:
> >
> > Fix the build issue by adding check for CONFIG_PERF_EVENTS config option
> > and disabling the papr_scm perf interface support incase this config
> > is not set
> >
> > Fixes: 4c08d4bbc089 ("powerpc/papr_scm: Add perf interface support") 
> > (Commit id
> > based on linux-next tree)
> > Signed-off-by: Kajol Jain 
> > ---
> >  arch/powerpc/platforms/pseries/papr_scm.c | 15 +++
>
> This is a bit messier than I would have liked mainly because it dumps
> a bunch of ifdefery into a C file contrary to coding style, "Wherever
> possible, don't use preprocessor conditionals (#if, #ifdef) in .c
> files". I would expect this all to move to an organization like:
>
> arch/powerpc/platforms/pseries/papr_scm/main.c
> arch/powerpc/platforms/pseries/papr_scm/perf.c
>
> ...and a new config symbol like:
>
> config PAPR_SCM_PERF
>depends on PAPR_SCM && PERF_EVENTS
>def_bool y
>
> ...with wrappers in header files to make everything compile away
> without any need for main.c to carry an ifdef.
>
> Can you turn a patch like that in the next couple days? Otherwise, I
> think if Linus saw me sending a late breaking compile fix that threw
> coding style out the window he'd have cause to just drop the pull
> request entirely.

Also, please base it on the current state of the libnvdimm-for-next
branch as -next includes some of the SMART health changes leading to
at least one conflict.


Re: [PATCH 2/2] powerpc/papr_scm: Fix build failure when CONFIG_PERF_EVENTS is not set

2022-03-21 Thread Dan Williams
On Fri, Mar 18, 2022 at 4:42 AM Kajol Jain  wrote:
>
> The following build failure occures when CONFIG_PERF_EVENTS is not set
> as generic pmu functions are not visible in that scenario.
>
> arch/powerpc/platforms/pseries/papr_scm.c:372:35: error: ‘struct perf_event’ 
> has no member named ‘attr’
>  p->nvdimm_events_map[event->attr.config],
>^~
> In file included from ./include/linux/list.h:5,
>  from ./include/linux/kobject.h:19,
>  from ./include/linux/of.h:17,
>  from arch/powerpc/platforms/pseries/papr_scm.c:5:
> arch/powerpc/platforms/pseries/papr_scm.c: In function 
> ‘papr_scm_pmu_event_init’:
> arch/powerpc/platforms/pseries/papr_scm.c:389:49: error: ‘struct perf_event’ 
> has no member named ‘pmu’
>   struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
>  ^~
> ./include/linux/container_of.h:18:26: note: in definition of macro 
> ‘container_of’
>   void *__mptr = (void *)(ptr); \
>   ^~~
> arch/powerpc/platforms/pseries/papr_scm.c:389:30: note: in expansion of macro 
> ‘to_nvdimm_pmu’
>   struct nvdimm_pmu *nd_pmu = to_nvdimm_pmu(event->pmu);
>   ^
> In file included from ./include/linux/bits.h:22,
>  from ./include/linux/bitops.h:6,
>  from ./include/linux/of.h:15,
>  from arch/powerpc/platforms/pseries/papr_scm.c:5:
>
> Fix the build issue by adding check for CONFIG_PERF_EVENTS config option
> and disabling the papr_scm perf interface support incase this config
> is not set
>
> Fixes: 4c08d4bbc089 ("powerpc/papr_scm: Add perf interface support") (Commit 
> id
> based on linux-next tree)
> Signed-off-by: Kajol Jain 
> ---
>  arch/powerpc/platforms/pseries/papr_scm.c | 15 +++

This is a bit messier than I would have liked mainly because it dumps
a bunch of ifdefery into a C file contrary to coding style, "Wherever
possible, don't use preprocessor conditionals (#if, #ifdef) in .c
files". I would expect this all to move to an organization like:

arch/powerpc/platforms/pseries/papr_scm/main.c
arch/powerpc/platforms/pseries/papr_scm/perf.c

...and a new config symbol like:

config PAPR_SCM_PERF
   depends on PAPR_SCM && PERF_EVENTS
   def_bool y

...with wrappers in header files to make everything compile away
without any need for main.c to carry an ifdef.

Can you turn a patch like that in the next couple days? Otherwise, I
think if Linus saw me sending a late breaking compile fix that threw
coding style out the window he'd have cause to just drop the pull
request entirely.


Re: linux-next: manual merge of the nvdimm tree with the powerpc tree

2022-03-15 Thread Dan Williams
On Tue, Mar 15, 2022 at 4:21 AM Michael Ellerman  wrote:
>
> Stephen Rothwell  writes:
> > Hi all,
> >
> > Today's linux-next merge of the nvdimm tree got a conflict in:
> >
> >   arch/powerpc/platforms/pseries/papr_scm.c
> >
> > between commit:
> >
> >   bbbca72352bb ("powerpc/papr_scm: Implement initial support for injecting 
> > smart errors")
> >
> > from the powerpc tree and commit:
> >
> >   4c08d4bbc089 ("powerpc/papr_scm: Add perf interface support")
> >
> > from the nvdimm tree.
> >
> > I fixed it up (see below) and can carry the fix as necessary. This
> > is now fixed as far as linux-next is concerned, but any non trivial
> > conflicts should be mentioned to your upstream maintainer when your tree
> > is submitted for merging.  You may also want to consider cooperating
> > with the maintainer of the conflicting tree to minimise any particularly
> > complex conflicts.
>
> Thanks, resolution looks obviously correct.
>
> Dan, this seems benign to me, I don't think any further action is
> required other than mentioning it to Linus.

Yes, it looks ok to me.


Re: [PATCH v7 0/4] Add perf interface to expose nvdimm

2022-03-09 Thread Dan Williams
On Mon, Mar 7, 2022 at 9:27 PM kajoljain  wrote:
>
> Hi Dan,
> Can you take this patch-set if it looks fine to you.
>

Pushed out to my libnvdimm-pending branch for a 0day confirmation
before heading over to linux-next.


Re: [PATCH v6 0/4] Add perf interface to expose nvdimm

2022-02-23 Thread Dan Williams
On Wed, Feb 23, 2022 at 11:07 AM Dan Williams  wrote:
>
> On Fri, Feb 18, 2022 at 10:06 AM Dan Williams  
> wrote:
> >
> > On Thu, Feb 17, 2022 at 8:34 AM Kajol Jain  wrote:
> > >
> > > Patchset adds performance stats reporting support for nvdimm.
> > > Added interface includes support for pmu register/unregister
> > > functions. A structure is added called nvdimm_pmu to be used for
> > > adding arch/platform specific data such as cpumask, nvdimm device
> > > pointer and pmu event functions like event_init/add/read/del.
> > > User could use the standard perf tool to access perf events
> > > exposed via pmu.
> > >
> > > Interface also defines supported event list, config fields for the
> > > event attributes and their corresponding bit values which are exported
> > > via sysfs. Patch 3 exposes IBM pseries platform nmem* device
> > > performance stats using this interface.
> > >
> > > Result from power9 pseries lpar with 2 nvdimm device:
> > >
> > > Ex: List all event by perf list
> > >
> > > command:# perf list nmem
> > >
> > >   nmem0/cache_rh_cnt/[Kernel PMU event]
> > >   nmem0/cache_wh_cnt/[Kernel PMU event]
> > >   nmem0/cri_res_util/[Kernel PMU event]
> > >   nmem0/ctl_res_cnt/ [Kernel PMU event]
> > >   nmem0/ctl_res_tm/  [Kernel PMU event]
> > >   nmem0/fast_w_cnt/  [Kernel PMU event]
> > >   nmem0/host_l_cnt/  [Kernel PMU event]
> > >   nmem0/host_l_dur/  [Kernel PMU event]
> > >   nmem0/host_s_cnt/  [Kernel PMU event]
> > >   nmem0/host_s_dur/  [Kernel PMU event]
> > >   nmem0/med_r_cnt/   [Kernel PMU event]
> > >   nmem0/med_r_dur/   [Kernel PMU event]
> > >   nmem0/med_w_cnt/   [Kernel PMU event]
> > >   nmem0/med_w_dur/   [Kernel PMU event]
> > >   nmem0/mem_life/[Kernel PMU event]
> > >   nmem0/poweron_secs/[Kernel PMU event]
> > >   ...
> > >   nmem1/mem_life/[Kernel PMU event]
> > >   nmem1/poweron_secs/[Kernel PMU event]
> > >
> > > Patch1:
> > > Introduces the nvdimm_pmu structure
> > > Patch2:
> > > Adds common interface to add arch/platform specific data
> > > includes nvdimm device pointer, pmu data along with
> > > pmu event functions. It also defines supported event list
> > > and adds attribute groups for format, events and cpumask.
> > > It also adds code for cpu hotplug support.
> > > Patch3:
> > > Add code in arch/powerpc/platform/pseries/papr_scm.c to expose
> > > nmem* pmu. It fills in the nvdimm_pmu structure with pmu name,
> > > capabilities, cpumask and event functions and then registers
> > > the pmu by adding callbacks to register_nvdimm_pmu.
> > > Patch4:
> > > Sysfs documentation patch
> > >
> > > Changelog
> > > ---
> > > Resend v5 -> v6
> > > - No logic change, just a rebase to latest upstream and
> > >   tested the patchset.
> > >
> > > - Link to the patchset Resend v5: https://lkml.org/lkml/2021/11/15/3979
> > >
> > > v5 -> Resend v5
> > > - Resend the patchset
> > >
> > > - Link to the patchset v5: https://lkml.org/lkml/2021/9/28/643
> > >
> > > v4 -> v5:
> > > - Remove multiple variables defined in nvdimm_pmu structure include
> > >   name and pmu functions(event_int/add/del/read) as they are just
> > >   used to copy them again in pmu variable. Now we are directly doing
> > >   this step in arch specific code as suggested by Dan Williams.
> > >
> > > - Remove attribute group field from nvdimm pmu structure and
> > >   defined these attribute groups in common interface which
> > >   includes format, event list along with cpumask as suggested by
> > >   Dan Williams.
> > >   Since we added static defination for attrbute groups needed in
> > >   common interfac

Re: [PATCH v6 0/4] Add perf interface to expose nvdimm

2022-02-23 Thread Dan Williams
On Fri, Feb 18, 2022 at 10:06 AM Dan Williams  wrote:
>
> On Thu, Feb 17, 2022 at 8:34 AM Kajol Jain  wrote:
> >
> > Patchset adds performance stats reporting support for nvdimm.
> > Added interface includes support for pmu register/unregister
> > functions. A structure is added called nvdimm_pmu to be used for
> > adding arch/platform specific data such as cpumask, nvdimm device
> > pointer and pmu event functions like event_init/add/read/del.
> > User could use the standard perf tool to access perf events
> > exposed via pmu.
> >
> > Interface also defines supported event list, config fields for the
> > event attributes and their corresponding bit values which are exported
> > via sysfs. Patch 3 exposes IBM pseries platform nmem* device
> > performance stats using this interface.
> >
> > Result from power9 pseries lpar with 2 nvdimm device:
> >
> > Ex: List all event by perf list
> >
> > command:# perf list nmem
> >
> >   nmem0/cache_rh_cnt/[Kernel PMU event]
> >   nmem0/cache_wh_cnt/[Kernel PMU event]
> >   nmem0/cri_res_util/[Kernel PMU event]
> >   nmem0/ctl_res_cnt/ [Kernel PMU event]
> >   nmem0/ctl_res_tm/  [Kernel PMU event]
> >   nmem0/fast_w_cnt/  [Kernel PMU event]
> >   nmem0/host_l_cnt/  [Kernel PMU event]
> >   nmem0/host_l_dur/  [Kernel PMU event]
> >   nmem0/host_s_cnt/  [Kernel PMU event]
> >   nmem0/host_s_dur/  [Kernel PMU event]
> >   nmem0/med_r_cnt/   [Kernel PMU event]
> >   nmem0/med_r_dur/   [Kernel PMU event]
> >   nmem0/med_w_cnt/   [Kernel PMU event]
> >   nmem0/med_w_dur/   [Kernel PMU event]
> >   nmem0/mem_life/[Kernel PMU event]
> >   nmem0/poweron_secs/[Kernel PMU event]
> >   ...
> >   nmem1/mem_life/[Kernel PMU event]
> >   nmem1/poweron_secs/[Kernel PMU event]
> >
> > Patch1:
> > Introduces the nvdimm_pmu structure
> > Patch2:
> > Adds common interface to add arch/platform specific data
> > includes nvdimm device pointer, pmu data along with
> > pmu event functions. It also defines supported event list
> > and adds attribute groups for format, events and cpumask.
> > It also adds code for cpu hotplug support.
> > Patch3:
> > Add code in arch/powerpc/platform/pseries/papr_scm.c to expose
> > nmem* pmu. It fills in the nvdimm_pmu structure with pmu name,
> > capabilities, cpumask and event functions and then registers
> > the pmu by adding callbacks to register_nvdimm_pmu.
> > Patch4:
> > Sysfs documentation patch
> >
> > Changelog
> > ---
> > Resend v5 -> v6
> > - No logic change, just a rebase to latest upstream and
> >   tested the patchset.
> >
> > - Link to the patchset Resend v5: https://lkml.org/lkml/2021/11/15/3979
> >
> > v5 -> Resend v5
> > - Resend the patchset
> >
> > - Link to the patchset v5: https://lkml.org/lkml/2021/9/28/643
> >
> > v4 -> v5:
> > - Remove multiple variables defined in nvdimm_pmu structure include
> >   name and pmu functions(event_int/add/del/read) as they are just
> >   used to copy them again in pmu variable. Now we are directly doing
> >   this step in arch specific code as suggested by Dan Williams.
> >
> > - Remove attribute group field from nvdimm pmu structure and
> >   defined these attribute groups in common interface which
> >   includes format, event list along with cpumask as suggested by
> >   Dan Williams.
> >   Since we added static defination for attrbute groups needed in
> >   common interface, removes corresponding code from papr.
> >
> > - Add nvdimm pmu event list with event codes in the common interface.
> >
> > - Remove Acked-by/Reviewed-by/Tested-by tags as code is refactored
> >   to handle review comments from Dan.
>
> I don't think review comments should invalidate the Acked-by tags in
> this case. Nothing fundamentally changed in the approach, and I would
> like to have the perf ack before taking this through the nvdimm tree.
>
> Otherwise this looks good to me.
>
> Peter, might you have a chance to re-Ack this series, or any concerns
> about me retrieving those Acks from the previous postings?

Reached Peter offline and he refreshed his Acked-by.


Re: [PATCH v6 0/4] Add perf interface to expose nvdimm

2022-02-18 Thread Dan Williams
On Thu, Feb 17, 2022 at 8:34 AM Kajol Jain  wrote:
>
> Patchset adds performance stats reporting support for nvdimm.
> Added interface includes support for pmu register/unregister
> functions. A structure is added called nvdimm_pmu to be used for
> adding arch/platform specific data such as cpumask, nvdimm device
> pointer and pmu event functions like event_init/add/read/del.
> User could use the standard perf tool to access perf events
> exposed via pmu.
>
> Interface also defines supported event list, config fields for the
> event attributes and their corresponding bit values which are exported
> via sysfs. Patch 3 exposes IBM pseries platform nmem* device
> performance stats using this interface.
>
> Result from power9 pseries lpar with 2 nvdimm device:
>
> Ex: List all event by perf list
>
> command:# perf list nmem
>
>   nmem0/cache_rh_cnt/[Kernel PMU event]
>   nmem0/cache_wh_cnt/[Kernel PMU event]
>   nmem0/cri_res_util/[Kernel PMU event]
>   nmem0/ctl_res_cnt/ [Kernel PMU event]
>   nmem0/ctl_res_tm/  [Kernel PMU event]
>   nmem0/fast_w_cnt/  [Kernel PMU event]
>   nmem0/host_l_cnt/  [Kernel PMU event]
>   nmem0/host_l_dur/  [Kernel PMU event]
>   nmem0/host_s_cnt/  [Kernel PMU event]
>   nmem0/host_s_dur/  [Kernel PMU event]
>   nmem0/med_r_cnt/   [Kernel PMU event]
>   nmem0/med_r_dur/   [Kernel PMU event]
>   nmem0/med_w_cnt/   [Kernel PMU event]
>   nmem0/med_w_dur/   [Kernel PMU event]
>   nmem0/mem_life/[Kernel PMU event]
>   nmem0/poweron_secs/[Kernel PMU event]
>   ...
>   nmem1/mem_life/[Kernel PMU event]
>   nmem1/poweron_secs/[Kernel PMU event]
>
> Patch1:
> Introduces the nvdimm_pmu structure
> Patch2:
> Adds common interface to add arch/platform specific data
> includes nvdimm device pointer, pmu data along with
> pmu event functions. It also defines supported event list
> and adds attribute groups for format, events and cpumask.
> It also adds code for cpu hotplug support.
> Patch3:
> Add code in arch/powerpc/platform/pseries/papr_scm.c to expose
> nmem* pmu. It fills in the nvdimm_pmu structure with pmu name,
> capabilities, cpumask and event functions and then registers
> the pmu by adding callbacks to register_nvdimm_pmu.
> Patch4:
> Sysfs documentation patch
>
> Changelog
> ---
> Resend v5 -> v6
> - No logic change, just a rebase to latest upstream and
>   tested the patchset.
>
> - Link to the patchset Resend v5: https://lkml.org/lkml/2021/11/15/3979
>
> v5 -> Resend v5
> - Resend the patchset
>
> - Link to the patchset v5: https://lkml.org/lkml/2021/9/28/643
>
> v4 -> v5:
> - Remove multiple variables defined in nvdimm_pmu structure include
>   name and pmu functions(event_int/add/del/read) as they are just
>   used to copy them again in pmu variable. Now we are directly doing
>   this step in arch specific code as suggested by Dan Williams.
>
> - Remove attribute group field from nvdimm pmu structure and
>   defined these attribute groups in common interface which
>   includes format, event list along with cpumask as suggested by
>   Dan Williams.
>   Since we added static defination for attrbute groups needed in
>   common interface, removes corresponding code from papr.
>
> - Add nvdimm pmu event list with event codes in the common interface.
>
> - Remove Acked-by/Reviewed-by/Tested-by tags as code is refactored
>   to handle review comments from Dan.

I don't think review comments should invalidate the Acked-by tags in
this case. Nothing fundamentally changed in the approach, and I would
like to have the perf ack before taking this through the nvdimm tree.

Otherwise this looks good to me.

Peter, might you have a chance to re-Ack this series, or any concerns
about me retrieving those Acks from the previous postings?


Re: [PATCH 06/13] nvdimm/blk: avoid calling del_gendisk() on early failures

2021-11-02 Thread Dan Williams
On Tue, Nov 2, 2021 at 5:10 PM Luis Chamberlain  wrote:
>
> On Fri, Oct 15, 2021 at 05:13:48PM -0700, Dan Williams wrote:
> > On Fri, Oct 15, 2021 at 4:53 PM Luis Chamberlain  wrote:
> > >
> > > If nd_integrity_init() fails we'd get del_gendisk() called,
> > > but that's not correct as we should only call that if we're
> > > done with device_add_disk(). Fix this by providing unwinding
> > > prior to the devm call being registered and moving the devm
> > > registration to the very end.
> > >
> > > This should fix calling del_gendisk() if nd_integrity_init()
> > > fails. I only spotted this issue through code inspection. It
> > > does not fix any real world bug.
> > >
> >
> > Just fyi, I'm preparing patches to delete this driver completely as it
> > is unused by any shipping platform. I hope to get that removal into
> > v5.16.
>
> Curious if are you going to nuking it on v5.16? Otherwise it would stand
> in the way of the last few patches to add __must_check for the final
> add_disk() error handling changes.

True, I don't think I can get it nuked in time, so you can add my
Reviewed-by for this one.


Re: [PATCH 04/13] nvdimm/btt: use goto error labels on btt_blk_init()

2021-10-31 Thread Dan Williams
On Fri, Oct 15, 2021 at 4:53 PM Luis Chamberlain  wrote:
>
> This will make it easier to share common error paths.
>
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/nvdimm/btt.c | 19 ---
>  1 file changed, 12 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
> index 29cc7325e890..23ee8c005db5 100644
> --- a/drivers/nvdimm/btt.c
> +++ b/drivers/nvdimm/btt.c
> @@ -1520,10 +1520,11 @@ static int btt_blk_init(struct btt *btt)
>  {
> struct nd_btt *nd_btt = btt->nd_btt;
> struct nd_namespace_common *ndns = nd_btt->ndns;
> +   int rc = -ENOMEM;
>
> btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE);
> if (!btt->btt_disk)
> -   return -ENOMEM;
> +   goto out;

I tend to not use a goto when there is nothing to unwind.

The rest looks good to me. After dropping "goto out;" you can add:

Reviewed-by: Dan Williams 


Re: [PATCH 03/13] nvdimm/btt: do not call del_gendisk() if not needed

2021-10-31 Thread Dan Williams
On Fri, Oct 15, 2021 at 4:53 PM Luis Chamberlain  wrote:
>
> We know we don't need del_gendisk() if we haven't added
> the disk, so just skip it. This should fix a bug on older
> kernels, as del_gendisk() became able to deal with
> disks not added only recently, after the patch titled
> "block: add flag for add_disk() completion notation".

Perhaps put this in:

commit $abbrev_commit ("block: add flag for add_disk() completion notation")

...format, but I can't seem to find that commit?

If you're touching the changelog how about one that clarifies the
impact and drops "we"?

"del_gendisk() is not required if the disk has not been added. On
kernels prior to commit $abbrev_commit ("block: add flag for
add_disk() completion notation")
it is mandatory to not call del_gendisk() if the underlying device has
not been through device_add()."

Fixes: 41cd8b70c37a ("libnvdimm, btt: add support for blk integrity")

With that you can add:

Reviewed-by: Dan Williams 


Re: [PATCH 06/13] nvdimm/blk: avoid calling del_gendisk() on early failures

2021-10-15 Thread Dan Williams
On Fri, Oct 15, 2021 at 4:53 PM Luis Chamberlain  wrote:
>
> If nd_integrity_init() fails we'd get del_gendisk() called,
> but that's not correct as we should only call that if we're
> done with device_add_disk(). Fix this by providing unwinding
> prior to the devm call being registered and moving the devm
> registration to the very end.
>
> This should fix calling del_gendisk() if nd_integrity_init()
> fails. I only spotted this issue through code inspection. It
> does not fix any real world bug.
>

Just fyi, I'm preparing patches to delete this driver completely as it
is unused by any shipping platform. I hope to get that removal into
v5.16.


[PATCH v3 10/10] ocxl: Use pci core's DVSEC functionality

2021-10-09 Thread Dan Williams
From: Ben Widawsky 

Reduce maintenance burden of DVSEC query implementation by using the
centralized PCI core implementation.

There are two obvious places to simply drop in the new core
implementation. There remains find_dvsec_from_pos() which would benefit
from using a core implementation. As that change is less trivial it is
reserved for later.

Cc: linuxppc-dev@lists.ozlabs.org
Cc: Andrew Donnellan 
Acked-by: Frederic Barrat  (v1)
Signed-off-by: Ben Widawsky 
Reviewed-by: Andrew Donnellan 
Signed-off-by: Dan Williams 
---
 arch/powerpc/platforms/powernv/ocxl.c |3 ++-
 drivers/misc/ocxl/config.c|   13 +
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/ocxl.c 
b/arch/powerpc/platforms/powernv/ocxl.c
index 9105efcf242a..28b009b46464 100644
--- a/arch/powerpc/platforms/powernv/ocxl.c
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -107,7 +107,8 @@ static int get_max_afu_index(struct pci_dev *dev, int 
*afu_idx)
int pos;
u32 val;
 
-   pos = find_dvsec_from_pos(dev, OCXL_DVSEC_FUNC_ID, 0);
+   pos = pci_find_dvsec_capability(dev, PCI_VENDOR_ID_IBM,
+   OCXL_DVSEC_FUNC_ID);
if (!pos)
return -ESRCH;
 
diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
index a68738f38252..e401a51596b9 100644
--- a/drivers/misc/ocxl/config.c
+++ b/drivers/misc/ocxl/config.c
@@ -33,18 +33,7 @@
 
 static int find_dvsec(struct pci_dev *dev, int dvsec_id)
 {
-   int vsec = 0;
-   u16 vendor, id;
-
-   while ((vsec = pci_find_next_ext_capability(dev, vsec,
-   OCXL_EXT_CAP_ID_DVSEC))) {
-   pci_read_config_word(dev, vsec + OCXL_DVSEC_VENDOR_OFFSET,
-   );
-   pci_read_config_word(dev, vsec + OCXL_DVSEC_ID_OFFSET, );
-   if (vendor == PCI_VENDOR_ID_IBM && id == dvsec_id)
-   return vsec;
-   }
-   return 0;
+   return pci_find_dvsec_capability(dev, PCI_VENDOR_ID_IBM, dvsec_id);
 }
 
 static int find_dvsec_afu_ctrl(struct pci_dev *dev, u8 afu_idx)



[PATCH v3 08/10] PCI: Add pci_find_dvsec_capability to find designated VSEC

2021-10-09 Thread Dan Williams
From: Ben Widawsky 

Add pci_find_dvsec_capability to locate a Designated Vendor-Specific
Extended Capability with the specified Vendor ID and Capability ID.

The Designated Vendor-Specific Extended Capability (DVSEC) allows one or
more "vendor" specific capabilities that are not tied to the Vendor ID
of the PCI component. Where the DVSEC Vendor may be a standards body
like CXL.

Cc: David E. Box 
Cc: Jonathan Cameron 
Cc: Bjorn Helgaas 
Cc: Dan Williams 
Cc: linux-...@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Cc: Andrew Donnellan 
Cc: Lu Baolu 
Reviewed-by: Frederic Barrat 
Signed-off-by: Ben Widawsky 
Reviewed-by: Andrew Donnellan 
Acked-by: Bjorn Helgaas 
Tested-by: Kan Liang 
Signed-off-by: Kan Liang 
Reviewed-by: Jonathan Cameron 
Signed-off-by: Dan Williams 
---
 drivers/pci/pci.c   |   32 
 include/linux/pci.h |1 +
 2 files changed, 33 insertions(+)

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index ce2ab62b64cf..94ac86ff28b0 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -732,6 +732,38 @@ u16 pci_find_vsec_capability(struct pci_dev *dev, u16 
vendor, int cap)
 }
 EXPORT_SYMBOL_GPL(pci_find_vsec_capability);
 
+/**
+ * pci_find_dvsec_capability - Find DVSEC for vendor
+ * @dev: PCI device to query
+ * @vendor: Vendor ID to match for the DVSEC
+ * @dvsec: Designated Vendor-specific capability ID
+ *
+ * If DVSEC has Vendor ID @vendor and DVSEC ID @dvsec return the capability
+ * offset in config space; otherwise return 0.
+ */
+u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec)
+{
+   int pos;
+
+   pos = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_DVSEC);
+   if (!pos)
+   return 0;
+
+   while (pos) {
+   u16 v, id;
+
+   pci_read_config_word(dev, pos + PCI_DVSEC_HEADER1, );
+   pci_read_config_word(dev, pos + PCI_DVSEC_HEADER2, );
+   if (vendor == v && dvsec == id)
+   return pos;
+
+   pos = pci_find_next_ext_capability(dev, pos, 
PCI_EXT_CAP_ID_DVSEC);
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(pci_find_dvsec_capability);
+
 /**
  * pci_find_parent_resource - return resource region of parent bus of given
  *   region
diff --git a/include/linux/pci.h b/include/linux/pci.h
index cd8aa6fce204..c93ccfa4571b 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1130,6 +1130,7 @@ u16 pci_find_ext_capability(struct pci_dev *dev, int cap);
 u16 pci_find_next_ext_capability(struct pci_dev *dev, u16 pos, int cap);
 struct pci_bus *pci_find_next_bus(const struct pci_bus *from);
 u16 pci_find_vsec_capability(struct pci_dev *dev, u16 vendor, int cap);
+u16 pci_find_dvsec_capability(struct pci_dev *dev, u16 vendor, u16 dvsec);
 
 u64 pci_get_dsn(struct pci_dev *dev);
 



[PATCH v3 00/10] cxl_pci refactor for reusability

2021-10-09 Thread Dan Williams
Changes since v2 [1]:
- Rework some of the changelogs per feedback (Bjorn, and I)
- Move the cxl_register_map refactor earlier in the series to make the
  cxl_setup_pci_regs() refactor easier to read.
- Fix a bug added in v5.14 for handling the error return case
  cxl_pci_map_regblock()
- Split the addition of @base to cxl_register_map to its own patch
- Drop the cxl_pci_dvsec() wrapper (Christoph)
- Drop the SIOV conversion patch given Baolu's feedback about it being
  dead code

[1]: https://lore.kernel.org/r/20210923172647.72738-1-ben.widaw...@intel.com

---
I am helping out with the review feedback on this set while Ben is
focusing on region provisioning. It appears this rework will be suitable
to just carry in cxl/next, no need to make a cross-tree dependency for
"PCI: Add pci_find_dvsec_capability to find designated VSEC" at this
time.

Ben's original cover:

Provide the ability to obtain CXL register blocks as discrete functionality.
This functionality will become useful for other CXL drivers that need access to
CXL register blocks. It is also in line with other additions to core which moves
register mapping functionality.

At the introduction of the CXL driver the only user of CXL MMIO was cxl_pci
(then known as cxl_mem). As the driver has evolved it is clear that cxl_pci will
not be the only entity that needs access to CXL MMIO. This series stops short of
moving the generalized functionality into cxl_core for the sake of getting eyes
on the important foundational bits sooner rather than later. The ultimate plan
is to move much of the code into cxl_core.

Via review of two previous patches [1] & [2] it has been suggested that the bits
which are being used for DVSEC enumeration move into PCI core. As CXL core is
soon going to require these, let's try to get the ball rolling now on making
that happen.

---

[1]: 
https://lore.kernel.org/linux-pci/20210913190131.xiiszmno46qie...@intel.com/
[2]: 
https://lore.kernel.org/linux-cxl/20210920225638.1729482-1-ben.widaw...@intel.com/
[3]: 
https://lore.kernel.org/linux-cxl/20210921220459.2437386-1-ben.widaw...@intel.com/

---

Ben Widawsky (8):
  cxl/pci: Convert register block identifiers to an enum
  cxl/pci: Remove dev_dbg for unknown register blocks
  cxl/pci: Remove pci request/release regions
  cxl/pci: Make more use of cxl_register_map
  cxl/pci: Split cxl_pci_setup_regs()
  PCI: Add pci_find_dvsec_capability to find designated VSEC
  cxl/pci: Use pci core's DVSEC functionality
  ocxl: Use pci core's DVSEC functionality

Dan Williams (2):
  cxl/pci: Fix NULL vs ERR_PTR confusion
  cxl/pci: Add @base to cxl_register_map


 arch/powerpc/platforms/powernv/ocxl.c |3 -
 drivers/cxl/cxl.h |1 
 drivers/cxl/pci.c |  157 +
 drivers/cxl/pci.h |   14 ++-
 drivers/misc/ocxl/config.c|   13 ---
 drivers/pci/pci.c |   32 +++
 include/linux/pci.h   |1 
 7 files changed, 105 insertions(+), 116 deletions(-)

base-commit: ed97afb53365cd03dde266c9644334a558fe5a16


Re: [PATCH v2 9/9] iommu/vt-d: Use pci core's DVSEC functionality

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:
>
> Reduce maintenance burden of DVSEC query implementation by using the
> centralized PCI core implementation.
>
> Cc: io...@lists.linux-foundation.org
> Cc: David Woodhouse 
> Cc: Lu Baolu 
> Signed-off-by: Ben Widawsky 
> ---
>  drivers/iommu/intel/iommu.c | 15 +--
>  1 file changed, 1 insertion(+), 14 deletions(-)
>
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index d75f59ae28e6..30c97181f0ae 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -5398,20 +5398,7 @@ static int intel_iommu_disable_sva(struct device *dev)
>   */
>  static int siov_find_pci_dvsec(struct pci_dev *pdev)
>  {
> -   int pos;
> -   u16 vendor, id;
> -
> -   pos = pci_find_next_ext_capability(pdev, 0, 0x23);
> -   while (pos) {
> -   pci_read_config_word(pdev, pos + 4, );
> -   pci_read_config_word(pdev, pos + 8, );
> -   if (vendor == PCI_VENDOR_ID_INTEL && id == 5)
> -   return pos;
> -
> -   pos = pci_find_next_ext_capability(pdev, pos, 0x23);
> -   }
> -
> -   return 0;
> +   return pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_INTEL, 5);
>  }

Same comments as the CXL patch, siov_find_pci_dvsec() doesn't seem to
have a reason to exist anymore. What is 5?


Re: [PATCH v2 7/9] cxl/pci: Use pci core's DVSEC functionality

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:
>
> Reduce maintenance burden of DVSEC query implementation by using the
> centralized PCI core implementation.
>
> Signed-off-by: Ben Widawsky 
> ---
>  drivers/cxl/pci.c | 20 +---
>  1 file changed, 1 insertion(+), 19 deletions(-)
>
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index 5eaf2736f779..79d4d9b16d83 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -340,25 +340,7 @@ static void cxl_pci_unmap_regblock(struct cxl_mem *cxlm, 
> struct cxl_register_map
>
>  static int cxl_pci_dvsec(struct pci_dev *pdev, int dvsec)

cxl_pci_dvsec() has no reason to exist anymore. Let's just have the
caller use pci_find_dvsec_capability() directly.


Re: [PATCH v2 5/9] cxl/pci: Make more use of cxl_register_map

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:
>
> The structure exists to pass around information about register mapping.
> Using it more extensively cleans up many existing functions.

I would have liked to have seen "add @base to cxl_register_map" and
"use @map for @bar and @offset arguments" somewhere in this changelog
to set expectations for what changes are included. That would have
also highlighted that adding a @base to cxl_register_map deserves its
own patch vs the conversion of @bar and @offset to instead use
@map->bar and @map->offset. Can you resend with that split and those
mentions?


Re: [PATCH v2 4/9] cxl/pci: Refactor cxl_pci_setup_regs

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:
>
> In preparation for moving parts of register mapping to cxl_core, the
> cxl_pci driver is refactored to utilize a new helper to find register
> blocks by type.
>
> cxl_pci scanned through all register blocks and mapping the ones that
> the driver will use. This logic is inverted so that the driver
> specifically requests the register blocks from a new helper. Under the
> hood, the same implementation of scanning through all register locator
> DVSEC entries exists.
>
> There are 2 behavioral changes (#2 is arguable):
> 1. A dev_err is introduced if cxl_map_regs fails.
> 2. The previous logic would try to map component registers and device
>registers multiple times if there were present and keep the mapping
>of the last one found (furthest offset in the register locator).
>While this is disallowed in the spec, CXL 2.0 8.1.9: "Each register
>block identifier shall only occur once in the Register Locator DVSEC
>structure" it was how the driver would respond to the spec violation.
>The new logic will take the first found register block by type and
>move on.
>
> Signed-off-by: Ben Widawsky 
>
> ---
> Changes since v1:

No changes? Luckily git am strips this section...

Overall I think this refactor can be broken down further for
readability and cleanup the long standing problem that the driver maps
component registers for no reason. The main contributing factor to
readability is that cxl_setup_pci_regs() still exists after the
refactor, which also contributes to the component register problem. If
the register mapping is up leveled to the caller of
cxl_setup_pci_regs() (and drops mapping component registers) then a
follow-on patch to rename cxl_setup_pci_regs to find_register_block
becomes easier to read. Moving the cxl_register_map array out of
cxl_setup_pci_regs() also makes a later patch to pass component
register enumeration details to the endpoint-port that much cleaner.


Re: [PATCH v2 3/9] cxl/pci: Remove pci request/release regions

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:26 AM Ben Widawsky  wrote:
>
> Quoting Dan, "... the request + release regions should probably just be
> dropped. It's not like any of the register enumeration would collide
> with someone else who already has the registers mapped. The collision
> only comes when the registers are mapped for their final usage, and that
> will have more precision in the request."

Looks good to me:

Reviewed-by: Dan Williams 

>
> Recommended-by: Dan Williams 

This isn't one of the canonical tags:
Documentation/process/submitting-patches.rst

I'll change this to Suggested-by:

> Signed-off-by: Ben Widawsky 
> ---
>  drivers/cxl/pci.c | 5 -
>  1 file changed, 5 deletions(-)
>
> diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
> index ccc7c2573ddc..7256c236fdb3 100644
> --- a/drivers/cxl/pci.c
> +++ b/drivers/cxl/pci.c
> @@ -453,9 +453,6 @@ static int cxl_pci_setup_regs(struct cxl_mem *cxlm)
> return -ENXIO;
> }
>
> -   if (pci_request_mem_regions(pdev, pci_name(pdev)))
> -   return -ENODEV;
> -
> /* Get the size of the Register Locator DVSEC */
> pci_read_config_dword(pdev, regloc + PCI_DVSEC_HEADER1, _size);
> regloc_size = FIELD_GET(PCI_DVSEC_HEADER1_LENGTH_MASK, regloc_size);
> @@ -499,8 +496,6 @@ static int cxl_pci_setup_regs(struct cxl_mem *cxlm)
> n_maps++;
> }
>
> -   pci_release_mem_regions(pdev);
> -
> for (i = 0; i < n_maps; i++) {
> ret = cxl_map_regs(cxlm, [i]);
> if (ret)
> --
> 2.33.0
>


Re: [PATCH v2 2/9] cxl/pci: Remove dev_dbg for unknown register blocks

2021-09-28 Thread Dan Williams
On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:
>
> While interesting to driver developers, the dev_dbg message doesn't do
> much except clutter up logs. This information should be attainable
> through sysfs, and someday lspci like utilities. This change
> additionally helps reduce the LOC in a subsequent patch to refactor some
> of cxl_pci register mapping.

Looks good to me:

Reviewed-by: Dan Williams 


Re: [PATCH v2 1/9] cxl: Convert "RBI" to enum

2021-09-27 Thread Dan Williams
Please spell out "register block indicator" in the subject so that the
shortlog remains somewhat readable.

On Thu, Sep 23, 2021 at 10:27 AM Ben Widawsky  wrote:
>
> In preparation for passing around the Register Block Indicator (RBI) as
> a parameter, it is desirable to convert the type to an enum so that the
> interface can use a well defined type checked parameter.

C wouldn't type check this unless it failed an integer conversion,
right? It would need to be a struct to get useful type checking.

I don't mind this for the self documenting properties it has for the
functions that will take this as a parameter, but maybe clarify what
you mean by type checked parameter?

>
> As a result of this change, should future versions of the spec add
> sparsely defined identifiers, it could become a problem if checking for
> invalid identifiers since the code currently checks for the max
> identifier. This is not an issue with current spec, and the algorithm to
> obtain the register blocks will change before those possible additions
> are made.

In general let's not spend changelog space trying to guess what future
specs may or may not do. I.e. I think this text can be dropped,
especially because enums can support sparse number spaces.


Re: [PATCH 7/7] ocxl: Use pci core's DVSEC functionality

2021-09-21 Thread Dan Williams
On Tue, Sep 21, 2021 at 3:05 PM Ben Widawsky  wrote:
>
> Reduce maintenance burden of DVSEC query implementation by using the
> centralized PCI core implementation.
>
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: Frederic Barrat 
> Cc: Andrew Donnellan 
> Signed-off-by: Ben Widawsky 
> ---
>  drivers/misc/ocxl/config.c | 13 +
>  1 file changed, 1 insertion(+), 12 deletions(-)
>
> diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
> index a68738f38252..e401a51596b9 100644
> --- a/drivers/misc/ocxl/config.c
> +++ b/drivers/misc/ocxl/config.c
> @@ -33,18 +33,7 @@
>
>  static int find_dvsec(struct pci_dev *dev, int dvsec_id)
>  {
> -   int vsec = 0;
> -   u16 vendor, id;
> -
> -   while ((vsec = pci_find_next_ext_capability(dev, vsec,
> -   OCXL_EXT_CAP_ID_DVSEC))) {
> -   pci_read_config_word(dev, vsec + OCXL_DVSEC_VENDOR_OFFSET,
> -   );
> -   pci_read_config_word(dev, vsec + OCXL_DVSEC_ID_OFFSET, );
> -   if (vendor == PCI_VENDOR_ID_IBM && id == dvsec_id)
> -   return vsec;
> -   }
> -   return 0;
> +   return pci_find_dvsec_capability(dev, PCI_VENDOR_ID_IBM, dvsec_id);
>  }

What about:

arch/powerpc/platforms/powernv/ocxl.c::find_dvsec_from_pos()

...?  With that converted the redundant definitions below:

OCXL_EXT_CAP_ID_DVSEC
OCXL_DVSEC_VENDOR_OFFSET
OCXL_DVSEC_ID_OFFSET

...can be cleaned up in favor of the core definitions.


Re: [RESEND PATCH v4 1/4] drivers/nvdimm: Add nvdimm pmu structure

2021-09-14 Thread Dan Williams
On Tue, Sep 14, 2021 at 9:08 PM Dan Williams  wrote:
>
> On Thu, Sep 9, 2021 at 12:56 AM kajoljain  wrote:
> >
> >
> >
> > On 9/8/21 3:29 AM, Dan Williams wrote:
> > > Hi Kajol,
> > >
> > > Apologies for the delay in responding to this series, some comments below:
> >
> > Hi Dan,
> > No issues, thanks for reviewing the patches.
> >
> > >
> > > On Thu, Sep 2, 2021 at 10:10 PM Kajol Jain  wrote:
> > >>
> > >> A structure is added, called nvdimm_pmu, for performance
> > >> stats reporting support of nvdimm devices. It can be used to add
> > >> nvdimm pmu data such as supported events and pmu event functions
> > >> like event_init/add/read/del with cpu hotplug support.
> > >>
> > >> Acked-by: Peter Zijlstra (Intel) 
> > >> Reviewed-by: Madhavan Srinivasan 
> > >> Tested-by: Nageswara R Sastry 
> > >> Signed-off-by: Kajol Jain 
> > >> ---
> > >>  include/linux/nd.h | 43 +++
> > >>  1 file changed, 43 insertions(+)
> > >>
> > >> diff --git a/include/linux/nd.h b/include/linux/nd.h
> > >> index ee9ad76afbba..712499cf7335 100644
> > >> --- a/include/linux/nd.h
> > >> +++ b/include/linux/nd.h
> > >> @@ -8,6 +8,8 @@
> > >>  #include 
> > >>  #include 
> > >>  #include 
> > >> +#include 
> > >> +#include 
> > >>
> > >>  enum nvdimm_event {
> > >> NVDIMM_REVALIDATE_POISON,
> > >> @@ -23,6 +25,47 @@ enum nvdimm_claim_class {
> > >> NVDIMM_CCLASS_UNKNOWN,
> > >>  };
> > >>
> > >> +/* Event attribute array index */
> > >> +#define NVDIMM_PMU_FORMAT_ATTR 0
> > >> +#define NVDIMM_PMU_EVENT_ATTR  1
> > >> +#define NVDIMM_PMU_CPUMASK_ATTR2
> > >> +#define NVDIMM_PMU_NULL_ATTR   3
> > >> +
> > >> +/**
> > >> + * struct nvdimm_pmu - data structure for nvdimm perf driver
> > >> + *
> > >> + * @name: name of the nvdimm pmu device.
> > >> + * @pmu: pmu data structure for nvdimm performance stats.
> > >> + * @dev: nvdimm device pointer.
> > >> + * @functions(event_init/add/del/read): platform specific pmu functions.
> > >
> > > This is not valid kernel-doc:
> > >
> > > include/linux/nd.h:67: warning: Function parameter or member
> > > 'event_init' not described in 'nvdimm_pmu'
> > > include/linux/nd.h:67: warning: Function parameter or member 'add' not
> > > described in 'nvdimm_pmu'
> > > include/linux/nd.h:67: warning: Function parameter or member 'del' not
> > > described in 'nvdimm_pmu'
> > > include/linux/nd.h:67: warning: Function parameter or member 'read'
> > > not described in 'nvdimm_pmu'
> > >
> > > ...but I think rather than fixing those up 'struct nvdimm_pmu' should be 
> > > pruned.
> > >
> > > It's not clear to me that it is worth the effort to describe these
> > > details to the nvdimm core which is just going to turn around and call
> > > the pmu core. I'd just as soon have the driver call the pmu core
> > > directly, optionally passing in attributes and callbacks that come
> > > from the nvdimm core and/or the nvdimm provider.
> >
> > The intend for adding these callbacks(event_init/add/del/read) is to give
> > flexibility to the nvdimm core to add some common checks/routines if 
> > required
> > in the future. Those checks can be common for all architecture with still 
> > having the
> > ability to call arch/platform specific driver code to use its own routines.
> >
> > But as you said, currently we don't have any common checks and it directly
> > calling platform specific code, so we can get rid of it.
> > Should we remove this part for now?
>
> Yes, lets go direct to the perf api for now and await the need for a
> common core wrapper to present itself.
>
> >
> >
> > >
> > > Otherwise it's also not clear which of these structure members are
> > > used at runtime vs purely used as temporary storage to pass parameters
> > > to the pmu core.
> > >
> > >> + * @attr_groups: data structure for events, formats and cpumask
> > >> + * @cpu: designated cpu for counter access.
> > >> + * @node: node for cpu hotplug notifier link.
> > >> + * @cp

Re: [RESEND PATCH v4 1/4] drivers/nvdimm: Add nvdimm pmu structure

2021-09-14 Thread Dan Williams
On Thu, Sep 9, 2021 at 12:56 AM kajoljain  wrote:
>
>
>
> On 9/8/21 3:29 AM, Dan Williams wrote:
> > Hi Kajol,
> >
> > Apologies for the delay in responding to this series, some comments below:
>
> Hi Dan,
> No issues, thanks for reviewing the patches.
>
> >
> > On Thu, Sep 2, 2021 at 10:10 PM Kajol Jain  wrote:
> >>
> >> A structure is added, called nvdimm_pmu, for performance
> >> stats reporting support of nvdimm devices. It can be used to add
> >> nvdimm pmu data such as supported events and pmu event functions
> >> like event_init/add/read/del with cpu hotplug support.
> >>
> >> Acked-by: Peter Zijlstra (Intel) 
> >> Reviewed-by: Madhavan Srinivasan 
> >> Tested-by: Nageswara R Sastry 
> >> Signed-off-by: Kajol Jain 
> >> ---
> >>  include/linux/nd.h | 43 +++
> >>  1 file changed, 43 insertions(+)
> >>
> >> diff --git a/include/linux/nd.h b/include/linux/nd.h
> >> index ee9ad76afbba..712499cf7335 100644
> >> --- a/include/linux/nd.h
> >> +++ b/include/linux/nd.h
> >> @@ -8,6 +8,8 @@
> >>  #include 
> >>  #include 
> >>  #include 
> >> +#include 
> >> +#include 
> >>
> >>  enum nvdimm_event {
> >> NVDIMM_REVALIDATE_POISON,
> >> @@ -23,6 +25,47 @@ enum nvdimm_claim_class {
> >> NVDIMM_CCLASS_UNKNOWN,
> >>  };
> >>
> >> +/* Event attribute array index */
> >> +#define NVDIMM_PMU_FORMAT_ATTR 0
> >> +#define NVDIMM_PMU_EVENT_ATTR  1
> >> +#define NVDIMM_PMU_CPUMASK_ATTR2
> >> +#define NVDIMM_PMU_NULL_ATTR   3
> >> +
> >> +/**
> >> + * struct nvdimm_pmu - data structure for nvdimm perf driver
> >> + *
> >> + * @name: name of the nvdimm pmu device.
> >> + * @pmu: pmu data structure for nvdimm performance stats.
> >> + * @dev: nvdimm device pointer.
> >> + * @functions(event_init/add/del/read): platform specific pmu functions.
> >
> > This is not valid kernel-doc:
> >
> > include/linux/nd.h:67: warning: Function parameter or member
> > 'event_init' not described in 'nvdimm_pmu'
> > include/linux/nd.h:67: warning: Function parameter or member 'add' not
> > described in 'nvdimm_pmu'
> > include/linux/nd.h:67: warning: Function parameter or member 'del' not
> > described in 'nvdimm_pmu'
> > include/linux/nd.h:67: warning: Function parameter or member 'read'
> > not described in 'nvdimm_pmu'
> >
> > ...but I think rather than fixing those up 'struct nvdimm_pmu' should be 
> > pruned.
> >
> > It's not clear to me that it is worth the effort to describe these
> > details to the nvdimm core which is just going to turn around and call
> > the pmu core. I'd just as soon have the driver call the pmu core
> > directly, optionally passing in attributes and callbacks that come
> > from the nvdimm core and/or the nvdimm provider.
>
> The intend for adding these callbacks(event_init/add/del/read) is to give
> flexibility to the nvdimm core to add some common checks/routines if required
> in the future. Those checks can be common for all architecture with still 
> having the
> ability to call arch/platform specific driver code to use its own routines.
>
> But as you said, currently we don't have any common checks and it directly
> calling platform specific code, so we can get rid of it.
> Should we remove this part for now?

Yes, lets go direct to the perf api for now and await the need for a
common core wrapper to present itself.

>
>
> >
> > Otherwise it's also not clear which of these structure members are
> > used at runtime vs purely used as temporary storage to pass parameters
> > to the pmu core.
> >
> >> + * @attr_groups: data structure for events, formats and cpumask
> >> + * @cpu: designated cpu for counter access.
> >> + * @node: node for cpu hotplug notifier link.
> >> + * @cpuhp_state: state for cpu hotplug notification.
> >> + * @arch_cpumask: cpumask to get designated cpu for counter access.
> >> + */
> >> +struct nvdimm_pmu {
> >> +   const char *name;
> >> +   struct pmu pmu;
> >> +   struct device *dev;
> >> +   int (*event_init)(struct perf_event *event);
> >> +   int  (*add)(struct perf_event *event, int flags);
> >> +   void (*del)(struct perf_event *event, int flags);
> >> +   void (*read)(struct perf_event *event);
> &g

Re: [RESEND PATCH v4 4/4] powerpc/papr_scm: Document papr_scm sysfs event format entries

2021-09-07 Thread Dan Williams
On Thu, Sep 2, 2021 at 10:11 PM Kajol Jain  wrote:
>
> Details is added for the event, cpumask and format attributes
> in the ABI documentation.
>
> Acked-by: Peter Zijlstra (Intel) 
> Reviewed-by: Madhavan Srinivasan 
> Tested-by: Nageswara R Sastry 
> Signed-off-by: Kajol Jain 
> ---
>  Documentation/ABI/testing/sysfs-bus-papr-pmem | 31 +++
>  1 file changed, 31 insertions(+)
>
> diff --git a/Documentation/ABI/testing/sysfs-bus-papr-pmem 
> b/Documentation/ABI/testing/sysfs-bus-papr-pmem
> index 95254cec92bf..4d86252448f8 100644
> --- a/Documentation/ABI/testing/sysfs-bus-papr-pmem
> +++ b/Documentation/ABI/testing/sysfs-bus-papr-pmem
> @@ -61,3 +61,34 @@ Description:
> * "CchRHCnt" : Cache Read Hit Count
> * "CchWHCnt" : Cache Write Hit Count
> * "FastWCnt" : Fast Write Count
> +
> +What:  /sys/devices/nmemX/format
> +Date:  June 2021
> +Contact:   linuxppc-dev , 
> nvd...@lists.linux.dev,
> +Description:   (RO) Attribute group to describe the magic bits
> +that go into perf_event_attr.config for a particular pmu.
> +(See ABI/testing/sysfs-bus-event_source-devices-format).
> +
> +Each attribute under this group defines a bit range of the
> +perf_event_attr.config. Supported attribute is listed
> +below::
> +
> +   event  = "config:0-4"  - event ID
> +
> +   For example::
> +   noopstat = "event=0x1"
> +
> +What:  /sys/devices/nmemX/events

That's not a valid sysfs path. Did you mean /sys/bus/nd/devices/nmemX?

> +Date:  June 2021
> +Contact:   linuxppc-dev , 
> nvd...@lists.linux.dev,
> +Description:(RO) Attribute group to describe performance monitoring
> +events specific to papr-scm. Each attribute in this group 
> describes
> +a single performance monitoring event supported by this 
> nvdimm pmu.
> +The name of the file is the name of the event.
> +(See ABI/testing/sysfs-bus-event_source-devices-events).

Given these events are in the generic namespace the ABI documentation
should be generic as well. So I think move these entries to
Documentation/ABI/testing/sysfs-bus-nvdimm directly.

You can still mention papr-scm, but I would expect something like:

What:   /sys/bus/nd/devices/nmemX/events
Date:   September 2021
KernelVersion:  5.16
Contact:Kajol Jain 
Description:
(RO) Attribute group to describe performance monitoring events
for the nvdimm memory device. Each attribute in this group
describes a single performance monitoring event supported by
this nvdimm pmu.  The name of the file is the name of the event.
(See ABI/testing/sysfs-bus-event_source-devices-events). A
listing of the events supported by a given nvdimm provider type
can be found in Documentation/driver-api/nvdimm/$provider, for
example: Documentation/driver-api/nvdimm/papr-scm.



> +
> +What:  /sys/devices/nmemX/cpumask
> +Date:  June 2021
> +Contact:   linuxppc-dev , 
> nvd...@lists.linux.dev,
> +Description:   (RO) This sysfs file exposes the cpumask which is designated 
> to make
> +HCALLs to retrieve nvdimm pmu event counter data.

Seems this one would be provider generic, so no need to refer to PPC
specific concepts like HCALLs.


Re: [RESEND PATCH v4 1/4] drivers/nvdimm: Add nvdimm pmu structure

2021-09-07 Thread Dan Williams
Hi Kajol,

Apologies for the delay in responding to this series, some comments below:

On Thu, Sep 2, 2021 at 10:10 PM Kajol Jain  wrote:
>
> A structure is added, called nvdimm_pmu, for performance
> stats reporting support of nvdimm devices. It can be used to add
> nvdimm pmu data such as supported events and pmu event functions
> like event_init/add/read/del with cpu hotplug support.
>
> Acked-by: Peter Zijlstra (Intel) 
> Reviewed-by: Madhavan Srinivasan 
> Tested-by: Nageswara R Sastry 
> Signed-off-by: Kajol Jain 
> ---
>  include/linux/nd.h | 43 +++
>  1 file changed, 43 insertions(+)
>
> diff --git a/include/linux/nd.h b/include/linux/nd.h
> index ee9ad76afbba..712499cf7335 100644
> --- a/include/linux/nd.h
> +++ b/include/linux/nd.h
> @@ -8,6 +8,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>
>  enum nvdimm_event {
> NVDIMM_REVALIDATE_POISON,
> @@ -23,6 +25,47 @@ enum nvdimm_claim_class {
> NVDIMM_CCLASS_UNKNOWN,
>  };
>
> +/* Event attribute array index */
> +#define NVDIMM_PMU_FORMAT_ATTR 0
> +#define NVDIMM_PMU_EVENT_ATTR  1
> +#define NVDIMM_PMU_CPUMASK_ATTR2
> +#define NVDIMM_PMU_NULL_ATTR   3
> +
> +/**
> + * struct nvdimm_pmu - data structure for nvdimm perf driver
> + *
> + * @name: name of the nvdimm pmu device.
> + * @pmu: pmu data structure for nvdimm performance stats.
> + * @dev: nvdimm device pointer.
> + * @functions(event_init/add/del/read): platform specific pmu functions.

This is not valid kernel-doc:

include/linux/nd.h:67: warning: Function parameter or member
'event_init' not described in 'nvdimm_pmu'
include/linux/nd.h:67: warning: Function parameter or member 'add' not
described in 'nvdimm_pmu'
include/linux/nd.h:67: warning: Function parameter or member 'del' not
described in 'nvdimm_pmu'
include/linux/nd.h:67: warning: Function parameter or member 'read'
not described in 'nvdimm_pmu'

...but I think rather than fixing those up 'struct nvdimm_pmu' should be pruned.

It's not clear to me that it is worth the effort to describe these
details to the nvdimm core which is just going to turn around and call
the pmu core. I'd just as soon have the driver call the pmu core
directly, optionally passing in attributes and callbacks that come
from the nvdimm core and/or the nvdimm provider.

Otherwise it's also not clear which of these structure members are
used at runtime vs purely used as temporary storage to pass parameters
to the pmu core.

> + * @attr_groups: data structure for events, formats and cpumask
> + * @cpu: designated cpu for counter access.
> + * @node: node for cpu hotplug notifier link.
> + * @cpuhp_state: state for cpu hotplug notification.
> + * @arch_cpumask: cpumask to get designated cpu for counter access.
> + */
> +struct nvdimm_pmu {
> +   const char *name;
> +   struct pmu pmu;
> +   struct device *dev;
> +   int (*event_init)(struct perf_event *event);
> +   int  (*add)(struct perf_event *event, int flags);
> +   void (*del)(struct perf_event *event, int flags);
> +   void (*read)(struct perf_event *event);
> +   /*
> +* Attribute groups for the nvdimm pmu. Index 0 used for
> +* format attribute, index 1 used for event attribute,
> +* index 2 used for cpusmask attribute and index 3 kept as NULL.
> +*/
> +   const struct attribute_group *attr_groups[4];

Following from above, I'd rather this was organized as static
attributes with an is_visible() helper for the groups for any dynamic
aspects. That mirrors the behavior of nvdimm_create() and allows for
device drivers to compose the attribute groups from a core set and /
or a provider specific set.

> +   int cpu;
> +   struct hlist_node node;
> +   enum cpuhp_state cpuhp_state;
> +
> +   /* cpumask provided by arch/platform specific code */
> +   struct cpumask arch_cpumask;
> +};
> +
>  struct nd_device_driver {
> struct device_driver drv;
> unsigned long type;
> --
> 2.26.2
>


Re: [PATCH v3] lockdown,selinux: fix wrong subject in some SELinux lockdown checks

2021-08-31 Thread Dan Williams
On Tue, Aug 31, 2021 at 6:53 AM Paul Moore  wrote:
>
> On Tue, Aug 31, 2021 at 5:09 AM Ondrej Mosnacek  wrote:
> > On Sat, Jun 19, 2021 at 12:18 AM Dan Williams  
> > wrote:
> > > On Wed, Jun 16, 2021 at 1:51 AM Ondrej Mosnacek  
> > > wrote:
>
> ...
>
> > > > diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
> > > > index 2acc6173da36..c1747b6555c7 100644
> > > > --- a/drivers/cxl/mem.c
> > > > +++ b/drivers/cxl/mem.c
> > > > @@ -568,7 +568,7 @@ static bool cxl_mem_raw_command_allowed(u16 opcode)
> > > > if (!IS_ENABLED(CONFIG_CXL_MEM_RAW_COMMANDS))
> > > > return false;
> > > >
> > > > -   if (security_locked_down(LOCKDOWN_NONE))
> > > > +   if (security_locked_down(current_cred(), LOCKDOWN_NONE))
> > >
> > > Acked-by: Dan Williams 
> > >
> > > ...however that usage looks wrong. The expectation is that if kernel
> > > integrity protections are enabled then raw command access should be
> > > disabled. So I think that should be equivalent to LOCKDOWN_PCI_ACCESS
> > > in terms of the command capabilities to filter.
> >
> > Yes, the LOCKDOWN_NONE seems wrong here... but it's a pre-existing bug
> > and I didn't want to go down yet another rabbit hole trying to fix it.
> > I'll look at this again once this patch is settled - it may indeed be
> > as simple as replacing LOCKDOWN_NONE with LOCKDOWN_PCI_ACCESS.
>
> At this point you should be well aware of my distaste for merging
> patches that have known bugs in them.  Yes, this is a pre-existing
> condition, but it seems well within the scope of this work to address
> it as well.
>
> This isn't something that is going to get merged while the merge
> window is open, so at the very least you've got almost two weeks to
> sort this out - please do that.

Yes, apologies, I should have sent the fix shortly after noticing the
problem. I'll get the CXL bug fix out of the way so Ondrej can move
this along.


Re: [PATCH v2 4/4] bus: Make remove callback return void

2021-07-06 Thread Dan Williams
On Tue, Jul 6, 2021 at 8:51 AM Uwe Kleine-König
 wrote:
>
> The driver core ignores the return value of this callback because there
> is only little it can do when a device disappears.
>
> This is the final bit of a long lasting cleanup quest where several
> buses were converted to also return void from their remove callback.
> Additionally some resource leaks were fixed that were caused by drivers
> returning an error code in the expectation that the driver won't go
> away.
>
> With struct bus_type::remove returning void it's prevented that newly
> implemented buses return an ignored error code and so don't anticipate
> wrong expectations for driver authors.
>

>  drivers/cxl/core.c| 3 +--
>  drivers/dax/bus.c | 4 +---
>  drivers/nvdimm/bus.c      | 3 +--

For CXL, DAX, and NVDIMM

Acked-by: Dan Williams 


Re: [RESEND-PATCH v2] powerpc/papr_scm: Add support for reporting dirty-shutdown-count

2021-06-26 Thread Dan Williams
On Thu, Jun 24, 2021 at 1:07 AM Vaibhav Jain  wrote:
>
> Persistent memory devices like NVDIMMs can loose cached writes in case
> something prevents flush on power-fail. Such situations are termed as
> dirty shutdown and are exposed to applications as
> last-shutdown-state (LSS) flag and a dirty-shutdown-counter(DSC) as
> described at [1]. The latter being useful in conditions where multiple
> applications want to detect a dirty shutdown event without racing with
> one another.
>
> PAPR-NVDIMMs have so far only exposed LSS style flags to indicate a
> dirty-shutdown-state. This patch further adds support for DSC via the
> "ibm,persistence-failed-count" device tree property of an NVDIMM. This
> property is a monotonic increasing 64-bit counter thats an indication
> of number of times an NVDIMM has encountered a dirty-shutdown event
> causing persistence loss.
>
> Since this value is not expected to change after system-boot hence
> papr_scm reads & caches its value during NVDIMM probe and exposes it
> as a PAPR sysfs attributed named 'dirty_shutdown' to match the name of
> similarly named NFIT sysfs attribute. Also this value is available to
> libnvdimm via PAPR_PDSM_HEALTH payload. 'struct nd_papr_pdsm_health'
> has been extended to add a new member called 'dimm_dsc' presence of
> which is indicated by the newly introduced PDSM_DIMM_DSC_VALID flag.
>
> References:
> [1] https://pmem.io/documents/Dirty_Shutdown_Handling-V1.0.pdf
>
> Signed-off-by: Vaibhav Jain 
> Reviewed-by: Aneesh Kumar K.V 

Belated:

Acked-by: Dan Williams 

It's looking like CXL will add one of these as well. Might be time to
add a unified location when that happens and deprecate these
bus-specific locations.


Re: [PATCH v3] lockdown,selinux: fix wrong subject in some SELinux lockdown checks

2021-06-18 Thread Dan Williams
On Wed, Jun 16, 2021 at 1:51 AM Ondrej Mosnacek  wrote:
>
> Commit 59438b46471a ("security,lockdown,selinux: implement SELinux
> lockdown") added an implementation of the locked_down LSM hook to
> SELinux, with the aim to restrict which domains are allowed to perform
> operations that would breach lockdown.
>
> However, in several places the security_locked_down() hook is called in
> situations where the current task isn't doing any action that would
> directly breach lockdown, leading to SELinux checks that are basically
> bogus.
>
> To fix this, add an explicit struct cred pointer argument to
> security_lockdown() and define NULL as a special value to pass instead
> of current_cred() in such situations. LSMs that take the subject
> credentials into account can then fall back to some default or ignore
> such calls altogether. In the SELinux lockdown hook implementation, use
> SECINITSID_KERNEL in case the cred argument is NULL.
>
> Most of the callers are updated to pass current_cred() as the cred
> pointer, thus maintaining the same behavior. The following callers are
> modified to pass NULL as the cred pointer instead:
> 1. arch/powerpc/xmon/xmon.c
>  Seems to be some interactive debugging facility. It appears that
>  the lockdown hook is called from interrupt context here, so it
>  should be more appropriate to request a global lockdown decision.
> 2. fs/tracefs/inode.c:tracefs_create_file()
>  Here the call is used to prevent creating new tracefs entries when
>  the kernel is locked down. Assumes that locking down is one-way -
>  i.e. if the hook returns non-zero once, it will never return zero
>  again, thus no point in creating these files. Also, the hook is
>  often called by a module's init function when it is loaded by
>  userspace, where it doesn't make much sense to do a check against
>  the current task's creds, since the task itself doesn't actually
>  use the tracing functionality (i.e. doesn't breach lockdown), just
>  indirectly makes some new tracepoints available to whoever is
>  authorized to use them.
> 3. net/xfrm/xfrm_user.c:copy_to_user_*()
>  Here a cryptographic secret is redacted based on the value returned
>  from the hook. There are two possible actions that may lead here:
>  a) A netlink message XFRM_MSG_GETSA with NLM_F_DUMP set - here the
> task context is relevant, since the dumped data is sent back to
> the current task.
>  b) When adding/deleting/updating an SA via XFRM_MSG_xxxSA, the
> dumped SA is broadcasted to tasks subscribed to XFRM events -
> here the current task context is not relevant as it doesn't
> represent the tasks that could potentially see the secret.
>  It doesn't seem worth it to try to keep using the current task's
>  context in the a) case, since the eventual data leak can be
>  circumvented anyway via b), plus there is no way for the task to
>  indicate that it doesn't care about the actual key value, so the
>  check could generate a lot of "false alert" denials with SELinux.
>  Thus, let's pass NULL instead of current_cred() here faute de
>  mieux.
>
> Improvements-suggested-by: Casey Schaufler 
> Improvements-suggested-by: Paul Moore 
> Fixes: 59438b46471a ("security,lockdown,selinux: implement SELinux lockdown")
> Signed-off-by: Ondrej Mosnacek 
[..]
> diff --git a/drivers/cxl/mem.c b/drivers/cxl/mem.c
> index 2acc6173da36..c1747b6555c7 100644
> --- a/drivers/cxl/mem.c
> +++ b/drivers/cxl/mem.c
> @@ -568,7 +568,7 @@ static bool cxl_mem_raw_command_allowed(u16 opcode)
> if (!IS_ENABLED(CONFIG_CXL_MEM_RAW_COMMANDS))
> return false;
>
> -   if (security_locked_down(LOCKDOWN_NONE))
> +   if (security_locked_down(current_cred(), LOCKDOWN_NONE))

Acked-by: Dan Williams 

...however that usage looks wrong. The expectation is that if kernel
integrity protections are enabled then raw command access should be
disabled. So I think that should be equivalent to LOCKDOWN_PCI_ACCESS
in terms of the command capabilities to filter.


[PATCH v2] libnvdimm/pmem: Fix blk_cleanup_disk() usage

2021-06-07 Thread Dan Williams
The queue_to_disk() helper can not be used after del_gendisk()
communicate @disk via the pgmap->owner.

Otherwise, queue_to_disk() returns NULL resulting in the splat below.

 Kernel attempted to read user page (330) - exploit attempt? (uid: 0)
 BUG: Kernel NULL pointer dereference on read at 0x0330
 Faulting instruction address: 0xc0906344
 Oops: Kernel access of bad area, sig: 11 [#1]
 [..]
 NIP [c0906344] pmem_pagemap_cleanup+0x24/0x40
 LR [c04701d4] memunmap_pages+0x1b4/0x4b0
 Call Trace:
 [c00022cbb9c0] [c09063c8] pmem_pagemap_kill+0x28/0x40 (unreliable)
 [c00022cbb9e0] [c04701d4] memunmap_pages+0x1b4/0x4b0
 [c00022cbba90] [c08b28a0] devm_action_release+0x30/0x50
 [c00022cbbab0] [c08b39c8] release_nodes+0x2f8/0x3e0
 [c00022cbbb60] [c08ac440] 
device_release_driver_internal+0x190/0x2b0
 [c00022cbbba0] [c08a8450] unbind_store+0x130/0x170

Reported-by: Sachin Sant 
Fixes: 87eb73b2ca7c ("nvdimm-pmem: convert to blk_alloc_disk/blk_cleanup_disk")
Link: 
http://lore.kernel.org/r/dfb75ba8-603f-4a35-880b-c5b23ef8f...@linux.vnet.ibm.com
Cc: Christoph Hellwig 
Cc: Ulf Hansson 
Cc: Jens Axboe 
Signed-off-by: Dan Williams 
---
Changes in v2 Improve the changelog.

 drivers/nvdimm/pmem.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 31f3c4bd6f72..fc6b78dd2d24 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -337,8 +337,9 @@ static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
 {
struct request_queue *q =
container_of(pgmap->ref, struct request_queue, q_usage_counter);
+   struct pmem_device *pmem = pgmap->owner;
 
-   blk_cleanup_disk(queue_to_disk(q));
+   blk_cleanup_disk(pmem->disk);
 }
 
 static void pmem_release_queue(void *pgmap)
@@ -427,6 +428,7 @@ static int pmem_attach_disk(struct device *dev,
q = disk->queue;
 
pmem->disk = disk;
+   pmem->pgmap.owner = pmem;
pmem->pfn_flags = PFN_DEV;
pmem->pgmap.ref = >q_usage_counter;
if (is_nd_pfn(dev)) {



[PATCH] libnvdimm/pmem: Fix blk_cleanup_disk() usage

2021-06-07 Thread Dan Williams
The queue_to_disk() helper can not be used after del_gendisk()
communicate @disk via the pgmap->owner.

Reported-by: Sachin Sant 
Fixes: 87eb73b2ca7c ("nvdimm-pmem: convert to blk_alloc_disk/blk_cleanup_disk")
Cc: Christoph Hellwig 
Cc: Ulf Hansson 
Cc: Jens Axboe 
Signed-off-by: Dan Williams 
---
Jens,

Please take or fold this into your tree after Sachin has a chance
to test it out. It's passing my tests.

 drivers/nvdimm/pmem.c |4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 31f3c4bd6f72..fc6b78dd2d24 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -337,8 +337,9 @@ static void pmem_pagemap_cleanup(struct dev_pagemap *pgmap)
 {
struct request_queue *q =
container_of(pgmap->ref, struct request_queue, q_usage_counter);
+   struct pmem_device *pmem = pgmap->owner;
 
-   blk_cleanup_disk(queue_to_disk(q));
+   blk_cleanup_disk(pmem->disk);
 }
 
 static void pmem_release_queue(void *pgmap)
@@ -427,6 +428,7 @@ static int pmem_attach_disk(struct device *dev,
q = disk->queue;
 
pmem->disk = disk;
+   pmem->pgmap.owner = pmem;
pmem->pfn_flags = PFN_DEV;
pmem->pgmap.ref = >q_usage_counter;
if (is_nd_pfn(dev)) {



Re: [PATCH 17/26] nvdimm-pmem: convert to blk_alloc_disk/blk_cleanup_disk

2021-06-06 Thread Dan Williams
[ add Sachin who reported this commit in -next ]

On Thu, May 20, 2021 at 10:52 PM Christoph Hellwig  wrote:
>
> Convert the nvdimm-pmem driver to use the blk_alloc_disk and
> blk_cleanup_disk helpers to simplify gendisk and request_queue
> allocation.
>
> Signed-off-by: Christoph Hellwig 
> ---
>  drivers/nvdimm/pmem.c | 15 +--
>  1 file changed, 5 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
> index 968b8483c763..9fcd05084564 100644
> --- a/drivers/nvdimm/pmem.c
> +++ b/drivers/nvdimm/pmem.c
> @@ -338,7 +338,7 @@ static void pmem_pagemap_cleanup(struct dev_pagemap 
> *pgmap)
> struct request_queue *q =
> container_of(pgmap->ref, struct request_queue, 
> q_usage_counter);
>
> -   blk_cleanup_queue(q);
> +   blk_cleanup_disk(queue_to_disk(q));

This is broken. This comes after del_gendisk() which means the queue
device is no longer associated with its disk parent. Perhaps @pmem
could be stashed in pgmap->owner and then this can use pmem->disk? Not
see any other readily available ways to get back to the disk from here
after del_gendisk().


Re: [PATCH v2] powerpc/papr_scm: Make 'perf_stats' invisible if perf-stats unavailable

2021-05-12 Thread Dan Williams
On Fri, May 7, 2021 at 4:40 AM Vaibhav Jain  wrote:
>
> In case performance stats for an nvdimm are not available, reading the
> 'perf_stats' sysfs file returns an -ENOENT error. A better approach is
> to make the 'perf_stats' file entirely invisible to indicate that
> performance stats for an nvdimm are unavailable.
>
> So this patch updates 'papr_nd_attribute_group' to add a 'is_visible'
> callback implemented as newly introduced 'papr_nd_attribute_visible()'
> that returns an appropriate mode in case performance stats aren't
> supported in a given nvdimm.
>
> Also the initialization of 'papr_scm_priv.stat_buffer_len' is moved
> from papr_scm_nvdimm_init() to papr_scm_probe() so that it value is
> available when 'papr_nd_attribute_visible()' is called during nvdimm
> initialization.
>
> Fixes: 2d02bf835e57('powerpc/papr_scm: Fetch nvdimm performance stats from 
> PHYP')

Since this has been the exposed ABI since v5.9 perhaps a note /
analysis is needed here that the disappearance of this file will not
break any existing scripts/tools.

> Signed-off-by: Vaibhav Jain 
> ---
> Changelog:
>
> v2:
> * Removed a redundant dev_info() from pap_scm_nvdimm_init() [ Aneesh ]
> * Marked papr_nd_attribute_visible() as static which also fixed the
>   build warning reported by kernel build robot
> ---
>  arch/powerpc/platforms/pseries/papr_scm.c | 35 ---
>  1 file changed, 24 insertions(+), 11 deletions(-)
>
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index e2b69cc3beaf..11e7b90a3360 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -907,6 +907,20 @@ static ssize_t flags_show(struct device *dev,
>  }
>  DEVICE_ATTR_RO(flags);
>
> +static umode_t papr_nd_attribute_visible(struct kobject *kobj,
> +struct attribute *attr, int n)
> +{
> +   struct device *dev = container_of(kobj, typeof(*dev), kobj);

This can use the kobj_to_dev() helper.

> +   struct nvdimm *nvdimm = to_nvdimm(dev);
> +   struct papr_scm_priv *p = nvdimm_provider_data(nvdimm);
> +
> +   /* For if perf-stats not available remove perf_stats sysfs */
> +   if (attr == _attr_perf_stats.attr && p->stat_buffer_len == 0)
> +   return 0;
> +
> +   return attr->mode;
> +}
> +
>  /* papr_scm specific dimm attributes */
>  static struct attribute *papr_nd_attributes[] = {
> _attr_flags.attr,
> @@ -916,6 +930,7 @@ static struct attribute *papr_nd_attributes[] = {
>
>  static struct attribute_group papr_nd_attribute_group = {
> .name = "papr",
> +   .is_visible = papr_nd_attribute_visible,
> .attrs = papr_nd_attributes,
>  };
>
> @@ -931,7 +946,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv *p)
> struct nd_region_desc ndr_desc;
> unsigned long dimm_flags;
> int target_nid, online_nid;
> -   ssize_t stat_size;
>
> p->bus_desc.ndctl = papr_scm_ndctl;
> p->bus_desc.module = THIS_MODULE;
> @@ -1016,16 +1030,6 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv 
> *p)
> list_add_tail(>region_list, _nd_regions);
> mutex_unlock(_ndr_lock);
>
> -   /* Try retriving the stat buffer and see if its supported */
> -   stat_size = drc_pmem_query_stats(p, NULL, 0);
> -   if (stat_size > 0) {
> -   p->stat_buffer_len = stat_size;
> -   dev_dbg(>pdev->dev, "Max perf-stat size %lu-bytes\n",
> -   p->stat_buffer_len);
> -   } else {
> -   dev_info(>pdev->dev, "Dimm performance stats 
> unavailable\n");
> -   }
> -
> return 0;
>
>  err:   nvdimm_bus_unregister(p->bus);
> @@ -1102,6 +1106,7 @@ static int papr_scm_probe(struct platform_device *pdev)
> u64 blocks, block_size;
> struct papr_scm_priv *p;
> const char *uuid_str;
> +   ssize_t stat_size;
> u64 uuid[2];
> int rc;
>
> @@ -1179,6 +1184,14 @@ static int papr_scm_probe(struct platform_device *pdev)
> p->res.name  = pdev->name;
> p->res.flags = IORESOURCE_MEM;
>
> +   /* Try retriving the stat buffer and see if its supported */

s/retriving/retrieving/

> +   stat_size = drc_pmem_query_stats(p, NULL, 0);
> +   if (stat_size > 0) {
> +   p->stat_buffer_len = stat_size;
> +   dev_dbg(>pdev->dev, "Max perf-stat size %lu-bytes\n",
> +   p->stat_buffer_len);
> +   }
> +
> rc = papr_scm_nvdimm_init(p);
> if (rc)
> goto err2;
> --
> 2.31.1
>

After the minor fixups above you can add:

Reviewed-by: Dan Williams 

...I assume this will go through the PPC tree.


Re: [PATCH] powerpc/papr_scm: Reduce error severity if nvdimm stats inaccessible

2021-04-15 Thread Dan Williams
On Thu, Apr 15, 2021 at 4:44 AM Vaibhav Jain  wrote:
>
> Thanks for looking into this Dan,
>
> Dan Williams  writes:
>
> > On Wed, Apr 14, 2021 at 5:40 AM Vaibhav Jain  wrote:
> >>
> >> Currently drc_pmem_qeury_stats() generates a dev_err in case
> >> "Enable Performance Information Collection" feature is disabled from
> >> HMC. The error is of the form below:
> >>
> >> papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Failed to query
> >>  performance stats, Err:-10
> >>
> >> This error message confuses users as it implies a possible problem
> >> with the nvdimm even though its due to a disabled feature.
> >>
> >> So we fix this by explicitly handling the H_AUTHORITY error from the
> >> H_SCM_PERFORMANCE_STATS hcall and generating a warning instead of an
> >> error, saying that "Performance stats in-accessible".
> >>
> >> Fixes: 2d02bf835e57('powerpc/papr_scm: Fetch nvdimm performance stats from 
> >> PHYP')
> >> Signed-off-by: Vaibhav Jain 
> >> ---
> >>  arch/powerpc/platforms/pseries/papr_scm.c | 3 +++
> >>  1 file changed, 3 insertions(+)
> >>
> >> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> >> b/arch/powerpc/platforms/pseries/papr_scm.c
> >> index 835163f54244..9216424f8be3 100644
> >> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> >> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> >> @@ -277,6 +277,9 @@ static ssize_t drc_pmem_query_stats(struct 
> >> papr_scm_priv *p,
> >> dev_err(>pdev->dev,
> >> "Unknown performance stats, Err:0x%016lX\n", 
> >> ret[0]);
> >> return -ENOENT;
> >> +   } else if (rc == H_AUTHORITY) {
> >> +   dev_warn(>pdev->dev, "Performance stats in-accessible");
> >> +   return -EPERM;
> >
> > So userspace can spam the kernel log? Why is kernel log message needed
> > at all? EPERM told the caller what happened.
> Currently this error message is only reported during probe of the
> nvdimm. So userspace cannot directly spam kernel log.

Oh, ok, I saw things like papr_pdsm_fuel_gauge() in the call stack and
thought this was reachable through an ioctl. Sorry for the noise.


Re: [PATCH] powerpc/papr_scm: Reduce error severity if nvdimm stats inaccessible

2021-04-14 Thread Dan Williams
On Wed, Apr 14, 2021 at 5:40 AM Vaibhav Jain  wrote:
>
> Currently drc_pmem_qeury_stats() generates a dev_err in case
> "Enable Performance Information Collection" feature is disabled from
> HMC. The error is of the form below:
>
> papr_scm ibm,persistent-memory:ibm,pmemory@44104001: Failed to query
>  performance stats, Err:-10
>
> This error message confuses users as it implies a possible problem
> with the nvdimm even though its due to a disabled feature.
>
> So we fix this by explicitly handling the H_AUTHORITY error from the
> H_SCM_PERFORMANCE_STATS hcall and generating a warning instead of an
> error, saying that "Performance stats in-accessible".
>
> Fixes: 2d02bf835e57('powerpc/papr_scm: Fetch nvdimm performance stats from 
> PHYP')
> Signed-off-by: Vaibhav Jain 
> ---
>  arch/powerpc/platforms/pseries/papr_scm.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index 835163f54244..9216424f8be3 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -277,6 +277,9 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv 
> *p,
> dev_err(>pdev->dev,
> "Unknown performance stats, Err:0x%016lX\n", ret[0]);
> return -ENOENT;
> +   } else if (rc == H_AUTHORITY) {
> +   dev_warn(>pdev->dev, "Performance stats in-accessible");
> +   return -EPERM;

So userspace can spam the kernel log? Why is kernel log message needed
at all? EPERM told the caller what happened.


Re: [PATCH] mm/pmem: Avoid inserting hugepage PTE entry with fsdax if hugepage support is disabled

2021-02-05 Thread Dan Williams
[ add Andrew ]

On Thu, Feb 4, 2021 at 6:40 PM Aneesh Kumar K.V
 wrote:
>
> Differentiate between hardware not supporting hugepages and user disabling THP
> via 'echo never > /sys/kernel/mm/transparent_hugepage/enabled'
>
> For the devdax namespace, the kernel handles the above via the
> supported_alignment attribute and failing to initialize the namespace
> if the namespace align value is not supported on the platform.
>
> For the fsdax namespace, the kernel will continue to initialize
> the namespace. This can result in the kernel creating a huge pte
> entry even though the hardware don't support the same.
>
> We do want hugepage support with pmem even if the end-user disabled THP
> via sysfs file (/sys/kernel/mm/transparent_hugepage/enabled). Hence
> differentiate between hardware/firmware lacking support vs user-controlled
> disable of THP and prevent a huge fault if the hardware lacks hugepage
> support.

Looks good to me.

Reviewed-by: Dan Williams 

I assume this will go through Andrew.


Re: [RFC][PATCH 1/2] libnvdimm: Introduce ND_CMD_GET_STAT to retrieve nvdimm statistics

2020-12-07 Thread Dan Williams
[ add perf maintainers ]

On Sun, Nov 8, 2020 at 1:16 PM Vaibhav Jain  wrote:
>
> Implement support for exposing generic nvdimm statistics via newly
> introduced dimm-command ND_CMD_GET_STAT that can be handled by nvdimm
> command handler function and provide values for these statistics back
> to libnvdimm. Following generic nvdimm statistics are defined as an
> enumeration in 'uapi/ndctl.h':
>
> * "media_reads" : Number of media reads that have occurred since reboot.
> * "media_writes" : Number of media writes that have occurred since reboot.
> * "read_requests" : Number of read requests that have occurred since reboot.
> * "write_requests" : Number of write requests that have occurred since reboot.

Perhaps document these as "since device reset"? As I can imagine some
devices might have a mechanism to reset the count outside of "reboot"
which is a bit ambiguous.

> * "total_media_reads" : Total number of media reads that have occurred.
> * "total_media_writes" : Total number of media writes that have occurred.
> * "total_read_requests" : Total number of read requests that have occurred.
> * "total_write_requests" : Total number of write requests that have occurred.
>
> Apart from ND_CMD_GET_STAT ioctl these nvdimm statistics are also
> exposed via sysfs '/stats' directory for easy user-space
> access like below:
>
> /sys/class/nd/ndctl0/device/nmem0/stats # tail -n +1 *
> ==> media_reads <==
> 252197707602
> ==> media_writes <==
> 20684685172
> ==> read_requests <==
> 658810924962
> ==> write_requests <==
> 404464081574

Hmm, I haven't looked but how hard would it be to plumb these to be
perf counter-events. So someone could combine these with other perf
counters?

> In case a specific nvdimm-statistic is not supported than nvdimm
> command handler function can simply return an error (e.g -ENOENT) for
> request to read that nvdimm-statistic.

Makes sense, but I expect the perf route also has a way to enumerate
which statistics / counters are supported. I'm not opposed to also
having them in sysfs, but I think perf support should be a first class
citizen.

>
> The value for a specific nvdimm-stat is exchanged via newly introduced
> 'struct nd_cmd_get_dimm_stat' that hold a single statistics and a
> union of possible values types. Presently only '__s64' type of generic
> attributes are supported. These attributes are defined in
> 'ndvimm/dimm_devs.c' via a helper macro 'NVDIMM_STAT_ATTR'.
>
> Signed-off-by: Vaibhav Jain 
> ---
>  drivers/nvdimm/bus.c   |   6 ++
>  drivers/nvdimm/dimm_devs.c | 109 +
>  drivers/nvdimm/nd.h|   5 ++
>  include/uapi/linux/ndctl.h |  27 +
>  4 files changed, 147 insertions(+)
>
> diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
> index 2304c6183822..d53564477437 100644
> --- a/drivers/nvdimm/bus.c
> +++ b/drivers/nvdimm/bus.c
> @@ -794,6 +794,12 @@ static const struct nd_cmd_desc __nd_cmd_dimm_descs[] = {
> .out_num = 1,
> .out_sizes = { UINT_MAX, },
> },
> +   [ND_CMD_GET_STAT] = {
> +   .in_num = 1,
> +   .in_sizes = { sizeof(struct nd_cmd_get_dimm_stat), },
> +   .out_num = 1,
> +   .out_sizes = { sizeof(struct nd_cmd_get_dimm_stat), },
> +   },
>  };
>
>  const struct nd_cmd_desc *nd_cmd_dimm_desc(int cmd)
> diff --git a/drivers/nvdimm/dimm_devs.c b/drivers/nvdimm/dimm_devs.c
> index b59032e0859b..68aaa294def7 100644
> --- a/drivers/nvdimm/dimm_devs.c
> +++ b/drivers/nvdimm/dimm_devs.c
> @@ -555,6 +555,114 @@ static umode_t nvdimm_firmware_visible(struct kobject 
> *kobj, struct attribute *a
> return a->mode;
>  }
>
> +/* Request a dimm stat from the bus driver */
> +static int __request_dimm_stat(struct nvdimm_bus *nvdimm_bus,
> +  struct nvdimm *dimm, u64 stat_id,
> +  s64 *stat_val)
> +{
> +   struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
> +   struct nd_cmd_get_dimm_stat stat = { .stat_id = stat_id };
> +   int rc, cmd_rc;
> +
> +   if (!test_bit(ND_CMD_GET_STAT, >cmd_mask)) {
> +   pr_debug("CMD_GET_STAT not set for bus driver 0x%lx\n",
> +nd_desc->cmd_mask);
> +   return -ENOENT;
> +   }
> +
> +   /* Is stat requested is known & bus driver supports fetching stats */
> +   if (stat_id <= ND_DIMM_STAT_INVALID || stat_id > ND_DIMM_STAT_MAX) {
> +   WARN(1, "Unknown stat-id(%llu) requested", stat_id);
> +   return -ENOENT;
> +   }
> +
> +   /* Ask bus driver for its stat value */
> +   rc = nd_desc->ndctl(nd_desc, dimm, ND_CMD_GET_STAT,
> +   , sizeof(stat), _rc);
> +   if (rc || cmd_rc) {
> +   pr_debug("Unable to request stat %lld. Error (%d,%d)\n",
> +stat_id, rc, cmd_rc);
> +   return rc ? rc : cmd_rc;
> +   }
> +
> +   /* Indicate error in 

[PATCH] powerpc: fix create_section_mapping compile warning

2020-11-16 Thread Dan Williams
0day robot reports that a recent rework of how
memory_add_physaddr_to_nid() and phys_to_target_node() are declared
resulted in the following new compilation warning:

arch/powerpc/mm/mem.c:91:12: warning: no previous prototype for 
'create_section_mapping' [-Wmissing-prototypes]
   91 | int __weak create_section_mapping(unsigned long start, unsigned long 
end,
  |^~

...fix this by moving the declaration of create_section_mapping()
outside of the CONFIG_NEED_MULTIPLE_NODES ifdef guard, and include an
explicit include of asm/mmzone.h in mem.c. An include of linux/mmzone.h
is not sufficient.

Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Andrew Morton 
Reported-by: kernel test robot 
Signed-off-by: Dan Williams 
---
 arch/powerpc/include/asm/mmzone.h |7 +--
 arch/powerpc/mm/mem.c |1 +
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/mmzone.h 
b/arch/powerpc/include/asm/mmzone.h
index 177fd18caf83..6cda76b57c5d 100644
--- a/arch/powerpc/include/asm/mmzone.h
+++ b/arch/powerpc/include/asm/mmzone.h
@@ -33,8 +33,6 @@ extern struct pglist_data *node_data[];
 extern int numa_cpu_lookup_table[];
 extern cpumask_var_t node_to_cpumask_map[];
 #ifdef CONFIG_MEMORY_HOTPLUG
-extern int create_section_mapping(unsigned long start, unsigned long end,
- int nid, pgprot_t prot);
 extern unsigned long max_pfn;
 u64 memory_hotplug_max(void);
 #else
@@ -48,5 +46,10 @@ u64 memory_hotplug_max(void);
 #define __HAVE_ARCH_RESERVED_KERNEL_PAGES
 #endif
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+extern int create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_MMZONE_H_ */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 01ec2a252f09..3fc325bebe4d 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -50,6 +50,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 



Re: error: redefinition of ‘dax_supported’

2020-09-21 Thread Dan Williams
On Mon, Sep 21, 2020 at 11:35 AM Nick Desaulniers
 wrote:
>
> Hello DAX maintainers,
> I noticed our PPC64LE builds failing last night:
> https://travis-ci.com/github/ClangBuiltLinux/continuous-integration/jobs/388047043
> https://travis-ci.com/github/ClangBuiltLinux/continuous-integration/jobs/388047056
> https://travis-ci.com/github/ClangBuiltLinux/continuous-integration/jobs/388047099
> and looking on lore, I see a fresh report from KernelCI against arm:
> https://lore.kernel.org/linux-next/?q=dax_supported
>
> Can you all please take a look?  More concerning is that I see this
> failure on mainline.  It may be interesting to consider how this was
> not spotted on -next.

The failure is fixed with commit 88b67edd7247 ("dax: Fix compilation
for CONFIG_DAX && !CONFIG_FS_DAX"). I rushed the fixes that led to
this regression with insufficient exposure because it was crashing all
users. I thought the 2 kbuild-robot reports I squashed covered all the
config combinations, but there was a straggling report after I sent my
-rc6 pull request.

The baseline process escape for all of this was allowing a unit test
triggerable insta-crash upstream in the first instance necessitating
an urgent fix.


Re: [PATCH 12/20] Documentation: maintainer-entry-profile: eliminate duplicated word

2020-07-07 Thread Dan Williams
On Tue, Jul 7, 2020 at 11:07 AM Randy Dunlap  wrote:
>
> Drop the doubled word "have".
>
> Signed-off-by: Randy Dunlap 
> Cc: Jonathan Corbet 
> Cc: linux-...@vger.kernel.org
> Cc: Dan Williams 
> ---
>  Documentation/maintainer/maintainer-entry-profile.rst |2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> --- 
> linux-next-20200701.orig/Documentation/maintainer/maintainer-entry-profile.rst
> +++ linux-next-20200701/Documentation/maintainer/maintainer-entry-profile.rst
> @@ -31,7 +31,7 @@ Example questions to consider:
>  - What branch should contributors submit against?
>  - Links to any other Maintainer Entry Profiles? For example a
>device-driver may point to an entry for its parent subsystem. This makes
> -  the contributor aware of obligations a maintainer may have have for
> +  the contributor aware of obligations a maintainer may have for
>other maintainers in the submission chain.

Acked-by Dan Williams 


Re: [PATCH 16/20] block: move ->make_request_fn to struct block_device_operations

2020-07-01 Thread Dan Williams
On Wed, Jul 1, 2020 at 2:01 AM Christoph Hellwig  wrote:
>
> The make_request_fn is a little weird in that it sits directly in
> struct request_queue instead of an operation vector.  Replace it with
> a block_device_operations method called submit_bio (which describes much
> better what it does).  Also remove the request_queue argument to it, as
> the queue can be derived pretty trivially from the bio.
>
> Signed-off-by: Christoph Hellwig 
> ---
[..]
>  drivers/nvdimm/blk.c  |  5 +-
>  drivers/nvdimm/btt.c  |  5 +-
>  drivers/nvdimm/pmem.c |  5 +-

For drivers/nvdimm

Acked-by: Dan Williams 


Re: [PATCH v6 6/8] powerpc/pmem: Avoid the barrier in flush routines

2020-06-30 Thread Dan Williams
On Tue, Jun 30, 2020 at 8:09 PM Aneesh Kumar K.V
 wrote:
>
> On 7/1/20 1:15 AM, Dan Williams wrote:
> > On Tue, Jun 30, 2020 at 2:21 AM Aneesh Kumar K.V
> >  wrote:
> > [..]
> >>>> The bio argument isn't for range based flushing, it is for flush
> >>>> operations that need to complete asynchronously.
> >>> How does the block layer determine that the pmem device needs
> >>> asynchronous fushing?
> >>>
> >>
> >>  set_bit(ND_REGION_ASYNC, _desc.flags);
> >>
> >> and dax_synchronous(dev)
> >
> > Yes, but I think it is overkill to have an indirect function call just
> > for a single instruction.
> >
> > How about something like this instead, to share a common pmem_wmb()
> > across x86 and powerpc.
> >
> > diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> > index 20ff30c2ab93..b14009060c83 100644
> > --- a/drivers/nvdimm/region_devs.c
> > +++ b/drivers/nvdimm/region_devs.c
> > @@ -1180,6 +1180,13 @@ int nvdimm_flush(struct nd_region *nd_region,
> > struct bio *bio)
> >   {
> >  int rc = 0;
> >
> > +   /*
> > +* pmem_wmb() is needed to 'sfence' all previous writes such
> > +* that they are architecturally visible for the platform buffer
> > +* flush.
> > +*/
> > +   pmem_wmb();
> > +
> >  if (!nd_region->flush)
> >  rc = generic_nvdimm_flush(nd_region);
> >  else {
> > @@ -1206,17 +1213,14 @@ int generic_nvdimm_flush(struct nd_region 
> > *nd_region)
> >  idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 
> > 8));
> >
> >  /*
> > -* The first wmb() is needed to 'sfence' all previous writes
> > -* such that they are architecturally visible for the platform
> > -* buffer flush.  Note that we've already arranged for pmem
> > -* writes to avoid the cache via memcpy_flushcache().  The final
> > -* wmb() ensures ordering for the NVDIMM flush write.
> > +* Note that we've already arranged for pmem writes to avoid the
> > +* cache via memcpy_flushcache().  The final wmb() ensures
> > +* ordering for the NVDIMM flush write.
> >   */
> > -   wmb();
>
>
> The series already convert this to pmem_wmb().
>
> >  for (i = 0; i < nd_region->ndr_mappings; i++)
> >  if (ndrd_get_flush_wpq(ndrd, i, 0))
> >  writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
> > -   wmb();
> > +   pmem_wmb();
>
>
> Should this be pmem_wmb()? This is ordering the above writeq() right?

Correct, this can just be wmb().

>
> >
> >  return 0;
> >   }
> >
>
> This still results in two pmem_wmb() on platforms that doesn't have
> flush_wpq. I was trying to avoid that by adding a nd_region->flush call
> back.

How about skip or exit early out of generic_nvdimm_flush if
ndrd->flush_wpq is NULL? That still saves an indirect branch at the
cost of another conditional, but that should still be worth it.


Re: [PATCH v6 6/8] powerpc/pmem: Avoid the barrier in flush routines

2020-06-30 Thread Dan Williams
On Tue, Jun 30, 2020 at 2:21 AM Aneesh Kumar K.V
 wrote:
[..]
> >> The bio argument isn't for range based flushing, it is for flush
> >> operations that need to complete asynchronously.
> > How does the block layer determine that the pmem device needs
> > asynchronous fushing?
> >
>
> set_bit(ND_REGION_ASYNC, _desc.flags);
>
> and dax_synchronous(dev)

Yes, but I think it is overkill to have an indirect function call just
for a single instruction.

How about something like this instead, to share a common pmem_wmb()
across x86 and powerpc.

diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
index 20ff30c2ab93..b14009060c83 100644
--- a/drivers/nvdimm/region_devs.c
+++ b/drivers/nvdimm/region_devs.c
@@ -1180,6 +1180,13 @@ int nvdimm_flush(struct nd_region *nd_region,
struct bio *bio)
 {
int rc = 0;

+   /*
+* pmem_wmb() is needed to 'sfence' all previous writes such
+* that they are architecturally visible for the platform buffer
+* flush.
+*/
+   pmem_wmb();
+
if (!nd_region->flush)
rc = generic_nvdimm_flush(nd_region);
else {
@@ -1206,17 +1213,14 @@ int generic_nvdimm_flush(struct nd_region *nd_region)
idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));

/*
-* The first wmb() is needed to 'sfence' all previous writes
-* such that they are architecturally visible for the platform
-* buffer flush.  Note that we've already arranged for pmem
-* writes to avoid the cache via memcpy_flushcache().  The final
-* wmb() ensures ordering for the NVDIMM flush write.
+* Note that we've already arranged for pmem writes to avoid the
+* cache via memcpy_flushcache().  The final wmb() ensures
+* ordering for the NVDIMM flush write.
 */
-   wmb();
for (i = 0; i < nd_region->ndr_mappings; i++)
if (ndrd_get_flush_wpq(ndrd, i, 0))
writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
-   wmb();
+   pmem_wmb();

return 0;
 }


Re: [PATCH updated] libnvdimm/nvdimm/flush: Allow architecture to override the flush barrier

2020-06-30 Thread Dan Williams
On Tue, Jun 30, 2020 at 5:48 AM Aneesh Kumar K.V
 wrote:
>
>
> Update patch.
>
> From 1e6aa6c4182e14ec5d6bf878ae44c3f69ebff745 Mon Sep 17 00:00:00 2001
> From: "Aneesh Kumar K.V" 
> Date: Tue, 12 May 2020 20:58:33 +0530
> Subject: [PATCH] libnvdimm/nvdimm/flush: Allow architecture to override the
>  flush barrier
>
> Architectures like ppc64 provide persistent memory specific barriers
> that will ensure that all stores for which the modifications are
> written to persistent storage by preceding dcbfps and dcbstps
> instructions have updated persistent storage before any data
> access or data transfer caused by subsequent instructions is initiated.
> This is in addition to the ordering done by wmb()
>
> Update nvdimm core such that architecture can use barriers other than
> wmb to ensure all previous writes are architecturally visible for
> the platform buffer flush.

Looks good, after a few minor fixups below you can add:

Reviewed-by: Dan Williams 

I'm expecting that these will be merged through the powerpc tree since
they mostly impact powerpc with only minor touches to libnvdimm.

> Signed-off-by: Aneesh Kumar K.V 
> ---
>  Documentation/memory-barriers.txt | 14 ++
>  drivers/md/dm-writecache.c|  2 +-
>  drivers/nvdimm/region_devs.c  |  8 
>  include/asm-generic/barrier.h | 10 ++
>  4 files changed, 29 insertions(+), 5 deletions(-)
>
> diff --git a/Documentation/memory-barriers.txt 
> b/Documentation/memory-barriers.txt
> index eaabc3134294..340273a6b18e 100644
> --- a/Documentation/memory-barriers.txt
> +++ b/Documentation/memory-barriers.txt
> @@ -1935,6 +1935,20 @@ There are some more advanced barrier functions:
>   relaxed I/O accessors and the Documentation/DMA-API.txt file for more
>   information on consistent memory.
>
> + (*) pmem_wmb();
> +
> + This is for use with persistent memory to ensure that stores for which
> + modifications are written to persistent storage have updated the 
> persistent
> + storage.

I think this should be:

s/updated the persistent storage/reached a platform durability domain/

> +
> + For example, after a non-temporal write to pmem region, we use 
> pmem_wmb()
> + to ensures that stores have updated the persistent storage. This ensures

s/ensures/ensure/

...and the same comment about "persistent storage" because pmem_wmb()
as implemented on x86 does not guarantee that the writes have reached
storage it ensures that writes have reached buffers / queues that are
within the ADR (platform persistence / durability) domain.

> + that stores have updated persistent storage before any data access or
> + data transfer caused by subsequent instructions is initiated. This is
> + in addition to the ordering done by wmb().
> +
> + For load from persistent memory, existing read memory barriers are 
> sufficient
> + to ensure read ordering.
>
>  ===
>  IMPLICIT KERNEL MEMORY BARRIERS
> diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
> index 74f3c506f084..00534fa4a384 100644
> --- a/drivers/md/dm-writecache.c
> +++ b/drivers/md/dm-writecache.c
> @@ -536,7 +536,7 @@ static void ssd_commit_superblock(struct dm_writecache 
> *wc)
>  static void writecache_commit_flushed(struct dm_writecache *wc, bool 
> wait_for_ios)
>  {
> if (WC_MODE_PMEM(wc))
> -   wmb();
> +   pmem_wmb();
> else
> ssd_commit_flushed(wc, wait_for_ios);
>  }
> diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> index 4502f9c4708d..2333b290bdcf 100644
> --- a/drivers/nvdimm/region_devs.c
> +++ b/drivers/nvdimm/region_devs.c
> @@ -1206,13 +1206,13 @@ int generic_nvdimm_flush(struct nd_region *nd_region)
> idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));
>
> /*
> -* The first wmb() is needed to 'sfence' all previous writes
> -* such that they are architecturally visible for the platform
> -* buffer flush.  Note that we've already arranged for pmem
> +* The first arch_pmem_flush_barrier() is needed to 'sfence' all

One missed arch_pmem_flush_barrier() rename.

> +* previous writes such that they are architecturally visible for
> +* the platform buffer flush. Note that we've already arranged for 
> pmem
>  * writes to avoid the cache via memcpy_flushcache().  The final
>  * wmb() ensures ordering for the NVDIMM flush write.
>  */
> -   wmb();
> +   pmem_wmb();
> for (i = 0; i < nd_region->ndr_mappings; i++)
> if (ndrd_get_flu

Re: [PATCH v6 5/8] powerpc/pmem/of_pmem: Update of_pmem to use the new barrier instruction.

2020-06-30 Thread Dan Williams
On Mon, Jun 29, 2020 at 10:05 PM Aneesh Kumar K.V
 wrote:
>
> Dan Williams  writes:
>
> > On Mon, Jun 29, 2020 at 6:58 AM Aneesh Kumar K.V
> >  wrote:
> >>
> >> of_pmem on POWER10 can now use phwsync instead of hwsync to ensure
> >> all previous writes are architecturally visible for the platform
> >> buffer flush.
> >>
> >> Signed-off-by: Aneesh Kumar K.V 
> >> ---
> >>  arch/powerpc/include/asm/cacheflush.h | 7 +++
> >>  1 file changed, 7 insertions(+)
> >>
> >> diff --git a/arch/powerpc/include/asm/cacheflush.h 
> >> b/arch/powerpc/include/asm/cacheflush.h
> >> index 54764c6e922d..95782f77d768 100644
> >> --- a/arch/powerpc/include/asm/cacheflush.h
> >> +++ b/arch/powerpc/include/asm/cacheflush.h
> >> @@ -98,6 +98,13 @@ static inline void invalidate_dcache_range(unsigned 
> >> long start,
> >> mb();   /* sync */
> >>  }
> >>
> >> +#define arch_pmem_flush_barrier arch_pmem_flush_barrier
> >> +static inline void  arch_pmem_flush_barrier(void)
> >> +{
> >> +   if (cpu_has_feature(CPU_FTR_ARCH_207S))
> >> +   asm volatile(PPC_PHWSYNC ::: "memory");
> >
> > Shouldn't this fallback to a compatible store-fence in an else statement?
>
> The idea was to avoid calling this on anything else. We ensure that by
> making sure that pmem devices are not initialized on systems without that
> cpu feature. Patch 1 does that. Also, the last patch adds a WARN_ON() to
> catch the usage of this outside pmem devices and on systems without that
> cpu feature.

If patch1 handles this why re-check the cpu-feature in this helper? If
the intent is for these routines to be generic why not have them fall
back to the P8 barrier instructions for example like x86 clwb(). Any
kernel code can call it, and it falls back to a compatible clflush()
call on older cpus. I otherwise don't get the point of patch7.


Re: [PATCH updated] libnvdimm/nvdimm/flush: Allow architecture to override the flush barrier

2020-06-30 Thread Dan Williams
On Mon, Jun 29, 2020 at 10:02 PM Aneesh Kumar K.V
 wrote:
>
> Dan Williams  writes:
>
> > On Mon, Jun 29, 2020 at 1:29 PM Aneesh Kumar K.V
> >  wrote:
> >>
> >> Architectures like ppc64 provide persistent memory specific barriers
> >> that will ensure that all stores for which the modifications are
> >> written to persistent storage by preceding dcbfps and dcbstps
> >> instructions have updated persistent storage before any data
> >> access or data transfer caused by subsequent instructions is initiated.
> >> This is in addition to the ordering done by wmb()
> >>
> >> Update nvdimm core such that architecture can use barriers other than
> >> wmb to ensure all previous writes are architecturally visible for
> >> the platform buffer flush.
> >>
> >> Signed-off-by: Aneesh Kumar K.V 
> >> ---
> >>  drivers/md/dm-writecache.c   | 2 +-
> >>  drivers/nvdimm/region_devs.c | 8 
> >>  include/linux/libnvdimm.h| 4 
> >>  3 files changed, 9 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
> >> index 74f3c506f084..8c6b6dce64e2 100644
> >> --- a/drivers/md/dm-writecache.c
> >> +++ b/drivers/md/dm-writecache.c
> >> @@ -536,7 +536,7 @@ static void ssd_commit_superblock(struct dm_writecache 
> >> *wc)
> >>  static void writecache_commit_flushed(struct dm_writecache *wc, bool 
> >> wait_for_ios)
> >>  {
> >> if (WC_MODE_PMEM(wc))
> >> -   wmb();
> >> +   arch_pmem_flush_barrier();
> >> else
> >> ssd_commit_flushed(wc, wait_for_ios);
> >>  }
> >> diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> >> index 4502f9c4708d..b308ad09b63d 100644
> >> --- a/drivers/nvdimm/region_devs.c
> >> +++ b/drivers/nvdimm/region_devs.c
> >> @@ -1206,13 +1206,13 @@ int generic_nvdimm_flush(struct nd_region 
> >> *nd_region)
> >> idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 
> >> 8));
> >>
> >> /*
> >> -* The first wmb() is needed to 'sfence' all previous writes
> >> -* such that they are architecturally visible for the platform
> >> -* buffer flush.  Note that we've already arranged for pmem
> >> +* The first arch_pmem_flush_barrier() is needed to 'sfence' all
> >> +* previous writes such that they are architecturally visible for
> >> +* the platform buffer flush. Note that we've already arranged for 
> >> pmem
> >>  * writes to avoid the cache via memcpy_flushcache().  The final
> >>  * wmb() ensures ordering for the NVDIMM flush write.
> >>  */
> >> -   wmb();
> >> +   arch_pmem_flush_barrier();
> >> for (i = 0; i < nd_region->ndr_mappings; i++)
> >> if (ndrd_get_flush_wpq(ndrd, i, 0))
> >> writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
> >> diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
> >> index 18da4059be09..66f6c65bd789 100644
> >> --- a/include/linux/libnvdimm.h
> >> +++ b/include/linux/libnvdimm.h
> >> @@ -286,4 +286,8 @@ static inline void arch_invalidate_pmem(void *addr, 
> >> size_t size)
> >>  }
> >>  #endif
> >>
> >> +#ifndef arch_pmem_flush_barrier
> >> +#define arch_pmem_flush_barrier() wmb()
> >> +#endif
> >
> > I think it is out of place to define this in libnvdimm.h and it is odd
> > to give it such a long name. The other pmem api helpers like
> > arch_wb_cache_pmem() and arch_invalidate_pmem() are function calls for
> > libnvdimm driver operations, this barrier is just an instruction and
> > is closer to wmb() than the pmem api routine.
> >
> > Since it is a store fence for pmem, so let's just call it pmem_wmb()
> > and define the generic version in include/linux/compiler.h. It should
> > probably also be documented alongside dma_wmb() in
> > Documentation/memory-barriers.txt about why code would use it over
> > wmb(), and why a symmetric pmem_rmb() is not needed.
>
> How about the below? I used pmem_barrier() instead of pmem_wmb().

Why? A barrier() is a bi-directional ordering mechanic for reads and
writes, and the proposed semantics mechanism only orders writes +
persistence. Otherwise the default fallback to wmb() on archs that
don't override it does not m

Re: [PATCH v6 7/8] powerpc/pmem: Add WARN_ONCE to catch the wrong usage of pmem flush functions.

2020-06-29 Thread Dan Williams
On Mon, Jun 29, 2020 at 6:58 AM Aneesh Kumar K.V
 wrote:
>
> We only support persistent memory on P8 and above. This is enforced by the
> firmware and further checked on virtualzied platform during platform init.
> Add WARN_ONCE in pmem flush routines to catch the wrong usage of these.
>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/include/asm/cacheflush.h | 2 ++
>  arch/powerpc/lib/pmem.c   | 2 ++
>  2 files changed, 4 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/cacheflush.h 
> b/arch/powerpc/include/asm/cacheflush.h
> index 95782f77d768..1ab0fa660497 100644
> --- a/arch/powerpc/include/asm/cacheflush.h
> +++ b/arch/powerpc/include/asm/cacheflush.h
> @@ -103,6 +103,8 @@ static inline void  arch_pmem_flush_barrier(void)
>  {
> if (cpu_has_feature(CPU_FTR_ARCH_207S))
> asm volatile(PPC_PHWSYNC ::: "memory");
> +   else
> +   WARN_ONCE(1, "Using pmem flush on older hardware.");

This seems too late to be making this determination. I'd expect the
driver to fail to successfully bind default if this constraint is not
met.


Re: [PATCH v6 6/8] powerpc/pmem: Avoid the barrier in flush routines

2020-06-29 Thread Dan Williams
On Mon, Jun 29, 2020 at 1:41 PM Aneesh Kumar K.V
 wrote:
>
> Michal Suchánek  writes:
>
> > Hello,
> >
> > On Mon, Jun 29, 2020 at 07:27:20PM +0530, Aneesh Kumar K.V wrote:
> >> nvdimm expect the flush routines to just mark the cache clean. The barrier
> >> that mark the store globally visible is done in nvdimm_flush().
> >>
> >> Update the papr_scm driver to a simplified nvdim_flush callback that do
> >> only the required barrier.
> >>
> >> Signed-off-by: Aneesh Kumar K.V 
> >> ---
> >>  arch/powerpc/lib/pmem.c   |  6 --
> >>  arch/powerpc/platforms/pseries/papr_scm.c | 13 +
> >>  2 files changed, 13 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/arch/powerpc/lib/pmem.c b/arch/powerpc/lib/pmem.c
> >> index 5a61aaeb6930..21210fa676e5 100644
> >> --- a/arch/powerpc/lib/pmem.c
> >> +++ b/arch/powerpc/lib/pmem.c
> >> @@ -19,9 +19,6 @@ static inline void __clean_pmem_range(unsigned long 
> >> start, unsigned long stop)
> >>
> >>  for (i = 0; i < size >> shift; i++, addr += bytes)
> >>  asm volatile(PPC_DCBSTPS(%0, %1): :"i"(0), "r"(addr): 
> >> "memory");
> >> -
> >> -
> >> -asm volatile(PPC_PHWSYNC ::: "memory");
> >>  }
> >>
> >>  static inline void __flush_pmem_range(unsigned long start, unsigned long 
> >> stop)
> >> @@ -34,9 +31,6 @@ static inline void __flush_pmem_range(unsigned long 
> >> start, unsigned long stop)
> >>
> >>  for (i = 0; i < size >> shift; i++, addr += bytes)
> >>  asm volatile(PPC_DCBFPS(%0, %1): :"i"(0), "r"(addr): 
> >> "memory");
> >> -
> >> -
> >> -asm volatile(PPC_PHWSYNC ::: "memory");
> >>  }
> >>
> >>  static inline void clean_pmem_range(unsigned long start, unsigned long 
> >> stop)
> >> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> >> b/arch/powerpc/platforms/pseries/papr_scm.c
> >> index 9c569078a09f..9a9a0766f8b6 100644
> >> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> >> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> >> @@ -630,6 +630,18 @@ static int papr_scm_ndctl(struct 
> >> nvdimm_bus_descriptor *nd_desc,
> >>
> >>  return 0;
> >>  }
> >> +/*
> >> + * We have made sure the pmem writes are done such that before calling 
> >> this
> >> + * all the caches are flushed/clean. We use dcbf/dcbfps to ensure this. 
> >> Here
> >> + * we just need to add the necessary barrier to make sure the above 
> >> flushes
> >> + * are have updated persistent storage before any data access or data 
> >> transfer
> >> + * caused by subsequent instructions is initiated.
> >> + */
> >> +static int papr_scm_flush_sync(struct nd_region *nd_region, struct bio 
> >> *bio)
> >> +{
> >> +arch_pmem_flush_barrier();
> >> +return 0;
> >> +}
> >>
> >>  static ssize_t flags_show(struct device *dev,
> >>struct device_attribute *attr, char *buf)
> >> @@ -743,6 +755,7 @@ static int papr_scm_nvdimm_init(struct papr_scm_priv 
> >> *p)
> >>  ndr_desc.mapping = 
> >>  ndr_desc.num_mappings = 1;
> >>  ndr_desc.nd_set = >nd_set;
> >> +ndr_desc.flush = papr_scm_flush_sync;
> >
> > AFAICT currently the only device that implements flush is virtio_pmem.
> > How does the nfit driver get away without implementing flush?
>
> generic_nvdimm_flush does the required barrier for nfit. The reason for
> adding ndr_desc.flush call back for papr_scm was to avoid the usage
> of iomem based deep flushing (ndr_region_data.flush_wpq) which is not
> supported by papr_scm.
>
> BTW we do return NULL for ndrd_get_flush_wpq() on power. So the upstream
> code also does the same thing, but in a different way.
>
>
> > Also the flush takes arguments that are completely unused but a user of
> > the pmem region must assume they are used, and call flush() on the
> > region rather than arch_pmem_flush_barrier() directly.
>
> The bio argument can help a pmem driver to do range based flushing in
> case of pmem_make_request. If bio is null then we must assume a full
> device flush.

The bio argument isn't for range based flushing, it is for flush
operations that need to complete asynchronously.

There's no mechanism for the block layer to communicate range based
cache flushing, block-device flushing is assumed to be the device's
entire cache. For pmem that would be the entirety of the cpu cache.
Instead of modeling the cpu cache as a storage device cache it is
modeled as page-cache. Once the fs-layer writes back page-cache /
cpu-cache the storage device is only responsible for flushing those
cache-writes into the persistence domain.

Additionally there is a concept of deep-flush that relegates some
power-fail scenarios to a smaller failure domain. For example consider
the difference between a write arriving at the head of a device-queue
and successfully traversing a device-queue to media. The expectation
of pmem applications is that data is persisted once they reach the
equivalent of the x86 ADR domain, deep-flush is past ADR.


Re: [PATCH v6 5/8] powerpc/pmem/of_pmem: Update of_pmem to use the new barrier instruction.

2020-06-29 Thread Dan Williams
On Mon, Jun 29, 2020 at 6:58 AM Aneesh Kumar K.V
 wrote:
>
> of_pmem on POWER10 can now use phwsync instead of hwsync to ensure
> all previous writes are architecturally visible for the platform
> buffer flush.
>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  arch/powerpc/include/asm/cacheflush.h | 7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/cacheflush.h 
> b/arch/powerpc/include/asm/cacheflush.h
> index 54764c6e922d..95782f77d768 100644
> --- a/arch/powerpc/include/asm/cacheflush.h
> +++ b/arch/powerpc/include/asm/cacheflush.h
> @@ -98,6 +98,13 @@ static inline void invalidate_dcache_range(unsigned long 
> start,
> mb();   /* sync */
>  }
>
> +#define arch_pmem_flush_barrier arch_pmem_flush_barrier
> +static inline void  arch_pmem_flush_barrier(void)
> +{
> +   if (cpu_has_feature(CPU_FTR_ARCH_207S))
> +   asm volatile(PPC_PHWSYNC ::: "memory");

Shouldn't this fallback to a compatible store-fence in an else statement?


Re: [PATCH updated] libnvdimm/nvdimm/flush: Allow architecture to override the flush barrier

2020-06-29 Thread Dan Williams
On Mon, Jun 29, 2020 at 1:29 PM Aneesh Kumar K.V
 wrote:
>
> Architectures like ppc64 provide persistent memory specific barriers
> that will ensure that all stores for which the modifications are
> written to persistent storage by preceding dcbfps and dcbstps
> instructions have updated persistent storage before any data
> access or data transfer caused by subsequent instructions is initiated.
> This is in addition to the ordering done by wmb()
>
> Update nvdimm core such that architecture can use barriers other than
> wmb to ensure all previous writes are architecturally visible for
> the platform buffer flush.
>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  drivers/md/dm-writecache.c   | 2 +-
>  drivers/nvdimm/region_devs.c | 8 
>  include/linux/libnvdimm.h| 4 
>  3 files changed, 9 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c
> index 74f3c506f084..8c6b6dce64e2 100644
> --- a/drivers/md/dm-writecache.c
> +++ b/drivers/md/dm-writecache.c
> @@ -536,7 +536,7 @@ static void ssd_commit_superblock(struct dm_writecache 
> *wc)
>  static void writecache_commit_flushed(struct dm_writecache *wc, bool 
> wait_for_ios)
>  {
> if (WC_MODE_PMEM(wc))
> -   wmb();
> +   arch_pmem_flush_barrier();
> else
> ssd_commit_flushed(wc, wait_for_ios);
>  }
> diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> index 4502f9c4708d..b308ad09b63d 100644
> --- a/drivers/nvdimm/region_devs.c
> +++ b/drivers/nvdimm/region_devs.c
> @@ -1206,13 +1206,13 @@ int generic_nvdimm_flush(struct nd_region *nd_region)
> idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));
>
> /*
> -* The first wmb() is needed to 'sfence' all previous writes
> -* such that they are architecturally visible for the platform
> -* buffer flush.  Note that we've already arranged for pmem
> +* The first arch_pmem_flush_barrier() is needed to 'sfence' all
> +* previous writes such that they are architecturally visible for
> +* the platform buffer flush. Note that we've already arranged for 
> pmem
>  * writes to avoid the cache via memcpy_flushcache().  The final
>  * wmb() ensures ordering for the NVDIMM flush write.
>  */
> -   wmb();
> +   arch_pmem_flush_barrier();
> for (i = 0; i < nd_region->ndr_mappings; i++)
> if (ndrd_get_flush_wpq(ndrd, i, 0))
> writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
> diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
> index 18da4059be09..66f6c65bd789 100644
> --- a/include/linux/libnvdimm.h
> +++ b/include/linux/libnvdimm.h
> @@ -286,4 +286,8 @@ static inline void arch_invalidate_pmem(void *addr, 
> size_t size)
>  }
>  #endif
>
> +#ifndef arch_pmem_flush_barrier
> +#define arch_pmem_flush_barrier() wmb()
> +#endif

I think it is out of place to define this in libnvdimm.h and it is odd
to give it such a long name. The other pmem api helpers like
arch_wb_cache_pmem() and arch_invalidate_pmem() are function calls for
libnvdimm driver operations, this barrier is just an instruction and
is closer to wmb() than the pmem api routine.

Since it is a store fence for pmem, so let's just call it pmem_wmb()
and define the generic version in include/linux/compiler.h. It should
probably also be documented alongside dma_wmb() in
Documentation/memory-barriers.txt about why code would use it over
wmb(), and why a symmetric pmem_rmb() is not needed.


Re: [PATCH v13 2/6] seq_buf: Export seq_buf_printf

2020-06-15 Thread Dan Williams
On Mon, Jun 15, 2020 at 5:56 AM Borislav Petkov  wrote:
>
> On Mon, Jun 15, 2020 at 06:14:03PM +0530, Vaibhav Jain wrote:
> > 'seq_buf' provides a very useful abstraction for writing to a string
> > buffer without needing to worry about it over-flowing. However even
> > though the API has been stable for couple of years now its still not
> > exported to kernel loadable modules limiting its usage.
> >
> > Hence this patch proposes update to 'seq_buf.c' to mark
> > seq_buf_printf() which is part of the seq_buf API to be exported to
> > kernel loadable GPL modules. This symbol will be used in later parts
> > of this patch-set to simplify content creation for a sysfs attribute.
> >
> > Cc: Piotr Maziarz 
> > Cc: Cezary Rojewski 
> > Cc: Christoph Hellwig 
> > Cc: Steven Rostedt 
> > Cc: Borislav Petkov 
> > Acked-by: Steven Rostedt (VMware) 
> > Signed-off-by: Vaibhav Jain 
> > ---
> > Changelog:
> >
> > v12..v13:
> > * None
> >
> > v11..v12:
> > * None
>
> Can you please resend your patchset once a week like everyone else and
> not flood inboxes with it?

Hi Boris,

I gave Vaibhav some long shot hope that his series could be included
in my libnvdimm pull request for -rc1. Save for a last minute clang
report that I misread as a gcc warning, I likely would have included.
This spin is looking to address the last of the comments I had and
something I would consider for -rc2. So, in this case the resends were
requested by me and I'll take the grumbles on Vaibhav's behalf.


Re: [PATCH v11 5/6] ndctl/papr_scm, uapi: Add support for PAPR nvdimm specific methods

2020-06-10 Thread Dan Williams
On Wed, Jun 10, 2020 at 5:10 AM Vaibhav Jain  wrote:
>
> Dan Williams  writes:
>
> > On Tue, Jun 9, 2020 at 10:54 AM Vaibhav Jain  wrote:
> >>
> >> Thanks Dan for the consideration and taking time to look into this.
> >>
> >> My responses below:
> >>
> >> Dan Williams  writes:
> >>
> >> > On Mon, Jun 8, 2020 at 5:16 PM kernel test robot  wrote:
> >> >>
> >> >> Hi Vaibhav,
> >> >>
> >> >> Thank you for the patch! Perhaps something to improve:
> >> >>
> >> >> [auto build test WARNING on powerpc/next]
> >> >> [also build test WARNING on linus/master v5.7 next-20200605]
> >> >> [cannot apply to linux-nvdimm/libnvdimm-for-next scottwood/next]
> >> >> [if your patch is applied to the wrong git tree, please drop us a note 
> >> >> to help
> >> >> improve the system. BTW, we also suggest to use '--base' option to 
> >> >> specify the
> >> >> base tree in git format-patch, please see 
> >> >> https://stackoverflow.com/a/37406982]
> >> >>
> >> >> url:
> >> >> https://github.com/0day-ci/linux/commits/Vaibhav-Jain/powerpc-papr_scm-Add-support-for-reporting-nvdimm-health/20200607-211653
> >> >> base:   
> >> >> https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
> >> >> config: powerpc-randconfig-r016-20200607 (attached as .config)
> >> >> compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 
> >> >> e429cffd4f228f70c1d9df0e5d77c08590dd9766)
> >> >> reproduce (this is a W=1 build):
> >> >> wget 
> >> >> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross
> >> >>  -O ~/bin/make.cross
> >> >> chmod +x ~/bin/make.cross
> >> >> # install powerpc cross compiling tool for clang build
> >> >> # apt-get install binutils-powerpc-linux-gnu
> >> >> # save the attached .config to linux build tree
> >> >> COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross 
> >> >> ARCH=powerpc
> >> >>
> >> >> If you fix the issue, kindly add following tag as appropriate
> >> >> Reported-by: kernel test robot 
> >> >>
> >> >> All warnings (new ones prefixed by >>, old ones prefixed by <<):
> >> >>
> >> >> In file included from :1:
> >> >> >> ./usr/include/asm/papr_pdsm.h:69:20: warning: field 'hdr' with 
> >> >> >> variable sized type 'struct nd_cmd_pkg' not at the end of a struct 
> >> >> >> or class is a GNU extension [-Wgnu-variable-sized-type-not-at-end]
> >> >> struct nd_cmd_pkg hdr;  /* Package header containing sub-cmd */
> >> >
> >> > Hi Vaibhav,
> >> >
> >> [.]
> >> > This looks like it's going to need another round to get this fixed. I
> >> > don't think 'struct nd_pdsm_cmd_pkg' should embed a definition of
> >> > 'struct nd_cmd_pkg'. An instance of 'struct nd_cmd_pkg' carries a
> >> > payload that is the 'pdsm' specifics. As the code has it now it's
> >> > defined as a superset of 'struct nd_cmd_pkg' and the compiler warning
> >> > is pointing out a real 'struct' organization problem.
> >> >
> >> > Given the soak time needed in -next after the code is finalized this
> >> > there's no time to do another round of updates and still make the v5.8
> >> > merge window.
> >>
> >> Agreed that this looks bad, a solution will probably need some more
> >> review cycles resulting in this series missing the merge window.
> >>
> >> I am investigating into the possible solutions for this reported issue
> >> and made few observations:
> >>
> >> I see command pkg for Intel, Hpe, Msft and Hyperv families using a
> >> similar layout of embedding nd_cmd_pkg at the head of the
> >> command-pkg. struct nd_pdsm_cmd_pkg is following the same pattern.
> >>
> >> struct nd_pdsm_cmd_pkg {
> >> struct nd_cmd_pkg hdr;
> >> /* other members */
> >> };
> >>
> >> struct ndn_pkg_msft {
> >> struct nd_cmd_pkg gen;
> >> /* other members */
> >> };
> >> struct nd_pkg_intel {
> >> struct nd_cmd_pkg gen;
> >> /

Re: [PATCH v11 5/6] ndctl/papr_scm, uapi: Add support for PAPR nvdimm specific methods

2020-06-09 Thread Dan Williams
On Tue, Jun 9, 2020 at 10:54 AM Vaibhav Jain  wrote:
>
> Thanks Dan for the consideration and taking time to look into this.
>
> My responses below:
>
> Dan Williams  writes:
>
> > On Mon, Jun 8, 2020 at 5:16 PM kernel test robot  wrote:
> >>
> >> Hi Vaibhav,
> >>
> >> Thank you for the patch! Perhaps something to improve:
> >>
> >> [auto build test WARNING on powerpc/next]
> >> [also build test WARNING on linus/master v5.7 next-20200605]
> >> [cannot apply to linux-nvdimm/libnvdimm-for-next scottwood/next]
> >> [if your patch is applied to the wrong git tree, please drop us a note to 
> >> help
> >> improve the system. BTW, we also suggest to use '--base' option to specify 
> >> the
> >> base tree in git format-patch, please see 
> >> https://stackoverflow.com/a/37406982]
> >>
> >> url:
> >> https://github.com/0day-ci/linux/commits/Vaibhav-Jain/powerpc-papr_scm-Add-support-for-reporting-nvdimm-health/20200607-211653
> >> base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git 
> >> next
> >> config: powerpc-randconfig-r016-20200607 (attached as .config)
> >> compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 
> >> e429cffd4f228f70c1d9df0e5d77c08590dd9766)
> >> reproduce (this is a W=1 build):
> >> wget 
> >> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross 
> >> -O ~/bin/make.cross
> >> chmod +x ~/bin/make.cross
> >> # install powerpc cross compiling tool for clang build
> >> # apt-get install binutils-powerpc-linux-gnu
> >> # save the attached .config to linux build tree
> >> COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross 
> >> ARCH=powerpc
> >>
> >> If you fix the issue, kindly add following tag as appropriate
> >> Reported-by: kernel test robot 
> >>
> >> All warnings (new ones prefixed by >>, old ones prefixed by <<):
> >>
> >> In file included from :1:
> >> >> ./usr/include/asm/papr_pdsm.h:69:20: warning: field 'hdr' with variable 
> >> >> sized type 'struct nd_cmd_pkg' not at the end of a struct or class is a 
> >> >> GNU extension [-Wgnu-variable-sized-type-not-at-end]
> >> struct nd_cmd_pkg hdr;  /* Package header containing sub-cmd */
> >
> > Hi Vaibhav,
> >
> [.]
> > This looks like it's going to need another round to get this fixed. I
> > don't think 'struct nd_pdsm_cmd_pkg' should embed a definition of
> > 'struct nd_cmd_pkg'. An instance of 'struct nd_cmd_pkg' carries a
> > payload that is the 'pdsm' specifics. As the code has it now it's
> > defined as a superset of 'struct nd_cmd_pkg' and the compiler warning
> > is pointing out a real 'struct' organization problem.
> >
> > Given the soak time needed in -next after the code is finalized this
> > there's no time to do another round of updates and still make the v5.8
> > merge window.
>
> Agreed that this looks bad, a solution will probably need some more
> review cycles resulting in this series missing the merge window.
>
> I am investigating into the possible solutions for this reported issue
> and made few observations:
>
> I see command pkg for Intel, Hpe, Msft and Hyperv families using a
> similar layout of embedding nd_cmd_pkg at the head of the
> command-pkg. struct nd_pdsm_cmd_pkg is following the same pattern.
>
> struct nd_pdsm_cmd_pkg {
> struct nd_cmd_pkg hdr;
> /* other members */
> };
>
> struct ndn_pkg_msft {
> struct nd_cmd_pkg gen;
> /* other members */
> };
> struct nd_pkg_intel {
> struct nd_cmd_pkg gen;
> /* other members */
> };
> struct ndn_pkg_hpe1 {
> struct nd_cmd_pkg gen;
> /* other members */

In those cases the other members are a union and there is no second
variable length array. Perhaps that is why those definitions are not
getting flagged? I'm not seeing anything in ndctl build options that
would explicitly disable this warning, but I'm not sure if the ndctl
build environment is missing this build warning by accident.

Those variable size payloads are also not being used in any code paths
that would look at the size of the command payload, like the kernel
ioctl() path. The payload validation code needs static sizes and the
payload parsing code wants to cast the payload to a known type. I
don't think you can use the same struct definition for both those
cases which is why the ndctl parsing code uses the union layout, but
the kernel command marsha

Re: [PATCH v11 5/6] ndctl/papr_scm, uapi: Add support for PAPR nvdimm specific methods

2020-06-08 Thread Dan Williams
On Mon, Jun 8, 2020 at 5:16 PM kernel test robot  wrote:
>
> Hi Vaibhav,
>
> Thank you for the patch! Perhaps something to improve:
>
> [auto build test WARNING on powerpc/next]
> [also build test WARNING on linus/master v5.7 next-20200605]
> [cannot apply to linux-nvdimm/libnvdimm-for-next scottwood/next]
> [if your patch is applied to the wrong git tree, please drop us a note to help
> improve the system. BTW, we also suggest to use '--base' option to specify the
> base tree in git format-patch, please see 
> https://stackoverflow.com/a/37406982]
>
> url:
> https://github.com/0day-ci/linux/commits/Vaibhav-Jain/powerpc-papr_scm-Add-support-for-reporting-nvdimm-health/20200607-211653
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
> config: powerpc-randconfig-r016-20200607 (attached as .config)
> compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 
> e429cffd4f228f70c1d9df0e5d77c08590dd9766)
> reproduce (this is a W=1 build):
> wget 
> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
> ~/bin/make.cross
> chmod +x ~/bin/make.cross
> # install powerpc cross compiling tool for clang build
> # apt-get install binutils-powerpc-linux-gnu
> # save the attached .config to linux build tree
> COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross 
> ARCH=powerpc
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot 
>
> All warnings (new ones prefixed by >>, old ones prefixed by <<):
>
> In file included from :1:
> >> ./usr/include/asm/papr_pdsm.h:69:20: warning: field 'hdr' with variable 
> >> sized type 'struct nd_cmd_pkg' not at the end of a struct or class is a 
> >> GNU extension [-Wgnu-variable-sized-type-not-at-end]
> struct nd_cmd_pkg hdr;  /* Package header containing sub-cmd */

Hi Vaibhav,

This looks like it's going to need another round to get this fixed. I
don't think 'struct nd_pdsm_cmd_pkg' should embed a definition of
'struct nd_cmd_pkg'. An instance of 'struct nd_cmd_pkg' carries a
payload that is the 'pdsm' specifics. As the code has it now it's
defined as a superset of 'struct nd_cmd_pkg' and the compiler warning
is pointing out a real 'struct' organization problem.

Given the soak time needed in -next after the code is finalized this
there's no time to do another round of updates and still make the v5.8
merge window.


Re: [PATCH v10 5/6] ndctl/papr_scm, uapi: Add support for PAPR nvdimm specific methods

2020-06-05 Thread Dan Williams
On Fri, Jun 5, 2020 at 12:50 PM Ira Weiny  wrote:
>
> On Fri, Jun 05, 2020 at 05:11:35AM +0530, Vaibhav Jain wrote:
> > Introduce support for PAPR NVDIMM Specific Methods (PDSM) in papr_scm
> > module and add the command family NVDIMM_FAMILY_PAPR to the white list
> > of NVDIMM command sets. Also advertise support for ND_CMD_CALL for the
> > nvdimm command mask and implement necessary scaffolding in the module
> > to handle ND_CMD_CALL ioctl and PDSM requests that we receive.
> >
> > The layout of the PDSM request as we expect from libnvdimm/libndctl is
> > described in newly introduced uapi header 'papr_pdsm.h' which
> > defines a new 'struct nd_pdsm_cmd_pkg' header. This header is used
> > to communicate the PDSM request via member
> > 'nd_cmd_pkg.nd_command' and size of payload that need to be
> > sent/received for servicing the PDSM.
> >
> > A new function is_cmd_valid() is implemented that reads the args to
> > papr_scm_ndctl() and performs sanity tests on them. A new function
> > papr_scm_service_pdsm() is introduced and is called from
> > papr_scm_ndctl() in case of a PDSM request is received via ND_CMD_CALL
> > command from libnvdimm.
> >
> > Cc: "Aneesh Kumar K . V" 
> > Cc: Dan Williams 
> > Cc: Michael Ellerman 
> > Cc: Ira Weiny 
> > Signed-off-by: Vaibhav Jain 
> > ---
> > Changelog:
> >
> > v9..v10:
> > * Simplified 'struct nd_pdsm_cmd_pkg' by removing the
> >   'payload_version' field.
> > * Removed the corrosponding documentation on versioning and backward
> >   compatibility from 'papr_pdsm.h'
> > * Reduced the size of reserved fields to 4-bytes making 'struct
> >   nd_pdsm_cmd_pkg' 64 + 8 bytes long.
> > * Updated is_cmd_valid() to enforce validation checks on pdsm
> >   commands. [ Dan Williams ]
> > * Added check for reserved fields being set to '0' in is_cmd_valid()
> >   [ Ira ]
> > * Moved changes for checking cmd_rc == NULL and logging improvements
> >   to a separate prelim patch [ Ira ].
> > * Moved  pdsm package validation checks from papr_scm_service_pdsm()
> >   to is_cmd_valid().
> > * Marked papr_scm_service_pdsm() return type as 'void' since errors
> >   are reported in nd_pdsm_cmd_pkg.cmd_status field.
> >
> > Resend:
> > * Added ack from Aneesh.
> >
> > v8..v9:
> > * Reduced the usage of term SCM replacing it with appropriate
> >   replacement [ Dan Williams, Aneesh ]
> > * Renamed 'papr_scm_pdsm.h' to 'papr_pdsm.h'
> > * s/PAPR_SCM_PDSM_*/PAPR_PDSM_*/g
> > * s/NVDIMM_FAMILY_PAPR_SCM/NVDIMM_FAMILY_PAPR/g
> > * Minor updates to 'papr_psdm.h' to replace usage of term 'SCM'.
> > * Minor update to patch description.
> >
> > v7..v8:
> > * Removed the 'payload_offset' field from 'struct
> >   nd_pdsm_cmd_pkg'. Instead command payload is always assumed to start
> >   at 'nd_pdsm_cmd_pkg.payload'. [ Aneesh ]
> > * To enable introducing new fields to 'struct nd_pdsm_cmd_pkg',
> >   'reserved' field of 10-bytes is introduced. [ Aneesh ]
> > * Fixed a typo in "Backward Compatibility" section of papr_scm_pdsm.h
> >   [ Ira ]
> >
> > Resend:
> > * None
> >
> > v6..v7 :
> > * Removed the re-definitions of __packed macro from papr_scm_pdsm.h
> >   [Mpe].
> > * Removed the usage of __KERNEL__ macros in papr_scm_pdsm.h [Mpe].
> > * Removed macros that were unused in papr_scm.c from papr_scm_pdsm.h
> >   [Mpe].
> > * Made functions defined in papr_scm_pdsm.h as static inline. [Mpe]
> >
> > v5..v6 :
> > * Changed the usage of the term DSM to PDSM to distinguish it from the
> >   ACPI term [ Dan Williams ]
> > * Renamed papr_scm_dsm.h to papr_scm_pdsm.h and updated various struct
> >   to reflect the new terminology.
> > * Updated the patch description and title to reflect the new terminology.
> > * Squashed patch to introduce new command family in 'ndctl.h' with
> >   this patch [ Dan Williams ]
> > * Updated the papr_scm_pdsm method starting index from 0x1 to 0x0
> >   [ Dan Williams ]
> > * Removed redundant license text from the papr_scm_psdm.h file.
> >   [ Dan Williams ]
> > * s/envelop/envelope/ at various places [ Dan Williams ]
> > * Added '__packed' attribute to command package header to gaurd
> >   against different compiler adding paddings between the fields.
> >   [ Dan Williams]
> > * Converted various pr_debug to dev_debug [ Dan Williams ]
> >
> > v4..v5 :
> > * None
> >
> > v3..v4 :
> > * None
> >
> > v2..v3 :
> > * Updated the patch prefi

Re: [PATCH v10 4/6] powerpc/papr_scm: Improve error logging and handling papr_scm_ndctl()

2020-06-05 Thread Dan Williams
On Fri, Jun 5, 2020 at 10:13 AM Ira Weiny  wrote:
>
> On Fri, Jun 05, 2020 at 05:11:34AM +0530, Vaibhav Jain wrote:
> > Since papr_scm_ndctl() can be called from outside papr_scm, its
> > exposed to the possibility of receiving NULL as value of 'cmd_rc'
> > argument. This patch updates papr_scm_ndctl() to protect against such
> > possibility by assigning it pointer to a local variable in case cmd_rc
> > == NULL.
> >
> > Finally the patch also updates the 'default' clause of the switch-case
> > block removing a 'return' statement thereby ensuring that value of
> > 'cmd_rc' is always logged when papr_scm_ndctl() returns.
> >
> > Cc: "Aneesh Kumar K . V" 
> > Cc: Dan Williams 
> > Cc: Michael Ellerman 
> > Cc: Ira Weiny 
> > Signed-off-by: Vaibhav Jain 
> > ---
> > Changelog:
> >
> > v9..v10
> > * New patch in the series
>
> Thanks for making this a separate patch it is easier to see what is going on
> here.
>
> > ---
> >  arch/powerpc/platforms/pseries/papr_scm.c | 10 --
> >  1 file changed, 8 insertions(+), 2 deletions(-)
> >
> > diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> > b/arch/powerpc/platforms/pseries/papr_scm.c
> > index 0c091622b15e..6512fe6a2874 100644
> > --- a/arch/powerpc/platforms/pseries/papr_scm.c
> > +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> > @@ -355,11 +355,16 @@ static int papr_scm_ndctl(struct 
> > nvdimm_bus_descriptor *nd_desc,
> >  {
> >   struct nd_cmd_get_config_size *get_size_hdr;
> >   struct papr_scm_priv *p;
> > + int rc;
> >
> >   /* Only dimm-specific calls are supported atm */
> >   if (!nvdimm)
> >   return -EINVAL;
> >
> > + /* Use a local variable in case cmd_rc pointer is NULL */
> > + if (!cmd_rc)
> > + cmd_rc = 
> > +
>
> This protects you from the NULL.  However...
>
> >   p = nvdimm_provider_data(nvdimm);
> >
> >   switch (cmd) {
> > @@ -381,12 +386,13 @@ static int papr_scm_ndctl(struct 
> > nvdimm_bus_descriptor *nd_desc,
> >   break;
> >
> >   default:
> > - return -EINVAL;
> > + dev_dbg(>pdev->dev, "Unknown command = %d\n", cmd);
> > + *cmd_rc = -EINVAL;
>
> ... I think you are conflating rc and cmd_rc...
>
> >   }
> >
> >   dev_dbg(>pdev->dev, "returned with cmd_rc = %d\n", *cmd_rc);
> >
> > - return 0;
> > + return *cmd_rc;
>
> ... this changes the behavior of the current commands.  Now if the underlying
> papr_scm_meta_[get|set]() fails you return that failure as rc rather than 0.
>
> Is that ok?

The expectation is that rc is "did the command get sent to the device,
or did it fail for 'transport' reasons". The role of cmd_rc is to
translate the specific status response of the command into a common
error code. The expectations are:

rc < 0: Error code, Linux terminated the ioctl before talking to hardware

rc == 0: Linux successfully submitted the command to hardware, cmd_rc
is valid for command specific response

rc > 0: Linux successfully submitted the command, but detected that
only a subset of the data was accepted for "write"-style commands, or
that only subset of data was returned for "read"-style commands. I.e.
short-write / short-read semantics. cmd_rc is valid in this case and
its up to userspace to determine if a short transfer is an error or
not.

> Also 'logging cmd_rc' in the invalid cmd case does not seem quite right unless
> you really want rc to be cmd_rc.
>
> The architecture is designed to separate errors which occur in the kernel vs
> errors in the firmware/dimm.  Are they always the same?  The current code
> differentiates them.

Yeah, they're distinct, transport vs end-point / command-specific
status returns.


Re: [RESEND PATCH v9 4/5] ndctl/papr_scm,uapi: Add support for PAPR nvdimm specific methods

2020-06-05 Thread Dan Williams
On Fri, Jun 5, 2020 at 8:22 AM Vaibhav Jain  wrote:
[..]
> > Oh, why not define a maximal health payload with all the attributes
> > you know about today, leave some room for future expansion, and then
> > report a validity flag for each attribute? This is how the "intel"
> > smart-health payload works. If they ever needed to extend the payload
> > they would increase the size and add more validity flags. Old
> > userspace never groks the new fields, new userspace knows to ask for
> > and parse the larger payload.
> >
> > See the flags field in 'struct nd_intel_smart' (in ndctl) and the
> > translation of those flags to ndctl generic attribute flags
> > intel_cmd_smart_get_flags().
> >
> > In general I'd like ndctl to understand the superset of all health
> > attributes across all vendors. For the truly vendor specific ones it
> > would mean that the health flags with a specific "papr_scm" back-end
> > just would never be set on an "intel" device. I.e. look at the "hpe"
> > and "msft" health backends. They only set a subset of the valid flags
> > that could be reported.
>
> Thanks, this sounds good. Infact papr_scm implementation in ndctl does
> advertises support for only a subset of ND_SMART_* flags right now.
>
> Using 'flags' instead of 'version' was indeed discussed during
> v7..v9. However re-looking at the 'msft' and 'hpe' implementations the
> approach of maximal health payload tagged with a flags field looks more
> intuitive and I would prefer implementing this scheme in this patch-set.
>
> The current set health data exchanged with between libndctl and
> papr_scm via 'struct nd_papr_pdsm_health' (e.g various health status
> bits , nvdimm arming status etc) are guaranteed to be always available
> hence associating their availability with a flag wont be much useful as
> the flag will be always set.
>
> However as you suggested, extending the 'struct nd_papr_pdsm_health' in
> future to accommodate new attributes like 'life-remaining' can be done
> via adding them to the end of the struct and setting a flag field to
> indicate its presence.
>
> So I have the following proposal:
> * Add a new '__u32 extension_flags' field at beginning of 'struct
>   nd_papr_pdsm_health'
> * Set the size of the struct to 184-bytes which is the maximum possible
>   size for a pdsm payload.
> * 'papr_scm' kernel driver will currently set 'extension_flag' to 0
>   indicating no extension fields.
>
> * Future patch that adds support for 'life-remaining' add the new-field
>   at the end of known fields in 'struct nd_papr_pdsm_health'.
> * When provided to  papr_scm kernel module, if 'life-remaining' data is
>   available its populated and corresponding flag set in
>   'extension_flags' field indicating its presence.
> * When received by libndctl papr_scm implementation its tests if the
>   extension_flags have associated 'life-remaining' flag set and if yes
>   then return ND_SMART_USED_VALID flag back from
>   ndctl_cmd_smart_get_flags().
>
> Implementing first 3 items above in the current patchset should be
> fairly trivial.
>
> Does that sounds reasonable ?

This sounds good to me.


Re: [RFC PATCH 1/2] libnvdimm: Add prctl control for disabling synchronous fault support.

2020-05-30 Thread Dan Williams
On Sat, May 30, 2020 at 12:18 AM Aneesh Kumar K.V
 wrote:
>
> On 5/30/20 12:52 AM, Dan Williams wrote:
> > On Fri, May 29, 2020 at 3:55 AM Aneesh Kumar K.V
> >  wrote:
> >>
> >> On 5/29/20 3:22 PM, Jan Kara wrote:
> >>> Hi!
> >>>
> >>> On Fri 29-05-20 15:07:31, Aneesh Kumar K.V wrote:
> >>>> Thanks Michal. I also missed Jeff in this email thread.
> >>>
> >>> And I think you'll also need some of the sched maintainers for the prctl
> >>> bits...
> >>>
> >>>> On 5/29/20 3:03 PM, Michal Suchánek wrote:
> >>>>> Adding Jan
> >>>>>
> >>>>> On Fri, May 29, 2020 at 11:11:39AM +0530, Aneesh Kumar K.V wrote:
> >>>>>> With POWER10, architecture is adding new pmem flush and sync 
> >>>>>> instructions.
> >>>>>> The kernel should prevent the usage of MAP_SYNC if applications are 
> >>>>>> not using
> >>>>>> the new instructions on newer hardware.
> >>>>>>
> >>>>>> This patch adds a prctl option MAP_SYNC_ENABLE that can be used to 
> >>>>>> enable
> >>>>>> the usage of MAP_SYNC. The kernel config option is added to allow the 
> >>>>>> user
> >>>>>> to control whether MAP_SYNC should be enabled by default or not.
> >>>>>>
> >>>>>> Signed-off-by: Aneesh Kumar K.V 
> >>> ...
> >>>>>> diff --git a/kernel/fork.c b/kernel/fork.c
> >>>>>> index 8c700f881d92..d5a9a363e81e 100644
> >>>>>> --- a/kernel/fork.c
> >>>>>> +++ b/kernel/fork.c
> >>>>>> @@ -963,6 +963,12 @@ __cacheline_aligned_in_smp 
> >>>>>> DEFINE_SPINLOCK(mmlist_lock);
> >>>>>> static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
> >>>>>> +#ifdef CONFIG_ARCH_MAP_SYNC_DISABLE
> >>>>>> +unsigned long default_map_sync_mask = MMF_DISABLE_MAP_SYNC_MASK;
> >>>>>> +#else
> >>>>>> +unsigned long default_map_sync_mask = 0;
> >>>>>> +#endif
> >>>>>> +
> >>>
> >>> I'm not sure CONFIG is really the right approach here. For a distro that 
> >>> would
> >>> basically mean to disable MAP_SYNC for all PPC kernels unless application
> >>> explicitly uses the right prctl. Shouldn't we rather initialize
> >>> default_map_sync_mask on boot based on whether the CPU we run on requires
> >>> new flush instructions or not? Otherwise the patch looks sensible.
> >>>
> >>
> >> yes that is correct. We ideally want to deny MAP_SYNC only w.r.t
> >> POWER10. But on a virtualized platform there is no easy way to detect
> >> that. We could ideally hook this into the nvdimm driver where we look at
> >> the new compat string ibm,persistent-memory-v2 and then disable MAP_SYNC
> >> if we find a device with the specific value.
> >>
> >> BTW with the recent changes I posted for the nvdimm driver, older kernel
> >> won't initialize persistent memory device on newer hardware. Newer
> >> hardware will present the device to OS with a different device tree
> >> compat string.
> >>
> >> My expectation  w.r.t this patch was, Distro would want to  mark
> >> CONFIG_ARCH_MAP_SYNC_DISABLE=n based on the different application
> >> certification.  Otherwise application will have to end up calling the
> >> prctl(MMF_DISABLE_MAP_SYNC, 0) any way. If that is the case, should this
> >> be dependent on P10?
> >>
> >> With that I am wondering should we even have this patch? Can we expect
> >> userspace get updated to use new instruction?.
> >>
> >> With ppc64 we never had a real persistent memory device available for
> >> end user to try. The available persistent memory stack was using vPMEM
> >> which was presented as a volatile memory region for which there is no
> >> need to use any of the flush instructions. We could safely assume that
> >> as we get applications certified/verified for working with pmem device
> >> on ppc64, they would all be using the new instructions?
> >
> > I think prctl is the wrong interface for this. I was thinking a sysfs
> > interface along the same lines as /sys/block/pmemX/dax/write_cache.
> > That attribute is toggling DAXDEV_WRITE_CACHE for the determination of
> > whether the platform or the kernel needs to handle cache flushing
> > relative to power loss. A similar attribute can be established for
> > DAXDEV_SYNC, it would simply default to off based on a configuration
> > time policy, but be dynamically changeable at runtime via sysfs.
> >
> > These flags are device properties that affect the kernel and
> > userspace's handling of persistence.
> >
>
> That will not handle the scenario with multiple applications using the
> same fsdax mount point where one is updated to use the new instruction
> and the other is not.

Right, it needs to be a global setting / flag day to switch from one
regime to another. Per-process control is a recipe for disaster.


Re: [RFC PATCH 1/2] libnvdimm: Add prctl control for disabling synchronous fault support.

2020-05-29 Thread Dan Williams
On Fri, May 29, 2020 at 3:55 AM Aneesh Kumar K.V
 wrote:
>
> On 5/29/20 3:22 PM, Jan Kara wrote:
> > Hi!
> >
> > On Fri 29-05-20 15:07:31, Aneesh Kumar K.V wrote:
> >> Thanks Michal. I also missed Jeff in this email thread.
> >
> > And I think you'll also need some of the sched maintainers for the prctl
> > bits...
> >
> >> On 5/29/20 3:03 PM, Michal Suchánek wrote:
> >>> Adding Jan
> >>>
> >>> On Fri, May 29, 2020 at 11:11:39AM +0530, Aneesh Kumar K.V wrote:
>  With POWER10, architecture is adding new pmem flush and sync 
>  instructions.
>  The kernel should prevent the usage of MAP_SYNC if applications are not 
>  using
>  the new instructions on newer hardware.
> 
>  This patch adds a prctl option MAP_SYNC_ENABLE that can be used to enable
>  the usage of MAP_SYNC. The kernel config option is added to allow the 
>  user
>  to control whether MAP_SYNC should be enabled by default or not.
> 
>  Signed-off-by: Aneesh Kumar K.V 
> > ...
>  diff --git a/kernel/fork.c b/kernel/fork.c
>  index 8c700f881d92..d5a9a363e81e 100644
>  --- a/kernel/fork.c
>  +++ b/kernel/fork.c
>  @@ -963,6 +963,12 @@ __cacheline_aligned_in_smp 
>  DEFINE_SPINLOCK(mmlist_lock);
> static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
>  +#ifdef CONFIG_ARCH_MAP_SYNC_DISABLE
>  +unsigned long default_map_sync_mask = MMF_DISABLE_MAP_SYNC_MASK;
>  +#else
>  +unsigned long default_map_sync_mask = 0;
>  +#endif
>  +
> >
> > I'm not sure CONFIG is really the right approach here. For a distro that 
> > would
> > basically mean to disable MAP_SYNC for all PPC kernels unless application
> > explicitly uses the right prctl. Shouldn't we rather initialize
> > default_map_sync_mask on boot based on whether the CPU we run on requires
> > new flush instructions or not? Otherwise the patch looks sensible.
> >
>
> yes that is correct. We ideally want to deny MAP_SYNC only w.r.t
> POWER10. But on a virtualized platform there is no easy way to detect
> that. We could ideally hook this into the nvdimm driver where we look at
> the new compat string ibm,persistent-memory-v2 and then disable MAP_SYNC
> if we find a device with the specific value.
>
> BTW with the recent changes I posted for the nvdimm driver, older kernel
> won't initialize persistent memory device on newer hardware. Newer
> hardware will present the device to OS with a different device tree
> compat string.
>
> My expectation  w.r.t this patch was, Distro would want to  mark
> CONFIG_ARCH_MAP_SYNC_DISABLE=n based on the different application
> certification.  Otherwise application will have to end up calling the
> prctl(MMF_DISABLE_MAP_SYNC, 0) any way. If that is the case, should this
> be dependent on P10?
>
> With that I am wondering should we even have this patch? Can we expect
> userspace get updated to use new instruction?.
>
> With ppc64 we never had a real persistent memory device available for
> end user to try. The available persistent memory stack was using vPMEM
> which was presented as a volatile memory region for which there is no
> need to use any of the flush instructions. We could safely assume that
> as we get applications certified/verified for working with pmem device
> on ppc64, they would all be using the new instructions?

I think prctl is the wrong interface for this. I was thinking a sysfs
interface along the same lines as /sys/block/pmemX/dax/write_cache.
That attribute is toggling DAXDEV_WRITE_CACHE for the determination of
whether the platform or the kernel needs to handle cache flushing
relative to power loss. A similar attribute can be established for
DAXDEV_SYNC, it would simply default to off based on a configuration
time policy, but be dynamically changeable at runtime via sysfs.

These flags are device properties that affect the kernel and
userspace's handling of persistence.


Re: [PATCH v8 1/5] powerpc: Document details on H_SCM_HEALTH hcall

2020-05-27 Thread Dan Williams
On Tue, May 26, 2020 at 9:13 PM Vaibhav Jain  wrote:
>
> Add documentation to 'papr_hcalls.rst' describing the bitmap flags
> that are returned from H_SCM_HEALTH hcall as per the PAPR-SCM
> specification.
>

Please do a global s/SCM/PMEM/ or s/SCM/NVDIMM/. It's unfortunate that
we already have 2 ways to describe persistent memory devices, let's
not perpetuate a third so that "grep" has a chance to find
interrelated code across architectures. Other than that this looks
good to me.

> Cc: "Aneesh Kumar K . V" 
> Cc: Dan Williams 
> Cc: Michael Ellerman 
> Cc: Ira Weiny 
> Signed-off-by: Vaibhav Jain 
> ---
> Changelog:
> v7..v8:
> * Added a clarification on bit-ordering of Health Bitmap
>
> Resend:
> * None
>
> v6..v7:
> * None
>
> v5..v6:
> * New patch in the series
> ---
>  Documentation/powerpc/papr_hcalls.rst | 45 ---
>  1 file changed, 41 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/powerpc/papr_hcalls.rst 
> b/Documentation/powerpc/papr_hcalls.rst
> index 3493631a60f8..45063f305813 100644
> --- a/Documentation/powerpc/papr_hcalls.rst
> +++ b/Documentation/powerpc/papr_hcalls.rst
> @@ -220,13 +220,50 @@ from the LPAR memory.
>  **H_SCM_HEALTH**
>
>  | Input: drcIndex
> -| Out: *health-bitmap, health-bit-valid-bitmap*
> +| Out: *health-bitmap (r4), health-bit-valid-bitmap (r5)*
>  | Return Value: *H_Success, H_Parameter, H_Hardware*
>
>  Given a DRC Index return the info on predictive failure and overall health of
> -the NVDIMM. The asserted bits in the health-bitmap indicate a single 
> predictive
> -failure and health-bit-valid-bitmap indicate which bits in health-bitmap are
> -valid.
> +the NVDIMM. The asserted bits in the health-bitmap indicate one or more 
> states
> +(described in table below) of the NVDIMM and health-bit-valid-bitmap indicate
> +which bits in health-bitmap are valid. The bits are reported in
> +reverse bit ordering for example a value of 0xC400
> +indicates bits 0, 1, and 5 are valid.
> +
> +Health Bitmap Flags:
> +
> ++--+---+
> +|  Bit |   Definition
>   |
> ++==+===+
> +|  00  | SCM device is unable to persist memory contents.
>   |
> +|  | If the system is powered down, nothing will be saved.   
>   |
> ++--+---+
> +|  01  | SCM device failed to persist memory contents. Either contents were 
> not|
> +|  | saved successfully on power down or were not restored properly on   
>   |
> +|  | power up.   
>   |
> ++--+---+
> +|  02  | SCM device contents are persisted from previous IPL. The data from  
>   |
> +|  | the last boot were successfully restored.   
>   |
> ++--+---+
> +|  03  | SCM device contents are not persisted from previous IPL. There was 
> no |
> +|  | data to restore from the last boot. 
>   |
> ++--+---+
> +|  04  | SCM device memory life remaining is critically low  
>   |
> ++--+---+
> +|  05  | SCM device will be garded off next IPL due to failure   
>   |
> ++--+---+
> +|  06  | SCM contents cannot persist due to current platform health status. 
> A  |
> +|  | hardware failure may prevent data from being saved or restored. 
>   |
> ++--+---+
> +|  07  | SCM device is unable to persist memory contents in certain 
> conditions |
> ++--+---+
> +|  08  | SCM device is encrypted 
>   |
> ++--+---+
> +|  09  | SCM device has successfully completed a requested erase or secure   
>   |
> +|  | erase procedure.
>   |
> ++--+---+
> +|10:63 | Reserved / Unused   
>   |
> ++--+---+
>
>  **H_SCM_PERFORMANCE_STATS**
>
> --
> 2.26.2
>


Re: [PATCH v2 3/5] libnvdimm/nvdimm/flush: Allow architecture to override the flush barrier

2020-05-21 Thread Dan Williams
On Thu, May 21, 2020 at 7:39 AM Jeff Moyer  wrote:
>
> Dan Williams  writes:
>
> >> But I agree with your concern that if we have older kernel/applications
> >> that continue to use `dcbf` on future hardware we will end up
> >> having issues w.r.t powerfail consistency. The plan is what you outlined
> >> above as tighter ecosystem control. Considering we don't have a pmem
> >> device generally available, we get both kernel and userspace upgraded
> >> to use these new instructions before such a device is made available.
>
> I thought power already supported NVDIMM-N, no?  So are you saying that
> those devices will continue to work with the existing flushing and
> fencing mechanisms?
>
> > Ok, I think a compile time kernel option with a runtime override
> > satisfies my concern. Does that work for you?
>
> The compile time option only helps when running newer kernels.  I'm not
> sure how you would even begin to audit userspace applications (keep in
> mind, not every application is open source, and not every application
> uses pmdk).  I also question the merits of forcing the administrator to
> make the determination of whether all applications on the system will
> work properly.  Really, you have to rely on the vendor to tell you the
> platform is supported, and at that point, why put further hurdles in the
> way?

I'm thoroughly confused by this. I thought this was exactly the role
of a Linux distribution vendor. ISVs qualify their application on a
hardware-platform + distribution combination and the distribution owns
picking ABI defaults like CONFIG_SYSFS_DEPRECATED regardless of
whether they can guarantee that all apps are updated to the new
semantics.

The administrator is not forced, the administrator if afforded an
override in the extreme case that they find an exception to what was
qualified and need to override the distribution's compile-time choice.

>
> The decision to require different instructions on ppc is unfortunate,
> but one I'm sure we have no control over.  I don't see any merit in the
> kernel disallowing MAP_SYNC access on these platforms.  Ideally, we'd
> have some way of ensuring older kernels don't work with these new
> platforms, but I don't think that's possible.

I see disabling MAP_SYNC as the more targeted form of "ensursing older
kernels don't work.

So I guess we agree that something should break when baseline
assumptions change, we just don't yet agree on where that break should
happen?


Re: [PATCH v2 3/5] libnvdimm/nvdimm/flush: Allow architecture to override the flush barrier

2020-05-21 Thread Dan Williams
On Thu, May 21, 2020 at 10:03 AM Aneesh Kumar K.V
 wrote:
>
> On 5/21/20 8:08 PM, Jeff Moyer wrote:
> > Dan Williams  writes:
> >
> >>> But I agree with your concern that if we have older kernel/applications
> >>> that continue to use `dcbf` on future hardware we will end up
> >>> having issues w.r.t powerfail consistency. The plan is what you outlined
> >>> above as tighter ecosystem control. Considering we don't have a pmem
> >>> device generally available, we get both kernel and userspace upgraded
> >>> to use these new instructions before such a device is made available.
> >
> > I thought power already supported NVDIMM-N, no?  So are you saying that
> > those devices will continue to work with the existing flushing and
> > fencing mechanisms?
> >
>
> yes. these devices can continue to use 'dcbf + hwsync' as long as we are
> running them on P9.
>
>
> >> Ok, I think a compile time kernel option with a runtime override
> >> satisfies my concern. Does that work for you?
> >
> > The compile time option only helps when running newer kernels.  I'm not
> > sure how you would even begin to audit userspace applications (keep in
> > mind, not every application is open source, and not every application
> > uses pmdk).  I also question the merits of forcing the administrator to
> > make the determination of whether all applications on the system will
> > work properly.  Really, you have to rely on the vendor to tell you the
> > platform is supported, and at that point, why put further hurdles in the
> > way?
> >
> > The decision to require different instructions on ppc is unfortunate,
> > but one I'm sure we have no control over.  I don't see any merit in the
> > kernel disallowing MAP_SYNC access on these platforms.  Ideally, we'd
> > have some way of ensuring older kernels don't work with these new
> > platforms, but I don't think that's possible.
> >
>
>
> I am currently looking at the possibility of firmware present these
> devices with different device-tree compat values. So that older
> /existing kernel won't initialize the device on newer systems. Is that a
> good compromise? We still can end up with older userspace and newer
> kernel. One of the option suggested by Jan Kara is to use a prctl flag
> to control that? (intead of kernel parameter option I posted before)
>
>
> > Moving on to the patch itself--Aneesh, have you audited other persistent
> > memory users in the kernel?  For example, drivers/md/dm-writecache.c does
> > this:
> >
> > static void writecache_commit_flushed(struct dm_writecache *wc, bool 
> > wait_for_ios)
> > {
> >   if (WC_MODE_PMEM(wc))
> >   wmb(); <==
> >  else
> >  ssd_commit_flushed(wc, wait_for_ios);
> > }
> >
> > I believe you'll need to make modifications there.
> >
>
> Correct. Thanks for catching that.
>
>
> I don't understand dm much, wondering how this will work with
> non-synchronous DAX device?

That's a good point. DM-writecache needs to be cognizant of things
like virtio-pmem that violate the rule that persisent memory writes
can be flushed by CPU functions rather than calling back into the
driver. It seems we need to always make the flush case a dax_operation
callback to account for this.


Re: [PATCH v2 3/5] libnvdimm/nvdimm/flush: Allow architecture to override the flush barrier

2020-05-19 Thread Dan Williams
On Tue, May 19, 2020 at 6:53 AM Aneesh Kumar K.V
 wrote:
>
> Dan Williams  writes:
>
> > On Mon, May 18, 2020 at 10:30 PM Aneesh Kumar K.V
> >  wrote:
>
> ...
>
> >> Applications using new instructions will behave as expected when running
> >> on P8 and P9. Only future hardware will differentiate between 'dcbf' and
> >> 'dcbfps'
> >
> > Right, this is the problem. Applications using new instructions behave
> > as expected, the kernel has been shipping of_pmem and papr_scm for
> > several cycles now, you're saying that the DAX applications written
> > against those platforms are going to be broken on P8 and P9?
>
> The expecation is that both kernel and userspace would get upgraded to
> use the new instruction before actual persistent memory devices are
> made available.
>
> >
> >> > I'm thinking the kernel
> >> > should go as far as to disable DAX operation by default on new
> >> > hardware until userspace asserts that it is prepared to switch to the
> >> > new implementation. Is there any other way to ensure the forward
> >> > compatibility of deployed ppc64 DAX applications?
> >>
> >> AFAIU there is no released persistent memory hardware on ppc64 platform
> >> and we need to make sure before applications get enabled to use these
> >> persistent memory devices, they should switch to use the new
> >> instruction?
> >
> > Right, I want the kernel to offer some level of safety here because
> > everything you are describing sounds like a flag day conversion. Am I
> > misreading? Is there some other gate that prevents existing users of
> > of_pmem and papr_scm from having their expectations violated when
> > running on P8 / P9 hardware? Maybe there's tighter ecosystem control
> > that I'm just not familiar with, I'm only going off the fact that the
> > kernel has shipped a non-zero number of NVDIMM drivers that build with
> > ARCH=ppc64 for several cycles.
>
> If we are looking at adding changes to kernel that will prevent a kernel
> from running on newer hardware in a specific case, we could as well take
> the changes to get the kernel use the newer instructions right?

Oh, no, I'm not talking about stopping the kernel from running. I'm
simply recommending that support for MAP_SYNC mappings (userspace
managed flushing) be disabled by default on PPC with either a
compile-time or run-time default to assert that userspace has been
audited for legacy applications or that the platform owner is
otherwise willing to take the risk.

> But I agree with your concern that if we have older kernel/applications
> that continue to use `dcbf` on future hardware we will end up
> having issues w.r.t powerfail consistency. The plan is what you outlined
> above as tighter ecosystem control. Considering we don't have a pmem
> device generally available, we get both kernel and userspace upgraded
> to use these new instructions before such a device is made available.

Ok, I think a compile time kernel option with a runtime override
satisfies my concern. Does that work for you?


Re: [PATCH v2 3/5] libnvdimm/nvdimm/flush: Allow architecture to override the flush barrier

2020-05-19 Thread Dan Williams
On Mon, May 18, 2020 at 10:30 PM Aneesh Kumar K.V
 wrote:
>
>
> Hi Dan,
>
> Apologies for the delay in response. I was waiting for feedback from
> hardware team before responding to this email.
>
>
> Dan Williams  writes:
>
> > On Tue, May 12, 2020 at 8:47 PM Aneesh Kumar K.V
> >  wrote:
> >>
> >> Architectures like ppc64 provide persistent memory specific barriers
> >> that will ensure that all stores for which the modifications are
> >> written to persistent storage by preceding dcbfps and dcbstps
> >> instructions have updated persistent storage before any data
> >> access or data transfer caused by subsequent instructions is initiated.
> >> This is in addition to the ordering done by wmb()
> >>
> >> Update nvdimm core such that architecture can use barriers other than
> >> wmb to ensure all previous writes are architecturally visible for
> >> the platform buffer flush.
> >
> > This seems like an exceedingly bad idea, maybe I'm missing something.
> > This implies that the deployed base of DAX applications using the old
> > instruction sequence are going to regress on new hardware that
> > requires the new instructions to be deployed.
>
>
> pmdk support for ppc64 is still work in progress and there is pull
> request to switch pmdk to use new instruction.

Ok.

>
> https://github.com/tuliom/pmdk/commit/fix-flush
>
> All userspace applications will be switched to use the new
> instructions. The new instructions are designed such that when running on P8
> and P9 they behave as 'dcbf' and 'hwsync'.

Sure, makes sense.

> Applications using new instructions will behave as expected when running
> on P8 and P9. Only future hardware will differentiate between 'dcbf' and
> 'dcbfps'

Right, this is the problem. Applications using new instructions behave
as expected, the kernel has been shipping of_pmem and papr_scm for
several cycles now, you're saying that the DAX applications written
against those platforms are going to be broken on P8 and P9?

> > I'm thinking the kernel
> > should go as far as to disable DAX operation by default on new
> > hardware until userspace asserts that it is prepared to switch to the
> > new implementation. Is there any other way to ensure the forward
> > compatibility of deployed ppc64 DAX applications?
>
> AFAIU there is no released persistent memory hardware on ppc64 platform
> and we need to make sure before applications get enabled to use these
> persistent memory devices, they should switch to use the new
> instruction?

Right, I want the kernel to offer some level of safety here because
everything you are describing sounds like a flag day conversion. Am I
misreading? Is there some other gate that prevents existing users of
of_pmem and papr_scm from having their expectations violated when
running on P8 / P9 hardware? Maybe there's tighter ecosystem control
that I'm just not familiar with, I'm only going off the fact that the
kernel has shipped a non-zero number of NVDIMM drivers that build with
ARCH=ppc64 for several cycles.


Re: remove a few uses of ->queuedata

2020-05-13 Thread Dan Williams
On Tue, May 12, 2020 at 1:08 AM Christoph Hellwig  wrote:
>
> On Sat, May 09, 2020 at 08:07:14AM -0700, Dan Williams wrote:
> > > which are all used in the I/O submission path (generic_make_request /
> > > generic_make_request_checks).  This is mostly a prep cleanup patch to
> > > also remove the pointless queue argument from ->make_request - then
> > > ->queue is an extra dereference and extra churn.
> >
> > Ah ok. If the changelogs had been filled in with something like "In
> > preparation for removing @q from make_request_fn, stop using
> > ->queuedata", I probably wouldn't have looked twice.
> >
> > For the nvdimm/ driver updates you can add:
> >
> > Reviewed-by: Dan Williams 
> >
> > ...or just let me know if you want me to pick those up through the nvdimm 
> > tree.
>
> I'd love you to pick them up through the nvdimm tree.  Do you want
> to fix up the commit message yourself?

Will do, thanks.


Re: [PATCH v2 3/5] libnvdimm/nvdimm/flush: Allow architecture to override the flush barrier

2020-05-13 Thread Dan Williams
On Tue, May 12, 2020 at 8:47 PM Aneesh Kumar K.V
 wrote:
>
> Architectures like ppc64 provide persistent memory specific barriers
> that will ensure that all stores for which the modifications are
> written to persistent storage by preceding dcbfps and dcbstps
> instructions have updated persistent storage before any data
> access or data transfer caused by subsequent instructions is initiated.
> This is in addition to the ordering done by wmb()
>
> Update nvdimm core such that architecture can use barriers other than
> wmb to ensure all previous writes are architecturally visible for
> the platform buffer flush.

This seems like an exceedingly bad idea, maybe I'm missing something.
This implies that the deployed base of DAX applications using the old
instruction sequence are going to regress on new hardware that
requires the new instructions to be deployed. I'm thinking the kernel
should go as far as to disable DAX operation by default on new
hardware until userspace asserts that it is prepared to switch to the
new implementation. Is there any other way to ensure the forward
compatibility of deployed ppc64 DAX applications?

>
> Signed-off-by: Aneesh Kumar K.V 
> ---
>  drivers/nvdimm/region_devs.c | 8 
>  include/linux/libnvdimm.h| 4 
>  2 files changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/nvdimm/region_devs.c b/drivers/nvdimm/region_devs.c
> index ccbb5b43b8b2..88ea34a9c7fd 100644
> --- a/drivers/nvdimm/region_devs.c
> +++ b/drivers/nvdimm/region_devs.c
> @@ -1216,13 +1216,13 @@ int generic_nvdimm_flush(struct nd_region *nd_region)
> idx = this_cpu_add_return(flush_idx, hash_32(current->pid + idx, 8));
>
> /*
> -* The first wmb() is needed to 'sfence' all previous writes
> -* such that they are architecturally visible for the platform
> -* buffer flush.  Note that we've already arranged for pmem
> +* The first arch_pmem_flush_barrier() is needed to 'sfence' all
> +* previous writes such that they are architecturally visible for
> +* the platform buffer flush. Note that we've already arranged for 
> pmem
>  * writes to avoid the cache via memcpy_flushcache().  The final
>  * wmb() ensures ordering for the NVDIMM flush write.
>  */
> -   wmb();
> +   arch_pmem_flush_barrier();
> for (i = 0; i < nd_region->ndr_mappings; i++)
> if (ndrd_get_flush_wpq(ndrd, i, 0))
> writeq(1, ndrd_get_flush_wpq(ndrd, i, idx));
> diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h
> index 18da4059be09..66f6c65bd789 100644
> --- a/include/linux/libnvdimm.h
> +++ b/include/linux/libnvdimm.h
> @@ -286,4 +286,8 @@ static inline void arch_invalidate_pmem(void *addr, 
> size_t size)
>  }
>  #endif
>
> +#ifndef arch_pmem_flush_barrier
> +#define arch_pmem_flush_barrier() wmb()
> +#endif
> +
>  #endif /* __LIBNVDIMM_H__ */
> --
> 2.26.2
>


Re: remove a few uses of ->queuedata

2020-05-09 Thread Dan Williams
On Sat, May 9, 2020 at 1:24 AM Christoph Hellwig  wrote:
>
> On Fri, May 08, 2020 at 11:04:45AM -0700, Dan Williams wrote:
> > On Fri, May 8, 2020 at 9:16 AM Christoph Hellwig  wrote:
> > >
> > > Hi all,
> > >
> > > various bio based drivers use queue->queuedata despite already having
> > > set up disk->private_data, which can be used just as easily.  This
> > > series cleans them up to only use a single private data pointer.
> >
> > ...but isn't the queue pretty much guaranteed to be cache hot and the
> > gendisk cache cold? I'm not immediately seeing what else needs the
> > gendisk in the I/O path. Is there another motivation I'm missing?
>
> ->private_data is right next to the ->queue pointer, pat0 and part_tbl
> which are all used in the I/O submission path (generic_make_request /
> generic_make_request_checks).  This is mostly a prep cleanup patch to
> also remove the pointless queue argument from ->make_request - then
> ->queue is an extra dereference and extra churn.

Ah ok. If the changelogs had been filled in with something like "In
preparation for removing @q from make_request_fn, stop using
->queuedata", I probably wouldn't have looked twice.

For the nvdimm/ driver updates you can add:

Reviewed-by: Dan Williams 

...or just let me know if you want me to pick those up through the nvdimm tree.


Re: remove a few uses of ->queuedata

2020-05-08 Thread Dan Williams
On Fri, May 8, 2020 at 9:16 AM Christoph Hellwig  wrote:
>
> Hi all,
>
> various bio based drivers use queue->queuedata despite already having
> set up disk->private_data, which can be used just as easily.  This
> series cleans them up to only use a single private data pointer.

...but isn't the queue pretty much guaranteed to be cache hot and the
gendisk cache cold? I'm not immediately seeing what else needs the
gendisk in the I/O path. Is there another motivation I'm missing?


Re: [PATCH v2 2/3] mm/memory_hotplug: Introduce MHP_NO_FIRMWARE_MEMMAP

2020-05-02 Thread Dan Williams
On Sat, May 2, 2020 at 2:27 AM David Hildenbrand  wrote:
>
> >> Now, let's clarify what I want regarding virtio-mem:
> >>
> >> 1. kexec should not add virtio-mem memory to the initial firmware
> >>memmap. The driver has to be in charge as discussed.
> >> 2. kexec should not place kexec images onto virtio-mem memory. That
> >>would end badly.
> >> 3. kexec should still dump virtio-mem memory via kdump.
> >
> > Ok, but then seems to say to me that dax/kmem is a different type of
> > (driver managed) than virtio-mem and it's confusing to try to apply
> > the same meaning. Why not just call your type for the distinct type it
> > is "System RAM (virtio-mem)" and let any other driver managed memory
> > follow the same "System RAM ($driver)" format if it wants?
>
> I had the same idea but discarded it because it seemed to uglify the
> add_memory() interface (passing yet another parameter only relevant for
> driver managed memory). Maybe we really want a new one, because I like
> that idea:
>
> /*
>  * Add special, driver-managed memory to the system as system ram.
>  * The resource_name is expected to have the name format "System RAM
>  * ($DRIVER)", so user space (esp. kexec-tools)" can special-case it.
>  *
>  * For this memory, no entries in /sys/firmware/memmap are created,
>  * as this memory won't be part of the raw firmware-provided memory map
>  * e.g., after a reboot. Also, the created memory resource is flagged
>  * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-
>  * case this memory (e.g., not place kexec images onto it).
>  */
> int add_memory_driver_managed(int nid, u64 start, u64 size,
>   const char *resource_name);
>
>
> If we'd ever have to special case it even more in the kernel, we could
> allow to specify further resource flags. While passing the driver name
> instead of the resource_name would be an option, this way we don't have
> to hand craft new resource strings for added memory resources.
>
> Thoughts?

Looks useful to me and simplifies walking /proc/iomem. I personally
like the safety of the string just being the $driver component of the
name, but I won't lose sleep if the interface stays freeform like you
propose.


Re: [PATCH v2 2/3] mm/memory_hotplug: Introduce MHP_NO_FIRMWARE_MEMMAP

2020-05-01 Thread Dan Williams
On Fri, May 1, 2020 at 2:11 PM David Hildenbrand  wrote:
>
> On 01.05.20 22:12, Dan Williams wrote:
[..]
> >>> Consider the case of EFI Special Purpose (SP) Memory that is
> >>> marked EFI Conventional Memory with the SP attribute. In that case the
> >>> firmware memory map marked it as conventional RAM, but the kernel
> >>> optionally marks it as System RAM vs Soft Reserved. The 2008 patch
> >>> simply does not consider that case. I'm not sure strict textualism
> >>> works for coding decisions.
> >>
> >> I am no expert on that matter (esp EFI). But looking at the users of
> >> firmware_map_add_early(), the single user is in arch/x86/kernel/e820.c
> >> . So the single source of /sys/firmware/memmap is (besides hotplug) e820.
> >>
> >> "'e820_table_firmware': the original firmware version passed to us by
> >> the bootloader - not modified by the kernel. ... inform the user about
> >> the firmware's notion of memory layout via /sys/firmware/memmap"
> >> (arch/x86/kernel/e820.c)
> >>
> >> How is the EFI Special Purpose (SP) Memory represented in e820?
> >> /sys/firmware/memmap is really simple: just dump in e820. No policies IIUC.
> >
> > e820 now has a Soft Reserved translation for this which means "try to
> > reserve, but treat as System RAM is ok too". It seems generically
> > useful to me that the toggle for determining whether Soft Reserved or
> > System RAM shows up /sys/firmware/memmap is a determination that
> > policy can make. The kernel need not preemptively block it.
>
> So, I think I have to clarify something here. We do have two ways to kexec
>
> 1. kexec_load(): User space (kexec-tools) crafts the memmap (e.g., using
> /sys/firmware/memmap on x86-64) and selects memory where to place the
> kexec images (e.g., using /proc/iomem)
>
> 2. kexec_file_load(): The kernel reuses the (basically) raw firmware
> memmap and selects memory where to place kexec images.
>
> We are talking about changing 1, to behave like 2 in regards to
> dax/kmem. 2. does currently not add any hotplugged memory to the
> fixed-up e820, and it should be fixed regarding hotplugged DIMMs that
> would appear in e820 after a reboot.
>
> Now, all these policy discussions are nice and fun, but I don't really
> see a good reason to (ab)use /sys/firmware/memmap for that (e.g., parent
> properties). If you want to be able to make this configurable, then
> e.g., add a way to configure this in the kernel (for example along with
> kmem) to make 1. and 2. behave the same way. Otherwise, you really only
> can change 1.

That's clearer.

>
>
> Now, let's clarify what I want regarding virtio-mem:
>
> 1. kexec should not add virtio-mem memory to the initial firmware
>memmap. The driver has to be in charge as discussed.
> 2. kexec should not place kexec images onto virtio-mem memory. That
>would end badly.
> 3. kexec should still dump virtio-mem memory via kdump.

Ok, but then seems to say to me that dax/kmem is a different type of
(driver managed) than virtio-mem and it's confusing to try to apply
the same meaning. Why not just call your type for the distinct type it
is "System RAM (virtio-mem)" and let any other driver managed memory
follow the same "System RAM ($driver)" format if it wants?


Re: [PATCH v2 2/3] mm/memory_hotplug: Introduce MHP_NO_FIRMWARE_MEMMAP

2020-05-01 Thread Dan Williams
On Fri, May 1, 2020 at 12:18 PM David Hildenbrand  wrote:
>
> On 01.05.20 20:43, Dan Williams wrote:
> > On Fri, May 1, 2020 at 11:14 AM David Hildenbrand  wrote:
> >>
> >> On 01.05.20 20:03, Dan Williams wrote:
> >>> On Fri, May 1, 2020 at 10:51 AM David Hildenbrand  
> >>> wrote:
> >>>>
> >>>> On 01.05.20 19:45, David Hildenbrand wrote:
> >>>>> On 01.05.20 19:39, Dan Williams wrote:
> >>>>>> On Fri, May 1, 2020 at 10:21 AM David Hildenbrand  
> >>>>>> wrote:
> >>>>>>>
> >>>>>>> On 01.05.20 18:56, Dan Williams wrote:
> >>>>>>>> On Fri, May 1, 2020 at 2:34 AM David Hildenbrand  
> >>>>>>>> wrote:
> >>>>>>>>>
> >>>>>>>>> On 01.05.20 00:24, Andrew Morton wrote:
> >>>>>>>>>> On Thu, 30 Apr 2020 20:43:39 +0200 David Hildenbrand 
> >>>>>>>>>>  wrote:
> >>>>>>>>>>
> >>>>>>>>>>>>
> >>>>>>>>>>>> Why does the firmware map support hotplug entries?
> >>>>>>>>>>>
> >>>>>>>>>>> I assume:
> >>>>>>>>>>>
> >>>>>>>>>>> The firmware memmap was added primarily for x86-64 kexec (and 
> >>>>>>>>>>> still, is
> >>>>>>>>>>> mostly used on x86-64 only IIRC). There, we had ACPI hotplug. 
> >>>>>>>>>>> When DIMMs
> >>>>>>>>>>> get hotplugged on real HW, they get added to e820. Same applies to
> >>>>>>>>>>> memory added via HyperV balloon (unless memory is unplugged via
> >>>>>>>>>>> ballooning and you reboot ... the the e820 is changed as well). I 
> >>>>>>>>>>> assume
> >>>>>>>>>>> we wanted to be able to reflect that, to make kexec look like a 
> >>>>>>>>>>> real reboot.
> >>>>>>>>>>>
> >>>>>>>>>>> This worked for a while. Then came dax/kmem. Now comes virtio-mem.
> >>>>>>>>>>>
> >>>>>>>>>>>
> >>>>>>>>>>> But I assume only Andrew can enlighten us.
> >>>>>>>>>>>
> >>>>>>>>>>> @Andrew, any guidance here? Should we really add all memory to the
> >>>>>>>>>>> firmware memmap, even if this contradicts with the existing
> >>>>>>>>>>> documentation? (especially, if the actual firmware memmap will 
> >>>>>>>>>>> *not*
> >>>>>>>>>>> contain that memory after a reboot)
> >>>>>>>>>>
> >>>>>>>>>> For some reason that patch is misattributed - it was authored by
> >>>>>>>>>> Shaohui Zheng , who hasn't been heard 
> >>>>>>>>>> from in
> >>>>>>>>>> a decade.  I looked through the email discussion from that time 
> >>>>>>>>>> and I'm
> >>>>>>>>>> not seeing anything useful.  But I wasn't able to locate Dave 
> >>>>>>>>>> Hansen's
> >>>>>>>>>> review comments.
> >>>>>>>>>
> >>>>>>>>> Okay, thanks for checking. I think the documentation from 2008 is 
> >>>>>>>>> pretty
> >>>>>>>>> clear what has to be done here. I will add some of these details to 
> >>>>>>>>> the
> >>>>>>>>> patch description.
> >>>>>>>>>
> >>>>>>>>> Also, now that I know that esp. kexec-tools already don't consider
> >>>>>>>>> dax/kmem memory properly (memory will not get dumped via kdump) and
> >>>>>>>>> won't really suffer from a name change in /proc/iomem, I will go 
> >>>>>>>>> back to
> >>>>>>>>> the MHP_DRIVER_MANAGED approach and
> >>>>>>>>> 1. Don't create firmware memma

Re: [PATCH v2 2/3] mm/memory_hotplug: Introduce MHP_NO_FIRMWARE_MEMMAP

2020-05-01 Thread Dan Williams
On Fri, May 1, 2020 at 11:14 AM David Hildenbrand  wrote:
>
> On 01.05.20 20:03, Dan Williams wrote:
> > On Fri, May 1, 2020 at 10:51 AM David Hildenbrand  wrote:
> >>
> >> On 01.05.20 19:45, David Hildenbrand wrote:
> >>> On 01.05.20 19:39, Dan Williams wrote:
> >>>> On Fri, May 1, 2020 at 10:21 AM David Hildenbrand  
> >>>> wrote:
> >>>>>
> >>>>> On 01.05.20 18:56, Dan Williams wrote:
> >>>>>> On Fri, May 1, 2020 at 2:34 AM David Hildenbrand  
> >>>>>> wrote:
> >>>>>>>
> >>>>>>> On 01.05.20 00:24, Andrew Morton wrote:
> >>>>>>>> On Thu, 30 Apr 2020 20:43:39 +0200 David Hildenbrand 
> >>>>>>>>  wrote:
> >>>>>>>>
> >>>>>>>>>>
> >>>>>>>>>> Why does the firmware map support hotplug entries?
> >>>>>>>>>
> >>>>>>>>> I assume:
> >>>>>>>>>
> >>>>>>>>> The firmware memmap was added primarily for x86-64 kexec (and 
> >>>>>>>>> still, is
> >>>>>>>>> mostly used on x86-64 only IIRC). There, we had ACPI hotplug. When 
> >>>>>>>>> DIMMs
> >>>>>>>>> get hotplugged on real HW, they get added to e820. Same applies to
> >>>>>>>>> memory added via HyperV balloon (unless memory is unplugged via
> >>>>>>>>> ballooning and you reboot ... the the e820 is changed as well). I 
> >>>>>>>>> assume
> >>>>>>>>> we wanted to be able to reflect that, to make kexec look like a 
> >>>>>>>>> real reboot.
> >>>>>>>>>
> >>>>>>>>> This worked for a while. Then came dax/kmem. Now comes virtio-mem.
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> But I assume only Andrew can enlighten us.
> >>>>>>>>>
> >>>>>>>>> @Andrew, any guidance here? Should we really add all memory to the
> >>>>>>>>> firmware memmap, even if this contradicts with the existing
> >>>>>>>>> documentation? (especially, if the actual firmware memmap will *not*
> >>>>>>>>> contain that memory after a reboot)
> >>>>>>>>
> >>>>>>>> For some reason that patch is misattributed - it was authored by
> >>>>>>>> Shaohui Zheng , who hasn't been heard from 
> >>>>>>>> in
> >>>>>>>> a decade.  I looked through the email discussion from that time and 
> >>>>>>>> I'm
> >>>>>>>> not seeing anything useful.  But I wasn't able to locate Dave 
> >>>>>>>> Hansen's
> >>>>>>>> review comments.
> >>>>>>>
> >>>>>>> Okay, thanks for checking. I think the documentation from 2008 is 
> >>>>>>> pretty
> >>>>>>> clear what has to be done here. I will add some of these details to 
> >>>>>>> the
> >>>>>>> patch description.
> >>>>>>>
> >>>>>>> Also, now that I know that esp. kexec-tools already don't consider
> >>>>>>> dax/kmem memory properly (memory will not get dumped via kdump) and
> >>>>>>> won't really suffer from a name change in /proc/iomem, I will go back 
> >>>>>>> to
> >>>>>>> the MHP_DRIVER_MANAGED approach and
> >>>>>>> 1. Don't create firmware memmap entries
> >>>>>>> 2. Name the resource "System RAM (driver managed)"
> >>>>>>> 3. Flag the resource via something like IORESOURCE_MEM_DRIVER_MANAGED.
> >>>>>>>
> >>>>>>> This way, kernel users and user space can figure out that this memory
> >>>>>>> has different semantics and handle it accordingly - I think that was
> >>>>>>> what Eric was asking for.
> >>>>>>>
> >>>>>>> Of course, open for suggestions.
> >>>>>>
> >>>>>> I'm still more of a fan of this bei

Re: [PATCH v2 2/3] mm/memory_hotplug: Introduce MHP_NO_FIRMWARE_MEMMAP

2020-05-01 Thread Dan Williams
On Fri, May 1, 2020 at 10:51 AM David Hildenbrand  wrote:
>
> On 01.05.20 19:45, David Hildenbrand wrote:
> > On 01.05.20 19:39, Dan Williams wrote:
> >> On Fri, May 1, 2020 at 10:21 AM David Hildenbrand  wrote:
> >>>
> >>> On 01.05.20 18:56, Dan Williams wrote:
> >>>> On Fri, May 1, 2020 at 2:34 AM David Hildenbrand  
> >>>> wrote:
> >>>>>
> >>>>> On 01.05.20 00:24, Andrew Morton wrote:
> >>>>>> On Thu, 30 Apr 2020 20:43:39 +0200 David Hildenbrand 
> >>>>>>  wrote:
> >>>>>>
> >>>>>>>>
> >>>>>>>> Why does the firmware map support hotplug entries?
> >>>>>>>
> >>>>>>> I assume:
> >>>>>>>
> >>>>>>> The firmware memmap was added primarily for x86-64 kexec (and still, 
> >>>>>>> is
> >>>>>>> mostly used on x86-64 only IIRC). There, we had ACPI hotplug. When 
> >>>>>>> DIMMs
> >>>>>>> get hotplugged on real HW, they get added to e820. Same applies to
> >>>>>>> memory added via HyperV balloon (unless memory is unplugged via
> >>>>>>> ballooning and you reboot ... the the e820 is changed as well). I 
> >>>>>>> assume
> >>>>>>> we wanted to be able to reflect that, to make kexec look like a real 
> >>>>>>> reboot.
> >>>>>>>
> >>>>>>> This worked for a while. Then came dax/kmem. Now comes virtio-mem.
> >>>>>>>
> >>>>>>>
> >>>>>>> But I assume only Andrew can enlighten us.
> >>>>>>>
> >>>>>>> @Andrew, any guidance here? Should we really add all memory to the
> >>>>>>> firmware memmap, even if this contradicts with the existing
> >>>>>>> documentation? (especially, if the actual firmware memmap will *not*
> >>>>>>> contain that memory after a reboot)
> >>>>>>
> >>>>>> For some reason that patch is misattributed - it was authored by
> >>>>>> Shaohui Zheng , who hasn't been heard from in
> >>>>>> a decade.  I looked through the email discussion from that time and I'm
> >>>>>> not seeing anything useful.  But I wasn't able to locate Dave Hansen's
> >>>>>> review comments.
> >>>>>
> >>>>> Okay, thanks for checking. I think the documentation from 2008 is pretty
> >>>>> clear what has to be done here. I will add some of these details to the
> >>>>> patch description.
> >>>>>
> >>>>> Also, now that I know that esp. kexec-tools already don't consider
> >>>>> dax/kmem memory properly (memory will not get dumped via kdump) and
> >>>>> won't really suffer from a name change in /proc/iomem, I will go back to
> >>>>> the MHP_DRIVER_MANAGED approach and
> >>>>> 1. Don't create firmware memmap entries
> >>>>> 2. Name the resource "System RAM (driver managed)"
> >>>>> 3. Flag the resource via something like IORESOURCE_MEM_DRIVER_MANAGED.
> >>>>>
> >>>>> This way, kernel users and user space can figure out that this memory
> >>>>> has different semantics and handle it accordingly - I think that was
> >>>>> what Eric was asking for.
> >>>>>
> >>>>> Of course, open for suggestions.
> >>>>
> >>>> I'm still more of a fan of this being communicated by "System RAM"
> >>>
> >>> I was mentioning somewhere in this thread that "System RAM" inside a
> >>> hierarchy (like dax/kmem) will already be basically ignored by
> >>> kexec-tools. So, placing it inside a hierarchy already makes it look
> >>> special already.
> >>>
> >>> But after all, as we have to change kexec-tools either way, we can
> >>> directly go ahead and flag it properly as special (in case there will
> >>> ever be other cases where we could no longer distinguish it).
> >>>
> >>>> being parented especially because that tells you something about how
> >>>> the memory is driver-managed and which mechanism might be in play.
> >>>
> >>> The 

Re: [PATCH v2 2/3] mm/memory_hotplug: Introduce MHP_NO_FIRMWARE_MEMMAP

2020-05-01 Thread Dan Williams
On Fri, May 1, 2020 at 10:21 AM David Hildenbrand  wrote:
>
> On 01.05.20 18:56, Dan Williams wrote:
> > On Fri, May 1, 2020 at 2:34 AM David Hildenbrand  wrote:
> >>
> >> On 01.05.20 00:24, Andrew Morton wrote:
> >>> On Thu, 30 Apr 2020 20:43:39 +0200 David Hildenbrand  
> >>> wrote:
> >>>
> >>>>>
> >>>>> Why does the firmware map support hotplug entries?
> >>>>
> >>>> I assume:
> >>>>
> >>>> The firmware memmap was added primarily for x86-64 kexec (and still, is
> >>>> mostly used on x86-64 only IIRC). There, we had ACPI hotplug. When DIMMs
> >>>> get hotplugged on real HW, they get added to e820. Same applies to
> >>>> memory added via HyperV balloon (unless memory is unplugged via
> >>>> ballooning and you reboot ... the the e820 is changed as well). I assume
> >>>> we wanted to be able to reflect that, to make kexec look like a real 
> >>>> reboot.
> >>>>
> >>>> This worked for a while. Then came dax/kmem. Now comes virtio-mem.
> >>>>
> >>>>
> >>>> But I assume only Andrew can enlighten us.
> >>>>
> >>>> @Andrew, any guidance here? Should we really add all memory to the
> >>>> firmware memmap, even if this contradicts with the existing
> >>>> documentation? (especially, if the actual firmware memmap will *not*
> >>>> contain that memory after a reboot)
> >>>
> >>> For some reason that patch is misattributed - it was authored by
> >>> Shaohui Zheng , who hasn't been heard from in
> >>> a decade.  I looked through the email discussion from that time and I'm
> >>> not seeing anything useful.  But I wasn't able to locate Dave Hansen's
> >>> review comments.
> >>
> >> Okay, thanks for checking. I think the documentation from 2008 is pretty
> >> clear what has to be done here. I will add some of these details to the
> >> patch description.
> >>
> >> Also, now that I know that esp. kexec-tools already don't consider
> >> dax/kmem memory properly (memory will not get dumped via kdump) and
> >> won't really suffer from a name change in /proc/iomem, I will go back to
> >> the MHP_DRIVER_MANAGED approach and
> >> 1. Don't create firmware memmap entries
> >> 2. Name the resource "System RAM (driver managed)"
> >> 3. Flag the resource via something like IORESOURCE_MEM_DRIVER_MANAGED.
> >>
> >> This way, kernel users and user space can figure out that this memory
> >> has different semantics and handle it accordingly - I think that was
> >> what Eric was asking for.
> >>
> >> Of course, open for suggestions.
> >
> > I'm still more of a fan of this being communicated by "System RAM"
>
> I was mentioning somewhere in this thread that "System RAM" inside a
> hierarchy (like dax/kmem) will already be basically ignored by
> kexec-tools. So, placing it inside a hierarchy already makes it look
> special already.
>
> But after all, as we have to change kexec-tools either way, we can
> directly go ahead and flag it properly as special (in case there will
> ever be other cases where we could no longer distinguish it).
>
> > being parented especially because that tells you something about how
> > the memory is driver-managed and which mechanism might be in play.
>
> The could be communicated to some degree via the resource hierarchy.
>
> E.g.,
>
> [root@localhost ~]# cat /proc/iomem
> ...
> 14000-33fff : Persistent Memory
>   14000-1481f : namespace0.0
>   15000-33fff : dax0.0
> 15000-33fff : System RAM (driver managed)
>
> vs.
>
>:/# cat /proc/iomem
> [...]
> 14000-333ff : virtio-mem (virtio0)
>   14000-147ff : System RAM (driver managed)
>   14800-14fff : System RAM (driver managed)
>   15000-157ff : System RAM (driver managed)
>
> Good enough for my taste.
>
> > What about adding an optional /sys/firmware/memmap/X/parent attribute.
>
> I really don't want any firmware memmap entries for something that is
> not part of the firmware provided memmap. In addition,
> /sys/firmware/memmap/ is still a fairly x86_64 specific thing. Only mips
> and two arm configs enable it at all.
>
> So, IMHO, /sys/firmware/memmap/ is definitely not the way to go.

I think that's a policy decision and policy decisions do not belong in
the kernel. Give the tooling the opportunity to decide whether System
RAM stays that way over a kexec. The parenthetical reference otherwise
looks out of place to me in the /proc/iomem output. What makes it
"driver managed" is how the kernel handles it, not how the kernel
names it.


Re: [PATCH v2 2/3] mm/memory_hotplug: Introduce MHP_NO_FIRMWARE_MEMMAP

2020-05-01 Thread Dan Williams
On Fri, May 1, 2020 at 2:34 AM David Hildenbrand  wrote:
>
> On 01.05.20 00:24, Andrew Morton wrote:
> > On Thu, 30 Apr 2020 20:43:39 +0200 David Hildenbrand  
> > wrote:
> >
> >>>
> >>> Why does the firmware map support hotplug entries?
> >>
> >> I assume:
> >>
> >> The firmware memmap was added primarily for x86-64 kexec (and still, is
> >> mostly used on x86-64 only IIRC). There, we had ACPI hotplug. When DIMMs
> >> get hotplugged on real HW, they get added to e820. Same applies to
> >> memory added via HyperV balloon (unless memory is unplugged via
> >> ballooning and you reboot ... the the e820 is changed as well). I assume
> >> we wanted to be able to reflect that, to make kexec look like a real 
> >> reboot.
> >>
> >> This worked for a while. Then came dax/kmem. Now comes virtio-mem.
> >>
> >>
> >> But I assume only Andrew can enlighten us.
> >>
> >> @Andrew, any guidance here? Should we really add all memory to the
> >> firmware memmap, even if this contradicts with the existing
> >> documentation? (especially, if the actual firmware memmap will *not*
> >> contain that memory after a reboot)
> >
> > For some reason that patch is misattributed - it was authored by
> > Shaohui Zheng , who hasn't been heard from in
> > a decade.  I looked through the email discussion from that time and I'm
> > not seeing anything useful.  But I wasn't able to locate Dave Hansen's
> > review comments.
>
> Okay, thanks for checking. I think the documentation from 2008 is pretty
> clear what has to be done here. I will add some of these details to the
> patch description.
>
> Also, now that I know that esp. kexec-tools already don't consider
> dax/kmem memory properly (memory will not get dumped via kdump) and
> won't really suffer from a name change in /proc/iomem, I will go back to
> the MHP_DRIVER_MANAGED approach and
> 1. Don't create firmware memmap entries
> 2. Name the resource "System RAM (driver managed)"
> 3. Flag the resource via something like IORESOURCE_MEM_DRIVER_MANAGED.
>
> This way, kernel users and user space can figure out that this memory
> has different semantics and handle it accordingly - I think that was
> what Eric was asking for.
>
> Of course, open for suggestions.

I'm still more of a fan of this being communicated by "System RAM"
being parented especially because that tells you something about how
the memory is driver-managed and which mechanism might be in play.
What about adding an optional /sys/firmware/memmap/X/parent attribute.
This lets tooling check if it cares via that interface and lets it
lookup the related infrastructure to interact with if it would do
something different for virtio-mem vs dax/kmem?


Re: [PATCH v2 2/3] mm/memory_hotplug: Introduce MHP_NO_FIRMWARE_MEMMAP

2020-04-30 Thread Dan Williams
On Thu, Apr 30, 2020 at 11:44 AM David Hildenbrand  wrote:
>
>  >>> If the class of memory is different then please by all means let's mark
> >>> it differently in struct resource so everyone knows it is different.
> >>> But that difference needs to be more than hotplug.
> >>>
> >>> That difference needs to be the hypervisor loaned us memory and might
> >>> take it back at any time, or this memory is persistent and so it has
> >>> these different characteristics so don't use it as ordinary ram.
> >>
> >> Yes, and I think kmem took an excellent approach of explicitly putting
> >> that "System RAM" into a resource hierarchy. That "System RAM" won't
> >> show up as a root node under /proc/iomem (see patch #3), which already
> >> results in kexec-tools to treat it in a special way. I am thinking about
> >> doing the same for virtio-mem.
> >
> > Reading this and your patch cover letters again my concern is that
> > the justification seems to be letting the tail wag the dog.
> >
> > You want kexec-tools to behave in a certain way so you are changing the
> > kernel.
> >
> > Rather it should be change the kernel to clearly reflect reality and if
> > you can get away without a change to kexec-tools that is a bonus.
> >
>
> Right, because user space has to have a way to figure out what to do.
>
> But talking about the firmware memmap, indicating something via a "raw
> firmware-provided memory map", that is not actually in the "raw
> firmware-provided memory map" feels wrong to me. (below)
>
>
> >>> That information is also useful to other people looking at the system
> >>> and seeing what is going on.
> >>>
> >>> Just please don't muddle the concepts, or assume that whatever subset of
> >>> hotplug memory you are dealing with is the only subset.
> >>
> >> I can certainly rephrase the subject/description/comment, stating that
> >> this is not to be used for ordinary hotplugged DIMMs - only when the
> >> device driver is under control to decide what to do with that memory -
> >> especially when kexec'ing.
> >>
> >> (previously, I called this flag MHP_DRIVER_MANAGED, but I think
> >> MHP_NO_FIRMWARE_MEMMAP is clearer, we just need a better description)
> >>
> >> Would that make it clearer?
> >
> > I am not certain, but Andrew Morton deliberately added that
> > firmware_map_add_hotplug call.  Which means that there is a reason
> > for putting hotplugged memory in the firmware map.
> >
> > So the justification needs to take that reason into account.  The
> > justification can not be it is hotplugged therefore it should not belong
> > in the firmware memory map.  Unless you can show that
> > firmware_map_add_hotplug that was actually a bug and should be removed.
> > But as it has been that way since 2010 that seems like a long shot.
> >
> > So my question is what is right for the firmware map?
>
> We have documentation for that since 2008. Andrews patch is from 2010.
>
> Documentation/ABI/testing/sysfs-firmware-memmap
>
> It clearly talks about "raw firmware-provided memory map" and why the
> interface was introduced at all ("on most architectures that
> firmware-provided memory map is modified afterwards by the kernel itself").
>
> >
> > Why does the firmware map support hotplug entries?
>
> I assume:
>
> The firmware memmap was added primarily for x86-64 kexec (and still, is
> mostly used on x86-64 only IIRC). There, we had ACPI hotplug. When DIMMs
> get hotplugged on real HW, they get added to e820. Same applies to
> memory added via HyperV balloon (unless memory is unplugged via
> ballooning and you reboot ... the the e820 is changed as well). I assume
> we wanted to be able to reflect that, to make kexec look like a real reboot.

I can at least say that this breakdown makes sense to me. Traditional
memory hotplug results in permanent change to the raw firmware memory
map reported by the host at next reboot. These device-driver-owned
memory regions really want a hotplug policy per-kernel boot instance
and should fall back to the default reserved state at reboot (kexec or
otherwise). When I say hotplug-policy I mean whether the current
kernel wants to treat the device range as System RAM or leave it as
device-managed. The intent is that the follow-on kernel needs to
re-decide the device policy.

>
> This worked for a while. Then came dax/kmem. Now comes virtio-mem.
>


Re: [PATCH v1 2/3] mm/memory_hotplug: Introduce MHP_DRIVER_MANAGED

2020-04-30 Thread Dan Williams
On Thu, Apr 30, 2020 at 1:21 AM David Hildenbrand  wrote:
> >> Just because we decided to use some DAX memory in the current kernel as
> >> system ram, doesn't mean we should make that decision for the kexec
> >> kernel (e.g., using it as initial memory, placing kexec binaries onto
> >> it, etc.). This is also not what we would observe during a real reboot.
> >
> > Agree.
> >
> >> I can see that the "System RAM" resource will show up as child resource
> >> under the device e.g., in /proc/iomem.
> >>
> >> However, entries in /sys/firmware/memmap/ are created as "System RAM".
> >
> > True. Do you think this rename should just be limited to what type
> > /sys/firmware/memmap/ emits? I have the concern, but no proof
>
> We could split this patch into
>
> MHP_NO_FIRMWARE_MEMMAP (create firmware memmap entries)
>
> and
>
> MHP_DRIVER_MANAGED (name of the resource)
>
> See below, the latter might not be needed.
>
> > currently, that there are /proc/iomem walkers that explicitly look for
> > "System RAM", but might be thrown off by "System RAM (driver
> > managed)". I was not aware of /sys/firmware/memmap until about 5
> > minutes ago.
>
> The only two users of /proc/iomem I am aware of are kexec-tools and some
> s390x tools.
>
> kexec-tools on x86-64 uses /sys/firmware/memmap to craft the initial
> memmap, but uses /proc/iomem to
> a) Find places for kexec images
> b) Detect memory regions to dump via kdump
>
> I am not yet sure if we really need the "System RAM (driver managed)"
> part. If we can teach kexec-tools to
> a) Don't place kexec images on "System RAM" that has a parent resource
> (most likely requires kexec-tools changes)
> b) Consider for kdump "System RAM" that has a parent resource
> we might be able to avoid renaming that. (I assume that's already done)
>
> E.g., regarding virtio-mem (patch #3) I am currently also looking into
> creating a parent resource instead, like dax/kmem to avoid the rename:
>
> :/# cat /proc/iomem
> -0fff : Reserved
> [...]
> 1-13fff : System RAM
> 14000-33fff : virtio0
>   14000-147ff : System RAM
>   14800-14fff : System RAM
>   15000-157ff : System RAM
> 34000-303fff : virtio1
>   34000-347ff : System RAM
> 328000-32 : PCI Bus :00

Looks good to me if it flies with kexec-tools.


Re: [PATCH v1 2/3] mm/memory_hotplug: Introduce MHP_DRIVER_MANAGED

2020-04-30 Thread Dan Williams
On Thu, Apr 30, 2020 at 12:20 AM David Hildenbrand  wrote:
>
> On 29.04.20 18:08, David Hildenbrand wrote:
> > Some paravirtualized devices that add memory via add_memory() and
> > friends (esp. virtio-mem) don't want to create entries in
> > /sys/firmware/memmap/ - primarily to hinder kexec from adding this
> > memory to the boot memmap of the kexec kernel.
> >
> > In fact, such memory is never exposed via the firmware (e.g., e820), but
> > only via the device, so exposing this memory via /sys/firmware/memmap/ is
> > wrong:
> >  "kexec needs the raw firmware-provided memory map to setup the
> >   parameter segment of the kernel that should be booted with
> >   kexec. Also, the raw memory map is useful for debugging. For
> >   that reason, /sys/firmware/memmap is an interface that provides
> >   the raw memory map to userspace." [1]
> >
> > We want to let user space know that memory which is always detected,
> > added, and managed via a (device) driver - like memory managed by
> > virtio-mem - is special. It cannot be used for placing kexec segments
> > and the (device) driver is responsible for re-adding memory that
> > (eventually shrunk/grown/defragmented) memory after a reboot/kexec. It
> > should e.g., not be added to a fixed up firmware memmap. However, it should
> > be dumped by kdump.
> >
> > Also, such memory could behave differently than an ordinary DIMM - e.g.,
> > memory managed by virtio-mem can have holes inside added memory resource,
> > which should not be touched, especially for writing.
> >
> > Let's expose that memory as "System RAM (driver managed)" e.g., via
> > /pro/iomem.
> >
> > We don't have to worry about firmware_map_remove() on the removal path.
> > If there is no entry, it will simply return with -EINVAL.
> >
> > [1] 
> > https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-firmware-memmap
> >
> > Cc: Andrew Morton 
> > Cc: Michal Hocko 
> > Cc: Pankaj Gupta 
> > Cc: Wei Yang 
> > Cc: Baoquan He 
> > Cc: Eric Biederman 
> > Signed-off-by: David Hildenbrand 
> > ---
> >  include/linux/memory_hotplug.h |  8 
> >  mm/memory_hotplug.c| 20 
> >  2 files changed, 24 insertions(+), 4 deletions(-)
> >
> > diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
> > index bf0e3edb8688..cc538584b39e 100644
> > --- a/include/linux/memory_hotplug.h
> > +++ b/include/linux/memory_hotplug.h
> > @@ -68,6 +68,14 @@ struct mhp_params {
> >   pgprot_t pgprot;
> >  };
> >
> > +/* Flags used for add_memory() and friends. */
> > +
> > +/*
> > + * Don't create entries in /sys/firmware/memmap/ and expose memory as
> > + * "System RAM (driver managed)" in e.g., /proc/iomem
> > + */
> > +#define MHP_DRIVER_MANAGED   1
> > +
> >  /*
> >   * Zone resizing functions
> >   *
> > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> > index ebdf6541d074..cfa0721280aa 100644
> > --- a/mm/memory_hotplug.c
> > +++ b/mm/memory_hotplug.c
> > @@ -98,11 +98,11 @@ void mem_hotplug_done(void)
> >  u64 max_mem_size = U64_MAX;
> >
> >  /* add this memory to iomem resource */
> > -static struct resource *register_memory_resource(u64 start, u64 size)
> > +static struct resource *register_memory_resource(u64 start, u64 size,
> > +  const char *resource_name)
> >  {
> >   struct resource *res;
> >   unsigned long flags =  IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
> > - char *resource_name = "System RAM";
> >
> >   /*
> >* Make sure value parsed from 'mem=' only restricts memory adding
> > @@ -1058,7 +1058,8 @@ int __ref add_memory_resource(int nid, struct 
> > resource *res,
> >   BUG_ON(ret);
> >
> >   /* create new memmap entry */
> > - firmware_map_add_hotplug(start, start + size, "System RAM");
> > + if (!(flags & MHP_DRIVER_MANAGED))
> > + firmware_map_add_hotplug(start, start + size, "System RAM");
> >
> >   /* device_online() will take the lock when calling online_pages() */
> >   mem_hotplug_done();
> > @@ -1081,10 +1082,21 @@ int __ref add_memory_resource(int nid, struct 
> > resource *res,
> >  /* requires device_hotplug_lock, see add_memory_resource() */
> >  int __ref __add_memory(int nid, u64 start, u64 size, unsigned long flags)
> >  {
> > + const char *resource_name = "System RAM";
> >   struct resource *res;
> >   int ret;
> >
> > - res = register_memory_resource(start, size);
> > + /*
> > +  * Indicate that memory managed by a driver is special. It's always
> > +  * detected and added via a driver, should not be given to the kexec
> > +  * kernel for booting when manually crafting the firmware memmap, and
> > +  * no kexec segments should be placed on it. However, kdump should
> > +  * dump this memory.
> > +  */
> > + if (flags & MHP_DRIVER_MANAGED)
> > + resource_name = "System RAM (driver managed)";
> > +
> > + res = register_memory_resource(start, size, 

Re: [PATCH v5 4/4] powerpc/papr_scm: Implement support for DSM_PAPR_SCM_HEALTH

2020-04-03 Thread Dan Williams
On Tue, Mar 31, 2020 at 7:33 AM Vaibhav Jain  wrote:
>
> This patch implements support for papr_scm command
> 'DSM_PAPR_SCM_HEALTH' that returns a newly introduced 'struct
> nd_papr_scm_dimm_health_stat' instance containing dimm health
> information back to user space in response to ND_CMD_CALL. This
> functionality is implemented in newly introduced papr_scm_get_health()
> that queries the scm-dimm health information and then copies these bitmaps
> to the package payload whose layout is defined by 'struct
> papr_scm_ndctl_health'.
>
> The patch also introduces a new member a new member 'struct
> papr_scm_priv.health' thats an instance of 'struct
> nd_papr_scm_dimm_health_stat' to cache the health information of a
> scm-dimm. As a result functions drc_pmem_query_health() and
> papr_flags_show() are updated to populate and use this new struct
> instead of two be64 integers that we earlier used.

Link to HCALL specification?

>
> Signed-off-by: Vaibhav Jain 
> ---
> Changelog:
>
> v4..v5: None
>
> v3..v4: Call the DSM_PAPR_SCM_HEALTH service function from
> papr_scm_service_dsm() instead of papr_scm_ndctl(). [Aneesh]
>
> v2..v3: Updated struct nd_papr_scm_dimm_health_stat_v1 to use '__xx'
> types as its exported to the userspace [Aneesh]
> Changed the constants DSM_PAPR_SCM_DIMM_XX indicating dimm
> health from enum to #defines [Aneesh]
>
> v1..v2: New patch in the series
> ---
>  arch/powerpc/include/uapi/asm/papr_scm_dsm.h |  40 +++
>  arch/powerpc/platforms/pseries/papr_scm.c| 109 ---
>  2 files changed, 132 insertions(+), 17 deletions(-)
>
> diff --git a/arch/powerpc/include/uapi/asm/papr_scm_dsm.h 
> b/arch/powerpc/include/uapi/asm/papr_scm_dsm.h
> index c039a49b41b4..8265125304ca 100644
> --- a/arch/powerpc/include/uapi/asm/papr_scm_dsm.h
> +++ b/arch/powerpc/include/uapi/asm/papr_scm_dsm.h
> @@ -132,6 +132,7 @@ struct nd_papr_scm_cmd_pkg {
>   */
>  enum dsm_papr_scm {
> DSM_PAPR_SCM_MIN =  0x1,
> +   DSM_PAPR_SCM_HEALTH,
> DSM_PAPR_SCM_MAX,
>  };
>
> @@ -158,4 +159,43 @@ static void *papr_scm_pcmd_to_payload(struct 
> nd_papr_scm_cmd_pkg *pcmd)
> else
> return (void *)((__u8 *) pcmd + pcmd->payload_offset);
>  }
> +
> +/* Various scm-dimm health indicators */
> +#define DSM_PAPR_SCM_DIMM_HEALTHY   0
> +#define DSM_PAPR_SCM_DIMM_UNHEALTHY 1
> +#define DSM_PAPR_SCM_DIMM_CRITICAL  2
> +#define DSM_PAPR_SCM_DIMM_FATAL 3
> +
> +/*
> + * Struct exchanged between kernel & ndctl in for PAPR_DSM_PAPR_SMART_HEALTH
> + * Various bitflags indicate the health status of the dimm.
> + *
> + * dimm_unarmed: Dimm not armed. So contents wont persist.
> + * dimm_bad_shutdown   : Previous shutdown did not persist contents.
> + * dimm_bad_restore: Contents from previous shutdown werent restored.
> + * dimm_scrubbed   : Contents of the dimm have been scrubbed.
> + * dimm_locked : Contents of the dimm cant be modified until CEC 
> reboot
> + * dimm_encrypted  : Contents of dimm are encrypted.
> + * dimm_health : Dimm health indicator.
> + */
> +struct nd_papr_scm_dimm_health_stat_v1 {
> +   __u8 dimm_unarmed;
> +   __u8 dimm_bad_shutdown;
> +   __u8 dimm_bad_restore;
> +   __u8 dimm_scrubbed;
> +   __u8 dimm_locked;
> +   __u8 dimm_encrypted;
> +   __u16 dimm_health;
> +};

Does the structure pack the same across different compilers and configurations?

> +
> +/*
> + * Typedef the current struct for dimm_health so that any application
> + * or kernel recompiled after introducing a new version automatically
> + * supports the new version.
> + */
> +#define nd_papr_scm_dimm_health_stat nd_papr_scm_dimm_health_stat_v1
> +
> +/* Current version number for the dimm health struct */
> +#define ND_PAPR_SCM_DIMM_HEALTH_VERSION 1
> +
>  #endif /* _UAPI_ASM_POWERPC_PAPR_SCM_DSM_H_ */
> diff --git a/arch/powerpc/platforms/pseries/papr_scm.c 
> b/arch/powerpc/platforms/pseries/papr_scm.c
> index e8ce96d2249e..ce94762954e0 100644
> --- a/arch/powerpc/platforms/pseries/papr_scm.c
> +++ b/arch/powerpc/platforms/pseries/papr_scm.c
> @@ -47,8 +47,7 @@ struct papr_scm_priv {
> struct mutex dimm_mutex;
>
> /* Health information for the dimm */
> -   __be64 health_bitmap;
> -   __be64 health_bitmap_valid;
> +   struct nd_papr_scm_dimm_health_stat health;
>  };
>
>  static int drc_pmem_bind(struct papr_scm_priv *p)
> @@ -158,6 +157,7 @@ static int drc_pmem_query_health(struct papr_scm_priv *p)
>  {
> unsigned long ret[PLPAR_HCALL_BUFSIZE];
> int64_t rc;
> +   __be64 health;
>
> rc = plpar_hcall(H_SCM_HEALTH, ret, p->drc_index);
> if (rc != H_SUCCESS) {
> @@ -172,13 +172,41 @@ static int drc_pmem_query_health(struct papr_scm_priv 
> *p)
> return rc;
>
> /* Store the retrieved health information in dimm platform data */
> -   p->health_bitmap = ret[0];
> -   

Re: [PATCH v5 3/4] powerpc/papr_scm,uapi: Add support for handling PAPR DSM commands

2020-04-03 Thread Dan Williams
On Tue, Mar 31, 2020 at 7:33 AM Vaibhav Jain  wrote:
>
> Implement support for handling PAPR DSM commands in papr_scm
> module. We advertise support for ND_CMD_CALL for the dimm command mask
> and implement necessary scaffolding in the module to handle ND_CMD_CALL
> ioctl and DSM commands that we receive.

They aren't ACPI Device Specific Methods in the papr_scm case, right?
I'd call them what the papr_scm specification calls them and replace
"DSM" throughout.

> The layout of the DSM commands as we expect from libnvdimm/libndctl is
> described in newly introduced uapi header 'papr_scm_dsm.h' which
> defines a new 'struct nd_papr_scm_cmd_pkg' header. This header is used
> to communicate the DSM command via 'nd_pkg_papr_scm->nd_command' and
> size of payload that need to be sent/received for servicing the DSM.
>
> The PAPR DSM commands are assigned indexes started from 0x1 to
> prevent them from overlapping ND_CMD_* values and also makes handling
> dimm commands in papr_scm_ndctl().

You don't necessarily need to have command number separation like
that. The function number spaces are unique per family.

> A new function cmd_to_func() is
> implemented that reads the args to papr_scm_ndctl() and performs
> sanity tests on them. In case of a DSM command being sent via
> ND_CMD_CALL a newly introduced function papr_scm_service_dsm() is
> called to handle the request.
>
> Signed-off-by: Vaibhav Jain 
>
> ---
> Changelog:
>
> v4..v5: Fixed a bug in new implementation of papr_scm_ndctl().
>
> v3..v4: Updated papr_scm_ndctl() to delegate DSM command handling to a
> different function papr_scm_service_dsm(). [Aneesh]
>
> v2..v3: Updated the nd_papr_scm_cmd_pkg to use __xx types as its
> exported to the userspace [Aneesh]
>
> v1..v2: New patch in the series.
> ---
>  arch/powerpc/include/uapi/asm/papr_scm_dsm.h | 161 +++
>  arch/powerpc/platforms/pseries/papr_scm.c|  97 ++-
>  2 files changed, 252 insertions(+), 6 deletions(-)
>  create mode 100644 arch/powerpc/include/uapi/asm/papr_scm_dsm.h
>
> diff --git a/arch/powerpc/include/uapi/asm/papr_scm_dsm.h 
> b/arch/powerpc/include/uapi/asm/papr_scm_dsm.h
> new file mode 100644
> index ..c039a49b41b4
> --- /dev/null
> +++ b/arch/powerpc/include/uapi/asm/papr_scm_dsm.h
> @@ -0,0 +1,161 @@
> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> +/*
> + * PAPR SCM Device specific methods and struct for libndctl and ndctl
> + *
> + * (C) Copyright IBM 2020
> + *
> + * Author: Vaibhav Jain 
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2, or (at your option)
> + * any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.

These 2 paragraphs of redundant license text can be dropped. The SPDX
line is sufficient.

> + */
> +
> +#ifndef _UAPI_ASM_POWERPC_PAPR_SCM_DSM_H_
> +#define _UAPI_ASM_POWERPC_PAPR_SCM_DSM_H_
> +
> +#include 
> +
> +#ifdef __KERNEL__
> +#include 
> +#else
> +#include 
> +#endif
> +
> +/*
> + * DSM Envelope:
> + *
> + * The ioctl ND_CMD_CALL transfers data between user-space and kernel via
> + * 'envelopes' which consists of a header and user-defined payload sections.
> + * The header is described by 'struct nd_papr_scm_cmd_pkg' which expects a
> + * payload following it and offset of which relative to the struct is 
> provided
> + * by 'nd_papr_scm_cmd_pkg.payload_offset'. *
> + *
> + *  +-+-+---+
> + *  |   64-Bytes  |   8-Bytes   |   Max 184-Bytes   |
> + *  +-+-+---+
> + *  |   nd_papr_scm_cmd_pkg |   |
> + *  |-+ |   |
> + *  |  nd_cmd_pkg | |   |
> + *  +-+-+---+
> + *  | nd_family   ||   |
> + *  | nd_size_out | cmd_status  |  |
> + *  | nd_size_in  | payload_version |  PAYLOAD |
> + *  | nd_command  | payload_offset ->  |
> + *  | nd_fw_size  | |  |
> + *  +-+-+---+
> + *
> + * DSM Header:
> + *
> + * The header is defined as 'struct nd_papr_scm_cmd_pkg' which embeds a
> + * 'struct nd_cmd_pkg' instance. The DSM command is assigned to member
> + * 'nd_cmd_pkg.nd_command'. Apart from size information of the envelop which 
> is

 s/envelop/envelope/

There's a 

Re: [PATCH v5 2/4] ndctl/uapi: Introduce NVDIMM_FAMILY_PAPR_SCM as a new NVDIMM DSM family

2020-04-03 Thread Dan Williams
On Tue, Mar 31, 2020 at 7:33 AM Vaibhav Jain  wrote:
>
> Add PAPR-scm family of DSM command-set to the white list of NVDIMM
> command sets.
>
> Signed-off-by: Vaibhav Jain 
> ---
> Changelog:
>
> v4..v5 : None
>
> v3..v4 : None
>
> v2..v3 : Updated the patch prefix to 'ndctl/uapi' [Aneesh]
>
> v1..v2 : None
> ---
>  include/uapi/linux/ndctl.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h
> index de5d90212409..99fb60600ef8 100644
> --- a/include/uapi/linux/ndctl.h
> +++ b/include/uapi/linux/ndctl.h
> @@ -244,6 +244,7 @@ struct nd_cmd_pkg {
>  #define NVDIMM_FAMILY_HPE2 2
>  #define NVDIMM_FAMILY_MSFT 3
>  #define NVDIMM_FAMILY_HYPERV 4
> +#define NVDIMM_FAMILY_PAPR_SCM 5

Looks good, but please squash it with patch 3.


  1   2   3   4   5   >