Re: [PATCH] powerpc/legacy_serial: Fix UBSAN: array-index-out-of-bounds

2021-05-10 Thread Christophe Leroy




Le 11/05/2021 à 03:16, Michael Ellerman a écrit :

Segher Boessenkool  writes:


On Sat, May 08, 2021 at 06:36:21AM +, Christophe Leroy wrote:

UBSAN complains when a pointer is calculated with invalid
'legacy_serial_console' index, allthough the index is verified
before dereferencing the pointer.


Addressing like this is UB already.

You could just move this:


-   if (legacy_serial_console < 0)
-   return 0;


to before


-   struct legacy_serial_info *info = 
_serial_infos[legacy_serial_console];
-   struct plat_serial8250_port *port = 
_serial_ports[legacy_serial_console];


and no other change is necessary.


Yeah I sent a v2 doing that, thanks.



I wanted something looking similar to setup_legacy_serial_console(), but of 
course this also works.

Christophe


[PATCH kernel v2] powerpc/makefile: Do not redefine $(CPP) for preprocessor

2021-05-10 Thread Alexey Kardashevskiy
The $(CPP) (do only preprocessing) macro is already defined in Makefile.
However POWERPC redefines it and adds $(KBUILD_CFLAGS) which results
in flags duplication. Which is not a big deal by itself except for
the flags which depend on other flags and the compiler checks them
as it parses the command line.

Specifically, scripts/Makefile.build:304 generates ksyms for .S files.
If clang+llvm+sanitizer are enabled, this results in

-emit-llvm-bc -fno-lto -flto -fvisibility=hidden \
 -fsanitize=cfi-mfcall -fno-lto  ...

in the clang command line and triggers error:

clang-13: error: invalid argument '-fsanitize=cfi-mfcall' only allowed with 
'-flto'

This removes unnecessary CPP redefinition. Which works fine as in most
place KBUILD_CFLAGS is passed to $CPP except
arch/powerpc/kernel/vdso64/vdso(32|64).lds (and probably some others,
not yet detected). To fix vdso, we do:
1. explicitly add -m(big|little)-endian to $CPP
2. (for clang) add $CLANG_FLAGS to $KBUILD_CPPFLAGS as otherwise clang
silently ignores -m(big|little)-endian if the building platform does not
support big endian (such as x86) so --prefix= is required.

While at this, remove some duplication from CPPFLAGS_vdso(32|64)
as cmd_cpp_lds_S has them anyway. It still puzzles me why we need -C
(preserve comments in the preprocessor output) flag here.

Signed-off-by: Alexey Kardashevskiy 
---
Changes:
v2:
* fix KBUILD_CPPFLAGS
* add CLANG_FLAGS to CPPFLAGS
---
 Makefile| 1 +
 arch/powerpc/Makefile   | 3 ++-
 arch/powerpc/kernel/vdso32/Makefile | 2 +-
 arch/powerpc/kernel/vdso64/Makefile | 2 +-
 4 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 72af8e423f11..13acd2183d55 100644
--- a/Makefile
+++ b/Makefile
@@ -591,6 +591,7 @@ CLANG_FLAGS += --prefix=$(GCC_TOOLCHAIN_DIR)$(notdir 
$(CROSS_COMPILE))
 endif
 CLANG_FLAGS+= -Werror=unknown-warning-option
 KBUILD_CFLAGS  += $(CLANG_FLAGS)
+KBUILD_CPPFLAGS+= $(CLANG_FLAGS)
 KBUILD_AFLAGS  += $(CLANG_FLAGS)
 export CLANG_FLAGS
 endif
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index 3212d076ac6a..306bfd2797ad 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -76,6 +76,7 @@ endif
 
 ifdef CONFIG_CPU_LITTLE_ENDIAN
 KBUILD_CFLAGS  += -mlittle-endian
+KBUILD_CPPFLAGS+= -mlittle-endian
 KBUILD_LDFLAGS += -EL
 LDEMULATION:= lppc
 GNUTARGET  := powerpcle
@@ -83,6 +84,7 @@ MULTIPLEWORD  := -mno-multiple
 KBUILD_CFLAGS_MODULE += $(call cc-option,-mno-save-toc-indirect)
 else
 KBUILD_CFLAGS += $(call cc-option,-mbig-endian)
+KBUILD_CPPFLAGS += $(call cc-option,-mbig-endian)
 KBUILD_LDFLAGS += -EB
 LDEMULATION:= ppc
 GNUTARGET  := powerpc
@@ -208,7 +210,6 @@ KBUILD_CPPFLAGS += -I $(srctree)/arch/$(ARCH) $(asinstr)
 KBUILD_AFLAGS  += $(AFLAGS-y)
 KBUILD_CFLAGS  += $(call cc-option,-msoft-float)
 KBUILD_CFLAGS  += -pipe $(CFLAGS-y)
-CPP= $(CC) -E $(KBUILD_CFLAGS)
 
 CHECKFLAGS += -m$(BITS) -D__powerpc__ -D__powerpc$(BITS)__
 ifdef CONFIG_CPU_BIG_ENDIAN
diff --git a/arch/powerpc/kernel/vdso32/Makefile 
b/arch/powerpc/kernel/vdso32/Makefile
index 7d9a6fee0e3d..ea001c6df1fa 100644
--- a/arch/powerpc/kernel/vdso32/Makefile
+++ b/arch/powerpc/kernel/vdso32/Makefile
@@ -44,7 +44,7 @@ asflags-y := -D__VDSO32__ -s
 
 obj-y += vdso32_wrapper.o
 targets += vdso32.lds
-CPPFLAGS_vdso32.lds += -P -C -Upowerpc
+CPPFLAGS_vdso32.lds += -C
 
 # link rule for the .so file, .lds has to be first
 $(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) $(obj)/vgettimeofday.o 
FORCE
diff --git a/arch/powerpc/kernel/vdso64/Makefile 
b/arch/powerpc/kernel/vdso64/Makefile
index 2813e3f98db6..07eadba48c7a 100644
--- a/arch/powerpc/kernel/vdso64/Makefile
+++ b/arch/powerpc/kernel/vdso64/Makefile
@@ -30,7 +30,7 @@ ccflags-y := -shared -fno-common -fno-builtin -nostdlib \
 asflags-y := -D__VDSO64__ -s
 
 targets += vdso64.lds
-CPPFLAGS_vdso64.lds += -P -C -U$(ARCH)
+CPPFLAGS_vdso64.lds += -C
 
 # link rule for the .so file, .lds has to be first
 $(obj)/vdso64.so.dbg: $(src)/vdso64.lds $(obj-vdso64) $(obj)/vgettimeofday.o 
FORCE
-- 
2.30.2



Re: [V3 PATCH 06/16] powerpc/pseries/vas: Define VAS/NXGZIP HCALLs and structs

2021-05-10 Thread Haren Myneni
On Mon, 2021-05-10 at 15:49 +1000, Nicholas Piggin wrote:
> Excerpts from Haren Myneni's message of April 18, 2021 7:05 am:
> > This patch adds HCALLs and other definitions. Also define structs
> > that are used in VAS implementation on powerVM.
> > 
> > Signed-off-by: Haren Myneni 
> > ---
> >  arch/powerpc/include/asm/hvcall.h|  7 ++
> >  arch/powerpc/include/asm/vas.h   | 28 
> >  arch/powerpc/platforms/pseries/vas.h | 96
> > 
> >  3 files changed, 131 insertions(+)
> >  create mode 100644 arch/powerpc/platforms/pseries/vas.h
> > 
> > diff --git a/arch/powerpc/include/asm/hvcall.h
> > b/arch/powerpc/include/asm/hvcall.h
> > index ed6086d57b22..accbb7f6f272 100644
> > --- a/arch/powerpc/include/asm/hvcall.h
> > +++ b/arch/powerpc/include/asm/hvcall.h
> > @@ -294,6 +294,13 @@
> >  #define H_RESIZE_HPT_COMMIT0x370
> >  #define H_REGISTER_PROC_TBL0x37C
> >  #define H_SIGNAL_SYS_RESET 0x380
> > +#defineH_ALLOCATE_VAS_WINDOW   0x388
> > +#defineH_MODIFY_VAS_WINDOW 0x38C
> > +#defineH_DEALLOCATE_VAS_WINDOW 0x390
> > +#defineH_QUERY_VAS_WINDOW  0x394
> > +#defineH_QUERY_VAS_CAPABILITIES0x398
> > +#defineH_QUERY_NX_CAPABILITIES 0x39C
> > +#defineH_GET_NX_FAULT  0x3A0
> 
> These should be spaces.
> 
> >  #define H_INT_GET_SOURCE_INFO   0x3A8
> >  #define H_INT_SET_SOURCE_CONFIG 0x3AC
> >  #define H_INT_GET_SOURCE_CONFIG 0x3B0
> > diff --git a/arch/powerpc/include/asm/vas.h
> > b/arch/powerpc/include/asm/vas.h
> > index f928bf4c7e98..d15784506a54 100644
> > --- a/arch/powerpc/include/asm/vas.h
> > +++ b/arch/powerpc/include/asm/vas.h
> > @@ -179,6 +179,7 @@ struct vas_tx_win_attr {
> > bool rx_win_ord_mode;
> >  };
> >  
> > +#ifdef CONFIG_PPC_POWERNV
> >  /*
> >   * Helper to map a chip id to VAS id.
> >   * For POWER9, this is a 1:1 mapping. In the future this maybe a
> > 1:N
> > @@ -243,6 +244,33 @@ int vas_paste_crb(struct vas_window *win, int
> > offset, bool re);
> >  int vas_register_api_powernv(struct module *mod, enum vas_cop_type
> > cop_type,
> >  const char *name);
> >  void vas_unregister_api_powernv(void);
> > +#endif
> > +
> > +#ifdef CONFIG_PPC_PSERIES
> > +
> > +/* VAS Capabilities */
> > +#define VAS_GZIP_QOS_FEAT  0x1
> > +#define VAS_GZIP_DEF_FEAT  0x2
> > +#define VAS_GZIP_QOS_FEAT_BIT  (1UL << (63 -
> > VAS_GZIP_QOS_FEAT)) /* Bit 1 */
> > +#define VAS_GZIP_DEF_FEAT_BIT  (1UL << (63 -
> > VAS_GZIP_DEF_FEAT)) /* Bit 2 */
> 
> Use PPC_BIT for these.
> 
> > +
> > +/* NX Capabilities */
> > +#defineVAS_NX_GZIP_FEAT0x1
> > +#defineVAS_NX_GZIP_FEAT_BIT(1UL << (63 -
> > VAS_NX_GZIP_FEAT)) /* Bit 1 */
> > +#defineVAS_DESCR_LEN   8
> > +
> > +struct vas_all_capabs_be {
> > +   __be64  descriptor;
> > +   __be64  feat_type;
> > +} __packed __aligned(0x1000);
> > +
> > +struct vas_all_capabs {
> > +   charname[VAS_DESCR_LEN + 1];
> > +   u64 descriptor;
> > +   u64 feat_type;
> > +};
> 
> You're using _be for the struct that is passed to the hcall, and a 
> non-postfixed one for something the driver uses internally? It seems
> like buf or buffer, or hv_ prefix is typically used rather than be
> (host 
> kernel could be BE).
> 
> struct hv_query_vas_capabilities_buffer for example.

Can I add like "hv_vas_all_caps_buf" instead of vas_all_capabs_be
> 
> Does the hcall really require 0x1000 alignment?

pAPR user mode NX says "resultBuffer : The logical real address of a
size-aligned 4K buffer to store VAS capabilities in" (Section:
H_QUERY_VAS_CAPABILITIES HCALL)
> 
> > +
> > +#endif
> >  
> >  /*
> >   * Register / unregister coprocessor type to VAS API which will be
> > exported
> > diff --git a/arch/powerpc/platforms/pseries/vas.h
> > b/arch/powerpc/platforms/pseries/vas.h
> > new file mode 100644
> > index ..208682fffa57
> > --- /dev/null
> > +++ b/arch/powerpc/platforms/pseries/vas.h
> > @@ -0,0 +1,96 @@
> > +/* SPDX-License-Identifier: GPL-2.0-or-later */
> > +/*
> > + * Copyright 2020-21 IBM Corp.
> > + */
> > +
> > +#ifndef _VAS_H
> > +#define _VAS_H
> > +#include 
> > +#include 
> > +#include 
> > +
> > +/*
> > + * VAS window modify flags
> > + */
> > +#defineVAS_MOD_WIN_CLOSE   (1UL << 63)
> > +#defineVAS_MOD_WIN_JOBS_KILL   (1UL << (63 - 1))
> > +#defineVAS_MOD_WIN_DR  (1UL << (63 - 3))
> > +#defineVAS_MOD_WIN_PR  (1UL << (63 - 4))
> > +#defineVAS_MOD_WIN_SF  (1UL << (63 - 5))
> > +#defineVAS_MOD_WIN_TA  (1UL << (63 - 6))
> > +#defineVAS_MOD_WIN_FLAGS   (VAS_MOD_WIN_JOBS_KILL |
> > VAS_MOD_WIN_DR | \
> > +   VAS_MOD_WIN_PR | VAS_MOD_WIN_SF)
> > +
> > +#defineVAS_WIN_ACTIVE  0x0
> > +#defineVAS_WIN_CLOSED  0x1
> > +#defineVAS_WIN_INACTIVE0x2 /* Inactive due to HW
> > failure */
> > +/* Process of being modified, deallocated, or quiesced */
> > 

Re: [V3 PATCH 07/16] powerpc/vas: Define QoS credit flag to allocate window

2021-05-10 Thread Haren Myneni
On Mon, 2021-05-10 at 15:54 +1000, Nicholas Piggin wrote:
> Excerpts from Haren Myneni's message of April 18, 2021 7:06 am:
> > pHyp introduces two different type of credits: Default and Quality
> > of service (QoS).
> > 
> > The total number of default credits available on each LPAR depends
> > on CPU resources configured. But these credits can be shared or
> > over-committed across LPARs in shared mode which can result in
> > paste command failure (RMA_busy). To avoid NX HW contention, phyp
> > introduces QoS credit type which makes sure guaranteed access to NX
> > resources. The system admins can assign QoS credits for each LPAR
> > via HMC.
> > 
> > Default credit type is used to allocate a VAS window by default as
> > on powerVM implementation. But the process can pass
> > VAS_WIN_QOS_CREDITS
> 
> There's some interchanging of pHyp and PowerVM in the series.
> 
> PowerVM is probably the better term to use, with uppercase P.
> Unless you mean PAPR or pseries etc.
> 
> I think you can say the PAPR VAS spec has two different types of 
> credits, rather than say a specific hypervisor is introducing them.

DEF and QoS credits types are introduced by the hypervisor, not VAS
PAPR. We did not have these types on powerNV. 

> 
> > flag with VAS_TX_WIN_OPEN ioctl to open QoS type window.
> > 
> > Signed-off-by: Haren Myneni 
> > ---
> >  arch/powerpc/include/uapi/asm/vas-api.h | 6 +-
> >  1 file changed, 5 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/include/uapi/asm/vas-api.h
> > b/arch/powerpc/include/uapi/asm/vas-api.h
> > index ebd4b2424785..eb7c8694174f 100644
> > --- a/arch/powerpc/include/uapi/asm/vas-api.h
> > +++ b/arch/powerpc/include/uapi/asm/vas-api.h
> > @@ -13,11 +13,15 @@
> >  #define VAS_MAGIC  'v'
> >  #define VAS_TX_WIN_OPEN_IOW(VAS_MAGIC, 0x20, struct
> > vas_tx_win_open_attr)
> >  
> > +/* Flags to VAS TX open window ioctl */
> > +/* To allocate a window with QoS credit, otherwise default credit
> > is used */
> > +#defineVAS_WIN_QOS_CREDITS 0x0001
> > +
> >  struct vas_tx_win_open_attr {
> 
> Some consistency of naming might help, VAS_TX_WIN_FLAG_QOS_CREDIT.

Sure, will change.
> 
> > __u32   version;
> > __s16   vas_id; /* specific instance of vas or -1 for
> > default */
> > __u16   reserved1;
> > -   __u64   flags;  /* Future use */
> > +   __u64   flags;
> > __u64   reserved2[6];
> >  };
> >  
> > -- 
> > 2.18.2
> > 
> > 
> > 



Re: [V3 PATCH 09/16] powerpc/pseries/vas: Implement to get all capabilities

2021-05-10 Thread Haren Myneni
On Mon, 2021-05-10 at 16:13 +1000, Nicholas Piggin wrote:
> Excerpts from Haren Myneni's message of April 18, 2021 7:08 am:
> > pHyp provides various VAS capabilities such as GZIP default and QoS
> > capabilities which are used to determine total number of credits
> > available in LPAR, maximum window credits, maximum LPAR credits,
> > whether usermode copy/paste is supported, and etc.
> > 
> > So first retrieve overall vas capabilities using
> > H_QUERY_VAS_CAPABILITIES HCALL which tells the specific features
> > that
> > are available. Then retrieve the specific capabilities by using the
> > feature type in H_QUERY_VAS_CAPABILITIES HCALL.
> > 
> > pHyp supports only GZIP default and GZIP QoS capabilities right
> > now.
> 
> Changelog and title could use a bit of work.
> 
> > Signed-off-by: Haren Myneni 
> > ---
> >  arch/powerpc/platforms/pseries/vas.c | 130
> > +++
> >  1 file changed, 130 insertions(+)
> > 
> > diff --git a/arch/powerpc/platforms/pseries/vas.c
> > b/arch/powerpc/platforms/pseries/vas.c
> > index 06960151477c..35946fb02995 100644
> > --- a/arch/powerpc/platforms/pseries/vas.c
> > +++ b/arch/powerpc/platforms/pseries/vas.c
> > @@ -30,6 +30,13 @@
> >  /* phyp allows one credit per window right now */
> >  #define DEF_WIN_CREDS  1
> >  
> > +static struct vas_all_capabs capabs_all;
> 
> Does this name come from PAPR? If not, capabilities or caps are
> better 
> for readability than capabs.

No capabs is not from PAPR, I just added capabs as short name for
capabilities. I will change it to caps as suggested.

> 
> > +static int copypaste_feat;
> 
> Should be a bool? And what does it mean? copy-paste is a host core 
> capability.

copypaste_feat can be bool. But is it used to export user space API if
reading all capabilities are successful. 
> 
> > +
> > +struct vas_capabs vcapabs[VAS_MAX_FEAT_TYPE];
> > +
> > +DEFINE_MUTEX(vas_pseries_mutex);
> 
> Can these be made static if they're only used here, and export them
> if a 
> future patch uses them (or add the header declaration now).
> 
> 
> > +
> >  static int64_t hcall_return_busy_check(int64_t rc)
> >  {
> > /* Check if we are stalled for some time */
> > @@ -215,3 +222,126 @@ int plpar_vas_query_capabilities(const u64
> > hcall, u8 query_type,
> > return -EIO;
> > }
> >  }
> > +
> > +/*
> > + * Get the specific capabilities based on the feature type.
> > + * Right now supports GZIP default and GZIP QoS capabilities.
> > + */
> > +static int get_vas_capabilities(u8 feat, enum vas_cop_feat_type
> > type,
> > +   struct vas_ct_capabs_be *capab_be)
> > +{
> > +   struct vas_ct_capabs *capab;
> > +   struct vas_capabs *vcapab;
> > +   int rc = 0;
> > +
> > +   vcapab = [type];
> > +   memset(vcapab, 0, sizeof(*vcapab));
> > +   INIT_LIST_HEAD(>list);
> > +
> > +   capab = >capab;
> > +
> > +   rc = plpar_vas_query_capabilities(H_QUERY_VAS_CAPABILITIES,
> > feat,
> > + (u64)virt_to_phys(capab_be));
> > +   if (rc)
> > +   return rc;
> > +
> > +   capab->user_mode = capab_be->user_mode;
> > +   if (!(capab->user_mode & VAS_COPY_PASTE_USER_MODE)) {
> > +   pr_err("User space COPY/PASTE is not supported\n");
> > +   return -ENOTSUPP;
> > +   }
> > +
> > +   snprintf(capab->name, VAS_DESCR_LEN + 1, "%.8s",
> > +(char *)_be->descriptor);
> > +   capab->descriptor = be64_to_cpu(capab_be->descriptor);
> > +   capab->win_type = capab_be->win_type;
> > +   if (capab->win_type >= VAS_MAX_FEAT_TYPE) {
> > +   pr_err("Unsupported window type %u\n", capab-
> > >win_type);
> > +   return -EINVAL;
> > +   }
> > +   capab->max_lpar_creds = be16_to_cpu(capab_be->max_lpar_creds);
> > +   capab->max_win_creds = be16_to_cpu(capab_be->max_win_creds);
> > +   atomic_set(>target_lpar_creds,
> > +  be16_to_cpu(capab_be->target_lpar_creds));
> > +   if (feat == VAS_GZIP_DEF_FEAT) {
> > +   capab->def_lpar_creds = be16_to_cpu(capab_be-
> > >def_lpar_creds);
> > +
> > +   if (capab->max_win_creds < DEF_WIN_CREDS) {
> > +   pr_err("Window creds(%u) > max allowed window
> > creds(%u)\n",
> > +  DEF_WIN_CREDS, capab->max_win_creds);
> > +   return -EINVAL;
> > +   }
> > +   }
> > +
> > +   copypaste_feat = 1;
> > +
> > +   return 0;
> > +}
> > +
> > +static int __init pseries_vas_init(void)
> > +{
> > +   struct vas_ct_capabs_be *ct_capabs_be;
> > +   struct vas_all_capabs_be *capabs_be;
> > +   int rc;
> > +
> > +   /*
> > +* Linux supports user space COPY/PASTE only with Radix
> > +*/
> > +   if (!radix_enabled()) {
> > +   pr_err("API is supported only with radix page
> > tables\n");
> > +   return -ENOTSUPP;
> > +   }
> > +
> > +   capabs_be = kmalloc(sizeof(*capabs_be), GFP_KERNEL);
> > +   if (!capabs_be)
> > +   return -ENOMEM;
> > +   /*
> > +* Get VAS overall capabilities 

Re: [PATCH V3 02/16] powerpc/vas: Move VAS API to common book3s platform

2021-05-10 Thread Michael Ellerman
Nicholas Piggin  writes:
> Excerpts from Haren Myneni's message of April 18, 2021 7:02 am:
>> 
>> Using the same /dev/crypto/nx-gzip interface for both powerNV and
>> pseries.
>
> The pseries NX driver will use the powernv VAS API ?
>
>> So this patch creates platforms/book3s/ and moves VAS API
>> to that directory. The actual functionality is not changed.
>> 
>> Common interface functions such as open, window open ioctl, mmap
>> and close are moved to arch/powerpc/platforms/book3s/vas-api.c
>> Added hooks to call platform specific code, but the underline
>> powerNV code in these functions is not changed.
>
> Even so, could you do one patch that just moves, and another that
> adds the ops struct?
>
>> 
>> Signed-off-by: Haren Myneni 
>> ---
>>  arch/powerpc/include/asm/vas.h| 22 ++-
>>  arch/powerpc/platforms/Kconfig|  1 +
>>  arch/powerpc/platforms/Makefile   |  1 +
>>  arch/powerpc/platforms/book3s/Kconfig | 15 +
>>  arch/powerpc/platforms/book3s/Makefile|  2 +
>
> The usual place for these would be arch/powerpc/sysdev/vas. E.g., see
> arch/powerpc/sysdev/xive.

You're right that is the usual place, but is it a good place? :)

Using platforms/book3s was my suggestion:

  https://lore.kernel.org/linuxppc-dev/87k0p6s5lo@mpe.ellerman.id.au/


But I don't feel that strongly about it, maybe just dumping things in
sysdev is easier.

cheers


[PATCH] s390/hvc_iucv: Drop unnecessary NULL check after container_of

2021-05-10 Thread Guenter Roeck
The result of container_of() operations is never NULL unless the extracted
element is the first element of the embedded structure. This is not the
case here. The NULL check is therefore unnecessary and misleading.
Remove it.

This change was made automatically with the following Coccinelle script.

@@
type t;
identifier v;
statement s;
@@

<+...
(
  t v = container_of(...);
|
  v = container_of(...);
)
  ...
  when != v
- if (\( !v \| v == NULL \) ) s
...+>

Signed-off-by: Guenter Roeck 
---
 drivers/tty/hvc/hvc_iucv.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/tty/hvc/hvc_iucv.c b/drivers/tty/hvc/hvc_iucv.c
index 2af1e5751bd6..3bd03ae01bf5 100644
--- a/drivers/tty/hvc/hvc_iucv.c
+++ b/drivers/tty/hvc/hvc_iucv.c
@@ -438,8 +438,6 @@ static void hvc_iucv_sndbuf_work(struct work_struct *work)
struct hvc_iucv_private *priv;
 
priv = container_of(work, struct hvc_iucv_private, sndbuf_work.work);
-   if (!priv)
-   return;
 
spin_lock_bh(>lock);
hvc_iucv_send(priv);
-- 
2.25.1



Re: [PATCH v3 1/3] audit: replace magic audit syscall class numbers with macros

2021-05-10 Thread Paul Moore
On Fri, Apr 30, 2021 at 4:36 PM Richard Guy Briggs  wrote:
>
> Replace audit syscall class magic numbers with macros.
>
> This required putting the macros into new header file
> include/linux/auditscm.h since the syscall macros were included for both 64
> bit and 32 bit in any compat code, causing redefinition warnings.

The ifndef/define didn't protect against redeclaration?  Huh.  Maybe
I'm not thinking about this correctly, or the arch specific code is
doing something wonky ...

Regardless, assuming that it is necessary, I would prefer if we called
it auditsc.h instead of auditscm.h; the latter makes me think of
sockets and not syscalls.

> Signed-off-by: Richard Guy Briggs 
> ---
>  MAINTAINERS|  1 +
>  arch/alpha/kernel/audit.c  |  8 
>  arch/ia64/kernel/audit.c   |  8 
>  arch/parisc/kernel/audit.c |  8 
>  arch/parisc/kernel/compat_audit.c  |  9 +
>  arch/powerpc/kernel/audit.c| 10 +-
>  arch/powerpc/kernel/compat_audit.c | 11 ++-
>  arch/s390/kernel/audit.c   | 10 +-
>  arch/s390/kernel/compat_audit.c| 11 ++-
>  arch/sparc/kernel/audit.c  | 10 +-
>  arch/sparc/kernel/compat_audit.c   | 11 ++-
>  arch/x86/ia32/audit.c  | 11 ++-
>  arch/x86/kernel/audit_64.c |  8 
>  include/linux/audit.h  |  1 +
>  include/linux/auditscm.h   | 23 +++
>  kernel/auditsc.c   | 12 ++--
>  lib/audit.c| 10 +-
>  lib/compat_audit.c | 11 ++-
>  18 files changed, 102 insertions(+), 71 deletions(-)
>  create mode 100644 include/linux/auditscm.h

...

> diff --git a/include/linux/auditscm.h b/include/linux/auditscm.h
> new file mode 100644
> index ..1c4f0ead5931
> --- /dev/null
> +++ b/include/linux/auditscm.h
> @@ -0,0 +1,23 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/* auditscm.h -- Auditing support syscall macros
> + *
> + * Copyright 2021 Red Hat Inc., Durham, North Carolina.
> + * All Rights Reserved.
> + *
> + * Author: Richard Guy Briggs 
> + */
> +#ifndef _LINUX_AUDITSCM_H_
> +#define _LINUX_AUDITSCM_H_
> +
> +enum auditsc_class_t {
> +   AUDITSC_NATIVE = 0,
> +   AUDITSC_COMPAT,
> +   AUDITSC_OPEN,
> +   AUDITSC_OPENAT,
> +   AUDITSC_SOCKETCALL,
> +   AUDITSC_EXECVE,
> +
> +   AUDITSC_NVALS /* count */
> +};
> +
> +#endif

-- 
paul moore
www.paul-moore.com


Re: [PATCH] powerpc/legacy_serial: Fix UBSAN: array-index-out-of-bounds

2021-05-10 Thread Michael Ellerman
Segher Boessenkool  writes:

> On Sat, May 08, 2021 at 06:36:21AM +, Christophe Leroy wrote:
>> UBSAN complains when a pointer is calculated with invalid
>> 'legacy_serial_console' index, allthough the index is verified
>> before dereferencing the pointer.
>
> Addressing like this is UB already.
>
> You could just move this:
>
>> -if (legacy_serial_console < 0)
>> -return 0;
>
> to before
>
>> -struct legacy_serial_info *info = 
>> _serial_infos[legacy_serial_console];
>> -struct plat_serial8250_port *port = 
>> _serial_ports[legacy_serial_console];
>
> and no other change is necessary.

Yeah I sent a v2 doing that, thanks.

cheers


[PATCH v2] powerpc/legacy_serial: Fix UBSAN: array-index-out-of-bounds

2021-05-10 Thread Michael Ellerman
From: Christophe Leroy 

UBSAN complains when a pointer is calculated with invalid
'legacy_serial_console' index, allthough the index is verified
before dereferencing the pointer.

Fix it by checking 'legacy_serial_console' validity before
calculating pointers.

Fixes: 0bd3f9e953bd ("powerpc/legacy_serial: Use early_ioremap()")
Reported-by: Paul Menzel 
Signed-off-by: Christophe Leroy 
Signed-off-by: Michael Ellerman 
---
 arch/powerpc/kernel/legacy_serial.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

v2: mpe: Just move the assignment of port/info below the check, as
suggested by Segher.

diff --git a/arch/powerpc/kernel/legacy_serial.c 
b/arch/powerpc/kernel/legacy_serial.c
index 8b2c1a8553a0..cfc03e016ff2 100644
--- a/arch/powerpc/kernel/legacy_serial.c
+++ b/arch/powerpc/kernel/legacy_serial.c
@@ -356,13 +356,16 @@ static void __init setup_legacy_serial_console(int 
console)
 
 static int __init ioremap_legacy_serial_console(void)
 {
-   struct legacy_serial_info *info = 
_serial_infos[legacy_serial_console];
-   struct plat_serial8250_port *port = 
_serial_ports[legacy_serial_console];
+   struct plat_serial8250_port *port;
+   struct legacy_serial_info *info;
void __iomem *vaddr;
 
if (legacy_serial_console < 0)
return 0;
 
+   info = _serial_infos[legacy_serial_console];
+   port = _serial_ports[legacy_serial_console];
+
if (!info->early_addr)
return 0;
 
-- 
2.25.1



[PATCH -next] ppc: boot: Fix a typo in the file decompress.c

2021-05-10 Thread Zhang Jianhua
s/input buffer/output buffer/
s/length of the input buffer/length of the input buffer/



Signed-off-by: Zhang Jianhua 
---
 arch/powerpc/boot/decompress.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/boot/decompress.c b/arch/powerpc/boot/decompress.c
index 6098b879ac97..977eb15a6d17 100644
--- a/arch/powerpc/boot/decompress.c
+++ b/arch/powerpc/boot/decompress.c
@@ -99,8 +99,8 @@ static void print_err(char *s)
  * partial_decompress - decompresses part or all of a compressed buffer
  * @inbuf:   input buffer
  * @input_size:  length of the input buffer
- * @outbuf:  input buffer
- * @output_size: length of the input buffer
+ * @outbuf:  output buffer
+ * @output_size: length of the output buffer
  * @skip number of output bytes to ignore
  *
  * This function takes compressed data from inbuf, decompresses and write it to
-- 
2.17.1



Re: [PATCH] powerpc/legacy_serial: Fix UBSAN: array-index-out-of-bounds

2021-05-10 Thread Segher Boessenkool
On Sat, May 08, 2021 at 06:36:21AM +, Christophe Leroy wrote:
> UBSAN complains when a pointer is calculated with invalid
> 'legacy_serial_console' index, allthough the index is verified
> before dereferencing the pointer.

Addressing like this is UB already.

You could just move this:

> - if (legacy_serial_console < 0)
> - return 0;

to before

> - struct legacy_serial_info *info = 
> _serial_infos[legacy_serial_console];
> - struct plat_serial8250_port *port = 
> _serial_ports[legacy_serial_console];

and no other change is necessary.


Segher


Re: [V3 PATCH 12/16] powerpc/pseries/vas: sysfs interface to export capabilities

2021-05-10 Thread Haren Myneni
On Mon, 2021-05-10 at 16:34 +1000, Nicholas Piggin wrote:
> Excerpts from Haren Myneni's message of April 18, 2021 7:10 am:
> > pHyp provides GZIP default and GZIP QoS capabilities which gives
> > the total number of credits are available in LPAR. This patch
> > creates sysfs entries and exports LPAR credits, the currently used
> > and the available credits for each feature.
> > 
> > /sys/kernel/vas/VasCaps/VDefGzip: (default GZIP capabilities)
> > avail_lpar_creds /* Available credits to use */
> > target_lpar_creds /* Total credits available which can be
> >  /* changed with DLPAR operation */
> > used_lpar_creds  /* Used credits */
> 
> /sys/kernel/ is not an appropriate directory to put it in. Also
> camel 
> case is not thought very highly of these days.

These capabilities are VAS specific ones (powerpc kernel), not for the
indicidual coprocessor. Not sure where to add in sysfs. 

> 
> And s/capabs/caps/g applies here (and all other patches).

Thanks, will change. 
> 
> Thanks,
> Nick



Re: [V3 PATCH 15/16] crypto/nx: Get NX capabilities for GZIP coprocessor type

2021-05-10 Thread Haren Myneni
On Mon, 2021-05-10 at 16:38 +1000, Nicholas Piggin wrote:
> Excerpts from Haren Myneni's message of April 18, 2021 7:12 am:
> > phyp provides NX capabilities which gives recommended minimum
> > compression / decompression length and maximum request buffer size
> > in bytes.
> > 
> > Changes to get NX overall capabilities which points to the specific
> > features phyp supports. Then retrieve NXGZIP specific capabilities.
> > 
> > Signed-off-by: Haren Myneni 
> > ---
> >  drivers/crypto/nx/nx-common-pseries.c | 83
> > +++
> >  1 file changed, 83 insertions(+)
> > 
> > diff --git a/drivers/crypto/nx/nx-common-pseries.c
> > b/drivers/crypto/nx/nx-common-pseries.c
> > index 9a40fca8a9e6..49224870d05e 100644
> > --- a/drivers/crypto/nx/nx-common-pseries.c
> > +++ b/drivers/crypto/nx/nx-common-pseries.c
> > @@ -9,6 +9,7 @@
> >   */
> >  
> >  #include 
> > +#include 
> >  #include 
> >  
> >  #include "nx-842.h"
> > @@ -20,6 +21,24 @@ MODULE_DESCRIPTION("842 H/W Compression driver
> > for IBM Power processors");
> >  MODULE_ALIAS_CRYPTO("842");
> >  MODULE_ALIAS_CRYPTO("842-nx");
> >  
> > +struct nx_ct_capabs_be {
> 
> What does "ct" mean? I've seen it in a few other places too.

ct means coprocessor type such as 842 and GZIP. phyp provides only GZIP
capabilities right now. But this struct may be extended to other types
in future.

> 
> > +   __be64  descriptor;
> > +   __be64  req_max_processed_len;  /* Max bytes in one GZIP
> > request */
> > +   __be64  min_compress_len;   /* Min compression size in
> > bytes */
> > +   __be64  min_decompress_len; /* Min decompression size
> > in bytes */
> > +} __packed __aligned(0x1000);
> > +
> > +struct nx_ct_capabs {
> > +   charname[VAS_DESCR_LEN + 1];
> > +   u64 descriptor;
> > +   u64 req_max_processed_len;  /* Max bytes in one GZIP request */
> > +   u64 min_compress_len;   /* Min compression in bytes */
> > +   u64 min_decompress_len; /* Min decompression in bytes */
> > +};
> > +
> > +u64 capab_feat = 0;
> 
> Why is this here and not a local variable?

capab_feat is used to add / delete sysfs entries.

> 
> > +struct nx_ct_capabs nx_ct_capab;
> 
> It's okay and generally better to use the same name as the struct
> name
> in this situation, i.e.,
> 
> "struct nx_ct_capabs nx_ct_capabs"
> 
> (modulo static / caps / etc)

Sure, will change.

> 
> Thanks,
> Nick



Re: [V3 PATCH 03/16] powerpc/vas: Create take/drop task reference functions

2021-05-10 Thread Haren Myneni
On Mon, 2021-05-10 at 15:28 +1000, Nicholas Piggin wrote:
> Excerpts from Haren Myneni's message of April 18, 2021 7:03 am:
> > Take task reference when each window opens and drops during close.
> > This functionality is needed for powerNV and pseries. So this patch
> > defines the existing code as functions in common book3s platform
> > vas-api.c
> > 
> > Signed-off-by: Haren Myneni 
> > ---
> >  arch/powerpc/include/asm/vas.h  | 20 
> >  arch/powerpc/platforms/book3s/vas-api.c | 51
> > ++
> >  arch/powerpc/platforms/powernv/vas-fault.c  | 10 ++--
> >  arch/powerpc/platforms/powernv/vas-window.c | 57 ++---
> > 
> >  arch/powerpc/platforms/powernv/vas.h|  6 +--
> >  5 files changed, 83 insertions(+), 61 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/vas.h
> > b/arch/powerpc/include/asm/vas.h
> > index 6bbade60d8f4..2daaa1a2a9a9 100644
> > --- a/arch/powerpc/include/asm/vas.h
> > +++ b/arch/powerpc/include/asm/vas.h
> > @@ -5,6 +5,9 @@
> >  
> >  #ifndef _ASM_POWERPC_VAS_H
> >  #define _ASM_POWERPC_VAS_H
> > +#include 
> > +#include 
> > +#include 
> >  #include 
> >  
> >  
> > @@ -60,6 +63,22 @@ struct vas_user_win_ops {
> > int (*close_win)(void *);
> >  };
> >  
> > +struct vas_win_task {
> > +   struct pid *pid;/* Thread group ID of owner */
> > +   struct pid *tgid;   /* Linux process mm_struct */
> > +   struct mm_struct *mm;   /* Linux process mm_struct */
> > +};
> 
> Looks okay, happy with struct vas_win_task? (and vas_user_win_ops)?
> 
> I'd be happier to have everything related to vas windows prefixed
> with 
> vas_window_ consistently, and have _user be present always for
> userspace
> windows, but you have to read and type it.

I will change to vas_window_task and vas_user_window_ops. struct
vas_window is common for both kernel and user space window, but struct
vas_window_task is needed only for user space.

> 
> > +
> > +static inline void vas_drop_reference_task(struct vas_win_task
> > *task)
> 
> This is not dropping a reference task, but a task reference. And
> it's 
> not really a task reference as far as Linux understands, but a
> reference on pid (not task) and mm related to an open vas window. And
> specifically a user window (with corresponding vas_user_win_ops).

Yes get and put references to pid and mm. Hence mentioned
reference_task to reflect for both pid and mm. Would you prefer
vas_put_reference_pid_mm()?

> 
> Could it be called a 'struct vas_window_user_ref' instead?

To support LPM and DLPAR, plannig to add VMA (mapping address) in
struct vas_window_task. But I can change it to 'struct
vas_window_user_ref' instread of vas_window_task.

Thanks
Haren 

> 
> Thanks,
> Nick
> 
> > +{
> > +   /* Drop references to pid and mm */
> > +   put_pid(task->pid);
> > +   if (task->mm) {
> > +   mm_context_remove_vas_window(task->mm);
> > +   mmdrop(task->mm);
> > +   }



Re: [PATCH V3 02/16] powerpc/vas: Move VAS API to common book3s platform

2021-05-10 Thread Haren Myneni
On Mon, 2021-05-10 at 15:19 +1000, Nicholas Piggin wrote:
> Excerpts from Haren Myneni's message of April 18, 2021 7:02 am:
> > Using the same /dev/crypto/nx-gzip interface for both powerNV and
> > pseries.
> 
> The pseries NX driver will use the powernv VAS API ?
Yes, both powerNV and pseries drivers use same VAS API.  

> 
> > So this patch creates platforms/book3s/ and moves VAS API
> > to that directory. The actual functionality is not changed.
> > 
> > Common interface functions such as open, window open ioctl, mmap
> > and close are moved to arch/powerpc/platforms/book3s/vas-api.c
> > Added hooks to call platform specific code, but the underline
> > powerNV code in these functions is not changed.
> 
> Even so, could you do one patch that just moves, and another that
> adds the ops struct?

Sure, I will create separate patches. 

> 
> > Signed-off-by: Haren Myneni 
> > ---
> >  arch/powerpc/include/asm/vas.h| 22 ++-
> >  arch/powerpc/platforms/Kconfig|  1 +
> >  arch/powerpc/platforms/Makefile   |  1 +
> >  arch/powerpc/platforms/book3s/Kconfig | 15 +
> >  arch/powerpc/platforms/book3s/Makefile|  2 +
> 
> The usual place for these would be arch/powerpc/sysdev/vas. E.g., see
> arch/powerpc/sysdev/xive.

It was recommended to create book3s and move common vas-api code. 

> 
> >  .../platforms/{powernv => book3s}/vas-api.c   | 64 ++-
> > ---
> >  arch/powerpc/platforms/powernv/Kconfig| 14 
> >  arch/powerpc/platforms/powernv/Makefile   |  2 +-
> >  arch/powerpc/platforms/powernv/vas-window.c   | 66
> > +++
> >  9 files changed, 143 insertions(+), 44 deletions(-)
> >  create mode 100644 arch/powerpc/platforms/book3s/Kconfig
> >  create mode 100644 arch/powerpc/platforms/book3s/Makefile
> >  rename arch/powerpc/platforms/{powernv => book3s}/vas-api.c (83%)
> > 
> > diff --git a/arch/powerpc/include/asm/vas.h
> > b/arch/powerpc/include/asm/vas.h
> > index 41f73fae7ab8..6bbade60d8f4 100644
> > --- a/arch/powerpc/include/asm/vas.h
> > +++ b/arch/powerpc/include/asm/vas.h
> > @@ -5,6 +5,8 @@
> >  
> >  #ifndef _ASM_POWERPC_VAS_H
> >  #define _ASM_POWERPC_VAS_H
> > +#include 
> > +
> >  
> >  struct vas_window;
> >  
> > @@ -48,6 +50,16 @@ enum vas_cop_type {
> > VAS_COP_TYPE_MAX,
> >  };
> >  
> > +/*
> > + * User space window operations used for powernv and powerVM
> > + */
> > +struct vas_user_win_ops {
> > +   struct vas_window * (*open_win)(struct vas_tx_win_open_attr *,
> > +   enum vas_cop_type);
> > +   u64 (*paste_addr)(void *);
> > +   int (*close_win)(void *);
> > +};
> > +
> >  /*
> >   * Receive window attributes specified by the (in-kernel) owner of
> > window.
> >   */
> > @@ -161,6 +173,9 @@ int vas_copy_crb(void *crb, int offset);
> >   * assumed to be true for NX windows.
> >   */
> >  int vas_paste_crb(struct vas_window *win, int offset, bool re);
> > +int vas_register_api_powernv(struct module *mod, enum vas_cop_type
> > cop_type,
> > +const char *name);
> > +void vas_unregister_api_powernv(void);
> >  
> >  /*
> >   * Register / unregister coprocessor type to VAS API which will be
> > exported
> > @@ -170,8 +185,9 @@ int vas_paste_crb(struct vas_window *win, int
> > offset, bool re);
> >   * Only NX GZIP coprocessor type is supported now, but this API
> > can be
> >   * used for others in future.
> >   */
> > -int vas_register_api_powernv(struct module *mod, enum vas_cop_type
> > cop_type,
> > -const char *name);
> > -void vas_unregister_api_powernv(void);
> > +int vas_register_coproc_api(struct module *mod, enum vas_cop_type
> > cop_type,
> > +   const char *name,
> > +   struct vas_user_win_ops *vops);
> > +void vas_unregister_coproc_api(void);
> >  
> >  #endif /* __ASM_POWERPC_VAS_H */
> > diff --git a/arch/powerpc/platforms/Kconfig
> > b/arch/powerpc/platforms/Kconfig
> > index 7a5e8f4541e3..594544a65b02 100644
> > --- a/arch/powerpc/platforms/Kconfig
> > +++ b/arch/powerpc/platforms/Kconfig
> > @@ -20,6 +20,7 @@ source
> > "arch/powerpc/platforms/embedded6xx/Kconfig"
> >  source "arch/powerpc/platforms/44x/Kconfig"
> >  source "arch/powerpc/platforms/40x/Kconfig"
> >  source "arch/powerpc/platforms/amigaone/Kconfig"
> > +source "arch/powerpc/platforms/book3s/Kconfig"
> >  
> >  config KVM_GUEST
> > bool "KVM Guest support"
> > diff --git a/arch/powerpc/platforms/Makefile
> > b/arch/powerpc/platforms/Makefile
> > index 143d4417f6cc..0e75d7df387b 100644
> > --- a/arch/powerpc/platforms/Makefile
> > +++ b/arch/powerpc/platforms/Makefile
> > @@ -22,3 +22,4 @@ obj-$(CONFIG_PPC_CELL)+= cell/
> >  obj-$(CONFIG_PPC_PS3)  += ps3/
> >  obj-$(CONFIG_EMBEDDED6xx)  += embedded6xx/
> >  obj-$(CONFIG_AMIGAONE) += amigaone/
> > +obj-$(CONFIG_PPC_BOOK3S)   += book3s/
> > diff --git a/arch/powerpc/platforms/book3s/Kconfig
> > 

Re: [V3 PATCH 05/16] powerpc/vas: Define and use common vas_window struct

2021-05-10 Thread Haren Myneni
On Mon, 2021-05-10 at 15:37 +1000, Nicholas Piggin wrote:
> Excerpts from Haren Myneni's message of April 18, 2021 7:04 am:
> > 
> 
> Empty email?
> 
> Thanks,
> Nick

Sorry, My mistake. Here is the patch and will add in V4 series.

[V3 PATCH 05/16] powerpc/vas:  Define and use common vas_window struct

Same vas_window struct is used on powerNV and pseries. So this patch
changes in struct vas_window to support both platforms and also the
corresponding modifications in powerNV vas code.

On powerNV, vas_window is used for both TX and RX windows, whereas
only for TX windows on powerVM. So some elements are specific to
these platforms.

Signed-off-by: Haren Myneni 
---
 arch/powerpc/include/asm/vas.h  |  48 
 arch/powerpc/platforms/powernv/vas-debug.c  |  12 +-
 arch/powerpc/platforms/powernv/vas-fault.c  |   4 +-
 arch/powerpc/platforms/powernv/vas-trace.h  |   6 +-
 arch/powerpc/platforms/powernv/vas-window.c | 129 +++-
 arch/powerpc/platforms/powernv/vas.h|  38 +-
 6 files changed, 135 insertions(+), 102 deletions(-)

diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
index 66bf8fb1a1be..f928bf4c7e98 100644
--- a/arch/powerpc/include/asm/vas.h
+++ b/arch/powerpc/include/asm/vas.h
@@ -69,6 +69,54 @@ struct vas_win_task {
struct mm_struct *mm;   /* Linux process mm_struct */
 };
 
+/*
+ * In-kernel state a VAS window. One per window.
+ * powerVM: Used only for Tx windows.
+ * powerNV: Used for both Tx and Rx windows.
+ */
+struct vas_window {
+   u32 winid;
+   u32 wcreds_max; /* Window credits */
+   enum vas_cop_type cop;
+   struct vas_win_task task;
+   char *dbgname;
+   struct dentry *dbgdir;
+   union {
+   /* powerNV specific data */
+   struct {
+   void *vinst;/* points to VAS instance */
+   bool tx_win;/* True if send window */
+   bool nx_win;/* True if NX window */
+   bool user_win;  /* True if user space window */
+   void *hvwc_map; /* HV window context */
+   void *uwc_map;  /* OS/User window context */
+
+   /* Fields applicable only to send windows */
+   void *paste_kaddr;
+   char *paste_addr_name;
+   struct vas_window *rxwin;
+
+   atomic_t num_txwins;/* Only for receive windows */
+   } pnv;
+   struct {
+   u64 win_addr;   /* Physical paste address */
+   u8 win_type;/* QoS or Default window */
+   u8 status;
+   u32 complete_irq;   /* Completion interrupt */
+   u32 fault_irq;  /* Fault interrupt */
+   u64 domain[6];  /* Associativity domain Ids */
+   /* this window is allocated */
+   u64 util;
+
+   /* List of windows opened which is used for LPM */
+   struct list_head win_list;
+   u64 flags;
+   char *name;
+   int fault_virq;
+   } lpar;
+   };
+};
+
 static inline void vas_drop_reference_task(struct vas_win_task *task)
 {
/* Drop references to pid and mm */
diff --git a/arch/powerpc/platforms/powernv/vas-debug.c 
b/arch/powerpc/platforms/powernv/vas-debug.c
index 41fa90d2f4ab..80f735449ab8 100644
--- a/arch/powerpc/platforms/powernv/vas-debug.c
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "vas.h"
 
 static struct dentry *vas_debugfs;
@@ -33,11 +34,11 @@ static int info_show(struct seq_file *s, void *private)
mutex_lock(_mutex);
 
/* ensure window is not unmapped */
-   if (!window->hvwc_map)
+   if (!window->pnv.hvwc_map)
goto unlock;
 
seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
-   window->tx_win ? "Send" : "Receive");
+   window->pnv.tx_win ? "Send" : "Receive");
seq_printf(s, "Pid : %d\n", vas_window_pid(window));
 
 unlock:
@@ -60,7 +61,7 @@ static int hvwc_show(struct seq_file *s, void *private)
mutex_lock(_mutex);
 
/* ensure window is not unmapped */
-   if (!window->hvwc_map)
+   if (!window->pnv.hvwc_map)
goto unlock;
 
print_reg(s, window, VREG(LPID));
@@ -115,9 +116,10 @@ void vas_window_free_dbgdir(struct vas_window *window)
 
 void vas_window_init_dbgdir(struct vas_window *window)
 {
+   struct vas_instance *vinst = window->pnv.vinst;
struct dentry *d;
 
-   if (!window->vinst->dbgdir)
+   if (!vinst->dbgdir)
return;
 
window->dbgname = kzalloc(16, GFP_KERNEL);
@@ -126,7 

Re: [V3 PATCH 01/16] powerpc/powernv/vas: Rename register/unregister functions

2021-05-10 Thread Haren Myneni
On Mon, 2021-05-10 at 15:10 +1000, Nicholas Piggin wrote:
> Excerpts from Haren Myneni's message of April 18, 2021 7:00 am:
> > powerNV and pseries drivers register / unregister to the
> > corresponding
> > VAS code separately. So rename powerNV VAS API register/unregister
> > functions.
> 
> The pseries VAS driver will have different calls for registering a
> coprocessor driver, you mean?
> 
> It certainly looks the same 
> 
> (from patch 13)
>   ret = vas_register_api_pseries(THIS_MODULE, VAS_COP_TYPE_GZIP,
>  "nx-gzip");
> 
> So I guess it's just a matter of the driver being different enough
> that 
> there is no benefit to making this call common (and branching to
> pseries
> or powernv dynamically).

Thanks for your review and comments on all patches. 

Yes, we have separate drivers nx-common-powernv/nx-common-pseries for
powerNV and pseries. 
API registeration patch is:
- driver calls platform specific API
   vas_register_api_powernv/pseries
- Platform specific code calls common API with its vas_user_win_ops
   vas_register_coproc_api()

> 
> Reviewed-by: Nicholas Piggin 
> 
> > Signed-off-by: Haren Myneni 
> > ---
> >  arch/powerpc/include/asm/vas.h   |  6 +++---
> >  arch/powerpc/platforms/powernv/vas-api.c | 10 +-
> >  drivers/crypto/nx/nx-common-powernv.c|  6 +++---
> >  3 files changed, 11 insertions(+), 11 deletions(-)
> > 
> > diff --git a/arch/powerpc/include/asm/vas.h
> > b/arch/powerpc/include/asm/vas.h
> > index e33f80b0ea81..41f73fae7ab8 100644
> > --- a/arch/powerpc/include/asm/vas.h
> > +++ b/arch/powerpc/include/asm/vas.h
> > @@ -170,8 +170,8 @@ int vas_paste_crb(struct vas_window *win, int
> > offset, bool re);
> >   * Only NX GZIP coprocessor type is supported now, but this API
> > can be
> >   * used for others in future.
> >   */
> > -int vas_register_coproc_api(struct module *mod, enum vas_cop_type
> > cop_type,
> > -   const char *name);
> > -void vas_unregister_coproc_api(void);
> > +int vas_register_api_powernv(struct module *mod, enum vas_cop_type
> > cop_type,
> > +const char *name);
> > +void vas_unregister_api_powernv(void);
> >  
> >  #endif /* __ASM_POWERPC_VAS_H */
> > diff --git a/arch/powerpc/platforms/powernv/vas-api.c
> > b/arch/powerpc/platforms/powernv/vas-api.c
> > index 98ed5d8c5441..72d8ce39e56c 100644
> > --- a/arch/powerpc/platforms/powernv/vas-api.c
> > +++ b/arch/powerpc/platforms/powernv/vas-api.c
> > @@ -207,8 +207,8 @@ static struct file_operations coproc_fops = {
> >   * Supporting only nx-gzip coprocessor type now, but this API code
> >   * extended to other coprocessor types later.
> >   */
> > -int vas_register_coproc_api(struct module *mod, enum vas_cop_type
> > cop_type,
> > -   const char *name)
> > +int vas_register_api_powernv(struct module *mod, enum vas_cop_type
> > cop_type,
> > +const char *name)
> >  {
> > int rc = -EINVAL;
> > dev_t devno;
> > @@ -262,9 +262,9 @@ int vas_register_coproc_api(struct module *mod,
> > enum vas_cop_type cop_type,
> > unregister_chrdev_region(coproc_device.devt, 1);
> > return rc;
> >  }
> > -EXPORT_SYMBOL_GPL(vas_register_coproc_api);
> > +EXPORT_SYMBOL_GPL(vas_register_api_powernv);
> >  
> > -void vas_unregister_coproc_api(void)
> > +void vas_unregister_api_powernv(void)
> >  {
> > dev_t devno;
> >  
> > @@ -275,4 +275,4 @@ void vas_unregister_coproc_api(void)
> > class_destroy(coproc_device.class);
> > unregister_chrdev_region(coproc_device.devt, 1);
> >  }
> > -EXPORT_SYMBOL_GPL(vas_unregister_coproc_api);
> > +EXPORT_SYMBOL_GPL(vas_unregister_api_powernv);
> > diff --git a/drivers/crypto/nx/nx-common-powernv.c
> > b/drivers/crypto/nx/nx-common-powernv.c
> > index 13c65deda8e9..88d728415bb2 100644
> > --- a/drivers/crypto/nx/nx-common-powernv.c
> > +++ b/drivers/crypto/nx/nx-common-powernv.c
> > @@ -1090,8 +1090,8 @@ static __init int
> > nx_compress_powernv_init(void)
> >  * normal FIFO priority is assigned for userspace.
> >  * 842 compression is supported only in kernel.
> >  */
> > -   ret = vas_register_coproc_api(THIS_MODULE,
> > VAS_COP_TYPE_GZIP,
> > -   "nx-gzip");
> > +   ret = vas_register_api_powernv(THIS_MODULE,
> > VAS_COP_TYPE_GZIP,
> > +  "nx-gzip");
> >  
> > /*
> >  * GZIP is not supported in kernel right now.
> > @@ -1127,7 +1127,7 @@ static void __exit
> > nx_compress_powernv_exit(void)
> >  * use. So delete this API use for GZIP engine.
> >  */
> > if (!nx842_ct)
> > -   vas_unregister_coproc_api();
> > +   vas_unregister_api_powernv();
> >  
> > crypto_unregister_alg(_powernv_alg);
> >  
> > -- 
> > 2.18.2
> > 
> > 
> > 



Re: [PATCH 01/11] PCI: Use sysfs_emit() and sysfs_emit_at() in "show" functions

2021-05-10 Thread Logan Gunthorpe



On 2021-05-09 10:14 p.m., Krzysztof Wilczyński wrote:
> The sysfs_emit() and sysfs_emit_at() functions were introduced to make
> it less ambiguous which function is preferred when writing to the output
> buffer in a device attribute's "show" callback [1].
> 
> Convert the PCI sysfs object "show" functions from sprintf(), snprintf()
> and scnprintf() to sysfs_emit() and sysfs_emit_at() accordingly, as the
> latter is aware of the PAGE_SIZE buffer and correctly returns the number
> of bytes written into the buffer.
> 
> No functional change intended.
> 
> [1] Documentation/filesystems/sysfs.rst
> 
> Related to:
>   commit ad025f8e46f3 ("PCI/sysfs: Use sysfs_emit() and sysfs_emit_at() in 
> "show" functions")
> 
> Signed-off-by: Krzysztof Wilczyński 

Thanks, this is a great cleanup. I've reviewed the entire series.

Reviewed-by: Logan Gunthorpe 

I agree that the new lines that are missing should be added.

Logan


Re: [PATCH v6 08/15] swiotlb: Bounce data from/to restricted DMA pool if available

2021-05-10 Thread Christoph Hellwig
> +static inline bool is_dev_swiotlb_force(struct device *dev)
> +{
> +#ifdef CONFIG_DMA_RESTRICTED_POOL
> + if (dev->dma_io_tlb_mem)
> + return true;
> +#endif /* CONFIG_DMA_RESTRICTED_POOL */
> + return false;
> +}
> +

>   /* If SWIOTLB is active, use its maximum mapping size */
>   if (is_swiotlb_active(dev) &&
> - (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
> + (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE ||
> +  is_dev_swiotlb_force(dev)))

This is a mess.  I think the right way is to have an always_bounce flag
in the io_tlb_mem structure instead.  Then the global swiotlb_force can
go away and be replace with this and the fact that having no
io_tlb_mem structure at all means forced no buffering (after a little
refactoring).


Re: [PATCH v6 05/15] swiotlb: Add a new get_io_tlb_mem getter

2021-05-10 Thread Christoph Hellwig
> +static inline struct io_tlb_mem *get_io_tlb_mem(struct device *dev)
> +{
> +#ifdef CONFIG_DMA_RESTRICTED_POOL
> + if (dev && dev->dma_io_tlb_mem)
> + return dev->dma_io_tlb_mem;
> +#endif /* CONFIG_DMA_RESTRICTED_POOL */
> +
> + return io_tlb_default_mem;

Given that we're also looking into a not addressing restricted pool
I'd rather always assign the active pool to dev->dma_io_tlb_mem and
do away with this helper.


Re: [PATCH v6 04/15] swiotlb: Add restricted DMA pool initialization

2021-05-10 Thread Christoph Hellwig
> +#ifdef CONFIG_DMA_RESTRICTED_POOL
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#endif

I don't think any of this belongs into swiotlb.c.  Marking
swiotlb_init_io_tlb_mem non-static and having all this code in a separate
file is probably a better idea.

> +#ifdef CONFIG_DMA_RESTRICTED_POOL
> +static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
> + struct device *dev)
> +{
> + struct io_tlb_mem *mem = rmem->priv;
> + unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
> +
> + if (dev->dma_io_tlb_mem)
> + return 0;
> +
> + /* Since multiple devices can share the same pool, the private data,
> +  * io_tlb_mem struct, will be initialized by the first device attached
> +  * to it.
> +  */

This is not the normal kernel comment style.

> +#ifdef CONFIG_ARM
> + if (!PageHighMem(pfn_to_page(PHYS_PFN(rmem->base {
> + kfree(mem);
> + return -EINVAL;
> + }
> +#endif /* CONFIG_ARM */

And this is weird.  Why would ARM have such a restriction?  And if we have
such rstrictions it absolutely belongs into an arch helper.

> + swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false);
> +
> + rmem->priv = mem;
> +
> +#ifdef CONFIG_DEBUG_FS
> + if (!debugfs_dir)
> + debugfs_dir = debugfs_create_dir("swiotlb", NULL);
> +
> + swiotlb_create_debugfs(mem, rmem->name, debugfs_dir);

Doesn't the debugfs_create_dir belong into swiotlb_create_debugfs?  Also
please use IS_ENABLEd or a stub to avoid ifdefs like this.


Re: [PATCH v6 02/15] swiotlb: Refactor swiotlb_create_debugfs

2021-05-10 Thread Christoph Hellwig
Looks good,

Reviewed-by: Christoph Hellwig 


Re: [PATCH v6 01/15] swiotlb: Refactor swiotlb init functions

2021-05-10 Thread Christoph Hellwig
Looks good,

Reviewed-by: Christoph Hellwig 


[PATCH v1 1/1] powerpc/prom_init: Move custom isspace() to its own namespace

2021-05-10 Thread Andy Shevchenko
If by some reason any of the headers will include ctype.h
we will have a name collision. Avoid this by moving isspace()
to the dedicate namespace.

First appearance of the code is in the commit cf68787b68a2
("powerpc/prom_init: Evaluate mem kernel parameter for early allocation").

Reported-by: kernel test robot 
Signed-off-by: Andy Shevchenko 
---
 arch/powerpc/kernel/prom_init.c | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c
index 41ed7e33d897..6845cbbc0cd4 100644
--- a/arch/powerpc/kernel/prom_init.c
+++ b/arch/powerpc/kernel/prom_init.c
@@ -701,13 +701,13 @@ static int __init prom_setprop(phandle node, const char 
*nodename,
 }
 
 /* We can't use the standard versions because of relocation headaches. */
-#define isxdigit(c)(('0' <= (c) && (c) <= '9') \
-|| ('a' <= (c) && (c) <= 'f') \
-|| ('A' <= (c) && (c) <= 'F'))
+#define prom_isxdigit(c)   (('0' <= (c) && (c) <= '9') \
+|| ('a' <= (c) && (c) <= 'f') \
+|| ('A' <= (c) && (c) <= 'F'))
 
-#define isdigit(c) ('0' <= (c) && (c) <= '9')
-#define islower(c) ('a' <= (c) && (c) <= 'z')
-#define toupper(c) (islower(c) ? ((c) - 'a' + 'A') : (c))
+#define prom_isdigit(c)('0' <= (c) && (c) <= '9')
+#define prom_islower(c)('a' <= (c) && (c) <= 'z')
+#define prom_toupper(c)(prom_islower(c) ? ((c) - 'a' + 'A') : 
(c))
 
 static unsigned long prom_strtoul(const char *cp, const char **endp)
 {
@@ -716,14 +716,14 @@ static unsigned long prom_strtoul(const char *cp, const 
char **endp)
if (*cp == '0') {
base = 8;
cp++;
-   if (toupper(*cp) == 'X') {
+   if (prom_toupper(*cp) == 'X') {
cp++;
base = 16;
}
}
 
-   while (isxdigit(*cp) &&
-  (value = isdigit(*cp) ? *cp - '0' : toupper(*cp) - 'A' + 10) < 
base) {
+   while (prom_isxdigit(*cp) &&
+  (value = prom_isdigit(*cp) ? *cp - '0' : prom_toupper(*cp) - 'A' 
+ 10) < base) {
result = result * base + value;
cp++;
}
-- 
2.30.2



Re: [RFC PATCH 0/7] Memory hotplug/hotremove at subsection size

2021-05-10 Thread Zi Yan
On 7 May 2021, at 10:00, David Hildenbrand wrote:

> On 07.05.21 13:55, Michal Hocko wrote:
>> [I haven't read through respective patches due to lack of time but let
>>   me comment on the general idea and the underlying justification]
>>
>> On Thu 06-05-21 17:31:09, David Hildenbrand wrote:
>>> On 06.05.21 17:26, Zi Yan wrote:
 From: Zi Yan 

 Hi all,

 This patchset tries to remove the restriction on memory hotplug/hotremove
 granularity, which is always greater or equal to memory section size[1].
 With the patchset, kernel is able to online/offline memory at a size 
 independent
 of memory section size, as small as 2MB (the subsection size).
>>>
>>> ... which doesn't make any sense as we can only online/offline whole memory
>>> block devices.
>>
>> Agreed. The subsection thingy is just a hack to workaround pmem
>> alignement problems. For the real memory hotplug it is quite hard to
>> argue for reasonable hotplug scenarios for very small physical memory
>> ranges wrt. to the existing sparsemem memory model.
>>
 The motivation is to increase MAX_ORDER of the buddy allocator and 
 pageblock
 size without increasing memory hotplug/hotremove granularity at the same 
 time,
>>>
>>> Gah, no. Please no. No.
>>
>> Agreed. Those are completely independent concepts. MAX_ORDER is can be
>> really arbitrary irrespective of the section size with vmemmap sparse
>> model. The existing restriction is due to old sparse model not being
>> able to do page pointer arithmetic across memory sections. Is there any
>> reason to stick with that memory model for an advance feature you are
>> working on?

No. I just want to increase MAX_ORDER. If the existing restriction can
be removed, that will be great.

>
> I gave it some more thought yesterday. I guess the first thing we should look 
> into is increasing MAX_ORDER and leaving pageblock_order and section size as 
> is -- finding out what we have to tweak to get that up and running. Once we 
> have that in place, we can actually look into better fragmentation avoidance 
> etc. One step at a time.

It makes sense to me.

>
> Because that change itself might require some thought. Requiring that bigger 
> MAX_ORDER depends on SPARSE_VMEMMAP is something reasonable to do.

OK, if with SPARSE_VMEMMAP MAX_ORDER can be set to be bigger than
SECTION_SIZE, it is perfectly OK to me. Since 1GB THP support, which I
want to add ultimately, will require SPARSE_VMEMMAP too (otherwise,
all page++ will need to be changed to nth_page(page,1)).

>
> As stated somewhere here already, we'll have to look into making 
> alloc_contig_range() (and main users CMA and virtio-mem) independent of 
> MAX_ORDER and mainly rely on pageblock_order. The current handling in 
> alloc_contig_range() is far from optimal as we have to isolate a whole 
> MAX_ORDER - 1 page -- and on ZONE_NORMAL we'll fail easily if any part 
> contains something unmovable although we don't even want to allocate that 
> part. I actually have that on my list (to be able to fully support 
> pageblock_order instead of MAX_ORDER -1 chunks in virtio-mem), however didn't 
> have time to look into it.

So in your mind, for gigantic page allocation (> MAX_ORDER), 
alloc_contig_range()
should be used instead of buddy allocator while pageblock_order is kept at a 
small
granularity like 2MB. Is that the case? Isn’t it going to have high fail rate
when any of the pageblocks within a gigantic page range (like 1GB) becomes 
unmovable?
Are you thinking additional mechanism/policy to prevent such thing happening as
an additional step for gigantic page allocation? Like your ZONE_PREFER_MOVABLE 
idea?

>
> Further, page onlining / offlining code and early init code most probably 
> also needs care if MAX_ORDER - 1 crosses sections. Memory holes we might 
> suddenly have in MAX_ORDER - 1 pages might become a problem and will have to 
> be handled. Not sure which other code has to be tweaked (compaction? page 
> isolation?).

Can you elaborate it a little more? From what I understand, memory holes mean 
valid
PFNs are not contiguous before and after a hole, so pfn++ will not work, but
struct pages are still virtually contiguous assuming SPARSE_VMEMMAP, meaning 
page++
would still work. So when MAX_ORDER - 1 crosses sections, additional code would 
be
needed instead of simple pfn++. Is there anything I am missing?

BTW, to test a system with memory holes, do you know is there an easy of adding
random memory holes to an x86_64 VM, which can help reveal potential missing 
pieces
in the code? Changing BIOS-e820 table might be one way, but I have no idea on
how to do it on QEMU.

>
> Figuring out what needs care itself might take quite some effort.
>
> One thing I was thinking about as well: The bigger our MAX_ORDER, the slower 
> it could be to allocate smaller pages. If we have 1G pages, splitting them 
> down to 4k then takes 8 additional steps if I'm, not wrong. Of course, that's 
> the worst case. 

[PATCH 1/1] powerpc/pseries/ras: Delete a redundant condition branch

2021-05-10 Thread Zhen Lei
The statement of the last "if (xxx)" branch is the same as the "else"
branch. Delete it to simplify code.

No functional change.

Signed-off-by: Zhen Lei 
---
 arch/powerpc/platforms/pseries/ras.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/ras.c 
b/arch/powerpc/platforms/pseries/ras.c
index 9d4ef65da7f395f..2f636308cf60430 100644
--- a/arch/powerpc/platforms/pseries/ras.c
+++ b/arch/powerpc/platforms/pseries/ras.c
@@ -593,8 +593,6 @@ static int mce_handle_err_virtmode(struct pt_regs *regs,
mce_err.severity = MCE_SEV_SEVERE;
else if (severity == RTAS_SEVERITY_ERROR)
mce_err.severity = MCE_SEV_SEVERE;
-   else if (severity == RTAS_SEVERITY_FATAL)
-   mce_err.severity = MCE_SEV_FATAL;
else
mce_err.severity = MCE_SEV_FATAL;
 
-- 
2.26.0.106.g9fadedd




Re: [PATCH] ppc64/numa: consider the max numa node for migratable LPAR

2021-05-10 Thread Laurent Dufour

Le 10/05/2021 à 12:21, Srikar Dronamraju a écrit :

* Laurent Dufour  [2021-04-29 20:19:01]:


When a LPAR is migratable, we should consider the maximum possible NUMA
node instead the number of NUMA node from the actual system.

The DT property 'ibm,current-associativity-domains' is defining the maximum
number of nodes the LPAR can see when running on that box. But if the LPAR
is being migrated on another box, it may seen up to the nodes defined by
'ibm,max-associativity-domains'. So if a LPAR is migratable, that value
should be used.

Unfortunately, there is no easy way to know if a LPAR is migratable or
not. The hypervisor is exporting the property 'ibm,migratable-partition' in
the case it set to migrate partition, but that would not mean that the
current partition is migratable.

Without that patch, when a LPAR is started on a 2 nodes box and then
migrated to a 3 nodes box, the hypervisor may spread the LPAR's CPUs on the
3rd node. In that case if a CPU from that 3rd node is added to the LPAR, it
will be wrongly assigned to the node because the kernel has been set to use




up to 2 nodes (the configuration of the departure node). With that patch
applies, the CPU is correctly added to the 3rd node.


You probably meant, "With this patch applied"

Also you may want to add a fixes tag:


I'll fix "that" and add the fixes tag.


Cc: Srikar Dronamraju 
Signed-off-by: Laurent Dufour 
---
  arch/powerpc/mm/numa.c | 14 +++---
  1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index f2bf98bdcea2..673fa6e47850 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -893,7 +893,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, 
u64 end_pfn)
  static void __init find_possible_nodes(void)
  {
struct device_node *rtas;
-   const __be32 *domains;
+   const __be32 *domains = NULL;
int prop_length, max_nodes;
u32 i;

@@ -909,9 +909,14 @@ static void __init find_possible_nodes(void)
 * it doesn't exist, then fallback on ibm,max-associativity-domains.
 * Current denotes what the platform can support compared to max
 * which denotes what the Hypervisor can support.
+*
+* If the LPAR is migratable, new nodes might be activated after a LPM,
+* so we should consider the max number in that case.
 */
-   domains = of_get_property(rtas, "ibm,current-associativity-domains",
-   _length);
+   if (!of_get_property(of_root, "ibm,migratable-partition", NULL))
+   domains = of_get_property(rtas,
+ "ibm,current-associativity-domains",
+ _length);
if (!domains) {
domains = of_get_property(rtas, "ibm,max-associativity-domains",
_length);
@@ -920,6 +925,9 @@ static void __init find_possible_nodes(void)
}

max_nodes = of_read_number([min_common_depth], 1);
+   printk(KERN_INFO "Partition configured for %d NUMA nodes.\n",
+  max_nodes);
+


Another nit:
you may want to make this pr_info instead of printk


Sure !


for (i = 0; i < max_nodes; i++) {
if (!node_possible(i))
node_set(i, node_possible_map);
--
2.31.1



Otherwise looks good to me.

Reviewed-by: Srikar Dronamraju 


Thanks Srikar, I'll add you review tag in the v2.




Re: [PATCH 01/11] PCI: Use sysfs_emit() and sysfs_emit_at() in "show" functions

2021-05-10 Thread Krzysztof Wilczyński
[+cc Joe for visibility]

[...]
>   spin_lock(_alignment_lock);
>   if (resource_alignment_param)
> - count = scnprintf(buf, PAGE_SIZE, "%s", 
> resource_alignment_param);
> + count = sysfs_emit(buf, "%s", resource_alignment_param);
>   spin_unlock(_alignment_lock);

Following the work that Joe did recently, see:

  
https://lore.kernel.org/lkml/aa1819fa5faf786573df298e5e2e7d357ba7d4ad.ca...@perches.com/

I think we ought to also add the missing newline to our sysfs_emit() and
sysfs_emit_at() users, like the one above and the following:

  drivers/pci/pci-sysfs.c
  540:  return sysfs_emit(buf, "%pOF", np);

To keep things correct and consistent.

Bjorn, I can follow-up with a small patch after this one, or send a v2,
or, if that would be OK with you, then you could fix it during merging,
provided you decide to merge things as-is.

Krzysztof


Re: [PATCH] ppc64/numa: consider the max numa node for migratable LPAR

2021-05-10 Thread Srikar Dronamraju
* Laurent Dufour  [2021-04-29 20:19:01]:

> When a LPAR is migratable, we should consider the maximum possible NUMA
> node instead the number of NUMA node from the actual system.
> 
> The DT property 'ibm,current-associativity-domains' is defining the maximum
> number of nodes the LPAR can see when running on that box. But if the LPAR
> is being migrated on another box, it may seen up to the nodes defined by
> 'ibm,max-associativity-domains'. So if a LPAR is migratable, that value
> should be used.
> 
> Unfortunately, there is no easy way to know if a LPAR is migratable or
> not. The hypervisor is exporting the property 'ibm,migratable-partition' in
> the case it set to migrate partition, but that would not mean that the
> current partition is migratable.
> 
> Without that patch, when a LPAR is started on a 2 nodes box and then
> migrated to a 3 nodes box, the hypervisor may spread the LPAR's CPUs on the
> 3rd node. In that case if a CPU from that 3rd node is added to the LPAR, it
> will be wrongly assigned to the node because the kernel has been set to use


> up to 2 nodes (the configuration of the departure node). With that patch
> applies, the CPU is correctly added to the 3rd node.

You probably meant, "With this patch applied"

Also you may want to add a fixes tag:

> Cc: Srikar Dronamraju 
> Signed-off-by: Laurent Dufour 
> ---
>  arch/powerpc/mm/numa.c | 14 +++---
>  1 file changed, 11 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index f2bf98bdcea2..673fa6e47850 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -893,7 +893,7 @@ static void __init setup_node_data(int nid, u64 
> start_pfn, u64 end_pfn)
>  static void __init find_possible_nodes(void)
>  {
>   struct device_node *rtas;
> - const __be32 *domains;
> + const __be32 *domains = NULL;
>   int prop_length, max_nodes;
>   u32 i;
> 
> @@ -909,9 +909,14 @@ static void __init find_possible_nodes(void)
>* it doesn't exist, then fallback on ibm,max-associativity-domains.
>* Current denotes what the platform can support compared to max
>* which denotes what the Hypervisor can support.
> +  *
> +  * If the LPAR is migratable, new nodes might be activated after a LPM,
> +  * so we should consider the max number in that case.
>*/
> - domains = of_get_property(rtas, "ibm,current-associativity-domains",
> - _length);
> + if (!of_get_property(of_root, "ibm,migratable-partition", NULL))
> + domains = of_get_property(rtas,
> +   "ibm,current-associativity-domains",
> +   _length);
>   if (!domains) {
>   domains = of_get_property(rtas, "ibm,max-associativity-domains",
>   _length);
> @@ -920,6 +925,9 @@ static void __init find_possible_nodes(void)
>   }
> 
>   max_nodes = of_read_number([min_common_depth], 1);
> + printk(KERN_INFO "Partition configured for %d NUMA nodes.\n",
> +max_nodes);
> +

Another nit:
you may want to make this pr_info instead of printk

>   for (i = 0; i < max_nodes; i++) {
>   if (!node_possible(i))
>   node_set(i, node_possible_map);
> -- 
> 2.31.1
> 

Otherwise looks good to me.

Reviewed-by: Srikar Dronamraju 

-- 
Thanks and Regards
Srikar Dronamraju


Re: [PATCH v5 00/16] Restricted DMA

2021-05-10 Thread Claire Chang
v6: https://lore.kernel.org/patchwork/cover/1423201/


[PATCH v6 15/15] of: Add plumbing for restricted DMA pool

2021-05-10 Thread Claire Chang
If a device is not behind an IOMMU, we look up the device node and set
up the restricted DMA when the restricted-dma-pool is presented.

Signed-off-by: Claire Chang 
---
 drivers/of/address.c| 25 +
 drivers/of/device.c |  3 +++
 drivers/of/of_private.h |  5 +
 3 files changed, 33 insertions(+)

diff --git a/drivers/of/address.c b/drivers/of/address.c
index aca94c348bd4..c562a9ff5f0b 100644
--- a/drivers/of/address.c
+++ b/drivers/of/address.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1112,6 +1113,30 @@ bool of_dma_is_coherent(struct device_node *np)
 }
 EXPORT_SYMBOL_GPL(of_dma_is_coherent);
 
+int of_dma_set_restricted_buffer(struct device *dev)
+{
+   struct device_node *node;
+   int count, i;
+
+   if (!dev->of_node)
+   return 0;
+
+   count = of_property_count_elems_of_size(dev->of_node, "memory-region",
+   sizeof(phandle));
+   for (i = 0; i < count; i++) {
+   node = of_parse_phandle(dev->of_node, "memory-region", i);
+   /* There might be multiple memory regions, but only one
+* restriced-dma-pool region is allowed.
+*/
+   if (of_device_is_compatible(node, "restricted-dma-pool") &&
+   of_device_is_available(node))
+   return of_reserved_mem_device_init_by_idx(
+   dev, dev->of_node, i);
+   }
+
+   return 0;
+}
+
 /**
  * of_mmio_is_nonposted - Check if device uses non-posted MMIO
  * @np:device node
diff --git a/drivers/of/device.c b/drivers/of/device.c
index c5a9473a5fb1..d8d865223e51 100644
--- a/drivers/of/device.c
+++ b/drivers/of/device.c
@@ -165,6 +165,9 @@ int of_dma_configure_id(struct device *dev, struct 
device_node *np,
 
arch_setup_dma_ops(dev, dma_start, size, iommu, coherent);
 
+   if (!iommu)
+   return of_dma_set_restricted_buffer(dev);
+
return 0;
 }
 EXPORT_SYMBOL_GPL(of_dma_configure_id);
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index d717efbd637d..e9237f5eff48 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -163,12 +163,17 @@ struct bus_dma_region;
 #if defined(CONFIG_OF_ADDRESS) && defined(CONFIG_HAS_DMA)
 int of_dma_get_range(struct device_node *np,
const struct bus_dma_region **map);
+int of_dma_set_restricted_buffer(struct device *dev);
 #else
 static inline int of_dma_get_range(struct device_node *np,
const struct bus_dma_region **map)
 {
return -ENODEV;
 }
+static inline int of_dma_get_restricted_buffer(struct device *dev)
+{
+   return -ENODEV;
+}
 #endif
 
 #endif /* _LINUX_OF_PRIVATE_H */
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 14/15] dt-bindings: of: Add restricted DMA pool

2021-05-10 Thread Claire Chang
Introduce the new compatible string, restricted-dma-pool, for restricted
DMA. One can specify the address and length of the restricted DMA memory
region by restricted-dma-pool in the reserved-memory node.

Signed-off-by: Claire Chang 
---
 .../reserved-memory/reserved-memory.txt   | 27 +++
 1 file changed, 27 insertions(+)

diff --git 
a/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt 
b/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
index e8d3096d922c..284aea659015 100644
--- a/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
+++ b/Documentation/devicetree/bindings/reserved-memory/reserved-memory.txt
@@ -51,6 +51,23 @@ compatible (optional) - standard definition
   used as a shared pool of DMA buffers for a set of devices. It can
   be used by an operating system to instantiate the necessary pool
   management subsystem if necessary.
+- restricted-dma-pool: This indicates a region of memory meant to be
+  used as a pool of restricted DMA buffers for a set of devices. The
+  memory region would be the only region accessible to those devices.
+  When using this, the no-map and reusable properties must not be set,
+  so the operating system can create a virtual mapping that will be 
used
+  for synchronization. The main purpose for restricted DMA is to
+  mitigate the lack of DMA access control on systems without an IOMMU,
+  which could result in the DMA accessing the system memory at
+  unexpected times and/or unexpected addresses, possibly leading to 
data
+  leakage or corruption. The feature on its own provides a basic level
+  of protection against the DMA overwriting buffer contents at
+  unexpected times. However, to protect against general data leakage 
and
+  system memory corruption, the system needs to provide way to lock 
down
+  the memory access, e.g., MPU. Note that since coherent allocation
+  needs remapping, one must set up another device coherent pool by
+  shared-dma-pool and use dma_alloc_from_dev_coherent instead for 
atomic
+  coherent allocation.
 - vendor specific string in the form ,[-]
 no-map (optional) - empty property
 - Indicates the operating system must not create a virtual mapping
@@ -120,6 +137,11 @@ one for multimedia processing (named 
multimedia-memory@7700, 64MiB).
compatible = "acme,multimedia-memory";
reg = <0x7700 0x400>;
};
+
+   restricted_dma_mem_reserved: restricted_dma_mem_reserved {
+   compatible = "restricted-dma-pool";
+   reg = <0x5000 0x40>;
+   };
};
 
/* ... */
@@ -138,4 +160,9 @@ one for multimedia processing (named 
multimedia-memory@7700, 64MiB).
memory-region = <_reserved>;
/* ... */
};
+
+   pcie_device: pcie_device@0,0 {
+   memory-region = <_dma_mem_reserved>;
+   /* ... */
+   };
 };
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 13/15] dma-direct: Allocate memory from restricted DMA pool if available

2021-05-10 Thread Claire Chang
The restricted DMA pool is preferred if available.

The restricted DMA pools provide a basic level of protection against the
DMA overwriting buffer contents at unexpected times. However, to protect
against general data leakage and system memory corruption, the system
needs to provide a way to lock down the memory access, e.g., MPU.

Note that since coherent allocation needs remapping, one must set up
another device coherent pool by shared-dma-pool and use
dma_alloc_from_dev_coherent instead for atomic coherent allocation.

Signed-off-by: Claire Chang 
---
 kernel/dma/direct.c | 38 +-
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index eb4098323bbc..0d521f78c7b9 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -78,6 +78,10 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t 
phys, size_t size)
 static void __dma_direct_free_pages(struct device *dev, struct page *page,
size_t size)
 {
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+   if (swiotlb_free(dev, page, size))
+   return;
+#endif
dma_free_contiguous(dev, page, size);
 }
 
@@ -92,7 +96,17 @@ static struct page *__dma_direct_alloc_pages(struct device 
*dev, size_t size,
 
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
   _limit);
-   page = dma_alloc_contiguous(dev, size, gfp);
+
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+   page = swiotlb_alloc(dev, size);
+   if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
+   __dma_direct_free_pages(dev, page, size);
+   page = NULL;
+   }
+#endif
+
+   if (!page)
+   page = dma_alloc_contiguous(dev, size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, size);
page = NULL;
@@ -148,7 +162,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
gfp |= __GFP_NOWARN;
 
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
-   !force_dma_unencrypted(dev)) {
+   !force_dma_unencrypted(dev) && !is_dev_swiotlb_force(dev)) {
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
if (!page)
return NULL;
@@ -161,18 +175,23 @@ void *dma_direct_alloc(struct device *dev, size_t size,
}
 
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-   !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-   !dev_is_dma_coherent(dev))
+   !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) &&
+   !is_dev_swiotlb_force(dev))
return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
 
/*
 * Remapping or decrypting memory may block. If either is required and
 * we can't block, allocate the memory from the atomic pools.
+* If restricted DMA (i.e., is_dev_swiotlb_force) is required, one must
+* set up another device coherent pool by shared-dma-pool and use
+* dma_alloc_from_dev_coherent instead.
 */
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
!gfpflags_allow_blocking(gfp) &&
(force_dma_unencrypted(dev) ||
-(IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && 
!dev_is_dma_coherent(dev
+(IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
+ !dev_is_dma_coherent(dev))) &&
+   !is_dev_swiotlb_force(dev))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
 
/* we always manually zero the memory once we are done */
@@ -253,15 +272,15 @@ void dma_direct_free(struct device *dev, size_t size,
unsigned int page_order = get_order(size);
 
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
-   !force_dma_unencrypted(dev)) {
+   !force_dma_unencrypted(dev) && !is_dev_swiotlb_force(dev)) {
/* cpu_addr is a struct page cookie, not a kernel address */
dma_free_contiguous(dev, cpu_addr, size);
return;
}
 
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
-   !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
-   !dev_is_dma_coherent(dev)) {
+   !IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) && !dev_is_dma_coherent(dev) &&
+   !is_dev_swiotlb_force(dev)) {
arch_dma_free(dev, size, cpu_addr, dma_addr, attrs);
return;
}
@@ -289,7 +308,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, 
size_t size,
void *ret;
 
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
-   force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp))
+   force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) &&
+   !is_dev_swiotlb_force(dev))
return dma_direct_alloc_from_pool(dev, size, dma_handle, 

[PATCH v6 12/15] swiotlb: Add restricted DMA alloc/free support.

2021-05-10 Thread Claire Chang
Add the functions, swiotlb_{alloc,free} to support the memory allocation
from restricted DMA pool.

Signed-off-by: Claire Chang 
---
 include/linux/swiotlb.h |  4 
 kernel/dma/swiotlb.c| 35 +--
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 0c5a18d9cf89..e8cf49bd90c5 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -134,6 +134,10 @@ unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
 bool is_swiotlb_active(struct device *dev);
 void __init swiotlb_adjust_size(unsigned long size);
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+struct page *swiotlb_alloc(struct device *dev, size_t size);
+bool swiotlb_free(struct device *dev, struct page *page, size_t size);
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
 static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index fa11787e4b95..d99d5f902259 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -457,8 +457,9 @@ static int find_slots(struct device *dev, phys_addr_t 
orig_addr,
 
index = wrap = wrap_index(mem, ALIGN(mem->index, stride));
do {
-   if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
-   (orig_addr & iotlb_align_mask)) {
+   if (orig_addr &&
+   (slot_addr(tbl_dma_addr, index) & iotlb_align_mask) !=
+   (orig_addr & iotlb_align_mask)) {
index = wrap_index(mem, index + 1);
continue;
}
@@ -701,6 +702,36 @@ late_initcall(swiotlb_create_default_debugfs);
 #endif
 
 #ifdef CONFIG_DMA_RESTRICTED_POOL
+struct page *swiotlb_alloc(struct device *dev, size_t size)
+{
+   struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
+   phys_addr_t tlb_addr;
+   int index;
+
+   if (!mem)
+   return NULL;
+
+   index = find_slots(dev, 0, size);
+   if (index == -1)
+   return NULL;
+
+   tlb_addr = slot_addr(mem->start, index);
+
+   return pfn_to_page(PFN_DOWN(tlb_addr));
+}
+
+bool swiotlb_free(struct device *dev, struct page *page, size_t size)
+{
+   phys_addr_t tlb_addr = page_to_phys(page);
+
+   if (!is_swiotlb_buffer(dev, tlb_addr))
+   return false;
+
+   release_slots(dev, tlb_addr);
+
+   return true;
+}
+
 static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
struct device *dev)
 {
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 11/15] dma-direct: Add a new wrapper __dma_direct_free_pages()

2021-05-10 Thread Claire Chang
Add a new wrapper __dma_direct_free_pages() that will be useful later
for swiotlb_free().

Signed-off-by: Claire Chang 
---
 kernel/dma/direct.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 078f7087e466..eb4098323bbc 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -75,6 +75,12 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t 
phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
 }
 
+static void __dma_direct_free_pages(struct device *dev, struct page *page,
+   size_t size)
+{
+   dma_free_contiguous(dev, page, size);
+}
+
 static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp)
 {
@@ -237,7 +243,7 @@ void *dma_direct_alloc(struct device *dev, size_t size,
return NULL;
}
 out_free_pages:
-   dma_free_contiguous(dev, page, size);
+   __dma_direct_free_pages(dev, page, size);
return NULL;
 }
 
@@ -273,7 +279,7 @@ void dma_direct_free(struct device *dev, size_t size,
else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
arch_dma_clear_uncached(cpu_addr, size);
 
-   dma_free_contiguous(dev, dma_direct_to_page(dev, dma_addr), size);
+   __dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
 }
 
 struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
@@ -310,7 +316,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, 
size_t size,
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
 out_free_pages:
-   dma_free_contiguous(dev, page, size);
+   __dma_direct_free_pages(dev, page, size);
return NULL;
 }
 
@@ -329,7 +335,7 @@ void dma_direct_free_pages(struct device *dev, size_t size,
if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
 
-   dma_free_contiguous(dev, page, size);
+   __dma_direct_free_pages(dev, page, size);
 }
 
 #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 10/15] swiotlb: Refactor swiotlb_tbl_unmap_single

2021-05-10 Thread Claire Chang
Add a new function, release_slots, to make the code reusable for supporting
different bounce buffer pools, e.g. restricted DMA pool.

Signed-off-by: Claire Chang 
---
 kernel/dma/swiotlb.c | 35 ---
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index ef04d8f7708f..fa11787e4b95 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -550,27 +550,15 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, 
phys_addr_t orig_addr,
return tlb_addr;
 }
 
-/*
- * tlb_addr is the physical address of the bounce buffer to unmap.
- */
-void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr,
- size_t mapping_size, enum dma_data_direction dir,
- unsigned long attrs)
+static void release_slots(struct device *dev, phys_addr_t tlb_addr)
 {
-   struct io_tlb_mem *mem = get_io_tlb_mem(hwdev);
+   struct io_tlb_mem *mem = get_io_tlb_mem(dev);
unsigned long flags;
-   unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr);
+   unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
int nslots = nr_slots(mem->slots[index].alloc_size + offset);
int count, i;
 
-   /*
-* First, sync the memory before unmapping the entry
-*/
-   if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-   (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
-   swiotlb_bounce(hwdev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
-
/*
 * Return the buffer to the free list by setting the corresponding
 * entries to indicate the number of contiguous entries available.
@@ -605,6 +593,23 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, 
phys_addr_t tlb_addr,
spin_unlock_irqrestore(>lock, flags);
 }
 
+/*
+ * tlb_addr is the physical address of the bounce buffer to unmap.
+ */
+void swiotlb_tbl_unmap_single(struct device *dev, phys_addr_t tlb_addr,
+ size_t mapping_size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+   /*
+* First, sync the memory before unmapping the entry
+*/
+   if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
+   (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL))
+   swiotlb_bounce(dev, tlb_addr, mapping_size, DMA_FROM_DEVICE);
+
+   release_slots(dev, tlb_addr);
+}
+
 void swiotlb_sync_single_for_device(struct device *dev, phys_addr_t tlb_addr,
size_t size, enum dma_data_direction dir)
 {
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 09/15] swiotlb: Move alloc_size to find_slots

2021-05-10 Thread Claire Chang
Move the maintenance of alloc_size to find_slots for better code
reusability later.

Signed-off-by: Claire Chang 
---
 kernel/dma/swiotlb.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 3f1ad080a4bc..ef04d8f7708f 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -482,8 +482,11 @@ static int find_slots(struct device *dev, phys_addr_t 
orig_addr,
return -1;
 
 found:
-   for (i = index; i < index + nslots; i++)
+   for (i = index; i < index + nslots; i++) {
mem->slots[i].list = 0;
+   mem->slots[i].alloc_size =
+   alloc_size - ((i - index) << IO_TLB_SHIFT);
+   }
for (i = index - 1;
 io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 &&
 mem->slots[i].list; i--)
@@ -538,11 +541,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, 
phys_addr_t orig_addr,
 * This is needed when we sync the memory.  Then we sync the buffer if
 * needed.
 */
-   for (i = 0; i < nr_slots(alloc_size + offset); i++) {
+   for (i = 0; i < nr_slots(alloc_size + offset); i++)
mem->slots[index + i].orig_addr = slot_addr(orig_addr, i);
-   mem->slots[index + i].alloc_size =
-   alloc_size - (i << IO_TLB_SHIFT);
-   }
tlb_addr = slot_addr(mem->start, index) + offset;
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
(dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 08/15] swiotlb: Bounce data from/to restricted DMA pool if available

2021-05-10 Thread Claire Chang
Regardless of swiotlb setting, the restricted DMA pool is preferred if
available.

The restricted DMA pools provide a basic level of protection against the
DMA overwriting buffer contents at unexpected times. However, to protect
against general data leakage and system memory corruption, the system
needs to provide a way to lock down the memory access, e.g., MPU.

Note that is_dev_swiotlb_force doesn't check if
swiotlb_force == SWIOTLB_FORCE. Otherwise the memory allocation behavior
with default swiotlb will be changed by the following patche
("dma-direct: Allocate memory from restricted DMA pool if available").

Signed-off-by: Claire Chang 
---
 include/linux/swiotlb.h | 13 +
 kernel/dma/direct.c |  3 ++-
 kernel/dma/direct.h |  3 ++-
 kernel/dma/swiotlb.c|  8 
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index c530c976d18b..0c5a18d9cf89 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -120,6 +120,15 @@ static inline bool is_swiotlb_buffer(struct device *dev, 
phys_addr_t paddr)
return mem && paddr >= mem->start && paddr < mem->end;
 }
 
+static inline bool is_dev_swiotlb_force(struct device *dev)
+{
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+   if (dev->dma_io_tlb_mem)
+   return true;
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
+   return false;
+}
+
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
@@ -131,6 +140,10 @@ static inline bool is_swiotlb_buffer(struct device *dev, 
phys_addr_t paddr)
 {
return false;
 }
+static inline bool is_dev_swiotlb_force(struct device *dev)
+{
+   return false;
+}
 static inline void swiotlb_exit(void)
 {
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 7a88c34d0867..078f7087e466 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -496,7 +496,8 @@ size_t dma_direct_max_mapping_size(struct device *dev)
 {
/* If SWIOTLB is active, use its maximum mapping size */
if (is_swiotlb_active(dev) &&
-   (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
+   (dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE ||
+is_dev_swiotlb_force(dev)))
return swiotlb_max_mapping_size(dev);
return SIZE_MAX;
 }
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index 13e9e7158d94..f94813674e23 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -87,7 +87,8 @@ static inline dma_addr_t dma_direct_map_page(struct device 
*dev,
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dma_addr = phys_to_dma(dev, phys);
 
-   if (unlikely(swiotlb_force == SWIOTLB_FORCE))
+   if (unlikely(swiotlb_force == SWIOTLB_FORCE) ||
+   is_dev_swiotlb_force(dev))
return swiotlb_map(dev, phys, size, dir, attrs);
 
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 81bed3d2c771..3f1ad080a4bc 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -347,7 +347,7 @@ void __init swiotlb_exit(void)
 static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t 
size,
   enum dma_data_direction dir)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
+   struct io_tlb_mem *mem = get_io_tlb_mem(dev);
int index = (tlb_addr - mem->start) >> IO_TLB_SHIFT;
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
@@ -429,7 +429,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, 
unsigned int index)
 static int find_slots(struct device *dev, phys_addr_t orig_addr,
size_t alloc_size)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
+   struct io_tlb_mem *mem = get_io_tlb_mem(dev);
unsigned long boundary_mask = dma_get_seg_boundary(dev);
dma_addr_t tbl_dma_addr =
phys_to_dma_unencrypted(dev, mem->start) & boundary_mask;
@@ -506,7 +506,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, 
phys_addr_t orig_addr,
size_t mapping_size, size_t alloc_size,
enum dma_data_direction dir, unsigned long attrs)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
+   struct io_tlb_mem *mem = get_io_tlb_mem(dev);
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
unsigned int i;
int index;
@@ -557,7 +557,7 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, 
phys_addr_t tlb_addr,
  size_t mapping_size, enum dma_data_direction dir,
  unsigned long attrs)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
+   struct io_tlb_mem *mem = get_io_tlb_mem(hwdev);
unsigned long flags;
unsigned int offset = 

[PATCH v6 07/15] swiotlb: Update is_swiotlb_active to add a struct device argument

2021-05-10 Thread Claire Chang
Update is_swiotlb_active to add a struct device argument. This will be
useful later to allow for restricted DMA pool.

Signed-off-by: Claire Chang 
---
 drivers/gpu/drm/i915/gem/i915_gem_internal.c | 2 +-
 drivers/gpu/drm/nouveau/nouveau_ttm.c| 2 +-
 drivers/pci/xen-pcifront.c   | 2 +-
 include/linux/swiotlb.h  | 4 ++--
 kernel/dma/direct.c  | 2 +-
 kernel/dma/swiotlb.c | 4 ++--
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_internal.c 
b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
index ce6b664b10aa..7d48c433446b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_internal.c
@@ -42,7 +42,7 @@ static int i915_gem_object_get_pages_internal(struct 
drm_i915_gem_object *obj)
 
max_order = MAX_ORDER;
 #ifdef CONFIG_SWIOTLB
-   if (is_swiotlb_active()) {
+   if (is_swiotlb_active(NULL)) {
unsigned int max_segment;
 
max_segment = swiotlb_max_segment();
diff --git a/drivers/gpu/drm/nouveau/nouveau_ttm.c 
b/drivers/gpu/drm/nouveau/nouveau_ttm.c
index e8b506a6685b..2a2ae6d6cf6d 100644
--- a/drivers/gpu/drm/nouveau/nouveau_ttm.c
+++ b/drivers/gpu/drm/nouveau/nouveau_ttm.c
@@ -321,7 +321,7 @@ nouveau_ttm_init(struct nouveau_drm *drm)
}
 
 #if IS_ENABLED(CONFIG_SWIOTLB) && IS_ENABLED(CONFIG_X86)
-   need_swiotlb = is_swiotlb_active();
+   need_swiotlb = is_swiotlb_active(NULL);
 #endif
 
ret = ttm_device_init(>ttm.bdev, _bo_driver, drm->dev->dev,
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index b7a8f3a1921f..6d548ce53ce7 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -693,7 +693,7 @@ static int pcifront_connect_and_init_dma(struct 
pcifront_device *pdev)
 
spin_unlock(_dev_lock);
 
-   if (!err && !is_swiotlb_active()) {
+   if (!err && !is_swiotlb_active(NULL)) {
err = pci_xen_swiotlb_init_late();
if (err)
dev_err(>xdev->dev, "Could not setup SWIOTLB!\n");
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 2a6cca07540b..c530c976d18b 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -123,7 +123,7 @@ static inline bool is_swiotlb_buffer(struct device *dev, 
phys_addr_t paddr)
 void __init swiotlb_exit(void);
 unsigned int swiotlb_max_segment(void);
 size_t swiotlb_max_mapping_size(struct device *dev);
-bool is_swiotlb_active(void);
+bool is_swiotlb_active(struct device *dev);
 void __init swiotlb_adjust_size(unsigned long size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
@@ -143,7 +143,7 @@ static inline size_t swiotlb_max_mapping_size(struct device 
*dev)
return SIZE_MAX;
 }
 
-static inline bool is_swiotlb_active(void)
+static inline bool is_swiotlb_active(struct device *dev)
 {
return false;
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 84c9feb5474a..7a88c34d0867 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -495,7 +495,7 @@ int dma_direct_supported(struct device *dev, u64 mask)
 size_t dma_direct_max_mapping_size(struct device *dev)
 {
/* If SWIOTLB is active, use its maximum mapping size */
-   if (is_swiotlb_active() &&
+   if (is_swiotlb_active(dev) &&
(dma_addressing_limited(dev) || swiotlb_force == SWIOTLB_FORCE))
return swiotlb_max_mapping_size(dev);
return SIZE_MAX;
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 4ea027b75013..81bed3d2c771 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -662,9 +662,9 @@ size_t swiotlb_max_mapping_size(struct device *dev)
return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE;
 }
 
-bool is_swiotlb_active(void)
+bool is_swiotlb_active(struct device *dev)
 {
-   return io_tlb_default_mem != NULL;
+   return get_io_tlb_mem(dev) != NULL;
 }
 EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 06/15] swiotlb: Update is_swiotlb_buffer to add a struct device argument

2021-05-10 Thread Claire Chang
Update is_swiotlb_buffer to add a struct device argument. This will be
useful later to allow for restricted DMA pool.

Signed-off-by: Claire Chang 
---
 drivers/iommu/dma-iommu.c | 12 ++--
 drivers/xen/swiotlb-xen.c |  2 +-
 include/linux/swiotlb.h   |  6 +++---
 kernel/dma/direct.c   |  6 +++---
 kernel/dma/direct.h   |  6 +++---
 5 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7bcdd1205535..a5df35bfd150 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -504,7 +504,7 @@ static void __iommu_dma_unmap_swiotlb(struct device *dev, 
dma_addr_t dma_addr,
 
__iommu_dma_unmap(dev, dma_addr, size);
 
-   if (unlikely(is_swiotlb_buffer(phys)))
+   if (unlikely(is_swiotlb_buffer(dev, phys)))
swiotlb_tbl_unmap_single(dev, phys, size, dir, attrs);
 }
 
@@ -575,7 +575,7 @@ static dma_addr_t __iommu_dma_map_swiotlb(struct device 
*dev, phys_addr_t phys,
}
 
iova = __iommu_dma_map(dev, phys, aligned_size, prot, dma_mask);
-   if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(phys))
+   if (iova == DMA_MAPPING_ERROR && is_swiotlb_buffer(dev, phys))
swiotlb_tbl_unmap_single(dev, phys, org_size, dir, attrs);
return iova;
 }
@@ -781,7 +781,7 @@ static void iommu_dma_sync_single_for_cpu(struct device 
*dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(phys, size, dir);
 
-   if (is_swiotlb_buffer(phys))
+   if (is_swiotlb_buffer(dev, phys))
swiotlb_sync_single_for_cpu(dev, phys, size, dir);
 }
 
@@ -794,7 +794,7 @@ static void iommu_dma_sync_single_for_device(struct device 
*dev,
return;
 
phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
-   if (is_swiotlb_buffer(phys))
+   if (is_swiotlb_buffer(dev, phys))
swiotlb_sync_single_for_device(dev, phys, size, dir);
 
if (!dev_is_dma_coherent(dev))
@@ -815,7 +815,7 @@ static void iommu_dma_sync_sg_for_cpu(struct device *dev,
if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu(sg_phys(sg), sg->length, dir);
 
-   if (is_swiotlb_buffer(sg_phys(sg)))
+   if (is_swiotlb_buffer(dev, sg_phys(sg)))
swiotlb_sync_single_for_cpu(dev, sg_phys(sg),
sg->length, dir);
}
@@ -832,7 +832,7 @@ static void iommu_dma_sync_sg_for_device(struct device *dev,
return;
 
for_each_sg(sgl, sg, nelems, i) {
-   if (is_swiotlb_buffer(sg_phys(sg)))
+   if (is_swiotlb_buffer(dev, sg_phys(sg)))
swiotlb_sync_single_for_device(dev, sg_phys(sg),
   sg->length, dir);
 
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 4c89afc0df62..0c6ed09f8513 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -100,7 +100,7 @@ static int is_xen_swiotlb_buffer(struct device *dev, 
dma_addr_t dma_addr)
 * in our domain. Therefore _only_ check address within our domain.
 */
if (pfn_valid(PFN_DOWN(paddr)))
-   return is_swiotlb_buffer(paddr);
+   return is_swiotlb_buffer(dev, paddr);
return 0;
 }
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index b469f04cca26..2a6cca07540b 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -113,9 +113,9 @@ static inline struct io_tlb_mem *get_io_tlb_mem(struct 
device *dev)
return io_tlb_default_mem;
 }
 
-static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
+   struct io_tlb_mem *mem = get_io_tlb_mem(dev);
 
return mem && paddr >= mem->start && paddr < mem->end;
 }
@@ -127,7 +127,7 @@ bool is_swiotlb_active(void);
 void __init swiotlb_adjust_size(unsigned long size);
 #else
 #define swiotlb_force SWIOTLB_NO_FORCE
-static inline bool is_swiotlb_buffer(phys_addr_t paddr)
+static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 {
return false;
 }
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index f737e3347059..84c9feb5474a 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -343,7 +343,7 @@ void dma_direct_sync_sg_for_device(struct device *dev,
for_each_sg(sgl, sg, nents, i) {
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
 
-   if (unlikely(is_swiotlb_buffer(paddr)))
+   if (unlikely(is_swiotlb_buffer(dev, paddr)))
swiotlb_sync_single_for_device(dev, paddr, sg->length,
   dir);
 
@@ -369,7 +369,7 @@ void dma_direct_sync_sg_for_cpu(struct device 

[PATCH v6 05/15] swiotlb: Add a new get_io_tlb_mem getter

2021-05-10 Thread Claire Chang
Add a new getter, get_io_tlb_mem, to help select the io_tlb_mem struct.
The restricted DMA pool is preferred if available.

Signed-off-by: Claire Chang 
---
 include/linux/swiotlb.h | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 03ad6e3b4056..b469f04cca26 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -2,6 +2,7 @@
 #ifndef __LINUX_SWIOTLB_H
 #define __LINUX_SWIOTLB_H
 
+#include 
 #include 
 #include 
 #include 
@@ -102,6 +103,16 @@ struct io_tlb_mem {
 };
 extern struct io_tlb_mem *io_tlb_default_mem;
 
+static inline struct io_tlb_mem *get_io_tlb_mem(struct device *dev)
+{
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+   if (dev && dev->dma_io_tlb_mem)
+   return dev->dma_io_tlb_mem;
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
+
+   return io_tlb_default_mem;
+}
+
 static inline bool is_swiotlb_buffer(phys_addr_t paddr)
 {
struct io_tlb_mem *mem = io_tlb_default_mem;
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 04/15] swiotlb: Add restricted DMA pool initialization

2021-05-10 Thread Claire Chang
Add the initialization function to create restricted DMA pools from
matching reserved-memory nodes.

Signed-off-by: Claire Chang 
---
 include/linux/device.h  |  4 +++
 include/linux/swiotlb.h |  3 +-
 kernel/dma/swiotlb.c| 79 +
 3 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/include/linux/device.h b/include/linux/device.h
index 38a2071cf776..4987608ea4ff 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -416,6 +416,7 @@ struct dev_links_info {
  * @dma_pools: Dma pools (if dma'ble device).
  * @dma_mem:   Internal for coherent mem override.
  * @cma_area:  Contiguous memory area for dma allocations
+ * @dma_io_tlb_mem: Internal for swiotlb io_tlb_mem override.
  * @archdata:  For arch-specific additions.
  * @of_node:   Associated device tree node.
  * @fwnode:Associated device node supplied by platform firmware.
@@ -521,6 +522,9 @@ struct device {
 #ifdef CONFIG_DMA_CMA
struct cma *cma_area;   /* contiguous memory area for dma
   allocations */
+#endif
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+   struct io_tlb_mem *dma_io_tlb_mem;
 #endif
/* arch specific additions */
struct dev_archdata archdata;
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 216854a5e513..03ad6e3b4056 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -72,7 +72,8 @@ extern enum swiotlb_force swiotlb_force;
  * range check to see if the memory was in fact allocated by this
  * API.
  * @nslabs:The number of IO TLB blocks (in groups of 64) between @start and
- * @end. This is command line adjustable via setup_io_tlb_npages.
+ * @end. For default swiotlb, this is command line adjustable via
+ * setup_io_tlb_npages.
  * @used:  The number of used IO TLB block.
  * @list:  The free list describing the number of free entries available
  * from each index.
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 858475bd6923..4ea027b75013 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -39,6 +39,13 @@
 #ifdef CONFIG_DEBUG_FS
 #include 
 #endif
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+#include 
+#include 
+#include 
+#include 
+#include 
+#endif
 
 #include 
 #include 
@@ -687,3 +694,75 @@ static int __init swiotlb_create_default_debugfs(void)
 late_initcall(swiotlb_create_default_debugfs);
 
 #endif
+
+#ifdef CONFIG_DMA_RESTRICTED_POOL
+static int rmem_swiotlb_device_init(struct reserved_mem *rmem,
+   struct device *dev)
+{
+   struct io_tlb_mem *mem = rmem->priv;
+   unsigned long nslabs = rmem->size >> IO_TLB_SHIFT;
+
+   if (dev->dma_io_tlb_mem)
+   return 0;
+
+   /* Since multiple devices can share the same pool, the private data,
+* io_tlb_mem struct, will be initialized by the first device attached
+* to it.
+*/
+   if (!mem) {
+   mem = kzalloc(struct_size(mem, slots, nslabs), GFP_KERNEL);
+   if (!mem)
+   return -ENOMEM;
+#ifdef CONFIG_ARM
+   if (!PageHighMem(pfn_to_page(PHYS_PFN(rmem->base {
+   kfree(mem);
+   return -EINVAL;
+   }
+#endif /* CONFIG_ARM */
+   swiotlb_init_io_tlb_mem(mem, rmem->base, nslabs, false);
+
+   rmem->priv = mem;
+
+#ifdef CONFIG_DEBUG_FS
+   if (!debugfs_dir)
+   debugfs_dir = debugfs_create_dir("swiotlb", NULL);
+
+   swiotlb_create_debugfs(mem, rmem->name, debugfs_dir);
+#endif /* CONFIG_DEBUG_FS */
+   }
+
+   dev->dma_io_tlb_mem = mem;
+
+   return 0;
+}
+
+static void rmem_swiotlb_device_release(struct reserved_mem *rmem,
+   struct device *dev)
+{
+   if (dev)
+   dev->dma_io_tlb_mem = NULL;
+}
+
+static const struct reserved_mem_ops rmem_swiotlb_ops = {
+   .device_init = rmem_swiotlb_device_init,
+   .device_release = rmem_swiotlb_device_release,
+};
+
+static int __init rmem_swiotlb_setup(struct reserved_mem *rmem)
+{
+   unsigned long node = rmem->fdt_node;
+
+   if (of_get_flat_dt_prop(node, "reusable", NULL) ||
+   of_get_flat_dt_prop(node, "linux,cma-default", NULL) ||
+   of_get_flat_dt_prop(node, "linux,dma-default", NULL) ||
+   of_get_flat_dt_prop(node, "no-map", NULL))
+   return -EINVAL;
+
+   rmem->ops = _swiotlb_ops;
+   pr_info("Reserved memory: created device swiotlb memory pool at %pa, 
size %ld MiB\n",
+   >base, (unsigned long)rmem->size / SZ_1M);
+   return 0;
+}
+
+RESERVEDMEM_OF_DECLARE(dma, "restricted-dma-pool", rmem_swiotlb_setup);
+#endif /* CONFIG_DMA_RESTRICTED_POOL */
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 03/15] swiotlb: Add DMA_RESTRICTED_POOL

2021-05-10 Thread Claire Chang
Add a new kconfig symbol, DMA_RESTRICTED_POOL, for restricted DMA pool.

Signed-off-by: Claire Chang 
---
 kernel/dma/Kconfig | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index 77b405508743..3e961dc39634 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -80,6 +80,20 @@ config SWIOTLB
bool
select NEED_DMA_MAP_STATE
 
+config DMA_RESTRICTED_POOL
+   bool "DMA Restricted Pool"
+   depends on OF && OF_RESERVED_MEM
+   select SWIOTLB
+   help
+ This enables support for restricted DMA pools which provide a level of
+ DMA memory protection on systems with limited hardware protection
+ capabilities, such as those lacking an IOMMU.
+
+ For more information see
+ 

+ and .
+ If unsure, say "n".
+
 #
 # Should be selected if we can mmap non-coherent mappings to userspace.
 # The only thing that is really required is a way to set an uncached bit
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 02/15] swiotlb: Refactor swiotlb_create_debugfs

2021-05-10 Thread Claire Chang
Split the debugfs creation to make the code reusable for supporting
different bounce buffer pools, e.g. restricted DMA pool.

Signed-off-by: Claire Chang 
---
 kernel/dma/swiotlb.c | 22 --
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index d3232fc19385..858475bd6923 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -64,6 +64,7 @@
 enum swiotlb_force swiotlb_force;
 
 struct io_tlb_mem *io_tlb_default_mem;
+static struct dentry *debugfs_dir;
 
 /*
  * Max segment that we can provide which (if pages are contingous) will
@@ -662,18 +663,27 @@ EXPORT_SYMBOL_GPL(is_swiotlb_active);
 
 #ifdef CONFIG_DEBUG_FS
 
-static int __init swiotlb_create_debugfs(void)
+static void swiotlb_create_debugfs(struct io_tlb_mem *mem, const char *name,
+  struct dentry *node)
 {
-   struct io_tlb_mem *mem = io_tlb_default_mem;
-
if (!mem)
-   return 0;
-   mem->debugfs = debugfs_create_dir("swiotlb", NULL);
+   return;
+
+   mem->debugfs = debugfs_create_dir(name, node);
debugfs_create_ulong("io_tlb_nslabs", 0400, mem->debugfs, >nslabs);
debugfs_create_ulong("io_tlb_used", 0400, mem->debugfs, >used);
+}
+
+static int __init swiotlb_create_default_debugfs(void)
+{
+   struct io_tlb_mem *mem = io_tlb_default_mem;
+
+   swiotlb_create_debugfs(mem, "swiotlb", NULL);
+   debugfs_dir = mem->debugfs;
+
return 0;
 }
 
-late_initcall(swiotlb_create_debugfs);
+late_initcall(swiotlb_create_default_debugfs);
 
 #endif
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 01/15] swiotlb: Refactor swiotlb init functions

2021-05-10 Thread Claire Chang
Add a new function, swiotlb_init_io_tlb_mem, for the io_tlb_mem struct
initialization to make the code reusable.

Note that we now also call set_memory_decrypted in swiotlb_init_with_tbl.

Signed-off-by: Claire Chang 
---
 kernel/dma/swiotlb.c | 51 ++--
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 8ca7d505d61c..d3232fc19385 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -168,9 +168,30 @@ void __init swiotlb_update_mem_attributes(void)
memset(vaddr, 0, bytes);
 }
 
-int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
+   unsigned long nslabs, bool late_alloc)
 {
+   void *vaddr = phys_to_virt(start);
unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
+
+   mem->nslabs = nslabs;
+   mem->start = start;
+   mem->end = mem->start + bytes;
+   mem->index = 0;
+   mem->late_alloc = late_alloc;
+   spin_lock_init(>lock);
+   for (i = 0; i < mem->nslabs; i++) {
+   mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
+   mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
+   mem->slots[i].alloc_size = 0;
+   }
+
+   set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
+   memset(vaddr, 0, bytes);
+}
+
+int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
+{
struct io_tlb_mem *mem;
size_t alloc_size;
 
@@ -186,16 +207,8 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long 
nslabs, int verbose)
if (!mem)
panic("%s: Failed to allocate %zu bytes align=0x%lx\n",
  __func__, alloc_size, PAGE_SIZE);
-   mem->nslabs = nslabs;
-   mem->start = __pa(tlb);
-   mem->end = mem->start + bytes;
-   mem->index = 0;
-   spin_lock_init(>lock);
-   for (i = 0; i < mem->nslabs; i++) {
-   mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
-   mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
-   mem->slots[i].alloc_size = 0;
-   }
+
+   swiotlb_init_io_tlb_mem(mem, __pa(tlb), nslabs, false);
 
io_tlb_default_mem = mem;
if (verbose)
@@ -282,7 +295,6 @@ swiotlb_late_init_with_default_size(size_t default_size)
 int
 swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
 {
-   unsigned long bytes = nslabs << IO_TLB_SHIFT, i;
struct io_tlb_mem *mem;
 
if (swiotlb_force == SWIOTLB_NO_FORCE)
@@ -297,20 +309,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs)
if (!mem)
return -ENOMEM;
 
-   mem->nslabs = nslabs;
-   mem->start = virt_to_phys(tlb);
-   mem->end = mem->start + bytes;
-   mem->index = 0;
-   mem->late_alloc = 1;
-   spin_lock_init(>lock);
-   for (i = 0; i < mem->nslabs; i++) {
-   mem->slots[i].list = IO_TLB_SEGSIZE - io_tlb_offset(i);
-   mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
-   mem->slots[i].alloc_size = 0;
-   }
-
-   set_memory_decrypted((unsigned long)tlb, bytes >> PAGE_SHIFT);
-   memset(tlb, 0, bytes);
+   swiotlb_init_io_tlb_mem(mem, virt_to_phys(tlb), nslabs, true);
 
io_tlb_default_mem = mem;
swiotlb_print_info();
-- 
2.31.1.607.g51e8a6a459-goog



[PATCH v6 00/15] Restricted DMA

2021-05-10 Thread Claire Chang
From: Claire Chang 

This series implements mitigations for lack of DMA access control on
systems without an IOMMU, which could result in the DMA accessing the
system memory at unexpected times and/or unexpected addresses, possibly
leading to data leakage or corruption.

For example, we plan to use the PCI-e bus for Wi-Fi and that PCI-e bus is
not behind an IOMMU. As PCI-e, by design, gives the device full access to
system memory, a vulnerability in the Wi-Fi firmware could easily escalate
to a full system exploit (remote wifi exploits: [1a], [1b] that shows a
full chain of exploits; [2], [3]).

To mitigate the security concerns, we introduce restricted DMA. Restricted
DMA utilizes the existing swiotlb to bounce streaming DMA in and out of a
specially allocated region and does memory allocation from the same region.
The feature on its own provides a basic level of protection against the DMA
overwriting buffer contents at unexpected times. However, to protect
against general data leakage and system memory corruption, the system needs
to provide a way to restrict the DMA to a predefined memory region (this is
usually done at firmware level, e.g. MPU in ATF on some ARM platforms [4]).

[1a] 
https://googleprojectzero.blogspot.com/2017/04/over-air-exploiting-broadcoms-wi-fi_4.html
[1b] 
https://googleprojectzero.blogspot.com/2017/04/over-air-exploiting-broadcoms-wi-fi_11.html
[2] https://blade.tencent.com/en/advisories/qualpwn/
[3] 
https://www.bleepingcomputer.com/news/security/vulnerabilities-found-in-highly-popular-firmware-for-wifi-chips/
[4] 
https://github.com/ARM-software/arm-trusted-firmware/blob/master/plat/mediatek/mt8183/drivers/emi_mpu/emi_mpu.c#L132

v6:
Address the comments in v5

v5:
Rebase on latest linux-next
https://lore.kernel.org/patchwork/cover/1416899/

v4:
- Fix spinlock bad magic
- Use rmem->name for debugfs entry
- Address the comments in v3
https://lore.kernel.org/patchwork/cover/1378113/

v3:
Using only one reserved memory region for both streaming DMA and memory
allocation.
https://lore.kernel.org/patchwork/cover/1360992/

v2:
Building on top of swiotlb.
https://lore.kernel.org/patchwork/cover/1280705/

v1:
Using dma_map_ops.
https://lore.kernel.org/patchwork/cover/1271660/
*** BLURB HERE ***

Claire Chang (15):
  swiotlb: Refactor swiotlb init functions
  swiotlb: Refactor swiotlb_create_debugfs
  swiotlb: Add DMA_RESTRICTED_POOL
  swiotlb: Add restricted DMA pool initialization
  swiotlb: Add a new get_io_tlb_mem getter
  swiotlb: Update is_swiotlb_buffer to add a struct device argument
  swiotlb: Update is_swiotlb_active to add a struct device argument
  swiotlb: Bounce data from/to restricted DMA pool if available
  swiotlb: Move alloc_size to find_slots
  swiotlb: Refactor swiotlb_tbl_unmap_single
  dma-direct: Add a new wrapper __dma_direct_free_pages()
  swiotlb: Add restricted DMA alloc/free support.
  dma-direct: Allocate memory from restricted DMA pool if available
  dt-bindings: of: Add restricted DMA pool
  of: Add plumbing for restricted DMA pool

 .../reserved-memory/reserved-memory.txt   |  27 ++
 drivers/gpu/drm/i915/gem/i915_gem_internal.c  |   2 +-
 drivers/gpu/drm/nouveau/nouveau_ttm.c |   2 +-
 drivers/iommu/dma-iommu.c |  12 +-
 drivers/of/address.c  |  25 ++
 drivers/of/device.c   |   3 +
 drivers/of/of_private.h   |   5 +
 drivers/pci/xen-pcifront.c|   2 +-
 drivers/xen/swiotlb-xen.c |   2 +-
 include/linux/device.h|   4 +
 include/linux/swiotlb.h   |  41 ++-
 kernel/dma/Kconfig|  14 +
 kernel/dma/direct.c   |  63 +++--
 kernel/dma/direct.h   |   9 +-
 kernel/dma/swiotlb.c  | 242 +-
 15 files changed, 356 insertions(+), 97 deletions(-)

-- 
2.31.1.607.g51e8a6a459-goog



Re: [PATCH v4 06/11] powerpc/pseries/iommu: Add ddw_property_create() and refactor enable_ddw()

2021-05-10 Thread Alexey Kardashevskiy




On 5/1/21 02:31, Leonardo Bras wrote:

Code used to create a ddw property that was previously scattered in
enable_ddw() is now gathered in ddw_property_create(), which deals with
allocation and filling the property, letting it ready for
of_property_add(), which now occurs in sequence.

This created an opportunity to reorganize the second part of enable_ddw():

Without this patch enable_ddw() does, in order:
kzalloc() property & members, create_ddw(), fill ddwprop inside property,
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
of_add_property(), and list_add().

With this patch enable_ddw() does, in order:
create_ddw(), ddw_property_create(), of_add_property(),
ddw_list_new_entry(), do tce_setrange_multi_pSeriesLP_walk in all memory,
and list_add().

This change requires of_remove_property() in case anything fails after
of_add_property(), but we get to do tce_setrange_multi_pSeriesLP_walk
in all memory, which looks the most expensive operation, only if
everything else succeeds.

Signed-off-by: Leonardo Bras 



Reviewed-by: Alexey Kardashevskiy 




---
  arch/powerpc/platforms/pseries/iommu.c | 93 --
  1 file changed, 57 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 955cf095416c..5a70ecd579b8 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -1122,6 +1122,35 @@ static void reset_dma_window(struct pci_dev *dev, struct 
device_node *par_dn)
 ret);
  }
  
+static struct property *ddw_property_create(const char *propname, u32 liobn, u64 dma_addr,

+   u32 page_shift, u32 window_shift)
+{
+   struct dynamic_dma_window_prop *ddwprop;
+   struct property *win64;
+
+   win64 = kzalloc(sizeof(*win64), GFP_KERNEL);
+   if (!win64)
+   return NULL;
+
+   win64->name = kstrdup(propname, GFP_KERNEL);
+   ddwprop = kzalloc(sizeof(*ddwprop), GFP_KERNEL);
+   win64->value = ddwprop;
+   win64->length = sizeof(*ddwprop);
+   if (!win64->name || !win64->value) {
+   kfree(win64->name);
+   kfree(win64->value);
+   kfree(win64);
+   return NULL;
+   }
+
+   ddwprop->liobn = cpu_to_be32(liobn);
+   ddwprop->dma_base = cpu_to_be64(dma_addr);
+   ddwprop->tce_shift = cpu_to_be32(page_shift);
+   ddwprop->window_shift = cpu_to_be32(window_shift);
+
+   return win64;
+}
+
  /* Return largest page shift based on "IO Page Sizes" output of 
ibm,query-pe-dma-window. */
  static int iommu_get_page_shift(u32 query_page_size)
  {
@@ -1167,11 +1196,11 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
struct ddw_query_response query;
struct ddw_create_response create;
int page_shift;
+   u64 win_addr;
struct device_node *dn;
u32 ddw_avail[DDW_APPLICABLE_SIZE];
struct direct_window *window;
struct property *win64 = NULL;
-   struct dynamic_dma_window_prop *ddwprop;
struct failed_ddw_pdn *fpdn;
bool default_win_removed = false;
bool pmem_present;
@@ -1286,65 +1315,54 @@ static bool enable_ddw(struct pci_dev *dev, struct 
device_node *pdn)
1ULL << page_shift);
goto out_failed;
}
-   win64 = kzalloc(sizeof(struct property), GFP_KERNEL);
-   if (!win64) {
-   dev_info(>dev,
-   "couldn't allocate property for 64bit dma window\n");
-   goto out_failed;
-   }
-   win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL);
-   win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL);
-   win64->length = sizeof(*ddwprop);
-   if (!win64->name || !win64->value) {
-   dev_info(>dev,
-   "couldn't allocate property name and value\n");
-   goto out_free_prop;
-   }
  
  	ret = create_ddw(dev, ddw_avail, , page_shift, len);

if (ret != 0)
-   goto out_free_prop;
-
-   ddwprop->liobn = cpu_to_be32(create.liobn);
-   ddwprop->dma_base = cpu_to_be64(((u64)create.addr_hi << 32) |
-   create.addr_lo);
-   ddwprop->tce_shift = cpu_to_be32(page_shift);
-   ddwprop->window_shift = cpu_to_be32(len);
+   goto out_failed;
  
  	dev_dbg(>dev, "created tce table LIOBN 0x%x for %pOF\n",

  create.liobn, dn);
  
-	window = ddw_list_new_entry(pdn, ddwprop);

+   win_addr = ((u64)create.addr_hi << 32) | create.addr_lo;
+   win64 = ddw_property_create(DIRECT64_PROPNAME, create.liobn, win_addr,
+   page_shift, len);
+   if (!win64) {
+   dev_info(>dev,
+"couldn't allocate property, property name, or 
value\n");
+   goto out_remove_win;
+   }
+
+   

Re: [PATCH v4 07/11] powerpc/pseries/iommu: Reorganize iommu_table_setparms*() with new helper

2021-05-10 Thread Alexey Kardashevskiy




On 5/1/21 02:31, Leonardo Bras wrote:

Add a new helper _iommu_table_setparms(), and use it in
iommu_table_setparms() and iommu_table_setparms_lpar() to avoid duplicated
code.

Also, setting tbl->it_ops was happening outsite iommu_table_setparms*(),
so move it to the new helper. Since we need the iommu_table_ops to be
declared before used, move iommu_table_lpar_multi_ops and
iommu_table_pseries_ops to before their respective iommu_table_setparms*().

Signed-off-by: Leonardo Bras 



This does not apply anymore as it conflicts with my 4be518d838809e2135.



---
  arch/powerpc/platforms/pseries/iommu.c | 100 -
  1 file changed, 50 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/iommu.c 
b/arch/powerpc/platforms/pseries/iommu.c
index 5a70ecd579b8..89cb6e9e9f31 100644
--- a/arch/powerpc/platforms/pseries/iommu.c
+++ b/arch/powerpc/platforms/pseries/iommu.c
@@ -53,6 +53,11 @@ enum {
DDW_EXT_QUERY_OUT_SIZE = 2
  };
  
+#ifdef CONFIG_IOMMU_API

+static int tce_exchange_pseries(struct iommu_table *tbl, long index, unsigned 
long *tce,
+   enum dma_data_direction *direction, bool 
realmode);
+#endif



Instead of declaring this so far from the the code which needs it, may 
be add


struct iommu_table_ops iommu_table_lpar_multi_ops;

right before iommu_table_setparms() (as the sctruct is what you actually 
want there), and you won't need to move iommu_table_pseries_ops as well.




+
  static struct iommu_table *iommu_pseries_alloc_table(int node)
  {
struct iommu_table *tbl;
@@ -501,6 +506,28 @@ static int tce_setrange_multi_pSeriesLP_walk(unsigned long 
start_pfn,
return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg);
  }
  
+static inline void _iommu_table_setparms(struct iommu_table *tbl, unsigned long busno,



The underscore is confusing, may be iommu_table_do_setparms()? 
iommu_table_setparms_common()? Not sure. I cannot recall a single 
function with just one leading underscore, I suspect I was pushed back 
when I tried adding one ages ago :) "inline" seems excessive, the 
compiler will probably figure it out anyway.





+unsigned long liobn, unsigned long 
win_addr,
+unsigned long window_size, unsigned 
long page_shift,
+unsigned long base, struct 
iommu_table_ops *table_ops)



Make "base" a pointer. Or, better, just keep setting it directly in 
iommu_table_setparms() rather than passing 0 around.


The same comment about "liobn" - set it in iommu_table_setparms_lpar(). 
The reviewer will see what field atters in what situation imho.





+{
+   tbl->it_busno = busno;
+   tbl->it_index = liobn;
+   tbl->it_offset = win_addr >> page_shift;
+   tbl->it_size = window_size >> page_shift;
+   tbl->it_page_shift = page_shift;
+   tbl->it_base = base;
+   tbl->it_blocksize = 16;
+   tbl->it_type = TCE_PCI;
+   tbl->it_ops = table_ops;
+}
+
+struct iommu_table_ops iommu_table_pseries_ops = {
+   .set = tce_build_pSeries,
+   .clear = tce_free_pSeries,
+   .get = tce_get_pseries
+};
+
  static void iommu_table_setparms(struct pci_controller *phb,
 struct device_node *dn,
 struct iommu_table *tbl)
@@ -509,8 +536,13 @@ static void iommu_table_setparms(struct pci_controller 
*phb,
const unsigned long *basep;
const u32 *sizep;
  
-	node = phb->dn;

+   /* Test if we are going over 2GB of DMA space */
+   if (phb->dma_window_base_cur + phb->dma_window_size > SZ_2G) {
+   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
PHB.\n");
+   panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n");
+   }
  
+	node = phb->dn;

basep = of_get_property(node, "linux,tce-base", NULL);
sizep = of_get_property(node, "linux,tce-size", NULL);
if (basep == NULL || sizep == NULL) {
@@ -519,33 +551,25 @@ static void iommu_table_setparms(struct pci_controller 
*phb,
return;
}
  
-	tbl->it_base = (unsigned long)__va(*basep);

+   _iommu_table_setparms(tbl, phb->bus->number, 0, 
phb->dma_window_base_cur,
+ phb->dma_window_size, IOMMU_PAGE_SHIFT_4K,
+ (unsigned long)__va(*basep), 
_table_pseries_ops);
  
  	if (!is_kdump_kernel())

memset((void *)tbl->it_base, 0, *sizep);
  
-	tbl->it_busno = phb->bus->number;

-   tbl->it_page_shift = IOMMU_PAGE_SHIFT_4K;
-
-   /* Units of tce entries */
-   tbl->it_offset = phb->dma_window_base_cur >> tbl->it_page_shift;
-
-   /* Test if we are going over 2GB of DMA space */
-   if (phb->dma_window_base_cur + phb->dma_window_size > 0x8000ul) {
-   udbg_printf("PCI_DMA: Unexpected number of IOAs under this 
PHB.\n");
-   

Re: [PATCH v4 02/11] powerpc/kernel/iommu: Add new iommu_table_in_use() helper

2021-05-10 Thread Alexey Kardashevskiy




On 5/1/21 02:31, Leonardo Bras wrote:

Having a function to check if the iommu table has any allocation helps
deciding if a tbl can be reset for using a new DMA window.

It should be enough to replace all instances of !bitmap_empty(tbl...).

iommu_table_in_use() skips reserved memory, so we don't need to worry about
releasing it before testing. This causes iommu_table_release_pages() to
become unnecessary, given it is only used to remove reserved memory for
testing.

Also, only allow storing reserved memory values in tbl if they are valid
in the table, so there is no need to check it in the new helper.

Signed-off-by: Leonardo Bras 
---
  arch/powerpc/include/asm/iommu.h |  1 +
  arch/powerpc/kernel/iommu.c  | 65 
  2 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index deef7c94d7b6..bf3b84128525 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -154,6 +154,7 @@ extern int iommu_tce_table_put(struct iommu_table *tbl);
   */
  extern struct iommu_table *iommu_init_table(struct iommu_table *tbl,
int nid, unsigned long res_start, unsigned long res_end);
+bool iommu_table_in_use(struct iommu_table *tbl);
  
  #define IOMMU_TABLE_GROUP_MAX_TABLES	2
  
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c

index ad82dda81640..5e168bd91401 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -691,32 +691,24 @@ static void iommu_table_reserve_pages(struct iommu_table 
*tbl,
if (tbl->it_offset == 0)
set_bit(0, tbl->it_map);
  
-	tbl->it_reserved_start = res_start;

-   tbl->it_reserved_end = res_end;
-
-   /* Check if res_start..res_end isn't empty and overlaps the table */
-   if (res_start && res_end &&
-   (tbl->it_offset + tbl->it_size < res_start ||
-res_end < tbl->it_offset))
-   return;
+   if (res_start < tbl->it_offset)
+   res_start = tbl->it_offset;
  
-	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)

-   set_bit(i - tbl->it_offset, tbl->it_map);
-}
+   if (res_end > (tbl->it_offset + tbl->it_size))
+   res_end = tbl->it_offset + tbl->it_size;
  
-static void iommu_table_release_pages(struct iommu_table *tbl)

-{
-   int i;
+   /* Check if res_start..res_end is a valid range in the table */
+   if (res_start >= res_end) {
+   tbl->it_reserved_start = tbl->it_offset;
+   tbl->it_reserved_end = tbl->it_offset;
+   return;
+   }
  
-	/*

-* In case we have reserved the first bit, we should not emit
-* the warning below.
-*/
-   if (tbl->it_offset == 0)
-   clear_bit(0, tbl->it_map);
+   tbl->it_reserved_start = res_start;
+   tbl->it_reserved_end = res_end;
  
  	for (i = tbl->it_reserved_start; i < tbl->it_reserved_end; ++i)

-   clear_bit(i - tbl->it_offset, tbl->it_map);
+   set_bit(i - tbl->it_offset, tbl->it_map);



git produced a messy chunk here. The new logic is:


static void iommu_table_reserve_pages(struct iommu_table *tbl,
unsigned long res_start, unsigned long res_end)
{
int i;

WARN_ON_ONCE(res_end < res_start);
/*
 * Reserve page 0 so it will not be used for any mappings.
 * This avoids buggy drivers that consider page 0 to be invalid
 * to crash the machine or even lose data.
 */
if (tbl->it_offset == 0)
set_bit(0, tbl->it_map);

if (res_start < tbl->it_offset)
res_start = tbl->it_offset;

if (res_end > (tbl->it_offset + tbl->it_size))
res_end = tbl->it_offset + tbl->it_size;

/* Check if res_start..res_end is a valid range in the table */
if (res_start >= res_end) {
tbl->it_reserved_start = tbl->it_offset;
tbl->it_reserved_end = tbl->it_offset;
return;
}


It is just hard to read. A code reviewer would assume res_end >= 
res_start (as there is WARN_ON) but later we allow res_end to be lesser 
than res_start.


but may be it is just me :)
Otherwise looks good.

Reviewed-by: Alexey Kardashevskiy 



  }
  
  /*

@@ -781,6 +773,22 @@ struct iommu_table *iommu_init_table(struct iommu_table 
*tbl, int nid,
return tbl;
  }
  
+bool iommu_table_in_use(struct iommu_table *tbl)

+{
+   unsigned long start = 0, end;
+
+   /* ignore reserved bit0 */
+   if (tbl->it_offset == 0)
+   start = 1;
+   end = tbl->it_reserved_start - tbl->it_offset;
+   if (find_next_bit(tbl->it_map, end, start) != end)
+   return true;
+
+   start = tbl->it_reserved_end - tbl->it_offset;
+   end = tbl->it_size;
+   return find_next_bit(tbl->it_map, end, start) 

Re: [PATCH] powerpc/pmu: Make the generic compat PMU use the architected events

2021-05-10 Thread Madhavan Srinivasan



On 5/4/21 1:13 PM, Paul Mackerras wrote:

This changes generic-compat-pmu.c so that it only uses architected
events defined in Power ISA v3.0B, rather than event encodings which,
while common to all the IBM Power Systems implementations, are
nevertheless implementation-specific rather than architected.  The



Yeah as you pointed, this was aimed at IBM system implementations.
Thanks for the patch and patch looks fine to me.

Reviewed-by: Madhavan Srinivasan 

I can send a follow up patch to return EINVAL for a non-zero value
other than pmc and pmcsel filed via check_attr_config.


intention is that any CPU implementation designed to conform to Power
ISA v3.0B or later can use generic-compat-pmu.c.

In addition to the existing events for cycles and instructions, this
adds several other architected events, including alternative encodings
for some events.  In order to make it possible to measure cycles and
instructions at the same time as each other, we set the CC5-6RUN bit
in MMCR0, which makes PMC5 and PMC6 count instructions and cycles
regardless of the run bit, so their events are now PM_CYC and
PM_INST_CMPL rather than PM_RUN_CYC and PM_RUN_INST_CMPL (the latter
are still available via other event codes).

Note that POWER9 has an erratum where one architected event
(PM_FLOP_CMPL, floating-point operations completed, code 0x100f4) does
not work correctly.  Given that there is a specific PMU driver for P9
which will be used in preference to generic-compat-pmu.c, that is not
a real problem.

Signed-off-by: Paul Mackerras 
---
  arch/powerpc/perf/generic-compat-pmu.c | 170 +++--
  1 file changed, 134 insertions(+), 36 deletions(-)

diff --git a/arch/powerpc/perf/generic-compat-pmu.c 
b/arch/powerpc/perf/generic-compat-pmu.c
index eb8a6aaf4cc1..695975227e60 100644
--- a/arch/powerpc/perf/generic-compat-pmu.c
+++ b/arch/powerpc/perf/generic-compat-pmu.c
@@ -14,45 +14,119 @@
   *
   *2824201612 8 4  
   0
   * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - 
- - - |
- * [ pmc ]   [unit ]   [ ]   m   [pmcxsel  
  ]
- * | |
- * | *- mark
- * |
- * |
- * *- combine
- *
- * Below uses IBM bit numbering.
- *
- * MMCR1[x:y] = unit(PMCxUNIT)
- * MMCR1[24]   = pmc1combine[0]
- * MMCR1[25]   = pmc1combine[1]
- * MMCR1[26]   = pmc2combine[0]
- * MMCR1[27]   = pmc2combine[1]
- * MMCR1[28]   = pmc3combine[0]
- * MMCR1[29]   = pmc3combine[1]
- * MMCR1[30]   = pmc4combine[0]
- * MMCR1[31]   = pmc4combine[1]
- *
+ * [ pmc ]   [pmcxsel  
  ]
   */
  
  /*

- * Some power9 event codes.
+ * Event codes defined in ISA v3.0B
   */
  #define EVENT(_name, _code)   _name = _code,
  
  enum {

-EVENT(PM_CYC,  0x0001e)
-EVENT(PM_INST_CMPL,0x2)
+   /* Cycles, alternate code */
+   EVENT(PM_CYC_ALT,   0x100f0)
+   /* One or more instructions completed in a cycle */
+   EVENT(PM_CYC_INST_CMPL, 0x100f2)
+   /* Floating-point instruction completed */
+   EVENT(PM_FLOP_CMPL, 0x100f4)
+   /* Instruction ERAT/L1-TLB miss */
+   EVENT(PM_L1_ITLB_MISS,  0x100f6)
+   /* All instructions completed and none available */
+   EVENT(PM_NO_INST_AVAIL, 0x100f8)
+   /* A load-type instruction completed (ISA v3.0+) */
+   EVENT(PM_LD_CMPL,   0x100fc)
+   /* Instruction completed, alternate code (ISA v3.0+) */
+   EVENT(PM_INST_CMPL_ALT, 0x100fe)
+   /* A store-type instruction completed */
+   EVENT(PM_ST_CMPL,   0x200f0)
+   /* Instruction Dispatched */
+   EVENT(PM_INST_DISP, 0x200f2)
+   /* Run_cycles */
+   EVENT(PM_RUN_CYC,   0x200f4)
+   /* Data ERAT/L1-TLB miss/reload */
+   EVENT(PM_L1_DTLB_RELOAD,0x200f6)
+   /* Taken branch completed */
+   EVENT(PM_BR_TAKEN_CMPL, 0x200fa)
+   /* Demand iCache Miss */
+   EVENT(PM_L1_ICACHE_MISS,0x200fc)
+   /* L1 Dcache reload from memory */
+   EVENT(PM_L1_RELOAD_FROM_MEM,0x200fe)
+   /* L1 Dcache store miss */
+   EVENT(PM_ST_MISS_L1,0x300f0)
+   /* Alternate code for PM_INST_DISP */
+   EVENT(PM_INST_DISP_ALT, 0x300f2)
+   /* Branch direction or target mispredicted */
+   EVENT(PM_BR_MISPREDICT, 0x300f6)
+   /* Data TLB miss/reload */
+   

Re: [V3 PATCH 15/16] crypto/nx: Get NX capabilities for GZIP coprocessor type

2021-05-10 Thread Nicholas Piggin
Excerpts from Haren Myneni's message of April 18, 2021 7:12 am:
> 
> phyp provides NX capabilities which gives recommended minimum
> compression / decompression length and maximum request buffer size
> in bytes.
> 
> Changes to get NX overall capabilities which points to the specific
> features phyp supports. Then retrieve NXGZIP specific capabilities.
> 
> Signed-off-by: Haren Myneni 
> ---
>  drivers/crypto/nx/nx-common-pseries.c | 83 +++
>  1 file changed, 83 insertions(+)
> 
> diff --git a/drivers/crypto/nx/nx-common-pseries.c 
> b/drivers/crypto/nx/nx-common-pseries.c
> index 9a40fca8a9e6..49224870d05e 100644
> --- a/drivers/crypto/nx/nx-common-pseries.c
> +++ b/drivers/crypto/nx/nx-common-pseries.c
> @@ -9,6 +9,7 @@
>   */
>  
>  #include 
> +#include 
>  #include 
>  
>  #include "nx-842.h"
> @@ -20,6 +21,24 @@ MODULE_DESCRIPTION("842 H/W Compression driver for IBM 
> Power processors");
>  MODULE_ALIAS_CRYPTO("842");
>  MODULE_ALIAS_CRYPTO("842-nx");
>  
> +struct nx_ct_capabs_be {

What does "ct" mean? I've seen it in a few other places too.

> + __be64  descriptor;
> + __be64  req_max_processed_len;  /* Max bytes in one GZIP request */
> + __be64  min_compress_len;   /* Min compression size in bytes */
> + __be64  min_decompress_len; /* Min decompression size in bytes */
> +} __packed __aligned(0x1000);
> +
> +struct nx_ct_capabs {
> + charname[VAS_DESCR_LEN + 1];
> + u64 descriptor;
> + u64 req_max_processed_len;  /* Max bytes in one GZIP request */
> + u64 min_compress_len;   /* Min compression in bytes */
> + u64 min_decompress_len; /* Min decompression in bytes */
> +};
> +
> +u64 capab_feat = 0;

Why is this here and not a local variable?

> +struct nx_ct_capabs nx_ct_capab;

It's okay and generally better to use the same name as the struct name
in this situation, i.e.,

"struct nx_ct_capabs nx_ct_capabs"

(modulo static / caps / etc)

Thanks,
Nick


Re: [V3 PATCH 12/16] powerpc/pseries/vas: sysfs interface to export capabilities

2021-05-10 Thread Nicholas Piggin
Excerpts from Haren Myneni's message of April 18, 2021 7:10 am:
> 
> pHyp provides GZIP default and GZIP QoS capabilities which gives
> the total number of credits are available in LPAR. This patch
> creates sysfs entries and exports LPAR credits, the currently used
> and the available credits for each feature.
> 
> /sys/kernel/vas/VasCaps/VDefGzip: (default GZIP capabilities)
>   avail_lpar_creds /* Available credits to use */
>   target_lpar_creds /* Total credits available which can be
>/* changed with DLPAR operation */
>   used_lpar_creds  /* Used credits */

/sys/kernel/ is not an appropriate directory to put it in. Also camel 
case is not thought very highly of these days.

And s/capabs/caps/g applies here (and all other patches).

Thanks,
Nick


Re: [V3 PATCH 10/16] powerpc/pseries/vas: Integrate API with open/close windows

2021-05-10 Thread Nicholas Piggin
Excerpts from Haren Myneni's message of April 18, 2021 7:08 am:
> +static int deallocate_free_window(struct vas_window *win)
> +{
> + int rc = 0;
> +
> + rc = plpar_vas_deallocate_window(win->winid);
> + if (!rc)
> + kfree(win->lpar.name);

Oh, did this kfree sneak in here? The allocation appears in patch 11
I think.

Thanks,
Nick



Re: [V3 PATCH 10/16] powerpc/pseries/vas: Integrate API with open/close windows

2021-05-10 Thread Nicholas Piggin
Excerpts from Haren Myneni's message of April 18, 2021 7:08 am:
> 
> This patch adds VAS window allocatioa/close with the corresponding
> HCALLs. Also changes to integrate with the existing user space VAS
> API and provide register/unregister functions to NX pseries driver.
> 
> The driver register function is used to create the user space
> interface (/dev/crypto/nx-gzip) and unregister to remove this entry.
> 
> The user space process opens this device node and makes an ioctl
> to allocate VAS window. The close interface is used to deallocate
> window.
> 
> Signed-off-by: Haren Myneni 
> ---
>  arch/powerpc/include/asm/vas.h  |   5 +
>  arch/powerpc/platforms/book3s/Kconfig   |   2 +-
>  arch/powerpc/platforms/pseries/Makefile |   1 +
>  arch/powerpc/platforms/pseries/vas.c| 212 
>  4 files changed, 219 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/vas.h b/arch/powerpc/include/asm/vas.h
> index d15784506a54..aa1974aba27e 100644
> --- a/arch/powerpc/include/asm/vas.h
> +++ b/arch/powerpc/include/asm/vas.h
> @@ -270,6 +270,11 @@ struct vas_all_capabs {
>   u64 feat_type;
>  };
>  
> +int plpar_vas_query_capabilities(const u64 hcall, u8 query_type,
> +  u64 result);
> +int vas_register_api_pseries(struct module *mod,
> +  enum vas_cop_type cop_type, const char *name);
> +void vas_unregister_api_pseries(void);
>  #endif
>  
>  /*
> diff --git a/arch/powerpc/platforms/book3s/Kconfig 
> b/arch/powerpc/platforms/book3s/Kconfig
> index 51e14db83a79..bed21449e8e5 100644
> --- a/arch/powerpc/platforms/book3s/Kconfig
> +++ b/arch/powerpc/platforms/book3s/Kconfig
> @@ -1,7 +1,7 @@
>  # SPDX-License-Identifier: GPL-2.0
>  config PPC_VAS
>   bool "IBM Virtual Accelerator Switchboard (VAS)"
> - depends on PPC_POWERNV && PPC_64K_PAGES
> + depends on (PPC_POWERNV || PPC_PSERIES) && PPC_64K_PAGES
>   default y
>   help
> This enables support for IBM Virtual Accelerator Switchboard (VAS).
> diff --git a/arch/powerpc/platforms/pseries/Makefile 
> b/arch/powerpc/platforms/pseries/Makefile
> index c8a2b0b05ac0..4cda0ef87be0 100644
> --- a/arch/powerpc/platforms/pseries/Makefile
> +++ b/arch/powerpc/platforms/pseries/Makefile
> @@ -30,3 +30,4 @@ obj-$(CONFIG_PPC_SVM)   += svm.o
>  obj-$(CONFIG_FA_DUMP)+= rtas-fadump.o
>  
>  obj-$(CONFIG_SUSPEND)+= suspend.o
> +obj-$(CONFIG_PPC_VAS)+= vas.o
> diff --git a/arch/powerpc/platforms/pseries/vas.c 
> b/arch/powerpc/platforms/pseries/vas.c
> index 35946fb02995..0ade0d6d728f 100644
> --- a/arch/powerpc/platforms/pseries/vas.c
> +++ b/arch/powerpc/platforms/pseries/vas.c
> @@ -222,6 +222,218 @@ int plpar_vas_query_capabilities(const u64 hcall, u8 
> query_type,
>   return -EIO;
>   }
>  }
> +EXPORT_SYMBOL_GPL(plpar_vas_query_capabilities);
> +
> +/*
> + * Allocate window and setup IRQ mapping.
> + */
> +static int allocate_setup_window(struct vas_window *txwin,
> +  u64 *domain, u8 wintype)
> +{
> + int rc;
> +
> + rc = plpar_vas_allocate_window(txwin, domain, wintype, DEF_WIN_CREDS);
> + if (rc)
> + return rc;
> +
> + txwin->wcreds_max = DEF_WIN_CREDS;
> +
> + return 0;
> +}
> +
> +static struct vas_window *vas_allocate_window(struct vas_tx_win_open_attr 
> *uattr,
> +   enum vas_cop_type cop_type)
> +{
> + long domain[PLPAR_HCALL9_BUFSIZE] = {VAS_DEFAULT_DOMAIN_ID};
> + struct vas_ct_capabs *ct_capab;
> + struct vas_capabs *capabs;
> + struct vas_window *txwin;
> + int rc;
> +
> + txwin = kzalloc(sizeof(*txwin), GFP_KERNEL);
> + if (!txwin)
> + return ERR_PTR(-ENOMEM);
> +
> + /*
> +  * A VAS window can have many credits which means that many
> +  * requests can be issued simultaneously. But phyp restricts
> +  * one credit per window.
> +  * phyp introduces 2 different types of credits:
> +  * Default credit type (Uses normal priority FIFO):
> +  *  A limited number of credits are assigned to partitions
> +  *  based on processor entitlement. But these credits may be
> +  *  over-committed on a system depends on whether the CPUs
> +  *  are in shared or dedicated modes - that is, more requests
> +  *  may be issued across the system than NX can service at
> +  *  once which can result in paste command failure (RMA_busy).
> +  *  Then the process has to resend requests or fall-back to
> +  *  SW compression.
> +  * Quality of Service (QoS) credit type (Uses high priority FIFO):
> +  *  To avoid NX HW contention, the system admins can assign
> +  *  QoS credits for each LPAR so that this partition is
> +  *  guaranteed access to NX resources. These credits are
> +  *  assigned to partitions via the HMC.

Re: [V3 PATCH 09/16] powerpc/pseries/vas: Implement to get all capabilities

2021-05-10 Thread Nicholas Piggin
Excerpts from Haren Myneni's message of April 18, 2021 7:08 am:
> 
> pHyp provides various VAS capabilities such as GZIP default and QoS
> capabilities which are used to determine total number of credits
> available in LPAR, maximum window credits, maximum LPAR credits,
> whether usermode copy/paste is supported, and etc.
> 
> So first retrieve overall vas capabilities using
> H_QUERY_VAS_CAPABILITIES HCALL which tells the specific features that
> are available. Then retrieve the specific capabilities by using the
> feature type in H_QUERY_VAS_CAPABILITIES HCALL.
> 
> pHyp supports only GZIP default and GZIP QoS capabilities right now.

Changelog and title could use a bit of work.

> 
> Signed-off-by: Haren Myneni 
> ---
>  arch/powerpc/platforms/pseries/vas.c | 130 +++
>  1 file changed, 130 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/pseries/vas.c 
> b/arch/powerpc/platforms/pseries/vas.c
> index 06960151477c..35946fb02995 100644
> --- a/arch/powerpc/platforms/pseries/vas.c
> +++ b/arch/powerpc/platforms/pseries/vas.c
> @@ -30,6 +30,13 @@
>  /* phyp allows one credit per window right now */
>  #define DEF_WIN_CREDS1
>  
> +static struct vas_all_capabs capabs_all;

Does this name come from PAPR? If not, capabilities or caps are better 
for readability than capabs.

> +static int copypaste_feat;

Should be a bool? And what does it mean? copy-paste is a host core 
capability.

> +
> +struct vas_capabs vcapabs[VAS_MAX_FEAT_TYPE];
> +
> +DEFINE_MUTEX(vas_pseries_mutex);

Can these be made static if they're only used here, and export them if a 
future patch uses them (or add the header declaration now).


> +
>  static int64_t hcall_return_busy_check(int64_t rc)
>  {
>   /* Check if we are stalled for some time */
> @@ -215,3 +222,126 @@ int plpar_vas_query_capabilities(const u64 hcall, u8 
> query_type,
>   return -EIO;
>   }
>  }
> +
> +/*
> + * Get the specific capabilities based on the feature type.
> + * Right now supports GZIP default and GZIP QoS capabilities.
> + */
> +static int get_vas_capabilities(u8 feat, enum vas_cop_feat_type type,
> + struct vas_ct_capabs_be *capab_be)
> +{
> + struct vas_ct_capabs *capab;
> + struct vas_capabs *vcapab;
> + int rc = 0;
> +
> + vcapab = [type];
> + memset(vcapab, 0, sizeof(*vcapab));
> + INIT_LIST_HEAD(>list);
> +
> + capab = >capab;
> +
> + rc = plpar_vas_query_capabilities(H_QUERY_VAS_CAPABILITIES, feat,
> +   (u64)virt_to_phys(capab_be));
> + if (rc)
> + return rc;
> +
> + capab->user_mode = capab_be->user_mode;
> + if (!(capab->user_mode & VAS_COPY_PASTE_USER_MODE)) {
> + pr_err("User space COPY/PASTE is not supported\n");
> + return -ENOTSUPP;
> + }
> +
> + snprintf(capab->name, VAS_DESCR_LEN + 1, "%.8s",
> +  (char *)_be->descriptor);
> + capab->descriptor = be64_to_cpu(capab_be->descriptor);
> + capab->win_type = capab_be->win_type;
> + if (capab->win_type >= VAS_MAX_FEAT_TYPE) {
> + pr_err("Unsupported window type %u\n", capab->win_type);
> + return -EINVAL;
> + }
> + capab->max_lpar_creds = be16_to_cpu(capab_be->max_lpar_creds);
> + capab->max_win_creds = be16_to_cpu(capab_be->max_win_creds);
> + atomic_set(>target_lpar_creds,
> +be16_to_cpu(capab_be->target_lpar_creds));
> + if (feat == VAS_GZIP_DEF_FEAT) {
> + capab->def_lpar_creds = be16_to_cpu(capab_be->def_lpar_creds);
> +
> + if (capab->max_win_creds < DEF_WIN_CREDS) {
> + pr_err("Window creds(%u) > max allowed window 
> creds(%u)\n",
> +DEF_WIN_CREDS, capab->max_win_creds);
> + return -EINVAL;
> + }
> + }
> +
> + copypaste_feat = 1;
> +
> + return 0;
> +}
> +
> +static int __init pseries_vas_init(void)
> +{
> + struct vas_ct_capabs_be *ct_capabs_be;
> + struct vas_all_capabs_be *capabs_be;
> + int rc;
> +
> + /*
> +  * Linux supports user space COPY/PASTE only with Radix
> +  */
> + if (!radix_enabled()) {
> + pr_err("API is supported only with radix page tables\n");
> + return -ENOTSUPP;
> + }
> +
> + capabs_be = kmalloc(sizeof(*capabs_be), GFP_KERNEL);
> + if (!capabs_be)
> + return -ENOMEM;
> + /*
> +  * Get VAS overall capabilities by passing 0 to feature type.
> +  */
> + rc = plpar_vas_query_capabilities(H_QUERY_VAS_CAPABILITIES, 0,
> +   (u64)virt_to_phys(capabs_be));
> + if (rc)
> + goto out;
> +
> + snprintf(capabs_all.name, VAS_DESCR_LEN, "%.7s",
> +  (char *)_be->descriptor);
> + capabs_all.descriptor = be64_to_cpu(capabs_be->descriptor);
> + capabs_all.feat_type = be64_to_cpu(capabs_be->feat_type);
> +
> 

Re: Kernel crosscompilers

2021-05-10 Thread Michael Ellerman
Christophe Leroy  writes:
> Le 08/05/2021 à 08:46, Christophe Leroy a écrit :
>> Le 06/05/2021 à 16:17, Arnd Bergmann a écrit :
>>> On Thu, May 6, 2021 at 2:42 PM Christophe Leroy
>>>  wrote:

 Hello Arnd,

 May I ask you whether you plan to build cross compilers based on GCC 11.1 
 at
 https://mirrors.edge.kernel.org/pub/tools/crosstool/ ?
>>>
>>> Hi Christophe,
>>>
>>> I've built a snapshot a few days before the release, that one is
>>> identical to 11.1
>>> except for the reported version number. I've tried to ask around for
>>> help testing
>>> this, but so far I have not heard from anyone.
>>>
>>> Building a new set of compilers takes around a day on my build box, so I 
>>> want
>>> to make sure I don't have to do it more often than necessary. If you are 
>>> able
>>> to give the binaries a spin, preferably on a ppc64le or arm64 host, please 
>>> let
>>> me know how it goes and I'll rebuilt them on the release tag.
>>>
>> 
>> Hi Arnd,
>> 
>> I don't have any ppc or arm host I can build on.
>> I'm building on x86 for powerpc embedded boards.
>> 
>> I have tried your GCC 11 snapshot, I get something booting but it crashes 
>> when launching init.
>> 
>> [    7.368410] init[1]: bad frame in sys_sigreturn: 7fb2fd60 nip 001083cc lr 
>> 001083c4
>> [    7.376283] Kernel panic - not syncing: Attempted to kill init! 
>> exitcode=0x000b
>> [    7.383680] CPU: 0 PID: 1 Comm: init Not tainted 
>> 5.12.0-s3k-dev-16316-g9e799d5df185 #5054
>> [    7.391767] Call Trace:
>> [    7.394174] [c9023db0] [c00211e8] panic+0x130/0x304 (unreliable)
>> [    7.400112] [c9023e10] [c0024e68] do_exit+0x874/0x910
>> [    7.405104] [c9023e50] [c0024f80] do_group_exit+0x40/0xc4
>> [    7.410440] [c9023e60] [c004] get_signal+0x1d8/0x93c
>> [    7.415689] [c9023ec0] [c0007f34] do_notify_resume+0x6c/0x314
>> [    7.421369] [c9023f20] [c000d580] syscall_exit_prepare+0x120/0x184
>> [    7.427479] [c9023f30] [c001101c] ret_from_syscall+0xc/0x28
>> 
>> Something is going wrong with asm goto output. I implemented get_user() 
>> helpers with asm goto this 
>> cycle (commit 5cd29b1fd3e8). I tested it with CLANG before submitting, it 
>> was working.
>> 
>> Seems like there is something wrong with it with GCC11. When forcing 
>> CONFIG_CC_HAS_ASM_GOTO_OUTPUT 
>> to 'n', the kernel boots ok.
>> 
>
> I found the problem, that's due to r10 register being reused by GCC in the 
> copy loop below:
>
>10:7d 09 03 a6 mtctr   r8
>14:80 ca 00 00 lwz r6,0(r10)
>18:80 ea 00 04 lwz r7,4(r10)
>1c:90 c9 00 08 stw r6,8(r9)
>20:90 e9 00 0c stw r7,12(r9)
>24:39 0a 00 08 addir8,r10,8
>28:39 29 00 08 addir9,r9,8
> =>2c: 81 4a 00 08 lwz r10,8(r10)
>30:81 6a 00 0c lwz r11,12(r10)
>34:91 49 00 08 stw r10,8(r9)
>38:91 69 00 0c stw r11,12(r9)
>3c:39 48 00 08 addir10,r8,8
>40:39 29 00 08 addir9,r9,8
>44:42 00 ff d0 bdnz14 <__unsafe_restore_general_regs+0x14>
>
> earlyclobber modifier is missing in the CONFIG_CC_HAS_ASM_GOTO_OUTPUT version 
> of __get_user_asm2_goto().

Thanks for tracking that down. I hit it last week when testing Arnd's
compilers but hadn't had time to find the root cause.

cheers


Re: [V3 PATCH 08/16] powerpc/pseries/VAS: Implement allocate/modify/deallocate HCALLS

2021-05-10 Thread Nicholas Piggin
Excerpts from Haren Myneni's message of April 18, 2021 7:07 am:
> 
> This patch adds the following HCALLs which are used to allocate,
> modify and deallocate VAS windows.

I don't like to be nitpicky about the language, but this is adding the 
hcall wrappers. Implementing the hcall would be adding it to KVM. 
Otherwise looks okay I think.

Reviewed-by: Nicholas Piggin 

> H_ALLOCATE_VAS_WINDOW: Allocate VAS window
> H_DEALLOCATE_VAS_WINDOW: Close VAS window
> H_MODIFY_VAS_WINDOW: Setup window before using
> 
> Also adds phyp call (H_QUERY_VAS_CAPABILITIES) to get all VAS
> capabilities that phyp provides.

"PAPR hcall to get VAS capabilities provided by the hypervisor"

Thanks,
Nick

> 
> Signed-off-by: Haren Myneni 
> ---
>  arch/powerpc/platforms/pseries/vas.c | 217 +++
>  1 file changed, 217 insertions(+)
>  create mode 100644 arch/powerpc/platforms/pseries/vas.c
> 
> diff --git a/arch/powerpc/platforms/pseries/vas.c 
> b/arch/powerpc/platforms/pseries/vas.c
> new file mode 100644
> index ..06960151477c
> --- /dev/null
> +++ b/arch/powerpc/platforms/pseries/vas.c
> @@ -0,0 +1,217 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Copyright 2020-21 IBM Corp.
> + */
> +
> +#define pr_fmt(fmt) "vas: " fmt
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "vas.h"
> +
> +#define  VAS_INVALID_WIN_ADDRESS 0xul
> +#define  VAS_DEFAULT_DOMAIN_ID   0xul
> +/* Authority Mask Register (AMR) value is not supported in */
> +/* linux implementation. So pass '0' to modify window HCALL */
> +#define  VAS_AMR_VALUE   0
> +/* phyp allows one credit per window right now */
> +#define DEF_WIN_CREDS1
> +
> +static int64_t hcall_return_busy_check(int64_t rc)
> +{
> + /* Check if we are stalled for some time */
> + if (H_IS_LONG_BUSY(rc)) {
> + msleep(get_longbusy_msecs(rc));
> + rc = H_BUSY;
> + } else if (rc == H_BUSY) {
> + cond_resched();
> + }
> +
> + return rc;
> +}
> +
> +/*
> + * Allocate VAS window HCALL
> + */
> +static int plpar_vas_allocate_window(struct vas_window *win, u64 *domain,
> +  u8 wintype, u16 credits)
> +{
> + long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
> + int64_t rc;
> +
> + do {
> + rc = plpar_hcall9(H_ALLOCATE_VAS_WINDOW, retbuf, wintype,
> +   credits, domain[0], domain[1], domain[2],
> +   domain[3], domain[4], domain[5]);
> +
> + rc = hcall_return_busy_check(rc);
> + } while (rc == H_BUSY);
> +
> + switch (rc) {
> + case H_SUCCESS:
> + win->winid = retbuf[0];
> + win->lpar.win_addr = retbuf[1];
> + win->lpar.complete_irq = retbuf[2];
> + win->lpar.fault_irq = retbuf[3];
> + if (win->lpar.win_addr == VAS_INVALID_WIN_ADDRESS) {
> + pr_err("HCALL(%x): COPY/PASTE is not supported\n",
> + H_ALLOCATE_VAS_WINDOW);
> + return -ENOTSUPP;
> + }
> + return 0;
> + case H_PARAMETER:
> + pr_err("HCALL(%x): Invalid window type (%u)\n",
> + H_ALLOCATE_VAS_WINDOW, wintype);
> + return -EINVAL;
> + case H_P2:
> + pr_err("HCALL(%x): Credits(%u) exceed maximum window credits\n",
> + H_ALLOCATE_VAS_WINDOW, credits);
> + return -EINVAL;
> + case H_COP_HW:
> + pr_err("HCALL(%x): User-mode COPY/PASTE is not supported\n",
> + H_ALLOCATE_VAS_WINDOW);
> + return -ENOTSUPP;
> + case H_RESOURCE:
> + pr_err("HCALL(%x): LPAR credit limit exceeds window limit\n",
> + H_ALLOCATE_VAS_WINDOW);
> + return -EPERM;
> + case H_CONSTRAINED:
> + pr_err("HCALL(%x): Credits (%u) are not available\n",
> + H_ALLOCATE_VAS_WINDOW, credits);
> + return -EPERM;
> + default:
> + pr_err("HCALL(%x): Unexpected error %lld\n",
> + H_ALLOCATE_VAS_WINDOW, rc);
> + return -EIO;
> + }
> +}
> +
> +/*
> + * Deallocate VAS window HCALL.
> + */
> +static int plpar_vas_deallocate_window(u64 winid)
> +{
> + int64_t rc;
> +
> + do {
> + rc = plpar_hcall_norets(H_DEALLOCATE_VAS_WINDOW, winid);
> +
> + rc = hcall_return_busy_check(rc);
> + } while (rc == H_BUSY);
> +
> + switch (rc) {
> + case H_SUCCESS:
> + return 0;
> + case H_PARAMETER:
> + pr_err("HCALL(%x): Invalid window ID %llu\n",
> + H_DEALLOCATE_VAS_WINDOW, winid);
> + return -EINVAL;
> + case H_STATE:
> +