date:20200515

[tip: x86/cleanups] x86/nmi: Remove edac.h include leftover

2020-05-15 Thread tip-bot2 for Borislav Petkov

The following commit has been merged into the x86/cleanups branch of tip:

Commit-ID: 6255c161a08564e4f3995db31f3d64a5fd24738b
Gitweb:
https://git.kernel.org/tip/6255c161a08564e4f3995db31f3d64a5fd24738b
Author:Borislav Petkov 
AuthorDate:Fri, 15 May 2020 20:21:21 +02:00
Committer: Borislav Petkov 
CommitterDate: Sat, 16 May 2020 07:47:57 +02:00

x86/nmi: Remove edac.h include leftover

... which

  db47d5f85646 ("x86/nmi, EDAC: Get rid of DRAM error reporting thru PCI SERR 
NMI")

forgot to remove.

No functional changes.

Signed-off-by: Borislav Petkov 
Link: https://lkml.kernel.org/r/20200515182246.3553-1...@alien8.de
---
 arch/x86/kernel/nmi.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 6407ea2..bdcc514 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -25,10 +25,6 @@
 #include 
 #include 
 
-#if defined(CONFIG_EDAC)
-#include 
-#endif
-
 #include 
 #include 
 #include

Re: mmotm 2020-05-15-16-29 uploaded

2020-05-15 Thread Stephen Rothwell

Hi Andrew,

On Fri, 15 May 2020 16:30:18 -0700 Andrew Morton  
wrote:
>
> * mm-introduce-external-memory-hinting-api.patch

The above patch should have

#define __NR_process_madvise 443

not 442, in arch/arm64/include/asm/unistd32.h

and

 442common  fsinfo  sys_fsinfo
+443common  process_madvise sys_process_madvise

in arch/microblaze/kernel/syscalls/syscall.tbl

> * mm-introduce-external-memory-hinting-api-fix.patch

The above patch should have

#define __NR_process_madvise 443

not 442

> * mm-support-vector-address-ranges-for-process_madvise-fix.patch

The above patch should have

#define __NR_process_madvise 443

not 442 in arch/arm64/include/asm/unistd32.h

-- 
Cheers,
Stephen Rothwell


pgpTuOvbopPEg.pgp
Description: OpenPGP digital signature

[PATCH v5 1/2] powerpc/rtas: Move type/struct definitions from rtas.h into rtas-types.h

2020-05-15 Thread Leonardo Bras

In order to get any rtas* struct into other headers, including rtas.h
may cause a lot of errors, regarding include dependency needed for
inline functions.

Create rtas-types.h and move there all type/struct definitions
from rtas.h, then include rtas-types.h into rtas.h.

Also, as suggested by checkpath.pl, replace uint8_t for u8, and keep
the same type pattern for the whole file, as they are the same
according to powerpc/boot/types.h.

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/rtas-types.h | 126 ++
 arch/powerpc/include/asm/rtas.h   | 118 +---
 2 files changed, 127 insertions(+), 117 deletions(-)
 create mode 100644 arch/powerpc/include/asm/rtas-types.h

diff --git a/arch/powerpc/include/asm/rtas-types.h 
b/arch/powerpc/include/asm/rtas-types.h
new file mode 100644
index ..87354e28f160
--- /dev/null
+++ b/arch/powerpc/include/asm/rtas-types.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _POWERPC_RTAS_TYPES_H
+#define _POWERPC_RTAS_TYPES_H
+#ifdef __KERNEL__
+
+#include 
+
+typedef __be32 rtas_arg_t;
+
+struct rtas_args {
+   __be32 token;
+   __be32 nargs;
+   __be32 nret;
+   rtas_arg_t args[16];
+   rtas_arg_t *rets; /* Pointer to return values in args[]. */
+};
+
+struct rtas_t {
+   unsigned long entry;/* physical address pointer */
+   unsigned long base; /* physical address pointer */
+   unsigned long size;
+   arch_spinlock_t lock;
+   struct rtas_args args;
+   struct device_node *dev;/* virtual address pointer */
+};
+
+struct rtas_suspend_me_data {
+   atomic_t working; /* number of cpus accessing this struct */
+   atomic_t done;
+   int token; /* ibm,suspend-me */
+   atomic_t error;
+   struct completion *complete; /* wait on this until working == 0 */
+};
+
+struct rtas_error_log {
+   /* Byte 0 */
+   u8  byte0;  /* Architectural version */
+
+   /* Byte 1 */
+   u8  byte1;
+   /* 
+* XXX  3: Severity level of error
+*XX2: Degree of recovery
+*  X   1: Extended log present?
+*   XX 2: Reserved
+*/
+
+   /* Byte 2 */
+   u8  byte2;
+   /* 
+*  4: Initiator of event
+*  4: Target of failed operation
+*/
+   u8  byte3;  /* General event or error*/
+   __be32  extended_log_length;/* length in bytes */
+   unsigned char   buffer[1];  /* Start of extended log */
+   /* Variable length.  */
+};
+
+/* RTAS general extended event log, Version 6. The extended log starts
+ * from "buffer" field of struct rtas_error_log defined above.
+ */
+struct rtas_ext_event_log_v6 {
+   /* Byte 0 */
+   u8 byte0;
+   /* 
+* X1: Log valid
+*  X   1: Unrecoverable error
+*   X  1: Recoverable (correctable or successfully retried)
+*X 1: Bypassed unrecoverable error (degraded operation)
+* X1: Predictive error
+*  X   1: "New" log (always 1 for data returned from RTAS)
+*   X  1: Big Endian
+*X 1: Reserved
+*/
+
+   /* Byte 1 */
+   u8 byte1;   /* reserved */
+
+   /* Byte 2 */
+   u8 byte2;
+   /* 
+* X1: Set to 1 (indicating log is in PowerPC format)
+*  XXX 3: Reserved
+*  4: Log format used for bytes 12-2047
+*/
+
+   /* Byte 3 */
+   u8 byte3;   /* reserved */
+   /* Byte 4-11 */
+   u8 reserved[8]; /* reserved */
+   /* Byte 12-15 */
+   __be32  company_id; /* Company ID of the company*/
+   /* that defines the format for  */
+   /* the vendor specific log type */
+   /* Byte 16-end of log */
+   u8 vendor_log[1];   /* Start of vendor specific log */
+   /* Variable length. */
+};
+
+/* Vendor specific Platform Event Log Format, Version 6, section header */
+struct pseries_errorlog {
+   __be16 id;  /* 0x00 2-byte ASCII section ID */
+   __be16 length;  /* 0x02 Section length in bytes */
+   u8 version; /* 0x04 Section version */
+   u8 subtype; /* 0x05 Section subtype */
+   __be16 creator_component;   /* 0x06 Creator component ID*/
+   u8 data[];  /* 0x08 Start of section data   */
+};
+
+/* RTAS pseries hotplug errorlog section */
+struct

[PATCH v5 2/2] powerpc/rtas: Implement reentrant rtas call

2020-05-15 Thread Leonardo Bras

Implement rtas_call_reentrant() for reentrant rtas-calls:
"ibm,int-on", "ibm,int-off",ibm,get-xive" and  "ibm,set-xive".

On LoPAPR Version 1.1 (March 24, 2016), from 7.3.10.1 to 7.3.10.4,
items 2 and 3 say:

2 - For the PowerPC External Interrupt option: The * call must be
reentrant to the number of processors on the platform.
3 - For the PowerPC External Interrupt option: The * argument call
buffer for each simultaneous call must be physically unique.

So, these rtas-calls can be called in a lockless way, if using
a different buffer for each cpu doing such rtas call.

For this, it was suggested to add the buffer (struct rtas_args)
in the PACA struct, so each cpu can have it's own buffer.
The PACA struct received a pointer to rtas buffer, which is
allocated in the memory range available to rtas 32-bit.

Reentrant rtas calls are useful to avoid deadlocks in crashing,
where rtas-calls are needed, but some other thread crashed holding
the rtas.lock.

This is a backtrace of a deadlock from a kdump testing environment:

  #0 arch_spin_lock
  #1  lock_rtas ()
  #2  rtas_call (token=8204, nargs=1, nret=1, outputs=0x0)
  #3  ics_rtas_mask_real_irq (hw_irq=4100)
  #4  machine_kexec_mask_interrupts
  #5  default_machine_crash_shutdown
  #6  machine_crash_shutdown
  #7  __crash_kexec
  #8  crash_kexec
  #9  oops_end

Signed-off-by: Leonardo Bras 
---
 arch/powerpc/include/asm/paca.h |  2 ++
 arch/powerpc/include/asm/rtas.h |  1 +
 arch/powerpc/kernel/paca.c  | 20 +++
 arch/powerpc/kernel/rtas.c  | 52 +
 arch/powerpc/sysdev/xics/ics-rtas.c | 22 ++--
 5 files changed, 86 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index e3cc9eb9204d..87cd9c2220cc 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -270,6 +271,7 @@ struct paca_struct {
 #ifdef CONFIG_MMIOWB
struct mmiowb_state mmiowb_state;
 #endif
+   struct rtas_args *reentrant_args;
 } cacheline_aligned;
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
index c35c5350b7e4..fa7509c85881 100644
--- a/arch/powerpc/include/asm/rtas.h
+++ b/arch/powerpc/include/asm/rtas.h
@@ -236,6 +236,7 @@ extern struct rtas_t rtas;
 extern int rtas_token(const char *service);
 extern int rtas_service_present(const char *service);
 extern int rtas_call(int token, int, int, int *, ...);
+int rtas_call_reentrant(int token, int nargs, int nret, int *outputs, ...);
 void rtas_call_unlocked(struct rtas_args *args, int token, int nargs,
int nret, ...);
 extern void __noreturn rtas_restart(char *cmd);
diff --git a/arch/powerpc/kernel/paca.c b/arch/powerpc/kernel/paca.c
index 3f91ccaa9c74..88c9b61489fc 100644
--- a/arch/powerpc/kernel/paca.c
+++ b/arch/powerpc/kernel/paca.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "setup.h"
 
@@ -164,6 +165,23 @@ static struct slb_shadow * __init new_slb_shadow(int cpu, 
unsigned long limit)
 
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
+/**
+ * new_rtas_args() - Allocates rtas args
+ * @cpu:   CPU number
+ * @limit: Memory limit for this allocation
+ *
+ * Allocates a struct rtas_args and return it's pointer.
+ *
+ * Return: Pointer to allocated rtas_args
+ */
+static struct rtas_args * __init new_rtas_args(int cpu, unsigned long limit)
+{
+   limit = min_t(unsigned long, limit, RTAS_INSTANTIATE_MAX);
+
+   return alloc_paca_data(sizeof(struct rtas_args), L1_CACHE_BYTES,
+  limit, cpu);
+}
+
 /* The Paca is an array with one entry per processor.  Each contains an
  * lppaca, which contains the information shared between the
  * hypervisor and Linux.
@@ -202,6 +220,7 @@ void __init __nostackprotector initialise_paca(struct 
paca_struct *new_paca, int
/* For now -- if we have threads this will be adjusted later */
new_paca->tcd_ptr = _paca->tcd;
 #endif
+   new_paca->reentrant_args = NULL;
 }
 
 /* Put the paca pointer into r13 and SPRG_PACA */
@@ -274,6 +293,7 @@ void __init allocate_paca(int cpu)
 #ifdef CONFIG_PPC_BOOK3S_64
paca->slb_shadow_ptr = new_slb_shadow(cpu, limit);
 #endif
+   paca->reentrant_args = new_rtas_args(cpu, limit);
paca_struct_size += sizeof(struct paca_struct);
 }
 
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index c5fa251b8950..6e22eb4fc0e7 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /* This is here deliberately so it's only used in this file */
 void enter_rtas(unsigned long);
@@ -483,6 +484,57 @@ int rtas_call(int token, int nargs, int nret, int 
*outputs, ...)
 }
 EXPORT_SYMBOL(rtas_call);
 
+/**
+ * rtas_call_reentrant() - Used for reentrant

[PATCH v5 0/2] Implement reentrant rtas call

2020-05-15 Thread Leonardo Bras

Patch 2 implement rtas_call_reentrant() for reentrant rtas-calls:
"ibm,int-on", "ibm,int-off",ibm,get-xive" and  "ibm,set-xive",
according to LoPAPR Version 1.1 (March 24, 2016).

For that, it's necessary that every call uses a different
rtas buffer (rtas_args). Paul Mackerras suggested using the PACA
structure for creating a per-cpu buffer for these calls.

Patch 1 was necessary to make PACA have a 'struct rtas_args' member.

Reentrant rtas calls can be useful to avoid deadlocks in crashing,
where rtas-calls are needed, but some other thread crashed holding
the rtas.lock.

This is a backtrace of a deadlock from a kdump testing environment:

  #0 arch_spin_lock
  #1  lock_rtas () 
  #2  rtas_call (token=8204, nargs=1, nret=1, outputs=0x0)
  #3  ics_rtas_mask_real_irq (hw_irq=4100) 
  #4  machine_kexec_mask_interrupts
  #5  default_machine_crash_shutdown
  #6  machine_crash_shutdown 
  #7  __crash_kexec
  #8  crash_kexec
  #9  oops_end

Signed-off-by: Leonardo Bras 

---
Changes since v4:
- Insted of having the full buffer on PACA, adds only a pointer and
  allocate it during allocate_paca(), making sure it's in a memory
  range available for RTAS (32-bit). (Thanks Nick Piggin!)

Changes since v3:
- Adds protection from preemption and interruption

Changes since v2:
- Fixed build failure from ppc64e, by including spinlock_types.h on 
  rtas-types.h
- Improved commit messages

Changes since v1:
- Moved buffer from stack to PACA (as suggested by Paul Mackerras)
- Added missing output bits
- Improve documentation following kernel-doc format (as suggested by
  Nathan Lynch)


Leonardo Bras (2):
  powerpc/rtas: Move type/struct definitions from rtas.h into
rtas-types.h
  powerpc/rtas: Implement reentrant rtas call

 arch/powerpc/include/asm/paca.h   |   2 +
 arch/powerpc/include/asm/rtas-types.h | 126 ++
 arch/powerpc/include/asm/rtas.h   | 119 +---
 arch/powerpc/kernel/rtas.c|  42 +
 arch/powerpc/sysdev/xics/ics-rtas.c   |  22 ++---
 5 files changed, 183 insertions(+), 128 deletions(-)
 create mode 100644 arch/powerpc/include/asm/rtas-types.h

-- 
2.25.4

Re: [PATCH v2 04/15] bnxt: use new module_firmware_crashed()

2020-05-15 Thread Vasundhara Volam

On Sat, May 16, 2020 at 3:00 AM Luis Chamberlain  wrote:
>
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
>
> Using a taint flag allows us to annotate when this happens clearly.
>
> Cc: Michael Chan 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
> b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> index dd0c3f227009..5ba1bd0734e9 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> @@ -3503,6 +3503,7 @@ static int bnxt_get_dump_data(struct net_device *dev, 
> struct ethtool_dump *dump,
>
> dump->flag = bp->dump_flag;
> if (dump->flag == BNXT_DUMP_CRASH) {
> +   module_firmware_crashed();
This is not the right place to annotate the taint flag.

Here the driver is just copying the dump after error recovery which is collected
by firmware to DDR, when firmware detects fatal conditions. Driver and firmware
will be healthy when the user calls this command.

Also, users can call this command a thousand times when there is no crash.

I will propose a patch to use this wrapper in the error recovery path,
where the driver
may not be able to recover.

>  #ifdef CONFIG_TEE_BNXT_FW
> return tee_bnxt_copy_coredump(buf, 0, dump->len);
>  #endif
> --
> 2.26.2
>
Nacked-by: Vasundhara Volam

Re: [PATCH V2 2/2] rbtree_latch: don't need to check seq when it found a node

2020-05-15 Thread Michel Lespinasse

On Fri, May 15, 2020 at 9:52 PM Lai Jiangshan
 wrote:
>
> On Sat, May 16, 2020 at 12:28 PM Michel Lespinasse  wrote:
> >
> > On Fri, May 15, 2020 at 03:59:09PM +, Lai Jiangshan wrote:
> > > latch_tree_find() should be protected by caller via RCU or so.
> > > When it find a node in an attempt, the node must be a valid one
> > > in RCU's point's of view even the tree is (being) updated with a
> > > new node with the same key which is entirely subject to timing
> > > anyway.
> >
> > I'm not sure I buy this. Even if we get a valid node, is it the one we
> > were searching for ? I don't see how this could be guaranteed if the
> > read raced with a tree rebalancing.
>
> It is valid because ops->comp() returns 0 and it should be
> the one we were searching for unless ops->comp() is wrong.
> The searched one could be possible just deleted, but it is still
> a legitimate searched result in RCU's point's of view.
>
> A tree rebalancing can cause a searching fails to find
> an existing target. This is the job of read_seqcount_retry()
> to tell you to retry.

Ah, yes, this is correct. It wouldn't work if we wanted to return the
next higher key for example, but it does work for exact matches. Nice!

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

Re: [PATCH V2 2/2] rbtree_latch: don't need to check seq when it found a node

2020-05-15 Thread Lai Jiangshan

On Sat, May 16, 2020 at 12:28 PM Michel Lespinasse  wrote:
>
> On Fri, May 15, 2020 at 03:59:09PM +, Lai Jiangshan wrote:
> > latch_tree_find() should be protected by caller via RCU or so.
> > When it find a node in an attempt, the node must be a valid one
> > in RCU's point's of view even the tree is (being) updated with a
> > new node with the same key which is entirely subject to timing
> > anyway.
>
> I'm not sure I buy this. Even if we get a valid node, is it the one we
> were searching for ? I don't see how this could be guaranteed if the
> read raced with a tree rebalancing.

It is valid because ops->comp() returns 0 and it should be
the one we were searching for unless ops->comp() is wrong.
The searched one could be possible just deleted, but it is still
a legitimate searched result in RCU's point's of view.

A tree rebalancing can cause a searching fails to find
an existing target. This is the job of read_seqcount_retry()
to tell you to retry.

>
> --
> Michel "Walken" Lespinasse
> A program is never fully debugged until the last user dies.

[GIT PULL] io_uring fixes for 5.7-rc6

2020-05-15 Thread Jens Axboe

Hi Linus,

Two small fixes that should go into this release:

- Check and handle zero length splice (Pavel)

- Fix a regression in this merge window for fixed files used with polled
  block IO.

Please pull!


  git://git.kernel.dk/linux-block.git tags/io_uring-5.7-2020-05-15



Jens Axboe (1):
  io_uring: polled fixed file must go through free iteration

Pavel Begunkov (1):
  io_uring: fix zero len do_splice()

 fs/io_uring.c | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

-- 
Jens Axboe

Re: [PATCH] drivers: block: use set_current_state macro

2020-05-15 Thread Chaitanya Kulkarni

On 05/07/2020 12:20 AM, Xu Wang wrote:
> Use set_current_state macro instead of current->state = TASK_RUNNING.
>
> Signed-off-by: Xu Wang

Reviewed-by: Chaitanya Kulkarni

Re: [PATCH V2 2/2] rbtree_latch: don't need to check seq when it found a node

2020-05-15 Thread Michel Lespinasse

On Fri, May 15, 2020 at 03:59:09PM +, Lai Jiangshan wrote:
> latch_tree_find() should be protected by caller via RCU or so.
> When it find a node in an attempt, the node must be a valid one
> in RCU's point's of view even the tree is (being) updated with a
> new node with the same key which is entirely subject to timing
> anyway.

I'm not sure I buy this. Even if we get a valid node, is it the one we
were searching for ? I don't see how this could be guaranteed if the
read raced with a tree rebalancing.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.

[PATCH] crypto: caam - make soc match data optional

2020-05-15 Thread Andrey Smirnov

Vyrbrid devices don't have any clock that need to be taken care of, so
make clock data optional on i.MX.

Signed-off-by: Andrey Smirnov 
Cc: Chris Healy 
Cc: Horia Geantă 
Cc: Herbert Xu 
Cc: Fabio Estevam 
Cc: linux-...@nxp.com
Cc: linux-cry...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/crypto/caam/ctrl.c | 7 +--
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/crypto/caam/ctrl.c b/drivers/crypto/caam/ctrl.c
index 4fcdd262e581..6aba430793cc 100644
--- a/drivers/crypto/caam/ctrl.c
+++ b/drivers/crypto/caam/ctrl.c
@@ -630,12 +630,7 @@ static int caam_probe(struct platform_device *pdev)
imx_soc_match = soc_device_match(caam_imx_soc_table);
caam_imx = (bool)imx_soc_match;

-   if (imx_soc_match) {
-   if (!imx_soc_match->data) {
-   dev_err(dev, "No clock data provided for i.MX SoC");
-   return -EINVAL;
-   }
-
+   if (imx_soc_match && imx_soc_match->data) {
ret = init_clocks(dev, imx_soc_match->data);
if (ret)
return ret;
--
2.21.3

Re: [PATCH v2 15/15] mwl8k: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:46PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: linux-wirel...@vger.kernel.org
> Cc: Lennert Buytenhek 
> Cc: Kalle Valo 
> Cc: "Gustavo A. R. Silva" 
> Cc: Johannes Berg 
> Cc: Ganapathi Bhat 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/wireless/marvell/mwl8k.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/wireless/marvell/mwl8k.c 
> b/drivers/net/wireless/marvell/mwl8k.c
> index 97f23f93f6e7..d609ef1bb879 100644
> --- a/drivers/net/wireless/marvell/mwl8k.c
> +++ b/drivers/net/wireless/marvell/mwl8k.c
> @@ -1551,6 +1551,7 @@ static int mwl8k_tx_wait_empty(struct ieee80211_hw *hw)
>* the firmware has crashed
>*/
>   if (priv->hw_restart_in_progress) {
> + module_firmware_crashed();
>   if (priv->hw_restart_owner == current)
>   return 0;
>   else
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 14/15] brcm80211: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:45PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: linux-wirel...@vger.kernel.org
> Cc: brcm80211-dev-list@broadcom.com
> Cc: brcm80211-dev-l...@cypress.com
> Cc: Arend van Spriel 
> Cc: Franky Lin 
> Cc: Hante Meuleman 
> Cc: Chi-Hsien Lin 
> Cc: Wright Feng 
> Cc: Kalle Valo 
> Cc: "Rafał Miłecki" 
> Cc: Pieter-Paul Giesberts 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c 
> b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c
> index c88655acc78c..d623f83568b3 100644
> --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c
> +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/core.c
> @@ -1393,6 +1393,7 @@ void brcmf_fw_crashed(struct device *dev)
>   struct brcmf_pub *drvr = bus_if->drvr;
>  
>   bphy_err(drvr, "Firmware has halted or crashed\n");
> + module_firmware_crashed();
>  
>   brcmf_dev_coredump(dev);
>  
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 13/15] ath6kl: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:44PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: linux-wirel...@vger.kernel.org
> Cc: ath...@lists.infradead.org
> Cc: Kalle Valo 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/wireless/ath/ath6kl/hif.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/wireless/ath/ath6kl/hif.c 
> b/drivers/net/wireless/ath/ath6kl/hif.c
> index d1942537ea10..cfd838607544 100644
> --- a/drivers/net/wireless/ath/ath6kl/hif.c
> +++ b/drivers/net/wireless/ath/ath6kl/hif.c
> @@ -120,6 +120,7 @@ static int ath6kl_hif_proc_dbg_intr(struct ath6kl_device 
> *dev)
>   int ret;
>  
>   ath6kl_warn("firmware crashed\n");
> + module_firmware_crashed();
>  
>   /*
>* read counter to clear the interrupt, the debug error interrupt is
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 11/15] wimax/i2400m: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:42PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: linux-wi...@intel.com
> Cc: Inaky Perez-Gonzalez 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/wimax/i2400m/rx.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/wimax/i2400m/rx.c b/drivers/net/wimax/i2400m/rx.c
> index c9fb619a9e01..307a62ca183f 100644
> --- a/drivers/net/wimax/i2400m/rx.c
> +++ b/drivers/net/wimax/i2400m/rx.c
> @@ -712,6 +712,7 @@ void __i2400m_roq_queue(struct i2400m *i2400m, struct 
> i2400m_roq *roq,
>   dev_err(dev, "SW BUG? failed to insert packet\n");
>   dev_err(dev, "ERX: roq %p [ws %u] skb %p nsn %d sn %u\n",
>   roq, roq->ws, skb, nsn, roq_data->sn);
> + module_firmware_crashed();
>   skb_queue_walk(>queue, skb_itr) {
>   roq_data_itr = (struct i2400m_roq_data *) _itr->cb;
>   nsn_itr = __i2400m_roq_nsn(roq, roq_data_itr->sn);
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 12/15] ath10k: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:43PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: linux-wirel...@vger.kernel.org
> Cc: ath...@lists.infradead.org
> Cc: Kalle Valo 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/wireless/ath/ath10k/pci.c  | 2 ++
>  drivers/net/wireless/ath/ath10k/sdio.c | 2 ++
>  drivers/net/wireless/ath/ath10k/snoc.c | 1 +
>  3 files changed, 5 insertions(+)
> 
> diff --git a/drivers/net/wireless/ath/ath10k/pci.c 
> b/drivers/net/wireless/ath/ath10k/pci.c
> index 1d941d53fdc9..6bd0f3b518b9 100644
> --- a/drivers/net/wireless/ath/ath10k/pci.c
> +++ b/drivers/net/wireless/ath/ath10k/pci.c
> @@ -1767,6 +1767,7 @@ static void ath10k_pci_fw_dump_work(struct work_struct 
> *work)
>   scnprintf(guid, sizeof(guid), "n/a");
>  
>   ath10k_err(ar, "firmware crashed! (guid %s)\n", guid);
> + module_firmware_crashed();
>   ath10k_print_driver_info(ar);
>   ath10k_pci_dump_registers(ar, crash_data);
>   ath10k_ce_dump_registers(ar, crash_data);
> @@ -2837,6 +2838,7 @@ static int ath10k_pci_hif_power_up(struct ath10k *ar,
>   if (ret) {
>   if (ath10k_pci_has_fw_crashed(ar)) {
>   ath10k_warn(ar, "firmware crashed during chip reset\n");
> + module_firmware_crashed();
>   ath10k_pci_fw_crashed_clear(ar);
>   ath10k_pci_fw_crashed_dump(ar);
>   }
> diff --git a/drivers/net/wireless/ath/ath10k/sdio.c 
> b/drivers/net/wireless/ath/ath10k/sdio.c
> index e2aff2254a40..d34ad289380f 100644
> --- a/drivers/net/wireless/ath/ath10k/sdio.c
> +++ b/drivers/net/wireless/ath/ath10k/sdio.c
> @@ -794,6 +794,7 @@ static int ath10k_sdio_mbox_proc_dbg_intr(struct ath10k 
> *ar)
>  
>   /* TODO: Add firmware crash handling */
>   ath10k_warn(ar, "firmware crashed\n");
> + module_firmware_crashed();
>  
>   /* read counter to clear the interrupt, the debug error interrupt is
>* counter 0.
> @@ -915,6 +916,7 @@ static int ath10k_sdio_mbox_proc_cpu_intr(struct ath10k 
> *ar)
>   if (cpu_int_status & MBOX_CPU_STATUS_ENABLE_ASSERT_MASK) {
>   ath10k_err(ar, "firmware crashed!\n");
>   queue_work(ar->workqueue, >restart_work);
> + module_firmware_crashed();
>   }
>   return ret;
>  }
> diff --git a/drivers/net/wireless/ath/ath10k/snoc.c 
> b/drivers/net/wireless/ath/ath10k/snoc.c
> index 354d49b1cd45..7cfc123c345c 100644
> --- a/drivers/net/wireless/ath/ath10k/snoc.c
> +++ b/drivers/net/wireless/ath/ath10k/snoc.c
> @@ -1451,6 +1451,7 @@ void ath10k_snoc_fw_crashed_dump(struct ath10k *ar)
>   scnprintf(guid, sizeof(guid), "n/a");
>  
>   ath10k_err(ar, "firmware crashed! (guid %s)\n", guid);
> + module_firmware_crashed();
>   ath10k_print_driver_info(ar);
>   ath10k_msa_dump_memory(ar, crash_data);
>   mutex_unlock(>dump_mutex);
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 09/15] qed: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:40PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: Ariel Elior 
> Cc: gr-everest-linux...@marvell.com
> Reviewed-by: Igor Russkikh 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/ethernet/qlogic/qed/qed_mcp.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c 
> b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
> index 9624616806e7..aea200d465ef 100644
> --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c
> +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c
> @@ -566,6 +566,7 @@ _qed_mcp_cmd_and_union(struct qed_hwfn *p_hwfn,
>   DP_NOTICE(p_hwfn,
> "The MFW failed to respond to command 0x%08x [param 
> 0x%08x].\n",
> p_mb_params->cmd, p_mb_params->param);
> + module_firmware_crashed();
>   qed_mcp_print_cpu_info(p_hwfn, p_ptt);
>  
>   spin_lock_bh(_hwfn->mcp_info->cmd_lock);
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 10/15] soc: qcom: ipa: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:41PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: Alex Elder 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/ipa/ipa_modem.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/ipa/ipa_modem.c b/drivers/net/ipa/ipa_modem.c
> index ed10818dd99f..1790b87446ed 100644
> --- a/drivers/net/ipa/ipa_modem.c
> +++ b/drivers/net/ipa/ipa_modem.c
> @@ -285,6 +285,7 @@ static void ipa_modem_crashed(struct ipa *ipa)
>   struct device *dev = >pdev->dev;
>   int ret;
>  
> + module_firmware_crashed();
>   ipa_endpoint_modem_pause_all(ipa, true);
>  
>   ipa_endpoint_modem_hol_block_clear_all(ipa);
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 08/15] ehea: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:39PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: Douglas Miller 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/ethernet/ibm/ehea/ehea_main.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c 
> b/drivers/net/ethernet/ibm/ehea/ehea_main.c
> index 0273fb7a9d01..6ae35067003f 100644
> --- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
> +++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
> @@ -3285,6 +3285,8 @@ static void ehea_crash_handler(void)
>  {
>   int i;
>  
> + module_firmware_crashed();
> +
>   if (ehea_fw_handles.arr)
>   for (i = 0; i < ehea_fw_handles.num_entries; i++)
>   ehea_h_free_resource(ehea_fw_handles.arr[i].adh,
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 07/15] cxgb4: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:38PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: Vishal Kulkarni 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
> b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> index a70018f067aa..c67fc86c0e42 100644
> --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
> @@ -3646,6 +3646,7 @@ void t4_fatal_err(struct adapter *adap)
>* could be exposed to the adapter.  RDMA MWs for example...
>*/
>   t4_shutdown_adapter(adap);
> + module_firmware_crashed();
>   for_each_port(adap, port) {
>   struct net_device *dev = adap->port[port];
>  
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v4 2/2] powerpc/rtas: Implement reentrant rtas call

2020-05-15 Thread Leonardo Bras

Hello Nick,

On Fri, 2020-05-15 at 17:30 +1000, Nicholas Piggin wrote:
> Excerpts from Leonardo Bras's message of May 15, 2020 9:51 am:
> > Implement rtas_call_reentrant() for reentrant rtas-calls:
> > "ibm,int-on", "ibm,int-off",ibm,get-xive" and  "ibm,set-xive".
> > 
> > On LoPAPR Version 1.1 (March 24, 2016), from 7.3.10.1 to 7.3.10.4,
> > items 2 and 3 say:
> > 
> > 2 - For the PowerPC External Interrupt option: The * call must be
> > reentrant to the number of processors on the platform.
> > 3 - For the PowerPC External Interrupt option: The * argument call
> > buffer for each simultaneous call must be physically unique.
> > 
> > So, these rtas-calls can be called in a lockless way, if using
> > a different buffer for each cpu doing such rtas call.
> 
> What about rtas_call_unlocked? Do the callers need to take the rtas 
> lock?
> 
> Machine checks must call ibm,nmi-interlock too, which we really don't 
> want to take a lock for either. Hopefully that's in a class of its own
> and we can essentially ignore with respect to other rtas calls.
> 
> The spec is pretty vague too :(
> 
> "The ibm,get-xive call must be reentrant to the number of processors on 
> the platform."
> 
> This suggests ibm,get-xive can be called concurrently by multiple
> processors. It doesn't say anything about being re-entrant against any 
> of the other re-entrant calls. Maybe that could be reasonably assumed,
> but I don't know if it's reasonable to assume it can be called 
> concurrently with a *non-reentrant* call, is it?

This was discussed on a previous version of the patchset:

https://lore.kernel.org/linuxppc-dev/875zcy2v8o@linux.ibm.com/

He checked with partition firmware development and these calls can be
used concurrently with arbitrary other RTAS calls.

> 
> > For this, it was suggested to add the buffer (struct rtas_args)
> > in the PACA struct, so each cpu can have it's own buffer.
> 
> You can't do this, paca is not limited to RTAS_INSTANTIATE_MAX.
> Which is good, because I didn't want you to add another 88 bytes to the 
> paca :) Can you make it a pointer and allocate it separately? Check
> the slb_shadow allocation, you could use a similar pattern.

Sure, I will send the next version with this change.

> 
> The other option would be to have just one more rtas args, and have the 
> crashing CPU always that. That would skirt the re-entrancy issue -- the
> concurrency is only ever a last resort. Would be a bit tricker though.

It seems a good idea, but I would like to try the previous alternative
first.

> Thanks,
> Nick

Thank you Nick!

Re: [PATCH v2 06/15] liquidio: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:37PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: Derek Chickles 
> Cc: Satanand Burla 
> Cc: Felix Manlunas 
> Reviewed-by: Derek Chickles 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/ethernet/cavium/liquidio/lio_main.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c 
> b/drivers/net/ethernet/cavium/liquidio/lio_main.c
> index 66d31c018c7e..f18085262982 100644
> --- a/drivers/net/ethernet/cavium/liquidio/lio_main.c
> +++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c
> @@ -801,6 +801,7 @@ static int liquidio_watchdog(void *param)
>   continue;
>  
>   WRITE_ONCE(oct->cores_crashed, true);
> + module_firmware_crashed();
>   other_oct = get_other_octeon_device(oct);
>   if (other_oct)
>   WRITE_ONCE(other_oct->cores_crashed, true);
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 05/15] bna: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:36PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: Rasesh Mody 
> Cc: Sudarsana Kalluru 
> Cc: gr-linux-nic-...@marvell.com
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/ethernet/brocade/bna/bfa_ioc.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/ethernet/brocade/bna/bfa_ioc.c 
> b/drivers/net/ethernet/brocade/bna/bfa_ioc.c
> index e17bfc87da90..b3f44a912574 100644
> --- a/drivers/net/ethernet/brocade/bna/bfa_ioc.c
> +++ b/drivers/net/ethernet/brocade/bna/bfa_ioc.c
> @@ -927,6 +927,7 @@ bfa_iocpf_sm_disabled(struct bfa_iocpf *iocpf, enum 
> iocpf_event event)
>  static void
>  bfa_iocpf_sm_initfail_sync_entry(struct bfa_iocpf *iocpf)
>  {
> + module_firmware_crashed();
>   bfa_nw_ioc_debug_save_ftrc(iocpf->ioc);
>   bfa_ioc_hw_sem_get(iocpf->ioc);
>  }
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 04/15] bnxt: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:35PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: Michael Chan 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c 
> b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> index dd0c3f227009..5ba1bd0734e9 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
> @@ -3503,6 +3503,7 @@ static int bnxt_get_dump_data(struct net_device *dev, 
> struct ethtool_dump *dump,
>  
>   dump->flag = bp->dump_flag;
>   if (dump->flag == BNXT_DUMP_CRASH) {
> + module_firmware_crashed();
>  #ifdef CONFIG_TEE_BNXT_FW
>   return tee_bnxt_copy_coredump(buf, 0, dump->len);
>  #endif
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 03/15] bnx2x: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:34PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: Ariel Elior 
> Cc: Sudarsana Kalluru 
> CC: gr-everest-linux...@marvell.com
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c 
> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
> index db5107e7937c..c38b8c9c8af0 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
> @@ -909,6 +909,7 @@ void bnx2x_panic_dump(struct bnx2x *bp, bool disable_int)
>   bp->eth_stats.unrecoverable_error++;
>   DP(BNX2X_MSG_STATS, "stats_state - DISABLED\n");
>  
> + module_firmware_crashed();
>   BNX2X_ERR("begin crash dump -\n");
>  
>   /* Indices */
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 02/15] ethernet/839: use new module_firmware_crashed()

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:33PM +, Luis Chamberlain wrote:
> This makes use of the new module_firmware_crashed() to help
> annotate when firmware for device drivers crash. When firmware
> crashes devices can sometimes become unresponsive, and recovery
> sometimes requires a driver unload / reload and in the worst cases
> a reboot.
> 
> Using a taint flag allows us to annotate when this happens clearly.
> 
> Cc: "Michael S. Tsirkin" 
> Cc: Shannon Nelson 
> Cc: Jakub Kicinski 
> Cc: Heiner Kallweit 
> Signed-off-by: Luis Chamberlain 
> ---
>  drivers/net/ethernet/8390/axnet_cs.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/8390/axnet_cs.c 
> b/drivers/net/ethernet/8390/axnet_cs.c
> index aeae7966a082..8ad0200db8e9 100644
> --- a/drivers/net/ethernet/8390/axnet_cs.c
> +++ b/drivers/net/ethernet/8390/axnet_cs.c
> @@ -1358,9 +1358,11 @@ static void ei_receive(struct net_device *dev)
>*/
>   if ((netif_msg_rx_err(ei_local)) &&
>   this_frame != ei_local->current_page &&
> - (this_frame != 0x0 || rxing_page != 0xFF))
> + (this_frame != 0x0 || rxing_page != 0xFF)) {
> + module_firmware_crashed();
>   netdev_err(dev, "mismatched read page pointers %2x vs 
> %2x\n",
>  this_frame, ei_local->current_page);
> + }
>   
>   if (this_frame == rxing_page)   /* Read all the frames? */
>   break;  /* Done for now */
> -- 
> 2.26.2
> 
Acked-by: Rafael Aquini

Re: [PATCH v2 01/15] taint: add module firmware crash taint support

2020-05-15 Thread Rafael Aquini

On Fri, May 15, 2020 at 09:28:32PM +, Luis Chamberlain wrote:
> Device driver firmware can crash, and sometimes, this can leave your
> system in a state which makes the device or subsystem completely
> useless. Detecting this by inspecting /proc/sys/kernel/tainted instead
> of scraping some magical words from the kernel log, which is driver
> specific, is much easier. So instead provide a helper which lets drivers
> annotate this.
> 
> Once this happens, scrapers can easily look for modules taint flags
> for a firmware crash. This will taint both the kernel and respective
> calling module.
> 
> The new helper module_firmware_crashed() uses LOCKDEP_STILL_OK as this
> fact should in no way shape or form affect lockdep. This taint is device
> driver specific.
> 
> Signed-off-by: Luis Chamberlain 
> ---
>  Documentation/admin-guide/tainted-kernels.rst |  6 ++
>  include/linux/kernel.h|  3 ++-
>  include/linux/module.h| 13 +
>  include/trace/events/module.h |  3 ++-
>  kernel/module.c   |  5 +++--
>  kernel/panic.c|  1 +
>  tools/debugging/kernel-chktaint   |  7 +++
>  7 files changed, 34 insertions(+), 4 deletions(-)
> 

Reviewed-by: Rafael Aquini

Re: [PATCH V2] dynamic_debug: Add an option to enable dynamic debug for modules only

2020-05-15 Thread Orson Zhai

On Fri, May 15, 2020 at 5:55 PM Petr Mladek  wrote:
>
> On Thu 2020-04-23 00:02:48, Orson Zhai wrote:
> > On Wed, Apr 22, 2020 at 10:25 PM Leon Romanovsky  wrote:
> > >
> > > On Wed, Apr 22, 2020 at 09:06:08PM +0800, Orson Zhai wrote:
> > > > On Tue, Apr 21, 2020 at 3:10 AM Leon Romanovsky  wrote:
> > > > My motivation came from the concept of GKI (Generic Kernel Image) in 
> > > > Android.
> > > > Google will release a common kernel image (binary) to all of the 
> > > > Android system
> > > > vendors in the world instead of letting them to build their owns as 
> > > > before.
> > > > Every SoC vendor's device drivers will be provided in kernel modules 
> > > > only.
> > > > By my patch, the driver owners could debug their modules in field (say
> > > > production releases)
> > > > without having to enable dynamic debug for the whole GKI.
> > >
> > > Will Google release that binary with CONFIG_DYNAMIC_DEBUG_CORE disabled?
> > >
> > In Google's plan, there will be only one GKI (no debug version) for
> > one Android version per kernel version per year.
>
> Are there plans to use modules with debug messages enabled on production
> systems?

Yes, but in a managed way. They are not being enabled directly to log buffer.
Users / FAEs (Field Application Engineer) might control to open or
close every single one on-the-fly.

>
> IMHO, the debug messages are primary needed during development and
> when fixing bugs. I am sure that developers will want to enable many
> more features that will help with debugging and which will be disabled
> on production systems.

I agree with you in general speaking.
For real production build we usually keep a few critical debugging
methods in case of some
potential bugs which are extremely hard to be found in production test.
Dynamic debug is one of these methods.
I assume it is widely used for maintenance to PC or server because I
can find it is enabled in some
popular Linux distribution configs.

Here is the search result from my PC with Ubuntu default installation.
zhai@ThinkPad:/boot$ cat config-4.15.0-99-generic | grep DYNAMIC_DEBUG
CONFIG_DYNAMIC_DEBUG=y

>
> I expect that Google will not release only the single binary. They
> should release also the sources and build configuration. Then

Yes, they have released the source and configuration which could be freely
downloaded from Google's website.

> developers might build their own versions with the needed debugging
> features enabled.

Yes, we do have the debug build for this.
But as I mentioned above, it is a little bit different for my requirement.
Actually my patch is to address the problem for embedded system where
image size is needed to be
considered when CONFIG_DYNAMIC_DEBUG is being enable globally.

For a "make allyesconfig" build, 2,335,704 bytes will be increased by
enabling CONFIG_DYNAMIC_DEBUG.
It is trivial for PC or server but might matter for embedded product.
So my patch is to give user an option to
only enable dynamic debug for modules especially in this GKI case.

Thanks
Orson
>
> Best Regards,
> Petr

Re: [RFC PATCH 00/13] scsi: ufs: Add HPB Support

2020-05-15 Thread Bart Van Assche

On 2020-05-15 03:30, Avri Altman wrote:
> NAND flash-based storage devices, needs to translate the logical
> addresses of the IO requests to its corresponding physical addresses of
> the flash storage.  As the internal SRAM of the storage device cannot
> accommodate the entire L2P mapping table, the device can only partially
> load the L2P data it needs based on the incoming LBAs. As a result,
> cache misses - which are more frequent in random read access, can result
> in serious performance degradation.  To remedy the above, UFS3.1 offers
> the Host Performance Booster (HPB) feature, which uses the host’s system
> memory as a cache for L2P map data. The basic concept of HPB is to
> cache L2P mapping entries in host system memory so that both physical
> block address (PBA) and logical block address (LBA) can be delivered in
> HPB read command.  Not to be confused with host-managed-FTL, HPB is
> merely about NAND flash cost reduction.
> 
> HPB, by its nature, imposes an interesting question regarding the proper
> location of its components across the storage stack. On Init it requires
> to detect the HPB capabilities and parameters, which can be only done
> from the LLD level.  On the other hand, it requires to send scsi
> commands as part of its cache management, which preferably should be
> done from scsi mid-layer,  and compose the "HPB_READ" command, which
> should be done as part of scsi command setup path.
> Therefore, we came up with a 2 module layout: ufshpb in the ufs driver,
> and scsi_dh_ufshpb under scsi device handler.
> 
> The ufshpb module bear 2 main duties. During initialization, it reads
> the hpb configuration from the device. Later on, during the scan host
> phase, it attaches the scsi device and activates the scsi hpb device
> handler.  The second duty mainly concern supporting the HPB cache
> management. The scsi hpb device handler will perform the cache
> management and send the HPB_READ command. The modules will communicate
> via the standard device handler API: the handler_data opaque pointer,
> and the set_params op-mode.
> 
> This series has borrowed heavily from a HPB implementation that was
> published as part of the pixel3 code, authored by:
> Yongmyung Lee  and
> Jinyoung Choi .
> 
> We kept some of its design and implementation details. We made some
> minor modifications to adopt the latest spec changes (HPB1.0 was not
> close when the driver initially published), and also divide the
> implementation between the scsi handler and the ufs modules, instead of
> a single module in the original driver, which simplified the
> implementation to a great deal and resulted in far less code. One more
> big difference is that the Pixel3 driver support device managed mode,
> while we are supporting host managed mode, which reflect heavily on the
> cache management decision process.

Hi Avri,

Thank you for having taken the time to publish your work. The way this
series has been split into individual patches makes reviewing easy.
Additionally, the cover letter and patch descriptions are very
informative, insightful and well written. However, I'm concerned about a
key aspect of the implementation, namely relying on a device handler to
alter the meaning of a block layer request. My concern about this
approach is that at most one device handler can be associated with a
SCSI LLD. If in the future more functionality would be added to the UFS
spec and if it would be desirable to implement that functionality as a
new kernel module, it won't be possible to implement that functionality
as a new device handler. So I think that not relying on the device
handler infrastructure is more future proof because that removes the
restrictions we have to deal with when using the device handler framework.
 Thanks,

Bart.

[PATCH v2] net: revert "net: get rid of an signed integer overflow in ip_idents_reserve()"

2020-05-15 Thread Shaokun Zhang

From: Yuqi Jin 

Commit adb03115f459 ("net: get rid of an signed integer overflow in 
ip_idents_reserve()")
used atomic_cmpxchg to replace "atomic_add_return" inside the function
"ip_idents_reserve". The reason was to avoid UBSAN warning.
However, this change has caused performance degrade and in GCC-8,
fno-strict-overflow is now mapped to -fwrapv -fwrapv-pointer
and signed integer overflow is now undefined by default at all
optimization levels[1]. Moreover, it was a bug in UBSAN vs -fwrapv
/-fno-strict-overflow, so Let's revert it safely.

[1] https://gcc.gnu.org/gcc-8/changes.html

Suggested-by: Peter Zijlstra 
Suggested-by: Eric Dumazet 
Cc: "David S. Miller" 
Cc: Alexey Kuznetsov 
Cc: Hideaki YOSHIFUJI 
Cc: Jakub Kicinski 
Cc: Jiri Pirko 
Cc: Arvind Sankar 
Cc: Peter Zijlstra 
Cc: Eric Dumazet 
Cc: Jiong Wang 
Signed-off-by: Yuqi Jin 
Signed-off-by: Shaokun Zhang 
---
ChangLog:
* Revise the commit log
* Add some comments. If it's wholly unnecessary, we
can remove it.

Patch v1: 
https://patchwork.ozlabs.org/project/netdev/patch/1579058620-26684-1-git-send-email-zhangshao...@hisilicon.com/

 net/ipv4/route.c | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 788c69d9bfe0..455871d6b3a0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -491,18 +491,16 @@ u32 ip_idents_reserve(u32 hash, int segs)
atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
u32 old = READ_ONCE(*p_tstamp);
u32 now = (u32)jiffies;
-   u32 new, delta = 0;
+   u32 delta = 0;
 
if (old != now && cmpxchg(p_tstamp, old, now) == old)
delta = prandom_u32_max(now - old);
 
-   /* Do not use atomic_add_return() as it makes UBSAN unhappy */
-   do {
-   old = (u32)atomic_read(p_id);
-   new = old + delta + segs;
-   } while (atomic_cmpxchg(p_id, old, new) != old);
-
-   return new - segs;
+   /* If UBSAN reports an error there, please make sure your compiler
+* supports -fno-strict-overflow before reporting it that was a bug
+* in UBSAN, and it has been fixed in GCC-8.
+*/
+   return atomic_add_return(segs + delta, p_id) - segs;
 }
 EXPORT_SYMBOL(ip_idents_reserve);
 
-- 
2.7.4

Re: [PATCH 14/18] maccess: allow architectures to provide kernel probing directly

2020-05-15 Thread Masami Hiramatsu

Hi Christoph,

On Wed, 13 May 2020 18:00:34 +0200
Christoph Hellwig  wrote:

> Provide alternative versions of probe_kernel_read, probe_kernel_write
> and strncpy_from_kernel_unsafe that don't need set_fs magic, but instead
> use arch hooks that are modelled after unsafe_{get,put}_user to access
> kernel memory in an exception safe way.

This patch seems to introduce new implementation of probe_kernel_read/write()
and strncpy_from_kernel_unsafe(), but also drops copy_from/to_kernel_nofault()
and strncpy_from_kernel_nofault() if HAVE_ARCH_PROBE_KERNEL is defined.
In the result, this cause a link error with BPF and kprobe events.

BTW, what is the difference of *_unsafe() and *_nofault()?
(maybe we make those to *_nofault() finally?)

Thank you,

> 
> Signed-off-by: Christoph Hellwig 
> ---
>  mm/maccess.c | 76 
>  1 file changed, 76 insertions(+)
> 
> diff --git a/mm/maccess.c b/mm/maccess.c
> index 9773e2253b495..e9efe2f98e34a 100644
> --- a/mm/maccess.c
> +++ b/mm/maccess.c
> @@ -12,6 +12,81 @@ bool __weak probe_kernel_read_allowed(void *dst, const 
> void *unsafe_src,
>   return true;
>  }
>  
> +#ifdef HAVE_ARCH_PROBE_KERNEL
> +
> +#define probe_kernel_read_loop(dst, src, len, type, err_label)   
> \
> + while (len >= sizeof(type)) {   \
> + arch_kernel_read(dst, src, type, err_label);\
> + dst += sizeof(type);\
> + src += sizeof(type);\
> + len -= sizeof(type);\
> + }
> +
> +long probe_kernel_read(void *dst, const void *src, size_t size)
> +{
> + if (!probe_kernel_read_allowed(dst, src, size))
> + return -EFAULT;
> +
> + pagefault_disable();
> + probe_kernel_read_loop(dst, src, size, u64, Efault);
> + probe_kernel_read_loop(dst, src, size, u32, Efault);
> + probe_kernel_read_loop(dst, src, size, u16, Efault);
> + probe_kernel_read_loop(dst, src, size, u8, Efault);
> + pagefault_enable();
> + return 0;
> +Efault:
> + pagefault_enable();
> + return -EFAULT;
> +}
> +EXPORT_SYMBOL_GPL(probe_kernel_read);
> +
> +#define probe_kernel_write_loop(dst, src, len, type, err_label)  
> \
> + while (len >= sizeof(type)) {   \
> + arch_kernel_write(dst, src, type, err_label);   \
> + dst += sizeof(type);\
> + src += sizeof(type);\
> + len -= sizeof(type);\
> + }
> +
> +long probe_kernel_write(void *dst, const void *src, size_t size)
> +{
> + pagefault_disable();
> + probe_kernel_write_loop(dst, src, size, u64, Efault);
> + probe_kernel_write_loop(dst, src, size, u32, Efault);
> + probe_kernel_write_loop(dst, src, size, u16, Efault);
> + probe_kernel_write_loop(dst, src, size, u8, Efault);
> + pagefault_enable();
> + return 0;
> +Efault:
> + pagefault_enable();
> + return -EFAULT;
> +}
> +
> +long strncpy_from_kernel_unsafe(char *dst, const void *unsafe_addr, long 
> count)
> +{
> + const void *src = unsafe_addr;
> +
> + if (unlikely(count <= 0))
> + return 0;
> + if (!probe_kernel_read_allowed(dst, unsafe_addr, count))
> + return -EFAULT;
> +
> + pagefault_disable();
> + do {
> + arch_kernel_read(dst, src, u8, Efault);
> + dst++;
> + src++;
> + } while (dst[-1] && src - unsafe_addr < count);
> + pagefault_enable();
> +
> + dst[-1] = '\0';
> + return src - unsafe_addr;
> +Efault:
> + pagefault_enable();
> + dst[-1] = '\0';
> + return -EFAULT;
> +}
> +#else /* HAVE_ARCH_PROBE_KERNEL */
>  /**
>   * probe_kernel_read(): safely attempt to read from kernel-space
>   * @dst: pointer to the buffer that shall take the data
> @@ -114,6 +189,7 @@ long strncpy_from_kernel_nofault(char *dst, const void 
> *unsafe_addr, long count)
>  
>   return ret ? -EFAULT : src - unsafe_addr;
>  }
> +#endif /* HAVE_ARCH_PROBE_KERNEL */
>  
>  /**
>   * probe_user_read(): safely attempt to read from a user-space location
> -- 
> 2.26.2
> 


-- 
Masami Hiramatsu

Re: [PATCH updated v2] sched/fair: core wide cfs task priority comparison

2020-05-15 Thread Aaron Lu

On Thu, May 14, 2020 at 03:02:48PM +0200, Peter Zijlstra wrote:
> On Fri, May 08, 2020 at 08:34:57PM +0800, Aaron Lu wrote:
> > With this said, I realized a workaround for the issue described above:
> > when the core went from 'compatible mode'(step 1-3) to 'incompatible
> > mode'(step 4), reset all root level sched entities' vruntime to be the
> > same as the core wide min_vruntime. After all, the core is transforming
> > from two runqueue mode to single runqueue mode... I think this can solve
> > the issue to some extent but I may miss other scenarios.
> 
> A little something like so, this syncs min_vruntime when we switch to
> single queue mode. This is very much SMT2 only, I got my head in twist
> when thikning about more siblings, I'll have to try again later.

Thanks a lot for the patch, I now see that "there is no need to adjust
every se's vruntime". :-)

> This very much retains the horrible approximation of S we always do.
> 
> Also, it is _completely_ untested...

I've been testing it.

One problem below.

> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4293,10 +4281,11 @@ static struct task_struct *
>  pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>  {
>   struct task_struct *next, *max = NULL;
> + int old_active = 0, new_active = 0;
>   const struct sched_class *class;
>   const struct cpumask *smt_mask;
> - int i, j, cpu;
>   bool need_sync = false;
> + int i, j, cpu;
>  
>   cpu = cpu_of(rq);
>   if (cpu_is_offline(cpu))
> @@ -4349,10 +4338,14 @@ pick_next_task(struct rq *rq, struct tas
>   rq_i->core_pick = NULL;
>  
>   if (rq_i->core_forceidle) {
> + // XXX is_idle_task(rq_i->curr) && rq_i->nr_running ??
>   need_sync = true;
>   rq_i->core_forceidle = false;
>   }
>  
> + if (!is_idle_task(rq_i->curr))
> + old_active++;
> +
>   if (i != cpu)
>   update_rq_clock(rq_i);
>   }
> @@ -4463,8 +4456,12 @@ next_class:;
>  
>   WARN_ON_ONCE(!rq_i->core_pick);
>  
> - if (is_idle_task(rq_i->core_pick) && rq_i->nr_running)
> - rq_i->core_forceidle = true;
> + if (is_idle_task(rq_i->core_pick)) {
> + if (rq_i->nr_running)
> + rq_i->core_forceidle = true;
> + } else {
> + new_active++;
> + }
>  
>   if (i == cpu)
>   continue;
> @@ -4476,6 +4473,16 @@ next_class:;
>   WARN_ON_ONCE(!cookie_match(next, rq_i->core_pick));
>   }
>  
> + /* XXX SMT2 only */
> + if (new_active == 1 && old_active > 1) {

There is a case when incompatible task appears but we failed to 'drop
into single-rq mode' per the above condition check. The TLDR is: when
there is a task that sits on the sibling rq with the same cookie as
'max', new_active will be 2 instead of 1 and that would cause us missing
the chance to do a sync of core min_vruntime.

This is how it happens:
1) 2 tasks of the same cgroup with different weight running on 2 siblings,
   say cg0_A with weight 1024 bound at cpu0 and cg0_B with weight 2 bound
   at cpu1(assume cpu0 and cpu1 are siblings);
2) Since new_active == 2, we didn't trigger min_vruntime sync. For
   simplicity, let's assume both siblings' root cfs_rq's min_vruntime and
   core_vruntime are all at 0 now;
3) let the two tasks run a while;
4) a new task cg1_C of another cgroup gets queued on cpu1. Since cpu1's
   existing task has a very small weight, its cfs_rq's min_vruntime can
   be much larger than cpu0's cfs_rq min_vruntime. So cg1_C's vruntime is
   much larger than cg0_A's and the 'max' of the core wide task
   selection goes to cg0_A;
5) Now I suppose we should drop into single-rq mode and by doing a sync
   of core min_vruntime, cg1_C's turn shall come. But the problem is, our
   current selection logic prefer not to waste CPU time so after decides
   cg0_A as the 'max', the sibling will also do a cookie_pick() and
   get cg0_B to run. This is where problem asises: new_active is 2
   instead of the expected 1.
6) Due to we didn't do the sync of core min_vruntime, the newly queued
   cg1_C shall wait a long time before cg0_A's vruntime catches up.

One naive way of precisely determine when to drop into single-rq mode is
to track how many tasks of a particular tag exists and use that to
decide if the core is in compatible mode(all tasks belong to the same
cgroup, IOW, have the same core_cookie) or not and act accordingly,
except that: does this sound too complex and inefficient?...

> + /*
> +  * We just dropped into single-rq mode, increment the sequence
> +  * count to trigger the vruntime sync.
> +  */
> + rq->core->core_sync_seq++;
> + }
> + rq->core->core_active = new_active;
> +

Re: [RFC PATCH 12/13] scsi: dh: ufshpb: Add prep_fn handler

2020-05-15 Thread Bart Van Assche

On 2020-05-15 03:30, Avri Altman wrote:
> diff --git a/drivers/scsi/device_handler/scsi_dh_ufshpb.c 
> b/drivers/scsi/device_handler/scsi_dh_ufshpb.c
> index affc143..04e3d56 100644
> --- a/drivers/scsi/device_handler/scsi_dh_ufshpb.c
> +++ b/drivers/scsi/device_handler/scsi_dh_ufshpb.c
> @@ -15,6 +15,7 @@
>  #include 
>  #include 
>  #include 
> +#include "../sd.h"

Please add a comment that explains why this include directive is necessary.

> +static void __update_read_counters(struct ufshpb_dh_lun *hpb,
> +struct ufshpb_region *r,
> +struct ufshpb_subregion *s, u64 nr_blocks)
> +{
> + enum ufshpb_state s_state;
> +
> + atomic64_add(nr_blocks, >reads);
> + atomic64_add(nr_blocks, >reads);
> +
> + /* normalize read counters if needed */
> + if (atomic64_read(>reads) >= READ_NORMALIZATION * entries_per_region)
> + queue_work(hpb->wq, >reads_normalization_work);
> +
> + rcu_read_lock();
> + s_state = s->state;
> + rcu_read_unlock();

We don't use locking in the Linux kernel to read a scalar that can be
read with a single load instruction, even if it can be modified while it
is being read.

> +/* Call this on read from prep_fn */
> +static bool ufshpb_test_block_dirty(struct ufshpb_dh_data *h,
> + struct request *rq, u64 start_lba,
> + u32 nr_blocks)
> +{
> + struct ufshpb_dh_lun *hpb = h->hpb;
> + u64 end_lba = start_lba + nr_blocks;
> + unsigned int region = ufshpb_lba_to_region(start_lba);
> + unsigned int subregion = ufshpb_lba_to_subregion(start_lba);
> + struct ufshpb_region *r = hpb->region_tbl + region;
> + struct ufshpb_subregion *s = r->subregion_tbl + subregion;
> + enum ufshpb_state s_state;
> +
> + __update_rw_counters(hpb, start_lba, end_lba, REQ_OP_READ);
> +
> + rcu_read_lock();
> + s_state = s->state;
> + rcu_read_unlock();
> +
> + if (s_state != HPB_STATE_ACTIVE)
> + return true;
> +
> + return (atomic64_read(>writes) >= SET_AS_DIRTY);
> +}

No parentheses around returned values please.

>  /*
>   * ufshpb_dispatch - ufshpb state machine
>   * make the various decisions based on region/subregion state & events
> @@ -631,6 +875,9 @@ static void ufshpb_work_handler(struct work_struct *work)
>   ufshpb_dispatch(s->hpb, s->r, s);
>  
>   mutex_unlock(>state_lock);
> +
> + /* the subregion state might has changed */
> + synchronize_rcu();
>  }

What is the purpose of this synchronize_rcu() call? This is the first
time that I see a synchronize_rcu() call at the end of a work handler.

>  static int ufshpb_activate_pinned_regions(struct ufshpb_dh_data *h, bool 
> init)
> @@ -679,6 +926,12 @@ static int ufshpb_activate_pinned_regions(struct 
> ufshpb_dh_data *h, bool init)
>   set_bit(start_idx + i, hpb->pinned_map);
>   }
>  
> + /*
> +  * those subregions of the pinned regions changed their state - they
> +  * are active now
> +  */
> + synchronize_rcu();
> +
>   return ret;
>  }

Same question here: what is the purpose of this synchronize_rcu() call?

> @@ -713,6 +966,9 @@ static void ufshpb_lun_reset_work_handler(struct 
> work_struct *work)
>   __region_reset(hpb, r);
>   }
>  
> + /* update rcu post lun reset */
> + synchronize_rcu();
> +

Also here: what is the purpose of this synchronize_rcu() call?

> +/*
> + * ufshpb_prep_fn - Construct HPB_READ when possible
> + */
> +static blk_status_t ufshpb_prep_fn(struct scsi_device *sdev, struct request 
> *rq)
> +{
> + struct ufshpb_dh_data *h = sdev->handler_data;
> + struct ufshpb_dh_lun *hpb = h->hpb;
> + u64 lba = sectors_to_logical(sdev, blk_rq_pos(rq));
> + u32 nr_blocks = sectors_to_logical(sdev, blk_rq_sectors(rq));
> +
> + if (op_is_write(req_op(rq)) || op_is_discard(req_op(rq))) {
> + ufshpb_set_block_dirty(h, rq, lba, nr_blocks);
> + goto out;
> + }
> +
> + if (req_op(rq) != REQ_OP_READ || nr_blocks > 255)
> + goto out;
> +
> + if (ufshpb_test_block_dirty(h, rq, lba, nr_blocks))
> + goto out;
> +
> + ufshpb_prepare_hpb_read_cmd(rq, hpb, lba, (u8)nr_blocks);
> +
> +out:
> + return BLK_STS_OK;
> +}

So this prep function calls ufshpb_prepare_hpb_read_cmd(), and that
function does the following:

memcpy(scsi_req(rq)->cmd, cmnd, sizeof(cmnd));

I'm not sure that such a construct would be acceptable in any SCSI LLD
or device handler. The SCSI CDB is overwritten without updating the
other members of the request structure, e.g. the page pointers in the
bvecs of the bio attached to a request structure. What will e.g. happen
if rq_for_each_segment() would be called? Will it iterate over the data
buffer of the original REQ_OP_READ or will it iterate over the data
buffer of the UFSHPB_READ command?

Bart.

[PATCH net-next v1] hinic: add set_channels ethtool_ops support

2020-05-15 Thread Luo bin

add support to change TX/RX queue number with ethtool -L

Signed-off-by: Luo bin 
---
 .../net/ethernet/huawei/hinic/hinic_ethtool.c | 46 +++
 .../net/ethernet/huawei/hinic/hinic_main.c|  2 +-
 drivers/net/ethernet/huawei/hinic/hinic_tx.c  |  5 ++
 3 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c 
b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
index ace18d258049..9796c1fbe132 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
@@ -619,14 +619,43 @@ static void hinic_get_channels(struct net_device *netdev,
struct hinic_dev *nic_dev = netdev_priv(netdev);
struct hinic_hwdev *hwdev = nic_dev->hwdev;
 
-   channels->max_rx = hwdev->nic_cap.max_qps;
-   channels->max_tx = hwdev->nic_cap.max_qps;
-   channels->max_other = 0;
-   channels->max_combined = 0;
-   channels->rx_count = hinic_hwdev_num_qps(hwdev);
-   channels->tx_count = hinic_hwdev_num_qps(hwdev);
-   channels->other_count = 0;
-   channels->combined_count = 0;
+   channels->max_combined = nic_dev->max_qps;
+   channels->combined_count = hinic_hwdev_num_qps(hwdev);
+}
+
+static int hinic_set_channels(struct net_device *netdev,
+ struct ethtool_channels *channels)
+{
+   struct hinic_dev *nic_dev = netdev_priv(netdev);
+   unsigned int count = channels->combined_count;
+   int err;
+
+   if (!count) {
+   netif_err(nic_dev, drv, netdev,
+ "Unsupported combined_count: 0\n");
+   return -EINVAL;
+   }
+
+   netif_info(nic_dev, drv, netdev, "Set max combined queue number from %d 
to %d\n",
+  hinic_hwdev_num_qps(nic_dev->hwdev), count);
+
+   if (netif_running(netdev)) {
+   netif_info(nic_dev, drv, netdev, "Restarting netdev\n");
+   hinic_close(netdev);
+
+   nic_dev->hwdev->nic_cap.num_qps = count;
+
+   err = hinic_open(netdev);
+   if (err) {
+   netif_err(nic_dev, drv, netdev,
+ "Failed to open netdev\n");
+   return -EFAULT;
+   }
+   } else {
+   nic_dev->hwdev->nic_cap.num_qps = count;
+   }
+
+   return 0;
 }
 
 static int hinic_get_rss_hash_opts(struct hinic_dev *nic_dev,
@@ -1219,6 +1248,7 @@ static const struct ethtool_ops hinic_ethtool_ops = {
.get_ringparam = hinic_get_ringparam,
.set_ringparam = hinic_set_ringparam,
.get_channels = hinic_get_channels,
+   .set_channels = hinic_set_channels,
.get_rxnfc = hinic_get_rxnfc,
.set_rxnfc = hinic_set_rxnfc,
.get_rxfh_key_size = hinic_get_rxfh_key_size,
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c 
b/drivers/net/ethernet/huawei/hinic/hinic_main.c
index e3ff119fe341..2c07b03bf6e5 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_main.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c
@@ -326,7 +326,6 @@ static void hinic_enable_rss(struct hinic_dev *nic_dev)
int i, node, err = 0;
u16 num_cpus = 0;
 
-   nic_dev->max_qps = hinic_hwdev_max_num_qps(hwdev);
if (nic_dev->max_qps <= 1) {
nic_dev->flags &= ~HINIC_RSS_ENABLE;
nic_dev->rss_limit = nic_dev->max_qps;
@@ -1043,6 +1042,7 @@ static int nic_dev_init(struct pci_dev *pdev)
nic_dev->rq_depth = HINIC_RQ_DEPTH;
nic_dev->sriov_info.hwdev = hwdev;
nic_dev->sriov_info.pdev = pdev;
+   nic_dev->max_qps = num_qps;
 
sema_init(_dev->mgmt_lock, 1);
 
diff --git a/drivers/net/ethernet/huawei/hinic/hinic_tx.c 
b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
index 4c66a0bc1b28..6da761d7a6ef 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
@@ -470,6 +470,11 @@ netdev_tx_t hinic_xmit_frame(struct sk_buff *skb, struct 
net_device *netdev)
struct hinic_txq *txq;
struct hinic_qp *qp;
 
+   if (unlikely(!netif_carrier_ok(netdev))) {
+   dev_kfree_skb_any(skb);
+   return NETDEV_TX_OK;
+   }
+
txq = _dev->txqs[q_id];
qp = container_of(txq->sq, struct hinic_qp, sq);
 
-- 
2.17.1

[PATCH v5 7/7] loop: be paranoid on exit and prevent new additions / removals

2020-05-15 Thread Luis Chamberlain

Be pedantic on removal as well and hold the mutex.
This should prevent uses of addition while we exit.

Reviewed-by: Ming Lei 
Signed-off-by: Luis Chamberlain 
---
 drivers/block/loop.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 14372df0f354..54fbcbd930de 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -2333,6 +2333,8 @@ static void __exit loop_exit(void)
 
range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
 
+   mutex_lock(_ctl_mutex);
+
idr_for_each(_index_idr, _exit_cb, NULL);
idr_destroy(_index_idr);
 
@@ -2340,6 +2342,8 @@ static void __exit loop_exit(void)
unregister_blkdev(LOOP_MAJOR, "loop");
 
misc_deregister(_misc);
+
+   mutex_unlock(_ctl_mutex);
 }
 
 module_init(loop_init);
-- 
2.26.2

[PATCH v5 3/7] block: revert back to synchronous request_queue removal

2020-05-15 Thread Luis Chamberlain

Commit dc9edc44de6c ("block: Fix a blk_exit_rl() regression") merged on
v4.12 moved the work behind blk_release_queue() into a workqueue after a
splat floated around which indicated some work on blk_release_queue()
could sleep in blk_exit_rl(). This splat would be possible when a driver
called blk_put_queue() or blk_cleanup_queue() (which calls blk_put_queue()
as its final call) from an atomic context.

blk_put_queue() decrements the refcount for the request_queue kobject,
and upon reaching 0 blk_release_queue() is called. Although blk_exit_rl()
is now removed through commit db6d9952356 ("block: remove request_list code")
on v5.0, we reserve the right to be able to sleep within blk_release_queue()
context.

The last reference for the request_queue must not be called from atomic
context. *When* the last reference to the request_queue reaches 0 varies,
and so let's take the opportunity to document when that is expected to
happen and also document the context of the related calls as best as possible
so we can avoid future issues, and with the hopes that the synchronous
request_queue removal sticks.

We revert back to synchronous request_queue removal because asynchronous
removal creates a regression with expected userspace interaction with
several drivers. An example is when removing the loopback driver, one
uses ioctls from userspace to do so, but upon return and if successful,
one expects the device to be removed. Likewise if one races to add another
device the new one may not be added as it is still being removed. This was
expected behavior before and it now fails as the device is still present
and busy still. Moving to asynchronous request_queue removal could have
broken many scripts which relied on the removal to have been completed if
there was no error. Document this expectation as well so that this
doesn't regress userspace again.

Using asynchronous request_queue removal however has helped us find
other bugs. In the future we can test what could break with this
arrangement by enabling CONFIG_DEBUG_KOBJECT_RELEASE.

While at it, update the docs with the context expectations for the
request_queue / gendisk refcount decrement, and make these
expectations explicit by using might_sleep().

Cc: Bart Van Assche 
Cc: Omar Sandoval 
Cc: Hannes Reinecke 
Cc: Nicolai Stange 
Cc: Greg Kroah-Hartman 
Cc: Michal Hocko 
Cc: yu kuai 
Suggested-by: Nicolai Stange 
Fixes: dc9edc44de6c ("block: Fix a blk_exit_rl() regression")
Reviewed-by: Christoph Hellwig 
Reviewed-by: Bart Van Assche 
Signed-off-by: Luis Chamberlain 
---
 block/blk-core.c   |  8 
 block/blk-sysfs.c  | 43 +-
 block/genhd.c  | 17 +
 include/linux/blkdev.h |  2 --
 4 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 94216fa16a05..8a785d16033b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -327,6 +327,9 @@ EXPORT_SYMBOL_GPL(blk_clear_pm_only);
  *
  * Decrements the refcount of the request_queue kobject. When this reaches 0
  * we'll have blk_release_queue() called.
+ *
+ * Context: Any context, but the last reference must not be dropped from
+ *  atomic context.
  */
 void blk_put_queue(struct request_queue *q)
 {
@@ -359,9 +362,14 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying);
  *
  * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
  * put it.  All future requests will be failed immediately with -ENODEV.
+ *
+ * Context: can sleep
  */
 void blk_cleanup_queue(struct request_queue *q)
 {
+   /* cannot be called from atomic context */
+   might_sleep();
+
WARN_ON_ONCE(blk_queue_registered(q));
 
/* mark @q DYING, no new request or merges will be allowed afterwards */
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 02643e149d5e..561624d4cc4e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -873,22 +873,32 @@ static void blk_exit_queue(struct request_queue *q)
bdi_put(q->backing_dev_info);
 }
 
-
 /**
- * __blk_release_queue - release a request queue
- * @work: pointer to the release_work member of the request queue to be 
released
+ * blk_release_queue - releases all allocated resources of the request_queue
+ * @kobj: pointer to a kobject, whose container is a request_queue
+ *
+ * This function releases all allocated resources of the request queue.
+ *
+ * The struct request_queue refcount is incremented with blk_get_queue() and
+ * decremented with blk_put_queue(). Once the refcount reaches 0 this function
+ * is called.
+ *
+ * For drivers that have a request_queue on a gendisk and added with
+ * __device_add_disk() the refcount to request_queue will reach 0 with
+ * the last put_disk() called by the driver. For drivers which don't use
+ * __device_add_disk() this happens with blk_cleanup_queue().
  *
- * Description:
- * This function is called when a block device is being unregistered. The
- * process of releasing a request

[PATCH v5 6/7] blktrace: break out of blktrace setup on concurrent calls

2020-05-15 Thread Luis Chamberlain

We use one blktrace per request_queue, that means one per the entire
disk.  So we cannot run one blktrace on say /dev/vda and then /dev/vda1,
or just two calls on /dev/vda.

We check for concurrent setup only at the very end of the blktrace setup though.

If we try to run two concurrent blktraces on the same block device the
second one will fail, and the first one seems to go on. However when
one tries to kill the first one one will see things like this:

The kernel will show these:

```
debugfs: File 'dropped' in directory 'nvme1n1' already present!
debugfs: File 'msg' in directory 'nvme1n1' already present!
debugfs: File 'trace0' in directory 'nvme1n1' already present!
``

And userspace just sees this error message for the second call:

```
blktrace /dev/nvme1n1
BLKTRACESETUP(2) /dev/nvme1n1 failed: 5/Input/output error
```

The first userspace process #1 will also claim that the files
were taken underneath their nose as well. The files are taken
away form the first process given that when the second blktrace
fails, it will follow up with a BLKTRACESTOP and BLKTRACETEARDOWN.
This means that even if go-happy process #1 is waiting for blktrace
data, we *have* been asked to take teardown the blktrace.

This can easily be reproduced with break-blktrace [0] run_0005.sh test.

Just break out early if we know we're already going to fail, this will
prevent trying to create the files all over again, which we know still
exist.

[0] https://github.com/mcgrof/break-blktrace
Signed-off-by: Luis Chamberlain 
---
 kernel/trace/blktrace.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 6c10a1427de2..ac6650828d49 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -3,6 +3,9 @@
  * Copyright (C) 2006 Jens Axboe 
  *
  */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include 
 #include 
 #include 
@@ -493,6 +496,16 @@ static int do_blk_trace_setup(struct request_queue *q, 
char *name, dev_t dev,
 */
strreplace(buts->name, '/', '_');
 
+   /*
+* bdev can be NULL, as with scsi-generic, this is a helpful as
+* we can be.
+*/
+   if (q->blk_trace) {
+   pr_warn("Concurrent blktraces are not allowed on %s\n",
+   buts->name);
+   return -EBUSY;
+   }
+
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
return -ENOMEM;
-- 
2.26.2

[PATCH v5 0/7] block: fix blktrace debugfs use after free

2020-05-15 Thread Luis Chamberlain

On this v5 I've split up the first patch into 3, one for comments,
another for context / might_sleep() updates, and the last the big
revert back to synchronous request_queue removal. I didn't update
the context for the put / decrements for gendisk & request_queue
as they would be updated in the next patch.

Since the first 3 patches are a reflection of the original one, I've
left the Reviewed-by's collected in place.

I've changed the kzalloc() / snprintf() to just kasprintf() as requested
by Bart. Since it was not clear that we don't have the bdev on
do_blk_trace_setup() for the patch titled "blktrace: break out of
blktrace setup on concurrent calls", I've added a comment so that
someone doesn't later try to add a dev_printk() or the like.

I've also addressed a compilation issue with debugfs disabled reported
by 0-day on the patch titled "blktrace: fix debugfs use after free". It
was missing a "static inline" on a function. I've also moved the new
declarations underneath the "#ifdef CONFIG_BLOCK" on include/linux/genhd.h,
I previously had them outside of this block.

I've left in place the scsi-generic blktrace suppport given I didn't receive any
feedback to kill it. This ensures this works as it used to.

Since these are minor changes I've given this a spin with break-blktrace
tests I have written and also ran blktrace with a scsi-generic media
changer. Both sg0 (the controller) and sg1 worked as expected.

These changes are based on linux-next tag next-20200515, and can also be
found on my git tree:

https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux-next.git/log/?h=20200515-blktrace-fixes

Luis Chamberlain (7):
  block: add docs for gendisk / request_queue refcount helpers
  block: clarify context for gendisk / request_queue refcount increment
helpers
  block: revert back to synchronous request_queue removal
  block: move main block debugfs initialization to its own file
  blktrace: fix debugfs use after free
  blktrace: break out of blktrace setup on concurrent calls
  loop: be paranoid on exit and prevent new additions / removals

 block/Makefile   |  10 +-
 block/blk-core.c |  32 --
 block/blk-debugfs.c  | 197 +++
 block/blk-mq-debugfs.c   |   5 -
 block/blk-sysfs.c|  46 
 block/blk.h  |  24 +
 block/bsg.c  |   2 +
 block/genhd.c|  73 -
 block/partitions/core.c  |   9 ++
 drivers/block/loop.c |   4 +
 drivers/scsi/ch.c|   1 +
 drivers/scsi/sg.c|  75 +
 drivers/scsi/st.c|   2 +
 include/linux/blkdev.h   |   6 +-
 include/linux/blktrace_api.h |   1 -
 include/linux/genhd.h|  69 
 kernel/trace/blktrace.c  |  37 +--
 17 files changed, 545 insertions(+), 48 deletions(-)
 create mode 100644 block/blk-debugfs.c

-- 
2.26.2

[PATCH v5 1/7] block: add docs for gendisk / request_queue refcount helpers

2020-05-15 Thread Luis Chamberlain

This adds documentation for the gendisk / request_queue refcount
helpers.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Bart Van Assche 
Signed-off-by: Luis Chamberlain 
---
 block/blk-core.c | 13 +
 block/genhd.c| 50 +++-
 2 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 5847993738f1..e438c3b0815b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -321,6 +321,13 @@ void blk_clear_pm_only(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_clear_pm_only);
 
+/**
+ * blk_put_queue - decrement the request_queue refcount
+ * @q: the request_queue structure to decrement the refcount for
+ *
+ * Decrements the refcount of the request_queue kobject. When this reaches 0
+ * we'll have blk_release_queue() called.
+ */
 void blk_put_queue(struct request_queue *q)
 {
kobject_put(>kobj);
@@ -598,6 +605,12 @@ struct request_queue *blk_alloc_queue(make_request_fn 
make_request, int node_id)
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
+/**
+ * blk_get_queue - increment the request_queue refcount
+ * @q: the request_queue structure to increment the refcount for
+ *
+ * Increment the refcount of the request_queue kobject.
+ */
 bool blk_get_queue(struct request_queue *q)
 {
if (likely(!blk_queue_dying(q))) {
diff --git a/block/genhd.c b/block/genhd.c
index afdb2c3e5b22..af910e6a0233 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -908,6 +908,20 @@ static void invalidate_partition(struct gendisk *disk, int 
partno)
bdput(bdev);
 }
 
+/**
+ * del_gendisk - remove the gendisk
+ * @disk: the struct gendisk to remove
+ *
+ * Removes the gendisk and all its associated resources. This deletes the
+ * partitions associated with the gendisk, and unregisters the associated
+ * request_queue.
+ *
+ * This is the counter to the respective __device_add_disk() call.
+ *
+ * The final removal of the struct gendisk happens when its refcount reaches 0
+ * with put_disk(), which should be called after del_gendisk(), if
+ * __device_add_disk() was used.
+ */
 void del_gendisk(struct gendisk *disk)
 {
struct disk_part_iter piter;
@@ -1539,6 +1553,23 @@ int disk_expand_part_tbl(struct gendisk *disk, int 
partno)
return 0;
 }
 
+/**
+ * disk_release - releases all allocated resources of the gendisk
+ * @dev: the device representing this disk
+ *
+ * This function releases all allocated resources of the gendisk.
+ *
+ * The struct gendisk refcount is incremented with get_gendisk() or
+ * get_disk_and_module(), and its refcount is decremented with
+ * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this
+ * function is called.
+ *
+ * Drivers which used __device_add_disk() have a gendisk with a request_queue
+ * assigned. Since the request_queue sits on top of the gendisk for these
+ * drivers we also call blk_put_queue() for them, and we expect the
+ * request_queue refcount to reach 0 at this point, and so the request_queue
+ * will also be freed prior to the disk.
+ */
 static void disk_release(struct device *dev)
 {
struct gendisk *disk = dev_to_disk(dev);
@@ -1748,6 +1779,13 @@ struct gendisk *__alloc_disk_node(int minors, int 
node_id)
 }
 EXPORT_SYMBOL(__alloc_disk_node);
 
+/**
+ * get_disk_and_module - increments the gendisk and gendisk fops module 
refcount
+ * @disk: the struct gendisk to to increment the refcount for
+ *
+ * This increments the refcount for the struct gendisk, and the gendisk's
+ * fops module owner.
+ */
 struct kobject *get_disk_and_module(struct gendisk *disk)
 {
struct module *owner;
@@ -1768,6 +1806,13 @@ struct kobject *get_disk_and_module(struct gendisk *disk)
 }
 EXPORT_SYMBOL(get_disk_and_module);
 
+/**
+ * put_disk - decrements the gendisk refcount
+ * @disk: the struct gendisk to to decrement the refcount for
+ *
+ * This decrements the refcount for the struct gendisk. When this reaches 0
+ * we'll have disk_release() called.
+ */
 void put_disk(struct gendisk *disk)
 {
if (disk)
@@ -1775,7 +1820,10 @@ void put_disk(struct gendisk *disk)
 }
 EXPORT_SYMBOL(put_disk);
 
-/*
+/**
+ * put_disk_and_module - decrements the module and gendisk refcount
+ * @disk: the struct gendisk to to decrement the refcount for
+ *
  * This is a counterpart of get_disk_and_module() and thus also of
  * get_gendisk().
  */
-- 
2.26.2

[PATCH v5 2/7] block: clarify context for gendisk / request_queue refcount increment helpers

2020-05-15 Thread Luis Chamberlain

Let us clarify the context under which the helpers to increment the
refcount for the gendisk and request_queue can be called under. We
make this explicit on the places where we may sleep with might_sleep().

We don't address the decrement context yet, as that needs some extra
work and fixes, but will be addressed in the next patch.

Reviewed-by: Christoph Hellwig 
Reviewed-by: Bart Van Assche 
Signed-off-by: Luis Chamberlain 
---
 block/blk-core.c | 2 ++
 block/genhd.c| 6 ++
 2 files changed, 8 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index e438c3b0815b..94216fa16a05 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -610,6 +610,8 @@ EXPORT_SYMBOL(blk_alloc_queue);
  * @q: the request_queue structure to increment the refcount for
  *
  * Increment the refcount of the request_queue kobject.
+ *
+ * Context: Any context.
  */
 bool blk_get_queue(struct request_queue *q)
 {
diff --git a/block/genhd.c b/block/genhd.c
index af910e6a0233..598bd32ad28c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1017,11 +1017,15 @@ static ssize_t disk_badblocks_store(struct device *dev,
  *
  * This function gets the structure containing partitioning
  * information for the given device @devt.
+ *
+ * Context: can sleep
  */
 struct gendisk *get_gendisk(dev_t devt, int *partno)
 {
struct gendisk *disk = NULL;
 
+   might_sleep();
+
if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
struct kobject *kobj;
 
@@ -1785,6 +1789,8 @@ EXPORT_SYMBOL(__alloc_disk_node);
  *
  * This increments the refcount for the struct gendisk, and the gendisk's
  * fops module owner.
+ *
+ * Context: Any context.
  */
 struct kobject *get_disk_and_module(struct gendisk *disk)
 {
-- 
2.26.2

[PATCH v5 4/7] block: move main block debugfs initialization to its own file

2020-05-15 Thread Luis Chamberlain

make_request-based drivers and and request-based drivers share some
debugfs code. By moving this into its own file it makes it easier
to expand and audit this shared code.

This patch contains no functional changes.

Cc: Bart Van Assche 
Cc: Omar Sandoval 
Cc: Hannes Reinecke 
Cc: Nicolai Stange 
Cc: Greg Kroah-Hartman 
Cc: Michal Hocko 
Cc: yu kuai 
Reviewed-by: Greg Kroah-Hartman 
Reviewed-by: Bart Van Assche 
Signed-off-by: Luis Chamberlain 
---
 block/Makefile  | 10 +++---
 block/blk-core.c|  9 +
 block/blk-debugfs.c | 15 +++
 block/blk.h |  8 
 4 files changed, 31 insertions(+), 11 deletions(-)
 create mode 100644 block/blk-debugfs.c

diff --git a/block/Makefile b/block/Makefile
index 78719169fb2a..ec4b17f9dd93 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
-   genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o
+   genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
+   debugfs.o
 
 obj-$(CONFIG_BOUNCE)   += bounce.o
 obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
@@ -32,8 +33,11 @@ obj-$(CONFIG_BLK_MQ_VIRTIO)  += blk-mq-virtio.o
 obj-$(CONFIG_BLK_MQ_RDMA)  += blk-mq-rdma.o
 obj-$(CONFIG_BLK_DEV_ZONED)+= blk-zoned.o
 obj-$(CONFIG_BLK_WBT)  += blk-wbt.o
-obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
-obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
+
+debugfs-$(CONFIG_DEBUG_FS) += blk-debugfs.o
+debugfs-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
+debugfs-$(CONFIG_BLK_DEBUG_FS_ZONED)   += blk-mq-debugfs-zoned.o
+
 obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
 obj-$(CONFIG_BLK_PM)   += blk-pm.o
 obj-$(CONFIG_BLK_INLINE_ENCRYPTION)+= keyslot-manager.o blk-crypto.o
diff --git a/block/blk-core.c b/block/blk-core.c
index 8a785d16033b..d40648958767 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -51,10 +51,6 @@
 #include "blk-pm.h"
 #include "blk-rq-qos.h"
 
-#ifdef CONFIG_DEBUG_FS
-struct dentry *blk_debugfs_root;
-#endif
-
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
@@ -1880,10 +1876,7 @@ int __init blk_dev_init(void)
 
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
-
-#ifdef CONFIG_DEBUG_FS
-   blk_debugfs_root = debugfs_create_dir("block", NULL);
-#endif
+   blk_debugfs_register();
 
return 0;
 }
diff --git a/block/blk-debugfs.c b/block/blk-debugfs.c
new file mode 100644
index ..19091e1effc0
--- /dev/null
+++ b/block/blk-debugfs.c
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Shared request-based / make_request-based functionality
+ */
+#include 
+#include 
+#include 
+
+struct dentry *blk_debugfs_root;
+
+void blk_debugfs_register(void)
+{
+   blk_debugfs_root = debugfs_create_dir("block", NULL);
+}
diff --git a/block/blk.h b/block/blk.h
index fc00537026a0..ee309233f95e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -458,4 +458,12 @@ int bio_add_hw_page(struct request_queue *q, struct bio 
*bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
 
+#ifdef CONFIG_DEBUG_FS
+void blk_debugfs_register(void);
+#else
+static inline void blk_debugfs_register(void)
+{
+}
+#endif /* CONFIG_DEBUG_FS */
+
 #endif /* BLK_INTERNAL_H */
-- 
2.26.2

[PATCH v5 5/7] blktrace: fix debugfs use after free

2020-05-15 Thread Luis Chamberlain

On commit 6ac93117ab00 ("blktrace: use existing disk debugfs directory")
merged on v4.12 Omar fixed the original blktrace code for request-based
drivers (multiqueue). This however left in place a possible crash, if you
happen to abuse blktrace while racing to remove / add a device.

We used to use asynchronous removal of the request_queue, and with that
the issue was easier to reproduce. Now that we have reverted to
synchronous removal of the request_queue, the issue is still possible to
reproduce, its however just a bit more difficult.

We essentially run two instances of break-blktrace which add/remove
a loop device, and setup a blktrace and just never tear the blktrace
down. We do this twice in parallel. This is easily reproduced with the
break-blktrace run_0004.sh script.

We can end up with two types of panics each reflecting where we
race, one a failed blktrace setup:

[  252.426751] debugfs: Directory 'loop0' with parent 'block' already present!
[  252.432265] BUG: kernel NULL pointer dereference, address: 00a0
[  252.436592] #PF: supervisor write access in kernel mode
[  252.439822] #PF: error_code(0x0002) - not-present page
[  252.442967] PGD 0 P4D 0
[  252.444656] Oops: 0002 [#1] SMP NOPTI
[  252.446972] CPU: 10 PID: 1153 Comm: break-blktrace Tainted: GE   
  5.7.0-rc2-next-20200420+ #164
[  252.452673] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.13.0-1 04/01/2014
[  252.456343] RIP: 0010:down_write+0x15/0x40
[  252.458146] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc
   cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00
   00 00  48 0f b1 55 00 75 0f 48 8b 04 25 c0 8b 01 00 48 89
   45 08 5d
[  252.463638] RSP: 0018:a626415abcc8 EFLAGS: 00010246
[  252.464950] RAX:  RBX: 958c25f0f5c0 RCX: ff81
[  252.466727] RDX: 0001 RSI: ff81 RDI: 00a0
[  252.468482] RBP: 00a0 R08:  R09: 0001
[  252.470014] R10:  R11: 958d1f9227ff R12: 
[  252.471473] R13: 958c25ea5380 R14: 8cce15f1 R15: 00a0
[  252.473346] FS:  7f2e69dee540() GS:958c2fc8() 
knlGS:
[  252.475225] CS:  0010 DS:  ES:  CR0: 80050033
[  252.476267] CR2: 00a0 CR3: 000427d10004 CR4: 00360ee0
[  252.477526] DR0:  DR1:  DR2: 
[  252.478776] DR3:  DR6: fffe0ff0 DR7: 0400
[  252.479866] Call Trace:
[  252.480322]  simple_recursive_removal+0x4e/0x2e0
[  252.481078]  ? debugfs_remove+0x60/0x60
[  252.481725]  ? relay_destroy_buf+0x77/0xb0
[  252.482662]  debugfs_remove+0x40/0x60
[  252.483518]  blk_remove_buf_file_callback+0x5/0x10
[  252.484328]  relay_close_buf+0x2e/0x60
[  252.484930]  relay_open+0x1ce/0x2c0
[  252.485520]  do_blk_trace_setup+0x14f/0x2b0
[  252.486187]  __blk_trace_setup+0x54/0xb0
[  252.486803]  blk_trace_ioctl+0x90/0x140
[  252.487423]  ? do_sys_openat2+0x1ab/0x2d0
[  252.488053]  blkdev_ioctl+0x4d/0x260
[  252.488636]  block_ioctl+0x39/0x40
[  252.489139]  ksys_ioctl+0x87/0xc0
[  252.489675]  __x64_sys_ioctl+0x16/0x20
[  252.490380]  do_syscall_64+0x52/0x180
[  252.491032]  entry_SYSCALL_64_after_hwframe+0x44/0xa9

And the other on the device removal:

[  128.528940] debugfs: Directory 'loop0' with parent 'block' already present!
[  128.615325] BUG: kernel NULL pointer dereference, address: 00a0
[  128.619537] #PF: supervisor write access in kernel mode
[  128.622700] #PF: error_code(0x0002) - not-present page
[  128.625842] PGD 0 P4D 0
[  128.627585] Oops: 0002 [#1] SMP NOPTI
[  128.629871] CPU: 12 PID: 544 Comm: break-blktrace Tainted: GE
 5.7.0-rc2-next-20200420+ #164
[  128.635595] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.13.0-1 04/01/2014
[  128.640471] RIP: 0010:down_write+0x15/0x40
[  128.643041] Code: eb ca e8 ae 22 8d ff cc cc cc cc cc cc cc cc cc cc cc cc
   cc cc 0f 1f 44 00 00 55 48 89 fd e8 52 db ff ff 31 c0 ba 01 00
   00 00  48 0f b1 55 00 75 0f 65 48 8b 04 25 c0 8b 01 00 48 89
   45 08 5d
[  128.650180] RSP: 0018:a9c3c05ebd78 EFLAGS: 00010246
[  128.651820] RAX:  RBX: 8ae9a6370240 RCX: ff81
[  128.653942] RDX: 0001 RSI: ff81 RDI: 00a0
[  128.655720] RBP: 00a0 R08: 0002 R09: 8ae9afd2d3d0
[  128.657400] R10: 0056 R11:  R12: 
[  128.659099] R13:  R14: 0003 R15: 00a0
[  128.660500] FS:  7febfd995540() GS:8ae9afd0() 
knlGS:
[  128.662204] CS:  0010 DS:  ES:  CR0: 80050033
[  128.663426] CR2: 00a0 CR3: 000420042003 CR4: 00360ee0
[  128.664776] DR0:

Re: [RFC PATCH 11/13] scsi: Allow device handler set their own CDB

2020-05-15 Thread Bart Van Assche

On 2020-05-15 03:30, Avri Altman wrote:
> Allow scsi device handler handle their own CDB and ship it down the
> stream of scsi passthrough command setup flow, without any further
> interventions.
> 
> This is useful for setting DRV-OP that implements vendor commands via
> the scsi device handlers framework.
> 
> Signed-off-by: Avri Altman 
> ---
>  drivers/scsi/scsi_lib.c | 5 +++--
>  drivers/scsi/sd.c   | 9 +
>  2 files changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
> index 6b6dd40..4e98714 100644
> --- a/drivers/scsi/scsi_lib.c
> +++ b/drivers/scsi/scsi_lib.c
> @@ -1148,14 +1148,15 @@ static blk_status_t scsi_setup_fs_cmnd(struct 
> scsi_device *sdev,
>  {
>   struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
>  
> + cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd;
> + memset(cmd->cmnd, 0, BLK_MAX_CDB);
> +
>   if (unlikely(sdev->handler && sdev->handler->prep_fn)) {
>   blk_status_t ret = sdev->handler->prep_fn(sdev, req);
>   if (ret != BLK_STS_OK)
>   return ret;
>   }
>  
> - cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd;
> - memset(cmd->cmnd, 0, BLK_MAX_CDB);
>   return scsi_cmd_to_driver(cmd)->init_command(cmd);
>  }
>  
> diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
> index a793cb0..246bef8 100644
> --- a/drivers/scsi/sd.c
> +++ b/drivers/scsi/sd.c
> @@ -1221,6 +1221,14 @@ static blk_status_t sd_setup_read_write_cmnd(struct 
> scsi_cmnd *cmd)
>   } else if (sdp->use_16_for_rw || (nr_blocks > 0x)) {
>   ret = sd_setup_rw16_cmnd(cmd, write, lba, nr_blocks,
>protect | fua);
> + } else if (unlikely(sdp->handler && blk_rq_is_private(rq))) {
> + /*
> +  * scsi device handler that implements vendor commands -
> +  * the command was already constructed in the device handler's
> +  * prep_fn, so no need to do anything here
> +  */
> + rq->cmd_flags = REQ_OP_READ;
> + ret = BLK_STS_OK;
>   } else if ((nr_blocks > 0xff) || (lba > 0x1f) ||
>  sdp->use_10_for_rw || protect) {
>   ret = sd_setup_rw10_cmnd(cmd, write, lba, nr_blocks,
> @@ -1285,6 +1293,7 @@ static blk_status_t sd_init_command(struct scsi_cmnd 
> *cmd)
>   return sd_setup_write_same_cmnd(cmd);
>   case REQ_OP_FLUSH:
>   return sd_setup_flush_cmnd(cmd);
> + case REQ_OP_DRV_IN:
>   case REQ_OP_READ:
>   case REQ_OP_WRITE:
>   return sd_setup_read_write_cmnd(cmd);

The above looks weird to me. My understanding is that
scsi_setup_fs_cmnd() is intended for operations like REQ_OP_READ,
REQ_OP_WRITE and REQ_OP_DISCARD. I don't think that it is appropriate to
pass REQ_OP_DRV_IN requests to scsi_setup_fs_cmnd() and/or
sd_init_command().

Thanks,

Bart.

Re: [PATCH] tick/nohz: Narrow down noise while setting current task's tick dependency

2020-05-15 Thread Paul E. McKenney

On Fri, May 15, 2020 at 02:34:29AM +0200, Frederic Weisbecker wrote:
> So far setting a tick dependency on any task, including current, used to
> trigger an IPI to all CPUs. That's of course suboptimal but it wasn't
> an issue as long as it was only used by posix-cpu-timers on nohz_full,
> a combo that nobody seemed to use in real life.
> 
> But RCU started to use task tick dependency on current task to fix
> stall issues on callbacks processing. These trigger regular and
> undesired system wide IPIs on nohz_full.
> 
> The fix is very easy while setting a tick dependency on the current
> task, only its CPU needs an IPI.

This passes moderate rcutorture testing.  If you want me to take it, please
let me know, and otherwise:

Tested-by: Paul E. McKenney 

> Fixes: 6a949b7af82d (rcu: Force on tick when invoking lots of callbacks)
> Reported-by: Matt Fleming 
> Signed-off-by: Frederic Weisbecker 
> Cc: sta...@kernel.org
> Cc: Paul E. McKenney 
> Cc: Thomas Gleixner 
> Cc: Peter Zijlstra 
> Cc: Ingo Molnar 
> ---
>  kernel/time/tick-sched.c | 22 +++---
>  1 file changed, 15 insertions(+), 7 deletions(-)
> 
> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
> index 3e2dc9b8858c..f0199a4ba1ad 100644
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -351,16 +351,24 @@ void tick_nohz_dep_clear_cpu(int cpu, enum 
> tick_dep_bits bit)
>  EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
>  
>  /*
> - * Set a per-task tick dependency. Posix CPU timers need this in order to 
> elapse
> - * per task timers.
> + * Set a per-task tick dependency. RCU need this. Also posix CPU timers
> + * in order to elapse per task timers.
>   */
>  void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
>  {
> - /*
> -  * We could optimize this with just kicking the target running the task
> -  * if that noise matters for nohz full users.
> -  */
> - tick_nohz_dep_set_all(>tick_dep_mask, bit);
> + if (!atomic_fetch_or(BIT(bit), >tick_dep_mask)) {
> + if (tsk == current) {
> + preempt_disable();
> + tick_nohz_full_kick();
> + preempt_enable();
> + } else {
> + /*
> +  * Some future tick_nohz_full_kick_task()
> +  * should optimize this.
> +  */
> + tick_nohz_full_kick_all();
> + }
> + }
>  }
>  EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);
>  
> -- 
> 2.26.2
>

Re: [PATCH 2/4] proc/sysctl: add shared variables -1

2020-05-15 Thread Xiaoming Ni


On 2020/5/16 10:47, Kees Cook wrote:

On Sat, May 16, 2020 at 10:32:19AM +0800, Xiaoming Ni wrote:

On 2020/5/16 0:05, Kees Cook wrote:

On Fri, May 15, 2020 at 05:06:28PM +0800, Xiaoming Ni wrote:

On 2020/5/15 16:06, Kees Cook wrote:

On Fri, May 15, 2020 at 12:33:42PM +0800, Xiaoming Ni wrote:

Add the shared variable SYSCTL_NEG_ONE to replace the variable neg_one
used in both sysctl_writes_strict and hung_task_warnings.

Signed-off-by: Xiaoming Ni 
---
fs/proc/proc_sysctl.c | 2 +-
include/linux/sysctl.h| 1 +
kernel/hung_task_sysctl.c | 3 +--
kernel/sysctl.c   | 3 +--


How about doing this refactoring in advance of the extraction patch?

Before  advance of the extraction patch, neg_one is only used in one file,
does it seem to have no value for refactoring?


I guess it doesn't matter much, but I think it's easier to review in the
sense that neg_one is first extracted and then later everything else is
moved.


Later, when more features sysctl interface is moved to the code file, there
will be more variables that need to be extracted.
So should I only extract the neg_one variable here, or should I extract all
the variables used by multiple features?


Hmm -- if you're going to do a consolidation pass, then nevermind, I
don't think order will matter then.

Thank you for the cleanup! Sorry we're giving you back-and-forth advice!

-Kees



Sorry, I don't fully understand.
Does this mean that there is no need to adjust the patch order or the 
order of variables in sysctl_vals?

Should I extract only SYSCTL_NEG_ONE or should I extract all variables?

Thanks
Xiaoming Ni

Re: [RFC PATCH 09/13] scsi: ufshpb: Add response API

2020-05-15 Thread Bart Van Assche

On 2020-05-15 03:30, Avri Altman wrote:
> +#define RSP_DATA_SEG_LEN (sizeof(struct ufshpb_sense_data))

The name of this macro is almost as long as the expression it replaces.
It may make the code easier to read by dropping this macro and using the
sizeof() expression directly.

> + struct tasklet_struct rsp_tasklet;

Why a tasklet instead of e.g. a work_struct? Tasklets can cause nasty
problems, e.g. CPU lockup complaints if too much work is done in tasklet
context.

> +static void ufshpb_dh_notify(struct ufshpb_lun *hpb,
> +  struct ufshpb_sense_data *sense)
> +{
> + struct ufs_hba *hba = shost_priv(hpb->sdev->host);
> +
> + spin_lock(hba->host->host_lock);
> +
> + if (scsi_device_get(hpb->sdev)) {
> + spin_unlock(hba->host->host_lock);
> + return;
> + }
> +
> + scsi_dh_set_params(hpb->sdev->request_queue, (const char *)sense);
> +
> + scsi_device_put(hpb->sdev);
> +
> + spin_unlock(hba->host->host_lock);
> +}

To me this looks like slight abuse of the scsi_dh_set_params() function.
The documentation of that function mentions clearly that the second
argument is an ASCII string and not e.g. sense data.

Has this driver been tested on a system with lockdep enabled? I don't
think that it is acceptable to use spin_lock() in tasklet context.

> +static void ufshpb_tasklet_fn(unsigned long priv)
> +{
> + struct ufshpb_lun *hpb = (struct ufshpb_lun *)priv;
> + struct ufshpb_rsp_element *rsp_elem = NULL;
> + unsigned long flags;
> +
> + while (1) {
> + spin_lock_irqsave(>rsp_list_lock, flags);
> + rsp_elem = ufshpb_get_rsp_elem(hpb, >lh_rsp);
> + spin_unlock_irqrestore(>rsp_list_lock, flags);
> +
> + if (!rsp_elem)
> + return;
> +
> + ufshpb_dh_notify(hpb, _elem->sense_data);
> +
> + spin_lock_irqsave(>rsp_list_lock, flags);
> + list_add_tail(_elem->list, >lh_rsp_free);
> + spin_unlock_irqrestore(>rsp_list_lock, flags);
> + }
> +}

Please schedule work instead of using tasklet context.

Thanks,

Bart.

Re: [PATCH] bus: mhi: core: Use current ee in intvec handler

2020-05-15 Thread bbhatt


On 2020-05-14 19:17, Jeffrey Hugo wrote:

The intvec handler stores the caches ee in a local variable for use in
processing the intvec.  It should instead use the current ee which is
read at the beginning of the intvec incase that the intvec is related 
to

an ee change.  Otherwise, the handler might make the wrong decision
based on an incorrect ee.

Fixes: 3000f85b8f47 (bus: mhi: core: Add support for basic PM 
operations)

Signed-off-by: Jeffrey Hugo 
---
 drivers/bus/mhi/core/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/bus/mhi/core/main.c b/drivers/bus/mhi/core/main.c
index 7272a5a..0a41fe5 100644
--- a/drivers/bus/mhi/core/main.c
+++ b/drivers/bus/mhi/core/main.c
@@ -386,8 +386,8 @@ irqreturn_t mhi_intvec_threaded_handler(int
irq_number, void *dev)
write_lock_irq(_cntrl->pm_lock);
if (MHI_REG_ACCESS_VALID(mhi_cntrl->pm_state)) {
state = mhi_get_mhi_state(mhi_cntrl);
-   ee = mhi_cntrl->ee;
mhi_cntrl->ee = mhi_get_exec_env(mhi_cntrl);
+   ee = mhi_cntrl->ee;
}

if (state == MHI_STATE_SYS_ERR) {

Hi Jeff,

Let's hold off on this change for now please as we have some good set of
bug fixes and improvements coming in very soon. They're only pending 
post

to LKML.

Thanks

Re: [PATCH net-next] hinic: add set_channels ethtool_ops support

2020-05-15 Thread luobin (L)




On 2020/5/16 2:13, Michal Kubecek wrote:

On Fri, May 15, 2020 at 12:35:47AM +, Luo bin wrote:

add support to change TX/RX queue number with ethtool -L

Signed-off-by: Luo bin 
---
  .../net/ethernet/huawei/hinic/hinic_ethtool.c | 67 +--
  .../net/ethernet/huawei/hinic/hinic_hw_dev.c  |  7 ++
  .../net/ethernet/huawei/hinic/hinic_hw_dev.h  |  2 +
  .../net/ethernet/huawei/hinic/hinic_main.c|  5 +-
  drivers/net/ethernet/huawei/hinic/hinic_tx.c  |  5 ++
  5 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c 
b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
index ace18d258049..92a0e3bd19c3 100644
--- a/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic/hinic_ethtool.c
@@ -619,14 +619,68 @@ static void hinic_get_channels(struct net_device *netdev,
struct hinic_dev *nic_dev = netdev_priv(netdev);
struct hinic_hwdev *hwdev = nic_dev->hwdev;
  
-	channels->max_rx = hwdev->nic_cap.max_qps;

-   channels->max_tx = hwdev->nic_cap.max_qps;
+   channels->max_rx = 0;
+   channels->max_tx = 0;
channels->max_other = 0;
-   channels->max_combined = 0;
-   channels->rx_count = hinic_hwdev_num_qps(hwdev);
-   channels->tx_count = hinic_hwdev_num_qps(hwdev);
+   channels->max_combined = nic_dev->max_qps;
+   channels->rx_count = 0;
+   channels->tx_count = 0;
channels->other_count = 0;
-   channels->combined_count = 0;
+   channels->combined_count = hinic_hwdev_num_qps(hwdev);
+}
+
+int hinic_set_channels(struct net_device *netdev,
+  struct ethtool_channels *channels)
+{
+   struct hinic_dev *nic_dev = netdev_priv(netdev);
+   unsigned int count = channels->combined_count;
+   int err;
+
+   if (!count) {
+   netif_err(nic_dev, drv, netdev,
+ "Unsupported combined_count: 0\n");
+   return -EINVAL;
+   }
+
+   if (channels->tx_count || channels->rx_count || channels->other_count) {
+   netif_err(nic_dev, drv, netdev,
+ "Setting rx/tx/other count not supported\n");
+   return -EINVAL;
+   }

With max_* reported as 0, these will be caught in ethnl_set_channels()
or ethtool_set_channels().
---Will fix. Thanks

+   if (!(nic_dev->flags & HINIC_RSS_ENABLE)) {
+   netif_err(nic_dev, drv, netdev,
+ "This function doesn't support RSS, only support 1 queue 
pair\n");
+   return -EOPNOTSUPP;
+   }

I'm not sure if the request should fail even if requested count is
actually 1.


+   if (count > nic_dev->max_qps) {
+   netif_err(nic_dev, drv, netdev,
+ "Combined count %d exceeds limit %d\n",
+ count, nic_dev->max_qps);
+   return -EINVAL;
+   }

As above, this check has been already performed in ethnl_set_channels()
or ethtool_set_channels().
---Will fix. Thanks

+   netif_info(nic_dev, drv, netdev, "Set max combined queue number from %d to 
%d\n",
+  hinic_hwdev_num_qps(nic_dev->hwdev), count);

We have netlink notifications now, is it necessary to log successful
changes?
---I think it contributes to locating defects for developers.

Michal
.

Re: [PATCH v10 01/26] Documentation/x86: Add CET description

2020-05-15 Thread H.J. Lu

On Fri, May 15, 2020 at 4:56 PM Dave Hansen  wrote:
>
> On 5/15/20 4:29 PM, Yu-cheng Yu wrote:
> > On Fri, 2020-05-15 at 15:43 -0700, Dave Hansen wrote:
> >> Basically, if there ends up being a bug in an app that violates the
> >> shadow stack rules, the app is broken, period.  The only recourse is to
> >> have the kernel disable CET and reboot.
> >>
> >> Is that right?
> >
> > You must be talking about init or any of the system daemons, right?
> > Assuming we let the app itself start CET with an arch_prctl(), why would 
> > that be
> > different from the current approach?
>
> You're getting ahead of me a bit here.
>
> I'm actually not asking directly about the prctls() or advocating for a
> different approach.  The MPX approach of _requiring the app to make a
> prctl() was actually pretty nasty because sometimes threads got created
> before the prctl() could get called.  Apps ended up inadvertently
> half-MPX-enabled.  Not fun.
>
> Let's say we have an app doing silly things like retpolines.  (Lots of
> app do lots of silly things).  It gets compiled in a distro but never
> runs on a system with CET.  The app gets run for the first time on a
> system with CET.  App goes boom.  Not init, just some random app, say
> /usr/bin/ldapsearch.

I designed and implemented CET toolchain and run-time in such a way
for it very difficult to happen.   Basically, CET won't be enabled on such
an app.

> What's my recourse as an end user?  I want to run my app and turn off
> CET for that app.  How can I do that?

The CET OS I designed turns CET off for you and you don't have to do
anything.

>  Can a binary compiled without CET run CET-enabled code?
> >>>
> >>> Partially yes, but in reality somewhat difficult.
> >> ...
> >>> - If a not-CET application does fork(), and the child wants to turn on 
> >>> CET, it
> >>> would be difficult to manage the stack frames, unless the child knows 
> >>> what is is
> >>> doing.
> >>
> >> It might be hard to do, but it is possible with the patches you posted?
> >
> > It is possible to add an arch_prctl() to turn on CET.  That is simple from 
> > the
> > kernel's perspective, but difficult for the application.  Once the app 
> > enables
> > shadow stack, it has to take care not to return beyond the function call 
> > layers
> > before that point.  It can no longer do longjmp or ucontext swaps to 
> > anything
> > before that point.  It will also be complicated if the app enables shadow 
> > stack
> > in a signal handler.
>
> Yu-cheng, I'm having a very hard time getting direct answers to my
> questions.  Could you endeavor to give succinct, direct answers?  If you
> want to give a longer, conditioned answer, that's great.  But, I'd
> appreciate if you could please focus first on clearly answering the
> questions that I'm asking.
>
> Let me try again:
>
> Is it possible with the patches in this series to run a single-
> threaded binary which was has GNU_PROPERTY_X86_FEATURE_1_SHSTK
> unset to run with shadow stack protection?

Yes, you can.  I added such capabilities for testing purpose.  But you
application
will crash as soon as there is a CET violation.  My CET software design is very
flexible.  It can accommodate different requirements.  We are working
with 2 OSVs
to enable CET in their OSes.  So far we haven't run into any
unexpected issues.

> I think the answer is an unambiguous: "No".  But I'd like to hear it
> from you.
>
> >>  I think you're saying that the CET-enabled binary would do
> >> arch_setup_elf_property() when it was first exec()'d.  Later, it could
> >> use the new prctl(ARCH_X86_CET_DISABLE) to disable its shadow stack,
> >> then fork() and the child would not be using CET.  Right?
> >>
> >> What is ARCH_X86_CET_DISABLE used for, anyway?
> >
> > Both the parent and the child can do ARCH_X86_CET_DISABLE, if CET is
> > not locked.
>
> Could you please describe a real-world example of why
> ARCH_X86_CET_DISABLE exists?  What kinds of apps will use it, or *are*
> using it?  Why was it created in the first place?
>
> >>> The JIT examples I mentioned previously run with CET enabled from the
> >>> beginning.  Do you have a reason to do this?  In other words, if the JIT 
> >>> code
> >>> needs CET, the app could have started with CET in the first place.
> >>
> >> Let's say I have a JIT'd sandbox.  I want the sandbox to be
> >> CET-protected, but the JIT engine itself not to be.
> >
> > I do not have any objections to this use case, but it needs some cautions as
> > stated above.  It will be much easier and cleaner if the sandbox is in a
> > separate exec'ed task with CET on.
>
> OK, great suggestion!  Could you do some research and look at the
> various sandboxing techniques?  Is imposing this requirement for a
> separate exec'd task reasonable?  Does it fit nicely with their existing
> models?  How about the Chrome browser and Firefox sandboxs?
>
>  Does this *code* work?  Could you please indicate which JITs have been
>  enabled to use

Re: [PATCH v10 01/26] Documentation/x86: Add CET description

2020-05-15 Thread Yu-cheng Yu

On Fri, 2020-05-15 at 16:56 -0700, Dave Hansen wrote:
> On 5/15/20 4:29 PM, Yu-cheng Yu wrote:
> > On Fri, 2020-05-15 at 15:43 -0700, Dave Hansen wrote:
> > > Basically, if there ends up being a bug in an app that violates the
> > > shadow stack rules, the app is broken, period.  The only recourse is to
> > > have the kernel disable CET and reboot.
> > > 
> > > Is that right?
> > 
> > You must be talking about init or any of the system daemons, right?
> > Assuming we let the app itself start CET with an arch_prctl(), why would 
> > that be
> > different from the current approach?
> 
> You're getting ahead of me a bit here.
> 
> I'm actually not asking directly about the prctls() or advocating for a
> different approach.  The MPX approach of _requiring the app to make a
> prctl() was actually pretty nasty because sometimes threads got created
> before the prctl() could get called.  Apps ended up inadvertently
> half-MPX-enabled.  Not fun.
> 
> Let's say we have an app doing silly things like retpolines.  (Lots of
> app do lots of silly things).  It gets compiled in a distro but never
> runs on a system with CET.  The app gets run for the first time on a
> system with CET.  App goes boom.  Not init, just some random app, say
> /usr/bin/ldapsearch.
> 
> What's my recourse as an end user?  I want to run my app and turn off
> CET for that app.  How can I do that?

GLIBC_TUNABLES=glibc.tune.hwcaps=-SHSTK,-IBT

> > > > > Can a binary compiled without CET run CET-enabled code?
> > > > 
> > > > Partially yes, but in reality somewhat difficult.
> > > ...
> > > > - If a not-CET application does fork(), and the child wants to turn on 
> > > > CET, it
> > > > would be difficult to manage the stack frames, unless the child knows 
> > > > what is is
> > > > doing.  
> > > 
> > > It might be hard to do, but it is possible with the patches you posted?
> > 
> > It is possible to add an arch_prctl() to turn on CET.  That is simple from 
> > the
> > kernel's perspective, but difficult for the application.  Once the app 
> > enables
> > shadow stack, it has to take care not to return beyond the function call 
> > layers
> > before that point.  It can no longer do longjmp or ucontext swaps to 
> > anything
> > before that point.  It will also be complicated if the app enables shadow 
> > stack
> > in a signal handler.
> 
> Yu-cheng, I'm having a very hard time getting direct answers to my
> questions.  Could you endeavor to give succinct, direct answers?  If you
> want to give a longer, conditioned answer, that's great.  But, I'd
> appreciate if you could please focus first on clearly answering the
> questions that I'm asking.
> 
> Let me try again:
> 
>   Is it possible with the patches in this series to run a single-
>   threaded binary which was has GNU_PROPERTY_X86_FEATURE_1_SHSTK
>   unset to run with shadow stack protection?
> 
> I think the answer is an unambiguous: "No".  But I'd like to hear it
> from you.

No!

> > >  I think you're saying that the CET-enabled binary would do
> > > arch_setup_elf_property() when it was first exec()'d.  Later, it could
> > > use the new prctl(ARCH_X86_CET_DISABLE) to disable its shadow stack,
> > > then fork() and the child would not be using CET.  Right?
> > > 
> > > What is ARCH_X86_CET_DISABLE used for, anyway?
> > 
> > Both the parent and the child can do ARCH_X86_CET_DISABLE, if CET is
> > not locked.
> 
> Could you please describe a real-world example of why
> ARCH_X86_CET_DISABLE exists?  What kinds of apps will use it, or *are*
> using it?  Why was it created in the first place?

Currently, ld-linux turns off CET if the binary being loaded does not support
CET.

> > > > The JIT examples I mentioned previously run with CET enabled from the
> > > > beginning.  Do you have a reason to do this?  In other words, if the 
> > > > JIT code
> > > > needs CET, the app could have started with CET in the first place.
> > > 
> > > Let's say I have a JIT'd sandbox.  I want the sandbox to be
> > > CET-protected, but the JIT engine itself not to be.
> > 
> > I do not have any objections to this use case, but it needs some cautions as
> > stated above.  It will be much easier and cleaner if the sandbox is in a
> > separate exec'ed task with CET on.
> 
> OK, great suggestion!  Could you do some research and look at the
> various sandboxing techniques?  Is imposing this requirement for a
> separate exec'd task reasonable?  Does it fit nicely with their existing
> models?  How about the Chrome browser and Firefox sandboxs?

I will check.

> > > > > Does this *code* work?  Could you please indicate which JITs have been
> > > > > enabled to use the code in this series?  How much of the new ABI is 
> > > > > in use?
> > > > 
> > > > JIT does not necessarily use all of the ABI.  The JIT changes mainly 
> > > > fix stack
> > > > frames and insert ENDBRs.  I do not work on JIT.  What I found is LLVM 
> > > > JIT fixes
> > > > are tested and in the master branch.  Sljit fixes are in the

Re: [PATCH 2/4] proc/sysctl: add shared variables -1

2020-05-15 Thread Kees Cook

On Sat, May 16, 2020 at 10:32:19AM +0800, Xiaoming Ni wrote:
> On 2020/5/16 0:05, Kees Cook wrote:
> > On Fri, May 15, 2020 at 05:06:28PM +0800, Xiaoming Ni wrote:
> > > On 2020/5/15 16:06, Kees Cook wrote:
> > > > On Fri, May 15, 2020 at 12:33:42PM +0800, Xiaoming Ni wrote:
> > > > > Add the shared variable SYSCTL_NEG_ONE to replace the variable neg_one
> > > > > used in both sysctl_writes_strict and hung_task_warnings.
> > > > > 
> > > > > Signed-off-by: Xiaoming Ni 
> > > > > ---
> > > > >fs/proc/proc_sysctl.c | 2 +-
> > > > >include/linux/sysctl.h| 1 +
> > > > >kernel/hung_task_sysctl.c | 3 +--
> > > > >kernel/sysctl.c   | 3 +--
> > > > 
> > > > How about doing this refactoring in advance of the extraction patch?
> > > Before  advance of the extraction patch, neg_one is only used in one file,
> > > does it seem to have no value for refactoring?
> > 
> > I guess it doesn't matter much, but I think it's easier to review in the
> > sense that neg_one is first extracted and then later everything else is
> > moved.
> > 
> Later, when more features sysctl interface is moved to the code file, there
> will be more variables that need to be extracted.
> So should I only extract the neg_one variable here, or should I extract all
> the variables used by multiple features?

Hmm -- if you're going to do a consolidation pass, then nevermind, I
don't think order will matter then.

Thank you for the cleanup! Sorry we're giving you back-and-forth advice!

-Kees

-- 
Kees Cook

Re: [RFC PATCH 07/13] scsi: scsi_dh: ufshpb: Add ufshpb state machine

2020-05-15 Thread Bart Van Assche

On 2020-05-15 03:30, Avri Altman wrote:
> @@ -17,6 +18,13 @@
>  
>  #define UFSHPB_NAME  "ufshpb"
>  
> +#define UFSHPB_WRITE_BUFFER (0xfa)
> +#define WRITE_BUFFER_TIMEOUT (3 * HZ)
> +#define WRITE_BUFFER_RETRIES (3)
> +#define UFSHPB_READ_BUFFER (0xf9)
> +#define READ_BUFFER_TIMEOUT (3 * HZ)
> +#define READ_BUFFER_RETRIES (3)

Parentheses around expressions are normal but parentheses around
constants are unusual. I think the parentheses around constants can be
left out.

> +#define to_subregion() (container_of(work, struct ufshpb_subregion, 
> hpb_work))

Could this have been defined as an inline function?

> @@ -76,6 +118,7 @@ struct ufshpb_subregion {
>   * @writes - sum over subregions @writes
>   * @region - region index
>   * @active_subregions - actual active subregions
> + * @evicted - to indicated if this region is currently being evicted
>   */
>  struct ufshpb_region {
>   struct ufshpb_subregion *subregion_tbl;
> @@ -85,6 +128,7 @@ struct ufshpb_region {
>   unsigned int region;
>  
>   atomic_t active_subregions;
> + atomic_t evicted;
>  };

Declaring a state variable as atomic_t is unusual. How are changes of
the @evicted member variable serialized?

>  /**
> @@ -93,6 +137,7 @@ struct ufshpb_region {
>   * @lh_map_ctx - list head of mapping context
>   * @map_list_lock - to protect mapping context list operations
>   * @region_tbl - regions/subregions table
> + * @pinned_map - to mark pinned regions
>   * @sdev - scsi device for that lun
>   * @regions_per_lun
>   * @subregions_per_lun - lun size is not guaranteed to be region aligned
> @@ -105,6 +150,7 @@ struct ufshpb_dh_lun {
>   struct list_head lh_map_ctx;
>   spinlock_t map_list_lock;
>   struct ufshpb_region *region_tbl;
> + unsigned long *pinned_map;
>   struct scsi_device *sdev;
>  
>   unsigned int regions_per_lun;
> @@ -113,6 +159,10 @@ struct ufshpb_dh_lun {
>   unsigned int max_active_regions;
>  
>   atomic_t active_regions;
> +
> + struct mutex eviction_lock;
> +
> + struct workqueue_struct *wq;
>  };

Please document what the eviction_lock protects.

> +static inline void ufshpb_set_write_buf_cmd(unsigned char *cmd,
> + unsigned int region)
> +{
> + cmd[0] = UFSHPB_WRITE_BUFFER;
> + cmd[1] = 0x01;
> + put_unaligned_be16(region, [2]);
> +}

Please follow the example of the sd driver and use the verb "setup"
instead of "set" for functions that initialize a SCSI CDB.

> +static int ufshpb_submit_write_buf_cmd(struct scsi_device *sdev,
> +unsigned int region)
> +{
> + unsigned char cmd[10] = {};
> + struct scsi_sense_hdr sshdr = {};
> + u64 flags = REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
> + REQ_FAILFAST_DRIVER;
> + int timeout = WRITE_BUFFER_TIMEOUT;
> + int cmd_retries = WRITE_BUFFER_RETRIES;
> + int ret = 0;
> +
> + ufshpb_set_write_buf_cmd(cmd, region);
> +
> + ret = scsi_execute(sdev, cmd, DMA_NONE, NULL, 0, NULL, ,
> +timeout, cmd_retries, flags, 0, NULL);
> +
> + /* HPB spec does not define any error handling */
> + sdev_printk(KERN_INFO, sdev, "%s: WRITE_BUFFER %s result %d\n",
> + UFSHPB_NAME, ret ? "failed" : "succeeded", ret);
> +
> + return ret;
> +}

I don't think that unconditionally printing the result of the WRITE
BUFFER command is acceptable. How about only reporting failures?

> +static void ufshpb_set_read_buf_cmd(unsigned char *cmd, unsigned int region,
> + unsigned int subregion,
> + unsigned int alloc_len)
> +{
> + cmd[0] = UFSHPB_READ_BUFFER;
> + cmd[1] = 0x01;
> + put_unaligned_be16(region, [2]);
> + put_unaligned_be16(subregion, [4]);
> +
> + cmd[6] = alloc_len >> 16;
> + cmd[7] = (alloc_len >> 8) & 0xff;
> + cmd[8] = alloc_len & 0xff;
> + cmd[9] = 0x00;
> +}

Please use put_unaligned_be24() instead of open-coding it.

> +static int ufshpb_subregion_alloc_pages(struct ufshpb_dh_lun *hpb,
> + struct ufshpb_subregion *s)
> +{
> + struct ufshpb_map_ctx *mctx;
> +
> + spin_lock(>map_list_lock);
> + mctx = list_first_entry_or_null(>lh_map_ctx,
> + struct ufshpb_map_ctx, list);
> + if (!mctx) {
> + spin_unlock(>map_list_lock);
> + return -EINVAL;
> + }
> +
> + list_del_init(>list);
> + spin_unlock(>map_list_lock);
> +
> + s->mctx = mctx;
> + mctx->pages = (char *)__get_free_pages(GFP_KERNEL, order);
> + if (!mctx->pages)
> + return -ENOMEM;
> +
> + return 0;
> +}

Relying on higher order pages is not acceptable because memory gets
fragmented easily. See also
https://elinux.org/images/a/a8/Controlling_Linux_Memory_Fragmentation_and_Higher_Order_Allocation_Failure-_Analysis%2C_Observations_and_Results.pdf.

> +

Re: [PATCH v3 4/6] pstore/ram: Introduce max_reason and convert dump_oops

2020-05-15 Thread Kees Cook

On Tue, May 12, 2020 at 06:35:04PM -0500, Tyler Hicks wrote:
> On 2020-05-06 14:15:21, Kees Cook wrote:
> > @@ -954,7 +965,11 @@ static void __init ramoops_register_dummy(void)
> > pdata.console_size = ramoops_console_size;
> > pdata.ftrace_size = ramoops_ftrace_size;
> > pdata.pmsg_size = ramoops_pmsg_size;
> > -   pdata.dump_oops = dump_oops;
> > +   /* Parse deprecated module param "dump_oops" into "max_reason". */
> > +   if (ramoops_dump_oops != -1)
> > +   pdata.max_reason = ramoops_dump_oops ? KMSG_DUMP_OOPS
> > +: KMSG_DUMP_PANIC;
> > +   pdata.max_reason = ramoops_max_reason;
> 
> This isn't quite right. We're conditionally assigning pdata.max_reason
> and then immediately re-assigning it.
> 
> IIUC, we're just missing an else block and it should look like this:
> 
>   /* Parse deprecated module param "dump_oops" into "max_reason". */
>   if (ramoops_dump_oops != -1)
>   pdata.max_reason = ramoops_dump_oops ? KMSG_DUMP_OOPS
>: KMSG_DUMP_PANIC;
>   else
>   pdata.max_reason = ramoops_max_reason;

Whoops, I forgot to CC you Tyler! This should be fixed now here:
https://lore.kernel.org/lkml/20200515184434.8470-6-keesc...@chromium.org/

-- 
Kees Cook

Re: [PATCH v10 01/26] Documentation/x86: Add CET description

2020-05-15 Thread H.J. Lu

On Fri, May 15, 2020 at 5:13 PM Andrew Cooper  wrote:
>
> On 15/05/2020 23:43, Dave Hansen wrote:
> > On 5/15/20 2:33 PM, Yu-cheng Yu wrote:
> >> On Fri, 2020-05-15 at 11:39 -0700, Dave Hansen wrote:
> >>> On 5/12/20 4:20 PM, Yu-cheng Yu wrote:
> >>> Can a binary compiled with CET run without CET?
> >> Yes, but a few details:
> >>
> >> - The shadow stack is transparent to the application.  A CET application 
> >> does
> >> not have anything different from a non-CET application.  However, if a CET
> >> application uses any CET instructions (e.g. INCSSP), it must first check 
> >> if CET
> >> is turned on.
> >> - If an application is compiled for IBT, the compiler inserts ENDBRs at 
> >> branch
> >> targets.  These are nops if IBT is not on.
> > I appreciate the detailed response, but it wasn't quite what I was
> > asking.  Let's ignore IBT for now and just talk about shadow stacks.
> >
> > An app compiled with the new ELF flags and running on a CET-enabled
> > kernel and CPU will start off with shadow stacks allocated and enabled,
> > right?  It can turn its shadow stack off per-thread with the new prctl.
> >  But, otherwise, it's stuck, the only way to turn shadow stacks off at
> > startup would be editing the binary.
> >
> > Basically, if there ends up being a bug in an app that violates the
> > shadow stack rules, the app is broken, period.  The only recourse is to
> > have the kernel disable CET and reboot.
> >
> > Is that right?
>
> If I may interject with the experience of having got supervisor shadow
> stacks working for Xen.
>
> Turning shadow stacks off is quite easy - clear MSR_U_CET.SHSTK_EN and
> the shadow stack will stay in whatever state it was in, and you can
> largely forget about it.  (Of course, in a sandbox scenario, it would be
> prudent to prevent the ability to disable shadow stacks.)
>
> Turning shadow stacks on is much more tricky.  You cannot enable it in
> any function you intend to return from, as the divergence between the
> stack and shadow stack will constitute a control flow violation.
>
>
> When it comes to binaries,  you can reasonably arrange for clone() to
> start a thread on a new stack/shstk, as you can prepare both stacks
> suitably before execution starts.
>
> You cannot reasonably implement a system call for "turn shadow stacks on
> for me", because you'll crash on the ret out of the VDSO from the system
> call.  It would be possible to conceive of an exec()-like system call
> which is "discard my current stack, turn on shstk, and start me on this
> new stack/shstk".
>
> In principle, with a pair of system calls to atomically manage the ststk
> settings and stack switching, it might possible to construct a
> `run_with_shstk_enabled(func, stack, shstk)` API which executes in the
> current threads context and doesn't explode.
>
> Fork() is a problem when shadow stacks are disabled in the parent.  The
> moment shadow stacks are disabled, the regular stack diverges from the
> shadow stack.  A CET-enabled app which turns off shstk and then fork()'s
> must have the child inherit the shstk-off property.  If the child were
> to start with shstk enabled, it would explode almost immediately due to
> the parent's stack divergence which it inherited.
>
>
> Finally seeing as the question was asked but not answered, it is
> actually quite easy to figure out whether shadow stacks are enabled in
> the current thread.
>
> mov $1, %eax
> rdsspd  %eax

This is for 32-bit mode.  I use

/* Check if shadow stack is in use.  */
xorl%esi, %esi
rdsspq  %rsi
testq   %rsi, %rsi
/* Normal return if shadow stack isn't in use.  */
je  L(no_shstk)

> cmp $1, %eax
> je  no_shstk
> ...
> no_shsk:
>
> rdssp is allocated from the hint nop encoding space, and the minimum
> alignment of the shadow stack pointer is 4.  On older parts, or with
> shstk disabled (either at the system level, or for the thread), the $1
> will be preserved in %eax, while if CET is active, it will be clobbered
> with something that has the bottom two bits clear.
>
> It turns out this is a lifesaver for codepaths (e.g. the NMI handler)
> which need to use other CET instructions which aren't from the hint nop
> space, and run before the BSP can set everything up.
>
> ~Andrew



-- 
H.J.

[PATCH] drm/i915: Mark check_shadow_context_ppgtt as maybe unused

2020-05-15 Thread Nathan Chancellor

When CONFIG_DRM_I915_DEBUG_GEM is not set, clang warns:

drivers/gpu/drm/i915/gvt/scheduler.c:884:1: warning: function
'check_shadow_context_ppgtt' is not needed and will not be emitted
[-Wunneeded-internal-declaration]
check_shadow_context_ppgtt(struct execlist_ring_context *c, struct
intel_vgpu_mm *m)
^
1 warning generated.

This warning is similar to -Wunused-function but rather than warning
that the function is completely unused, it warns that it is used in some
expression within the file but that expression will be evaluated to a
constant or be optimized away in the final assembly, essentially making
it appeared used but really isn't. Usually, this happens when a function
or variable is only used in sizeof, where it will appear to be used but
will be evaluated at compile time and not be required to be emitted.

In this case, the function is only used in GEM_BUG_ON, which is defined
as BUILD_BUG_ON_INVALID, which intentionally follows this pattern. To
fix this warning, add __maybe_unused to make it clear that this is
intentional depending on the configuration.

Fixes: bec3df930fbd ("drm/i915/gvt: Support PPGTT table load command")
Link: https://github.com/ClangBuiltLinux/linux/issues/1027
Signed-off-by: Nathan Chancellor 
---
 drivers/gpu/drm/i915/gvt/scheduler.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c 
b/drivers/gpu/drm/i915/gvt/scheduler.c
index f776c92de8d7..0fb1df71c637 100644
--- a/drivers/gpu/drm/i915/gvt/scheduler.c
+++ b/drivers/gpu/drm/i915/gvt/scheduler.c
@@ -880,7 +880,7 @@ static void update_guest_pdps(struct intel_vgpu *vgpu,
gpa + i * 8, [7 - i], 4);
 }
 
-static bool
+static __maybe_unused bool
 check_shadow_context_ppgtt(struct execlist_ring_context *c, struct 
intel_vgpu_mm *m)
 {
if (m->ppgtt_mm.root_entry_type == GTT_TYPE_PPGTT_ROOT_L4_ENTRY) {

base-commit: bdecf38f228bcca73b31ada98b5b7ba1215eb9c9
-- 
2.26.2

Re: Possibility of conflicting memory types in lazier TLB mode?

2020-05-15 Thread Nicholas Piggin

Excerpts from Rik van Riel's message of May 16, 2020 5:24 am:
> On Fri, 2020-05-15 at 16:50 +1000, Nicholas Piggin wrote:
>> 
>> But what about if there are (real, not speculative) stores in the
>> store 
>> queue still on the lazy thread from when it was switched, that have
>> not 
>> yet become coherent? The page is freed by another CPU and reallocated
>> for something that maps it as nocache. Do you have a coherency
>> problem 
>> there?
>> 
>> Ensuring the store queue is drained when switching to lazy seems like
>> it 
>> would fix it, maybe context switch code does that already or you
>> have 
>> some other trick or reason it's not a problem. Am I way off base
>> here?
> 
> On x86, all stores become visible in-order globally.
> 
> I suspect that
> means any pending stores in the queue
> would become visible to the rest of the system before
> the store to the "current" cpu-local variable, as
> well as other writes from the context switch code
> become visible to the rest of the system.
> 
> Is that too naive a way of preventing the scenario you
> describe?
> 
> What am I overlooking?

I'm concerned if the physical address gets mapped with different 
cacheability attributes where that ordering is not enforced by cache 
coherency

 "The PAT allows any memory type to be specified in the page tables, and 
 therefore it is possible to have a single physical page mapped to two 
 or more different linear addresses, each with different memory types. 
 Intel does not support this practice because it may lead to undefined 
 operations that can result in a system failure. In particular, a WC 
 page must never be aliased to a cacheable page because WC writes may 
 not check the processor caches." -- Vol. 3A 11-35

Maybe I'm over thinking it, and this would never happen anyway because 
if anyone were to map a RAM page WC, they might always have to ensure 
all processor caches are flushed first anyway so perhaps this is just a 
non-issue?

Thanks,
Nick

Re: [PATCH 2/4] proc/sysctl: add shared variables -1

2020-05-15 Thread Xiaoming Ni


On 2020/5/16 0:05, Kees Cook wrote:

On Fri, May 15, 2020 at 05:06:28PM +0800, Xiaoming Ni wrote:

On 2020/5/15 16:06, Kees Cook wrote:

On Fri, May 15, 2020 at 12:33:42PM +0800, Xiaoming Ni wrote:

Add the shared variable SYSCTL_NEG_ONE to replace the variable neg_one
used in both sysctl_writes_strict and hung_task_warnings.

Signed-off-by: Xiaoming Ni 
---
   fs/proc/proc_sysctl.c | 2 +-
   include/linux/sysctl.h| 1 +
   kernel/hung_task_sysctl.c | 3 +--
   kernel/sysctl.c   | 3 +--


How about doing this refactoring in advance of the extraction patch?

Before  advance of the extraction patch, neg_one is only used in one file,
does it seem to have no value for refactoring?


I guess it doesn't matter much, but I think it's easier to review in the
sense that neg_one is first extracted and then later everything else is
moved.

Later, when more features sysctl interface is moved to the code file, 
there will be more variables that need to be extracted.
So should I only extract the neg_one variable here, or should I extract 
all the variables used by multiple features?


 fs/proc/proc_sysctl.c  |  2 +-
 include/linux/sysctl.h | 11 ---
 kernel/sysctl.c| 37 -
 3 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index b6f5d45..3f77e64 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -23,7 +23,7 @@
 static const struct inode_operations proc_sys_dir_operations;

 /* shared constants to be used in various sysctls */
-const int sysctl_vals[] = { 0, 1, INT_MAX };
+const int sysctl_vals[] = { -1, 0, 1, 2, 4, 100, 1000, INT_MAX };
 EXPORT_SYMBOL(sysctl_vals);

 /* Support for permanently empty directories */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 43f8ef9..bf97c30 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -38,9 +38,14 @@
 struct ctl_dir;

 /* Keep the same order as in fs/proc/proc_sysctl.c */
-#define SYSCTL_ZERO((void *)_vals[0])
-#define SYSCTL_ONE ((void *)_vals[1])
-#define SYSCTL_INT_MAX ((void *)_vals[2])
+#define SYSCTL_NEG_ONE ((void *)_vals[0])
+#define SYSCTL_ZERO((void *)_vals[1])
+#define SYSCTL_ONE ((void *)_vals[2])
+#define SYSCTL_TWO ((void *)_vals[3])
+#define SYSCTL_FOUR((void *)_vals[4])
+#define SYSCTL_ONE_HUNDRED ((void *)_vals[5])
+#define SYSCTL_ONE_THOUSAND((void *)_vals[6])
+#define SYSCTL_INT_MAX ((void *)_vals[7])

 extern const int sysctl_vals[];

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5dd6d01..efe6172 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -118,14 +118,9 @@

 /* Constants used for minimum and  maximum */

-static int __maybe_unused neg_one = -1;
-static int __maybe_unused two = 2;
-static int __maybe_unused four = 4;
 static unsigned long zero_ul;
 static unsigned long one_ul = 1;
 static unsigned long long_max = LONG_MAX;
-static int one_hundred = 100;
-static int one_thousand = 1000;
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 1;
 #endif
@@ -534,7 +529,7 @@ static int sysrq_sysctl_handler(struct ctl_table 
*table, int write,

.maxlen = sizeof(int),
.mode   = 0644,
.proc_handler   = proc_dointvec_minmax,
-   .extra1 = _one,
+   .extra1 = SYSCTL_NEG_ONE,
.extra2 = SYSCTL_ONE,
},
 #endif
@@ -865,7 +860,7 @@ static int sysrq_sysctl_handler(struct ctl_table 
*table, int write,

.mode   = 0644,
.proc_handler   = proc_dointvec_minmax_sysadmin,
.extra1 = SYSCTL_ZERO,
-   .extra2 = ,
+   .extra2 = SYSCTL_TWO,
},
 #endif
{
@@ -1043,7 +1038,7 @@ static int sysrq_sysctl_handler(struct ctl_table 
*table, int write,

.mode   = 0644,
.proc_handler   = perf_cpu_time_max_percent_handler,
.extra1 = SYSCTL_ZERO,
-   .extra2 = _hundred,
+   .extra2 = SYSCTL_ONE_HUNDRED,
},
{
.procname   = "perf_event_max_stack",
@@ -1061,7 +1056,7 @@ static int sysrq_sysctl_handler(struct ctl_table 
*table, int write,

.mode   = 0644,
.proc_handler   = perf_event_max_stack_handler,
.extra1 = SYSCTL_ZERO,
-   .extra2 = _thousand,
+   .extra2 = SYSCTL_ONE_THOUSAND,
},
 #endif
{
@@ -1136,7 +1131,7 @@ static int sysrq_sysctl_handler(struct ctl_table 
*table, int write,

.mode   = 0644,
.proc_handler   = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
-   .extra2 = ,
+   .extra2 = SYSCTL_TWO,
},
{
.procname   =

[PATCH 1/2] dt-bindings: iio: adc: Add binding for current-from-voltage

2020-05-15 Thread Jonathan Bakker

Some devices may require a current adc, but only have a voltage
ADC onboard.  In order to read the current, they have a resistor
connected to the ADC.  Add bindings for this possibility.

Signed-off-by: Jonathan Bakker 
---
 .../iio/adc/linux,current-from-voltage.yaml   | 47 +++
 1 file changed, 47 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/iio/adc/linux,current-from-voltage.yaml

diff --git 
a/Documentation/devicetree/bindings/iio/adc/linux,current-from-voltage.yaml 
b/Documentation/devicetree/bindings/iio/adc/linux,current-from-voltage.yaml
new file mode 100644
index ..385d317607c3
--- /dev/null
+++ b/Documentation/devicetree/bindings/iio/adc/linux,current-from-voltage.yaml
@@ -0,0 +1,47 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/iio/adc/linux,current-from-voltage.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Current ADC from voltage ADC and resistor
+
+maintainers:
+  - Jonathan Bakker 
+
+properties:
+  compatible:
+const: linux,current-from-voltage
+
+  io-channel-names:
+const: adc
+
+  io-channels:
+maxItems: 1
+description: Voltage ADC channel
+
+  linux,resistor-ohms:
+description: Strength of resistor connected to voltage ADC
+
+  "#io-channel-cells":
+const: 0
+
+required:
+  - compatible
+  - io-channel-names
+  - io-channels
+  - linux,resistor-ohms
+  - "#io-channel-cells"
+
+examples:
+  - |
+current-from-voltage {
+  compatible = "linux,current-from-voltage";
+  io-channel-names = "adc";
+  io-channels = < 9>;
+  linux,resistor-ohms = <47>;
+  #io-channel-cells = <0>;
+  io-channel-ranges;
+};
+
+...
-- 
2.20.1

[PATCH 2/2] iio: adc: Add current-from-voltage driver

2020-05-15 Thread Jonathan Bakker

Some devices may require a current adc, but only have a voltage
ADC onboard.  In order to read the current, they have a resistor
connected to the ADC.

Suggested-by: Jonathan Cameron 
Signed-off-by: Jonathan Bakker 
---
 MAINTAINERS|   8 ++
 drivers/iio/adc/Kconfig|   9 ++
 drivers/iio/adc/Makefile   |   1 +
 drivers/iio/adc/current-from-voltage.c | 123 +
 4 files changed, 141 insertions(+)
 create mode 100644 drivers/iio/adc/current-from-voltage.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 2926327e4976..094cf512b403 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4503,6 +4503,14 @@ T:   git git://linuxtv.org/media_tree.git
 F: Documentation/devicetree/bindings/media/allwinner,sun6i-a31-csi.yaml
 F: drivers/media/platform/sunxi/sun6i-csi/
 
+CURRENT ADC FROM VOLTAGE ADC DRIVER
+M: Jonathan Bakker 
+L: linux-...@vger.kernel.org
+S: Maintained
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git
+F: 
Documentation/devicetree/bindings/iio/adc/linux,current-from-voltage.yaml
+F: drivers/iio/adc/current-from-voltage.c
+
 CW1200 WLAN driver
 M: Solomon Peachy 
 S: Maintained
diff --git a/drivers/iio/adc/Kconfig b/drivers/iio/adc/Kconfig
index 12bb8b7ca1ff..84e6ccb36024 100644
--- a/drivers/iio/adc/Kconfig
+++ b/drivers/iio/adc/Kconfig
@@ -344,6 +344,15 @@ config CPCAP_ADC
  This driver can also be built as a module. If so, the module will be
  called cpcap-adc.
 
+config CURRENT_FROM_VOLTAGE
+   tristate "Current from voltage shim driver"
+   help
+ Say yes here to build support for a shim driver converting a voltage
+ ADC coupled with a resistor to a current ADC.
+
+ To compile this driver as a module, choose M here: the module will be
+ called current-from-voltage.
+
 config DA9150_GPADC
tristate "Dialog DA9150 GPADC driver support"
depends on MFD_DA9150
diff --git a/drivers/iio/adc/Makefile b/drivers/iio/adc/Makefile
index 637807861112..d293184fc32a 100644
--- a/drivers/iio/adc/Makefile
+++ b/drivers/iio/adc/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_BCM_IPROC_ADC) += bcm_iproc_adc.o
 obj-$(CONFIG_BERLIN2_ADC) += berlin2-adc.o
 obj-$(CONFIG_CC10001_ADC) += cc10001_adc.o
 obj-$(CONFIG_CPCAP_ADC) += cpcap-adc.o
+obj-$(CONFIG_CURRENT_FROM_VOLTAGE) += current-from-voltage.o
 obj-$(CONFIG_DA9150_GPADC) += da9150-gpadc.o
 obj-$(CONFIG_DLN2_ADC) += dln2-adc.o
 obj-$(CONFIG_ENVELOPE_DETECTOR) += envelope-detector.o
diff --git a/drivers/iio/adc/current-from-voltage.c 
b/drivers/iio/adc/current-from-voltage.c
new file mode 100644
index ..69cb18e0995b
--- /dev/null
+++ b/drivers/iio/adc/current-from-voltage.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Driver for converting a resistor and a voltage ADC to a current ADC
+ *
+ * Copyright (C) 2020 Jonathan Bakker 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct shim {
+   struct iio_channel *adc;
+   u32 resistor_value;
+};
+
+static int shim_read_raw(struct iio_dev *indio_dev,
+ struct iio_chan_spec const *chan,
+ int *val, int *val2, long mask)
+{
+   struct shim *shim = iio_priv(indio_dev);
+   int ret;
+
+   switch (mask) {
+   case IIO_CHAN_INFO_RAW:
+   ret = iio_read_channel_processed(shim->adc, val);
+   if (ret < 0) {
+   dev_err(_dev->dev, "fail reading voltage ADC\n");
+   return ret;
+   }
+
+   return IIO_VAL_INT;
+   case IIO_CHAN_INFO_SCALE:
+   *val = 1;
+   *val2 = shim->resistor_value;
+
+   return IIO_VAL_FRACTIONAL;
+   }
+
+   return -EINVAL;
+}
+
+static const struct iio_chan_spec shim_iio_channel = {
+   .type = IIO_CURRENT,
+   .info_mask_separate = BIT(IIO_CHAN_INFO_RAW)
+   | BIT(IIO_CHAN_INFO_SCALE),
+};
+
+static const struct iio_info shim_info = {
+   .read_raw = _read_raw,
+};
+
+static int shim_probe(struct platform_device *pdev)
+{
+   struct device *dev = >dev;
+   struct iio_dev *indio_dev;
+   struct shim *shim;
+   enum iio_chan_type type;
+   int ret;
+
+   indio_dev = devm_iio_device_alloc(dev, sizeof(*shim));
+   if (!indio_dev)
+   return -ENOMEM;
+
+   platform_set_drvdata(pdev, indio_dev);
+   shim = iio_priv(indio_dev);
+
+   indio_dev->name = dev_name(dev);
+   indio_dev->dev.parent = dev;
+   indio_dev->dev.of_node = dev->of_node;
+   indio_dev->info = _info;
+   indio_dev->channels = _iio_channel;
+   indio_dev->num_channels = 1;
+
+   shim->adc = devm_iio_channel_get(dev, "adc");
+   if (IS_ERR(shim->adc)) {
+   if (PTR_ERR(shim->adc) != -EPROBE_DEFER)
+   dev_err(dev,

[PATCH 0/2] iio: adc: Add a current from voltage driver

2020-05-15 Thread Jonathan Bakker

In the discussion around adding the GP2A002 light driver, there came
up the question of what to do when a system emulates a current ADC
by using a voltage ADC and a resistor.  Rather than adding it on
a per-driver basis, it was suggested(1) to add a minimal IIO driver
to support this situation.

The new driver is fairly simple - it simply takes a voltage ADC and
a resistor value in ohms exposed as the scale and outputs a current.

It has been tested on a first-gen Galaxy S device which has the above
mentioned GP2A002 chip connected to the voltage ADC resistor complex.

1) https://lore.kernel.org/linux-iio/20200202150843.762c6897@archlinux/

Jonathan Bakker (2):
  dt-bindings: iio: adc: Add binding for current-from-voltage
  iio: adc: Add current-from-voltage driver

 .../iio/adc/linux,current-from-voltage.yaml   |  47 +++
 MAINTAINERS   |   8 ++
 drivers/iio/adc/Kconfig   |   9 ++
 drivers/iio/adc/Makefile  |   1 +
 drivers/iio/adc/current-from-voltage.c| 123 ++
 5 files changed, 188 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/iio/adc/linux,current-from-voltage.yaml
 create mode 100644 drivers/iio/adc/current-from-voltage.c

-- 
2.20.1

[PATCH] net/packet: simply allocations in alloc_one_pg_vec_page

2020-05-15 Thread Shakeel Butt

Currently the initial allocation for pg_vec buffers are done through
page allocator with __GFP_NORETRY, the first fallbacks is vzalloc and
the second fallback is page allocator without __GFP_NORETRY.

First, there is no need to do vzalloc if the order is 0 and second the
vzalloc internally use GFP_KERNEL for each individual page allocation
and thus there is no need to have any fallback after vzalloc.

Signed-off-by: Shakeel Butt 
---
 net/packet/af_packet.c | 15 ---
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 29bd405adbbd..d6f96b9f5b01 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -4243,19 +4243,12 @@ static char *alloc_one_pg_vec_page(unsigned long order)
if (buffer)
return buffer;
 
-   /* __get_free_pages failed, fall back to vmalloc */
-   buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
-   if (buffer)
-   return buffer;
+   /* __get_free_pages failed, fall back to vmalloc for high order. */
+   if (order)
+   return vzalloc(array_size((1 << order), PAGE_SIZE));
 
-   /* vmalloc failed, lets dig into swap here */
gfp_flags &= ~__GFP_NORETRY;
-   buffer = (char *) __get_free_pages(gfp_flags, order);
-   if (buffer)
-   return buffer;
-
-   /* complete and utter failure */
-   return NULL;
+   return (char *)__get_free_pages(gfp_flags, order);
 }
 
 static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
-- 
2.26.2.761.g0e0b3e54be-goog

[PATCH v3 1/2] MIPS: Loongson: Build ATI Radeon GPU driver as module

2020-05-15 Thread Tiezhu Yang

When ATI Radeon GPU driver has been compiled directly into the kernel
instead of as a module, we should make sure the firmware for the model
(check available ones in /lib/firmware/radeon) is built-in to the kernel
as well, otherwise there exists the following fatal error during GPU init,
change CONFIG_DRM_RADEON=y to CONFIG_DRM_RADEON=m to fix it.

[1.900997] [drm] Loading RS780 Microcode
[1.905077] radeon :01:05.0: Direct firmware load for 
radeon/RS780_pfp.bin failed with error -2
[1.914140] r600_cp: Failed to load firmware "radeon/RS780_pfp.bin"
[1.920405] [drm:r600_init] *ERROR* Failed to load firmware!
[1.926069] radeon :01:05.0: Fatal error during GPU init
[1.931729] [drm] radeon: finishing device.

Fixes: 024e6a8b5bb1 ("MIPS: Loongson: Add a Loongson-3 default config file")
Signed-off-by: Tiezhu Yang 
---

v2:
  - Modify the patch subject and update the commit message

v3:
  - No changes

 arch/mips/configs/loongson3_defconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/configs/loongson3_defconfig 
b/arch/mips/configs/loongson3_defconfig
index 6768c16..4df2434 100644
--- a/arch/mips/configs/loongson3_defconfig
+++ b/arch/mips/configs/loongson3_defconfig
@@ -230,7 +230,7 @@ CONFIG_MEDIA_CAMERA_SUPPORT=y
 CONFIG_MEDIA_USB_SUPPORT=y
 CONFIG_USB_VIDEO_CLASS=m
 CONFIG_DRM=y
-CONFIG_DRM_RADEON=y
+CONFIG_DRM_RADEON=m
 CONFIG_FB_RADEON=y
 CONFIG_LCD_CLASS_DEVICE=y
 CONFIG_LCD_PLATFORM=m
-- 
2.1.0

[PATCH v3 2/2] MIPS: Remove not used 8250-platform.c

2020-05-15 Thread Tiezhu Yang

When CONFIG_HAVE_STD_PC_SERIAL_PORT is set, there exists build errors
of 8250-platform.c due to linux/module.h is not included.

CONFIG_HAVE_STD_PC_SERIAL_PORT is not used in arch/mips for many years,
8250-platform.c is also not built and used, so it is not necessary to
fix the build errors, just remove the not used file 8250-platform.c and
the related code in Kconfig and Makefile.

Signed-off-by: Tiezhu Yang 
---

v2:
  - No changes

v3:
  - Remove not used 8250-platform.c

 arch/mips/Kconfig|  3 ---
 arch/mips/kernel/8250-platform.c | 46 
 arch/mips/kernel/Makefile|  2 --
 3 files changed, 51 deletions(-)
 delete mode 100644 arch/mips/kernel/8250-platform.c

diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index bfa9cd9..b2ff77f 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1358,9 +1358,6 @@ config MIPS_L1_CACHE_SHIFT
default "4" if MIPS_L1_CACHE_SHIFT_4
default "5"
 
-config HAVE_STD_PC_SERIAL_PORT
-   bool
-
 config ARC_CMDLINE_ONLY
bool
 
diff --git a/arch/mips/kernel/8250-platform.c b/arch/mips/kernel/8250-platform.c
deleted file mode 100644
index 5c6b2ab..000
--- a/arch/mips/kernel/8250-platform.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Copyright (C) 2007 Ralf Baechle (r...@linux-mips.org)
- */
-#include 
-#include 
-
-#define PORT(base, int)
\
-{  \
-   .iobase = base, \
-   .irq= int,  \
-   .uartclk= 1843200,  \
-   .iotype = UPIO_PORT,\
-   .flags  = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST,\
-   .regshift   = 0,\
-}
-
-static struct plat_serial8250_port uart8250_data[] = {
-   PORT(0x3F8, 4),
-   PORT(0x2F8, 3),
-   PORT(0x3E8, 4),
-   PORT(0x2E8, 3),
-   { },
-};
-
-static struct platform_device uart8250_device = {
-   .name   = "serial8250",
-   .id = PLAT8250_DEV_PLATFORM,
-   .dev= {
-   .platform_data  = uart8250_data,
-   },
-};
-
-static int __init uart8250_init(void)
-{
-   return platform_device_register(_device);
-}
-
-module_init(uart8250_init);
-
-MODULE_AUTHOR("Ralf Baechle ");
-MODULE_LICENSE("GPL");
-MODULE_DESCRIPTION("Generic 8250 UART probe driver");
diff --git a/arch/mips/kernel/Makefile b/arch/mips/kernel/Makefile
index d6e97df..8c7a043 100644
--- a/arch/mips/kernel/Makefile
+++ b/arch/mips/kernel/Makefile
@@ -98,8 +98,6 @@ obj-$(CONFIG_MIPSR2_TO_R6_EMULATOR)   += mips-r2-to-r6-emul.o
 
 CFLAGS_cpu-bugs64.o= $(shell if $(CC) $(KBUILD_CFLAGS) -Wa,-mdaddi -c -o 
/dev/null -x c /dev/null >/dev/null 2>&1; then echo "-DHAVE_AS_SET_DADDI"; fi)
 
-obj-$(CONFIG_HAVE_STD_PC_SERIAL_PORT)  += 8250-platform.o
-
 obj-$(CONFIG_PERF_EVENTS)  += perf_event.o
 obj-$(CONFIG_HW_PERF_EVENTS)   += perf_event_mipsxx.o
 
-- 
2.1.0

Re: [RFC PATCH 06/13] scsi: scsi_dh: ufshpb: Prepare for L2P cache management

2020-05-15 Thread Bart Van Assche

On 2020-05-15 03:30, Avri Altman wrote:
> +static int ufshpb_mempool_init(struct ufshpb_dh_lun *hpb)
> +{
> + unsigned int max_active_subregions = hpb->max_active_regions *
> + subregions_per_region;
> + int i;
> +
> + INIT_LIST_HEAD(>lh_map_ctx);
> + spin_lock_init(>map_list_lock);
> +
> + for (i = 0 ; i < max_active_subregions; i++) {
> + struct ufshpb_map_ctx *mctx =
> + kzalloc(sizeof(struct ufshpb_map_ctx), GFP_KERNEL);
> +
> + if (!mctx) {
> + /*
> +  * mctxs already added in lh_map_ctx will be removed in
> +  * detach
> +  */
> + return -ENOMEM;
> + }
> +
> + /* actual page allocation is done upon subregion activation */
> +
> + INIT_LIST_HEAD(>list);
> + list_add(>list, >lh_map_ctx);
> + }
> +
> + return 0;
> +
> +}

Could kmem_cache_create() have been used instead of implementing yet
another memory pool implementation?

> +static int ufshpb_region_tbl_init(struct ufshpb_dh_lun *hpb)
> +{
> + struct ufshpb_region *regions;
> + int i, j;
> +
> + regions = kcalloc(hpb->regions_per_lun, sizeof(*regions), GFP_KERNEL);
> + if (!regions)
> + return -ENOMEM;
> +
> + atomic_set(>active_regions, 0);
> +
> + for (i = 0 ; i < hpb->regions_per_lun; i++) {
> + struct ufshpb_region *r = regions + i;
> + struct ufshpb_subregion *subregions;
> +
> + subregions = kcalloc(subregions_per_region, sizeof(*subregions),
> +  GFP_KERNEL);
> + if (!subregions)
> + goto release_mem;
> +
> + for (j = 0; j < subregions_per_region; j++) {
> + struct ufshpb_subregion *s = subregions + j;
> +
> + s->hpb = hpb;
> + s->r = r;
> + s->region = i;
> + s->subregion = j;
> + }
> +
> + r->subregion_tbl = subregions;
> + r->hpb = hpb;
> + r->region = i;
> + }
> +
> + hpb->region_tbl = regions;
> +
> + return 0;

Could kvmalloc() have been used to allocate multiple subregion data
structures instead of calling kcalloc() multiple times?

> + spin_lock(>map_list_lock);
> +
> + list_for_each_entry_safe(mctx, next, >lh_map_ctx, list) {
> + list_del(>list);
> + kfree(mctx);
> + }
> +
> + spin_unlock(>map_list_lock);

Spinlocks should be held during a short time. I'm not sure that's the
case for the above loop.

Thanks,

Bart.

Re: [RFC PATCH 05/13] scsi: ufs: ufshpb: Disable HPB if no HPB-enabled luns

2020-05-15 Thread Bart Van Assche

On 2020-05-15 03:30, Avri Altman wrote:
> @@ -368,6 +390,8 @@ int ufshpb_probe(struct ufs_hba *hba)
>   if (ret)
>   goto out;
>  
> + INIT_DELAYED_WORK(>hpb_disable_work, ufshpb_disable_work);
> + schedule_delayed_work(>hpb_disable_work, 60 * HZ);
>  out:
>   kfree(dev_desc);
>   if (ret) {

Calling INIT_DELAYED_WORK() just before schedule_delayed_work() is a bad
practice. If cancel_delayed_work() gets called before
INIT_DELAYED_WORK() then it will encounter something that it not
expects. If cancel_delayed_work() and INIT_DELAYED_WORK() get called
concurrently than that will trigger a race condition. It is better to
call INIT_DELAYED_WORK() from the context that allocates the data
structure in which the work structure is embedded.

Thanks,

Bart.

Re: [PATCH 3/9] fs/ext4: Disallow encryption if inode is DAX

2020-05-15 Thread Eric Biggers

On Tue, May 12, 2020 at 10:43:18PM -0700, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> Encryption and DAX are incompatible.  Changing the DAX mode due to a
> change in Encryption mode is wrong without a corresponding
> address_space_operations update.
> 
> Make the 2 options mutually exclusive by returning an error if DAX was
> set first.
> 
> Furthermore, clarify the documentation of the exclusivity and how that
> will work.
> 
> Signed-off-by: Ira Weiny 
> 
> ---
> Changes:
>   remove WARN_ON_ONCE
>   Add documentation to the encrypt doc WRT DAX
> ---
>  Documentation/filesystems/fscrypt.rst |  4 +++-
>  fs/ext4/super.c   | 10 +-
>  2 files changed, 4 insertions(+), 10 deletions(-)
> 
> diff --git a/Documentation/filesystems/fscrypt.rst 
> b/Documentation/filesystems/fscrypt.rst
> index aa072112cfff..1475b8d52fef 100644
> --- a/Documentation/filesystems/fscrypt.rst
> +++ b/Documentation/filesystems/fscrypt.rst
> @@ -1038,7 +1038,9 @@ astute users may notice some differences in behavior:
>  - The ext4 filesystem does not support data journaling with encrypted
>regular files.  It will fall back to ordered data mode instead.
>  
> -- DAX (Direct Access) is not supported on encrypted files.
> +- DAX (Direct Access) is not supported on encrypted files.  Attempts to 
> enable
> +  DAX on an encrypted file will fail.  Mount options will _not_ enable DAX on
> +  encrypted files.
>  
>  - The st_size of an encrypted symlink will not necessarily give the
>length of the symlink target as required by POSIX.  It will actually
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index bf5fcb477f66..9873ab27e3fa 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1320,7 +1320,7 @@ static int ext4_set_context(struct inode *inode, const 
> void *ctx, size_t len,
>   if (inode->i_ino == EXT4_ROOT_INO)
>   return -EPERM;
>  
> - if (WARN_ON_ONCE(IS_DAX(inode) && i_size_read(inode)))
> + if (IS_DAX(inode))
>   return -EINVAL;
>  
>   res = ext4_convert_inline_data(inode);
> @@ -1344,10 +1344,6 @@ static int ext4_set_context(struct inode *inode, const 
> void *ctx, size_t len,
>   ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
>   ext4_clear_inode_state(inode,
>   EXT4_STATE_MAY_INLINE_DATA);
> - /*
> -  * Update inode->i_flags - S_ENCRYPTED will be enabled,
> -  * S_DAX may be disabled
> -  */
>   ext4_set_inode_flags(inode);
>   }
>   return res;
> @@ -1371,10 +1367,6 @@ static int ext4_set_context(struct inode *inode, const 
> void *ctx, size_t len,
>   ctx, len, 0);
>   if (!res) {
>   ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
> - /*
> -  * Update inode->i_flags - S_ENCRYPTED will be enabled,
> -  * S_DAX may be disabled
> -  */
>   ext4_set_inode_flags(inode);
>   res = ext4_mark_inode_dirty(handle, inode);
>   if (res)

I'm confused by the ext4_set_context() change.

ext4_set_context() is only called when FS_IOC_SET_ENCRYPTION_POLICY sets an
encryption policy on an empty directory, *or* when a new inode (regular, dir, or
symlink) is created in an encrypted directory (thus inheriting encryption from
its parent).

So when is it reachable when IS_DAX()?  Is the issue that the DAX flag can now
be set on directories?  The commit message doesn't seem to be talking about
directories.  Is the behavior we want is that on an (empty) directory with the
DAX flag set, FS_IOC_SET_ENCRYPTION_POLICY should fail with EINVAL?

I don't see why the i_size_read(inode) check is there though, so I think you're
at least right to remove that.

- Eric

Re: [RFC PATCH 02/13] scsi: ufshpb: Init part I - Read HPB config

2020-05-15 Thread Bart Van Assche

On 2020-05-15 03:30, Avri Altman wrote:
> +struct ufshpb_lun_config *ufshpb_luns_conf;
> +struct ufshpb_lun *ufshpb_luns;
> +static unsigned long ufshpb_lun_map[BITS_TO_LONGS(UFSHPB_MAX_LUNS)];
> +static u8 ufshpb_lun_lookup[UFSHPB_MAX_LUNS];
> +
> +/**
> + * ufshpb_remove - ufshpb cleanup
> + *
> + * Should be called when unloading the driver.
> + */
> +void ufshpb_remove(struct ufs_hba *hba)
> +{
> + kfree(ufshpb_conf);
> + kfree(ufshpb_luns_conf);
> + kfree(ufshpb_luns);
> + ufshcd_query_flag(hba, UPIU_QUERY_OPCODE_SET_FLAG,
> +   QUERY_FLAG_IDN_HPB_RESET, 0, NULL);
> +}
> +
> +static int ufshpb_hpb_init(void)
> +{
> + u8 num_hpb_luns = ufshpb_conf->num_hpb_luns;
> + int i;
> +
> + ufshpb_luns = kcalloc(num_hpb_luns, sizeof(*ufshpb_luns), GFP_KERNEL);
> + if (!ufshpb_luns)
> + return -ENOMEM;
> +
> + for (i = 0; i < num_hpb_luns; i++) {
> + struct ufshpb_lun *hpb = ufshpb_luns + i;
> +
> + hpb->lun = (ufshpb_luns_conf + i)->lun;
> + }
> +
> + return 0;
> +}

Do the ufshpb_lun... data structures perhaps duplicate SCSI core data
structures? If so, please don't introduce duplicates of SCSI core data
structures.

Thanks,

Bart.

Re: [RFC PATCH 04/13] scsi: ufs: ufshpb: Init part II - Attach scsi device

2020-05-15 Thread Bart Van Assche

On 2020-05-15 03:30, Avri Altman wrote:
> @@ -4628,6 +4628,9 @@ static int ufshcd_slave_alloc(struct scsi_device *sdev)
>  
>   ufshcd_get_lu_power_on_wp_status(hba, sdev);
>  
> + if (ufshcd_is_hpb_supported(hba))
> + ufshpb_attach_sdev(hba, sdev);
> +
>   return 0;
>  }

This approach is unusual because:
- Direct calls from the ufshcd module into the ufshpb module make it
  impossible to unload the latter kernel module.
- No other SCSI LLD calls directly into a device handler.

Thanks,

Bart.

Re: [PATCH 2/9] fs/ext4: Disallow verity if inode is DAX

2020-05-15 Thread Eric Biggers

On Tue, May 12, 2020 at 10:43:17PM -0700, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> Verity and DAX are incompatible.  Changing the DAX mode due to a verity
> flag change is wrong without a corresponding address_space_operations
> update.
> 
> Make the 2 options mutually exclusive by returning an error if DAX was
> set first.
> 
> (Setting DAX is already disabled if Verity is set first.)
> 
> Signed-off-by: Ira Weiny 
> 
> ---
> Changes:
>   remove WARN_ON_ONCE
>   Add documentation for DAX/Verity exclusivity
> ---
>  Documentation/filesystems/ext4/verity.rst | 7 +++
>  fs/ext4/verity.c  | 3 +++
>  2 files changed, 10 insertions(+)
> 
> diff --git a/Documentation/filesystems/ext4/verity.rst 
> b/Documentation/filesystems/ext4/verity.rst
> index 3e4c0ee0e068..51ab1aa17e59 100644
> --- a/Documentation/filesystems/ext4/verity.rst
> +++ b/Documentation/filesystems/ext4/verity.rst
> @@ -39,3 +39,10 @@ is encrypted as well as the data itself.
>  
>  Verity files cannot have blocks allocated past the end of the verity
>  metadata.
> +
> +Verity and DAX
> +--
> +
> +Verity and DAX are not compatible and attempts to set both of these flags on 
> a
> +file will fail.
> +

If you build the documentation, this shows up as its own subsection
"2.13. Verity and DAX" alongside "2.12. Verity files", which looks odd.
I think you should delete this new subsection header so that this paragraph goes
in the existing "Verity files" subsection.

Also, Documentation/filesystems/fsverity.rst already mentions DAX (similar to
fscrypt.rst).  Is it intentional that you added this to the ext4-specific
documentation instead?

- Eric

Re: [RFC PATCH 03/13] scsi: scsi_dh: Introduce scsi_dh_ufshpb

2020-05-15 Thread Bart Van Assche

On 2020-05-15 03:30, Avri Altman wrote:
> +static int ufshpb_attach(struct scsi_device *sdev)
> +{
> + struct ufshpb_dh_data *h;
> +
> + h = kzalloc(sizeof(*h), GFP_KERNEL);
> + if (!h)
> + return SCSI_DH_NOMEM;
> +
> + sdev_printk(KERN_INFO, sdev, "%s: attached to sdev (lun) %llu\n",
> + UFSHPB_NAME, sdev->lun);
> +
> + sdev->handler_data = h;
> +
> + return SCSI_DH_OK;
> +}

I think that all other SCSI device handlers check in their .attach
function whether the @sdev SCSI device is supported by the device
handler. I don't see any such check in the above function?

Bart.

Re: [RFC PATCH 02/13] scsi: ufshpb: Init part I - Read HPB config

2020-05-15 Thread Bart Van Assche

On 2020-05-15 03:30, Avri Altman wrote:
> diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c
> index 426073a..bffe699 100644
> --- a/drivers/scsi/ufs/ufshcd.c
> +++ b/drivers/scsi/ufs/ufshcd.c
> @@ -50,6 +50,7 @@
>  #include "ufs_bsg.h"
>  #include 
>  #include 
> +#include "ufshpb.h"
>  
>  #define CREATE_TRACE_POINTS
>  #include 
> @@ -7341,6 +7342,9 @@ static int ufshcd_add_lus(struct ufs_hba *hba)
>   hba->clk_scaling.is_allowed = true;
>   }
>  
> + if (ufshcd_is_hpb_supported(hba))
> + ufshpb_probe(hba);
> +
>   ufs_bsg_probe(hba);
>   scsi_scan_host(hba->host);
>   pm_runtime_put_sync(hba->dev);

This looks weird to me because of the following reasons:
- If there are direct calls from the ufshcd module into the ufshpb
  module and if there are direct calls from the ufshpb module into the
  ufshcd module, why has ufshpb been implemented as a kernel module? Or
  in other words, will it ever be possible to load the ufshcd module
  without loading the ufshpb module?
- Patch 3/13 makes ufshpb a device handler. There are no direct calls
  from any upstream SCSI LLD to any upstream device handler. However,
  this patch adds a direct call from the ufshcd module to the ufshpb
  module which is a device handler.

Bart.

Re: [PATCH] memcg: expose root cgroup's memory.stat

2020-05-15 Thread Chris Down


Hey Shakeel,

Shakeel Butt writes:

One way to measure the efficiency of memory reclaim is to look at the
ratio (pgscan+pfrefill)/pgsteal. However at the moment these stats are
not updated consistently at the system level and the ratio of these are
not very meaningful. The pgsteal and pgscan are updated for only global
reclaim while pgrefill gets updated for global as well as cgroup
reclaim.

Please note that this difference is only for system level vmstats. The
cgroup stats returned by memory.stat are actually consistent. The
cgroup's pgsteal contains number of reclaimed pages for global as well
as cgroup reclaim. So, one way to get the system level stats is to get
these stats from root's memory.stat, so, expose memory.stat for the root
cgroup.

from Johannes Weiner:
There are subtle differences between /proc/vmstat and
memory.stat, and cgroup-aware code that wants to watch the full
hierarchy currently has to know about these intricacies and
translate semantics back and forth.

Generally having the fully recursive memory.stat at the root
level could help a broader range of usecases.

Signed-off-by: Shakeel Butt 
Suggested-by: Johannes Weiner 


The patch looks great, thanks. To that extent you can add my ack:

Acked-by: Chris Down 

One concern about the API now exposed, though: to a new cgroup v2 user this 
looks fairly dodgy as a sole stat file (except for cgroup.stat) at the root. If 
I used cgroup v2 for the first time and only saw memory.stat and cgroup.stat 
there, but for some reason io.stat and cpu.stat are not available at the root 
but are available everywhere else, I think my first thought would be that the 
cgroup v2 developers must have been on some strong stuff when they came up with 
this ;-)


Even if they're only really duplicating information available elsewhere right 
now, have you considered exposing the rest of the stat files as well so that 
the API maintains a bit more consistency? As a bonus, that also means userspace 
applications can parse in the same way from the root down.

Re: [PATCH v4 4/5] blktrace: break out of blktrace setup on concurrent calls

2020-05-15 Thread Luis Chamberlain

On Mon, May 11, 2020 at 7:39 AM Luis Chamberlain  wrote:
>
> On Sat, May 09, 2020 at 06:09:38PM -0700, Bart Van Assche wrote:
> > How about using the block device name instead of the partition name in
> > the error message since the concurrency context is the block device and
> > not the partition?
>
> blk device argument can be NULL here. sg-generic is one case.

I'm going to add a comment about this, as it is easily forgotten.

  Luis

Re: [PATCH AUTOSEL 4.14 39/39] crypto: xts - simplify error handling in ->create()

2020-05-15 Thread Eric Biggers

On Thu, May 14, 2020 at 08:55:30PM -0400, Sasha Levin wrote:
> On Thu, May 14, 2020 at 12:08:43PM -0700, Eric Biggers wrote:
> > On Thu, May 14, 2020 at 02:54:56PM -0400, Sasha Levin wrote:
> > > From: Eric Biggers 
> > > 
> > > [ Upstream commit 732e540953477083082e999ff553622c59cffd5f ]
> > > 
> > > Simplify the error handling in the XTS template's ->create() function by
> > > taking advantage of crypto_drop_skcipher() now accepting (as a no-op) a
> > > spawn that hasn't been grabbed yet.
> > > 
> > > Signed-off-by: Eric Biggers 
> > > Signed-off-by: Herbert Xu 
> > > Signed-off-by: Sasha Levin 
> > 
> > Please don't backport this patch.  It's a cleanup (not a fix) that depends 
> > on
> > patches in 5.6, which you don't seem to be backporting.
> 
> For 5.6-4.19 I grabbed these to take:
> 
>   1a263ae60b04 ("gcc-10: avoid shadowing standard library 'free()' in 
> crypto")
> 
> cleanly. I'll drop it as it's mostly to avoid silly gcc10 warnings, but
> I just wanted to let you know the reason they ended up here.
> 

If the gcc 10 warning fix is needed, then you should just backport it on its
own.  It just renames a function, so it seems it's trivial to fix the conflict?

- Eric

[PATCH] mm: use only pidfd for process_madvise syscall

2020-05-15 Thread Minchan Kim

Based on discussion[1], people didn't feel we need to support both
pid and pidfd for every new coming API[2] so this patch keeps only
pidfd. This patch also changes flags's type with "unsigned int".
So finally, the API is as follows,

  ssize_t process_madvise(int pidfd, const struct iovec *iovec,
unsigned long vlen, int advice, unsigned int flags);

DESCRIPTION
  The process_madvise() system call is used to give advice or directions
  to the kernel about the address ranges from external process as well as
  local process. It provides the advice to address ranges of process
  described by iovec and vlen. The goal of such advice is to improve system
  or application performance.

  The pidfd selects the process referred to by the PID file descriptor
  specified in pidfd. (See pidofd_open(2) for further information)

  The pointer iovec points to an array of iovec structures, defined in
   as:

struct iovec {
void *iov_base; /* starting address */
size_t iov_len; /* number of bytes to be advised */
};

  The iovec describes address ranges beginning at address(iov_base)
  and with size length of bytes(iov_len).

  The vlen represents the number of elements in iovec.

  The advice is indicated in the advice argument, which is one of the
  following at this moment if the target process specified by idtype and
  id is external.

MADV_COLD
MADV_PAGEOUT
MADV_MERGEABLE
MADV_UNMERGEABLE

  Permission to provide a hint to external process is governed by a
  ptrace access mode PTRACE_MODE_ATTACH_FSCREDS check; see ptrace(2).

  The process_madvise supports every advice madvise(2) has if target
  process is in same thread group with calling process so user could
  use process_madvise(2) to extend existing madvise(2) to support
  vector address ranges.

RETURN VALUE
  On success, process_madvise() returns the number of bytes advised.
  This return value may be less than the total number of requested
  bytes, if an error occurred. The caller should check return value
  to determine whether a partial advice occurred.

[1] 
https://lore.kernel.org/linux-mm/20200509124817.xmrvsrq3mla6b76k@wittgenstein/
[2] 
https://lore.kernel.org/linux-mm/9d849087-3359-c4ab-fbec-859e8186c...@virtuozzo.com/
Signed-off-by: Minchan Kim 
---
 mm/madvise.c | 42 +-
 1 file changed, 13 insertions(+), 29 deletions(-)

diff --git a/mm/madvise.c b/mm/madvise.c
index d3fbbe52d230..35c9b220146a 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1229,8 +1229,8 @@ static int process_madvise_vec(struct task_struct 
*target_task,
return ret;
 }
 
-static ssize_t do_process_madvise(int which, pid_t upid, struct iov_iter *iter,
-  int behavior, unsigned long flags)
+static ssize_t do_process_madvise(int pidfd, struct iov_iter *iter,
+   int behavior, unsigned int flags)
 {
ssize_t ret;
struct pid *pid;
@@ -1241,26 +1241,12 @@ static ssize_t do_process_madvise(int which, pid_t 
upid, struct iov_iter *iter,
if (flags != 0)
return -EINVAL;
 
-   switch (which) {
-   case P_PID:
-   if (upid <= 0)
-   return -EINVAL;
-
-   pid = find_get_pid(upid);
-   if (!pid)
-   return -ESRCH;
-   break;
-   case P_PIDFD:
-   if (upid < 0)
-   return -EINVAL;
-
-   pid = pidfd_get_pid(upid);
-   if (IS_ERR(pid))
-   return PTR_ERR(pid);
-   break;
-   default:
+   if (pidfd < 0)
return -EINVAL;
-   }
+
+   pid = pidfd_get_pid(pidfd);
+   if (IS_ERR(pid))
+   return PTR_ERR(pid);
 
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
@@ -1292,9 +1278,8 @@ static ssize_t do_process_madvise(int which, pid_t upid, 
struct iov_iter *iter,
return ret;
 }
 
-SYSCALL_DEFINE6(process_madvise, int, which, pid_t, upid,
-   const struct iovec __user *, vec, unsigned long, vlen,
-   int, behavior, unsigned long, flags)
+SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
+   unsigned long, vlen, int, behavior, unsigned int, flags)
 {
ssize_t ret;
struct iovec iovstack[UIO_FASTIOV];
@@ -1303,19 +1288,18 @@ SYSCALL_DEFINE6(process_madvise, int, which, pid_t, 
upid,
 
ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), , );
if (ret >= 0) {
-   ret = do_process_madvise(which, upid, , behavior, flags);
+   ret = do_process_madvise(pidfd, , behavior, flags);
kfree(iov);
}
return ret;
 }
 
 #ifdef CONFIG_COMPAT

Re: [PATCH][next] USB: EHCI: ehci-mv: fix less than zero comparisonof an unsigned int

2020-05-15 Thread Alan Stern

On Sat, May 16, 2020 at 07:23:42AM +0800, Tang Bin wrote:
> Hi Alan & Colin：
> 
> On 2020/5/16 1:21, Alan Stern wrote:
> > On Fri, May 15, 2020 at 05:54:53PM +0100, Colin King wrote:
> > > From: Colin Ian King 
> > > 
> > > The comparison of hcd->irq to less than zero for an error check will
> > > never be true because hcd->irq is an unsigned int.  Fix this by
> > > assigning the int retval to the return of platform_get_irq and checking
> > > this for the -ve error condition and assigning hcd->irq to retval.
> > > 
> > > Addresses-Coverity: ("Unsigned compared against 0")
> > > Fixes: c856b4b0fdb5 ("USB: EHCI: ehci-mv: fix error handling in 
> > > mv_ehci_probe()")
> > > Signed-off-by: Colin Ian King 
> > > ---
> > Thanks to Coverity for spotting this.  Any reason why it didn't spot
> > exactly the same mistake in the ohci-da8xx.c driver?
> 
> I just looked at the code and wondered whether it would be more appropriate
> to modify the header file "hcd.h".  Since 'irq' might be an negative, why
> not just modify the variables in the 'struct usb_hcd',  'unsigned int 
> irq'--> 'int irq'? After all, it's a public one.

I think the code in the drivers should be changed.  The reason is if 
platform_get_irq() returns a negative value, then that value should be 
what the probe function returns.

Alan Stern

Re: [PATCH v4] Make initramfs honor CONFIG_DEVTMPFS_MOUNT

2020-05-15 Thread Rob Landley

On 5/14/20 11:50 PM, Randy Dunlap wrote:
> Hi Rob,

Um, hi.

> You need to send this patch to some maintainer who could merge it.

The implicit "if" is "you expect the kernel bureaucracy to merge anything Not
Invented Here", and the 3 year gap since the last version is because I stopped:

  https://landley.net/notes-2017.html#14-09-2017

To be honest I didn't think anyone would notice this. They don't usually:

  http://lkml.iu.edu/hypermail/linux/kernel/2002.2/00083.html
  https://www.spinics.net/lists/linux-sh/msg56844.html

It just seems polite to post things that got shipped to customers.

> And it uses the wrong multi-line comment format.

Offending comment removed.

Rob
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index f046d21..97352d4 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -48,16 +48,10 @@  config DEVTMPFS_MOUNT
 	bool "Automount devtmpfs at /dev, after the kernel mounted the rootfs"
 	depends on DEVTMPFS
 	help
-	  This will instruct the kernel to automatically mount the
-	  devtmpfs filesystem at /dev, directly after the kernel has
-	  mounted the root filesystem. The behavior can be overridden
-	  with the commandline parameter: devtmpfs.mount=0|1.
-	  This option does not affect initramfs based booting, here
-	  the devtmpfs filesystem always needs to be mounted manually
-	  after the rootfs is mounted.
-	  With this option enabled, it allows to bring up a system in
-	  rescue mode with init=/bin/sh, even when the /dev directory
-	  on the rootfs is completely empty.
+	  Automatically mount devtmpfs at /dev on the root filesystem, which
+	  lets the system come up in rescue mode with [rd]init=/bin/sh.
+	  Override with devtmpfs.mount=0 on the commandline. Initramfs can
+	  create a /dev dir as needed, other rootfs needs the mount point.
 
 config STANDALONE
 	bool "Select only drivers that don't need compile-time external firmware"
diff --git a/fs/namespace.c b/fs/namespace.c
index f8893dc..06057d7 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2417,7 +2417,16 @@  static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags)
 	err = -EBUSY;
 	if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb &&
 	path->mnt->mnt_root == path->dentry)
+	{
+		if (IS_ENABLED(CONFIG_DEVTMPFS_MOUNT) &&
+		!strcmp(path->mnt->mnt_sb->s_type->name, "devtmpfs"))
+		{
+			printk(KERN_WARNING "Debian bug workaround for devtmpfs overmount.");
+
+			err = 0;
+		}
 		goto unlock;
+	}
 
 	err = -EINVAL;
 	if (d_is_symlink(newmnt->mnt.mnt_root))
diff --git a/init/main.c b/init/main.c
index 0ee9c686..0d8e5ec 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1065,12 +1065,6 @@  static noinline void __init kernel_init_freeable(void)
 
 	do_basic_setup();
 
-	/* Open the /dev/console on the rootfs, this should never fail */
-	if (ksys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
-		pr_err("Warning: unable to open an initial console.\n");
-
-	(void) ksys_dup(0);
-	(void) ksys_dup(0);
 	/*
 	 * check if there is an early userspace init.  If yes, let it do all
 	 * the work
@@ -1082,8 +1076,17 @@  static noinline void __init kernel_init_freeable(void)
 			ramdisk_execute_command, 0) != 0) {
 		ramdisk_execute_command = NULL;
 		prepare_namespace();
+	} else if (IS_ENABLED(CONFIG_DEVTMPFS_MOUNT)) {
+		sys_mkdir("/dev", 0755);
+		devtmpfs_mount("/dev");
 	}
 
+	/* Open the /dev/console on the rootfs, this should never fail */
+	if (ksys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
+		pr_err("Warning: unable to open an initial console.\n");
+	(void) ksys_dup(0);
+	(void) ksys_dup(0);
+
 	/*
 	 * Ok, we have completed the initial bootup, and
 	 * we're essentially up and running. Get rid of the

Re: [PATCH v5 0/7] firmware: add partial read support in request_firmware_into_buf

2020-05-15 Thread Luis Chamberlain

On Fri, May 15, 2020 at 04:28:08PM -0700, Scott Branden wrote:
> Hi Luis,
> 
> On 2020-05-15 1:47 p.m., Luis Chamberlain wrote:
> > On Wed, May 13, 2020 at 12:23:59PM -0400, Mimi Zohar wrote:
> > > Hi Scott,
> > > 
> > > On Thu, 2020-05-07 at 17:27 -0700, Scott Branden wrote:
> > > > Please consider this version series ready for upstream acceptance.
> > > > 
> > > > This patch series adds partial read support in 
> > > > request_firmware_into_buf.
> > > > In order to accept the enhanced API it has been requested that kernel
> > > > selftests and upstreamed driver utilize the API enhancement and so
> > > > are included in this patch series.
> > > > 
> > > > Also in this patch series is the addition of a new Broadcom VK driver
> > > > utilizing the new request_firmware_into_buf enhanced API.
> > > Up to now, the firmware blob was read into memory allowing IMA to
> > > verify the file signature.  With this change, ima_post_read_file()
> > > will not be able to verify the file signature.
> > > 
> > > (I don't think any of the other LSMs are on this hook, but you might
> > > want to Cc the LSM or integrity mailing list.)
> > Scott, so it sounds we need a resolution for pread for IMA for file
> > signature verification. It seems that can be addressed though. Feel
> > free to submit the u32 flag changes which you picked up on though in
> > the meantime.
> Sure, I can submit a new patch to change the existing enum to
> a u32. 

Great thanks!

> For the new pread flags I am adding I could also leave as
> a u32 or change from a u32 to an enum since there is currently only
> one flag in use.  Then, in the future if another flag was added we would
> need
> to change it back to a u32.

Yes that approach works well, enum for now.


  Luis

[PATCH v2] mm/hmm/test: use xa_for_each_range instead of looping

2020-05-15 Thread Ralph Campbell

The test driver uses an xa_array to store virtual to physical address
translations for a simulated hardware device. The MMU notifier
invalidation callback is used to keep the table consistent with the CPU
page table and is frequently called only for a page or two. However, if
the test process exits unexpectedly or is killed, the range can be
[0..ULONG_MAX] in which case calling xa_erase() for every possible PFN
results in CPU timeouts.
Use xa_for_each_range() to efficiently erase entries in the range.

Signed-off-by: Ralph Campbell 
---

This patch is based on Jason Gunthorpe's hmm tree and should be folded
into the ("mm/hmm/test: add selftest driver for HMM") patch once this
patch is reviewed, etc.

v1 -> v2:
Use xa_for_each_range() instead of special casing [0..ULONG_MAX].

 lib/test_hmm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 8b36c26b717b..5c1858e325ba 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -196,13 +196,15 @@ static void dmirror_do_update(struct dmirror *dmirror, 
unsigned long start,
  unsigned long end)
 {
unsigned long pfn;
+   void *entry;
 
/*
 * The XArray doesn't hold references to pages since it relies on
 * the mmu notifier to clear page pointers when they become stale.
 * Therefore, it is OK to just clear the entry.
 */
-   for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++)
+   xa_for_each_range(>pt, pfn, entry, start >> PAGE_SHIFT,
+ end >> PAGE_SHIFT)
xa_erase(>pt, pfn);
 }
 
-- 
2.20.1

[PATCH] f2fs: fix checkpoint=disable:%u%%

2020-05-15 Thread Jaegeuk Kim

When parsing the mount option, we don't have sbi->user_block_count.
Should do it after getting it.

Signed-off-by: Jaegeuk Kim 
---
 fs/f2fs/f2fs.h  |  1 +
 fs/f2fs/super.c | 25 +++--
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 51863e4f5d4e0..fb180020e175c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -139,6 +139,7 @@ struct f2fs_mount_info {
int fs_mode;/* fs mode: LFS or ADAPTIVE */
int bggc_mode;  /* bggc mode: off, on or sync */
bool test_dummy_encryption; /* test dummy encryption */
+   block_t unusable_cap_perc;  /* percentage for cap */
block_t unusable_cap;   /* Amount of space allowed to be
 * unusable when disabling checkpoint
 */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 441eaaf9739c4..a71da699cb2d5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -284,6 +284,22 @@ static inline void limit_reserve_root(struct f2fs_sb_info 
*sbi)
   F2FS_OPTION(sbi).s_resgid));
 }
 
+static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi)
+{
+   if (!F2FS_OPTION(sbi).unusable_cap_perc)
+   return;
+
+   if (F2FS_OPTION(sbi).unusable_cap_perc == 100)
+   F2FS_OPTION(sbi).unusable_cap = sbi->user_block_count;
+   else
+   F2FS_OPTION(sbi).unusable_cap = (sbi->user_block_count / 100) *
+   F2FS_OPTION(sbi).unusable_cap_perc;
+
+   f2fs_info(sbi, "Adjust unusable cap for checkpoint=disable = %u / %u%%",
+   F2FS_OPTION(sbi).unusable_cap,
+   F2FS_OPTION(sbi).unusable_cap_perc);
+}
+
 static void init_once(void *foo)
 {
struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
@@ -785,12 +801,7 @@ static int parse_options(struct super_block *sb, char 
*options)
return -EINVAL;
if (arg < 0 || arg > 100)
return -EINVAL;
-   if (arg == 100)
-   F2FS_OPTION(sbi).unusable_cap =
-   sbi->user_block_count;
-   else
-   F2FS_OPTION(sbi).unusable_cap =
-   (sbi->user_block_count / 100) * arg;
+   F2FS_OPTION(sbi).unusable_cap_perc = arg;
set_opt(sbi, DISABLE_CHECKPOINT);
break;
case Opt_checkpoint_disable_cap:
@@ -1840,6 +1851,7 @@ static int f2fs_remount(struct super_block *sb, int 
*flags, char *data)
(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
 
limit_reserve_root(sbi);
+   adjust_unusable_cap_perc(sbi);
*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME);
return 0;
 restore_gc:
@@ -3516,6 +3528,7 @@ static int f2fs_fill_super(struct super_block *sb, void 
*data, int silent)
sbi->reserved_blocks = 0;
sbi->current_reserved_blocks = 0;
limit_reserve_root(sbi);
+   adjust_unusable_cap_perc(sbi);
 
for (i = 0; i < NR_INODE_TYPE; i++) {
INIT_LIST_HEAD(>inode_list[i]);
-- 
2.26.2.761.g0e0b3e54be-goog

Re: [PATCH v10 2/3] mfd: add Gateworks System Controller core driver

2020-05-15 Thread kbuild test robot

Hi Tim,

I love your patch! Yet something to improve:

[auto build test ERROR on ljones-mfd/for-mfd-next]
[also build test ERROR on hwmon/hwmon-next linus/master v5.7-rc5 next-20200515]
[if your patch is applied to the wrong git tree, please drop us a note to help
improve the system. BTW, we also suggest to use '--base' option to specify the
base tree in git format-patch, please see https://stackoverflow.com/a/37406982]

url:
https://github.com/0day-ci/linux/commits/Tim-Harvey/Add-support-for-the-Gateworks-System-Controller/20200515-232142
base:   https://git.kernel.org/pub/scm/linux/kernel/git/lee/mfd.git for-mfd-next
config: x86_64-allyesconfig (attached as .config)
compiler: clang version 11.0.0 (https://github.com/llvm/llvm-project 
9d4cf5bd421fb6467ff5f00e26a37527246dd4d6)
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# install x86_64 cross compiling tool for clang build
# apt-get install binutils-x86-64-linux-gnu
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kbuild test robot 

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> drivers/mfd/gateworks-gsc.c:270:14: error: use of undeclared identifier 
>> 'gsc_id_table'
.id_table   = gsc_id_table,
^
1 error generated.

vim +/gsc_id_table +270 drivers/mfd/gateworks-gsc.c

   264  
   265  static struct i2c_driver gsc_driver = {
   266  .driver = {
   267  .name   = "gateworks-gsc",
   268  .of_match_table = gsc_of_match,
   269  },
 > 270  .id_table   = gsc_id_table,
   271  .probe_new  = gsc_probe,
   272  .remove = gsc_remove,
   273  };
   274  module_i2c_driver(gsc_driver);
   275  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip

Re: [PATCH] mm/hmm/test: destroy xa_array instead of looping

2020-05-15 Thread Ralph Campbell




On 5/15/20 4:15 PM, Jason Gunthorpe wrote:

On Wed, May 13, 2020 at 02:45:07PM -0700, Ralph Campbell wrote:

The test driver uses an xa_array to store virtual to physical address
translations for a simulated hardware device. The MMU notifier
invalidation callback is used to keep the table consistent with the CPU
page table and is frequently called only for a page or two. However, if
the test process exits unexpectedly or is killed, the range can be
[0..ULONG_MAX] in which case calling xa_erase() for every possible PFN
results in CPU timeouts. Munmap() can result in a large range being
invalidated but in that case, the xa_array is likely to contain entries
that need to be invalidated.
Check for [0..ULONG_MAX] explicitly and just destroy the whole table.

Signed-off-by: Ralph Campbell 

This patch is based on Jason Gunthorpe's hmm tree and should be folded
into the ("mm/hmm/test: add selftest driver for HMM") patch once this
patch is reviewed, etc.

  lib/test_hmm.c | 6 ++
  1 file changed, 6 insertions(+)

diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index 8b36c26b717b..b89852ec3c29 100644
+++ b/lib/test_hmm.c
@@ -201,7 +201,13 @@ static void dmirror_do_update(struct dmirror *dmirror, 
unsigned long start,
 * The XArray doesn't hold references to pages since it relies on
 * the mmu notifier to clear page pointers when they become stale.
 * Therefore, it is OK to just clear the entry.
+* However, if the entire address space is being invalidated, it
+* takes too long to clear them one at a time so destroy the array.
 */
+   if (start == 0 && end == ULONG_MAX) {
+   xa_destroy(>pt);
+   return;
+   }
for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++)
xa_erase(>pt, pfn);
  }


Just use xa_for_each_range() instead of the naive loop, it already
optimizes against membership and avoids the need for the xa_destroy
hack

Jason



For some reason I had looked at that and rejected it but of course, it works
fine. :-)
Thanks!

[PATCH 1/1] Documentation: security: core.rst: add missing argument

2020-05-15 Thread Ben Boeckel

From: Ben Boeckel 

This argument was just never documented in the first place.

Signed-off-by: Ben Boeckel 
---
 Documentation/security/keys/core.rst | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Documentation/security/keys/core.rst 
b/Documentation/security/keys/core.rst
index d9b0b859018b..c26b9e7d47c2 100644
--- a/Documentation/security/keys/core.rst
+++ b/Documentation/security/keys/core.rst
@@ -920,10 +920,14 @@ The keyctl syscall functions are:
 
long keyctl(KEYCTL_PKEY_QUERY,
key_serial_t key_id, unsigned long reserved,
+   const char* params,
struct keyctl_pkey_query *info);
 
- Get information about an asymmetric key.  The information is returned in
- the keyctl_pkey_query struct::
+ Get information about an asymmetric key.  Specific algorithms and
+ encodings may be queried by using the ``params`` argument.  This is a
+ string containing a space- or tab-separated string of key-value pairs.
+ Currently supported keys include ``enc`` and ``hash``.  The information
+ is returned in the keyctl_pkey_query struct::
 
__u32   supported_ops;
__u32   key_size;
-- 
2.25.4

[PATCH 0/1] Document keyctl(KEYCTL_PKEY_QUERY) arguments correctly

2020-05-15 Thread Ben Boeckel

From: Ben Boeckel 

This is the way the code parses the arguments and libkeyutils calls the
syscall.

Note on the email split: I'm still in the process of migrating emails
for various usages hence the email From mismatch here (I've migrated my
list subscription, but not my general contribution email).

Ben Boeckel (1):
  Documentation: security: core.rst: add missing argument

 Documentation/security/keys/core.rst | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)


base-commit: 12bf0b632ed090358cbf03e323e5342212d0b2e4
-- 
2.25.4

Re: [PATCH v4 3/9] usb: dwc3: Increase timeout for CmdAct cleared by device controller

2020-05-15 Thread Thinh Nguyen

Hi,

Jun Li wrote:
>> -Original Message-
>> From: Felipe Balbi  On Behalf Of Felipe Balbi
>> Sent: 2020年5月15日 17:31
>> To: Jun Li 
>> Cc: John Stultz ; lkml 
>> ; Yu
>> Chen ; Greg Kroah-Hartman ; 
>> Rob
>> Herring ; Mark Rutland ; ShuFan Lee
>> ; Heikki Krogerus ;
>> Suzuki K Poulose ; Chunfeng Yun
>> ; Hans de Goede ; Andy 
>> Shevchenko
>> ; Valentin Schneider ;
>> Jack Pham ; Linux USB List 
>> ; open
>> list:OPEN FIRMWARE AND FLATTENED DEVICE TREE BINDINGS 
>> ;
>> Peter Chen ; Jun Li ; Thinh Nguyen
>> 
>> Subject: Re: [PATCH v4 3/9] usb: dwc3: Increase timeout for CmdAct cleared 
>> by device
>> controller
>>
>>
>> Hi,
>>
>> Jun Li  writes:
 @@ -397,12 +407,18 @@ int dwc3_send_gadget_ep_cmd(struct dwc3_ep *dep, 
 unsigned
>> cmd,
  dwc3_gadget_ep_get_transfer_index(dep);
  }

 -   if (saved_config) {
 +   if (saved_hs_config) {
  reg = dwc3_readl(dwc->regs, DWC3_GUSB2PHYCFG(0));
 -   reg |= saved_config;
 +   reg |= saved_hs_config;
  dwc3_writel(dwc->regs, DWC3_GUSB2PHYCFG(0), reg);
  }

 +   if (saved_ss_config) {
 +   reg = dwc3_readl(dwc->regs, DWC3_GUSB3PIPECTL(0));
 +   reg |= saved_ss_config;
 +   dwc3_writel(dwc->regs, DWC3_GUSB3PIPECTL(0), reg);
 +   }
 +
  return ret;
   }
>>> Unfortunately this way can't work, once the SS PHY enters P3, disable
>>> suspend_en can't force SS PHY exit P3, unless do this at the very
>>> beginning to prevent SS PHY entering P3(e.g. add "snps,dis_u3_susphy_quirk" 
>>> for
>> test).
>>
>> It sounds like you have a quirky PHY.
>  From what I got from the IC design, the behavior of DWC3_GUSB3PIPECTL_SUSPHY
> bit should be as what I said, not a quirky.
>
> Hi Thinh, could you comment this?

You only need to wake up the usb2 phy when issuing the command while 
running in highspeed or below. If you're running in SS or higher, 
internally the controller does it for you for usb3 phy. In Jun's case, 
it seems like it takes longer for his phy to wake up.

IMO, in this case, I think it's fine to increase the command timeout.

BR,
Thinh

Re: [PATCH] memcg: expose root cgroup's memory.stat

2020-05-15 Thread Shakeel Butt

On Fri, May 15, 2020 at 11:09 AM Johannes Weiner  wrote:
>
> On Fri, May 15, 2020 at 10:49:22AM -0700, Shakeel Butt wrote:
> > On Fri, May 15, 2020 at 8:00 AM Roman Gushchin  wrote:
> > > On Fri, May 15, 2020 at 06:44:44AM -0700, Shakeel Butt wrote:
> > > > On Fri, May 15, 2020 at 6:24 AM Johannes Weiner  
> > > > wrote:
> > > > > You're right. It should only bypass the page_counter, but still set
> > > > > page->mem_cgroup = root_mem_cgroup, just like user pages.
> > >
> > > What about kernel threads? We consider them belonging to the root memory
> > > cgroup. Should their memory consumption being considered in root-level 
> > > stats?
> > >
> > > I'm not sure we really want it, but I guess we need to document how
> > > kernel threads are handled.
> >
> > What will be the cons of updating root-level stats for kthreads?
>
> Should kernel threads be doing GFP_ACCOUNT allocations without
> memalloc_use_memcg()? GFP_ACCOUNT implies that the memory consumption
> can be significant and should be attributed to userspace activity.
>
> If the kernel thread has no userspace entity to blame, it seems to
> imply the same thing as a !GFP_ACCOUNT allocation: shared public
> infrastructure, not interesting to account to any specific cgroup.
>
> I'm not sure if we have such allocations right now. But IMO we should
> not account anything from kthreads, or interrupts for that matter,
> /unless/ there is a specific active_memcg that was set by the kthread
> or the interrupt.

I totally agree with you but I think your response is about memory
charging in IRQ/kthread context, a topic of a parallel patch from
Zefan Li at [1].

Here we are discussing stats update for kthreads e.g. should we update
root memcg's MEMCG_KERNEL_STACK_KB stat when we allocate stack for
kernel threads?

[1] http://lkml.kernel.org/r/3a721f62-5a66-8bc5-247b-5c8b7c51c...@huawei.com

Re: [PATCH v10 01/26] Documentation/x86: Add CET description

2020-05-15 Thread Andrew Cooper

On 15/05/2020 23:43, Dave Hansen wrote:
> On 5/15/20 2:33 PM, Yu-cheng Yu wrote:
>> On Fri, 2020-05-15 at 11:39 -0700, Dave Hansen wrote:
>>> On 5/12/20 4:20 PM, Yu-cheng Yu wrote:
>>> Can a binary compiled with CET run without CET?
>> Yes, but a few details:
>>
>> - The shadow stack is transparent to the application.  A CET application does
>> not have anything different from a non-CET application.  However, if a CET
>> application uses any CET instructions (e.g. INCSSP), it must first check if 
>> CET
>> is turned on.
>> - If an application is compiled for IBT, the compiler inserts ENDBRs at 
>> branch
>> targets.  These are nops if IBT is not on.
> I appreciate the detailed response, but it wasn't quite what I was
> asking.  Let's ignore IBT for now and just talk about shadow stacks.
>
> An app compiled with the new ELF flags and running on a CET-enabled
> kernel and CPU will start off with shadow stacks allocated and enabled,
> right?  It can turn its shadow stack off per-thread with the new prctl.
>  But, otherwise, it's stuck, the only way to turn shadow stacks off at
> startup would be editing the binary.
>
> Basically, if there ends up being a bug in an app that violates the
> shadow stack rules, the app is broken, period.  The only recourse is to
> have the kernel disable CET and reboot.
>
> Is that right?

If I may interject with the experience of having got supervisor shadow
stacks working for Xen.

Turning shadow stacks off is quite easy - clear MSR_U_CET.SHSTK_EN and
the shadow stack will stay in whatever state it was in, and you can
largely forget about it.  (Of course, in a sandbox scenario, it would be
prudent to prevent the ability to disable shadow stacks.)

Turning shadow stacks on is much more tricky.  You cannot enable it in
any function you intend to return from, as the divergence between the
stack and shadow stack will constitute a control flow violation.

When it comes to binaries,  you can reasonably arrange for clone() to
start a thread on a new stack/shstk, as you can prepare both stacks
suitably before execution starts.

You cannot reasonably implement a system call for "turn shadow stacks on
for me", because you'll crash on the ret out of the VDSO from the system
call.  It would be possible to conceive of an exec()-like system call
which is "discard my current stack, turn on shstk, and start me on this
new stack/shstk".

In principle, with a pair of system calls to atomically manage the ststk
settings and stack switching, it might possible to construct a
`run_with_shstk_enabled(func, stack, shstk)` API which executes in the
current threads context and doesn't explode.

Fork() is a problem when shadow stacks are disabled in the parent.  The
moment shadow stacks are disabled, the regular stack diverges from the
shadow stack.  A CET-enabled app which turns off shstk and then fork()'s
must have the child inherit the shstk-off property.  If the child were
to start with shstk enabled, it would explode almost immediately due to
the parent's stack divergence which it inherited.

Finally seeing as the question was asked but not answered, it is
actually quite easy to figure out whether shadow stacks are enabled in
the current thread.

    mov $1, %eax
    rdsspd  %eax
    cmp $1, %eax
    je  no_shstk
    ...
no_shsk:

rdssp is allocated from the hint nop encoding space, and the minimum
alignment of the shadow stack pointer is 4.  On older parts, or with
shstk disabled (either at the system level, or for the thread), the $1
will be preserved in %eax, while if CET is active, it will be clobbered
with something that has the bottom two bits clear.

It turns out this is a lifesaver for codepaths (e.g. the NMI handler)
which need to use other CET instructions which aren't from the hint nop
space, and run before the BSP can set everything up.

~Andrew

[patch V6 00/37] x86/entry: Rework leftovers and merge plan

2020-05-15 Thread Thomas Gleixner

Folks!

This is V6 of the rework series. V5 can be found here:

  https://lore.kernel.org/r/20200512210059.056244...@linutronix.de

The V6 leftover series is based on:

  git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git entry-base-v6

which is the reworked base series from part 1-4 of the original 5 part
series with a few changes which are described in detail below in the merge
plan section.

V6 has the following changes vs. V5:

- Rebased on top entry-base-v6

- Addressed Stevens request to split up the hardware latency detector.
  This are 3 patches now as I couldn't resist to cleanup the
  timestamping mess in that code before splitting it up.

- Dropped the KVM/SVM change as that is going to be routed
  differently. See below.

The full series is available from:

  git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git entry-v6-the-rest

On top of that the kvm changes are applied for completeness and available
from:

  git://git.kernel.org/pub/scm/linux/kernel/git/tglx/devel.git entry-v6-full


Merge plan:
---

After figuring out that the entry pile and next are not really happy with
each other, I spent quite some time to come up with a plan.

The goal was to:

- not let Stephen Rothwell grow more grey hair when trying to resolve
  the conflicts

- allow the affected trees (RCU and KVM) to take a small part of the
  series into their trees while making sure that the x86/entry branch
  is complete and contains the required RCU and KVM changes as well.

About 10 hours of patch tetris later the solution looks like this:

  I've reshuffled the patches so that they are grouped by subsystem instead
  of having the cross tree/subsystem patches close to the actual usage site
  in the x86/entry series.

  This allowed me to tag these subsytem parts and they contain just the
  minimal subset of changes to be able to build and boot.

The resulting tag list is:

  - noinstr-lds-2020-05-15

A single commit containing the vmlinux.lds.h change which introduces
the noinstr.text section.

  - noinstr-core-2020-05-15

Based on noinstr-lds-2020-05-15 and contains the core changes

  - noinstr-core-for-kvm-2020-05-15

Subset of noinstr-core-2020-05-15 which is required to let KVM pull
the KVM async pagefault cleanup and base the guest_enter/exit() and
noinstr changes on top.

  - noinstr-rcu-nmi-2020-05-15

Based on the core/rcu branch in the tip tree. It has merged in
noinstr-lds-2020-05-15 and contains the nmi_enter/exit() changes along
with the noinstr section changes on top.

This tag is intended to be pulled by Paul into his rcu/next branch so
he can sort the conflicts and base further work on top.

  - noinstr-core-2020-05-15

Based on noinstr-core-for-kvm-2020-05-15 and contains the async page
fault cleanup which goes into x86/entry so the IDTENTRY conversion of
#PF which also touches the async pagefault code can be applied on top

This tag is intended to be pulled by Paolo into his next branch so he
can work against these changes and the merged result is also target for
the rebased version of the KVM guest_enter/exit() changes. These are
not part of the entry-v6-base tag. I'm going to post them as a separate
series because the original ones are conflicting with work in that area
in the KVM tree.

  - noinstr-kcsan-2020-05015, noinstr-kprobes-2020-05-15,
noinstr-objtool-2020-05-15

TIP tree internal tags which I added to reduce the brain-melt.

The x86/entry branch is based on the TIP x86/entry branch and has the
following branches and tags merged and patches from part 1-4 applied:

- x86/asm because this has conflicting changes vs. #DF

- A small set of preparatory changes and fixes which are independent
  of the noinstr mechanics

- noinstr-objtool-2020-05-15
- noinstr-core-2020-05-15
- noinstr-kprobes-2020-05-15
- noinstr-rcu-nmi-2020-05-15
- noinstr-kcsan-2020-05015
- noinstr-x86-kvm-2020-05-15

- The part 1-4 patches up to

51336ff8b658 ("x86/entry: Convert double fault exception to 
IDTENTRY_DF")

  This is tagged as entry-v6-base

The remaining patches in this leftover series will be applied on top.

If this works for all maintainers involved, then I'm going to pull the tags
and branches into the tip-tree which makes them immutable.

If not, please tell me ASAP that I should restart the patch tetris session
after hiding in a brown paperbag for some time to recover from brain melt.

Thanks,

tglx

[patch V6 08/37] x86/entry/64: Move do_softirq_own_stack() to C

2020-05-15 Thread Thomas Gleixner



The first step to get rid of the ENTER/LEAVE_IRQ_STACK ASM macro maze.  Use
the new C code helpers to move do_softirq_own_stack() out of ASM code.

Signed-off-by: Thomas Gleixner 

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3b8da9f09297..bdf8391b2f95 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1145,19 +1145,6 @@ SYM_FUNC_START(asm_call_on_stack)
ret
 SYM_FUNC_END(asm_call_on_stack)
 
-/* Call softirq on interrupt stack. Interrupts are off. */
-.pushsection .text, "ax"
-SYM_FUNC_START(do_softirq_own_stack)
-   pushq   %rbp
-   mov %rsp, %rbp
-   ENTER_IRQ_STACK regs=0 old_rsp=%r11
-   call__do_softirq
-   LEAVE_IRQ_STACK regs=0
-   leaveq
-   ret
-SYM_FUNC_END(do_softirq_own_stack)
-.popsection
-
 #ifdef CONFIG_XEN_PV
 /*
  * A note on the "critical region" in our callback handler.
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 12df3a4abfdd..62cff52e03c5 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -20,6 +20,7 @@
 #include 
 
 #include 
+#include 
 #include 
 #include 
 
@@ -70,3 +71,11 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return 0;
return map_irq_stack(cpu);
 }
+
+void do_softirq_own_stack(void)
+{
+   if (irqstack_active())
+   __do_softirq();
+   else
+   run_on_irqstack(__do_softirq, NULL);
+}

[patch V6 15/37] x86/entry: Change exit path of xen_failsafe_callback

2020-05-15 Thread Thomas Gleixner



xen_failsafe_callback is invoked from XEN for two cases:

  1. Fault while reloading DS, ES, FS or GS
  2. Fault while executing IRET

#1 retries the IRET after XEN has fixed up the segments.
#2 injects a #GP which kills the task

For #1 there is no reason to go through the full exception return path
because the tasks TIF state is still the same. So just going straight to
the IRET path is good enough.

Signed-off-by: Thomas Gleixner 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index dd8064ffdf12..65fd41fe77d9 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1352,7 +1352,7 @@ SYM_FUNC_START(xen_failsafe_callback)
 5: pushl   $-1 /* orig_ax = -1 => not a system 
call */
SAVE_ALL
ENCODE_FRAME_POINTER
-   jmp ret_from_exception
+   jmp handle_exception_return
 
 .section .fixup, "ax"
 6: xorl%eax, %eax
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1157d63b3682..55f612032793 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1175,7 +1175,7 @@ SYM_CODE_START(xen_failsafe_callback)
pushq   $-1 /* orig_ax = -1 => not a system call */
PUSH_AND_CLEAR_REGS
ENCODE_FRAME_POINTER
-   jmp error_exit
+   jmp error_return
 SYM_CODE_END(xen_failsafe_callback)
 #endif /* CONFIG_XEN_PV */

[patch V6 04/37] x86: Make hardware latency tracing explicit

2020-05-15 Thread Thomas Gleixner



The hardware latency tracer calls into trace_sched_clock and ends up in
various instrumentable functions which is problemeatic vs. the kprobe
handling especially the text poke machinery. It's invoked from
nmi_enter/exit(), i.e. non-instrumentable code.

Use nmi_enter/exit_notrace() instead. These variants do not invoke the
hardware latency tracer which avoids chasing down complex callchains to
make them non-instrumentable.

The real interesting measurement is the actual NMI handler. Add an explicit
invocation for the hardware latency tracer to it.

#DB and #BP are uninteresting as they really should not be in use when
analzying hardware induced latencies.

If #DF hits, hardware latency is definitely not interesting anymore and in
case of a machine check the hardware latency is not the most troublesome
issue either.

Signed-off-by: Thomas Gleixner 

---
 arch/x86/kernel/cpu/mce/core.c |4 ++--
 arch/x86/kernel/nmi.c  |6 --
 arch/x86/kernel/traps.c|   12 +++-
 3 files changed, 13 insertions(+), 9 deletions(-)

--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1916,7 +1916,7 @@ static __always_inline void exc_machine_
mce_check_crashing_cpu())
return;
 
-   nmi_enter();
+   nmi_enter_notrace();
/*
 * The call targets are marked noinstr, but objtool can't figure
 * that out because it's an indirect call. Annotate it.
@@ -1924,7 +1924,7 @@ static __always_inline void exc_machine_
instrumentation_begin();
machine_check_vector(regs);
instrumentation_end();
-   nmi_exit();
+   nmi_exit_notrace();
 }
 
 static __always_inline void exc_machine_check_user(struct pt_regs *regs)
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -334,6 +334,7 @@ static noinstr void default_do_nmi(struc
__this_cpu_write(last_nmi_rip, regs->ip);
 
instrumentation_begin();
+   ftrace_nmi_handler_enter();
 
handled = nmi_handle(NMI_LOCAL, regs);
__this_cpu_add(nmi_stats.normal, handled);
@@ -420,6 +421,7 @@ static noinstr void default_do_nmi(struc
unknown_nmi_error(reason, regs);
 
 out:
+   ftrace_nmi_handler_exit();
instrumentation_end();
 }
 
@@ -536,14 +538,14 @@ DEFINE_IDTENTRY_NMI(exc_nmi)
}
 #endif
 
-   nmi_enter();
+   nmi_enter_notrace();
 
inc_irq_stat(__nmi_count);
 
if (!ignore_nmis)
default_do_nmi(regs);
 
-   nmi_exit();
+   nmi_exit_notrace();
 
 #ifdef CONFIG_X86_64
if (unlikely(this_cpu_read(update_debug_stack))) {
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -387,7 +387,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault)
}
 #endif
 
-   nmi_enter();
+   nmi_enter_notrace();
instrumentation_begin();
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
@@ -632,12 +632,14 @@ DEFINE_IDTENTRY_RAW(exc_int3)
instrumentation_end();
idtentry_exit(regs);
} else {
-   nmi_enter();
+   nmi_enter_notrace();
instrumentation_begin();
+   ftrace_nmi_handler_enter();
if (!do_int3(regs))
die("int3", regs, 0);
+   ftrace_nmi_handler_exit();
instrumentation_end();
-   nmi_exit();
+   nmi_exit_notrace();
}
 }
 
@@ -849,7 +851,7 @@ static void noinstr handle_debug(struct
 static __always_inline void exc_debug_kernel(struct pt_regs *regs,
 unsigned long dr6)
 {
-   nmi_enter();
+   nmi_enter_notrace();
/*
 * The SDM says "The processor clears the BTF flag when it
 * generates a debug exception."  Clear TIF_BLOCKSTEP to keep
@@ -871,7 +873,7 @@ static __always_inline void exc_debug_ke
if (dr6)
handle_debug(regs, dr6, false);
 
-   nmi_exit();
+   nmi_exit_notrace();
 }
 
 static __always_inline void exc_debug_user(struct pt_regs *regs,

[patch V6 18/37] x86/irq: Use generic irq_regs implementation

2020-05-15 Thread Thomas Gleixner



The only difference is the name of the per-CPU variable: irq_regs
vs. __irq_regs, but the accessor functions are identical.

Remove the pointless copy and use the generic variant.

Signed-off-by: Thomas Gleixner 

diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
deleted file mode 100644
index 187ce59aea28..
--- a/arch/x86/include/asm/irq_regs.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Per-cpu current frame pointer - the location of the last exception frame on
- * the stack, stored in the per-cpu area.
- *
- * Jeremy Fitzhardinge 
- */
-#ifndef _ASM_X86_IRQ_REGS_H
-#define _ASM_X86_IRQ_REGS_H
-
-#include 
-
-#define ARCH_HAS_OWN_IRQ_REGS
-
-DECLARE_PER_CPU(struct pt_regs *, irq_regs);
-
-static inline struct pt_regs *get_irq_regs(void)
-{
-   return __this_cpu_read(irq_regs);
-}
-
-static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
-{
-   struct pt_regs *old_regs;
-
-   old_regs = get_irq_regs();
-   __this_cpu_write(irq_regs, new_regs);
-
-   return old_regs;
-}
-
-#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index c7965ff429c5..252065d32ab5 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -26,9 +26,6 @@
 DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
 EXPORT_PER_CPU_SYMBOL(irq_stat);
 
-DEFINE_PER_CPU(struct pt_regs *, irq_regs);
-EXPORT_PER_CPU_SYMBOL(irq_regs);
-
 atomic_t irq_err_count;
 
 /*

[patch V6 21/37] x86/entry: Add IRQENTRY_IRQ macro

2020-05-15 Thread Thomas Gleixner



Provide a seperate IDTENTRY macro for device interrupts. Similar to
IDTENTRY_ERRORCODE with the addition of invoking irq_enter/exit_rcu() and
providing the errorcode as a 'u8' argument to the C function, which
truncates the sign extended vector number.

Signed-off-by: Thomas Gleixner 

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index ce113d5613b6..5ad4893ac31c 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -751,6 +751,20 @@ SYM_CODE_START(\asmsym)
 SYM_CODE_END(\asmsym)
 .endm
 
+.macro idtentry_irq vector cfunc
+   .p2align CONFIG_X86_L1_CACHE_SHIFT
+SYM_CODE_START_LOCAL(asm_\cfunc)
+   ASM_CLAC
+   SAVE_ALL switch_stacks=1
+   ENCODE_FRAME_POINTER
+   movl%esp, %eax
+   movlPT_ORIG_EAX(%esp), %edx /* get the vector from stack */
+   movl$-1, PT_ORIG_EAX(%esp)  /* no syscall to restart */
+   call\cfunc
+   jmp handle_exception_return
+SYM_CODE_END(asm_\cfunc)
+.endm
+
 /*
  * Include the defines which emit the idt entries which are shared
  * shared between 32 and 64 bit.
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9c8529035e58..613f606e6dbf 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -528,6 +528,20 @@ SYM_CODE_END(\asmsym)
 .endm
 
 /*
+ * Interrupt entry/exit.
+ *
+ + The interrupt stubs push (vector) onto the stack, which is the error_code
+ * position of idtentry exceptions, and jump to one of the two idtentry points
+ * (common/spurious).
+ *
+ * common_interrupt is a hotpath, align it to a cache line
+ */
+.macro idtentry_irq vector cfunc
+   .p2align CONFIG_X86_L1_CACHE_SHIFT
+   idtentry \vector asm_\cfunc \cfunc has_error_code=1
+.endm
+
+/*
  * MCE and DB exceptions
  */
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 1fad257372e3..5ea35a9f4562 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -163,6 +163,49 @@ __visible noinstr void func(struct pt_regs *regs)
 #define DEFINE_IDTENTRY_RAW_ERRORCODE(func)\
 __visible noinstr void func(struct pt_regs *regs, unsigned long error_code)
 
+/**
+ * DECLARE_IDTENTRY_IRQ - Declare functions for device interrupt IDT entry
+ *   points (common/spurious)
+ * @vector:Vector number (ignored for C)
+ * @func:  Function name of the entry point
+ *
+ * Maps to DECLARE_IDTENTRY_ERRORCODE()
+ */
+#define DECLARE_IDTENTRY_IRQ(vector, func) \
+   DECLARE_IDTENTRY_ERRORCODE(vector, func)
+
+/**
+ * DEFINE_IDTENTRY_IRQ - Emit code for device interrupt IDT entry points
+ * @func:  Function name of the entry point
+ *
+ * The vector number is pushed by the low level entry stub and handed
+ * to the function as error_code argument which needs to be truncated
+ * to an u8 because the push is sign extending.
+ *
+ * On 64bit dtentry_enter/exit() are invoked in the ASM entry code before
+ * and after switching to the interrupt stack. On 32bit this happens in C.
+ *
+ * irq_enter/exit_rcu() are invoked before the function body and the
+ * KVM L1D flush request is set.
+ */
+#define DEFINE_IDTENTRY_IRQ(func)  \
+static __always_inline void __##func(struct pt_regs *regs, u8 vector); \
+   \
+__visible noinstr void func(struct pt_regs *regs,  \
+   unsigned long error_code)   \
+{  \
+   idtentry_enter(regs);   \
+   instrumentation_begin();\
+   irq_enter_rcu();\
+   kvm_set_cpu_l1tf_flush_l1d();   \
+   __##func (regs, (u8)error_code);\
+   irq_exit_rcu(); \
+   lockdep_hardirq_exit(); \
+   instrumentation_end();  \
+   idtentry_exit(regs);\
+}  \
+   \
+static __always_inline void __##func(struct pt_regs *regs, u8 vector)
 
 #ifdef CONFIG_X86_64
 /**
@@ -295,6 +338,10 @@ __visible noinstr void func(struct pt_regs *regs,  
\
 #define DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func)   \
DECLARE_IDTENTRY_ERRORCODE(vector, func)
 
+/* Entries for common/spurious (device) interrupts */
+#define DECLARE_IDTENTRY_IRQ(vector, func) \
+

[patch V6 07/37] x86/entry: Provide helpers for execute on irqstack

2020-05-15 Thread Thomas Gleixner



Device interrupt handlers and system vector handlers are executed on the
interrupt stack. The stack switch happens in the low level assembly entry
code. This conflicts with the efforts to consolidate the exit code in C to
ensure correctness vs. RCU and tracing.

As there is no way to move #DB away from IST due to the MOV SS issue, the
requirements vs. #DB and NMI for switching to the interrupt stack do not
exist anymore. The only requirement is that interrupts are disabled.

That allows to move the stack switching to C code which simplifies the
entry/exit handling further because it allows to switch stacks after
handling the entry and on exit before handling RCU, return to usermode and
kernel preemption in the same way as for regular exceptions.

The initial attempt of having the stack switching in inline ASM caused too
much headache vs. objtool and the unwinder. After analysing the use cases
it was agreed on that having the stack switch in ASM for the price of an
indirect call is acceptable as the main users are indirect call heavy
anyway and the few system vectors which are empty shells (scheduler IPI and
KVM posted interrupt vectors) can run from the regular stack.

Provide helper functions to check whether the interrupt stack is already
active and whether stack switching is required.

64 bit only for now. 32 bit has a variant of that already. Once this is
cleaned up the two implementations might be consolidated as a cleanup on
top.

Signed-off-by: Thomas Gleixner 

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 902691b35e7e..3b8da9f09297 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1106,6 +1106,45 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
 SYM_CODE_END(.Lbad_gs)
.previous
 
+/*
+ * rdi: New stack pointer points to the top word of the stack
+ * rsi: Function pointer
+ * rdx: Function argument (can be NULL if none)
+ */
+SYM_FUNC_START(asm_call_on_stack)
+   /*
+* Save the frame pointer unconditionally. This allows the ORC
+* unwinder to handle the stack switch.
+*/
+   pushq   %rbp
+   mov %rsp, %rbp
+
+   /*
+* The unwinder relies on the word at the top of the new stack
+* page linking back to the previous RSP.
+*/
+   mov %rsp, (%rdi)
+   mov %rdi, %rsp
+   /* Move the argument to the right place */
+   mov %rdx, %rdi
+
+1:
+   .pushsection .discard.instr_begin
+   .long 1b - .
+   .popsection
+
+   CALL_NOSPEC rsi
+
+2:
+   .pushsection .discard.instr_end
+   .long 2b - .
+   .popsection
+
+   /* Restore the previous stack pointer from RBP. */
+   leaveq
+   ret
+SYM_FUNC_END(asm_call_on_stack)
+
 /* Call softirq on interrupt stack. Interrupts are off. */
 .pushsection .text, "ax"
 SYM_FUNC_START(do_softirq_own_stack)
diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h
new file mode 100644
index ..d58135152ee8
--- /dev/null
+++ b/arch/x86/include/asm/irq_stack.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_IRQ_STACK_H
+#define _ASM_X86_IRQ_STACK_H
+
+#include 
+
+#include 
+
+#ifdef CONFIG_X86_64
+static __always_inline bool irqstack_active(void)
+{
+   return __this_cpu_read(irq_count) != -1;
+}
+
+void asm_call_on_stack(void *sp, void *func, void *arg);
+
+static __always_inline void run_on_irqstack(void *func, void *arg)
+{
+   void *tos = __this_cpu_read(hardirq_stack_ptr);
+
+   lockdep_assert_irqs_disabled();
+
+   __this_cpu_add(irq_count, 1);
+   asm_call_on_stack(tos - 8, func, arg);
+   __this_cpu_sub(irq_count, 1);
+}
+
+#else /* CONFIG_X86_64 */
+static inline bool irqstack_active(void) { return false; }
+static inline void run_on_irqstack(void *func, void *arg) { }
+#endif /* !CONFIG_X86_64 */
+
+static __always_inline bool irq_needs_irq_stack(struct pt_regs *regs)
+{
+   if (IS_ENABLED(CONFIG_X86_32))
+   return false;
+   return !user_mode(regs) && !irqstack_active();
+}
+
+#endif

[patch V6 02/37] tracing/hwlat: Split ftrace_nmi_enter/exit()

2020-05-15 Thread Thomas Gleixner

The hardware latency tracer calls into timekeeping and ends up in
various instrumentable functions which is problematic vs. the kprobe
handling especially the text poke machinery. It's invoked from
nmi_enter/exit(), i.e. non-instrumentable code.

Split it into two parts:

 1) NMI counter, only invoked on nmi_enter() and noinstr safe

 2) NMI timestamping, to be invoked from instrumentable code

Move it into the rcu is watching regions of nmi_enter/exit() even
if there is no actual RCU dependency right now but there is also
no point in having it early.

The actual split of nmi_enter/exit() is done in a separate step.

Requested-by: Steven Rostedt 
Signed-off-by: Thomas Gleixner 
---
 include/linux/ftrace_irq.h |   31 +++
 include/linux/hardirq.h|5 +++--
 kernel/trace/trace_hwlat.c |   19 ---
 3 files changed, 34 insertions(+), 21 deletions(-)

--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -4,23 +4,30 @@
 
 #ifdef CONFIG_HWLAT_TRACER
 extern bool trace_hwlat_callback_enabled;
-extern void trace_hwlat_callback(bool enter);
-#endif
+extern void trace_hwlat_count_nmi(void);
+extern void trace_hwlat_timestamp(bool enter);
 
-static inline void ftrace_nmi_enter(void)
+static __always_inline void ftrace_count_nmi(void)
 {
-#ifdef CONFIG_HWLAT_TRACER
-   if (trace_hwlat_callback_enabled)
-   trace_hwlat_callback(true);
-#endif
+   if (unlikely(trace_hwlat_callback_enabled))
+   trace_hwlat_count_nmi();
 }
 
-static inline void ftrace_nmi_exit(void)
+static __always_inline void ftrace_nmi_handler_enter(void)
 {
-#ifdef CONFIG_HWLAT_TRACER
-   if (trace_hwlat_callback_enabled)
-   trace_hwlat_callback(false);
-#endif
+   if (unlikely(trace_hwlat_callback_enabled))
+   trace_hwlat_timestamp(true);
 }
 
+static __always_inline void ftrace_nmi_handler_exit(void)
+{
+   if (unlikely(trace_hwlat_callback_enabled))
+   trace_hwlat_timestamp(false);
+}
+#else /* CONFIG_HWLAT_TRACER */
+static inline void ftrace_count_nmi(void) {}
+static inline void ftrace_nmi_handler_enter(void) {}
+static inline void ftrace_nmi_handler_exit(void) {}
+#endif
+
 #endif /* _LINUX_FTRACE_IRQ_H */
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -82,20 +82,21 @@ extern void irq_exit(void);
arch_nmi_enter();   \
printk_nmi_enter(); \
lockdep_off();  \
-   ftrace_nmi_enter(); \
BUG_ON(in_nmi() == NMI_MASK);   \
__preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);   \
rcu_nmi_enter();\
lockdep_hardirq_enter();\
+   ftrace_count_nmi(); \
+   ftrace_nmi_handler_enter(); \
} while (0)
 
 #define nmi_exit() \
do {\
+   ftrace_nmi_handler_exit();  \
lockdep_hardirq_exit(); \
rcu_nmi_exit(); \
BUG_ON(!in_nmi());  \
__preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET);   \
-   ftrace_nmi_exit();  \
lockdep_on();   \
printk_nmi_exit();  \
arch_nmi_exit();\
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -132,21 +132,26 @@ static void trace_hwlat_sample(struct hw
 }
 
 /*
+ * Count NMIs in nmi_enter(). Does not take timestamps
+ * because the timestamping callchain cannot be invoked
+ * from noinstr sections.
+ */
+noinstr void trace_hwlat_count_nmi(void)
+{
+   if (smp_processor_id() == nmi_cpu)
+   nmi_count++;
+}
+
+/*
  * Timestamping uses ktime_get_mono_fast(), the NMI safe access to
  * CLOCK_MONOTONIC.
  */
-void trace_hwlat_callback(bool enter)
+void trace_hwlat_timestamp(bool enter)
 {
-   if (smp_processor_id() != nmi_cpu)
-   return;
-
if (enter)
nmi_ts_start = ktime_get_mono_fast_ns();
else
nmi_total_ts += ktime_get_mono_fast_ns() - nmi_ts_start;
-
-   if (enter)
-   nmi_count++;
 }
 
 /**

[patch V6 05/37] genirq: Provide irq_enter/exit_rcu()

2020-05-15 Thread Thomas Gleixner



irq_enter()/exit() include the RCU handling. To properly separate the RCU
handling provide variants which contain only the non-RCU related
functionality.

Signed-off-by: Thomas Gleixner 

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index a4c5a1df067e..f6f25fab34cb 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -43,7 +43,11 @@ extern void rcu_nmi_exit(void);
 /*
  * Enter irq context (on NO_HZ, update jiffies):
  */
-extern void irq_enter(void);
+void irq_enter(void);
+/*
+ * Like irq_enter(), but RCU is already watching.
+ */
+void irq_enter_rcu(void);
 
 /*
  * Exit irq context without processing softirqs:
@@ -58,7 +62,12 @@ extern void irq_enter(void);
 /*
  * Exit irq context and process softirqs if needed:
  */
-extern void irq_exit(void);
+void irq_exit(void);
+
+/*
+ * Like irq_exit(), but return with RCU watching.
+ */
+void irq_exit_rcu(void);
 
 #ifndef arch_nmi_enter
 #define arch_nmi_enter()   do { } while (0)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a47c6dd57452..beb8e3a66c7c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -339,12 +339,11 @@ asmlinkage __visible void do_softirq(void)
local_irq_restore(flags);
 }
 
-/*
- * Enter an interrupt context.
+/**
+ * irq_enter_rcu - Enter an interrupt context with RCU watching
  */
-void irq_enter(void)
+void irq_enter_rcu(void)
 {
-   rcu_irq_enter();
if (is_idle_task(current) && !in_interrupt()) {
/*
 * Prevent raise_softirq from needlessly waking up ksoftirqd
@@ -354,10 +353,18 @@ void irq_enter(void)
tick_irq_enter();
_local_bh_enable();
}
-
__irq_enter();
 }
 
+/**
+ * irq_enter - Enter an interrupt context including RCU update
+ */
+void irq_enter(void)
+{
+   rcu_irq_enter();
+   irq_enter_rcu();
+}
+
 static inline void invoke_softirq(void)
 {
if (ksoftirqd_running(local_softirq_pending()))
@@ -397,10 +404,12 @@ static inline void tick_irq_exit(void)
 #endif
 }
 
-/*
- * Exit an interrupt context. Process softirqs if needed and possible:
+/**
+ * irq_exit_rcu() - Exit an interrupt context without updating RCU
+ *
+ * Also processes softirqs if needed and possible.
  */
-void irq_exit(void)
+void irq_exit_rcu(void)
 {
 #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
@@ -413,6 +422,16 @@ void irq_exit(void)
invoke_softirq();
 
tick_irq_exit();
+}
+
+/**
+ * irq_exit - Exit an interrupt context, update RCU and lockdep
+ *
+ * Also processes softirqs if needed and possible.
+ */
+void irq_exit(void)
+{
+   irq_exit_rcu();
rcu_irq_exit();
 /* must be last! */
lockdep_hardirq_exit();

[patch V6 13/37] x86/entry: Switch page fault exception to IDTENTRY_RAW

2020-05-15 Thread Thomas Gleixner



Convert page fault exceptions to IDTENTRY_RAW:
  - Implement the C entry point with DEFINE_IDTENTRY_RAW
  - Add the CR2 read into the exception handler
  - Add the idtentry_enter/exit_cond_rcu() invocations in
in the regular page fault handler and use the regular
idtentry_enter/exit() for the async PF part.
  - Emit the ASM stub with DECLARE_IDTENTRY_RAW
  - Remove the ASM idtentry in 64bit
  - Remove the CR2 read from 64bit
  - Remove the open coded ASM entry code in 32bit
  - Fixup the XEN/PV code
  - Remove the old prototypes

No functional change.

Signed-off-by: Thomas Gleixner 

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 6ac890d5c9d8..3c3ca6fbe58e 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1395,36 +1395,6 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, 
HYPERV_STIMER0_VECTOR,
 
 #endif /* CONFIG_HYPERV */
 
-SYM_CODE_START(page_fault)
-   ASM_CLAC
-   pushl   $do_page_fault
-   jmp common_exception_read_cr2
-SYM_CODE_END(page_fault)
-
-SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2)
-   /* the function address is in %gs's slot on the stack */
-   SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
-
-   ENCODE_FRAME_POINTER
-
-   /* fixup %gs */
-   GS_TO_REG %ecx
-   movlPT_GS(%esp), %edi
-   REG_TO_PTGS %ecx
-   SET_KERNEL_GS %ecx
-
-   GET_CR2_INTO(%ecx)  # might clobber %eax
-
-   /* fixup orig %eax */
-   movlPT_ORIG_EAX(%esp), %edx # get the error code
-   movl$-1, PT_ORIG_EAX(%esp)  # no syscall to restart
-
-   TRACE_IRQS_OFF
-   movl%esp, %eax  # pt_regs pointer
-   CALL_NOSPEC edi
-   jmp ret_from_exception
-SYM_CODE_END(common_exception_read_cr2)
-
 SYM_CODE_START_LOCAL_NOALIGN(common_exception)
/* the function address is in %gs's slot on the stack */
SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1d700bde232b..e061c48d0ae2 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -506,15 +506,6 @@ SYM_CODE_END(spurious_entries_start)
callerror_entry
UNWIND_HINT_REGS
 
-   .if \vector == X86_TRAP_PF
-   /*
-* Store CR2 early so subsequent faults cannot clobber it. Use 
R12 as
-* intermediate storage as RDX can be clobbered in 
enter_from_user_mode().
-* GET_CR2_INTO can clobber RAX.
-*/
-   GET_CR2_INTO(%r12);
-   .endif
-
.if \sane == 0
TRACE_IRQS_OFF
 
@@ -533,10 +524,6 @@ SYM_CODE_END(spurious_entries_start)
movq$-1, ORIG_RAX(%rsp) /* no syscall to restart */
.endif
 
-   .if \vector == X86_TRAP_PF
-   movq%r12, %rdx  /* Move CR2 into 3rd argument */
-   .endif
-
call\cfunc
 
.if \sane == 0
@@ -1060,12 +1047,6 @@ apicinterrupt IRQ_WORK_VECTOR
irq_work_interrupt  smp_irq_work_interrupt
 #endif
 
 /*
- * Exception entry points.
- */
-
-idtentry   X86_TRAP_PF page_fault  do_page_fault   
has_error_code=1
-
-/*
  * Reload gs selector with exception handling
  * edi:  new selector
  *
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index ccd572fd6583..4e4975df7ce0 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -364,7 +364,8 @@ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP, 
exc_general_protection);
 DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC,exc_alignment_check);
 
 /* Raw exception entries which need extra work */
-DECLARE_IDTENTRY_RAW(X86_TRAP_BP,  exc_int3);
+DECLARE_IDTENTRY_RAW(X86_TRAP_BP,  exc_int3);
+DECLARE_IDTENTRY_RAW_ERRORCODE(X86_TRAP_PF,exc_page_fault);
 
 #ifdef CONFIG_X86_MCE
 DECLARE_IDTENTRY_MCE(X86_TRAP_MC,  exc_machine_check);
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index f5a2e438a878..d7de360eec74 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -9,17 +9,6 @@
 #include 
 #include/* TRAP_TRACE, ... */
 
-#define dotraplinkage __visible
-
-asmlinkage void page_fault(void);
-asmlinkage void async_page_fault(void);
-
-#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
-asmlinkage void xen_page_fault(void);
-#endif
-
-dotraplinkage void do_page_fault(struct pt_regs *regs, unsigned long 
error_code, unsigned long address);
-
 #ifdef CONFIG_X86_64
 asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs);
 asmlinkage __visible notrace
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index df66e9dc2087..b4bd866568ee 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -59,7 +59,7 @@ static const __initconst struct idt_data

[patch V6 24/37] x86/entry: Convert APIC interrupts to IDTENTRY_SYSVEC

2020-05-15 Thread Thomas Gleixner



Convert APIC interrupts to IDTENTRY_SYSVEC
  - Implement the C entry point with DEFINE_IDTENTRY_SYSVEC
  - Emit the ASM stub with DECLARE_IDTENTRY_SYSVEC
  - Remove the ASM idtentries in 64bit
  - Remove the BUILD_INTERRUPT entries in 32bit
  - Remove the old prototypes

No functional change.

Signed-off-by: Thomas Gleixner 

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 2c0eb5c2100a..6e629e5d5047 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -965,9 +965,6 @@ apicinterrupt3 REBOOT_VECTOR
reboot_interruptsmp_reboot_interrupt
 apicinterrupt3 UV_BAU_MESSAGE  uv_bau_message_intr1
uv_bau_message_interrupt
 #endif
 
-apicinterrupt LOCAL_TIMER_VECTOR   apic_timer_interrupt
smp_apic_timer_interrupt
-apicinterrupt X86_PLATFORM_IPI_VECTOR  x86_platform_ipi
smp_x86_platform_ipi
-
 #ifdef CONFIG_HAVE_KVM
 apicinterrupt3 POSTED_INTR_VECTOR  kvm_posted_intr_ipi 
smp_kvm_posted_intr_ipi
 apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR   kvm_posted_intr_wakeup_ipi  
smp_kvm_posted_intr_wakeup_ipi
@@ -992,9 +989,6 @@ apicinterrupt CALL_FUNCTION_VECTOR  
call_function_interrupt smp_call_function_i
 apicinterrupt RESCHEDULE_VECTORreschedule_interrupt
smp_reschedule_interrupt
 #endif
 
-apicinterrupt ERROR_APIC_VECTORerror_interrupt 
smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR spurious_apic_interrupt 
smp_spurious_apic_interrupt
-
 #ifdef CONFIG_IRQ_WORK
 apicinterrupt IRQ_WORK_VECTOR  irq_work_interrupt  
smp_irq_work_interrupt
 #endif
diff --git a/arch/x86/include/asm/entry_arch.h 
b/arch/x86/include/asm/entry_arch.h
index cd57ce6134c9..d10d6d807e73 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -33,11 +33,6 @@ BUILD_INTERRUPT(kvm_posted_intr_nested_ipi, 
POSTED_INTR_NESTED_VECTOR)
  */
 #ifdef CONFIG_X86_LOCAL_APIC
 
-BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
-BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
-BUILD_INTERRUPT(spurious_apic_interrupt,SPURIOUS_APIC_VECTOR)
-BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
-
 #ifdef CONFIG_IRQ_WORK
 BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
 #endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 3213d36b92d3..1765993360e7 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,16 +29,12 @@
 #include 
 
 /* Interrupt handlers registered during init_IRQ */
-extern asmlinkage void apic_timer_interrupt(void);
-extern asmlinkage void x86_platform_ipi(void);
 extern asmlinkage void kvm_posted_intr_ipi(void);
 extern asmlinkage void kvm_posted_intr_wakeup_ipi(void);
 extern asmlinkage void kvm_posted_intr_nested_ipi(void);
-extern asmlinkage void error_interrupt(void);
 extern asmlinkage void irq_work_interrupt(void);
 extern asmlinkage void uv_bau_message_intr1(void);
 
-extern asmlinkage void spurious_apic_interrupt(void);
 extern asmlinkage void thermal_interrupt(void);
 extern asmlinkage void reschedule_interrupt(void);
 
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 43658fcedae8..6154d1e75fce 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -567,6 +567,14 @@ DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER,   
common_interrupt);
 DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER,   spurious_interrupt);
 #endif
 
+/* System vector entry points */
+#ifdef CONFIG_X86_LOCAL_APIC
+DECLARE_IDTENTRY_SYSVEC(ERROR_APIC_VECTOR, sysvec_error_interrupt);
+DECLARE_IDTENTRY_SYSVEC(SPURIOUS_APIC_VECTOR,  
sysvec_spurious_apic_interrupt);
+DECLARE_IDTENTRY_SYSVEC(LOCAL_TIMER_VECTOR,
sysvec_apic_timer_interrupt);
+DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR,   
sysvec_x86_platform_ipi);
+#endif
+
 #undef X86_TRAP_OTHER
 
 #endif
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ae10083b7631..112c85673179 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -44,7 +44,6 @@ extern void __init init_IRQ(void);
 void arch_trigger_cpumask_backtrace(const struct cpumask *mask,
bool exclude_self);
 
-extern __visible void smp_x86_platform_ipi(struct pt_regs *regs);
 #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace
 #endif
 
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 97e6945bfce8..933934c3e173 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -40,9 +40,6 @@ asmlinkage void smp_threshold_interrupt(struct pt_regs *regs);
 asmlinkage void smp_deferred_error_interrupt(struct pt_regs *regs);
 #endif
 
-void smp_apic_timer_interrupt(struct pt_regs *regs);
-void

[patch V6 30/37] x86/entry: Convert reschedule interrupt to IDTENTRY_RAW

2020-05-15 Thread Thomas Gleixner



The scheduler IPI does not need the full interrupt entry handling logic
when the entry is from kernel mode.

Even if tracing is enabled the only requirement is that RCU is watching and
preempt_count has the hardirq bit on.

The NOHZ tick state does not have to be adjusted. If the tick is not
running then the CPU is in idle and the idle exit will restore the
tick. Softinterrupts are not raised here, so handling them on return is not
required either.

User mode entry must go through the regular entry path as it will invoke
the scheduler on return so context tracking needs to be in the correct
state.

Use IDTENTRY_RAW and the RCU conditional variants of idtentry_enter/exit()
to guarantee that RCU is watching even if the IPI hits a RCU idle section.

Remove the tracepoint static key conditional which is incomplete
vs. tracing anyway because e.g. ack_APIC_irq() calls out into
instrumentable code.

Avoid the overhead of irq time accounting and introduce variants of
__irq_enter/exit() so instrumentation observes the correct preempt count
state.

Spare the switch to the interrupt stack as the IPI is not going to use only
a minimal amount of stack space.

Signed-off-by: Thomas Gleixner 

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 943ffd64363a..38dc4d1f7a7b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -956,10 +956,6 @@ apicinterrupt3 \num \sym \do_sym
 POP_SECTION_IRQENTRY
 .endm
 
-#ifdef CONFIG_SMP
-apicinterrupt RESCHEDULE_VECTORreschedule_interrupt
smp_reschedule_interrupt
-#endif
-
 /*
  * Reload gs selector with exception handling
  * edi:  new selector
diff --git a/arch/x86/include/asm/entry_arch.h 
b/arch/x86/include/asm/entry_arch.h
index a01bb74244ac..3e841ed5c17a 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -10,6 +10,3 @@
  * is no hardware IRQ pin equivalent for them, they are triggered
  * through the ICC by us (IPIs)
  */
-#ifdef CONFIG_SMP
-BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
-#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index fd5e7c8825e1..74c12437401e 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -28,9 +28,6 @@
 #include 
 #include 
 
-/* Interrupt handlers registered during init_IRQ */
-extern asmlinkage void reschedule_interrupt(void);
-
 #ifdef CONFIG_X86_LOCAL_APIC
 struct irq_data;
 struct pci_dev;
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 1bedae4f297a..a380303089cd 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -576,6 +576,7 @@ DECLARE_IDTENTRY_SYSVEC(X86_PLATFORM_IPI_VECTOR,
sysvec_x86_platform_ipi);
 #endif
 
 #ifdef CONFIG_SMP
+DECLARE_IDTENTRY(RESCHEDULE_VECTOR,sysvec_reschedule_ipi);
 DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR,   
sysvec_irq_move_cleanup);
 DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot);
 DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR,   
sysvec_call_function_single);
diff --git a/arch/x86/include/asm/trace/common.h 
b/arch/x86/include/asm/trace/common.h
index 57c8da027d99..f0f9bcdb74d9 100644
--- a/arch/x86/include/asm/trace/common.h
+++ b/arch/x86/include/asm/trace/common.h
@@ -5,12 +5,8 @@
 DECLARE_STATIC_KEY_FALSE(trace_pagefault_key);
 #define trace_pagefault_enabled()  \
static_branch_unlikely(_pagefault_key)
-DECLARE_STATIC_KEY_FALSE(trace_resched_ipi_key);
-#define trace_resched_ipi_enabled()\
-   static_branch_unlikely(_resched_ipi_key)
 #else
 static inline bool trace_pagefault_enabled(void) { return false; }
-static inline bool trace_resched_ipi_enabled(void) { return false; }
 #endif
 
 #endif
diff --git a/arch/x86/include/asm/trace/irq_vectors.h 
b/arch/x86/include/asm/trace/irq_vectors.h
index 33b9d0f0aafe..88e7f0f3bf62 100644
--- a/arch/x86/include/asm/trace/irq_vectors.h
+++ b/arch/x86/include/asm/trace/irq_vectors.h
@@ -10,9 +10,6 @@
 
 #ifdef CONFIG_X86_LOCAL_APIC
 
-extern int trace_resched_ipi_reg(void);
-extern void trace_resched_ipi_unreg(void);
-
 DECLARE_EVENT_CLASS(x86_irq_vector,
 
TP_PROTO(int vector),
@@ -37,18 +34,6 @@ DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \
TP_PROTO(int vector),   \
TP_ARGS(vector), NULL, NULL);
 
-#define DEFINE_RESCHED_IPI_EVENT(name) \
-DEFINE_EVENT_FN(x86_irq_vector, name##_entry,  \
-   TP_PROTO(int vector),   \
-   TP_ARGS(vector),\
-   trace_resched_ipi_reg,  \
-   trace_resched_ipi_unreg);   \
-DEFINE_EVENT_FN(x86_irq_vector, name##_exit,   \
-   TP_PROTO(int vector),   \
-   TP_ARGS(vector),\
-   trace_resched_ipi_reg,  \
-   trace_resched_ipi_unreg);
-
 /*
  * local_timer - called when

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1315 matches

Mail list logo