Re: [PATCH] softirq: add irq off checking for __raise_softirq_irqoff

2020-08-12 Thread Peter Zijlstra
On Thu, Aug 06, 2020 at 12:07:29PM +0800, Jiafei Pan wrote:
> __raise_softirq_irqoff will update per-CPU mask of pending softirqs,
> it need to be called in irq disabled context in order to keep it atomic
> operation, otherwise it will be interrupted by hardware interrupt,
> and per-CPU softirqs pending mask will be corrupted, the result is
> there will be unexpected issue, for example hrtimer soft irq will
> be losed and soft hrtimer will never be expire and handled.
> 
> Adding irqs disabled checking here to provide warning in irqs enabled
> context.
> 
> Signed-off-by: Jiafei Pan 
> ---
>  kernel/softirq.c | 5 +
>  1 file changed, 5 insertions(+)
> 
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index bf88d7f62433..11f61e54a3ae 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -481,6 +481,11 @@ void raise_softirq(unsigned int nr)
>  
>  void __raise_softirq_irqoff(unsigned int nr)
>  {
> + /* This function can only be called in irq disabled context,
> +  * otherwise or_softirq_pending will be interrupted by hardware
> +  * interrupt, so that there will be unexpected issue.
> +  */

Comment style is wrong, also I'm not sure the comment is really
helpfull.

> + WARN_ON_ONCE(!irqs_disabled());

lockdep_assert_irqs_disabled();

>   trace_softirq_raise(nr);
>   or_softirq_pending(1UL << nr);
>  }


Re: [PATCH v3 2/2] powerpc/uaccess: Add pre-update addressing to __get_user_asm() and __put_user_asm()

2020-08-12 Thread Christophe Leroy




Le 12/08/2020 à 21:37, Segher Boessenkool a écrit :

On Wed, Aug 12, 2020 at 12:25:17PM +, Christophe Leroy wrote:

Enable pre-update addressing mode in __get_user_asm() and __put_user_asm()

Signed-off-by: Christophe Leroy 
---
v3: new, splited out from patch 1.


It still looks fine to me, you can keep my Reviewed-by: :-)



Ah yes thanks, forgot it when I commited it.

Reviewed-by: Segher Boessenkool 

Christophe


Re: [PATCH v4 1/7] pinctrl: qcom: Add msmgpio irqchip flags

2020-08-12 Thread Maulik Shah

Hi,

On 8/12/2020 1:02 AM, Stephen Boyd wrote:

Can the subject be more specific? "pinctrl: qcom: Set IRQCHIP_SET_TYPE_MASKED 
flag"?


Sure i can update subject in v5.

Thanks,
Maulik



Quoting Maulik Shah (2020-08-10 04:20:54)

Add irqchip specific flags for msmgpio irqchip to mask non wakeirqs
during suspend and mask before setting irq type.

Masking before changing type should make sure any spurious interrupt
is not detected during this operation.

Fixes: e35a6ae0eb3a ("pinctrl/msm: Setup GPIO chip in hierarchy")
Acked-by: Linus Walleij 
Signed-off-by: Maulik Shah 
---


--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of 
Code Aurora Forum, hosted by The Linux Foundation



Re: [PATCH v2] riscv: Setup exception vector for nommu platform

2020-08-12 Thread Damien Le Moal
On 2020/08/13 12:40, Qiu Wenbo wrote:
> Exception vector is missing on nommu platform and that is an issue.
> This patch is tested in Sipeed Maix Bit Dev Board.
> 
> Fixes: 79b1feba5455 ("RISC-V: Setup exception vector early")
> Suggested-by: Anup Patel 
> Suggested-by: Atish Patra 
> Signed-off-by: Qiu Wenbo 

Please add a cc stable #5.8 tag. Kendryte support is in 5.8 stable.

> ---
>  arch/riscv/kernel/head.S | 25 +
>  1 file changed, 17 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> index d0c5c316e9bb..0a4e81b8dc79 100644
> --- a/arch/riscv/kernel/head.S
> +++ b/arch/riscv/kernel/head.S
> @@ -77,16 +77,10 @@ relocate:
>   csrw CSR_SATP, a0
>  .align 2
>  1:
> - /* Set trap vector to exception handler */
> - la a0, handle_exception
> + /* Set trap vector to spin forever to help debug */
> + la a0, .Lsecondary_park
>   csrw CSR_TVEC, a0
>  
> - /*
> -  * Set sup0 scratch register to 0, indicating to exception vector that
> -  * we are presently executing in kernel.
> -  */
> - csrw CSR_SCRATCH, zero
> -
>   /* Reload the global pointer */
>  .option push
>  .option norelax
> @@ -144,9 +138,23 @@ secondary_start_common:
>   la a0, swapper_pg_dir
>   call relocate
>  #endif
> + call setup_trap_vector
>   tail smp_callin
>  #endif /* CONFIG_SMP */
>  
> +.align 2
> +setup_trap_vector:
> + /* Set trap vector to exception handler */
> + la a0, handle_exception
> + csrw CSR_TVEC, a0
> +
> + /*
> +  * Set sup0 scratch register to 0, indicating to exception vector that
> +  * we are presently executing in kernel.
> +  */
> + csrw CSR_SCRATCH, zero
> + ret
> +
>  .Lsecondary_park:
>   /* We lack SMP support or have too many harts, so park this hart */
>   wfi
> @@ -240,6 +248,7 @@ clear_bss_done:
>   call relocate
>  #endif /* CONFIG_MMU */
>  
> + call setup_trap_vector
>   /* Restore C environment */
>   la tp, init_task
>   sw zero, TASK_TI_CPU(tp)
> 


-- 
Damien Le Moal
Western Digital Research


Re: WARN_ON_ONCE(1) in iomap_dio_actor()

2020-08-12 Thread Dave Chinner
On Mon, Aug 10, 2020 at 10:03:03PM -0400, Qian Cai wrote:
> On Sun, Jul 26, 2020 at 04:24:12PM +0100, Christoph Hellwig wrote:
> > On Fri, Jul 24, 2020 at 02:24:32PM -0400, Qian Cai wrote:
> > > On Fri, Jun 19, 2020 at 05:17:47PM -0700, Matthew Wilcox wrote:
> > > > On Fri, Jun 19, 2020 at 05:17:50PM -0400, Qian Cai wrote:
> > > > > Running a syscall fuzzer by a normal user could trigger this,
> > > > > 
> > > > > [55649.32][T515839] WARNING: CPU: 6 PID: 515839 at 
> > > > > fs/iomap/direct-io.c:391 iomap_dio_actor+0x29c/0x420
> > > > ...
> > > > > 371 static loff_t
> > > > > 372 iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
> > > > > 373 void *data, struct iomap *iomap, struct iomap 
> > > > > *srcmap)
> > > > > 374 {
> > > > > 375 struct iomap_dio *dio = data;
> > > > > 376
> > > > > 377 switch (iomap->type) {
> > > > > 378 case IOMAP_HOLE:
> > > > > 379 if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
> > > > > 380 return -EIO;
> > > > > 381 return iomap_dio_hole_actor(length, dio);
> > > > > 382 case IOMAP_UNWRITTEN:
> > > > > 383 if (!(dio->flags & IOMAP_DIO_WRITE))
> > > > > 384 return iomap_dio_hole_actor(length, dio);
> > > > > 385 return iomap_dio_bio_actor(inode, pos, length, 
> > > > > dio, iomap);
> > > > > 386 case IOMAP_MAPPED:
> > > > > 387 return iomap_dio_bio_actor(inode, pos, length, 
> > > > > dio, iomap);
> > > > > 388 case IOMAP_INLINE:
> > > > > 389 return iomap_dio_inline_actor(inode, pos, length, 
> > > > > dio, iomap);
> > > > > 390 default:
> > > > > 391 WARN_ON_ONCE(1);
> > > > > 392 return -EIO;
> > > > > 393 }
> > > > > 394 }
> > > > > 
> > > > > Could that be iomap->type == IOMAP_DELALLOC ? Looking throught the 
> > > > > logs,
> > > > > it contains a few pread64() calls until this happens,
> > > > 
> > > > It _shouldn't_ be able to happen.  XFS writes back ranges which exist
> > > > in the page cache upon seeing an O_DIRECT I/O.  So it's not supposed to
> > > > be possible for there to be an extent which is waiting for the contents
> > > > of the page cache to be written back.
> > > 
> > > Okay, it is IOMAP_DELALLOC. We have,
> > 
> > Can you share the fuzzer?  If we end up with delalloc space here we
> > probably need to fix a bug in the cache invalidation code.
> 
> Here is a simple reproducer (I believe it can also be reproduced using 
> xfstests
> generic/503 on a plain xfs without DAX when SCRATCH_MNT == TEST_DIR),
> 
> # git clone https://gitlab.com/cailca/linux-mm
> # cd linux-mm; make
> # ./random 14

Ok:

file.fd_write = safe_open("./testfile", O_RDWR|O_CREAT);

file.fd_read = safe_open("./testfile", O_RDWR|O_CREAT|O_DIRECT);

file.ptr = safe_mmap(NULL, fsize, PROT_READ|PROT_WRITE, MAP_SHARED,
file.fd_write, 0);

So this is all IO to the same inode

and you loop

while !done {

do {
rc = pread(file.fd_read, file.ptr + read, fsize - read,
read);
if (rc > 0)
read += rc;
} while (rc > 0);

rc = safe_fallocate(file.fd_write,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
0, fsize);
}

On two threads at once?

So, essentially, you do a DIO read into a mmap()d range from the
same file, with DIO read ascending and the mmap() range descending,
then once that is done you hole punch the file and do it again?

IOWs, this is a racing page_mkwrite()/DIO read workload, and the
moment the two threads hit the same block of the file with a
DIO read and a page_mkwrite at the same time, it throws a warning.

Well, that's completely expected behaviour. DIO is not serialised
against mmap() access at all, and so if the page_mkwrite occurs
between the writeback and the iomap_apply() call in the dio path,
then it will see the delalloc block taht the page-mkwrite allocated.

No sane application would ever do this, it's behaviour as expected,
so I don't think there's anything to care about here.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


[PATCH] rtc: cmos: initialize rtc time when reading alarm

2020-08-12 Thread Victor Ding
cmos_read_alarm() may leave certain fields of a struct rtc_time
untouched; therefore, these fields contain garbage if not properly
initialized, leading to inconsistent values when converting into
time64_t.
This patch to set all fields of a struct rtc_time to -1 before calling
cmos_read_alarm().

Signed-off-by: Victor Ding 
---

 drivers/rtc/rtc-cmos.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index bcc96ab7793f..c99af567780d 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -1006,6 +1006,7 @@ static int cmos_suspend(struct device *dev)
enable_irq_wake(cmos->irq);
}
 
+   memset(>saved_wkalrm.time, -1, sizeof(struct rtc_time));
cmos_read_alarm(dev, >saved_wkalrm);
 
dev_dbg(dev, "suspend%s, ctrl %02x\n",
@@ -1054,6 +1055,7 @@ static void cmos_check_wkalrm(struct device *dev)
return;
}
 
+   memset(_alarm.time, -1, sizeof(struct rtc_time));
cmos_read_alarm(dev, _alarm);
t_current_expires = rtc_tm_to_time64(_alarm.time);
t_saved_expires = rtc_tm_to_time64(>saved_wkalrm.time);
-- 
2.28.0.236.gb10cc79966-goog



Re: [RFC 2/7] KVM: VMX: Expose IA32_PKRS MSR

2020-08-12 Thread Chenyi Qiang




On 8/13/2020 5:21 AM, Jim Mattson wrote:

On Fri, Aug 7, 2020 at 1:46 AM Chenyi Qiang  wrote:


Protection Keys for Supervisor Pages (PKS) uses IA32_PKRS MSR (PKRS) at
index 0x6E1 to allow software to manage supervisor protection key
rights. For performance consideration, PKRS intercept will be disabled
so that the guest can access the PKRS without VM exits.
PKS introduces dedicated control fields in VMCS to switch PKRS, which
only does the retore part. In addition, every VM exit saves PKRS into
the guest-state area in VMCS, while VM enter won't save the host value
due to the expectation that the host won't change the MSR often. Update
the host's value in VMCS manually if the MSR has been changed by the
kernel since the last time the VMCS was run.
The function get_current_pkrs() in arch/x86/mm/pkeys.c exports the
per-cpu variable pkrs_cache to avoid frequent rdmsr of PKRS.

Signed-off-by: Chenyi Qiang 
---



diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 11e4df560018..df2c2e733549 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -289,6 +289,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
 dest->ds_sel = src->ds_sel;
 dest->es_sel = src->es_sel;
  #endif
+   dest->pkrs = src->pkrs;


Why isn't this (and other PKRS code) inside the #ifdef CONFIG_X86_64?
PKRS isn't usable outside of long mode, is it?



Yes, I'm also thinking about whether to put all pks code into 
CONFIG_X86_64. The kernel implementation also wrap its pks code inside 
CONFIG_ARCH_HAS_SUPERVISOR_PKEYS which has dependency with CONFIG_X86_64.
However, maybe this can help when host kernel disable PKS but the guest 
enable it. What do you think about this?




  }

  static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 7a3675fddec2..39ec3d0c844b 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -40,6 +40,7 @@ struct vmcs_host_state {
  #ifdef CONFIG_X86_64
 u16   ds_sel, es_sel;
  #endif
+   u32   pkrs;


One thing that does seem odd throughout is that PKRS is a 64-bit
resource, but we store and pass around only 32-bits. Yes, the top 32
bits are currently reserved, but the same can be said of, say, cr4, a
few lines above this. Yet, we store and pass around cr4 as 64-bits.
How do we decide?



IMO, If the high part of PKRS is zero-reserved, it's OK to use u32. I 
define it as u32 just to follow the definition pkrs_cache in kernel code.



  };

  struct vmcs_controls_shadow {



@@ -1163,6 +1164,20 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
  */
 host_state->ldt_sel = kvm_read_ldt();

+   /*
+* Update the host pkrs vmcs field before vcpu runs.
+* The setting of VM_EXIT_LOAD_IA32_PKRS can ensure
+* kvm_cpu_cap_has(X86_FEATURE_PKS) &&
+* guest_cpuid_has(vcpu, X86_FEATURE_VMX).
+*/
+   if (vm_exit_controls_get(vmx) & VM_EXIT_LOAD_IA32_PKRS) {
+   host_pkrs = get_current_pkrs();
+   if (unlikely(host_pkrs != host_state->pkrs)) {
+   vmcs_write64(HOST_IA32_PKRS, host_pkrs);
+   host_state->pkrs = host_pkrs;
+   }
+   }
+
  #ifdef CONFIG_X86_64
 savesegment(ds, host_state->ds_sel);
 savesegment(es, host_state->es_sel);
@@ -1951,6 +1966,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
 else
 msr_info->data = vmx->pt_desc.guest.addr_a[index / 2];
 break;
+   case MSR_IA32_PKRS:
+   if (!kvm_cpu_cap_has(X86_FEATURE_PKS) ||
+   (!msr_info->host_initiated &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_PKS)))


Could this be simplified if we required
kvm_cpu_cap_has(X86_FEATURE_PKS) as a prerequisite for
guest_cpuid_has(vcpu, X86_FEATURE_PKS)? If not, what is the expected
behavior if guest_cpuid_has(vcpu, X86_FEATURE_PKS) is true and
kvm_cpu_cap_has(X86_FEATURE_PKS)  is false?



Yes, kvm_cpu_cap_has() is a prerequisite for guest_cpuid_has() as we 
have done the check in vmx_cpuid_update(). Here I add the 
kvm_cpu_cap_has() check to ensure the vmcs_read(GUEST_IA32_PKRS) can 
execute correctly in case of the userspace access.



+   return 1;
+   msr_info->data = vmcs_read64(GUEST_IA32_PKRS);
+   break;
 case MSR_TSC_AUX:
 if (!msr_info->host_initiated &&
 !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))



@@ -7230,6 +7267,26 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
 vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
  }

+static void vmx_update_pkrs_cfg(struct kvm_vcpu *vcpu)
+{
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+   bool pks_supported = 

Re: [PATCH v2] riscv: Setup exception vector for nommu platform

2020-08-12 Thread Atish Patra
On Wed, Aug 12, 2020 at 8:40 PM Qiu Wenbo  wrote:
>
> Exception vector is missing on nommu platform and that is an issue.
> This patch is tested in Sipeed Maix Bit Dev Board.
>
> Fixes: 79b1feba5455 ("RISC-V: Setup exception vector early")
> Suggested-by: Anup Patel 
> Suggested-by: Atish Patra 
> Signed-off-by: Qiu Wenbo 
> ---
>  arch/riscv/kernel/head.S | 25 +
>  1 file changed, 17 insertions(+), 8 deletions(-)
>
> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> index d0c5c316e9bb..0a4e81b8dc79 100644
> --- a/arch/riscv/kernel/head.S
> +++ b/arch/riscv/kernel/head.S
> @@ -77,16 +77,10 @@ relocate:
> csrw CSR_SATP, a0
>  .align 2
>  1:
> -   /* Set trap vector to exception handler */
> -   la a0, handle_exception
> +   /* Set trap vector to spin forever to help debug */
> +   la a0, .Lsecondary_park
> csrw CSR_TVEC, a0
>
> -   /*
> -* Set sup0 scratch register to 0, indicating to exception vector that
> -* we are presently executing in kernel.
> -*/
> -   csrw CSR_SCRATCH, zero
> -
> /* Reload the global pointer */
>  .option push
>  .option norelax
> @@ -144,9 +138,23 @@ secondary_start_common:
> la a0, swapper_pg_dir
> call relocate
>  #endif
> +   call setup_trap_vector
> tail smp_callin
>  #endif /* CONFIG_SMP */
>
> +.align 2
> +setup_trap_vector:
> +   /* Set trap vector to exception handler */
> +   la a0, handle_exception
> +   csrw CSR_TVEC, a0
> +
> +   /*
> +* Set sup0 scratch register to 0, indicating to exception vector that
> +* we are presently executing in kernel.
> +*/
> +   csrw CSR_SCRATCH, zero
> +   ret
> +
>  .Lsecondary_park:
> /* We lack SMP support or have too many harts, so park this hart */
> wfi
> @@ -240,6 +248,7 @@ clear_bss_done:
> call relocate
>  #endif /* CONFIG_MMU */
>
> +   call setup_trap_vector
> /* Restore C environment */
> la tp, init_task
> sw zero, TASK_TI_CPU(tp)
> --
> 2.28.0
>
@palmer: Can you queue this for the next part2 PR ?

Reviewed-by: Atish Patra 
-- 
Regards,
Atish


Re: [PATCH 3/3] usb: Add Kconfig and Makefile changes to build brcmstb-usb-pinmap

2020-08-12 Thread Greg Kroah-Hartman
On Wed, Aug 12, 2020 at 04:20:18PM -0400, Al Cooper wrote:
> From: Al Cooper 
> 
> Add Kconfig and Makefile changes to build brcmstb-usb-pinmap and
> update MAINTAINERS for the new driver.

This can be part of the previous patch, or at least the Kconfig and
Makefile changes should be there so that we build the code when we add
it.

> refs #SWLINUX-5537

What is this?

> 
> Signed-off-by: Al Cooper 
> ---
>  MAINTAINERS   | 8 
>  drivers/usb/host/Kconfig  | 4 
>  drivers/usb/host/Makefile | 1 +
>  3 files changed, 13 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f0569cf304ca..3a44ac61899b 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3527,6 +3527,14 @@ S: Maintained
>  F:   Documentation/devicetree/bindings/usb/brcm,bcm7445-ehci.yaml
>  F:   drivers/usb/host/ehci-brcm.*
>  
> +BROADCOM BRCMSTB USB PIN MAP DRIVER
> +M:   Al Cooper 
> +L:   linux-...@vger.kernel.org
> +L:   bcm-kernel-feedback-l...@broadcom.com
> +S:   Maintained
> +F:   Documentation/devicetree/bindings/usb/brcm,usb-pinmap.yaml
> +F:   drivers/usb/host/brcmstb-usb-pinmap.c
> +
>  BROADCOM BRCMSTB USB2 and USB3 PHY DRIVER
>  M:   Al Cooper 
>  L:   linux-kernel@vger.kernel.org
> diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
> index 1cb3004ea7b2..9c285053bb0c 100644
> --- a/drivers/usb/host/Kconfig
> +++ b/drivers/usb/host/Kconfig
> @@ -109,12 +109,16 @@ endif # USB_XHCI_HCD
>  config USB_EHCI_BRCMSTB
> tristate
>  
> +config BRCM_USB_PINMAP
> +   tristate
> +
>  config USB_BRCMSTB
>   tristate "Broadcom STB USB support"
>   depends on (ARCH_BRCMSTB && PHY_BRCM_USB) || COMPILE_TEST
>   select USB_OHCI_HCD_PLATFORM if USB_OHCI_HCD
>   select USB_EHCI_BRCMSTB if USB_EHCI_HCD
>   select USB_XHCI_PLATFORM if USB_XHCI_HCD
> + select BRCM_USB_PINMAP
>   help
> Enables support for XHCI, EHCI and OHCI host controllers
> found in Broadcom STB SoC's.
> diff --git a/drivers/usb/host/Makefile b/drivers/usb/host/Makefile
> index bc731332fed9..0e63ef94790d 100644
> --- a/drivers/usb/host/Makefile
> +++ b/drivers/usb/host/Makefile
> @@ -90,3 +90,4 @@ obj-$(CONFIG_USB_HCD_BCMA)  += bcma-hcd.o
>  obj-$(CONFIG_USB_HCD_SSB)+= ssb-hcd.o
>  obj-$(CONFIG_USB_FOTG210_HCD)+= fotg210-hcd.o
>  obj-$(CONFIG_USB_MAX3421_HCD)+= max3421-hcd.o
> +obj-$(CONFIG_BRCM_USB_PINMAP)+= brcmstb-usb-pinmap.o

Shouldn't this driver be in usb/misc/ with other drivers like this?  Why
host?

Wait, why is this a separate driver at all?  Why not just build it into
the USB_BRCMSTB driver?

thanks,

greg k-h


Re: [PATCH 2/2] iommu/iova: Free global iova rcache on iova alloc failure

2020-08-12 Thread Vijayanand Jitta



On 8/12/2020 8:46 PM, Joerg Roedel wrote:
> On Mon, Aug 03, 2020 at 03:30:48PM +0530, Vijayanand Jitta wrote:
>> ping?
> 
> Please repost when v5.9-rc1 is released and add
> 
>   Robin Murphy 
> 
> on your Cc list.
> 
> Thanks,
> 
>   Joerg
> 

Sure, will do.

Thanks,
Vijay
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member of Code Aurora Forum, hosted by The Linux Foundation


Re: [PATCH net] bonding: show saner speed for broadcast mode

2020-08-12 Thread Jay Vosburgh
Jarod Wilson  wrote:

>Broadcast mode bonds transmit a copy of all traffic simultaneously out of
>all interfaces, so the "speed" of the bond isn't really the aggregate of
>all interfaces, but rather, the speed of the lowest active interface.

Did you mean "slowest" here?

>Also, the type of the speed field is u32, not unsigned long, so adjust
>that accordingly, as required to make min() function here without
>complaining about mismatching types.
>
>Fixes: bb5b052f751b ("bond: add support to read speed and duplex via ethtool")
>CC: Jay Vosburgh 
>CC: Veaceslav Falico 
>CC: Andy Gospodarek 
>CC: "David S. Miller" 
>CC: net...@vger.kernel.org
>Signed-off-by: Jarod Wilson 

Did you notice this by inspection, or did it come up in use
somewhere?  I can't recall ever hearing of anyone using broadcast mode,
so I'm curious if there is a use for it, but this change seems
reasonable enough regardless.

-J

Acked-by: Jay Vosburgh 


>---
> drivers/net/bonding/bond_main.c | 21 ++---
> 1 file changed, 18 insertions(+), 3 deletions(-)
>
>diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
>index 5ad43aaf76e5..c853ca67058c 100644
>--- a/drivers/net/bonding/bond_main.c
>+++ b/drivers/net/bonding/bond_main.c
>@@ -4552,13 +4552,23 @@ static netdev_tx_t bond_start_xmit(struct sk_buff 
>*skb, struct net_device *dev)
>   return ret;
> }
> 
>+static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed)
>+{
>+  if (speed == 0 || speed == SPEED_UNKNOWN)
>+  speed = slave->speed;
>+  else
>+  speed = min(speed, slave->speed);
>+
>+  return speed;
>+}
>+
> static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev,
>  struct ethtool_link_ksettings *cmd)
> {
>   struct bonding *bond = netdev_priv(bond_dev);
>-  unsigned long speed = 0;
>   struct list_head *iter;
>   struct slave *slave;
>+  u32 speed = 0;
> 
>   cmd->base.duplex = DUPLEX_UNKNOWN;
>   cmd->base.port = PORT_OTHER;
>@@ -4570,8 +4580,13 @@ static int bond_ethtool_get_link_ksettings(struct 
>net_device *bond_dev,
>*/
>   bond_for_each_slave(bond, slave, iter) {
>   if (bond_slave_can_tx(slave)) {
>-  if (slave->speed != SPEED_UNKNOWN)
>-  speed += slave->speed;
>+  if (slave->speed != SPEED_UNKNOWN) {
>+  if (BOND_MODE(bond) == BOND_MODE_BROADCAST)
>+  speed = bond_mode_bcast_speed(slave,
>+speed);
>+  else
>+  speed += slave->speed;
>+  }
>   if (cmd->base.duplex == DUPLEX_UNKNOWN &&
>   slave->duplex != DUPLEX_UNKNOWN)
>   cmd->base.duplex = slave->duplex;
>-- 
>2.20.1
>


RE: [PATCH RFC v2 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver

2020-08-12 Thread Tian, Kevin
> From: Jason Wang 
> Sent: Thursday, August 13, 2020 12:34 PM
> 
> 
> On 2020/8/12 下午12:05, Tian, Kevin wrote:
> >> The problem is that if we tie all controls via VFIO uAPI, the other
> >> subsystem like vDPA is likely to duplicate them. I wonder if there is a
> >> way to decouple the vSVA out of VFIO uAPI?
> > vSVA is a per-device (either pdev or mdev) feature thus naturally should
> > be managed by its device driver (VFIO or vDPA). From this angle some
> > duplication is inevitable given VFIO and vDPA are orthogonal passthrough
> > frameworks. Within the kernel the majority of vSVA handling is done by
> > IOMMU and IOASID modules thus most logic are shared.
> 
> 
> So why not introduce vSVA uAPI at IOMMU or IOASID layer?

One may ask a similar question why IOMMU doesn't expose map/unmap
as uAPI...

> 
> 
> >
> >>>If an userspace DMA interface can be easily
> >>> adapted to be a passthrough one, it might be the choice.
> >> It's not that easy even for VFIO which requires a lot of new uAPIs and
> >> infrastructures(e.g mdev) to be invented.
> >>
> >>
> >>> But for idxd,
> >>> we see mdev a much better fit here, given the big difference between
> >>> what userspace DMA requires and what guest driver requires in this hw.
> >> A weak point for mdev is that it can't serve kernel subsystem other than
> >> VFIO. In this case, you need some other infrastructures (like [1]) to do
> >> this.
> > mdev is not exclusive from kernel usages. It's perfectly fine for a driver
> > to reserve some work queues for host usages, while wrapping others
> > into mdevs.
> 
> 
> I meant you may want slices to be an independent device from the kernel
> point of view:
> 
> E.g for ethernet devices, you may want 10K mdevs to be passed to guest.
> 
> Similarly, you may want 10K net devices which is connected to the kernel
> networking subsystems.
> 
> In this case it's not simply reserving queues but you need some other
> type of device abstraction. There could be some kind of duplication
> between this and mdev.
> 

yes, some abstraction required but isn't it what the driver should
care about instead of mdev framework itself? If the driver reports
the same set of resource to both mdev and networking, it needs to
make sure when the resource is claimed in one interface then it
should be marked in-use in another. e.g. each mdev includes a
available_intances attribute. the driver could report 10k available
instances initially and then update it to 5K when another 5K is used
for net devices later.

Mdev definitely has its usage limitations. Some may be improved 
in the future, some may not. But those are distracting from the
original purpose of this thread (mdev vs. userspace DMA) and better
be discussed in other places e.g. LPC... 

Thanks
Kevin


Re: [PATCH v3 2/2] power: supply: sbs-battery: don't assume i2c errors as battery disconnect

2020-08-12 Thread Nicolas Boichat
On Thu, Aug 13, 2020 at 1:11 PM Ikjoon Jang  wrote:
>
> Current sbs-battery considers all smbus errors as disconnection events
> when battery-detect pin isn't supplied, and restored to present state back
> when any successful transaction is made.
>
> This can lead to unlimited state changes between present and !present
> when one unsupported command was requested and other following commands
> were successful, e.g. udev rules tries to read multiple properties.
>
> This patch checks battery presence by reading known good command to
> check battery existence.
>
> Signed-off-by: Ikjoon Jang 

Looks good now, AFAICT. Thanks!

Reviewed-by: Nicolas Boichat 

> ---
>  drivers/power/supply/sbs-battery.c | 25 +
>  1 file changed, 17 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/power/supply/sbs-battery.c 
> b/drivers/power/supply/sbs-battery.c
> index 6acb4ea25d2a..2e32fde04628 100644
> --- a/drivers/power/supply/sbs-battery.c
> +++ b/drivers/power/supply/sbs-battery.c
> @@ -878,10 +878,17 @@ static int sbs_get_property(struct power_supply *psy,
> if (!chip->enable_detection)
> goto done;
>
> -   if (!chip->gpio_detect &&
> -   chip->is_present != (ret >= 0)) {
> -   sbs_update_presence(chip, (ret >= 0));
> -   power_supply_changed(chip->power_supply);
> +   if (!chip->gpio_detect && chip->is_present != (ret >= 0)) {
> +   bool old_present = chip->is_present;
> +   union power_supply_propval val;
> +
> +   ret = sbs_get_battery_presence_and_health(
> +   client, POWER_SUPPLY_PROP_PRESENT, );
> +
> +   sbs_update_presence(chip, !ret && val.intval);
> +
> +   if (old_present != chip->is_present)
> +   power_supply_changed(chip->power_supply);
> }
>
>  done:
> @@ -1067,11 +1074,13 @@ static int sbs_probe(struct i2c_client *client)
>  * to the battery.
>  */
> if (!(force_load || chip->gpio_detect)) {
> -   rc = sbs_read_word_data(client, sbs_data[REG_STATUS].addr);
> +   union power_supply_propval val;
>
> -   if (rc < 0) {
> -   dev_err(>dev, "%s: Failed to get device 
> status\n",
> -   __func__);
> +   rc = sbs_get_battery_presence_and_health(
> +   client, POWER_SUPPLY_PROP_PRESENT, );
> +   if (rc < 0 || !val.intval) {
> +   dev_err(>dev, "Failed to get present 
> status\n");
> +   rc = -ENODEV;
> goto exit_psupply;
> }
> }
> --
> 2.28.0.236.gb10cc79966-goog
>


Re: WARNING: locking bug in l2cap_chan_del

2020-08-12 Thread syzbot
syzbot has found a reproducer for the following issue on:

HEAD commit:06a7a37b ipv4: tunnel: fix compilation on ARCH=um
git tree:   net
console output: https://syzkaller.appspot.com/x/log.txt?x=115caa1690
kernel config:  https://syzkaller.appspot.com/x/.config?x=7bb894f55faf8242
dashboard link: https://syzkaller.appspot.com/bug?extid=01d7fc00b2a0419d01cc
compiler:   gcc (GCC) 10.1.0-syz 20200507
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=13fb564a90

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+01d7fc00b2a0419d0...@syzkaller.appspotmail.com

[ cut here ]
DEBUG_LOCKS_WARN_ON(1)
WARNING: CPU: 0 PID: 5 at kernel/locking/lockdep.c:183 hlock_class 
kernel/locking/lockdep.c:183 [inline]
WARNING: CPU: 0 PID: 5 at kernel/locking/lockdep.c:183 hlock_class 
kernel/locking/lockdep.c:172 [inline]
WARNING: CPU: 0 PID: 5 at kernel/locking/lockdep.c:183 check_wait_context 
kernel/locking/lockdep.c:4100 [inline]
WARNING: CPU: 0 PID: 5 at kernel/locking/lockdep.c:183 
__lock_acquire+0x1674/0x5640 kernel/locking/lockdep.c:4376
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 5 Comm: kworker/0:0 Not tainted 5.8.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Workqueue: events l2cap_chan_timeout
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x18f/0x20d lib/dump_stack.c:118
 panic+0x2e3/0x75c kernel/panic.c:231
 __warn.cold+0x20/0x45 kernel/panic.c:600
 report_bug+0x1bd/0x210 lib/bug.c:198
 handle_bug+0x38/0x90 arch/x86/kernel/traps.c:235
 exc_invalid_op+0x14/0x40 arch/x86/kernel/traps.c:255
 asm_exc_invalid_op+0x12/0x20 arch/x86/include/asm/idtentry.h:536
RIP: 0010:hlock_class kernel/locking/lockdep.c:183 [inline]
RIP: 0010:hlock_class kernel/locking/lockdep.c:172 [inline]
RIP: 0010:check_wait_context kernel/locking/lockdep.c:4100 [inline]
RIP: 0010:__lock_acquire+0x1674/0x5640 kernel/locking/lockdep.c:4376
Code: d2 0f 85 f1 36 00 00 44 8b 15 f0 8e 57 09 45 85 d2 0f 85 1c fa ff ff 48 
c7 c6 80 af 4b 88 48 c7 c7 80 aa 4b 88 e8 ce 36 eb ff <0f> 0b e9 02 fa ff ff c7 
44 24 38 fe ff ff ff 41 bf 01 00 00 00 c7
RSP: 0018:c9cbf8e0 EFLAGS: 00010086
RAX:  RBX: 0004 RCX: 
RDX: 8880a95a2140 RSI: 815d8eb7 RDI: f52000197f0e
RBP: 8880a95a2ab0 R08:  R09: 89bcb3c3
R10: 07d2 R11: 0001 R12: 
R13: 19a1 R14: 8880a95a2140 R15: 0004
 lock_acquire+0x1f1/0xad0 kernel/locking/lockdep.c:5005
 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline]
 _raw_spin_lock_bh+0x2f/0x40 kernel/locking/spinlock.c:175
 spin_lock_bh include/linux/spinlock.h:359 [inline]
 lock_sock_nested+0x3b/0x110 net/core/sock.c:3040
 l2cap_sock_teardown_cb+0x88/0x400 net/bluetooth/l2cap_sock.c:1520
 l2cap_chan_del+0xad/0x1300 net/bluetooth/l2cap_core.c:618
 l2cap_chan_close+0x118/0xb10 net/bluetooth/l2cap_core.c:823
 l2cap_chan_timeout+0x173/0x450 net/bluetooth/l2cap_core.c:436
 process_one_work+0x94c/0x1670 kernel/workqueue.c:2269
 worker_thread+0x64c/0x1120 kernel/workqueue.c:2415
 kthread+0x3b5/0x4a0 kernel/kthread.c:292
 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
Kernel Offset: disabled
Rebooting in 86400 seconds..



Re: POC: Alternative solution: Re: [PATCH 0/4] printk: reimplement LOG_CONT handling

2020-08-12 Thread Sergey Senozhatsky
On (20/08/13 02:30), John Ogness wrote:
> 2. I haven't yet figured out how to preserve calling context when a
> newline appears. For example:
> 
> pr_info("text");
> pr_cont(" 1");
> pr_cont(" 2\n");
> pr_cont("3");
> pr_cont(" 4\n");
>
> For "3" the calling context (info, timestamp) is lost because with "2"
> the record is finalized. Perhaps the above is invalid usage of LOG_CONT?

This is not an unseen pattern, I'm afraid. And the problem here can
be more general:

pr_info("text");
pr_cont("1");
exception/IRQ/NMI
pr_alert("text\n");
pr_cont("2");
pr_cont("\n");

I guess the solution would be to store "last log_level" in task_struct
and get current (new) timestamp for broken cont line?

We have this problem now. E.g. early boot messages from one of my boxes:

6,173,41837,-;x86: Booting SMP configuration:
6,174,41838,-; node  #0, CPUs:  #1 #2 #3 #4
4,175,44993,-;MDS CPU bug present and SMT on, data leak possible. See 
https://www.kernel.org/doc/...
4,176,44993,c; #5 #6 #7

"CPUs: #1 #2 #3 #4 #5 #6 #7" is supposed to be one cont line with
loglevel 6. But it gets interrupted and flushed so that "#5 #6 #7"
appears with the different loglevel.

-ss


Re: [PATCH 1/3] mm: don't call activate_page() on new ksm pages

2020-08-12 Thread Yang Shi
On Tue, Aug 11, 2020 at 9:04 PM Yu Zhao  wrote:
>
> lru_cache_add_active_or_unevictable() already adds new ksm pages to
> active lru. Calling activate_page() isn't really necessary in this
> case.
>
> Signed-off-by: Yu Zhao 
> ---
>  mm/swapfile.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
>
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 6c26916e95fd..cf115ea26a20 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -1913,16 +1913,16 @@ static int unuse_pte(struct vm_area_struct *vma, 
> pmd_t *pmd,
>pte_mkold(mk_pte(page, vma->vm_page_prot)));
> if (page == swapcache) {
> page_add_anon_rmap(page, vma, addr, false);
> +   /*
> +* Move the page to the active list so it is not
> +* immediately swapped out again after swapon.
> +*/
> +   activate_page(page);

Actually I think we could just remove this activate_page() call with
Joonsoo's anonymous page workingset series merged. The active bit will
be taken care by workingset_refault().

> } else { /* ksm created a completely new copy */
> page_add_new_anon_rmap(page, vma, addr, false);
> lru_cache_add_active_or_unevictable(page, vma);

And it looks the latest linus's tree already changed this to
lru_cache_add_inactive_or_unevictable() by commit b518154e59
("mm/vmscan: protect the workingset on anonymous LRU")

Added Joonsoo in this loop.

> }
> swap_free(entry);
> -   /*
> -* Move the page to the active list so it is not
> -* immediately swapped out again after swapon.
> -*/
> -   activate_page(page);
>  out:
> pte_unmap_unlock(pte, ptl);
> if (page != swapcache) {
> --
> 2.28.0.236.gb10cc79966-goog
>
>


drivers/char/agp/i460-agp.c:254:19: sparse: sparse: incorrect type in assignment (different address spaces)

2020-08-12 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
master
head:   dc06fe51d26efc100ac74121607c01a454867c91
commit: 670d0a4b10704667765f7d18f7592993d02783aa sparse: use identifiers to 
define address spaces
date:   8 weeks ago
config: ia64-randconfig-s031-20200813 (attached as .config)
compiler: ia64-linux-gcc (GCC) 9.3.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# apt-get install sparse
# sparse version: v0.6.2-168-g9554805c-dirty
git checkout 670d0a4b10704667765f7d18f7592993d02783aa
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross C=1 
CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=ia64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 


sparse warnings: (new ones prefixed by >>)

>> drivers/char/agp/i460-agp.c:254:19: sparse: sparse: incorrect type in 
>> assignment (different address spaces) @@ expected void *static 
>> [assigned] [toplevel] gatt @@ got void [noderef] __iomem * @@
   drivers/char/agp/i460-agp.c:254:19: sparse: expected void *static 
[assigned] [toplevel] gatt
>> drivers/char/agp/i460-agp.c:254:19: sparse: got void [noderef] __iomem *
>> drivers/char/agp/i460-agp.c:266:17: sparse: sparse: incorrect type in 
>> argument 2 (different address spaces) @@ expected void volatile 
>> [noderef] __iomem *addr @@ got unsigned int [usertype] * @@
>> drivers/char/agp/i460-agp.c:266:17: sparse: expected void volatile 
>> [noderef] __iomem *addr
   drivers/char/agp/i460-agp.c:266:17: sparse: got unsigned int [usertype] *
>> drivers/char/agp/i460-agp.c:267:9: sparse: sparse: incorrect type in 
>> argument 1 (different address spaces) @@ expected void const volatile 
>> [noderef] __iomem *addr @@ got unsigned int [usertype] * @@
>> drivers/char/agp/i460-agp.c:267:9: sparse: expected void const volatile 
>> [noderef] __iomem *addr
   drivers/char/agp/i460-agp.c:267:9: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:281:17: sparse: sparse: incorrect type in 
argument 2 (different address spaces) @@ expected void volatile [noderef] 
__iomem *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:281:17: sparse: expected void volatile 
[noderef] __iomem *addr
   drivers/char/agp/i460-agp.c:281:17: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:282:9: sparse: sparse: incorrect type in 
argument 1 (different address spaces) @@ expected void const volatile 
[noderef] __iomem *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:282:9: sparse: expected void const volatile 
[noderef] __iomem *addr
   drivers/char/agp/i460-agp.c:282:9: sparse: got unsigned int [usertype] *
>> drivers/char/agp/i460-agp.c:284:21: sparse: sparse: incorrect type in 
>> argument 1 (different address spaces) @@ expected void volatile 
>> [noderef] __iomem *addr @@ got void *static [assigned] [toplevel] gatt @@
   drivers/char/agp/i460-agp.c:284:21: sparse: expected void volatile 
[noderef] __iomem *addr
   drivers/char/agp/i460-agp.c:284:21: sparse: got void *static [assigned] 
[toplevel] gatt
   drivers/char/agp/i460-agp.c:318:22: sparse: sparse: incorrect type in 
argument 1 (different address spaces) @@ expected void const volatile 
[noderef] __iomem *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:318:22: sparse: expected void const volatile 
[noderef] __iomem *addr
   drivers/char/agp/i460-agp.c:318:22: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:318:22: sparse: sparse: incorrect type in 
argument 1 (different address spaces) @@ expected void const volatile 
[noderef] __iomem *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:318:22: sparse: expected void const volatile 
[noderef] __iomem *addr
   drivers/char/agp/i460-agp.c:318:22: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:319:25: sparse: sparse: incorrect type in 
argument 1 (different address spaces) @@ expected void const volatile 
[noderef] __iomem *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:319:25: sparse: expected void const volatile 
[noderef] __iomem *addr
   drivers/char/agp/i460-agp.c:319:25: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:330:25: sparse: sparse: incorrect type in 
argument 2 (different address spaces) @@ expected void volatile [noderef] 
__iomem *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:330:25: sparse: expected void volatile 
[noderef] __iomem *addr
   drivers/char/agp/i460-agp.c:330:25: sparse: got unsigned int [usertype] *
   

Re: [PATCH] ftrace: Fixup lockdep assert held of text_mutex

2020-08-12 Thread Palmer Dabbelt

On Thu, 06 Aug 2020 22:01:01 PDT (-0700), guo...@kernel.org wrote:

On Fri, Aug 7, 2020 at 12:01 PM Steven Rostedt  wrote:


On Fri, 7 Aug 2020 10:59:16 +0800
Guo Ren  wrote:
> >
> > This looks like a bug in the lockdep_assert_held() in whatever arch
> > (riscv) is running.
> Seems you think it's a bug of arch implementation with the wrong usage
> of text_mutex?
>
> Also @riscv maintainer,
> How about modifying it in riscv's code? we still need to solve it.
>
> 
> diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h
> index ace8a6e..fb266c3 100644
> --- a/arch/riscv/include/asm/ftrace.h
> +++ b/arch/riscv/include/asm/ftrace.h
> @@ -23,6 +23,12 @@ static inline unsigned long
> ftrace_call_adjust(unsigned long addr)
>
>  struct dyn_arch_ftrace {
>  };
> +
> +#ifdef CONFIG_DYNAMIC_FTRACE
> +struct dyn_ftrace;
> +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec);
> +#define ftrace_init_nop ftrace_init_nop
> +#endif
>  #endif
>
>  #ifdef CONFIG_DYNAMIC_FTRACE
> diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c
> index 2ff63d0..9e9f7c0 100644
> --- a/arch/riscv/kernel/ftrace.c
> +++ b/arch/riscv/kernel/ftrace.c
> @@ -97,6 +97,17 @@ int ftrace_make_nop(struct module *mod, struct
> dyn_ftrace *rec,
> return __ftrace_modify_call(rec->ip, addr, false);
>  }
>
> +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec)
> +{
> +   int ret;
> +
> +   mutex_lock(_mutex);
> +   ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);

Looking at x86, we have the following code:

static int ftrace_poke_late = 0;

int ftrace_arch_code_modify_prepare(void)
__acquires(_mutex)
{
/*
 * Need to grab text_mutex to prevent a race from module loading
 * and live kernel patching from changing the text permissions while
 * ftrace has it set to "read/write".
 */
mutex_lock(_mutex);
ftrace_poke_late = 1;
return 0;
}

int ftrace_arch_code_modify_post_process(void)
__releases(_mutex)
{
/*
 * ftrace_make_{call,nop}() may be called during
 * module load, and we need to finish the text_poke_queue()
 * that they do, here.
 */
text_poke_finish();
ftrace_poke_late = 0;
mutex_unlock(_mutex);
return 0;
}

And if ftrace_poke_late is not set, then ftrace_make_nop() does direct
modification (calls text_poke_early(), which is basically a memcpy).

This path doesn't have any checks against text_mutex being held,
because it only happens at boot up.

The solution is ok for me, but I want to get riscv maintainer's
opinion before the next patch.
@Paul Walmsley
@Palmer Dabbelt


Sorry, I'm not really sure what's going on here.  I'm not really seeing code
that matches this in our port right now, so maybe this is aginst some other
tree?  If it's the RISC-V kprobes patch set then I was hoping to take a look at
that tomorrow (or I guess a bit earlier this week, but I had some surprise work
stuff to do).  IIRC there were a handful of races in the last patch set I saw,
but it's been a while so I don't remember for sure.

That said, I certainly wouldn't be surprised if there's a locking bug in our
ftrace stuff.  It'd be way easier for me to figure out what's going on if you
have a concrete suggestion as to how to fix the issues -- even if it's just a
workaround.





> +   mutex_unlock(_mutex);
> +
> +   return ret;
> +}
> +
>  int ftrace_update_ftrace_func(ftrace_func_t func)
>  {
> int ret = __ftrace_modify_call((unsigned long)_call,
> ---
>
> > > --- a/kernel/trace/ftrace.c
> > > +++ b/kernel/trace/ftrace.c
> > > @@ -26,6 +26,7 @@
> > >  #include 
> > >  #include 
> > >  #include 
> > > +#include 
> > >  #include 
> > >  #include 
> > >  #include 
> > > @@ -6712,9 +6713,11 @@ void __init ftrace_init(void)
> >
> > ftrace_init() is called before SMP is initialized. Nothing else should
> > be running here. That means grabbing a mutex is useless.
> I don't agree, ftrace_init are modifying kernel text, so we should
> give the lock of text_mutex to keep semantic consistency.


Did you test your patch on x86 with lockdep?

Ah.., no :P



ftrace_process_locs() grabs the ftrace_lock, which I believe is held
when text_mutex is taken in other locations. So this will probably not
work anyway.

text_mutex isn't to be taken at the ftrace level.

Yes, currently it seemed only to be used by kernel/kprobes.c.


Re: [PATCH v2 15/20] fuse, dax: Take ->i_mmap_sem lock during dax page fault

2020-08-12 Thread Dave Chinner
On Wed, Aug 12, 2020 at 05:10:12PM -0400, Vivek Goyal wrote:
> On Wed, Aug 12, 2020 at 11:23:45AM +1000, Dave Chinner wrote:
> > On Tue, Aug 11, 2020 at 01:55:30PM -0400, Vivek Goyal wrote:
> > > On Tue, Aug 11, 2020 at 08:22:38AM +1000, Dave Chinner wrote:
> > > > On Fri, Aug 07, 2020 at 03:55:21PM -0400, Vivek Goyal wrote:
> > > > > We need some kind of locking mechanism here. Normal file systems like
> > > > > ext4 and xfs seems to take their own semaphore to protect agains
> > > > > truncate while fault is going on.
> > > > > 
> > > > > We have additional requirement to protect against fuse dax memory 
> > > > > range
> > > > > reclaim. When a range has been selected for reclaim, we need to make 
> > > > > sure
> > > > > no other read/write/fault can try to access that memory range while
> > > > > reclaim is in progress. Once reclaim is complete, lock will be 
> > > > > released
> > > > > and read/write/fault will trigger allocation of fresh dax range.
> > > > > 
> > > > > Taking inode_lock() is not an option in fault path as lockdep 
> > > > > complains
> > > > > about circular dependencies. So define a new fuse_inode->i_mmap_sem.
> > > > 
> > > > That's precisely why filesystems like ext4 and XFS define their own
> > > > rwsem.
> > > > 
> > > > Note that this isn't a DAX requirement - the page fault
> > > > serialisation is actually a requirement of hole punching...
> > > 
> > > Hi Dave,
> > > 
> > > I noticed that fuse code currently does not seem to have a rwsem which
> > > can provide mutual exclusion between truncation/hole_punch path
> > > and page fault path. I am wondering does that mean there are issues
> > > with existing code or something else makes it unnecessary to provide
> > > this mutual exlusion.
> > 
> > I don't know enough about the fuse implementation to say. What I'm
> > saying is that nothing in the core mm/ or VFS serilises page cache
> > access to the data against direct filesystem manipulations of the
> > underlying filesystem structures.
> 
> Hi Dave,
> 
> Got it. I was checking nfs and they also seem to be calling filemap_fault
> and not taking any locks to block faults. fallocate() (nfs42_fallocate)
> seems to block read/write/aio/dio but does not seem to do anything
> about blocking faults. I am wondering if remote filesystem are
> little different in this aspect. Especially fuse does not maintain
> any filesystem block/extent data. It is file server which is doing
> all that.

I suspect they have all the same problems, and worse, the behaviour
will largely be dependent on the server side behaviour that is out
of the user's control.

Essentially, nobody except us XFS folks seem to regard hole punching
corrupting data or exposing stale data as being a problem that needs
to be avoided or fixed. The only reason ext4 has the i_mmap_sem is
because ext4 wanted to support DAX, and us XFS developers said "DAX
absolutely requires that the filesystem can lock out physical access
to the storage" and so they had no choice in the matter.

Other than that, nobody really seems to understand or care about all
these nasty little mmap() corner cases that we've seen corrupt user
data or expose stale data to users over many years.

> > i.e. nothing in the VFS or page fault IO path prevents this race
> > condition:
> > 
> > P0  P1
> > fallocate
> > page cache invalidation
> > page fault
> > read data
> > punch out data extents
> > 
> >  > backing store allocated>
> > 
> > 
> > That's where the ext4 and XFS internal rwsem come into play:
> > 
> > fallocate
> > down_write(mmaplock)
> > page cache invalidation
> > page fault
> > down_read(mmaplock)
> > 
> > punch out data
> > up_write(mmaplock)
> > 
> > 
> > 
> > 
> > And there's not stale data exposure to userspace.
> 
> Got it. I noticed that both fuse/nfs seem to have reversed the
> order of operation. They call server to punch out data first
> and then truncate page cache. And that should mean that even
> if mmap reader will not see stale data after fallocate(punch_hole)
> has finished.

Yes, but that doesn't prevent page fault races from occuring, it
just changes the nature of them.. Such as.

> > There is nothing stopping, say, memory reclaim from reclaiming pages
> > over the range while the hole is being punched, then having the
> > application refault them while the backing store is being freed.
> > While the page fault read IO is in progress, there's nothing
> > stopping the filesystem from freeing those blocks, nor reallocating
> > them and writing something else to them (e.g. metadata). So they
> > could read someone elses data.
> > 
> > Even worse: the page fault is a write fault, it lands in a 

Re: KMSAN: uninit-value in can_receive (2)

2020-08-12 Thread syzbot
syzbot has found a reproducer for the following issue on:

HEAD commit:ce8056d1 wip: changed copy_from_user where instrumented
git tree:   https://github.com/google/kmsan.git master
console output: https://syzkaller.appspot.com/x/log.txt?x=1041a3be90
kernel config:  https://syzkaller.appspot.com/x/.config?x=3afe005fb99591f
dashboard link: https://syzkaller.appspot.com/bug?extid=3f3837e61a48d32b495f
compiler:   clang version 10.0.0 (https://github.com/llvm/llvm-project/ 
c2443155a0fb245c8f17f2c1c72b6ea391e86e81)
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=160fbd0690
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=12a14e6e90

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+3f3837e61a48d32b4...@syzkaller.appspotmail.com

=
BUG: KMSAN: uninit-value in can_receive+0x26b/0x630 net/can/af_can.c:650
CPU: 1 PID: 8475 Comm: syz-executor984 Not tainted 5.8.0-rc5-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x21c/0x280 lib/dump_stack.c:118
 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:121
 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215
 can_receive+0x26b/0x630 net/can/af_can.c:650
 can_rcv+0x1fb/0x410 net/can/af_can.c:686
 __netif_receive_skb_one_core net/core/dev.c:5281 [inline]
 __netif_receive_skb+0x265/0x670 net/core/dev.c:5395
 process_backlog+0x50d/0xba0 net/core/dev.c:6239
 napi_poll+0x43b/0xfd0 net/core/dev.c:6684
 net_rx_action+0x35c/0xd40 net/core/dev.c:6752
 __do_softirq+0x2ea/0x7f5 kernel/softirq.c:293
 asm_call_on_stack+0xf/0x20 arch/x86/entry/entry_64.S:711
 
 __run_on_irqstack arch/x86/include/asm/irq_stack.h:23 [inline]
 run_on_irqstack_cond arch/x86/include/asm/irq_stack.h:50 [inline]
 do_softirq_own_stack+0x7c/0xa0 arch/x86/kernel/irq_64.c:77
 invoke_softirq kernel/softirq.c:390 [inline]
 __irq_exit_rcu+0x226/0x270 kernel/softirq.c:420
 irq_exit_rcu+0xe/0x10 kernel/softirq.c:432
 sysvec_apic_timer_interrupt+0x107/0x130 arch/x86/kernel/apic/apic.c:1091
 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:593
RIP: 0010:_raw_spin_unlock_irqrestore+0x4b/0x70 kernel/locking/spinlock.c:192
Code: 00 8b b8 88 0c 00 00 48 8b 00 48 85 c0 75 28 48 89 df e8 b8 6e 4b f1 c6 
00 00 c6 03 00 4d 85 e4 75 1c 4c 89 7d d8 ff 75 d8 9d <48> 83 c4 08 5b 41 5c 41 
5e 41 5f 5d c3 e8 53 74 4b f1 eb d1 44 89
RSP: 0018:8880b8b0f880 EFLAGS: 0282
RAX: 88821fd3bc00 RBX: 88812fd1dc00 RCX: 00021fc9cc00
RDX: 88821fc9cc00 RSI: 04a0 RDI: 88812fd1dc00
RBP: 8880b8b0f8a8 R08: ea0f R09: 88812fffa000
R10: 0004 R11:  R12: 
R13: 8880b656c2e8 R14:  R15: 0282
 unlock_hrtimer_base kernel/time/hrtimer.c:898 [inline]
 hrtimer_start_range_ns+0x459/0x4e0 kernel/time/hrtimer.c:1136
 hrtimer_start include/linux/hrtimer.h:421 [inline]
 j1939_tp_schedule_txtimer+0x132/0x1b0 net/can/j1939/transport.c:671
 j1939_sk_send_loop net/can/j1939/socket.c:1047 [inline]
 j1939_sk_sendmsg+0x1cc0/0x2950 net/can/j1939/socket.c:1160
 sock_sendmsg_nosec net/socket.c:652 [inline]
 sock_sendmsg net/socket.c:672 [inline]
 sys_sendmsg+0xc82/0x1240 net/socket.c:2352
 ___sys_sendmsg net/socket.c:2406 [inline]
 __sys_sendmsg+0x6d1/0x840 net/socket.c:2439
 __do_sys_sendmsg net/socket.c:2448 [inline]
 __se_sys_sendmsg+0x97/0xb0 net/socket.c:2446
 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2446
 do_syscall_64+0xad/0x160 arch/x86/entry/common.c:386
 entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x443559
Code: Bad RIP value.
RSP: 002b:7dad8558 EFLAGS: 0246 ORIG_RAX: 002e
RAX: ffda RBX: 0003 RCX: 00443559
RDX:  RSI: 2180 RDI: 0003
RBP: 7dad8560 R08: 01bb R09: 01bb
R10: 01bb R11: 0246 R12: 7dad8570
R13:  R14:  R15: 

Uninit was created at:
 kmsan_save_stack_with_flags mm/kmsan/kmsan.c:144 [inline]
 kmsan_internal_poison_shadow+0x66/0xd0 mm/kmsan/kmsan.c:127
 kmsan_slab_alloc+0x8a/0xe0 mm/kmsan/kmsan_hooks.c:80
 slab_alloc_node mm/slub.c:2839 [inline]
 __kmalloc_node_track_caller+0xeab/0x12e0 mm/slub.c:4478
 __kmalloc_reserve net/core/skbuff.c:142 [inline]
 __alloc_skb+0x35f/0xb30 net/core/skbuff.c:210
 alloc_skb include/linux/skbuff.h:1083 [inline]
 j1939_tp_tx_dat_new net/can/j1939/transport.c:568 [inline]
 j1939_xtp_do_tx_ctl net/can/j1939/transport.c:628 [inline]
 j1939_tp_tx_ctl net/can/j1939/transport.c:646 [inline]
 j1939_session_tx_rts net/can/j1939/transport.c:714 [inline]
 j1939_xtp_txnext_transmiter net/can/j1939/transport.c:832 [inline]
 j1939_tp_txtimer+0x402c/0x6980 net/can/j1939/transport.c:1095
 

[PATCH v3 1/2] power: supply: sbs-battery: combine get_presence_and_health

2020-08-12 Thread Ikjoon Jang
This patch enables calling sbs_get_battery_presence_and_health()
without checking its chip type. No functional changes.

Signed-off-by: Ikjoon Jang 
---
 drivers/power/supply/sbs-battery.c | 73 +++---
 1 file changed, 36 insertions(+), 37 deletions(-)

diff --git a/drivers/power/supply/sbs-battery.c 
b/drivers/power/supply/sbs-battery.c
index 83b9924033bd..6acb4ea25d2a 100644
--- a/drivers/power/supply/sbs-battery.c
+++ b/drivers/power/supply/sbs-battery.c
@@ -389,37 +389,6 @@ static bool sbs_bat_needs_calibration(struct i2c_client 
*client)
return !!(ret & BIT(7));
 }
 
-static int sbs_get_battery_presence_and_health(
-   struct i2c_client *client, enum power_supply_property psp,
-   union power_supply_propval *val)
-{
-   int ret;
-
-   /* Dummy command; if it succeeds, battery is present. */
-   ret = sbs_read_word_data(client, sbs_data[REG_STATUS].addr);
-
-   if (ret < 0) { /* battery not present*/
-   if (psp == POWER_SUPPLY_PROP_PRESENT) {
-   val->intval = 0;
-   return 0;
-   }
-   return ret;
-   }
-
-   if (psp == POWER_SUPPLY_PROP_PRESENT)
-   val->intval = 1; /* battery present */
-   else { /* POWER_SUPPLY_PROP_HEALTH */
-   if (sbs_bat_needs_calibration(client)) {
-   val->intval = POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED;
-   } else {
-   /* SBS spec doesn't have a general health command. */
-   val->intval = POWER_SUPPLY_HEALTH_UNKNOWN;
-   }
-   }
-
-   return 0;
-}
-
 static int sbs_get_ti_battery_presence_and_health(
struct i2c_client *client, enum power_supply_property psp,
union power_supply_propval *val)
@@ -478,6 +447,41 @@ static int sbs_get_ti_battery_presence_and_health(
return 0;
 }
 
+static int sbs_get_battery_presence_and_health(
+   struct i2c_client *client, enum power_supply_property psp,
+   union power_supply_propval *val)
+{
+   struct sbs_info *chip = i2c_get_clientdata(client);
+   int ret;
+
+   if (chip->flags & SBS_FLAGS_TI_BQ20ZX5)
+   return sbs_get_ti_battery_presence_and_health(client, psp, val);
+
+   /* Dummy command; if it succeeds, battery is present. */
+   ret = sbs_read_word_data(client, sbs_data[REG_STATUS].addr);
+
+   if (ret < 0) { /* battery not present*/
+   if (psp == POWER_SUPPLY_PROP_PRESENT) {
+   val->intval = 0;
+   return 0;
+   }
+   return ret;
+   }
+
+   if (psp == POWER_SUPPLY_PROP_PRESENT)
+   val->intval = 1; /* battery present */
+   else { /* POWER_SUPPLY_PROP_HEALTH */
+   if (sbs_bat_needs_calibration(client)) {
+   val->intval = POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED;
+   } else {
+   /* SBS spec doesn't have a general health command. */
+   val->intval = POWER_SUPPLY_HEALTH_UNKNOWN;
+   }
+   }
+
+   return 0;
+}
+
 static int sbs_get_battery_property(struct i2c_client *client,
int reg_offset, enum power_supply_property psp,
union power_supply_propval *val)
@@ -780,12 +784,7 @@ static int sbs_get_property(struct power_supply *psy,
switch (psp) {
case POWER_SUPPLY_PROP_PRESENT:
case POWER_SUPPLY_PROP_HEALTH:
-   if (chip->flags & SBS_FLAGS_TI_BQ20ZX5)
-   ret = sbs_get_ti_battery_presence_and_health(client,
-psp, val);
-   else
-   ret = sbs_get_battery_presence_and_health(client, psp,
- val);
+   ret = sbs_get_battery_presence_and_health(client, psp, val);
 
/* this can only be true if no gpio is used */
if (psp == POWER_SUPPLY_PROP_PRESENT)
-- 
2.28.0.236.gb10cc79966-goog



[PATCH v3 2/2] power: supply: sbs-battery: don't assume i2c errors as battery disconnect

2020-08-12 Thread Ikjoon Jang
Current sbs-battery considers all smbus errors as disconnection events
when battery-detect pin isn't supplied, and restored to present state back
when any successful transaction is made.

This can lead to unlimited state changes between present and !present
when one unsupported command was requested and other following commands
were successful, e.g. udev rules tries to read multiple properties.

This patch checks battery presence by reading known good command to
check battery existence.

Signed-off-by: Ikjoon Jang 
---
 drivers/power/supply/sbs-battery.c | 25 +
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/power/supply/sbs-battery.c 
b/drivers/power/supply/sbs-battery.c
index 6acb4ea25d2a..2e32fde04628 100644
--- a/drivers/power/supply/sbs-battery.c
+++ b/drivers/power/supply/sbs-battery.c
@@ -878,10 +878,17 @@ static int sbs_get_property(struct power_supply *psy,
if (!chip->enable_detection)
goto done;
 
-   if (!chip->gpio_detect &&
-   chip->is_present != (ret >= 0)) {
-   sbs_update_presence(chip, (ret >= 0));
-   power_supply_changed(chip->power_supply);
+   if (!chip->gpio_detect && chip->is_present != (ret >= 0)) {
+   bool old_present = chip->is_present;
+   union power_supply_propval val;
+
+   ret = sbs_get_battery_presence_and_health(
+   client, POWER_SUPPLY_PROP_PRESENT, );
+
+   sbs_update_presence(chip, !ret && val.intval);
+
+   if (old_present != chip->is_present)
+   power_supply_changed(chip->power_supply);
}
 
 done:
@@ -1067,11 +1074,13 @@ static int sbs_probe(struct i2c_client *client)
 * to the battery.
 */
if (!(force_load || chip->gpio_detect)) {
-   rc = sbs_read_word_data(client, sbs_data[REG_STATUS].addr);
+   union power_supply_propval val;
 
-   if (rc < 0) {
-   dev_err(>dev, "%s: Failed to get device 
status\n",
-   __func__);
+   rc = sbs_get_battery_presence_and_health(
+   client, POWER_SUPPLY_PROP_PRESENT, );
+   if (rc < 0 || !val.intval) {
+   dev_err(>dev, "Failed to get present status\n");
+   rc = -ENODEV;
goto exit_psupply;
}
}
-- 
2.28.0.236.gb10cc79966-goog



[PATCH v3 0/2] power: supply: sbs-battery: fix presence check

2020-08-12 Thread Ikjoon Jang
When gpio detection is not supplied, presence state transitions depend
on every smbus transfer result from get_property(). This patch tries to
check battery presence again with well supported command before
state transition.

Changes:
v3: check return value of get_presence_and_health()
v2: combine get_presence_and_health functions to reuse

Ikjoon Jang (2):
  power: supply: sbs-battery: combine get_presence_and_health
  power: supply: sbs-battery: don't assume i2c errors as battery
disconnect

 drivers/power/supply/sbs-battery.c | 98 --
 1 file changed, 53 insertions(+), 45 deletions(-)

-- 
2.28.0.236.gb10cc79966-goog



Re: [PATCH] perf parse-events: Set exclude_guest for user-space counting

2020-08-12 Thread Jin, Yao

Hi Arnaldo,

On 8/12/2020 8:55 PM, Arnaldo Carvalho de Melo wrote:

Em Wed, Aug 12, 2020 at 09:15:04AM -0300, Arnaldo Carvalho de Melo escreveu:

Em Wed, Aug 12, 2020 at 02:59:53PM +0800, Jin Yao escreveu:

Currently if we run 'perf record -e cycles:u', exclude_guest is 0.

But it doesn't make sense that we request for user-space counting
but we also get the guest report.

To keep perf semantics consistent and clear, this patch sets
exclude_guest for user-space counting.


Applied, and also added this, that you should consider doing in the
future (modulo the "Committer testing:" header :) ):

Committer testing:

Before:

   # perf record -e cycles:u
   ^C[ perf record: Woken up 1 times to write data ]
   [ perf record: Captured and wrote 1.231 MB perf.data (91 samples) ]
   #
   # perf evlist -v
   cycles:u: size: 120, { sample_period, sample_freq }: 4000, sample_type: 
IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, 
exclude_kernel: 1, exclude_hv: 1, freq: 1, sample_id_all: 1
   
   #

After:

   # perf record -e cycles:u
   ^C[ perf record: Woken up 1 times to write data ]
   [ perf record: Captured and wrote 1.263 MB perf.data (403 samples) ]
   #
   # perf evlist -v
   cycles:u: size: 120, { sample_period, sample_freq }: 4000, sample_type: 
IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, 
exclude_kernel: 1, exclude_hv: 1, freq: 1, sample_id_all: 1, exclude_guest: 1
   #


Also, please run 'perf test', as this will require changes to some
expected perf_event_attr setups:

[root@quaco ~]# perf test "event definition"
  6: Parse event definition strings: FAILED!
[root@quaco ~]#

- Arnaldo



Sorry for the perf test failure! I will post v2 to fix this issue.

Thanks
Jin Yao


Re: Recursive/circular locking in serial8250_console_write/serial8250_do_startup

2020-08-12 Thread Guenter Roeck
On Wed, Aug 12, 2020 at 08:48:13AM -0700, Guenter Roeck wrote:
> Hi,
> 
> crbug.com/1114800 reports a hard lockup due to circular locking in the
> 8250 console driver. This is seen if CONFIG_PROVE_LOCKING is enabled.
> 
> Problem is as follows:
> - serial8250_do_startup() locks the serial (console) port.
> - serial8250_do_startup() then disables interrupts if interrupts are
>   shared, by calling disable_irq_nosync().
> - disable_irq_nosync() calls __irq_get_desc_lock() to lock the interrupt
>   descriptor.
> - __irq_get_desc_lock() calls lock_acquire()
> - If CONFIG_PROVE_LOCKING is enabled, validate_chain() and check_noncircular()
>   are called and identify a potential locking error.
> - This locking error is reported via printk, which ultimately calls
>   serial8250_console_write().
> - serial8250_console_write() tries to lock the serial console port.
>   Since it is already locked, the system hangs and ultimately reports
>   a hard lockup.
> 
> I understand we'll need to figure out and fix what lockdep complains about,
> and I am working on that. However, even if that is fixed, we'll need a
> solution for the recursive lock: Fixing the lockdep problem doesn't
> guarantee that a similar problem (or some other log message) won't be
> detected and reported sometime in the future while serial8250_do_startup()
> holds the console port lock.
> 
> Ideas, anyone ? Everything I came up with so far seems clumsy and hackish.
> 

Turns out the situation is a bit worse than I thought. disable_irq_nosync(),
when called from serial8250_do_startup(), locks the interrupt descriptor.
The order of locking is
serial port lock
  interrupt descriptor lock

At the same time, __setup_irq() locks the interrupt descriptor as well.
With the descriptor locked, it may report an error message using pr_err().
This in turn may call serial8250_console_write(), which will try to lock
the console serial port. The lock sequence is
interrupt descriptor lock
  serial port lock

I added the lockdep splat to the bug log at crbug.com/1114800.

Effectively, I think, this means we can't call disable_irq_nosync()
while holding a serial port lock, or at least not while holding a
serial port lock that is associated with a console.

The problem was introduced (or, rather, exposed) with upstream commit
7febbcbc48fc ("serial: 8250: Check UPF_IRQ_SHARED in advance").

Guenter


Re: [PATCH v17 1/3] dt-bindings: Add bindings for Mediatek matrix keypad

2020-08-12 Thread Yingjoe Chen
On Wed, 2020-08-12 at 15:13 -0700, Dmitry Torokhov wrote:
> Hi,
> 
> On Tue, Aug 11, 2020 at 09:47:23AM +0800, Yingjoe Chen wrote:
> > Hi,
> > 
> > 
> > On Mon, 2020-08-10 at 14:40 +0800, Fengping Yu wrote:
> > > From: "fengping.yu" 
> > > 
> > > This patch add devicetree bindings for Mediatek matrix keypad driver.
> > > 
> > > Signed-off-by: fengping.yu 
> > > ---
> > >  .../devicetree/bindings/input/mtk-kpd.yaml| 87 +++
> > >  1 file changed, 87 insertions(+)
> > >  create mode 100644 Documentation/devicetree/bindings/input/mtk-kpd.yaml
> > > 
> > > diff --git a/Documentation/devicetree/bindings/input/mtk-kpd.yaml 
> > > b/Documentation/devicetree/bindings/input/mtk-kpd.yaml
> > > new file mode 100644
> > > index ..d74dd8a6fbde
> > 
> > <...>
> > 
> > 
> > > +  keypad,num-columns:
> > > +description: Number of column lines connected to the keypad 
> > > controller,
> > > +it is not equal to PCB columns number, instead you should add 
> > > required value
> > > +for each IC. If not specified, the default value is 1.
> > > +
> > > +  keypad,num-rows:
> > > +description: Number of row lines connected to the keypad controller, 
> > > it is
> > > +not equal to PCB rows number, instead you should add required value 
> > > for each IC.
> > > +If not specified, the default value is 1.
> > 
> > Your source code can't really handle dts without rows/columns
> > properties. Also, the default value doesn't make any sense. No IC will
> > have rows or columns set to 1.
> > 
> > Since these are IC specified, not board specified, I think you should
> > just have the correct numbers in driver.
> 
> It is actually property of board to decide how many keys it wants to
> wire up. In extreme case it will be a single key, i.e. number of rows
> and columns will indeed be 1.
> 
> Thanks.
> 

From the binding "it is not equal to PCB columns number, instead you
should add required value for each IC."
Driver code use this to calculate bit position in register, which is IC
dependent.

Joe.C



Re: [RFC 7/7] KVM: VMX: Enable PKS for nested VM

2020-08-12 Thread Chenyi Qiang




On 8/11/2020 8:05 AM, Jim Mattson wrote:

On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang  wrote:


PKS MSR passes through guest directly. Configure the MSR to match the
L0/L1 settings so that nested VM runs PKS properly.

Signed-off-by: Chenyi Qiang 
---
  arch/x86/kvm/vmx/nested.c | 32 
  arch/x86/kvm/vmx/vmcs12.c |  2 ++
  arch/x86/kvm/vmx/vmcs12.h |  6 +-
  arch/x86/kvm/vmx/vmx.c| 10 ++
  arch/x86/kvm/vmx/vmx.h|  1 +
  5 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index df2c2e733549..1f9823d21ecd 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -647,6 +647,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct 
kvm_vcpu *vcpu,
 MSR_IA32_PRED_CMD,
 MSR_TYPE_W);

+   if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS))
+   nested_vmx_disable_intercept_for_msr(
+   msr_bitmap_l1, msr_bitmap_l0,
+   MSR_IA32_PKRS,
+   MSR_TYPE_R | MSR_TYPE_W);


What if L1 intercepts only *reads* of MSR_IA32_PKRS?


 kvm_vcpu_unmap(vcpu, _vmx(vcpu)->nested.msr_bitmap_map, false);

 return true;



@@ -2509,6 +2519,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12,
 if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending ||
 !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
 vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs);
+
+   if (kvm_cpu_cap_has(X86_FEATURE_PKS) &&


Is the above check superfluous? I would assume that the L1 guest can't
set VM_ENTRY_LOAD_IA32_PKRS unless this is true.



I enforce this check to ensure vmcs_write to the Guest_IA32_PKRS without 
error. if deleted, vmcs_write to GUEST_IA32_PKRS may executed when PKS 
is unsupported.



+   (!vmx->nested.nested_run_pending ||
+!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS)))
+   vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs);


This doesn't seem right to me. On the target of a live migration, with
L2 active at the time the snapshot was taken (i.e.,
vmx->nested.nested_run_pending=0), it looks like we're going to try to
overwrite the current L2 PKRS value with L1's PKRS value (except that
in this situation, vmx->nested.vmcs01_guest_pkrs should actually be
0). Am I missing something?



We overwrite the L2 PKRS with L1's value when L2 doesn't support PKS. 
Because the L1's VM_ENTRY_LOAD_IA32_PKRS is off, we need to migrate L1's 
PKRS to L2.



 vmx_set_rflags(vcpu, vmcs12->guest_rflags);

 /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the




@@ -3916,6 +3943,8 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu 
*vcpu,
 vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS);
 if (kvm_mpx_supported())
 vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+   if (kvm_cpu_cap_has(X86_FEATURE_PKS))


Shouldn't we be checking to see if the *virtual* CPU supports PKS
before writing anything into vmcs12->guest_ia32_pkrs?



Yes, It's reasonable.


+   vmcs12->guest_ia32_pkrs = vmcs_read64(GUEST_IA32_PKRS);

 vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false;
  }


Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes

2020-08-12 Thread David Ahern
On 8/12/20 1:51 AM, Andrei Vagin wrote:
> 
> I rebased the task_diag patches on top of v5.8:
> https://github.com/avagin/linux-task-diag/tree/v5.8-task-diag

Thanks for updating the patches.

> 
> /proc/pid files have three major limitations:
> * Requires at least three syscalls per process per file
>   open(), read(), close()
> * Variety of formats, mostly text based
>   The kernel spent time to encode binary data into a text format and
>   then tools like top and ps spent time to decode them back to a binary
>   format.
> * Sometimes slow due to extra attributes
>   For example, /proc/PID/smaps contains a lot of useful informations
>   about memory mappings and memory consumption for each of them. But
>   even if we don't need memory consumption fields, the kernel will
>   spend time to collect this information.

that's what I recall as well.

> 
> More details and numbers are in this article:
> https://avagin.github.io/how-fast-is-procfs
> 
> This new interface doesn't have only one of these limitations, but
> task_diag doesn't have all of them.
> 
> And I compared how fast each of these interfaces:
> 
> The test environment:
> CPU: Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz
> RAM: 16GB
> kernel: v5.8 with task_diag and /proc/all patches.
> 100K processes:
> $ ps ax | wc -l
> 10228

100k processes but showing 10k here??

> 
> $ time cat /proc/all/status > /dev/null
> 
> real  0m0.577s
> user  0m0.017s
> sys   0m0.559s
> 
> task_proc_all is used to read /proc/pid/status for all tasks:
> https://github.com/avagin/linux-task-diag/blob/master/tools/testing/selftests/task_diag/task_proc_all.c
> 
> $ time ./task_proc_all status
> tasks: 100230
> 
> real  0m0.924s
> user  0m0.054s
> sys   0m0.858s
> 
> 
> /proc/all/status is about 40% faster than /proc/*/status.
> 
> Now let's take a look at the perf output:
> 
> $ time perf record -g cat /proc/all/status > /dev/null
> $ perf report
> -   98.08% 1.38%  cat  [kernel.vmlinux]  [k] entry_SYSCALL_64
>- 96.70% entry_SYSCALL_64
>   - do_syscall_64
>  - 94.97% ksys_read
> - 94.80% vfs_read
>- 94.58% proc_reg_read
>   - seq_read
>  - 87.95% proc_pid_status
> + 13.10% seq_put_decimal_ull_width
> - 11.69% task_mem
>+ 9.48% seq_put_decimal_ull_width
> + 10.63% seq_printf
> - 10.35% cpuset_task_status_allowed
>+ seq_printf
> - 9.84% render_sigset_t
>  1.61% seq_putc
>+ 1.61% seq_puts
> + 4.99% proc_task_name
> + 4.11% seq_puts
> - 3.76% render_cap_t
>  2.38% seq_put_hex_ll
>+ 1.25% seq_puts
>   2.64% __task_pid_nr_ns
> + 1.54% get_task_mm
> + 1.34% __lock_task_sighand
> + 0.70% from_kuid_munged
>   0.61% get_task_cred
>   0.56% seq_putc
>   0.52% hugetlb_report_usage
>   0.52% from_kgid_munged
>  + 4.30% proc_all_next
>  + 0.82% _copy_to_user 
> 
> We can see that the kernel spent more than 50% of the time to encode binary
> data into a text format.
> 
> Now let's see how fast task_diag:
> 
> $ time ./task_diag_all all -c -q
> 
> real  0m0.087s
> user  0m0.001s
> sys   0m0.082s
> 
> Maybe we need resurrect the task_diag series instead of inventing
> another less-effective interface...

I think the netlink message design is the better way to go. As system
sizes continue to increase (> 100 cpus is common now) you need to be
able to pass the right data to userspace as fast as possible to keep up
with what can be a very dynamic userspace and set of processes.

When you first proposed this idea I was working on systems with >= 1k
cpus and the netlink option was able to keep up with a 'make -j N' on
those systems. `perf record` walking /proc would never finish
initializing - I had to add a "done initializing" message to know when
to start a test. With the task_diag approach, perf could collect the
data in short order and move on to recording data.



[PATCH] selftests: rtnetlink: load fou module for kci_test_encap_fou()

2020-08-12 Thread Po-Hsu Lin
The kci_test_encap_fou() test from kci_test_encap() in rtnetlink.sh
needs the fou module to work. Otherwise it will fail with:

  $ ip netns exec "$testns" ip fou add port  ipproto 47
  RTNETLINK answers: No such file or directory
  Error talking to the kernel

Add the CONFIG_NET_FOU into the config file as well.

Signed-off-by: Po-Hsu Lin 
---
 tools/testing/selftests/net/config   | 1 +
 tools/testing/selftests/net/rtnetlink.sh | 6 ++
 2 files changed, 7 insertions(+)

diff --git a/tools/testing/selftests/net/config 
b/tools/testing/selftests/net/config
index 3b42c06b..96d2763 100644
--- a/tools/testing/selftests/net/config
+++ b/tools/testing/selftests/net/config
@@ -31,3 +31,4 @@ CONFIG_NET_SCH_ETF=m
 CONFIG_NET_SCH_NETEM=y
 CONFIG_TEST_BLACKHOLE_DEV=m
 CONFIG_KALLSYMS=y
+CONFIG_NET_FOU
diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index bdbf4b3..7931b65 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -521,6 +521,11 @@ kci_test_encap_fou()
return $ksft_skip
fi
 
+   if ! /sbin/modprobe -q -n fou; then
+   echo "SKIP: module fou is not found"
+   return $ksft_skip
+   fi
+   /sbin/modprobe -q fou
ip -netns "$testns" fou add port  ipproto 47 2>/dev/null
if [ $? -ne 0 ];then
echo "FAIL: can't add fou port , skipping test"
@@ -541,6 +546,7 @@ kci_test_encap_fou()
return 1
fi
 
+   /sbin/modprobe -q -r fou
echo "PASS: fou"
 }
 
-- 
2.7.4



drivers/char/agp/i460-agp.c:254:19: sparse: expected void gatt

2020-08-12 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
master
head:   dc06fe51d26efc100ac74121607c01a454867c91
commit: df41017eafd267c08acbfff99d34e4f96bbfbc92 ia64: remove support for 
machvecs
date:   12 months ago
config: ia64-randconfig-s031-20200813 (attached as .config)
compiler: ia64-linux-gcc (GCC) 9.3.0
reproduce:
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# apt-get install sparse
# sparse version: v0.6.2-168-g9554805c-dirty
git checkout df41017eafd267c08acbfff99d34e4f96bbfbc92
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross C=1 
CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=ia64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 


sparse warnings: (new ones prefixed by >>)

   drivers/char/agp/i460-agp.c:254:19: sparse: sparse: incorrect type in 
assignment (different address spaces) @@ expected void *static [assigned] 
[toplevel] gatt @@ got void [noderef]  * @@
>> drivers/char/agp/i460-agp.c:254:19: sparse: expected void *static 
>> [assigned] [toplevel] gatt
   drivers/char/agp/i460-agp.c:254:19: sparse: got void [noderef]  *
   drivers/char/agp/i460-agp.c:266:17: sparse: sparse: incorrect type in 
argument 2 (different address spaces) @@ expected void volatile [noderef] 
 *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:266:17: sparse: expected void volatile 
[noderef]  *addr
>> drivers/char/agp/i460-agp.c:266:17: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:267:9: sparse: sparse: incorrect type in 
argument 1 (different address spaces) @@ expected void const volatile 
[noderef]  *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:267:9: sparse: expected void const volatile 
[noderef]  *addr
   drivers/char/agp/i460-agp.c:267:9: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:281:17: sparse: sparse: incorrect type in 
argument 2 (different address spaces) @@ expected void volatile [noderef] 
 *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:281:17: sparse: expected void volatile 
[noderef]  *addr
   drivers/char/agp/i460-agp.c:281:17: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:282:9: sparse: sparse: incorrect type in 
argument 1 (different address spaces) @@ expected void const volatile 
[noderef]  *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:282:9: sparse: expected void const volatile 
[noderef]  *addr
   drivers/char/agp/i460-agp.c:282:9: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:284:21: sparse: sparse: incorrect type in 
argument 1 (different address spaces) @@ expected void volatile [noderef] 
 *addr @@ got void *static [assigned] [toplevel] gatt @@
   drivers/char/agp/i460-agp.c:284:21: sparse: expected void volatile 
[noderef]  *addr
>> drivers/char/agp/i460-agp.c:284:21: sparse: got void *static [assigned] 
>> [toplevel] gatt
   drivers/char/agp/i460-agp.c:318:22: sparse: sparse: incorrect type in 
argument 1 (different address spaces) @@ expected void const volatile 
[noderef]  *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:318:22: sparse: expected void const volatile 
[noderef]  *addr
   drivers/char/agp/i460-agp.c:318:22: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:318:22: sparse: sparse: incorrect type in 
argument 1 (different address spaces) @@ expected void const volatile 
[noderef]  *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:318:22: sparse: expected void const volatile 
[noderef]  *addr
   drivers/char/agp/i460-agp.c:318:22: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:319:25: sparse: sparse: incorrect type in 
argument 1 (different address spaces) @@ expected void const volatile 
[noderef]  *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:319:25: sparse: expected void const volatile 
[noderef]  *addr
   drivers/char/agp/i460-agp.c:319:25: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:330:25: sparse: sparse: incorrect type in 
argument 2 (different address spaces) @@ expected void volatile [noderef] 
 *addr @@ got unsigned int [usertype] * @@
   drivers/char/agp/i460-agp.c:330:25: sparse: expected void volatile 
[noderef]  *addr
   drivers/char/agp/i460-agp.c:330:25: sparse: got unsigned int [usertype] *
   drivers/char/agp/i460-agp.c:332:9: sparse: sparse: incorrect type in 
argument 1 (different address spaces) @@ expected void const volatile 
[noderef]  *addr @@ got unsigned int [usertype] * @@
   

Re: [PATCH RFC v2 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver

2020-08-12 Thread Jason Wang



On 2020/8/12 下午12:05, Tian, Kevin wrote:

The problem is that if we tie all controls via VFIO uAPI, the other
subsystem like vDPA is likely to duplicate them. I wonder if there is a
way to decouple the vSVA out of VFIO uAPI?

vSVA is a per-device (either pdev or mdev) feature thus naturally should
be managed by its device driver (VFIO or vDPA). From this angle some
duplication is inevitable given VFIO and vDPA are orthogonal passthrough
frameworks. Within the kernel the majority of vSVA handling is done by
IOMMU and IOASID modules thus most logic are shared.



So why not introduce vSVA uAPI at IOMMU or IOASID layer?





   If an userspace DMA interface can be easily
adapted to be a passthrough one, it might be the choice.

It's not that easy even for VFIO which requires a lot of new uAPIs and
infrastructures(e.g mdev) to be invented.



But for idxd,
we see mdev a much better fit here, given the big difference between
what userspace DMA requires and what guest driver requires in this hw.

A weak point for mdev is that it can't serve kernel subsystem other than
VFIO. In this case, you need some other infrastructures (like [1]) to do
this.

mdev is not exclusive from kernel usages. It's perfectly fine for a driver
to reserve some work queues for host usages, while wrapping others
into mdevs.



I meant you may want slices to be an independent device from the kernel 
point of view:


E.g for ethernet devices, you may want 10K mdevs to be passed to guest.

Similarly, you may want 10K net devices which is connected to the kernel 
networking subsystems.


In this case it's not simply reserving queues but you need some other 
type of device abstraction. There could be some kind of duplication 
between this and mdev.


Thanks




Thanks
Kevin





Re: [PATCH] opp: Fix dev_pm_opp_set_rate() to not return early

2020-08-12 Thread Viresh Kumar
On 11-08-20, 14:09, Stephen Boyd wrote:
> This is a goto maze! Any chance we can clean this up?

I have sent a short series in reply to this series, please have a
look. It should look better now.

-- 
viresh


[PATCH V2 1/4] opp: Enable resources again if they were disabled earlier

2020-08-12 Thread Viresh Kumar
From: Rajendra Nayak 

dev_pm_opp_set_rate() can now be called with freq = 0 in order
to either drop performance or bandwidth votes or to disable
regulators on platforms which support them.

In such cases, a subsequent call to dev_pm_opp_set_rate() with
the same frequency ends up returning early because 'old_freq == freq'

Instead make it fall through and put back the dropped performance
and bandwidth votes and/or enable back the regulators.

Cc: v5.3+  # v5.3+
Fixes: cd7ea582 ("opp: Make dev_pm_opp_set_rate() handle freq = 0 to drop 
performance votes")
Reported-by: Sajida Bhanu 
Reviewed-by: Sibi Sankar 
Reported-by: Matthias Kaehlcke 
Tested-by: Matthias Kaehlcke 
Signed-off-by: Rajendra Nayak 
[ Viresh: Don't skip clk_set_rate() and massaged changelog ]
Signed-off-by: Viresh Kumar 
---
Hi Rajendra,

I wasn't able to test this stuff, please give it a try. I have
simplified your patch and cleaned up a bunch of stuff as well.

 drivers/opp/core.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index bdb028c7793d..9668ea04cc80 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -934,10 +934,13 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long 
target_freq)
 
/* Return early if nothing to do */
if (old_freq == freq) {
-   dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, 
nothing to do\n",
-   __func__, freq);
-   ret = 0;
-   goto put_opp_table;
+   if (!opp_table->required_opp_tables && !opp_table->regulators &&
+   !opp_table->paths) {
+   dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are 
same, nothing to do\n",
+   __func__, freq);
+   ret = 0;
+   goto put_opp_table;
+   }
}
 
/*
-- 
2.14.1



[PATCH 2/4] opp: Track device's resources configuration status

2020-08-12 Thread Viresh Kumar
The OPP core needs to track if the resources of devices are enabled or
configured or not, as it disables the resources when target_freq is set
to 0.

Handle that with a separate variable to make it easy to maintain.

Also note that we will unconditionally call clk_set_rate() in the case
where the resources are getting enabled again. This shouldn't have any
side effects anyway.

Signed-off-by: Viresh Kumar 
---
 drivers/opp/core.c | 37 ++---
 drivers/opp/opp.h  |  2 ++
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 9668ea04cc80..e8882e7fd8a5 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -888,22 +888,18 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long 
target_freq)
}
 
if (unlikely(!target_freq)) {
+   ret = 0;
+
+   if (!opp_table->enabled)
+   goto put_opp_table;
+
/*
 * Some drivers need to support cases where some platforms may
 * have OPP table for the device, while others don't and
 * opp_set_rate() just needs to behave like clk_set_rate().
 */
-   if (!_get_opp_count(opp_table)) {
-   ret = 0;
-   goto put_opp_table;
-   }
-
-   if (!opp_table->required_opp_tables && !opp_table->regulators &&
-   !opp_table->paths) {
-   dev_err(dev, "target frequency can't be 0\n");
-   ret = -EINVAL;
+   if (!_get_opp_count(opp_table))
goto put_opp_table;
-   }
 
ret = _set_opp_bw(opp_table, NULL, dev, true);
if (ret)
@@ -915,6 +911,9 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long 
target_freq)
}
 
ret = _set_required_opps(dev, opp_table, NULL);
+   if (!ret)
+   opp_table->enabled = false;
+
goto put_opp_table;
}
 
@@ -933,14 +932,11 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long 
target_freq)
old_freq = clk_get_rate(clk);
 
/* Return early if nothing to do */
-   if (old_freq == freq) {
-   if (!opp_table->required_opp_tables && !opp_table->regulators &&
-   !opp_table->paths) {
-   dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are 
same, nothing to do\n",
-   __func__, freq);
-   ret = 0;
-   goto put_opp_table;
-   }
+   if (opp_table->enabled && old_freq == freq) {
+   dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, 
nothing to do\n",
+   __func__, freq);
+   ret = 0;
+   goto put_opp_table;
}
 
/*
@@ -1001,8 +997,11 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long 
target_freq)
dev_err(dev, "Failed to set required opps: %d\n", ret);
}
 
-   if (!ret)
+   if (!ret) {
ret = _set_opp_bw(opp_table, opp, dev, false);
+   if (!ret)
+   opp_table->enabled = true;
+   }
 
 put_opp:
dev_pm_opp_put(opp);
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index e51646ff279e..bd35802acc6e 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -152,6 +152,7 @@ enum opp_table_access {
  * property).
  * @paths: Interconnect path handles
  * @path_count: Number of interconnect paths
+ * @enabled: Set to true if the device's resources are enabled/configured.
  * @genpd_performance_state: Device's power domain support performance state.
  * @is_genpd: Marks if the OPP table belongs to a genpd.
  * @set_opp: Platform specific set_opp callback
@@ -198,6 +199,7 @@ struct opp_table {
bool regulator_enabled;
struct icc_path **paths;
unsigned int path_count;
+   bool enabled;
bool genpd_performance_state;
bool is_genpd;
 
-- 
2.14.1



[PATCH 3/4] opp: Reused enabled flag and remove regulator_enabled

2020-08-12 Thread Viresh Kumar
The common "enabled" flag can be used here instead of
"regulator_enabled" now.

Signed-off-by: Viresh Kumar 
---
 drivers/opp/core.c | 13 +++--
 drivers/opp/opp.h  |  2 --
 2 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index e8882e7fd8a5..5f5da257f58a 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -703,12 +703,10 @@ static int _generic_set_opp_regulator(struct opp_table 
*opp_table,
 * Enable the regulator after setting its voltages, otherwise it breaks
 * some boot-enabled regulators.
 */
-   if (unlikely(!opp_table->regulator_enabled)) {
+   if (unlikely(!opp_table->enabled)) {
ret = regulator_enable(reg);
if (ret < 0)
dev_warn(dev, "Failed to enable regulator: %d", ret);
-   else
-   opp_table->regulator_enabled = true;
}
 
return 0;
@@ -905,10 +903,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long 
target_freq)
if (ret)
goto put_opp_table;
 
-   if (opp_table->regulator_enabled) {
-   regulator_disable(opp_table->regulators[0]);
-   opp_table->regulator_enabled = false;
-   }
+   regulator_disable(opp_table->regulators[0]);
 
ret = _set_required_opps(dev, opp_table, NULL);
if (!ret)
@@ -1795,11 +1790,9 @@ void dev_pm_opp_put_regulators(struct opp_table 
*opp_table)
/* Make sure there are no concurrent readers while updating opp_table */
WARN_ON(!list_empty(_table->opp_list));
 
-   if (opp_table->regulator_enabled) {
+   if (opp_table->enabled) {
for (i = opp_table->regulator_count - 1; i >= 0; i--)
regulator_disable(opp_table->regulators[i]);
-
-   opp_table->regulator_enabled = false;
}
 
for (i = opp_table->regulator_count - 1; i >= 0; i--)
diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h
index bd35802acc6e..0c3de3f6db5c 100644
--- a/drivers/opp/opp.h
+++ b/drivers/opp/opp.h
@@ -147,7 +147,6 @@ enum opp_table_access {
  * @clk: Device's clock handle
  * @regulators: Supply regulators
  * @regulator_count: Number of power supply regulators. Its value can be -1
- * @regulator_enabled: Set to true if regulators were previously enabled.
  * (uninitialized), 0 (no opp-microvolt property) or > 0 (has opp-microvolt
  * property).
  * @paths: Interconnect path handles
@@ -196,7 +195,6 @@ struct opp_table {
struct clk *clk;
struct regulator **regulators;
int regulator_count;
-   bool regulator_enabled;
struct icc_path **paths;
unsigned int path_count;
bool enabled;
-- 
2.14.1



[PATCH 4/4] opp: Split out _opp_set_rate_zero()

2020-08-12 Thread Viresh Kumar
Create separate routine _opp_set_rate_zero() to handle !target_freq
case.

Signed-off-by: Viresh Kumar 
---
 drivers/opp/core.c | 52 +---
 1 file changed, 29 insertions(+), 23 deletions(-)

diff --git a/drivers/opp/core.c b/drivers/opp/core.c
index 5f5da257f58a..e198b57efcf8 100644
--- a/drivers/opp/core.c
+++ b/drivers/opp/core.c
@@ -860,6 +860,34 @@ int dev_pm_opp_set_bw(struct device *dev, struct 
dev_pm_opp *opp)
 }
 EXPORT_SYMBOL_GPL(dev_pm_opp_set_bw);
 
+static int _opp_set_rate_zero(struct device *dev, struct opp_table *opp_table)
+{
+   int ret;
+
+   if (!opp_table->enabled)
+   return 0;
+
+   /*
+* Some drivers need to support cases where some platforms may
+* have OPP table for the device, while others don't and
+* opp_set_rate() just needs to behave like clk_set_rate().
+*/
+   if (!_get_opp_count(opp_table))
+   return 0;
+
+   ret = _set_opp_bw(opp_table, NULL, dev, true);
+   if (ret)
+   return ret;
+
+   regulator_disable(opp_table->regulators[0]);
+
+   ret = _set_required_opps(dev, opp_table, NULL);
+   if (!ret)
+   opp_table->enabled = false;
+
+   return ret;
+}
+
 /**
  * dev_pm_opp_set_rate() - Configure new OPP based on frequency
  * @dev:device for which we do this operation
@@ -886,29 +914,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long 
target_freq)
}
 
if (unlikely(!target_freq)) {
-   ret = 0;
-
-   if (!opp_table->enabled)
-   goto put_opp_table;
-
-   /*
-* Some drivers need to support cases where some platforms may
-* have OPP table for the device, while others don't and
-* opp_set_rate() just needs to behave like clk_set_rate().
-*/
-   if (!_get_opp_count(opp_table))
-   goto put_opp_table;
-
-   ret = _set_opp_bw(opp_table, NULL, dev, true);
-   if (ret)
-   goto put_opp_table;
-
-   regulator_disable(opp_table->regulators[0]);
-
-   ret = _set_required_opps(dev, opp_table, NULL);
-   if (!ret)
-   opp_table->enabled = false;
-
+   ret = _opp_set_rate_zero(dev, opp_table);
goto put_opp_table;
}
 
-- 
2.14.1



Re: [RFC PATCH 00/16] Core scheduling v6

2020-08-12 Thread Li, Aubrey
On 2020/8/13 7:08, Joel Fernandes wrote:
> On Wed, Aug 12, 2020 at 10:01:24AM +0800, Li, Aubrey wrote:
>> Hi Joel,
>>
>> On 2020/8/10 0:44, Joel Fernandes wrote:
>>> Hi Aubrey,
>>>
>>> Apologies for replying late as I was still looking into the details.
>>>
>>> On Wed, Aug 05, 2020 at 11:57:20AM +0800, Li, Aubrey wrote:
>>> [...]
 +/*
 + * Core scheduling policy:
 + * - CORE_SCHED_DISABLED: core scheduling is disabled.
 + * - CORE_COOKIE_MATCH: tasks with same cookie can run
 + * on the same core concurrently.
 + * - CORE_COOKIE_TRUST: trusted task can run with kernel
thread on the same core concurrently. 
 + * - CORE_COOKIE_LONELY: tasks with cookie can run only
 + * with idle thread on the same core.
 + */
 +enum coresched_policy {
 +   CORE_SCHED_DISABLED,
 +   CORE_SCHED_COOKIE_MATCH,
 +  CORE_SCHED_COOKIE_TRUST,
 +   CORE_SCHED_COOKIE_LONELY,
 +};

 We can set policy to CORE_COOKIE_TRUST of uperf cgroup and fix this kind
 of performance regression. Not sure if this sounds attractive?
>>>
>>> Instead of this, I think it can be something simpler IMHO:
>>>
>>> 1. Consider all cookie-0 task as trusted. (Even right now, if you apply the
>>>core-scheduling patchset, such tasks will share a core and sniff on each
>>>other. So let us not pretend that such tasks are not trusted).
>>>
>>> 2. All kernel threads and idle task would have a cookie 0 (so that will 
>>> cover
>>>ksoftirqd reported in your original issue).
>>>
>>> 3. Add a config option (CONFIG_SCHED_CORE_DEFAULT_TASKS_UNTRUSTED). Default
>>>enable it. Setting this option would tag all tasks that are forked from a
>>>cookie-0 task with their own cookie. Later on, such tasks can be added to
>>>a group. This cover's PeterZ's ask about having 'default untrusted').
>>>(Users like ChromeOS that don't want to userspace system processes to be
>>>tagged can disable this option so such tasks will be cookie-0).
>>>
>>> 4. Allow prctl/cgroup interfaces to create groups of tasks and override the
>>>above behaviors.
>>
>> How does uperf in a cgroup work with ksoftirqd? Are you suggesting I set 
>> uperf's
>> cookie to be cookie-0 via prctl?
> 
> Yes, but let me try to understand better. There are 2 problems here I think:
> 
> 1. ksoftirqd getting idled when HT is turned on, because uperf is sharing a
> core with it: This should not be any worse than SMT OFF, because even SMT OFF
> would also reduce ksoftirqd's CPU time just core sched is doing. Sure
> core-scheduling adds some overhead with IPIs but such a huge drop of perf is
> strange. Peter any thoughts on that?
> 
> 2. Interface: To solve the performance problem, you are saying you want uperf
> to share a core with ksoftirqd so that it is not forced into idle.  Why not
> just keep uperf out of the cgroup?

I guess this is unacceptable for who runs their apps in container and vm.

Thanks,
-Aubrey

> Then it will have cookie 0 and be able to
> share core with kernel threads. About user-user isolation that you need, if
> you tag any "untrusted" threads by adding it to CGroup, then there will
> automatically isolated from uperf while allowing uperf to share CPU with
> kernel threads.
> 
> Please let me know your thoughts and thanks,
> 
>  - Joel
> 
>>
>> Thanks,
>> -Aubrey
>>>
>>> 5. Document everything clearly so the semantics are clear both to the
>>>developers of core scheduling and to system administrators.
>>>
>>> Note that, with the concept of "system trusted cookie", we can also do
>>> optimizations like:
>>> 1. Disable STIBP when switching into trusted tasks.
>>> 2. Disable L1D flushing / verw stuff for L1TF/MDS issues, when switching 
>>> into
>>>trusted tasks.
>>>
>>> At least #1 seems to be biting enabling HT on ChromeOS right now, and one
>>> other engineer requested I do something like #2 already.
>>>
>>> Once we get full-syscall isolation working, threads belonging to a process
>>> can also share a core so those can just share a core with the task-group
>>> leader.
>>>
> Is the uperf throughput worse with SMT+core-scheduling versus no-SMT ?

 This is a good question, from the data we measured by uperf,
 SMT+core-scheduling is 28.2% worse than no-SMT, :(
>>>
>>> This is worrying for sure. :-(. We ought to debug/profile it more to see 
>>> what
>>> is causing the overhead. Me/Vineeth added it as a topic for LPC as well.
>>>
>>> Any other thoughts from others on this?
>>>
>>> thanks,
>>>
>>>  - Joel
>>>
>>>
> thanks,
>
>  - Joel
> PS: I am planning to write a patch behind a CONFIG option that tags
> all processes (default untrusted) so everything gets a cookie which
> some folks said was how they wanted (have a whitelist instead of
> blacklist).
>

>>



[rcu:dev.2020.08.11a 110/112] ERROR: modpost: "rcu_read_unlock_strict" undefined!

2020-08-12 Thread kernel test robot
Hi Paul,

First bad commit (maybe != root cause):

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git 
dev.2020.08.11a
head:   9030fc13639a71b15466520f09ca07b80fc30ed1
commit: 601cd69fa4bb0127bb774cc852203fd9fdf5e269 [110/112] rcuperf: Change 
rcuperf to rcuscale
config: riscv-allmodconfig (attached as .config)
compiler: riscv64-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
git checkout 601cd69fa4bb0127bb774cc852203fd9fdf5e269
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
ARCH=riscv 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>, old ones prefixed by <<):

ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/wireless/intel/iwlegacy/iwl4965.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/wireless/intel/iwlegacy/iwlegacy.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/wireless/ath/ath11k/ath11k.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/wireless/ath/wcn36xx/wcn36xx.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/wireless/ath/ath10k/ath10k_core.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/wireless/ath/carl9170/carl9170.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/wireless/ath/ath9k/ath9k_htc.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/wireless/ath/ath9k/ath9k_hw.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/wireless/ath/ath9k/ath9k.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wan/lapbether.ko] 
undefined!
ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wan/hdlc_cisco.ko] 
undefined!
ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/hamradio/bpqether.ko] 
undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/via/via-velocity.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/ti/ti_cpsw_new.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/ti/ti_cpsw.ko] 
undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/socionext/netsec.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/sfc/sfc.ko] 
undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/qlogic/qede/qede.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/qlogic/netxen/netxen_nic.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/qlogic/qlcnic/qlcnic.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/netronome/nfp/nfp.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/mscc/mscc_ocelot_common.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/mellanox/mlxsw/mlxsw_spectrum.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/mellanox/mlxsw/mlxsw_core.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/mellanox/mlx4/mlx4_en.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/mellanox/mlx4/mlx4_core.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/marvell/mvneta.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/intel/ice/ice.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/intel/fm10k/fm10k.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/intel/iavf/iavf.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/intel/i40e/i40e.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/intel/ixgbevf/ixgbevf.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/intel/ixgbe/ixgbe.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/intel/igc/igc.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/intel/igb/igb.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/chelsio/cxgb4/cxgb4.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/chelsio/cxgb3/cxgb3.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/cavium/thunder/nicvf.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/broadcom/bnxt/bnxt_en.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/broadcom/bnx2x/bnx2x.ko] undefined!
ERROR: modpost: "rcu_read_unlock_strict" 
[drivers/net/ethernet/broadcom/cnic.ko] 

Re: KMSAN: uninit-value in joydev_connect

2020-08-12 Thread syzbot
syzbot has found a reproducer for the following issue on:

HEAD commit:ce8056d1 wip: changed copy_from_user where instrumented
git tree:   https://github.com/google/kmsan.git master
console output: https://syzkaller.appspot.com/x/log.txt?x=127b9fba90
kernel config:  https://syzkaller.appspot.com/x/.config?x=3afe005fb99591f
dashboard link: https://syzkaller.appspot.com/bug?extid=6a1bb5a33a0b128085bc
compiler:   clang version 10.0.0 (https://github.com/llvm/llvm-project/ 
c2443155a0fb245c8f17f2c1c72b6ea391e86e81)
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=1472674e90
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=15a1339a90

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+6a1bb5a33a0b12808...@syzkaller.appspotmail.com

usb 1-1: config 0 interface 219 altsetting 0 endpoint 0xA has invalid 
wMaxPacketSize 0
usb 1-1: New USB device found, idVendor=078c, idProduct=1002, bcdDevice=e6.47
usb 1-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0
usb 1-1: config 0 descriptor??
gtco 1-1:0.219: Collection level already at zero
input: GTCO_CalComp as 
/devices/platform/dummy_hcd.0/usb1/1-1/1-1:0.219/input/input5
=
BUG: KMSAN: uninit-value in joydev_connect+0x10c0/0x1920 
drivers/input/joydev.c:958
CPU: 1 PID: 27 Comm: kworker/1:1 Not tainted 5.8.0-rc5-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Workqueue: usb_hub_wq hub_event
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x21c/0x280 lib/dump_stack.c:118
 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:121
 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215
 joydev_connect+0x10c0/0x1920 drivers/input/joydev.c:958
 input_attach_handler drivers/input/input.c:1031 [inline]
 input_register_device+0x1d7b/0x21c0 drivers/input/input.c:2229
 gtco_probe+0x32ce/0x39b0 drivers/input/tablet/gtco.c:990
 usb_probe_interface+0xece/0x1550 drivers/usb/core/driver.c:374
 really_probe+0xf20/0x20b0 drivers/base/dd.c:529
 driver_probe_device+0x293/0x390 drivers/base/dd.c:701
 __device_attach_driver+0x63f/0x830 drivers/base/dd.c:807
 bus_for_each_drv+0x2ca/0x3f0 drivers/base/bus.c:431
 __device_attach+0x4e2/0x7f0 drivers/base/dd.c:873
 devic



Re: [PATCH v2 2/2] power: supply: sbs-battery: don't assume every i2c errors as battery disconnect

2020-08-12 Thread Nicolas Boichat
On Tue, Aug 11, 2020 at 2:59 PM Ikjoon Jang  wrote:
>
> Current sbs-battery considers all smbus errors as diconnection events

disconnection

> when battery-detect pin isn't supplied, and restored to present state back
> on any successful transaction were made.

when any... is made

>
> This can leads

lead

> to unlimited state changes between present and !present
> when one unsupported command was requested and other following commands
> were successful, e.g. udev rules tries to read multiple properties.
>
> This patch checks battery presence by reading known good command to
> check battery existence.
>
> Signed-off-by: Ikjoon Jang 
> ---
> v2: fix return value checking of sbs_get_battery_presence_and_health()
> ---
>  drivers/power/supply/sbs-battery.c | 26 +-
>  1 file changed, 17 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/power/supply/sbs-battery.c 
> b/drivers/power/supply/sbs-battery.c
> index 6acb4ea25d2a..db964a470ebc 100644
> --- a/drivers/power/supply/sbs-battery.c
> +++ b/drivers/power/supply/sbs-battery.c
> @@ -878,10 +878,17 @@ static int sbs_get_property(struct power_supply *psy,
> if (!chip->enable_detection)
> goto done;
>
> -   if (!chip->gpio_detect &&
> -   chip->is_present != (ret >= 0)) {
> -   sbs_update_presence(chip, (ret >= 0));
> -   power_supply_changed(chip->power_supply);
> +   if (!chip->gpio_detect && chip->is_present != (ret >=0)) {
> +   bool old_present = chip->is_present;
> +   union power_supply_propval val;
> +
> +   sbs_get_battery_presence_and_health(
> +   client, POWER_SUPPLY_PROP_PRESENT, );
> +
> +   sbs_update_presence(chip, val.intval);

I don't think you can/should assume that val.intval will be set
correctly if the return value is negative (even if that's what the
functions currently do, it'd be too easy to accidentally change them).

So I still think you need to have:

ret = sbs_get_battery_presence_and_health...

sbs_update_presence(chip, !ret && val.intval);

> +
> +   if (old_present != chip->is_present)
> +   power_supply_changed(chip->power_supply);
> }
>
>  done:
> @@ -1067,11 +1074,12 @@ static int sbs_probe(struct i2c_client *client)
>  * to the battery.
>  */
> if (!(force_load || chip->gpio_detect)) {
> -   rc = sbs_read_word_data(client, sbs_data[REG_STATUS].addr);
> -
> -   if (rc < 0) {
> -   dev_err(>dev, "%s: Failed to get device 
> status\n",
> -   __func__);
> +   union power_supply_propval val;
> +   sbs_get_battery_presence_and_health(
> +   client, POWER_SUPPLY_PROP_PRESENT, );
> +   if (!val.intval) {
> +   dev_err(>dev, "Failed to get present 
> status\n");
> +   rc = -ENODEV;
> goto exit_psupply;
> }
> }
> --
> 2.28.0.236.gb10cc79966-goog
>


RE: [PATCH v3] exfat: remove EXFAT_SB_DIRTY flag

2020-08-12 Thread Namjae Jeon
> Thanks for thinking on this complicated issue.
> 
> 
> > Most of the NAND flash devices and HDDs have wear leveling and bad sector 
> > replacement algorithms
> applied.
> > So I think that the life of the boot sector will not be exhausted first.
> 
> I'm not too worried about the life of the boot-sector.
> I'm worried about write failures caused by external factors.
> (power failure/system down/vibration/etc. during writing) They rarely occur 
> on SD cards, but occur on
> many HDDs, some SSDs and USB storages by 0.1% or more.
Hard disk and SSD do not guarantee atomic write of a sector unit?

> Especially with AFT-HDD, not only boot-sector but also the following multiple 
> sectors become
> unreadable.
Other file systems will also be unstable on a such HW.

> It is not possible to completely solve this problem, as long as writing to 
> the boot-sector.
> (I think it's a exFAT's specification defect) The only effective way to 
> reduce this problem is to
> reduce writes to the boot-sector.
exFAT's specification defect... Well..
Even though the boot sector is corrupted, It can be recovered using the backup 
boot sector
through fsck.
> 
> 
> > Currently the volume dirty/clean policy of exfat-fs is not perfect,
> 
> Thank you for sharing the problem with you.
> 
> 
> > but I think it behaves similarly to the policy of MS Windows.
> 
> On Windows10, the dirty flag is cleared after more than 15 seconds after all 
> write operations are
> completed.
> (dirty-flag is never updated during the write operation continues)
> 
> 
> > Therefore,
> > I think code improvements should be made to reduce volume flag records 
> > while maintaining the current
> policy.
> 
> Current policy is inconsistent.
> As I wrote last mail, the problem with the current implementation is that the 
> dirty-flag may not be
> cleared after the write operation.(even if sync is enabled or disabled) 
> Because, some write operations
> clear the dirty-flag but some don't clear.
> Unmount or sync command is the only way to ensure that the dirty-flag is 
> cleared.
> This has no effect on clearing the dirty-flag after a write operations, it 
> only increases risk of
> destroying the boot-sector.
>   - Clear the dirty-flag after every write operation.
>   - Never clear the dirty-flag after every write operation.
> Unless unified to either one,  I think that sync policy cannot be consistent.
> 
> How do you think?
> 
> 
> BR
> ---
> etsuhiro Kohada 




[RFC PATCH 0/3] Re: [PATCH v17 14/21] mm/compaction: do page isolation first in compaction

2020-08-12 Thread Alexander Duyck
Here are the patches I had discussed earlier to address the issues in
isolate_migratepages_block.

They are based on the tree at:
 https://github.com/alexshi/linux.git lrunext 

The first patch is mostly cleanup to address the RCU locking in the
function. The second addresses the test_and_set_skip issue, and the third
relocates PageCompound.

I did some digging into the history of the skip bits and since they are
only supposed to be a hint I thought we could probably just drop the
testing portion of the call since the LRU flag is preventing more than one
thread from accessing the function anyway so it would make sense to just
switch it to a set operation similar to what happens when low_pfn ==
end_pfn at the end of the call.

I have only had a chance to build test these since rebasing on the tree. In
addition I am not 100% certain the PageCompound changes are correct as they
operate on the assumption that get_page_unless_zero is enough to keep a
compound page from being split up. I plan on doing some testing tomorrow,
but thought I would push these out now so that we could discuss them.

---

Alexander Duyck (3):
  mm: Drop locked from isolate_migratepages_block
  mm: Drop use of test_and_set_skip in favor of just setting skip
  mm: Identify compound pages sooner in isolate_migratepages_block


 mm/compaction.c |  126 +++
 1 file changed, 44 insertions(+), 82 deletions(-)

--


[RFC PATCH 1/3] mm: Drop locked from isolate_migratepages_block

2020-08-12 Thread Alexander Duyck
From: Alexander Duyck 

We can drop the need for the locked variable by making use of the
lruvec_holds_page_lru_lock function. By doing this we can avoid some rcu
locking ugliness for the case where the lruvec is still holding the LRU
lock associated with the page. Instead we can just use the lruvec and if it
is NULL we assume the lock was released.

Signed-off-by: Alexander Duyck 
---
 mm/compaction.c |   45 -
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index b99c96c4862d..5021a18ef722 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -803,9 +803,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
 {
pg_data_t *pgdat = cc->zone->zone_pgdat;
unsigned long nr_scanned = 0, nr_isolated = 0;
-   struct lruvec *lruvec;
+   struct lruvec *lruvec = NULL;
unsigned long flags = 0;
-   struct lruvec *locked = NULL;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
@@ -866,9 +865,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 * a fatal signal is pending.
 */
if (!(low_pfn % SWAP_CLUSTER_MAX)) {
-   if (locked) {
-   unlock_page_lruvec_irqrestore(locked, flags);
-   locked = NULL;
+   if (lruvec) {
+   unlock_page_lruvec_irqrestore(lruvec, flags);
+   lruvec = NULL;
}
 
if (fatal_signal_pending(current)) {
@@ -949,9 +948,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 */
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
-   if (locked) {
-   unlock_page_lruvec_irqrestore(locked, 
flags);
-   locked = NULL;
+   if (lruvec) {
+   unlock_page_lruvec_irqrestore(lruvec, 
flags);
+   lruvec = NULL;
}
 
if (!isolate_movable_page(page, isolate_mode))
@@ -992,16 +991,13 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (!TestClearPageLRU(page))
goto isolate_fail_put;
 
-   rcu_read_lock();
-   lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
/* If we already hold the lock, we can skip some rechecking */
-   if (lruvec != locked) {
-   if (locked)
-   unlock_page_lruvec_irqrestore(locked, flags);
+   if (!lruvec || !lruvec_holds_page_lru_lock(page, lruvec)) {
+   if (lruvec)
+   unlock_page_lruvec_irqrestore(lruvec, flags);
 
+   lruvec = mem_cgroup_page_lruvec(page, pgdat);
compact_lock_irqsave(>lru_lock, , cc);
-   locked = lruvec;
rcu_read_unlock();
 
lruvec_memcg_debug(lruvec, page);
@@ -1023,8 +1019,7 @@ static bool too_many_isolated(pg_data_t *pgdat)
SetPageLRU(page);
goto isolate_fail_put;
}
-   } else
-   rcu_read_unlock();
+   }
 
/* The whole page is taken off the LRU; skip the tail pages. */
if (PageCompound(page))
@@ -1057,9 +1052,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 
 isolate_fail_put:
/* Avoid potential deadlock in freeing page under lru_lock */
-   if (locked) {
-   unlock_page_lruvec_irqrestore(locked, flags);
-   locked = NULL;
+   if (lruvec) {
+   unlock_page_lruvec_irqrestore(lruvec, flags);
+   lruvec = NULL;
}
put_page(page);
 
@@ -1073,9 +1068,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
 * page anyway.
 */
if (nr_isolated) {
-   if (locked) {
-   unlock_page_lruvec_irqrestore(locked, flags);
-   locked = NULL;
+   if (lruvec) {
+   unlock_page_lruvec_irqrestore(lruvec, flags);
+   lruvec = NULL;
}
putback_movable_pages(>migratepages);
cc->nr_migratepages = 0;
@@ -1102,8 +1097,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
page = NULL;
 
 

[RFC PATCH 2/3] mm: Drop use of test_and_set_skip in favor of just setting skip

2020-08-12 Thread Alexander Duyck
From: Alexander Duyck 

The only user of test_and_set_skip was isolate_migratepages_block and it
was using it after a call that was testing and clearing the LRU flag. As
such it really didn't need to be behind the LRU lock anymore as it wasn't
really fulfilling its purpose.

With that being the case we can simply drop the bit and instead directly
just call the set_pageblock_skip function if the page we are working on is
the valid_page at the start of the pageblock. It shouldn't be possible for
us to encounter the bit being set since we obtained the LRU flag for the
first page in the pageblock which means we would have exclusive access to
setting the skip bit. As such we don't need to worry about the abort case
since no other thread will be able to call what used to be
test_and_set_skip.

Since we have dropped the late abort case we can drop the code that was
clearing the LRU flag and calling page_put since the abort case will now
not be holding a reference to a page.

Signed-off-by: Alexander Duyck 
---
 mm/compaction.c |   50 +++---
 1 file changed, 7 insertions(+), 43 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index 5021a18ef722..c1e9918f9dd4 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -399,29 +399,6 @@ void reset_isolation_suitable(pg_data_t *pgdat)
}
 }
 
-/*
- * Sets the pageblock skip bit if it was clear. Note that this is a hint as
- * locks are not required for read/writers. Returns true if it was already set.
- */
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
-   unsigned long pfn)
-{
-   bool skip;
-
-   /* Do no update if skip hint is being ignored */
-   if (cc->ignore_skip_hint)
-   return false;
-
-   if (!IS_ALIGNED(pfn, pageblock_nr_pages))
-   return false;
-
-   skip = get_pageblock_skip(page);
-   if (!skip && !cc->no_set_skip_hint)
-   skip = !set_pageblock_skip(page);
-
-   return skip;
-}
-
 static void update_cached_migrate(struct compact_control *cc, unsigned long 
pfn)
 {
struct zone *zone = cc->zone;
@@ -480,12 +457,6 @@ static inline void update_pageblock_skip(struct 
compact_control *cc,
 static void update_cached_migrate(struct compact_control *cc, unsigned long 
pfn)
 {
 }
-
-static bool test_and_set_skip(struct compact_control *cc, struct page *page,
-   unsigned long pfn)
-{
-   return false;
-}
 #endif /* CONFIG_COMPACTION */
 
 /*
@@ -895,7 +866,6 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
low_pfn = end_pfn;
-   page = NULL;
goto isolate_abort;
}
valid_page = page;
@@ -991,6 +961,13 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (!TestClearPageLRU(page))
goto isolate_fail_put;
 
+   /* Indicate that we want exclusive access to this pageblock */
+   if (page == valid_page) {
+   skip_updated = true;
+   if (!cc->ignore_skip_hint)
+   set_pageblock_skip(page);
+   }
+
/* If we already hold the lock, we can skip some rechecking */
if (!lruvec || !lruvec_holds_page_lru_lock(page, lruvec)) {
if (lruvec)
@@ -1002,13 +979,6 @@ static bool too_many_isolated(pg_data_t *pgdat)
 
lruvec_memcg_debug(lruvec, page);
 
-   /* Try get exclusive access under lock */
-   if (!skip_updated) {
-   skip_updated = true;
-   if (test_and_set_skip(cc, page, low_pfn))
-   goto isolate_abort;
-   }
-
/*
 * Page become compound since the non-locked check,
 * and it's on LRU. It can only be a THP so the order
@@ -1094,15 +1064,9 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
 
-   page = NULL;
-
 isolate_abort:
if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags);
-   if (page) {
-   SetPageLRU(page);
-   put_page(page);
-   }
 
/*
 * Updated the cached scanner pfn once the pageblock has been scanned



[RFC PATCH 3/3] mm: Identify compound pages sooner in isolate_migratepages_block

2020-08-12 Thread Alexander Duyck
From: Alexander Duyck 

Since we are holding a reference to the page much sooner in
isolate_migratepages_block we could move the PageCompound check out of the
LRU locked section and instead just place it after get_page_unless_zero. By
doing this we can allow any of the items that might trigger a failure to
trigger a failure for the compound page rather than the order 0 page and as
a result we should be able to process the pageblock faster.

In addition by testing for PageCompound sooner we can avoid having the LRU
flag cleared and then reset in the exception case. As a result this should
prevent any possible races where another thread might be attempting to pull
the LRU pages from the list.

Signed-off-by: Alexander Duyck 
---
 mm/compaction.c |   33 ++---
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index c1e9918f9dd4..3803f129fd6a 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -954,6 +954,24 @@ static bool too_many_isolated(pg_data_t *pgdat)
if (unlikely(!get_page_unless_zero(page)))
goto isolate_fail;
 
+   /*
+* Page is compound. We know the order before we know if it is
+* on the LRU so we cannot assume it is THP. However since the
+* page will have the LRU validated shortly we can use the value
+* to skip over this page for now or validate the LRU is set and
+* then isolate the entire compound page if we are isolating to
+* generate a CMA page.
+*/
+   if (PageCompound(page)) {
+   const unsigned int order = compound_order(page);
+
+   if (likely(order < MAX_ORDER))
+   low_pfn += (1UL << order) - 1;
+
+   if (!cc->alloc_contig)
+   goto isolate_fail_put;
+   }
+
if (__isolate_lru_page_prepare(page, isolate_mode) != 0)
goto isolate_fail_put;
 
@@ -978,23 +996,8 @@ static bool too_many_isolated(pg_data_t *pgdat)
rcu_read_unlock();
 
lruvec_memcg_debug(lruvec, page);
-
-   /*
-* Page become compound since the non-locked check,
-* and it's on LRU. It can only be a THP so the order
-* is safe to read and it's 0 for tail pages.
-*/
-   if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
-   low_pfn += compound_nr(page) - 1;
-   SetPageLRU(page);
-   goto isolate_fail_put;
-   }
}
 
-   /* The whole page is taken off the LRU; skip the tail pages. */
-   if (PageCompound(page))
-   low_pfn += compound_nr(page) - 1;
-
/* Successfully isolated */
del_page_from_lru_list(page, lruvec, page_lru(page));
mod_node_page_state(page_pgdat(page),



Re: file metadata via fs API (was: [GIT PULL] Filesystem Information)

2020-08-12 Thread Jeffrey E Altman
On 8/12/2020 2:18 PM, Linus Torvalds (torva...@linux-foundation.org) wrote:
> What's wrong with fstatfs()? All the extra magic metadata seems to not
> really be anything people really care about.
> 
> What people are actually asking for seems to be some unique mount ID,
> and we have 16 bytes of spare information in 'struct statfs64'.
> 
> All the other fancy fsinfo stuff seems to be "just because", and like
> complete overdesign.

Hi Linus,

Is there any existing method by which userland applications can
determine the properties of the filesystem in which a directory or file
is stored in a filesystem agnostic manner?

Over the past year I've observed the opendev/openstack community
struggle with performance issues caused by rsync's inability to
determine if the source and destination object's last update time have
the same resolution and valid time range.  If the source file system
supports 100 nanosecond granularity and the destination file system
supports one second granularity, any source file with a non-zero
fractional seconds timestamp will appear to have changed compared to the
copy in the destination filesystem which discarded the fractional
seconds during the last sync.  Sure, the end user could use the
--modify-window=1 option to inform rsync to add fuzz to the comparisons,
but that introduces the possibility that a file updated a fraction of a
second after an rsync execution would not synchronize the file on the
next run when both source and target have fine grained timestamps.  If
the userland sync processes have access to the source and destination
filesystem time capabilities, they can make more intelligent decisions
without explicit user input.  At a minimum, the timestamp properties
that are important to know include the range of valid timestamps and the
resolution.  Some filesystems support unsigned 32-bit time starting with
UNIX epoch.  Others signed 32-bit time with UNIX epoch.  Still others
FAT, NTFS, etc use alternative epochs and range and resolutions.

Another case where lack of filesystem properties is problematic is "df
--local" which currently relies upon string comparisons of file system
name strings to determine if the underlying file system is local or
remote.  This requires that the gnulib maintainers have knowledge of all
file systems implementations, their published names, and which category
they belong to.  Patches have been accepted in the past year to add
"smb3", "afs", and "gpfs" to the list of remote file systems.  There are
many more remote filesystems that have yet to be added including
"cephfs", "lustre", "gluster", etc.

In many cases, the filesystem properties cannot be inferred from the
filesystem name.  For network file systems, these properties might
depend upon the remote server capabilities or even the properties
associated with a particular volume or share.  Consider the case of a
remote file server that supports 64-bit 100ns time but which for
backward compatibility exports certain volumes or shares with more
restrictive capabilities. Or the case of a network file system protocol
that has evolved over time and gained new capabilities.

For the AFS community, fsinfo offers a method of exposing some server
and volume properties that are obtained via "path ioctls" in OpenAFS and
AuriStorFS.  Some example of properties that might be exposed include
answers to questions such as:

 * what is the volume cell id? perhaps a uuid.
 * what is the volume id in the cell? unsigned 64-bit integer
 * where is a mounted volume hosted? which fileservers, named by uuid
 * what is the block size? 1K, 4K, ...
 * how many blocks are in use or available?
 * what is the quota (thin provisioning), if any?
 * what is the reserved space (fat provisioning), if any?
 * how many vnodes are present?
 * what is the vnode count limit, if any?
 * when was the volume created and last updated?
 * what is the file size limit?
 * are byte range locks supported?
 * are mandatory locks supported?
 * how many entries can be created within a directory?
 * are cross-directory hard links supported?
 * are directories just-send-8, case-sensitive, case-preserving, or
   case-insensitive?
 * if not just-send-8, what character set is used?
 * if Unicode, what normalization rules? etc.
 * are per-object acls supported?
 * what volume maximum acl is assigned, if any?
 * what volume security policy (authn, integ, priv) is assigned, if any?
 * what is the replication policy, if any?
 * what is the volume encryption policy, if any?
 * what is the volume compression policy, if any?
 * are server-to-server copies supported?
 * which of atime, ctime and mtime does the volume support?
 * what is the permitted timestamp range and resolution?
 * are xattrs supported?
 * what is the xattr maximum name length?
 * what is the xattr maximum object size?
 * is the volume currently reachable?
 * is the volume immutable?
 * etc ...

Its true that there isn't widespread use of these filesystem properties
by today's 

Re: INFO: task hung in pipe_release (2)

2020-08-12 Thread syzbot
syzbot has bisected this issue to:

commit fddb5d430ad9fa91b49b1d34d0202ffe2fa0e179
Author: Aleksa Sarai 
Date:   Sat Jan 18 12:07:59 2020 +

open: introduce openat2(2) syscall

bisection log:  https://syzkaller.appspot.com/x/bisect.txt?x=164e716a90
start commit:   6ba1b005 Merge tag 'asm-generic-fixes-5.8' of git://git.ke..
git tree:   upstream
final oops: https://syzkaller.appspot.com/x/report.txt?x=154e716a90
console output: https://syzkaller.appspot.com/x/log.txt?x=114e716a90
kernel config:  https://syzkaller.appspot.com/x/.config?x=84f076779e989e69
dashboard link: https://syzkaller.appspot.com/bug?extid=61acc40a49a3e46e25ea
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=142ae22490

Reported-by: syzbot+61acc40a49a3e46e2...@syzkaller.appspotmail.com
Fixes: fddb5d430ad9 ("open: introduce openat2(2) syscall")

For information about bisection process see: https://goo.gl/tpsmEJ#bisection


[PATCH net] bonding: show saner speed for broadcast mode

2020-08-12 Thread Jarod Wilson
Broadcast mode bonds transmit a copy of all traffic simultaneously out of
all interfaces, so the "speed" of the bond isn't really the aggregate of
all interfaces, but rather, the speed of the lowest active interface.

Also, the type of the speed field is u32, not unsigned long, so adjust
that accordingly, as required to make min() function here without
complaining about mismatching types.

Fixes: bb5b052f751b ("bond: add support to read speed and duplex via ethtool")
CC: Jay Vosburgh 
CC: Veaceslav Falico 
CC: Andy Gospodarek 
CC: "David S. Miller" 
CC: net...@vger.kernel.org
Signed-off-by: Jarod Wilson 
---
 drivers/net/bonding/bond_main.c | 21 ++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 5ad43aaf76e5..c853ca67058c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4552,13 +4552,23 @@ static netdev_tx_t bond_start_xmit(struct sk_buff *skb, 
struct net_device *dev)
return ret;
 }
 
+static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed)
+{
+   if (speed == 0 || speed == SPEED_UNKNOWN)
+   speed = slave->speed;
+   else
+   speed = min(speed, slave->speed);
+
+   return speed;
+}
+
 static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev,
   struct ethtool_link_ksettings *cmd)
 {
struct bonding *bond = netdev_priv(bond_dev);
-   unsigned long speed = 0;
struct list_head *iter;
struct slave *slave;
+   u32 speed = 0;
 
cmd->base.duplex = DUPLEX_UNKNOWN;
cmd->base.port = PORT_OTHER;
@@ -4570,8 +4580,13 @@ static int bond_ethtool_get_link_ksettings(struct 
net_device *bond_dev,
 */
bond_for_each_slave(bond, slave, iter) {
if (bond_slave_can_tx(slave)) {
-   if (slave->speed != SPEED_UNKNOWN)
-   speed += slave->speed;
+   if (slave->speed != SPEED_UNKNOWN) {
+   if (BOND_MODE(bond) == BOND_MODE_BROADCAST)
+   speed = bond_mode_bcast_speed(slave,
+ speed);
+   else
+   speed += slave->speed;
+   }
if (cmd->base.duplex == DUPLEX_UNKNOWN &&
slave->duplex != DUPLEX_UNKNOWN)
cmd->base.duplex = slave->duplex;
-- 
2.20.1



Re: [PATCH v2] nvme: Use spin_lock_irq() when taking the ctrl->lock

2020-08-12 Thread Logan Gunthorpe



On 2020-08-12 6:32 p.m., Keith Busch wrote:
> There's an unrelated whitespace change in nvme_init_identify().
> Otherwise, looks fine.

Oops, sorry. can this be fixed up when it's merged?

Logan


Re: [PATCH v17 14/21] mm/compaction: do page isolation first in compaction

2020-08-12 Thread Alex Shi



在 2020/8/13 上午10:17, Alexander Duyck 写道:
>> zone lock is probability better. you can try and test.
> So I spent a good chunk of today looking the code over and what I
> realized is that we probably don't even really need to have this code
> protected by the zone lock since the LRU bit in the pageblock should
> do most of the work for us. In addition we can get rid of the test
> portion of this and just make it a set only operation if I am not
> mistaken.
> 
> the LRU flag is cleared then you are creating a situation where
> multiple processes will be stomping all over each other as you can
> have each thread essentially take a page via the LRU flag, but only
> one thread will process a page and it could skip over all other pages
> that preemptively had their LRU flag cleared.
 It increase a bit crowd here, but lru_lock do reduce some them, and 
 skip_bit
 could stop each other in a array check(bitmap). So compare to whole node
 lru_lock, the net profit is clear in patch 17.
>>> My concern is that what you can end up with is multiple threads all
>>> working over the same pageblock for isolation. With the old code the
>>> LRU lock was used to make certain that test_and_set_skip was being
>>> synchronized on the first page in the pageblock so you would only have
>>> one thread going through and working a single pageblock. However after
>>> your changes it doesn't seem like the test_and_set_skip has that
>>> protection since only one thread will ever be able to successfully
>>> call it for the first page in the pageblock assuming that the LRU flag
>>> is set on the first page in the pageblock block.
>>>
> If you take a look at the test_and_set_skip the function only acts on
> the pageblock aligned PFN for a given range. WIth the changes you have
> in place now that would mean that only one thread would ever actually
> call this function anyway since the first PFN would take the LRU flag
> so no other thread could follow through and test or set the bit as
 Is this good for only one process could do test_and_set_skip? is that
 the 'skip' meaning to be?
>>> So only one thread really getting to fully use test_and_set_skip is
>>> good, however the issue is that there is nothing to synchronize the
>>> testing from the other threads. As a result the other threads could
>>> have isolated other pages within the pageblock before the thread that
>>> is calling test_and_set_skip will get to complete the setting of the
>>> skip bit. This will result in isolation failures for the thread that
>>> set the skip bit which may be undesirable behavior.
>>>
>>> With the old code the threads were all synchronized on testing the
>>> first PFN in the pageblock while holding the LRU lock and that is what
>>> we lost. My concern is the cases where skip_on_failure == true are
>>> going to fail much more often now as the threads can easily interfere
>>> with each other.
>> I have a patch to fix this, which is on
>> https://github.com/alexshi/linux.git lrunext
> I don't think that patch helps to address anything. You are now
> failing to set the bit in the case that something modifies the
> pageblock flags while you are attempting to do so. I think it would be
> better to just leave the cmpxchg loop as it is.

It do increae the case-lru-file-mmap-read in vm-scalibity about 3% performance.
Yes, I am glad to see it can be make better.


> 
> well. The expectation before was that all threads would encounter this
> test and either proceed after setting the bit for the first PFN or
> abort after testing the first PFN. With you changes only the first
> thread actually runs this test and then it and the others will likely
> encounter multiple failures as they are all clearing LRU bits
> simultaneously and tripping each other up. That is why the skip bit
> must have a test and set done before you even get to the point of
> clearing the LRU flag.
 It make the things warse in my machine, would you like to have a try by 
 yourself?
>>> I plan to do that. I have already been working on a few things to
>>> clean up and optimize your patch set further. I will try to submit an
>>> RFC this evening so we can discuss.
>>>
>> Glad to see your new code soon. Would you like do it base on
>> https://github.com/alexshi/linux.git lrunext
> I can rebase off of that tree. It may add another half hour or so. I
> have barely had any time to test my code. When I enabled some of the
> debugging features in the kernel related to using the vm-scalability
> tests the boot time became incredibly slow so I may just make certain
> I can boot and not mess the system up before submitting my patches as
> an RFC. I can probably try testing them more tomorrow.
> 
>>> The point I was getting at with the PageCompound check is that instead
>>> of needing the LRU lock you should be able to look at PageCompound as
>>> soon as you call 

[PATCH] mm: include CMA pages in lowmem_reserve at boot

2020-08-12 Thread Doug Berger
The lowmem_reserve arrays provide a means of applying pressure
against allocations from lower zones that were targeted at
higher zones. Its values are a function of the number of pages
managed by higher zones and are assigned by a call to the
setup_per_zone_lowmem_reserve() function.

The function is initially called at boot time by the function
init_per_zone_wmark_min() and may be called later by accesses
of the /proc/sys/vm/lowmem_reserve_ratio sysctl file.

The function init_per_zone_wmark_min() was moved up from a
module_init to a core_initcall to resolve a sequencing issue
with khugepaged. Unfortunately this created a sequencing issue
with CMA page accounting.

The CMA pages are added to the managed page count of a zone
when cma_init_reserved_areas() is called at boot also as a
core_initcall. This makes it uncertain whether the CMA pages
will be added to the managed page counts of their zones before
or after the call to init_per_zone_wmark_min() as it becomes
dependent on link order. With the current link order the pages
are added to the managed count after the lowmem_reserve arrays
are initialized at boot.

This means the lowmem_reserve values at boot may be lower than
the values used later if /proc/sys/vm/lowmem_reserve_ratio is
accessed even if the ratio values are unchanged.

In many cases the difference is not significant, but in others
it may have an affect.

This commit breaks the link order dependency by invoking
init_per_zone_wmark_min() as a postcore_initcall so that the
CMA pages have the chance to be properly accounted in their
zone(s) and allowing the lowmem_reserve arrays to receive
consistent values.

Fixes: bc22af74f271 ("mm: update min_free_kbytes from khugepaged after core 
initialization")
Signed-off-by: Doug Berger 
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8b7d0ecf30b1..f3e340ec2b6b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7887,7 +7887,7 @@ int __meminit init_per_zone_wmark_min(void)
 
return 0;
 }
-core_initcall(init_per_zone_wmark_min)
+postcore_initcall(init_per_zone_wmark_min)
 
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
-- 
2.7.4



Re: WARNING in compat_do_ebt_get_ctl

2020-08-12 Thread syzbot
syzbot has found a reproducer for the following issue on:

HEAD commit:fb893de3 Merge tag 'tag-chrome-platform-for-v5.9' of git:/..
git tree:   upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=1742b31c90
kernel config:  https://syzkaller.appspot.com/x/.config?x=f1fedc63022bf07e
dashboard link: https://syzkaller.appspot.com/bug?extid=5accb5c62faa1d346480
compiler:   gcc (GCC) 10.1.0-syz 20200507
userspace arch: i386
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=13280fd690
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=1409f4a690

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+5accb5c62faa1d346...@syzkaller.appspotmail.com

[ cut here ]
Buffer overflow detected (80 < 137)!
WARNING: CPU: 0 PID: 6853 at include/linux/thread_info.h:134 copy_overflow 
include/linux/thread_info.h:134 [inline]
WARNING: CPU: 0 PID: 6853 at include/linux/thread_info.h:134 check_copy_size 
include/linux/thread_info.h:143 [inline]
WARNING: CPU: 0 PID: 6853 at include/linux/thread_info.h:134 copy_to_user 
include/linux/uaccess.h:151 [inline]
WARNING: CPU: 0 PID: 6853 at include/linux/thread_info.h:134 
compat_do_ebt_get_ctl+0x47e/0x500 net/bridge/netfilter/ebtables.c:2270
Kernel panic - not syncing: panic_on_warn set ...
CPU: 0 PID: 6853 Comm: syz-executor171 Not tainted 5.8.0-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x18f/0x20d lib/dump_stack.c:118
 panic+0x2e3/0x75c kernel/panic.c:231
 __warn.cold+0x20/0x45 kernel/panic.c:600
 report_bug+0x1bd/0x210 lib/bug.c:198
 handle_bug+0x38/0x90 arch/x86/kernel/traps.c:234
 exc_invalid_op+0x14/0x40 arch/x86/kernel/traps.c:254
 asm_exc_invalid_op+0x12/0x20 arch/x86/include/asm/idtentry.h:536
RIP: 0010:copy_overflow include/linux/thread_info.h:134 [inline]
RIP: 0010:check_copy_size include/linux/thread_info.h:143 [inline]
RIP: 0010:copy_to_user include/linux/uaccess.h:151 [inline]
RIP: 0010:compat_do_ebt_get_ctl+0x47e/0x500 net/bridge/netfilter/ebtables.c:2270
Code: ba fd ff ff 4c 89 f7 e8 60 07 a2 fa e9 ad fd ff ff e8 36 18 62 fa 4c 89 
e2 be 50 00 00 00 48 c7 c7 40 b9 0e 89 e8 94 1f 33 fa <0f> 0b e9 dc fd ff ff 41 
bc f2 ff ff ff e9 4f fe ff ff e8 3b 07 a2
RSP: 0018:c90005667ae8 EFLAGS: 00010282
RAX:  RBX: 192000accf5e RCX: 
RDX: 88809458a280 RSI: 815dbce7 RDI: f52000accf4f
RBP: 8a8faa60 R08: 0001 R09: 8880ae6318e7
R10:  R11: 35383654 R12: 0089
R13: 2000 R14: c90005667d80 R15: c90005667b20
 do_ebt_get_ctl+0x2b4/0x790 net/bridge/netfilter/ebtables.c:2317
 nf_getsockopt+0x72/0xd0 net/netfilter/nf_sockopt.c:116
 ip_getsockopt net/ipv4/ip_sockglue.c:1778 [inline]
 ip_getsockopt+0x164/0x1c0 net/ipv4/ip_sockglue.c:1757
 tcp_getsockopt+0x86/0xd0 net/ipv4/tcp.c:3884
 __sys_getsockopt+0x219/0x4c0 net/socket.c:2179
 __do_sys_getsockopt net/socket.c:2194 [inline]
 __se_sys_getsockopt net/socket.c:2191 [inline]
 __ia32_sys_getsockopt+0xb9/0x150 net/socket.c:2191
 do_syscall_32_irqs_on arch/x86/entry/common.c:84 [inline]
 __do_fast_syscall_32+0x57/0x80 arch/x86/entry/common.c:126
 do_fast_syscall_32+0x2f/0x70 arch/x86/entry/common.c:149
 entry_SYSENTER_compat_after_hwframe+0x4d/0x5c
RIP: 0023:0xf7f91569
Code: 10 05 03 74 b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 
00 00 00 00 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 
eb 0d 90 90 90 90 90 90 90 90 90 90 90 90
RSP: 002b:ffdae08c EFLAGS: 0292 ORIG_RAX: 016d
RAX: ffda RBX: 0003 RCX: 
RDX: 0082 RSI: 2000 RDI: 2100
RBP: 0012 R08:  R09: 
R10:  R11:  R12: 
R13:  R14:  R15: 
Kernel Offset: disabled
Rebooting in 86400 seconds..



Re: file metadata via fs API

2020-08-12 Thread Ian Kent
On Wed, 2020-08-12 at 12:50 -0700, Linus Torvalds wrote:
> On Wed, Aug 12, 2020 at 12:34 PM Steven Whitehouse <
> swhit...@redhat.com> wrote:
> > The point of this is to give us the ability to monitor mounts from
> > userspace.
> 
> We haven't had that before, I don't see why it's suddenly such a big
> deal.

Because there's a trend occurring in user space where there are
frequent and persistent mount changes that cause high overhead.

I've seen the number of problems building up over the last few months
that are essentially the same problem that I wanted to resolve. And
that's related to side effects of autofs using a large number of
mounts.

The problems are real.

> 
> The notification side I understand. Polling /proc files is not the
> answer.

Yep, that's one aspect, getting the information about a mount without
reading the entire mount table seems like the sensible thing to do to
allow for a more efficient notification mechanism.

> 
> But the whole "let's design this crazy subsystem for it" seems way
> overkill. I don't see anybody caring that deeply.
> 
> It really smells like "do it because we can, not because we must".
> 
> Who the hell cares about monitoring mounts at a kHz frequencies? If
> this is for MIS use, you want a nice GUI and not wasting CPU time
> polling.

That part of the problem still remains.

The kernel sending a continuous stream of wake ups under load does
also introduce a resource problem but that's probably something to
handle in user space.

> 
> I'm starting to ignore the pull requests from David Howells, because
> by now they have had the same pattern for a couple of years now:
> esoteric new interfaces that seem overdesigned for corner-cases that
> I'm not seeing people clamoring for.
> 
> I need (a) proof this is actualyl something real users care about and
> (b) way more open discussion and implementation from multiple
> parties.
> 
> Because right now it looks like a small in-cabal of a couple of
> people
> who have wild ideas but I'm not seeing the wider use of it.
> 
> Convince me otherwise. AGAIN. This is the exact same issue I had with
> the notification queues that I really wanted actual use-cases for,
> and
> feedback from actual outside users.
> 
> I really think this is engineering for its own sake, rather than
> responding to actual user concerns.
> 
>Linus



[PATCH] drivers/input/misc: Use kobj_to_dev() instead

2020-08-12 Thread Wang Qing
Use kobj_to_dev() instead of container_of()

Signed-off-by: Wang Qing 
---
 drivers/input/misc/ims-pcu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/input/misc/ims-pcu.c b/drivers/input/misc/ims-pcu.c
index d8dbfc0..0879d96
--- a/drivers/input/misc/ims-pcu.c
+++ b/drivers/input/misc/ims-pcu.c
@@ -1228,7 +1228,7 @@ static struct attribute *ims_pcu_attrs[] = {
 static umode_t ims_pcu_is_attr_visible(struct kobject *kobj,
   struct attribute *attr, int n)
 {
-   struct device *dev = container_of(kobj, struct device, kobj);
+   struct device *dev = kobj_to_dev(kobj);
struct usb_interface *intf = to_usb_interface(dev);
struct ims_pcu *pcu = usb_get_intfdata(intf);
umode_t mode = attr->mode;
-- 
2.7.4



[PATCH] drm/hisilicon: Fix build error of no type of module_init

2020-08-12 Thread Tian Tao
Add missing include to fix build error:
hibmc_drm_drv.c:385:1: warning: data definition has no type or storage
class [enabled by default]
hibmc_drm_drv.c:385:1: error: type defaults to ‘int’ in declaration
of ‘module_init’ [-Werror=implicit-int]
hibmc_drm_drv.c:385:1: warning: parameter names (without types) in function
of ‘module_exit’ [-Werror=implicit-int]
hibmc_drm_drv.c:385:292:1: warning: parameter names (without types) in
function declaration [enabled by default]

Signed-off-by: Tian Tao 
Reported-by: kernel test robot 
---
 drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c 
b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c
index 1ae360d..2b4f821 100644
--- a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c
+++ b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c
@@ -11,6 +11,7 @@
  * Jianhua Li 
  */
 
+#include 
 #include 
 
 #include 
-- 
2.7.4



[PATCH v2] riscv: Setup exception vector for nommu platform

2020-08-12 Thread Qiu Wenbo
Exception vector is missing on nommu platform and that is an issue.
This patch is tested in Sipeed Maix Bit Dev Board.

Fixes: 79b1feba5455 ("RISC-V: Setup exception vector early")
Suggested-by: Anup Patel 
Suggested-by: Atish Patra 
Signed-off-by: Qiu Wenbo 
---
 arch/riscv/kernel/head.S | 25 +
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index d0c5c316e9bb..0a4e81b8dc79 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -77,16 +77,10 @@ relocate:
csrw CSR_SATP, a0
 .align 2
 1:
-   /* Set trap vector to exception handler */
-   la a0, handle_exception
+   /* Set trap vector to spin forever to help debug */
+   la a0, .Lsecondary_park
csrw CSR_TVEC, a0
 
-   /*
-* Set sup0 scratch register to 0, indicating to exception vector that
-* we are presently executing in kernel.
-*/
-   csrw CSR_SCRATCH, zero
-
/* Reload the global pointer */
 .option push
 .option norelax
@@ -144,9 +138,23 @@ secondary_start_common:
la a0, swapper_pg_dir
call relocate
 #endif
+   call setup_trap_vector
tail smp_callin
 #endif /* CONFIG_SMP */
 
+.align 2
+setup_trap_vector:
+   /* Set trap vector to exception handler */
+   la a0, handle_exception
+   csrw CSR_TVEC, a0
+
+   /*
+* Set sup0 scratch register to 0, indicating to exception vector that
+* we are presently executing in kernel.
+*/
+   csrw CSR_SCRATCH, zero
+   ret
+
 .Lsecondary_park:
/* We lack SMP support or have too many harts, so park this hart */
wfi
@@ -240,6 +248,7 @@ clear_bss_done:
call relocate
 #endif /* CONFIG_MMU */
 
+   call setup_trap_vector
/* Restore C environment */
la tp, init_task
sw zero, TASK_TI_CPU(tp)
-- 
2.28.0



[PATCH] drivers/greybus: Use kobj_to_dev() instead

2020-08-12 Thread Wang Qing
Use kobj_to_dev() instead of container_of()

Signed-off-by: Wang Qing 
---
 drivers/greybus/interface.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/greybus/interface.c b/drivers/greybus/interface.c
index 67dbe6f..de167f1
--- a/drivers/greybus/interface.c
+++ b/drivers/greybus/interface.c
@@ -620,7 +620,7 @@ static struct attribute *interface_common_attrs[] = {
 static umode_t interface_unipro_is_visible(struct kobject *kobj,
   struct attribute *attr, int n)
 {
-   struct device *dev = container_of(kobj, struct device, kobj);
+   struct device *dev = kobj_to_dev(kobj);
struct gb_interface *intf = to_gb_interface(dev);
 
switch (intf->type) {
@@ -635,7 +635,7 @@ static umode_t interface_unipro_is_visible(struct kobject 
*kobj,
 static umode_t interface_greybus_is_visible(struct kobject *kobj,
struct attribute *attr, int n)
 {
-   struct device *dev = container_of(kobj, struct device, kobj);
+   struct device *dev = kobj_to_dev(kobj);
struct gb_interface *intf = to_gb_interface(dev);
 
switch (intf->type) {
@@ -649,7 +649,7 @@ static umode_t interface_greybus_is_visible(struct kobject 
*kobj,
 static umode_t interface_power_is_visible(struct kobject *kobj,
  struct attribute *attr, int n)
 {
-   struct device *dev = container_of(kobj, struct device, kobj);
+   struct device *dev = kobj_to_dev(kobj);
struct gb_interface *intf = to_gb_interface(dev);
 
switch (intf->type) {
-- 
2.7.4



Re: [PATCH] binfmt_flat: revert "binfmt_flat: don't offset the data start"

2020-08-12 Thread Greg Ungerer

Hi Max,

On 9/8/20 4:37 am, Max Filippov wrote:

binfmt_flat loader uses the gap between text and data to store data
segment pointers for the libraries. Even in the absence of shared
libraries it stores at least one pointer to the executable's own data
segment. Text and data can go back to back in the flat binary image and
without offsetting data segment last few instructions in the text
segment may get corrupted by the data segment pointer.


Yep, your right, it does.

I will push this into the m68knommu git tree next week (once the merge
window is closed), and make sure it gets to Linus for rc series soon
after that.

Thanks
Greg



Fix it by reverting commit a2357223c50a ("binfmt_flat: don't offset the
data start").

Cc: sta...@vger.kernel.org
Fixes: a2357223c50a ("binfmt_flat: don't offset the data start")
Signed-off-by: Max Filippov 
---
  fs/binfmt_flat.c | 20 
  1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index f2f9086ebe98..b9c658e0548e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -576,7 +576,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
  
-		len = data_len + extra;

+   len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned 
long);
len = PAGE_ALIGN(len);
realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -590,7 +590,9 @@ static int load_flat_file(struct linux_binprm *bprm,
vm_munmap(textpos, text_len);
goto err;
}
-   datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
+   datapos = ALIGN(realdatastart +
+   MAX_SHARED_LIBS * sizeof(unsigned long),
+   FLAT_DATA_ALIGN);
  
  		pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",

 data_len + bss_len + stack_len, datapos);
@@ -620,7 +622,7 @@ static int load_flat_file(struct linux_binprm *bprm,
memp_size = len;
} else {
  
-		len = text_len + data_len + extra;

+   len = text_len + data_len + extra + MAX_SHARED_LIBS * 
sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -635,7 +637,9 @@ static int load_flat_file(struct linux_binprm *bprm,
}
  
  		realdatastart = textpos + ntohl(hdr->data_start);

-   datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN);
+   datapos = ALIGN(realdatastart +
+   MAX_SHARED_LIBS * sizeof(u32),
+   FLAT_DATA_ALIGN);
  
  		reloc = (__be32 __user *)

(datapos + (ntohl(hdr->reloc_start) - text_len));
@@ -652,9 +656,8 @@ static int load_flat_file(struct linux_binprm *bprm,
 (text_len + full_data
  - sizeof(struct flat_hdr)),
 0);
-   if (datapos != realdatastart)
-   memmove((void *)datapos, (void *)realdatastart,
-   full_data);
+   memmove((void *) datapos, (void *) realdatastart,
+   full_data);
  #else
/*
 * This is used on MMU systems mainly for testing.
@@ -710,7 +713,8 @@ static int load_flat_file(struct linux_binprm *bprm,
if (IS_ERR_VALUE(result)) {
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
-   vm_munmap(textpos, text_len + data_len + extra);
+   vm_munmap(textpos, text_len + data_len + extra +
+   MAX_SHARED_LIBS * sizeof(u32));
goto err;
}
}



Re: KMSAN: uninit-value in ath9k_htc_rx_msg

2020-08-12 Thread syzbot
syzbot has found a reproducer for the following issue on:

HEAD commit:ce8056d1 wip: changed copy_from_user where instrumented
git tree:   https://github.com/google/kmsan.git master
console output: https://syzkaller.appspot.com/x/log.txt?x=12985a1690
kernel config:  https://syzkaller.appspot.com/x/.config?x=3afe005fb99591f
dashboard link: https://syzkaller.appspot.com/bug?extid=2ca247c2d60c7023de7f
compiler:   clang version 10.0.0 (https://github.com/llvm/llvm-project/ 
c2443155a0fb245c8f17f2c1c72b6ea391e86e81)
syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=1468efe290
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=10bb9fba90

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+2ca247c2d60c7023d...@syzkaller.appspotmail.com

=
BUG: KMSAN: uninit-value in ath9k_htc_rx_msg+0x28f/0x1f50 
drivers/net/wireless/ath/ath9k/htc_hst.c:410
CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.8.0-rc5-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
Call Trace:
 
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x21c/0x280 lib/dump_stack.c:118
 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:121
 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215
 ath9k_htc_rx_msg+0x28f/0x1f50 drivers/net/wireless/ath/ath9k/htc_hst.c:410
 ath9k_hif_usb_rx_stream drivers/net/wireless/ath/ath9k/hif_usb.c:638 [inline]
 ath9k_hif_usb_rx_cb+0x1841/0x1d10 drivers/net/wireless/ath/ath9k/hif_usb.c:671
 __usb_hcd_giveback_urb+0x687/0x870 drivers/usb/core/hcd.c:1650
 usb_hcd_giveback_urb+0x1cb/0x730 drivers/usb/core/hcd.c:1716
 dummy_timer+0xd98/0x71c0 drivers/usb/gadget/udc/dummy_hcd.c:1967
 call_timer_fn+0x226/0x550 kernel/time/timer.c:1404
 expire_timers+0x4fc/0x780 kernel/time/timer.c:1449
 __run_timers+0xaf4/0xd30 kernel/time/timer.c:1773
 run_timer_softirq+0x2d/0x50 kernel/time/timer.c:1786
 __do_softirq+0x2ea/0x7f5 kernel/softirq.c:293
 asm_call_on_stack+0xf/0x20 arch/x86/entry/entry_64.S:711
 
 __run_on_irqstack arch/x86/include/asm/irq_stack.h:23 [inline]
 run_on_irqstack_cond arch/x86/include/asm/irq_stack.h:50 [inline]
 do_softirq_own_stack+0x7c/0xa0 arch/x86/kernel/irq_64.c:77
 invoke_softirq kernel/softirq.c:390 [inline]
 __irq_exit_rcu+0x226/0x270 kernel/softirq.c:420
 irq_exit_rcu+0xe/0x10 kernel/softirq.c:432
 sysvec_apic_timer_interrupt+0x107/0x130 arch/x86/kernel/apic/apic.c:1091
 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:593
RIP: 0010:native_irq_disable arch/x86/include/asm/irqflags.h:49 [inline]
RIP: 0010:arch_local_irq_disable arch/x86/include/asm/irqflags.h:89 [inline]
RIP: 0010:acpi_safe_halt drivers/acpi/processor_idle.c:112 [inline]
RIP: 0010:acpi_idle_do_entry drivers/acpi/processor_idle.c:525 [inline]
RIP: 0010:acpi_idle_enter+0x817/0xeb0 drivers/acpi/processor_idle.c:651
Code: 85 db 74 0a f7 d3 44 21 fb 48 85 db 74 32 4d 85 ff 75 3a 48 8b 5d a0 e9 
0c 00 00 00 e8 12 b2 78 fb 0f 00 2d 25 15 1c 0b fb f4  eb 5a 84 c0 8b 7d 90 
0f 45 7d 94 e8 d8 9a f4 fb e9 74 fc ff ff
RSP: 0018:88812df93bc8 EFLAGS: 0246
RAX:  RBX: 8881dfefce70 RCX: 00012db88000
RDX: 88812df88000 RSI:  RDI: 
RBP: 88812df93ca0 R08: 86420acc R09: 88812fffa000
R10: 0002 R11: 88812df88000 R12: 88812df889d8
R13: 8881dfefcc64 R14:  R15: 
 cpuidle_enter_state+0x860/0x12b0 drivers/cpuidle/cpuidle.c:235
 cpuidle_enter+0xe3/0x170 drivers/cpuidle/cpuidle.c:346
 call_cpuidle kernel/sched/idle.c:126 [inline]
 cpuidle_idle_call kernel/sched/idle.c:214 [inline]
 do_idle+0x668/0x810 kernel/sched/idle.c:276
 cpu_startup_entry+0x45/0x50 kernel/sched/idle.c:372
 start_secondary+0x1bf/0x240 arch/x86/kernel/smpboot.c:268
 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:243

Uninit was created at:
 kmsan_save_stack_with_flags+0x3c/0x90 mm/kmsan/kmsan.c:144
 kmsan_internal_alloc_meta_for_pages mm/kmsan/kmsan_shadow.c:269 [inline]
 kmsan_alloc_page+0xc5/0x1a0 mm/kmsan/kmsan_shadow.c:293
 __alloc_pages_nodemask+0xdf0/0x1030 mm/page_alloc.c:4889
 __alloc_pages include/linux/gfp.h:509 [inline]
 __alloc_pages_node include/linux/gfp.h:522 [inline]
 alloc_pages_node include/linux/gfp.h:536 [inline]
 __page_frag_cache_refill mm/page_alloc.c:4964 [inline]
 page_frag_alloc+0x35b/0x880 mm/page_alloc.c:4994
 __netdev_alloc_skb+0x2a8/0xc90 net/core/skbuff.c:451
 __dev_alloc_skb include/linux/skbuff.h:2813 [inline]
 ath9k_hif_usb_rx_stream drivers/net/wireless/ath/ath9k/hif_usb.c:620 [inline]
 ath9k_hif_usb_rx_cb+0xe5a/0x1d10 drivers/net/wireless/ath/ath9k/hif_usb.c:671
 __usb_hcd_giveback_urb+0x687/0x870 drivers/usb/core/hcd.c:1650
 usb_hcd_giveback_urb+0x1cb/0x730 drivers/usb/core/hcd.c:1716
 dummy_timer+0xd98/0x71c0 drivers/usb/gadget/udc/dummy_hcd.c:1967
 call_timer_fn+0x226/0x550 kernel/time/timer.c:1404
 

[PATCH v6] Add MediaTek MT6779 devapc driver

2020-08-12 Thread Neal Liu
These patch series introduce a MediaTek MT6779 devapc driver.

MediaTek bus fabric provides TrustZone security support and data protection to 
prevent slaves from being accessed by unexpected masters.
The security violation is logged and sent to the processor for further analysis 
or countermeasures.

Any occurrence of security violation would raise an interrupt, and it will be 
handled by mtk-devapc driver.
The violation information is printed in order to find the murderer.

changes since v5:
- remove redundant write reg operation.
- use static variable of vio_dbgs instead.
- add stop_devapc() if driver is removed.

changes since v4:
- refactor data structure.
- merge two simple functions into one.
- refactor register setting to prevent too many function call overhead.

changes since v3:
- revise violation handling flow to make it more easily to understand
  hardware behavior.
- add more comments to understand how hardware works.

changes since v2:
- pass platform info through DT data.
- remove unnecessary function.
- remove slave_type because it always equals to 1 in current support SoC.
- use vio_idx_num instread of list all devices' index.
- add more comments to describe hardware behavior.

changes since v1:
- move SoC specific part to DT data.
- remove unnecessary boundary check.
- remove unnecessary data type declaration.
- use read_poll_timeout() instread of for loop polling.
- revise coding style elegantly.


*** BLURB HERE ***

Neal Liu (2):
  dt-bindings: devapc: add bindings for mtk-devapc
  soc: mediatek: add mt6779 devapc driver

 .../bindings/soc/mediatek/devapc.yaml |  58 
 drivers/soc/mediatek/Kconfig  |   9 +
 drivers/soc/mediatek/Makefile |   1 +
 drivers/soc/mediatek/mtk-devapc.c | 320 ++
 4 files changed, 388 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/soc/mediatek/devapc.yaml
 create mode 100644 drivers/soc/mediatek/mtk-devapc.c

-- 
2.18.0


[PATCH v6 2/2] soc: mediatek: add mt6779 devapc driver

2020-08-12 Thread Neal Liu
MediaTek bus fabric provides TrustZone security support and data
protection to prevent slaves from being accessed by unexpected
masters.
The security violation is logged and sent to the processor for
further analysis or countermeasures.

Any occurrence of security violation would raise an interrupt, and
it will be handled by mtk-devapc driver. The violation
information is printed in order to find the murderer.

Signed-off-by: Neal Liu 
---
 drivers/soc/mediatek/Kconfig  |9 ++
 drivers/soc/mediatek/Makefile |1 +
 drivers/soc/mediatek/mtk-devapc.c |  320 +
 3 files changed, 330 insertions(+)
 create mode 100644 drivers/soc/mediatek/mtk-devapc.c

diff --git a/drivers/soc/mediatek/Kconfig b/drivers/soc/mediatek/Kconfig
index 59a56cd..1177c98 100644
--- a/drivers/soc/mediatek/Kconfig
+++ b/drivers/soc/mediatek/Kconfig
@@ -17,6 +17,15 @@ config MTK_CMDQ
  time limitation, such as updating display configuration during the
  vblank.
 
+config MTK_DEVAPC
+   tristate "Mediatek Device APC Support"
+   help
+ Say yes here to enable support for Mediatek Device APC driver.
+ This driver is mainly used to handle the violation which catches
+ unexpected transaction.
+ The violation information is logged for further analysis or
+ countermeasures.
+
 config MTK_INFRACFG
bool "MediaTek INFRACFG Support"
select REGMAP
diff --git a/drivers/soc/mediatek/Makefile b/drivers/soc/mediatek/Makefile
index 01f9f87..abfd4ba 100644
--- a/drivers/soc/mediatek/Makefile
+++ b/drivers/soc/mediatek/Makefile
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_MTK_CMDQ) += mtk-cmdq-helper.o
+obj-$(CONFIG_MTK_DEVAPC) += mtk-devapc.o
 obj-$(CONFIG_MTK_INFRACFG) += mtk-infracfg.o
 obj-$(CONFIG_MTK_PMIC_WRAP) += mtk-pmic-wrap.o
 obj-$(CONFIG_MTK_SCPSYS) += mtk-scpsys.o
diff --git a/drivers/soc/mediatek/mtk-devapc.c 
b/drivers/soc/mediatek/mtk-devapc.c
new file mode 100644
index 000..5189b3f
--- /dev/null
+++ b/drivers/soc/mediatek/mtk-devapc.c
@@ -0,0 +1,320 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2020 MediaTek Inc.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define VIO_MOD_TO_REG_IND(m)  ((m) / 32)
+#define VIO_MOD_TO_REG_OFF(m)  ((m) % 32)
+
+struct mtk_devapc_vio_dbgs {
+   union {
+   u32 vio_dbg0;
+   struct {
+   u32 mstid:16;
+   u32 dmnid:6;
+   u32 vio_w:1;
+   u32 vio_r:1;
+   u32 addr_h:4;
+   u32 resv:4;
+   } dbg0_bits;
+   };
+
+   u32 vio_dbg1;
+};
+
+struct mtk_devapc_data {
+   u32 vio_idx_num;
+   u32 vio_mask_offset;
+   u32 vio_sta_offset;
+   u32 vio_dbg0_offset;
+   u32 vio_dbg1_offset;
+   u32 apc_con_offset;
+   u32 vio_shift_sta_offset;
+   u32 vio_shift_sel_offset;
+   u32 vio_shift_con_offset;
+};
+
+struct mtk_devapc_context {
+   struct device *dev;
+   void __iomem *infra_base;
+   struct clk *infra_clk;
+   const struct mtk_devapc_data *data;
+};
+
+static void clear_vio_status(struct mtk_devapc_context *ctx)
+{
+   void __iomem *reg;
+   int i;
+
+   reg = ctx->infra_base + ctx->data->vio_sta_offset;
+
+   for (i = 0; i < VIO_MOD_TO_REG_IND(ctx->data->vio_idx_num - 1); i++)
+   writel(GENMASK(31, 0), reg + 4 * i);
+
+   writel(GENMASK(VIO_MOD_TO_REG_OFF(ctx->data->vio_idx_num - 1), 0),
+  reg + 4 * i);
+}
+
+static void mask_module_irq(struct mtk_devapc_context *ctx, bool mask)
+{
+   void __iomem *reg;
+   u32 val;
+   int i;
+
+   reg = ctx->infra_base + ctx->data->vio_mask_offset;
+
+   if (mask)
+   val = GENMASK(31, 0);
+   else
+   val = 0;
+
+   for (i = 0; i < VIO_MOD_TO_REG_IND(ctx->data->vio_idx_num - 1); i++)
+   writel(val, reg + 4 * i);
+
+   val = readl(reg + 4 * i);
+   if (mask)
+   val |= GENMASK(VIO_MOD_TO_REG_OFF(ctx->data->vio_idx_num - 1),
+  0);
+   else
+   val &= ~GENMASK(VIO_MOD_TO_REG_OFF(ctx->data->vio_idx_num - 1),
+   0);
+
+   writel(val, reg + 4 * i);
+}
+
+#define PHY_DEVAPC_TIMEOUT 0x1
+
+/*
+ * devapc_sync_vio_dbg - do "shift" mechansim" to get full violation 
information.
+ *   shift mechanism is depends on devapc hardware design.
+ *   Mediatek devapc set multiple slaves as a group.
+ *   When violation is triggered, violation info is kept
+ *   inside devapc hardware.
+ *   Driver should do shift mechansim to sync full 
violation
+ *   info to VIO_DBGs registers.
+ *
+ */
+static int 

[PATCH v6 1/2] dt-bindings: devapc: add bindings for mtk-devapc

2020-08-12 Thread Neal Liu
Add bindings for mtk-devapc.

Signed-off-by: Neal Liu 
---
 .../devicetree/bindings/soc/mediatek/devapc.yaml   |   58 
 1 file changed, 58 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/soc/mediatek/devapc.yaml

diff --git a/Documentation/devicetree/bindings/soc/mediatek/devapc.yaml 
b/Documentation/devicetree/bindings/soc/mediatek/devapc.yaml
new file mode 100644
index 000..6c763f8
--- /dev/null
+++ b/Documentation/devicetree/bindings/soc/mediatek/devapc.yaml
@@ -0,0 +1,58 @@
+# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+# # Copyright 2020 MediaTek Inc.
+%YAML 1.2
+---
+$id: "http://devicetree.org/schemas/soc/mediatek/devapc.yaml#;
+$schema: "http://devicetree.org/meta-schemas/core.yaml#;
+
+title: MediaTek Device Access Permission Control driver
+
+description: |
+  MediaTek bus fabric provides TrustZone security support and data
+  protection to prevent slaves from being accessed by unexpected masters.
+  The security violation is logged and sent to the processor for further
+  analysis and countermeasures.
+
+maintainers:
+  - Neal Liu 
+
+properties:
+  compatible:
+enum:
+  - mediatek,mt6779-devapc
+
+  reg:
+description: The base address of devapc register bank
+maxItems: 1
+
+  interrupts:
+description: A single interrupt specifier
+maxItems: 1
+
+  clocks:
+description: Contains module clock source and clock names
+maxItems: 1
+
+  clock-names:
+description: Names of the clocks list in clocks property
+maxItems: 1
+
+required:
+  - compatible
+  - reg
+  - interrupts
+  - clocks
+  - clock-names
+
+examples:
+  - |
+#include 
+#include 
+
+devapc: devapc@10207000 {
+  compatible = "mediatek,mt6779-devapc";
+  reg = <0x10207000 0x1000>;
+  interrupts = ;
+  clocks = <_ao CLK_INFRA_DEVICE_APC>;
+  clock-names = "devapc-infra-clock";
+};
-- 
1.7.9.5


Re: [PATCH v2] lib/cmdline: prevent unintented access to address

2020-08-12 Thread Randy Dunlap
On 8/12/20 8:07 PM, Seungil Kang wrote:
> When args = "\"\0", "i" will be 0 and args[i-1] is used. (*lib/cmdline.c +238)
> Because of "i" is an unsigned int type, the function will access at 
> args[0x].
> It can make a crash.
> 
> Signed-off-by: Seungil Kang 
> ---
> 
> Thanks for your review, my comments below
> 
>> Can you be less ambiguous with the args value? (Perhaps provide a hexdump of 
>> it
> for better understanding)
> 
>  This kind of args as hexdump below can cause crash.
>  
>  : 736f 6d65 7468 696e 6731 3d73 6f6d 655f  something1=some_
>  0010: 7661 6c75 6573 2022      values "
>  
>  The args end with "\"\0".
> 
>> Please, use proper punctuation, I'm lost where is the sentence and what are 
>> the
> logical parts of them.
> 
>  I'm sorry to confuse you. I fix the commit msg
> 
>> Can you point out to the code that calls this and leads to a crash?
> 
>  *lib/cmdlinc + 201 ~, next_arg function with args = "\"\0"
>  
>  char *next_arg(char *args, char **param, char **val) <-- args = "\"\0".
>  {
> unsigned int i, equals = 0;
> int in_quote = 0, quoted = 0;
> char *next;
> 
> if (*args == '"') {   <-- *args == '"' is a true condition,
> args++;   <-- args++, so *args = '\0'.
> in_quote = 1;
> quoted = 1;   <-- quoted also set 1.
> }
> 
> for (i = 0; args[i]; i++) { <-- when reached this point, i = 0, and 
> arg[0] == '\0',
> so for loop is skipped.
> if (isspace(args[i]) && !in_quote)
> break;
> if (equals == 0) {
> if (args[i] == '=')
> equals = i;
> }
> if (args[i] == '"')
> in_quote = !in_quote;
> }
> 
> *param = args;
> if (!equals)
> *val = NULL;
> else {
> args[equals] = '\0';
> *val = args + equals + 1;
> 
> /* Don't include quotes in value. */
> if (**val == '"') {
> (*val)++;
> if (args[i-1] == '"')
> args[i-1] = '\0';
> }
> }
> if (quoted && args[i-1] == '"') <-- When reached this point, quoted 
> is still set 1, 
> "i" is still 0, and "i" is 
> unsigned int type,
> so address will be {address of 
> args} + 0x.
> It can make a crash.
> args[i-1] = '\0';
> 
> if (args[i]) {
> args[i] = '\0';
> next = args + i + 1;
> } else
> next = args + i;
> 
> /* Chew up trailing spaces. */
> return skip_spaces(next);
>  }
> 
> 
>> Can you provide a KUnit test module which can check the case?
> 
>  If necessary, I will make it and share it.

Hi,
Have you tested this patch?
If so, how?


> 
>  lib/cmdline.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/lib/cmdline.c b/lib/cmdline.c
> index fbb9981a04a4..2fd29d7723b2 100644
> --- a/lib/cmdline.c
> +++ b/lib/cmdline.c
> @@ -200,7 +200,7 @@ bool parse_option_str(const char *str, const char *option)
>   */
>  char *next_arg(char *args, char **param, char **val)
>  {
> - unsigned int i, equals = 0;
> + int i, equals = 0;
>   int in_quote = 0, quoted = 0;
>   char *next;
>  
> 

thanks.
-- 
~Randy



Re: [PATCH tip/core/rcu 06/12] rcu: Do full report for .need_qs for strict GPs

2020-08-12 Thread Paul E. McKenney
On Thu, Aug 13, 2020 at 02:50:27AM +0200, Jann Horn wrote:
> On Thu, Aug 13, 2020 at 12:57 AM  wrote:
> > The rcu_preempt_deferred_qs_irqrestore() function is invoked at
> > the end of an RCU read-side critical section (for example, directly
> > from rcu_read_unlock()) and, if .need_qs is set, invokes rcu_qs() to
> > report the new quiescent state.  This works, except that rcu_qs() only
> > updates per-CPU state, leaving reporting of the actual quiescent state
> > to a later call to rcu_report_qs_rdp(), for example from within a later
> > RCU_SOFTIRQ instance.  Although this approach is exactly what you want if
> > you are more concerned about efficiency than about short grace periods,
> > in CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, short grace periods are
> > the name of the game.
> >
> > This commit therefore makes rcu_preempt_deferred_qs_irqrestore() directly
> > invoke rcu_report_qs_rdp() in CONFIG_RCU_STRICT_GRACE_PERIOD=y, thus
> > shortening grace periods.
> 
> Ooh, I'm very happy about this series! :)

Glad you like it!  And I hope that it helps!

One usability concern is whether rcutree.rcu_unlock_delay needs to be
applied only some small fraction of the time in order to allow the delay
to be large (a couple hundred microseconds?)  while still avoiding doing
too much more damage to timing and performance than absolutely necessary.

> > diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
> > index 7ed55c5..1761ff4 100644
> > --- a/kernel/rcu/tree_plugin.h
> > +++ b/kernel/rcu/tree_plugin.h
> > @@ -459,8 +459,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct 
> > *t, unsigned long flags)
> > return;
> > }
> > t->rcu_read_unlock_special.s = 0;
> > -   if (special.b.need_qs)
> > -   rcu_qs();
> > +   if (special.b.need_qs) {
> > +   if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
> > +   rcu_report_qs_rdp(rdp->cpu, rdp);
> 
> Not an issue with this patch specifically, but: I'm looking at
> rcu_report_qs_rdp(), and some of the parts that I do vaguely
> understand look a bit off to me.
> 
> rcu_report_qs_rdp() is given a CPU number as first argument, but never
> actually uses that argument. (And the only existing caller also passes
> in rdp->cpu, just like this patch.) I guess that argument can go away?
> 
> The comment above rcu_report_qs_rdp() claims that it "must be called
> from the specified CPU", but there is a branch in there that
> specifically checks whether that is true ("if (rdp->cpu ==
> smp_processor_id())"). As far as I can tell, rcu_report_qs_rdp() is,
> as the comment says, indeed never invoked with another CPU's rcu_data
> (only invoked via rcu_core() -> rcu_check_quiescent_state() ->
> rcu_report_qs_rdp(), and rcu_core() looks up "rdp =
> raw_cpu_ptr(_data)"). So perhaps if there is a check for whether
> rdp belongs to the current CPU, that check should have a WARN_ON(), or
> something like that, since it indicates that the API contract
> specified in the comment was violated?

It looks like you are correct, and that the first parameter can be
dropped, the "if" you mention replaced by a WARN_ON_ONCE(), and
the body of that "if" be unconditional.  I have it on my list, and
if it still looks correct in the cold hard light of dawn, I will
apply it with your Reported-by.

And thank you very much for looking it over!

Thanx, Paul


[PATCH] drivers/dax: Use kobj_to_dev() instead

2020-08-12 Thread Wang Qing
Use kobj_to_dev() instead of container_of()

Signed-off-by: Wang Qing 
---
 drivers/dax/bus.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c
index df238c8..24625d2
--- a/drivers/dax/bus.c
+++ b/drivers/dax/bus.c
@@ -331,7 +331,7 @@ static DEVICE_ATTR_RO(numa_node);
 
 static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int 
n)
 {
-   struct device *dev = container_of(kobj, struct device, kobj);
+   struct device *dev = kobj_to_dev(kobj);
struct dev_dax *dev_dax = to_dev_dax(dev);
 
if (a == _attr_target_node.attr && dev_dax_target_node(dev_dax) < 0)
-- 
2.7.4



drivers/md/dm-ebs-target.c:264:4: warning: Variable 'r' is reassigned a value before the old one has been used.

2020-08-12 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
master
head:   dc06fe51d26efc100ac74121607c01a454867c91
commit: d3c7b35c20d60650bac8b55c17b194adda03a979 dm: add emulated block size 
target
date:   3 months ago
compiler: sparc-linux-gcc (GCC) 9.3.0

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 


cppcheck warnings: (new ones prefixed by >>)

>> drivers/md/dm-ebs-target.c:264:4: warning: Variable 'r' is reassigned a 
>> value before the old one has been used. [redundantAssignment]
r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), >dev);
  ^
   drivers/md/dm-ebs-target.c:237:4: note: Variable 'r' is reassigned a value 
before the old one has been used.
r = -EINVAL;
  ^
   drivers/md/dm-ebs-target.c:264:4: note: Variable 'r' is reassigned a value 
before the old one has been used.
r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), >dev);
  ^

vim +/r +264 drivers/md/dm-ebs-target.c

   208  
   209  /*
   210   * Construct an emulated block size mapping:
[]
   211   *
   212   * : path of the underlying device
   213   * : offset in 512 bytes sectors into 
   214   * : emulated block size in units of 512 bytes exposed to the 
upper layer
   215   * []: underlying block size in units of 512 bytes imposed on the 
lower layer;
   216   *  optional, if not supplied, retrieve logical block size from 
underlying device
   217   */
   218  static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv)
   219  {
   220  int r;
   221  unsigned short tmp1;
   222  unsigned long long tmp;
   223  char dummy;
   224  struct ebs_c *ec;
   225  
   226  if (argc < 3 || argc > 4) {
   227  ti->error = "Invalid argument count";
   228  return -EINVAL;
   229  }
   230  
   231  ec = ti->private = kzalloc(sizeof(*ec), GFP_KERNEL);
   232  if (!ec) {
   233  ti->error = "Cannot allocate ebs context";
   234  return -ENOMEM;
   235  }
   236  
   237  r = -EINVAL;
   238  if (sscanf(argv[1], "%llu%c", , ) != 1 ||
   239  tmp != (sector_t)tmp ||
   240  (sector_t)tmp >= ti->len) {
   241  ti->error = "Invalid device offset sector";
   242  goto bad;
   243  }
   244  ec->start = tmp;
   245  
   246  if (sscanf(argv[2], "%hu%c", , ) != 1 ||
   247  !__ebs_check_bs(tmp1) ||
   248  to_bytes(tmp1) > PAGE_SIZE) {
   249  ti->error = "Invalid emulated block size";
   250  goto bad;
   251  }
   252  ec->e_bs = tmp1;
   253  
   254  if (argc > 3) {
   255  if (sscanf(argv[3], "%hu%c", , ) != 1 || 
!__ebs_check_bs(tmp1)) {
   256  ti->error = "Invalid underlying block size";
   257  goto bad;
   258  }
   259  ec->u_bs = tmp1;
   260  ec->u_bs_set = true;
   261  } else
   262  ec->u_bs_set = false;
   263  
 > 264  r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), 
 > >dev);
   265  if (r) {
   266  ti->error = "Device lookup failed";
   267  ec->dev = NULL;
   268  goto bad;
   269  }
   270  
   271  r = -EINVAL;
   272  if (!ec->u_bs_set) {
   273  ec->u_bs = 
to_sector(bdev_logical_block_size(ec->dev->bdev));
   274  if (!__ebs_check_bs(ec->u_bs)) {
   275  ti->error = "Invalid retrieved underlying block 
size";
   276  goto bad;
   277  }
   278  }
   279  
   280  if (!ec->u_bs_set && ec->e_bs == ec->u_bs)
   281  DMINFO("Emulation superfluous: emulated equal to 
underlying block size");
   282  
   283  if (__block_mod(ec->start, ec->u_bs)) {
   284  ti->error = "Device offset must be multiple of 
underlying block size";
   285  goto bad;
   286  }
   287  
   288  ec->bufio = dm_bufio_client_create(ec->dev->bdev, 
to_bytes(ec->u_bs), 1, 0, NULL, NULL);
   289  if (IS_ERR(ec->bufio)) {
   290  ti->error = "Cannot create dm bufio client";
   291  r = PTR_ERR(ec->bufio);
   292  ec->bufio = NULL;
   293  goto bad;
   294  }
   295  
   296  ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, 
WQ_MEM_RECLAIM);
   297  if (!ec->wq) {
   298  ti->error = "Cannot create dm-" DM_MSG_PREFIX " 
workqueue";
   299  r = -ENOMEM;
   300  goto bad;
   301  }
   302  
   303  ec->block_shift = __ffs(ec->u_bs);
   

drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c:5638 bnx2x_link_settings_status() warn: signedness bug returning '(-22)'

2020-08-12 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
master
head:   7c2a69f610e64c8dec6a06a66e721f4ce1dd783a
commit: 2c855d73f2f6107f5b8ebc45f8b934bf7f4419e0 bnx2x: Remove read_status_t 
function casts
date:   9 months ago
config: ia64-randconfig-m031-20200811 (attached as .config)
compiler: ia64-linux-gcc (GCC) 9.3.0

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

New smatch warnings:
drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c:5638 
bnx2x_link_settings_status() warn: signedness bug returning '(-22)'

Old smatch warnings:
drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c:7184 
bnx2x_8073_8727_external_rom_boot() error: uninitialized symbol 'fw_ver1'.

vim +5638 drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c

239d686d494f10e drivers/net/bnx2x_link.c Eilon 
Greenstein 2009-08-12  5613  
2c855d73f2f6107 drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c Kees Cook  
  2019-11-14  5614  static u8 bnx2x_link_settings_status(struct bnx2x_phy *phy,
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5615   struct link_params *params,
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5616   struct link_vars *vars)
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5617  {
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5618  struct bnx2x *bp = params->bp;
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5619  
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5620  u16 gp_status, duplex = DUPLEX_HALF, link_up = 0, 
speed_mask;
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5621  int rc = 0;
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5622  
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5623  /* Read gp_status */
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5624  CL22_RD_OVER_CL45(bp, phy,
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5625MDIO_REG_BANK_GP_STATUS,
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5626MDIO_GP_STATUS_TOP_AN_STATUS1,
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5627_status);
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5628  if (gp_status & 
MDIO_GP_STATUS_TOP_AN_STATUS1_DUPLEX_STATUS)
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5629  duplex = DUPLEX_FULL;
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5630  if (gp_status & 
MDIO_GP_STATUS_TOP_AN_STATUS1_LINK_STATUS)
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5631  link_up = 1;
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5632  speed_mask = gp_status & GP_STATUS_SPEED_MASK;
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5633  DP(NETIF_MSG_LINK, "gp_status 0x%x, is_link_up %d, 
speed_mask 0x%x\n",
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5634 gp_status, link_up, speed_mask);
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5635  rc = bnx2x_get_link_speed_duplex(phy, params, vars, 
link_up, speed_mask,
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5636   duplex);
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5637  if (rc == -EINVAL)
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14 @5638  return rc;
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5639  
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5640  if (gp_status & 
MDIO_GP_STATUS_TOP_AN_STATUS1_LINK_STATUS) {
3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c   Yaniv Rosner   
  2011-06-14  5641  if (SINGLE_MEDIA_DIRECT(params)) {
430d172a635c3dd drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c Yaniv Rosner   
  2012-09-11  5642  

Re: [PATCH v8 2/4] scsi: ufs: Introduce HPB feature

2020-08-12 Thread Bart Van Assche
On 2020-08-12 20:00, Daejun Park wrote:
> On 2020-08-06 02:11, Daejun Park wrote:
>>> +static int ufshpb_create_sysfs(struct ufs_hba *hba, struct ufshpb_lu *hpb)
>>> +{
>>> +int ret;
>>> +
>>> +ufshpb_stat_init(hpb);
>>> +
>>> +kobject_init(>kobj, _ktype);
>>> +mutex_init(>sysfs_lock);
>>> +
>>> +ret = kobject_add(>kobj, kobject_get(>dev->kobj),
>>> +  "ufshpb_lu%d", hpb->lun);
>>> +
>>> +if (ret)
>>> +return ret;
>>> +
>>> +ret = sysfs_create_group(>kobj, _sysfs_group);
>>> +
>>> +if (ret) {
>>> +dev_err(hba->dev, "ufshpb_lu%d create file error\n", hpb->lun);
>>> +return ret;
>>> +}
>>> +
>>> +dev_info(hba->dev, "ufshpb_lu%d sysfs adds uevent", hpb->lun);
>>> +kobject_uevent(>kobj, KOBJ_ADD);
>>> +
>>> +return 0;
>>> +}
>>
>> Please attach these sysfs attributes to /sys/class/scsi_device/*/device
>> instead of creating a new kobject. Consider using the following
>> scsi_host_template member to define LUN sysfs attributes:
> 
> I am not rejecting your comment. But I added kobject for distinguishing 
> between other attributes and attributes related to HPB feature.
> If you think it's pointless, I'll fix it.

Hi Daejun,

I see two reasons to add these sysfs attributes under
/sys/class/scsi_device/*/device:
- This makes the behavior of the UFS driver similar to that of other Linux
  SCSI LLD drivers.
- This makes it easier for people who want to write udev rules that read
  from these attributes. Since ufshpb_lu%d is attached to the UFS controller
  it is not clear to me which attributes will appear first in sysfs - the
  SCSI device attributes or the ufshpb_lu%d attributes. If there are only
  SCSI device attributes there is no such ambiguity and hence authors of
  udev rules won't have to worry about this race condition.

>>> +void ufshpb_remove(struct ufs_hba *hba)
>>> +{
>>> +struct ufshpb_lu *hpb, *n_hpb;
>>> +struct ufsf_feature_info *ufsf;
>>> +struct scsi_device *sdev;
>>> +
>>> +ufsf = >ufsf;
>>> +
>>> +list_for_each_entry_safe(hpb, n_hpb, _hpb_lu, list_hpb_lu) {
>>> +ufshpb_set_state(hpb, HPB_FAILED);
>>> +
>>> +sdev = hpb->sdev_ufs_lu;
>>> +sdev->hostdata = NULL;
>>> +
>>> +ufshpb_destroy_region_tbl(hpb);
>>> +
>>> +list_del_init(>list_hpb_lu);
>>> +ufshpb_remove_sysfs(hpb);
>>> +
>>> +kfree(hpb);
>>> +}
>>> +
>>> +dev_info(hba->dev, "ufshpb: remove success\n");
>>> +}
>>
>> Should the code in the body of the above loop perhaps be called from inside
>> ufshcd_slave_destroy()?
> 
> Moving other stuffs in the loop is good idea, but removing attributes is 
> problem.
> To avoid adding new kobject, I will try to use sysfs_merge_group() 
> for adding attributes. To delete merged attributes, sysfs_unmerge_group() 
> should be called. But sysfs_remove_groups() is called before calling 
> ufshcd_slave_destroy().

Hmm ... I don't see why the sdev_groups host template attribute can't be used?

Please don't use sysfs_merge_group() and sysfs_unmerge_group() because that
would create a race condition against udev rules if these functions are called
after the device core has emitted a KOBJ_ADD event.

Thanks,

Bart.


Re: [PATCH v8 3/4] scsi: ufs: L2P map management for HPB read

2020-08-12 Thread Daejun Park
On 2020-08-06 02:15, Daejun Park wrote:
> > +req->end_io_data = (void *)map_req;
> 
> Please leave the (void *) cast out since explicit casts from a non-void
> to a void pointer are not necessary in C.

OK, I will fix it.
 
> > +static inline struct
> > +ufshpb_rsp_field *ufshpb_get_hpb_rsp(struct ufshcd_lrb *lrbp)
> > +{
> > +return (struct ufshpb_rsp_field 
> > *)>ucd_rsp_ptr->sr.sense_data_len;
> > +}
> 
> Please introduce a union in struct utp_cmd_rsp instead of using casts
> to reinterpret a part of a data structure.

OK. I will introduce a union in struct utp_cmd_rsp and use it.

> > +/* routine : isr (ufs) */
> 
> The above comment looks very cryptic. Should it perhaps be expanded?
> 
> > +struct ufshpb_active_field {
> > +__be16 active_rgn;
> > +__be16 active_srgn;
> > +} __packed;
> 
> Since "__packed" is not necessary for the above data structure, please
> remove it. Note: a typical approach in the Linux kernel to verify that
> the compiler has not inserted any padding bytes is to add a BUILD_BUG_ON()
> statement in an initialization function that verifies the size of ABI data
> structures. See also the output of the following command:
> 
> git grep -nH 'BUILD_BUG_ON.sizeof.*!='

OK, I didn't know about it. Thanks.

> > +struct ufshpb_rsp_field {
> > +__be16 sense_data_len;
> > +u8 desc_type;
> > +u8 additional_len;
> > +u8 hpb_type;
> > +u8 reserved;
> > +u8 active_rgn_cnt;
> > +u8 inactive_rgn_cnt;
> > +struct ufshpb_active_field hpb_active_field[2];
> > +__be16 hpb_inactive_field[2];
> > +} __packed;
> 
> I think the above __packed statement should also be left out.

OK, I will remove it.

Thanks,
Daejun


Re: [PATCH v8 4/4] scsi: ufs: Prepare HPB read for cached sub-region

2020-08-12 Thread Daejun Park
On 2020-08-06 02:18, Daejun Park wrote:
> > +static inline u32 ufshpb_get_lpn(struct scsi_cmnd *cmnd)
> > +{
> > +return blk_rq_pos(cmnd->request) >>
> > +(ilog2(cmnd->device->sector_size) - 9);
> > +}
> 
> Please use sectors_to_logical() from drivers/scsi/sd.h instead of open-coding
> that function.

OK, I will.

> > +static inline unsigned int ufshpb_get_len(struct scsi_cmnd *cmnd)
> > +{
> > +return blk_rq_sectors(cmnd->request) >>
> > +(ilog2(cmnd->device->sector_size) - 9);
> > +}
> 
> Same comment here.

OK
 
> > +/* routine : READ10 -> HPB_READ  */
> 
> Please expand this comment.

OK

Thanks,
Daejun


[PATCH] dma-debug: fix debug_dma_assert_idle(), use rcu_read_lock()

2020-08-12 Thread Hugh Dickins
Since commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic")
improved unlock_page(), it has become more noticeable how cow_user_page()
in a kernel with CONFIG_DMA_API_DEBUG=y can create and suffer from heavy
contention on DMA debug's radix_lock in debug_dma_assert_idle().

It is only doing a lookup: use rcu_read_lock() and rcu_read_unlock()
instead; though that does require the static ents[] to be moved onstack...

...but, hold on, isn't that radix_tree_gang_lookup() and loop doing quite
the wrong thing: searching CACHELINES_PER_PAGE entries for an exact match
with the first cacheline of the page in question?  radix_tree_gang_lookup()
is the right tool for the job, but we need nothing more than to check the
first entry it can find, reporting if that falls anywhere within the page.

(Is RCU safe here?  As safe as using the spinlock was.  The entries are
never freed, so don't need to be freed by RCU.  They may be reused, and
there is a faint chance of a race, with an offending entry reused while
printing its error info; but the spinlock did not prevent that either,
and I agree that it's not worth worrying about.)

Fixes: 3b7a6418c749 ("dma debug: account for cachelines and read-only mappings 
in overlap tracking")
Signed-off-by: Hugh Dickins 
---

 kernel/dma/debug.c |   27 +--
 1 file changed, 9 insertions(+), 18 deletions(-)

--- v5.9-rc/kernel/dma/debug.c  2020-08-05 18:17:57.544203766 -0700
+++ linux/kernel/dma/debug.c2020-08-12 19:53:33.159070245 -0700
@@ -565,11 +565,8 @@ static void active_cacheline_remove(stru
  */
 void debug_dma_assert_idle(struct page *page)
 {
-   static struct dma_debug_entry *ents[CACHELINES_PER_PAGE];
-   struct dma_debug_entry *entry = NULL;
-   void **results = (void **) 
-   unsigned int nents, i;
-   unsigned long flags;
+   struct dma_debug_entry *entry;
+   unsigned long pfn;
phys_addr_t cln;
 
if (dma_debug_disabled())
@@ -578,20 +575,14 @@ void debug_dma_assert_idle(struct page *
if (!page)
return;
 
-   cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT;
-   spin_lock_irqsave(_lock, flags);
-   nents = radix_tree_gang_lookup(_active_cacheline, results, cln,
-  CACHELINES_PER_PAGE);
-   for (i = 0; i < nents; i++) {
-   phys_addr_t ent_cln = to_cacheline_number(ents[i]);
+   pfn = page_to_pfn(page);
+   cln = (phys_addr_t) pfn << CACHELINE_PER_PAGE_SHIFT;
 
-   if (ent_cln == cln) {
-   entry = ents[i];
-   break;
-   } else if (ent_cln >= cln + CACHELINES_PER_PAGE)
-   break;
-   }
-   spin_unlock_irqrestore(_lock, flags);
+   rcu_read_lock();
+   if (!radix_tree_gang_lookup(_active_cacheline, (void **) ,
+   cln, 1) || entry->pfn != pfn)
+   entry = NULL;
+   rcu_read_unlock();
 
if (!entry)
return;


Re: [PATCH] perf parse-events: Set exclude_guest for user-space counting

2020-08-12 Thread Jin, Yao

Hi Like,

On 8/12/2020 9:02 PM, Like Xu wrote:

On 2020/8/12 20:15, Arnaldo Carvalho de Melo wrote:

Em Wed, Aug 12, 2020 at 02:59:53PM +0800, Jin Yao escreveu:

Currently if we run 'perf record -e cycles:u', exclude_guest is 0.

But it doesn't make sense that we request for user-space counting
but we also get the guest report.



Please hold the horse and allow this possibility.

Some authorized perf users on the host may
only want to count (KVM) guest user space events.

Thanks,
Like Xu



Without this patch, if we don't set the ":u" modifier, exclude_guest = 1.

perf record -e cycles ./div
perf evlist -v
cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, 
read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, 
sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1


So this patch doesn't change perf's original behavior.

Thanks
Jin Yao


To keep perf semantics consistent and clear, this patch sets
exclude_guest for user-space counting.


Applied, and also added this, that you should consider doing in the
future (modulo the "Committer testing:" header :) ):

Committer testing:

Before:

   # perf record -e cycles:u
   ^C[ perf record: Woken up 1 times to write data ]
   [ perf record: Captured and wrote 1.231 MB perf.data (91 samples) ]
   #
   # perf evlist -v
   cycles:u: size: 120, { sample_period, sample_freq }: 4000, sample_type: 
IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, 
exclude_hv: 1, freq: 1, sample_id_all: 1

   
   #

After:

   # perf record -e cycles:u
   ^C[ perf record: Woken up 1 times to write data ]
   [ perf record: Captured and wrote 1.263 MB perf.data (403 samples) ]
   #
   # perf evlist -v
   cycles:u: size: 120, { sample_period, sample_freq }: 4000, sample_type: 
IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, 
exclude_hv: 1, freq: 1, sample_id_all: 1, exclude_guest: 1

   #



I.e. show actual command output before and after that demonstrates the
problem and then the solution.


Signed-off-by: Jin Yao 
---
  tools/perf/util/parse-events.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 9f7260e69113..4d809f1fe269 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -1794,6 +1794,8 @@ static int get_event_modifier(struct event_modifier *mod, 
char *str,
  if (*str == 'u') {
  if (!exclude)
  exclude = eu = ek = eh = 1;
+    if (!exclude_GH)
+    eG = 1;
  eu = 0;
  } else if (*str == 'k') {
  if (!exclude)
--
2.17.1







[PATCH v2] lib/cmdline: prevent unintented access to address

2020-08-12 Thread Seungil Kang
When args = "\"\0", "i" will be 0 and args[i-1] is used. (*lib/cmdline.c +238)
Because of "i" is an unsigned int type, the function will access at 
args[0x].
It can make a crash.

Signed-off-by: Seungil Kang 
---

Thanks for your review, my comments below

> Can you be less ambiguous with the args value? (Perhaps provide a hexdump of 
> it
for better understanding)

 This kind of args as hexdump below can cause crash.
 
 : 736f 6d65 7468 696e 6731 3d73 6f6d 655f  something1=some_
 0010: 7661 6c75 6573 2022      values "
 
 The args end with "\"\0".

> Please, use proper punctuation, I'm lost where is the sentence and what are 
> the
logical parts of them.

 I'm sorry to confuse you. I fix the commit msg

> Can you point out to the code that calls this and leads to a crash?

 *lib/cmdlinc + 201 ~, next_arg function with args = "\"\0"
 
 char *next_arg(char *args, char **param, char **val) <-- args = "\"\0".
 {
unsigned int i, equals = 0;
int in_quote = 0, quoted = 0;
char *next;

if (*args == '"') {   <-- *args == '"' is a true condition,
args++;   <-- args++, so *args = '\0'.
in_quote = 1;
quoted = 1;   <-- quoted also set 1.
}

for (i = 0; args[i]; i++) { <-- when reached this point, i = 0, and 
arg[0] == '\0',
so for loop is skipped.
if (isspace(args[i]) && !in_quote)
break;
if (equals == 0) {
if (args[i] == '=')
equals = i;
}
if (args[i] == '"')
in_quote = !in_quote;
}

*param = args;
if (!equals)
*val = NULL;
else {
args[equals] = '\0';
*val = args + equals + 1;

/* Don't include quotes in value. */
if (**val == '"') {
(*val)++;
if (args[i-1] == '"')
args[i-1] = '\0';
}
}
if (quoted && args[i-1] == '"') <-- When reached this point, quoted is 
still set 1, 
"i" is still 0, and "i" is unsigned 
int type,
so address will be {address of 
args} + 0x.
It can make a crash.
args[i-1] = '\0';

if (args[i]) {
args[i] = '\0';
next = args + i + 1;
} else
next = args + i;

/* Chew up trailing spaces. */
return skip_spaces(next);
 }


> Can you provide a KUnit test module which can check the case?

 If necessary, I will make it and share it.

 lib/cmdline.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/cmdline.c b/lib/cmdline.c
index fbb9981a04a4..2fd29d7723b2 100644
--- a/lib/cmdline.c
+++ b/lib/cmdline.c
@@ -200,7 +200,7 @@ bool parse_option_str(const char *str, const char *option)
  */
 char *next_arg(char *args, char **param, char **val)
 {
-   unsigned int i, equals = 0;
+   int i, equals = 0;
int in_quote = 0, quoted = 0;
char *next;
 
-- 
2.17.1



RE: [PATCH] softirq: add irq off checking for __raise_softirq_irqoff

2020-08-12 Thread Jiafei Pan
Any comments? Thanks.

@Steven Rostedt, I thinks irq off checking is necessary especially for 
Preempt-RT kernel, because some context may be changed from irq off to irq on 
when enable Preempt RT, I once met a issue that hrtimer soft irq is lost when 
enabled Preempt RT, finally I found napi_schedule_irqoff is called in hardware 
interrupt handler, there maybe no issue for non RT kernel, but for Preempt RT, 
interrupt is threaded, so irq is on in interrupt handler, the result is 
__raise_softirq_irqoff is called in irq on context, so that per-CPU softirq 
masking is corrupted because of the process of updating of soft irq masking is 
interrupted and not a atomic operation , and then caused hrtimer soft irq is 
lost. So I think adding irq status checking in __raise_softirq_irqoff can 
report such issue directly and help us to find the root cause of such issue.

I know that there may be performance impaction to add extra checking here, if 
it is the case, how about to include it in some debug configuration items? Such 
as CONFIG_DEBUG_PREEMPT or other debug items?

Best Regards,
Jiafei.

-Original Message-
From: Jiafei Pan  
Sent: Thursday, August 6, 2020 12:07 PM
To: pet...@infradead.org; mi...@kernel.org; t...@linutronix.de; 
rost...@goodmis.org; romain.per...@gmail.com; w...@kernel.org
Cc: linux-kernel@vger.kernel.org; linux-rt-us...@vger.kernel.org; Jiafei Pan 
; Leo Li ; Vladimir Oltean 
; Jiafei Pan 
Subject: [PATCH] softirq: add irq off checking for __raise_softirq_irqoff

__raise_softirq_irqoff will update per-CPU mask of pending softirqs, it need to 
be called in irq disabled context in order to keep it atomic operation, 
otherwise it will be interrupted by hardware interrupt, and per-CPU softirqs 
pending mask will be corrupted, the result is there will be unexpected issue, 
for example hrtimer soft irq will be losed and soft hrtimer will never be 
expire and handled.

Adding irqs disabled checking here to provide warning in irqs enabled context.

Signed-off-by: Jiafei Pan 
---
 kernel/softirq.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/kernel/softirq.c b/kernel/softirq.c index 
bf88d7f62433..11f61e54a3ae 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -481,6 +481,11 @@ void raise_softirq(unsigned int nr)
 
 void __raise_softirq_irqoff(unsigned int nr)  {
+   /* This function can only be called in irq disabled context,
+* otherwise or_softirq_pending will be interrupted by hardware
+* interrupt, so that there will be unexpected issue.
+*/
+   WARN_ON_ONCE(!irqs_disabled());
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr);
 }
--
2.17.1



Re: [PATCH v8 2/4] scsi: ufs: Introduce HPB feature

2020-08-12 Thread Daejun Park
Hi Bart,

On 2020-08-06 02:11, Daejun Park wrote:
> > This is a patch for the HPB feature.
> > This patch adds HPB function calls to UFS core driver.
> > 
> > The mininum size of the memory pool used in the HPB is implemented as a
>   ^^^
>   minimum?

I will fix it.

> > Kconfig parameter (SCSI_UFS_HPB_HOST_MEM), so that it can be configurable.
> 
> > +config SCSI_UFS_HPB
> > +bool "Support UFS Host Performance Booster"
> > +depends on SCSI_UFSHCD
> > +help
> > +  A UFS HPB Feature improves random read performance. It caches
>   ^ ^^^
>   The?  feature?

I will fix it.

> > +  L2P map of UFS to host DRAM. The driver uses HPB read command
> > +  by piggybacking physical page number for bypassing FTL's L2P address
> > +  translation.
> 
> Please explain what L2P and FTL mean. Not everyone is familiar with SSD
> internals.

I added full name of the abbreviation.


L2P (logical to physical) map of UFS to host DRAM. The driver uses HPB read 
command
 ^^^
by piggybacking physical page number for bypassing FTL (flash translation layer)


> > +config SCSI_UFS_HPB_HOST_MEM
> > +int "Host-side cached memory size (KB) for HPB support"
> > +default 32
> > +depends on SCSI_UFS_HPB
> > +help
> > +  The mininum size of the memory pool used in the HPB module. It can
> > +  be configurable by the user. If this value is larger than required
> > +  memory size, kernel resizes cached memory size.
>   ^^^ ^^
>  reduces?cache size?
> 
> Please make this a kernel module parameter instead of a compile-time constant.

OK, I will change it.
 
> > +#ifndef CONFIG_SCSI_UFS_HPB
> > +static void ufshpb_resume(struct ufs_hba *hba) {}
> > +static void ufshpb_suspend(struct ufs_hba *hba) {}
> > +static void ufshpb_reset(struct ufs_hba *hba) {}
> > +static void ufshpb_reset_host(struct ufs_hba *hba) {}
> > +static void ufshpb_rsp_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) 
> > {}
> > +static void ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) {}
> > +static void ufshpb_remove(struct ufs_hba *hba) {}
> > +static void ufshpb_scan_feature(struct ufs_hba *hba) {}
> > +#endif
> 
> Please move these definitions into ufshpb.h since that is the
> recommended Linux kernel coding style.

OK, I will move them.

> > diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h
> > index b2ef18f1b746..904c19796e09 100644
> > --- a/drivers/scsi/ufs/ufshcd.h
> > +++ b/drivers/scsi/ufs/ufshcd.h
> > @@ -47,6 +47,9 @@
> >  #include "ufs.h"
> >  #include "ufs_quirks.h"
> >  #include "ufshci.h"
> > +#ifdef CONFIG_SCSI_UFS_HPB
> > +#include "ufshpb.h"
> > +#endif
> 
> Please move #ifdef CONFIG_SCSI_UFS_HPB / #endif into ufshpb.h. From
> Documentation/process/4.Coding.rst: "As a general rule, #ifdef use
> should be confined to header files whenever possible."

OK, I will fix it.

> > +struct ufsf_feature_info {
> > +atomic_t slave_conf_cnt;
> > +wait_queue_head_t sdev_wait;
> > +};
> 
> Please add a comment above this data structure that explains the role
> of this data structure and also what "ufsf" stands for.

"ufsf" is stands for ufs feature. I wiil add comments for the data structure.

> > +static int ufshpb_create_sysfs(struct ufs_hba *hba, struct ufshpb_lu *hpb);
> 
> I don't think that this forward declaration is necessary so please leave it
> out.

OK, I will remove it.

> > +static inline int ufshpb_is_valid_srgn(struct ufshpb_region *rgn,
> > + struct ufshpb_subregion *srgn)
> > +{
> > +return rgn->rgn_state != HPB_RGN_INACTIVE &&
> > +srgn->srgn_state == HPB_SRGN_VALID;
> > +}
> 
> Please do not declare functions inside .c files inline but instead let
> the compiler decide which functions to inline. Modern compilers are really
> good at this.

I didn't know about it. Thanks.

> > +static struct kobj_type ufshpb_ktype = {
> > +.sysfs_ops = _sysfs_ops,
> > +.release = NULL,
> > +};
> 
> If the release method of a kobj_type is NULL that is a strong sign that
> there is something wrong ...
> 
> > +static int ufshpb_create_sysfs(struct ufs_hba *hba, struct ufshpb_lu *hpb)
> > +{
> > +int ret;
> > +
> > +ufshpb_stat_init(hpb);
> > +
> > +kobject_init(>kobj, _ktype);
> > +mutex_init(>sysfs_lock);
> > +
> > +ret = kobject_add(>kobj, kobject_get(>dev->kobj),
> > +  "ufshpb_lu%d", hpb->lun);
> > +
> > +if (ret)
> > +return ret;
> > +
> > +ret = sysfs_create_group(>kobj, _sysfs_group);
> > +
> > +if (ret) {
> > +dev_err(hba->dev, "ufshpb_lu%d create file error\n", hpb->lun);
> > +return ret;
> > +}
> > +
> > +dev_info(hba->dev, "ufshpb_lu%d sysfs adds uevent", hpb->lun);
> > +kobject_uevent(>kobj, KOBJ_ADD);
> > +
> > +return 0;
> > 

[PATCH] acpi/nfit: Use kobj_to_dev() instead

2020-08-12 Thread Wang Qing
Use kobj_to_dev() instead of container_of()

Signed-off-by: Wang Qing 
---
 drivers/acpi/nfit/core.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c
index fa4500f..3bb350b
--- a/drivers/acpi/nfit/core.c
+++ b/drivers/acpi/nfit/core.c
@@ -1382,7 +1382,7 @@ static bool ars_supported(struct nvdimm_bus *nvdimm_bus)
 
 static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n)
 {
-   struct device *dev = container_of(kobj, struct device, kobj);
+   struct device *dev = kobj_to_dev(kobj);
struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev);
 
if (a == _attr_scrub.attr && !ars_supported(nvdimm_bus))
@@ -1667,7 +1667,7 @@ static struct attribute *acpi_nfit_dimm_attributes[] = {
 static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj,
struct attribute *a, int n)
 {
-   struct device *dev = container_of(kobj, struct device, kobj);
+   struct device *dev = kobj_to_dev(kobj);
struct nvdimm *nvdimm = to_nvdimm(dev);
struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
 
-- 
2.7.4



[PATCH v2] mm: LMK, adjust oom_score_adj when fork a new process

2020-08-12 Thread hui yang
From: YangHui 

Also it rely on inheritance,But there are some things you need't inheriting
if all children oom_score_adj is -1000,the oom is meaningless

Signed-off-by: YangHui 
---
 include/uapi/linux/oom.h | 1 +
 kernel/fork.c| 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/oom.h b/include/uapi/linux/oom.h
index 750b1c5..0251f23 100644
--- a/include/uapi/linux/oom.h
+++ b/include/uapi/linux/oom.h
@@ -8,6 +8,7 @@
  */
 #define OOM_SCORE_ADJ_MIN  (-1000)
 #define OOM_SCORE_ADJ_MAX  1000
+#define OOM_SCORE_ADJ_DEFAULT  0
 
 /*
  * /proc//oom_adj set to -17 protects from the oom killer for legacy
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d32190..9dfa388 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1584,8 +1584,8 @@ static int copy_signal(unsigned long clone_flags, struct 
task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
 
-   sig->oom_score_adj = current->signal->oom_score_adj;
-   sig->oom_score_adj_min = current->signal->oom_score_adj_min;
+   sig->oom_score_adj = OOM_SCORE_ADJ_DEFAULT;
+   sig->oom_score_adj_min = OOM_SCORE_ADJ_MIN;
 
mutex_init(>cred_guard_mutex);
mutex_init(>exec_update_mutex);
-- 
2.7.4



RE: [PATCH 1/2] exfat: add NameLength check when extracting name

2020-08-12 Thread Namjae Jeon
> Thank you for your reply.
> 
> >> -static void exfat_get_uniname_from_ext_entry(struct super_block *sb,
> >> -  struct exfat_chain *p_dir, int entry, unsigned short *uniname)
> >> +static int exfat_get_uniname_from_name_entries(struct 
> >> exfat_entry_set_cache *es,
> >> +  struct exfat_uni_name *uniname)
> >>   {
> >> -  int i;
> >> -  struct exfat_entry_set_cache *es;
> >> +  int n, l, i;
> >>struct exfat_dentry *ep;
> >>
> >> -  es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES);
> >> -  if (!es)
> >> -  return;
> >> +  uniname->name_len = es->de_stream->name_len;
> >> +  if (uniname->name_len == 0)
> >> +  return -EIO;
> > Can we validate ->name_len and name entry ->type in exfat_get_dentry_set() ?
> 
> Yes.
> As I wrote in a previous email, entry type validation, name-length 
> validation, and name extraction
> should not be separated, so implement all of these in exfat_get_dentry_set().
> It can be easily implemented by adding uniname to exfat_entry_set_cache and 
> calling
> exfat_get_uniname_from_name_entries() from exfat_get_dentry_set().
No, We can check stream->name_len and name entry type in exfat_get_dentry_set().
And you are already checking entry type with TYPE_SECONDARY in
exfat_get_dentry_set(). Why do we have to check twice?

>
> However, that would be over-implementation.
> Not all callers of exfat_get_dentry_set() need a name.
Where? It will not checked with ES_2_ENTRIES.

> It is enough to validate the name when it is needed.
> This is a file-system driver, not fsck.
Sorry, I don't understand what you are talking about. If there is a problem
in ondisk-metadata, Filesystem should return error.

> Validation is possible in exfat_get_dentry_set(), but unnecessary.
> 
> Why do you want to validate the name in exfat_get_dentry_set()?
exfat_get_dentry_set validates file, stream entry. And you are trying to check
name entries with type_secondary. In addition, trying add the checksum check.
Conversely, Why would you want to add those checks to exfat_get_dentry_set()?
Why do we check only name entries separately? Aren't you intent to return
validated entry set in exfat_get_dentry_set()?
> 
> 
> BR
> ---
> Tetsuhiro Kohada 



drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:2135:41: sparse: sparse: incorrect type in argument 1 (different address spaces)

2020-08-12 Thread kernel test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 
master
head:   dc06fe51d26efc100ac74121607c01a454867c91
commit: 670d0a4b10704667765f7d18f7592993d02783aa sparse: use identifiers to 
define address spaces
date:   8 weeks ago
config: i386-randconfig-s001-20200813 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce:
# apt-get install sparse
# sparse version: v0.6.2-168-g9554805c-dirty
git checkout 670d0a4b10704667765f7d18f7592993d02783aa
# save the attached .config to linux build tree
make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=i386 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 


sparse warnings: (new ones prefixed by >>)

>> drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:2135:41: sparse: 
>> sparse: incorrect type in argument 1 (different address spaces) @@ 
>> expected void *reg @@ got unsigned int [noderef] __iomem * @@
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:2135:41: sparse: 
expected void *reg
>> drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:2135:41: sparse: 
>> got unsigned int [noderef] __iomem *
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:33: sparse: sparse: 
incorrect type in argument 2 (different base types) @@ expected unsigned 
short [usertype] uid_hi @@ got restricted __be16 [usertype] @@
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:33: sparse: 
expected unsigned short [usertype] uid_hi
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:33: sparse: got 
restricted __be16 [usertype]
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:45: sparse: sparse: 
incorrect type in argument 3 (different base types) @@ expected unsigned 
int [usertype] uid_lo @@ got restricted __be32 [usertype] @@
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:45: sparse: 
expected unsigned int [usertype] uid_lo
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:45: sparse: got 
restricted __be32 [usertype]
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:56: sparse: sparse: 
incorrect type in argument 4 (different base types) @@ expected unsigned 
short [usertype] seqid @@ got restricted __be16 [usertype] @@
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:56: sparse: 
expected unsigned short [usertype] seqid
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:56: sparse: got 
restricted __be16 [usertype]
>> drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:306:26: sparse: sparse: 
>> incorrect type in argument 1 (different address spaces) @@ expected void 
>> [noderef] __iomem * @@ got void *reg @@
>> drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:306:26: sparse: 
>> expected void [noderef] __iomem *
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:306:26: sparse: got 
void *reg
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:336:33: sparse: sparse: 
incorrect type in argument 1 (different address spaces) @@ expected void 
*reg @@ got unsigned int [noderef] __iomem * @@
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:336:33: sparse: 
expected void *reg
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:336:33: sparse: got 
unsigned int [noderef] __iomem *
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:343:33: sparse: sparse: 
incorrect type in argument 1 (different address spaces) @@ expected void 
*reg @@ got unsigned int [noderef] __iomem * @@
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:343:33: sparse: 
expected void *reg
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:343:33: sparse: got 
unsigned int [noderef] __iomem *
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:356:33: sparse: sparse: 
incorrect type in argument 1 (different address spaces) @@ expected void 
*reg @@ got unsigned int [noderef] __iomem * @@
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:356:33: sparse: 
expected void *reg
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:356:33: sparse: got 
unsigned int [noderef] __iomem *
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:397:33: sparse: sparse: 
incorrect type in argument 1 (different address spaces) @@ expected void 
*reg @@ got unsigned int [noderef] __iomem * @@
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:397:33: sparse: 
expected void *reg
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:397:33: sparse: got 
unsigned int [noderef] __iomem *
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:466:41: sparse: sparse: 
incorrect type in argument 1 (different address spaces) @@ expected void 
*reg @@ got unsigned int [noderef] __iomem * @@
   drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:466:41: sparse: 
expected void *reg
   

Re: [PATCH v2] nvme: Use spin_lock_irq() when taking the ctrl->lock

2020-08-12 Thread Chaitanya Kulkarni
On 8/12/20 16:25, Logan Gunthorpe wrote:
> When locking the ctrl->lock spinlock IRQs need to be disabled to avoid a
> dead lock. The new spin_lock() calls recently added produce the
> following lockdep warning when running the blktest nvme/003:
> 
>  
>  WARNING: inconsistent lock state
>  
>  inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
>  ksoftirqd/2/22 [HC0[0]:SC1[1]:HE0:SE0] takes:
>  888276a8c4c0 (>lock){+.?.}-{2:2}, at: 
> nvme_keep_alive_end_io+0x50/0xc0
>  {SOFTIRQ-ON-W} state was registered at:
>lock_acquire+0x164/0x500
>_raw_spin_lock+0x28/0x40
>nvme_get_effects_log+0x37/0x1c0
>nvme_init_identify+0x9e4/0x14f0
>nvme_reset_work+0xadd/0x2360
>process_one_work+0x66b/0xb70
>worker_thread+0x6e/0x6c0
>kthread+0x1e7/0x210
>ret_from_fork+0x22/0x30
>  irq event stamp: 1449221
>  hardirqs last  enabled at (1449220): [] 
> ktime_get+0xf9/0x140
>  hardirqs last disabled at (1449221): [] 
> _raw_spin_lock_irqsave+0x25/0x60
>  softirqs last  enabled at (1449210): [] 
> __do_softirq+0x447/0x595
>  softirqs last disabled at (1449215): [] 
> run_ksoftirqd+0x35/0x50
> 
>  other info that might help us debug this:
>   Possible unsafe locking scenario:
> 
> CPU0
> 
>lock(>lock);
>
>  lock(>lock);
> 
>   *** DEADLOCK ***
> 
>  no locks held by ksoftirqd/2/22.
> 
>  stack backtrace:
>  CPU: 2 PID: 22 Comm: ksoftirqd/2 Not tainted 
> 5.8.0-rc4-eid-vmlocalyes-dbg-00157-g7236657c6b3a #1450
>  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-1 
> 04/01/2014
>  Call Trace:
>   dump_stack+0xc8/0x11a
>   print_usage_bug.cold.63+0x235/0x23e
>   mark_lock+0xa9c/0xcf0
>   __lock_acquire+0xd9a/0x2b50
>   lock_acquire+0x164/0x500
>   _raw_spin_lock_irqsave+0x40/0x60
>   nvme_keep_alive_end_io+0x50/0xc0
>   blk_mq_end_request+0x158/0x210
>   nvme_complete_rq+0x146/0x500
>   nvme_loop_complete_rq+0x26/0x30 [nvme_loop]
>   blk_done_softirq+0x187/0x1e0
>   __do_softirq+0x118/0x595
>   run_ksoftirqd+0x35/0x50
>   smpboot_thread_fn+0x1d3/0x310
>   kthread+0x1e7/0x210
>   ret_from_fork+0x22/0x30
> 
> Fixes: be93e87e7802 ("nvme: support for multiple Command Sets Supported and 
> Effects log pages")
> Signed-off-by: Logan Gunthorpe

Thanks for this fix, looks good.

Tested-by: Chaitanya Kulkarni 
Reviewed-by: Chaitanya Kulkarni 


Re: [PATCH] tools build feature: Quote CC and CXX for their arguments

2020-08-12 Thread Tom Hebb
On Wed, Aug 12, 2020 at 3:15 PM Daniel Díaz  wrote:
>
> When using a cross-compilation environment, such as OpenEmbedded,
> the CC an CXX variables are set to something more than just a
> command: there are arguments (such as --sysroot) that need to be
> passed on to the compiler so that the right set of headers and
> libraries are used.
>
> For the particular case that our systems detected, CC is set to
> the following:
>
>   export CC="aarch64-linaro-linux-gcc  
> --sysroot=/oe/build/tmp/work/machine/perf/1.0-r9/recipe-sysroot"
>
> Without quotes, detection is as follows:
>
>   Auto-detecting system features:
>   ... dwarf: [ OFF ]
>   ...dwarf_getlocations: [ OFF ]
>   ... glibc: [ OFF ]
>   ...  gtk2: [ OFF ]
>   ...libbfd: [ OFF ]
>   ...libcap: [ OFF ]
>   ...libelf: [ OFF ]
>   ...   libnuma: [ OFF ]
>   ...numa_num_possible_cpus: [ OFF ]
>   ...   libperl: [ OFF ]
>   ... libpython: [ OFF ]
>   ... libcrypto: [ OFF ]
>   ... libunwind: [ OFF ]
>   ...libdw-dwarf-unwind: [ OFF ]
>   ...  zlib: [ OFF ]
>   ...  lzma: [ OFF ]
>   ... get_cpuid: [ OFF ]
>   ...   bpf: [ OFF ]
>   ...libaio: [ OFF ]
>   ...   libzstd: [ OFF ]
>   ...disassembler-four-args: [ OFF ]
>
>   Makefile.config:414: *** No gnu/libc-version.h found, please install 
> glibc-dev[el].  Stop.
>   Makefile.perf:230: recipe for target 'sub-make' failed
>   make[1]: *** [sub-make] Error 2
>   Makefile:69: recipe for target 'all' failed
>   make: *** [all] Error 2
>
> With CC and CXX quoted, some of those features are now detected.
>
> Fixes: e3232c2f39ac ("tools build feature: Use CC and CXX from parent")
>
> Signed-off-by: Daniel Díaz 

Whoops, I'm the one who introduced this issue. Fix looks good, thanks!

Reviewed-by: Thomas Hebb 
Fixes: e3232c2f39ac ("tools build feature: Use CC and CXX from parent")

> ---
>  tools/build/Makefile.feature | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature
> index 774f0b0ca28a..e7818b44b48e 100644
> --- a/tools/build/Makefile.feature
> +++ b/tools/build/Makefile.feature
> @@ -8,7 +8,7 @@ endif
>
>  feature_check = $(eval $(feature_check_code))
>  define feature_check_code
> -  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CC=$(CC) 
> CXX=$(CXX) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" 
> CXXFLAGS="$(EXTRA_CXXFLAGS) $(FEATURE_CHECK_CXXFLAGS-$(1))" 
> LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) 
> $(OUTPUT_FEATURES)test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)
> +  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CC="$(CC)" 
> CXX="$(CXX)" CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" 
> CXXFLAGS="$(EXTRA_CXXFLAGS) $(FEATURE_CHECK_CXXFLAGS-$(1))" 
> LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) 
> $(OUTPUT_FEATURES)test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)

We should probably also be quoting the arguments that expand $(OUTPUT_FEATURES)
too, although trying to handle path names with spaces is probably a
lost cause anyway.

>  endef
>
>  feature_set = $(eval $(feature_set_code))
> --
> 2.25.1
>


Re: [PATCH v8 2/4] scsi: ufs: Introduce HPB feature

2020-08-12 Thread Daejun Park
Hi Bart,

On 2020-08-06 02:11, Daejun Park wrote:
> > +static void ufshpb_issue_hpb_reset_query(struct ufs_hba *hba)
> > +{
> > +int err;
> > +int retries;
> > +
> > +for (retries = 0; retries < HPB_RESET_REQ_RETRIES; retries++) {
> > +err = ufshcd_query_flag(hba, UPIU_QUERY_OPCODE_SET_FLAG,
> > +QUERY_FLAG_IDN_HPB_RESET, 0, NULL);
> > +if (err)
> > +dev_dbg(hba->dev,
> > +"%s: failed with error %d, retries %d\n",
> > +__func__, err, retries);
> > +else
> > +break;
> > +}
> > +
> > +if (err) {
> > +dev_err(hba->dev,
> > +"%s setting fHpbReset flag failed with error %d\n",
> > +__func__, err);
> > +return;
> > +}
> > +}
> 
> Please change the "break" into an early return, remove the last
> occurrence "if (err)" and remove the final return statement.

OK, I will.

> 
> > +static void ufshpb_check_hpb_reset_query(struct ufs_hba *hba)
> > +{
> > +int err;
> > +bool flag_res = true;
> > +int try = 0;
> > +
> > +/* wait for the device to complete HPB reset query */
> > +do {
> > +if (++try == HPB_RESET_REQ_RETRIES)
> > +break;
> > +
> > +dev_info(hba->dev,
> > +"%s start flag reset polling %d times\n",
> > +__func__, try);
> > +
> > +/* Poll fHpbReset flag to be cleared */
> > +err = ufshcd_query_flag(hba, UPIU_QUERY_OPCODE_READ_FLAG,
> > +QUERY_FLAG_IDN_HPB_RESET, 0, _res);
> > +usleep_range(1000, 1100);
> > +} while (flag_res);
> > +
> > +if (err) {
> > +dev_err(hba->dev,
> > +"%s reading fHpbReset flag failed with error %d\n",
> > +__func__, err);
> > +return;
> > +}
> > +
> > +if (flag_res) {
> > +dev_err(hba->dev,
> > +"%s fHpbReset was not cleared by the device\n",
> > +__func__);
> > +}
> > +}
> 
> Should "polling %d times" perhaps be changed into "attempt %d"?

I will change it.

> The "if (err)" statement may be reached without "err" having been
> initialized. Please fix.

OK, I will initialize err to 0.

> Additionally, please change the do-while loop into a for-loop, e.g. as
> follows:
> 
> for (try = 0; try < HPB_RESET_REQ_RETRIES; try++)
> ...

OK, I will change do-while to for-loop.

Thanks,

Daejun


Re: [PATCH v8 1/4] scsi: ufs: Add UFS feature related parameter

2020-08-12 Thread Daejun Park
Hi Bart,

> On 2020-08-06 02:02, Daejun Park wrote:
> > @@ -537,6 +548,7 @@ struct ufs_dev_info {
> >  u8 *model;
> >  u16 wspecversion;
> >  u32 clk_gating_wait_us;
> > +u8 b_ufs_feature_sup;
> >  u32 d_ext_ufs_feature_sup;
> >  u8 b_wb_buffer_type;
> >  u32 d_wb_alloc_units;
> > 
> 
> Hmm ... shouldn't this variable be introduced in the patch that introduces
> the code that sets and uses this variable?

OK, I will move this variable to 2/4 patch.

> How about making it clear in the patch subject that this patch adds protocol
> constants related to HPB?

The subject will be changed :
"Add UFS feature related parameter -> Adds constants related to HPB"
 
> Otherwise this patch looks good to me.
> 
> Bart.

Thanks,

Daejun


Re: [PATCH v17 14/21] mm/compaction: do page isolation first in compaction

2020-08-12 Thread Alexander Duyck
On Wed, Aug 12, 2020 at 6:47 PM Alex Shi  wrote:
>
>
>
> 在 2020/8/13 上午12:51, Alexander Duyck 写道:
> > On Wed, Aug 12, 2020 at 4:44 AM Alex Shi  wrote:
> >>
> >>
> >>
> >> 在 2020/8/11 下午10:47, Alexander Duyck 写道:
> >>> On Tue, Aug 11, 2020 at 1:23 AM Alex Shi  
> >>> wrote:
> 
> 
> 
>  在 2020/8/10 下午10:41, Alexander Duyck 写道:
> > On Mon, Aug 10, 2020 at 6:10 AM Alex Shi  
> > wrote:
> >>
> >>
> >>
> >> 在 2020/8/7 下午10:51, Alexander Duyck 写道:
> >>> I wonder if this entire section shouldn't be restructured. This is the
> >>> only spot I can see where we are resetting the LRU flag instead of
> >>> pulling the page from the LRU list with the lock held. Looking over
> >>> the code it seems like something like that should be possible. I am
> >>> not sure the LRU lock is really protecting us in either the
> >>> PageCompound check nor the skip bits. It seems like holding a
> >>> reference on the page should prevent it from switching between
> >>> compound or not, and the skip bits are per pageblock with the LRU bits
> >>> being per node/memcg which I would think implies that we could have
> >>> multiple LRU locks that could apply to a single skip bit.
> >>
> >> Hi Alexander,
> >>
> >> I don't find problem yet on compound or skip bit usage. Would you 
> >> clarify the
> >> issue do you concerned?
> >>
> >> Thanks!
> >
> > The point I was getting at is that the LRU lock is being used to
> > protect these and with your changes I don't think that makes sense
> > anymore.
> >
> > The skip bits are per-pageblock bits. With your change the LRU lock is
> > now per memcg first and then per node. As such I do not believe it
> > really provides any sort of exclusive access to the skip bits. I still
> > have to look into this more, but it seems like you need a lock per
> > either section or zone that can be used to protect those bits and deal
> > with this sooner rather than waiting until you have found an LRU page.
> > The one part that is confusing though is that the definition of the
> > skip bits seems to call out that they are a hint since they are not
> > protected by a lock, but that is exactly what has been happening here.
> >
> 
>  The skip bits are safe here, since even it race with other skip action,
>  It will still skip out. The skip action is try not to compaction too 
>  much,
>  not a exclusive action needs avoid race.
> >>>
> >>> That would be the case if it didn't have the impact that they
> >>> currently do on the compaction process. What I am getting at is that a
> >>> race was introduced when you placed this test between the clearing of
> >>> the LRU flag and the actual pulling of the page from the LRU list. So
> >>> if you tested the skip bits before clearing the LRU flag then I would
> >>> be okay with the code, however because it is triggering an abort after
> >>
> >> Hi Alexander,
> >>
> >> Thanks a lot for comments and suggestions!
> >>
> >> I have tried your suggestion:
> >>
> >> Signed-off-by: Alex Shi 
> >> ---
> >>  mm/compaction.c | 14 +++---
> >>  1 file changed, 7 insertions(+), 7 deletions(-)
> >>
> >> diff --git a/mm/compaction.c b/mm/compaction.c
> >> index b99c96c4862d..6c881dee8c9a 100644
> >> --- a/mm/compaction.c
> >> +++ b/mm/compaction.c
> >> @@ -988,6 +988,13 @@ static bool too_many_isolated(pg_data_t *pgdat)
> >> if (__isolate_lru_page_prepare(page, isolate_mode) != 0)
> >> goto isolate_fail_put;
> >>
> >> +   /* Try get exclusive access under lock */
> >> +   if (!skip_updated) {
> >> +   skip_updated = true;
> >> +   if (test_and_set_skip(cc, page, low_pfn))
> >> +   goto isolate_fail_put;
> >> +   }
> >> +
> >> /* Try isolate the page */
> >> if (!TestClearPageLRU(page))
> >> goto isolate_fail_put;
> >
> > I would have made this much sooner. Probably before you call
> > get_page_unless_zero so as to avoid the unnecessary atomic operations.
> >
> >> @@ -1006,13 +1013,6 @@ static bool too_many_isolated(pg_data_t *pgdat)
> >>
> >> lruvec_memcg_debug(lruvec, page);
> >>
> >> -   /* Try get exclusive access under lock */
> >> -   if (!skip_updated) {
> >> -   skip_updated = true;
> >> -   if (test_and_set_skip(cc, page, low_pfn))
> >> -   goto isolate_abort;
> >> -   }
> >> -
> >> /*
> >>  * Page become compound since the non-locked check,
> >>  * and it's on LRU. It can only be a THP so the 
> >> order
> >> --
> >>
> >> Performance of 

[GIT PULL] xfs: small fixes for 5.9-rc1

2020-08-12 Thread Darrick J. Wong
Hi Linus,

Please pull these two small fixes that have come in during the past
week.  The branch merges cleanly with upstream as of a few minutes ago,
so please let me know if anything strange happens.

--D

The following changes since commit 818d5a91559ffe1e1f2095dcbbdb96c13fdb94ec:

  fs/xfs: Support that ioctl(SETXFLAGS/GETXFLAGS) can set/get inode DAX on XFS. 
(2020-07-28 20:28:20 -0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git tags/xfs-5.9-merge-8

for you to fetch changes up to 96cf2a2c75567ff56195fe3126d497a2e7e4379f:

  xfs: Fix UBSAN null-ptr-deref in xfs_sysfs_init (2020-08-07 11:50:17 -0700)


Fixes for 5.9-rc1:
- Fix duplicated words in comments.
- Fix an ubsan complaint about null pointer arithmetic.


Eiichi Tsukata (1):
  xfs: Fix UBSAN null-ptr-deref in xfs_sysfs_init

Randy Dunlap (1):
  xfs: delete duplicated words + other fixes

 fs/xfs/libxfs/xfs_sb.c| 2 +-
 fs/xfs/xfs_attr_list.c| 2 +-
 fs/xfs/xfs_buf_item.c | 2 +-
 fs/xfs/xfs_buf_item_recover.c | 2 +-
 fs/xfs/xfs_dquot.c| 2 +-
 fs/xfs/xfs_export.c   | 2 +-
 fs/xfs/xfs_inode.c| 4 ++--
 fs/xfs/xfs_inode_item.c   | 4 ++--
 fs/xfs/xfs_iomap.c| 2 +-
 fs/xfs/xfs_log_cil.c  | 2 +-
 fs/xfs/xfs_log_recover.c  | 2 +-
 fs/xfs/xfs_refcount_item.c| 2 +-
 fs/xfs/xfs_reflink.c  | 2 +-
 fs/xfs/xfs_sysfs.h| 6 --
 fs/xfs/xfs_trans_ail.c| 4 ++--
 15 files changed, 21 insertions(+), 19 deletions(-)


[PATCH v2 0/2] Try to release mmap_lock temporarily in smaps_rollup

2020-08-12 Thread Chinwen Chang
Recently, we have observed some janky issues caused by unpleasantly long
contention on mmap_lock which is held by smaps_rollup when probing large
processes. To address the problem, we let smaps_rollup detect if anyone
wants to acquire mmap_lock for write attempts. If yes, just release the
lock temporarily to ease the contention.

smaps_rollup is a procfs interface which allows users to summarize the
process's memory usage without the overhead of seq_* calls. Android uses it
to sample the memory usage of various processes to balance its memory pool
sizes. If no one wants to take the lock for write requests, smaps_rollup
with this patch will behave like the original one.

Although there are on-going mmap_lock optimizations like range-based locks,
the lock applied to smaps_rollup would be the coarse one, which is hard to
avoid the occurrence of aforementioned issues. So the detection and
temporary release for write attempts on mmap_lock in smaps_rollup is still
necessary.

Change since v1:
- If current VMA is freed after dropping the lock, it will return
- incomplete result. To fix this issue, refine the code flow as
- suggested by Steve. [1]

[1] https://lore.kernel.org/lkml/bf40676e-b14b-44cd-75ce-419c70194...@arm.com/


Chinwen Chang (2):
  mmap locking API: add mmap_lock_is_contended()
  mm: proc: smaps_rollup: do not stall write attempts on mmap_lock

 fs/proc/task_mmu.c| 57 ++-
 include/linux/mmap_lock.h |  5 
 2 files changed, 61 insertions(+), 1 deletion(-)

[PATCH v2 1/2] mmap locking API: add mmap_lock_is_contended()

2020-08-12 Thread Chinwen Chang
Add new API to query if someone wants to acquire mmap_lock
for write attempts.

Using this instead of rwsem_is_contended makes it more tolerant
of future changes to the lock type.

Signed-off-by: Chinwen Chang 
---
 include/linux/mmap_lock.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h
index 0707671..18e7eae 100644
--- a/include/linux/mmap_lock.h
+++ b/include/linux/mmap_lock.h
@@ -87,4 +87,9 @@ static inline void mmap_assert_write_locked(struct mm_struct 
*mm)
VM_BUG_ON_MM(!rwsem_is_locked(>mmap_lock), mm);
 }
 
+static inline int mmap_lock_is_contended(struct mm_struct *mm)
+{
+   return rwsem_is_contended(>mmap_lock);
+}
+
 #endif /* _LINUX_MMAP_LOCK_H */
-- 
1.9.1


[PATCH v2 2/2] mm: proc: smaps_rollup: do not stall write attempts on mmap_lock

2020-08-12 Thread Chinwen Chang
smaps_rollup will try to grab mmap_lock and go through the whole vma
list until it finishes the iterating. When encountering large processes,
the mmap_lock will be held for a longer time, which may block other
write requests like mmap and munmap from progressing smoothly.

There are upcoming mmap_lock optimizations like range-based locks, but
the lock applied to smaps_rollup would be the coarse type, which doesn't
avoid the occurrence of unpleasant contention.

To solve aforementioned issue, we add a check which detects whether
anyone wants to grab mmap_lock for write attempts.

Change since v1:
- If current VMA is freed after dropping the lock, it will return
- incomplete result. To fix this issue, refine the code flow as
- suggested by Steve. [1]

[1] https://lore.kernel.org/lkml/bf40676e-b14b-44cd-75ce-419c70194...@arm.com/

Signed-off-by: Chinwen Chang 
---
 fs/proc/task_mmu.c | 56 +-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index dbda449..23b3a447 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -853,9 +853,63 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
 
hold_task_mempolicy(priv);
 
-   for (vma = priv->mm->mmap; vma; vma = vma->vm_next) {
+   for (vma = priv->mm->mmap; vma;) {
smap_gather_stats(vma, );
last_vma_end = vma->vm_end;
+
+   /*
+* Release mmap_lock temporarily if someone wants to
+* access it for write request.
+*/
+   if (mmap_lock_is_contended(mm)) {
+   mmap_read_unlock(mm);
+   ret = mmap_read_lock_killable(mm);
+   if (ret) {
+   release_task_mempolicy(priv);
+   goto out_put_mm;
+   }
+
+   /*
+* After dropping the lock, there are three cases to
+* consider. See the following example for explanation.
+*
+*   +--+--+---+
+*   | VMA1 | VMA2 | VMA3  |
+*   +--+--+---+
+*   |  |  |   |
+*  4k 8k 16k 400k
+*
+* Suppose we drop the lock after reading VMA2 due to
+* contention, then we get:
+*
+*  last_vma_end = 16k
+*
+* 1) VMA2 is freed, but VMA3 exists:
+*
+*find_vma(mm, 16k - 1) will return VMA3.
+*In this case, just continue from VMA3.
+*
+* 2) VMA2 still exists:
+*
+*find_vma(mm, 16k - 1) will return VMA2.
+*Iterate the loop like the original one.
+*
+* 3) No more VMAs can be found:
+*
+*find_vma(mm, 16k - 1) will return NULL.
+*No more things to do, just break.
+*/
+   vma = find_vma(mm, last_vma_end - 1);
+   /* Case 3 above */
+   if (!vma)
+   break;
+
+   /* Case 1 above */
+   if (vma->vm_start >= last_vma_end)
+   continue;
+   }
+   /* Case 2 above */
+   vma = vma->vm_next;
}
 
show_vma_header_prefix(m, priv->mm->mmap->vm_start,
-- 
1.9.1


Re: [PATCH v2] PCI: Introduce flag for detached virtual functions

2020-08-12 Thread Oliver O'Halloran
On Thu, Aug 13, 2020 at 6:33 AM Alex Williamson
 wrote:
>
> On Wed, 12 Aug 2020 15:21:11 -0400
> Matthew Rosato  wrote:
>
> > @@ -521,7 +522,8 @@ static int vfio_basic_config_read(struct 
> > vfio_pci_device *vdev, int pos,
> >   count = vfio_default_config_read(vdev, pos, count, perm, offset, val);
> >
> >   /* Mask in virtual memory enable for SR-IOV devices */
> > - if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) {
> > + if ((offset == PCI_COMMAND) &&
> > + (vdev->pdev->is_virtfn || vdev->pdev->detached_vf)) {
> >   u16 cmd = le16_to_cpu(*(__le16 *)>vconfig[PCI_COMMAND]);
> >   u32 tmp_val = le32_to_cpu(*val);
> >
> > @@ -1734,7 +1736,8 @@ int vfio_config_init(struct vfio_pci_device *vdev)
> >vconfig[PCI_INTERRUPT_PIN]);
> >
> >   vconfig[PCI_INTERRUPT_PIN] = 0; /* Gratuitous for good VFs */
> > -
> > + }
> > + if (pdev->is_virtfn || pdev->detached_vf) {
> >   /*
> >* VFs do no implement the memory enable bit of the COMMAND
> >* register therefore we'll not have it set in our initial
> > diff --git a/include/linux/pci.h b/include/linux/pci.h
> > index 8355306..23a6972 100644
> > --- a/include/linux/pci.h
> > +++ b/include/linux/pci.h
> > @@ -445,6 +445,7 @@ struct pci_dev {
> >   unsigned intis_probed:1;/* Device probing in progress 
> > */
> >   unsigned intlink_active_reporting:1;/* Device capable of 
> > reporting link active */
> >   unsigned intno_vf_scan:1;   /* Don't scan for VFs after 
> > IOV enablement */
> > + unsigned intdetached_vf:1;  /* VF without local PF access 
> > */
>
> Is there too much implicit knowledge in defining a "detached VF"?  For
> example, why do we know that we can skip the portion of
> vfio_config_init() that copies the vendor and device IDs from the
> struct pci_dev into the virtual config space?  It's true on s390x, but
> I think that's because we know that firmware emulates those registers
> for us.
>
> We also skip the INTx pin register sanity checking.  Do we do
> that because we haven't installed the broken device into an s390x
> system?  Because we know firmware manages that for us too?  Or simply
> because s390x doesn't support INTx anyway, and therefore it's another
> architecture implicit decision?

Agreed. Any hacks we put in for normal VFs are going to be needed for
the passed-though VF case. Only applying the memory space enable
workaround doesn't make sense to me either.

> If detached_vf is really equivalent to is_virtfn for all cases that
> don't care about referencing physfn on the pci_dev, then we should
> probably have a macro to that effect.

A pci_is_virtfn() helper would be better than open coding both checks
everywhere. That said, it might be solving the wrong problem. The
union between ->physfn and ->sriov has always seemed like a footgun to
me so we might be better off switching the users who want a physfn to
a helper instead. i.e.

struct pci_dev *pci_get_vf_physfn(struct pci_dev *vf)
{
if (!vf->is_virtfn)
return NULL;

return vf->physfn;
}

...

pf = pci_get_vf_physfn(vf)
if (pf)
/* do pf things */

Then we can just use ->is_virtfn for the normal and detached cases.

Oliver


Re: [PATCH linux-5.2.y-rt only] hrtimer: correct the logic for grab expiry lock

2020-08-12 Thread Zhantao Tang



在 2020/8/12 下午7:45, Rasmus Villemoes 写道:

On 12/08/2020 12.50, zhantao.t...@windriver.com wrote:

From: Zhantao Tang 

In commit: 47b6de0b7f22 ("hrtimer: Add a missing bracket and hide `migration_base' 
on !SMP")
a inline function is_migration_base() is introduced. But
the logic of the hrtimer_grab_expiry_lock was changed.

This patch is to correct it.


Yup, same patch sent back in April, which also had a fixes tag for 5.2.

https://lore.kernel.org/lkml/20200428144026.5882-1-rasmus.villem...@prevas.dk/

It got picked up for 4.19-rt, dunno why it wasn't for 5.2-rt.


Yes, currently 5.2 rt kernel not fix the issue.

And I checked the 4.19 rt kernel, but  did not find the implementation 
of hrtimer_grab_expiry_lock().


It used hrtimer_wait_for_timer() and the logic is OK.


Thanks,

Zhantao



Rasmus


Re: [PATCH v2] PCI: Introduce flag for detached virtual functions

2020-08-12 Thread Oliver O'Halloran
On Thu, Aug 13, 2020 at 5:21 AM Matthew Rosato  wrote:
>
> s390x has the notion of providing VFs to the kernel in a manner
> where the associated PF is inaccessible other than via firmware.
> These are not treated as typical VFs and access to them is emulated
> by underlying firmware which can still access the PF.  After
> abafbc55 however these detached VFs were no longer able to work
> with vfio-pci as the firmware does not provide emulation of the
> PCI_COMMAND_MEMORY bit.  In this case, let's explicitly recognize
> these detached VFs so that vfio-pci can allow memory access to
> them again.

Hmm, cool. I think we have a similar feature on pseries so that's
probably broken too.

> Signed-off-by: Matthew Rosato 
> ---
>  arch/s390/pci/pci.c|  8 
>  drivers/vfio/pci/vfio_pci_config.c | 11 +++
>  include/linux/pci.h|  1 +
>  3 files changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
> index 3902c9f..04ac76d 100644
> --- a/arch/s390/pci/pci.c
> +++ b/arch/s390/pci/pci.c
> @@ -581,6 +581,14 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask)
>  {
> struct zpci_dev *zdev = to_zpci(pdev);
>
> +   /*
> +* If we have a VF on a non-multifunction bus, it must be a VF that is
> +* detached from its parent PF.  We rely on firmware emulation to
> +* provide underlying PF details.
> +*/
> +   if (zdev->vfn && !zdev->zbus->multifunction)
> +   pdev->detached_vf = 1;

The enable hook seems like it's a bit too late for this sort of
screwing around with the pci_dev. Anything in the setup path that
looks at ->detached_vf would see it cleared while anything that looks
after the device is enabled will see it set. Can this go into
pcibios_add_device() or a fixup instead?


Re: [RFC PATCH] printk: Change timestamp to triplet as mono, boot and real

2020-08-12 Thread Sergey Senozhatsky
On (20/08/11 15:02), Petr Mladek wrote:
> On Tue 2020-08-11 14:05:12, Thomas Gleixner wrote:
> > Petr Mladek  writes:
> > > At least "crash" tool would need an update anyway. AFAIK, it checks
> > > the size of struct printk_log and refuses to read it when it changes.
> > >
> > > It means that the hack with VMCOREINFO_FIELD_OFFSET probably is not
> > > needed because we would need to update the crashdump-related tools anyway.
> > >
> > > Well, the timing is good. We are about to switch the printk ring
> > > buffer into a lockless one. It requires updating the crashdump tools
> > > as well. We could do this at the same time. The lockless ring buffer
> > > already is in linux-next. It is aimed for 5.10 or 5.11.
> > ...
> > > It would be great to synchronize all these changes changes of the
> > > printk log buffer structures.
> > 
> > I agree that having one update is a good thing, but pretty please can we
> > finally make progress with this and not create yet another dependency?
> 
> To make it clear. I definitely do not want to block lockless printk by
> this.
> 
> BTW: I am not 100% convinced that storing all three timestamps is
> worth it. It increases the code complexity, metadata size. It needs
> an interface with the userspace that has to stay backward compatible.

Can we, perhaps, store those various "alternative" timestamps in dict so
then whoever wants to read them can just parse the dict key:value pairs
attach to each printk message?

> Also it still will be racy because the timestamp is taken when the message
> is printed. It might be "long" before or after the event that
> it talks about.
>
> There is still the alternative to print all three timestamps regularly
> for those interested. It is less user convenient but much easier
> to maintain.

Yes, that's a nice alternative.

-ss


Re: [PATCH v2 0/4] Perf tool: Enable Arm arch timer counter and arm-spe's timestamp

2020-08-12 Thread Leo Yan
On Wed, Aug 12, 2020 at 03:53:34PM -0300, Arnaldo Carvalho de Melo wrote:
> Em Wed, Aug 12, 2020 at 10:06:53AM -0600, Mathieu Poirier escreveu:
> > Hi Arnaldo,
> > 
> > On Fri, 7 Aug 2020 at 01:16, Leo Yan  wrote:
> > >
> > > This patch set is to enable Arm arch timer counter and Arm SPE is the
> > > first customer to use arch timer counter for its timestamp.
> > >
> > > Patches 01 ~ 03 enables Arm arch timer counter in Perf tool; patch 01 is
> > > to retrieve arch timer's parameters from mmaped page; patch 02 provides
> > > APIs for the conversion between arch timer's counter and time; patch 03
> > > adds a test for patches 01 and 02.
> > >
> > > As the first customer to use Arm arch timer counter in perf tool, patch
> > > 04 is to generate sample's timestamp for ARM SPE AUX trace data.
> > >
> > > This patch set has been rebased on perf/core branch with the latest
> > > commit c4735d990268 ("perf evsel: Don't set
> > > sample_regs_intr/sample_regs_user for dummy event").
> > 
> > The ARM SPE perf tools code is orphan and I don't have the cycles to
> > pick it up.  Leo has spent a lot of time in that code and as such I
> > suggest that he starts maintaining it, probably following the same
> > kind of arrangement you and I have for coresight.
> 
> Thats ok with me, I think we should reflect that on the MAINTAINERS
> file, right?

Very appreciate for the promoting, Mathieu and Arnaldo.

> We have this already:
> 
> PERFORMANCE EVENTS SUBSYSTEM ARM64 PMU EVENTS
> R:  John Garry 
> R:  Will Deacon 
> L:  linux-arm-ker...@lists.infradead.org (moderated for non-subscribers)
> S:  Supported
> F:  tools/perf/pmu-events/arch/arm64/
> 
> I think we should have entries for CoreSight and ARM SPE, one listing
> you as the maintainer and the other listing Leo, right?
> 
> Leo, it would be good for you to try and find someone at your
> organization or elsewhere that could help with this, this way I would
> wait a bit, see if someone else Acks it, and collect those, processing
> it from you after a while if nobody chimes in.

I will follow up this and might offline to check with relevant
maintainers (Will and John) for what's good way to proceed.

> Please continue providing 'perf test' regression tests, etc, to help
> with maintaining this code being checked.

Sure, I'm glad to continue the maintaining and testing works :)

Thanks,
Leo


Re: [PATCH v3 0/6] tracing: export event trace and trace_marker

2020-08-12 Thread Tingwei Zhang
On Thu, Aug 13, 2020 at 12:49:25AM +0800, Mathieu Poirier wrote:
> On Tue, Aug 11, 2020 at 11:57:20AM +0800, Tingwei Zhang wrote:
> > Ftrace has ability to export trace packets to other destination.
> > Currently, only function trace can be exported. This series extends the
> > support to event trace and trace_maker. STM is one possible destination
> to
> > export ftrace. Use separate channel for each CPU to avoid mixing up
> packets
> > from different CPUs together.
> 
> As the get_maintainer script points out, you are missing the maintainer of
> the
> system trace subsystem.
> 
Sorry for missing that. I resent the patch and add Alexander.

> > 
> > Change from v2:
> > Change flag definition to BIT(). (Steven)
> > Add comment in stm_ftrace_write() to clarify it's safe to use 
> > smp_processor_id() here since preempt is disabled. (Steven) 
> > 
> > Change from v1:
> > All changes are suggested by Steven Rostedt.
> > User separate flag to control function trace, event trace and trace
> mark.
> > Allocate channels according to num_possible_cpu() dynamically.
> > Move ftrace_exports routines up so all ftrace can use them.
> > 
> > Tingwei Zhang (6):
> >   stm class: ftrace: change dependency to TRACING
> >   tracing: add flag to control different traces
> >   tracing: add trace_export support for event trace
> >   tracing: add trace_export support for trace_marker
> >   stm class: ftrace: enable supported trace export flag
> >   stm class: ftrace: use different channel accroding to CPU
> > 
> >  drivers/hwtracing/stm/Kconfig  |   2 +-
> >  drivers/hwtracing/stm/ftrace.c |   7 +-
> >  include/linux/trace.h  |   7 +
> >  kernel/trace/trace.c   | 270 ++---
> >  4 files changed, 159 insertions(+), 127 deletions(-)
> > 
> > -- 
> > The Qualcomm Innovation Center, Inc. is a member of the Code Aurora
> Forum,
> > a Linux Foundation Collaborative Project
> > 


[PATCH v3 6/6] stm class: ftrace: use different channel accroding to CPU

2020-08-12 Thread Tingwei Zhang
To avoid mixup of packets from differnt ftrace packets simultaneously,
use different channel for packets from different CPU.

Signed-off-by: Tingwei Zhang 
Reviewed-by: Steven Rostedt (VMware) 
---
 drivers/hwtracing/stm/ftrace.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/hwtracing/stm/ftrace.c b/drivers/hwtracing/stm/ftrace.c
index c694a6e692d1..ebf29489919c 100644
--- a/drivers/hwtracing/stm/ftrace.c
+++ b/drivers/hwtracing/stm/ftrace.c
@@ -37,8 +37,10 @@ static void notrace
 stm_ftrace_write(struct trace_export *export, const void *buf, unsigned int 
len)
 {
struct stm_ftrace *stm = container_of(export, struct stm_ftrace, 
ftrace);
+   /* This is called from trace system with preemption disabled */
+   unsigned int cpu = smp_processor_id();
 
-   stm_source_write(>data, STM_FTRACE_CHAN, buf, len);
+   stm_source_write(>data, STM_FTRACE_CHAN + cpu, buf, len);
 }
 
 static int stm_ftrace_link(struct stm_source_data *data)
@@ -63,6 +65,7 @@ static int __init stm_ftrace_init(void)
 {
int ret;
 
+   stm_ftrace.data.nr_chans = num_possible_cpus();
ret = stm_source_register_device(NULL, _ftrace.data);
if (ret)
pr_err("Failed to register stm_source - ftrace.\n");
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v3 5/6] stm class: ftrace: enable supported trace export flag

2020-08-12 Thread Tingwei Zhang
Set flags for trace_export. Export function trace, event trace
and trace marker to stm.

Signed-off-by: Tingwei Zhang 
Reviewed-by: Steven Rostedt (VMware) 
---
 drivers/hwtracing/stm/ftrace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/hwtracing/stm/ftrace.c b/drivers/hwtracing/stm/ftrace.c
index ce868e095410..c694a6e692d1 100644
--- a/drivers/hwtracing/stm/ftrace.c
+++ b/drivers/hwtracing/stm/ftrace.c
@@ -46,6 +46,8 @@ static int stm_ftrace_link(struct stm_source_data *data)
struct stm_ftrace *sf = container_of(data, struct stm_ftrace, data);
 
sf->ftrace.write = stm_ftrace_write;
+   sf->ftrace.flags = TRACE_EXPORT_FUNCTION | TRACE_EXPORT_EVENT
+   | TRACE_EXPORT_MARKER;
 
return register_ftrace_export(>ftrace);
 }
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



Re: [PATCH v17 14/21] mm/compaction: do page isolation first in compaction

2020-08-12 Thread Alex Shi



在 2020/8/13 上午12:51, Alexander Duyck 写道:
> On Wed, Aug 12, 2020 at 4:44 AM Alex Shi  wrote:
>>
>>
>>
>> 在 2020/8/11 下午10:47, Alexander Duyck 写道:
>>> On Tue, Aug 11, 2020 at 1:23 AM Alex Shi  wrote:



 在 2020/8/10 下午10:41, Alexander Duyck 写道:
> On Mon, Aug 10, 2020 at 6:10 AM Alex Shi  
> wrote:
>>
>>
>>
>> 在 2020/8/7 下午10:51, Alexander Duyck 写道:
>>> I wonder if this entire section shouldn't be restructured. This is the
>>> only spot I can see where we are resetting the LRU flag instead of
>>> pulling the page from the LRU list with the lock held. Looking over
>>> the code it seems like something like that should be possible. I am
>>> not sure the LRU lock is really protecting us in either the
>>> PageCompound check nor the skip bits. It seems like holding a
>>> reference on the page should prevent it from switching between
>>> compound or not, and the skip bits are per pageblock with the LRU bits
>>> being per node/memcg which I would think implies that we could have
>>> multiple LRU locks that could apply to a single skip bit.
>>
>> Hi Alexander,
>>
>> I don't find problem yet on compound or skip bit usage. Would you 
>> clarify the
>> issue do you concerned?
>>
>> Thanks!
>
> The point I was getting at is that the LRU lock is being used to
> protect these and with your changes I don't think that makes sense
> anymore.
>
> The skip bits are per-pageblock bits. With your change the LRU lock is
> now per memcg first and then per node. As such I do not believe it
> really provides any sort of exclusive access to the skip bits. I still
> have to look into this more, but it seems like you need a lock per
> either section or zone that can be used to protect those bits and deal
> with this sooner rather than waiting until you have found an LRU page.
> The one part that is confusing though is that the definition of the
> skip bits seems to call out that they are a hint since they are not
> protected by a lock, but that is exactly what has been happening here.
>

 The skip bits are safe here, since even it race with other skip action,
 It will still skip out. The skip action is try not to compaction too much,
 not a exclusive action needs avoid race.
>>>
>>> That would be the case if it didn't have the impact that they
>>> currently do on the compaction process. What I am getting at is that a
>>> race was introduced when you placed this test between the clearing of
>>> the LRU flag and the actual pulling of the page from the LRU list. So
>>> if you tested the skip bits before clearing the LRU flag then I would
>>> be okay with the code, however because it is triggering an abort after
>>
>> Hi Alexander,
>>
>> Thanks a lot for comments and suggestions!
>>
>> I have tried your suggestion:
>>
>> Signed-off-by: Alex Shi 
>> ---
>>  mm/compaction.c | 14 +++---
>>  1 file changed, 7 insertions(+), 7 deletions(-)
>>
>> diff --git a/mm/compaction.c b/mm/compaction.c
>> index b99c96c4862d..6c881dee8c9a 100644
>> --- a/mm/compaction.c
>> +++ b/mm/compaction.c
>> @@ -988,6 +988,13 @@ static bool too_many_isolated(pg_data_t *pgdat)
>> if (__isolate_lru_page_prepare(page, isolate_mode) != 0)
>> goto isolate_fail_put;
>>
>> +   /* Try get exclusive access under lock */
>> +   if (!skip_updated) {
>> +   skip_updated = true;
>> +   if (test_and_set_skip(cc, page, low_pfn))
>> +   goto isolate_fail_put;
>> +   }
>> +
>> /* Try isolate the page */
>> if (!TestClearPageLRU(page))
>> goto isolate_fail_put;
> 
> I would have made this much sooner. Probably before you call
> get_page_unless_zero so as to avoid the unnecessary atomic operations.
> 
>> @@ -1006,13 +1013,6 @@ static bool too_many_isolated(pg_data_t *pgdat)
>>
>> lruvec_memcg_debug(lruvec, page);
>>
>> -   /* Try get exclusive access under lock */
>> -   if (!skip_updated) {
>> -   skip_updated = true;
>> -   if (test_and_set_skip(cc, page, low_pfn))
>> -   goto isolate_abort;
>> -   }
>> -
>> /*
>>  * Page become compound since the non-locked check,
>>  * and it's on LRU. It can only be a THP so the order
>> --
>>
>> Performance of case-lru-file-mmap-read in vm-scalibity is dropped a bit. not
>> helpful
> 
> So one issue with this change is that it is still too late to be of
> much benefit. Really you should probably be doing this much sooner,
> for example somewhere before the get_page_unless_zero(). Also the
> thing that still has 

[PATCH v3 0/6] tracing: export event trace and trace_marker

2020-08-12 Thread Tingwei Zhang
Ftrace has ability to export trace packets to other destination.
Currently, only function trace can be exported. This series extends the
support to event trace and trace_maker. STM is one possible destination to
export ftrace. Use separate channel for each CPU to avoid mixing up packets
from different CPUs together.

Change from v2:
Change flag definition to BIT(). (Steven)
Add comment in stm_ftrace_write() to clarify it's safe to use 
smp_processor_id() here since preempt is disabled. (Steven) 

Change from v1:
All changes are suggested by Steven Rostedt.
User separate flag to control function trace, event trace and trace mark.
Allocate channels according to num_possible_cpu() dynamically.
Move ftrace_exports routines up so all ftrace can use them.

Tingwei Zhang (6):
  stm class: ftrace: change dependency to TRACING
  tracing: add flag to control different traces
  tracing: add trace_export support for event trace
  tracing: add trace_export support for trace_marker
  stm class: ftrace: enable supported trace export flag
  stm class: ftrace: use different channel accroding to CPU

 drivers/hwtracing/stm/Kconfig  |   2 +-
 drivers/hwtracing/stm/ftrace.c |   7 +-
 include/linux/trace.h  |   7 +
 kernel/trace/trace.c   | 270 ++---
 4 files changed, 159 insertions(+), 127 deletions(-)

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v3 2/6] tracing: add flag to control different traces

2020-08-12 Thread Tingwei Zhang
More traces like event trace or trace marker will be supported.
Add flag for difference traces, so that they can be controlled
separately. Move current function trace to it's own flag
instead of global ftrace enable flag.

Signed-off-by: Tingwei Zhang 
Reviewed-by: Steven Rostedt (VMware) 
---
 include/linux/trace.h |  5 +
 kernel/trace/trace.c  | 36 +++-
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/include/linux/trace.h b/include/linux/trace.h
index 7fd86d3c691f..bc93063dda39 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -3,6 +3,9 @@
 #define _LINUX_TRACE_H
 
 #ifdef CONFIG_TRACING
+
+#define TRACE_EXPORT_FUNCTION  BIT(0)
+
 /*
  * The trace export - an export of Ftrace output. The trace_export
  * can process traces and export them to a registered destination as
@@ -15,10 +18,12 @@
  * next- pointer to the next trace_export
  * write   - copy traces which have been delt with ->commit() to
  *   the destination
+ * flags   - which ftrace to be exported
  */
 struct trace_export {
struct trace_export __rcu   *next;
void (*write)(struct trace_export *, const void *, unsigned int);
+   int flags;
 };
 
 int register_ftrace_export(struct trace_export *export);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bb62269724d5..8f1e66831e9e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2747,33 +2747,37 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer 
*buffer,
 
 static void
 trace_process_export(struct trace_export *export,
-  struct ring_buffer_event *event)
+  struct ring_buffer_event *event, int flag)
 {
struct trace_entry *entry;
unsigned int size = 0;
 
-   entry = ring_buffer_event_data(event);
-   size = ring_buffer_event_length(event);
-   export->write(export, entry, size);
+   if (export->flags & flag) {
+   entry = ring_buffer_event_data(event);
+   size = ring_buffer_event_length(event);
+   export->write(export, entry, size);
+   }
 }
 
 static DEFINE_MUTEX(ftrace_export_lock);
 
 static struct trace_export __rcu *ftrace_exports_list __read_mostly;
 
-static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled);
+static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled);
 
-static inline void ftrace_exports_enable(void)
+static inline void ftrace_exports_enable(struct trace_export *export)
 {
-   static_branch_enable(_exports_enabled);
+   if (export->flags & TRACE_EXPORT_FUNCTION)
+   static_branch_inc(_function_exports_enabled);
 }
 
-static inline void ftrace_exports_disable(void)
+static inline void ftrace_exports_disable(struct trace_export *export)
 {
-   static_branch_disable(_exports_enabled);
+   if (export->flags & TRACE_EXPORT_FUNCTION)
+   static_branch_dec(_function_exports_enabled);
 }
 
-static void ftrace_exports(struct ring_buffer_event *event)
+static void ftrace_exports(struct ring_buffer_event *event, int flag)
 {
struct trace_export *export;
 
@@ -2781,7 +2785,7 @@ static void ftrace_exports(struct ring_buffer_event 
*event)
 
export = rcu_dereference_raw_check(ftrace_exports_list);
while (export) {
-   trace_process_export(export, event);
+   trace_process_export(export, event, flag);
export = rcu_dereference_raw_check(export->next);
}
 
@@ -2821,8 +2825,7 @@ rm_trace_export(struct trace_export **list, struct 
trace_export *export)
 static inline void
 add_ftrace_export(struct trace_export **list, struct trace_export *export)
 {
-   if (*list == NULL)
-   ftrace_exports_enable();
+   ftrace_exports_enable(export);
 
add_trace_export(list, export);
 }
@@ -2833,8 +2836,7 @@ rm_ftrace_export(struct trace_export **list, struct 
trace_export *export)
int ret;
 
ret = rm_trace_export(list, export);
-   if (*list == NULL)
-   ftrace_exports_disable();
+   ftrace_exports_disable(export);
 
return ret;
 }
@@ -2887,8 +2889,8 @@ trace_function(struct trace_array *tr,
entry->parent_ip= parent_ip;
 
if (!call_filter_check_discard(call, entry, buffer, event)) {
-   if (static_branch_unlikely(_exports_enabled))
-   ftrace_exports(event);
+   if (static_branch_unlikely(_function_exports_enabled))
+   ftrace_exports(event, TRACE_EXPORT_FUNCTION);
__buffer_unlock_commit(buffer, event);
}
 }
-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project



[PATCH v3 3/6] tracing: add trace_export support for event trace

2020-08-12 Thread Tingwei Zhang
Only function traces can be exported to other destinations currently.
This patch exports event trace as well. Move trace export related
function to the beginning of file so other trace can call
trace_process_export() to export.

Signed-off-by: Tingwei Zhang 
Reviewed-by: Steven Rostedt (VMware) 
---
 include/linux/trace.h |   1 +
 kernel/trace/trace.c  | 259 ++
 2 files changed, 135 insertions(+), 125 deletions(-)

diff --git a/include/linux/trace.h b/include/linux/trace.h
index bc93063dda39..5a01eeffb254 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -5,6 +5,7 @@
 #ifdef CONFIG_TRACING
 
 #define TRACE_EXPORT_FUNCTION  BIT(0)
+#define TRACE_EXPORT_EVENT BIT(1)
 
 /*
  * The trace export - an export of Ftrace output. The trace_export
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f1e66831e9e..2f9302a8b322 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -251,6 +251,138 @@ unsigned long long ns2usecs(u64 nsec)
return nsec;
 }
 
+static void
+trace_process_export(struct trace_export *export,
+  struct ring_buffer_event *event, int flag)
+{
+   struct trace_entry *entry;
+   unsigned int size = 0;
+
+   if (export->flags & flag) {
+   entry = ring_buffer_event_data(event);
+   size = ring_buffer_event_length(event);
+   export->write(export, entry, size);
+   }
+}
+
+static DEFINE_MUTEX(ftrace_export_lock);
+
+static struct trace_export __rcu *ftrace_exports_list __read_mostly;
+
+static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled);
+static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled);
+
+static inline void ftrace_exports_enable(struct trace_export *export)
+{
+   if (export->flags & TRACE_EXPORT_FUNCTION)
+   static_branch_inc(_function_exports_enabled);
+
+   if (export->flags & TRACE_EXPORT_EVENT)
+   static_branch_inc(_event_exports_enabled);
+}
+
+static inline void ftrace_exports_disable(struct trace_export *export)
+{
+   if (export->flags & TRACE_EXPORT_FUNCTION)
+   static_branch_dec(_function_exports_enabled);
+
+   if (export->flags & TRACE_EXPORT_EVENT)
+   static_branch_dec(_event_exports_enabled);
+}
+
+static void ftrace_exports(struct ring_buffer_event *event, int flag)
+{
+   struct trace_export *export;
+
+   preempt_disable_notrace();
+
+   export = rcu_dereference_raw_check(ftrace_exports_list);
+   while (export) {
+   trace_process_export(export, event, flag);
+   export = rcu_dereference_raw_check(export->next);
+   }
+
+   preempt_enable_notrace();
+}
+
+static inline void
+add_trace_export(struct trace_export **list, struct trace_export *export)
+{
+   rcu_assign_pointer(export->next, *list);
+   /*
+* We are entering export into the list but another
+* CPU might be walking that list. We need to make sure
+* the export->next pointer is valid before another CPU sees
+* the export pointer included into the list.
+*/
+   rcu_assign_pointer(*list, export);
+}
+
+static inline int
+rm_trace_export(struct trace_export **list, struct trace_export *export)
+{
+   struct trace_export **p;
+
+   for (p = list; *p != NULL; p = &(*p)->next)
+   if (*p == export)
+   break;
+
+   if (*p != export)
+   return -1;
+
+   rcu_assign_pointer(*p, (*p)->next);
+
+   return 0;
+}
+
+static inline void
+add_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+   ftrace_exports_enable(export);
+
+   add_trace_export(list, export);
+}
+
+static inline int
+rm_ftrace_export(struct trace_export **list, struct trace_export *export)
+{
+   int ret;
+
+   ret = rm_trace_export(list, export);
+   ftrace_exports_disable(export);
+
+   return ret;
+}
+
+int register_ftrace_export(struct trace_export *export)
+{
+   if (WARN_ON_ONCE(!export->write))
+   return -1;
+
+   mutex_lock(_export_lock);
+
+   add_ftrace_export(_exports_list, export);
+
+   mutex_unlock(_export_lock);
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(register_ftrace_export);
+
+int unregister_ftrace_export(struct trace_export *export)
+{
+   int ret;
+
+   mutex_lock(_export_lock);
+
+   ret = rm_ftrace_export(_exports_list, export);
+
+   mutex_unlock(_export_lock);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(unregister_ftrace_export);
+
 /* trace_flags holds trace_options default values */
 #define TRACE_DEFAULT_FLAGS\
(FUNCTION_DEFAULT_FLAGS |   \
@@ -2702,6 +2834,8 @@ void trace_event_buffer_commit(struct trace_event_buffer 
*fbuffer)
if (static_key_false(_printk_key.key))
output_printk(fbuffer);
 
+   if 

  1   2   3   4   5   6   7   8   9   10   >