Re: [PATCH] softirq: add irq off checking for __raise_softirq_irqoff
On Thu, Aug 06, 2020 at 12:07:29PM +0800, Jiafei Pan wrote: > __raise_softirq_irqoff will update per-CPU mask of pending softirqs, > it need to be called in irq disabled context in order to keep it atomic > operation, otherwise it will be interrupted by hardware interrupt, > and per-CPU softirqs pending mask will be corrupted, the result is > there will be unexpected issue, for example hrtimer soft irq will > be losed and soft hrtimer will never be expire and handled. > > Adding irqs disabled checking here to provide warning in irqs enabled > context. > > Signed-off-by: Jiafei Pan > --- > kernel/softirq.c | 5 + > 1 file changed, 5 insertions(+) > > diff --git a/kernel/softirq.c b/kernel/softirq.c > index bf88d7f62433..11f61e54a3ae 100644 > --- a/kernel/softirq.c > +++ b/kernel/softirq.c > @@ -481,6 +481,11 @@ void raise_softirq(unsigned int nr) > > void __raise_softirq_irqoff(unsigned int nr) > { > + /* This function can only be called in irq disabled context, > + * otherwise or_softirq_pending will be interrupted by hardware > + * interrupt, so that there will be unexpected issue. > + */ Comment style is wrong, also I'm not sure the comment is really helpfull. > + WARN_ON_ONCE(!irqs_disabled()); lockdep_assert_irqs_disabled(); > trace_softirq_raise(nr); > or_softirq_pending(1UL << nr); > }
Re: [PATCH v3 2/2] powerpc/uaccess: Add pre-update addressing to __get_user_asm() and __put_user_asm()
Le 12/08/2020 à 21:37, Segher Boessenkool a écrit : On Wed, Aug 12, 2020 at 12:25:17PM +, Christophe Leroy wrote: Enable pre-update addressing mode in __get_user_asm() and __put_user_asm() Signed-off-by: Christophe Leroy --- v3: new, splited out from patch 1. It still looks fine to me, you can keep my Reviewed-by: :-) Ah yes thanks, forgot it when I commited it. Reviewed-by: Segher Boessenkool Christophe
Re: [PATCH v4 1/7] pinctrl: qcom: Add msmgpio irqchip flags
Hi, On 8/12/2020 1:02 AM, Stephen Boyd wrote: Can the subject be more specific? "pinctrl: qcom: Set IRQCHIP_SET_TYPE_MASKED flag"? Sure i can update subject in v5. Thanks, Maulik Quoting Maulik Shah (2020-08-10 04:20:54) Add irqchip specific flags for msmgpio irqchip to mask non wakeirqs during suspend and mask before setting irq type. Masking before changing type should make sure any spurious interrupt is not detected during this operation. Fixes: e35a6ae0eb3a ("pinctrl/msm: Setup GPIO chip in hierarchy") Acked-by: Linus Walleij Signed-off-by: Maulik Shah --- -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation
Re: [PATCH v2] riscv: Setup exception vector for nommu platform
On 2020/08/13 12:40, Qiu Wenbo wrote: > Exception vector is missing on nommu platform and that is an issue. > This patch is tested in Sipeed Maix Bit Dev Board. > > Fixes: 79b1feba5455 ("RISC-V: Setup exception vector early") > Suggested-by: Anup Patel > Suggested-by: Atish Patra > Signed-off-by: Qiu Wenbo Please add a cc stable #5.8 tag. Kendryte support is in 5.8 stable. > --- > arch/riscv/kernel/head.S | 25 + > 1 file changed, 17 insertions(+), 8 deletions(-) > > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S > index d0c5c316e9bb..0a4e81b8dc79 100644 > --- a/arch/riscv/kernel/head.S > +++ b/arch/riscv/kernel/head.S > @@ -77,16 +77,10 @@ relocate: > csrw CSR_SATP, a0 > .align 2 > 1: > - /* Set trap vector to exception handler */ > - la a0, handle_exception > + /* Set trap vector to spin forever to help debug */ > + la a0, .Lsecondary_park > csrw CSR_TVEC, a0 > > - /* > - * Set sup0 scratch register to 0, indicating to exception vector that > - * we are presently executing in kernel. > - */ > - csrw CSR_SCRATCH, zero > - > /* Reload the global pointer */ > .option push > .option norelax > @@ -144,9 +138,23 @@ secondary_start_common: > la a0, swapper_pg_dir > call relocate > #endif > + call setup_trap_vector > tail smp_callin > #endif /* CONFIG_SMP */ > > +.align 2 > +setup_trap_vector: > + /* Set trap vector to exception handler */ > + la a0, handle_exception > + csrw CSR_TVEC, a0 > + > + /* > + * Set sup0 scratch register to 0, indicating to exception vector that > + * we are presently executing in kernel. > + */ > + csrw CSR_SCRATCH, zero > + ret > + > .Lsecondary_park: > /* We lack SMP support or have too many harts, so park this hart */ > wfi > @@ -240,6 +248,7 @@ clear_bss_done: > call relocate > #endif /* CONFIG_MMU */ > > + call setup_trap_vector > /* Restore C environment */ > la tp, init_task > sw zero, TASK_TI_CPU(tp) > -- Damien Le Moal Western Digital Research
Re: WARN_ON_ONCE(1) in iomap_dio_actor()
On Mon, Aug 10, 2020 at 10:03:03PM -0400, Qian Cai wrote: > On Sun, Jul 26, 2020 at 04:24:12PM +0100, Christoph Hellwig wrote: > > On Fri, Jul 24, 2020 at 02:24:32PM -0400, Qian Cai wrote: > > > On Fri, Jun 19, 2020 at 05:17:47PM -0700, Matthew Wilcox wrote: > > > > On Fri, Jun 19, 2020 at 05:17:50PM -0400, Qian Cai wrote: > > > > > Running a syscall fuzzer by a normal user could trigger this, > > > > > > > > > > [55649.32][T515839] WARNING: CPU: 6 PID: 515839 at > > > > > fs/iomap/direct-io.c:391 iomap_dio_actor+0x29c/0x420 > > > > ... > > > > > 371 static loff_t > > > > > 372 iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, > > > > > 373 void *data, struct iomap *iomap, struct iomap > > > > > *srcmap) > > > > > 374 { > > > > > 375 struct iomap_dio *dio = data; > > > > > 376 > > > > > 377 switch (iomap->type) { > > > > > 378 case IOMAP_HOLE: > > > > > 379 if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) > > > > > 380 return -EIO; > > > > > 381 return iomap_dio_hole_actor(length, dio); > > > > > 382 case IOMAP_UNWRITTEN: > > > > > 383 if (!(dio->flags & IOMAP_DIO_WRITE)) > > > > > 384 return iomap_dio_hole_actor(length, dio); > > > > > 385 return iomap_dio_bio_actor(inode, pos, length, > > > > > dio, iomap); > > > > > 386 case IOMAP_MAPPED: > > > > > 387 return iomap_dio_bio_actor(inode, pos, length, > > > > > dio, iomap); > > > > > 388 case IOMAP_INLINE: > > > > > 389 return iomap_dio_inline_actor(inode, pos, length, > > > > > dio, iomap); > > > > > 390 default: > > > > > 391 WARN_ON_ONCE(1); > > > > > 392 return -EIO; > > > > > 393 } > > > > > 394 } > > > > > > > > > > Could that be iomap->type == IOMAP_DELALLOC ? Looking throught the > > > > > logs, > > > > > it contains a few pread64() calls until this happens, > > > > > > > > It _shouldn't_ be able to happen. XFS writes back ranges which exist > > > > in the page cache upon seeing an O_DIRECT I/O. So it's not supposed to > > > > be possible for there to be an extent which is waiting for the contents > > > > of the page cache to be written back. > > > > > > Okay, it is IOMAP_DELALLOC. We have, > > > > Can you share the fuzzer? If we end up with delalloc space here we > > probably need to fix a bug in the cache invalidation code. > > Here is a simple reproducer (I believe it can also be reproduced using > xfstests > generic/503 on a plain xfs without DAX when SCRATCH_MNT == TEST_DIR), > > # git clone https://gitlab.com/cailca/linux-mm > # cd linux-mm; make > # ./random 14 Ok: file.fd_write = safe_open("./testfile", O_RDWR|O_CREAT); file.fd_read = safe_open("./testfile", O_RDWR|O_CREAT|O_DIRECT); file.ptr = safe_mmap(NULL, fsize, PROT_READ|PROT_WRITE, MAP_SHARED, file.fd_write, 0); So this is all IO to the same inode and you loop while !done { do { rc = pread(file.fd_read, file.ptr + read, fsize - read, read); if (rc > 0) read += rc; } while (rc > 0); rc = safe_fallocate(file.fd_write, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, fsize); } On two threads at once? So, essentially, you do a DIO read into a mmap()d range from the same file, with DIO read ascending and the mmap() range descending, then once that is done you hole punch the file and do it again? IOWs, this is a racing page_mkwrite()/DIO read workload, and the moment the two threads hit the same block of the file with a DIO read and a page_mkwrite at the same time, it throws a warning. Well, that's completely expected behaviour. DIO is not serialised against mmap() access at all, and so if the page_mkwrite occurs between the writeback and the iomap_apply() call in the dio path, then it will see the delalloc block taht the page-mkwrite allocated. No sane application would ever do this, it's behaviour as expected, so I don't think there's anything to care about here. Cheers, Dave. -- Dave Chinner da...@fromorbit.com
[PATCH] rtc: cmos: initialize rtc time when reading alarm
cmos_read_alarm() may leave certain fields of a struct rtc_time untouched; therefore, these fields contain garbage if not properly initialized, leading to inconsistent values when converting into time64_t. This patch to set all fields of a struct rtc_time to -1 before calling cmos_read_alarm(). Signed-off-by: Victor Ding --- drivers/rtc/rtc-cmos.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index bcc96ab7793f..c99af567780d 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -1006,6 +1006,7 @@ static int cmos_suspend(struct device *dev) enable_irq_wake(cmos->irq); } + memset(>saved_wkalrm.time, -1, sizeof(struct rtc_time)); cmos_read_alarm(dev, >saved_wkalrm); dev_dbg(dev, "suspend%s, ctrl %02x\n", @@ -1054,6 +1055,7 @@ static void cmos_check_wkalrm(struct device *dev) return; } + memset(_alarm.time, -1, sizeof(struct rtc_time)); cmos_read_alarm(dev, _alarm); t_current_expires = rtc_tm_to_time64(_alarm.time); t_saved_expires = rtc_tm_to_time64(>saved_wkalrm.time); -- 2.28.0.236.gb10cc79966-goog
Re: [RFC 2/7] KVM: VMX: Expose IA32_PKRS MSR
On 8/13/2020 5:21 AM, Jim Mattson wrote: On Fri, Aug 7, 2020 at 1:46 AM Chenyi Qiang wrote: Protection Keys for Supervisor Pages (PKS) uses IA32_PKRS MSR (PKRS) at index 0x6E1 to allow software to manage supervisor protection key rights. For performance consideration, PKRS intercept will be disabled so that the guest can access the PKRS without VM exits. PKS introduces dedicated control fields in VMCS to switch PKRS, which only does the retore part. In addition, every VM exit saves PKRS into the guest-state area in VMCS, while VM enter won't save the host value due to the expectation that the host won't change the MSR often. Update the host's value in VMCS manually if the MSR has been changed by the kernel since the last time the VMCS was run. The function get_current_pkrs() in arch/x86/mm/pkeys.c exports the per-cpu variable pkrs_cache to avoid frequent rdmsr of PKRS. Signed-off-by: Chenyi Qiang --- diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 11e4df560018..df2c2e733549 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -289,6 +289,7 @@ static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx, dest->ds_sel = src->ds_sel; dest->es_sel = src->es_sel; #endif + dest->pkrs = src->pkrs; Why isn't this (and other PKRS code) inside the #ifdef CONFIG_X86_64? PKRS isn't usable outside of long mode, is it? Yes, I'm also thinking about whether to put all pks code into CONFIG_X86_64. The kernel implementation also wrap its pks code inside CONFIG_ARCH_HAS_SUPERVISOR_PKEYS which has dependency with CONFIG_X86_64. However, maybe this can help when host kernel disable PKS but the guest enable it. What do you think about this? } static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 7a3675fddec2..39ec3d0c844b 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -40,6 +40,7 @@ struct vmcs_host_state { #ifdef CONFIG_X86_64 u16 ds_sel, es_sel; #endif + u32 pkrs; One thing that does seem odd throughout is that PKRS is a 64-bit resource, but we store and pass around only 32-bits. Yes, the top 32 bits are currently reserved, but the same can be said of, say, cr4, a few lines above this. Yet, we store and pass around cr4 as 64-bits. How do we decide? IMO, If the high part of PKRS is zero-reserved, it's OK to use u32. I define it as u32 just to follow the definition pkrs_cache in kernel code. }; struct vmcs_controls_shadow { @@ -1163,6 +1164,20 @@ void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) */ host_state->ldt_sel = kvm_read_ldt(); + /* +* Update the host pkrs vmcs field before vcpu runs. +* The setting of VM_EXIT_LOAD_IA32_PKRS can ensure +* kvm_cpu_cap_has(X86_FEATURE_PKS) && +* guest_cpuid_has(vcpu, X86_FEATURE_VMX). +*/ + if (vm_exit_controls_get(vmx) & VM_EXIT_LOAD_IA32_PKRS) { + host_pkrs = get_current_pkrs(); + if (unlikely(host_pkrs != host_state->pkrs)) { + vmcs_write64(HOST_IA32_PKRS, host_pkrs); + host_state->pkrs = host_pkrs; + } + } + #ifdef CONFIG_X86_64 savesegment(ds, host_state->ds_sel); savesegment(es, host_state->es_sel); @@ -1951,6 +1966,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) else msr_info->data = vmx->pt_desc.guest.addr_a[index / 2]; break; + case MSR_IA32_PKRS: + if (!kvm_cpu_cap_has(X86_FEATURE_PKS) || + (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_PKS))) Could this be simplified if we required kvm_cpu_cap_has(X86_FEATURE_PKS) as a prerequisite for guest_cpuid_has(vcpu, X86_FEATURE_PKS)? If not, what is the expected behavior if guest_cpuid_has(vcpu, X86_FEATURE_PKS) is true and kvm_cpu_cap_has(X86_FEATURE_PKS) is false? Yes, kvm_cpu_cap_has() is a prerequisite for guest_cpuid_has() as we have done the check in vmx_cpuid_update(). Here I add the kvm_cpu_cap_has() check to ensure the vmcs_read(GUEST_IA32_PKRS) can execute correctly in case of the userspace access. + return 1; + msr_info->data = vmcs_read64(GUEST_IA32_PKRS); + break; case MSR_TSC_AUX: if (!msr_info->host_initiated && !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) @@ -7230,6 +7267,26 @@ static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4)); } +static void vmx_update_pkrs_cfg(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap; + bool pks_supported =
Re: [PATCH v2] riscv: Setup exception vector for nommu platform
On Wed, Aug 12, 2020 at 8:40 PM Qiu Wenbo wrote: > > Exception vector is missing on nommu platform and that is an issue. > This patch is tested in Sipeed Maix Bit Dev Board. > > Fixes: 79b1feba5455 ("RISC-V: Setup exception vector early") > Suggested-by: Anup Patel > Suggested-by: Atish Patra > Signed-off-by: Qiu Wenbo > --- > arch/riscv/kernel/head.S | 25 + > 1 file changed, 17 insertions(+), 8 deletions(-) > > diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S > index d0c5c316e9bb..0a4e81b8dc79 100644 > --- a/arch/riscv/kernel/head.S > +++ b/arch/riscv/kernel/head.S > @@ -77,16 +77,10 @@ relocate: > csrw CSR_SATP, a0 > .align 2 > 1: > - /* Set trap vector to exception handler */ > - la a0, handle_exception > + /* Set trap vector to spin forever to help debug */ > + la a0, .Lsecondary_park > csrw CSR_TVEC, a0 > > - /* > -* Set sup0 scratch register to 0, indicating to exception vector that > -* we are presently executing in kernel. > -*/ > - csrw CSR_SCRATCH, zero > - > /* Reload the global pointer */ > .option push > .option norelax > @@ -144,9 +138,23 @@ secondary_start_common: > la a0, swapper_pg_dir > call relocate > #endif > + call setup_trap_vector > tail smp_callin > #endif /* CONFIG_SMP */ > > +.align 2 > +setup_trap_vector: > + /* Set trap vector to exception handler */ > + la a0, handle_exception > + csrw CSR_TVEC, a0 > + > + /* > +* Set sup0 scratch register to 0, indicating to exception vector that > +* we are presently executing in kernel. > +*/ > + csrw CSR_SCRATCH, zero > + ret > + > .Lsecondary_park: > /* We lack SMP support or have too many harts, so park this hart */ > wfi > @@ -240,6 +248,7 @@ clear_bss_done: > call relocate > #endif /* CONFIG_MMU */ > > + call setup_trap_vector > /* Restore C environment */ > la tp, init_task > sw zero, TASK_TI_CPU(tp) > -- > 2.28.0 > @palmer: Can you queue this for the next part2 PR ? Reviewed-by: Atish Patra -- Regards, Atish
Re: [PATCH 3/3] usb: Add Kconfig and Makefile changes to build brcmstb-usb-pinmap
On Wed, Aug 12, 2020 at 04:20:18PM -0400, Al Cooper wrote: > From: Al Cooper > > Add Kconfig and Makefile changes to build brcmstb-usb-pinmap and > update MAINTAINERS for the new driver. This can be part of the previous patch, or at least the Kconfig and Makefile changes should be there so that we build the code when we add it. > refs #SWLINUX-5537 What is this? > > Signed-off-by: Al Cooper > --- > MAINTAINERS | 8 > drivers/usb/host/Kconfig | 4 > drivers/usb/host/Makefile | 1 + > 3 files changed, 13 insertions(+) > > diff --git a/MAINTAINERS b/MAINTAINERS > index f0569cf304ca..3a44ac61899b 100644 > --- a/MAINTAINERS > +++ b/MAINTAINERS > @@ -3527,6 +3527,14 @@ S: Maintained > F: Documentation/devicetree/bindings/usb/brcm,bcm7445-ehci.yaml > F: drivers/usb/host/ehci-brcm.* > > +BROADCOM BRCMSTB USB PIN MAP DRIVER > +M: Al Cooper > +L: linux-...@vger.kernel.org > +L: bcm-kernel-feedback-l...@broadcom.com > +S: Maintained > +F: Documentation/devicetree/bindings/usb/brcm,usb-pinmap.yaml > +F: drivers/usb/host/brcmstb-usb-pinmap.c > + > BROADCOM BRCMSTB USB2 and USB3 PHY DRIVER > M: Al Cooper > L: linux-kernel@vger.kernel.org > diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig > index 1cb3004ea7b2..9c285053bb0c 100644 > --- a/drivers/usb/host/Kconfig > +++ b/drivers/usb/host/Kconfig > @@ -109,12 +109,16 @@ endif # USB_XHCI_HCD > config USB_EHCI_BRCMSTB > tristate > > +config BRCM_USB_PINMAP > + tristate > + > config USB_BRCMSTB > tristate "Broadcom STB USB support" > depends on (ARCH_BRCMSTB && PHY_BRCM_USB) || COMPILE_TEST > select USB_OHCI_HCD_PLATFORM if USB_OHCI_HCD > select USB_EHCI_BRCMSTB if USB_EHCI_HCD > select USB_XHCI_PLATFORM if USB_XHCI_HCD > + select BRCM_USB_PINMAP > help > Enables support for XHCI, EHCI and OHCI host controllers > found in Broadcom STB SoC's. > diff --git a/drivers/usb/host/Makefile b/drivers/usb/host/Makefile > index bc731332fed9..0e63ef94790d 100644 > --- a/drivers/usb/host/Makefile > +++ b/drivers/usb/host/Makefile > @@ -90,3 +90,4 @@ obj-$(CONFIG_USB_HCD_BCMA) += bcma-hcd.o > obj-$(CONFIG_USB_HCD_SSB)+= ssb-hcd.o > obj-$(CONFIG_USB_FOTG210_HCD)+= fotg210-hcd.o > obj-$(CONFIG_USB_MAX3421_HCD)+= max3421-hcd.o > +obj-$(CONFIG_BRCM_USB_PINMAP)+= brcmstb-usb-pinmap.o Shouldn't this driver be in usb/misc/ with other drivers like this? Why host? Wait, why is this a separate driver at all? Why not just build it into the USB_BRCMSTB driver? thanks, greg k-h
Re: [PATCH 2/2] iommu/iova: Free global iova rcache on iova alloc failure
On 8/12/2020 8:46 PM, Joerg Roedel wrote: > On Mon, Aug 03, 2020 at 03:30:48PM +0530, Vijayanand Jitta wrote: >> ping? > > Please repost when v5.9-rc1 is released and add > > Robin Murphy > > on your Cc list. > > Thanks, > > Joerg > Sure, will do. Thanks, Vijay -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, hosted by The Linux Foundation
Re: [PATCH net] bonding: show saner speed for broadcast mode
Jarod Wilson wrote: >Broadcast mode bonds transmit a copy of all traffic simultaneously out of >all interfaces, so the "speed" of the bond isn't really the aggregate of >all interfaces, but rather, the speed of the lowest active interface. Did you mean "slowest" here? >Also, the type of the speed field is u32, not unsigned long, so adjust >that accordingly, as required to make min() function here without >complaining about mismatching types. > >Fixes: bb5b052f751b ("bond: add support to read speed and duplex via ethtool") >CC: Jay Vosburgh >CC: Veaceslav Falico >CC: Andy Gospodarek >CC: "David S. Miller" >CC: net...@vger.kernel.org >Signed-off-by: Jarod Wilson Did you notice this by inspection, or did it come up in use somewhere? I can't recall ever hearing of anyone using broadcast mode, so I'm curious if there is a use for it, but this change seems reasonable enough regardless. -J Acked-by: Jay Vosburgh >--- > drivers/net/bonding/bond_main.c | 21 ++--- > 1 file changed, 18 insertions(+), 3 deletions(-) > >diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c >index 5ad43aaf76e5..c853ca67058c 100644 >--- a/drivers/net/bonding/bond_main.c >+++ b/drivers/net/bonding/bond_main.c >@@ -4552,13 +4552,23 @@ static netdev_tx_t bond_start_xmit(struct sk_buff >*skb, struct net_device *dev) > return ret; > } > >+static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed) >+{ >+ if (speed == 0 || speed == SPEED_UNKNOWN) >+ speed = slave->speed; >+ else >+ speed = min(speed, slave->speed); >+ >+ return speed; >+} >+ > static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, > struct ethtool_link_ksettings *cmd) > { > struct bonding *bond = netdev_priv(bond_dev); >- unsigned long speed = 0; > struct list_head *iter; > struct slave *slave; >+ u32 speed = 0; > > cmd->base.duplex = DUPLEX_UNKNOWN; > cmd->base.port = PORT_OTHER; >@@ -4570,8 +4580,13 @@ static int bond_ethtool_get_link_ksettings(struct >net_device *bond_dev, >*/ > bond_for_each_slave(bond, slave, iter) { > if (bond_slave_can_tx(slave)) { >- if (slave->speed != SPEED_UNKNOWN) >- speed += slave->speed; >+ if (slave->speed != SPEED_UNKNOWN) { >+ if (BOND_MODE(bond) == BOND_MODE_BROADCAST) >+ speed = bond_mode_bcast_speed(slave, >+speed); >+ else >+ speed += slave->speed; >+ } > if (cmd->base.duplex == DUPLEX_UNKNOWN && > slave->duplex != DUPLEX_UNKNOWN) > cmd->base.duplex = slave->duplex; >-- >2.20.1 >
RE: [PATCH RFC v2 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver
> From: Jason Wang > Sent: Thursday, August 13, 2020 12:34 PM > > > On 2020/8/12 下午12:05, Tian, Kevin wrote: > >> The problem is that if we tie all controls via VFIO uAPI, the other > >> subsystem like vDPA is likely to duplicate them. I wonder if there is a > >> way to decouple the vSVA out of VFIO uAPI? > > vSVA is a per-device (either pdev or mdev) feature thus naturally should > > be managed by its device driver (VFIO or vDPA). From this angle some > > duplication is inevitable given VFIO and vDPA are orthogonal passthrough > > frameworks. Within the kernel the majority of vSVA handling is done by > > IOMMU and IOASID modules thus most logic are shared. > > > So why not introduce vSVA uAPI at IOMMU or IOASID layer? One may ask a similar question why IOMMU doesn't expose map/unmap as uAPI... > > > > > >>>If an userspace DMA interface can be easily > >>> adapted to be a passthrough one, it might be the choice. > >> It's not that easy even for VFIO which requires a lot of new uAPIs and > >> infrastructures(e.g mdev) to be invented. > >> > >> > >>> But for idxd, > >>> we see mdev a much better fit here, given the big difference between > >>> what userspace DMA requires and what guest driver requires in this hw. > >> A weak point for mdev is that it can't serve kernel subsystem other than > >> VFIO. In this case, you need some other infrastructures (like [1]) to do > >> this. > > mdev is not exclusive from kernel usages. It's perfectly fine for a driver > > to reserve some work queues for host usages, while wrapping others > > into mdevs. > > > I meant you may want slices to be an independent device from the kernel > point of view: > > E.g for ethernet devices, you may want 10K mdevs to be passed to guest. > > Similarly, you may want 10K net devices which is connected to the kernel > networking subsystems. > > In this case it's not simply reserving queues but you need some other > type of device abstraction. There could be some kind of duplication > between this and mdev. > yes, some abstraction required but isn't it what the driver should care about instead of mdev framework itself? If the driver reports the same set of resource to both mdev and networking, it needs to make sure when the resource is claimed in one interface then it should be marked in-use in another. e.g. each mdev includes a available_intances attribute. the driver could report 10k available instances initially and then update it to 5K when another 5K is used for net devices later. Mdev definitely has its usage limitations. Some may be improved in the future, some may not. But those are distracting from the original purpose of this thread (mdev vs. userspace DMA) and better be discussed in other places e.g. LPC... Thanks Kevin
Re: [PATCH v3 2/2] power: supply: sbs-battery: don't assume i2c errors as battery disconnect
On Thu, Aug 13, 2020 at 1:11 PM Ikjoon Jang wrote: > > Current sbs-battery considers all smbus errors as disconnection events > when battery-detect pin isn't supplied, and restored to present state back > when any successful transaction is made. > > This can lead to unlimited state changes between present and !present > when one unsupported command was requested and other following commands > were successful, e.g. udev rules tries to read multiple properties. > > This patch checks battery presence by reading known good command to > check battery existence. > > Signed-off-by: Ikjoon Jang Looks good now, AFAICT. Thanks! Reviewed-by: Nicolas Boichat > --- > drivers/power/supply/sbs-battery.c | 25 + > 1 file changed, 17 insertions(+), 8 deletions(-) > > diff --git a/drivers/power/supply/sbs-battery.c > b/drivers/power/supply/sbs-battery.c > index 6acb4ea25d2a..2e32fde04628 100644 > --- a/drivers/power/supply/sbs-battery.c > +++ b/drivers/power/supply/sbs-battery.c > @@ -878,10 +878,17 @@ static int sbs_get_property(struct power_supply *psy, > if (!chip->enable_detection) > goto done; > > - if (!chip->gpio_detect && > - chip->is_present != (ret >= 0)) { > - sbs_update_presence(chip, (ret >= 0)); > - power_supply_changed(chip->power_supply); > + if (!chip->gpio_detect && chip->is_present != (ret >= 0)) { > + bool old_present = chip->is_present; > + union power_supply_propval val; > + > + ret = sbs_get_battery_presence_and_health( > + client, POWER_SUPPLY_PROP_PRESENT, ); > + > + sbs_update_presence(chip, !ret && val.intval); > + > + if (old_present != chip->is_present) > + power_supply_changed(chip->power_supply); > } > > done: > @@ -1067,11 +1074,13 @@ static int sbs_probe(struct i2c_client *client) > * to the battery. > */ > if (!(force_load || chip->gpio_detect)) { > - rc = sbs_read_word_data(client, sbs_data[REG_STATUS].addr); > + union power_supply_propval val; > > - if (rc < 0) { > - dev_err(>dev, "%s: Failed to get device > status\n", > - __func__); > + rc = sbs_get_battery_presence_and_health( > + client, POWER_SUPPLY_PROP_PRESENT, ); > + if (rc < 0 || !val.intval) { > + dev_err(>dev, "Failed to get present > status\n"); > + rc = -ENODEV; > goto exit_psupply; > } > } > -- > 2.28.0.236.gb10cc79966-goog >
Re: WARNING: locking bug in l2cap_chan_del
syzbot has found a reproducer for the following issue on: HEAD commit:06a7a37b ipv4: tunnel: fix compilation on ARCH=um git tree: net console output: https://syzkaller.appspot.com/x/log.txt?x=115caa1690 kernel config: https://syzkaller.appspot.com/x/.config?x=7bb894f55faf8242 dashboard link: https://syzkaller.appspot.com/bug?extid=01d7fc00b2a0419d01cc compiler: gcc (GCC) 10.1.0-syz 20200507 syz repro: https://syzkaller.appspot.com/x/repro.syz?x=13fb564a90 IMPORTANT: if you fix the issue, please add the following tag to the commit: Reported-by: syzbot+01d7fc00b2a0419d0...@syzkaller.appspotmail.com [ cut here ] DEBUG_LOCKS_WARN_ON(1) WARNING: CPU: 0 PID: 5 at kernel/locking/lockdep.c:183 hlock_class kernel/locking/lockdep.c:183 [inline] WARNING: CPU: 0 PID: 5 at kernel/locking/lockdep.c:183 hlock_class kernel/locking/lockdep.c:172 [inline] WARNING: CPU: 0 PID: 5 at kernel/locking/lockdep.c:183 check_wait_context kernel/locking/lockdep.c:4100 [inline] WARNING: CPU: 0 PID: 5 at kernel/locking/lockdep.c:183 __lock_acquire+0x1674/0x5640 kernel/locking/lockdep.c:4376 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 5 Comm: kworker/0:0 Not tainted 5.8.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events l2cap_chan_timeout Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x18f/0x20d lib/dump_stack.c:118 panic+0x2e3/0x75c kernel/panic.c:231 __warn.cold+0x20/0x45 kernel/panic.c:600 report_bug+0x1bd/0x210 lib/bug.c:198 handle_bug+0x38/0x90 arch/x86/kernel/traps.c:235 exc_invalid_op+0x14/0x40 arch/x86/kernel/traps.c:255 asm_exc_invalid_op+0x12/0x20 arch/x86/include/asm/idtentry.h:536 RIP: 0010:hlock_class kernel/locking/lockdep.c:183 [inline] RIP: 0010:hlock_class kernel/locking/lockdep.c:172 [inline] RIP: 0010:check_wait_context kernel/locking/lockdep.c:4100 [inline] RIP: 0010:__lock_acquire+0x1674/0x5640 kernel/locking/lockdep.c:4376 Code: d2 0f 85 f1 36 00 00 44 8b 15 f0 8e 57 09 45 85 d2 0f 85 1c fa ff ff 48 c7 c6 80 af 4b 88 48 c7 c7 80 aa 4b 88 e8 ce 36 eb ff <0f> 0b e9 02 fa ff ff c7 44 24 38 fe ff ff ff 41 bf 01 00 00 00 c7 RSP: 0018:c9cbf8e0 EFLAGS: 00010086 RAX: RBX: 0004 RCX: RDX: 8880a95a2140 RSI: 815d8eb7 RDI: f52000197f0e RBP: 8880a95a2ab0 R08: R09: 89bcb3c3 R10: 07d2 R11: 0001 R12: R13: 19a1 R14: 8880a95a2140 R15: 0004 lock_acquire+0x1f1/0xad0 kernel/locking/lockdep.c:5005 __raw_spin_lock_bh include/linux/spinlock_api_smp.h:135 [inline] _raw_spin_lock_bh+0x2f/0x40 kernel/locking/spinlock.c:175 spin_lock_bh include/linux/spinlock.h:359 [inline] lock_sock_nested+0x3b/0x110 net/core/sock.c:3040 l2cap_sock_teardown_cb+0x88/0x400 net/bluetooth/l2cap_sock.c:1520 l2cap_chan_del+0xad/0x1300 net/bluetooth/l2cap_core.c:618 l2cap_chan_close+0x118/0xb10 net/bluetooth/l2cap_core.c:823 l2cap_chan_timeout+0x173/0x450 net/bluetooth/l2cap_core.c:436 process_one_work+0x94c/0x1670 kernel/workqueue.c:2269 worker_thread+0x64c/0x1120 kernel/workqueue.c:2415 kthread+0x3b5/0x4a0 kernel/kthread.c:292 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294 Kernel Offset: disabled Rebooting in 86400 seconds..
Re: POC: Alternative solution: Re: [PATCH 0/4] printk: reimplement LOG_CONT handling
On (20/08/13 02:30), John Ogness wrote: > 2. I haven't yet figured out how to preserve calling context when a > newline appears. For example: > > pr_info("text"); > pr_cont(" 1"); > pr_cont(" 2\n"); > pr_cont("3"); > pr_cont(" 4\n"); > > For "3" the calling context (info, timestamp) is lost because with "2" > the record is finalized. Perhaps the above is invalid usage of LOG_CONT? This is not an unseen pattern, I'm afraid. And the problem here can be more general: pr_info("text"); pr_cont("1"); exception/IRQ/NMI pr_alert("text\n"); pr_cont("2"); pr_cont("\n"); I guess the solution would be to store "last log_level" in task_struct and get current (new) timestamp for broken cont line? We have this problem now. E.g. early boot messages from one of my boxes: 6,173,41837,-;x86: Booting SMP configuration: 6,174,41838,-; node #0, CPUs: #1 #2 #3 #4 4,175,44993,-;MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/... 4,176,44993,c; #5 #6 #7 "CPUs: #1 #2 #3 #4 #5 #6 #7" is supposed to be one cont line with loglevel 6. But it gets interrupted and flushed so that "#5 #6 #7" appears with the different loglevel. -ss
Re: [PATCH 1/3] mm: don't call activate_page() on new ksm pages
On Tue, Aug 11, 2020 at 9:04 PM Yu Zhao wrote: > > lru_cache_add_active_or_unevictable() already adds new ksm pages to > active lru. Calling activate_page() isn't really necessary in this > case. > > Signed-off-by: Yu Zhao > --- > mm/swapfile.c | 10 +- > 1 file changed, 5 insertions(+), 5 deletions(-) > > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 6c26916e95fd..cf115ea26a20 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -1913,16 +1913,16 @@ static int unuse_pte(struct vm_area_struct *vma, > pmd_t *pmd, >pte_mkold(mk_pte(page, vma->vm_page_prot))); > if (page == swapcache) { > page_add_anon_rmap(page, vma, addr, false); > + /* > +* Move the page to the active list so it is not > +* immediately swapped out again after swapon. > +*/ > + activate_page(page); Actually I think we could just remove this activate_page() call with Joonsoo's anonymous page workingset series merged. The active bit will be taken care by workingset_refault(). > } else { /* ksm created a completely new copy */ > page_add_new_anon_rmap(page, vma, addr, false); > lru_cache_add_active_or_unevictable(page, vma); And it looks the latest linus's tree already changed this to lru_cache_add_inactive_or_unevictable() by commit b518154e59 ("mm/vmscan: protect the workingset on anonymous LRU") Added Joonsoo in this loop. > } > swap_free(entry); > - /* > -* Move the page to the active list so it is not > -* immediately swapped out again after swapon. > -*/ > - activate_page(page); > out: > pte_unmap_unlock(pte, ptl); > if (page != swapcache) { > -- > 2.28.0.236.gb10cc79966-goog > >
drivers/char/agp/i460-agp.c:254:19: sparse: sparse: incorrect type in assignment (different address spaces)
tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master head: dc06fe51d26efc100ac74121607c01a454867c91 commit: 670d0a4b10704667765f7d18f7592993d02783aa sparse: use identifiers to define address spaces date: 8 weeks ago config: ia64-randconfig-s031-20200813 (attached as .config) compiler: ia64-linux-gcc (GCC) 9.3.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # apt-get install sparse # sparse version: v0.6.2-168-g9554805c-dirty git checkout 670d0a4b10704667765f7d18f7592993d02783aa # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=ia64 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot sparse warnings: (new ones prefixed by >>) >> drivers/char/agp/i460-agp.c:254:19: sparse: sparse: incorrect type in >> assignment (different address spaces) @@ expected void *static >> [assigned] [toplevel] gatt @@ got void [noderef] __iomem * @@ drivers/char/agp/i460-agp.c:254:19: sparse: expected void *static [assigned] [toplevel] gatt >> drivers/char/agp/i460-agp.c:254:19: sparse: got void [noderef] __iomem * >> drivers/char/agp/i460-agp.c:266:17: sparse: sparse: incorrect type in >> argument 2 (different address spaces) @@ expected void volatile >> [noderef] __iomem *addr @@ got unsigned int [usertype] * @@ >> drivers/char/agp/i460-agp.c:266:17: sparse: expected void volatile >> [noderef] __iomem *addr drivers/char/agp/i460-agp.c:266:17: sparse: got unsigned int [usertype] * >> drivers/char/agp/i460-agp.c:267:9: sparse: sparse: incorrect type in >> argument 1 (different address spaces) @@ expected void const volatile >> [noderef] __iomem *addr @@ got unsigned int [usertype] * @@ >> drivers/char/agp/i460-agp.c:267:9: sparse: expected void const volatile >> [noderef] __iomem *addr drivers/char/agp/i460-agp.c:267:9: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:281:17: sparse: sparse: incorrect type in argument 2 (different address spaces) @@ expected void volatile [noderef] __iomem *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:281:17: sparse: expected void volatile [noderef] __iomem *addr drivers/char/agp/i460-agp.c:281:17: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:282:9: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const volatile [noderef] __iomem *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:282:9: sparse: expected void const volatile [noderef] __iomem *addr drivers/char/agp/i460-agp.c:282:9: sparse: got unsigned int [usertype] * >> drivers/char/agp/i460-agp.c:284:21: sparse: sparse: incorrect type in >> argument 1 (different address spaces) @@ expected void volatile >> [noderef] __iomem *addr @@ got void *static [assigned] [toplevel] gatt @@ drivers/char/agp/i460-agp.c:284:21: sparse: expected void volatile [noderef] __iomem *addr drivers/char/agp/i460-agp.c:284:21: sparse: got void *static [assigned] [toplevel] gatt drivers/char/agp/i460-agp.c:318:22: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const volatile [noderef] __iomem *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:318:22: sparse: expected void const volatile [noderef] __iomem *addr drivers/char/agp/i460-agp.c:318:22: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:318:22: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const volatile [noderef] __iomem *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:318:22: sparse: expected void const volatile [noderef] __iomem *addr drivers/char/agp/i460-agp.c:318:22: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:319:25: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const volatile [noderef] __iomem *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:319:25: sparse: expected void const volatile [noderef] __iomem *addr drivers/char/agp/i460-agp.c:319:25: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:330:25: sparse: sparse: incorrect type in argument 2 (different address spaces) @@ expected void volatile [noderef] __iomem *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:330:25: sparse: expected void volatile [noderef] __iomem *addr drivers/char/agp/i460-agp.c:330:25: sparse: got unsigned int [usertype] *
Re: [PATCH] ftrace: Fixup lockdep assert held of text_mutex
On Thu, 06 Aug 2020 22:01:01 PDT (-0700), guo...@kernel.org wrote: On Fri, Aug 7, 2020 at 12:01 PM Steven Rostedt wrote: On Fri, 7 Aug 2020 10:59:16 +0800 Guo Ren wrote: > > > > This looks like a bug in the lockdep_assert_held() in whatever arch > > (riscv) is running. > Seems you think it's a bug of arch implementation with the wrong usage > of text_mutex? > > Also @riscv maintainer, > How about modifying it in riscv's code? we still need to solve it. > > > diff --git a/arch/riscv/include/asm/ftrace.h b/arch/riscv/include/asm/ftrace.h > index ace8a6e..fb266c3 100644 > --- a/arch/riscv/include/asm/ftrace.h > +++ b/arch/riscv/include/asm/ftrace.h > @@ -23,6 +23,12 @@ static inline unsigned long > ftrace_call_adjust(unsigned long addr) > > struct dyn_arch_ftrace { > }; > + > +#ifdef CONFIG_DYNAMIC_FTRACE > +struct dyn_ftrace; > +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec); > +#define ftrace_init_nop ftrace_init_nop > +#endif > #endif > > #ifdef CONFIG_DYNAMIC_FTRACE > diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c > index 2ff63d0..9e9f7c0 100644 > --- a/arch/riscv/kernel/ftrace.c > +++ b/arch/riscv/kernel/ftrace.c > @@ -97,6 +97,17 @@ int ftrace_make_nop(struct module *mod, struct > dyn_ftrace *rec, > return __ftrace_modify_call(rec->ip, addr, false); > } > > +int ftrace_init_nop(struct module *mod, struct dyn_ftrace *rec) > +{ > + int ret; > + > + mutex_lock(_mutex); > + ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); Looking at x86, we have the following code: static int ftrace_poke_late = 0; int ftrace_arch_code_modify_prepare(void) __acquires(_mutex) { /* * Need to grab text_mutex to prevent a race from module loading * and live kernel patching from changing the text permissions while * ftrace has it set to "read/write". */ mutex_lock(_mutex); ftrace_poke_late = 1; return 0; } int ftrace_arch_code_modify_post_process(void) __releases(_mutex) { /* * ftrace_make_{call,nop}() may be called during * module load, and we need to finish the text_poke_queue() * that they do, here. */ text_poke_finish(); ftrace_poke_late = 0; mutex_unlock(_mutex); return 0; } And if ftrace_poke_late is not set, then ftrace_make_nop() does direct modification (calls text_poke_early(), which is basically a memcpy). This path doesn't have any checks against text_mutex being held, because it only happens at boot up. The solution is ok for me, but I want to get riscv maintainer's opinion before the next patch. @Paul Walmsley @Palmer Dabbelt Sorry, I'm not really sure what's going on here. I'm not really seeing code that matches this in our port right now, so maybe this is aginst some other tree? If it's the RISC-V kprobes patch set then I was hoping to take a look at that tomorrow (or I guess a bit earlier this week, but I had some surprise work stuff to do). IIRC there were a handful of races in the last patch set I saw, but it's been a while so I don't remember for sure. That said, I certainly wouldn't be surprised if there's a locking bug in our ftrace stuff. It'd be way easier for me to figure out what's going on if you have a concrete suggestion as to how to fix the issues -- even if it's just a workaround. > + mutex_unlock(_mutex); > + > + return ret; > +} > + > int ftrace_update_ftrace_func(ftrace_func_t func) > { > int ret = __ftrace_modify_call((unsigned long)_call, > --- > > > > --- a/kernel/trace/ftrace.c > > > +++ b/kernel/trace/ftrace.c > > > @@ -26,6 +26,7 @@ > > > #include > > > #include > > > #include > > > +#include > > > #include > > > #include > > > #include > > > @@ -6712,9 +6713,11 @@ void __init ftrace_init(void) > > > > ftrace_init() is called before SMP is initialized. Nothing else should > > be running here. That means grabbing a mutex is useless. > I don't agree, ftrace_init are modifying kernel text, so we should > give the lock of text_mutex to keep semantic consistency. Did you test your patch on x86 with lockdep? Ah.., no :P ftrace_process_locs() grabs the ftrace_lock, which I believe is held when text_mutex is taken in other locations. So this will probably not work anyway. text_mutex isn't to be taken at the ftrace level. Yes, currently it seemed only to be used by kernel/kprobes.c.
Re: [PATCH v2 15/20] fuse, dax: Take ->i_mmap_sem lock during dax page fault
On Wed, Aug 12, 2020 at 05:10:12PM -0400, Vivek Goyal wrote: > On Wed, Aug 12, 2020 at 11:23:45AM +1000, Dave Chinner wrote: > > On Tue, Aug 11, 2020 at 01:55:30PM -0400, Vivek Goyal wrote: > > > On Tue, Aug 11, 2020 at 08:22:38AM +1000, Dave Chinner wrote: > > > > On Fri, Aug 07, 2020 at 03:55:21PM -0400, Vivek Goyal wrote: > > > > > We need some kind of locking mechanism here. Normal file systems like > > > > > ext4 and xfs seems to take their own semaphore to protect agains > > > > > truncate while fault is going on. > > > > > > > > > > We have additional requirement to protect against fuse dax memory > > > > > range > > > > > reclaim. When a range has been selected for reclaim, we need to make > > > > > sure > > > > > no other read/write/fault can try to access that memory range while > > > > > reclaim is in progress. Once reclaim is complete, lock will be > > > > > released > > > > > and read/write/fault will trigger allocation of fresh dax range. > > > > > > > > > > Taking inode_lock() is not an option in fault path as lockdep > > > > > complains > > > > > about circular dependencies. So define a new fuse_inode->i_mmap_sem. > > > > > > > > That's precisely why filesystems like ext4 and XFS define their own > > > > rwsem. > > > > > > > > Note that this isn't a DAX requirement - the page fault > > > > serialisation is actually a requirement of hole punching... > > > > > > Hi Dave, > > > > > > I noticed that fuse code currently does not seem to have a rwsem which > > > can provide mutual exclusion between truncation/hole_punch path > > > and page fault path. I am wondering does that mean there are issues > > > with existing code or something else makes it unnecessary to provide > > > this mutual exlusion. > > > > I don't know enough about the fuse implementation to say. What I'm > > saying is that nothing in the core mm/ or VFS serilises page cache > > access to the data against direct filesystem manipulations of the > > underlying filesystem structures. > > Hi Dave, > > Got it. I was checking nfs and they also seem to be calling filemap_fault > and not taking any locks to block faults. fallocate() (nfs42_fallocate) > seems to block read/write/aio/dio but does not seem to do anything > about blocking faults. I am wondering if remote filesystem are > little different in this aspect. Especially fuse does not maintain > any filesystem block/extent data. It is file server which is doing > all that. I suspect they have all the same problems, and worse, the behaviour will largely be dependent on the server side behaviour that is out of the user's control. Essentially, nobody except us XFS folks seem to regard hole punching corrupting data or exposing stale data as being a problem that needs to be avoided or fixed. The only reason ext4 has the i_mmap_sem is because ext4 wanted to support DAX, and us XFS developers said "DAX absolutely requires that the filesystem can lock out physical access to the storage" and so they had no choice in the matter. Other than that, nobody really seems to understand or care about all these nasty little mmap() corner cases that we've seen corrupt user data or expose stale data to users over many years. > > i.e. nothing in the VFS or page fault IO path prevents this race > > condition: > > > > P0 P1 > > fallocate > > page cache invalidation > > page fault > > read data > > punch out data extents > > > > > backing store allocated> > > > > > > That's where the ext4 and XFS internal rwsem come into play: > > > > fallocate > > down_write(mmaplock) > > page cache invalidation > > page fault > > down_read(mmaplock) > > > > punch out data > > up_write(mmaplock) > > > > > > > > > > And there's not stale data exposure to userspace. > > Got it. I noticed that both fuse/nfs seem to have reversed the > order of operation. They call server to punch out data first > and then truncate page cache. And that should mean that even > if mmap reader will not see stale data after fallocate(punch_hole) > has finished. Yes, but that doesn't prevent page fault races from occuring, it just changes the nature of them.. Such as. > > There is nothing stopping, say, memory reclaim from reclaiming pages > > over the range while the hole is being punched, then having the > > application refault them while the backing store is being freed. > > While the page fault read IO is in progress, there's nothing > > stopping the filesystem from freeing those blocks, nor reallocating > > them and writing something else to them (e.g. metadata). So they > > could read someone elses data. > > > > Even worse: the page fault is a write fault, it lands in a
Re: KMSAN: uninit-value in can_receive (2)
syzbot has found a reproducer for the following issue on: HEAD commit:ce8056d1 wip: changed copy_from_user where instrumented git tree: https://github.com/google/kmsan.git master console output: https://syzkaller.appspot.com/x/log.txt?x=1041a3be90 kernel config: https://syzkaller.appspot.com/x/.config?x=3afe005fb99591f dashboard link: https://syzkaller.appspot.com/bug?extid=3f3837e61a48d32b495f compiler: clang version 10.0.0 (https://github.com/llvm/llvm-project/ c2443155a0fb245c8f17f2c1c72b6ea391e86e81) syz repro: https://syzkaller.appspot.com/x/repro.syz?x=160fbd0690 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=12a14e6e90 IMPORTANT: if you fix the issue, please add the following tag to the commit: Reported-by: syzbot+3f3837e61a48d32b4...@syzkaller.appspotmail.com = BUG: KMSAN: uninit-value in can_receive+0x26b/0x630 net/can/af_can.c:650 CPU: 1 PID: 8475 Comm: syz-executor984 Not tainted 5.8.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x21c/0x280 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:121 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 can_receive+0x26b/0x630 net/can/af_can.c:650 can_rcv+0x1fb/0x410 net/can/af_can.c:686 __netif_receive_skb_one_core net/core/dev.c:5281 [inline] __netif_receive_skb+0x265/0x670 net/core/dev.c:5395 process_backlog+0x50d/0xba0 net/core/dev.c:6239 napi_poll+0x43b/0xfd0 net/core/dev.c:6684 net_rx_action+0x35c/0xd40 net/core/dev.c:6752 __do_softirq+0x2ea/0x7f5 kernel/softirq.c:293 asm_call_on_stack+0xf/0x20 arch/x86/entry/entry_64.S:711 __run_on_irqstack arch/x86/include/asm/irq_stack.h:23 [inline] run_on_irqstack_cond arch/x86/include/asm/irq_stack.h:50 [inline] do_softirq_own_stack+0x7c/0xa0 arch/x86/kernel/irq_64.c:77 invoke_softirq kernel/softirq.c:390 [inline] __irq_exit_rcu+0x226/0x270 kernel/softirq.c:420 irq_exit_rcu+0xe/0x10 kernel/softirq.c:432 sysvec_apic_timer_interrupt+0x107/0x130 arch/x86/kernel/apic/apic.c:1091 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:593 RIP: 0010:_raw_spin_unlock_irqrestore+0x4b/0x70 kernel/locking/spinlock.c:192 Code: 00 8b b8 88 0c 00 00 48 8b 00 48 85 c0 75 28 48 89 df e8 b8 6e 4b f1 c6 00 00 c6 03 00 4d 85 e4 75 1c 4c 89 7d d8 ff 75 d8 9d <48> 83 c4 08 5b 41 5c 41 5e 41 5f 5d c3 e8 53 74 4b f1 eb d1 44 89 RSP: 0018:8880b8b0f880 EFLAGS: 0282 RAX: 88821fd3bc00 RBX: 88812fd1dc00 RCX: 00021fc9cc00 RDX: 88821fc9cc00 RSI: 04a0 RDI: 88812fd1dc00 RBP: 8880b8b0f8a8 R08: ea0f R09: 88812fffa000 R10: 0004 R11: R12: R13: 8880b656c2e8 R14: R15: 0282 unlock_hrtimer_base kernel/time/hrtimer.c:898 [inline] hrtimer_start_range_ns+0x459/0x4e0 kernel/time/hrtimer.c:1136 hrtimer_start include/linux/hrtimer.h:421 [inline] j1939_tp_schedule_txtimer+0x132/0x1b0 net/can/j1939/transport.c:671 j1939_sk_send_loop net/can/j1939/socket.c:1047 [inline] j1939_sk_sendmsg+0x1cc0/0x2950 net/can/j1939/socket.c:1160 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg net/socket.c:672 [inline] sys_sendmsg+0xc82/0x1240 net/socket.c:2352 ___sys_sendmsg net/socket.c:2406 [inline] __sys_sendmsg+0x6d1/0x840 net/socket.c:2439 __do_sys_sendmsg net/socket.c:2448 [inline] __se_sys_sendmsg+0x97/0xb0 net/socket.c:2446 __x64_sys_sendmsg+0x4a/0x70 net/socket.c:2446 do_syscall_64+0xad/0x160 arch/x86/entry/common.c:386 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x443559 Code: Bad RIP value. RSP: 002b:7dad8558 EFLAGS: 0246 ORIG_RAX: 002e RAX: ffda RBX: 0003 RCX: 00443559 RDX: RSI: 2180 RDI: 0003 RBP: 7dad8560 R08: 01bb R09: 01bb R10: 01bb R11: 0246 R12: 7dad8570 R13: R14: R15: Uninit was created at: kmsan_save_stack_with_flags mm/kmsan/kmsan.c:144 [inline] kmsan_internal_poison_shadow+0x66/0xd0 mm/kmsan/kmsan.c:127 kmsan_slab_alloc+0x8a/0xe0 mm/kmsan/kmsan_hooks.c:80 slab_alloc_node mm/slub.c:2839 [inline] __kmalloc_node_track_caller+0xeab/0x12e0 mm/slub.c:4478 __kmalloc_reserve net/core/skbuff.c:142 [inline] __alloc_skb+0x35f/0xb30 net/core/skbuff.c:210 alloc_skb include/linux/skbuff.h:1083 [inline] j1939_tp_tx_dat_new net/can/j1939/transport.c:568 [inline] j1939_xtp_do_tx_ctl net/can/j1939/transport.c:628 [inline] j1939_tp_tx_ctl net/can/j1939/transport.c:646 [inline] j1939_session_tx_rts net/can/j1939/transport.c:714 [inline] j1939_xtp_txnext_transmiter net/can/j1939/transport.c:832 [inline] j1939_tp_txtimer+0x402c/0x6980 net/can/j1939/transport.c:1095
[PATCH v3 1/2] power: supply: sbs-battery: combine get_presence_and_health
This patch enables calling sbs_get_battery_presence_and_health() without checking its chip type. No functional changes. Signed-off-by: Ikjoon Jang --- drivers/power/supply/sbs-battery.c | 73 +++--- 1 file changed, 36 insertions(+), 37 deletions(-) diff --git a/drivers/power/supply/sbs-battery.c b/drivers/power/supply/sbs-battery.c index 83b9924033bd..6acb4ea25d2a 100644 --- a/drivers/power/supply/sbs-battery.c +++ b/drivers/power/supply/sbs-battery.c @@ -389,37 +389,6 @@ static bool sbs_bat_needs_calibration(struct i2c_client *client) return !!(ret & BIT(7)); } -static int sbs_get_battery_presence_and_health( - struct i2c_client *client, enum power_supply_property psp, - union power_supply_propval *val) -{ - int ret; - - /* Dummy command; if it succeeds, battery is present. */ - ret = sbs_read_word_data(client, sbs_data[REG_STATUS].addr); - - if (ret < 0) { /* battery not present*/ - if (psp == POWER_SUPPLY_PROP_PRESENT) { - val->intval = 0; - return 0; - } - return ret; - } - - if (psp == POWER_SUPPLY_PROP_PRESENT) - val->intval = 1; /* battery present */ - else { /* POWER_SUPPLY_PROP_HEALTH */ - if (sbs_bat_needs_calibration(client)) { - val->intval = POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED; - } else { - /* SBS spec doesn't have a general health command. */ - val->intval = POWER_SUPPLY_HEALTH_UNKNOWN; - } - } - - return 0; -} - static int sbs_get_ti_battery_presence_and_health( struct i2c_client *client, enum power_supply_property psp, union power_supply_propval *val) @@ -478,6 +447,41 @@ static int sbs_get_ti_battery_presence_and_health( return 0; } +static int sbs_get_battery_presence_and_health( + struct i2c_client *client, enum power_supply_property psp, + union power_supply_propval *val) +{ + struct sbs_info *chip = i2c_get_clientdata(client); + int ret; + + if (chip->flags & SBS_FLAGS_TI_BQ20ZX5) + return sbs_get_ti_battery_presence_and_health(client, psp, val); + + /* Dummy command; if it succeeds, battery is present. */ + ret = sbs_read_word_data(client, sbs_data[REG_STATUS].addr); + + if (ret < 0) { /* battery not present*/ + if (psp == POWER_SUPPLY_PROP_PRESENT) { + val->intval = 0; + return 0; + } + return ret; + } + + if (psp == POWER_SUPPLY_PROP_PRESENT) + val->intval = 1; /* battery present */ + else { /* POWER_SUPPLY_PROP_HEALTH */ + if (sbs_bat_needs_calibration(client)) { + val->intval = POWER_SUPPLY_HEALTH_CALIBRATION_REQUIRED; + } else { + /* SBS spec doesn't have a general health command. */ + val->intval = POWER_SUPPLY_HEALTH_UNKNOWN; + } + } + + return 0; +} + static int sbs_get_battery_property(struct i2c_client *client, int reg_offset, enum power_supply_property psp, union power_supply_propval *val) @@ -780,12 +784,7 @@ static int sbs_get_property(struct power_supply *psy, switch (psp) { case POWER_SUPPLY_PROP_PRESENT: case POWER_SUPPLY_PROP_HEALTH: - if (chip->flags & SBS_FLAGS_TI_BQ20ZX5) - ret = sbs_get_ti_battery_presence_and_health(client, -psp, val); - else - ret = sbs_get_battery_presence_and_health(client, psp, - val); + ret = sbs_get_battery_presence_and_health(client, psp, val); /* this can only be true if no gpio is used */ if (psp == POWER_SUPPLY_PROP_PRESENT) -- 2.28.0.236.gb10cc79966-goog
[PATCH v3 2/2] power: supply: sbs-battery: don't assume i2c errors as battery disconnect
Current sbs-battery considers all smbus errors as disconnection events when battery-detect pin isn't supplied, and restored to present state back when any successful transaction is made. This can lead to unlimited state changes between present and !present when one unsupported command was requested and other following commands were successful, e.g. udev rules tries to read multiple properties. This patch checks battery presence by reading known good command to check battery existence. Signed-off-by: Ikjoon Jang --- drivers/power/supply/sbs-battery.c | 25 + 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/power/supply/sbs-battery.c b/drivers/power/supply/sbs-battery.c index 6acb4ea25d2a..2e32fde04628 100644 --- a/drivers/power/supply/sbs-battery.c +++ b/drivers/power/supply/sbs-battery.c @@ -878,10 +878,17 @@ static int sbs_get_property(struct power_supply *psy, if (!chip->enable_detection) goto done; - if (!chip->gpio_detect && - chip->is_present != (ret >= 0)) { - sbs_update_presence(chip, (ret >= 0)); - power_supply_changed(chip->power_supply); + if (!chip->gpio_detect && chip->is_present != (ret >= 0)) { + bool old_present = chip->is_present; + union power_supply_propval val; + + ret = sbs_get_battery_presence_and_health( + client, POWER_SUPPLY_PROP_PRESENT, ); + + sbs_update_presence(chip, !ret && val.intval); + + if (old_present != chip->is_present) + power_supply_changed(chip->power_supply); } done: @@ -1067,11 +1074,13 @@ static int sbs_probe(struct i2c_client *client) * to the battery. */ if (!(force_load || chip->gpio_detect)) { - rc = sbs_read_word_data(client, sbs_data[REG_STATUS].addr); + union power_supply_propval val; - if (rc < 0) { - dev_err(>dev, "%s: Failed to get device status\n", - __func__); + rc = sbs_get_battery_presence_and_health( + client, POWER_SUPPLY_PROP_PRESENT, ); + if (rc < 0 || !val.intval) { + dev_err(>dev, "Failed to get present status\n"); + rc = -ENODEV; goto exit_psupply; } } -- 2.28.0.236.gb10cc79966-goog
[PATCH v3 0/2] power: supply: sbs-battery: fix presence check
When gpio detection is not supplied, presence state transitions depend on every smbus transfer result from get_property(). This patch tries to check battery presence again with well supported command before state transition. Changes: v3: check return value of get_presence_and_health() v2: combine get_presence_and_health functions to reuse Ikjoon Jang (2): power: supply: sbs-battery: combine get_presence_and_health power: supply: sbs-battery: don't assume i2c errors as battery disconnect drivers/power/supply/sbs-battery.c | 98 -- 1 file changed, 53 insertions(+), 45 deletions(-) -- 2.28.0.236.gb10cc79966-goog
Re: [PATCH] perf parse-events: Set exclude_guest for user-space counting
Hi Arnaldo, On 8/12/2020 8:55 PM, Arnaldo Carvalho de Melo wrote: Em Wed, Aug 12, 2020 at 09:15:04AM -0300, Arnaldo Carvalho de Melo escreveu: Em Wed, Aug 12, 2020 at 02:59:53PM +0800, Jin Yao escreveu: Currently if we run 'perf record -e cycles:u', exclude_guest is 0. But it doesn't make sense that we request for user-space counting but we also get the guest report. To keep perf semantics consistent and clear, this patch sets exclude_guest for user-space counting. Applied, and also added this, that you should consider doing in the future (modulo the "Committer testing:" header :) ): Committer testing: Before: # perf record -e cycles:u ^C[ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 1.231 MB perf.data (91 samples) ] # # perf evlist -v cycles:u: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, exclude_hv: 1, freq: 1, sample_id_all: 1 # After: # perf record -e cycles:u ^C[ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 1.263 MB perf.data (403 samples) ] # # perf evlist -v cycles:u: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, exclude_hv: 1, freq: 1, sample_id_all: 1, exclude_guest: 1 # Also, please run 'perf test', as this will require changes to some expected perf_event_attr setups: [root@quaco ~]# perf test "event definition" 6: Parse event definition strings: FAILED! [root@quaco ~]# - Arnaldo Sorry for the perf test failure! I will post v2 to fix this issue. Thanks Jin Yao
Re: Recursive/circular locking in serial8250_console_write/serial8250_do_startup
On Wed, Aug 12, 2020 at 08:48:13AM -0700, Guenter Roeck wrote: > Hi, > > crbug.com/1114800 reports a hard lockup due to circular locking in the > 8250 console driver. This is seen if CONFIG_PROVE_LOCKING is enabled. > > Problem is as follows: > - serial8250_do_startup() locks the serial (console) port. > - serial8250_do_startup() then disables interrupts if interrupts are > shared, by calling disable_irq_nosync(). > - disable_irq_nosync() calls __irq_get_desc_lock() to lock the interrupt > descriptor. > - __irq_get_desc_lock() calls lock_acquire() > - If CONFIG_PROVE_LOCKING is enabled, validate_chain() and check_noncircular() > are called and identify a potential locking error. > - This locking error is reported via printk, which ultimately calls > serial8250_console_write(). > - serial8250_console_write() tries to lock the serial console port. > Since it is already locked, the system hangs and ultimately reports > a hard lockup. > > I understand we'll need to figure out and fix what lockdep complains about, > and I am working on that. However, even if that is fixed, we'll need a > solution for the recursive lock: Fixing the lockdep problem doesn't > guarantee that a similar problem (or some other log message) won't be > detected and reported sometime in the future while serial8250_do_startup() > holds the console port lock. > > Ideas, anyone ? Everything I came up with so far seems clumsy and hackish. > Turns out the situation is a bit worse than I thought. disable_irq_nosync(), when called from serial8250_do_startup(), locks the interrupt descriptor. The order of locking is serial port lock interrupt descriptor lock At the same time, __setup_irq() locks the interrupt descriptor as well. With the descriptor locked, it may report an error message using pr_err(). This in turn may call serial8250_console_write(), which will try to lock the console serial port. The lock sequence is interrupt descriptor lock serial port lock I added the lockdep splat to the bug log at crbug.com/1114800. Effectively, I think, this means we can't call disable_irq_nosync() while holding a serial port lock, or at least not while holding a serial port lock that is associated with a console. The problem was introduced (or, rather, exposed) with upstream commit 7febbcbc48fc ("serial: 8250: Check UPF_IRQ_SHARED in advance"). Guenter
Re: [PATCH v17 1/3] dt-bindings: Add bindings for Mediatek matrix keypad
On Wed, 2020-08-12 at 15:13 -0700, Dmitry Torokhov wrote: > Hi, > > On Tue, Aug 11, 2020 at 09:47:23AM +0800, Yingjoe Chen wrote: > > Hi, > > > > > > On Mon, 2020-08-10 at 14:40 +0800, Fengping Yu wrote: > > > From: "fengping.yu" > > > > > > This patch add devicetree bindings for Mediatek matrix keypad driver. > > > > > > Signed-off-by: fengping.yu > > > --- > > > .../devicetree/bindings/input/mtk-kpd.yaml| 87 +++ > > > 1 file changed, 87 insertions(+) > > > create mode 100644 Documentation/devicetree/bindings/input/mtk-kpd.yaml > > > > > > diff --git a/Documentation/devicetree/bindings/input/mtk-kpd.yaml > > > b/Documentation/devicetree/bindings/input/mtk-kpd.yaml > > > new file mode 100644 > > > index ..d74dd8a6fbde > > > > <...> > > > > > > > + keypad,num-columns: > > > +description: Number of column lines connected to the keypad > > > controller, > > > +it is not equal to PCB columns number, instead you should add > > > required value > > > +for each IC. If not specified, the default value is 1. > > > + > > > + keypad,num-rows: > > > +description: Number of row lines connected to the keypad controller, > > > it is > > > +not equal to PCB rows number, instead you should add required value > > > for each IC. > > > +If not specified, the default value is 1. > > > > Your source code can't really handle dts without rows/columns > > properties. Also, the default value doesn't make any sense. No IC will > > have rows or columns set to 1. > > > > Since these are IC specified, not board specified, I think you should > > just have the correct numbers in driver. > > It is actually property of board to decide how many keys it wants to > wire up. In extreme case it will be a single key, i.e. number of rows > and columns will indeed be 1. > > Thanks. > From the binding "it is not equal to PCB columns number, instead you should add required value for each IC." Driver code use this to calculate bit position in register, which is IC dependent. Joe.C
Re: [RFC 7/7] KVM: VMX: Enable PKS for nested VM
On 8/11/2020 8:05 AM, Jim Mattson wrote: On Fri, Aug 7, 2020 at 1:47 AM Chenyi Qiang wrote: PKS MSR passes through guest directly. Configure the MSR to match the L0/L1 settings so that nested VM runs PKS properly. Signed-off-by: Chenyi Qiang --- arch/x86/kvm/vmx/nested.c | 32 arch/x86/kvm/vmx/vmcs12.c | 2 ++ arch/x86/kvm/vmx/vmcs12.h | 6 +- arch/x86/kvm/vmx/vmx.c| 10 ++ arch/x86/kvm/vmx/vmx.h| 1 + 5 files changed, 50 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index df2c2e733549..1f9823d21ecd 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -647,6 +647,12 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W); + if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PKRS)) + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PKRS, + MSR_TYPE_R | MSR_TYPE_W); What if L1 intercepts only *reads* of MSR_IA32_PKRS? kvm_vcpu_unmap(vcpu, _vmx(vcpu)->nested.msr_bitmap_map, false); return true; @@ -2509,6 +2519,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (kvm_mpx_supported() && (!vmx->nested.nested_run_pending || !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))) vmcs_write64(GUEST_BNDCFGS, vmx->nested.vmcs01_guest_bndcfgs); + + if (kvm_cpu_cap_has(X86_FEATURE_PKS) && Is the above check superfluous? I would assume that the L1 guest can't set VM_ENTRY_LOAD_IA32_PKRS unless this is true. I enforce this check to ensure vmcs_write to the Guest_IA32_PKRS without error. if deleted, vmcs_write to GUEST_IA32_PKRS may executed when PKS is unsupported. + (!vmx->nested.nested_run_pending || +!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PKRS))) + vmcs_write64(GUEST_IA32_PKRS, vmx->nested.vmcs01_guest_pkrs); This doesn't seem right to me. On the target of a live migration, with L2 active at the time the snapshot was taken (i.e., vmx->nested.nested_run_pending=0), it looks like we're going to try to overwrite the current L2 PKRS value with L1's PKRS value (except that in this situation, vmx->nested.vmcs01_guest_pkrs should actually be 0). Am I missing something? We overwrite the L2 PKRS with L1's value when L2 doesn't support PKS. Because the L1's VM_ENTRY_LOAD_IA32_PKRS is off, we need to migrate L1's PKRS to L2. vmx_set_rflags(vcpu, vmcs12->guest_rflags); /* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the @@ -3916,6 +3943,8 @@ static void sync_vmcs02_to_vmcs12_rare(struct kvm_vcpu *vcpu, vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS); if (kvm_mpx_supported()) vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS); + if (kvm_cpu_cap_has(X86_FEATURE_PKS)) Shouldn't we be checking to see if the *virtual* CPU supports PKS before writing anything into vmcs12->guest_ia32_pkrs? Yes, It's reasonable. + vmcs12->guest_ia32_pkrs = vmcs_read64(GUEST_IA32_PKRS); vmx->nested.need_sync_vmcs02_to_vmcs12_rare = false; }
Re: [RFC PATCH 0/5] Introduce /proc/all/ to gather stats from all processes
On 8/12/20 1:51 AM, Andrei Vagin wrote: > > I rebased the task_diag patches on top of v5.8: > https://github.com/avagin/linux-task-diag/tree/v5.8-task-diag Thanks for updating the patches. > > /proc/pid files have three major limitations: > * Requires at least three syscalls per process per file > open(), read(), close() > * Variety of formats, mostly text based > The kernel spent time to encode binary data into a text format and > then tools like top and ps spent time to decode them back to a binary > format. > * Sometimes slow due to extra attributes > For example, /proc/PID/smaps contains a lot of useful informations > about memory mappings and memory consumption for each of them. But > even if we don't need memory consumption fields, the kernel will > spend time to collect this information. that's what I recall as well. > > More details and numbers are in this article: > https://avagin.github.io/how-fast-is-procfs > > This new interface doesn't have only one of these limitations, but > task_diag doesn't have all of them. > > And I compared how fast each of these interfaces: > > The test environment: > CPU: Intel(R) Core(TM) i5-6300U CPU @ 2.40GHz > RAM: 16GB > kernel: v5.8 with task_diag and /proc/all patches. > 100K processes: > $ ps ax | wc -l > 10228 100k processes but showing 10k here?? > > $ time cat /proc/all/status > /dev/null > > real 0m0.577s > user 0m0.017s > sys 0m0.559s > > task_proc_all is used to read /proc/pid/status for all tasks: > https://github.com/avagin/linux-task-diag/blob/master/tools/testing/selftests/task_diag/task_proc_all.c > > $ time ./task_proc_all status > tasks: 100230 > > real 0m0.924s > user 0m0.054s > sys 0m0.858s > > > /proc/all/status is about 40% faster than /proc/*/status. > > Now let's take a look at the perf output: > > $ time perf record -g cat /proc/all/status > /dev/null > $ perf report > - 98.08% 1.38% cat [kernel.vmlinux] [k] entry_SYSCALL_64 >- 96.70% entry_SYSCALL_64 > - do_syscall_64 > - 94.97% ksys_read > - 94.80% vfs_read >- 94.58% proc_reg_read > - seq_read > - 87.95% proc_pid_status > + 13.10% seq_put_decimal_ull_width > - 11.69% task_mem >+ 9.48% seq_put_decimal_ull_width > + 10.63% seq_printf > - 10.35% cpuset_task_status_allowed >+ seq_printf > - 9.84% render_sigset_t > 1.61% seq_putc >+ 1.61% seq_puts > + 4.99% proc_task_name > + 4.11% seq_puts > - 3.76% render_cap_t > 2.38% seq_put_hex_ll >+ 1.25% seq_puts > 2.64% __task_pid_nr_ns > + 1.54% get_task_mm > + 1.34% __lock_task_sighand > + 0.70% from_kuid_munged > 0.61% get_task_cred > 0.56% seq_putc > 0.52% hugetlb_report_usage > 0.52% from_kgid_munged > + 4.30% proc_all_next > + 0.82% _copy_to_user > > We can see that the kernel spent more than 50% of the time to encode binary > data into a text format. > > Now let's see how fast task_diag: > > $ time ./task_diag_all all -c -q > > real 0m0.087s > user 0m0.001s > sys 0m0.082s > > Maybe we need resurrect the task_diag series instead of inventing > another less-effective interface... I think the netlink message design is the better way to go. As system sizes continue to increase (> 100 cpus is common now) you need to be able to pass the right data to userspace as fast as possible to keep up with what can be a very dynamic userspace and set of processes. When you first proposed this idea I was working on systems with >= 1k cpus and the netlink option was able to keep up with a 'make -j N' on those systems. `perf record` walking /proc would never finish initializing - I had to add a "done initializing" message to know when to start a test. With the task_diag approach, perf could collect the data in short order and move on to recording data.
[PATCH] selftests: rtnetlink: load fou module for kci_test_encap_fou()
The kci_test_encap_fou() test from kci_test_encap() in rtnetlink.sh needs the fou module to work. Otherwise it will fail with: $ ip netns exec "$testns" ip fou add port ipproto 47 RTNETLINK answers: No such file or directory Error talking to the kernel Add the CONFIG_NET_FOU into the config file as well. Signed-off-by: Po-Hsu Lin --- tools/testing/selftests/net/config | 1 + tools/testing/selftests/net/rtnetlink.sh | 6 ++ 2 files changed, 7 insertions(+) diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config index 3b42c06b..96d2763 100644 --- a/tools/testing/selftests/net/config +++ b/tools/testing/selftests/net/config @@ -31,3 +31,4 @@ CONFIG_NET_SCH_ETF=m CONFIG_NET_SCH_NETEM=y CONFIG_TEST_BLACKHOLE_DEV=m CONFIG_KALLSYMS=y +CONFIG_NET_FOU diff --git a/tools/testing/selftests/net/rtnetlink.sh b/tools/testing/selftests/net/rtnetlink.sh index bdbf4b3..7931b65 100755 --- a/tools/testing/selftests/net/rtnetlink.sh +++ b/tools/testing/selftests/net/rtnetlink.sh @@ -521,6 +521,11 @@ kci_test_encap_fou() return $ksft_skip fi + if ! /sbin/modprobe -q -n fou; then + echo "SKIP: module fou is not found" + return $ksft_skip + fi + /sbin/modprobe -q fou ip -netns "$testns" fou add port ipproto 47 2>/dev/null if [ $? -ne 0 ];then echo "FAIL: can't add fou port , skipping test" @@ -541,6 +546,7 @@ kci_test_encap_fou() return 1 fi + /sbin/modprobe -q -r fou echo "PASS: fou" } -- 2.7.4
drivers/char/agp/i460-agp.c:254:19: sparse: expected void gatt
tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master head: dc06fe51d26efc100ac74121607c01a454867c91 commit: df41017eafd267c08acbfff99d34e4f96bbfbc92 ia64: remove support for machvecs date: 12 months ago config: ia64-randconfig-s031-20200813 (attached as .config) compiler: ia64-linux-gcc (GCC) 9.3.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # apt-get install sparse # sparse version: v0.6.2-168-g9554805c-dirty git checkout df41017eafd267c08acbfff99d34e4f96bbfbc92 # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=ia64 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot sparse warnings: (new ones prefixed by >>) drivers/char/agp/i460-agp.c:254:19: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected void *static [assigned] [toplevel] gatt @@ got void [noderef] * @@ >> drivers/char/agp/i460-agp.c:254:19: sparse: expected void *static >> [assigned] [toplevel] gatt drivers/char/agp/i460-agp.c:254:19: sparse: got void [noderef] * drivers/char/agp/i460-agp.c:266:17: sparse: sparse: incorrect type in argument 2 (different address spaces) @@ expected void volatile [noderef] *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:266:17: sparse: expected void volatile [noderef] *addr >> drivers/char/agp/i460-agp.c:266:17: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:267:9: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const volatile [noderef] *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:267:9: sparse: expected void const volatile [noderef] *addr drivers/char/agp/i460-agp.c:267:9: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:281:17: sparse: sparse: incorrect type in argument 2 (different address spaces) @@ expected void volatile [noderef] *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:281:17: sparse: expected void volatile [noderef] *addr drivers/char/agp/i460-agp.c:281:17: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:282:9: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const volatile [noderef] *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:282:9: sparse: expected void const volatile [noderef] *addr drivers/char/agp/i460-agp.c:282:9: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:284:21: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void volatile [noderef] *addr @@ got void *static [assigned] [toplevel] gatt @@ drivers/char/agp/i460-agp.c:284:21: sparse: expected void volatile [noderef] *addr >> drivers/char/agp/i460-agp.c:284:21: sparse: got void *static [assigned] >> [toplevel] gatt drivers/char/agp/i460-agp.c:318:22: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const volatile [noderef] *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:318:22: sparse: expected void const volatile [noderef] *addr drivers/char/agp/i460-agp.c:318:22: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:318:22: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const volatile [noderef] *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:318:22: sparse: expected void const volatile [noderef] *addr drivers/char/agp/i460-agp.c:318:22: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:319:25: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const volatile [noderef] *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:319:25: sparse: expected void const volatile [noderef] *addr drivers/char/agp/i460-agp.c:319:25: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:330:25: sparse: sparse: incorrect type in argument 2 (different address spaces) @@ expected void volatile [noderef] *addr @@ got unsigned int [usertype] * @@ drivers/char/agp/i460-agp.c:330:25: sparse: expected void volatile [noderef] *addr drivers/char/agp/i460-agp.c:330:25: sparse: got unsigned int [usertype] * drivers/char/agp/i460-agp.c:332:9: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void const volatile [noderef] *addr @@ got unsigned int [usertype] * @@
Re: [PATCH RFC v2 00/18] Add VFIO mediated device support and DEV-MSI support for the idxd driver
On 2020/8/12 下午12:05, Tian, Kevin wrote: The problem is that if we tie all controls via VFIO uAPI, the other subsystem like vDPA is likely to duplicate them. I wonder if there is a way to decouple the vSVA out of VFIO uAPI? vSVA is a per-device (either pdev or mdev) feature thus naturally should be managed by its device driver (VFIO or vDPA). From this angle some duplication is inevitable given VFIO and vDPA are orthogonal passthrough frameworks. Within the kernel the majority of vSVA handling is done by IOMMU and IOASID modules thus most logic are shared. So why not introduce vSVA uAPI at IOMMU or IOASID layer? If an userspace DMA interface can be easily adapted to be a passthrough one, it might be the choice. It's not that easy even for VFIO which requires a lot of new uAPIs and infrastructures(e.g mdev) to be invented. But for idxd, we see mdev a much better fit here, given the big difference between what userspace DMA requires and what guest driver requires in this hw. A weak point for mdev is that it can't serve kernel subsystem other than VFIO. In this case, you need some other infrastructures (like [1]) to do this. mdev is not exclusive from kernel usages. It's perfectly fine for a driver to reserve some work queues for host usages, while wrapping others into mdevs. I meant you may want slices to be an independent device from the kernel point of view: E.g for ethernet devices, you may want 10K mdevs to be passed to guest. Similarly, you may want 10K net devices which is connected to the kernel networking subsystems. In this case it's not simply reserving queues but you need some other type of device abstraction. There could be some kind of duplication between this and mdev. Thanks Thanks Kevin
Re: [PATCH] opp: Fix dev_pm_opp_set_rate() to not return early
On 11-08-20, 14:09, Stephen Boyd wrote: > This is a goto maze! Any chance we can clean this up? I have sent a short series in reply to this series, please have a look. It should look better now. -- viresh
[PATCH V2 1/4] opp: Enable resources again if they were disabled earlier
From: Rajendra Nayak dev_pm_opp_set_rate() can now be called with freq = 0 in order to either drop performance or bandwidth votes or to disable regulators on platforms which support them. In such cases, a subsequent call to dev_pm_opp_set_rate() with the same frequency ends up returning early because 'old_freq == freq' Instead make it fall through and put back the dropped performance and bandwidth votes and/or enable back the regulators. Cc: v5.3+ # v5.3+ Fixes: cd7ea582 ("opp: Make dev_pm_opp_set_rate() handle freq = 0 to drop performance votes") Reported-by: Sajida Bhanu Reviewed-by: Sibi Sankar Reported-by: Matthias Kaehlcke Tested-by: Matthias Kaehlcke Signed-off-by: Rajendra Nayak [ Viresh: Don't skip clk_set_rate() and massaged changelog ] Signed-off-by: Viresh Kumar --- Hi Rajendra, I wasn't able to test this stuff, please give it a try. I have simplified your patch and cleaned up a bunch of stuff as well. drivers/opp/core.c | 11 +++ 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index bdb028c7793d..9668ea04cc80 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -934,10 +934,13 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) /* Return early if nothing to do */ if (old_freq == freq) { - dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n", - __func__, freq); - ret = 0; - goto put_opp_table; + if (!opp_table->required_opp_tables && !opp_table->regulators && + !opp_table->paths) { + dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n", + __func__, freq); + ret = 0; + goto put_opp_table; + } } /* -- 2.14.1
[PATCH 2/4] opp: Track device's resources configuration status
The OPP core needs to track if the resources of devices are enabled or configured or not, as it disables the resources when target_freq is set to 0. Handle that with a separate variable to make it easy to maintain. Also note that we will unconditionally call clk_set_rate() in the case where the resources are getting enabled again. This shouldn't have any side effects anyway. Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 37 ++--- drivers/opp/opp.h | 2 ++ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 9668ea04cc80..e8882e7fd8a5 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -888,22 +888,18 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) } if (unlikely(!target_freq)) { + ret = 0; + + if (!opp_table->enabled) + goto put_opp_table; + /* * Some drivers need to support cases where some platforms may * have OPP table for the device, while others don't and * opp_set_rate() just needs to behave like clk_set_rate(). */ - if (!_get_opp_count(opp_table)) { - ret = 0; - goto put_opp_table; - } - - if (!opp_table->required_opp_tables && !opp_table->regulators && - !opp_table->paths) { - dev_err(dev, "target frequency can't be 0\n"); - ret = -EINVAL; + if (!_get_opp_count(opp_table)) goto put_opp_table; - } ret = _set_opp_bw(opp_table, NULL, dev, true); if (ret) @@ -915,6 +911,9 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) } ret = _set_required_opps(dev, opp_table, NULL); + if (!ret) + opp_table->enabled = false; + goto put_opp_table; } @@ -933,14 +932,11 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) old_freq = clk_get_rate(clk); /* Return early if nothing to do */ - if (old_freq == freq) { - if (!opp_table->required_opp_tables && !opp_table->regulators && - !opp_table->paths) { - dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n", - __func__, freq); - ret = 0; - goto put_opp_table; - } + if (opp_table->enabled && old_freq == freq) { + dev_dbg(dev, "%s: old/new frequencies (%lu Hz) are same, nothing to do\n", + __func__, freq); + ret = 0; + goto put_opp_table; } /* @@ -1001,8 +997,11 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) dev_err(dev, "Failed to set required opps: %d\n", ret); } - if (!ret) + if (!ret) { ret = _set_opp_bw(opp_table, opp, dev, false); + if (!ret) + opp_table->enabled = true; + } put_opp: dev_pm_opp_put(opp); diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index e51646ff279e..bd35802acc6e 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -152,6 +152,7 @@ enum opp_table_access { * property). * @paths: Interconnect path handles * @path_count: Number of interconnect paths + * @enabled: Set to true if the device's resources are enabled/configured. * @genpd_performance_state: Device's power domain support performance state. * @is_genpd: Marks if the OPP table belongs to a genpd. * @set_opp: Platform specific set_opp callback @@ -198,6 +199,7 @@ struct opp_table { bool regulator_enabled; struct icc_path **paths; unsigned int path_count; + bool enabled; bool genpd_performance_state; bool is_genpd; -- 2.14.1
[PATCH 3/4] opp: Reused enabled flag and remove regulator_enabled
The common "enabled" flag can be used here instead of "regulator_enabled" now. Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 13 +++-- drivers/opp/opp.h | 2 -- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index e8882e7fd8a5..5f5da257f58a 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -703,12 +703,10 @@ static int _generic_set_opp_regulator(struct opp_table *opp_table, * Enable the regulator after setting its voltages, otherwise it breaks * some boot-enabled regulators. */ - if (unlikely(!opp_table->regulator_enabled)) { + if (unlikely(!opp_table->enabled)) { ret = regulator_enable(reg); if (ret < 0) dev_warn(dev, "Failed to enable regulator: %d", ret); - else - opp_table->regulator_enabled = true; } return 0; @@ -905,10 +903,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) if (ret) goto put_opp_table; - if (opp_table->regulator_enabled) { - regulator_disable(opp_table->regulators[0]); - opp_table->regulator_enabled = false; - } + regulator_disable(opp_table->regulators[0]); ret = _set_required_opps(dev, opp_table, NULL); if (!ret) @@ -1795,11 +1790,9 @@ void dev_pm_opp_put_regulators(struct opp_table *opp_table) /* Make sure there are no concurrent readers while updating opp_table */ WARN_ON(!list_empty(_table->opp_list)); - if (opp_table->regulator_enabled) { + if (opp_table->enabled) { for (i = opp_table->regulator_count - 1; i >= 0; i--) regulator_disable(opp_table->regulators[i]); - - opp_table->regulator_enabled = false; } for (i = opp_table->regulator_count - 1; i >= 0; i--) diff --git a/drivers/opp/opp.h b/drivers/opp/opp.h index bd35802acc6e..0c3de3f6db5c 100644 --- a/drivers/opp/opp.h +++ b/drivers/opp/opp.h @@ -147,7 +147,6 @@ enum opp_table_access { * @clk: Device's clock handle * @regulators: Supply regulators * @regulator_count: Number of power supply regulators. Its value can be -1 - * @regulator_enabled: Set to true if regulators were previously enabled. * (uninitialized), 0 (no opp-microvolt property) or > 0 (has opp-microvolt * property). * @paths: Interconnect path handles @@ -196,7 +195,6 @@ struct opp_table { struct clk *clk; struct regulator **regulators; int regulator_count; - bool regulator_enabled; struct icc_path **paths; unsigned int path_count; bool enabled; -- 2.14.1
[PATCH 4/4] opp: Split out _opp_set_rate_zero()
Create separate routine _opp_set_rate_zero() to handle !target_freq case. Signed-off-by: Viresh Kumar --- drivers/opp/core.c | 52 +--- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/drivers/opp/core.c b/drivers/opp/core.c index 5f5da257f58a..e198b57efcf8 100644 --- a/drivers/opp/core.c +++ b/drivers/opp/core.c @@ -860,6 +860,34 @@ int dev_pm_opp_set_bw(struct device *dev, struct dev_pm_opp *opp) } EXPORT_SYMBOL_GPL(dev_pm_opp_set_bw); +static int _opp_set_rate_zero(struct device *dev, struct opp_table *opp_table) +{ + int ret; + + if (!opp_table->enabled) + return 0; + + /* +* Some drivers need to support cases where some platforms may +* have OPP table for the device, while others don't and +* opp_set_rate() just needs to behave like clk_set_rate(). +*/ + if (!_get_opp_count(opp_table)) + return 0; + + ret = _set_opp_bw(opp_table, NULL, dev, true); + if (ret) + return ret; + + regulator_disable(opp_table->regulators[0]); + + ret = _set_required_opps(dev, opp_table, NULL); + if (!ret) + opp_table->enabled = false; + + return ret; +} + /** * dev_pm_opp_set_rate() - Configure new OPP based on frequency * @dev:device for which we do this operation @@ -886,29 +914,7 @@ int dev_pm_opp_set_rate(struct device *dev, unsigned long target_freq) } if (unlikely(!target_freq)) { - ret = 0; - - if (!opp_table->enabled) - goto put_opp_table; - - /* -* Some drivers need to support cases where some platforms may -* have OPP table for the device, while others don't and -* opp_set_rate() just needs to behave like clk_set_rate(). -*/ - if (!_get_opp_count(opp_table)) - goto put_opp_table; - - ret = _set_opp_bw(opp_table, NULL, dev, true); - if (ret) - goto put_opp_table; - - regulator_disable(opp_table->regulators[0]); - - ret = _set_required_opps(dev, opp_table, NULL); - if (!ret) - opp_table->enabled = false; - + ret = _opp_set_rate_zero(dev, opp_table); goto put_opp_table; } -- 2.14.1
Re: [RFC PATCH 00/16] Core scheduling v6
On 2020/8/13 7:08, Joel Fernandes wrote: > On Wed, Aug 12, 2020 at 10:01:24AM +0800, Li, Aubrey wrote: >> Hi Joel, >> >> On 2020/8/10 0:44, Joel Fernandes wrote: >>> Hi Aubrey, >>> >>> Apologies for replying late as I was still looking into the details. >>> >>> On Wed, Aug 05, 2020 at 11:57:20AM +0800, Li, Aubrey wrote: >>> [...] +/* + * Core scheduling policy: + * - CORE_SCHED_DISABLED: core scheduling is disabled. + * - CORE_COOKIE_MATCH: tasks with same cookie can run + * on the same core concurrently. + * - CORE_COOKIE_TRUST: trusted task can run with kernel thread on the same core concurrently. + * - CORE_COOKIE_LONELY: tasks with cookie can run only + * with idle thread on the same core. + */ +enum coresched_policy { + CORE_SCHED_DISABLED, + CORE_SCHED_COOKIE_MATCH, + CORE_SCHED_COOKIE_TRUST, + CORE_SCHED_COOKIE_LONELY, +}; We can set policy to CORE_COOKIE_TRUST of uperf cgroup and fix this kind of performance regression. Not sure if this sounds attractive? >>> >>> Instead of this, I think it can be something simpler IMHO: >>> >>> 1. Consider all cookie-0 task as trusted. (Even right now, if you apply the >>>core-scheduling patchset, such tasks will share a core and sniff on each >>>other. So let us not pretend that such tasks are not trusted). >>> >>> 2. All kernel threads and idle task would have a cookie 0 (so that will >>> cover >>>ksoftirqd reported in your original issue). >>> >>> 3. Add a config option (CONFIG_SCHED_CORE_DEFAULT_TASKS_UNTRUSTED). Default >>>enable it. Setting this option would tag all tasks that are forked from a >>>cookie-0 task with their own cookie. Later on, such tasks can be added to >>>a group. This cover's PeterZ's ask about having 'default untrusted'). >>>(Users like ChromeOS that don't want to userspace system processes to be >>>tagged can disable this option so such tasks will be cookie-0). >>> >>> 4. Allow prctl/cgroup interfaces to create groups of tasks and override the >>>above behaviors. >> >> How does uperf in a cgroup work with ksoftirqd? Are you suggesting I set >> uperf's >> cookie to be cookie-0 via prctl? > > Yes, but let me try to understand better. There are 2 problems here I think: > > 1. ksoftirqd getting idled when HT is turned on, because uperf is sharing a > core with it: This should not be any worse than SMT OFF, because even SMT OFF > would also reduce ksoftirqd's CPU time just core sched is doing. Sure > core-scheduling adds some overhead with IPIs but such a huge drop of perf is > strange. Peter any thoughts on that? > > 2. Interface: To solve the performance problem, you are saying you want uperf > to share a core with ksoftirqd so that it is not forced into idle. Why not > just keep uperf out of the cgroup? I guess this is unacceptable for who runs their apps in container and vm. Thanks, -Aubrey > Then it will have cookie 0 and be able to > share core with kernel threads. About user-user isolation that you need, if > you tag any "untrusted" threads by adding it to CGroup, then there will > automatically isolated from uperf while allowing uperf to share CPU with > kernel threads. > > Please let me know your thoughts and thanks, > > - Joel > >> >> Thanks, >> -Aubrey >>> >>> 5. Document everything clearly so the semantics are clear both to the >>>developers of core scheduling and to system administrators. >>> >>> Note that, with the concept of "system trusted cookie", we can also do >>> optimizations like: >>> 1. Disable STIBP when switching into trusted tasks. >>> 2. Disable L1D flushing / verw stuff for L1TF/MDS issues, when switching >>> into >>>trusted tasks. >>> >>> At least #1 seems to be biting enabling HT on ChromeOS right now, and one >>> other engineer requested I do something like #2 already. >>> >>> Once we get full-syscall isolation working, threads belonging to a process >>> can also share a core so those can just share a core with the task-group >>> leader. >>> > Is the uperf throughput worse with SMT+core-scheduling versus no-SMT ? This is a good question, from the data we measured by uperf, SMT+core-scheduling is 28.2% worse than no-SMT, :( >>> >>> This is worrying for sure. :-(. We ought to debug/profile it more to see >>> what >>> is causing the overhead. Me/Vineeth added it as a topic for LPC as well. >>> >>> Any other thoughts from others on this? >>> >>> thanks, >>> >>> - Joel >>> >>> > thanks, > > - Joel > PS: I am planning to write a patch behind a CONFIG option that tags > all processes (default untrusted) so everything gets a cookie which > some folks said was how they wanted (have a whitelist instead of > blacklist). > >>
[rcu:dev.2020.08.11a 110/112] ERROR: modpost: "rcu_read_unlock_strict" undefined!
Hi Paul, First bad commit (maybe != root cause): tree: https://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev.2020.08.11a head: 9030fc13639a71b15466520f09ca07b80fc30ed1 commit: 601cd69fa4bb0127bb774cc852203fd9fdf5e269 [110/112] rcuperf: Change rcuperf to rcuscale config: riscv-allmodconfig (attached as .config) compiler: riscv64-linux-gcc (GCC) 9.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout 601cd69fa4bb0127bb774cc852203fd9fdf5e269 # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=riscv If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot All errors (new ones prefixed by >>, old ones prefixed by <<): ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wireless/intel/iwlegacy/iwl4965.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wireless/intel/iwlegacy/iwlegacy.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wireless/ath/ath11k/ath11k.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wireless/ath/wcn36xx/wcn36xx.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wireless/ath/ath10k/ath10k_core.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wireless/ath/carl9170/carl9170.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wireless/ath/ath9k/ath9k_htc.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wireless/ath/ath9k/ath9k_hw.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wireless/ath/ath9k/ath9k.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wan/lapbether.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/wan/hdlc_cisco.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/hamradio/bpqether.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/via/via-velocity.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/ti/ti_cpsw_new.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/ti/ti_cpsw.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/socionext/netsec.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/sfc/sfc.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/qlogic/qede/qede.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/qlogic/netxen/netxen_nic.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/qlogic/qlcnic/qlcnic.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/netronome/nfp/nfp.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/mscc/mscc_ocelot_common.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/mellanox/mlxsw/mlxsw_spectrum.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/mellanox/mlxsw/mlxsw_core.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/mellanox/mlx4/mlx4_en.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/mellanox/mlx4/mlx4_core.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/marvell/mvneta.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/intel/ice/ice.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/intel/fm10k/fm10k.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/intel/iavf/iavf.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/intel/i40e/i40e.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/intel/ixgbevf/ixgbevf.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/intel/ixgbe/ixgbe.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/intel/igc/igc.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/intel/igb/igb.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/chelsio/cxgb4/cxgb4.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/chelsio/cxgb3/cxgb3.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/cavium/thunder/nicvf.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/broadcom/bnxt/bnxt_en.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/broadcom/bnx2x/bnx2x.ko] undefined! ERROR: modpost: "rcu_read_unlock_strict" [drivers/net/ethernet/broadcom/cnic.ko]
Re: KMSAN: uninit-value in joydev_connect
syzbot has found a reproducer for the following issue on: HEAD commit:ce8056d1 wip: changed copy_from_user where instrumented git tree: https://github.com/google/kmsan.git master console output: https://syzkaller.appspot.com/x/log.txt?x=127b9fba90 kernel config: https://syzkaller.appspot.com/x/.config?x=3afe005fb99591f dashboard link: https://syzkaller.appspot.com/bug?extid=6a1bb5a33a0b128085bc compiler: clang version 10.0.0 (https://github.com/llvm/llvm-project/ c2443155a0fb245c8f17f2c1c72b6ea391e86e81) syz repro: https://syzkaller.appspot.com/x/repro.syz?x=1472674e90 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=15a1339a90 IMPORTANT: if you fix the issue, please add the following tag to the commit: Reported-by: syzbot+6a1bb5a33a0b12808...@syzkaller.appspotmail.com usb 1-1: config 0 interface 219 altsetting 0 endpoint 0xA has invalid wMaxPacketSize 0 usb 1-1: New USB device found, idVendor=078c, idProduct=1002, bcdDevice=e6.47 usb 1-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0 usb 1-1: config 0 descriptor?? gtco 1-1:0.219: Collection level already at zero input: GTCO_CalComp as /devices/platform/dummy_hcd.0/usb1/1-1/1-1:0.219/input/input5 = BUG: KMSAN: uninit-value in joydev_connect+0x10c0/0x1920 drivers/input/joydev.c:958 CPU: 1 PID: 27 Comm: kworker/1:1 Not tainted 5.8.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x21c/0x280 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:121 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 joydev_connect+0x10c0/0x1920 drivers/input/joydev.c:958 input_attach_handler drivers/input/input.c:1031 [inline] input_register_device+0x1d7b/0x21c0 drivers/input/input.c:2229 gtco_probe+0x32ce/0x39b0 drivers/input/tablet/gtco.c:990 usb_probe_interface+0xece/0x1550 drivers/usb/core/driver.c:374 really_probe+0xf20/0x20b0 drivers/base/dd.c:529 driver_probe_device+0x293/0x390 drivers/base/dd.c:701 __device_attach_driver+0x63f/0x830 drivers/base/dd.c:807 bus_for_each_drv+0x2ca/0x3f0 drivers/base/bus.c:431 __device_attach+0x4e2/0x7f0 drivers/base/dd.c:873 devic
Re: [PATCH v2 2/2] power: supply: sbs-battery: don't assume every i2c errors as battery disconnect
On Tue, Aug 11, 2020 at 2:59 PM Ikjoon Jang wrote: > > Current sbs-battery considers all smbus errors as diconnection events disconnection > when battery-detect pin isn't supplied, and restored to present state back > on any successful transaction were made. when any... is made > > This can leads lead > to unlimited state changes between present and !present > when one unsupported command was requested and other following commands > were successful, e.g. udev rules tries to read multiple properties. > > This patch checks battery presence by reading known good command to > check battery existence. > > Signed-off-by: Ikjoon Jang > --- > v2: fix return value checking of sbs_get_battery_presence_and_health() > --- > drivers/power/supply/sbs-battery.c | 26 +- > 1 file changed, 17 insertions(+), 9 deletions(-) > > diff --git a/drivers/power/supply/sbs-battery.c > b/drivers/power/supply/sbs-battery.c > index 6acb4ea25d2a..db964a470ebc 100644 > --- a/drivers/power/supply/sbs-battery.c > +++ b/drivers/power/supply/sbs-battery.c > @@ -878,10 +878,17 @@ static int sbs_get_property(struct power_supply *psy, > if (!chip->enable_detection) > goto done; > > - if (!chip->gpio_detect && > - chip->is_present != (ret >= 0)) { > - sbs_update_presence(chip, (ret >= 0)); > - power_supply_changed(chip->power_supply); > + if (!chip->gpio_detect && chip->is_present != (ret >=0)) { > + bool old_present = chip->is_present; > + union power_supply_propval val; > + > + sbs_get_battery_presence_and_health( > + client, POWER_SUPPLY_PROP_PRESENT, ); > + > + sbs_update_presence(chip, val.intval); I don't think you can/should assume that val.intval will be set correctly if the return value is negative (even if that's what the functions currently do, it'd be too easy to accidentally change them). So I still think you need to have: ret = sbs_get_battery_presence_and_health... sbs_update_presence(chip, !ret && val.intval); > + > + if (old_present != chip->is_present) > + power_supply_changed(chip->power_supply); > } > > done: > @@ -1067,11 +1074,12 @@ static int sbs_probe(struct i2c_client *client) > * to the battery. > */ > if (!(force_load || chip->gpio_detect)) { > - rc = sbs_read_word_data(client, sbs_data[REG_STATUS].addr); > - > - if (rc < 0) { > - dev_err(>dev, "%s: Failed to get device > status\n", > - __func__); > + union power_supply_propval val; > + sbs_get_battery_presence_and_health( > + client, POWER_SUPPLY_PROP_PRESENT, ); > + if (!val.intval) { > + dev_err(>dev, "Failed to get present > status\n"); > + rc = -ENODEV; > goto exit_psupply; > } > } > -- > 2.28.0.236.gb10cc79966-goog >
RE: [PATCH v3] exfat: remove EXFAT_SB_DIRTY flag
> Thanks for thinking on this complicated issue. > > > > Most of the NAND flash devices and HDDs have wear leveling and bad sector > > replacement algorithms > applied. > > So I think that the life of the boot sector will not be exhausted first. > > I'm not too worried about the life of the boot-sector. > I'm worried about write failures caused by external factors. > (power failure/system down/vibration/etc. during writing) They rarely occur > on SD cards, but occur on > many HDDs, some SSDs and USB storages by 0.1% or more. Hard disk and SSD do not guarantee atomic write of a sector unit? > Especially with AFT-HDD, not only boot-sector but also the following multiple > sectors become > unreadable. Other file systems will also be unstable on a such HW. > It is not possible to completely solve this problem, as long as writing to > the boot-sector. > (I think it's a exFAT's specification defect) The only effective way to > reduce this problem is to > reduce writes to the boot-sector. exFAT's specification defect... Well.. Even though the boot sector is corrupted, It can be recovered using the backup boot sector through fsck. > > > > Currently the volume dirty/clean policy of exfat-fs is not perfect, > > Thank you for sharing the problem with you. > > > > but I think it behaves similarly to the policy of MS Windows. > > On Windows10, the dirty flag is cleared after more than 15 seconds after all > write operations are > completed. > (dirty-flag is never updated during the write operation continues) > > > > Therefore, > > I think code improvements should be made to reduce volume flag records > > while maintaining the current > policy. > > Current policy is inconsistent. > As I wrote last mail, the problem with the current implementation is that the > dirty-flag may not be > cleared after the write operation.(even if sync is enabled or disabled) > Because, some write operations > clear the dirty-flag but some don't clear. > Unmount or sync command is the only way to ensure that the dirty-flag is > cleared. > This has no effect on clearing the dirty-flag after a write operations, it > only increases risk of > destroying the boot-sector. > - Clear the dirty-flag after every write operation. > - Never clear the dirty-flag after every write operation. > Unless unified to either one, I think that sync policy cannot be consistent. > > How do you think? > > > BR > --- > etsuhiro Kohada
[RFC PATCH 0/3] Re: [PATCH v17 14/21] mm/compaction: do page isolation first in compaction
Here are the patches I had discussed earlier to address the issues in isolate_migratepages_block. They are based on the tree at: https://github.com/alexshi/linux.git lrunext The first patch is mostly cleanup to address the RCU locking in the function. The second addresses the test_and_set_skip issue, and the third relocates PageCompound. I did some digging into the history of the skip bits and since they are only supposed to be a hint I thought we could probably just drop the testing portion of the call since the LRU flag is preventing more than one thread from accessing the function anyway so it would make sense to just switch it to a set operation similar to what happens when low_pfn == end_pfn at the end of the call. I have only had a chance to build test these since rebasing on the tree. In addition I am not 100% certain the PageCompound changes are correct as they operate on the assumption that get_page_unless_zero is enough to keep a compound page from being split up. I plan on doing some testing tomorrow, but thought I would push these out now so that we could discuss them. --- Alexander Duyck (3): mm: Drop locked from isolate_migratepages_block mm: Drop use of test_and_set_skip in favor of just setting skip mm: Identify compound pages sooner in isolate_migratepages_block mm/compaction.c | 126 +++ 1 file changed, 44 insertions(+), 82 deletions(-) --
[RFC PATCH 1/3] mm: Drop locked from isolate_migratepages_block
From: Alexander Duyck We can drop the need for the locked variable by making use of the lruvec_holds_page_lru_lock function. By doing this we can avoid some rcu locking ugliness for the case where the lruvec is still holding the LRU lock associated with the page. Instead we can just use the lruvec and if it is NULL we assume the lock was released. Signed-off-by: Alexander Duyck --- mm/compaction.c | 45 - 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index b99c96c4862d..5021a18ef722 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -803,9 +803,8 @@ static bool too_many_isolated(pg_data_t *pgdat) { pg_data_t *pgdat = cc->zone->zone_pgdat; unsigned long nr_scanned = 0, nr_isolated = 0; - struct lruvec *lruvec; + struct lruvec *lruvec = NULL; unsigned long flags = 0; - struct lruvec *locked = NULL; struct page *page = NULL, *valid_page = NULL; unsigned long start_pfn = low_pfn; bool skip_on_failure = false; @@ -866,9 +865,9 @@ static bool too_many_isolated(pg_data_t *pgdat) * a fatal signal is pending. */ if (!(low_pfn % SWAP_CLUSTER_MAX)) { - if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); - locked = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } if (fatal_signal_pending(current)) { @@ -949,9 +948,9 @@ static bool too_many_isolated(pg_data_t *pgdat) */ if (unlikely(__PageMovable(page)) && !PageIsolated(page)) { - if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); - locked = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } if (!isolate_movable_page(page, isolate_mode)) @@ -992,16 +991,13 @@ static bool too_many_isolated(pg_data_t *pgdat) if (!TestClearPageLRU(page)) goto isolate_fail_put; - rcu_read_lock(); - lruvec = mem_cgroup_page_lruvec(page, pgdat); - /* If we already hold the lock, we can skip some rechecking */ - if (lruvec != locked) { - if (locked) - unlock_page_lruvec_irqrestore(locked, flags); + if (!lruvec || !lruvec_holds_page_lru_lock(page, lruvec)) { + if (lruvec) + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = mem_cgroup_page_lruvec(page, pgdat); compact_lock_irqsave(>lru_lock, , cc); - locked = lruvec; rcu_read_unlock(); lruvec_memcg_debug(lruvec, page); @@ -1023,8 +1019,7 @@ static bool too_many_isolated(pg_data_t *pgdat) SetPageLRU(page); goto isolate_fail_put; } - } else - rcu_read_unlock(); + } /* The whole page is taken off the LRU; skip the tail pages. */ if (PageCompound(page)) @@ -1057,9 +1052,9 @@ static bool too_many_isolated(pg_data_t *pgdat) isolate_fail_put: /* Avoid potential deadlock in freeing page under lru_lock */ - if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); - locked = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } put_page(page); @@ -1073,9 +1068,9 @@ static bool too_many_isolated(pg_data_t *pgdat) * page anyway. */ if (nr_isolated) { - if (locked) { - unlock_page_lruvec_irqrestore(locked, flags); - locked = NULL; + if (lruvec) { + unlock_page_lruvec_irqrestore(lruvec, flags); + lruvec = NULL; } putback_movable_pages(>migratepages); cc->nr_migratepages = 0; @@ -1102,8 +1097,8 @@ static bool too_many_isolated(pg_data_t *pgdat) page = NULL;
[RFC PATCH 2/3] mm: Drop use of test_and_set_skip in favor of just setting skip
From: Alexander Duyck The only user of test_and_set_skip was isolate_migratepages_block and it was using it after a call that was testing and clearing the LRU flag. As such it really didn't need to be behind the LRU lock anymore as it wasn't really fulfilling its purpose. With that being the case we can simply drop the bit and instead directly just call the set_pageblock_skip function if the page we are working on is the valid_page at the start of the pageblock. It shouldn't be possible for us to encounter the bit being set since we obtained the LRU flag for the first page in the pageblock which means we would have exclusive access to setting the skip bit. As such we don't need to worry about the abort case since no other thread will be able to call what used to be test_and_set_skip. Since we have dropped the late abort case we can drop the code that was clearing the LRU flag and calling page_put since the abort case will now not be holding a reference to a page. Signed-off-by: Alexander Duyck --- mm/compaction.c | 50 +++--- 1 file changed, 7 insertions(+), 43 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index 5021a18ef722..c1e9918f9dd4 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -399,29 +399,6 @@ void reset_isolation_suitable(pg_data_t *pgdat) } } -/* - * Sets the pageblock skip bit if it was clear. Note that this is a hint as - * locks are not required for read/writers. Returns true if it was already set. - */ -static bool test_and_set_skip(struct compact_control *cc, struct page *page, - unsigned long pfn) -{ - bool skip; - - /* Do no update if skip hint is being ignored */ - if (cc->ignore_skip_hint) - return false; - - if (!IS_ALIGNED(pfn, pageblock_nr_pages)) - return false; - - skip = get_pageblock_skip(page); - if (!skip && !cc->no_set_skip_hint) - skip = !set_pageblock_skip(page); - - return skip; -} - static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) { struct zone *zone = cc->zone; @@ -480,12 +457,6 @@ static inline void update_pageblock_skip(struct compact_control *cc, static void update_cached_migrate(struct compact_control *cc, unsigned long pfn) { } - -static bool test_and_set_skip(struct compact_control *cc, struct page *page, - unsigned long pfn) -{ - return false; -} #endif /* CONFIG_COMPACTION */ /* @@ -895,7 +866,6 @@ static bool too_many_isolated(pg_data_t *pgdat) if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { if (!cc->ignore_skip_hint && get_pageblock_skip(page)) { low_pfn = end_pfn; - page = NULL; goto isolate_abort; } valid_page = page; @@ -991,6 +961,13 @@ static bool too_many_isolated(pg_data_t *pgdat) if (!TestClearPageLRU(page)) goto isolate_fail_put; + /* Indicate that we want exclusive access to this pageblock */ + if (page == valid_page) { + skip_updated = true; + if (!cc->ignore_skip_hint) + set_pageblock_skip(page); + } + /* If we already hold the lock, we can skip some rechecking */ if (!lruvec || !lruvec_holds_page_lru_lock(page, lruvec)) { if (lruvec) @@ -1002,13 +979,6 @@ static bool too_many_isolated(pg_data_t *pgdat) lruvec_memcg_debug(lruvec, page); - /* Try get exclusive access under lock */ - if (!skip_updated) { - skip_updated = true; - if (test_and_set_skip(cc, page, low_pfn)) - goto isolate_abort; - } - /* * Page become compound since the non-locked check, * and it's on LRU. It can only be a THP so the order @@ -1094,15 +1064,9 @@ static bool too_many_isolated(pg_data_t *pgdat) if (unlikely(low_pfn > end_pfn)) low_pfn = end_pfn; - page = NULL; - isolate_abort: if (lruvec) unlock_page_lruvec_irqrestore(lruvec, flags); - if (page) { - SetPageLRU(page); - put_page(page); - } /* * Updated the cached scanner pfn once the pageblock has been scanned
[RFC PATCH 3/3] mm: Identify compound pages sooner in isolate_migratepages_block
From: Alexander Duyck Since we are holding a reference to the page much sooner in isolate_migratepages_block we could move the PageCompound check out of the LRU locked section and instead just place it after get_page_unless_zero. By doing this we can allow any of the items that might trigger a failure to trigger a failure for the compound page rather than the order 0 page and as a result we should be able to process the pageblock faster. In addition by testing for PageCompound sooner we can avoid having the LRU flag cleared and then reset in the exception case. As a result this should prevent any possible races where another thread might be attempting to pull the LRU pages from the list. Signed-off-by: Alexander Duyck --- mm/compaction.c | 33 ++--- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/mm/compaction.c b/mm/compaction.c index c1e9918f9dd4..3803f129fd6a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -954,6 +954,24 @@ static bool too_many_isolated(pg_data_t *pgdat) if (unlikely(!get_page_unless_zero(page))) goto isolate_fail; + /* +* Page is compound. We know the order before we know if it is +* on the LRU so we cannot assume it is THP. However since the +* page will have the LRU validated shortly we can use the value +* to skip over this page for now or validate the LRU is set and +* then isolate the entire compound page if we are isolating to +* generate a CMA page. +*/ + if (PageCompound(page)) { + const unsigned int order = compound_order(page); + + if (likely(order < MAX_ORDER)) + low_pfn += (1UL << order) - 1; + + if (!cc->alloc_contig) + goto isolate_fail_put; + } + if (__isolate_lru_page_prepare(page, isolate_mode) != 0) goto isolate_fail_put; @@ -978,23 +996,8 @@ static bool too_many_isolated(pg_data_t *pgdat) rcu_read_unlock(); lruvec_memcg_debug(lruvec, page); - - /* -* Page become compound since the non-locked check, -* and it's on LRU. It can only be a THP so the order -* is safe to read and it's 0 for tail pages. -*/ - if (unlikely(PageCompound(page) && !cc->alloc_contig)) { - low_pfn += compound_nr(page) - 1; - SetPageLRU(page); - goto isolate_fail_put; - } } - /* The whole page is taken off the LRU; skip the tail pages. */ - if (PageCompound(page)) - low_pfn += compound_nr(page) - 1; - /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); mod_node_page_state(page_pgdat(page),
Re: file metadata via fs API (was: [GIT PULL] Filesystem Information)
On 8/12/2020 2:18 PM, Linus Torvalds (torva...@linux-foundation.org) wrote: > What's wrong with fstatfs()? All the extra magic metadata seems to not > really be anything people really care about. > > What people are actually asking for seems to be some unique mount ID, > and we have 16 bytes of spare information in 'struct statfs64'. > > All the other fancy fsinfo stuff seems to be "just because", and like > complete overdesign. Hi Linus, Is there any existing method by which userland applications can determine the properties of the filesystem in which a directory or file is stored in a filesystem agnostic manner? Over the past year I've observed the opendev/openstack community struggle with performance issues caused by rsync's inability to determine if the source and destination object's last update time have the same resolution and valid time range. If the source file system supports 100 nanosecond granularity and the destination file system supports one second granularity, any source file with a non-zero fractional seconds timestamp will appear to have changed compared to the copy in the destination filesystem which discarded the fractional seconds during the last sync. Sure, the end user could use the --modify-window=1 option to inform rsync to add fuzz to the comparisons, but that introduces the possibility that a file updated a fraction of a second after an rsync execution would not synchronize the file on the next run when both source and target have fine grained timestamps. If the userland sync processes have access to the source and destination filesystem time capabilities, they can make more intelligent decisions without explicit user input. At a minimum, the timestamp properties that are important to know include the range of valid timestamps and the resolution. Some filesystems support unsigned 32-bit time starting with UNIX epoch. Others signed 32-bit time with UNIX epoch. Still others FAT, NTFS, etc use alternative epochs and range and resolutions. Another case where lack of filesystem properties is problematic is "df --local" which currently relies upon string comparisons of file system name strings to determine if the underlying file system is local or remote. This requires that the gnulib maintainers have knowledge of all file systems implementations, their published names, and which category they belong to. Patches have been accepted in the past year to add "smb3", "afs", and "gpfs" to the list of remote file systems. There are many more remote filesystems that have yet to be added including "cephfs", "lustre", "gluster", etc. In many cases, the filesystem properties cannot be inferred from the filesystem name. For network file systems, these properties might depend upon the remote server capabilities or even the properties associated with a particular volume or share. Consider the case of a remote file server that supports 64-bit 100ns time but which for backward compatibility exports certain volumes or shares with more restrictive capabilities. Or the case of a network file system protocol that has evolved over time and gained new capabilities. For the AFS community, fsinfo offers a method of exposing some server and volume properties that are obtained via "path ioctls" in OpenAFS and AuriStorFS. Some example of properties that might be exposed include answers to questions such as: * what is the volume cell id? perhaps a uuid. * what is the volume id in the cell? unsigned 64-bit integer * where is a mounted volume hosted? which fileservers, named by uuid * what is the block size? 1K, 4K, ... * how many blocks are in use or available? * what is the quota (thin provisioning), if any? * what is the reserved space (fat provisioning), if any? * how many vnodes are present? * what is the vnode count limit, if any? * when was the volume created and last updated? * what is the file size limit? * are byte range locks supported? * are mandatory locks supported? * how many entries can be created within a directory? * are cross-directory hard links supported? * are directories just-send-8, case-sensitive, case-preserving, or case-insensitive? * if not just-send-8, what character set is used? * if Unicode, what normalization rules? etc. * are per-object acls supported? * what volume maximum acl is assigned, if any? * what volume security policy (authn, integ, priv) is assigned, if any? * what is the replication policy, if any? * what is the volume encryption policy, if any? * what is the volume compression policy, if any? * are server-to-server copies supported? * which of atime, ctime and mtime does the volume support? * what is the permitted timestamp range and resolution? * are xattrs supported? * what is the xattr maximum name length? * what is the xattr maximum object size? * is the volume currently reachable? * is the volume immutable? * etc ... Its true that there isn't widespread use of these filesystem properties by today's
Re: INFO: task hung in pipe_release (2)
syzbot has bisected this issue to: commit fddb5d430ad9fa91b49b1d34d0202ffe2fa0e179 Author: Aleksa Sarai Date: Sat Jan 18 12:07:59 2020 + open: introduce openat2(2) syscall bisection log: https://syzkaller.appspot.com/x/bisect.txt?x=164e716a90 start commit: 6ba1b005 Merge tag 'asm-generic-fixes-5.8' of git://git.ke.. git tree: upstream final oops: https://syzkaller.appspot.com/x/report.txt?x=154e716a90 console output: https://syzkaller.appspot.com/x/log.txt?x=114e716a90 kernel config: https://syzkaller.appspot.com/x/.config?x=84f076779e989e69 dashboard link: https://syzkaller.appspot.com/bug?extid=61acc40a49a3e46e25ea syz repro: https://syzkaller.appspot.com/x/repro.syz?x=142ae22490 Reported-by: syzbot+61acc40a49a3e46e2...@syzkaller.appspotmail.com Fixes: fddb5d430ad9 ("open: introduce openat2(2) syscall") For information about bisection process see: https://goo.gl/tpsmEJ#bisection
[PATCH net] bonding: show saner speed for broadcast mode
Broadcast mode bonds transmit a copy of all traffic simultaneously out of all interfaces, so the "speed" of the bond isn't really the aggregate of all interfaces, but rather, the speed of the lowest active interface. Also, the type of the speed field is u32, not unsigned long, so adjust that accordingly, as required to make min() function here without complaining about mismatching types. Fixes: bb5b052f751b ("bond: add support to read speed and duplex via ethtool") CC: Jay Vosburgh CC: Veaceslav Falico CC: Andy Gospodarek CC: "David S. Miller" CC: net...@vger.kernel.org Signed-off-by: Jarod Wilson --- drivers/net/bonding/bond_main.c | 21 ++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 5ad43aaf76e5..c853ca67058c 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4552,13 +4552,23 @@ static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev) return ret; } +static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed) +{ + if (speed == 0 || speed == SPEED_UNKNOWN) + speed = slave->speed; + else + speed = min(speed, slave->speed); + + return speed; +} + static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, struct ethtool_link_ksettings *cmd) { struct bonding *bond = netdev_priv(bond_dev); - unsigned long speed = 0; struct list_head *iter; struct slave *slave; + u32 speed = 0; cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; @@ -4570,8 +4580,13 @@ static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, */ bond_for_each_slave(bond, slave, iter) { if (bond_slave_can_tx(slave)) { - if (slave->speed != SPEED_UNKNOWN) - speed += slave->speed; + if (slave->speed != SPEED_UNKNOWN) { + if (BOND_MODE(bond) == BOND_MODE_BROADCAST) + speed = bond_mode_bcast_speed(slave, + speed); + else + speed += slave->speed; + } if (cmd->base.duplex == DUPLEX_UNKNOWN && slave->duplex != DUPLEX_UNKNOWN) cmd->base.duplex = slave->duplex; -- 2.20.1
Re: [PATCH v2] nvme: Use spin_lock_irq() when taking the ctrl->lock
On 2020-08-12 6:32 p.m., Keith Busch wrote: > There's an unrelated whitespace change in nvme_init_identify(). > Otherwise, looks fine. Oops, sorry. can this be fixed up when it's merged? Logan
Re: [PATCH v17 14/21] mm/compaction: do page isolation first in compaction
在 2020/8/13 上午10:17, Alexander Duyck 写道: >> zone lock is probability better. you can try and test. > So I spent a good chunk of today looking the code over and what I > realized is that we probably don't even really need to have this code > protected by the zone lock since the LRU bit in the pageblock should > do most of the work for us. In addition we can get rid of the test > portion of this and just make it a set only operation if I am not > mistaken. > > the LRU flag is cleared then you are creating a situation where > multiple processes will be stomping all over each other as you can > have each thread essentially take a page via the LRU flag, but only > one thread will process a page and it could skip over all other pages > that preemptively had their LRU flag cleared. It increase a bit crowd here, but lru_lock do reduce some them, and skip_bit could stop each other in a array check(bitmap). So compare to whole node lru_lock, the net profit is clear in patch 17. >>> My concern is that what you can end up with is multiple threads all >>> working over the same pageblock for isolation. With the old code the >>> LRU lock was used to make certain that test_and_set_skip was being >>> synchronized on the first page in the pageblock so you would only have >>> one thread going through and working a single pageblock. However after >>> your changes it doesn't seem like the test_and_set_skip has that >>> protection since only one thread will ever be able to successfully >>> call it for the first page in the pageblock assuming that the LRU flag >>> is set on the first page in the pageblock block. >>> > If you take a look at the test_and_set_skip the function only acts on > the pageblock aligned PFN for a given range. WIth the changes you have > in place now that would mean that only one thread would ever actually > call this function anyway since the first PFN would take the LRU flag > so no other thread could follow through and test or set the bit as Is this good for only one process could do test_and_set_skip? is that the 'skip' meaning to be? >>> So only one thread really getting to fully use test_and_set_skip is >>> good, however the issue is that there is nothing to synchronize the >>> testing from the other threads. As a result the other threads could >>> have isolated other pages within the pageblock before the thread that >>> is calling test_and_set_skip will get to complete the setting of the >>> skip bit. This will result in isolation failures for the thread that >>> set the skip bit which may be undesirable behavior. >>> >>> With the old code the threads were all synchronized on testing the >>> first PFN in the pageblock while holding the LRU lock and that is what >>> we lost. My concern is the cases where skip_on_failure == true are >>> going to fail much more often now as the threads can easily interfere >>> with each other. >> I have a patch to fix this, which is on >> https://github.com/alexshi/linux.git lrunext > I don't think that patch helps to address anything. You are now > failing to set the bit in the case that something modifies the > pageblock flags while you are attempting to do so. I think it would be > better to just leave the cmpxchg loop as it is. It do increae the case-lru-file-mmap-read in vm-scalibity about 3% performance. Yes, I am glad to see it can be make better. > > well. The expectation before was that all threads would encounter this > test and either proceed after setting the bit for the first PFN or > abort after testing the first PFN. With you changes only the first > thread actually runs this test and then it and the others will likely > encounter multiple failures as they are all clearing LRU bits > simultaneously and tripping each other up. That is why the skip bit > must have a test and set done before you even get to the point of > clearing the LRU flag. It make the things warse in my machine, would you like to have a try by yourself? >>> I plan to do that. I have already been working on a few things to >>> clean up and optimize your patch set further. I will try to submit an >>> RFC this evening so we can discuss. >>> >> Glad to see your new code soon. Would you like do it base on >> https://github.com/alexshi/linux.git lrunext > I can rebase off of that tree. It may add another half hour or so. I > have barely had any time to test my code. When I enabled some of the > debugging features in the kernel related to using the vm-scalability > tests the boot time became incredibly slow so I may just make certain > I can boot and not mess the system up before submitting my patches as > an RFC. I can probably try testing them more tomorrow. > >>> The point I was getting at with the PageCompound check is that instead >>> of needing the LRU lock you should be able to look at PageCompound as >>> soon as you call
[PATCH] mm: include CMA pages in lowmem_reserve at boot
The lowmem_reserve arrays provide a means of applying pressure against allocations from lower zones that were targeted at higher zones. Its values are a function of the number of pages managed by higher zones and are assigned by a call to the setup_per_zone_lowmem_reserve() function. The function is initially called at boot time by the function init_per_zone_wmark_min() and may be called later by accesses of the /proc/sys/vm/lowmem_reserve_ratio sysctl file. The function init_per_zone_wmark_min() was moved up from a module_init to a core_initcall to resolve a sequencing issue with khugepaged. Unfortunately this created a sequencing issue with CMA page accounting. The CMA pages are added to the managed page count of a zone when cma_init_reserved_areas() is called at boot also as a core_initcall. This makes it uncertain whether the CMA pages will be added to the managed page counts of their zones before or after the call to init_per_zone_wmark_min() as it becomes dependent on link order. With the current link order the pages are added to the managed count after the lowmem_reserve arrays are initialized at boot. This means the lowmem_reserve values at boot may be lower than the values used later if /proc/sys/vm/lowmem_reserve_ratio is accessed even if the ratio values are unchanged. In many cases the difference is not significant, but in others it may have an affect. This commit breaks the link order dependency by invoking init_per_zone_wmark_min() as a postcore_initcall so that the CMA pages have the chance to be properly accounted in their zone(s) and allowing the lowmem_reserve arrays to receive consistent values. Fixes: bc22af74f271 ("mm: update min_free_kbytes from khugepaged after core initialization") Signed-off-by: Doug Berger --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8b7d0ecf30b1..f3e340ec2b6b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7887,7 +7887,7 @@ int __meminit init_per_zone_wmark_min(void) return 0; } -core_initcall(init_per_zone_wmark_min) +postcore_initcall(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so -- 2.7.4
Re: WARNING in compat_do_ebt_get_ctl
syzbot has found a reproducer for the following issue on: HEAD commit:fb893de3 Merge tag 'tag-chrome-platform-for-v5.9' of git:/.. git tree: upstream console output: https://syzkaller.appspot.com/x/log.txt?x=1742b31c90 kernel config: https://syzkaller.appspot.com/x/.config?x=f1fedc63022bf07e dashboard link: https://syzkaller.appspot.com/bug?extid=5accb5c62faa1d346480 compiler: gcc (GCC) 10.1.0-syz 20200507 userspace arch: i386 syz repro: https://syzkaller.appspot.com/x/repro.syz?x=13280fd690 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=1409f4a690 IMPORTANT: if you fix the issue, please add the following tag to the commit: Reported-by: syzbot+5accb5c62faa1d346...@syzkaller.appspotmail.com [ cut here ] Buffer overflow detected (80 < 137)! WARNING: CPU: 0 PID: 6853 at include/linux/thread_info.h:134 copy_overflow include/linux/thread_info.h:134 [inline] WARNING: CPU: 0 PID: 6853 at include/linux/thread_info.h:134 check_copy_size include/linux/thread_info.h:143 [inline] WARNING: CPU: 0 PID: 6853 at include/linux/thread_info.h:134 copy_to_user include/linux/uaccess.h:151 [inline] WARNING: CPU: 0 PID: 6853 at include/linux/thread_info.h:134 compat_do_ebt_get_ctl+0x47e/0x500 net/bridge/netfilter/ebtables.c:2270 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 6853 Comm: syz-executor171 Not tainted 5.8.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x18f/0x20d lib/dump_stack.c:118 panic+0x2e3/0x75c kernel/panic.c:231 __warn.cold+0x20/0x45 kernel/panic.c:600 report_bug+0x1bd/0x210 lib/bug.c:198 handle_bug+0x38/0x90 arch/x86/kernel/traps.c:234 exc_invalid_op+0x14/0x40 arch/x86/kernel/traps.c:254 asm_exc_invalid_op+0x12/0x20 arch/x86/include/asm/idtentry.h:536 RIP: 0010:copy_overflow include/linux/thread_info.h:134 [inline] RIP: 0010:check_copy_size include/linux/thread_info.h:143 [inline] RIP: 0010:copy_to_user include/linux/uaccess.h:151 [inline] RIP: 0010:compat_do_ebt_get_ctl+0x47e/0x500 net/bridge/netfilter/ebtables.c:2270 Code: ba fd ff ff 4c 89 f7 e8 60 07 a2 fa e9 ad fd ff ff e8 36 18 62 fa 4c 89 e2 be 50 00 00 00 48 c7 c7 40 b9 0e 89 e8 94 1f 33 fa <0f> 0b e9 dc fd ff ff 41 bc f2 ff ff ff e9 4f fe ff ff e8 3b 07 a2 RSP: 0018:c90005667ae8 EFLAGS: 00010282 RAX: RBX: 192000accf5e RCX: RDX: 88809458a280 RSI: 815dbce7 RDI: f52000accf4f RBP: 8a8faa60 R08: 0001 R09: 8880ae6318e7 R10: R11: 35383654 R12: 0089 R13: 2000 R14: c90005667d80 R15: c90005667b20 do_ebt_get_ctl+0x2b4/0x790 net/bridge/netfilter/ebtables.c:2317 nf_getsockopt+0x72/0xd0 net/netfilter/nf_sockopt.c:116 ip_getsockopt net/ipv4/ip_sockglue.c:1778 [inline] ip_getsockopt+0x164/0x1c0 net/ipv4/ip_sockglue.c:1757 tcp_getsockopt+0x86/0xd0 net/ipv4/tcp.c:3884 __sys_getsockopt+0x219/0x4c0 net/socket.c:2179 __do_sys_getsockopt net/socket.c:2194 [inline] __se_sys_getsockopt net/socket.c:2191 [inline] __ia32_sys_getsockopt+0xb9/0x150 net/socket.c:2191 do_syscall_32_irqs_on arch/x86/entry/common.c:84 [inline] __do_fast_syscall_32+0x57/0x80 arch/x86/entry/common.c:126 do_fast_syscall_32+0x2f/0x70 arch/x86/entry/common.c:149 entry_SYSENTER_compat_after_hwframe+0x4d/0x5c RIP: 0023:0xf7f91569 Code: 10 05 03 74 b8 01 10 06 03 74 b4 01 10 07 03 74 b0 01 10 08 03 74 d8 01 00 00 00 00 00 00 00 00 00 51 52 55 89 e5 0f 34 cd 80 <5d> 5a 59 c3 90 90 90 90 eb 0d 90 90 90 90 90 90 90 90 90 90 90 90 RSP: 002b:ffdae08c EFLAGS: 0292 ORIG_RAX: 016d RAX: ffda RBX: 0003 RCX: RDX: 0082 RSI: 2000 RDI: 2100 RBP: 0012 R08: R09: R10: R11: R12: R13: R14: R15: Kernel Offset: disabled Rebooting in 86400 seconds..
Re: file metadata via fs API
On Wed, 2020-08-12 at 12:50 -0700, Linus Torvalds wrote: > On Wed, Aug 12, 2020 at 12:34 PM Steven Whitehouse < > swhit...@redhat.com> wrote: > > The point of this is to give us the ability to monitor mounts from > > userspace. > > We haven't had that before, I don't see why it's suddenly such a big > deal. Because there's a trend occurring in user space where there are frequent and persistent mount changes that cause high overhead. I've seen the number of problems building up over the last few months that are essentially the same problem that I wanted to resolve. And that's related to side effects of autofs using a large number of mounts. The problems are real. > > The notification side I understand. Polling /proc files is not the > answer. Yep, that's one aspect, getting the information about a mount without reading the entire mount table seems like the sensible thing to do to allow for a more efficient notification mechanism. > > But the whole "let's design this crazy subsystem for it" seems way > overkill. I don't see anybody caring that deeply. > > It really smells like "do it because we can, not because we must". > > Who the hell cares about monitoring mounts at a kHz frequencies? If > this is for MIS use, you want a nice GUI and not wasting CPU time > polling. That part of the problem still remains. The kernel sending a continuous stream of wake ups under load does also introduce a resource problem but that's probably something to handle in user space. > > I'm starting to ignore the pull requests from David Howells, because > by now they have had the same pattern for a couple of years now: > esoteric new interfaces that seem overdesigned for corner-cases that > I'm not seeing people clamoring for. > > I need (a) proof this is actualyl something real users care about and > (b) way more open discussion and implementation from multiple > parties. > > Because right now it looks like a small in-cabal of a couple of > people > who have wild ideas but I'm not seeing the wider use of it. > > Convince me otherwise. AGAIN. This is the exact same issue I had with > the notification queues that I really wanted actual use-cases for, > and > feedback from actual outside users. > > I really think this is engineering for its own sake, rather than > responding to actual user concerns. > >Linus
[PATCH] drivers/input/misc: Use kobj_to_dev() instead
Use kobj_to_dev() instead of container_of() Signed-off-by: Wang Qing --- drivers/input/misc/ims-pcu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/misc/ims-pcu.c b/drivers/input/misc/ims-pcu.c index d8dbfc0..0879d96 --- a/drivers/input/misc/ims-pcu.c +++ b/drivers/input/misc/ims-pcu.c @@ -1228,7 +1228,7 @@ static struct attribute *ims_pcu_attrs[] = { static umode_t ims_pcu_is_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct usb_interface *intf = to_usb_interface(dev); struct ims_pcu *pcu = usb_get_intfdata(intf); umode_t mode = attr->mode; -- 2.7.4
[PATCH] drm/hisilicon: Fix build error of no type of module_init
Add missing include to fix build error: hibmc_drm_drv.c:385:1: warning: data definition has no type or storage class [enabled by default] hibmc_drm_drv.c:385:1: error: type defaults to ‘int’ in declaration of ‘module_init’ [-Werror=implicit-int] hibmc_drm_drv.c:385:1: warning: parameter names (without types) in function of ‘module_exit’ [-Werror=implicit-int] hibmc_drm_drv.c:385:292:1: warning: parameter names (without types) in function declaration [enabled by default] Signed-off-by: Tian Tao Reported-by: kernel test robot --- drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c index 1ae360d..2b4f821 100644 --- a/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c +++ b/drivers/gpu/drm/hisilicon/hibmc/hibmc_drm_drv.c @@ -11,6 +11,7 @@ * Jianhua Li */ +#include #include #include -- 2.7.4
[PATCH v2] riscv: Setup exception vector for nommu platform
Exception vector is missing on nommu platform and that is an issue. This patch is tested in Sipeed Maix Bit Dev Board. Fixes: 79b1feba5455 ("RISC-V: Setup exception vector early") Suggested-by: Anup Patel Suggested-by: Atish Patra Signed-off-by: Qiu Wenbo --- arch/riscv/kernel/head.S | 25 + 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index d0c5c316e9bb..0a4e81b8dc79 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -77,16 +77,10 @@ relocate: csrw CSR_SATP, a0 .align 2 1: - /* Set trap vector to exception handler */ - la a0, handle_exception + /* Set trap vector to spin forever to help debug */ + la a0, .Lsecondary_park csrw CSR_TVEC, a0 - /* -* Set sup0 scratch register to 0, indicating to exception vector that -* we are presently executing in kernel. -*/ - csrw CSR_SCRATCH, zero - /* Reload the global pointer */ .option push .option norelax @@ -144,9 +138,23 @@ secondary_start_common: la a0, swapper_pg_dir call relocate #endif + call setup_trap_vector tail smp_callin #endif /* CONFIG_SMP */ +.align 2 +setup_trap_vector: + /* Set trap vector to exception handler */ + la a0, handle_exception + csrw CSR_TVEC, a0 + + /* +* Set sup0 scratch register to 0, indicating to exception vector that +* we are presently executing in kernel. +*/ + csrw CSR_SCRATCH, zero + ret + .Lsecondary_park: /* We lack SMP support or have too many harts, so park this hart */ wfi @@ -240,6 +248,7 @@ clear_bss_done: call relocate #endif /* CONFIG_MMU */ + call setup_trap_vector /* Restore C environment */ la tp, init_task sw zero, TASK_TI_CPU(tp) -- 2.28.0
[PATCH] drivers/greybus: Use kobj_to_dev() instead
Use kobj_to_dev() instead of container_of() Signed-off-by: Wang Qing --- drivers/greybus/interface.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/greybus/interface.c b/drivers/greybus/interface.c index 67dbe6f..de167f1 --- a/drivers/greybus/interface.c +++ b/drivers/greybus/interface.c @@ -620,7 +620,7 @@ static struct attribute *interface_common_attrs[] = { static umode_t interface_unipro_is_visible(struct kobject *kobj, struct attribute *attr, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct gb_interface *intf = to_gb_interface(dev); switch (intf->type) { @@ -635,7 +635,7 @@ static umode_t interface_unipro_is_visible(struct kobject *kobj, static umode_t interface_greybus_is_visible(struct kobject *kobj, struct attribute *attr, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct gb_interface *intf = to_gb_interface(dev); switch (intf->type) { @@ -649,7 +649,7 @@ static umode_t interface_greybus_is_visible(struct kobject *kobj, static umode_t interface_power_is_visible(struct kobject *kobj, struct attribute *attr, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct gb_interface *intf = to_gb_interface(dev); switch (intf->type) { -- 2.7.4
Re: [PATCH] binfmt_flat: revert "binfmt_flat: don't offset the data start"
Hi Max, On 9/8/20 4:37 am, Max Filippov wrote: binfmt_flat loader uses the gap between text and data to store data segment pointers for the libraries. Even in the absence of shared libraries it stores at least one pointer to the executable's own data segment. Text and data can go back to back in the flat binary image and without offsetting data segment last few instructions in the text segment may get corrupted by the data segment pointer. Yep, your right, it does. I will push this into the m68knommu git tree next week (once the merge window is closed), and make sure it gets to Linus for rc series soon after that. Thanks Greg Fix it by reverting commit a2357223c50a ("binfmt_flat: don't offset the data start"). Cc: sta...@vger.kernel.org Fixes: a2357223c50a ("binfmt_flat: don't offset the data start") Signed-off-by: Max Filippov --- fs/binfmt_flat.c | 20 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index f2f9086ebe98..b9c658e0548e 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -576,7 +576,7 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } - len = data_len + extra; + len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); @@ -590,7 +590,9 @@ static int load_flat_file(struct linux_binprm *bprm, vm_munmap(textpos, text_len); goto err; } - datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(unsigned long), + FLAT_DATA_ALIGN); pr_debug("Allocated data+bss+stack (%u bytes): %lx\n", data_len + bss_len + stack_len, datapos); @@ -620,7 +622,7 @@ static int load_flat_file(struct linux_binprm *bprm, memp_size = len; } else { - len = text_len + data_len + extra; + len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32); len = PAGE_ALIGN(len); textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); @@ -635,7 +637,9 @@ static int load_flat_file(struct linux_binprm *bprm, } realdatastart = textpos + ntohl(hdr->data_start); - datapos = ALIGN(realdatastart, FLAT_DATA_ALIGN); + datapos = ALIGN(realdatastart + + MAX_SHARED_LIBS * sizeof(u32), + FLAT_DATA_ALIGN); reloc = (__be32 __user *) (datapos + (ntohl(hdr->reloc_start) - text_len)); @@ -652,9 +656,8 @@ static int load_flat_file(struct linux_binprm *bprm, (text_len + full_data - sizeof(struct flat_hdr)), 0); - if (datapos != realdatastart) - memmove((void *)datapos, (void *)realdatastart, - full_data); + memmove((void *) datapos, (void *) realdatastart, + full_data); #else /* * This is used on MMU systems mainly for testing. @@ -710,7 +713,8 @@ static int load_flat_file(struct linux_binprm *bprm, if (IS_ERR_VALUE(result)) { ret = result; pr_err("Unable to read code+data+bss, errno %d\n", ret); - vm_munmap(textpos, text_len + data_len + extra); + vm_munmap(textpos, text_len + data_len + extra + + MAX_SHARED_LIBS * sizeof(u32)); goto err; } }
Re: KMSAN: uninit-value in ath9k_htc_rx_msg
syzbot has found a reproducer for the following issue on: HEAD commit:ce8056d1 wip: changed copy_from_user where instrumented git tree: https://github.com/google/kmsan.git master console output: https://syzkaller.appspot.com/x/log.txt?x=12985a1690 kernel config: https://syzkaller.appspot.com/x/.config?x=3afe005fb99591f dashboard link: https://syzkaller.appspot.com/bug?extid=2ca247c2d60c7023de7f compiler: clang version 10.0.0 (https://github.com/llvm/llvm-project/ c2443155a0fb245c8f17f2c1c72b6ea391e86e81) syz repro: https://syzkaller.appspot.com/x/repro.syz?x=1468efe290 C reproducer: https://syzkaller.appspot.com/x/repro.c?x=10bb9fba90 IMPORTANT: if you fix the issue, please add the following tag to the commit: Reported-by: syzbot+2ca247c2d60c7023d...@syzkaller.appspotmail.com = BUG: KMSAN: uninit-value in ath9k_htc_rx_msg+0x28f/0x1f50 drivers/net/wireless/ath/ath9k/htc_hst.c:410 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.8.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x21c/0x280 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:121 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 ath9k_htc_rx_msg+0x28f/0x1f50 drivers/net/wireless/ath/ath9k/htc_hst.c:410 ath9k_hif_usb_rx_stream drivers/net/wireless/ath/ath9k/hif_usb.c:638 [inline] ath9k_hif_usb_rx_cb+0x1841/0x1d10 drivers/net/wireless/ath/ath9k/hif_usb.c:671 __usb_hcd_giveback_urb+0x687/0x870 drivers/usb/core/hcd.c:1650 usb_hcd_giveback_urb+0x1cb/0x730 drivers/usb/core/hcd.c:1716 dummy_timer+0xd98/0x71c0 drivers/usb/gadget/udc/dummy_hcd.c:1967 call_timer_fn+0x226/0x550 kernel/time/timer.c:1404 expire_timers+0x4fc/0x780 kernel/time/timer.c:1449 __run_timers+0xaf4/0xd30 kernel/time/timer.c:1773 run_timer_softirq+0x2d/0x50 kernel/time/timer.c:1786 __do_softirq+0x2ea/0x7f5 kernel/softirq.c:293 asm_call_on_stack+0xf/0x20 arch/x86/entry/entry_64.S:711 __run_on_irqstack arch/x86/include/asm/irq_stack.h:23 [inline] run_on_irqstack_cond arch/x86/include/asm/irq_stack.h:50 [inline] do_softirq_own_stack+0x7c/0xa0 arch/x86/kernel/irq_64.c:77 invoke_softirq kernel/softirq.c:390 [inline] __irq_exit_rcu+0x226/0x270 kernel/softirq.c:420 irq_exit_rcu+0xe/0x10 kernel/softirq.c:432 sysvec_apic_timer_interrupt+0x107/0x130 arch/x86/kernel/apic/apic.c:1091 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:593 RIP: 0010:native_irq_disable arch/x86/include/asm/irqflags.h:49 [inline] RIP: 0010:arch_local_irq_disable arch/x86/include/asm/irqflags.h:89 [inline] RIP: 0010:acpi_safe_halt drivers/acpi/processor_idle.c:112 [inline] RIP: 0010:acpi_idle_do_entry drivers/acpi/processor_idle.c:525 [inline] RIP: 0010:acpi_idle_enter+0x817/0xeb0 drivers/acpi/processor_idle.c:651 Code: 85 db 74 0a f7 d3 44 21 fb 48 85 db 74 32 4d 85 ff 75 3a 48 8b 5d a0 e9 0c 00 00 00 e8 12 b2 78 fb 0f 00 2d 25 15 1c 0b fb f4 eb 5a 84 c0 8b 7d 90 0f 45 7d 94 e8 d8 9a f4 fb e9 74 fc ff ff RSP: 0018:88812df93bc8 EFLAGS: 0246 RAX: RBX: 8881dfefce70 RCX: 00012db88000 RDX: 88812df88000 RSI: RDI: RBP: 88812df93ca0 R08: 86420acc R09: 88812fffa000 R10: 0002 R11: 88812df88000 R12: 88812df889d8 R13: 8881dfefcc64 R14: R15: cpuidle_enter_state+0x860/0x12b0 drivers/cpuidle/cpuidle.c:235 cpuidle_enter+0xe3/0x170 drivers/cpuidle/cpuidle.c:346 call_cpuidle kernel/sched/idle.c:126 [inline] cpuidle_idle_call kernel/sched/idle.c:214 [inline] do_idle+0x668/0x810 kernel/sched/idle.c:276 cpu_startup_entry+0x45/0x50 kernel/sched/idle.c:372 start_secondary+0x1bf/0x240 arch/x86/kernel/smpboot.c:268 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:243 Uninit was created at: kmsan_save_stack_with_flags+0x3c/0x90 mm/kmsan/kmsan.c:144 kmsan_internal_alloc_meta_for_pages mm/kmsan/kmsan_shadow.c:269 [inline] kmsan_alloc_page+0xc5/0x1a0 mm/kmsan/kmsan_shadow.c:293 __alloc_pages_nodemask+0xdf0/0x1030 mm/page_alloc.c:4889 __alloc_pages include/linux/gfp.h:509 [inline] __alloc_pages_node include/linux/gfp.h:522 [inline] alloc_pages_node include/linux/gfp.h:536 [inline] __page_frag_cache_refill mm/page_alloc.c:4964 [inline] page_frag_alloc+0x35b/0x880 mm/page_alloc.c:4994 __netdev_alloc_skb+0x2a8/0xc90 net/core/skbuff.c:451 __dev_alloc_skb include/linux/skbuff.h:2813 [inline] ath9k_hif_usb_rx_stream drivers/net/wireless/ath/ath9k/hif_usb.c:620 [inline] ath9k_hif_usb_rx_cb+0xe5a/0x1d10 drivers/net/wireless/ath/ath9k/hif_usb.c:671 __usb_hcd_giveback_urb+0x687/0x870 drivers/usb/core/hcd.c:1650 usb_hcd_giveback_urb+0x1cb/0x730 drivers/usb/core/hcd.c:1716 dummy_timer+0xd98/0x71c0 drivers/usb/gadget/udc/dummy_hcd.c:1967 call_timer_fn+0x226/0x550 kernel/time/timer.c:1404
[PATCH v6] Add MediaTek MT6779 devapc driver
These patch series introduce a MediaTek MT6779 devapc driver. MediaTek bus fabric provides TrustZone security support and data protection to prevent slaves from being accessed by unexpected masters. The security violation is logged and sent to the processor for further analysis or countermeasures. Any occurrence of security violation would raise an interrupt, and it will be handled by mtk-devapc driver. The violation information is printed in order to find the murderer. changes since v5: - remove redundant write reg operation. - use static variable of vio_dbgs instead. - add stop_devapc() if driver is removed. changes since v4: - refactor data structure. - merge two simple functions into one. - refactor register setting to prevent too many function call overhead. changes since v3: - revise violation handling flow to make it more easily to understand hardware behavior. - add more comments to understand how hardware works. changes since v2: - pass platform info through DT data. - remove unnecessary function. - remove slave_type because it always equals to 1 in current support SoC. - use vio_idx_num instread of list all devices' index. - add more comments to describe hardware behavior. changes since v1: - move SoC specific part to DT data. - remove unnecessary boundary check. - remove unnecessary data type declaration. - use read_poll_timeout() instread of for loop polling. - revise coding style elegantly. *** BLURB HERE *** Neal Liu (2): dt-bindings: devapc: add bindings for mtk-devapc soc: mediatek: add mt6779 devapc driver .../bindings/soc/mediatek/devapc.yaml | 58 drivers/soc/mediatek/Kconfig | 9 + drivers/soc/mediatek/Makefile | 1 + drivers/soc/mediatek/mtk-devapc.c | 320 ++ 4 files changed, 388 insertions(+) create mode 100644 Documentation/devicetree/bindings/soc/mediatek/devapc.yaml create mode 100644 drivers/soc/mediatek/mtk-devapc.c -- 2.18.0
[PATCH v6 2/2] soc: mediatek: add mt6779 devapc driver
MediaTek bus fabric provides TrustZone security support and data protection to prevent slaves from being accessed by unexpected masters. The security violation is logged and sent to the processor for further analysis or countermeasures. Any occurrence of security violation would raise an interrupt, and it will be handled by mtk-devapc driver. The violation information is printed in order to find the murderer. Signed-off-by: Neal Liu --- drivers/soc/mediatek/Kconfig |9 ++ drivers/soc/mediatek/Makefile |1 + drivers/soc/mediatek/mtk-devapc.c | 320 + 3 files changed, 330 insertions(+) create mode 100644 drivers/soc/mediatek/mtk-devapc.c diff --git a/drivers/soc/mediatek/Kconfig b/drivers/soc/mediatek/Kconfig index 59a56cd..1177c98 100644 --- a/drivers/soc/mediatek/Kconfig +++ b/drivers/soc/mediatek/Kconfig @@ -17,6 +17,15 @@ config MTK_CMDQ time limitation, such as updating display configuration during the vblank. +config MTK_DEVAPC + tristate "Mediatek Device APC Support" + help + Say yes here to enable support for Mediatek Device APC driver. + This driver is mainly used to handle the violation which catches + unexpected transaction. + The violation information is logged for further analysis or + countermeasures. + config MTK_INFRACFG bool "MediaTek INFRACFG Support" select REGMAP diff --git a/drivers/soc/mediatek/Makefile b/drivers/soc/mediatek/Makefile index 01f9f87..abfd4ba 100644 --- a/drivers/soc/mediatek/Makefile +++ b/drivers/soc/mediatek/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_MTK_CMDQ) += mtk-cmdq-helper.o +obj-$(CONFIG_MTK_DEVAPC) += mtk-devapc.o obj-$(CONFIG_MTK_INFRACFG) += mtk-infracfg.o obj-$(CONFIG_MTK_PMIC_WRAP) += mtk-pmic-wrap.o obj-$(CONFIG_MTK_SCPSYS) += mtk-scpsys.o diff --git a/drivers/soc/mediatek/mtk-devapc.c b/drivers/soc/mediatek/mtk-devapc.c new file mode 100644 index 000..5189b3f --- /dev/null +++ b/drivers/soc/mediatek/mtk-devapc.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 MediaTek Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define VIO_MOD_TO_REG_IND(m) ((m) / 32) +#define VIO_MOD_TO_REG_OFF(m) ((m) % 32) + +struct mtk_devapc_vio_dbgs { + union { + u32 vio_dbg0; + struct { + u32 mstid:16; + u32 dmnid:6; + u32 vio_w:1; + u32 vio_r:1; + u32 addr_h:4; + u32 resv:4; + } dbg0_bits; + }; + + u32 vio_dbg1; +}; + +struct mtk_devapc_data { + u32 vio_idx_num; + u32 vio_mask_offset; + u32 vio_sta_offset; + u32 vio_dbg0_offset; + u32 vio_dbg1_offset; + u32 apc_con_offset; + u32 vio_shift_sta_offset; + u32 vio_shift_sel_offset; + u32 vio_shift_con_offset; +}; + +struct mtk_devapc_context { + struct device *dev; + void __iomem *infra_base; + struct clk *infra_clk; + const struct mtk_devapc_data *data; +}; + +static void clear_vio_status(struct mtk_devapc_context *ctx) +{ + void __iomem *reg; + int i; + + reg = ctx->infra_base + ctx->data->vio_sta_offset; + + for (i = 0; i < VIO_MOD_TO_REG_IND(ctx->data->vio_idx_num - 1); i++) + writel(GENMASK(31, 0), reg + 4 * i); + + writel(GENMASK(VIO_MOD_TO_REG_OFF(ctx->data->vio_idx_num - 1), 0), + reg + 4 * i); +} + +static void mask_module_irq(struct mtk_devapc_context *ctx, bool mask) +{ + void __iomem *reg; + u32 val; + int i; + + reg = ctx->infra_base + ctx->data->vio_mask_offset; + + if (mask) + val = GENMASK(31, 0); + else + val = 0; + + for (i = 0; i < VIO_MOD_TO_REG_IND(ctx->data->vio_idx_num - 1); i++) + writel(val, reg + 4 * i); + + val = readl(reg + 4 * i); + if (mask) + val |= GENMASK(VIO_MOD_TO_REG_OFF(ctx->data->vio_idx_num - 1), + 0); + else + val &= ~GENMASK(VIO_MOD_TO_REG_OFF(ctx->data->vio_idx_num - 1), + 0); + + writel(val, reg + 4 * i); +} + +#define PHY_DEVAPC_TIMEOUT 0x1 + +/* + * devapc_sync_vio_dbg - do "shift" mechansim" to get full violation information. + * shift mechanism is depends on devapc hardware design. + * Mediatek devapc set multiple slaves as a group. + * When violation is triggered, violation info is kept + * inside devapc hardware. + * Driver should do shift mechansim to sync full violation + * info to VIO_DBGs registers. + * + */ +static int
[PATCH v6 1/2] dt-bindings: devapc: add bindings for mtk-devapc
Add bindings for mtk-devapc. Signed-off-by: Neal Liu --- .../devicetree/bindings/soc/mediatek/devapc.yaml | 58 1 file changed, 58 insertions(+) create mode 100644 Documentation/devicetree/bindings/soc/mediatek/devapc.yaml diff --git a/Documentation/devicetree/bindings/soc/mediatek/devapc.yaml b/Documentation/devicetree/bindings/soc/mediatek/devapc.yaml new file mode 100644 index 000..6c763f8 --- /dev/null +++ b/Documentation/devicetree/bindings/soc/mediatek/devapc.yaml @@ -0,0 +1,58 @@ +# SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause) +# # Copyright 2020 MediaTek Inc. +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/soc/mediatek/devapc.yaml#; +$schema: "http://devicetree.org/meta-schemas/core.yaml#; + +title: MediaTek Device Access Permission Control driver + +description: | + MediaTek bus fabric provides TrustZone security support and data + protection to prevent slaves from being accessed by unexpected masters. + The security violation is logged and sent to the processor for further + analysis and countermeasures. + +maintainers: + - Neal Liu + +properties: + compatible: +enum: + - mediatek,mt6779-devapc + + reg: +description: The base address of devapc register bank +maxItems: 1 + + interrupts: +description: A single interrupt specifier +maxItems: 1 + + clocks: +description: Contains module clock source and clock names +maxItems: 1 + + clock-names: +description: Names of the clocks list in clocks property +maxItems: 1 + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +examples: + - | +#include +#include + +devapc: devapc@10207000 { + compatible = "mediatek,mt6779-devapc"; + reg = <0x10207000 0x1000>; + interrupts = ; + clocks = <_ao CLK_INFRA_DEVICE_APC>; + clock-names = "devapc-infra-clock"; +}; -- 1.7.9.5
Re: [PATCH v2] lib/cmdline: prevent unintented access to address
On 8/12/20 8:07 PM, Seungil Kang wrote: > When args = "\"\0", "i" will be 0 and args[i-1] is used. (*lib/cmdline.c +238) > Because of "i" is an unsigned int type, the function will access at > args[0x]. > It can make a crash. > > Signed-off-by: Seungil Kang > --- > > Thanks for your review, my comments below > >> Can you be less ambiguous with the args value? (Perhaps provide a hexdump of >> it > for better understanding) > > This kind of args as hexdump below can cause crash. > > : 736f 6d65 7468 696e 6731 3d73 6f6d 655f something1=some_ > 0010: 7661 6c75 6573 2022 values " > > The args end with "\"\0". > >> Please, use proper punctuation, I'm lost where is the sentence and what are >> the > logical parts of them. > > I'm sorry to confuse you. I fix the commit msg > >> Can you point out to the code that calls this and leads to a crash? > > *lib/cmdlinc + 201 ~, next_arg function with args = "\"\0" > > char *next_arg(char *args, char **param, char **val) <-- args = "\"\0". > { > unsigned int i, equals = 0; > int in_quote = 0, quoted = 0; > char *next; > > if (*args == '"') { <-- *args == '"' is a true condition, > args++; <-- args++, so *args = '\0'. > in_quote = 1; > quoted = 1; <-- quoted also set 1. > } > > for (i = 0; args[i]; i++) { <-- when reached this point, i = 0, and > arg[0] == '\0', > so for loop is skipped. > if (isspace(args[i]) && !in_quote) > break; > if (equals == 0) { > if (args[i] == '=') > equals = i; > } > if (args[i] == '"') > in_quote = !in_quote; > } > > *param = args; > if (!equals) > *val = NULL; > else { > args[equals] = '\0'; > *val = args + equals + 1; > > /* Don't include quotes in value. */ > if (**val == '"') { > (*val)++; > if (args[i-1] == '"') > args[i-1] = '\0'; > } > } > if (quoted && args[i-1] == '"') <-- When reached this point, quoted > is still set 1, > "i" is still 0, and "i" is > unsigned int type, > so address will be {address of > args} + 0x. > It can make a crash. > args[i-1] = '\0'; > > if (args[i]) { > args[i] = '\0'; > next = args + i + 1; > } else > next = args + i; > > /* Chew up trailing spaces. */ > return skip_spaces(next); > } > > >> Can you provide a KUnit test module which can check the case? > > If necessary, I will make it and share it. Hi, Have you tested this patch? If so, how? > > lib/cmdline.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/lib/cmdline.c b/lib/cmdline.c > index fbb9981a04a4..2fd29d7723b2 100644 > --- a/lib/cmdline.c > +++ b/lib/cmdline.c > @@ -200,7 +200,7 @@ bool parse_option_str(const char *str, const char *option) > */ > char *next_arg(char *args, char **param, char **val) > { > - unsigned int i, equals = 0; > + int i, equals = 0; > int in_quote = 0, quoted = 0; > char *next; > > thanks. -- ~Randy
Re: [PATCH tip/core/rcu 06/12] rcu: Do full report for .need_qs for strict GPs
On Thu, Aug 13, 2020 at 02:50:27AM +0200, Jann Horn wrote: > On Thu, Aug 13, 2020 at 12:57 AM wrote: > > The rcu_preempt_deferred_qs_irqrestore() function is invoked at > > the end of an RCU read-side critical section (for example, directly > > from rcu_read_unlock()) and, if .need_qs is set, invokes rcu_qs() to > > report the new quiescent state. This works, except that rcu_qs() only > > updates per-CPU state, leaving reporting of the actual quiescent state > > to a later call to rcu_report_qs_rdp(), for example from within a later > > RCU_SOFTIRQ instance. Although this approach is exactly what you want if > > you are more concerned about efficiency than about short grace periods, > > in CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, short grace periods are > > the name of the game. > > > > This commit therefore makes rcu_preempt_deferred_qs_irqrestore() directly > > invoke rcu_report_qs_rdp() in CONFIG_RCU_STRICT_GRACE_PERIOD=y, thus > > shortening grace periods. > > Ooh, I'm very happy about this series! :) Glad you like it! And I hope that it helps! One usability concern is whether rcutree.rcu_unlock_delay needs to be applied only some small fraction of the time in order to allow the delay to be large (a couple hundred microseconds?) while still avoiding doing too much more damage to timing and performance than absolutely necessary. > > diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h > > index 7ed55c5..1761ff4 100644 > > --- a/kernel/rcu/tree_plugin.h > > +++ b/kernel/rcu/tree_plugin.h > > @@ -459,8 +459,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct > > *t, unsigned long flags) > > return; > > } > > t->rcu_read_unlock_special.s = 0; > > - if (special.b.need_qs) > > - rcu_qs(); > > + if (special.b.need_qs) { > > + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) > > + rcu_report_qs_rdp(rdp->cpu, rdp); > > Not an issue with this patch specifically, but: I'm looking at > rcu_report_qs_rdp(), and some of the parts that I do vaguely > understand look a bit off to me. > > rcu_report_qs_rdp() is given a CPU number as first argument, but never > actually uses that argument. (And the only existing caller also passes > in rdp->cpu, just like this patch.) I guess that argument can go away? > > The comment above rcu_report_qs_rdp() claims that it "must be called > from the specified CPU", but there is a branch in there that > specifically checks whether that is true ("if (rdp->cpu == > smp_processor_id())"). As far as I can tell, rcu_report_qs_rdp() is, > as the comment says, indeed never invoked with another CPU's rcu_data > (only invoked via rcu_core() -> rcu_check_quiescent_state() -> > rcu_report_qs_rdp(), and rcu_core() looks up "rdp = > raw_cpu_ptr(_data)"). So perhaps if there is a check for whether > rdp belongs to the current CPU, that check should have a WARN_ON(), or > something like that, since it indicates that the API contract > specified in the comment was violated? It looks like you are correct, and that the first parameter can be dropped, the "if" you mention replaced by a WARN_ON_ONCE(), and the body of that "if" be unconditional. I have it on my list, and if it still looks correct in the cold hard light of dawn, I will apply it with your Reported-by. And thank you very much for looking it over! Thanx, Paul
[PATCH] drivers/dax: Use kobj_to_dev() instead
Use kobj_to_dev() instead of container_of() Signed-off-by: Wang Qing --- drivers/dax/bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dax/bus.c b/drivers/dax/bus.c index df238c8..24625d2 --- a/drivers/dax/bus.c +++ b/drivers/dax/bus.c @@ -331,7 +331,7 @@ static DEVICE_ATTR_RO(numa_node); static umode_t dev_dax_visible(struct kobject *kobj, struct attribute *a, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct dev_dax *dev_dax = to_dev_dax(dev); if (a == _attr_target_node.attr && dev_dax_target_node(dev_dax) < 0) -- 2.7.4
drivers/md/dm-ebs-target.c:264:4: warning: Variable 'r' is reassigned a value before the old one has been used.
tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master head: dc06fe51d26efc100ac74121607c01a454867c91 commit: d3c7b35c20d60650bac8b55c17b194adda03a979 dm: add emulated block size target date: 3 months ago compiler: sparc-linux-gcc (GCC) 9.3.0 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot cppcheck warnings: (new ones prefixed by >>) >> drivers/md/dm-ebs-target.c:264:4: warning: Variable 'r' is reassigned a >> value before the old one has been used. [redundantAssignment] r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), >dev); ^ drivers/md/dm-ebs-target.c:237:4: note: Variable 'r' is reassigned a value before the old one has been used. r = -EINVAL; ^ drivers/md/dm-ebs-target.c:264:4: note: Variable 'r' is reassigned a value before the old one has been used. r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), >dev); ^ vim +/r +264 drivers/md/dm-ebs-target.c 208 209 /* 210 * Construct an emulated block size mapping: [] 211 * 212 * : path of the underlying device 213 * : offset in 512 bytes sectors into 214 * : emulated block size in units of 512 bytes exposed to the upper layer 215 * []: underlying block size in units of 512 bytes imposed on the lower layer; 216 * optional, if not supplied, retrieve logical block size from underlying device 217 */ 218 static int ebs_ctr(struct dm_target *ti, unsigned int argc, char **argv) 219 { 220 int r; 221 unsigned short tmp1; 222 unsigned long long tmp; 223 char dummy; 224 struct ebs_c *ec; 225 226 if (argc < 3 || argc > 4) { 227 ti->error = "Invalid argument count"; 228 return -EINVAL; 229 } 230 231 ec = ti->private = kzalloc(sizeof(*ec), GFP_KERNEL); 232 if (!ec) { 233 ti->error = "Cannot allocate ebs context"; 234 return -ENOMEM; 235 } 236 237 r = -EINVAL; 238 if (sscanf(argv[1], "%llu%c", , ) != 1 || 239 tmp != (sector_t)tmp || 240 (sector_t)tmp >= ti->len) { 241 ti->error = "Invalid device offset sector"; 242 goto bad; 243 } 244 ec->start = tmp; 245 246 if (sscanf(argv[2], "%hu%c", , ) != 1 || 247 !__ebs_check_bs(tmp1) || 248 to_bytes(tmp1) > PAGE_SIZE) { 249 ti->error = "Invalid emulated block size"; 250 goto bad; 251 } 252 ec->e_bs = tmp1; 253 254 if (argc > 3) { 255 if (sscanf(argv[3], "%hu%c", , ) != 1 || !__ebs_check_bs(tmp1)) { 256 ti->error = "Invalid underlying block size"; 257 goto bad; 258 } 259 ec->u_bs = tmp1; 260 ec->u_bs_set = true; 261 } else 262 ec->u_bs_set = false; 263 > 264 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), > >dev); 265 if (r) { 266 ti->error = "Device lookup failed"; 267 ec->dev = NULL; 268 goto bad; 269 } 270 271 r = -EINVAL; 272 if (!ec->u_bs_set) { 273 ec->u_bs = to_sector(bdev_logical_block_size(ec->dev->bdev)); 274 if (!__ebs_check_bs(ec->u_bs)) { 275 ti->error = "Invalid retrieved underlying block size"; 276 goto bad; 277 } 278 } 279 280 if (!ec->u_bs_set && ec->e_bs == ec->u_bs) 281 DMINFO("Emulation superfluous: emulated equal to underlying block size"); 282 283 if (__block_mod(ec->start, ec->u_bs)) { 284 ti->error = "Device offset must be multiple of underlying block size"; 285 goto bad; 286 } 287 288 ec->bufio = dm_bufio_client_create(ec->dev->bdev, to_bytes(ec->u_bs), 1, 0, NULL, NULL); 289 if (IS_ERR(ec->bufio)) { 290 ti->error = "Cannot create dm bufio client"; 291 r = PTR_ERR(ec->bufio); 292 ec->bufio = NULL; 293 goto bad; 294 } 295 296 ec->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM); 297 if (!ec->wq) { 298 ti->error = "Cannot create dm-" DM_MSG_PREFIX " workqueue"; 299 r = -ENOMEM; 300 goto bad; 301 } 302 303 ec->block_shift = __ffs(ec->u_bs);
drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c:5638 bnx2x_link_settings_status() warn: signedness bug returning '(-22)'
tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master head: 7c2a69f610e64c8dec6a06a66e721f4ce1dd783a commit: 2c855d73f2f6107f5b8ebc45f8b934bf7f4419e0 bnx2x: Remove read_status_t function casts date: 9 months ago config: ia64-randconfig-m031-20200811 (attached as .config) compiler: ia64-linux-gcc (GCC) 9.3.0 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot New smatch warnings: drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c:5638 bnx2x_link_settings_status() warn: signedness bug returning '(-22)' Old smatch warnings: drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c:7184 bnx2x_8073_8727_external_rom_boot() error: uninitialized symbol 'fw_ver1'. vim +5638 drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c 239d686d494f10e drivers/net/bnx2x_link.c Eilon Greenstein 2009-08-12 5613 2c855d73f2f6107 drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c Kees Cook 2019-11-14 5614 static u8 bnx2x_link_settings_status(struct bnx2x_phy *phy, 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5615 struct link_params *params, 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5616 struct link_vars *vars) 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5617 { 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5618 struct bnx2x *bp = params->bp; 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5619 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5620 u16 gp_status, duplex = DUPLEX_HALF, link_up = 0, speed_mask; 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5621 int rc = 0; 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5622 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5623 /* Read gp_status */ 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5624 CL22_RD_OVER_CL45(bp, phy, 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5625MDIO_REG_BANK_GP_STATUS, 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5626MDIO_GP_STATUS_TOP_AN_STATUS1, 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5627_status); 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5628 if (gp_status & MDIO_GP_STATUS_TOP_AN_STATUS1_DUPLEX_STATUS) 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5629 duplex = DUPLEX_FULL; 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5630 if (gp_status & MDIO_GP_STATUS_TOP_AN_STATUS1_LINK_STATUS) 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5631 link_up = 1; 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5632 speed_mask = gp_status & GP_STATUS_SPEED_MASK; 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5633 DP(NETIF_MSG_LINK, "gp_status 0x%x, is_link_up %d, speed_mask 0x%x\n", 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5634 gp_status, link_up, speed_mask); 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5635 rc = bnx2x_get_link_speed_duplex(phy, params, vars, link_up, speed_mask, 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5636 duplex); 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5637 if (rc == -EINVAL) 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 @5638 return rc; 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5639 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5640 if (gp_status & MDIO_GP_STATUS_TOP_AN_STATUS1_LINK_STATUS) { 3c9ada227c56c6f drivers/net/bnx2x/bnx2x_link.c Yaniv Rosner 2011-06-14 5641 if (SINGLE_MEDIA_DIRECT(params)) { 430d172a635c3dd drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c Yaniv Rosner 2012-09-11 5642
Re: [PATCH v8 2/4] scsi: ufs: Introduce HPB feature
On 2020-08-12 20:00, Daejun Park wrote: > On 2020-08-06 02:11, Daejun Park wrote: >>> +static int ufshpb_create_sysfs(struct ufs_hba *hba, struct ufshpb_lu *hpb) >>> +{ >>> +int ret; >>> + >>> +ufshpb_stat_init(hpb); >>> + >>> +kobject_init(>kobj, _ktype); >>> +mutex_init(>sysfs_lock); >>> + >>> +ret = kobject_add(>kobj, kobject_get(>dev->kobj), >>> + "ufshpb_lu%d", hpb->lun); >>> + >>> +if (ret) >>> +return ret; >>> + >>> +ret = sysfs_create_group(>kobj, _sysfs_group); >>> + >>> +if (ret) { >>> +dev_err(hba->dev, "ufshpb_lu%d create file error\n", hpb->lun); >>> +return ret; >>> +} >>> + >>> +dev_info(hba->dev, "ufshpb_lu%d sysfs adds uevent", hpb->lun); >>> +kobject_uevent(>kobj, KOBJ_ADD); >>> + >>> +return 0; >>> +} >> >> Please attach these sysfs attributes to /sys/class/scsi_device/*/device >> instead of creating a new kobject. Consider using the following >> scsi_host_template member to define LUN sysfs attributes: > > I am not rejecting your comment. But I added kobject for distinguishing > between other attributes and attributes related to HPB feature. > If you think it's pointless, I'll fix it. Hi Daejun, I see two reasons to add these sysfs attributes under /sys/class/scsi_device/*/device: - This makes the behavior of the UFS driver similar to that of other Linux SCSI LLD drivers. - This makes it easier for people who want to write udev rules that read from these attributes. Since ufshpb_lu%d is attached to the UFS controller it is not clear to me which attributes will appear first in sysfs - the SCSI device attributes or the ufshpb_lu%d attributes. If there are only SCSI device attributes there is no such ambiguity and hence authors of udev rules won't have to worry about this race condition. >>> +void ufshpb_remove(struct ufs_hba *hba) >>> +{ >>> +struct ufshpb_lu *hpb, *n_hpb; >>> +struct ufsf_feature_info *ufsf; >>> +struct scsi_device *sdev; >>> + >>> +ufsf = >ufsf; >>> + >>> +list_for_each_entry_safe(hpb, n_hpb, _hpb_lu, list_hpb_lu) { >>> +ufshpb_set_state(hpb, HPB_FAILED); >>> + >>> +sdev = hpb->sdev_ufs_lu; >>> +sdev->hostdata = NULL; >>> + >>> +ufshpb_destroy_region_tbl(hpb); >>> + >>> +list_del_init(>list_hpb_lu); >>> +ufshpb_remove_sysfs(hpb); >>> + >>> +kfree(hpb); >>> +} >>> + >>> +dev_info(hba->dev, "ufshpb: remove success\n"); >>> +} >> >> Should the code in the body of the above loop perhaps be called from inside >> ufshcd_slave_destroy()? > > Moving other stuffs in the loop is good idea, but removing attributes is > problem. > To avoid adding new kobject, I will try to use sysfs_merge_group() > for adding attributes. To delete merged attributes, sysfs_unmerge_group() > should be called. But sysfs_remove_groups() is called before calling > ufshcd_slave_destroy(). Hmm ... I don't see why the sdev_groups host template attribute can't be used? Please don't use sysfs_merge_group() and sysfs_unmerge_group() because that would create a race condition against udev rules if these functions are called after the device core has emitted a KOBJ_ADD event. Thanks, Bart.
Re: [PATCH v8 3/4] scsi: ufs: L2P map management for HPB read
On 2020-08-06 02:15, Daejun Park wrote: > > +req->end_io_data = (void *)map_req; > > Please leave the (void *) cast out since explicit casts from a non-void > to a void pointer are not necessary in C. OK, I will fix it. > > +static inline struct > > +ufshpb_rsp_field *ufshpb_get_hpb_rsp(struct ufshcd_lrb *lrbp) > > +{ > > +return (struct ufshpb_rsp_field > > *)>ucd_rsp_ptr->sr.sense_data_len; > > +} > > Please introduce a union in struct utp_cmd_rsp instead of using casts > to reinterpret a part of a data structure. OK. I will introduce a union in struct utp_cmd_rsp and use it. > > +/* routine : isr (ufs) */ > > The above comment looks very cryptic. Should it perhaps be expanded? > > > +struct ufshpb_active_field { > > +__be16 active_rgn; > > +__be16 active_srgn; > > +} __packed; > > Since "__packed" is not necessary for the above data structure, please > remove it. Note: a typical approach in the Linux kernel to verify that > the compiler has not inserted any padding bytes is to add a BUILD_BUG_ON() > statement in an initialization function that verifies the size of ABI data > structures. See also the output of the following command: > > git grep -nH 'BUILD_BUG_ON.sizeof.*!=' OK, I didn't know about it. Thanks. > > +struct ufshpb_rsp_field { > > +__be16 sense_data_len; > > +u8 desc_type; > > +u8 additional_len; > > +u8 hpb_type; > > +u8 reserved; > > +u8 active_rgn_cnt; > > +u8 inactive_rgn_cnt; > > +struct ufshpb_active_field hpb_active_field[2]; > > +__be16 hpb_inactive_field[2]; > > +} __packed; > > I think the above __packed statement should also be left out. OK, I will remove it. Thanks, Daejun
Re: [PATCH v8 4/4] scsi: ufs: Prepare HPB read for cached sub-region
On 2020-08-06 02:18, Daejun Park wrote: > > +static inline u32 ufshpb_get_lpn(struct scsi_cmnd *cmnd) > > +{ > > +return blk_rq_pos(cmnd->request) >> > > +(ilog2(cmnd->device->sector_size) - 9); > > +} > > Please use sectors_to_logical() from drivers/scsi/sd.h instead of open-coding > that function. OK, I will. > > +static inline unsigned int ufshpb_get_len(struct scsi_cmnd *cmnd) > > +{ > > +return blk_rq_sectors(cmnd->request) >> > > +(ilog2(cmnd->device->sector_size) - 9); > > +} > > Same comment here. OK > > +/* routine : READ10 -> HPB_READ */ > > Please expand this comment. OK Thanks, Daejun
[PATCH] dma-debug: fix debug_dma_assert_idle(), use rcu_read_lock()
Since commit 2a9127fcf229 ("mm: rewrite wait_on_page_bit_common() logic") improved unlock_page(), it has become more noticeable how cow_user_page() in a kernel with CONFIG_DMA_API_DEBUG=y can create and suffer from heavy contention on DMA debug's radix_lock in debug_dma_assert_idle(). It is only doing a lookup: use rcu_read_lock() and rcu_read_unlock() instead; though that does require the static ents[] to be moved onstack... ...but, hold on, isn't that radix_tree_gang_lookup() and loop doing quite the wrong thing: searching CACHELINES_PER_PAGE entries for an exact match with the first cacheline of the page in question? radix_tree_gang_lookup() is the right tool for the job, but we need nothing more than to check the first entry it can find, reporting if that falls anywhere within the page. (Is RCU safe here? As safe as using the spinlock was. The entries are never freed, so don't need to be freed by RCU. They may be reused, and there is a faint chance of a race, with an offending entry reused while printing its error info; but the spinlock did not prevent that either, and I agree that it's not worth worrying about.) Fixes: 3b7a6418c749 ("dma debug: account for cachelines and read-only mappings in overlap tracking") Signed-off-by: Hugh Dickins --- kernel/dma/debug.c | 27 +-- 1 file changed, 9 insertions(+), 18 deletions(-) --- v5.9-rc/kernel/dma/debug.c 2020-08-05 18:17:57.544203766 -0700 +++ linux/kernel/dma/debug.c2020-08-12 19:53:33.159070245 -0700 @@ -565,11 +565,8 @@ static void active_cacheline_remove(stru */ void debug_dma_assert_idle(struct page *page) { - static struct dma_debug_entry *ents[CACHELINES_PER_PAGE]; - struct dma_debug_entry *entry = NULL; - void **results = (void **) - unsigned int nents, i; - unsigned long flags; + struct dma_debug_entry *entry; + unsigned long pfn; phys_addr_t cln; if (dma_debug_disabled()) @@ -578,20 +575,14 @@ void debug_dma_assert_idle(struct page * if (!page) return; - cln = (phys_addr_t) page_to_pfn(page) << CACHELINE_PER_PAGE_SHIFT; - spin_lock_irqsave(_lock, flags); - nents = radix_tree_gang_lookup(_active_cacheline, results, cln, - CACHELINES_PER_PAGE); - for (i = 0; i < nents; i++) { - phys_addr_t ent_cln = to_cacheline_number(ents[i]); + pfn = page_to_pfn(page); + cln = (phys_addr_t) pfn << CACHELINE_PER_PAGE_SHIFT; - if (ent_cln == cln) { - entry = ents[i]; - break; - } else if (ent_cln >= cln + CACHELINES_PER_PAGE) - break; - } - spin_unlock_irqrestore(_lock, flags); + rcu_read_lock(); + if (!radix_tree_gang_lookup(_active_cacheline, (void **) , + cln, 1) || entry->pfn != pfn) + entry = NULL; + rcu_read_unlock(); if (!entry) return;
Re: [PATCH] perf parse-events: Set exclude_guest for user-space counting
Hi Like, On 8/12/2020 9:02 PM, Like Xu wrote: On 2020/8/12 20:15, Arnaldo Carvalho de Melo wrote: Em Wed, Aug 12, 2020 at 02:59:53PM +0800, Jin Yao escreveu: Currently if we run 'perf record -e cycles:u', exclude_guest is 0. But it doesn't make sense that we request for user-space counting but we also get the guest report. Please hold the horse and allow this possibility. Some authorized perf users on the host may only want to count (KVM) guest user space events. Thanks, Like Xu Without this patch, if we don't set the ":u" modifier, exclude_guest = 1. perf record -e cycles ./div perf evlist -v cycles: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|PERIOD, read_format: ID, disabled: 1, inherit: 1, mmap: 1, comm: 1, freq: 1, enable_on_exec: 1, task: 1, sample_id_all: 1, exclude_guest: 1, mmap2: 1, comm_exec: 1, ksymbol: 1, bpf_event: 1 So this patch doesn't change perf's original behavior. Thanks Jin Yao To keep perf semantics consistent and clear, this patch sets exclude_guest for user-space counting. Applied, and also added this, that you should consider doing in the future (modulo the "Committer testing:" header :) ): Committer testing: Before: # perf record -e cycles:u ^C[ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 1.231 MB perf.data (91 samples) ] # # perf evlist -v cycles:u: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, exclude_hv: 1, freq: 1, sample_id_all: 1 # After: # perf record -e cycles:u ^C[ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 1.263 MB perf.data (403 samples) ] # # perf evlist -v cycles:u: size: 120, { sample_period, sample_freq }: 4000, sample_type: IP|TID|TIME|ID|CPU|PERIOD, read_format: ID, disabled: 1, inherit: 1, exclude_kernel: 1, exclude_hv: 1, freq: 1, sample_id_all: 1, exclude_guest: 1 # I.e. show actual command output before and after that demonstrates the problem and then the solution. Signed-off-by: Jin Yao --- tools/perf/util/parse-events.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 9f7260e69113..4d809f1fe269 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1794,6 +1794,8 @@ static int get_event_modifier(struct event_modifier *mod, char *str, if (*str == 'u') { if (!exclude) exclude = eu = ek = eh = 1; + if (!exclude_GH) + eG = 1; eu = 0; } else if (*str == 'k') { if (!exclude) -- 2.17.1
[PATCH v2] lib/cmdline: prevent unintented access to address
When args = "\"\0", "i" will be 0 and args[i-1] is used. (*lib/cmdline.c +238) Because of "i" is an unsigned int type, the function will access at args[0x]. It can make a crash. Signed-off-by: Seungil Kang --- Thanks for your review, my comments below > Can you be less ambiguous with the args value? (Perhaps provide a hexdump of > it for better understanding) This kind of args as hexdump below can cause crash. : 736f 6d65 7468 696e 6731 3d73 6f6d 655f something1=some_ 0010: 7661 6c75 6573 2022 values " The args end with "\"\0". > Please, use proper punctuation, I'm lost where is the sentence and what are > the logical parts of them. I'm sorry to confuse you. I fix the commit msg > Can you point out to the code that calls this and leads to a crash? *lib/cmdlinc + 201 ~, next_arg function with args = "\"\0" char *next_arg(char *args, char **param, char **val) <-- args = "\"\0". { unsigned int i, equals = 0; int in_quote = 0, quoted = 0; char *next; if (*args == '"') { <-- *args == '"' is a true condition, args++; <-- args++, so *args = '\0'. in_quote = 1; quoted = 1; <-- quoted also set 1. } for (i = 0; args[i]; i++) { <-- when reached this point, i = 0, and arg[0] == '\0', so for loop is skipped. if (isspace(args[i]) && !in_quote) break; if (equals == 0) { if (args[i] == '=') equals = i; } if (args[i] == '"') in_quote = !in_quote; } *param = args; if (!equals) *val = NULL; else { args[equals] = '\0'; *val = args + equals + 1; /* Don't include quotes in value. */ if (**val == '"') { (*val)++; if (args[i-1] == '"') args[i-1] = '\0'; } } if (quoted && args[i-1] == '"') <-- When reached this point, quoted is still set 1, "i" is still 0, and "i" is unsigned int type, so address will be {address of args} + 0x. It can make a crash. args[i-1] = '\0'; if (args[i]) { args[i] = '\0'; next = args + i + 1; } else next = args + i; /* Chew up trailing spaces. */ return skip_spaces(next); } > Can you provide a KUnit test module which can check the case? If necessary, I will make it and share it. lib/cmdline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cmdline.c b/lib/cmdline.c index fbb9981a04a4..2fd29d7723b2 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -200,7 +200,7 @@ bool parse_option_str(const char *str, const char *option) */ char *next_arg(char *args, char **param, char **val) { - unsigned int i, equals = 0; + int i, equals = 0; int in_quote = 0, quoted = 0; char *next; -- 2.17.1
RE: [PATCH] softirq: add irq off checking for __raise_softirq_irqoff
Any comments? Thanks. @Steven Rostedt, I thinks irq off checking is necessary especially for Preempt-RT kernel, because some context may be changed from irq off to irq on when enable Preempt RT, I once met a issue that hrtimer soft irq is lost when enabled Preempt RT, finally I found napi_schedule_irqoff is called in hardware interrupt handler, there maybe no issue for non RT kernel, but for Preempt RT, interrupt is threaded, so irq is on in interrupt handler, the result is __raise_softirq_irqoff is called in irq on context, so that per-CPU softirq masking is corrupted because of the process of updating of soft irq masking is interrupted and not a atomic operation , and then caused hrtimer soft irq is lost. So I think adding irq status checking in __raise_softirq_irqoff can report such issue directly and help us to find the root cause of such issue. I know that there may be performance impaction to add extra checking here, if it is the case, how about to include it in some debug configuration items? Such as CONFIG_DEBUG_PREEMPT or other debug items? Best Regards, Jiafei. -Original Message- From: Jiafei Pan Sent: Thursday, August 6, 2020 12:07 PM To: pet...@infradead.org; mi...@kernel.org; t...@linutronix.de; rost...@goodmis.org; romain.per...@gmail.com; w...@kernel.org Cc: linux-kernel@vger.kernel.org; linux-rt-us...@vger.kernel.org; Jiafei Pan ; Leo Li ; Vladimir Oltean ; Jiafei Pan Subject: [PATCH] softirq: add irq off checking for __raise_softirq_irqoff __raise_softirq_irqoff will update per-CPU mask of pending softirqs, it need to be called in irq disabled context in order to keep it atomic operation, otherwise it will be interrupted by hardware interrupt, and per-CPU softirqs pending mask will be corrupted, the result is there will be unexpected issue, for example hrtimer soft irq will be losed and soft hrtimer will never be expire and handled. Adding irqs disabled checking here to provide warning in irqs enabled context. Signed-off-by: Jiafei Pan --- kernel/softirq.c | 5 + 1 file changed, 5 insertions(+) diff --git a/kernel/softirq.c b/kernel/softirq.c index bf88d7f62433..11f61e54a3ae 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -481,6 +481,11 @@ void raise_softirq(unsigned int nr) void __raise_softirq_irqoff(unsigned int nr) { + /* This function can only be called in irq disabled context, +* otherwise or_softirq_pending will be interrupted by hardware +* interrupt, so that there will be unexpected issue. +*/ + WARN_ON_ONCE(!irqs_disabled()); trace_softirq_raise(nr); or_softirq_pending(1UL << nr); } -- 2.17.1
Re: [PATCH v8 2/4] scsi: ufs: Introduce HPB feature
Hi Bart, On 2020-08-06 02:11, Daejun Park wrote: > > This is a patch for the HPB feature. > > This patch adds HPB function calls to UFS core driver. > > > > The mininum size of the memory pool used in the HPB is implemented as a > ^^^ > minimum? I will fix it. > > Kconfig parameter (SCSI_UFS_HPB_HOST_MEM), so that it can be configurable. > > > +config SCSI_UFS_HPB > > +bool "Support UFS Host Performance Booster" > > +depends on SCSI_UFSHCD > > +help > > + A UFS HPB Feature improves random read performance. It caches > ^ ^^^ > The? feature? I will fix it. > > + L2P map of UFS to host DRAM. The driver uses HPB read command > > + by piggybacking physical page number for bypassing FTL's L2P address > > + translation. > > Please explain what L2P and FTL mean. Not everyone is familiar with SSD > internals. I added full name of the abbreviation. L2P (logical to physical) map of UFS to host DRAM. The driver uses HPB read command ^^^ by piggybacking physical page number for bypassing FTL (flash translation layer) > > +config SCSI_UFS_HPB_HOST_MEM > > +int "Host-side cached memory size (KB) for HPB support" > > +default 32 > > +depends on SCSI_UFS_HPB > > +help > > + The mininum size of the memory pool used in the HPB module. It can > > + be configurable by the user. If this value is larger than required > > + memory size, kernel resizes cached memory size. > ^^^ ^^ > reduces?cache size? > > Please make this a kernel module parameter instead of a compile-time constant. OK, I will change it. > > +#ifndef CONFIG_SCSI_UFS_HPB > > +static void ufshpb_resume(struct ufs_hba *hba) {} > > +static void ufshpb_suspend(struct ufs_hba *hba) {} > > +static void ufshpb_reset(struct ufs_hba *hba) {} > > +static void ufshpb_reset_host(struct ufs_hba *hba) {} > > +static void ufshpb_rsp_upiu(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) > > {} > > +static void ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) {} > > +static void ufshpb_remove(struct ufs_hba *hba) {} > > +static void ufshpb_scan_feature(struct ufs_hba *hba) {} > > +#endif > > Please move these definitions into ufshpb.h since that is the > recommended Linux kernel coding style. OK, I will move them. > > diff --git a/drivers/scsi/ufs/ufshcd.h b/drivers/scsi/ufs/ufshcd.h > > index b2ef18f1b746..904c19796e09 100644 > > --- a/drivers/scsi/ufs/ufshcd.h > > +++ b/drivers/scsi/ufs/ufshcd.h > > @@ -47,6 +47,9 @@ > > #include "ufs.h" > > #include "ufs_quirks.h" > > #include "ufshci.h" > > +#ifdef CONFIG_SCSI_UFS_HPB > > +#include "ufshpb.h" > > +#endif > > Please move #ifdef CONFIG_SCSI_UFS_HPB / #endif into ufshpb.h. From > Documentation/process/4.Coding.rst: "As a general rule, #ifdef use > should be confined to header files whenever possible." OK, I will fix it. > > +struct ufsf_feature_info { > > +atomic_t slave_conf_cnt; > > +wait_queue_head_t sdev_wait; > > +}; > > Please add a comment above this data structure that explains the role > of this data structure and also what "ufsf" stands for. "ufsf" is stands for ufs feature. I wiil add comments for the data structure. > > +static int ufshpb_create_sysfs(struct ufs_hba *hba, struct ufshpb_lu *hpb); > > I don't think that this forward declaration is necessary so please leave it > out. OK, I will remove it. > > +static inline int ufshpb_is_valid_srgn(struct ufshpb_region *rgn, > > + struct ufshpb_subregion *srgn) > > +{ > > +return rgn->rgn_state != HPB_RGN_INACTIVE && > > +srgn->srgn_state == HPB_SRGN_VALID; > > +} > > Please do not declare functions inside .c files inline but instead let > the compiler decide which functions to inline. Modern compilers are really > good at this. I didn't know about it. Thanks. > > +static struct kobj_type ufshpb_ktype = { > > +.sysfs_ops = _sysfs_ops, > > +.release = NULL, > > +}; > > If the release method of a kobj_type is NULL that is a strong sign that > there is something wrong ... > > > +static int ufshpb_create_sysfs(struct ufs_hba *hba, struct ufshpb_lu *hpb) > > +{ > > +int ret; > > + > > +ufshpb_stat_init(hpb); > > + > > +kobject_init(>kobj, _ktype); > > +mutex_init(>sysfs_lock); > > + > > +ret = kobject_add(>kobj, kobject_get(>dev->kobj), > > + "ufshpb_lu%d", hpb->lun); > > + > > +if (ret) > > +return ret; > > + > > +ret = sysfs_create_group(>kobj, _sysfs_group); > > + > > +if (ret) { > > +dev_err(hba->dev, "ufshpb_lu%d create file error\n", hpb->lun); > > +return ret; > > +} > > + > > +dev_info(hba->dev, "ufshpb_lu%d sysfs adds uevent", hpb->lun); > > +kobject_uevent(>kobj, KOBJ_ADD); > > + > > +return 0; > >
[PATCH] acpi/nfit: Use kobj_to_dev() instead
Use kobj_to_dev() instead of container_of() Signed-off-by: Wang Qing --- drivers/acpi/nfit/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index fa4500f..3bb350b --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1382,7 +1382,7 @@ static bool ars_supported(struct nvdimm_bus *nvdimm_bus) static umode_t nfit_visible(struct kobject *kobj, struct attribute *a, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct nvdimm_bus *nvdimm_bus = to_nvdimm_bus(dev); if (a == _attr_scrub.attr && !ars_supported(nvdimm_bus)) @@ -1667,7 +1667,7 @@ static struct attribute *acpi_nfit_dimm_attributes[] = { static umode_t acpi_nfit_dimm_attr_visible(struct kobject *kobj, struct attribute *a, int n) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct nvdimm *nvdimm = to_nvdimm(dev); struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm); -- 2.7.4
[PATCH v2] mm: LMK, adjust oom_score_adj when fork a new process
From: YangHui Also it rely on inheritance,But there are some things you need't inheriting if all children oom_score_adj is -1000,the oom is meaningless Signed-off-by: YangHui --- include/uapi/linux/oom.h | 1 + kernel/fork.c| 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/oom.h b/include/uapi/linux/oom.h index 750b1c5..0251f23 100644 --- a/include/uapi/linux/oom.h +++ b/include/uapi/linux/oom.h @@ -8,6 +8,7 @@ */ #define OOM_SCORE_ADJ_MIN (-1000) #define OOM_SCORE_ADJ_MAX 1000 +#define OOM_SCORE_ADJ_DEFAULT 0 /* * /proc//oom_adj set to -17 protects from the oom killer for legacy diff --git a/kernel/fork.c b/kernel/fork.c index 4d32190..9dfa388 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1584,8 +1584,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) tty_audit_fork(sig); sched_autogroup_fork(sig); - sig->oom_score_adj = current->signal->oom_score_adj; - sig->oom_score_adj_min = current->signal->oom_score_adj_min; + sig->oom_score_adj = OOM_SCORE_ADJ_DEFAULT; + sig->oom_score_adj_min = OOM_SCORE_ADJ_MIN; mutex_init(>cred_guard_mutex); mutex_init(>exec_update_mutex); -- 2.7.4
RE: [PATCH 1/2] exfat: add NameLength check when extracting name
> Thank you for your reply. > > >> -static void exfat_get_uniname_from_ext_entry(struct super_block *sb, > >> - struct exfat_chain *p_dir, int entry, unsigned short *uniname) > >> +static int exfat_get_uniname_from_name_entries(struct > >> exfat_entry_set_cache *es, > >> + struct exfat_uni_name *uniname) > >> { > >> - int i; > >> - struct exfat_entry_set_cache *es; > >> + int n, l, i; > >>struct exfat_dentry *ep; > >> > >> - es = exfat_get_dentry_set(sb, p_dir, entry, ES_ALL_ENTRIES); > >> - if (!es) > >> - return; > >> + uniname->name_len = es->de_stream->name_len; > >> + if (uniname->name_len == 0) > >> + return -EIO; > > Can we validate ->name_len and name entry ->type in exfat_get_dentry_set() ? > > Yes. > As I wrote in a previous email, entry type validation, name-length > validation, and name extraction > should not be separated, so implement all of these in exfat_get_dentry_set(). > It can be easily implemented by adding uniname to exfat_entry_set_cache and > calling > exfat_get_uniname_from_name_entries() from exfat_get_dentry_set(). No, We can check stream->name_len and name entry type in exfat_get_dentry_set(). And you are already checking entry type with TYPE_SECONDARY in exfat_get_dentry_set(). Why do we have to check twice? > > However, that would be over-implementation. > Not all callers of exfat_get_dentry_set() need a name. Where? It will not checked with ES_2_ENTRIES. > It is enough to validate the name when it is needed. > This is a file-system driver, not fsck. Sorry, I don't understand what you are talking about. If there is a problem in ondisk-metadata, Filesystem should return error. > Validation is possible in exfat_get_dentry_set(), but unnecessary. > > Why do you want to validate the name in exfat_get_dentry_set()? exfat_get_dentry_set validates file, stream entry. And you are trying to check name entries with type_secondary. In addition, trying add the checksum check. Conversely, Why would you want to add those checks to exfat_get_dentry_set()? Why do we check only name entries separately? Aren't you intent to return validated entry set in exfat_get_dentry_set()? > > > BR > --- > Tetsuhiro Kohada
drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:2135:41: sparse: sparse: incorrect type in argument 1 (different address spaces)
tree: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git master head: dc06fe51d26efc100ac74121607c01a454867c91 commit: 670d0a4b10704667765f7d18f7592993d02783aa sparse: use identifiers to define address spaces date: 8 weeks ago config: i386-randconfig-s001-20200813 (attached as .config) compiler: gcc-9 (Debian 9.3.0-15) 9.3.0 reproduce: # apt-get install sparse # sparse version: v0.6.2-168-g9554805c-dirty git checkout 670d0a4b10704667765f7d18f7592993d02783aa # save the attached .config to linux build tree make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=i386 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot sparse warnings: (new ones prefixed by >>) >> drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:2135:41: sparse: >> sparse: incorrect type in argument 1 (different address spaces) @@ >> expected void *reg @@ got unsigned int [noderef] __iomem * @@ drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:2135:41: sparse: expected void *reg >> drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:2135:41: sparse: >> got unsigned int [noderef] __iomem * drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:33: sparse: sparse: incorrect type in argument 2 (different base types) @@ expected unsigned short [usertype] uid_hi @@ got restricted __be16 [usertype] @@ drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:33: sparse: expected unsigned short [usertype] uid_hi drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:33: sparse: got restricted __be16 [usertype] drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:45: sparse: sparse: incorrect type in argument 3 (different base types) @@ expected unsigned int [usertype] uid_lo @@ got restricted __be32 [usertype] @@ drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:45: sparse: expected unsigned int [usertype] uid_lo drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:45: sparse: got restricted __be32 [usertype] drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:56: sparse: sparse: incorrect type in argument 4 (different base types) @@ expected unsigned short [usertype] seqid @@ got restricted __be16 [usertype] @@ drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:56: sparse: expected unsigned short [usertype] seqid drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:157:56: sparse: got restricted __be16 [usertype] >> drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:306:26: sparse: sparse: >> incorrect type in argument 1 (different address spaces) @@ expected void >> [noderef] __iomem * @@ got void *reg @@ >> drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:306:26: sparse: >> expected void [noderef] __iomem * drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:306:26: sparse: got void *reg drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:336:33: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void *reg @@ got unsigned int [noderef] __iomem * @@ drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:336:33: sparse: expected void *reg drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:336:33: sparse: got unsigned int [noderef] __iomem * drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:343:33: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void *reg @@ got unsigned int [noderef] __iomem * @@ drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:343:33: sparse: expected void *reg drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:343:33: sparse: got unsigned int [noderef] __iomem * drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:356:33: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void *reg @@ got unsigned int [noderef] __iomem * @@ drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:356:33: sparse: expected void *reg drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:356:33: sparse: got unsigned int [noderef] __iomem * drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:397:33: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void *reg @@ got unsigned int [noderef] __iomem * @@ drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:397:33: sparse: expected void *reg drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:397:33: sparse: got unsigned int [noderef] __iomem * drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:466:41: sparse: sparse: incorrect type in argument 1 (different address spaces) @@ expected void *reg @@ got unsigned int [noderef] __iomem * @@ drivers/net/ethernet/oki-semi/pch_gbe/pch_gbe_main.c:466:41: sparse: expected void *reg
Re: [PATCH v2] nvme: Use spin_lock_irq() when taking the ctrl->lock
On 8/12/20 16:25, Logan Gunthorpe wrote: > When locking the ctrl->lock spinlock IRQs need to be disabled to avoid a > dead lock. The new spin_lock() calls recently added produce the > following lockdep warning when running the blktest nvme/003: > > > WARNING: inconsistent lock state > > inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. > ksoftirqd/2/22 [HC0[0]:SC1[1]:HE0:SE0] takes: > 888276a8c4c0 (>lock){+.?.}-{2:2}, at: > nvme_keep_alive_end_io+0x50/0xc0 > {SOFTIRQ-ON-W} state was registered at: >lock_acquire+0x164/0x500 >_raw_spin_lock+0x28/0x40 >nvme_get_effects_log+0x37/0x1c0 >nvme_init_identify+0x9e4/0x14f0 >nvme_reset_work+0xadd/0x2360 >process_one_work+0x66b/0xb70 >worker_thread+0x6e/0x6c0 >kthread+0x1e7/0x210 >ret_from_fork+0x22/0x30 > irq event stamp: 1449221 > hardirqs last enabled at (1449220): [] > ktime_get+0xf9/0x140 > hardirqs last disabled at (1449221): [] > _raw_spin_lock_irqsave+0x25/0x60 > softirqs last enabled at (1449210): [] > __do_softirq+0x447/0x595 > softirqs last disabled at (1449215): [] > run_ksoftirqd+0x35/0x50 > > other info that might help us debug this: > Possible unsafe locking scenario: > > CPU0 > >lock(>lock); > > lock(>lock); > > *** DEADLOCK *** > > no locks held by ksoftirqd/2/22. > > stack backtrace: > CPU: 2 PID: 22 Comm: ksoftirqd/2 Not tainted > 5.8.0-rc4-eid-vmlocalyes-dbg-00157-g7236657c6b3a #1450 > Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.12.0-1 > 04/01/2014 > Call Trace: > dump_stack+0xc8/0x11a > print_usage_bug.cold.63+0x235/0x23e > mark_lock+0xa9c/0xcf0 > __lock_acquire+0xd9a/0x2b50 > lock_acquire+0x164/0x500 > _raw_spin_lock_irqsave+0x40/0x60 > nvme_keep_alive_end_io+0x50/0xc0 > blk_mq_end_request+0x158/0x210 > nvme_complete_rq+0x146/0x500 > nvme_loop_complete_rq+0x26/0x30 [nvme_loop] > blk_done_softirq+0x187/0x1e0 > __do_softirq+0x118/0x595 > run_ksoftirqd+0x35/0x50 > smpboot_thread_fn+0x1d3/0x310 > kthread+0x1e7/0x210 > ret_from_fork+0x22/0x30 > > Fixes: be93e87e7802 ("nvme: support for multiple Command Sets Supported and > Effects log pages") > Signed-off-by: Logan Gunthorpe Thanks for this fix, looks good. Tested-by: Chaitanya Kulkarni Reviewed-by: Chaitanya Kulkarni
Re: [PATCH] tools build feature: Quote CC and CXX for their arguments
On Wed, Aug 12, 2020 at 3:15 PM Daniel Díaz wrote: > > When using a cross-compilation environment, such as OpenEmbedded, > the CC an CXX variables are set to something more than just a > command: there are arguments (such as --sysroot) that need to be > passed on to the compiler so that the right set of headers and > libraries are used. > > For the particular case that our systems detected, CC is set to > the following: > > export CC="aarch64-linaro-linux-gcc > --sysroot=/oe/build/tmp/work/machine/perf/1.0-r9/recipe-sysroot" > > Without quotes, detection is as follows: > > Auto-detecting system features: > ... dwarf: [ OFF ] > ...dwarf_getlocations: [ OFF ] > ... glibc: [ OFF ] > ... gtk2: [ OFF ] > ...libbfd: [ OFF ] > ...libcap: [ OFF ] > ...libelf: [ OFF ] > ... libnuma: [ OFF ] > ...numa_num_possible_cpus: [ OFF ] > ... libperl: [ OFF ] > ... libpython: [ OFF ] > ... libcrypto: [ OFF ] > ... libunwind: [ OFF ] > ...libdw-dwarf-unwind: [ OFF ] > ... zlib: [ OFF ] > ... lzma: [ OFF ] > ... get_cpuid: [ OFF ] > ... bpf: [ OFF ] > ...libaio: [ OFF ] > ... libzstd: [ OFF ] > ...disassembler-four-args: [ OFF ] > > Makefile.config:414: *** No gnu/libc-version.h found, please install > glibc-dev[el]. Stop. > Makefile.perf:230: recipe for target 'sub-make' failed > make[1]: *** [sub-make] Error 2 > Makefile:69: recipe for target 'all' failed > make: *** [all] Error 2 > > With CC and CXX quoted, some of those features are now detected. > > Fixes: e3232c2f39ac ("tools build feature: Use CC and CXX from parent") > > Signed-off-by: Daniel Díaz Whoops, I'm the one who introduced this issue. Fix looks good, thanks! Reviewed-by: Thomas Hebb Fixes: e3232c2f39ac ("tools build feature: Use CC and CXX from parent") > --- > tools/build/Makefile.feature | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature > index 774f0b0ca28a..e7818b44b48e 100644 > --- a/tools/build/Makefile.feature > +++ b/tools/build/Makefile.feature > @@ -8,7 +8,7 @@ endif > > feature_check = $(eval $(feature_check_code)) > define feature_check_code > - feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CC=$(CC) > CXX=$(CXX) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" > CXXFLAGS="$(EXTRA_CXXFLAGS) $(FEATURE_CHECK_CXXFLAGS-$(1))" > LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) > $(OUTPUT_FEATURES)test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0) > + feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CC="$(CC)" > CXX="$(CXX)" CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" > CXXFLAGS="$(EXTRA_CXXFLAGS) $(FEATURE_CHECK_CXXFLAGS-$(1))" > LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C $(feature_dir) > $(OUTPUT_FEATURES)test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0) We should probably also be quoting the arguments that expand $(OUTPUT_FEATURES) too, although trying to handle path names with spaces is probably a lost cause anyway. > endef > > feature_set = $(eval $(feature_set_code)) > -- > 2.25.1 >
Re: [PATCH v8 2/4] scsi: ufs: Introduce HPB feature
Hi Bart, On 2020-08-06 02:11, Daejun Park wrote: > > +static void ufshpb_issue_hpb_reset_query(struct ufs_hba *hba) > > +{ > > +int err; > > +int retries; > > + > > +for (retries = 0; retries < HPB_RESET_REQ_RETRIES; retries++) { > > +err = ufshcd_query_flag(hba, UPIU_QUERY_OPCODE_SET_FLAG, > > +QUERY_FLAG_IDN_HPB_RESET, 0, NULL); > > +if (err) > > +dev_dbg(hba->dev, > > +"%s: failed with error %d, retries %d\n", > > +__func__, err, retries); > > +else > > +break; > > +} > > + > > +if (err) { > > +dev_err(hba->dev, > > +"%s setting fHpbReset flag failed with error %d\n", > > +__func__, err); > > +return; > > +} > > +} > > Please change the "break" into an early return, remove the last > occurrence "if (err)" and remove the final return statement. OK, I will. > > > +static void ufshpb_check_hpb_reset_query(struct ufs_hba *hba) > > +{ > > +int err; > > +bool flag_res = true; > > +int try = 0; > > + > > +/* wait for the device to complete HPB reset query */ > > +do { > > +if (++try == HPB_RESET_REQ_RETRIES) > > +break; > > + > > +dev_info(hba->dev, > > +"%s start flag reset polling %d times\n", > > +__func__, try); > > + > > +/* Poll fHpbReset flag to be cleared */ > > +err = ufshcd_query_flag(hba, UPIU_QUERY_OPCODE_READ_FLAG, > > +QUERY_FLAG_IDN_HPB_RESET, 0, _res); > > +usleep_range(1000, 1100); > > +} while (flag_res); > > + > > +if (err) { > > +dev_err(hba->dev, > > +"%s reading fHpbReset flag failed with error %d\n", > > +__func__, err); > > +return; > > +} > > + > > +if (flag_res) { > > +dev_err(hba->dev, > > +"%s fHpbReset was not cleared by the device\n", > > +__func__); > > +} > > +} > > Should "polling %d times" perhaps be changed into "attempt %d"? I will change it. > The "if (err)" statement may be reached without "err" having been > initialized. Please fix. OK, I will initialize err to 0. > Additionally, please change the do-while loop into a for-loop, e.g. as > follows: > > for (try = 0; try < HPB_RESET_REQ_RETRIES; try++) > ... OK, I will change do-while to for-loop. Thanks, Daejun
Re: [PATCH v8 1/4] scsi: ufs: Add UFS feature related parameter
Hi Bart, > On 2020-08-06 02:02, Daejun Park wrote: > > @@ -537,6 +548,7 @@ struct ufs_dev_info { > > u8 *model; > > u16 wspecversion; > > u32 clk_gating_wait_us; > > +u8 b_ufs_feature_sup; > > u32 d_ext_ufs_feature_sup; > > u8 b_wb_buffer_type; > > u32 d_wb_alloc_units; > > > > Hmm ... shouldn't this variable be introduced in the patch that introduces > the code that sets and uses this variable? OK, I will move this variable to 2/4 patch. > How about making it clear in the patch subject that this patch adds protocol > constants related to HPB? The subject will be changed : "Add UFS feature related parameter -> Adds constants related to HPB" > Otherwise this patch looks good to me. > > Bart. Thanks, Daejun
Re: [PATCH v17 14/21] mm/compaction: do page isolation first in compaction
On Wed, Aug 12, 2020 at 6:47 PM Alex Shi wrote: > > > > 在 2020/8/13 上午12:51, Alexander Duyck 写道: > > On Wed, Aug 12, 2020 at 4:44 AM Alex Shi wrote: > >> > >> > >> > >> 在 2020/8/11 下午10:47, Alexander Duyck 写道: > >>> On Tue, Aug 11, 2020 at 1:23 AM Alex Shi > >>> wrote: > > > > 在 2020/8/10 下午10:41, Alexander Duyck 写道: > > On Mon, Aug 10, 2020 at 6:10 AM Alex Shi > > wrote: > >> > >> > >> > >> 在 2020/8/7 下午10:51, Alexander Duyck 写道: > >>> I wonder if this entire section shouldn't be restructured. This is the > >>> only spot I can see where we are resetting the LRU flag instead of > >>> pulling the page from the LRU list with the lock held. Looking over > >>> the code it seems like something like that should be possible. I am > >>> not sure the LRU lock is really protecting us in either the > >>> PageCompound check nor the skip bits. It seems like holding a > >>> reference on the page should prevent it from switching between > >>> compound or not, and the skip bits are per pageblock with the LRU bits > >>> being per node/memcg which I would think implies that we could have > >>> multiple LRU locks that could apply to a single skip bit. > >> > >> Hi Alexander, > >> > >> I don't find problem yet on compound or skip bit usage. Would you > >> clarify the > >> issue do you concerned? > >> > >> Thanks! > > > > The point I was getting at is that the LRU lock is being used to > > protect these and with your changes I don't think that makes sense > > anymore. > > > > The skip bits are per-pageblock bits. With your change the LRU lock is > > now per memcg first and then per node. As such I do not believe it > > really provides any sort of exclusive access to the skip bits. I still > > have to look into this more, but it seems like you need a lock per > > either section or zone that can be used to protect those bits and deal > > with this sooner rather than waiting until you have found an LRU page. > > The one part that is confusing though is that the definition of the > > skip bits seems to call out that they are a hint since they are not > > protected by a lock, but that is exactly what has been happening here. > > > > The skip bits are safe here, since even it race with other skip action, > It will still skip out. The skip action is try not to compaction too > much, > not a exclusive action needs avoid race. > >>> > >>> That would be the case if it didn't have the impact that they > >>> currently do on the compaction process. What I am getting at is that a > >>> race was introduced when you placed this test between the clearing of > >>> the LRU flag and the actual pulling of the page from the LRU list. So > >>> if you tested the skip bits before clearing the LRU flag then I would > >>> be okay with the code, however because it is triggering an abort after > >> > >> Hi Alexander, > >> > >> Thanks a lot for comments and suggestions! > >> > >> I have tried your suggestion: > >> > >> Signed-off-by: Alex Shi > >> --- > >> mm/compaction.c | 14 +++--- > >> 1 file changed, 7 insertions(+), 7 deletions(-) > >> > >> diff --git a/mm/compaction.c b/mm/compaction.c > >> index b99c96c4862d..6c881dee8c9a 100644 > >> --- a/mm/compaction.c > >> +++ b/mm/compaction.c > >> @@ -988,6 +988,13 @@ static bool too_many_isolated(pg_data_t *pgdat) > >> if (__isolate_lru_page_prepare(page, isolate_mode) != 0) > >> goto isolate_fail_put; > >> > >> + /* Try get exclusive access under lock */ > >> + if (!skip_updated) { > >> + skip_updated = true; > >> + if (test_and_set_skip(cc, page, low_pfn)) > >> + goto isolate_fail_put; > >> + } > >> + > >> /* Try isolate the page */ > >> if (!TestClearPageLRU(page)) > >> goto isolate_fail_put; > > > > I would have made this much sooner. Probably before you call > > get_page_unless_zero so as to avoid the unnecessary atomic operations. > > > >> @@ -1006,13 +1013,6 @@ static bool too_many_isolated(pg_data_t *pgdat) > >> > >> lruvec_memcg_debug(lruvec, page); > >> > >> - /* Try get exclusive access under lock */ > >> - if (!skip_updated) { > >> - skip_updated = true; > >> - if (test_and_set_skip(cc, page, low_pfn)) > >> - goto isolate_abort; > >> - } > >> - > >> /* > >> * Page become compound since the non-locked check, > >> * and it's on LRU. It can only be a THP so the > >> order > >> -- > >> > >> Performance of
[GIT PULL] xfs: small fixes for 5.9-rc1
Hi Linus, Please pull these two small fixes that have come in during the past week. The branch merges cleanly with upstream as of a few minutes ago, so please let me know if anything strange happens. --D The following changes since commit 818d5a91559ffe1e1f2095dcbbdb96c13fdb94ec: fs/xfs: Support that ioctl(SETXFLAGS/GETXFLAGS) can set/get inode DAX on XFS. (2020-07-28 20:28:20 -0700) are available in the Git repository at: git://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git tags/xfs-5.9-merge-8 for you to fetch changes up to 96cf2a2c75567ff56195fe3126d497a2e7e4379f: xfs: Fix UBSAN null-ptr-deref in xfs_sysfs_init (2020-08-07 11:50:17 -0700) Fixes for 5.9-rc1: - Fix duplicated words in comments. - Fix an ubsan complaint about null pointer arithmetic. Eiichi Tsukata (1): xfs: Fix UBSAN null-ptr-deref in xfs_sysfs_init Randy Dunlap (1): xfs: delete duplicated words + other fixes fs/xfs/libxfs/xfs_sb.c| 2 +- fs/xfs/xfs_attr_list.c| 2 +- fs/xfs/xfs_buf_item.c | 2 +- fs/xfs/xfs_buf_item_recover.c | 2 +- fs/xfs/xfs_dquot.c| 2 +- fs/xfs/xfs_export.c | 2 +- fs/xfs/xfs_inode.c| 4 ++-- fs/xfs/xfs_inode_item.c | 4 ++-- fs/xfs/xfs_iomap.c| 2 +- fs/xfs/xfs_log_cil.c | 2 +- fs/xfs/xfs_log_recover.c | 2 +- fs/xfs/xfs_refcount_item.c| 2 +- fs/xfs/xfs_reflink.c | 2 +- fs/xfs/xfs_sysfs.h| 6 -- fs/xfs/xfs_trans_ail.c| 4 ++-- 15 files changed, 21 insertions(+), 19 deletions(-)
[PATCH v2 0/2] Try to release mmap_lock temporarily in smaps_rollup
Recently, we have observed some janky issues caused by unpleasantly long contention on mmap_lock which is held by smaps_rollup when probing large processes. To address the problem, we let smaps_rollup detect if anyone wants to acquire mmap_lock for write attempts. If yes, just release the lock temporarily to ease the contention. smaps_rollup is a procfs interface which allows users to summarize the process's memory usage without the overhead of seq_* calls. Android uses it to sample the memory usage of various processes to balance its memory pool sizes. If no one wants to take the lock for write requests, smaps_rollup with this patch will behave like the original one. Although there are on-going mmap_lock optimizations like range-based locks, the lock applied to smaps_rollup would be the coarse one, which is hard to avoid the occurrence of aforementioned issues. So the detection and temporary release for write attempts on mmap_lock in smaps_rollup is still necessary. Change since v1: - If current VMA is freed after dropping the lock, it will return - incomplete result. To fix this issue, refine the code flow as - suggested by Steve. [1] [1] https://lore.kernel.org/lkml/bf40676e-b14b-44cd-75ce-419c70194...@arm.com/ Chinwen Chang (2): mmap locking API: add mmap_lock_is_contended() mm: proc: smaps_rollup: do not stall write attempts on mmap_lock fs/proc/task_mmu.c| 57 ++- include/linux/mmap_lock.h | 5 2 files changed, 61 insertions(+), 1 deletion(-)
[PATCH v2 1/2] mmap locking API: add mmap_lock_is_contended()
Add new API to query if someone wants to acquire mmap_lock for write attempts. Using this instead of rwsem_is_contended makes it more tolerant of future changes to the lock type. Signed-off-by: Chinwen Chang --- include/linux/mmap_lock.h | 5 + 1 file changed, 5 insertions(+) diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 0707671..18e7eae 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -87,4 +87,9 @@ static inline void mmap_assert_write_locked(struct mm_struct *mm) VM_BUG_ON_MM(!rwsem_is_locked(>mmap_lock), mm); } +static inline int mmap_lock_is_contended(struct mm_struct *mm) +{ + return rwsem_is_contended(>mmap_lock); +} + #endif /* _LINUX_MMAP_LOCK_H */ -- 1.9.1
[PATCH v2 2/2] mm: proc: smaps_rollup: do not stall write attempts on mmap_lock
smaps_rollup will try to grab mmap_lock and go through the whole vma list until it finishes the iterating. When encountering large processes, the mmap_lock will be held for a longer time, which may block other write requests like mmap and munmap from progressing smoothly. There are upcoming mmap_lock optimizations like range-based locks, but the lock applied to smaps_rollup would be the coarse type, which doesn't avoid the occurrence of unpleasant contention. To solve aforementioned issue, we add a check which detects whether anyone wants to grab mmap_lock for write attempts. Change since v1: - If current VMA is freed after dropping the lock, it will return - incomplete result. To fix this issue, refine the code flow as - suggested by Steve. [1] [1] https://lore.kernel.org/lkml/bf40676e-b14b-44cd-75ce-419c70194...@arm.com/ Signed-off-by: Chinwen Chang --- fs/proc/task_mmu.c | 56 +- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index dbda449..23b3a447 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -853,9 +853,63 @@ static int show_smaps_rollup(struct seq_file *m, void *v) hold_task_mempolicy(priv); - for (vma = priv->mm->mmap; vma; vma = vma->vm_next) { + for (vma = priv->mm->mmap; vma;) { smap_gather_stats(vma, ); last_vma_end = vma->vm_end; + + /* +* Release mmap_lock temporarily if someone wants to +* access it for write request. +*/ + if (mmap_lock_is_contended(mm)) { + mmap_read_unlock(mm); + ret = mmap_read_lock_killable(mm); + if (ret) { + release_task_mempolicy(priv); + goto out_put_mm; + } + + /* +* After dropping the lock, there are three cases to +* consider. See the following example for explanation. +* +* +--+--+---+ +* | VMA1 | VMA2 | VMA3 | +* +--+--+---+ +* | | | | +* 4k 8k 16k 400k +* +* Suppose we drop the lock after reading VMA2 due to +* contention, then we get: +* +* last_vma_end = 16k +* +* 1) VMA2 is freed, but VMA3 exists: +* +*find_vma(mm, 16k - 1) will return VMA3. +*In this case, just continue from VMA3. +* +* 2) VMA2 still exists: +* +*find_vma(mm, 16k - 1) will return VMA2. +*Iterate the loop like the original one. +* +* 3) No more VMAs can be found: +* +*find_vma(mm, 16k - 1) will return NULL. +*No more things to do, just break. +*/ + vma = find_vma(mm, last_vma_end - 1); + /* Case 3 above */ + if (!vma) + break; + + /* Case 1 above */ + if (vma->vm_start >= last_vma_end) + continue; + } + /* Case 2 above */ + vma = vma->vm_next; } show_vma_header_prefix(m, priv->mm->mmap->vm_start, -- 1.9.1
Re: [PATCH v2] PCI: Introduce flag for detached virtual functions
On Thu, Aug 13, 2020 at 6:33 AM Alex Williamson wrote: > > On Wed, 12 Aug 2020 15:21:11 -0400 > Matthew Rosato wrote: > > > @@ -521,7 +522,8 @@ static int vfio_basic_config_read(struct > > vfio_pci_device *vdev, int pos, > > count = vfio_default_config_read(vdev, pos, count, perm, offset, val); > > > > /* Mask in virtual memory enable for SR-IOV devices */ > > - if (offset == PCI_COMMAND && vdev->pdev->is_virtfn) { > > + if ((offset == PCI_COMMAND) && > > + (vdev->pdev->is_virtfn || vdev->pdev->detached_vf)) { > > u16 cmd = le16_to_cpu(*(__le16 *)>vconfig[PCI_COMMAND]); > > u32 tmp_val = le32_to_cpu(*val); > > > > @@ -1734,7 +1736,8 @@ int vfio_config_init(struct vfio_pci_device *vdev) > >vconfig[PCI_INTERRUPT_PIN]); > > > > vconfig[PCI_INTERRUPT_PIN] = 0; /* Gratuitous for good VFs */ > > - > > + } > > + if (pdev->is_virtfn || pdev->detached_vf) { > > /* > >* VFs do no implement the memory enable bit of the COMMAND > >* register therefore we'll not have it set in our initial > > diff --git a/include/linux/pci.h b/include/linux/pci.h > > index 8355306..23a6972 100644 > > --- a/include/linux/pci.h > > +++ b/include/linux/pci.h > > @@ -445,6 +445,7 @@ struct pci_dev { > > unsigned intis_probed:1;/* Device probing in progress > > */ > > unsigned intlink_active_reporting:1;/* Device capable of > > reporting link active */ > > unsigned intno_vf_scan:1; /* Don't scan for VFs after > > IOV enablement */ > > + unsigned intdetached_vf:1; /* VF without local PF access > > */ > > Is there too much implicit knowledge in defining a "detached VF"? For > example, why do we know that we can skip the portion of > vfio_config_init() that copies the vendor and device IDs from the > struct pci_dev into the virtual config space? It's true on s390x, but > I think that's because we know that firmware emulates those registers > for us. > > We also skip the INTx pin register sanity checking. Do we do > that because we haven't installed the broken device into an s390x > system? Because we know firmware manages that for us too? Or simply > because s390x doesn't support INTx anyway, and therefore it's another > architecture implicit decision? Agreed. Any hacks we put in for normal VFs are going to be needed for the passed-though VF case. Only applying the memory space enable workaround doesn't make sense to me either. > If detached_vf is really equivalent to is_virtfn for all cases that > don't care about referencing physfn on the pci_dev, then we should > probably have a macro to that effect. A pci_is_virtfn() helper would be better than open coding both checks everywhere. That said, it might be solving the wrong problem. The union between ->physfn and ->sriov has always seemed like a footgun to me so we might be better off switching the users who want a physfn to a helper instead. i.e. struct pci_dev *pci_get_vf_physfn(struct pci_dev *vf) { if (!vf->is_virtfn) return NULL; return vf->physfn; } ... pf = pci_get_vf_physfn(vf) if (pf) /* do pf things */ Then we can just use ->is_virtfn for the normal and detached cases. Oliver
Re: [PATCH linux-5.2.y-rt only] hrtimer: correct the logic for grab expiry lock
在 2020/8/12 下午7:45, Rasmus Villemoes 写道: On 12/08/2020 12.50, zhantao.t...@windriver.com wrote: From: Zhantao Tang In commit: 47b6de0b7f22 ("hrtimer: Add a missing bracket and hide `migration_base' on !SMP") a inline function is_migration_base() is introduced. But the logic of the hrtimer_grab_expiry_lock was changed. This patch is to correct it. Yup, same patch sent back in April, which also had a fixes tag for 5.2. https://lore.kernel.org/lkml/20200428144026.5882-1-rasmus.villem...@prevas.dk/ It got picked up for 4.19-rt, dunno why it wasn't for 5.2-rt. Yes, currently 5.2 rt kernel not fix the issue. And I checked the 4.19 rt kernel, but did not find the implementation of hrtimer_grab_expiry_lock(). It used hrtimer_wait_for_timer() and the logic is OK. Thanks, Zhantao Rasmus
Re: [PATCH v2] PCI: Introduce flag for detached virtual functions
On Thu, Aug 13, 2020 at 5:21 AM Matthew Rosato wrote: > > s390x has the notion of providing VFs to the kernel in a manner > where the associated PF is inaccessible other than via firmware. > These are not treated as typical VFs and access to them is emulated > by underlying firmware which can still access the PF. After > abafbc55 however these detached VFs were no longer able to work > with vfio-pci as the firmware does not provide emulation of the > PCI_COMMAND_MEMORY bit. In this case, let's explicitly recognize > these detached VFs so that vfio-pci can allow memory access to > them again. Hmm, cool. I think we have a similar feature on pseries so that's probably broken too. > Signed-off-by: Matthew Rosato > --- > arch/s390/pci/pci.c| 8 > drivers/vfio/pci/vfio_pci_config.c | 11 +++ > include/linux/pci.h| 1 + > 3 files changed, 16 insertions(+), 4 deletions(-) > > diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c > index 3902c9f..04ac76d 100644 > --- a/arch/s390/pci/pci.c > +++ b/arch/s390/pci/pci.c > @@ -581,6 +581,14 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask) > { > struct zpci_dev *zdev = to_zpci(pdev); > > + /* > +* If we have a VF on a non-multifunction bus, it must be a VF that is > +* detached from its parent PF. We rely on firmware emulation to > +* provide underlying PF details. > +*/ > + if (zdev->vfn && !zdev->zbus->multifunction) > + pdev->detached_vf = 1; The enable hook seems like it's a bit too late for this sort of screwing around with the pci_dev. Anything in the setup path that looks at ->detached_vf would see it cleared while anything that looks after the device is enabled will see it set. Can this go into pcibios_add_device() or a fixup instead?
Re: [RFC PATCH] printk: Change timestamp to triplet as mono, boot and real
On (20/08/11 15:02), Petr Mladek wrote: > On Tue 2020-08-11 14:05:12, Thomas Gleixner wrote: > > Petr Mladek writes: > > > At least "crash" tool would need an update anyway. AFAIK, it checks > > > the size of struct printk_log and refuses to read it when it changes. > > > > > > It means that the hack with VMCOREINFO_FIELD_OFFSET probably is not > > > needed because we would need to update the crashdump-related tools anyway. > > > > > > Well, the timing is good. We are about to switch the printk ring > > > buffer into a lockless one. It requires updating the crashdump tools > > > as well. We could do this at the same time. The lockless ring buffer > > > already is in linux-next. It is aimed for 5.10 or 5.11. > > ... > > > It would be great to synchronize all these changes changes of the > > > printk log buffer structures. > > > > I agree that having one update is a good thing, but pretty please can we > > finally make progress with this and not create yet another dependency? > > To make it clear. I definitely do not want to block lockless printk by > this. > > BTW: I am not 100% convinced that storing all three timestamps is > worth it. It increases the code complexity, metadata size. It needs > an interface with the userspace that has to stay backward compatible. Can we, perhaps, store those various "alternative" timestamps in dict so then whoever wants to read them can just parse the dict key:value pairs attach to each printk message? > Also it still will be racy because the timestamp is taken when the message > is printed. It might be "long" before or after the event that > it talks about. > > There is still the alternative to print all three timestamps regularly > for those interested. It is less user convenient but much easier > to maintain. Yes, that's a nice alternative. -ss
Re: [PATCH v2 0/4] Perf tool: Enable Arm arch timer counter and arm-spe's timestamp
On Wed, Aug 12, 2020 at 03:53:34PM -0300, Arnaldo Carvalho de Melo wrote: > Em Wed, Aug 12, 2020 at 10:06:53AM -0600, Mathieu Poirier escreveu: > > Hi Arnaldo, > > > > On Fri, 7 Aug 2020 at 01:16, Leo Yan wrote: > > > > > > This patch set is to enable Arm arch timer counter and Arm SPE is the > > > first customer to use arch timer counter for its timestamp. > > > > > > Patches 01 ~ 03 enables Arm arch timer counter in Perf tool; patch 01 is > > > to retrieve arch timer's parameters from mmaped page; patch 02 provides > > > APIs for the conversion between arch timer's counter and time; patch 03 > > > adds a test for patches 01 and 02. > > > > > > As the first customer to use Arm arch timer counter in perf tool, patch > > > 04 is to generate sample's timestamp for ARM SPE AUX trace data. > > > > > > This patch set has been rebased on perf/core branch with the latest > > > commit c4735d990268 ("perf evsel: Don't set > > > sample_regs_intr/sample_regs_user for dummy event"). > > > > The ARM SPE perf tools code is orphan and I don't have the cycles to > > pick it up. Leo has spent a lot of time in that code and as such I > > suggest that he starts maintaining it, probably following the same > > kind of arrangement you and I have for coresight. > > Thats ok with me, I think we should reflect that on the MAINTAINERS > file, right? Very appreciate for the promoting, Mathieu and Arnaldo. > We have this already: > > PERFORMANCE EVENTS SUBSYSTEM ARM64 PMU EVENTS > R: John Garry > R: Will Deacon > L: linux-arm-ker...@lists.infradead.org (moderated for non-subscribers) > S: Supported > F: tools/perf/pmu-events/arch/arm64/ > > I think we should have entries for CoreSight and ARM SPE, one listing > you as the maintainer and the other listing Leo, right? > > Leo, it would be good for you to try and find someone at your > organization or elsewhere that could help with this, this way I would > wait a bit, see if someone else Acks it, and collect those, processing > it from you after a while if nobody chimes in. I will follow up this and might offline to check with relevant maintainers (Will and John) for what's good way to proceed. > Please continue providing 'perf test' regression tests, etc, to help > with maintaining this code being checked. Sure, I'm glad to continue the maintaining and testing works :) Thanks, Leo
Re: [PATCH v3 0/6] tracing: export event trace and trace_marker
On Thu, Aug 13, 2020 at 12:49:25AM +0800, Mathieu Poirier wrote: > On Tue, Aug 11, 2020 at 11:57:20AM +0800, Tingwei Zhang wrote: > > Ftrace has ability to export trace packets to other destination. > > Currently, only function trace can be exported. This series extends the > > support to event trace and trace_maker. STM is one possible destination > to > > export ftrace. Use separate channel for each CPU to avoid mixing up > packets > > from different CPUs together. > > As the get_maintainer script points out, you are missing the maintainer of > the > system trace subsystem. > Sorry for missing that. I resent the patch and add Alexander. > > > > Change from v2: > > Change flag definition to BIT(). (Steven) > > Add comment in stm_ftrace_write() to clarify it's safe to use > > smp_processor_id() here since preempt is disabled. (Steven) > > > > Change from v1: > > All changes are suggested by Steven Rostedt. > > User separate flag to control function trace, event trace and trace > mark. > > Allocate channels according to num_possible_cpu() dynamically. > > Move ftrace_exports routines up so all ftrace can use them. > > > > Tingwei Zhang (6): > > stm class: ftrace: change dependency to TRACING > > tracing: add flag to control different traces > > tracing: add trace_export support for event trace > > tracing: add trace_export support for trace_marker > > stm class: ftrace: enable supported trace export flag > > stm class: ftrace: use different channel accroding to CPU > > > > drivers/hwtracing/stm/Kconfig | 2 +- > > drivers/hwtracing/stm/ftrace.c | 7 +- > > include/linux/trace.h | 7 + > > kernel/trace/trace.c | 270 ++--- > > 4 files changed, 159 insertions(+), 127 deletions(-) > > > > -- > > The Qualcomm Innovation Center, Inc. is a member of the Code Aurora > Forum, > > a Linux Foundation Collaborative Project > >
[PATCH v3 6/6] stm class: ftrace: use different channel accroding to CPU
To avoid mixup of packets from differnt ftrace packets simultaneously, use different channel for packets from different CPU. Signed-off-by: Tingwei Zhang Reviewed-by: Steven Rostedt (VMware) --- drivers/hwtracing/stm/ftrace.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/hwtracing/stm/ftrace.c b/drivers/hwtracing/stm/ftrace.c index c694a6e692d1..ebf29489919c 100644 --- a/drivers/hwtracing/stm/ftrace.c +++ b/drivers/hwtracing/stm/ftrace.c @@ -37,8 +37,10 @@ static void notrace stm_ftrace_write(struct trace_export *export, const void *buf, unsigned int len) { struct stm_ftrace *stm = container_of(export, struct stm_ftrace, ftrace); + /* This is called from trace system with preemption disabled */ + unsigned int cpu = smp_processor_id(); - stm_source_write(>data, STM_FTRACE_CHAN, buf, len); + stm_source_write(>data, STM_FTRACE_CHAN + cpu, buf, len); } static int stm_ftrace_link(struct stm_source_data *data) @@ -63,6 +65,7 @@ static int __init stm_ftrace_init(void) { int ret; + stm_ftrace.data.nr_chans = num_possible_cpus(); ret = stm_source_register_device(NULL, _ftrace.data); if (ret) pr_err("Failed to register stm_source - ftrace.\n"); -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project
[PATCH v3 5/6] stm class: ftrace: enable supported trace export flag
Set flags for trace_export. Export function trace, event trace and trace marker to stm. Signed-off-by: Tingwei Zhang Reviewed-by: Steven Rostedt (VMware) --- drivers/hwtracing/stm/ftrace.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/hwtracing/stm/ftrace.c b/drivers/hwtracing/stm/ftrace.c index ce868e095410..c694a6e692d1 100644 --- a/drivers/hwtracing/stm/ftrace.c +++ b/drivers/hwtracing/stm/ftrace.c @@ -46,6 +46,8 @@ static int stm_ftrace_link(struct stm_source_data *data) struct stm_ftrace *sf = container_of(data, struct stm_ftrace, data); sf->ftrace.write = stm_ftrace_write; + sf->ftrace.flags = TRACE_EXPORT_FUNCTION | TRACE_EXPORT_EVENT + | TRACE_EXPORT_MARKER; return register_ftrace_export(>ftrace); } -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project
Re: [PATCH v17 14/21] mm/compaction: do page isolation first in compaction
在 2020/8/13 上午12:51, Alexander Duyck 写道: > On Wed, Aug 12, 2020 at 4:44 AM Alex Shi wrote: >> >> >> >> 在 2020/8/11 下午10:47, Alexander Duyck 写道: >>> On Tue, Aug 11, 2020 at 1:23 AM Alex Shi wrote: 在 2020/8/10 下午10:41, Alexander Duyck 写道: > On Mon, Aug 10, 2020 at 6:10 AM Alex Shi > wrote: >> >> >> >> 在 2020/8/7 下午10:51, Alexander Duyck 写道: >>> I wonder if this entire section shouldn't be restructured. This is the >>> only spot I can see where we are resetting the LRU flag instead of >>> pulling the page from the LRU list with the lock held. Looking over >>> the code it seems like something like that should be possible. I am >>> not sure the LRU lock is really protecting us in either the >>> PageCompound check nor the skip bits. It seems like holding a >>> reference on the page should prevent it from switching between >>> compound or not, and the skip bits are per pageblock with the LRU bits >>> being per node/memcg which I would think implies that we could have >>> multiple LRU locks that could apply to a single skip bit. >> >> Hi Alexander, >> >> I don't find problem yet on compound or skip bit usage. Would you >> clarify the >> issue do you concerned? >> >> Thanks! > > The point I was getting at is that the LRU lock is being used to > protect these and with your changes I don't think that makes sense > anymore. > > The skip bits are per-pageblock bits. With your change the LRU lock is > now per memcg first and then per node. As such I do not believe it > really provides any sort of exclusive access to the skip bits. I still > have to look into this more, but it seems like you need a lock per > either section or zone that can be used to protect those bits and deal > with this sooner rather than waiting until you have found an LRU page. > The one part that is confusing though is that the definition of the > skip bits seems to call out that they are a hint since they are not > protected by a lock, but that is exactly what has been happening here. > The skip bits are safe here, since even it race with other skip action, It will still skip out. The skip action is try not to compaction too much, not a exclusive action needs avoid race. >>> >>> That would be the case if it didn't have the impact that they >>> currently do on the compaction process. What I am getting at is that a >>> race was introduced when you placed this test between the clearing of >>> the LRU flag and the actual pulling of the page from the LRU list. So >>> if you tested the skip bits before clearing the LRU flag then I would >>> be okay with the code, however because it is triggering an abort after >> >> Hi Alexander, >> >> Thanks a lot for comments and suggestions! >> >> I have tried your suggestion: >> >> Signed-off-by: Alex Shi >> --- >> mm/compaction.c | 14 +++--- >> 1 file changed, 7 insertions(+), 7 deletions(-) >> >> diff --git a/mm/compaction.c b/mm/compaction.c >> index b99c96c4862d..6c881dee8c9a 100644 >> --- a/mm/compaction.c >> +++ b/mm/compaction.c >> @@ -988,6 +988,13 @@ static bool too_many_isolated(pg_data_t *pgdat) >> if (__isolate_lru_page_prepare(page, isolate_mode) != 0) >> goto isolate_fail_put; >> >> + /* Try get exclusive access under lock */ >> + if (!skip_updated) { >> + skip_updated = true; >> + if (test_and_set_skip(cc, page, low_pfn)) >> + goto isolate_fail_put; >> + } >> + >> /* Try isolate the page */ >> if (!TestClearPageLRU(page)) >> goto isolate_fail_put; > > I would have made this much sooner. Probably before you call > get_page_unless_zero so as to avoid the unnecessary atomic operations. > >> @@ -1006,13 +1013,6 @@ static bool too_many_isolated(pg_data_t *pgdat) >> >> lruvec_memcg_debug(lruvec, page); >> >> - /* Try get exclusive access under lock */ >> - if (!skip_updated) { >> - skip_updated = true; >> - if (test_and_set_skip(cc, page, low_pfn)) >> - goto isolate_abort; >> - } >> - >> /* >> * Page become compound since the non-locked check, >> * and it's on LRU. It can only be a THP so the order >> -- >> >> Performance of case-lru-file-mmap-read in vm-scalibity is dropped a bit. not >> helpful > > So one issue with this change is that it is still too late to be of > much benefit. Really you should probably be doing this much sooner, > for example somewhere before the get_page_unless_zero(). Also the > thing that still has
[PATCH v3 0/6] tracing: export event trace and trace_marker
Ftrace has ability to export trace packets to other destination. Currently, only function trace can be exported. This series extends the support to event trace and trace_maker. STM is one possible destination to export ftrace. Use separate channel for each CPU to avoid mixing up packets from different CPUs together. Change from v2: Change flag definition to BIT(). (Steven) Add comment in stm_ftrace_write() to clarify it's safe to use smp_processor_id() here since preempt is disabled. (Steven) Change from v1: All changes are suggested by Steven Rostedt. User separate flag to control function trace, event trace and trace mark. Allocate channels according to num_possible_cpu() dynamically. Move ftrace_exports routines up so all ftrace can use them. Tingwei Zhang (6): stm class: ftrace: change dependency to TRACING tracing: add flag to control different traces tracing: add trace_export support for event trace tracing: add trace_export support for trace_marker stm class: ftrace: enable supported trace export flag stm class: ftrace: use different channel accroding to CPU drivers/hwtracing/stm/Kconfig | 2 +- drivers/hwtracing/stm/ftrace.c | 7 +- include/linux/trace.h | 7 + kernel/trace/trace.c | 270 ++--- 4 files changed, 159 insertions(+), 127 deletions(-) -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project
[PATCH v3 2/6] tracing: add flag to control different traces
More traces like event trace or trace marker will be supported. Add flag for difference traces, so that they can be controlled separately. Move current function trace to it's own flag instead of global ftrace enable flag. Signed-off-by: Tingwei Zhang Reviewed-by: Steven Rostedt (VMware) --- include/linux/trace.h | 5 + kernel/trace/trace.c | 36 +++- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/include/linux/trace.h b/include/linux/trace.h index 7fd86d3c691f..bc93063dda39 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -3,6 +3,9 @@ #define _LINUX_TRACE_H #ifdef CONFIG_TRACING + +#define TRACE_EXPORT_FUNCTION BIT(0) + /* * The trace export - an export of Ftrace output. The trace_export * can process traces and export them to a registered destination as @@ -15,10 +18,12 @@ * next- pointer to the next trace_export * write - copy traces which have been delt with ->commit() to * the destination + * flags - which ftrace to be exported */ struct trace_export { struct trace_export __rcu *next; void (*write)(struct trace_export *, const void *, unsigned int); + int flags; }; int register_ftrace_export(struct trace_export *export); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index bb62269724d5..8f1e66831e9e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2747,33 +2747,37 @@ trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, static void trace_process_export(struct trace_export *export, - struct ring_buffer_event *event) + struct ring_buffer_event *event, int flag) { struct trace_entry *entry; unsigned int size = 0; - entry = ring_buffer_event_data(event); - size = ring_buffer_event_length(event); - export->write(export, entry, size); + if (export->flags & flag) { + entry = ring_buffer_event_data(event); + size = ring_buffer_event_length(event); + export->write(export, entry, size); + } } static DEFINE_MUTEX(ftrace_export_lock); static struct trace_export __rcu *ftrace_exports_list __read_mostly; -static DEFINE_STATIC_KEY_FALSE(ftrace_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); -static inline void ftrace_exports_enable(void) +static inline void ftrace_exports_enable(struct trace_export *export) { - static_branch_enable(_exports_enabled); + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_inc(_function_exports_enabled); } -static inline void ftrace_exports_disable(void) +static inline void ftrace_exports_disable(struct trace_export *export) { - static_branch_disable(_exports_enabled); + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_dec(_function_exports_enabled); } -static void ftrace_exports(struct ring_buffer_event *event) +static void ftrace_exports(struct ring_buffer_event *event, int flag) { struct trace_export *export; @@ -2781,7 +2785,7 @@ static void ftrace_exports(struct ring_buffer_event *event) export = rcu_dereference_raw_check(ftrace_exports_list); while (export) { - trace_process_export(export, event); + trace_process_export(export, event, flag); export = rcu_dereference_raw_check(export->next); } @@ -2821,8 +2825,7 @@ rm_trace_export(struct trace_export **list, struct trace_export *export) static inline void add_ftrace_export(struct trace_export **list, struct trace_export *export) { - if (*list == NULL) - ftrace_exports_enable(); + ftrace_exports_enable(export); add_trace_export(list, export); } @@ -2833,8 +2836,7 @@ rm_ftrace_export(struct trace_export **list, struct trace_export *export) int ret; ret = rm_trace_export(list, export); - if (*list == NULL) - ftrace_exports_disable(); + ftrace_exports_disable(export); return ret; } @@ -2887,8 +2889,8 @@ trace_function(struct trace_array *tr, entry->parent_ip= parent_ip; if (!call_filter_check_discard(call, entry, buffer, event)) { - if (static_branch_unlikely(_exports_enabled)) - ftrace_exports(event); + if (static_branch_unlikely(_function_exports_enabled)) + ftrace_exports(event, TRACE_EXPORT_FUNCTION); __buffer_unlock_commit(buffer, event); } } -- The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project
[PATCH v3 3/6] tracing: add trace_export support for event trace
Only function traces can be exported to other destinations currently. This patch exports event trace as well. Move trace export related function to the beginning of file so other trace can call trace_process_export() to export. Signed-off-by: Tingwei Zhang Reviewed-by: Steven Rostedt (VMware) --- include/linux/trace.h | 1 + kernel/trace/trace.c | 259 ++ 2 files changed, 135 insertions(+), 125 deletions(-) diff --git a/include/linux/trace.h b/include/linux/trace.h index bc93063dda39..5a01eeffb254 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -5,6 +5,7 @@ #ifdef CONFIG_TRACING #define TRACE_EXPORT_FUNCTION BIT(0) +#define TRACE_EXPORT_EVENT BIT(1) /* * The trace export - an export of Ftrace output. The trace_export diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8f1e66831e9e..2f9302a8b322 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -251,6 +251,138 @@ unsigned long long ns2usecs(u64 nsec) return nsec; } +static void +trace_process_export(struct trace_export *export, + struct ring_buffer_event *event, int flag) +{ + struct trace_entry *entry; + unsigned int size = 0; + + if (export->flags & flag) { + entry = ring_buffer_event_data(event); + size = ring_buffer_event_length(event); + export->write(export, entry, size); + } +} + +static DEFINE_MUTEX(ftrace_export_lock); + +static struct trace_export __rcu *ftrace_exports_list __read_mostly; + +static DEFINE_STATIC_KEY_FALSE(trace_function_exports_enabled); +static DEFINE_STATIC_KEY_FALSE(trace_event_exports_enabled); + +static inline void ftrace_exports_enable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_inc(_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_inc(_event_exports_enabled); +} + +static inline void ftrace_exports_disable(struct trace_export *export) +{ + if (export->flags & TRACE_EXPORT_FUNCTION) + static_branch_dec(_function_exports_enabled); + + if (export->flags & TRACE_EXPORT_EVENT) + static_branch_dec(_event_exports_enabled); +} + +static void ftrace_exports(struct ring_buffer_event *event, int flag) +{ + struct trace_export *export; + + preempt_disable_notrace(); + + export = rcu_dereference_raw_check(ftrace_exports_list); + while (export) { + trace_process_export(export, event, flag); + export = rcu_dereference_raw_check(export->next); + } + + preempt_enable_notrace(); +} + +static inline void +add_trace_export(struct trace_export **list, struct trace_export *export) +{ + rcu_assign_pointer(export->next, *list); + /* +* We are entering export into the list but another +* CPU might be walking that list. We need to make sure +* the export->next pointer is valid before another CPU sees +* the export pointer included into the list. +*/ + rcu_assign_pointer(*list, export); +} + +static inline int +rm_trace_export(struct trace_export **list, struct trace_export *export) +{ + struct trace_export **p; + + for (p = list; *p != NULL; p = &(*p)->next) + if (*p == export) + break; + + if (*p != export) + return -1; + + rcu_assign_pointer(*p, (*p)->next); + + return 0; +} + +static inline void +add_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + ftrace_exports_enable(export); + + add_trace_export(list, export); +} + +static inline int +rm_ftrace_export(struct trace_export **list, struct trace_export *export) +{ + int ret; + + ret = rm_trace_export(list, export); + ftrace_exports_disable(export); + + return ret; +} + +int register_ftrace_export(struct trace_export *export) +{ + if (WARN_ON_ONCE(!export->write)) + return -1; + + mutex_lock(_export_lock); + + add_ftrace_export(_exports_list, export); + + mutex_unlock(_export_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ftrace_export); + +int unregister_ftrace_export(struct trace_export *export) +{ + int ret; + + mutex_lock(_export_lock); + + ret = rm_ftrace_export(_exports_list, export); + + mutex_unlock(_export_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_export); + /* trace_flags holds trace_options default values */ #define TRACE_DEFAULT_FLAGS\ (FUNCTION_DEFAULT_FLAGS | \ @@ -2702,6 +2834,8 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) if (static_key_false(_printk_key.key)) output_printk(fbuffer); + if