Re: linux-next: manual merge of the userns tree with the mips tree
On Tue, Aug 08, 2017 at 03:10:04PM +1000, Stephen Rothwell wrote: (Maciej added to cc.) > Hi Eric, > > Today's linux-next merge of the userns tree got a conflict in: > > arch/mips/kernel/traps.c > > between commit: > > 260a789828aa ("MIPS: signal: Remove unreachable code from > force_fcr31_sig().") > > from the mips tree and commit: > > ea1b75cf9138 ("signal/mips: Document a conflict with SI_USER with SIGFPE") > > from the userns tree. > > I fixed it up (the former removed the code updated by the latter) and > can carry the fix as necessary. This is now fixed as far as linux-next > is concerned, but any non trivial conflicts should be mentioned to your > upstream maintainer when your tree is submitted for merging. You may > also want to consider cooperating with the maintainer of the conflicting > tree to minimise any particularly complex conflicts. Eric, after yesterday's emails on the topic I think commit ea1b75cf9138 ("signal/ mips: Document a conflict with SI_USER with SIGFPE") should be dropped. Ralf
Re: linux-next: manual merge of the userns tree with the mips tree
On Tue, Aug 08, 2017 at 03:10:04PM +1000, Stephen Rothwell wrote: (Maciej added to cc.) > Hi Eric, > > Today's linux-next merge of the userns tree got a conflict in: > > arch/mips/kernel/traps.c > > between commit: > > 260a789828aa ("MIPS: signal: Remove unreachable code from > force_fcr31_sig().") > > from the mips tree and commit: > > ea1b75cf9138 ("signal/mips: Document a conflict with SI_USER with SIGFPE") > > from the userns tree. > > I fixed it up (the former removed the code updated by the latter) and > can carry the fix as necessary. This is now fixed as far as linux-next > is concerned, but any non trivial conflicts should be mentioned to your > upstream maintainer when your tree is submitted for merging. You may > also want to consider cooperating with the maintainer of the conflicting > tree to minimise any particularly complex conflicts. Eric, after yesterday's emails on the topic I think commit ea1b75cf9138 ("signal/ mips: Document a conflict with SI_USER with SIGFPE") should be dropped. Ralf
[PATCH] mmc: mxcmmc: Handle return value of clk_prepare_enable
clk_prepare_enable() can fail here and we must check its return value. Signed-off-by: Arvind Yadav--- drivers/mmc/host/mxcmmc.c | 25 +++-- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index fb3ca82..c016820 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -1098,8 +1098,13 @@ static int mxcmci_probe(struct platform_device *pdev) goto out_free; } - clk_prepare_enable(host->clk_per); - clk_prepare_enable(host->clk_ipg); + ret = clk_prepare_enable(host->clk_per); + if (ret) + goto out_free; + + ret = clk_prepare_enable(host->clk_ipg); + if (ret) + goto out_clk_per_put; mxcmci_softreset(host); @@ -1168,8 +1173,9 @@ static int mxcmci_probe(struct platform_device *pdev) dma_release_channel(host->dma); out_clk_put: - clk_disable_unprepare(host->clk_per); clk_disable_unprepare(host->clk_ipg); +out_clk_per_put: + clk_disable_unprepare(host->clk_per); out_free: mmc_free_host(mmc); @@ -1212,10 +1218,17 @@ static int __maybe_unused mxcmci_resume(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct mxcmci_host *host = mmc_priv(mmc); + int ret; - clk_prepare_enable(host->clk_per); - clk_prepare_enable(host->clk_ipg); - return 0; + ret = clk_prepare_enable(host->clk_per); + if (ret) + return ret; + + ret = clk_prepare_enable(host->clk_ipg); + if (ret) + clk_disable_unprepare(host->clk_per); + + return ret; } static SIMPLE_DEV_PM_OPS(mxcmci_pm_ops, mxcmci_suspend, mxcmci_resume); -- 1.9.1
[PATCH] mmc: mxcmmc: Handle return value of clk_prepare_enable
clk_prepare_enable() can fail here and we must check its return value. Signed-off-by: Arvind Yadav --- drivers/mmc/host/mxcmmc.c | 25 +++-- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index fb3ca82..c016820 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -1098,8 +1098,13 @@ static int mxcmci_probe(struct platform_device *pdev) goto out_free; } - clk_prepare_enable(host->clk_per); - clk_prepare_enable(host->clk_ipg); + ret = clk_prepare_enable(host->clk_per); + if (ret) + goto out_free; + + ret = clk_prepare_enable(host->clk_ipg); + if (ret) + goto out_clk_per_put; mxcmci_softreset(host); @@ -1168,8 +1173,9 @@ static int mxcmci_probe(struct platform_device *pdev) dma_release_channel(host->dma); out_clk_put: - clk_disable_unprepare(host->clk_per); clk_disable_unprepare(host->clk_ipg); +out_clk_per_put: + clk_disable_unprepare(host->clk_per); out_free: mmc_free_host(mmc); @@ -1212,10 +1218,17 @@ static int __maybe_unused mxcmci_resume(struct device *dev) { struct mmc_host *mmc = dev_get_drvdata(dev); struct mxcmci_host *host = mmc_priv(mmc); + int ret; - clk_prepare_enable(host->clk_per); - clk_prepare_enable(host->clk_ipg); - return 0; + ret = clk_prepare_enable(host->clk_per); + if (ret) + return ret; + + ret = clk_prepare_enable(host->clk_ipg); + if (ret) + clk_disable_unprepare(host->clk_per); + + return ret; } static SIMPLE_DEV_PM_OPS(mxcmci_pm_ops, mxcmci_suspend, mxcmci_resume); -- 1.9.1
Re: [lkp-robot] [mm] 7674270022: will-it-scale.per_process_ops -19.3% regression
Nadav Amitwrote: > Minchan Kim wrote: > >> Hi, >> >> On Tue, Aug 08, 2017 at 09:19:23AM +0800, kernel test robot wrote: >>> Greeting, >>> >>> FYI, we noticed a -19.3% regression of will-it-scale.per_process_ops due to >>> commit: >>> >>> >>> commit: 76742700225cad9df49f05399381ac3f1ec3dc60 ("mm: fix >>> MADV_[FREE|DONTNEED] TLB flush miss problem") >>> url: >>> https://github.com/0day-ci/linux/commits/Nadav-Amit/mm-migrate-prevent-racy-access-to-tlb_flush_pending/20170802-205715 >>> >>> >>> in testcase: will-it-scale >>> on test machine: 88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz with >>> 64G memory >>> with following parameters: >>> >>> nr_task: 16 >>> mode: process >>> test: brk1 >>> cpufreq_governor: performance >>> >>> test-description: Will It Scale takes a testcase and runs it from 1 through >>> to n parallel copies to see if the testcase will scale. It builds both a >>> process and threads based test in order to see any differences between the >>> two. >>> test-url: https://github.com/antonblanchard/will-it-scale >> >> Thanks for the report. >> Could you explain what kinds of workload you are testing? >> >> Does it calls frequently madvise(MADV_DONTNEED) in parallel on multiple >> threads? > > According to the description it is "testcase:brk increase/decrease of one > page”. According to the mode it spawns multiple processes, not threads. > > Since a single page is unmapped each time, and the iTLB-loads increase > dramatically, I would suspect that for some reason a full TLB flush is > caused during do_munmap(). > > If I find some free time, I’ll try to profile the workload - but feel free > to beat me to it. The root-cause appears to be that tlb_finish_mmu() does not call dec_tlb_flush_pending() - as it should. Any chance you can take care of it? Having said that it appears that cpumask_any_but() is really inefficient since it does not have an optimization for the case in which small_const_nbits(nbits)==true. When I find some free time, I’ll try to deal with it. Thanks, Nadav
Re: [lkp-robot] [mm] 7674270022: will-it-scale.per_process_ops -19.3% regression
Nadav Amit wrote: > Minchan Kim wrote: > >> Hi, >> >> On Tue, Aug 08, 2017 at 09:19:23AM +0800, kernel test robot wrote: >>> Greeting, >>> >>> FYI, we noticed a -19.3% regression of will-it-scale.per_process_ops due to >>> commit: >>> >>> >>> commit: 76742700225cad9df49f05399381ac3f1ec3dc60 ("mm: fix >>> MADV_[FREE|DONTNEED] TLB flush miss problem") >>> url: >>> https://github.com/0day-ci/linux/commits/Nadav-Amit/mm-migrate-prevent-racy-access-to-tlb_flush_pending/20170802-205715 >>> >>> >>> in testcase: will-it-scale >>> on test machine: 88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz with >>> 64G memory >>> with following parameters: >>> >>> nr_task: 16 >>> mode: process >>> test: brk1 >>> cpufreq_governor: performance >>> >>> test-description: Will It Scale takes a testcase and runs it from 1 through >>> to n parallel copies to see if the testcase will scale. It builds both a >>> process and threads based test in order to see any differences between the >>> two. >>> test-url: https://github.com/antonblanchard/will-it-scale >> >> Thanks for the report. >> Could you explain what kinds of workload you are testing? >> >> Does it calls frequently madvise(MADV_DONTNEED) in parallel on multiple >> threads? > > According to the description it is "testcase:brk increase/decrease of one > page”. According to the mode it spawns multiple processes, not threads. > > Since a single page is unmapped each time, and the iTLB-loads increase > dramatically, I would suspect that for some reason a full TLB flush is > caused during do_munmap(). > > If I find some free time, I’ll try to profile the workload - but feel free > to beat me to it. The root-cause appears to be that tlb_finish_mmu() does not call dec_tlb_flush_pending() - as it should. Any chance you can take care of it? Having said that it appears that cpumask_any_but() is really inefficient since it does not have an optimization for the case in which small_const_nbits(nbits)==true. When I find some free time, I’ll try to deal with it. Thanks, Nadav
[PATCH] mmc: wmt-sdmmc: Handle return value of clk_prepare_enable
clk_prepare_enable() can fail here and we must check its return value. Signed-off-by: Arvind Yadav--- drivers/mmc/host/wmt-sdmmc.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/wmt-sdmmc.c b/drivers/mmc/host/wmt-sdmmc.c index 21ebba8..e64f930 100644 --- a/drivers/mmc/host/wmt-sdmmc.c +++ b/drivers/mmc/host/wmt-sdmmc.c @@ -856,7 +856,9 @@ static int wmt_mci_probe(struct platform_device *pdev) goto fail5; } - clk_prepare_enable(priv->clk_sdmmc); + ret = clk_prepare_enable(priv->clk_sdmmc); + if (ret) + goto fail6; /* configure the controller to a known 'ready' state */ wmt_reset_hardware(mmc); @@ -866,6 +868,8 @@ static int wmt_mci_probe(struct platform_device *pdev) dev_info(>dev, "WMT SDHC Controller initialized\n"); return 0; +fail6: + clk_put(priv->clk_sdmmc); fail5: free_irq(dma_irq, priv); fail4: -- 1.9.1
[PATCH] mmc: wmt-sdmmc: Handle return value of clk_prepare_enable
clk_prepare_enable() can fail here and we must check its return value. Signed-off-by: Arvind Yadav --- drivers/mmc/host/wmt-sdmmc.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/wmt-sdmmc.c b/drivers/mmc/host/wmt-sdmmc.c index 21ebba8..e64f930 100644 --- a/drivers/mmc/host/wmt-sdmmc.c +++ b/drivers/mmc/host/wmt-sdmmc.c @@ -856,7 +856,9 @@ static int wmt_mci_probe(struct platform_device *pdev) goto fail5; } - clk_prepare_enable(priv->clk_sdmmc); + ret = clk_prepare_enable(priv->clk_sdmmc); + if (ret) + goto fail6; /* configure the controller to a known 'ready' state */ wmt_reset_hardware(mmc); @@ -866,6 +868,8 @@ static int wmt_mci_probe(struct platform_device *pdev) dev_info(>dev, "WMT SDHC Controller initialized\n"); return 0; +fail6: + clk_put(priv->clk_sdmmc); fail5: free_irq(dma_irq, priv); fail4: -- 1.9.1
[PATCH 1/4] usb: mtu3: add generic compatible string
The mtu3 driver is a generic driver for MediaTek usb3 DRD IP, add a generic compatible to avoid confusion when support new SoCs but use a compatible with specific SoC's name "mt8173". Signed-off-by: Chunfeng Yun--- drivers/usb/mtu3/mtu3_plat.c |1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/mtu3/mtu3_plat.c b/drivers/usb/mtu3/mtu3_plat.c index 0d3ebb3..088e3e6 100644 --- a/drivers/usb/mtu3/mtu3_plat.c +++ b/drivers/usb/mtu3/mtu3_plat.c @@ -500,6 +500,7 @@ static int __maybe_unused mtu3_resume(struct device *dev) static const struct of_device_id mtu3_of_match[] = { {.compatible = "mediatek,mt8173-mtu3",}, + {.compatible = "mediatek,mtu3",}, {}, }; -- 1.7.9.5
[PATCH 1/4] usb: mtu3: add generic compatible string
The mtu3 driver is a generic driver for MediaTek usb3 DRD IP, add a generic compatible to avoid confusion when support new SoCs but use a compatible with specific SoC's name "mt8173". Signed-off-by: Chunfeng Yun --- drivers/usb/mtu3/mtu3_plat.c |1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/mtu3/mtu3_plat.c b/drivers/usb/mtu3/mtu3_plat.c index 0d3ebb3..088e3e6 100644 --- a/drivers/usb/mtu3/mtu3_plat.c +++ b/drivers/usb/mtu3/mtu3_plat.c @@ -500,6 +500,7 @@ static int __maybe_unused mtu3_resume(struct device *dev) static const struct of_device_id mtu3_of_match[] = { {.compatible = "mediatek,mt8173-mtu3",}, + {.compatible = "mediatek,mtu3",}, {}, }; -- 1.7.9.5
[PATCH 2/4] usb: xhci-mtk: add generic compatible string
The xhci-mtk driver is a generic driver for MediaTek xHCI IP, add a generic compatible to avoid confusion when support new SoCs but use a compatible with specific SoC's name "mt8173". Signed-off-by: Chunfeng Yun--- drivers/usb/host/xhci-mtk.c |1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/host/xhci-mtk.c b/drivers/usb/host/xhci-mtk.c index 67d5dc7..d2934b9 100644 --- a/drivers/usb/host/xhci-mtk.c +++ b/drivers/usb/host/xhci-mtk.c @@ -795,6 +795,7 @@ static int __maybe_unused xhci_mtk_resume(struct device *dev) #ifdef CONFIG_OF static const struct of_device_id mtk_xhci_of_match[] = { { .compatible = "mediatek,mt8173-xhci"}, + { .compatible = "mediatek,xhci-mtk"}, { }, }; MODULE_DEVICE_TABLE(of, mtk_xhci_of_match); -- 1.7.9.5
[PATCH 2/4] usb: xhci-mtk: add generic compatible string
The xhci-mtk driver is a generic driver for MediaTek xHCI IP, add a generic compatible to avoid confusion when support new SoCs but use a compatible with specific SoC's name "mt8173". Signed-off-by: Chunfeng Yun --- drivers/usb/host/xhci-mtk.c |1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/host/xhci-mtk.c b/drivers/usb/host/xhci-mtk.c index 67d5dc7..d2934b9 100644 --- a/drivers/usb/host/xhci-mtk.c +++ b/drivers/usb/host/xhci-mtk.c @@ -795,6 +795,7 @@ static int __maybe_unused xhci_mtk_resume(struct device *dev) #ifdef CONFIG_OF static const struct of_device_id mtk_xhci_of_match[] = { { .compatible = "mediatek,mt8173-xhci"}, + { .compatible = "mediatek,xhci-mtk"}, { }, }; MODULE_DEVICE_TABLE(of, mtk_xhci_of_match); -- 1.7.9.5
[PATCH 3/4] dt-bindings: mt8173-mtu3: add generic compatible and rename file
The mt8173-mtu3.txt actually holds the bindings for all mediatek SoCs with usb3 DRD IP, so add a generic compatible and change the name to mtu3.txt. Signed-off-by: Chunfeng Yun--- .../bindings/usb/{mt8173-mtu3.txt => mtu3.txt} |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) rename Documentation/devicetree/bindings/usb/{mt8173-mtu3.txt => mtu3.txt} (95%) diff --git a/Documentation/devicetree/bindings/usb/mt8173-mtu3.txt b/Documentation/devicetree/bindings/usb/mtu3.txt similarity index 95% rename from Documentation/devicetree/bindings/usb/mt8173-mtu3.txt rename to Documentation/devicetree/bindings/usb/mtu3.txt index 1d7c3bc..832741d 100644 --- a/Documentation/devicetree/bindings/usb/mt8173-mtu3.txt +++ b/Documentation/devicetree/bindings/usb/mtu3.txt @@ -1,7 +1,9 @@ The device node for Mediatek USB3.0 DRD controller Required properties: - - compatible : should be "mediatek,mt8173-mtu3" + - compatible : should be one of + "mediatek,mt8173-mtu3" (deprecated, use "mediatek,mtu3" instead), + "mediatek,mtu3" - reg : specifies physical base address and size of the registers - reg-names: should be "mac" for device IP and "ippc" for IP port control - interrupts : interrupt used by the device IP @@ -44,7 +46,7 @@ Optional properties: Sub-nodes: The xhci should be added as subnode to mtu3 as shown in the following example if host mode is enabled. The DT binding details of xhci can be found in: -Documentation/devicetree/bindings/usb/mt8173-xhci.txt +Documentation/devicetree/bindings/usb/xhci-mtk.txt Example: ssusb: usb@11271000 { -- 1.7.9.5
[PATCH 3/4] dt-bindings: mt8173-mtu3: add generic compatible and rename file
The mt8173-mtu3.txt actually holds the bindings for all mediatek SoCs with usb3 DRD IP, so add a generic compatible and change the name to mtu3.txt. Signed-off-by: Chunfeng Yun --- .../bindings/usb/{mt8173-mtu3.txt => mtu3.txt} |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) rename Documentation/devicetree/bindings/usb/{mt8173-mtu3.txt => mtu3.txt} (95%) diff --git a/Documentation/devicetree/bindings/usb/mt8173-mtu3.txt b/Documentation/devicetree/bindings/usb/mtu3.txt similarity index 95% rename from Documentation/devicetree/bindings/usb/mt8173-mtu3.txt rename to Documentation/devicetree/bindings/usb/mtu3.txt index 1d7c3bc..832741d 100644 --- a/Documentation/devicetree/bindings/usb/mt8173-mtu3.txt +++ b/Documentation/devicetree/bindings/usb/mtu3.txt @@ -1,7 +1,9 @@ The device node for Mediatek USB3.0 DRD controller Required properties: - - compatible : should be "mediatek,mt8173-mtu3" + - compatible : should be one of + "mediatek,mt8173-mtu3" (deprecated, use "mediatek,mtu3" instead), + "mediatek,mtu3" - reg : specifies physical base address and size of the registers - reg-names: should be "mac" for device IP and "ippc" for IP port control - interrupts : interrupt used by the device IP @@ -44,7 +46,7 @@ Optional properties: Sub-nodes: The xhci should be added as subnode to mtu3 as shown in the following example if host mode is enabled. The DT binding details of xhci can be found in: -Documentation/devicetree/bindings/usb/mt8173-xhci.txt +Documentation/devicetree/bindings/usb/xhci-mtk.txt Example: ssusb: usb@11271000 { -- 1.7.9.5
[PATCH 4/4] dt-bindings: mt8173-xhci: add generic compatible and rename file
The mt8173-xhci.txt actually holds the bindings for all mediatek SoCs with xHCI controller, so add a generic compatible and change the name to xhci-mtk.txt to reflect that. Signed-off-by: Chunfeng Yun--- .../bindings/usb/{mt8173-xhci.txt => xhci-mtk.txt} | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) rename Documentation/devicetree/bindings/usb/{mt8173-xhci.txt => xhci-mtk.txt} (92%) diff --git a/Documentation/devicetree/bindings/usb/mt8173-xhci.txt b/Documentation/devicetree/bindings/usb/xhci-mtk.txt similarity index 92% rename from Documentation/devicetree/bindings/usb/mt8173-xhci.txt rename to Documentation/devicetree/bindings/usb/xhci-mtk.txt index 0acfc8a..1ce77c7 100644 --- a/Documentation/devicetree/bindings/usb/mt8173-xhci.txt +++ b/Documentation/devicetree/bindings/usb/xhci-mtk.txt @@ -11,7 +11,9 @@ into two parts. Required properties: - - compatible : should contain "mediatek,mt8173-xhci" + - compatible : should be one of + "mediatek,mt8173-xhci" (deprecated, use "mediatek,xhci-mtk" instead), + "mediatek,xhci-mtk" - reg : specifies physical base address and size of the registers - reg-names: should be "mac" for xHCI MAC and "ippc" for IP port control - interrupts : interrupt used by the controller @@ -68,10 +70,12 @@ usb30: usb@1127 { In the case, xhci is added as subnode to mtu3. An example and the DT binding details of mtu3 can be found in: -Documentation/devicetree/bindings/usb/mt8173-mtu3.txt +Documentation/devicetree/bindings/usb/mtu3.txt Required properties: - - compatible : should contain "mediatek,mt8173-xhci" + - compatible : should be one of + "mediatek,mt8173-xhci" (deprecated, use "mediatek,xhci-mtk" instead), + "mediatek,xhci-mtk" - reg : specifies physical base address and size of the registers - reg-names: should be "mac" for xHCI MAC - interrupts : interrupt used by the host controller -- 1.7.9.5
[PATCH 4/4] dt-bindings: mt8173-xhci: add generic compatible and rename file
The mt8173-xhci.txt actually holds the bindings for all mediatek SoCs with xHCI controller, so add a generic compatible and change the name to xhci-mtk.txt to reflect that. Signed-off-by: Chunfeng Yun --- .../bindings/usb/{mt8173-xhci.txt => xhci-mtk.txt} | 10 +++--- 1 file changed, 7 insertions(+), 3 deletions(-) rename Documentation/devicetree/bindings/usb/{mt8173-xhci.txt => xhci-mtk.txt} (92%) diff --git a/Documentation/devicetree/bindings/usb/mt8173-xhci.txt b/Documentation/devicetree/bindings/usb/xhci-mtk.txt similarity index 92% rename from Documentation/devicetree/bindings/usb/mt8173-xhci.txt rename to Documentation/devicetree/bindings/usb/xhci-mtk.txt index 0acfc8a..1ce77c7 100644 --- a/Documentation/devicetree/bindings/usb/mt8173-xhci.txt +++ b/Documentation/devicetree/bindings/usb/xhci-mtk.txt @@ -11,7 +11,9 @@ into two parts. Required properties: - - compatible : should contain "mediatek,mt8173-xhci" + - compatible : should be one of + "mediatek,mt8173-xhci" (deprecated, use "mediatek,xhci-mtk" instead), + "mediatek,xhci-mtk" - reg : specifies physical base address and size of the registers - reg-names: should be "mac" for xHCI MAC and "ippc" for IP port control - interrupts : interrupt used by the controller @@ -68,10 +70,12 @@ usb30: usb@1127 { In the case, xhci is added as subnode to mtu3. An example and the DT binding details of mtu3 can be found in: -Documentation/devicetree/bindings/usb/mt8173-mtu3.txt +Documentation/devicetree/bindings/usb/mtu3.txt Required properties: - - compatible : should contain "mediatek,mt8173-xhci" + - compatible : should be one of + "mediatek,mt8173-xhci" (deprecated, use "mediatek,xhci-mtk" instead), + "mediatek,xhci-mtk" - reg : specifies physical base address and size of the registers - reg-names: should be "mac" for xHCI MAC - interrupts : interrupt used by the host controller -- 1.7.9.5
[PATCH] ARM: dts: DRA7: Add pcie1 dt node for EP mode
Add pcie1 dt node in order for the controller to operate in endpoint mode. However since none of the dra7 based boards have slots configured to operate in endpoint mode, keep EP mode disabled. Signed-off-by: Kishon Vijay Abraham I--- arch/arm/boot/dts/am571x-idk.dts| 9 + arch/arm/boot/dts/am572x-idk.dts| 7 ++- arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi | 7 ++- arch/arm/boot/dts/dra7-evm.dts | 4 arch/arm/boot/dts/dra7.dtsi | 23 ++- arch/arm/boot/dts/dra72-evm-common.dtsi | 4 6 files changed, 51 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/am571x-idk.dts b/arch/arm/boot/dts/am571x-idk.dts index adc70fb091a2..0c0bb4e93f25 100644 --- a/arch/arm/boot/dts/am571x-idk.dts +++ b/arch/arm/boot/dts/am571x-idk.dts @@ -96,3 +96,12 @@ status = "okay"; }; }; + +_rc { + status = "okay"; + gpios = < 23 GPIO_ACTIVE_HIGH>; +}; + +_ep { + gpios = < 23 GPIO_ACTIVE_HIGH>; +}; diff --git a/arch/arm/boot/dts/am572x-idk.dts b/arch/arm/boot/dts/am572x-idk.dts index 940fcbe5380b..5ff75004afcf 100644 --- a/arch/arm/boot/dts/am572x-idk.dts +++ b/arch/arm/boot/dts/am572x-idk.dts @@ -88,7 +88,12 @@ load-gpios = < 19 GPIO_ACTIVE_LOW>; }; - { +_rc { + status = "okay"; + gpios = < 23 GPIO_ACTIVE_HIGH>; +}; + +_ep { gpios = < 23 GPIO_ACTIVE_HIGH>; }; diff --git a/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi b/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi index fdfe5b16b806..d433a50cd18a 100644 --- a/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi +++ b/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi @@ -570,7 +570,12 @@ }; }; - { +_rc { + status = "ok"; + gpios = < 8 GPIO_ACTIVE_LOW>; +}; + +_ep { gpios = < 8 GPIO_ACTIVE_LOW>; }; diff --git a/arch/arm/boot/dts/dra7-evm.dts b/arch/arm/boot/dts/dra7-evm.dts index f47fc4daf062..57bd75909d96 100644 --- a/arch/arm/boot/dts/dra7-evm.dts +++ b/arch/arm/boot/dts/dra7-evm.dts @@ -720,3 +720,7 @@ status = "okay"; }; }; + +_rc { + status = "okay"; +}; diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi index 0f0f6f58bd18..e6f2c6a15dc1 100644 --- a/arch/arm/boot/dts/dra7.dtsi +++ b/arch/arm/boot/dts/dra7.dtsi @@ -196,6 +196,7 @@ scm_conf1: scm_conf@1c04 { compatible = "syscon"; reg = <0x1c04 0x0020>; + #syscon-cells = <2>; }; scm_conf_pcie: scm_conf@1c24 { @@ -287,7 +288,11 @@ #address-cells = <1>; ranges = <0x5100 0x5100 0x3000 0x00x2000 0x1000>; - pcie1: pcie@5100 { + /** +* To enable PCI endpoint mode, disable the pcie1_rc +* node and enable pcie1_ep mode. +*/ + pcie1_rc: pcie@5100 { compatible = "ti,dra7-pcie"; reg = <0x5100 0x2000>, <0x51002000 0x14c>, <0x1000 0x2000>; reg-names = "rc_dbics", "ti_conf", "config"; @@ -309,12 +314,28 @@ <0 0 0 2 _intc 2>, <0 0 0 3 _intc 3>, <0 0 0 4 _intc 4>; + status = "disabled"; pcie1_intc: interrupt-controller { interrupt-controller; #address-cells = <0>; #interrupt-cells = <1>; }; }; + + pcie1_ep: pcie_ep@5100 { + compatible = "ti,dra7-pcie-ep"; + reg = <0x5100 0x28>, <0x51002000 0x14c>, <0x51001000 0x28>, <0x1000 0x1000>; + reg-names = "ep_dbics", "ti_conf", "ep_dbics2", "addr_space"; + interrupts = <0 232 0x4>; + num-lanes = <1>; + num-ib-windows = <4>; + num-ob-windows = <16>; + ti,hwmods = "pcie1"; + phys = <_phy>; + phy-names = "pcie-phy0"; + ti,syscon-unaligned-access = <_conf1 0x14 2>; + status = "disabled"; + }; }; axi@1 { diff --git
[PATCH] ARM: dts: DRA7: Add pcie1 dt node for EP mode
Add pcie1 dt node in order for the controller to operate in endpoint mode. However since none of the dra7 based boards have slots configured to operate in endpoint mode, keep EP mode disabled. Signed-off-by: Kishon Vijay Abraham I --- arch/arm/boot/dts/am571x-idk.dts| 9 + arch/arm/boot/dts/am572x-idk.dts| 7 ++- arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi | 7 ++- arch/arm/boot/dts/dra7-evm.dts | 4 arch/arm/boot/dts/dra7.dtsi | 23 ++- arch/arm/boot/dts/dra72-evm-common.dtsi | 4 6 files changed, 51 insertions(+), 3 deletions(-) diff --git a/arch/arm/boot/dts/am571x-idk.dts b/arch/arm/boot/dts/am571x-idk.dts index adc70fb091a2..0c0bb4e93f25 100644 --- a/arch/arm/boot/dts/am571x-idk.dts +++ b/arch/arm/boot/dts/am571x-idk.dts @@ -96,3 +96,12 @@ status = "okay"; }; }; + +_rc { + status = "okay"; + gpios = < 23 GPIO_ACTIVE_HIGH>; +}; + +_ep { + gpios = < 23 GPIO_ACTIVE_HIGH>; +}; diff --git a/arch/arm/boot/dts/am572x-idk.dts b/arch/arm/boot/dts/am572x-idk.dts index 940fcbe5380b..5ff75004afcf 100644 --- a/arch/arm/boot/dts/am572x-idk.dts +++ b/arch/arm/boot/dts/am572x-idk.dts @@ -88,7 +88,12 @@ load-gpios = < 19 GPIO_ACTIVE_LOW>; }; - { +_rc { + status = "okay"; + gpios = < 23 GPIO_ACTIVE_HIGH>; +}; + +_ep { gpios = < 23 GPIO_ACTIVE_HIGH>; }; diff --git a/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi b/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi index fdfe5b16b806..d433a50cd18a 100644 --- a/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi +++ b/arch/arm/boot/dts/am57xx-beagle-x15-common.dtsi @@ -570,7 +570,12 @@ }; }; - { +_rc { + status = "ok"; + gpios = < 8 GPIO_ACTIVE_LOW>; +}; + +_ep { gpios = < 8 GPIO_ACTIVE_LOW>; }; diff --git a/arch/arm/boot/dts/dra7-evm.dts b/arch/arm/boot/dts/dra7-evm.dts index f47fc4daf062..57bd75909d96 100644 --- a/arch/arm/boot/dts/dra7-evm.dts +++ b/arch/arm/boot/dts/dra7-evm.dts @@ -720,3 +720,7 @@ status = "okay"; }; }; + +_rc { + status = "okay"; +}; diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi index 0f0f6f58bd18..e6f2c6a15dc1 100644 --- a/arch/arm/boot/dts/dra7.dtsi +++ b/arch/arm/boot/dts/dra7.dtsi @@ -196,6 +196,7 @@ scm_conf1: scm_conf@1c04 { compatible = "syscon"; reg = <0x1c04 0x0020>; + #syscon-cells = <2>; }; scm_conf_pcie: scm_conf@1c24 { @@ -287,7 +288,11 @@ #address-cells = <1>; ranges = <0x5100 0x5100 0x3000 0x00x2000 0x1000>; - pcie1: pcie@5100 { + /** +* To enable PCI endpoint mode, disable the pcie1_rc +* node and enable pcie1_ep mode. +*/ + pcie1_rc: pcie@5100 { compatible = "ti,dra7-pcie"; reg = <0x5100 0x2000>, <0x51002000 0x14c>, <0x1000 0x2000>; reg-names = "rc_dbics", "ti_conf", "config"; @@ -309,12 +314,28 @@ <0 0 0 2 _intc 2>, <0 0 0 3 _intc 3>, <0 0 0 4 _intc 4>; + status = "disabled"; pcie1_intc: interrupt-controller { interrupt-controller; #address-cells = <0>; #interrupt-cells = <1>; }; }; + + pcie1_ep: pcie_ep@5100 { + compatible = "ti,dra7-pcie-ep"; + reg = <0x5100 0x28>, <0x51002000 0x14c>, <0x51001000 0x28>, <0x1000 0x1000>; + reg-names = "ep_dbics", "ti_conf", "ep_dbics2", "addr_space"; + interrupts = <0 232 0x4>; + num-lanes = <1>; + num-ib-windows = <4>; + num-ob-windows = <16>; + ti,hwmods = "pcie1"; + phys = <_phy>; + phy-names = "pcie-phy0"; + ti,syscon-unaligned-access = <_conf1 0x14 2>; + status = "disabled"; + }; }; axi@1 { diff --git
[PATCH] memory: mtk-smi: Handle return value of clk_prepare_enable
clk_prepare_enable() can fail here and we must check its return value. Signed-off-by: Arvind Yadav--- drivers/memory/mtk-smi.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/memory/mtk-smi.c b/drivers/memory/mtk-smi.c index 4afbc41..edf36f0 100644 --- a/drivers/memory/mtk-smi.c +++ b/drivers/memory/mtk-smi.c @@ -321,6 +321,7 @@ static int mtk_smi_common_probe(struct platform_device *pdev) struct resource *res; const struct of_device_id *of_id; enum mtk_smi_gen smi_gen; + int ret; if (!dev->pm_domain) return -EPROBE_DEFER; @@ -359,7 +360,9 @@ static int mtk_smi_common_probe(struct platform_device *pdev) if (IS_ERR(common->clk_async)) return PTR_ERR(common->clk_async); - clk_prepare_enable(common->clk_async); + ret = clk_prepare_enable(common->clk_async); + if (ret) + return ret; } pm_runtime_enable(dev); platform_set_drvdata(pdev, common); -- 1.9.1
[PATCH] memory: mtk-smi: Handle return value of clk_prepare_enable
clk_prepare_enable() can fail here and we must check its return value. Signed-off-by: Arvind Yadav --- drivers/memory/mtk-smi.c | 5 - 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/memory/mtk-smi.c b/drivers/memory/mtk-smi.c index 4afbc41..edf36f0 100644 --- a/drivers/memory/mtk-smi.c +++ b/drivers/memory/mtk-smi.c @@ -321,6 +321,7 @@ static int mtk_smi_common_probe(struct platform_device *pdev) struct resource *res; const struct of_device_id *of_id; enum mtk_smi_gen smi_gen; + int ret; if (!dev->pm_domain) return -EPROBE_DEFER; @@ -359,7 +360,9 @@ static int mtk_smi_common_probe(struct platform_device *pdev) if (IS_ERR(common->clk_async)) return PTR_ERR(common->clk_async); - clk_prepare_enable(common->clk_async); + ret = clk_prepare_enable(common->clk_async); + if (ret) + return ret; } pm_runtime_enable(dev); platform_set_drvdata(pdev, common); -- 1.9.1
Re: [RFC PATCH 0/1] Add hugetlbfs support to memfd_create()
Hi, I am one foot out of office and will be offline for two days so I didn't get to review the patch yet but this information is an useful information about the usecase that should be in the patch directly for future reference. On Mon 07-08-17 16:47:51, Mike Kravetz wrote: > This patch came out of discussions in this e-mail thread [1]. > > The Oracle JVM team is developing a new garbage collection model. This > new model requires multiple mappings of the same anonymous memory. One > straight forward way to accomplish this is with memfd_create. They can > use the returned fd to create multiple mappings of the same memory. > > The JVM today has an option to use (static hugetlb) huge pages. If this > option is specified, they would like to use the same garbage collection > model requiring multiple mappings to the same memory. Using hugetlbfs, > it is possible to explicitly mount a filesystem and specify file paths > in order to get an fd that can be used for multiple mappings. However, > this introduces additional system admin work and coordination. > > Ideally they would like to get a hugetlbfs fd without requiring explicit > mounting of a filesystem. Today, mmap and shmget can make use of > hugetlbfs without explicitly mounting a filesystem. The patch adds this > functionality to hugetlbfs. > > A new flag MFD_HUGETLB is introduced to request a hugetlbfs file. Like > other system calls where hugetlb can be requested, the huge page size > can be encoded in the flags argument is the non-default huge page size > is desired. hugetlbfs does not support sealing operations, therefore > specifying MFD_ALLOW_SEALING with MFD_HUGETLB will result in EINVAL. > > Of course, the memfd_man page would need updating if this type of > functionality moves forward. > > [1] https://lkml.org/lkml/2017/7/6/564 > > Mike Kravetz (1): > mm/shmem: add hugetlbfs support to memfd_create() > > include/uapi/linux/memfd.h | 24 > mm/shmem.c | 37 +++-- > 2 files changed, 55 insertions(+), 6 deletions(-) > > -- > 2.7.5 -- Michal Hocko SUSE Labs
Re: [RFC PATCH 0/1] Add hugetlbfs support to memfd_create()
Hi, I am one foot out of office and will be offline for two days so I didn't get to review the patch yet but this information is an useful information about the usecase that should be in the patch directly for future reference. On Mon 07-08-17 16:47:51, Mike Kravetz wrote: > This patch came out of discussions in this e-mail thread [1]. > > The Oracle JVM team is developing a new garbage collection model. This > new model requires multiple mappings of the same anonymous memory. One > straight forward way to accomplish this is with memfd_create. They can > use the returned fd to create multiple mappings of the same memory. > > The JVM today has an option to use (static hugetlb) huge pages. If this > option is specified, they would like to use the same garbage collection > model requiring multiple mappings to the same memory. Using hugetlbfs, > it is possible to explicitly mount a filesystem and specify file paths > in order to get an fd that can be used for multiple mappings. However, > this introduces additional system admin work and coordination. > > Ideally they would like to get a hugetlbfs fd without requiring explicit > mounting of a filesystem. Today, mmap and shmget can make use of > hugetlbfs without explicitly mounting a filesystem. The patch adds this > functionality to hugetlbfs. > > A new flag MFD_HUGETLB is introduced to request a hugetlbfs file. Like > other system calls where hugetlb can be requested, the huge page size > can be encoded in the flags argument is the non-default huge page size > is desired. hugetlbfs does not support sealing operations, therefore > specifying MFD_ALLOW_SEALING with MFD_HUGETLB will result in EINVAL. > > Of course, the memfd_man page would need updating if this type of > functionality moves forward. > > [1] https://lkml.org/lkml/2017/7/6/564 > > Mike Kravetz (1): > mm/shmem: add hugetlbfs support to memfd_create() > > include/uapi/linux/memfd.h | 24 > mm/shmem.c | 37 +++-- > 2 files changed, 55 insertions(+), 6 deletions(-) > > -- > 2.7.5 -- Michal Hocko SUSE Labs
Re: [PATCH] mm: ratelimit PFNs busy info message
Andrew Mortonwrites: > On Wed, 2 Aug 2017 13:44:57 -0400 Jonathan Toppins > wrote: > >> The RDMA subsystem can generate several thousand of these messages per >> second eventually leading to a kernel crash. Ratelimit these messages >> to prevent this crash. > > Well... why are all these EBUSY's occurring? It sounds inefficient (at > least) but if it is expected, normal and unavoidable then perhaps we > should just remove that message altogether? We see them on powerpc sometimes when CMA is unable to make large allocations for the hash table of a KVM guest. At least in that context they're not useful, CMA will try the allocation again, and if it really can't allocate then CMA will print more useful information itself. So I'd vote for dropping the message and letting the callers decide what to do. cheers
Re: [PATCH] mm: ratelimit PFNs busy info message
Andrew Morton writes: > On Wed, 2 Aug 2017 13:44:57 -0400 Jonathan Toppins > wrote: > >> The RDMA subsystem can generate several thousand of these messages per >> second eventually leading to a kernel crash. Ratelimit these messages >> to prevent this crash. > > Well... why are all these EBUSY's occurring? It sounds inefficient (at > least) but if it is expected, normal and unavoidable then perhaps we > should just remove that message altogether? We see them on powerpc sometimes when CMA is unable to make large allocations for the hash table of a KVM guest. At least in that context they're not useful, CMA will try the allocation again, and if it really can't allocate then CMA will print more useful information itself. So I'd vote for dropping the message and letting the callers decide what to do. cheers
Re: [RESEND PATCH] bcache: Don't reinvent the wheel but use existing llist API
On 2017/8/8 下午12:12, Byungchul Park wrote: > On Mon, Aug 07, 2017 at 06:18:35PM +0800, Coly Li wrote: >> On 2017/8/7 下午4:38, Byungchul Park wrote: >>> Although llist provides proper APIs, they are not used. Make them used. >>> >>> Signed-off-by: Byungchul Park> Only have a question about why not using llist_for_each_entry(), it's > > Hello, > > The reason is to keep the original logic unchanged. The logic already > does as if it's the safe version against removal. > >> still OK with llist_for_each_entry_safe(). The rested part is good to me. >> >> Acked-by: Coly Li >> >>> --- >>> drivers/md/bcache/closure.c | 17 +++-- >>> 1 file changed, 3 insertions(+), 14 deletions(-) >>> >>> diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c >>> index 864e673..1841d03 100644 >>> --- a/drivers/md/bcache/closure.c >>> +++ b/drivers/md/bcache/closure.c >>> @@ -64,27 +64,16 @@ void closure_put(struct closure *cl) >>> void __closure_wake_up(struct closure_waitlist *wait_list) >>> { >>> struct llist_node *list; >>> - struct closure *cl; >>> + struct closure *cl, *t; >>> struct llist_node *reverse = NULL; >>> >>> list = llist_del_all(_list->list); >>> >>> /* We first reverse the list to preserve FIFO ordering and fairness */ >>> - >>> - while (list) { >>> - struct llist_node *t = list; >>> - list = llist_next(list); >>> - >>> - t->next = reverse; >>> - reverse = t; >>> - } >>> + reverse = llist_reverse_order(list); >>> >>> /* Then do the wakeups */ >>> - >>> - while (reverse) { >>> - cl = container_of(reverse, struct closure, list); >>> - reverse = llist_next(reverse); >>> - >>> + llist_for_each_entry_safe(cl, t, reverse, list) { >> >> Just wondering why not using llist_for_each_entry(), or you use the >> _safe version on purpose ? > > If I use llist_for_each_entry(), then it would change the original > behavior. Is it ok? > I feel llist_for_each_entry() keeps the original behavior, and variable 't' can be removed. Anyway, either llist_for_each_entry() or llist_for_each_entry_safe() works correctly and well here. Any one you use is OK to me, thanks for your informative reply :-) -- Coly Li
Re: [RESEND PATCH] bcache: Don't reinvent the wheel but use existing llist API
On 2017/8/8 下午12:12, Byungchul Park wrote: > On Mon, Aug 07, 2017 at 06:18:35PM +0800, Coly Li wrote: >> On 2017/8/7 下午4:38, Byungchul Park wrote: >>> Although llist provides proper APIs, they are not used. Make them used. >>> >>> Signed-off-by: Byungchul Park > Only have a question about why not using llist_for_each_entry(), it's > > Hello, > > The reason is to keep the original logic unchanged. The logic already > does as if it's the safe version against removal. > >> still OK with llist_for_each_entry_safe(). The rested part is good to me. >> >> Acked-by: Coly Li >> >>> --- >>> drivers/md/bcache/closure.c | 17 +++-- >>> 1 file changed, 3 insertions(+), 14 deletions(-) >>> >>> diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c >>> index 864e673..1841d03 100644 >>> --- a/drivers/md/bcache/closure.c >>> +++ b/drivers/md/bcache/closure.c >>> @@ -64,27 +64,16 @@ void closure_put(struct closure *cl) >>> void __closure_wake_up(struct closure_waitlist *wait_list) >>> { >>> struct llist_node *list; >>> - struct closure *cl; >>> + struct closure *cl, *t; >>> struct llist_node *reverse = NULL; >>> >>> list = llist_del_all(_list->list); >>> >>> /* We first reverse the list to preserve FIFO ordering and fairness */ >>> - >>> - while (list) { >>> - struct llist_node *t = list; >>> - list = llist_next(list); >>> - >>> - t->next = reverse; >>> - reverse = t; >>> - } >>> + reverse = llist_reverse_order(list); >>> >>> /* Then do the wakeups */ >>> - >>> - while (reverse) { >>> - cl = container_of(reverse, struct closure, list); >>> - reverse = llist_next(reverse); >>> - >>> + llist_for_each_entry_safe(cl, t, reverse, list) { >> >> Just wondering why not using llist_for_each_entry(), or you use the >> _safe version on purpose ? > > If I use llist_for_each_entry(), then it would change the original > behavior. Is it ok? > I feel llist_for_each_entry() keeps the original behavior, and variable 't' can be removed. Anyway, either llist_for_each_entry() or llist_for_each_entry_safe() works correctly and well here. Any one you use is OK to me, thanks for your informative reply :-) -- Coly Li
Re: [PATCH] arm64: correct modules range of kernel virtual memory layout
On Tue, 2017-08-08 at 12:44 +0800, Miles Chen wrote: > On Mon, 2017-08-07 at 15:01 +0100, Will Deacon wrote: > > On Mon, Aug 07, 2017 at 02:18:00PM +0100, Ard Biesheuvel wrote: > > > On 7 August 2017 at 14:16, Will Deaconwrote: > > > > On Mon, Aug 07, 2017 at 07:04:46PM +0800, Miles Chen wrote: > > > >> The commit f80fb3a3d508 ("arm64: add support for kernel ASLR") > > > >> moved module virtual address to > > > >> [module_alloc_base, module_alloc_base + MODULES_VSIZE). > > > >> > > > >> Display module information of the virtual kernel > > > >> memory layout by using module_alloc_base. > > > >> > > > >> testing output: > > > >> 1) Current implementation: > > > >> Virtual kernel memory layout: > > > >> modules : 0xff80 - 0xff800800 ( 128 MB) > > > >> 2) this patch + KASLR: > > > >> Virtual kernel memory layout: > > > >> modules : 0xff800056 - 0xff800856 ( 128 MB) > > > >> 3) this patch + KASLR and a dummy seed: > > > >> Virtual kernel memory layout: > > > >> modules : 0xffa7df637000 - 0xffa7e7637000 ( 128 MB) > > > >> > > > >> Signed-off-by: Miles Chen > > > >> --- > > > >> arch/arm64/mm/init.c | 5 +++-- > > > >> 1 file changed, 3 insertions(+), 2 deletions(-) > > > > > > > > Does this mean the modules code in our pt dumper is busted > > > > (arch/arm64/mm/dump.c)? Also, what about KASAN, which uses these > > > > addresses > > > > too (in kasan_init)? Should we just remove MODULES_VADDR and MODULES_END > > > > altogether? > > > > > > > > > > I don't think we need this patch. The 'module' line simply prints the > > > VA region that is reserved for modules. The fact that we end up > > > putting them elsewhere when running randomized does not necessarily > > > mean this line should reflect that. > > > > I was more concerned by other users of MODULES_VADDR tbh, although I see > > now that we don't randomize the module region if kasan is enabled. Still, > > the kcore code adds the modules region as a separate area (distinct from > > vmalloc) if MODULES_VADDR is defined, the page table dumping code uses > > MODULES_VADDR to identify the module region and I think we'll get false > > positives from is_vmalloc_or_module_addr, which again uses the static > > region. > > > > So, given that MODULES_VADDR never points at the module area, can't we get > > rid of it? > > Agreed.MODULES_VADDR should be phased out. Considering the kernel > modules live somewhere between [VMALLOC_START, VMALLOC_END) now: > (arch/arm64/kernel/module.c:module_alloc). I suggest the following > changes: > > 1. is_vmalloc_or_module_addr() should return is_vmalloc_addr() directly > 2. arch/arm64/mm/dump.c does not need MODULES_VADDR and MODULES_END. > 3. kasan uses [module_alloc_base, module_alloc_base + MODULES_VSIZE) to > get the shadow memory? (the kernel modules still live in this range when > kasan is enabled) > 4. remove modules line in kernel memory layout > (optional, thanks for Ard's feedback) > 5. remove MODULE_VADDR, MODULES_END definition I was wrong about this. is_vmalloc_or_module_addr() is defined in mm/vmalloc and it uses MODULES_VADDR and MODULES_END. May it is better to give MODULES_VADDR and MODULES_END proper values, not remove them. > Miles > > > > Will > >
Re: [PATCH] arm64: correct modules range of kernel virtual memory layout
On Tue, 2017-08-08 at 12:44 +0800, Miles Chen wrote: > On Mon, 2017-08-07 at 15:01 +0100, Will Deacon wrote: > > On Mon, Aug 07, 2017 at 02:18:00PM +0100, Ard Biesheuvel wrote: > > > On 7 August 2017 at 14:16, Will Deacon wrote: > > > > On Mon, Aug 07, 2017 at 07:04:46PM +0800, Miles Chen wrote: > > > >> The commit f80fb3a3d508 ("arm64: add support for kernel ASLR") > > > >> moved module virtual address to > > > >> [module_alloc_base, module_alloc_base + MODULES_VSIZE). > > > >> > > > >> Display module information of the virtual kernel > > > >> memory layout by using module_alloc_base. > > > >> > > > >> testing output: > > > >> 1) Current implementation: > > > >> Virtual kernel memory layout: > > > >> modules : 0xff80 - 0xff800800 ( 128 MB) > > > >> 2) this patch + KASLR: > > > >> Virtual kernel memory layout: > > > >> modules : 0xff800056 - 0xff800856 ( 128 MB) > > > >> 3) this patch + KASLR and a dummy seed: > > > >> Virtual kernel memory layout: > > > >> modules : 0xffa7df637000 - 0xffa7e7637000 ( 128 MB) > > > >> > > > >> Signed-off-by: Miles Chen > > > >> --- > > > >> arch/arm64/mm/init.c | 5 +++-- > > > >> 1 file changed, 3 insertions(+), 2 deletions(-) > > > > > > > > Does this mean the modules code in our pt dumper is busted > > > > (arch/arm64/mm/dump.c)? Also, what about KASAN, which uses these > > > > addresses > > > > too (in kasan_init)? Should we just remove MODULES_VADDR and MODULES_END > > > > altogether? > > > > > > > > > > I don't think we need this patch. The 'module' line simply prints the > > > VA region that is reserved for modules. The fact that we end up > > > putting them elsewhere when running randomized does not necessarily > > > mean this line should reflect that. > > > > I was more concerned by other users of MODULES_VADDR tbh, although I see > > now that we don't randomize the module region if kasan is enabled. Still, > > the kcore code adds the modules region as a separate area (distinct from > > vmalloc) if MODULES_VADDR is defined, the page table dumping code uses > > MODULES_VADDR to identify the module region and I think we'll get false > > positives from is_vmalloc_or_module_addr, which again uses the static > > region. > > > > So, given that MODULES_VADDR never points at the module area, can't we get > > rid of it? > > Agreed.MODULES_VADDR should be phased out. Considering the kernel > modules live somewhere between [VMALLOC_START, VMALLOC_END) now: > (arch/arm64/kernel/module.c:module_alloc). I suggest the following > changes: > > 1. is_vmalloc_or_module_addr() should return is_vmalloc_addr() directly > 2. arch/arm64/mm/dump.c does not need MODULES_VADDR and MODULES_END. > 3. kasan uses [module_alloc_base, module_alloc_base + MODULES_VSIZE) to > get the shadow memory? (the kernel modules still live in this range when > kasan is enabled) > 4. remove modules line in kernel memory layout > (optional, thanks for Ard's feedback) > 5. remove MODULE_VADDR, MODULES_END definition I was wrong about this. is_vmalloc_or_module_addr() is defined in mm/vmalloc and it uses MODULES_VADDR and MODULES_END. May it is better to give MODULES_VADDR and MODULES_END proper values, not remove them. > Miles > > > > Will > >
[PATCH 2/2] ARM: dts: am572x-idk: Fix GPIO polarity for MMC1 card detect
The GPIO polarity for MMC1 card detect is set to '0' which means active-high. However the polarity should be active-low. Fix it here. Signed-off-by: Kishon Vijay Abraham I--- arch/arm/boot/dts/am572x-idk.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/am572x-idk.dts b/arch/arm/boot/dts/am572x-idk.dts index 9da6d83ca185..940fcbe5380b 100644 --- a/arch/arm/boot/dts/am572x-idk.dts +++ b/arch/arm/boot/dts/am572x-idk.dts @@ -81,7 +81,7 @@ vmmc-supply = <_3d>; vmmc_aux-supply = <_reg>; bus-width = <4>; - cd-gpios = < 27 0>; /* gpio 219 */ + cd-gpios = < 27 GPIO_ACTIVE_LOW>; /* gpio 219 */ }; { -- 2.11.0
[PATCH 1/2] ARM: dts: am571x-idk: Fix GPIO polarity for MMC1 card detect
The GPIO polarity for MMC1 card detect is set to '0' which means active-high. However the polarity should be active-low. Fix it here. Signed-off-by: Kishon Vijay Abraham I--- arch/arm/boot/dts/am571x-idk.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/am571x-idk.dts b/arch/arm/boot/dts/am571x-idk.dts index 7b207835b2d1..adc70fb091a2 100644 --- a/arch/arm/boot/dts/am571x-idk.dts +++ b/arch/arm/boot/dts/am571x-idk.dts @@ -68,7 +68,7 @@ status = "okay"; vmmc-supply = <_reg>; bus-width = <4>; - cd-gpios = < 27 0>; /* gpio 219 */ + cd-gpios = < 27 GPIO_ACTIVE_LOW>; /* gpio 219 */ }; _dwc3_2 { -- 2.11.0
[PATCH 2/2] ARM: dts: am572x-idk: Fix GPIO polarity for MMC1 card detect
The GPIO polarity for MMC1 card detect is set to '0' which means active-high. However the polarity should be active-low. Fix it here. Signed-off-by: Kishon Vijay Abraham I --- arch/arm/boot/dts/am572x-idk.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/am572x-idk.dts b/arch/arm/boot/dts/am572x-idk.dts index 9da6d83ca185..940fcbe5380b 100644 --- a/arch/arm/boot/dts/am572x-idk.dts +++ b/arch/arm/boot/dts/am572x-idk.dts @@ -81,7 +81,7 @@ vmmc-supply = <_3d>; vmmc_aux-supply = <_reg>; bus-width = <4>; - cd-gpios = < 27 0>; /* gpio 219 */ + cd-gpios = < 27 GPIO_ACTIVE_LOW>; /* gpio 219 */ }; { -- 2.11.0
[PATCH 1/2] ARM: dts: am571x-idk: Fix GPIO polarity for MMC1 card detect
The GPIO polarity for MMC1 card detect is set to '0' which means active-high. However the polarity should be active-low. Fix it here. Signed-off-by: Kishon Vijay Abraham I --- arch/arm/boot/dts/am571x-idk.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/am571x-idk.dts b/arch/arm/boot/dts/am571x-idk.dts index 7b207835b2d1..adc70fb091a2 100644 --- a/arch/arm/boot/dts/am571x-idk.dts +++ b/arch/arm/boot/dts/am571x-idk.dts @@ -68,7 +68,7 @@ status = "okay"; vmmc-supply = <_reg>; bus-width = <4>; - cd-gpios = < 27 0>; /* gpio 219 */ + cd-gpios = < 27 GPIO_ACTIVE_LOW>; /* gpio 219 */ }; _dwc3_2 { -- 2.11.0
[PATCH] mmc: host: omap_hsmmc: Add CMD23 capability to omap_hsmmc driver
omap_hsmmc driver always relied on CMD12 to stop transmission. However if CMD12 is not issued at the correct timing, the card will indicate a out of range error. With certain cards in some of the DRA7 based boards, -EIO error is observed. By Adding CMD23 capability, the MMC core will send MMC_SET_BLOCK_COUNT command before MMC_READ_MULTIPLE_BLOCK/MMC_WRITE_MULTIPLE_BLOCK commands. commit a04e6bae9e6f12 ("mmc: core: check also R1 response for stop commands") exposed this bug in omap_hsmmc driver. Signed-off-by: Kishon Vijay Abraham I--- drivers/mmc/host/omap_hsmmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 04ff3c97a535..2ab4788d021f 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -2086,7 +2086,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev) mmc->max_seg_size = mmc->max_req_size; mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED | -MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_ERASE; +MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_ERASE | MMC_CAP_CMD23; mmc->caps |= mmc_pdata(host)->caps; if (mmc->caps & MMC_CAP_8_BIT_DATA) -- 2.11.0
[PATCH] mmc: host: omap_hsmmc: Add CMD23 capability to omap_hsmmc driver
omap_hsmmc driver always relied on CMD12 to stop transmission. However if CMD12 is not issued at the correct timing, the card will indicate a out of range error. With certain cards in some of the DRA7 based boards, -EIO error is observed. By Adding CMD23 capability, the MMC core will send MMC_SET_BLOCK_COUNT command before MMC_READ_MULTIPLE_BLOCK/MMC_WRITE_MULTIPLE_BLOCK commands. commit a04e6bae9e6f12 ("mmc: core: check also R1 response for stop commands") exposed this bug in omap_hsmmc driver. Signed-off-by: Kishon Vijay Abraham I --- drivers/mmc/host/omap_hsmmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 04ff3c97a535..2ab4788d021f 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -2086,7 +2086,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev) mmc->max_seg_size = mmc->max_req_size; mmc->caps |= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED | -MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_ERASE; +MMC_CAP_WAIT_WHILE_BUSY | MMC_CAP_ERASE | MMC_CAP_CMD23; mmc->caps |= mmc_pdata(host)->caps; if (mmc->caps & MMC_CAP_8_BIT_DATA) -- 2.11.0
linux-next: manual merge of the userns tree with the mips tree
Hi Eric, Today's linux-next merge of the userns tree got a conflict in: arch/mips/kernel/traps.c between commit: 260a789828aa ("MIPS: signal: Remove unreachable code from force_fcr31_sig().") from the mips tree and commit: ea1b75cf9138 ("signal/mips: Document a conflict with SI_USER with SIGFPE") from the userns tree. I fixed it up (the former removed the code updated by the latter) and can carry the fix as necessary. This is now fixed as far as linux-next is concerned, but any non trivial conflicts should be mentioned to your upstream maintainer when your tree is submitted for merging. You may also want to consider cooperating with the maintainer of the conflicting tree to minimise any particularly complex conflicts. -- Cheers, Stephen Rothwell
linux-next: manual merge of the userns tree with the mips tree
Hi Eric, Today's linux-next merge of the userns tree got a conflict in: arch/mips/kernel/traps.c between commit: 260a789828aa ("MIPS: signal: Remove unreachable code from force_fcr31_sig().") from the mips tree and commit: ea1b75cf9138 ("signal/mips: Document a conflict with SI_USER with SIGFPE") from the userns tree. I fixed it up (the former removed the code updated by the latter) and can carry the fix as necessary. This is now fixed as far as linux-next is concerned, but any non trivial conflicts should be mentioned to your upstream maintainer when your tree is submitted for merging. You may also want to consider cooperating with the maintainer of the conflicting tree to minimise any particularly complex conflicts. -- Cheers, Stephen Rothwell
[PATCH v2] scheduler: enhancement to show_state_filter
Sometimes we want to get tasks in TASK_RUNNING sepcifically, instead of dump all tasks. For example, when the loadavg are high, we want to dump tasks in TASK_RUNNING and TASK_UNINTERRUPTIBLE, which contribute to system load. But mostly there're lots of tasks in Sleep state, which occupies almost all of the kernel log buffer, even overflows it, that causes the useful messages get lost. Although we can enlarge the kernel log buffer, but that's not a good idea. So I made this change to make the show_state_filter more flexible, and then we can dump the tasks in TASK_RUNNING specifically. Signed-off-by: Yafang Shao--- drivers/tty/sysrq.c | 2 +- include/linux/sched.h | 1 + include/linux/sched/debug.h | 6 -- kernel/sched/core.c | 7 --- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 3ffc1ce..86db51b 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -291,7 +291,7 @@ static void sysrq_handle_showstate(int key) static void sysrq_handle_showstate_blocked(int key) { - show_state_filter(TASK_UNINTERRUPTIBLE); + show_state_filter(TASK_UNINTERRUPTIBLE << 1); } static struct sysrq_key_op sysrq_showstate_blocked_op = { .handler= sysrq_handle_showstate_blocked, diff --git a/include/linux/sched.h b/include/linux/sched.h index 8337e2d..318f149 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -82,6 +82,7 @@ #define TASK_NOLOAD1024 #define TASK_NEW 2048 #define TASK_STATE_MAX 4096 +#define TASK_ALL_BITS ((TASK_STATE_MAX << 1) - 1) #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index e0eaee5..c844689 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -1,6 +1,8 @@ #ifndef _LINUX_SCHED_DEBUG_H #define _LINUX_SCHED_DEBUG_H +#include + /* * Various scheduler/task debugging interfaces: */ @@ -10,13 +12,13 @@ extern void dump_cpu_task(int cpu); /* - * Only dump TASK_* tasks. (0 for all tasks) + * Only dump TASK_* tasks. (TASK_ALL_BITS for all tasks) */ extern void show_state_filter(unsigned long state_filter); static inline void show_state(void) { - show_state_filter(0); + show_state_filter(TASK_ALL_BITS); } struct pt_regs; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0869b20..f9b9529 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5161,19 +5161,20 @@ void show_state_filter(unsigned long state_filter) */ touch_nmi_watchdog(); touch_all_softlockup_watchdogs(); - if (!state_filter || (p->state & state_filter)) + /* in case we want to set TASK_RUNNING specifically */ + if ((p->state != TASK_RUNNING ? p->state << 1 : 1) & state_filter) sched_show_task(p); } #ifdef CONFIG_SCHED_DEBUG - if (!state_filter) + if (state_filter == TASK_ALL_BITS) sysrq_sched_debug_show(); #endif rcu_read_unlock(); /* * Only show locks if all tasks are dumped: */ - if (!state_filter) + if (state_filter == TASK_ALL_BITS) debug_show_all_locks(); } -- 1.8.3.1
[PATCH v2] scheduler: enhancement to show_state_filter
Sometimes we want to get tasks in TASK_RUNNING sepcifically, instead of dump all tasks. For example, when the loadavg are high, we want to dump tasks in TASK_RUNNING and TASK_UNINTERRUPTIBLE, which contribute to system load. But mostly there're lots of tasks in Sleep state, which occupies almost all of the kernel log buffer, even overflows it, that causes the useful messages get lost. Although we can enlarge the kernel log buffer, but that's not a good idea. So I made this change to make the show_state_filter more flexible, and then we can dump the tasks in TASK_RUNNING specifically. Signed-off-by: Yafang Shao --- drivers/tty/sysrq.c | 2 +- include/linux/sched.h | 1 + include/linux/sched/debug.h | 6 -- kernel/sched/core.c | 7 --- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 3ffc1ce..86db51b 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -291,7 +291,7 @@ static void sysrq_handle_showstate(int key) static void sysrq_handle_showstate_blocked(int key) { - show_state_filter(TASK_UNINTERRUPTIBLE); + show_state_filter(TASK_UNINTERRUPTIBLE << 1); } static struct sysrq_key_op sysrq_showstate_blocked_op = { .handler= sysrq_handle_showstate_blocked, diff --git a/include/linux/sched.h b/include/linux/sched.h index 8337e2d..318f149 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -82,6 +82,7 @@ #define TASK_NOLOAD1024 #define TASK_NEW 2048 #define TASK_STATE_MAX 4096 +#define TASK_ALL_BITS ((TASK_STATE_MAX << 1) - 1) #define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h index e0eaee5..c844689 100644 --- a/include/linux/sched/debug.h +++ b/include/linux/sched/debug.h @@ -1,6 +1,8 @@ #ifndef _LINUX_SCHED_DEBUG_H #define _LINUX_SCHED_DEBUG_H +#include + /* * Various scheduler/task debugging interfaces: */ @@ -10,13 +12,13 @@ extern void dump_cpu_task(int cpu); /* - * Only dump TASK_* tasks. (0 for all tasks) + * Only dump TASK_* tasks. (TASK_ALL_BITS for all tasks) */ extern void show_state_filter(unsigned long state_filter); static inline void show_state(void) { - show_state_filter(0); + show_state_filter(TASK_ALL_BITS); } struct pt_regs; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0869b20..f9b9529 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5161,19 +5161,20 @@ void show_state_filter(unsigned long state_filter) */ touch_nmi_watchdog(); touch_all_softlockup_watchdogs(); - if (!state_filter || (p->state & state_filter)) + /* in case we want to set TASK_RUNNING specifically */ + if ((p->state != TASK_RUNNING ? p->state << 1 : 1) & state_filter) sched_show_task(p); } #ifdef CONFIG_SCHED_DEBUG - if (!state_filter) + if (state_filter == TASK_ALL_BITS) sysrq_sched_debug_show(); #endif rcu_read_unlock(); /* * Only show locks if all tasks are dumped: */ - if (!state_filter) + if (state_filter == TASK_ALL_BITS) debug_show_all_locks(); } -- 1.8.3.1
[PATCH] spi/bcm63xx-hspi: fix error return code in bcm63xx_hsspi_probe()
platform_get_irq() returns an error code, but the spi-bcm63xx-hsspi driver ignores it and always returns -ENXIO. This is not correct and, prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva--- drivers/spi/spi-bcm63xx-hsspi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-bcm63xx-hsspi.c b/drivers/spi/spi-bcm63xx-hsspi.c index 475a790..cbcba61 100644 --- a/drivers/spi/spi-bcm63xx-hsspi.c +++ b/drivers/spi/spi-bcm63xx-hsspi.c @@ -338,8 +338,8 @@ static int bcm63xx_hsspi_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - dev_err(dev, "no irq\n"); - return -ENXIO; + dev_err(dev, "no irq: %d\n", irq); + return irq; } res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); -- 2.5.0
[PATCH] spi/bcm63xx-hspi: fix error return code in bcm63xx_hsspi_probe()
platform_get_irq() returns an error code, but the spi-bcm63xx-hsspi driver ignores it and always returns -ENXIO. This is not correct and, prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva --- drivers/spi/spi-bcm63xx-hsspi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-bcm63xx-hsspi.c b/drivers/spi/spi-bcm63xx-hsspi.c index 475a790..cbcba61 100644 --- a/drivers/spi/spi-bcm63xx-hsspi.c +++ b/drivers/spi/spi-bcm63xx-hsspi.c @@ -338,8 +338,8 @@ static int bcm63xx_hsspi_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - dev_err(dev, "no irq\n"); - return -ENXIO; + dev_err(dev, "no irq: %d\n", irq); + return irq; } res_mem = platform_get_resource(pdev, IORESOURCE_MEM, 0); -- 2.5.0
[PATCH] spi/bcm63xx: fix error return code in bcm63xx_spi_probe()
platform_get_irq() returns an error code, but the spi-bcm63xx driver ignores it and always returns -ENXIO. This is not correct and, prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva--- drivers/spi/spi-bcm63xx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-bcm63xx.c b/drivers/spi/spi-bcm63xx.c index 84c7356..bfe5754 100644 --- a/drivers/spi/spi-bcm63xx.c +++ b/drivers/spi/spi-bcm63xx.c @@ -530,8 +530,8 @@ static int bcm63xx_spi_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - dev_err(dev, "no irq\n"); - return -ENXIO; + dev_err(dev, "no irq: %d\n", irq); + return irq; } clk = devm_clk_get(dev, "spi"); -- 2.5.0
[PATCH] spi/bcm63xx: fix error return code in bcm63xx_spi_probe()
platform_get_irq() returns an error code, but the spi-bcm63xx driver ignores it and always returns -ENXIO. This is not correct and, prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva --- drivers/spi/spi-bcm63xx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-bcm63xx.c b/drivers/spi/spi-bcm63xx.c index 84c7356..bfe5754 100644 --- a/drivers/spi/spi-bcm63xx.c +++ b/drivers/spi/spi-bcm63xx.c @@ -530,8 +530,8 @@ static int bcm63xx_spi_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - dev_err(dev, "no irq\n"); - return -ENXIO; + dev_err(dev, "no irq: %d\n", irq); + return irq; } clk = devm_clk_get(dev, "spi"); -- 2.5.0
Re: [MD] Crash with 4.12+ kernel and high disk load -- bisected to 4ad23a976413: MD: use per-cpu counter for writes_pending
On Mon, Aug 07, 2017 at 01:20:25PM +0200, Dominik Brodowski wrote: > Neil, Shaohua, > > following up on David R's bug message: I have observed something similar > on v4.12.[345] and v4.13-rc4, but not on v4.11. This is a RAID1 (on bare > metal partitions, /dev/sdaX and /dev/sdbY linked together). In case it > matters: Further upwards are cryptsetup, a DM volume group, then logical > volumes, and then filesystems (ext4, but also happened with xfs). > > In a tedious bisect (the bug wasn't as quickly reproducible as I would like, > but happened when I repeatedly created large lvs and filled them with some > content, while compiling kernels in parallel), I was able to track this > down to: > > > commit 4ad23a976413aa57fe5ba7a25953dc35ccca5b71 > Author: NeilBrown> Date: Wed Mar 15 14:05:14 2017 +1100 > > MD: use per-cpu counter for writes_pending > > The 'writes_pending' counter is used to determine when the > array is stable so that it can be marked in the superblock > as "Clean". Consequently it needs to be updated frequently > but only checked for zero occasionally. Recent changes to > raid5 cause the count to be updated even more often - once > per 4K rather than once per bio. This provided > justification for making the updates more efficient. > > ... > > > CC'ing t...@kernel.org, as 4ad23a976413 is the first (and only?) user > of percpu_ref_switch_to_atomic_sync() introduced in 210f7cdcf088. > > Applying a415c0f10627 on top of 4ad23a976413 does *not* fix the issue, but > reverting all of a2bfc6753065, a415c0f10627 and 4ad23a976413 seems to fix > the issue for v4.12.5. Spent some time to check this one, unfortunately I can't find how that patch makes rcu stall. the percpu part looks good to me too. Can you double check if reverting 4ad23a976413aa57 makes the issue go away? When the rcu stall happens, what the /sys/block/md/md0/array_state? please also attach /proc/mdstat. When you say the mdx_raid1 threads are in 'R' state, can you double check if the /proc/pid/stack always 0xff? Thanks, Shaohua > In addition, I can provide the following stack traces, which appear in dmesg > around the time the system becomes more or less unusuable, with one or more > of the md[0123]_raid1 threads in the "R" state. > > ... ... > [ 142.275244] INFO: rcu_sched self-detected stall on CPU > [ 142.275386] 4-...: (5999 ticks this GP) idle=d8a/141/0 > softirq=2404/2404 fqs=2954 > [ 142.275441] (t=6000 jiffies g=645 c=644 q=199031) > [ 142.275490] NMI backtrace for cpu 4 > [ 142.275537] CPU: 4 PID: 1164 Comm: md2_raid1 Not tainted 4.12.4 #2 > [ 142.275586] Hardware name: MSI MS-7522/MSI X58 Pro (MS-7522) , BIOS > V8.14B8 11/09/2012 > [ 142.275640] Call Trace: > [ 142.275683] > [ 142.275728] dump_stack+0x4d/0x6a > [ 142.275775] nmi_cpu_backtrace+0x9b/0xa0 > [ 142.275822] ? irq_force_complete_move+0xf0/0xf0 > [ 142.275869] nmi_trigger_cpumask_backtrace+0x8f/0xc0 > [ 142.275918] arch_trigger_cpumask_backtrace+0x14/0x20 > [ 142.275967] rcu_dump_cpu_stacks+0x8f/0xd9 > [ 142.276016] rcu_check_callbacks+0x62e/0x780 > [ 142.276064] ? acct_account_cputime+0x17/0x20 > [ 142.276111] update_process_times+0x2a/0x50 > [ 142.276159] tick_sched_handle.isra.18+0x2d/0x30 > [ 142.276222] tick_sched_timer+0x38/0x70 > [ 142.276283] __hrtimer_run_queues+0xbe/0x120 > [ 142.276345] hrtimer_interrupt+0xa3/0x190 > [ 142.276408] local_apic_timer_interrupt+0x33/0x60 > [ 142.276471] smp_apic_timer_interrupt+0x33/0x50 > [ 142.276534] apic_timer_interrupt+0x86/0x90 > [ 142.276598] RIP: 0010:__wake_up+0x44/0x50 > [ 142.276658] RSP: 0018:c9f8fd88 EFLAGS: 0246 ORIG_RAX: > ff10 > [ 142.276742] RAX: 81a84bc0 RBX: 880235cf8800 RCX: > > [ 142.276809] RDX: 81a84bd8 RSI: 0246 RDI: > 81a84bd0 > [ 142.276876] RBP: c9f8fd98 R08: R09: > 0001 > [ 142.276943] R10: R11: R12: > 880235cf8800 > [ 142.277009] R13: 880235eb2c28 R14: 0001 R15: > > [ 142.277076] > [ 142.277136] md_check_recovery+0x30b/0x4a0 > [ 142.277199] raid1d+0x4c/0x810 > [ 142.277258] md_thread+0x11a/0x150 > [ 142.277319] ? md_thread+0x11a/0x150 > [ 142.277379] ? __wake_up_common+0x80/0x80 > [ 142.277442] kthread+0x11a/0x150 > [ 142.277502] ? find_pers+0x70/0x70 > [ 142.277562] ? __kthread_create_on_node+0x140/0x140 > [ 142.277625] ret_from_fork+0x22/0x30 > > ... or this one (on v4.12.5): > [ 1294.560172] INFO: rcu_sched self-detected stall on CPU > [ 1294.560285] 2-...: (6000 ticks this GP) idle=f06/141/0 > softirq=140681/140681 fqs=2988 > [ 1294.560365] (t=6001 jiffies g=28666 c=28665 q=129416) > [ 1294.560426] NMI backtrace for cpu 2 > [ 1294.560483] CPU: 2 PID: 1173 Comm: md3_raid1 Not tainted 4.12.5 #1 > [ 1294.560543]
Re: [MD] Crash with 4.12+ kernel and high disk load -- bisected to 4ad23a976413: MD: use per-cpu counter for writes_pending
On Mon, Aug 07, 2017 at 01:20:25PM +0200, Dominik Brodowski wrote: > Neil, Shaohua, > > following up on David R's bug message: I have observed something similar > on v4.12.[345] and v4.13-rc4, but not on v4.11. This is a RAID1 (on bare > metal partitions, /dev/sdaX and /dev/sdbY linked together). In case it > matters: Further upwards are cryptsetup, a DM volume group, then logical > volumes, and then filesystems (ext4, but also happened with xfs). > > In a tedious bisect (the bug wasn't as quickly reproducible as I would like, > but happened when I repeatedly created large lvs and filled them with some > content, while compiling kernels in parallel), I was able to track this > down to: > > > commit 4ad23a976413aa57fe5ba7a25953dc35ccca5b71 > Author: NeilBrown > Date: Wed Mar 15 14:05:14 2017 +1100 > > MD: use per-cpu counter for writes_pending > > The 'writes_pending' counter is used to determine when the > array is stable so that it can be marked in the superblock > as "Clean". Consequently it needs to be updated frequently > but only checked for zero occasionally. Recent changes to > raid5 cause the count to be updated even more often - once > per 4K rather than once per bio. This provided > justification for making the updates more efficient. > > ... > > > CC'ing t...@kernel.org, as 4ad23a976413 is the first (and only?) user > of percpu_ref_switch_to_atomic_sync() introduced in 210f7cdcf088. > > Applying a415c0f10627 on top of 4ad23a976413 does *not* fix the issue, but > reverting all of a2bfc6753065, a415c0f10627 and 4ad23a976413 seems to fix > the issue for v4.12.5. Spent some time to check this one, unfortunately I can't find how that patch makes rcu stall. the percpu part looks good to me too. Can you double check if reverting 4ad23a976413aa57 makes the issue go away? When the rcu stall happens, what the /sys/block/md/md0/array_state? please also attach /proc/mdstat. When you say the mdx_raid1 threads are in 'R' state, can you double check if the /proc/pid/stack always 0xff? Thanks, Shaohua > In addition, I can provide the following stack traces, which appear in dmesg > around the time the system becomes more or less unusuable, with one or more > of the md[0123]_raid1 threads in the "R" state. > > ... ... > [ 142.275244] INFO: rcu_sched self-detected stall on CPU > [ 142.275386] 4-...: (5999 ticks this GP) idle=d8a/141/0 > softirq=2404/2404 fqs=2954 > [ 142.275441] (t=6000 jiffies g=645 c=644 q=199031) > [ 142.275490] NMI backtrace for cpu 4 > [ 142.275537] CPU: 4 PID: 1164 Comm: md2_raid1 Not tainted 4.12.4 #2 > [ 142.275586] Hardware name: MSI MS-7522/MSI X58 Pro (MS-7522) , BIOS > V8.14B8 11/09/2012 > [ 142.275640] Call Trace: > [ 142.275683] > [ 142.275728] dump_stack+0x4d/0x6a > [ 142.275775] nmi_cpu_backtrace+0x9b/0xa0 > [ 142.275822] ? irq_force_complete_move+0xf0/0xf0 > [ 142.275869] nmi_trigger_cpumask_backtrace+0x8f/0xc0 > [ 142.275918] arch_trigger_cpumask_backtrace+0x14/0x20 > [ 142.275967] rcu_dump_cpu_stacks+0x8f/0xd9 > [ 142.276016] rcu_check_callbacks+0x62e/0x780 > [ 142.276064] ? acct_account_cputime+0x17/0x20 > [ 142.276111] update_process_times+0x2a/0x50 > [ 142.276159] tick_sched_handle.isra.18+0x2d/0x30 > [ 142.276222] tick_sched_timer+0x38/0x70 > [ 142.276283] __hrtimer_run_queues+0xbe/0x120 > [ 142.276345] hrtimer_interrupt+0xa3/0x190 > [ 142.276408] local_apic_timer_interrupt+0x33/0x60 > [ 142.276471] smp_apic_timer_interrupt+0x33/0x50 > [ 142.276534] apic_timer_interrupt+0x86/0x90 > [ 142.276598] RIP: 0010:__wake_up+0x44/0x50 > [ 142.276658] RSP: 0018:c9f8fd88 EFLAGS: 0246 ORIG_RAX: > ff10 > [ 142.276742] RAX: 81a84bc0 RBX: 880235cf8800 RCX: > > [ 142.276809] RDX: 81a84bd8 RSI: 0246 RDI: > 81a84bd0 > [ 142.276876] RBP: c9f8fd98 R08: R09: > 0001 > [ 142.276943] R10: R11: R12: > 880235cf8800 > [ 142.277009] R13: 880235eb2c28 R14: 0001 R15: > > [ 142.277076] > [ 142.277136] md_check_recovery+0x30b/0x4a0 > [ 142.277199] raid1d+0x4c/0x810 > [ 142.277258] md_thread+0x11a/0x150 > [ 142.277319] ? md_thread+0x11a/0x150 > [ 142.277379] ? __wake_up_common+0x80/0x80 > [ 142.277442] kthread+0x11a/0x150 > [ 142.277502] ? find_pers+0x70/0x70 > [ 142.277562] ? __kthread_create_on_node+0x140/0x140 > [ 142.277625] ret_from_fork+0x22/0x30 > > ... or this one (on v4.12.5): > [ 1294.560172] INFO: rcu_sched self-detected stall on CPU > [ 1294.560285] 2-...: (6000 ticks this GP) idle=f06/141/0 > softirq=140681/140681 fqs=2988 > [ 1294.560365] (t=6001 jiffies g=28666 c=28665 q=129416) > [ 1294.560426] NMI backtrace for cpu 2 > [ 1294.560483] CPU: 2 PID: 1173 Comm: md3_raid1 Not tainted 4.12.5 #1 > [ 1294.560543] Hardware name: MSI
Re: linux-next: Signed-off-by missing for commit in the scsi-mkp tree
On Tue, 8 Aug 2017, Stephen Rothwell wrote: > Hi Martin, > > Commit > > facfc963ae92 ("scsi: g_NCR5380: Two DTC436 PDMA workarounds") > > is missing a Signed-off-by from its author. > Sorry about that. The patch was a joint effort. Ondrej, would you please send your "Signed-off-by" tag so that Martin can amend this commit (if need be). --
Re: linux-next: Signed-off-by missing for commit in the scsi-mkp tree
On Tue, 8 Aug 2017, Stephen Rothwell wrote: > Hi Martin, > > Commit > > facfc963ae92 ("scsi: g_NCR5380: Two DTC436 PDMA workarounds") > > is missing a Signed-off-by from its author. > Sorry about that. The patch was a joint effort. Ondrej, would you please send your "Signed-off-by" tag so that Martin can amend this commit (if need be). --
[PATCH] usb: dwc3: omap: fix error return code in dwc3_omap_probe()
platform_get_irq() returns an error code, but the dwc3-omap driver ignores it and always returns -EINVAL. This is not correct and, prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva--- drivers/usb/dwc3/dwc3-omap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c index f5aaa0c..3530795 100644 --- a/drivers/usb/dwc3/dwc3-omap.c +++ b/drivers/usb/dwc3/dwc3-omap.c @@ -478,8 +478,8 @@ static int dwc3_omap_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - dev_err(dev, "missing IRQ resource\n"); - return -EINVAL; + dev_err(dev, "missing IRQ resource: %d\n", irq); + return irq; } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); -- 2.5.0
[PATCH] usb: dwc3: omap: fix error return code in dwc3_omap_probe()
platform_get_irq() returns an error code, but the dwc3-omap driver ignores it and always returns -EINVAL. This is not correct and, prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva --- drivers/usb/dwc3/dwc3-omap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c index f5aaa0c..3530795 100644 --- a/drivers/usb/dwc3/dwc3-omap.c +++ b/drivers/usb/dwc3/dwc3-omap.c @@ -478,8 +478,8 @@ static int dwc3_omap_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - dev_err(dev, "missing IRQ resource\n"); - return -EINVAL; + dev_err(dev, "missing IRQ resource: %d\n", irq); + return irq; } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); -- 2.5.0
Re: [PATCH 0/6] In-kernel QMI handling
On Mon 07 Aug 12:19 PDT 2017, Marcel Holtmann wrote: > Hi Bjorn, > > >>> This series starts by moving the common definitions of the QMUX > >>> protocol to the > >>> uapi header, as they are shared with clients - both in kernel and > >>> userspace. > >>> > >>> This series then introduces in-kernel helper functions for aiding the > >>> handling > >>> of QMI encoded messages in the kernel. QMI encoding is a wire-format > >>> used in > >>> exchanging messages between the majority of QRTR clients and > >>> services. > >> > >> This raises a few red-flags for me. > > > > I'm glad it does. In discussions with the responsible team within > > Qualcomm I've highlighted a number of concerns about enabling this > > support in the kernel. Together we're continuously looking into what > > should be pushed out to user space, and trying to not introduce > > unnecessary new users. > > > >> So far, we've kept almost everything QMI related in userspace and > >> handled all QMI control-channel messages from libraries like libqmi or > >> uqmi via the cdc-wdm driver and the "rmnet" interface via the qmi_wwan > >> driver. The kernel drivers just serve as the transport. > >> > > > > The path that was taken to support the MSM-style devices was to > > implement net/qrtr, which exposes a socket interface to abstract the > > physical transports (QMUX or IPCROUTER in Qualcomm terminology). > > > > As I share you view on letting the kernel handle the transportation only > > the task of keeping track of registered services (service id -> node and > > port mapping) was done in a user space process and so far we've only > > ever have to deal with QMI encoded messages in various user space tools. > > I think that the transport and multiplexing can be in the kernel as > long as it is done as proper subsystem. Similar to Phonet or CAIF. > Meaning it should have a well defined socket interface that can be > easily used from userspace, but also a clean in-kernel interface > handling. > In a mobile Qualcomm device there's a few different components involved here: message routing, QMUX protocol and QMI-encoding. The downstream Qualcomm kernel implements the two first in the IPCROUTER, upstream this is split between the kernel net/qrtr and a user space service-register implementing the QMUX protocol for knowing where services are located. The common encoding of messages passed between endpoints of the message routing is QMI, which is made an affair totally that of each client. > If Qualcomm is supportive of this effort and is willing to actually > assist and/or open some of the specs or interface descriptions, then > this is a good thing. Service registration and cleanup is really done > best in the kernel. Same applies to multiplexing. Trying to do > multiplexing in userspace is always cumbersome and leads to overhead > that is of no gain. For example within oFono, we had to force > everything to go via oFono since it was the only sane way of handling > it. Other approaches were error prone and full of race conditions. You > need a central entity that can clean up. > The current upstream solution depends on a collaboration between net/qrtr and the user space service register for figuring out whom to send messages to. After that muxing et al is handled by the socket interface and service registry does not need to be involved. Qualcomm is very supporting of this solution and we're collaborating on transitioning "downstream" to use this implementation. > For the definition of an UAPI to share some code, I am actually not > sure that is such a good idea. For example the QMI code in oFono > follows a way simpler approach. And I am not convinced that all the > macros are actually beneficial. For example, the whole netlink macros > are pretty cumbersome. Adding some Documentation/qmi.txt on how the > wire format looks like and what is expected seems to be a way better > approach. > The socket interface provided by the kernel expects some knowledge of the QMUX protocol, for service management. The majority of this knowledge is already public, but I agree that it would be good to gather this in a document. The common data structure for the control message is what I've put in the uapi, as this is used by anyone dealing with control messages. When it comes to the QMI-encoded messages these are application specific, just like e.g. protobuf definitions are application specific. As the core infrastructure is becoming available upstream and boards like the DB410c and DB820c aim to be supported by open solutions we will have a natural place to discuss publication of at least some of the application level protocols. Regards, Bjorn
[PATCH] spi: xlp: fix error return code in xlp_spi_probe()
platform_get_irq() returns an error code, but the spi-xlp driver ignores it and always returns -EINVAL. This is not correct and, prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva--- drivers/spi/spi-xlp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-xlp.c b/drivers/spi/spi-xlp.c index 80cb4d6..74a01b0 100644 --- a/drivers/spi/spi-xlp.c +++ b/drivers/spi/spi-xlp.c @@ -393,8 +393,8 @@ static int xlp_spi_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - dev_err(>dev, "no IRQ resource found\n"); - return -EINVAL; + dev_err(>dev, "no IRQ resource found: %d\n", irq); + return irq; } err = devm_request_irq(>dev, irq, xlp_spi_interrupt, 0, pdev->name, xspi); -- 2.5.0
Re: [PATCH 0/6] In-kernel QMI handling
On Mon 07 Aug 12:19 PDT 2017, Marcel Holtmann wrote: > Hi Bjorn, > > >>> This series starts by moving the common definitions of the QMUX > >>> protocol to the > >>> uapi header, as they are shared with clients - both in kernel and > >>> userspace. > >>> > >>> This series then introduces in-kernel helper functions for aiding the > >>> handling > >>> of QMI encoded messages in the kernel. QMI encoding is a wire-format > >>> used in > >>> exchanging messages between the majority of QRTR clients and > >>> services. > >> > >> This raises a few red-flags for me. > > > > I'm glad it does. In discussions with the responsible team within > > Qualcomm I've highlighted a number of concerns about enabling this > > support in the kernel. Together we're continuously looking into what > > should be pushed out to user space, and trying to not introduce > > unnecessary new users. > > > >> So far, we've kept almost everything QMI related in userspace and > >> handled all QMI control-channel messages from libraries like libqmi or > >> uqmi via the cdc-wdm driver and the "rmnet" interface via the qmi_wwan > >> driver. The kernel drivers just serve as the transport. > >> > > > > The path that was taken to support the MSM-style devices was to > > implement net/qrtr, which exposes a socket interface to abstract the > > physical transports (QMUX or IPCROUTER in Qualcomm terminology). > > > > As I share you view on letting the kernel handle the transportation only > > the task of keeping track of registered services (service id -> node and > > port mapping) was done in a user space process and so far we've only > > ever have to deal with QMI encoded messages in various user space tools. > > I think that the transport and multiplexing can be in the kernel as > long as it is done as proper subsystem. Similar to Phonet or CAIF. > Meaning it should have a well defined socket interface that can be > easily used from userspace, but also a clean in-kernel interface > handling. > In a mobile Qualcomm device there's a few different components involved here: message routing, QMUX protocol and QMI-encoding. The downstream Qualcomm kernel implements the two first in the IPCROUTER, upstream this is split between the kernel net/qrtr and a user space service-register implementing the QMUX protocol for knowing where services are located. The common encoding of messages passed between endpoints of the message routing is QMI, which is made an affair totally that of each client. > If Qualcomm is supportive of this effort and is willing to actually > assist and/or open some of the specs or interface descriptions, then > this is a good thing. Service registration and cleanup is really done > best in the kernel. Same applies to multiplexing. Trying to do > multiplexing in userspace is always cumbersome and leads to overhead > that is of no gain. For example within oFono, we had to force > everything to go via oFono since it was the only sane way of handling > it. Other approaches were error prone and full of race conditions. You > need a central entity that can clean up. > The current upstream solution depends on a collaboration between net/qrtr and the user space service register for figuring out whom to send messages to. After that muxing et al is handled by the socket interface and service registry does not need to be involved. Qualcomm is very supporting of this solution and we're collaborating on transitioning "downstream" to use this implementation. > For the definition of an UAPI to share some code, I am actually not > sure that is such a good idea. For example the QMI code in oFono > follows a way simpler approach. And I am not convinced that all the > macros are actually beneficial. For example, the whole netlink macros > are pretty cumbersome. Adding some Documentation/qmi.txt on how the > wire format looks like and what is expected seems to be a way better > approach. > The socket interface provided by the kernel expects some knowledge of the QMUX protocol, for service management. The majority of this knowledge is already public, but I agree that it would be good to gather this in a document. The common data structure for the control message is what I've put in the uapi, as this is used by anyone dealing with control messages. When it comes to the QMI-encoded messages these are application specific, just like e.g. protobuf definitions are application specific. As the core infrastructure is becoming available upstream and boards like the DB410c and DB820c aim to be supported by open solutions we will have a natural place to discuss publication of at least some of the application level protocols. Regards, Bjorn
[PATCH] spi: xlp: fix error return code in xlp_spi_probe()
platform_get_irq() returns an error code, but the spi-xlp driver ignores it and always returns -EINVAL. This is not correct and, prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva --- drivers/spi/spi-xlp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-xlp.c b/drivers/spi/spi-xlp.c index 80cb4d6..74a01b0 100644 --- a/drivers/spi/spi-xlp.c +++ b/drivers/spi/spi-xlp.c @@ -393,8 +393,8 @@ static int xlp_spi_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - dev_err(>dev, "no IRQ resource found\n"); - return -EINVAL; + dev_err(>dev, "no IRQ resource found: %d\n", irq); + return irq; } err = devm_request_irq(>dev, irq, xlp_spi_interrupt, 0, pdev->name, xspi); -- 2.5.0
[PATCH 1/2] PCI: iproc: Implement PCI hotplug support
This patch implements PCI hotplug support for iproc family chipsets. Iproc based SOC (e.g. Stingray) does not have hotplug controller integrated. Hence, standard PCI hotplug framework hooks can-not be used. e.g. controlled power up/down of slot. The mechanism, for e.g. Stingray has adopted for PCI hotplug is as follows: PCI present lines are input to GPIOs depending on the type of connector (x2, x4, x8). GPIO array needs to be present if hotplug is supported. HW implementation is SOC/Board specific, and also it depends on how add-in card is designed (e.g. how many present pins are implemented). If x8 card is connected, then it might be possible that all the 3 present pins could go low, or at least one pin goes low. If x4 card is connected, then it might be possible that 2 present pins go low, or at least one pin goes low. The implementation essentially takes care of following: > Initializing hotplug irq thread. > Detecting the endpoint device based on link state. > Handling PERST and detecting the plugged devices. > Ordered hot plug-out, where User is expected to write 1 to /sys/bus/pci/devices//remove > Handling spurious interrupt > Handling multiple interrupts and makes sure that card is enumerated only once. Signed-off-by: Oza PawandeepReviewed-by: Ray Jui diff --git a/drivers/pci/host/pcie-iproc-platform.c b/drivers/pci/host/pcie-iproc-platform.c index 9512960..e51bbd2 100644 --- a/drivers/pci/host/pcie-iproc-platform.c +++ b/drivers/pci/host/pcie-iproc-platform.c @@ -89,6 +89,9 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev) pcie->need_ob_cfg = true; } + if (of_property_read_bool(np, "brcm,pci-hotplug")) + pcie->enable_hotplug = true; + /* PHY use is optional */ pcie->phy = devm_phy_get(dev, "pcie-phy"); if (IS_ERR(pcie->phy)) { diff --git a/drivers/pci/host/pcie-iproc.c b/drivers/pci/host/pcie-iproc.c index ee40651..c6d1add 100644 --- a/drivers/pci/host/pcie-iproc.c +++ b/drivers/pci/host/pcie-iproc.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "pcie-iproc.h" @@ -65,6 +66,17 @@ #define PCIE_DL_ACTIVE_SHIFT 2 #define PCIE_DL_ACTIVE BIT(PCIE_DL_ACTIVE_SHIFT) +#define CFG_RC_LTSSM 0x1cf8 +#define CFG_RC_PHY_CTL 0x1804 +#define CFG_RC_LTSSM_TIMEOUT 1000 +#define CFG_RC_LTSSM_STATE_MASK 0xff +#define CFG_RC_LTSSM_STATE_L10x1 + +#define CFG_RC_CLR_LTSSM_HIST_SHIFT 29 +#define CFG_RC_CLR_LTSSM_HIST_MASK BIT(CFG_RC_CLR_LTSSM_HIST_SHIFT) +#define CFG_RC_CLR_RECOV_HIST_SHIFT 31 +#define CFG_RC_CLR_RECOV_HIST_MASK BIT(CFG_RC_CLR_RECOV_HIST_SHIFT) + #define APB_ERR_EN_SHIFT 0 #define APB_ERR_EN BIT(APB_ERR_EN_SHIFT) @@ -1306,12 +1318,106 @@ static int iproc_pcie_rev_init(struct iproc_pcie *pcie) return 0; } +static bool iproc_pci_hp_check_ltssm(struct iproc_pcie *pcie) +{ + struct pci_bus *bus = pcie->root_bus; + u32 val, timeout = CFG_RC_LTSSM_TIMEOUT; + + /* Clear LTSSM history. */ + pci_bus_read_config_dword(pcie->root_bus, 0, + CFG_RC_PHY_CTL, ); + pci_bus_write_config_dword(bus, 0, CFG_RC_PHY_CTL, + val | CFG_RC_CLR_RECOV_HIST_MASK | + CFG_RC_CLR_LTSSM_HIST_MASK); + /* write back the origional value. */ + pci_bus_write_config_dword(bus, 0, CFG_RC_PHY_CTL, val); + + do { + pci_bus_read_config_dword(pcie->root_bus, 0, + CFG_RC_LTSSM, ); + /* check link state to see if link moved to L1 state. */ + if ((val & CFG_RC_LTSSM_STATE_MASK) == +CFG_RC_LTSSM_STATE_L1) + return true; + timeout--; + usleep_range(500, 1000); + } while (timeout); + + return false; +} + +static irqreturn_t iproc_pci_hotplug_thread(int irq, void *data) +{ + struct iproc_pcie *pcie = data; + struct pci_bus *bus = pcie->root_bus, *child; + bool link_status; + + iproc_pcie_perst_ctrl(pcie, true); + iproc_pcie_perst_ctrl(pcie, false); + + link_status = iproc_pci_hp_check_ltssm(pcie); + + if (link_status && + !iproc_pcie_check_link(pcie, bus) && + !pcie->ep_is_present) { + pci_rescan_bus(bus); + list_for_each_entry(child, >children, node) + pcie_bus_configure_settings(child); + pcie->ep_is_present = true; + dev_info(pcie->dev, +"PCI Hotplug: \n"); + } else if (link_status && pcie->ep_is_present) + /* +* ep_is_present makes sure, enumuration done only once. +* So it can handle spurious intrrupts, and also if we +
[PATCH 0/2] PCI hotplug feature
These patches bring in PCI hotplug support for iproc family chipsets. It includes DT binding documentation update and, implementation in iproc pcie RC driver. These patch set is made on top of following patches. [PATCH v6 2/2] PCI: iproc: add device shutdown for PCI RC [PATCH v6 1/2] PCI: iproc: Retry request when CRS returned from EP Oza Pawandeep (2): PCI: iproc: Implement PCI hotplug support PCI: iproc: Add optional brcm,pci-hotplug .../devicetree/bindings/pci/brcm,iproc-pcie.txt| 23 drivers/pci/host/pcie-iproc-platform.c | 3 + drivers/pci/host/pcie-iproc.c | 130 - drivers/pci/host/pcie-iproc.h | 7 ++ 4 files changed, 157 insertions(+), 6 deletions(-) -- 1.9.1
[PATCH 1/2] PCI: iproc: Implement PCI hotplug support
This patch implements PCI hotplug support for iproc family chipsets. Iproc based SOC (e.g. Stingray) does not have hotplug controller integrated. Hence, standard PCI hotplug framework hooks can-not be used. e.g. controlled power up/down of slot. The mechanism, for e.g. Stingray has adopted for PCI hotplug is as follows: PCI present lines are input to GPIOs depending on the type of connector (x2, x4, x8). GPIO array needs to be present if hotplug is supported. HW implementation is SOC/Board specific, and also it depends on how add-in card is designed (e.g. how many present pins are implemented). If x8 card is connected, then it might be possible that all the 3 present pins could go low, or at least one pin goes low. If x4 card is connected, then it might be possible that 2 present pins go low, or at least one pin goes low. The implementation essentially takes care of following: > Initializing hotplug irq thread. > Detecting the endpoint device based on link state. > Handling PERST and detecting the plugged devices. > Ordered hot plug-out, where User is expected to write 1 to /sys/bus/pci/devices//remove > Handling spurious interrupt > Handling multiple interrupts and makes sure that card is enumerated only once. Signed-off-by: Oza Pawandeep Reviewed-by: Ray Jui diff --git a/drivers/pci/host/pcie-iproc-platform.c b/drivers/pci/host/pcie-iproc-platform.c index 9512960..e51bbd2 100644 --- a/drivers/pci/host/pcie-iproc-platform.c +++ b/drivers/pci/host/pcie-iproc-platform.c @@ -89,6 +89,9 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev) pcie->need_ob_cfg = true; } + if (of_property_read_bool(np, "brcm,pci-hotplug")) + pcie->enable_hotplug = true; + /* PHY use is optional */ pcie->phy = devm_phy_get(dev, "pcie-phy"); if (IS_ERR(pcie->phy)) { diff --git a/drivers/pci/host/pcie-iproc.c b/drivers/pci/host/pcie-iproc.c index ee40651..c6d1add 100644 --- a/drivers/pci/host/pcie-iproc.c +++ b/drivers/pci/host/pcie-iproc.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "pcie-iproc.h" @@ -65,6 +66,17 @@ #define PCIE_DL_ACTIVE_SHIFT 2 #define PCIE_DL_ACTIVE BIT(PCIE_DL_ACTIVE_SHIFT) +#define CFG_RC_LTSSM 0x1cf8 +#define CFG_RC_PHY_CTL 0x1804 +#define CFG_RC_LTSSM_TIMEOUT 1000 +#define CFG_RC_LTSSM_STATE_MASK 0xff +#define CFG_RC_LTSSM_STATE_L10x1 + +#define CFG_RC_CLR_LTSSM_HIST_SHIFT 29 +#define CFG_RC_CLR_LTSSM_HIST_MASK BIT(CFG_RC_CLR_LTSSM_HIST_SHIFT) +#define CFG_RC_CLR_RECOV_HIST_SHIFT 31 +#define CFG_RC_CLR_RECOV_HIST_MASK BIT(CFG_RC_CLR_RECOV_HIST_SHIFT) + #define APB_ERR_EN_SHIFT 0 #define APB_ERR_EN BIT(APB_ERR_EN_SHIFT) @@ -1306,12 +1318,106 @@ static int iproc_pcie_rev_init(struct iproc_pcie *pcie) return 0; } +static bool iproc_pci_hp_check_ltssm(struct iproc_pcie *pcie) +{ + struct pci_bus *bus = pcie->root_bus; + u32 val, timeout = CFG_RC_LTSSM_TIMEOUT; + + /* Clear LTSSM history. */ + pci_bus_read_config_dword(pcie->root_bus, 0, + CFG_RC_PHY_CTL, ); + pci_bus_write_config_dword(bus, 0, CFG_RC_PHY_CTL, + val | CFG_RC_CLR_RECOV_HIST_MASK | + CFG_RC_CLR_LTSSM_HIST_MASK); + /* write back the origional value. */ + pci_bus_write_config_dword(bus, 0, CFG_RC_PHY_CTL, val); + + do { + pci_bus_read_config_dword(pcie->root_bus, 0, + CFG_RC_LTSSM, ); + /* check link state to see if link moved to L1 state. */ + if ((val & CFG_RC_LTSSM_STATE_MASK) == +CFG_RC_LTSSM_STATE_L1) + return true; + timeout--; + usleep_range(500, 1000); + } while (timeout); + + return false; +} + +static irqreturn_t iproc_pci_hotplug_thread(int irq, void *data) +{ + struct iproc_pcie *pcie = data; + struct pci_bus *bus = pcie->root_bus, *child; + bool link_status; + + iproc_pcie_perst_ctrl(pcie, true); + iproc_pcie_perst_ctrl(pcie, false); + + link_status = iproc_pci_hp_check_ltssm(pcie); + + if (link_status && + !iproc_pcie_check_link(pcie, bus) && + !pcie->ep_is_present) { + pci_rescan_bus(bus); + list_for_each_entry(child, >children, node) + pcie_bus_configure_settings(child); + pcie->ep_is_present = true; + dev_info(pcie->dev, +"PCI Hotplug: \n"); + } else if (link_status && pcie->ep_is_present) + /* +* ep_is_present makes sure, enumuration done only once. +* So it can handle spurious intrrupts, and also if we +* get multiple interrupts for all the
[PATCH 0/2] PCI hotplug feature
These patches bring in PCI hotplug support for iproc family chipsets. It includes DT binding documentation update and, implementation in iproc pcie RC driver. These patch set is made on top of following patches. [PATCH v6 2/2] PCI: iproc: add device shutdown for PCI RC [PATCH v6 1/2] PCI: iproc: Retry request when CRS returned from EP Oza Pawandeep (2): PCI: iproc: Implement PCI hotplug support PCI: iproc: Add optional brcm,pci-hotplug .../devicetree/bindings/pci/brcm,iproc-pcie.txt| 23 drivers/pci/host/pcie-iproc-platform.c | 3 + drivers/pci/host/pcie-iproc.c | 130 - drivers/pci/host/pcie-iproc.h | 7 ++ 4 files changed, 157 insertions(+), 6 deletions(-) -- 1.9.1
[PATCH 2/2] PCI: iproc: Add optional brcm,pci-hotplug
Add description for optional device tree property 'brcm,pci-hotplug' for PCI hotplug feature. Signed-off-by: Oza PawandeepReviewed-by: Ray Jui diff --git a/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt b/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt index b8e48b4..a3bad24 100644 --- a/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt +++ b/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt @@ -72,6 +72,29 @@ Optional properties: - brcm,pcie-msi-inten: Needs to be present for some older iProc platforms that require the interrupt enable registers to be set explicitly to enable MSI +Optional properties: +- brcm,pci-hotplug: PCI hotplug feature is supported. + +If the brcm,pcie-hotplug property is present, the following properties become +effective: + +- brcm,prsnt-gpio: Array of gpios, needs to be present if Hotplug is supported. + +PCI hotplug implementation is SOC/Board specific, and also it depends on +how add-in card is designed (e.g. how many present pins are implemented). + +If x8 card is connected, then it might be possible that all the +3 present pins could go low, or at least one pin goes low. + +If x4 card is connected, then it might be possible that 2 present +pins go low, or at least one pin goes low. + +Example: +brcm,prsnt-gpio: < 32 1>, < 33 1>; +This is x4 connector: monitoring max 2 present lines. +brcm,prsnt-gpio: < 32 1>, < 33 1>, < 34 1>; +This is x8 connector: monitoring max 3 present lines. + Example: pcie0: pcie@18012000 { compatible = "brcm,iproc-pcie"; -- 1.9.1
[PATCH 2/2] PCI: iproc: Add optional brcm,pci-hotplug
Add description for optional device tree property 'brcm,pci-hotplug' for PCI hotplug feature. Signed-off-by: Oza Pawandeep Reviewed-by: Ray Jui diff --git a/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt b/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt index b8e48b4..a3bad24 100644 --- a/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt +++ b/Documentation/devicetree/bindings/pci/brcm,iproc-pcie.txt @@ -72,6 +72,29 @@ Optional properties: - brcm,pcie-msi-inten: Needs to be present for some older iProc platforms that require the interrupt enable registers to be set explicitly to enable MSI +Optional properties: +- brcm,pci-hotplug: PCI hotplug feature is supported. + +If the brcm,pcie-hotplug property is present, the following properties become +effective: + +- brcm,prsnt-gpio: Array of gpios, needs to be present if Hotplug is supported. + +PCI hotplug implementation is SOC/Board specific, and also it depends on +how add-in card is designed (e.g. how many present pins are implemented). + +If x8 card is connected, then it might be possible that all the +3 present pins could go low, or at least one pin goes low. + +If x4 card is connected, then it might be possible that 2 present +pins go low, or at least one pin goes low. + +Example: +brcm,prsnt-gpio: < 32 1>, < 33 1>; +This is x4 connector: monitoring max 2 present lines. +brcm,prsnt-gpio: < 32 1>, < 33 1>, < 34 1>; +This is x8 connector: monitoring max 3 present lines. + Example: pcie0: pcie@18012000 { compatible = "brcm,iproc-pcie"; -- 1.9.1
Re: [PATCH] arm64: correct modules range of kernel virtual memory layout
On Mon, 2017-08-07 at 15:01 +0100, Will Deacon wrote: > On Mon, Aug 07, 2017 at 02:18:00PM +0100, Ard Biesheuvel wrote: > > On 7 August 2017 at 14:16, Will Deaconwrote: > > > On Mon, Aug 07, 2017 at 07:04:46PM +0800, Miles Chen wrote: > > >> The commit f80fb3a3d508 ("arm64: add support for kernel ASLR") > > >> moved module virtual address to > > >> [module_alloc_base, module_alloc_base + MODULES_VSIZE). > > >> > > >> Display module information of the virtual kernel > > >> memory layout by using module_alloc_base. > > >> > > >> testing output: > > >> 1) Current implementation: > > >> Virtual kernel memory layout: > > >> modules : 0xff80 - 0xff800800 ( 128 MB) > > >> 2) this patch + KASLR: > > >> Virtual kernel memory layout: > > >> modules : 0xff800056 - 0xff800856 ( 128 MB) > > >> 3) this patch + KASLR and a dummy seed: > > >> Virtual kernel memory layout: > > >> modules : 0xffa7df637000 - 0xffa7e7637000 ( 128 MB) > > >> > > >> Signed-off-by: Miles Chen > > >> --- > > >> arch/arm64/mm/init.c | 5 +++-- > > >> 1 file changed, 3 insertions(+), 2 deletions(-) > > > > > > Does this mean the modules code in our pt dumper is busted > > > (arch/arm64/mm/dump.c)? Also, what about KASAN, which uses these addresses > > > too (in kasan_init)? Should we just remove MODULES_VADDR and MODULES_END > > > altogether? > > > > > > > I don't think we need this patch. The 'module' line simply prints the > > VA region that is reserved for modules. The fact that we end up > > putting them elsewhere when running randomized does not necessarily > > mean this line should reflect that. > > I was more concerned by other users of MODULES_VADDR tbh, although I see > now that we don't randomize the module region if kasan is enabled. Still, > the kcore code adds the modules region as a separate area (distinct from > vmalloc) if MODULES_VADDR is defined, the page table dumping code uses > MODULES_VADDR to identify the module region and I think we'll get false > positives from is_vmalloc_or_module_addr, which again uses the static > region. > > So, given that MODULES_VADDR never points at the module area, can't we get > rid of it? Agreed.MODULES_VADDR should be phased out. Considering the kernel modules live somewhere between [VMALLOC_START, VMALLOC_END) now: (arch/arm64/kernel/module.c:module_alloc). I suggest the following changes: 1. is_vmalloc_or_module_addr() should return is_vmalloc_addr() directly 2. arch/arm64/mm/dump.c does not need MODULES_VADDR and MODULES_END. 3. kasan uses [module_alloc_base, module_alloc_base + MODULES_VSIZE) to get the shadow memory? (the kernel modules still live in this range when kasan is enabled) 4. remove modules line in kernel memory layout (optional, thanks for Ard's feedback) 5. remove MODULE_VADDR, MODULES_END definition Miles > > Will
Re: [PATCH] arm64: correct modules range of kernel virtual memory layout
On Mon, 2017-08-07 at 15:01 +0100, Will Deacon wrote: > On Mon, Aug 07, 2017 at 02:18:00PM +0100, Ard Biesheuvel wrote: > > On 7 August 2017 at 14:16, Will Deacon wrote: > > > On Mon, Aug 07, 2017 at 07:04:46PM +0800, Miles Chen wrote: > > >> The commit f80fb3a3d508 ("arm64: add support for kernel ASLR") > > >> moved module virtual address to > > >> [module_alloc_base, module_alloc_base + MODULES_VSIZE). > > >> > > >> Display module information of the virtual kernel > > >> memory layout by using module_alloc_base. > > >> > > >> testing output: > > >> 1) Current implementation: > > >> Virtual kernel memory layout: > > >> modules : 0xff80 - 0xff800800 ( 128 MB) > > >> 2) this patch + KASLR: > > >> Virtual kernel memory layout: > > >> modules : 0xff800056 - 0xff800856 ( 128 MB) > > >> 3) this patch + KASLR and a dummy seed: > > >> Virtual kernel memory layout: > > >> modules : 0xffa7df637000 - 0xffa7e7637000 ( 128 MB) > > >> > > >> Signed-off-by: Miles Chen > > >> --- > > >> arch/arm64/mm/init.c | 5 +++-- > > >> 1 file changed, 3 insertions(+), 2 deletions(-) > > > > > > Does this mean the modules code in our pt dumper is busted > > > (arch/arm64/mm/dump.c)? Also, what about KASAN, which uses these addresses > > > too (in kasan_init)? Should we just remove MODULES_VADDR and MODULES_END > > > altogether? > > > > > > > I don't think we need this patch. The 'module' line simply prints the > > VA region that is reserved for modules. The fact that we end up > > putting them elsewhere when running randomized does not necessarily > > mean this line should reflect that. > > I was more concerned by other users of MODULES_VADDR tbh, although I see > now that we don't randomize the module region if kasan is enabled. Still, > the kcore code adds the modules region as a separate area (distinct from > vmalloc) if MODULES_VADDR is defined, the page table dumping code uses > MODULES_VADDR to identify the module region and I think we'll get false > positives from is_vmalloc_or_module_addr, which again uses the static > region. > > So, given that MODULES_VADDR never points at the module area, can't we get > rid of it? Agreed.MODULES_VADDR should be phased out. Considering the kernel modules live somewhere between [VMALLOC_START, VMALLOC_END) now: (arch/arm64/kernel/module.c:module_alloc). I suggest the following changes: 1. is_vmalloc_or_module_addr() should return is_vmalloc_addr() directly 2. arch/arm64/mm/dump.c does not need MODULES_VADDR and MODULES_END. 3. kasan uses [module_alloc_base, module_alloc_base + MODULES_VSIZE) to get the shadow memory? (the kernel modules still live in this range when kasan is enabled) 4. remove modules line in kernel memory layout (optional, thanks for Ard's feedback) 5. remove MODULE_VADDR, MODULES_END definition Miles > > Will
[PATCH v2] perf/core: Avoid context switch overheads
From: "leilei.lin"A performance issue caused by less strickly check in task sched when these tasks were once attached by per-task perf_event. A task will alloc task->perf_event_ctxp[ctxn] when it was called by perf_event_open, and task->perf_event_ctxp[ctxn] would not ever be freed to NULL. __perf_event_task_sched_in() if (task->perf_event_ctxp[ctxn]) // here is always true perf_event_context_sched_in() // operate pmu 50% at most performance overhead was observed under some extreme test case. Therefor, add a more strick check as to ctx->nr_events, when ctx->nr_events == 0, it's no need to continue. Signed-off-by: leilei.lin --- kernel/events/core.c | 9 + 1 file changed, 9 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ff..3d86695 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3180,6 +3180,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, return; perf_ctx_lock(cpuctx, ctx); + /* +* We must check ctx->nr_events while holding ctx->lock, such +* that we serialize against perf_install_in_context(). +*/ + if (!cpuctx->task_ctx && !ctx->nr_events) + goto unlock; + perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: @@ -3193,6 +3200,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); + +unlock: perf_ctx_unlock(cpuctx, ctx); } -- 2.8.4.31.g9ed660f
[PATCH v2] perf/core: Avoid context switch overheads
From: "leilei.lin" A performance issue caused by less strickly check in task sched when these tasks were once attached by per-task perf_event. A task will alloc task->perf_event_ctxp[ctxn] when it was called by perf_event_open, and task->perf_event_ctxp[ctxn] would not ever be freed to NULL. __perf_event_task_sched_in() if (task->perf_event_ctxp[ctxn]) // here is always true perf_event_context_sched_in() // operate pmu 50% at most performance overhead was observed under some extreme test case. Therefor, add a more strick check as to ctx->nr_events, when ctx->nr_events == 0, it's no need to continue. Signed-off-by: leilei.lin --- kernel/events/core.c | 9 + 1 file changed, 9 insertions(+) diff --git a/kernel/events/core.c b/kernel/events/core.c index 426c2ff..3d86695 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3180,6 +3180,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, return; perf_ctx_lock(cpuctx, ctx); + /* +* We must check ctx->nr_events while holding ctx->lock, such +* that we serialize against perf_install_in_context(). +*/ + if (!cpuctx->task_ctx && !ctx->nr_events) + goto unlock; + perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: @@ -3193,6 +3200,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); + +unlock: perf_ctx_unlock(cpuctx, ctx); } -- 2.8.4.31.g9ed660f
[PATCH 2/3] autofs - make disc device user accessible
The autofs miscellanous device ioctls that shouldn't require CAP_SYS_ADMIN need to be accessible to user space applications in order to be able to get information about autofs mounts. The module checks capabilities so the miscelaneous device should be fine with broad permissions. Signed-off-by: Ian KentCc: Colin Walters Cc: Ondrej Holy --- fs/autofs4/dev-ioctl.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index dd9f1bebb5a3..218a4ecc75cc 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -733,7 +733,8 @@ static const struct file_operations _dev_ioctl_fops = { static struct miscdevice _autofs_dev_ioctl_misc = { .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, - .fops = &_dev_ioctl_fops + .fops = &_dev_ioctl_fops, + .mode = 0644, }; MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
[PATCH 2/3] autofs - make disc device user accessible
The autofs miscellanous device ioctls that shouldn't require CAP_SYS_ADMIN need to be accessible to user space applications in order to be able to get information about autofs mounts. The module checks capabilities so the miscelaneous device should be fine with broad permissions. Signed-off-by: Ian Kent Cc: Colin Walters Cc: Ondrej Holy --- fs/autofs4/dev-ioctl.c |3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index dd9f1bebb5a3..218a4ecc75cc 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -733,7 +733,8 @@ static const struct file_operations _dev_ioctl_fops = { static struct miscdevice _autofs_dev_ioctl_misc = { .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, - .fops = &_dev_ioctl_fops + .fops = &_dev_ioctl_fops, + .mode = 0644, }; MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
[PATCH 1/3] autofs - fix AT_NO_AUTOMOUNT not being honored
The fstatat(2) and statx() calls can pass the flag AT_NO_AUTOMOUNT which is meant to clear the LOOKUP_AUTOMOUNT flag and prevent triggering of an automount by the call. But this flag is unconditionally cleared for all stat family system calls except statx(). stat family system calls have always triggered mount requests for the negative dentry case in follow_automount() which is intended but prevents the fstatat(2) and statx() AT_NO_AUTOMOUNT case from being handled. In order to handle the AT_NO_AUTOMOUNT for both system calls the negative dentry case in follow_automount() needs to be changed to return ENOENT when the LOOKUP_AUTOMOUNT flag is clear (and the other required flags are clear). AFAICT this change doesn't have any noticable side effects and may, in some use cases (although I didn't see it in testing) prevent unnecessary callbacks to the automount daemon. It's also possible that a stat family call has been made with a path that is in the process of being mounted by some other process. But stat family calls should return the automount state of the path as it is "now" so it shouldn't wait for mount completion. This is the same semantic as the positive dentry case already handled. Signed-off-by: Ian KentCc: David Howells Cc: Colin Walters Cc: Ondrej Holy --- fs/namei.c | 15 --- include/linux/fs.h |3 +-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index ddb6a7c2b3d4..1180f9c58093 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1129,9 +1129,18 @@ static int follow_automount(struct path *path, struct nameidata *nd, * of the daemon to instantiate them before they can be used. */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && - path->dentry->d_inode) - return -EISDIR; + LOOKUP_OPEN | LOOKUP_CREATE | + LOOKUP_AUTOMOUNT))) { + /* Positive dentry that isn't meant to trigger an +* automount, EISDIR will allow it to be used, +* otherwise there's no mount here "now" so return +* ENOENT. +*/ + if (path->dentry->d_inode) + return -EISDIR; + else + return -ENOENT; + } if (path->dentry->d_sb->s_user_ns != _user_ns) return -EACCES; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e1fd5d21248..37c96f52e48e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3022,8 +3022,7 @@ static inline int vfs_lstat(const char __user *name, struct kstat *stat) static inline int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags) { - return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, -stat, STATX_BASIC_STATS); + return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS); } static inline int vfs_fstat(int fd, struct kstat *stat) {
[PATCH 3/3] autofs - make dev ioctl version and ismountpoint user accessible
Some of the autofs miscellaneous device ioctls need to be accessable to user space applications without CAP_SYS_ADMIN to get information about autofs mounts. Signed-off-by: Ian KentCc: Colin Walters Cc: Ondrej Holy --- fs/autofs4/dev-ioctl.c | 12 include/uapi/linux/auto_dev-ioctl.h |2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 218a4ecc75cc..ea8b3a1cddd2 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -628,10 +628,6 @@ static int _autofs_dev_ioctl(unsigned int command, ioctl_fn fn = NULL; int err = 0; - /* only root can play with this */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); cmd = _IOC_NR(command); @@ -640,6 +636,14 @@ static int _autofs_dev_ioctl(unsigned int command, return -ENOTTY; } + /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD +* and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD +*/ + if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && + cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* Copy the parameters into kernel space. */ param = copy_dev_ioctl(user); if (IS_ERR(param)) diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h index 744b3d060968..5558db8e6646 100644 --- a/include/uapi/linux/auto_dev-ioctl.h +++ b/include/uapi/linux/auto_dev-ioctl.h @@ -16,7 +16,7 @@ #define AUTOFS_DEVICE_NAME "autofs" #define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 -#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 +#define AUTOFS_DEV_IOCTL_VERSION_MINOR 1 #define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl)
[PATCH 1/3] autofs - fix AT_NO_AUTOMOUNT not being honored
The fstatat(2) and statx() calls can pass the flag AT_NO_AUTOMOUNT which is meant to clear the LOOKUP_AUTOMOUNT flag and prevent triggering of an automount by the call. But this flag is unconditionally cleared for all stat family system calls except statx(). stat family system calls have always triggered mount requests for the negative dentry case in follow_automount() which is intended but prevents the fstatat(2) and statx() AT_NO_AUTOMOUNT case from being handled. In order to handle the AT_NO_AUTOMOUNT for both system calls the negative dentry case in follow_automount() needs to be changed to return ENOENT when the LOOKUP_AUTOMOUNT flag is clear (and the other required flags are clear). AFAICT this change doesn't have any noticable side effects and may, in some use cases (although I didn't see it in testing) prevent unnecessary callbacks to the automount daemon. It's also possible that a stat family call has been made with a path that is in the process of being mounted by some other process. But stat family calls should return the automount state of the path as it is "now" so it shouldn't wait for mount completion. This is the same semantic as the positive dentry case already handled. Signed-off-by: Ian Kent Cc: David Howells Cc: Colin Walters Cc: Ondrej Holy --- fs/namei.c | 15 --- include/linux/fs.h |3 +-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index ddb6a7c2b3d4..1180f9c58093 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1129,9 +1129,18 @@ static int follow_automount(struct path *path, struct nameidata *nd, * of the daemon to instantiate them before they can be used. */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && - path->dentry->d_inode) - return -EISDIR; + LOOKUP_OPEN | LOOKUP_CREATE | + LOOKUP_AUTOMOUNT))) { + /* Positive dentry that isn't meant to trigger an +* automount, EISDIR will allow it to be used, +* otherwise there's no mount here "now" so return +* ENOENT. +*/ + if (path->dentry->d_inode) + return -EISDIR; + else + return -ENOENT; + } if (path->dentry->d_sb->s_user_ns != _user_ns) return -EACCES; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6e1fd5d21248..37c96f52e48e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3022,8 +3022,7 @@ static inline int vfs_lstat(const char __user *name, struct kstat *stat) static inline int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags) { - return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, -stat, STATX_BASIC_STATS); + return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS); } static inline int vfs_fstat(int fd, struct kstat *stat) {
[PATCH 3/3] autofs - make dev ioctl version and ismountpoint user accessible
Some of the autofs miscellaneous device ioctls need to be accessable to user space applications without CAP_SYS_ADMIN to get information about autofs mounts. Signed-off-by: Ian Kent Cc: Colin Walters Cc: Ondrej Holy --- fs/autofs4/dev-ioctl.c | 12 include/uapi/linux/auto_dev-ioctl.h |2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 218a4ecc75cc..ea8b3a1cddd2 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -628,10 +628,6 @@ static int _autofs_dev_ioctl(unsigned int command, ioctl_fn fn = NULL; int err = 0; - /* only root can play with this */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); cmd = _IOC_NR(command); @@ -640,6 +636,14 @@ static int _autofs_dev_ioctl(unsigned int command, return -ENOTTY; } + /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD +* and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD +*/ + if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && + cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* Copy the parameters into kernel space. */ param = copy_dev_ioctl(user); if (IS_ERR(param)) diff --git a/include/uapi/linux/auto_dev-ioctl.h b/include/uapi/linux/auto_dev-ioctl.h index 744b3d060968..5558db8e6646 100644 --- a/include/uapi/linux/auto_dev-ioctl.h +++ b/include/uapi/linux/auto_dev-ioctl.h @@ -16,7 +16,7 @@ #define AUTOFS_DEVICE_NAME "autofs" #define AUTOFS_DEV_IOCTL_VERSION_MAJOR 1 -#define AUTOFS_DEV_IOCTL_VERSION_MINOR 0 +#define AUTOFS_DEV_IOCTL_VERSION_MINOR 1 #define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl)
[PATCH] thermal: rockchip: fix error return code in rockchip_thermal_probe()
platform_get_irq() returns an error code, but the rockchip_thermal driver ignores it and always returns -EINVAL. This is not correct and, prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva--- drivers/thermal/rockchip_thermal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c index 4c77965..6ca9747 100644 --- a/drivers/thermal/rockchip_thermal.c +++ b/drivers/thermal/rockchip_thermal.c @@ -1068,8 +1068,8 @@ static int rockchip_thermal_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - dev_err(>dev, "no irq resource?\n"); - return -EINVAL; + dev_err(>dev, "no irq resource: %d\n", irq); + return irq; } thermal = devm_kzalloc(>dev, sizeof(struct rockchip_thermal_data), -- 2.5.0
[PATCH] thermal: rockchip: fix error return code in rockchip_thermal_probe()
platform_get_irq() returns an error code, but the rockchip_thermal driver ignores it and always returns -EINVAL. This is not correct and, prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva --- drivers/thermal/rockchip_thermal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c index 4c77965..6ca9747 100644 --- a/drivers/thermal/rockchip_thermal.c +++ b/drivers/thermal/rockchip_thermal.c @@ -1068,8 +1068,8 @@ static int rockchip_thermal_probe(struct platform_device *pdev) irq = platform_get_irq(pdev, 0); if (irq < 0) { - dev_err(>dev, "no irq resource?\n"); - return -EINVAL; + dev_err(>dev, "no irq resource: %d\n", irq); + return irq; } thermal = devm_kzalloc(>dev, sizeof(struct rockchip_thermal_data), -- 2.5.0
Re: [PATCH 3/3] IPI: Avoid to use 2 cache lines for one call_single_data
Peter Zijlstrawrites: > On Sat, Aug 05, 2017 at 08:47:02AM +0800, Huang, Ying wrote: >> Yes. That looks good. So you will prepare the final patch? Or you >> hope me to do that? > > I was hoping you'd do it ;-) Thanks! Here is the updated patch Best Regards, Huang, Ying -->8-- >From 957735e9ff3922368286540dab852986fc7b23b5 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 7 Aug 2017 16:55:33 +0800 Subject: [PATCH -v3] IPI: Avoid to use 2 cache lines for one call_single_data struct call_single_data is used in IPI to transfer information between CPUs. Its size is bigger than sizeof(unsigned long) and less than cache line size. Now, it is allocated with no explicit alignment requirement. This makes it possible for allocated call_single_data to cross 2 cache lines. So that double the number of the cache lines that need to be transferred among CPUs. This is resolved by requiring call_single_data to be aligned with the size of call_single_data. Now the size of call_single_data is the power of 2. If we add new fields to call_single_data, we may need to add pads to make sure the size of new definition is the power of 2. Fortunately, this is enforced by gcc, which will report error for not power of 2 alignment requirement. To set alignment requirement of call_single_data to the size of call_single_data, a struct definition and a typedef is used. To test the effect of the patch, we use the vm-scalability multiple thread swap test case (swap-w-seq-mt). The test will create multiple threads and each thread will eat memory until all RAM and part of swap is used, so that huge number of IPI will be triggered when unmapping memory. In the test, the throughput of memory writing improves ~5% compared with misaligned call_single_data because of faster IPI. [Add call_single_data_t and align with size of call_single_data] Suggested-by: Peter Zijlstra Signed-off-by: "Huang, Ying" Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Michael Ellerman Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Juergen Gross Cc: Aaron Lu --- arch/mips/kernel/smp.c | 6 ++-- block/blk-softirq.c| 2 +- drivers/block/null_blk.c | 2 +- drivers/cpuidle/coupled.c | 10 +++ drivers/net/ethernet/cavium/liquidio/lio_main.c| 2 +- drivers/net/ethernet/cavium/liquidio/octeon_droq.h | 2 +- include/linux/blkdev.h | 2 +- include/linux/netdevice.h | 2 +- include/linux/smp.h| 8 -- kernel/sched/sched.h | 2 +- kernel/smp.c | 32 -- kernel/up.c| 2 +- 12 files changed, 39 insertions(+), 33 deletions(-) diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 770d4d1516cb..bd8ba5472bca 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static DEFINE_PER_CPU(atomic_t, tick_broadcast_count); -static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd); +static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd); void tick_broadcast(const struct cpumask *mask) { atomic_t *count; - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for_each_cpu(cpu, mask) { @@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info) static int __init tick_broadcast_init(void) { - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for (cpu = 0; cpu < NR_CPUS; cpu++) { diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 87b7df4851bf..07125e7941f4 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -60,7 +60,7 @@ static void trigger_softirq(void *data) static int raise_blk_irq(int cpu, struct request *rq) { if (cpu_online(cpu)) { - struct call_single_data *data = >csd; + call_single_data_t *data = >csd; data->func = trigger_softirq; data->info = rq; diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 85c24cace973..81142ce781da 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -13,7 +13,7 @@ struct nullb_cmd { struct list_head list; struct llist_node ll_list; - struct call_single_data csd; + call_single_data_t csd; struct request *rq; struct bio *bio; unsigned int tag; diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c index 71e586d7df71..147f38ea0fcd
Re: [PATCH 3/3] IPI: Avoid to use 2 cache lines for one call_single_data
Peter Zijlstra writes: > On Sat, Aug 05, 2017 at 08:47:02AM +0800, Huang, Ying wrote: >> Yes. That looks good. So you will prepare the final patch? Or you >> hope me to do that? > > I was hoping you'd do it ;-) Thanks! Here is the updated patch Best Regards, Huang, Ying -->8-- >From 957735e9ff3922368286540dab852986fc7b23b5 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Mon, 7 Aug 2017 16:55:33 +0800 Subject: [PATCH -v3] IPI: Avoid to use 2 cache lines for one call_single_data struct call_single_data is used in IPI to transfer information between CPUs. Its size is bigger than sizeof(unsigned long) and less than cache line size. Now, it is allocated with no explicit alignment requirement. This makes it possible for allocated call_single_data to cross 2 cache lines. So that double the number of the cache lines that need to be transferred among CPUs. This is resolved by requiring call_single_data to be aligned with the size of call_single_data. Now the size of call_single_data is the power of 2. If we add new fields to call_single_data, we may need to add pads to make sure the size of new definition is the power of 2. Fortunately, this is enforced by gcc, which will report error for not power of 2 alignment requirement. To set alignment requirement of call_single_data to the size of call_single_data, a struct definition and a typedef is used. To test the effect of the patch, we use the vm-scalability multiple thread swap test case (swap-w-seq-mt). The test will create multiple threads and each thread will eat memory until all RAM and part of swap is used, so that huge number of IPI will be triggered when unmapping memory. In the test, the throughput of memory writing improves ~5% compared with misaligned call_single_data because of faster IPI. [Add call_single_data_t and align with size of call_single_data] Suggested-by: Peter Zijlstra Signed-off-by: "Huang, Ying" Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Michael Ellerman Cc: Borislav Petkov Cc: Thomas Gleixner Cc: Juergen Gross Cc: Aaron Lu --- arch/mips/kernel/smp.c | 6 ++-- block/blk-softirq.c| 2 +- drivers/block/null_blk.c | 2 +- drivers/cpuidle/coupled.c | 10 +++ drivers/net/ethernet/cavium/liquidio/lio_main.c| 2 +- drivers/net/ethernet/cavium/liquidio/octeon_droq.h | 2 +- include/linux/blkdev.h | 2 +- include/linux/netdevice.h | 2 +- include/linux/smp.h| 8 -- kernel/sched/sched.h | 2 +- kernel/smp.c | 32 -- kernel/up.c| 2 +- 12 files changed, 39 insertions(+), 33 deletions(-) diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 770d4d1516cb..bd8ba5472bca 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -648,12 +648,12 @@ EXPORT_SYMBOL(flush_tlb_one); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static DEFINE_PER_CPU(atomic_t, tick_broadcast_count); -static DEFINE_PER_CPU(struct call_single_data, tick_broadcast_csd); +static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd); void tick_broadcast(const struct cpumask *mask) { atomic_t *count; - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for_each_cpu(cpu, mask) { @@ -674,7 +674,7 @@ static void tick_broadcast_callee(void *info) static int __init tick_broadcast_init(void) { - struct call_single_data *csd; + call_single_data_t *csd; int cpu; for (cpu = 0; cpu < NR_CPUS; cpu++) { diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 87b7df4851bf..07125e7941f4 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -60,7 +60,7 @@ static void trigger_softirq(void *data) static int raise_blk_irq(int cpu, struct request *rq) { if (cpu_online(cpu)) { - struct call_single_data *data = >csd; + call_single_data_t *data = >csd; data->func = trigger_softirq; data->info = rq; diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 85c24cace973..81142ce781da 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -13,7 +13,7 @@ struct nullb_cmd { struct list_head list; struct llist_node ll_list; - struct call_single_data csd; + call_single_data_t csd; struct request *rq; struct bio *bio; unsigned int tag; diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c index 71e586d7df71..147f38ea0fcd 100644 --- a/drivers/cpuidle/coupled.c +++ b/drivers/cpuidle/coupled.c @@ -119,13 +119,13 @@ struct cpuidle_coupled { #define CPUIDLE_COUPLED_NOT_IDLE (-1) -static DEFINE_PER_CPU(struct call_single_data,
[PATCH v5 1/1] usb:host:xhci support option to disable the xHCI USB2 HW LPM
XHCI specification 1.1 does not require xHCI-compliant controllers to always enable hardware USB2 LPM. However, the current xHCI driver always enable it when seeing HLC=1. This patch supports an option for users to control disabling USB2 Hardware LPM via DT/ACPI attribute. This option is needed in case user would like to disable this feature. For example, their xHCI controller has its USB2 HW LPM broken. Signed-off-by: Tung NguyenSigned-off-by: Thang Q. Nguyen Acked-by: Rob Herring --- Changes since v4: - When HW LPM is optionally disabled, explicitly disable HLE, RWE, ... - Update codes to work with kernel 4.13-rc4 - Add Acked-By from Rob Herring Changes since v3: - Bypass updating LPM parameters when HW LPM is optionally disabled. Changes since v2: - Change code to disable HW LPM as an option for user which is set via ACPI/DT. Changes since v1: - Update DT/ACPI attribute and corresponding codes from HLE to LPM to be consistent with other attribute names. --- Documentation/devicetree/bindings/usb/usb-xhci.txt |1 + drivers/usb/host/xhci-plat.c |3 +++ drivers/usb/host/xhci.c|2 +- drivers/usb/host/xhci.h|1 + 4 files changed, 6 insertions(+), 1 deletions(-) diff --git a/Documentation/devicetree/bindings/usb/usb-xhci.txt b/Documentation/devicetree/bindings/usb/usb-xhci.txt index 2d80b60..ae6e484 100644 --- a/Documentation/devicetree/bindings/usb/usb-xhci.txt +++ b/Documentation/devicetree/bindings/usb/usb-xhci.txt @@ -26,6 +26,7 @@ Required properties: Optional properties: - clocks: reference to a clock + - usb2-lpm-disable: indicate if we don't want to enable USB2 HW LPM - usb3-lpm-capable: determines if platform is USB3 LPM capable - quirk-broken-port-ped: set if the controller has broken port disable mechanism diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c index c04144b..9028fb5 100644 --- a/drivers/usb/host/xhci-plat.c +++ b/drivers/usb/host/xhci-plat.c @@ -267,6 +267,9 @@ static int xhci_plat_probe(struct platform_device *pdev) goto disable_clk; } + if (device_property_read_bool(sysdev, "usb2-lpm-disable")) + xhci->quirks |= XHCI_HW_LPM_DISABLE; + if (device_property_read_bool(sysdev, "usb3-lpm-capable")) xhci->quirks |= XHCI_LPM_SUPPORT; diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index b2ff1ff..3a8e75f 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -4087,7 +4087,7 @@ static int xhci_set_usb2_hardware_lpm(struct usb_hcd *hcd, xhci_dbg(xhci, "%s port %d USB2 hardware LPM\n", enable ? "enable" : "disable", port_num + 1); - if (enable) { + if (enable && !(xhci->quirks & XHCI_HW_LPM_DISABLE)) { /* Host supports BESL timeout instead of HIRD */ if (udev->usb2_hw_lpm_besl_capable) { /* if device doesn't have a preferred BESL value use a diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index e3e9352..5d89c51 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1821,6 +1821,7 @@ struct xhci_hcd { #define XHCI_LIMIT_ENDPOINT_INTERVAL_7 (1 << 26) #define XHCI_U2_DISABLE_WAKE (1 << 27) #define XHCI_ASMEDIA_MODIFY_FLOWCONTROL(1 << 28) +#define XHCI_HW_LPM_DISABLE(1 << 29) unsigned intnum_active_eps; unsigned intlimit_active_eps; -- 1.7.1
[PATCH v5 1/1] usb:host:xhci support option to disable the xHCI USB2 HW LPM
XHCI specification 1.1 does not require xHCI-compliant controllers to always enable hardware USB2 LPM. However, the current xHCI driver always enable it when seeing HLC=1. This patch supports an option for users to control disabling USB2 Hardware LPM via DT/ACPI attribute. This option is needed in case user would like to disable this feature. For example, their xHCI controller has its USB2 HW LPM broken. Signed-off-by: Tung Nguyen Signed-off-by: Thang Q. Nguyen Acked-by: Rob Herring --- Changes since v4: - When HW LPM is optionally disabled, explicitly disable HLE, RWE, ... - Update codes to work with kernel 4.13-rc4 - Add Acked-By from Rob Herring Changes since v3: - Bypass updating LPM parameters when HW LPM is optionally disabled. Changes since v2: - Change code to disable HW LPM as an option for user which is set via ACPI/DT. Changes since v1: - Update DT/ACPI attribute and corresponding codes from HLE to LPM to be consistent with other attribute names. --- Documentation/devicetree/bindings/usb/usb-xhci.txt |1 + drivers/usb/host/xhci-plat.c |3 +++ drivers/usb/host/xhci.c|2 +- drivers/usb/host/xhci.h|1 + 4 files changed, 6 insertions(+), 1 deletions(-) diff --git a/Documentation/devicetree/bindings/usb/usb-xhci.txt b/Documentation/devicetree/bindings/usb/usb-xhci.txt index 2d80b60..ae6e484 100644 --- a/Documentation/devicetree/bindings/usb/usb-xhci.txt +++ b/Documentation/devicetree/bindings/usb/usb-xhci.txt @@ -26,6 +26,7 @@ Required properties: Optional properties: - clocks: reference to a clock + - usb2-lpm-disable: indicate if we don't want to enable USB2 HW LPM - usb3-lpm-capable: determines if platform is USB3 LPM capable - quirk-broken-port-ped: set if the controller has broken port disable mechanism diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c index c04144b..9028fb5 100644 --- a/drivers/usb/host/xhci-plat.c +++ b/drivers/usb/host/xhci-plat.c @@ -267,6 +267,9 @@ static int xhci_plat_probe(struct platform_device *pdev) goto disable_clk; } + if (device_property_read_bool(sysdev, "usb2-lpm-disable")) + xhci->quirks |= XHCI_HW_LPM_DISABLE; + if (device_property_read_bool(sysdev, "usb3-lpm-capable")) xhci->quirks |= XHCI_LPM_SUPPORT; diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index b2ff1ff..3a8e75f 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -4087,7 +4087,7 @@ static int xhci_set_usb2_hardware_lpm(struct usb_hcd *hcd, xhci_dbg(xhci, "%s port %d USB2 hardware LPM\n", enable ? "enable" : "disable", port_num + 1); - if (enable) { + if (enable && !(xhci->quirks & XHCI_HW_LPM_DISABLE)) { /* Host supports BESL timeout instead of HIRD */ if (udev->usb2_hw_lpm_besl_capable) { /* if device doesn't have a preferred BESL value use a diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index e3e9352..5d89c51 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1821,6 +1821,7 @@ struct xhci_hcd { #define XHCI_LIMIT_ENDPOINT_INTERVAL_7 (1 << 26) #define XHCI_U2_DISABLE_WAKE (1 << 27) #define XHCI_ASMEDIA_MODIFY_FLOWCONTROL(1 << 28) +#define XHCI_HW_LPM_DISABLE(1 << 29) unsigned intnum_active_eps; unsigned intlimit_active_eps; -- 1.7.1
Re: [PATCH -mm] mm: Clear to access sub-page last when clearing huge page
Mike Kravetzwrites: > On 08/07/2017 12:21 AM, Huang, Ying wrote: >> From: Huang Ying >> >> Huge page helps to reduce TLB miss rate, but it has higher cache >> footprint, sometimes this may cause some issue. For example, when >> clearing huge page on x86_64 platform, the cache footprint is 2M. But >> on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M >> LLC (last level cache). That is, in average, there are 2.5M LLC for >> each core and 1.25M LLC for each thread. If the cache pressure is >> heavy when clearing the huge page, and we clear the huge page from the >> begin to the end, it is possible that the begin of huge page is >> evicted from the cache after we finishing clearing the end of the huge >> page. And it is possible for the application to access the begin of >> the huge page after clearing the huge page. >> >> To help the above situation, in this patch, when we clear a huge page, >> the order to clear sub-pages is changed. In quite some situation, we >> can get the address that the application will access after we clear >> the huge page, for example, in a page fault handler. Instead of >> clearing the huge page from begin to end, we will clear the sub-pages >> farthest from the the sub-page to access firstly, and clear the >> sub-page to access last. This will make the sub-page to access most >> cache-hot and sub-pages around it more cache-hot too. If we cannot >> know the address the application will access, the begin of the huge >> page is assumed to be the the address the application will access. >> >> With this patch, the throughput increases ~28.3% in vm-scalability >> anon-w-seq test case with 72 processes on a 2 socket Xeon E5 v3 2699 >> system (36 cores, 72 threads). The test case creates 72 processes, >> each process mmap a big anonymous memory area and writes to it from >> the begin to the end. For each process, other processes could be seen >> as other workload which generates heavy cache pressure. At the same >> time, the cache miss rate reduced from ~33.4% to ~31.7%, the >> IPC (instruction per cycle) increased from 0.56 to 0.74, and the time >> spent in user space is reduced ~7.9% >> >> Thanks Andi Kleen to propose to use address to access to determine the >> order of sub-pages to clear. >> >> The hugetlbfs access address could be improved, will do that in >> another patch. > > hugetlb_fault masks off the actual faulting address with, > address &= huge_page_mask(h); > before calling hugetlb_no_page. > > But, we could pass down the actual (unmasked) address to take advantage > of this optimization for hugetlb faults as well. hugetlb_fault is the > only caller of hugetlb_no_page, so this should be pretty straight forward. > > Were you thinking of additional improvements? No. I am thinking of something like this. If the basic idea is accepted, I plan to add better support like this for hugetlbfs in another patch. Best Regards, Huang, Ying
Re: [PATCH -mm] mm: Clear to access sub-page last when clearing huge page
Mike Kravetz writes: > On 08/07/2017 12:21 AM, Huang, Ying wrote: >> From: Huang Ying >> >> Huge page helps to reduce TLB miss rate, but it has higher cache >> footprint, sometimes this may cause some issue. For example, when >> clearing huge page on x86_64 platform, the cache footprint is 2M. But >> on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M >> LLC (last level cache). That is, in average, there are 2.5M LLC for >> each core and 1.25M LLC for each thread. If the cache pressure is >> heavy when clearing the huge page, and we clear the huge page from the >> begin to the end, it is possible that the begin of huge page is >> evicted from the cache after we finishing clearing the end of the huge >> page. And it is possible for the application to access the begin of >> the huge page after clearing the huge page. >> >> To help the above situation, in this patch, when we clear a huge page, >> the order to clear sub-pages is changed. In quite some situation, we >> can get the address that the application will access after we clear >> the huge page, for example, in a page fault handler. Instead of >> clearing the huge page from begin to end, we will clear the sub-pages >> farthest from the the sub-page to access firstly, and clear the >> sub-page to access last. This will make the sub-page to access most >> cache-hot and sub-pages around it more cache-hot too. If we cannot >> know the address the application will access, the begin of the huge >> page is assumed to be the the address the application will access. >> >> With this patch, the throughput increases ~28.3% in vm-scalability >> anon-w-seq test case with 72 processes on a 2 socket Xeon E5 v3 2699 >> system (36 cores, 72 threads). The test case creates 72 processes, >> each process mmap a big anonymous memory area and writes to it from >> the begin to the end. For each process, other processes could be seen >> as other workload which generates heavy cache pressure. At the same >> time, the cache miss rate reduced from ~33.4% to ~31.7%, the >> IPC (instruction per cycle) increased from 0.56 to 0.74, and the time >> spent in user space is reduced ~7.9% >> >> Thanks Andi Kleen to propose to use address to access to determine the >> order of sub-pages to clear. >> >> The hugetlbfs access address could be improved, will do that in >> another patch. > > hugetlb_fault masks off the actual faulting address with, > address &= huge_page_mask(h); > before calling hugetlb_no_page. > > But, we could pass down the actual (unmasked) address to take advantage > of this optimization for hugetlb faults as well. hugetlb_fault is the > only caller of hugetlb_no_page, so this should be pretty straight forward. > > Were you thinking of additional improvements? No. I am thinking of something like this. If the basic idea is accepted, I plan to add better support like this for hugetlbfs in another patch. Best Regards, Huang, Ying
Re: [lkp-robot] [mm] 7674270022: will-it-scale.per_process_ops -19.3% regression
Minchan Kimwrote: > Hi, > > On Tue, Aug 08, 2017 at 09:19:23AM +0800, kernel test robot wrote: >> Greeting, >> >> FYI, we noticed a -19.3% regression of will-it-scale.per_process_ops due to >> commit: >> >> >> commit: 76742700225cad9df49f05399381ac3f1ec3dc60 ("mm: fix >> MADV_[FREE|DONTNEED] TLB flush miss problem") >> url: >> https://github.com/0day-ci/linux/commits/Nadav-Amit/mm-migrate-prevent-racy-access-to-tlb_flush_pending/20170802-205715 >> >> >> in testcase: will-it-scale >> on test machine: 88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz with >> 64G memory >> with following parameters: >> >> nr_task: 16 >> mode: process >> test: brk1 >> cpufreq_governor: performance >> >> test-description: Will It Scale takes a testcase and runs it from 1 through >> to n parallel copies to see if the testcase will scale. It builds both a >> process and threads based test in order to see any differences between the >> two. >> test-url: https://github.com/antonblanchard/will-it-scale > > Thanks for the report. > Could you explain what kinds of workload you are testing? > > Does it calls frequently madvise(MADV_DONTNEED) in parallel on multiple > threads? According to the description it is "testcase:brk increase/decrease of one page”. According to the mode it spawns multiple processes, not threads. Since a single page is unmapped each time, and the iTLB-loads increase dramatically, I would suspect that for some reason a full TLB flush is caused during do_munmap(). If I find some free time, I’ll try to profile the workload - but feel free to beat me to it. Nadav
Re: [lkp-robot] [mm] 7674270022: will-it-scale.per_process_ops -19.3% regression
Minchan Kim wrote: > Hi, > > On Tue, Aug 08, 2017 at 09:19:23AM +0800, kernel test robot wrote: >> Greeting, >> >> FYI, we noticed a -19.3% regression of will-it-scale.per_process_ops due to >> commit: >> >> >> commit: 76742700225cad9df49f05399381ac3f1ec3dc60 ("mm: fix >> MADV_[FREE|DONTNEED] TLB flush miss problem") >> url: >> https://github.com/0day-ci/linux/commits/Nadav-Amit/mm-migrate-prevent-racy-access-to-tlb_flush_pending/20170802-205715 >> >> >> in testcase: will-it-scale >> on test machine: 88 threads Intel(R) Xeon(R) CPU E5-2699 v4 @ 2.20GHz with >> 64G memory >> with following parameters: >> >> nr_task: 16 >> mode: process >> test: brk1 >> cpufreq_governor: performance >> >> test-description: Will It Scale takes a testcase and runs it from 1 through >> to n parallel copies to see if the testcase will scale. It builds both a >> process and threads based test in order to see any differences between the >> two. >> test-url: https://github.com/antonblanchard/will-it-scale > > Thanks for the report. > Could you explain what kinds of workload you are testing? > > Does it calls frequently madvise(MADV_DONTNEED) in parallel on multiple > threads? According to the description it is "testcase:brk increase/decrease of one page”. According to the mode it spawns multiple processes, not threads. Since a single page is unmapped each time, and the iTLB-loads increase dramatically, I would suspect that for some reason a full TLB flush is caused during do_munmap(). If I find some free time, I’ll try to profile the workload - but feel free to beat me to it. Nadav
Re: [v5] wlcore: add missing nvs file name info for wilink8
* Reizer, Eyal[170807 00:47]: > Hi Tony, > > > > * Reizer, Eyal [170807 00:32]: > > > The following commits: > > > c815fde wlcore: spi: Populate config firmware data > > > d776fc8 wlcore: sdio: Populate config firmware data > > > > > > Populated the nvs entry for wilink6 and wilink7 only while it is > > > still needed for wilink8 as well. > > > This broke user space backward compatibility when upgrading from older > > > kernels, as the alternate mac address would not be read from the nvs that > > is > > > present in the file system (lib/firmware/ti-connectivity/wl1271-nvs.bin) > > > causing mac address change of the wlan interface. > > > > > > This patch fix this and update the structure field with the same default > > > nvs file name that has been used before. > > > > > > In addition, some distros hold a default wl1271-nvs.bin in the file > > > system with a bogus mac address (deadbeef...) that for a wl18xx device > > > also overrides the mac address that is stored inside the device. > > > Warn users about this bogus mac address and use a random mac instead > > > > Hmm looks pretty good to me except for one more thing I just noticed. > > > > Why don't you just use the hardware mac address instead of a random > > mac address on wl18xx device when you see a bogus nvs file? > > > > I agree that this would have been better but the problem is that hardware > mac address is available for wilink8 onlyWilink6/7 don't have one stored. > The wlcore code responsible for handling mac address is common code > and there is method for detecting between them in this module. Care to clarify a bit.. Are you saying wilink8 will use the hardware mac address in case of bogus nvs file? Regards, Tony
Re: [v5] wlcore: add missing nvs file name info for wilink8
* Reizer, Eyal [170807 00:47]: > Hi Tony, > > > > * Reizer, Eyal [170807 00:32]: > > > The following commits: > > > c815fde wlcore: spi: Populate config firmware data > > > d776fc8 wlcore: sdio: Populate config firmware data > > > > > > Populated the nvs entry for wilink6 and wilink7 only while it is > > > still needed for wilink8 as well. > > > This broke user space backward compatibility when upgrading from older > > > kernels, as the alternate mac address would not be read from the nvs that > > is > > > present in the file system (lib/firmware/ti-connectivity/wl1271-nvs.bin) > > > causing mac address change of the wlan interface. > > > > > > This patch fix this and update the structure field with the same default > > > nvs file name that has been used before. > > > > > > In addition, some distros hold a default wl1271-nvs.bin in the file > > > system with a bogus mac address (deadbeef...) that for a wl18xx device > > > also overrides the mac address that is stored inside the device. > > > Warn users about this bogus mac address and use a random mac instead > > > > Hmm looks pretty good to me except for one more thing I just noticed. > > > > Why don't you just use the hardware mac address instead of a random > > mac address on wl18xx device when you see a bogus nvs file? > > > > I agree that this would have been better but the problem is that hardware > mac address is available for wilink8 onlyWilink6/7 don't have one stored. > The wlcore code responsible for handling mac address is common code > and there is method for detecting between them in this module. Care to clarify a bit.. Are you saying wilink8 will use the hardware mac address in case of bogus nvs file? Regards, Tony
Re: [PATCH] gpio: uniphier: add UniPhier GPIO controller driver
On Tuesday 08 August 2017 06:36 AM, Masahiro Yamada wrote: > Hi Linus, > > 2017-08-08 0:37 GMT+09:00 Linus Walleij: >> On Mon, Aug 7, 2017 at 3:50 PM, Masahiro Yamada >> wrote: >> >>> Adding "interrupts" property in DT causes >>> of_pupulate_default_populate() to assign virtual IRQ numbers >>> before driver probing. So it does not work well with IRQ domain hierarchy. >> >> I think I heard some noise about this the week before. >> >>> For pinctrl/stm32/pinctrl-stm32.c, >>> I do not see "interrupts", so it just straight maps the irq numbers. >> >> I think OMAP and DaVinci does someting similar too. This is from a recent >> DaVinci patch from Keerthy: >> >> +Example for 66AK2G: >> + >> +gpio0: gpio@2603000 { >> + compatible = "ti,k2g-gpio", "ti,keystone-gpio"; >> + reg = <0x02603000 0x100>; >> + gpio-controller; >> + #gpio-cells = <2>; >> + interrupts = , >> + , >> + , >> + , >> + , >> + , >> + , >> + , >> + ; >> + interrupt-controller; >> + #interrupt-cells = <2>; >> + ti,ngpio = <144>; >> + ti,davinci-gpio-unbanked = <0>; >> + clocks = <_clks 0x001b 0x0>; >> + clock-names = "gpio"; >> +}; >> >> >> That looks fairly similar. >> > > I do not think so. > > > I do not see .alloc hook in drivers/gpio/gpio-davinci.c > so this driver is unrelated to IRQ domain hierarchy. Hi Masahiro, Yes CONFIG_IRQ_DOMAIN_HIERARCHY is not enabled in keystone_defconfig or davinci_all_defconfig. Regards, Keerthy > > > > > >
Re: [PATCH] gpio: uniphier: add UniPhier GPIO controller driver
On Tuesday 08 August 2017 06:36 AM, Masahiro Yamada wrote: > Hi Linus, > > 2017-08-08 0:37 GMT+09:00 Linus Walleij : >> On Mon, Aug 7, 2017 at 3:50 PM, Masahiro Yamada >> wrote: >> >>> Adding "interrupts" property in DT causes >>> of_pupulate_default_populate() to assign virtual IRQ numbers >>> before driver probing. So it does not work well with IRQ domain hierarchy. >> >> I think I heard some noise about this the week before. >> >>> For pinctrl/stm32/pinctrl-stm32.c, >>> I do not see "interrupts", so it just straight maps the irq numbers. >> >> I think OMAP and DaVinci does someting similar too. This is from a recent >> DaVinci patch from Keerthy: >> >> +Example for 66AK2G: >> + >> +gpio0: gpio@2603000 { >> + compatible = "ti,k2g-gpio", "ti,keystone-gpio"; >> + reg = <0x02603000 0x100>; >> + gpio-controller; >> + #gpio-cells = <2>; >> + interrupts = , >> + , >> + , >> + , >> + , >> + , >> + , >> + , >> + ; >> + interrupt-controller; >> + #interrupt-cells = <2>; >> + ti,ngpio = <144>; >> + ti,davinci-gpio-unbanked = <0>; >> + clocks = <_clks 0x001b 0x0>; >> + clock-names = "gpio"; >> +}; >> >> >> That looks fairly similar. >> > > I do not think so. > > > I do not see .alloc hook in drivers/gpio/gpio-davinci.c > so this driver is unrelated to IRQ domain hierarchy. Hi Masahiro, Yes CONFIG_IRQ_DOMAIN_HIERARCHY is not enabled in keystone_defconfig or davinci_all_defconfig. Regards, Keerthy > > > > > >
Re: [PATCH 13/18] power: supply: bq24190_charger: Export 5V boost converter as regulator
* Hans de Goede[170806 05:37]: > Register the 5V boost converter as a regulator named > "regulator-bq24190-usb-vbus". Note the name includes "bq24190" because > the bq24190 family is also used on ACPI devices where there are no > device-tree phandles, so regulator_get will fallback to the name and thus > it must be unique on the system. Nice, this makes VBUS easy to use for USB PHY drivers :) Tony
Re: [PATCH 13/18] power: supply: bq24190_charger: Export 5V boost converter as regulator
* Hans de Goede [170806 05:37]: > Register the 5V boost converter as a regulator named > "regulator-bq24190-usb-vbus". Note the name includes "bq24190" because > the bq24190 family is also used on ACPI devices where there are no > device-tree phandles, so regulator_get will fallback to the name and thus > it must be unique on the system. Nice, this makes VBUS easy to use for USB PHY drivers :) Tony
[PATCH] usb: imx21-hcd: fix error return code in imx21_probe()
platform_get_irq() returns an error code, but the imx21-hcd driver ignores it and always returns -ENXIO. This is not correct, and prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print error message and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva--- drivers/usb/host/imx21-hcd.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/imx21-hcd.c b/drivers/usb/host/imx21-hcd.c index f542045..e25d72e 100644 --- a/drivers/usb/host/imx21-hcd.c +++ b/drivers/usb/host/imx21-hcd.c @@ -1849,8 +1849,10 @@ static int imx21_probe(struct platform_device *pdev) if (!res) return -ENODEV; irq = platform_get_irq(pdev, 0); - if (irq < 0) - return -ENXIO; + if (irq < 0) { + dev_err(>dev, "Failed to get IRQ: %d\n", irq); + return irq; + } hcd = usb_create_hcd(_hc_driver, >dev, dev_name(>dev)); -- 2.5.0
Re: [RESEND PATCH] bcache: Don't reinvent the wheel but use existing llist API
On Mon, Aug 07, 2017 at 06:18:35PM +0800, Coly Li wrote: > On 2017/8/7 下午4:38, Byungchul Park wrote: > > Although llist provides proper APIs, they are not used. Make them used. > > > > Signed-off-by: Byungchul ParkOnly have a question about why not using llist_for_each_entry(), it's Hello, The reason is to keep the original logic unchanged. The logic already does as if it's the safe version against removal. > still OK with llist_for_each_entry_safe(). The rested part is good to me. > > Acked-by: Coly Li > > > --- > > drivers/md/bcache/closure.c | 17 +++-- > > 1 file changed, 3 insertions(+), 14 deletions(-) > > > > diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c > > index 864e673..1841d03 100644 > > --- a/drivers/md/bcache/closure.c > > +++ b/drivers/md/bcache/closure.c > > @@ -64,27 +64,16 @@ void closure_put(struct closure *cl) > > void __closure_wake_up(struct closure_waitlist *wait_list) > > { > > struct llist_node *list; > > - struct closure *cl; > > + struct closure *cl, *t; > > struct llist_node *reverse = NULL; > > > > list = llist_del_all(_list->list); > > > > /* We first reverse the list to preserve FIFO ordering and fairness */ > > - > > - while (list) { > > - struct llist_node *t = list; > > - list = llist_next(list); > > - > > - t->next = reverse; > > - reverse = t; > > - } > > + reverse = llist_reverse_order(list); > > > > /* Then do the wakeups */ > > - > > - while (reverse) { > > - cl = container_of(reverse, struct closure, list); > > - reverse = llist_next(reverse); > > - > > + llist_for_each_entry_safe(cl, t, reverse, list) { > > Just wondering why not using llist_for_each_entry(), or you use the > _safe version on purpose ? If I use llist_for_each_entry(), then it would change the original behavior. Is it ok? Thank you, Byungchul
[PATCH] usb: imx21-hcd: fix error return code in imx21_probe()
platform_get_irq() returns an error code, but the imx21-hcd driver ignores it and always returns -ENXIO. This is not correct, and prevents -EPROBE_DEFER from being propagated properly. Notice that platform_get_irq() no longer returns 0 on error: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=e330b9a6bb35dc7097a4f02cb1ae7b6f96df92af Print error message and propagate the return value of platform_get_irq on failure. This issue was detected with the help of Coccinelle. Signed-off-by: Gustavo A. R. Silva --- drivers/usb/host/imx21-hcd.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/host/imx21-hcd.c b/drivers/usb/host/imx21-hcd.c index f542045..e25d72e 100644 --- a/drivers/usb/host/imx21-hcd.c +++ b/drivers/usb/host/imx21-hcd.c @@ -1849,8 +1849,10 @@ static int imx21_probe(struct platform_device *pdev) if (!res) return -ENODEV; irq = platform_get_irq(pdev, 0); - if (irq < 0) - return -ENXIO; + if (irq < 0) { + dev_err(>dev, "Failed to get IRQ: %d\n", irq); + return irq; + } hcd = usb_create_hcd(_hc_driver, >dev, dev_name(>dev)); -- 2.5.0
Re: [RESEND PATCH] bcache: Don't reinvent the wheel but use existing llist API
On Mon, Aug 07, 2017 at 06:18:35PM +0800, Coly Li wrote: > On 2017/8/7 下午4:38, Byungchul Park wrote: > > Although llist provides proper APIs, they are not used. Make them used. > > > > Signed-off-by: Byungchul Park Only have a question about why not using llist_for_each_entry(), it's Hello, The reason is to keep the original logic unchanged. The logic already does as if it's the safe version against removal. > still OK with llist_for_each_entry_safe(). The rested part is good to me. > > Acked-by: Coly Li > > > --- > > drivers/md/bcache/closure.c | 17 +++-- > > 1 file changed, 3 insertions(+), 14 deletions(-) > > > > diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c > > index 864e673..1841d03 100644 > > --- a/drivers/md/bcache/closure.c > > +++ b/drivers/md/bcache/closure.c > > @@ -64,27 +64,16 @@ void closure_put(struct closure *cl) > > void __closure_wake_up(struct closure_waitlist *wait_list) > > { > > struct llist_node *list; > > - struct closure *cl; > > + struct closure *cl, *t; > > struct llist_node *reverse = NULL; > > > > list = llist_del_all(_list->list); > > > > /* We first reverse the list to preserve FIFO ordering and fairness */ > > - > > - while (list) { > > - struct llist_node *t = list; > > - list = llist_next(list); > > - > > - t->next = reverse; > > - reverse = t; > > - } > > + reverse = llist_reverse_order(list); > > > > /* Then do the wakeups */ > > - > > - while (reverse) { > > - cl = container_of(reverse, struct closure, list); > > - reverse = llist_next(reverse); > > - > > + llist_for_each_entry_safe(cl, t, reverse, list) { > > Just wondering why not using llist_for_each_entry(), or you use the > _safe version on purpose ? If I use llist_for_each_entry(), then it would change the original behavior. Is it ok? Thank you, Byungchul
Re: [PATCH 3.18 00/50] 3.18.64-stable review
On 08/07/2017 12:34 PM, Greg Kroah-Hartman wrote: On Sat, Aug 05, 2017 at 12:11:19PM -0700, Guenter Roeck wrote: On 08/05/2017 08:43 AM, Greg Kroah-Hartman wrote: On Sat, Aug 05, 2017 at 08:02:17AM +0200, Willy Tarreau wrote: On Sat, Aug 05, 2017 at 07:55:11AM +0200, Willy Tarreau wrote: On Fri, Aug 04, 2017 at 07:51:07PM -0700, Greg Kroah-Hartman wrote: On Fri, Aug 04, 2017 at 07:46:57PM -0700, Greg Kroah-Hartman wrote: On Fri, Aug 04, 2017 at 06:43:50PM -0700, Guenter Roeck wrote: On 08/04/2017 04:15 PM, Greg Kroah-Hartman wrote: This is the start of the stable review cycle for the 3.18.64 release. There are 50 patches in this series, all will be posted as a response to this one. If anyone has any issues with these being applied, please let me know. Responses should be made by Sun Aug 6 23:15:34 UTC 2017. Anything received after that time might be too late. Preliminary: Lots of lib/string.c:31:32: fatal error: asm/word-at-a-time.h affecting several architectures. alpha: lib/string.c:217:4: error: implicit declaration of function 'zero_bytemask' Hm, I think I need to add c753bf34c94e ("word-at-a-time.h: support zero_bytemask() on alpha and tile"), right? Any other arches failing? Hm, that doesn't work, do we care about tile? :) Let me see how deep this hole is, I just wanted to get strscpy into 3.18 to fix a bug... I suspect you'll need this one which came as part of the strscpy() series between 4.2 and 4.3 (though I have not tested) : commit a6e2f029ae34f41adb6ae3812c32c5d326e1abd2 Author: Chris MetcalfDate: Wed Apr 29 12:48:40 2015 -0400 Make asm/word-at-a-time.h available on all architectures Added the x86 implementation of word-at-a-time to the generic version, which previously only supported big-endian. (...) OK I just applied it on top of 3.18.64-rc1 and it allowed me to build mips which previously broke. It will not apply as-is, you'll need to drop the change for arch/nios2/include/asm/Kbuild, and after that it's OK. Thanks for that, I've now queued that patch up. Better, but there are still some errors. powerpc: lib/string.c: In function 'strscpy': lib/string.c:217:4: error: implicit declaration of function 'zero_bytemask' tile: arch/tile/gxio/mpipe.c:46:15: error: conflicting types for 'strscpy' include/linux/string.h:29:22: note: previous declaration of 'strscpy' was here Missing patches: 7a5692e6e533 ("arch/powerpc: provide zero_bytemask() for big-endian") 30059d494a72 ("tile: use global strscpy() rather than private copy") Thanks for these, I'll queue them up. And do a -rc2 in a few days as this was a mess... Getting there. With v3.18.63-62-gc7d9ae0: Build results: total: 136 pass: 136 fail: 0 Qemu test results: total: 111 pass: 111 fail: 0 Details are available at http://kerneltests.org/builders. Guenter
Re: [PATCH 3.18 00/50] 3.18.64-stable review
On 08/07/2017 12:34 PM, Greg Kroah-Hartman wrote: On Sat, Aug 05, 2017 at 12:11:19PM -0700, Guenter Roeck wrote: On 08/05/2017 08:43 AM, Greg Kroah-Hartman wrote: On Sat, Aug 05, 2017 at 08:02:17AM +0200, Willy Tarreau wrote: On Sat, Aug 05, 2017 at 07:55:11AM +0200, Willy Tarreau wrote: On Fri, Aug 04, 2017 at 07:51:07PM -0700, Greg Kroah-Hartman wrote: On Fri, Aug 04, 2017 at 07:46:57PM -0700, Greg Kroah-Hartman wrote: On Fri, Aug 04, 2017 at 06:43:50PM -0700, Guenter Roeck wrote: On 08/04/2017 04:15 PM, Greg Kroah-Hartman wrote: This is the start of the stable review cycle for the 3.18.64 release. There are 50 patches in this series, all will be posted as a response to this one. If anyone has any issues with these being applied, please let me know. Responses should be made by Sun Aug 6 23:15:34 UTC 2017. Anything received after that time might be too late. Preliminary: Lots of lib/string.c:31:32: fatal error: asm/word-at-a-time.h affecting several architectures. alpha: lib/string.c:217:4: error: implicit declaration of function 'zero_bytemask' Hm, I think I need to add c753bf34c94e ("word-at-a-time.h: support zero_bytemask() on alpha and tile"), right? Any other arches failing? Hm, that doesn't work, do we care about tile? :) Let me see how deep this hole is, I just wanted to get strscpy into 3.18 to fix a bug... I suspect you'll need this one which came as part of the strscpy() series between 4.2 and 4.3 (though I have not tested) : commit a6e2f029ae34f41adb6ae3812c32c5d326e1abd2 Author: Chris Metcalf Date: Wed Apr 29 12:48:40 2015 -0400 Make asm/word-at-a-time.h available on all architectures Added the x86 implementation of word-at-a-time to the generic version, which previously only supported big-endian. (...) OK I just applied it on top of 3.18.64-rc1 and it allowed me to build mips which previously broke. It will not apply as-is, you'll need to drop the change for arch/nios2/include/asm/Kbuild, and after that it's OK. Thanks for that, I've now queued that patch up. Better, but there are still some errors. powerpc: lib/string.c: In function 'strscpy': lib/string.c:217:4: error: implicit declaration of function 'zero_bytemask' tile: arch/tile/gxio/mpipe.c:46:15: error: conflicting types for 'strscpy' include/linux/string.h:29:22: note: previous declaration of 'strscpy' was here Missing patches: 7a5692e6e533 ("arch/powerpc: provide zero_bytemask() for big-endian") 30059d494a72 ("tile: use global strscpy() rather than private copy") Thanks for these, I'll queue them up. And do a -rc2 in a few days as this was a mess... Getting there. With v3.18.63-62-gc7d9ae0: Build results: total: 136 pass: 136 fail: 0 Qemu test results: total: 111 pass: 111 fail: 0 Details are available at http://kerneltests.org/builders. Guenter
[PATCH] f2fs: fix some cases with reserved_blocks
Signed-off-by: Yunlong Song--- fs/f2fs/recovery.c | 3 ++- fs/f2fs/super.c| 9 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a3d0261..e288319 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -51,7 +51,8 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi) { s64 nalloc = percpu_counter_sum_positive(>alloc_valid_block_count); - if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) + if (sbi->last_valid_block_count + nalloc + + sbi->reserved_blocks > sbi->user_block_count) return false; return true; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4c1bdcb..c644bf5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -946,6 +946,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count, ovp_count; u64 avail_node_count; + block_t avail_user_block_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; @@ -953,16 +954,16 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; + avail_user_block_count = user_block_count - sbi->reserved_blocks; buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi) - - sbi->reserved_blocks; + buf->f_bavail = avail_user_block_count - valid_user_blocks(sbi); avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - if (avail_node_count > user_block_count) { - buf->f_files = user_block_count; + if (avail_node_count > avail_user_block_count) { + buf->f_files = avail_user_block_count; buf->f_ffree = buf->f_bavail; } else { buf->f_files = avail_node_count; -- 1.8.5.2
[PATCH] f2fs: fix some cases with reserved_blocks
Signed-off-by: Yunlong Song --- fs/f2fs/recovery.c | 3 ++- fs/f2fs/super.c| 9 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index a3d0261..e288319 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -51,7 +51,8 @@ bool space_for_roll_forward(struct f2fs_sb_info *sbi) { s64 nalloc = percpu_counter_sum_positive(>alloc_valid_block_count); - if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) + if (sbi->last_valid_block_count + nalloc + + sbi->reserved_blocks > sbi->user_block_count) return false; return true; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4c1bdcb..c644bf5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -946,6 +946,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count, ovp_count; u64 avail_node_count; + block_t avail_user_block_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; @@ -953,16 +954,16 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; buf->f_type = F2FS_SUPER_MAGIC; buf->f_bsize = sbi->blocksize; + avail_user_block_count = user_block_count - sbi->reserved_blocks; buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi) - - sbi->reserved_blocks; + buf->f_bavail = avail_user_block_count - valid_user_blocks(sbi); avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - if (avail_node_count > user_block_count) { - buf->f_files = user_block_count; + if (avail_node_count > avail_user_block_count) { + buf->f_files = avail_user_block_count; buf->f_ffree = buf->f_bavail; } else { buf->f_files = avail_node_count; -- 1.8.5.2
Re: Possible race in pc87413_wdt.ko
On 08/07/2017 06:22 AM, Anton Volkov wrote: Hello. While searching for races in the Linux kernel I've come across "drivers/watchdog/pc87413_wdt.ko" module. Here is a question that I came up with while analyzing results. Lines are given using the info from Linux v4.12. Consider the following case: Thread 1: Thread 2: pc87413_init misc_register(_miscdev) -> pc87413_get_swc_base_addr pc87413_open -> pc87413_refresh -> pc87413_swc_bank3 swc_base_addr = ... (pc87413_wdt.c: line 133)(pc87413_wdt.c: line 146) So in this case preemptive registration of the device leads to a possibility of race between the initialization process and a callback to the registered device. Is this race feasible from your point of view? And if it is, is it possible to move the device registration a bit further down in the pc87413_init function? Yes, the race is feasible, and it is possible to move the device registration function (though the preferred solution would be to convert the driver to use the watchdog subsystem). The code looks pretty bad as written. Just not sure if it is worth bothering about it. I suspect no on is using that driver anymore (the datasheet is from 2001). Might as well just declare it obsolete and wait for someone to scream. Guenter
Re: Possible race in pc87413_wdt.ko
On 08/07/2017 06:22 AM, Anton Volkov wrote: Hello. While searching for races in the Linux kernel I've come across "drivers/watchdog/pc87413_wdt.ko" module. Here is a question that I came up with while analyzing results. Lines are given using the info from Linux v4.12. Consider the following case: Thread 1: Thread 2: pc87413_init misc_register(_miscdev) -> pc87413_get_swc_base_addr pc87413_open -> pc87413_refresh -> pc87413_swc_bank3 swc_base_addr = ... (pc87413_wdt.c: line 133)(pc87413_wdt.c: line 146) So in this case preemptive registration of the device leads to a possibility of race between the initialization process and a callback to the registered device. Is this race feasible from your point of view? And if it is, is it possible to move the device registration a bit further down in the pc87413_init function? Yes, the race is feasible, and it is possible to move the device registration function (though the preferred solution would be to convert the driver to use the watchdog subsystem). The code looks pretty bad as written. Just not sure if it is worth bothering about it. I suspect no on is using that driver anymore (the datasheet is from 2001). Might as well just declare it obsolete and wait for someone to scream. Guenter
Re: [PATCH -mm] mm: Clear to access sub-page last when clearing huge page
On 08/07/2017 12:21 AM, Huang, Ying wrote: > From: Huang Ying> > Huge page helps to reduce TLB miss rate, but it has higher cache > footprint, sometimes this may cause some issue. For example, when > clearing huge page on x86_64 platform, the cache footprint is 2M. But > on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M > LLC (last level cache). That is, in average, there are 2.5M LLC for > each core and 1.25M LLC for each thread. If the cache pressure is > heavy when clearing the huge page, and we clear the huge page from the > begin to the end, it is possible that the begin of huge page is > evicted from the cache after we finishing clearing the end of the huge > page. And it is possible for the application to access the begin of > the huge page after clearing the huge page. > > To help the above situation, in this patch, when we clear a huge page, > the order to clear sub-pages is changed. In quite some situation, we > can get the address that the application will access after we clear > the huge page, for example, in a page fault handler. Instead of > clearing the huge page from begin to end, we will clear the sub-pages > farthest from the the sub-page to access firstly, and clear the > sub-page to access last. This will make the sub-page to access most > cache-hot and sub-pages around it more cache-hot too. If we cannot > know the address the application will access, the begin of the huge > page is assumed to be the the address the application will access. > > With this patch, the throughput increases ~28.3% in vm-scalability > anon-w-seq test case with 72 processes on a 2 socket Xeon E5 v3 2699 > system (36 cores, 72 threads). The test case creates 72 processes, > each process mmap a big anonymous memory area and writes to it from > the begin to the end. For each process, other processes could be seen > as other workload which generates heavy cache pressure. At the same > time, the cache miss rate reduced from ~33.4% to ~31.7%, the > IPC (instruction per cycle) increased from 0.56 to 0.74, and the time > spent in user space is reduced ~7.9% > > Thanks Andi Kleen to propose to use address to access to determine the > order of sub-pages to clear. > > The hugetlbfs access address could be improved, will do that in > another patch. hugetlb_fault masks off the actual faulting address with, address &= huge_page_mask(h); before calling hugetlb_no_page. But, we could pass down the actual (unmasked) address to take advantage of this optimization for hugetlb faults as well. hugetlb_fault is the only caller of hugetlb_no_page, so this should be pretty straight forward. Were you thinking of additional improvements? -- Mike Kravetz
Re: [PATCH -mm] mm: Clear to access sub-page last when clearing huge page
On 08/07/2017 12:21 AM, Huang, Ying wrote: > From: Huang Ying > > Huge page helps to reduce TLB miss rate, but it has higher cache > footprint, sometimes this may cause some issue. For example, when > clearing huge page on x86_64 platform, the cache footprint is 2M. But > on a Xeon E5 v3 2699 CPU, there are 18 cores, 36 threads, and only 45M > LLC (last level cache). That is, in average, there are 2.5M LLC for > each core and 1.25M LLC for each thread. If the cache pressure is > heavy when clearing the huge page, and we clear the huge page from the > begin to the end, it is possible that the begin of huge page is > evicted from the cache after we finishing clearing the end of the huge > page. And it is possible for the application to access the begin of > the huge page after clearing the huge page. > > To help the above situation, in this patch, when we clear a huge page, > the order to clear sub-pages is changed. In quite some situation, we > can get the address that the application will access after we clear > the huge page, for example, in a page fault handler. Instead of > clearing the huge page from begin to end, we will clear the sub-pages > farthest from the the sub-page to access firstly, and clear the > sub-page to access last. This will make the sub-page to access most > cache-hot and sub-pages around it more cache-hot too. If we cannot > know the address the application will access, the begin of the huge > page is assumed to be the the address the application will access. > > With this patch, the throughput increases ~28.3% in vm-scalability > anon-w-seq test case with 72 processes on a 2 socket Xeon E5 v3 2699 > system (36 cores, 72 threads). The test case creates 72 processes, > each process mmap a big anonymous memory area and writes to it from > the begin to the end. For each process, other processes could be seen > as other workload which generates heavy cache pressure. At the same > time, the cache miss rate reduced from ~33.4% to ~31.7%, the > IPC (instruction per cycle) increased from 0.56 to 0.74, and the time > spent in user space is reduced ~7.9% > > Thanks Andi Kleen to propose to use address to access to determine the > order of sub-pages to clear. > > The hugetlbfs access address could be improved, will do that in > another patch. hugetlb_fault masks off the actual faulting address with, address &= huge_page_mask(h); before calling hugetlb_no_page. But, we could pass down the actual (unmasked) address to take advantage of this optimization for hugetlb faults as well. hugetlb_fault is the only caller of hugetlb_no_page, so this should be pretty straight forward. Were you thinking of additional improvements? -- Mike Kravetz
[PATCH v2 3/4] KVM: s390: implements the kvm_arch_vcpu_in_kernel()
This implements the kvm_arch_vcpu_in_kernel() for s390. Signed-off-by: Longpeng(Mike)--- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0b0c689..e46177b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2449,7 +2449,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { - return false; + return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE); } EXPORT_SYMBOL_GPL(kvm_arch_vcpu_in_kernel); -- 1.8.3.1
[PATCH v2 2/4] KVM: X86: implement the logic for spinlock optimization
1. Implements the kvm_arch_vcpu_in_kernel(), because get_cpl requires vcpu_load, so we must cache the result(whether the vcpu was preempted when its cpl=0) in kvm_vcpu_arch. 2. Add ->spin_in_kernel hook, because we can get benefit from VMX. Signed-off-by: Longpeng(Mike)--- arch/x86/include/asm/kvm_host.h | 5 + arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/svm.c | 8 +++- arch/x86/kvm/vmx.c | 16 +++- arch/x86/kvm/x86.c | 7 ++- 5 files changed, 34 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 87ac4fb..d2b2d57 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -688,6 +688,9 @@ struct kvm_vcpu_arch { /* GPA available (AMD only) */ bool gpa_available; + + /* be preempted when it's in kernel-mode(cpl=0) */ + bool preempted_in_kernel; }; struct kvm_lpage_info { @@ -1057,6 +1060,8 @@ struct kvm_x86_ops { void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); void (*setup_mce)(struct kvm_vcpu *vcpu); + + bool (*spin_in_kernel)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index cd0e6e6..dec5e8a 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1268,7 +1268,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) switch (code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: - kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu)); + kvm_vcpu_on_spin(vcpu, kvm_x86_ops->spin_in_kernel(vcpu)); break; case HVCALL_POST_MESSAGE: case HVCALL_SIGNAL_EVENT: diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e6ed24e..ccb6df7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3751,7 +3751,7 @@ static int pause_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &(svm->vcpu); - kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu)); + kvm_vcpu_on_spin(vcpu, kvm_x86_ops->spin_in_kernel(vcpu)); return 1; } @@ -5364,6 +5364,11 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } +static bool svm_spin_in_kernel(struct kvm_vcpu *vcpu) +{ + return svm_get_cpl(vcpu) == 0; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5476,6 +5481,7 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) .deliver_posted_interrupt = svm_deliver_avic_intr, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, + .spin_in_kernel = svm_spin_in_kernel, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9d6223a..297a158 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6761,7 +6761,8 @@ static int handle_pause(struct kvm_vcpu *vcpu) if (ple_gap) grow_ple_window(vcpu); - kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu)); + /* See comments in vmx_spin_in_kernel() */ + kvm_vcpu_on_spin(vcpu, true); return kvm_skip_emulated_instruction(vcpu); } @@ -11636,6 +11637,17 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEATURE_CONTROL_LMCE; } +static bool vmx_spin_in_kernel(struct kvm_vcpu *vcpu) +{ + /* +* Intel sdm vol3 ch-25.1.3 says: The “PAUSE-loop exiting” +* VM-execution control is ignored if CPL > 0. OTOH, KVM +* never set PAUSE_EXITING and just set PLE if supported, +* so the vcpu must be CPL=0 if it gets a PAUSE exit. +*/ + return true; +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -11763,6 +11775,8 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) #endif .setup_mce = vmx_setup_mce, + + .spin_in_kernel = vmx_spin_in_kernel, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4430be6..28299b9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2881,6 +2881,10 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { int idx; + + if (vcpu->preempted) + vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu); + /* * Disable page faults because we're in atomic context here. * kvm_write_guest_offset_cached() would call might_fault() @@ -7992,6 +7996,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; + vcpu->arch.preempted_in_kernel = false; kvm_hv_vcpu_init(vcpu); @@ -8441,7 +8446,7 @@ int
[PATCH v2 3/4] KVM: s390: implements the kvm_arch_vcpu_in_kernel()
This implements the kvm_arch_vcpu_in_kernel() for s390. Signed-off-by: Longpeng(Mike) --- arch/s390/kvm/kvm-s390.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 0b0c689..e46177b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2449,7 +2449,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) { - return false; + return !(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE); } EXPORT_SYMBOL_GPL(kvm_arch_vcpu_in_kernel); -- 1.8.3.1
[PATCH v2 2/4] KVM: X86: implement the logic for spinlock optimization
1. Implements the kvm_arch_vcpu_in_kernel(), because get_cpl requires vcpu_load, so we must cache the result(whether the vcpu was preempted when its cpl=0) in kvm_vcpu_arch. 2. Add ->spin_in_kernel hook, because we can get benefit from VMX. Signed-off-by: Longpeng(Mike) --- arch/x86/include/asm/kvm_host.h | 5 + arch/x86/kvm/hyperv.c | 2 +- arch/x86/kvm/svm.c | 8 +++- arch/x86/kvm/vmx.c | 16 +++- arch/x86/kvm/x86.c | 7 ++- 5 files changed, 34 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 87ac4fb..d2b2d57 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -688,6 +688,9 @@ struct kvm_vcpu_arch { /* GPA available (AMD only) */ bool gpa_available; + + /* be preempted when it's in kernel-mode(cpl=0) */ + bool preempted_in_kernel; }; struct kvm_lpage_info { @@ -1057,6 +1060,8 @@ struct kvm_x86_ops { void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); void (*setup_mce)(struct kvm_vcpu *vcpu); + + bool (*spin_in_kernel)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index cd0e6e6..dec5e8a 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1268,7 +1268,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) switch (code) { case HVCALL_NOTIFY_LONG_SPIN_WAIT: - kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu)); + kvm_vcpu_on_spin(vcpu, kvm_x86_ops->spin_in_kernel(vcpu)); break; case HVCALL_POST_MESSAGE: case HVCALL_SIGNAL_EVENT: diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e6ed24e..ccb6df7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3751,7 +3751,7 @@ static int pause_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &(svm->vcpu); - kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu)); + kvm_vcpu_on_spin(vcpu, kvm_x86_ops->spin_in_kernel(vcpu)); return 1; } @@ -5364,6 +5364,11 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } +static bool svm_spin_in_kernel(struct kvm_vcpu *vcpu) +{ + return svm_get_cpl(vcpu) == 0; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5476,6 +5481,7 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) .deliver_posted_interrupt = svm_deliver_avic_intr, .update_pi_irte = svm_update_pi_irte, .setup_mce = svm_setup_mce, + .spin_in_kernel = svm_spin_in_kernel, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9d6223a..297a158 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6761,7 +6761,8 @@ static int handle_pause(struct kvm_vcpu *vcpu) if (ple_gap) grow_ple_window(vcpu); - kvm_vcpu_on_spin(vcpu, kvm_arch_vcpu_in_kernel(vcpu)); + /* See comments in vmx_spin_in_kernel() */ + kvm_vcpu_on_spin(vcpu, true); return kvm_skip_emulated_instruction(vcpu); } @@ -11636,6 +11637,17 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEATURE_CONTROL_LMCE; } +static bool vmx_spin_in_kernel(struct kvm_vcpu *vcpu) +{ + /* +* Intel sdm vol3 ch-25.1.3 says: The “PAUSE-loop exiting” +* VM-execution control is ignored if CPL > 0. OTOH, KVM +* never set PAUSE_EXITING and just set PLE if supported, +* so the vcpu must be CPL=0 if it gets a PAUSE exit. +*/ + return true; +} + static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -11763,6 +11775,8 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) #endif .setup_mce = vmx_setup_mce, + + .spin_in_kernel = vmx_spin_in_kernel, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4430be6..28299b9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2881,6 +2881,10 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { int idx; + + if (vcpu->preempted) + vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu); + /* * Disable page faults because we're in atomic context here. * kvm_write_guest_offset_cached() would call might_fault() @@ -7992,6 +7996,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_pmu_init(vcpu); vcpu->arch.pending_external_vector = -1; + vcpu->arch.preempted_in_kernel = false; kvm_hv_vcpu_init(vcpu); @@ -8441,7 +8446,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)