[PATCH v5 5/7] dmaengine: xilinx_dma: autodetect whether the HW supports scatter-gather
The AXIDMA and CDMA HW can be either direct-access or scatter-gather version. These are SW incompatible. The driver can handle both versions: a DT property was used to tell the driver whether to assume the HW is in scatter-gather mode. This patch makes the driver to autodetect this information. The DT property is not required anymore. No changes for VDMA. Cc: Rob Herring Cc: Mark Rutland Cc: devicet...@vger.kernel.org Cc: Radhey Shyam Pandey Signed-off-by: Andrea Merello Reviewed-by: Radhey Shyam Pandey --- Changes in v2: - autodetect only in !VDMA case Changes in v3: - cc DT maintainers/ML Changes in v4: - fix typos in commit message Changes in v5: None --- drivers/dma/xilinx/xilinx_dma.c | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index b17f24e4ec35..78d0f2f8225e 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -86,6 +86,7 @@ #define XILINX_DMA_DMASR_DMA_DEC_ERR BIT(6) #define XILINX_DMA_DMASR_DMA_SLAVE_ERR BIT(5) #define XILINX_DMA_DMASR_DMA_INT_ERR BIT(4) +#define XILINX_DMA_DMASR_SG_MASK BIT(3) #define XILINX_DMA_DMASR_IDLE BIT(1) #define XILINX_DMA_DMASR_HALTEDBIT(0) #define XILINX_DMA_DMASR_DELAY_MASKGENMASK(31, 24) @@ -407,7 +408,6 @@ struct xilinx_dma_config { * @dev: Device Structure * @common: DMA device structure * @chan: Driver specific DMA channel - * @has_sg: Specifies whether Scatter-Gather is present or not * @mcdma: Specifies whether Multi-Channel is present or not * @flush_on_fsync: Flush on frame sync * @ext_addr: Indicates 64 bit addressing is supported by dma device @@ -427,7 +427,6 @@ struct xilinx_dma_device { struct device *dev; struct dma_device common; struct xilinx_dma_chan *chan[XILINX_DMA_MAX_CHANS_PER_DEVICE]; - bool has_sg; bool mcdma; u32 flush_on_fsync; bool ext_addr; @@ -2400,7 +2399,6 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev, chan->dev = xdev->dev; chan->xdev = xdev; - chan->has_sg = xdev->has_sg; chan->desc_pendingcount = 0x0; chan->ext_addr = xdev->ext_addr; /* This variable ensures that descriptors are not @@ -2493,6 +2491,15 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device *xdev, chan->stop_transfer = xilinx_dma_stop_transfer; } + /* check if SG is enabled (only for AXIDMA and CDMA) */ + if (xdev->dma_config->dmatype != XDMA_TYPE_VDMA) { + if (dma_ctrl_read(chan, XILINX_DMA_REG_DMASR) & + XILINX_DMA_DMASR_SG_MASK) + chan->has_sg = true; + dev_dbg(chan->dev, "ch %d: SG %s\n", chan->id, + chan->has_sg ? "enabled" : "disabled"); + } + /* Initialize the tasklet */ tasklet_init(>tasklet, xilinx_dma_do_tasklet, (unsigned long)chan); @@ -2631,7 +2638,6 @@ static int xilinx_dma_probe(struct platform_device *pdev) return PTR_ERR(xdev->regs); /* Retrieve the DMA engine properties from the device tree */ - xdev->has_sg = of_property_read_bool(node, "xlnx,include-sg"); xdev->max_buffer_len = GENMASK(XILINX_DMA_MAX_TRANS_LEN_MAX - 1, 0); if (xdev->dma_config->dmatype == XDMA_TYPE_AXIDMA) { -- 2.17.1
[PATCH v5 4/7] dmaengine: xilinx_dma: program hardware supported buffer length
From: Radhey Shyam Pandey AXI-DMA IP supports configurable (c_sg_length_width) buffer length register width, hence read buffer length (xlnx,sg-length-width) DT property and ensure that driver doesn't program buffer length exceeding the supported limit. For VDMA and CDMA there is no change. Cc: Rob Herring Cc: Mark Rutland Cc: devicet...@vger.kernel.org Signed-off-by: Radhey Shyam Pandey Signed-off-by: Michal Simek Signed-off-by: Andrea Merello [rebase, reword] --- Changes in v2: - drop original patch and replace with the one in Xilinx tree Changes in v3: - cc DT maintainers/ML Changes in v4: - upper bound for the property should be 26, not 23 - add warn for width > 23 as per xilinx original patch - rework due to changes introduced in 1/6 Changes in v5: None --- drivers/dma/xilinx/xilinx_dma.c | 36 + 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index aaa6de8a70e4..b17f24e4ec35 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -158,7 +158,9 @@ #define XILINX_DMA_REG_BTT 0x28 /* AXI DMA Specific Masks/Bit fields */ -#define XILINX_DMA_MAX_TRANS_LEN GENMASK(22, 0) +#define XILINX_DMA_MAX_TRANS_LEN_MIN 8 +#define XILINX_DMA_MAX_TRANS_LEN_MAX 23 +#define XILINX_DMA_V2_MAX_TRANS_LEN_MAX26 #define XILINX_DMA_CR_COALESCE_MAX GENMASK(23, 16) #define XILINX_DMA_CR_CYCLIC_BD_EN_MASKBIT(4) #define XILINX_DMA_CR_COALESCE_SHIFT 16 @@ -418,6 +420,7 @@ struct xilinx_dma_config { * @rxs_clk: DMA s2mm stream clock * @nr_channels: Number of channels DMA device supports * @chan_id: DMA channel identifier + * @max_buffer_len: Max buffer length */ struct xilinx_dma_device { void __iomem *regs; @@ -437,6 +440,7 @@ struct xilinx_dma_device { struct clk *rxs_clk; u32 nr_channels; u32 chan_id; + u32 max_buffer_len; }; /* Macros */ @@ -964,7 +968,7 @@ static int xilinx_dma_calc_copysize(struct xilinx_dma_chan *chan, int size, int done) { size_t copy = min_t(size_t, size - done, -XILINX_DMA_MAX_TRANS_LEN); + chan->xdev->max_buffer_len); if ((copy + done < size) && chan->xdev->common.copy_align) { @@ -1011,7 +1015,7 @@ static enum dma_status xilinx_dma_tx_status(struct dma_chan *dchan, list_for_each_entry(segment, >segments, node) { hw = >hw; residue += (hw->control - hw->status) & - XILINX_DMA_MAX_TRANS_LEN; + chan->xdev->max_buffer_len; } } spin_unlock_irqrestore(>lock, flags); @@ -1263,7 +1267,7 @@ static void xilinx_cdma_start_transfer(struct xilinx_dma_chan *chan) /* Start the transfer */ dma_ctrl_write(chan, XILINX_DMA_REG_BTT, - hw->control & XILINX_DMA_MAX_TRANS_LEN); + hw->control & chan->xdev->max_buffer_len); } list_splice_tail_init(>pending_list, >active_list); @@ -1366,7 +1370,7 @@ static void xilinx_dma_start_transfer(struct xilinx_dma_chan *chan) /* Start the transfer */ dma_ctrl_write(chan, XILINX_DMA_REG_BTT, - hw->control & XILINX_DMA_MAX_TRANS_LEN); + hw->control & chan->xdev->max_buffer_len); } list_splice_tail_init(>pending_list, >active_list); @@ -1727,7 +1731,7 @@ xilinx_cdma_prep_memcpy(struct dma_chan *dchan, dma_addr_t dma_dst, struct xilinx_cdma_tx_segment *segment; struct xilinx_cdma_desc_hw *hw; - if (!len || len > XILINX_DMA_MAX_TRANS_LEN) + if (!len || len > chan->xdev->max_buffer_len) return NULL; desc = xilinx_dma_alloc_tx_descriptor(chan); @@ -2596,7 +2600,7 @@ static int xilinx_dma_probe(struct platform_device *pdev) struct xilinx_dma_device *xdev; struct device_node *child, *np = pdev->dev.of_node; struct resource *io; - u32 num_frames, addr_width; + u32 num_frames, addr_width, len_width; int i, err; /* Allocate and initialize the DMA engine structure */ @@ -2628,8 +2632,24 @@ static int xilinx_dma_probe(struct platform_device *pdev) /* Retrieve the DMA engine properties from the device tree */ xdev->has_sg = of_property_read_bool(node, "xlnx,include-sg"); - if (xdev->dma_config->dmatype == XDMA_TYPE_AXIDMA) + xdev->max_buffer_len = GENMASK(XILINX_DMA_MAX_TRANS_LEN_MAX - 1, 0); + + if (xdev->dma_config->dmatype == XDMA_TYPE_AXIDMA) { xdev->mcdma = of_property_read_bool(node,
[PATCH v5 3/7] dt-bindings: dmaengine: xilinx_dma: add optional xlnx,sg-length-width property
The width of the "length register" cannot be autodetected, and it is now specified with a DT property. Add documentation for it. Cc: Rob Herring Cc: Mark Rutland Cc: devicet...@vger.kernel.org Cc: Radhey Shyam Pandey Signed-off-by: Andrea Merello Reviewed-by: Radhey Shyam Pandey --- Changes in v2: - change property name - property is now optional - cc DT maintainer Changes in v3: - reword - cc DT maintainerS and ML Changes in v4: - specify the unit, the valid range and the default value Changes in v5: - commit message trivial fix - fix spaces before tab --- Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt | 4 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt index a2b8bfaec43c..5df4eac7300c 100644 --- a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt +++ b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt @@ -41,6 +41,10 @@ Optional properties: - xlnx,include-sg: Tells configured for Scatter-mode in the hardware. Optional properties for AXI DMA: +- xlnx,sg-length-width: Should be set to the width in bits of the length + register as configured in h/w. Takes values {8...26}. If the property + is missing or invalid then the default value 23 is used. This is the + maximum value that is supported by all IP versions. - xlnx,mcdma: Tells whether configured for multi-channel mode in the hardware. Optional properties for VDMA: - xlnx,flush-fsync: Tells which channel to Flush on Frame sync. -- 2.17.1
[PATCH v5 1/7] dmaengine: xilinx_dma: commonize DMA copy size calculation
This patch removes a bit of duplicated code by introducing a new function that implements calculations for DMA copy size. Suggested-by: Vinod Koul Signed-off-by: Andrea Merello --- Changes in v4: - introduce this patch in the patch series Changes in v5: None --- drivers/dma/xilinx/xilinx_dma.c | 20 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index 27b523530c4a..a3aaa0e34cc7 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -952,6 +952,19 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan) return 0; } +/** + * xilinx_dma_calc_copysize - Calculate the amount of data to copy + * @size: Total data that needs to be copied + * @done: Amount of data that has been already copied + * + * Return: Amount of data that has to be copied + */ +static int xilinx_dma_calc_copysize(int size, int done) +{ + return min_t(size_t, size - done, +XILINX_DMA_MAX_TRANS_LEN); +} + /** * xilinx_dma_tx_status - Get DMA transaction status * @dchan: DMA channel @@ -1791,8 +1804,8 @@ static struct dma_async_tx_descriptor *xilinx_dma_prep_slave_sg( * Calculate the maximum number of bytes to transfer, * making sure it is less than the hw limit */ - copy = min_t(size_t, sg_dma_len(sg) - sg_used, -XILINX_DMA_MAX_TRANS_LEN); + copy = xilinx_dma_calc_copysize(sg_dma_len(sg), + sg_used); hw = >hw; /* Fill in the descriptor */ @@ -1896,8 +1909,7 @@ static struct dma_async_tx_descriptor *xilinx_dma_prep_dma_cyclic( * Calculate the maximum number of bytes to transfer, * making sure it is less than the hw limit */ - copy = min_t(size_t, period_len - sg_used, -XILINX_DMA_MAX_TRANS_LEN); + copy = xilinx_dma_calc_copysize(period_len, sg_used); hw = >hw; xilinx_axidma_buf(chan, hw, buf_addr, sg_used, period_len * i); -- 2.17.1
[PATCH v5 2/7] dmaengine: xilinx_dma: in axidma slave_sg and dma_cyclic mode align split descriptors
Whenever a single or cyclic transaction is prepared, the driver could eventually split it over several SG descriptors in order to deal with the HW maximum transfer length. This could end up in DMA operations starting from a misaligned address. This seems fatal for the HW if DRE (Data Realignment Engine) is not enabled. This patch eventually adjusts the transfer size in order to make sure all operations start from an aligned address. Cc: Radhey Shyam Pandey Signed-off-by: Andrea Merello Reviewed-by: Radhey Shyam Pandey --- Changes in v2: - don't introduce copy_mask field, rather rely on already-esistent copy_align field. Suggested by Radhey Shyam Pandey - reword title Changes in v3: - fix bug introduced in v2: wrong copy size when DRE is enabled - use implementation suggested by Radhey Shyam Pandey Changes in v4: - rework on the top of 1/6 Changes in v5: - fix typo in commit title - add hint about "DRE" meaning in commit message --- drivers/dma/xilinx/xilinx_dma.c | 22 ++ 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index a3aaa0e34cc7..aaa6de8a70e4 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -954,15 +954,28 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan *dchan) /** * xilinx_dma_calc_copysize - Calculate the amount of data to copy + * @chan: Driver specific DMA channel * @size: Total data that needs to be copied * @done: Amount of data that has been already copied * * Return: Amount of data that has to be copied */ -static int xilinx_dma_calc_copysize(int size, int done) +static int xilinx_dma_calc_copysize(struct xilinx_dma_chan *chan, + int size, int done) { - return min_t(size_t, size - done, + size_t copy = min_t(size_t, size - done, XILINX_DMA_MAX_TRANS_LEN); + + if ((copy + done < size) && + chan->xdev->common.copy_align) { + /* +* If this is not the last descriptor, make sure +* the next one will be properly aligned +*/ + copy = rounddown(copy, +(1 << chan->xdev->common.copy_align)); + } + return copy; } /** @@ -1804,7 +1817,7 @@ static struct dma_async_tx_descriptor *xilinx_dma_prep_slave_sg( * Calculate the maximum number of bytes to transfer, * making sure it is less than the hw limit */ - copy = xilinx_dma_calc_copysize(sg_dma_len(sg), + copy = xilinx_dma_calc_copysize(chan, sg_dma_len(sg), sg_used); hw = >hw; @@ -1909,7 +1922,8 @@ static struct dma_async_tx_descriptor *xilinx_dma_prep_dma_cyclic( * Calculate the maximum number of bytes to transfer, * making sure it is less than the hw limit */ - copy = xilinx_dma_calc_copysize(period_len, sg_used); + copy = xilinx_dma_calc_copysize(chan, + period_len, sg_used); hw = >hw; xilinx_axidma_buf(chan, hw, buf_addr, sg_used, period_len * i); -- 2.17.1
[PATCH v5 7/7] dmaengine: xilinx_dma: Drop SG support for VDMA IP
xilinx_vdma_start_transfer() is used only for VDMA IP, still it contains conditional code on has_sg variable. has_sg is set only whenever the HW does support SG mode, that is never true for VDMA IP. This patch drops the never-taken branches. Signed-off-by: Andrea Merello --- Changes in V4: introduced this patch in series Changes in v5: None --- drivers/dma/xilinx/xilinx_dma.c | 84 + 1 file changed, 32 insertions(+), 52 deletions(-) diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c index 78d0f2f8225e..07ceadef0a00 100644 --- a/drivers/dma/xilinx/xilinx_dma.c +++ b/drivers/dma/xilinx/xilinx_dma.c @@ -1093,6 +1093,8 @@ static void xilinx_vdma_start_transfer(struct xilinx_dma_chan *chan) struct xilinx_dma_tx_descriptor *desc, *tail_desc; u32 reg, j; struct xilinx_vdma_tx_segment *tail_segment; + struct xilinx_vdma_tx_segment *segment, *last = NULL; + int i = 0; /* This function was invoked with lock held */ if (chan->err) @@ -1112,14 +1114,6 @@ static void xilinx_vdma_start_transfer(struct xilinx_dma_chan *chan) tail_segment = list_last_entry(_desc->segments, struct xilinx_vdma_tx_segment, node); - /* -* If hardware is idle, then all descriptors on the running lists are -* done, start new transfers -*/ - if (chan->has_sg) - dma_ctrl_write(chan, XILINX_DMA_REG_CURDESC, - desc->async_tx.phys); - /* Configure the hardware using info in the config structure */ reg = dma_ctrl_read(chan, XILINX_DMA_REG_DMACR); @@ -1128,15 +1122,11 @@ static void xilinx_vdma_start_transfer(struct xilinx_dma_chan *chan) else reg &= ~XILINX_DMA_DMACR_FRAMECNT_EN; - /* -* With SG, start with circular mode, so that BDs can be fetched. -* In direct register mode, if not parking, enable circular mode -*/ - if (chan->has_sg || !config->park) - reg |= XILINX_DMA_DMACR_CIRC_EN; - + /* If not parking, enable circular mode */ if (config->park) reg &= ~XILINX_DMA_DMACR_CIRC_EN; + else + reg |= XILINX_DMA_DMACR_CIRC_EN; dma_ctrl_write(chan, XILINX_DMA_REG_DMACR, reg); @@ -1158,48 +1148,38 @@ static void xilinx_vdma_start_transfer(struct xilinx_dma_chan *chan) return; /* Start the transfer */ - if (chan->has_sg) { - dma_ctrl_write(chan, XILINX_DMA_REG_TAILDESC, - tail_segment->phys); - list_splice_tail_init(>pending_list, >active_list); - chan->desc_pendingcount = 0; - } else { - struct xilinx_vdma_tx_segment *segment, *last = NULL; - int i = 0; - - if (chan->desc_submitcount < chan->num_frms) - i = chan->desc_submitcount; - - list_for_each_entry(segment, >segments, node) { - if (chan->ext_addr) - vdma_desc_write_64(chan, - XILINX_VDMA_REG_START_ADDRESS_64(i++), - segment->hw.buf_addr, - segment->hw.buf_addr_msb); - else - vdma_desc_write(chan, + if (chan->desc_submitcount < chan->num_frms) + i = chan->desc_submitcount; + + list_for_each_entry(segment, >segments, node) { + if (chan->ext_addr) + vdma_desc_write_64(chan, + XILINX_VDMA_REG_START_ADDRESS_64(i++), + segment->hw.buf_addr, + segment->hw.buf_addr_msb); + else + vdma_desc_write(chan, XILINX_VDMA_REG_START_ADDRESS(i++), segment->hw.buf_addr); - last = segment; - } - - if (!last) - return; + last = segment; + } - /* HW expects these parameters to be same for one transaction */ - vdma_desc_write(chan, XILINX_DMA_REG_HSIZE, last->hw.hsize); - vdma_desc_write(chan, XILINX_DMA_REG_FRMDLY_STRIDE, - last->hw.stride); - vdma_desc_write(chan, XILINX_DMA_REG_VSIZE, last->hw.vsize); + if (!last) + return; - chan->desc_submitcount++; - chan->desc_pendingcount--; - list_del(>node); - list_add_tail(>node, >active_list); - if (chan->desc_submitcount == chan->num_frms) - chan->desc_submitcount = 0; - } + /* HW expects these
[PATCH v5 6/7] dt-bindings: dmaengine: xilinx_dma: drop has-sg property
This property is not needed anymore, because the driver now autodetects it. Delete references in documentation. Cc: Rob Herring Cc: Mark Rutland Cc: devicet...@vger.kernel.org Cc: Radhey Shyam Pandey Signed-off-by: Andrea Merello Reviewed-by: Radhey Shyam Pandey Reviewed-by: Rob Herring --- Changes in v2: - cc DT maintainer Changes in v3: - cc DT maintainerS/ML Changes in v4: None Changes in v5: None --- Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt index 5df4eac7300c..6303ce7fcc3d 100644 --- a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt +++ b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt @@ -37,9 +37,6 @@ Required properties: Required properties for VDMA: - xlnx,num-fstores: Should be the number of framebuffers as configured in h/w. -Optional properties: -- xlnx,include-sg: Tells configured for Scatter-mode in - the hardware. Optional properties for AXI DMA: - xlnx,sg-length-width: Should be set to the width in bits of the length register as configured in h/w. Takes values {8...26}. If the property -- 2.17.1
Re: [PATCH AUTOSEL 4.14 27/67] ARM: exynos: Define EINT_WAKEUP_MASK registers for S5Pv210 and Exynos5433
On Fri, 7 Sep 2018 at 02:54, Sasha Levin wrote: > > From: Krzysztof Kozlowski > > [ Upstream commit e5cda42c16d89720c29678f51d95a119490ef7d8 ] > > S5Pv210 and Exynos5433/Exynos7 have different address of > EINT_WAKEUP_MASK register. Rename existing S5P_EINT_WAKEUP_MASK to > avoid confusion and add new ones. This should not be backported to stable. It does not fix anything but prepares the code for a8be2af0218c ("pinctrl: samsung: Write external wakeup interrupt mask"). Best regards, Krzysztof > > Signed-off-by: Krzysztof Kozlowski > Cc: Tomasz Figa > Cc: Sylwester Nawrocki > Acked-by: Tomasz Figa > Tested-by: Marek Szyprowski > Signed-off-by: Sasha Levin > --- > arch/arm/mach-exynos/suspend.c | 2 +- > include/linux/soc/samsung/exynos-regs-pmu.h | 6 +- > 2 files changed, 6 insertions(+), 2 deletions(-) > > diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c > index b529ba04ed16..a6a4ba334147 100644 > --- a/arch/arm/mach-exynos/suspend.c > +++ b/arch/arm/mach-exynos/suspend.c > @@ -279,7 +279,7 @@ static int exynos5420_cpu_suspend(unsigned long arg) > static void exynos_pm_set_wakeup_mask(void) > { > /* Set wake-up mask registers */ > - pmu_raw_writel(exynos_get_eint_wake_mask(), S5P_EINT_WAKEUP_MASK); > + pmu_raw_writel(exynos_get_eint_wake_mask(), EXYNOS_EINT_WAKEUP_MASK); > pmu_raw_writel(exynos_irqwake_intmask & ~(1 << 31), S5P_WAKEUP_MASK); > } > > diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h > b/include/linux/soc/samsung/exynos-regs-pmu.h > index bebdde5dccd6..f248e7e079b7 100644 > --- a/include/linux/soc/samsung/exynos-regs-pmu.h > +++ b/include/linux/soc/samsung/exynos-regs-pmu.h > @@ -46,7 +46,7 @@ > #define EXYNOS_SWRESET 0x0400 > > #define S5P_WAKEUP_STAT0x0600 > -#define S5P_EINT_WAKEUP_MASK 0x0604 > +#define EXYNOS_EINT_WAKEUP_MASK0x0604 > #define S5P_WAKEUP_MASK0x0608 > #define S5P_WAKEUP_MASK2 0x0614 > > @@ -184,6 +184,9 @@ > #define S5P_CORE_WAKEUP_FROM_LOCAL_CFG (0x3 << 8) > #define S5P_CORE_AUTOWAKEUP_EN (1 << 31) > > +/* Only for S5Pv210 */ > +#define S5PV210_EINT_WAKEUP_MASK 0xC004 > + > /* Only for EXYNOS4210 */ > #define S5P_CMU_CLKSTOP_LCD1_LOWPWR0x1154 > #define S5P_CMU_RESET_LCD1_LOWPWR 0x1174 > @@ -645,6 +648,7 @@ > | EXYNOS5420_KFC_USE_STANDBY_WFI3) > > /* For EXYNOS5433 */ > +#define EXYNOS5433_EINT_WAKEUP_MASK(0x060C) > #define EXYNOS5433_USBHOST30_PHY_CONTROL (0x0728) > #define EXYNOS5433_PAD_RETENTION_AUD_OPTION(0x3028) > #define EXYNOS5433_PAD_RETENTION_MMC2_OPTION (0x30C8) > -- > 2.17.1
Re: [RFC PATCH 00/11] Avoid synchronous TLB invalidation for intermediate page-table entries on arm64
On 09/05/2018 08:28 AM, Will Deacon wrote: > On Tue, Sep 04, 2018 at 02:38:02PM -0400, Jon Masters wrote: >> On 08/24/2018 11:52 AM, Will Deacon wrote: >> >>> I hacked up this RFC on the back of the recent changes to the mmu_gather >>> stuff in mainline. It's had a bit of testing and it looks pretty good so >>> far. >> >> I will request the server folks go and test this. You'll probably >> remember a couple of parts we've seen where aggressive walker caches >> ended up (correctly) seeing stale page table entries and we had all >> manner of horrifically hard to debug problems. We have some fairly nice >> reproducers that were able to find this last time that we can test. > > Cheers, Jon, that would be very helpful. You're probably best off using > my (rebasing) tlb branch rather than picking the RFC: > > git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git tlb > > Let me know if you'd prefer something stable (I can tag it with a date). That would be useful. I've prodded each of the Arm server SoC vendors I work with via our weekly call to have them each specifically check this. A tag would be helpful to that effort I expect. They all claim to be watching this thread now, so we'll see if they see cabbages here. Jon. -- Computer Architect | Sent from my Fedora powered laptop
Re: [LKP] [tty] 0b4f83d510: INFO:task_blocked_for_more_than#seconds
On 09/07/2018, 06:50 AM, kernel test robot wrote: > FYI, we noticed the following commit (built with gcc-7): > > commit: 0b4f83d510f6fef6bb9da25f122c8d733d50516f ("[PATCH 2/4] tty: Hold > tty_ldisc_lock() during tty_reopen()") > url: > https://github.com/0day-ci/linux/commits/Dmitry-Safonov/tty-Hold-write-ldisc-sem-in-tty_reopen/20180829-165618 > base: https://git.kernel.org/cgit/linux/kernel/git/gregkh/tty.git tty-testing > > in testcase: trinity > with following parameters: > > runtime: 300s > > test-description: Trinity is a linux system call fuzz tester. > test-url: http://codemonkey.org.uk/projects/trinity/ > > > on test machine: qemu-system-x86_64 -enable-kvm -m 256M > > caused below changes (please refer to attached dmesg/kmsg for entire > log/backtrace): > > > +--+++ > | | 58dd163974 | 0b4f83d510 | > +--+++ > | boot_successes | 14 | 4 | > | boot_failures| 0 | 6 | > | INFO:task_blocked_for_more_than#seconds | 0 | 6 | > | Kernel_panic-not_syncing:hung_task:blocked_tasks | 0 | 6 | > +--+++ > > > > [ 244.816801] INFO: task validate_data:655 blocked for more than 120 seconds. > [ 244.818833] Not tainted 4.18.0-11684-g0b4f83d #1 > [ 244.820028] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables > this message. > [ 244.826965] validate_data D0 655623 0x2002 > [ 244.828279] Call Trace: > [ 244.828958] ? __schedule+0x843/0x950 > [ 244.830173] ? __ldsem_down_read_nested+0x1c4/0x3b0 > [ 244.834903] schedule+0x31/0x70 > [ 244.835665] schedule_timeout+0x34/0x760 > [ 244.836613] ? ftrace_likely_update+0x35/0x60 > [ 244.837683] ? __ldsem_down_read_nested+0x1c4/0x3b0 > [ 244.838818] ? ftrace_likely_update+0x35/0x60 > [ 244.840127] ? ftrace_likely_update+0x35/0x60 > [ 244.845947] ? __ldsem_down_read_nested+0x1c4/0x3b0 > [ 244.847882] __ldsem_down_read_nested+0x23a/0x3b0 > [ 244.849886] ? tty_ldisc_ref_wait+0x25/0x50 > [ 244.853807] tty_ldisc_ref_wait+0x25/0x50 > [ 244.854946] tty_compat_ioctl+0x8a/0x120 > [ 244.855928] ? this_tty+0x80/0x80 > [ 244.856742] __ia32_compat_sys_ioctl+0xc28/0x1ce0 > [ 244.857981] do_int80_syscall_32+0x1d2/0x5f0 > [ 244.859003] entry_INT80_compat+0x88/0xa0 > [ 244.859972] INFO: task dnsmasq:668 blocked for more than 120 seconds. > [ 244.868315] Not tainted 4.18.0-11684-g0b4f83d #1 > [ 244.869583] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables > this message. > [ 244.871744] dnsmasq D0 668 1 0x2002 > [ 244.873063] Call Trace: > [ 244.873697] ? __schedule+0x843/0x950 > [ 244.874572] ? __ldsem_down_read_nested+0x1c4/0x3b0 > [ 244.875725] schedule+0x31/0x70 > [ 244.876576] schedule_timeout+0x34/0x760 > [ 244.877573] ? ftrace_likely_update+0x35/0x60 > [ 244.878660] ? __ldsem_down_read_nested+0x1c4/0x3b0 > [ 244.879872] ? ftrace_likely_update+0x35/0x60 > [ 244.890522] ? ftrace_likely_update+0x35/0x60 > [ 244.891572] ? __ldsem_down_read_nested+0x1c4/0x3b0 > [ 244.892746] __ldsem_down_read_nested+0x23a/0x3b0 > [ 244.893861] ? tty_ldisc_ref_wait+0x25/0x50 > [ 244.894841] tty_ldisc_ref_wait+0x25/0x50 > [ 244.895911] tty_compat_ioctl+0x8a/0x120 > [ 244.896916] ? this_tty+0x80/0x80 > [ 244.897717] __ia32_compat_sys_ioctl+0xc28/0x1ce0 > [ 244.898821] do_int80_syscall_32+0x1d2/0x5f0 > [ 244.899830] entry_INT80_compat+0x88/0xa0 > [ 244.909466] INFO: task dropbear:734 blocked for more than 120 seconds. > [ 244.911173] Not tainted 4.18.0-11684-g0b4f83d #1 > [ 244.912394] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables > this message. > [ 244.914176] dropbearD0 734 1 0x2002 > [ 244.915446] Call Trace: > [ 244.916068] ? __schedule+0x843/0x950 > [ 244.916945] ? __ldsem_down_read_nested+0x1c4/0x3b0 > [ 244.918076] schedule+0x31/0x70 > [ 244.918832] schedule_timeout+0x34/0x760 > [ 244.919781] ? ftrace_likely_update+0x35/0x60 > [ 244.921104] ? __ldsem_down_read_nested+0x1c4/0x3b0 > [ 244.922304] ? ftrace_likely_update+0x35/0x60 > [ 244.923347] ? ftrace_likely_update+0x35/0x60 > [ 244.924369] ? __ldsem_down_read_nested+0x1c4/0x3b0 > [ 244.925496] __ldsem_down_read_nested+0x23a/0x3b0 > [ 244.926598] ? tty_ldisc_ref_wait+0x25/0x50 > [ 244.927578] tty_ldisc_ref_wait+0x25/0x50 > [ 244.928526] tty_compat_ioctl+0x8a/0x120 > [ 244.929449] ? this_tty+0x80/0x80 > [ 244.930240] __ia32_compat_sys_ioctl+0xc28/0x1ce0 > [ 244.940083] do_int80_syscall_32+0x1d2/0x5f0 > [ 244.941310] entry_INT80_compat+0x88/0xa0 > [ 244.944070] > [ 244.944070] Showing all locks held
Re: [PATCH v2 1/8] perf/x86: add a function to get the lbr stack
On 09/07/2018 11:28 AM, Andi Kleen wrote: +int perf_get_lbr_stack(struct perf_lbr_stack *stack) +{ + stack->lbr_nr = x86_pmu.lbr_nr; + stack->lbr_tos = x86_pmu.lbr_tos; + stack->lbr_from = x86_pmu.lbr_from; + stack->lbr_to = x86_pmu.lbr_to; + + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO) + stack->lbr_info = MSR_LBR_INFO_0; + else + stack->lbr_info = 0; Seems weird to export the enum value if the enum isn't exported. How can it be used? I'm not sure about the issue. The caller gets the value of MSR_LBR_INFO_0 (not the enum, LBR_FORMAT_INFO) only when the hardware supports it. If hardware doesn't support it, just sets it to 0, and there will be no lbr info msr to be passed through. Best, Wei
Re: [PATCH V3 00/26] C-SKY(csky) Linux Kernel Port
On Thu, Sep 06, 2018 at 07:08:18PM -0700, Guenter Roeck wrote: > Hi, > > On Wed, Sep 05, 2018 at 08:07:39PM +0800, Guo Ren wrote: > > This is the 3th version patchset to add the Linux kernel port for > > C-SKY(csky). > > Thanks to everyone who provided feedback on the previous version. > > > > This patchset adds architecture support to Linux for C-SKY's 32-bit embedded > > CPU cores and the patches are based on linux-4.18.4 > > > > There are two ABI versions with several CPU cores in this patchset: > > ABIv1: ck610 (16-bit instruction, 32-bit data path, VIPT Cache ...) > > ABIv2: ck807 ck810 ck860 (16/32-bit variable length instruction, PIPT > > Cache, > > SMP ...) > > > > My key question is about upstream toolchain support. > The buildroot clone tells me > > $ git describe csky/master > 2017.11-2111-ge9cc5a5 > > and > > $ git log --oneline origin/master..csky/master | wc >11807436 57104 > > with > $ git remote -v > csky https://gitlab.com/c-sky/buildroot.git > origingit://git.buildroot.net/buildroot > > So it looks like there are more thasn a thousand patches on top of > buildroot. Adding an architecture to buildroot should only take a > single patch, or maybe a few, but not more than a thousand. > This strongly suggests that a lot of changes are not upstream > but only available in the buildroot clone. csky https://gitlab.com/c-sky/buildroot.git is our CI environment based on buildroot and it's so miscellaneous. We won't upstream it directly and we'll prepare another patch set for buildroot.org update after kernel, glibc upstreamed. > When are we going to see all those changes in upstream gcc, binutils, > and qemu ? I don't really want to dig through more than a thousand > patches in a buildroot clone to find out details about the status > of upstream toolchain support. Ok, you want to use upstream gcc, binutils to build the kernel. I'll give the tips in next version patch. Best Regards Guo Ren
Re: [PATCH V3 19/26] dt-bindings: timer: gx6605s SOC timer
On Thu, Sep 06, 2018 at 10:02:29AM +0800, Guo Ren wrote: > On Wed, Sep 05, 2018 at 07:47:29PM -0500, Rob Herring wrote: > > On Wed, Sep 5, 2018 at 7:09 AM Guo Ren wrote: > > > > > > Signed-off-by: Guo Ren > > > --- > > > .../bindings/timer/csky,gx6605s-timer.txt | 46 > > > ++ > > > 1 file changed, 46 insertions(+) > Ok, change to "timer0: timer@0x0020a000" Ok, change to "timer0: timer@20a000"
[PATCH v8 0/3]: perf: reduce data loss when profiling highly parallel CPU bound workloads
Currently in record mode the tool implements trace writing serially. The algorithm loops over mapped per-cpu data buffers and stores ready data chunks into a trace file using write() system call. At some circumstances the kernel may lack free space in a buffer because the other buffer's half is not yet written to disk due to some other buffer's data writing by the tool at the moment. Thus serial trace writing implementation may cause the kernel to loose profiling data and that is what observed when profiling highly parallel CPU bound workloads on machines with big number of cores. Experiment with profiling matrix multiplication code executing 128 threads on Intel Xeon Phi (KNM) with 272 cores, like below, demonstrates data loss metrics value of 98%: /usr/bin/time perf record -o /tmp/perf-ser.data -a -N -B -T -R -g \ --call-graph dwarf,1024 --user-regs=IP,SP,BP \ --switch-events -e cycles,instructions,ref-cycles,software/period=1,name=cs,config=0x3/Duk -- \ matrix.gcc Data loss metrics is the ratio lost_time/elapsed_time where lost_time is the sum of time intervals containing PERF_RECORD_LOST records and elapsed_time is the elapsed application run time under profiling. Applying asynchronous trace streaming thru Posix AIO API (http://man7.org/linux/man-pages/man7/aio.7.html) lowers data loss metrics value providing 2x improvement - lowering 98% loss to almost 0%. --- Alexey Budankov (3): perf util: map data buffer for preserving collected data perf record: enable asynchronous trace writing perf record: extend trace writing to multi AIO tools/perf/builtin-record.c | 166 ++-- tools/perf/perf.h | 1 + tools/perf/util/evlist.c| 7 +- tools/perf/util/evlist.h| 3 +- tools/perf/util/mmap.c | 114 ++ tools/perf/util/mmap.h | 11 ++- 6 files changed, 277 insertions(+), 25 deletions(-) --- Changes in v8: - run the whole thing thru checkpatch.pl and corrected found issues except lines longer than 80 symbols - corrected comments alignment and formatting - moved multi AIO implementation into 3rd patch in the series - implemented explicit cblocks array allocation - split AIO completion check into separate record__aio_complete() - set nr_cblocks default to 1 and max allowed value to 4 Changes in v7: - implemented handling record.aio setting from perfconfig file Changes in v6: - adjusted setting of priorities for cblocks; - handled errno == EAGAIN case from aio_write() return; Changes in v5: - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero of=/dev/null count=10 - data loss metrics decreased from 25% to 2x in trialed configuration; - reshaped layout of data structures; - implemented --aio option; - avoided nanosleep() prior calling aio_suspend(); - switched to per-cpu aio multi buffer record__aio_sync(); - record_mmap_read_sync() now does global sync just before switching trace file or collection stop; Changes in v4: - converted mmap()/munmap() to malloc()/free() for mmap->data buffer management - converted void *bf to struct perf_mmap *md in signatures - written comment in perf_mmap__push() just before perf_mmap__get(); - written comment in record__mmap_read_sync() on possible restarting of aio_write() operation and releasing perf_mmap object after all; - added perf_mmap__put() for the cases of failed aio_write(); Changes in v3: - written comments about nanosleep(0.5ms) call prior aio_suspend() to cope with intrusiveness of its implementation in glibc; - written comments about rationale behind coping profiling data into mmap->data buffer; Changes in v2: - converted zalloc() to calloc() for allocation of mmap_aio array, - cleared typo and adjusted fallback branch code;
[PATCH] Input: elantech - enable middle button of touchpad on ThinkPad P72
Adding 2 new touchpad IDs to support middle button support. Cc: sta...@vger.kernel.org Signed-off-by: Aaron Ma --- drivers/input/mouse/elantech.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c index 44f57cf6675b..2d95e8d93cc7 100644 --- a/drivers/input/mouse/elantech.c +++ b/drivers/input/mouse/elantech.c @@ -1178,6 +1178,8 @@ static const struct dmi_system_id elantech_dmi_has_middle_button[] = { static const char * const middle_button_pnp_ids[] = { "LEN2131", /* ThinkPad P52 w/ NFC */ "LEN2132", /* ThinkPad P52 */ + "LEN2133", /* ThinkPad P72 w/ NFC */ + "LEN2134", /* ThinkPad P72 */ NULL }; -- 2.17.1
[PATCH v8 1/3]: perf util: map data buffer for preserving collected data
The map->data buffer is used to preserve map->base profiling data for writing to disk. AIO map->cblock is used to queue corresponding map->data buffer for asynchronous writing. Signed-off-by: Alexey Budankov --- Changes in v7: - implemented handling record.aio setting from perfconfig file Changes in v6: - adjusted setting of priorities for cblocks; Changes in v5: - reshaped layout of data structures; - implemented --aio option; Changes in v4: - converted mmap()/munmap() to malloc()/free() for mmap->data buffer management Changes in v2: - converted zalloc() to calloc() for allocation of mmap_aio array, - cleared typo and adjusted fallback branch code; --- tools/perf/util/mmap.c | 25 + tools/perf/util/mmap.h | 3 +++ 2 files changed, 28 insertions(+) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index fc832676a798..e53038d76445 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -155,6 +155,8 @@ void __weak auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp __mayb void perf_mmap__munmap(struct perf_mmap *map) { + if (map->data) + zfree(>data); if (map->base != NULL) { munmap(map->base, perf_mmap__mmap_len(map)); map->base = NULL; @@ -166,6 +168,7 @@ void perf_mmap__munmap(struct perf_mmap *map) int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd) { + int delta_max; /* * The last one will be done at perf_mmap__consume(), so that we * make sure we don't prevent tools from consuming every last event in @@ -190,6 +193,28 @@ int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd) map->base = NULL; return -1; } + delta_max = sysconf(_SC_AIO_PRIO_DELTA_MAX); + map->data = malloc(perf_mmap__mmap_len(map)); + if (!map->data) { + pr_debug2("failed to allocate data buffer, error %d\n", + errno); + return -1; + } + /* +* Use cblock.aio_fildes value different from -1 +* to denote started aio write operation on the +* cblock so it requires explicit record__aio_sync() +* call prior the cblock may be reused again. +*/ + map->cblock.aio_fildes = -1; + /* +* Allocate cblock with max priority delta to +* have faster aio_write() calls because queued +* requests are kept in separate per-prio queues +* and adding a new request iterates thru shorter +* per-prio list. +*/ + map->cblock.aio_reqprio = delta_max; map->fd = fd; if (auxtrace_mmap__mmap(>auxtrace_mmap, diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index d82294db1295..1974e621e36b 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "auxtrace.h" #include "event.h" @@ -25,6 +26,8 @@ struct perf_mmap { bool overwrite; struct auxtrace_mmap auxtrace_mmap; char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8); + void *data; + struct aiocb cblock; }; /*
Re: [PATCH AUTOSEL 4.18 043/131] ASoC: soc-pcm: Use delay set in component pointer function
On 9/7/2018 5:53 AM, Sasha Levin wrote: > On Mon, Sep 03, 2018 at 12:16:26PM +0100, Mark Brown wrote: >> On Sun, Sep 02, 2018 at 01:03:55PM +, Sasha Levin wrote: >>> From: Akshu Agrawal >>> >>> [ Upstream commit 9fb4c2bf130b922c77c16a8368732699799c40de ] >>> >>> Take into account the base delay set in pointer callback. >>> >>> There are cases where a pointer function populates >>> runtime->delay, such as: >>> ./sound/pci/hda/hda_controller.c >>> ./sound/soc/intel/atom/sst-mfld-platform-pcm.c >> >> I'm worried that if anyone notices this at all they will have already >> compensated for the delays in userspace and therefore this will cause >> them to see problems as they get double compenstation for delays. > > But what happens when they update to a newer Stable? They're going to > hit that issue anyways. > Drivers which had exposed this delay in pointer function but have compensated for the issue in userspace are likely see the problem of double delay when the update happens. I Don't know what is the best way to communicate that issue is fixed in kernel and usersapce compensation isn't required. But more likely I think the delay was just getting left out and there wouldn't have been a compensation in userspace. Thanks, Akshu
Re: [PATCH] sched/fair: vruntime should normalize when switching from fair
On 06/09/18 16:25, Dietmar Eggemann wrote: > Hi Juri, > > On 08/23/2018 11:54 PM, Juri Lelli wrote: > > On 23/08/18 18:52, Dietmar Eggemann wrote: > > > Hi, > > > > > > On 08/21/2018 01:54 AM, Miguel de Dios wrote: > > > > On 08/17/2018 11:27 AM, Steve Muckle wrote: > > > > > From: John Dias > > [...] > > > > > > > I tried to catch this issue on my Arm64 Juno board using pi_test (and a > > > slightly adapted pip_test (usleep_val = 1500 and keep low as cfs)) from > > > rt-tests but wasn't able to do so. > > > > > > # pi_stress --inversions=1 --duration=1 --groups=1 --sched > > > id=low,policy=cfs > > > > > > Starting PI Stress Test > > > Number of thread groups: 1 > > > Duration of test run: 1 seconds > > > Number of inversions per group: 1 > > > Admin thread SCHED_FIFO priority 4 > > > 1 groups of 3 threads will be created > > >High thread SCHED_FIFO priority 3 > > > Med thread SCHED_FIFO priority 2 > > > Low thread SCHED_OTHER nice 0 > > > > > > # ./pip_stress > > > > > > In both cases, the cfs task entering rt_mutex_setprio() is queued, so > > > dequeue_task_fair()->dequeue_entity(), which subtracts > > > cfs_rq->min_vruntime > > > from se->vruntime, is called on it before it gets the rt prio. > > > > > > Maybe it requires a very specific use of the pthread library to provoke > > > this > > > issue by making sure that the cfs tasks really blocks/sleeps? > > > > Maybe one could play with rt-app to recreate such specific use case? > > > > https://github.com/scheduler-tools/rt-app/blob/master/doc/tutorial.txt#L459 > > I played a little bit with rt-app on hikey960 to re-create Steve's test > program. Oh, nice! Thanks for sharing what you have got. > Since there is no semaphore support (sem_wait(), sem_post()) I used > condition variables (wait: pthread_cond_wait() , signal: > pthread_cond_signal()). It's not really the same since this is stateless but > sleeps before the signals help to maintain the state in this easy example. > > This provokes the vruntime issue e.g. for cpus 0,4 and it doesn't for 0,1: > > > "global": { > "calibration" : 130, > "pi_enabled" : true > }, > "tasks": { > "rt_task": { > "loop" : 100, > "policy" : "SCHED_FIFO", > "cpus" : [0], > > "lock" : "b_mutex", > "wait" : { "ref" : "b_cond", "mutex" : "b_mutex" }, > "unlock" : "b_mutex", > "sleep" : 3000, > "lock1" : "a_mutex", > "signal" : "a_cond", > "unlock1" : "a_mutex", > "lock2" : "pi-mutex", > "unlock2" : "pi-mutex" > }, > "cfs_task": { > "loop" : 100, > "policy" : "SCHED_OTHER", > "cpus" : [4], > > "lock" : "pi-mutex", > "sleep" : 3000, > "lock1" : "b_mutex", > "signal" : "b_cond", > "unlock" : "b_mutex", > "lock2" : "a_mutex", > "wait" : { "ref" : "a_cond", "mutex" : "a_mutex" }, > "unlock1" : "a_mutex", > "unlock2" : "pi-mutex" > } > } > } > > Adding semaphores is possible but rt-app has no easy way to initialize > individual objects, e.g. sem_init(..., value). The only way I see is via the > global section, like "pi_enabled". But then, this is true for all objects of > this kind (in this case mutexes)? Right, global section should work fine. Why do you think this is a problem/limitation? > So the following couple of lines extension to rt-app works because both > semaphores can be initialized to 0: > > { > "global": { > "calibration" : 130, > "pi_enabled" : true > }, > "tasks": { > "rt_task": { > "loop" : 100, > "policy" : "SCHED_FIFO", > "cpus" : [0], > > "sem_wait" : "b_sem", > "sleep" : 1000, > "sem_post" : "a_sem", > > "lock" : "pi-mutex", > "unlock" : "pi-mutex" > }, > "cfs_task": { > "loop" : 100, > "policy" : "SCHED_OTHER", > "cpus" : [4], > > "lock" : "pi-mutex", > "sleep" : 1000, > "sem_post" : "b_sem", > "sem_wait" : "a_sem", > "unlock" : "pi-mutex" > } > } > } > > Any thoughts on that? I can see something like this as infrastructure to > create a regression test case based on rt-app and standard ftrace. Agree. I guess we should add your first example to the repo (you'd be very welcome to create a PR) already and then work to support the second?
[PATCH v8 2/3]: perf record: enable asynchronous trace writing
Trace file offset is calculated and updated linearly prior enqueuing aio write at record__pushfn(). record__aio_sync() blocks till completion of started AIO operation and then proceeds. record__mmap_read_sync() implements a barrier for all incomplete aio write requests. Signed-off-by: Alexey Budankov --- Changes in v8: - split AIO completion check into separate record__aio_complete() Changes in v6: - handled errno == EAGAIN case from aio_write(); Changes in v5: - data loss metrics decreased from 25% to 2x in trialed configuration; - avoided nanosleep() prior calling aio_suspend(); - switched to per cpu multi record__aio_sync() aio - record_mmap_read_sync() now does global barrier just before switching trace file or collection stop; - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero of=/dev/null count=10 Changes in v4: - converted void *bf to struct perf_mmap *md in signatures - written comment in perf_mmap__push() just before perf_mmap__get(); - written comment in record__mmap_read_sync() on possible restarting of aio_write() operation and releasing perf_mmap object after all; - added perf_mmap__put() for the cases of failed aio_write(); Changes in v3: - written comments about nanosleep(0.5ms) call prior aio_suspend() to cope with intrusiveness of its implementation in glibc; - written comments about rationale behind coping profiling data into mmap->data buffer; --- tools/perf/builtin-record.c | 128 +++- tools/perf/util/mmap.c | 54 ++- tools/perf/util/mmap.h | 2 +- 3 files changed, 169 insertions(+), 15 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 22ebeb92ac51..d4857572cf33 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -121,6 +121,93 @@ static int record__write(struct record *rec, void *bf, size_t size) return 0; } +static int record__aio_write(struct aiocb *cblock, int trace_fd, + void *buf, size_t size, off_t off) +{ + int rc; + + cblock->aio_fildes = trace_fd; + cblock->aio_buf= buf; + cblock->aio_nbytes = size; + cblock->aio_offset = off; + cblock->aio_sigevent.sigev_notify = SIGEV_NONE; + + do { + rc = aio_write(cblock); + if (rc == 0) { + break; + } else if (errno != EAGAIN) { + cblock->aio_fildes = -1; + pr_err("failed to queue perf data, error: %m\n"); + break; + } + } while (1); + + return rc; +} + +static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock) +{ + void *rem_buf; + off_t rem_off; + size_t rem_size; + int rc, aio_errno; + ssize_t aio_ret, written; + + aio_errno = aio_error(cblock); + if (aio_errno == EINPROGRESS) + return 0; + + written = aio_ret = aio_return(cblock); + if (aio_ret < 0) { + if (!(aio_errno == EINTR)) + pr_err("failed to write perf data, error: %m\n"); + written = 0; + } + + rem_size = cblock->aio_nbytes - written; + + if (rem_size == 0) { + cblock->aio_fildes = -1; + /* +* md->refcount is incremented in perf_mmap__push() for +* every enqueued aio write request so decrement it because +* the request is now complete. +*/ + perf_mmap__put(md); + rc = 1; + } else { + /* +* aio write request may require restart with the +* reminder if the kernel didn't write whole +* chunk at once. +*/ + rem_off = cblock->aio_offset + written; + rem_buf = (void *)(cblock->aio_buf + written); + record__aio_write(cblock, cblock->aio_fildes, + rem_buf, rem_size, rem_off); + rc = 0; + } + + return rc; +} + +static void record__aio_sync(struct perf_mmap *md) +{ + struct aiocb *cblock = >cblock; + struct timespec timeout = { 0, 1000 * 1000 * 1 }; // 1ms + + do { + if (cblock->aio_fildes == -1 || record__aio_complete(md, cblock)) + return; + + while (aio_suspend((const struct aiocb**), 1, )) { + if (!(errno == EAGAIN || errno == EINTR)) + pr_err("failed to sync perf data, error: %m\n"); + } + } while (1); +} + static int process_synthesized_event(struct perf_tool *tool, union perf_event *event, struct perf_sample *sample __maybe_unused, @@ -130,12 +217,27 @@ static int process_synthesized_event(struct
Re: [PATCH resend 0/2] irqchip: convert to SPDX for Renesas drivers
On Fri, 07 Sep 2018 02:50:13 +0100, Kuninori Morimoto wrote: > > > Hi Thomas, Marc, Jason > > 2weeks passed. I resend this patch again > > Kuninori Morimoto (2): > pinctrl: sh-pfc: convert to SPDX identifiers > pinctrl: rza1: convert to SPDX identifiers > > drivers/pinctrl/pinctrl-rza1.c | 5 + > drivers/pinctrl/sh-pfc/Kconfig | 1 + > drivers/pinctrl/sh-pfc/core.c| 5 + > drivers/pinctrl/sh-pfc/core.h| 7 ++- > drivers/pinctrl/sh-pfc/gpio.c| 5 + > drivers/pinctrl/sh-pfc/pfc-emev2.c | 5 + > drivers/pinctrl/sh-pfc/pfc-r8a73a4.c | 15 +-- > drivers/pinctrl/sh-pfc/pfc-r8a7740.c | 15 +-- > drivers/pinctrl/sh-pfc/pfc-r8a7778.c | 10 +- > drivers/pinctrl/sh-pfc/pfc-r8a7779.c | 14 +- > drivers/pinctrl/sh-pfc/pfc-r8a7790.c | 15 +-- > drivers/pinctrl/sh-pfc/pfc-r8a7791.c | 5 + > drivers/pinctrl/sh-pfc/pfc-r8a7792.c | 5 + > drivers/pinctrl/sh-pfc/pfc-r8a7794.c | 5 + > drivers/pinctrl/sh-pfc/pfc-r8a7795-es1.c | 5 + > drivers/pinctrl/sh-pfc/pfc-r8a7795.c | 5 + > drivers/pinctrl/sh-pfc/pfc-r8a7796.c | 5 + > drivers/pinctrl/sh-pfc/pfc-r8a77970.c| 5 + > drivers/pinctrl/sh-pfc/pfc-r8a77995.c| 5 + > drivers/pinctrl/sh-pfc/pfc-sh7203.c | 5 + > drivers/pinctrl/sh-pfc/pfc-sh7264.c | 5 + > drivers/pinctrl/sh-pfc/pfc-sh7269.c | 5 + > drivers/pinctrl/sh-pfc/pfc-sh73a0.c | 15 +-- > drivers/pinctrl/sh-pfc/pfc-sh7720.c | 5 + > drivers/pinctrl/sh-pfc/pfc-sh7723.c | 5 + > drivers/pinctrl/sh-pfc/pfc-sh7724.c | 5 + > drivers/pinctrl/sh-pfc/pfc-sh7734.c | 5 + > drivers/pinctrl/sh-pfc/pfc-sh7757.c | 5 + > drivers/pinctrl/sh-pfc/pfc-sh7785.c | 5 + > drivers/pinctrl/sh-pfc/pfc-sh7786.c | 5 + > drivers/pinctrl/sh-pfc/pfc-shx3.c| 5 + > drivers/pinctrl/sh-pfc/pinctrl.c | 5 + > drivers/pinctrl/sh-pfc/sh_pfc.h | 7 ++- > 33 files changed, 35 insertions(+), 184 deletions(-) [+ Linus] If I trust the diffstat, should this be sent to the pinctrl maintainer instead? M. -- Jazz is not dead, it just smell funny.
Re: [PATCH] vme: remove unneeded kfree
On Thu, Sep 06, 2018 at 10:04:49PM -0700, Linus Torvalds wrote: > On Thu, Sep 6, 2018 at 1:51 AM Ding Xiang > wrote: > > > > put_device will call vme_dev_release to free vdev, kfree is > > unnecessary here. > > That does seem to be the case. I think "unnecessary" is overly kind, > it does seem to be a double free. > > Looks like the issue was introduced back in 2013 by commit > def1820d25fa ("vme: add missing put_device() after device_register() > fails"). > > It seems you should *either* kfree() the vdev, _or_ do put_device(), > but doing both seems wrong. You should only ever call put_device() after you have created the structure, the documentation should say that somewhere... > I presume the device_register() has never failed, and this being > vme-only I'm guessing there isn't a vibrant testing community. > > Greg? It's the correct fix, I'll queue it up soon, thanks. greg k-h
[PATCH 1/2] mtd: rawnand: denali: remove ->dev_ready() hook
The Denali NAND IP has no way to read out the current signal level of the R/B# pin. Instead, denali_dev_ready() checks if the R/B# transition has already happened. (The INTR__INT_ACT interrupt is asserted at the rising edge of the R/B# pin.) It is not a correct way to implement the ->dev_ready() hook. In fact, it has a drawback; in the nand_scan_ident phase, the chip detection iterates over maxchips until it fails to find a homogeneous chip. For the last loop, nand_reset() fails if no chip is there. If ->dev_ready hook exists, nand_command(_lp) calls nand_wait_ready() after NAND_CMD_RESET. However, we know denali_dev_ready() never returns 1 unless there exists a chip that toggles R/B# in that chip select. Then, nand_wait_ready() just ends up with wasting 400 msec, in the end, shows the "timeout while waiting for chip to become ready" warning. Let's remove the mis-implemented dev_ready hook, and fallback to sending the NAND_CMD_STATUS and nand_wait_status_ready(), which bails out more quickly. Signed-off-by: Masahiro Yamada --- drivers/mtd/nand/raw/denali.c | 22 +- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/drivers/mtd/nand/raw/denali.c b/drivers/mtd/nand/raw/denali.c index f88a5dc..f069184 100644 --- a/drivers/mtd/nand/raw/denali.c +++ b/drivers/mtd/nand/raw/denali.c @@ -203,18 +203,6 @@ static uint32_t denali_wait_for_irq(struct denali_nand_info *denali, return denali->irq_status; } -static uint32_t denali_check_irq(struct denali_nand_info *denali) -{ - unsigned long flags; - uint32_t irq_status; - - spin_lock_irqsave(>irq_lock, flags); - irq_status = denali->irq_status; - spin_unlock_irqrestore(>irq_lock, flags); - - return irq_status; -} - static void denali_read_buf(struct mtd_info *mtd, uint8_t *buf, int len) { struct denali_nand_info *denali = mtd_to_denali(mtd); @@ -294,7 +282,7 @@ static void denali_cmd_ctrl(struct mtd_info *mtd, int dat, unsigned int ctrl) return; /* -* Some commands are followed by chip->dev_ready or chip->waitfunc. +* Some commands are followed by chip->waitfunc. * irq_status must be cleared here to catch the R/B# interrupt later. */ if (ctrl & NAND_CTRL_CHANGE) @@ -303,13 +291,6 @@ static void denali_cmd_ctrl(struct mtd_info *mtd, int dat, unsigned int ctrl) denali->host_write(denali, DENALI_BANK(denali) | type, dat); } -static int denali_dev_ready(struct mtd_info *mtd) -{ - struct denali_nand_info *denali = mtd_to_denali(mtd); - - return !!(denali_check_irq(denali) & INTR__INT_ACT); -} - static int denali_check_erased_page(struct mtd_info *mtd, struct nand_chip *chip, uint8_t *buf, unsigned long uncor_ecc_flags, @@ -1349,7 +1330,6 @@ int denali_init(struct denali_nand_info *denali) chip->write_byte = denali_write_byte; chip->read_word = denali_read_word; chip->cmd_ctrl = denali_cmd_ctrl; - chip->dev_ready = denali_dev_ready; chip->waitfunc = denali_waitfunc; if (features & FEATURES__INDEX_ADDR) { -- 2.7.4
[PATCH 0/2] mtd: rawnand: denali: clean-up unnecessary hook and device reset
As I replied to Boris [1], I took a closer look for further cleanups. I test this series on my board. Remove mis-implemented ->dev_ready hook. Remove unnecessary device resetting because nand_scan_ident() reset devices anyway. [1] http://patchwork.ozlabs.org/patch/960160/ Masahiro Yamada (2): mtd: rawnand: denali: remove ->dev_ready() hook mtd: rawnand: denali: remove denali_reset_banks() drivers/mtd/nand/raw/denali.c | 51 +-- 1 file changed, 1 insertion(+), 50 deletions(-) -- 2.7.4
[PATCH 2/2] mtd: rawnand: denali: remove denali_reset_banks()
In nand_scan_ident(), the controller driver resets every NAND chip. This is done by sending NAND_CMD_RESET. The Denali IP provides another way to do the equivalent thing; if a bit is set in the DEVICE_RESET register, the controller sends the RESET command to the corresponding device. denali_reset_banks() uses it to reset all devices beforehand. This redundant reset sequence was needed to know the actual number of chips before calling nand_scan_ident(); if DEVICE_RESET fails, there is no chip in that chip select. Then, denali_reset_banks() sets denali->max_banks to the number of detected chips. As commit f486287d2372 ("mtd: nand: denali: fix bank reset function to detect the number of chips") explained, nand_scan_ident() issued Set Features (0xEF) command to all CS lines, some of which may not be connected with a chip. Then, the driver would wait for R/B# response, which never happens. This problem was solved by commit 107b7d6a7ad4 ("mtd: rawnand: avoid setting again the timings to mode 0 after a reset"). In the current code, nand_setup_data_interface() is called from nand_scan_tail(), which is invoked after the chip detection. Now, we can really remove the redundant denali_nand_banks() by simply passing the maximum number of chip selects supported by this IP (typically 4 or 8) to nand_scan(). Let's leave all the chip detection process to nand_scan_ident(). Signed-off-by: Masahiro Yamada --- drivers/mtd/nand/raw/denali.c | 29 - 1 file changed, 29 deletions(-) diff --git a/drivers/mtd/nand/raw/denali.c b/drivers/mtd/nand/raw/denali.c index f069184..d1ae968 100644 --- a/drivers/mtd/nand/raw/denali.c +++ b/drivers/mtd/nand/raw/denali.c @@ -1040,29 +1040,6 @@ static int denali_setup_data_interface(struct mtd_info *mtd, int chipnr, return 0; } -static void denali_reset_banks(struct denali_nand_info *denali) -{ - u32 irq_status; - int i; - - for (i = 0; i < denali->max_banks; i++) { - denali->active_bank = i; - - denali_reset_irq(denali); - - iowrite32(DEVICE_RESET__BANK(i), - denali->reg + DEVICE_RESET); - - irq_status = denali_wait_for_irq(denali, - INTR__RST_COMP | INTR__INT_ACT | INTR__TIME_OUT); - if (!(irq_status & INTR__INT_ACT)) - break; - } - - dev_dbg(denali->dev, "%d chips connected\n", i); - denali->max_banks = i; -} - static void denali_hw_init(struct denali_nand_info *denali) { /* @@ -1311,12 +1288,6 @@ int denali_init(struct denali_nand_info *denali) } denali_enable_irq(denali); - denali_reset_banks(denali); - if (!denali->max_banks) { - /* Error out earlier if no chip is found for some reasons. */ - ret = -ENODEV; - goto disable_irq; - } denali->active_bank = DENALI_INVALID_BANK; -- 2.7.4
Re: [PATCH 1/2] platform/chrome: Move mfd/cros_ec_lpc* includes to drivers/platform.
Hi Enric, On Wed, Jul 18, 2018 at 06:09:55PM +0200, Enric Balletbo i Serra wrote: > The cros-ec-lpc driver lives in drivers/platform because is platform > specific, however there are two includes (cros_ec_lpc_mec.h and > cros_ec_lpc_reg.h) that lives in include/linux/mfd. These two includes > are only used for the platform driver and are not really related to the > MFD subsystem, so move the includes from include/linux/mfd to > drivers/platform/chrome. > > Signed-off-by: Enric Balletbo i Serra Thanks. Applied to my working branch for v4.20. -- Benson Leung Staff Software Engineer Chrome OS Kernel Google Inc. ble...@google.com Chromium OS Project ble...@chromium.org signature.asc Description: PGP signature
Re: [PATCH v7 1/2] leds: core: Introduce LED pattern trigger
Hi! > +What:/sys/class/leds//hw_pattern > +Date:September 2018 > +KernelVersion: 4.20 > +Description: > + Specify a hardware pattern for the SC27XX LED. For the SC27XX > + LED controller, it only supports 4 hardware patterns to > configure > + the low time, rise time, high time and fall time for the > breathing > + mode, and each stage duration unit is 125ms. So the format of > + the hardware pattern values should be: > + "brightness_1 duration_1 brightness_2 duration_2 brightness_3 > + duration_3 brightness_4 duration_4". > > In this case low time and high time can be easily described with > use of the proposed [brightness delta_t] tuples. It is not equally > obvious in case of rise time and fall time. > > I can imagine hw pattern that would require defining blink rate > over period of time, or blink rate during rise/fall time - in the > latter case we would have odd number of pattern components. Probably > it wouldn't be a big deal, we'd need one "padding" value, but still > there's room for improvement IMHO. Well, you can describe blinking while rising, it is just going to be awkward as you'll need to give precise times/brightnesses for each blinking, and pattern will become long. I'm sure some hardware can do that (the led in N900 can compute prime numbers, it can blink while changing brightness, too). OTOH people tend to use pretty simple patterns on their LEDs, so we should be fine. Pavel -- (english) http://www.livejournal.com/~pavelmachek (cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html signature.asc Description: Digital signature
Re: [PATCH] printk/tracing: Do not trace printk_nmi_enter()
On Wed, Sep 05, 2018 at 09:33:34PM -0400, Steven Rostedt wrote: > do_idle { > > [interrupts enabled] > > [interrupts disabled] > TRACE_IRQS_OFF [lockdep says irqs off] > [...] > TRACE_IRQS_IRET > test if pt_regs say return to interrupts enabled [yes] > TRACE_IRQS_ON [lockdep says irqs are on] > > > nmi_enter() { > printk_nmi_enter() [traced by ftrace] > [ hit ftrace breakpoint ] > > TRACE_IRQS_OFF [lockdep says irqs off] > [...] > TRACE_IRQS_IRET [return from breakpoint] > test if pt_regs say interrupts enabled [no] > [iret back to interrupt] > [iret back to code] > > tick_nohz_idle_enter() { > > lockdep_assert_irqs_enabled() [lockdep say no!] Isn't the problem that we muck with the IRQ state from NMI context? We shouldn't be doing that. The thing is, since we trace the IRQ state from within IRQ-disable, since that's the only IRQ-safe option, it is very much not NMI-safe. Your patch might avoid the symptom, but I don't think it cures the fundamental problem.
Re: [PATCH v9 3/6] kernel/reboot.c: export pm_power_off_prepare
Hi Mark, On Thu, Sep 06, 2018 at 11:15:17AM +0100, Mark Brown wrote: > On Mon, Aug 27, 2018 at 09:48:16AM +0800, Shawn Guo wrote: > > > Can you ACK on those two regulator patches, so that I can queue this > > series up on IMX tree? > > I was expecting to get a pull request with the precursor patches in it - > the regulator driver seems to get a moderate amount of development so > there's a reasonable risk of conflicts. Are there any thing I can or should do? -- Pengutronix e.K. | | Industrial Linux Solutions | http://www.pengutronix.de/ | Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0| Amtsgericht Hildesheim, HRA 2686 | Fax: +49-5121-206917- | signature.asc Description: PGP signature
Re: [PATCH 0/9] psi: pressure stall information for CPU, memory, and IO v4
On Thu, Sep 6, 2018 at 5:43 AM, Johannes Weiner wrote: > Peter, do the changes from v3 look sane to you? > > If there aren't any further objections, I was hoping we could get this > lined up for 4.20. That would be excellent. I just retested the latest version at http://git.cmpxchg.org/cgit.cgi/linux-psi.git (Linux 4.18) and the results are great. Test setup: Endless OS GeminiLake N4200 low end laptop 2GB RAM swap (and zram swap) disabled Baseline test: open a handful of large-ish apps and several website tabs in Google Chrome. Results: after a couple of minutes, system is excessively thrashing, mouse cursor can barely be moved, UI is not responding to mouse clicks, so it's impractical to recover from this situation as an ordinary user Add my simple killer: https://gist.github.com/dsd/a8988bf0b81a6163475988120fe8d9cd Results: when the thrashing causes the UI to become sluggish, the killer steps in and kills something (usually a chrome tab), and the system remains usable. I repeatedly opened more apps and more websites over a 15 minute period but I wasn't able to get the system to a point of UI unresponsiveness. Thanks, Daniel
[PATCH v8 3/3]: perf record: extend trace writing to multi AIO
Multi AIO trace writing allows caching more kernel data into userspace memory postponing trace writing for the sake of overall profiling data thruput increase. It could be seen as kernel data buffer extension into userspace memory. With aio-cblocks option value different from 1, current default value, tool has capability to cache more and more data into user space along with delegating spill to AIO. That allows avoiding suspend at record__aio_sync() between calls of record__mmap_read_evlist() and increase profiling data thruput for the cost of userspace memory. Signed-off-by: Alexey Budankov --- tools/perf/builtin-record.c | 55 +++--- tools/perf/perf.h | 1 + tools/perf/util/evlist.c| 7 ++-- tools/perf/util/evlist.h| 3 +- tools/perf/util/mmap.c | 83 +++-- tools/perf/util/mmap.h | 10 +++--- 6 files changed, 114 insertions(+), 45 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index d4857572cf33..6361098a5898 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -192,16 +192,35 @@ static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock) return rc; } -static void record__aio_sync(struct perf_mmap *md) +static int record__aio_sync(struct perf_mmap *md, bool sync_all) { - struct aiocb *cblock = >cblock; + struct aiocb **aiocb = md->aiocb; + struct aiocb *cblocks = md->cblocks; struct timespec timeout = { 0, 1000 * 1000 * 1 }; // 1ms + int i, do_suspend; do { - if (cblock->aio_fildes == -1 || record__aio_complete(md, cblock)) - return; + do_suspend = 0; + for (i = 0; i < md->nr_cblocks; ++i) { + if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, [i])) { + if (sync_all) + aiocb[i] = NULL; + else + return i; + } else { + /* +* Started aio write is not complete yet +* so it has to be waited before the +* next allocation. +*/ + aiocb[i] = [i]; + do_suspend = 1; + } + } + if (!do_suspend) + return -1; - while (aio_suspend((const struct aiocb**), 1, )) { + while (aio_suspend((const struct aiocb **)aiocb, md->nr_cblocks, )) { if (!(errno == EAGAIN || errno == EINTR)) pr_err("failed to sync perf data, error: %m\n"); } @@ -428,7 +447,8 @@ static int record__mmap_evlist(struct record *rec, if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, opts->auxtrace_mmap_pages, -opts->auxtrace_snapshot_mode) < 0) { +opts->auxtrace_snapshot_mode, +opts->nr_cblocks) < 0) { if (errno == EPERM) { pr_err("Permission error mapping pages.\n" "Consider increasing " @@ -621,7 +641,7 @@ static void record__mmap_read_sync(struct record *rec) for (i = 0; i < evlist->nr_mmaps; i++) { struct perf_mmap *map = [i]; if (map->base) - record__aio_sync(map); + record__aio_sync(map, true); } } @@ -629,7 +649,7 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli bool overwrite) { u64 bytes_written = rec->bytes_written; - int i; + int i, idx; int rc = 0; struct perf_mmap *maps; @@ -648,11 +668,12 @@ static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evli if (maps[i].base) { /* -* Call record__aio_sync() to wait till map->data buffer -* becomes available after previous aio write request. +* Call record__aio_sync() to get some free map->data +* buffer or wait if all of previously started aio +* writes are still incomplete. */ - record__aio_sync([i]); - if (perf_mmap__push([i], rec, record__pushfn) != 0) { + idx = record__aio_sync([i], false); + if (perf_mmap__push([i], rec, idx, record__pushfn) != 0) { rc = -1;
Re: [PATCH] sched/fair: fix load_balance redo for null imbalance
Le Friday 07 Sep 2018 à 13:37:49 (+0200), Peter Zijlstra a écrit : > On Fri, Sep 07, 2018 at 09:51:04AM +0200, Vincent Guittot wrote: > > It can happen that load_balance finds a busiest group and then a busiest rq > > but the calculated imbalance is in fact null. > > Cute. Does that happen often? I have a use case with RT tasks that reproduces the problem regularly. It happens at least when we have CPUs with different capacity either because of heterogeous CPU or because of RT/DL reducing available capacity for cfs I have put the call path that trigs the problem below and accroding to the comment it seems that we can reach similar state when playing with priority. > > > If the calculated imbalance is null, it's useless to try to find a busiest > > rq as no task will be migrated and we can return immediately. > > > > This situation can happen with heterogeneous system or smp system when RT > > tasks are decreasing the capacity of some CPUs. > > Is it the result of one of those "force_balance" conditions in > find_busiest_group() ? Should we not fix that to then return NULL > instead? The UC is: We have a newly_idle load balance that is triggered when RT task becomes idle ( but I think that I have seen that with idle load balance too) we trigs: if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; In calculate_imbalance we use the path /* * Avg load of busiest sg can be less and avg load of local sg can * be greater than avg load across all sgs of sd because avg load * factors in sg capacity and sgs with smaller group_type are * skipped when updating the busiest sg: */ if (busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) { env->imbalance = 0; return fix_small_imbalance(env, sds); } but fix_small_imbalance finally decides to return without modifying imbalance like here if (busiest->avg_load + scaled_busy_load_per_task >= local->avg_load + (scaled_busy_load_per_task * imbn)) { env->imbalance = busiest->load_per_task; return; } Beside this patch, I'm preparing another patch in fix small imbalance to ensure 1 task per CPU in similar situation but according to the comment above, we can reach this situation because of tasks priority
Re: [PATCH 4/4] sched/numa: Do not move imbalanced load purely on the basis of an idle CPU
On Fri, Sep 07, 2018 at 01:33:09PM +0200, Peter Zijlstra wrote: > > --- > > kernel/sched/fair.c | 2 +- > > 1 file changed, 1 insertion(+), 1 deletion(-) > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c > > index d59d3e00a480..d4c289c11012 100644 > > --- a/kernel/sched/fair.c > > +++ b/kernel/sched/fair.c > > @@ -1560,7 +1560,7 @@ static bool task_numa_compare(struct task_numa_env > > *env, > > goto unlock; > > > > if (!cur) { > > - if (maymove || imp > env->best_imp) > > + if (maymove) > > goto assign; > > else > > goto unlock; > > Srikar's patch here: > > > http://lkml.kernel.org/r/1533276841-16341-4-git-send-email-sri...@linux.vnet.ibm.com > > Also frobs this condition, but in a less radical way. Does that yield > similar results? I can check. I do wonder of course if the less radical approach just means that automatic NUMA balancing and the load balancer simply disagree about placement at a different time. It'll take a few days to have an answer as the battery of workloads to check this take ages. -- Mel Gorman SUSE Labs
Re: [GIT PULL] ext4 updates for 3.11
Digging up an email thread from 2013... On Wed, Jul 03, 2013 at 01:29:41PM +1000, Dave Chinner wrote: > On Tue, Jul 02, 2013 at 06:01:11PM -0700, Greg KH wrote: > > On Tue, Jul 02, 2013 at 05:58:15PM -0700, Linus Torvalds wrote: > > > On Tue, Jul 2, 2013 at 5:54 PM, Greg KH wrote: > > > > On Tue, Jul 02, 2013 at 05:02:21PM -0700, Linus Torvalds wrote: > > > >> > > > >> I'm really not convinced this whole Lustre thing was correctly > > > >> handled. Merging it into stable and yet being in such bad shape that > > > >> it isn't enabled even there? I just dunno. But I have the turd in my > > > >> tree now, let's hope it gets fixed up. > > > > > > > > It's in "staging", not "stable" :) > > > > > > Yes. But what was the reason to actually merge it even there? And once > > > it gets merged, disabling it again rather than fixing the problems it > > > has? > > > > The problems turned out to be too big, too late in the merge cycle for > > me to be able to take them (they still aren't even done, as I don't have > > a working set of patches yet.) So I just disabled it from the build to > > give Andreas and team time to get it working properly. > > > > I could have just removed it, but I thought I would give them a chance. > > > > > This is a filesystem that Intel apparently wants to push. I think it > > > would have been a better idea to push back a bit and say "at least > > > clean it up a bit first". It's not like Intel is one of the clueless > > > companies that couldn't have done so and need help from the community. > > > > For this filesystem, it seems that they don't have any resources to do > > this work and are relying on the community to help out. Which is odd, > > but big companies are strange some times... > > Didn't we learn this lesson already with POHMELFS? i.e. that dumping > filesystem code in staging on the assumption "the community" will > fix it up when nobody in "the community" uses or can even test that > filesystem is a broken development model Dave, and Linus, you were totally right here. Sorry for not listening to you before, my fault. The lustre developers never got their act together and probably by this being in staging, it only prolonged the agony of everyone involved. greg k-h
Re: [PATCH] printk/tracing: Do not trace printk_nmi_enter()
On Fri, 7 Sep 2018 15:45:32 +0200 Peter Zijlstra wrote: > Yes really, we should not muck with the IRQ state from NMI context. Right, and we didn't. Your patch didn't change anything, but allow for printk_nmi_enter/exit() to be traced by ftrace, but that's wrong to begin with because it ftrace_nmi_enter() hasn't been called yet. -- Steve
Re: [PATCH 8/9] psi: pressure stall information for CPU, memory, and IO
On Fri, Sep 07, 2018 at 12:16:34PM +0200, Peter Zijlstra wrote: > On Tue, Aug 28, 2018 at 01:22:57PM -0400, Johannes Weiner wrote: > > +enum psi_states { > > + PSI_IO_SOME, > > + PSI_IO_FULL, > > + PSI_MEM_SOME, > > + PSI_MEM_FULL, > > + PSI_CPU_SOME, > > + /* Only per-CPU, to weigh the CPU in the global average: */ > > + PSI_NONIDLE, > > + NR_PSI_STATES, > > +}; > > > +static u32 get_recent_time(struct psi_group *group, int cpu, > > + enum psi_states state) > > +{ > > + struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); > > + unsigned int seq; > > + u32 time, delta; > > + > > + do { > > + seq = read_seqcount_begin(>seq); > > + > > + time = groupc->times[state]; > > + /* > > +* In addition to already concluded states, we also > > +* incorporate currently active states on the CPU, > > +* since states may last for many sampling periods. > > +* > > +* This way we keep our delta sampling buckets small > > +* (u32) and our reported pressure close to what's > > +* actually happening. > > +*/ > > + if (test_state(groupc->tasks, state)) > > + time += cpu_clock(cpu) - groupc->state_start; > > + } while (read_seqcount_retry(>seq, seq)); > > + > > + delta = time - groupc->times_prev[state]; > > + groupc->times_prev[state] = time; > > + > > + return delta; > > +} > > > +static bool update_stats(struct psi_group *group) > > +{ > > + u64 deltas[NR_PSI_STATES - 1] = { 0, }; > > + unsigned long missed_periods = 0; > > + unsigned long nonidle_total = 0; > > + u64 now, expires, period; > > + int cpu; > > + int s; > > + > > + mutex_lock(>stat_lock); > > + > > + /* > > +* Collect the per-cpu time buckets and average them into a > > +* single time sample that is normalized to wallclock time. > > +* > > +* For averaging, each CPU is weighted by its non-idle time in > > +* the sampling period. This eliminates artifacts from uneven > > +* loading, or even entirely idle CPUs. > > +*/ > > + for_each_possible_cpu(cpu) { > > + u32 nonidle; > > + > > + nonidle = get_recent_time(group, cpu, PSI_NONIDLE); > > + nonidle = nsecs_to_jiffies(nonidle); > > + nonidle_total += nonidle; > > + > > + for (s = 0; s < PSI_NONIDLE; s++) { > > + u32 delta; > > + > > + delta = get_recent_time(group, cpu, s); > > + deltas[s] += (u64)delta * nonidle; > > + } > > + } > > This does the whole seqcount thing 6x, which is a bit of a waste. [...] > It's a bit cumbersome, but that's because of C. I was actually debating exactly this with Suren before, but since this is a super cold path I went with readability. I was also thinking that restarts could happen quite regularly under heavy scheduler load, and so keeping the individual retry sections small could be helpful - but I didn't instrument this in any way. No strong opinion from me, I can send an updated patch if you prefer.
Re: [PATCH v11 0/3] remain and optimize memblock_next_valid_pfn on arm and arm64
On Thu, Sep 06, 2018 at 01:24:22PM +0200, Ard Biesheuvel wrote: > On 22 August 2018 at 05:07, Jia He wrote: > > Commit b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns > > where possible") optimized the loop in memmap_init_zone(). But it causes > > possible panic bug. So Daniel Vacek reverted it later. > > > > But as suggested by Daniel Vacek, it is fine to using memblock to skip > > gaps and finding next valid frame with CONFIG_HAVE_ARCH_PFN_VALID. > > > > More from what Daniel said: > > "On arm and arm64, memblock is used by default. But generic version of > > pfn_valid() is based on mem sections and memblock_next_valid_pfn() does > > not always return the next valid one but skips more resulting in some > > valid frames to be skipped (as if they were invalid). And that's why > > kernel was eventually crashing on some !arm machines." > > > > About the performance consideration: > > As said by James in b92df1de5, > > "I have tested this patch on a virtual model of a Samurai CPU with a > > sparse memory map. The kernel boot time drops from 109 to 62 seconds." > > Thus it would be better if we remain memblock_next_valid_pfn on arm/arm64. > > > > Besides we can remain memblock_next_valid_pfn, there is still some room > > for improvement. After this set, I can see the time overhead of memmap_init > > is reduced from 27956us to 13537us in my armv8a server(QDF2400 with 96G > > memory, pagesize 64k). I believe arm server will benefit more if memory is > > larger than TBs > > > > OK so we can summarize the benefits of this series as follows: > - boot time on a virtual model of a Samurai CPU drops from 109 to 62 seconds > - boot time on a QDF2400 arm64 server with 96 GB of RAM drops by ~15 > *milliseconds* > > Google was not very helpful in figuring out what a Samurai CPU is and > why we should care about the boot time of Linux running on a virtual > model of it, and the 15 ms speedup is not that compelling either. > > Apologies to Jia that it took 11 revisions to reach this conclusion, > but in /my/ opinion, tweaking the fragile memblock/pfn handling code > for this reason is totally unjustified, and we're better off > disregarding these patches. Oh, we're talking about a *simulator* for the significant boot time improvement here? I didn't realise that, so I agree that the premise of this patch set looks pretty questionable given how much "fun" we've had with the memmap on arm and arm64. Will
Re: [PATCH] printk/tracing: Do not trace printk_nmi_enter()
On (09/07/18 16:03), Peter Zijlstra wrote: > > > > I would even argue that placing printk_nmi_enter() between > > lockdep_off() and ftrace_nmi_enter() is wrong because if in the future > > printk_nmi_enter() were to do any ftrace tracing, it wont be caught, as > > it was by having it before lockdep_off(). > > > > printk_nmi_enter() should not muck with IRQ state, nor should it do any > > ftrace tracing. Since ftrace mucks with IRQ state when it gets enabled > > or disabled, it will screw up lockdep, and lockdep will complain. That > > way we can use lockdep not being off to catch this bug. > > The very bestest solution is to rm -rf printk ;-) Talented, capable and tremendously clever people had spent decades on making printk what it is today. I feel responsible for respecting that effort and, thus, my vote would be to keep printk around for a while. ... we also support !CONFIG_PRINTK builds ;) -ss
Re: [tip:x86/paravirt] x86/paravirt: Move the pv_irq_ops under the PARAVIRT_XXL umbrella
On 07/09/18 16:49, Borislav Petkov wrote: > Hi Jürgen, > > On Mon, Sep 03, 2018 at 08:01:40AM -0700, tip-bot for Juergen Gross wrote: >> Commit-ID: 6da63eb241a05b0e676d68975e793c0521387141 >> Gitweb: >> https://git.kernel.org/tip/6da63eb241a05b0e676d68975e793c0521387141 >> Author: Juergen Gross >> AuthorDate: Tue, 28 Aug 2018 09:40:24 +0200 >> Committer: Thomas Gleixner >> CommitDate: Mon, 3 Sep 2018 16:50:36 +0200 >> >> x86/paravirt: Move the pv_irq_ops under the PARAVIRT_XXL umbrella >> >> All of the paravirt ops defined in pv_irq_ops are for Xen PV guests >> or VSMP only. Define them only if CONFIG_PARAVIRT_XXL is set. >> >> Signed-off-by: Juergen Gross >> Signed-off-by: Thomas Gleixner >> Cc: xen-de...@lists.xenproject.org >> Cc: virtualizat...@lists.linux-foundation.org >> Cc: akata...@vmware.com >> Cc: ru...@rustcorp.com.au >> Cc: boris.ostrov...@oracle.com >> Cc: h...@zytor.com >> Link: https://lkml.kernel.org/r/20180828074026.820-14-jgr...@suse.com >> >> --- >> arch/x86/include/asm/irqflags.h | 8 +--- >> arch/x86/include/asm/paravirt.h | 6 +++--- >> arch/x86/include/asm/paravirt_types.h | 3 ++- >> arch/x86/kernel/asm-offsets.c | 2 +- >> arch/x86/kernel/asm-offsets_64.c | 2 +- >> arch/x86/kernel/paravirt.c| 2 +- >> arch/x86/kernel/paravirt_patch_32.c | 4 ++-- >> arch/x86/kernel/paravirt_patch_64.c | 4 +++- >> arch/x86/kernel/vsmp_64.c | 2 +- >> 9 files changed, 15 insertions(+), 18 deletions(-) > > this one is breaking the randconfig builds with the following error > (failure case simplified): > > $ make arch/x86/entry/entry_64.o > DESCEND objtool > CALLscripts/checksyscalls.sh > AS arch/x86/entry/entry_64.o > In file included from arch/x86/entry/entry_64.S:33:0: > ./arch/x86/include/asm/paravirt.h:938:0: warning: "SAVE_FLAGS" redefined > #define SAVE_FLAGS(clobbers)\ And the fixing patch is already there: https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/patch/?id=b7a5eb6aafa95fce45fc4dcbc195cb232fa1b76d Juergen
Re: [PATCH v6 1/5] seccomp: add a return code to trap to userspace
Hey Tyler, On Thu, Sep 06, 2018 at 10:15:12PM +, Tyler Hicks wrote: > > +Users can read via ``ioctl(SECCOMP_NOTIF_RECV)`` (or ``poll()``) on a > > seccomp > > +notification fd to receive a ``struct seccomp_notif``, which contains five > > +members: the input length of the structure, a globally unique ``id``, the > > This documentation says that id is "globally unique" but an in-code > comment below says "this is unique for this filter". IIUC, the id is > only guaranteed to be unique for the filter so this documentation should > be updated slightly to make it clear that the id is only global in those > terms. Yup, thanks. > > +``pid`` of the task which triggered this request (which may be 0 if the > > task is > > +in a pid ns not visible from the listener's pid namespace), a flag > > representing > > +whether or not the notification is a result of a non-fatal signal, and the > > +``data`` passed to seccomp. Userspace can then make a decision based on > > this > > +information about what to do, and ``ioctl(SECCOMP_NOTIF_SEND)`` a response, > > +indicating what should be returned to userspace. The ``id`` member of > > ``struct > > +seccomp_notif_resp`` should be the same ``id`` as in ``struct > > seccomp_notif``. > > + > > +It is worth noting that ``struct seccomp_data`` contains the values of > > register > > +arguments to the syscall, but does not contain pointers to memory. The > > task's > > +memory is accessible to suitably privileged traces via ``ptrace()`` or > > +``/proc/pid/map_files/``. However, care should be taken to avoid the TOCTOU > > +mentioned above in this document: all arguments being read from the > > tracee's > > +memory should be read into the tracer's memory before any policy decisions > > are > > +made. This allows for an atomic decision on syscall arguments. > > + > > Sysctls > > === > > > > diff --git a/arch/Kconfig b/arch/Kconfig > > index 6801123932a5..42f3585d925d 100644 > > --- a/arch/Kconfig > > +++ b/arch/Kconfig > > @@ -419,6 +419,15 @@ config SECCOMP_FILTER > > > > See Documentation/userspace-api/seccomp_filter.rst for details. > > > > +config SECCOMP_USER_NOTIFICATION > > Did someone request a Kconfig option for this new feature? If not, I > think that nuking the Kconfig option would reduce the test matrix. No > other filter flags have their own build time option but maybe it makes > sense in this case if this filter flag exposes the kernel to significant > new attack surface since there's more to this than just a new filter > flag. > > If someone has a requirement to disable this feature, maybe it'd be > better to leave the decision up to the distro *and* the admin via a > sysctl instead of taking the admin out of the decision with a build > time option. No, there was no explicit request by anyone, I just did it so I wouldn't offend anyone with this code. I'll drop it for the next version. > > /** > > * struct seccomp_filter - container for seccomp BPF programs > > * > > @@ -66,6 +114,30 @@ struct seccomp_filter { > > bool log; > > struct seccomp_filter *prev; > > struct bpf_prog *prog; > > + > > +#ifdef CONFIG_SECCOMP_USER_NOTIFICATION > > + /* > > +* A semaphore that users of this notification can wait on for > > +* changes. Actual reads and writes are still controlled with > > +* filter->notify_lock. > > +*/ > > + struct semaphore request; > > + > > + /* A lock for all notification-related accesses. */ > > + struct mutex notify_lock; > > + > > + /* Is there currently an attached listener? */ > > + bool has_listener; > > + > > + /* The id of the next request. */ > > + u64 next_id; > > + > > + /* A list of struct seccomp_knotif elements. */ > > + struct list_head notifications; > > + > > + /* A wait queue for poll. */ > > + wait_queue_head_t wqh; > > +#endif > > I suspect that these additions would benefit from better struct packing > since there could be a lot of seccomp_filter structs floating around in > memory on a system with a large number of running containers or > otherwise sandboxed processes. > > IIRC, there's a 3 byte hole following the log member that could be used > by has_listener, at least, and I'm not sure how the rest of the new > members affect things. Ok, I'll take a look. > > +static void seccomp_do_user_notification(int this_syscall, > > +struct seccomp_filter *match, > > +const struct seccomp_data *sd) > > +{ > > + int err; > > + long ret = 0; > > + struct seccomp_knotif n = {}; > > + > > + mutex_lock(>notify_lock); > > + err = -ENOSYS; > > + if (!match->has_listener) > > + goto out; > > + > > + n.pid = task_pid(current); > > + n.state = SECCOMP_NOTIFY_INIT; > > + n.data = sd; > > + n.id = seccomp_next_notify_id(match); > > + init_completion(); > > + > > + list_add(, >notifications); > > + wake_up_poll(>wqh, EPOLLIN | EPOLLRDNORM); > > + > > +
Re: BUG: bad usercopy in __check_object_size (2)
On 2018/09/08 0:29, syzbot wrote: > syzbot has found a reproducer for the following crash on: > > HEAD commit: 28619527b8a7 Merge git://git.kernel.org/pub/scm/linux/kern.. > git tree: bpf > console output: https://syzkaller.appspot.com/x/log.txt?x=124e64d140 > kernel config: https://syzkaller.appspot.com/x/.config?x=62e9b447c16085cf > dashboard link: https://syzkaller.appspot.com/bug?extid=a3c9d2673837ccc0f22b > compiler: gcc (GCC) 8.0.1 20180413 (experimental) > syz repro: https://syzkaller.appspot.com/x/repro.syz?x=179f9cd140 > C reproducer: https://syzkaller.appspot.com/x/repro.c?x=11b3e8be40 > > IMPORTANT: if you fix the bug, please add the following tag to the commit: > Reported-by: syzbot+a3c9d2673837ccc0f...@syzkaller.appspotmail.com > > entry_SYSCALL_64_after_hwframe+0x49/0xbe > RIP: 0033:0x440479 > usercopy: Kernel memory overwrite attempt detected to spans multiple pages > (offset 0, size 64)! Kees, is this because check_page_span() is failing to allow on-stack variable u8 opcodes[OPCODE_BUFSIZE]; which by chance crossed PAGE_SIZE boundary?
Re: [PATCH V3] spi: spi-geni-qcom: Add SPI driver support for GENI based QUP
Hi, On Fri, Sep 7, 2018 at 3:00 AM, wrote: >> In v2, I said: >> >>> I'm not sure where to comment about this, so adding it to the end: >>> >>> Between v1 and v2 you totally removed all the locking. Presumably >>> this is because you didn't want to hold the lock in >>> handle_fifo_timeout() while waiting for the completion. IMO taking >>> the lock out was the wrong thing to do. You should keep it, but just >>> drop the lock before wait_for_completion_timeout() and add it back >>> afterwards. Specifically you _don't_ want the IRQ and timeout code >>> stomping on each other. >> >> >> ...but still no spinlock? > > I see there is no need of taking the spinlock as timeout will be handled > after the calculated time as per data size and speed. > There is 99.9% less chances of interrupt during the timeout handler. >> >> >> >> https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/1201081 The thing is, we want it to be 100% reliable, not 99.9% reliable. Is it somehow wrong to add the spinlock? ...or are you noticing performance problems with the spinlock there? It's just nice not to have to think about it. -Doug
Re: [PATCH RFC LKMM 1/7] tools/memory-model: Add extra ordering for locks and remove it for ordinary release/acquire
On 9/7/2018 9:09 AM, Will Deacon wrote: > On Fri, Sep 07, 2018 at 12:00:19PM -0400, Alan Stern wrote: >> On Thu, 6 Sep 2018, Andrea Parri wrote: >> Have you noticed any part of the generic code that relies on ordinary acquire-release (rather than atomic RMW acquire-release) in order to implement locking constructs? >>> >>> There are several places in code where the "lock-acquire" seems to be >>> provided by an atomic_cond_read_acquire/smp_cond_load_acquire: I have >>> mentioned one in qspinlock in this thread; qrwlock and mcs_spinlock >>> provide other examples (grep for the primitives...). >>> >>> As long as we don't consider these primitive as RMW (which would seem >>> odd...) or as acquire for which "most people expect strong ordering" >>> (see above), these provides other examples for the _gap_ I mentioned. >> >> Okay, now I understand your objection. It does appear that on RISC-V, >> if nowhere else, the current implementations of qspinlock, qrwlock, >> etc. will not provide "RCtso" ordering. >> >> The discussions surrounding this topic have been so lengthy and >> confusing that I have lost track of any comments Palmer or Daniel may >> have made concerning this potential problem. >> >> One possible resolution would be to define smp_cond_load_acquire() >> specially on RISC-V so that it provided the same ordering guarantees as >> RMW-acquire. (Plus adding a comment in the asm-generic/barrier.h >> pointing out the necessity for the stronger guarantee on all >> architectures.) >> >> Another would be to replace the usages of atomic/smp_cond_load_acquire >> in the locking constructs with a new function that would otherwise be >> the same but would provide the ordering guarantee we want. >> >> Do you think either of these would be an adequate fix? > > I didn't think RISC-V used qspinlock or qrwlock, so I'm not sure there's > actually anything to fix, is there? > > Will I've also lost track of whether the current preference is or is not for RCtso, or in which subset of cases RCtso is currently preferred. For whichever cases do in fact need to be RCtso, the RISC-V approach would still be the same as what I've written in the past, as far as I can tell [1]. In a nutshell, if a data structure uses only atomics with .aq/.rl, RISC-V provides RCtso already anyway. If a data structure uses fences, or mixes fences and atomics, we can replace a "fence r,rw" or a "fence rw,w" with a "fence.tso" (== fence r,rw + fence rw,w) as necessary, at the cost of some amount of performance. I suppose the answer to the question of whether smp_cond_load_acquire() needs to change depends on where exactly RCtso is needed, and which data structures actually use that vs. some other macro. Does that answer your question Alan? Does it make sense? [1] https://lore.kernel.org/lkml/11b27d32-4a8a-3f84-0f25-723095ef1...@nvidia.com/ Dan
Re: [PATCH v4 0/3] mtd: rawnand: ams-delta: Cleanups and optimizations
Hi, * Janusz Krzysztofik [180905 20:56]: > On Wednesday, September 5, 2018 8:47:57 AM CEST Miquel Raynal wrote: > > Patch 2/3 does not apply on nand/next. Indeed the driver does not look > > the same as in the diff. > > That's because I built it on top of my former series from the mid of July, > containing "[PATCH v2 2/3 v4] mtd: rawnand: ams-delta: use GPIO lookup > table". > It was acked by you, Miquel, and supposed to be merged via linux-omap tree. Hmm I thought the plan was for dependencies to clear and then merge the rest via various driver trees.. Or at least I don't have the patch above tagged anywhere for me to merge. Then again, I try to forget everything posted before -rc1 just to stay sane :) > > I don't see any changes on my side that could > > explain this so perhaps you could rebase on top of 4.19-rc2 (or > > nand/next, as you wish) and resend the series? > > As far as I can see, Tony hasn't applied that series yet, so maybe I can > still > move that patch out of there and insert it into this series in front of the > other 3 patches and resend. That would however make patch 3/3 of that old > series depend on this one. > > Tony, what do you think? Yes please resend based on v4.19-rc1 or MTD next. If there are still pending dependencies, please let us know and we can set up an immutable branch against v4.19-rc1 with those for MTD and me to merge in as needed. Regards, Tony
[PATCH 1/4] 9p: acl: fix uninitialized iattr access
From: Dominique Martinet iattr is passed to v9fs_vfs_setattr_dotl which does send various values from iattr over the wire, even if it tells the server to only look at iattr.ia_valid fields this could leak some stack data. Addresses-Coverity-ID: 1195601 ("Uninitalized scalar variable") Signed-off-by: Dominique Martinet --- fs/9p/acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 082d227fa56b..6261719f6f2a 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -276,7 +276,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, switch (handler->flags) { case ACL_TYPE_ACCESS: if (acl) { - struct iattr iattr; + struct iattr iattr = { 0 }; struct posix_acl *old_acl = acl; retval = posix_acl_update_mode(inode, _mode, ); -- 2.17.1
[PATCH 3/4] 9p: p9dirent_read: check network-provided name length
From: Dominique Martinet strcpy to dirent->d_name could overflow the buffer, use strscpy to check the provided string length and error out if the size was too big. While we are here, make the function return an error when the pdu parsing failed, instead of returning the pdu offset as if it had been a success... Addresses-Coverity-ID: 139133 ("Copy into fixed size buffer") Signed-off-by: Dominique Martinet --- net/9p/protocol.c | 12 +--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/9p/protocol.c b/net/9p/protocol.c index b4d80c533f89..462ba144cb39 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -623,13 +623,19 @@ int p9dirent_read(struct p9_client *clnt, char *buf, int len, if (ret) { p9_debug(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret); trace_9p_protocol_dump(clnt, _pdu); - goto out; + return ret; } - strcpy(dirent->d_name, nameptr); + ret = strscpy(dirent->d_name, nameptr, sizeof(dirent->d_name)); + if (ret < 0) { + p9_debug(P9_DEBUG_ERROR, +"On the wire dirent name too long: %s\n", +nameptr); + kfree(nameptr); + return ret; + } kfree(nameptr); -out: return fake_pdu.offset; } EXPORT_SYMBOL(p9dirent_read); -- 2.17.1
[PATCH 2/4] 9p/rdma: remove useless check in cm_event_handler
From: Dominique Martinet the client c is always dereferenced to get the rdma struct, so c has to be a valid pointer at this point. Gcc would optimize that away but let's make coverity happy... Addresses-Coverity-ID: 102778 ("Dereference before null check") Signed-off-by: Dominique Martinet --- net/9p/trans_rdma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 9719bc4d9424..119103bfa82e 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -274,8 +274,7 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: if (rdma) rdma->state = P9_RDMA_CLOSED; - if (c) - c->status = Disconnected; + c->status = Disconnected; break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: -- 2.17.1
Re: [PATCH v3 0/4] pci-dra7xx: Enable errata i870 workaround for RC mode
* Vignesh R [180810 10:10]: > > > On Wednesday 08 August 2018 10:27 PM, Lorenzo Pieralisi wrote: > > On Tue, Jul 24, 2018 at 11:01:46PM +0530, Vignesh R wrote: > >> Make workaround for errata i870 applicable in Host mode as > >> well(previously it was enabled only for EP mode) as per errata > >> documentation: http://www.ti.com/lit/er/sprz450/sprz450.pdf > >> > >> Tested on DRA72 EVM > >> > >> Tony, > >> > >> If you are okay with the series, could you pick this via omap tree? > >> All ACKs are in place and Lorenzo is okay with PCIe bits to go along with > >> rest of DTS changes. > > > > I think we have missed the v4.19 merge window by now - > > Right. I didn't get any response from Tony. Sorry catching up with pending mails.. I try hard to not touch anything except fixes around -rc6 time. > > please let me know if I can drop this series from the PCI patch queue. > > > > Ok, I will resend the patch after 4.19-rc. Thanks! FYI, I'm untagging this thread too. Please post the dts changes separately once the dependencies (if any) have cleared. Regards, Tony
Re: [PATCH 4.4 31/43] mm: fix cache mode tracking in vm_insert_mixed()
On Tue, 2018-08-14 at 19:18 +0200, Greg Kroah-Hartman wrote: > 4.4-stable review patch. If anyone has any objections, please let me know. > > -- > > From: Dan Williams > > commit 87744ab3832b83ba71b931f86f9cfdb000d07da5 upstream > > vm_insert_mixed() unlike vm_insert_pfn_prot() and vmf_insert_pfn_pmd(), > fails to check the pgprot_t it uses for the mapping against the one > recorded in the memtype tracking tree. Add the missing call to > track_pfn_insert() to preclude cases where incompatible aliased mappings > are established for a given physical address range. [...] This apparently breaks a number of DRM drivers. The upstream fixes are: 8ef4227615e1 x86/io: add interface to reserve io memtype for a resource range. (v1.1) 7cf321d118a8 drm/drivers: add support for using the arch wc mapping API. They appear to apply cleanly to 4.4-stable. They are included in 4.9 so no other stable branch needs them. Ben. -- Ben Hutchings, Software Developer Codethink Ltd https://www.codethink.co.uk/ Dale House, 35 Dale Street Manchester, M1 2HF, United Kingdom
Re: [PATCH 6/7] HID: logitech-hidpp: support the G700 over wireless
Hi Benjamin, On Fri, 7 Sep 2018 at 03:35, Benjamin Tissoires wrote: > > The G700 is using a non unifying receiver, so it's easy to add its support > in hid-logitech-hidpp now. > [snip] > @@ -3671,6 +3671,9 @@ static const struct hid_device_id hidpp_devices[] = { > { /* Solar Keyboard Logitech K750 */ > LDJ_DEVICE(0x4002), > .driver_data = HIDPP_QUIRK_CLASS_K750 }, > + { /* G700 over Wireless */ > + HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, > USB_DEVICE_ID_LOGITECH_G700_RECEIVER), > + .driver_data = HIDPP_QUIRK_RECEIVER | HIDPP_QUIRK_UNIFYING }, As someone who's new to the codebase, it seems rather confusing to me that HIDPP_QUIRK_UNIFYING would be present here for a device that doesn't use a Unifying receiver. Am I misunderstanding, or should we consider renaming the quirk or adding some clarifying comment? (Similarly for the G900 in the next patch.) > > { LDJ_DEVICE(HID_ANY_ID) }, > > -- > 2.14.3 > Thanks, Harry Cutts Chrome OS Touch/Input team
Re: [PATCH RFC LKMM 1/7] tools/memory-model: Add extra ordering for locks and remove it for ordinary release/acquire
On Fri, 7 Sep 2018, Daniel Lustig wrote: > On 9/7/2018 9:09 AM, Will Deacon wrote: > > On Fri, Sep 07, 2018 at 12:00:19PM -0400, Alan Stern wrote: > >> On Thu, 6 Sep 2018, Andrea Parri wrote: > >> > Have you noticed any part of the generic code that relies on ordinary > acquire-release (rather than atomic RMW acquire-release) in order to > implement locking constructs? > >>> > >>> There are several places in code where the "lock-acquire" seems to be > >>> provided by an atomic_cond_read_acquire/smp_cond_load_acquire: I have > >>> mentioned one in qspinlock in this thread; qrwlock and mcs_spinlock > >>> provide other examples (grep for the primitives...). > >>> > >>> As long as we don't consider these primitive as RMW (which would seem > >>> odd...) or as acquire for which "most people expect strong ordering" > >>> (see above), these provides other examples for the _gap_ I mentioned. > >> > >> Okay, now I understand your objection. It does appear that on RISC-V, > >> if nowhere else, the current implementations of qspinlock, qrwlock, > >> etc. will not provide "RCtso" ordering. > >> > >> The discussions surrounding this topic have been so lengthy and > >> confusing that I have lost track of any comments Palmer or Daniel may > >> have made concerning this potential problem. > >> > >> One possible resolution would be to define smp_cond_load_acquire() > >> specially on RISC-V so that it provided the same ordering guarantees as > >> RMW-acquire. (Plus adding a comment in the asm-generic/barrier.h > >> pointing out the necessity for the stronger guarantee on all > >> architectures.) > >> > >> Another would be to replace the usages of atomic/smp_cond_load_acquire > >> in the locking constructs with a new function that would otherwise be > >> the same but would provide the ordering guarantee we want. > >> > >> Do you think either of these would be an adequate fix? > > > > I didn't think RISC-V used qspinlock or qrwlock, so I'm not sure there's > > actually anything to fix, is there? > > > > Will > > I've also lost track of whether the current preference is or is not for > RCtso, or in which subset of cases RCtso is currently preferred. For > whichever cases do in fact need to be RCtso, the RISC-V approach would > still be the same as what I've written in the past, as far as I can > tell [1]. The patch which Paul plans to send in for the next merge window makes the LKMM require RCtso ordering for spinlocks, and by extension, for all locking operations. As I understand it, the current RISC-V implementation of spinlocks does provide this ordering. We have discussed creating another patch for the LKMM which would require RMW-acquire/ordinary-release also to have RCtso ordering. Nobody has written the patch yet, but it would be straightfoward. The rationale is that many types of locks are implemented in terms of RMW-acquire, so if the locks are required to be RCtso then so should the lower-level operations they are built from. Will feels strongly (and Linus agrees) that the LKMM should not require ordinary acquire and release to be any stronger than RCpc. The issue that Andrea raised has to do with qspinlock, qrwlock, and mcs_spinlock, which are implemented using smp_cond_load_acquire() instead of RMW-acquire. This provides only the ordering properties of smp_load_acquire(), namely RCpc, which means that qspinlocks etc. might not be RCtso. Since we do want locks to be RCtso, the question is how to resolve this discrepancy. > In a nutshell, if a data structure uses only atomics with .aq/.rl, > RISC-V provides RCtso already anyway. If a data structure uses fences, > or mixes fences and atomics, we can replace a "fence r,rw" or a > "fence rw,w" with a "fence.tso" (== fence r,rw + fence rw,w) as > necessary, at the cost of some amount of performance. > > I suppose the answer to the question of whether smp_cond_load_acquire() > needs to change depends on where exactly RCtso is needed, and which > data structures actually use that vs. some other macro. > > Does that answer your question Alan? Does it make sense? On all other architectures, as far as I know, smp_cond_load_acquire() is in fact RCtso. Any changes would only be needed on RISC-V. A quick grep of the kernel source (not quite up-to-date, unfortunately) turns up only the following additional usages of smp_cond_load_acquire(): It is used in kernel/smp.c for csd_lock(); I don't know what that is meant for. It is also used in the scheduler core (kernel/sched/core.c). I don't know what ordering requirements the scheduler has for it, but Peter does. There's a usage in drivers/iommu/arm-smmu-v3.c, but no comment to explain why it is needed. To tell the truth, I'm not aware of any code in the kernel that actually _needs_ RCtso ordering for locks, but Peter and Will are quite firm that it should be required. Linus would actually like locks to be
Re: [PATCH V2 6/8] input: stpmic1: add stpmic1 onkey driver
Hi Pascal, On Fri, Sep 07, 2018 at 12:59:45PM +, Pascal PAILLET-LME wrote: > From: pascal paillet > > The stpmic1 pmic is able to manage an onkey button. This driver exposes > the stpmic1 onkey as an input device. It can also be configured to > shut-down the power supplies on a long key-press with an adjustable > duration. > > Signed-off-by: pascal paillet > --- > changes in v2: > * the hardware component has been renamed from stpmu1 to stpmic1 ! > * change headers > * handle remarks from Dmitry > * the irq is threaded because is is nested in a thread; I have added a > comment. > Dmitry, I'm sorry, but I did not catch your comment regarding usage of > "generic device property API.". could you tell more ? You basically do s/of_property_/device_property_/ and that's it. > > drivers/input/misc/Kconfig | 11 ++ > drivers/input/misc/Makefile| 2 + > drivers/input/misc/stpmic1_onkey.c | 257 > + > 3 files changed, 270 insertions(+) > create mode 100644 drivers/input/misc/stpmic1_onkey.c > > diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig > index c25606e..cc82dad 100644 > --- a/drivers/input/misc/Kconfig > +++ b/drivers/input/misc/Kconfig > @@ -841,4 +841,15 @@ config INPUT_RAVE_SP_PWRBUTTON > To compile this driver as a module, choose M here: the > module will be called rave-sp-pwrbutton. > > +config INPUT_STPMIC1_ONKEY > + tristate "STPMIC1 PMIC Onkey support" > + depends on MFD_STPMIC1 > + help > + Say Y to enable support of onkey embedded into STPMIC1 PMIC. onkey > + can be used to wakeup from low power modes and force a shut-down on > + long press. > + > + To compile this driver as a module, choose M here: the > + module will be called stpmic1_onkey. > + > endif > diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile > index 72cde28..f0e11b0 100644 > --- a/drivers/input/misc/Makefile > +++ b/drivers/input/misc/Makefile > @@ -70,6 +70,7 @@ obj-$(CONFIG_INPUT_SGI_BTNS)+= sgi_btns.o > obj-$(CONFIG_INPUT_SIRFSOC_ONKEY)+= sirfsoc-onkey.o > obj-$(CONFIG_INPUT_SOC_BUTTON_ARRAY) += soc_button_array.o > obj-$(CONFIG_INPUT_SPARCSPKR)+= sparcspkr.o > +obj-$(CONFIG_INPUT_STPMIC1_ONKEY)+= stpmic1_onkey.o > obj-$(CONFIG_INPUT_TPS65218_PWRBUTTON) += tps65218-pwrbutton.o > obj-$(CONFIG_INPUT_TWL4030_PWRBUTTON)+= twl4030-pwrbutton.o > obj-$(CONFIG_INPUT_TWL4030_VIBRA)+= twl4030-vibra.o > @@ -80,3 +81,4 @@ obj-$(CONFIG_INPUT_WM831X_ON) += wm831x-on.o > obj-$(CONFIG_INPUT_XEN_KBDDEV_FRONTEND) += xen-kbdfront.o > obj-$(CONFIG_INPUT_YEALINK) += yealink.o > obj-$(CONFIG_INPUT_IDEAPAD_SLIDEBAR) += ideapad_slidebar.o > + > diff --git a/drivers/input/misc/stpmic1_onkey.c > b/drivers/input/misc/stpmic1_onkey.c > new file mode 100644 > index 000..170d879 > --- /dev/null > +++ b/drivers/input/misc/stpmic1_onkey.c > @@ -0,0 +1,257 @@ > +// SPDX-License-Identifier: GPL-2.0 > +// Copyright (C) STMicroelectronics 2018 > +// Author: Pascal Paillet for STMicroelectronics. > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +/** > + * struct stpmic1_onkey - OnKey data > + * @pmic:pointer to STPMIC1 PMIC device > + * @input_dev: pointer to input device > + * @irq_falling: irq that we are hooked on to > + * @irq_rising: irq that we are hooked on to > + */ > +struct stpmic1_onkey { > + struct stpmic1_dev *pmic; > + struct input_dev *input_dev; > + int irq_falling; > + int irq_rising; > +}; > + > +/** > + * struct pmic_onkey_config - configuration of pmic PONKEYn > + * @turnoff_enabled: value to enable turnoff condition > + * @cc_flag_clear: value to clear CC flag in case of PowerOff > + * trigger by longkey press > + * @onkey_pullup_val:value of PONKEY PullUp (active or > inactive) > + * @long_press_time_val: value for long press h/w shutdown event > + */ > +struct pmic_onkey_config { > + bool turnoff_enabled; > + bool cc_flag_clear; > + u8 onkey_pullup_val; > + u8 long_press_time_val; > +}; > + > +static irqreturn_t onkey_falling_irq(int irq, void *ponkey) > +{ > + struct stpmic1_onkey *onkey = ponkey; > + struct input_dev *input_dev = onkey->input_dev; > + > + input_report_key(input_dev, KEY_POWER, 1); > + pm_wakeup_event(input_dev->dev.parent, 0); > + input_sync(input_dev); > + > + dev_dbg(_dev->dev, "Pwr Onkey Falling Interrupt received\n"); > + > + return IRQ_HANDLED; > +} > + > +static irqreturn_t onkey_rising_irq(int irq, void *ponkey) > +{ > + struct stpmic1_onkey *onkey = ponkey; > + struct input_dev *input_dev = onkey->input_dev; > + > + input_report_key(input_dev, KEY_POWER, 0); > + pm_wakeup_event(input_dev->dev.parent, 0); > + input_sync(input_dev); > +
Re: [PATCH v2 0/3] mtd concat device driver
Apologies, again, I seem not to be able to handle git-send-mail correctly, the cover letter got lost in operation (using get_maintainers on a cover letter is not a good idea). Here it is again: Hi everybody, when porting my router board from a mach-file based OpenWRT target to a device-tree based target, I found that there is no generic way to create a mtd_concat device from within the dts. The following patches attempt to provide that possibility. This is a second roll of that patch series, the first one can be seen at [1]. Apologies for not including the correct recipients in the first roll. In this first discussion, concerns were raised that a driver for a "virtual" device like this might have no place in the device tree system. However, I would argue that this very similar to specifying the partitions of a mtd device, which can also done in the device tree. In fact, I believe this is the only way to be able to specify the partitions of such a concat device in the dts file (but I'm happy to be corrected if I'm mistaken). I have made the example in the dt-binding documentation a little bit more expressive in this detail. In this second roll I have also addressed all issues that reviewers have brought up so far, hopefully to their satisfaction. Best Regards Bernhard [1] http://lists.infradead.org/pipermail/linux-mtd/2018-September/083832.html Bernhard Frauendienst (3): mtd: core: add get_mtd_device_by_node dt-bindings: add bindings for mtd-concat devices mtd: mtdconcat: add dt driver for concat devices .../devicetree/bindings/mtd/mtd-concat.txt | 36 + drivers/mtd/Kconfig | 2 + drivers/mtd/Makefile | 3 + drivers/mtd/composite/Kconfig | 12 ++ drivers/mtd/composite/Makefile | 7 + drivers/mtd/composite/virt_concat.c | 128 ++ drivers/mtd/mtdcore.c | 38 ++ include/linux/mtd/mtd.h | 2 + 8 files changed, 228 insertions(+) create mode 100644 Documentation/devicetree/bindings/mtd/mtd-concat.txt create mode 100644 drivers/mtd/composite/Kconfig create mode 100644 drivers/mtd/composite/Makefile create mode 100644 drivers/mtd/composite/virt_concat.c -- 2.17.1
Re: Conflict between sparse and commit cafa0010cd51f ("Raise the minimum required gcc version to 4.6")
On Fri, Sep 07, 2018 at 10:22:56AM -0700, Nick Desaulniers wrote: > On Fri, Sep 7, 2018 at 7:34 AM Christophe LEROY > wrote: > > > > Cc linux-spa...@vger.kernel.org > > > > Le 07/09/2018 à 14:22, Christophe Leroy a écrit : > > > Since commit cafa0010cd51f ("Raise the minimum required gcc version to > > > 4.6"), sparse check fails as follows: > > > > > > [root@pc16082vm linux-powerpc]# make C=2 arch/powerpc/kernel/process.o > > >CALLscripts/checksyscalls.sh > > >CHECK scripts/mod/empty.c > > > ./include/linux/compiler-gcc.h:14:3: error: Sorry, your compiler is too > > > old - please upgrade it. > > >CHECK arch/powerpc/kernel/process.c > > > ./include/linux/compiler-gcc.h:14:3: error: Sorry, your compiler is too > > > old - please upgrade it. > > > > > > > > > I have sparse version 0.5.2 > > > > > > What can be done to fix that ? > > > > > > Christophe > > Oof, sorry Christophe. Looks like that's the latest version of sparse: > https://sparse.wiki.kernel.org/index.php/Main_Page#News > > I'm curious what sparse expands __GNUC__, __GNUC_MINOR__, and > __GNUC_PATCHLEVEL__ to? Pre commit cafa0010cd51f, it MUST be > expanding them to something, otherwise you'd have seen the error then, > too. The previous check was GCC < 3.3, now it's GCC < 4.6. Sparse expand these macros to the same version than the compiler used to compile GCC. I find a bit strange though to have sparse v0.5.2 but using an old compiler. Also, it's worth to look at what is said in this email: https://lore.kernel.org/lkml/ca+55afzyenzr2gzlr-dwponjmnygyody+6awlcvnaywiazu...@mail.gmail.com/ -- Luc
[PATCH] ring-buffer: Allow for rescheduling when removing pages
When reducing ring buffer size, pages are removed by scheduling a work item on each CPU for the corresponding CPU ring buffer. After the pages are removed from ring buffer linked list, the pages are free()d in a tight loop. The loop does not give up CPU until all pages are removed. In a worst case behavior, when lot of pages are to be freed, it can cause system stall. After the pages are removed from the list, the free() can happen while the work is rescheduled. Add a check for need_sched() within the loop to prevent the system hangup. Reported-by: Jason Behmer Signed-off-by: Vaibhav Nagarnaik --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1d92d4a982fd..bc1789df7c53 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1546,6 +1546,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) tmp_iter_page = first_page; do { + if (need_resched()) + schedule(); + to_remove_page = tmp_iter_page; rb_inc_page(cpu_buffer, _iter_page); -- 2.19.0.rc2.392.g5ba43deb5a-goog
Re: [PATCH] ring-buffer: Allow for rescheduling when removing pages
On Fri, 7 Sep 2018 11:21:31 -0700 Vaibhav Nagarnaik wrote: > When reducing ring buffer size, pages are removed by scheduling a work > item on each CPU for the corresponding CPU ring buffer. After the pages > are removed from ring buffer linked list, the pages are free()d in a > tight loop. The loop does not give up CPU until all pages are removed. > In a worst case behavior, when lot of pages are to be freed, it can > cause system stall. > > After the pages are removed from the list, the free() can happen while > the work is rescheduled. Add a check for need_sched() within the loop > to prevent the system hangup. > > Reported-by: Jason Behmer > Signed-off-by: Vaibhav Nagarnaik > --- > kernel/trace/ring_buffer.c | 3 +++ > 1 file changed, 3 insertions(+) > > diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c > index 1d92d4a982fd..bc1789df7c53 100644 > --- a/kernel/trace/ring_buffer.c > +++ b/kernel/trace/ring_buffer.c > @@ -1546,6 +1546,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, > unsigned long nr_pages) > tmp_iter_page = first_page; > > do { > + if (need_resched()) > + schedule(); > + Hi, thanks for the patch, but the proper way to do this is to stick in: cond_resched(); And that should solve it for you. Want to send in another patch? -- Steve > to_remove_page = tmp_iter_page; > rb_inc_page(cpu_buffer, _iter_page); >
[PATCH] sched/fair: fix 1 task per CPU
When CPUs have different capacity because of RT/DL tasks or micro-architecture or max frequency differences, there are situation where the imbalance is not correctly set to migrate waiting task on the idle CPU. The UC uses the force_balance case : if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; But calculate_imbalance fails to set the right amount of load to migrate a task because of the special condition: busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load) Add in fix_small_imbalance, this special case that triggered the force balance in order to make sure that the amount of load to migrate will be enough. Signed-off-by: Vincent Guittot --- kernel/sched/fair.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 309c93f..57b4d83 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8048,6 +8048,20 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) local = >local_stat; busiest = >busiest_stat; + /* +* There is available capacity in local group and busiest group is +* overloaded but calculate_imbalance can't compute the amount of load +* to migrate because they became meaningless because asymetric +* capacity between group. In such case, we only want to migrate at +* least one tasks of the busiest group and rely of the average load +* per task to ensure the migration. +*/ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && + busiest->group_no_capacity) { + env->imbalance = busiest->load_per_task; + return; + } + if (!local->sum_nr_running) local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); else if (busiest->load_per_task > local->load_per_task) -- 2.7.4
Applied "spi: pic32: remove unnecessary of_node_get()" to the spi tree
The patch spi: pic32: remove unnecessary of_node_get() has been applied to the spi tree at https://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git All being well this means that it will be integrated into the linux-next tree (usually sometime in the next 24 hours) and sent to Linus during the next merge window (or sooner if it is a bug fix), however if problems are discovered then the patch may be dropped or reverted. You may get further e-mails resulting from automated or manual testing and review of the tree, please engage with people reporting problems and send followup patches addressing any issues that are reported if needed. If any updates are required or you are submitting further changes they should be sent as incremental updates against current git, existing patches will not be replaced. Please add any relevant lists and maintainers to the CCs when replying to this mail. Thanks, Mark >From b9a947dd756b7af84ababa57e0524788f91a5382 Mon Sep 17 00:00:00 2001 From: Alexey Khoroshilov Date: Fri, 7 Sep 2018 01:16:54 +0300 Subject: [PATCH] spi: pic32: remove unnecessary of_node_get() Almost all spi drivers assign spi master->dev.of_node from its parent platform device without additional refcounting. It seems of_node_get() in pic32_spi_probe() is unnecessary and there is no corresponding of_node_put(). Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Alexey Khoroshilov Signed-off-by: Mark Brown --- drivers/spi/spi-pic32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-pic32.c b/drivers/spi/spi-pic32.c index f8a45af1fa9f..46ff76193ee1 100644 --- a/drivers/spi/spi-pic32.c +++ b/drivers/spi/spi-pic32.c @@ -774,7 +774,7 @@ static int pic32_spi_probe(struct platform_device *pdev) if (ret) goto err_master; - master->dev.of_node = of_node_get(pdev->dev.of_node); + master->dev.of_node = pdev->dev.of_node; master->mode_bits = SPI_MODE_3 | SPI_MODE_0 | SPI_CS_HIGH; master->num_chipselect = 1; /* single chip-select */ master->max_speed_hz= clk_get_rate(pic32s->clk); -- 2.19.0.rc1
Applied "regmap: fix comment for regmap.use_single_write" to the regmap tree
The patch regmap: fix comment for regmap.use_single_write has been applied to the regmap tree at https://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git All being well this means that it will be integrated into the linux-next tree (usually sometime in the next 24 hours) and sent to Linus during the next merge window (or sooner if it is a bug fix), however if problems are discovered then the patch may be dropped or reverted. You may get further e-mails resulting from automated or manual testing and review of the tree, please engage with people reporting problems and send followup patches addressing any issues that are reported if needed. If any updates are required or you are submitting further changes they should be sent as incremental updates against current git, existing patches will not be replaced. Please add any relevant lists and maintainers to the CCs when replying to this mail. Thanks, Mark >From 9ad8eb0168ab76786f65d4b80ce082980f79a1d9 Mon Sep 17 00:00:00 2001 From: David Frey Date: Sat, 1 Sep 2018 09:50:40 -0700 Subject: [PATCH] regmap: fix comment for regmap.use_single_write Signed-off-by: David Frey Signed-off-by: Mark Brown --- drivers/base/regmap/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h index a6bf34d6394e..16414ccace96 100644 --- a/drivers/base/regmap/internal.h +++ b/drivers/base/regmap/internal.h @@ -149,7 +149,7 @@ struct regmap { /* if set, converts bulk read to single read */ bool use_single_read; - /* if set, converts bulk read to single read */ + /* if set, converts bulk write to single write */ bool use_single_write; /* if set, the device supports multi write mode */ bool can_multi_write; -- 2.19.0.rc1
Re: [PATCH RFC LKMM 1/7] tools/memory-model: Add extra ordering for locks and remove it for ordinary release/acquire
On Fri, Sep 07, 2018 at 12:00:19PM -0400, Alan Stern wrote: > On Thu, 6 Sep 2018, Andrea Parri wrote: > > > > Have you noticed any part of the generic code that relies on ordinary > > > acquire-release (rather than atomic RMW acquire-release) in order to > > > implement locking constructs? > > > > There are several places in code where the "lock-acquire" seems to be > > provided by an atomic_cond_read_acquire/smp_cond_load_acquire: I have > > mentioned one in qspinlock in this thread; qrwlock and mcs_spinlock > > provide other examples (grep for the primitives...). > > > > As long as we don't consider these primitive as RMW (which would seem > > odd...) or as acquire for which "most people expect strong ordering" > > (see above), these provides other examples for the _gap_ I mentioned. > > Okay, now I understand your objection. It does appear that on RISC-V, > if nowhere else, the current implementations of qspinlock, qrwlock, > etc. will not provide "RCtso" ordering. > > The discussions surrounding this topic have been so lengthy and > confusing that I have lost track of any comments Palmer or Daniel may > have made concerning this potential problem. > > One possible resolution would be to define smp_cond_load_acquire() > specially on RISC-V so that it provided the same ordering guarantees as > RMW-acquire. (Plus adding a comment in the asm-generic/barrier.h > pointing out the necessity for the stronger guarantee on all > architectures.) > > Another would be to replace the usages of atomic/smp_cond_load_acquire > in the locking constructs with a new function that would otherwise be > the same but would provide the ordering guarantee we want. > > Do you think either of these would be an adequate fix? I didn't think RISC-V used qspinlock or qrwlock, so I'm not sure there's actually anything to fix, is there? Will
[PATCH 2/6] md: convert to kvmalloc
The code really just wants a big flat buffer, so just do that. Signed-off-by: Kent Overstreet Cc: Shaohua Li Cc: linux-r...@vger.kernel.org --- drivers/md/raid5-ppl.c | 7 ++-- drivers/md/raid5.c | 82 +++--- drivers/md/raid5.h | 9 ++--- 3 files changed, 45 insertions(+), 53 deletions(-) diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 3a7c363265..5911810101 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include "md.h" @@ -165,7 +164,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, struct dma_async_tx_descriptor *tx) { int disks = sh->disks; - struct page **srcs = flex_array_get(percpu->scribble, 0); + struct page **srcs = percpu->scribble; int count = 0, pd_idx = sh->pd_idx, i; struct async_submit_ctl submit; @@ -196,8 +195,8 @@ ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, } init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx, - NULL, sh, flex_array_get(percpu->scribble, 0) - + sizeof(struct page *) * (sh->disks + 2)); + NULL, sh, percpu->scribble + + sizeof(struct page *) * (sh->disks + 2)); if (count == 1) tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE, diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2031506a0e..d5603946dc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include @@ -1399,19 +1398,14 @@ static void ops_complete_compute(void *stripe_head_ref) static addr_conv_t *to_addr_conv(struct stripe_head *sh, struct raid5_percpu *percpu, int i) { - void *addr; - - addr = flex_array_get(percpu->scribble, i); - return addr + sizeof(struct page *) * (sh->disks + 2); + return percpu->scribble + i * percpu->scribble_obj_size + + sizeof(struct page *) * (sh->disks + 2); } /* return a pointer to the address conversion region of the scribble buffer */ static struct page **to_addr_page(struct raid5_percpu *percpu, int i) { - void *addr; - - addr = flex_array_get(percpu->scribble, i); - return addr; + return percpu->scribble + i * percpu->scribble_obj_size; } static struct dma_async_tx_descriptor * @@ -2240,21 +2234,23 @@ static int grow_stripes(struct r5conf *conf, int num) * calculate over all devices (not just the data blocks), using zeros in place * of the P and Q blocks. */ -static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags) +static int scribble_alloc(struct raid5_percpu *percpu, + int num, int cnt, gfp_t flags) { - struct flex_array *ret; - size_t len; + size_t obj_size = + sizeof(struct page *) * (num+2) + + sizeof(addr_conv_t) * (num+2); + void *scribble; - len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); - ret = flex_array_alloc(len, cnt, flags); - if (!ret) - return NULL; - /* always prealloc all elements, so no locking is required */ - if (flex_array_prealloc(ret, 0, cnt, flags)) { - flex_array_free(ret); - return NULL; - } - return ret; + scribble = kvmalloc_array(cnt, obj_size, flags); + if (!scribble) + return -ENOMEM; + + kvfree(percpu->scribble); + + percpu->scribble = scribble; + percpu->scribble_obj_size = obj_size; + return 0; } static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) @@ -2272,23 +2268,18 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) return 0; mddev_suspend(conf->mddev); get_online_cpus(); + for_each_present_cpu(cpu) { struct raid5_percpu *percpu; - struct flex_array *scribble; percpu = per_cpu_ptr(conf->percpu, cpu); - scribble = scribble_alloc(new_disks, - new_sectors / STRIPE_SECTORS, - GFP_NOIO); - - if (scribble) { - flex_array_free(percpu->scribble); - percpu->scribble = scribble; - } else { - err = -ENOMEM; + err = scribble_alloc(percpu, new_disks, +new_sectors / STRIPE_SECTORS, +GFP_NOIO); + if (err) break; - } } + put_online_cpus(); mddev_resume(conf->mddev);
[PATCH 3/6] selinux: convert to kvmalloc
The flex arrays were being used for constant sized arrays, so there's no benefit to using flex_arrays over something simpler. Signed-off-by: Kent Overstreet Cc: linux-security-mod...@vger.kernel.org --- security/selinux/ss/avtab.c | 40 +- security/selinux/ss/avtab.h | 4 +- security/selinux/ss/conditional.c | 6 +- security/selinux/ss/policydb.c| 122 -- security/selinux/ss/policydb.h| 12 +-- security/selinux/ss/services.c| 22 ++ 6 files changed, 62 insertions(+), 144 deletions(-) diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c index a2c9148b06..5a7fd5f0b7 100644 --- a/security/selinux/ss/avtab.c +++ b/security/selinux/ss/avtab.c @@ -93,12 +93,10 @@ avtab_insert_node(struct avtab *h, int hvalue, newnode->next = prev->next; prev->next = newnode; } else { - newnode->next = flex_array_get_ptr(h->htable, hvalue); - if (flex_array_put_ptr(h->htable, hvalue, newnode, - GFP_KERNEL|__GFP_ZERO)) { - kmem_cache_free(avtab_node_cachep, newnode); - return NULL; - } + struct avtab_node **n = >htable[hvalue]; + + newnode->next = *n; + *n = newnode; } h->nel++; @@ -111,11 +109,11 @@ static int avtab_insert(struct avtab *h, struct avtab_key *key, struct avtab_dat struct avtab_node *prev, *cur, *newnode; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); - if (!h || !h->htable) + if (!h) return -EINVAL; hvalue = avtab_hash(key, h->mask); - for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue); + for (prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next) { if (key->source_type == cur->key.source_type && @@ -156,10 +154,10 @@ avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, struct avtab_datu struct avtab_node *prev, *cur; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); - if (!h || !h->htable) + if (!h) return NULL; hvalue = avtab_hash(key, h->mask); - for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue); + for (prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next) { if (key->source_type == cur->key.source_type && @@ -186,11 +184,11 @@ struct avtab_datum *avtab_search(struct avtab *h, struct avtab_key *key) struct avtab_node *cur; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); - if (!h || !h->htable) + if (!h) return NULL; hvalue = avtab_hash(key, h->mask); - for (cur = flex_array_get_ptr(h->htable, hvalue); cur; + for (cur = h->htable[hvalue]; cur; cur = cur->next) { if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && @@ -222,11 +220,11 @@ avtab_search_node(struct avtab *h, struct avtab_key *key) struct avtab_node *cur; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); - if (!h || !h->htable) + if (!h) return NULL; hvalue = avtab_hash(key, h->mask); - for (cur = flex_array_get_ptr(h->htable, hvalue); cur; + for (cur = h->htable[hvalue]; cur; cur = cur->next) { if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && @@ -281,11 +279,11 @@ void avtab_destroy(struct avtab *h) int i; struct avtab_node *cur, *temp; - if (!h || !h->htable) + if (!h) return; for (i = 0; i < h->nslot; i++) { - cur = flex_array_get_ptr(h->htable, i); + cur = h->htable[i]; while (cur) { temp = cur; cur = cur->next; @@ -295,7 +293,7 @@ void avtab_destroy(struct avtab *h) kmem_cache_free(avtab_node_cachep, temp); } } - flex_array_free(h->htable); + kvfree(h->htable); h->htable = NULL; h->nslot = 0; h->mask = 0; @@ -303,6 +301,7 @@ void avtab_destroy(struct avtab *h) int avtab_init(struct avtab *h) { + kvfree(h->htable); h->htable = NULL; h->nel = 0; return 0; @@ -329,8 +328,7 @@ int avtab_alloc(struct avtab *h, u32 nrules) nslot = MAX_AVTAB_HASH_BUCKETS; mask = nslot - 1; - h->htable = flex_array_alloc(sizeof(struct avtab_node *), nslot, -GFP_KERNEL | __GFP_ZERO); + h->htable = kvmalloc_array(nslot, sizeof(void *), GFP_KERNEL);
[PATCH 4/6] Generic radix trees
Very simple radix tree implementation that supports storing arbitrary size entries, up to PAGE_SIZE - upcoming patches will convert existing flex_array users to genradixes. The new genradix code has a much simpler API and implementation, and doesn't have a hard limit on the number of elements like flex_array does. Signed-off-by: Kent Overstreet --- include/linux/generic-radix-tree.h | 222 + lib/Makefile | 3 +- lib/generic-radix-tree.c | 180 +++ 3 files changed, 404 insertions(+), 1 deletion(-) create mode 100644 include/linux/generic-radix-tree.h create mode 100644 lib/generic-radix-tree.c diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h new file mode 100644 index 00..3328813322 --- /dev/null +++ b/include/linux/generic-radix-tree.h @@ -0,0 +1,222 @@ +#ifndef _LINUX_GENERIC_RADIX_TREE_H +#define _LINUX_GENERIC_RADIX_TREE_H + +/* + * Generic radix trees/sparse arrays: + * + * Very simple and minimalistic, supporting arbitrary size entries up to + * PAGE_SIZE. + * + * A genradix is defined with the type it will store, like so: + * static GENRADIX(struct foo) foo_genradix; + * + * The main operations are: + * - genradix_init(radix) - initialize an empty genradix + * + * - genradix_free(radix) - free all memory owned by the genradix and + * reinitialize it + * + * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning + * NULL if that entry does not exist + * + * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry, + * allocating it if necessary + * + * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix + * + * The radix tree allocates one page of entries at a time, so entries may exist + * that were never explicitly allocated - they will be initialized to all + * zeroes. + * + * Internally, a genradix is just a radix tree of pages, and indexing works in + * terms of byte offsets. The wrappers in this header file use sizeof on the + * type the radix contains to calculate a byte offset from the index - see + * __idx_to_offset. + */ + +#include +#include +#include +#include + +struct genradix_node; + +struct __genradix { + struct genradix_node*root; + size_t depth; +}; + +#define __GENRADIX_INITIALIZER \ + { \ + .tree = { \ + .root = NULL, \ + .depth = 0, \ + } \ + } + +/* + * We use a 0 size array to stash the type we're storing without taking any + * space at runtime - then the various accessor macros can use typeof() to get + * to it for casts/sizeof - we also force the alignment so that storing a type + * with a ridiculous alignment doesn't blow up the alignment or size of the + * genradix. + */ + +#define GENRADIX(_type)\ +struct { \ + struct __genradix tree; \ + _type type[0] __aligned(1); \ +} + +#define DEFINE_GENRADIX(_name, _type) \ + GENRADIX(_type) _name = __GENRADIX_INITIALIZER + +/** + * genradix_init - initialize a genradix + * @_radix:genradix to initialize + * + * Does not fail + */ +#define genradix_init(_radix) \ +do { \ + *(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER; \ +} while (0) + +void __genradix_free(struct __genradix *); + +/** + * genradix_free: free all memory owned by a genradix + * + * After freeing, @_radix will be reinitialized and empty + */ +#define genradix_free(_radix) __genradix_free(&(_radix)->tree) + +static inline size_t __idx_to_offset(size_t idx, size_t obj_size) +{ + if (__builtin_constant_p(obj_size)) + BUILD_BUG_ON(obj_size > PAGE_SIZE); + else + BUG_ON(obj_size > PAGE_SIZE); + + if (!is_power_of_2(obj_size)) { + size_t objs_per_page = PAGE_SIZE / obj_size; + + return (idx / objs_per_page) * PAGE_SIZE + + (idx % objs_per_page) * obj_size; + } else { + return idx * obj_size; + } +} + +#define __genradix_cast(_radix)(typeof((_radix)->type[0]) *) +#define __genradix_obj_size(_radix)sizeof((_radix)->type[0]) +#define __genradix_idx_to_offset(_radix, _idx) \ + __idx_to_offset(_idx, __genradix_obj_size(_radix)) + +void *__genradix_ptr(struct __genradix *, size_t); + +/** + * genradix_ptr - get a pointer to a genradix entry + * @_radix:genradix
[PATCH 6/6] Drop flex_arrays
All existing users have been converted to generic radix trees Signed-off-by: Kent Overstreet Acked-by: Dave Hansen --- Documentation/core-api/flexible-arrays.rst | 130 --- Documentation/flexible-arrays.txt | 123 --- include/linux/flex_array.h | 149 include/linux/poison.h | 3 - lib/Makefile | 2 +- lib/flex_array.c | 398 - tools/include/linux/poison.h | 3 - 7 files changed, 1 insertion(+), 807 deletions(-) delete mode 100644 Documentation/core-api/flexible-arrays.rst delete mode 100644 Documentation/flexible-arrays.txt delete mode 100644 include/linux/flex_array.h delete mode 100644 lib/flex_array.c diff --git a/Documentation/core-api/flexible-arrays.rst b/Documentation/core-api/flexible-arrays.rst deleted file mode 100644 index b6b85a1b51..00 --- a/Documentation/core-api/flexible-arrays.rst +++ /dev/null @@ -1,130 +0,0 @@ - -=== -Using flexible arrays in the kernel -=== - -Large contiguous memory allocations can be unreliable in the Linux kernel. -Kernel programmers will sometimes respond to this problem by allocating -pages with :c:func:`vmalloc()`. This solution not ideal, though. On 32-bit -systems, memory from vmalloc() must be mapped into a relatively small address -space; it's easy to run out. On SMP systems, the page table changes required -by vmalloc() allocations can require expensive cross-processor interrupts on -all CPUs. And, on all systems, use of space in the vmalloc() range increases -pressure on the translation lookaside buffer (TLB), reducing the performance -of the system. - -In many cases, the need for memory from vmalloc() can be eliminated by piecing -together an array from smaller parts; the flexible array library exists to make -this task easier. - -A flexible array holds an arbitrary (within limits) number of fixed-sized -objects, accessed via an integer index. Sparse arrays are handled -reasonably well. Only single-page allocations are made, so memory -allocation failures should be relatively rare. The down sides are that the -arrays cannot be indexed directly, individual object size cannot exceed the -system page size, and putting data into a flexible array requires a copy -operation. It's also worth noting that flexible arrays do no internal -locking at all; if concurrent access to an array is possible, then the -caller must arrange for appropriate mutual exclusion. - -The creation of a flexible array is done with :c:func:`flex_array_alloc()`:: - -#include - -struct flex_array *flex_array_alloc(int element_size, - unsigned int total, - gfp_t flags); - -The individual object size is provided by ``element_size``, while total is the -maximum number of objects which can be stored in the array. The flags -argument is passed directly to the internal memory allocation calls. With -the current code, using flags to ask for high memory is likely to lead to -notably unpleasant side effects. - -It is also possible to define flexible arrays at compile time with:: - -DEFINE_FLEX_ARRAY(name, element_size, total); - -This macro will result in a definition of an array with the given name; the -element size and total will be checked for validity at compile time. - -Storing data into a flexible array is accomplished with a call to -:c:func:`flex_array_put()`:: - -int flex_array_put(struct flex_array *array, unsigned int element_nr, - void *src, gfp_t flags); - -This call will copy the data from src into the array, in the position -indicated by ``element_nr`` (which must be less than the maximum specified when -the array was created). If any memory allocations must be performed, flags -will be used. The return value is zero on success, a negative error code -otherwise. - -There might possibly be a need to store data into a flexible array while -running in some sort of atomic context; in this situation, sleeping in the -memory allocator would be a bad thing. That can be avoided by using -``GFP_ATOMIC`` for the flags value, but, often, there is a better way. The -trick is to ensure that any needed memory allocations are done before -entering atomic context, using :c:func:`flex_array_prealloc()`:: - -int flex_array_prealloc(struct flex_array *array, unsigned int start, - unsigned int nr_elements, gfp_t flags); - -This function will ensure that memory for the elements indexed in the range -defined by ``start`` and ``nr_elements`` has been allocated. Thereafter, a -``flex_array_put()`` call on an element in that range is guaranteed not to -block. - -Getting data back out of the array is done with :c:func:`flex_array_get()`:: - -void *flex_array_get(struct flex_array *fa, unsigned int element_nr); -
Re: [PATCH 0/3] ARM: OMAP1: ams-delta: Clean up GPIO setup for MODEM
* Janusz Krzysztofik [180820 11:16]: > > Convert modem related GPIO setup from integer space to GPIO descriptors. > Also, restore original initialization order of the MODEM device and its > related GPIO pins. > > Cleanup of MODEM relaated regulator setup is postponed while waiting for > upcoming conversion of fixed regulator API to GPIO descriptors. > > > Janusz Krzysztofik (3): > ARM: OMAP1: ams-delta: assign MODEM IRQ from GPIO descriptor > ARM: OMAP1: ams-delta: initialize latch2 pins to safe values > ARM: OMAP1: ams-delta: register MODEM device earlier Janusz, can you please repost this series based on v4.19-rc1 with Linus' acks? At least the header file has moved around now. And as this also conflicts with your earlier patch "ARM: OMAP1: ams-delta: assign MODEM IRQ from GPIO descriptor" please repost that too in the same series. If you have other arch/arm/*omap*/* related patches then please repost those too, these are the only ones I still had tagged :) Regards, Tony
[PATCH 1/6] openvswitch: convert to kvmalloc
There was no real need for this code to be using flexarrays, it's just implementing a hash table - ideally it would be using rhashtables, but that conversion would be significantly more complicated. Signed-off-by: Kent Overstreet Cc: Pravin B Shelar Cc: d...@openvswitch.org --- net/openvswitch/flow.h | 1 - net/openvswitch/flow_netlink.h | 1 - net/openvswitch/flow_table.c | 51 -- net/openvswitch/flow_table.h | 3 +- 4 files changed, 13 insertions(+), 43 deletions(-) diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index c670dd24b8..4f06278166 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index 6657606b2b..66f9553758 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -30,7 +30,6 @@ #include #include #include -#include #include #include diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index 80ea2a7185..cfb0098c9a 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -111,29 +111,6 @@ int ovs_flow_tbl_count(const struct flow_table *table) return table->count; } -static struct flex_array *alloc_buckets(unsigned int n_buckets) -{ - struct flex_array *buckets; - int i, err; - - buckets = flex_array_alloc(sizeof(struct hlist_head), - n_buckets, GFP_KERNEL); - if (!buckets) - return NULL; - - err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); - if (err) { - flex_array_free(buckets); - return NULL; - } - - for (i = 0; i < n_buckets; i++) - INIT_HLIST_HEAD((struct hlist_head *) - flex_array_get(buckets, i)); - - return buckets; -} - static void flow_free(struct sw_flow *flow) { int cpu; @@ -168,31 +145,30 @@ void ovs_flow_free(struct sw_flow *flow, bool deferred) flow_free(flow); } -static void free_buckets(struct flex_array *buckets) -{ - flex_array_free(buckets); -} - - static void __table_instance_destroy(struct table_instance *ti) { - free_buckets(ti->buckets); + kvfree(ti->buckets); kfree(ti); } static struct table_instance *table_instance_alloc(int new_size) { struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + int i; if (!ti) return NULL; - ti->buckets = alloc_buckets(new_size); - + ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head), +GFP_KERNEL); if (!ti->buckets) { kfree(ti); return NULL; } + + for (i = 0; i < new_size; i++) + INIT_HLIST_HEAD(>buckets[i]); + ti->n_buckets = new_size; ti->node_ver = 0; ti->keep_flows = false; @@ -249,7 +225,7 @@ static void table_instance_destroy(struct table_instance *ti, for (i = 0; i < ti->n_buckets; i++) { struct sw_flow *flow; - struct hlist_head *head = flex_array_get(ti->buckets, i); + struct hlist_head *head = >buckets[i]; struct hlist_node *n; int ver = ti->node_ver; int ufid_ver = ufid_ti->node_ver; @@ -294,7 +270,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, ver = ti->node_ver; while (*bucket < ti->n_buckets) { i = 0; - head = flex_array_get(ti->buckets, *bucket); + head = >buckets[*bucket]; hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) { if (i < *last) { i++; @@ -313,8 +289,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) { hash = jhash_1word(hash, ti->hash_seed); - return flex_array_get(ti->buckets, - (hash & (ti->n_buckets - 1))); + return >buckets[hash & (ti->n_buckets - 1)]; } static void table_instance_insert(struct table_instance *ti, @@ -347,9 +322,7 @@ static void flow_table_copy_flows(struct table_instance *old, /* Insert in new table. */ for (i = 0; i < old->n_buckets; i++) { struct sw_flow *flow; - struct hlist_head *head; - - head = flex_array_get(old->buckets, i); + struct hlist_head *head = >buckets[i]; if (ufid) hlist_for_each_entry(flow, head, diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h index 2dd9900f53..de5ec6cf51 100644 --- a/net/openvswitch/flow_table.h +++
[PATCH 0/6] flex_arrays -> genradix; prep work for bcachefs
Generic radix trees are a dead simple radix tree implementation that can store types of different sizes, needed for bcachefs. The patch series was sent out previously and was pretty uncontroversial - this is a respin that converts most users to just use kvmalloc. Kent Overstreet (6): openvswitch: convert to kvmalloc md: convert to kvmalloc selinux: convert to kvmalloc Generic radix trees proc: commit to genradix Drop flex_arrays Documentation/core-api/flexible-arrays.rst | 130 --- Documentation/flexible-arrays.txt | 123 --- drivers/md/raid5-ppl.c | 7 +- drivers/md/raid5.c | 82 ++--- drivers/md/raid5.h | 9 +- fs/proc/base.c | 43 +-- include/linux/flex_array.h | 149 include/linux/generic-radix-tree.h | 222 include/linux/poison.h | 3 - lib/Makefile | 5 +- lib/flex_array.c | 398 - lib/generic-radix-tree.c | 180 ++ net/openvswitch/flow.h | 1 - net/openvswitch/flow_netlink.h | 1 - net/openvswitch/flow_table.c | 51 +-- net/openvswitch/flow_table.h | 3 +- security/selinux/ss/avtab.c| 40 +-- security/selinux/ss/avtab.h| 4 +- security/selinux/ss/conditional.c | 6 +- security/selinux/ss/policydb.c | 122 ++- security/selinux/ss/policydb.h | 12 +- security/selinux/ss/services.c | 22 +- tools/include/linux/poison.h | 3 - 23 files changed, 540 insertions(+), 1076 deletions(-) delete mode 100644 Documentation/core-api/flexible-arrays.rst delete mode 100644 Documentation/flexible-arrays.txt delete mode 100644 include/linux/flex_array.h create mode 100644 include/linux/generic-radix-tree.h delete mode 100644 lib/flex_array.c create mode 100644 lib/generic-radix-tree.c -- 2.19.0.rc2
Re: [PATCH v13 11/13] platform/x86: Intel SGX driver
On Thu, Sep 06, 2018 at 05:50:01PM -0700, Joe Perches wrote: > On Thu, 2018-09-06 at 19:35 +0200, Miguel Ojeda wrote: > > > Which one is right and why the kernel tree is polluted with C99-headers > > > when they do not pass checkpatch.pl? > > checkpatch ignores c99 headers since 2016. Jarkko was referring to c99 comments for the SPDX license. checkpatch explicitly requires c-style comments for headers and assembly files as dictated by Documentation/process/license-rules.rst. $ grep -r SPDX **/*.h | grep \/\/ | wc -l 665 $ grep -r SPDX **/*.S | grep \/\/ | wc -l 22 $ git show 9f3a89926d6df commit 9f3a89926d6dfc30a4fd1bbcb92cc7b218d3786d Author: Rob Herring Date: Tue Apr 10 16:33:13 2018 -0700 checkpatch.pl: add SPDX license tag check Add SPDX license tag check based on the rules defined in Documentation/process/license-rules.rst. To summarize, SPDX license tags should be on the 1st line (or 2nd line in scripts) using the appropriate comment style for the file type. Link: http://lkml.kernel.org/r/20180202154026.15298-1-r...@kernel.org Signed-off-by: Rob Herring Signed-off-by: Joe Perches Acked-by: Greg Kroah-Hartman Acked-by: Philippe Ombredanne Cc: Andy Whitcroft Cc: Joe Perches Cc: Thomas Gleixner Cc: Igor Stoppa Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b464a4c3f863..0f022b56f117 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2257,6 +2257,8 @@ sub process { my $camelcase_file_seeded = 0; + my $checklicenseline = 1; + sanitise_line_reset(); my $line; foreach my $rawline (@rawlines) { @@ -2448,6 +2450,7 @@ sub process { } else { $check = $check_orig; } + $checklicenseline = 1; next; } @@ -2911,6 +2914,30 @@ sub process { } } +# check for using SPDX license tag at beginning of files + if ($realline == $checklicenseline) { + if ($rawline =~ /^[ \+]\s*\#\!\s*\//) { + $checklicenseline = 2; + } elsif ($rawline =~ /^\+/) { + my $comment = ""; + if ($realfile =~ /\.(h|s|S)$/) { + $comment = '/*'; + } elsif ($realfile =~ /\.(c|dts|dtsi)$/) { + $comment = '//'; + } elsif (($checklicenseline == 2) || $realfile =~ /\.(sh|pl|py|awk|tc)$/) { + $comment = '#'; + } elsif ($realfile =~ /\.rst$/) { + $comment = '..'; + } + + if ($comment !~ /^$/ && + $rawline !~ /^\+\Q$comment\E SPDX-License-Identifier: /) { + WARN("SPDX_LICENSE_TAG", +"Missing or malformed SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr); + } + } + } + # check we are in a valid source file if not then ignore this hunk next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/);
Re: [PATCH] mfd: ti-lmu: constify mfd_cell tables
Pavel On 09/07/2018 04:39 AM, Pavel Machek wrote: > On Wed 2018-08-29 11:31:08, Pavel Machek wrote: >> From: Sebastian Reichel >> >> mfd: ti-lmu: constify mfd_cell tables >> >> Add const attribute to all mfd_cell structures. >> >> Signed-off-by: Sebastian Reichel >> Signed-off-by: Pavel Machek > > Lee, I guess this is for you to apply. Any news there? > > There are more patches ready, As I stated in another email thread. I don't see the need for this level of LMU framework. Here is the reference thread https://lore.kernel.org/patchwork/patch/982550/ > > https://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git/log/?h=droid4-pending-v4.19 > > and it would be good to get them in. (Alternatively, you can just > cherry-pick them from droid4-pending-v4.19). > > Thanks, > Pavel > > >> diff --git a/drivers/mfd/ti-lmu.c b/drivers/mfd/ti-lmu.c >> index cfb411c..990437e 100644 >> --- a/drivers/mfd/ti-lmu.c >> +++ b/drivers/mfd/ti-lmu.c >> @@ -25,7 +25,7 @@ >> #include >> >> struct ti_lmu_data { >> -struct mfd_cell *cells; >> +const struct mfd_cell *cells; >> int num_cells; >> unsigned int max_register; >> }; >> @@ -63,7 +63,7 @@ static void ti_lmu_disable_hw(struct ti_lmu *lmu) >> gpio_set_value(lmu->en_gpio, 0); >> } >> >> -static struct mfd_cell lm3532_devices[] = { >> +static const struct mfd_cell lm3532_devices[] = { >> { >> .name = "ti-lmu-backlight", >> .id= LM3532, >> @@ -78,7 +78,7 @@ static struct mfd_cell lm3532_devices[] = { >> .of_compatible = "ti,lm363x-regulator", \ >> } \ >> >> -static struct mfd_cell lm3631_devices[] = { >> +static const struct mfd_cell lm3631_devices[] = { >> LM363X_REGULATOR(LM3631_BOOST), >> LM363X_REGULATOR(LM3631_LDO_CONT), >> LM363X_REGULATOR(LM3631_LDO_OREF), >> @@ -91,7 +91,7 @@ static struct mfd_cell lm3631_devices[] = { >> }, >> }; >> >> -static struct mfd_cell lm3632_devices[] = { >> +static const struct mfd_cell lm3632_devices[] = { >> LM363X_REGULATOR(LM3632_BOOST), >> LM363X_REGULATOR(LM3632_LDO_POS), >> LM363X_REGULATOR(LM3632_LDO_NEG), >> @@ -102,7 +102,7 @@ static struct mfd_cell lm3632_devices[] = { >> }, >> }; >> >> -static struct mfd_cell lm3633_devices[] = { >> +static const struct mfd_cell lm3633_devices[] = { >> { >> .name = "ti-lmu-backlight", >> .id= LM3633, >> @@ -120,7 +120,7 @@ static struct mfd_cell lm3633_devices[] = { >> }, >> }; >> >> -static struct mfd_cell lm3695_devices[] = { >> +static const struct mfd_cell lm3695_devices[] = { >> { >> .name = "ti-lmu-backlight", >> .id= LM3695, >> @@ -128,7 +128,7 @@ static struct mfd_cell lm3695_devices[] = { >> }, >> }; >> >> -static struct mfd_cell lm3697_devices[] = { >> +static const struct mfd_cell lm3697_devices[] = { >> { >> .name = "ti-lmu-backlight", >> .id= LM3697, >> > > > -- -- Dan Murphy
Re: [PATCH 2/6] md: convert to kvmalloc
On Fri, Sep 07, 2018 at 12:56:31PM -0400, Kent Overstreet wrote: > @@ -165,7 +164,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct > raid5_percpu *percpu, > struct dma_async_tx_descriptor *tx) > { > int disks = sh->disks; > - struct page **srcs = flex_array_get(percpu->scribble, 0); > + struct page **srcs = percpu->scribble; > int count = 0, pd_idx = sh->pd_idx, i; > struct async_submit_ctl submit; > > @@ -196,8 +195,8 @@ ops_run_partial_parity(struct stripe_head *sh, struct > raid5_percpu *percpu, > } > > init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx, > - NULL, sh, flex_array_get(percpu->scribble, 0) > - + sizeof(struct page *) * (sh->disks + 2)); > + NULL, sh, percpu->scribble + > + sizeof(struct page *) * (sh->disks + 2)); I think this would read better written as: init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx, NULL, sh, srcs + sh->disks + 2); > static addr_conv_t *to_addr_conv(struct stripe_head *sh, >struct raid5_percpu *percpu, int i) > { > - void *addr; > - > - addr = flex_array_get(percpu->scribble, i); > - return addr + sizeof(struct page *) * (sh->disks + 2); > + return percpu->scribble + i * percpu->scribble_obj_size + > + sizeof(struct page *) * (sh->disks + 2); > } > > /* return a pointer to the address conversion region of the scribble buffer > */ > static struct page **to_addr_page(struct raid5_percpu *percpu, int i) > { > - void *addr; > - > - addr = flex_array_get(percpu->scribble, i); > - return addr; > + return percpu->scribble + i * percpu->scribble_obj_size; > } Perhaps this would be better as ... static struct page **to_addr_page(struct raid5_percpu *percpu, int i) { - void *addr; - - addr = flex_array_get(percpu->scribble, i); - return addr; + return percpu->scribble + i * percpu->scribble_obj_size; } static addr_conv_t *to_addr_conv(struct stripe_head *sh, struct raid5_percpu *percpu, int i) { - void *addr; - - addr = flex_array_get(percpu->scribble, i); - return addr + sizeof(struct page *) * (sh->disks + 2); + return to_addr_page(percpu, i) + sh->disks + 2; } The rest looks good.
Re: [PATCH] ASoC: max98373: usleep_range() needs include/delay.h
On Fri, Sep 07, 2018 at 10:52:24AM -0700, Grant Grundler wrote: > On Fri, Sep 7, 2018 at 5:11 AM Mark Brown wrote: > > Note that this isn't causing a warning upstream, presumably due to an > > implicit inclusion that isn't present in the v4.4 kernel that you appear > > to be using, or gets missed due to config differences. > Ok. Is this just an observation or are these reasons to not accept the > change? An observation, you should already have a mail about it being applied. signature.asc Description: PGP signature
[PATCH v6 5/5] x86/kvm: Avoid dynamic allocation of pvclock data when SEV is active
Currently, the per-cpu pvclock data is allocated dynamically when cpu > HVC_BOOT_ARRAY_SIZE. The physical address of this variable is shared between the guest and the hypervisor hence it must be mapped as unencrypted (ie. C=0) when SEV is active. The C-bit works on a page, hence we will be required to perform a full 4k page allocation to store a single 32-byte pvclock variable. It will waste fairly sizeable amount of memory since each CPU will be doing a separate 4k allocation. Let's define a second array for the SEV case to statically allocate for NR_CPUS and put this array in .data..decrypted section so that its mapped with C=0 during boot. The .data..decrypted section has a big chunk of memory that is currently unused. And since second array will be used only when memory encryption is active hence free it when encryption is not active. Signed-off-by: Brijesh Singh Suggested-by: Sean Christopherson Cc: Tom Lendacky Cc: k...@vger.kernel.org Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: linux-kernel@vger.kernel.org Cc: Paolo Bonzini Cc: Sean Christopherson Cc: k...@vger.kernel.org Cc: "Radim Krčmář" --- arch/x86/include/asm/mem_encrypt.h | 4 arch/x86/kernel/kvmclock.c | 14 ++ arch/x86/kernel/vmlinux.lds.S | 3 +++ arch/x86/mm/init.c | 3 +++ arch/x86/mm/mem_encrypt.c | 10 ++ 5 files changed, 34 insertions(+) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 802b2eb..cc46584 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -48,11 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); /* Architecture __weak replacement functions */ void __init mem_encrypt_init(void); +void __init free_decrypted_mem(void); bool sme_active(void); bool sev_active(void); #define __decrypted __attribute__((__section__(".data..decrypted"))) +#define __decrypted_aux __attribute__((__section__(".data..decrypted.aux"))) #else /* !CONFIG_AMD_MEM_ENCRYPT */ @@ -80,6 +82,7 @@ static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } #define __decrypted +#define __decrypted_aux #endif /* CONFIG_AMD_MEM_ENCRYPT */ @@ -93,6 +96,7 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; #define __sme_pa_nodebug(x)(__pa_nodebug(x) | sme_me_mask) extern char __start_data_decrypted[], __end_data_decrypted[]; +extern char __start_data_decrypted_aux[]; #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 376fd3a..6086b56 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -65,6 +65,15 @@ static struct pvclock_vsyscall_time_info static struct pvclock_wall_clock wall_clock __decrypted; static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); +#ifdef CONFIG_AMD_MEM_ENCRYPT +/* + * The auxiliary array will be used when SEV is active. In non-SEV case, + * it will be freed by free_decrypted_mem(). + */ +static struct pvclock_vsyscall_time_info + hv_clock_aux[NR_CPUS] __decrypted_aux; +#endif + static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) { return _cpu_read(hv_clock_per_cpu)->pvti; @@ -269,6 +278,11 @@ static int kvmclock_setup_percpu(unsigned int cpu) /* Use the static page for the first CPUs, allocate otherwise */ if (cpu < HVC_BOOT_ARRAY_SIZE) p = _clock_boot[cpu]; +#ifdef CONFIG_AMD_MEM_ENCRYPT + /* Use the static page from auxiliary array instead of allocating it. */ + else if (sev_active()) + p = _clock_aux[cpu - HVC_BOOT_ARRAY_SIZE]; +#endif else p = kzalloc(sizeof(*p), GFP_KERNEL); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 4cb1064..bde287a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -77,6 +77,9 @@ jiffies_64 = jiffies; . = ALIGN(PMD_SIZE);\ __start_data_decrypted = .; \ *(.data..decrypted);\ + . = ALIGN(PAGE_SIZE); \ + __start_data_decrypted_aux = .; \ + *(.data..decrypted.aux);\ . = ALIGN(PMD_SIZE);\ __end_data_decrypted = .; \ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 7a8fc26..052b279 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -815,9 +815,12 @@ void free_kernel_image_pages(void *begin, void *end) set_memory_np_noalias(begin_ul, len_pages); } +void __weak free_decrypted_mem(void) { } + void __ref free_initmem(void) {
[PATCH v6 3/5] x86/mm: add .data..decrypted section to hold shared variables
kvmclock defines few static variables which are shared with the hypervisor during the kvmclock initialization. When SEV is active, memory is encrypted with a guest-specific key, and if guest OS wants to share the memory region with hypervisor then it must clear the C-bit before sharing it. Currently, we use kernel_physical_mapping_init() to split large pages before clearing the C-bit on shared pages. But it fails when called from the kvmclock initialization (mainly because memblock allocator is not ready that early during boot). Add a __decrypted section attribute which can be used when defining such shared variable. The so-defined variables will be placed in the .data..decrypted section. This section is mapped with C=0 early during boot, we also ensure that the initialized values are updated to match with C=0 (i.e perform an in-place decryption). The .data..decrypted section is PMD-aligned and sized so that we avoid the need to split the large pages when mapping the section. The sme_encrypt_kernel() was used to perform the in-place encryption of the Linux kernel and initrd when SME is active. The routine has been enhanced to decrypt the .data..decrypted section for both SME and SEV cases. Signed-off-by: Brijesh Singh Reviewed-by: Tom Lendacky Cc: Tom Lendacky Cc: k...@vger.kernel.org Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: linux-kernel@vger.kernel.org Cc: Paolo Bonzini Cc: Sean Christopherson Cc: k...@vger.kernel.org Cc: "Radim Krčmář" --- arch/x86/include/asm/mem_encrypt.h | 6 +++ arch/x86/kernel/head64.c | 11 + arch/x86/kernel/vmlinux.lds.S | 17 +++ arch/x86/mm/mem_encrypt_identity.c | 94 -- 4 files changed, 113 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index c064383..802b2eb 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -52,6 +52,8 @@ void __init mem_encrypt_init(void); bool sme_active(void); bool sev_active(void); +#define __decrypted __attribute__((__section__(".data..decrypted"))) + #else /* !CONFIG_AMD_MEM_ENCRYPT */ #define sme_me_mask0ULL @@ -77,6 +79,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +#define __decrypted + #endif /* CONFIG_AMD_MEM_ENCRYPT */ /* @@ -88,6 +92,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; #define __sme_pa(x)(__pa(x) | sme_me_mask) #define __sme_pa_nodebug(x)(__pa_nodebug(x) | sme_me_mask) +extern char __start_data_decrypted[], __end_data_decrypted[]; + #endif /* __ASSEMBLY__ */ #endif /* __X86_MEM_ENCRYPT_H__ */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8047379..af39d68 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -112,6 +112,7 @@ static bool __head check_la57_support(unsigned long physaddr) unsigned long __head __startup_64(unsigned long physaddr, struct boot_params *bp) { + unsigned long vaddr, vaddr_end; unsigned long load_delta, *p; unsigned long pgtable_flags; pgdval_t *pgd; @@ -234,6 +235,16 @@ unsigned long __head __startup_64(unsigned long physaddr, /* Encrypt the kernel and related (if SME is active) */ sme_encrypt_kernel(bp); + /* Clear the memory encryption mask from the .data..decrypted section. */ + if (mem_encrypt_active()) { + vaddr = (unsigned long)__start_data_decrypted; + vaddr_end = (unsigned long)__end_data_decrypted; + for (; vaddr < vaddr_end; vaddr += PMD_SIZE) { + i = pmd_index(vaddr); + pmd[i] -= sme_get_me_mask(); + } + } + /* * Return the SME encryption mask (if SME is active) to be used as a * modifier for the initial pgdir entry programmed into CR3. diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8bde0a4..4cb1064 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -65,6 +65,21 @@ jiffies_64 = jiffies; #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE); #define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE); +/* + * This section contains data which will be mapped as decrypted. Memory + * encryption operates on a page basis. Make this section PMD-aligned + * to avoid spliting the pages while mapping the section early. + * + * Note: We use a separate section so that only this section gets + * decrypted to avoid exposing more than we wish. + */ +#define DATA_DECRYPTED \ + . = ALIGN(PMD_SIZE);\ + __start_data_decrypted = .; \ +
[PATCH v6 4/5] x86/kvm: use __decrypted attribute in shared variables
Commit: 368a540e0232 (x86/kvmclock: Remove memblock dependency) caused SEV guest regression. When SEV is active, we map the shared variables (wall_clock and hv_clock_boot) with C=0 to ensure that both the guest and the hypervisor are able to access the data. To map the variables we use kernel_physical_mapping_init() to split the large pages, but splitting large pages requires allocating a new PMD, which fails now that kvmclock initialization is called early during boot. Recently we added a special .data..decrypted section to hold the shared variables. This section is mapped with C=0 early during boot. Use __decrypted attribute to put the wall_clock and hv_clock_boot in .data..decrypted section so that they are mapped with C=0. Signed-off-by: Brijesh Singh Reviewed-by: Tom Lendacky Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency") Cc: Tom Lendacky Cc: k...@vger.kernel.org Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: linux-kernel@vger.kernel.org Cc: Paolo Bonzini Cc: Sean Christopherson Cc: k...@vger.kernel.org Cc: "Radim Krčmář" --- arch/x86/kernel/kvmclock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1e67646..376fd3a 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -61,8 +61,8 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) static struct pvclock_vsyscall_time_info - hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); -static struct pvclock_wall_clock wall_clock; + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __decrypted __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock __decrypted; static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) -- 2.7.4
Re: Patch "arm64: mm: always enable CONFIG_HOLES_IN_ZONE" has been added to the 4.9-stable tree
On Fri, Sep 07, 2018 at 02:57:51PM +0200, gre...@linuxfoundation.org wrote: > > This is a note to let you know that I've just added the patch titled > > arm64: mm: always enable CONFIG_HOLES_IN_ZONE > > to the 4.9-stable tree which can be found at: > > http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary > > The filename of the patch is: > arm64-mm-always-enable-config_holes_in_zone.patch > and it can be found in the queue-4.9 subdirectory. > > If you, or anyone else, feels it should not be added to the stable tree, > please let know about it. > > > From f52bb98f5aded4c43e52f5ce19fb83f7261e9e73 Mon Sep 17 00:00:00 2001 > From: James Morse > Date: Thu, 30 Aug 2018 16:05:32 +0100 > Subject: arm64: mm: always enable CONFIG_HOLES_IN_ZONE > > From: James Morse > > commit f52bb98f5aded4c43e52f5ce19fb83f7261e9e73 upstream. > > Commit 6d526ee26ccd ("arm64: mm: enable CONFIG_HOLES_IN_ZONE for NUMA") > only enabled HOLES_IN_ZONE for NUMA systems because the NUMA code was > choking on the missing zone for nomap pages. This problem doesn't just > apply to NUMA systems. > > If the architecture doesn't set HAVE_ARCH_PFN_VALID, pfn_valid() will > return true if the pfn is part of a valid sparsemem section. > > When working with multiple pages, the mm code uses pfn_valid_within() > to test each page it uses within the sparsemem section is valid. On > most systems memory comes in MAX_ORDER_NR_PAGES chunks which all > have valid/initialised struct pages. In this case pfn_valid_within() > is optimised out. > > Systems where this isn't true (e.g. due to nomap) should set > HOLES_IN_ZONE and provide HAVE_ARCH_PFN_VALID so that mm tests each > page as it works with it. > > Currently non-NUMA arm64 systems can't enable HOLES_IN_ZONE, leading to > a VM_BUG_ON(): > > | page:fdff802e1780 is uninitialized and poisoned > | raw: > | raw: > | page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p)) > | [ cut here ] > | kernel BUG at include/linux/mm.h:978! > | Internal error: Oops - BUG: 0 [#1] PREEMPT SMP > [...] > | CPU: 1 PID: 25236 Comm: dd Not tainted 4.18.0 #7 > | Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 > | pstate: 4085 (nZcv daIf -PAN -UAO) > | pc : move_freepages_block+0x144/0x248 > | lr : move_freepages_block+0x144/0x248 > | sp : fe0071177680 > [...] > | Process dd (pid: 25236, stack limit = 0x94cc07fb) > | Call trace: > | move_freepages_block+0x144/0x248 > | steal_suitable_fallback+0x100/0x16c > | get_page_from_freelist+0x440/0xb20 > | __alloc_pages_nodemask+0xe8/0x838 > | new_slab+0xd4/0x418 > | ___slab_alloc.constprop.27+0x380/0x4a8 > | __slab_alloc.isra.21.constprop.26+0x24/0x34 > | kmem_cache_alloc+0xa8/0x180 > | alloc_buffer_head+0x1c/0x90 > | alloc_page_buffers+0x68/0xb0 > | create_empty_buffers+0x20/0x1ec > | create_page_buffers+0xb0/0xf0 > | __block_write_begin_int+0xc4/0x564 > | __block_write_begin+0x10/0x18 > | block_write_begin+0x48/0xd0 > | blkdev_write_begin+0x28/0x30 > | generic_perform_write+0x98/0x16c > | __generic_file_write_iter+0x138/0x168 > | blkdev_write_iter+0x80/0xf0 > | __vfs_write+0xe4/0x10c > | vfs_write+0xb4/0x168 > | ksys_write+0x44/0x88 > | sys_write+0xc/0x14 > | el0_svc_naked+0x30/0x34 > | Code: aa1303e0 90001a01 91296421 94008902 (d421) > | ---[ end trace 1601ba47f6e883fe ]--- > > Remove the NUMA dependency. > > Link: https://www.spinics.net/lists/arm-kernel/msg671851.html > Cc: > Cc: Ard Biesheuvel > Reported-by: Mikulas Patocka > Reviewed-by: Pavel Tatashin > Tested-by: Mikulas Patocka > Signed-off-by: James Morse > Signed-off-by: Will Deacon > Signed-off-by: Greg Kroah-Hartman > > --- > arch/arm64/Kconfig |1 - > 1 file changed, 1 deletion(-) > > --- a/arch/arm64/Kconfig > +++ b/arch/arm64/Kconfig > @@ -631,7 +631,6 @@ config HAVE_SETUP_PER_CPU_AREA > > config NEED_PER_CPU_EMBED_FIRST_CHUNK Looks like git got confused here, this isn't HOLES_IN_ZONE. Additionally, commit 6d526ee26ccd ("arm64: mm: enable CONFIG_HOLES_IN_ZONE for NUMA") that introduced it to this file didn't appear until 4.11 so this patch can be dropped from 4.9. > def_bool y > - depends on NUMA > > source kernel/Kconfig.preempt > source kernel/Kconfig.hz > > > Patches currently in stable-queue which might be from james.mo...@arm.com are > > queue-4.9/arm64-mm-always-enable-config_holes_in_zone.patch
Re: [PATCH] of: Split up name & type in modalias generation
On Fri, Sep 7, 2018 at 9:22 AM Thierry Reding wrote: > > From: Thierry Reding > > The kernel's vsnprintf() implementation discards all alpha-numeric > characters following a %p conversion specifier. This is done in order to > generically skip any of the various modifiers that the kernel supports. > Unfortunately, the OF modalias is generated with a format string that > violates the assumption made by vsnprintf(): > > of:N%pOFnT%s > > While processing the above format string, vsnprintf() will eat the 'T' > character, assuming that it belongs to the preceeding %p specifier. This > results in a modalias with an incompatible format, which in turn causes > the automatic loading of drivers based on modalias to no longer work. > > To fix this, split up the generation of the name & type fields into two > separate snprintf() calls to avoid confusing the parser. > > Fixes: 73813f8483b1 ("of: Convert to using %pOFn instead of device_node.name") > Signed-off-by: Thierry Reding > --- > Note that a more elegant fix would be to make the %p format specifier > parser report back the exact number of characters consumed. I briefly > tried to implement it, but quickly ran into numerous special cases > that make this solution rather involved. > > I can spend some more time to improve this in general if that's what we > ultimately want, but I think this patch is a better short-term fix to > workaround the issue. See my reply on the original patch. I've updated the patch in my dt/next branch with the fix to use %c. Rob
Re: [PATCH RFC LKMM 1/7] tools/memory-model: Add extra ordering for locks and remove it for ordinary release/acquire
On Thu, 6 Sep 2018, Andrea Parri wrote: > > Have you noticed any part of the generic code that relies on ordinary > > acquire-release (rather than atomic RMW acquire-release) in order to > > implement locking constructs? > > There are several places in code where the "lock-acquire" seems to be > provided by an atomic_cond_read_acquire/smp_cond_load_acquire: I have > mentioned one in qspinlock in this thread; qrwlock and mcs_spinlock > provide other examples (grep for the primitives...). > > As long as we don't consider these primitive as RMW (which would seem > odd...) or as acquire for which "most people expect strong ordering" > (see above), these provides other examples for the _gap_ I mentioned. Okay, now I understand your objection. It does appear that on RISC-V, if nowhere else, the current implementations of qspinlock, qrwlock, etc. will not provide "RCtso" ordering. The discussions surrounding this topic have been so lengthy and confusing that I have lost track of any comments Palmer or Daniel may have made concerning this potential problem. One possible resolution would be to define smp_cond_load_acquire() specially on RISC-V so that it provided the same ordering guarantees as RMW-acquire. (Plus adding a comment in the asm-generic/barrier.h pointing out the necessity for the stronger guarantee on all architectures.) Another would be to replace the usages of atomic/smp_cond_load_acquire in the locking constructs with a new function that would otherwise be the same but would provide the ordering guarantee we want. Do you think either of these would be an adequate fix? Alan
Re: [PATCH] apparmor: Fix network performance issue in aa_label_sk_perm
On 09/06/2018 09:33 PM, Tony Jones wrote: > The netperf benchmark shows a 5.73% reduction in throughput for > small (64 byte) transfers by unconfined tasks. > > DEFINE_AUDIT_SK() in aa_label_sk_perm() should not be performed > unconditionally, rather only when the label is confined. > > netperf-tcp > 56974a6fc^ 56974a6fc > Min 64 563.48 ( 0.00%) 531.17 ( -5.73%) > Min 128 1056.92 ( 0.00%) 999.44 ( -5.44%) > Min 256 1945.95 ( 0.00%) 1867.97 ( -4.01%) > Min 1024 6761.40 ( 0.00%) 6364.23 ( -5.87%) > Min 2048 0.53 ( 0.00%)10606.20 ( -4.54%) > Min 3312 13692.67 ( 0.00%)13158.41 ( -3.90%) > Min 4096 14926.29 ( 0.00%)14457.46 ( -3.14%) > Min 8192 18399.34 ( 0.00%)18091.65 ( -1.67%) > Min 1638421384.13 ( 0.00%)21158.05 ( -1.06%) > Hmean 64 564.96 ( 0.00%) 534.38 ( -5.41%) > Hmean 128 1064.42 ( 0.00%) 1010.12 ( -5.10%) > Hmean 256 1965.85 ( 0.00%) 1879.16 ( -4.41%) > Hmean 1024 6839.77 ( 0.00%) 6478.70 ( -5.28%) > Hmean 2048 11154.80 ( 0.00%)10671.13 ( -4.34%) > Hmean 3312 13838.12 ( 0.00%)13249.01 ( -4.26%) > Hmean 4096 15009.99 ( 0.00%)14561.36 ( -2.99%) > Hmean 8192 18975.57 ( 0.00%)18326.54 ( -3.42%) > Hmean 1638421440.44 ( 0.00%)21324.59 ( -0.54%) > Stddev64 1.24 ( 0.00%)2.85 (-130.64%) > Stddev128 4.51 ( 0.00%)6.53 ( -44.84%) > Stddev256 11.67 ( 0.00%)8.50 ( 27.16%) > Stddev102448.33 ( 0.00%) 75.07 ( -55.34%) > Stddev204854.82 ( 0.00%) 65.16 ( -18.86%) > Stddev3312 153.57 ( 0.00%) 56.29 ( 63.35%) > Stddev4096 100.25 ( 0.00%) 88.50 ( 11.72%) > Stddev8192 358.13 ( 0.00%) 169.99 ( 52.54%) > Stddev16384 43.99 ( 0.00%) 141.82 (-222.39%) > > Signed-off-by: Tony Jones > Fixes: 56974a6fcfef ("apparmor: add base infastructure for socket > mediation") hey Tony, thanks for the patch, I am curious did you're investigation look into what parts of DEFINE_AUDIT_SK are causing the issue? regardless, I have pulled it into apparmor next > --- > security/apparmor/net.c | 15 +-- > 1 file changed, 9 insertions(+), 6 deletions(-) > > diff --git a/security/apparmor/net.c b/security/apparmor/net.c > index bb24cfa0a164..d5d72dd1ca1f 100644 > --- a/security/apparmor/net.c > +++ b/security/apparmor/net.c > @@ -146,17 +146,20 @@ int aa_af_perm(struct aa_label *label, const char *op, > u32 request, u16 family, > static int aa_label_sk_perm(struct aa_label *label, const char *op, u32 > request, > struct sock *sk) > { > - struct aa_profile *profile; > - DEFINE_AUDIT_SK(sa, op, sk); > + int error = 0; > > AA_BUG(!label); > AA_BUG(!sk); > > - if (unconfined(label)) > - return 0; > + if (!unconfined(label)) { > + struct aa_profile *profile; > + DEFINE_AUDIT_SK(sa, op, sk); > > - return fn_for_each_confined(label, profile, > - aa_profile_af_sk_perm(profile, , request, sk)); > + error = fn_for_each_confined(label, profile, > + aa_profile_af_sk_perm(profile, , request, sk)); > + } > + > + return error; > } > > int aa_sk_perm(const char *op, u32 request, struct sock *sk) >
[PATCH 0/4] 9p coverity fixes
From: Dominique Martinet Since we already had one coverity fix for 9p, I figured I could request an account and look at stuff that actually could matter. The leak of glock.client_id wasn't found by coverity but when I was looking at a false positive there, of the rest the rdma one is useless but the other two are pretty important -- I will probably mark the three useful ones to backport to stable kernels. As usual, comments more than welcome, but I'll probably push them to linux-next along with the other patches that need test after testing the whole batch together next week. Dominique Martinet (4): 9p: acl: fix uninitialized iattr access 9p/rdma: remove useless check in cm_event_handler 9p: p9dirent_read: check network-provided name length 9p locks: fix glock.client_id leak in do_lock fs/9p/acl.c | 2 +- fs/9p/vfs_file.c| 16 ++-- net/9p/protocol.c | 12 +--- net/9p/trans_rdma.c | 3 +-- 4 files changed, 25 insertions(+), 8 deletions(-) -- 2.17.1
[PATCH 4/4] 9p locks: fix glock.client_id leak in do_lock
From: Dominique Martinet the 9p client code overwrites our glock.client_id pointing to a static buffer by an allocated string holding the network provided value which we do not care about; free and reset the value as appropriate. This is almost identical to the leak in v9fs_file_getlock() fixed by Al Viro in commit ce85dd58ad5a6 ("9p: we are leaking glock.client_id in v9fs_file_getlock()"), which was returned as an error by a coverity false positive -- while we are here attempt to make the code slightly more robust to future change of the net/9p/client code and hopefully more clear to coverity that there is no problem. Signed-off-by: Dominique Martinet --- fs/9p/vfs_file.c | 16 ++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 73857ebaedfb..a25efa782fcc 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -208,6 +208,14 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) if (schedule_timeout_interruptible(v9ses->session_lock_timeout) != 0) break; + /* +* p9_client_lock_dotl overwrites flock.client_id with the +* server message, free and reuse the client name +*/ + if (flock.client_id != fid->clnt->name) { + kfree(flock.client_id); + flock.client_id = fid->clnt->name; + } } /* map 9p status to VFS status */ @@ -239,6 +247,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) locks_lock_file_wait(filp, fl); fl->fl_type = fl_type; } + if (flock.client_id != fid->clnt->name) + kfree(flock.client_id); out: return res; } @@ -273,7 +283,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) res = p9_client_getlock_dotl(fid, ); if (res < 0) - return res; + goto out; /* map 9p lock type to os lock type */ switch (glock.type) { case P9_LOCK_TYPE_RDLCK: @@ -294,7 +304,9 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) fl->fl_end = glock.start + glock.length - 1; fl->fl_pid = -glock.proc_id; } - kfree(glock.client_id); +out: + if (glock.client_id != fid->clnt->name) + kfree(glock.client_id); return res; } -- 2.17.1
Re: [PATCH 01/12] blkcg: fix ref count issue with bio_blkcg using task_css
On Thu, Sep 06, 2018 at 05:10:34PM -0400, Dennis Zhou wrote: > From: "Dennis Zhou (Facebook)" > > The accessor function bio_blkcg either returns the blkcg associated with > the bio or finds one in the current context. This can cause an issue > when trying to associate a bio with a blkcg. Particularly, it's the > third case that is problematic: > > return css_to_blkcg(task_css(current, io_cgrp_id)); > > As the above may race against task migration and the cgroup exiting, it > is not always ok to take a reference on the blkcg returned from > bio_blkcg. > > This patch adds association ahead of calling bio_blkcg rather than > after. This makes association a required and explicit step along the > code paths for calling bio_blkcg. blk_get_rl is modified as well to get > a reference to the blkcg it may use and blk_put_rl will always put the > reference back. Association is also moved above the bio_blkcg call to > ensure it will not return NULL in blk-iolatency. > > BFQ and CFQ utilize this flaw, but due to the complexity, I do not want > to address this in this series. I've created a private version of the > function with notes not to use it describing the flaw. Hopefully soon, > that code can be cleaned up. > > Signed-off-by: Dennis Zhou Acked-by: Tejun Heo -- tejun
[PATCH] firmware: arm_scmi: use strlcpy to ensure NULL-terminated strings
Replace all the memcpy() for copying name strings from the firmware with strlcpy() to make sure we are bounded by the source buffer size and we also always have NULL-terminated strings. This is needed to avoid out of bounds accesses if the firmware returns a non-terminated string. Reported-by: Olof Johansson Signed-off-by: Sudeep Holla --- drivers/firmware/arm_scmi/base.c| 2 +- drivers/firmware/arm_scmi/clock.c | 2 +- drivers/firmware/arm_scmi/perf.c| 2 +- drivers/firmware/arm_scmi/power.c | 2 +- drivers/firmware/arm_scmi/sensors.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) Hi Olof, Let me know if this is rc/fix material or need to wait for v4.20 ? Regards, Sudeep diff --git a/drivers/firmware/arm_scmi/base.c b/drivers/firmware/arm_scmi/base.c index 9dff33ea6416..204390297f4b 100644 --- a/drivers/firmware/arm_scmi/base.c +++ b/drivers/firmware/arm_scmi/base.c @@ -208,7 +208,7 @@ static int scmi_base_discover_agent_get(const struct scmi_handle *handle, ret = scmi_do_xfer(handle, t); if (!ret) - memcpy(name, t->rx.buf, SCMI_MAX_STR_SIZE); + strlcpy(name, t->rx.buf, SCMI_MAX_STR_SIZE); scmi_xfer_put(handle, t); diff --git a/drivers/firmware/arm_scmi/clock.c b/drivers/firmware/arm_scmi/clock.c index e4119eb34986..30fc04e28431 100644 --- a/drivers/firmware/arm_scmi/clock.c +++ b/drivers/firmware/arm_scmi/clock.c @@ -111,7 +111,7 @@ static int scmi_clock_attributes_get(const struct scmi_handle *handle, ret = scmi_do_xfer(handle, t); if (!ret) - memcpy(clk->name, attr->name, SCMI_MAX_STR_SIZE); + strlcpy(clk->name, attr->name, SCMI_MAX_STR_SIZE); else clk->name[0] = '\0'; diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c index 721e6c57beae..c3b0041defee 100644 --- a/drivers/firmware/arm_scmi/perf.c +++ b/drivers/firmware/arm_scmi/perf.c @@ -168,7 +168,7 @@ scmi_perf_domain_attributes_get(const struct scmi_handle *handle, u32 domain, le32_to_cpu(attr->sustained_perf_level); dom_info->mult_factor = (dom_info->sustained_freq_khz * 1000) / dom_info->sustained_perf_level; - memcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE); + strlcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE); } scmi_xfer_put(handle, t); diff --git a/drivers/firmware/arm_scmi/power.c b/drivers/firmware/arm_scmi/power.c index cfa033b05aed..62f3401a1f01 100644 --- a/drivers/firmware/arm_scmi/power.c +++ b/drivers/firmware/arm_scmi/power.c @@ -106,7 +106,7 @@ scmi_power_domain_attributes_get(const struct scmi_handle *handle, u32 domain, dom_info->state_set_notify = SUPPORTS_STATE_SET_NOTIFY(flags); dom_info->state_set_async = SUPPORTS_STATE_SET_ASYNC(flags); dom_info->state_set_sync = SUPPORTS_STATE_SET_SYNC(flags); - memcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE); + strlcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE); } scmi_xfer_put(handle, t); diff --git a/drivers/firmware/arm_scmi/sensors.c b/drivers/firmware/arm_scmi/sensors.c index 27f2092b9882..b53d5cc9c9f6 100644 --- a/drivers/firmware/arm_scmi/sensors.c +++ b/drivers/firmware/arm_scmi/sensors.c @@ -140,7 +140,7 @@ static int scmi_sensor_description_get(const struct scmi_handle *handle, s = >sensors[desc_index + cnt]; s->id = le32_to_cpu(buf->desc[cnt].id); s->type = SENSOR_TYPE(attrh); - memcpy(s->name, buf->desc[cnt].name, SCMI_MAX_STR_SIZE); + strlcpy(s->name, buf->desc[cnt].name, SCMI_MAX_STR_SIZE); } desc_index += num_returned; -- 2.7.4
Re: [PATCH 2/2] Add tests for memory.oom.group
On Fri, Sep 07, 2018 at 09:49:24AM -0700, jgka...@fb.com wrote: > From: Jay Kamat > > Add tests for memory.oom.group for the following cases: > - Killing all processes in a leaf cgroup, but leaving the > parent untouched > - Killing all processes in a parent and leaf cgroup > - Keeping processes marked by OOM_SCORE_ADJ_MIN alive when considered > for being killed by the group oom killer. > > Signed-off-by: Jay Kamat Acked-by: Roman Gushchin
[PATCH 5/6] proc: commit to genradix
the new generic radix trees have a simpler API and implementation, and no limitations on number of elements, so all flex_array users are being converted Signed-off-by: Kent Overstreet Cc: Al Viro --- fs/proc/base.c | 43 +++ 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index aaffc0c302..e11fbb390a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include #include @@ -92,7 +93,6 @@ #include #include #include -#include #include #include #include "internal.h" @@ -2128,11 +2128,12 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) struct task_struct *task; struct mm_struct *mm; unsigned long nr_files, pos, i; - struct flex_array *fa = NULL; - struct map_files_info info; + GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; + genradix_init(); + ret = -ENOENT; task = get_proc_task(file_inode(file)); if (!task) @@ -2164,35 +2165,22 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) */ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { - if (vma->vm_file && ++pos > ctx->pos) - nr_files++; - } + if (!vma->vm_file) + continue; + if (++pos <= ctx->pos) + continue; - if (nr_files) { - fa = flex_array_alloc(sizeof(info), nr_files, - GFP_KERNEL); - if (!fa || flex_array_prealloc(fa, 0, nr_files, - GFP_KERNEL)) { + p = genradix_ptr_alloc(, nr_files++, GFP_KERNEL); + if (!p) { ret = -ENOMEM; - if (fa) - flex_array_free(fa); up_read(>mmap_sem); mmput(mm); goto out_put_task; } - for (i = 0, vma = mm->mmap, pos = 2; vma; - vma = vma->vm_next) { - if (!vma->vm_file) - continue; - if (++pos <= ctx->pos) - continue; - info.start = vma->vm_start; - info.end = vma->vm_end; - info.mode = vma->vm_file->f_mode; - if (flex_array_put(fa, i++, , GFP_KERNEL)) - BUG(); - } + p->start = vma->vm_start; + p->end = vma->vm_end; + p->mode = vma->vm_file->f_mode; } up_read(>mmap_sem); mmput(mm); @@ -2201,7 +2189,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */ unsigned int len; - p = flex_array_get(fa, i); + p = genradix_ptr(, i); len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end); if (!proc_fill_cache(file, ctx, buf, len, @@ -2211,12 +2199,11 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) break; ctx->pos++; } - if (fa) - flex_array_free(fa); out_put_task: put_task_struct(task); out: + genradix_free(); return ret; } -- 2.19.0.rc2
Re: [PATCH 4.4 105/124] xen-netfront: wait xenbus state change when load module manually
On 09/06/2018 08:21 PM, Ben Hutchings wrote: > On Sat, 2018-08-04 at 11:01 +0200, Greg Kroah-Hartman wrote: >> 4.4-stable review patch. If anyone has any objections, please let me know. >> >> -- >> >> From: Xiao Liang >> >> [ Upstream commit 822fb18a82abaf4ee7058793d95d340f5dab7bfc ] >> >> When loading module manually, after call xenbus_switch_state to initializes >> the state of the netfront device, the driver state did not change so fast >> that may lead no dev created in latest kernel. This patch adds wait to make >> sure xenbus knows the driver is not in closed/unknown state. > [...] >> --- a/drivers/net/xen-netfront.c >> +++ b/drivers/net/xen-netfront.c >> @@ -86,6 +86,7 @@ struct netfront_cb { >> /* IRQ name is queue name with "-tx" or "-rx" appended */ >> #define IRQ_NAME_SIZE (QUEUE_NAME_SIZE + 3) >> >> +static DECLARE_WAIT_QUEUE_HEAD(module_load_q); >> static DECLARE_WAIT_QUEUE_HEAD(module_unload_q); >> >> struct netfront_stats { >> @@ -1335,6 +1336,11 @@ static struct net_device *xennet_create_ >> netif_carrier_off(netdev); >> >> xenbus_switch_state(dev, XenbusStateInitialising); >> +wait_event(module_load_q, >> + xenbus_read_driver_state(dev->otherend) != >> + XenbusStateClosed && >> + xenbus_read_driver_state(dev->otherend) != >> + XenbusStateUnknown); >> return netdev; >> >> exit: > This won't work; it will hang. Something (maybe netback_changed()?) > needs to wake up tasks on the module_load_q. https://lkml.org/lkml/2018/9/7/691 -boris
Re: [PATCH v2 00/32] Device Tree Updates for GTA04 (A3/A4/A5 variants)
* Tony Lindgren [180829 19:40]: > * H. Nikolaus Schaller [180829 07:24]: > > Hi OMAP3 DTS Maintainers, > > is there any progress in merging this patch series? > > Looks good to me in general, I'll be getting into applying > patches for v4.20 in few days. Applying all into omap-for-v4.20/dt finally thanks. Regards, Tony
Re: Regression in next with filesystem context concept
Hi, On Fri, 7 Sep 2018 09:10:23 -0700 Tony Lindgren wrote: > * David Howells [180907 08:51]: > > Tony Lindgren wrote: > > > > > Looks like next-20180906 now has a regression where mounting > > > root won't work with commit fd0002870b45 ("vfs: Implement a > > > filesystem superblock creation/configuration context"). > > > > Am I right in thinking you're not using any of the LSMs? > > Assuming LSM as in Documentation/lsm.txt, right not using any. > > BTW, I don't think this issue shows up with ramdisk either, > so that's probably why for example kernelci.org does not > show errors. > I have also similar experience with my automated tests (automated alarming does not work yet ;-)), I am still in the beginning. I do there a ramdisk boot to create an overlay mount with the fresh modules on top of an ordinary rootfs. initramfs mount is ok, but the microsd card fails. Testing from a ramdisk I get: / # ls -l /dev/mmcblk0p2 brw---1 00 179, 2 Jan 1 1970 /dev/mmcblk0p2 / # mount /dev/mmcblk0p2 /mnt/ [ 682.819061] Filesystem requires source device [ 682.825103] Filesystem requires source device [ 682.830810] Filesystem requires source device [ 682.836303] Filesystem requires source device [ 682.843078] Filesystem requires source device [ 682.847991] Filesystem requires source device [ 682.853149] Filesystem requires source device mount: mounting /dev/mmcblk0p2 on /mnt/ failed: No such file or directory 64GB microsd at omap_hsmmc correcly recognized. Last known successful boot: next-20180830 so you are not alone with such problems. will investigate further Regards, Andreas pgpHygN3ZoJxx.pgp Description: OpenPGP digital signature
[RESEND PATCH] mm: percpu: remove unnecessary unlikely()
WARN_ON() already contains an unlikely(), so it's not necessary to wrap it into another. Signed-off-by: Igor Stoppa Acked-by: Dennis Zhou Cc: Tejun Heo Cc: zijun_hu Cc: Christoph Lameter Cc: linux...@kvack.org Cc: linux-kernel@vger.kernel.org --- mm/percpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/percpu.c b/mm/percpu.c index a749d4d96e3e..f5c2796fe63e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2588,7 +2588,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, BUG_ON(ai->nr_groups != 1); upa = ai->alloc_size/ai->unit_size; nr_g0_units = roundup(num_possible_cpus(), upa); - if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) { + if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) { pcpu_free_alloc_info(ai); return -EINVAL; } -- 2.17.1
Re: [PATCH v2 2/9] nios2: build .dtb files in dts directory
On Thu, Sep 6, 2018 at 9:21 PM Ley Foon Tan wrote: > > On Wed, 2018-09-05 at 18:53 -0500, Rob Herring wrote: > > Align nios2 with other architectures which build the dtb files in the > > same directory as the dts files. This is also in line with most other > > build targets which are located in the same directory as the source. > > This move will help enable the 'dtbs' target which builds all the > > dtbs > > regardless of kernel config. > > > > This transition could break some scripts if they expect dtb files in > > the old location. > > > > Cc: Ley Foon Tan > > Cc: nios2-...@lists.rocketboards.org > > Signed-off-by: Rob Herring > > --- > > Please ack so I can take the whole series via the DT tree. > > > > arch/nios2/Makefile | 4 ++-- > > arch/nios2/boot/Makefile | 4 > > arch/nios2/boot/dts/Makefile | 1 + > > 3 files changed, 3 insertions(+), 6 deletions(-) > > create mode 100644 arch/nios2/boot/dts/Makefile > > > > diff --git a/arch/nios2/Makefile b/arch/nios2/Makefile > > index 8673a79dca9c..50eece1c6adb 100644 > > --- a/arch/nios2/Makefile > > +++ b/arch/nios2/Makefile > > @@ -59,10 +59,10 @@ archclean: > > $(Q)$(MAKE) $(clean)=$(nios2-boot) > > > > %.dtb: | scripts > > - $(Q)$(MAKE) $(build)=$(nios2-boot) $(nios2-boot)/$@ > > + $(Q)$(MAKE) $(build)=$(nios2-boot)/dts $(nios2-boot)/dts/$@ > > > > dtbs: > > - $(Q)$(MAKE) $(build)=$(nios2-boot) $(nios2-boot)/$@ > > + $(Q)$(MAKE) $(build)=$(nios2-boot)/dts > > > > $(BOOT_TARGETS): vmlinux > > $(Q)$(MAKE) $(build)=$(nios2-boot) $(nios2-boot)/$@ > > diff --git a/arch/nios2/boot/Makefile b/arch/nios2/boot/Makefile > > index 2ba23a679732..007586094dde 100644 > > --- a/arch/nios2/boot/Makefile > > +++ b/arch/nios2/boot/Makefile > > @@ -47,10 +47,6 @@ obj-$(CONFIG_NIOS2_DTB_SOURCE_BOOL) += > > linked_dtb.o > > > > targets += $(dtb-y) > > > > -# Rule to build device tree blobs with make command > > -$(obj)/%.dtb: $(src)/dts/%.dts FORCE > > - $(call if_changed_dep,dtc) > > - > > $(obj)/dtbs: $(addprefix $(obj)/, $(dtb-y)) > > > > install: > > diff --git a/arch/nios2/boot/dts/Makefile > > b/arch/nios2/boot/dts/Makefile > > new file mode 100644 > > index ..f66554cd5c45 > > --- /dev/null > > +++ b/arch/nios2/boot/dts/Makefile > > @@ -0,0 +1 @@ > > +# SPDX-License-Identifier: GPL-2.0 > > -- > > 2.17.1 > > > Hi Rob > > I have synced your all-dtbs branch from here: https://git.kernel.org/pu > b/scm/linux/kernel/git/robh/linux.git/log/?h=all-dtbs > > It shows error when compile kernel image and also when "make > dtbs_install". Can you fetch the branch again and try it. I fixed a few dependency issues. > make dtbs_install > make[1]: *** No rule to make target > 'arch/nios2/boot/dts/arch/nios2/boot/dts/10m50_devboard.dtb', needed by > 'arch/nios2/boot/dts/arch/nios2/boot/dts/10m50_devboard.dtb.S'. Stop. What is the value of CONFIG_NIOS2_DTB_SOURCE? As patch 3 notes, it now should not have any path. If that's a problem, I could take the basename to strip the path, but then sub directories wouldn't work either. BTW, next up, I want to consolidate the config variables for built-in dtbs. Rob
Re: Conflict between sparse and commit cafa0010cd51f ("Raise the minimum required gcc version to 4.6")
On Fri, Sep 7, 2018 at 11:13 AM Luc Van Oostenryck wrote: > > On Fri, Sep 07, 2018 at 10:22:56AM -0700, Nick Desaulniers wrote: > > On Fri, Sep 7, 2018 at 7:34 AM Christophe LEROY > > wrote: > > > > > > Cc linux-spa...@vger.kernel.org > > > > > > Le 07/09/2018 à 14:22, Christophe Leroy a écrit : > > > > Since commit cafa0010cd51f ("Raise the minimum required gcc version to > > > > 4.6"), sparse check fails as follows: > > > > > > > > [root@pc16082vm linux-powerpc]# make C=2 arch/powerpc/kernel/process.o > > > >CALLscripts/checksyscalls.sh > > > >CHECK scripts/mod/empty.c > > > > ./include/linux/compiler-gcc.h:14:3: error: Sorry, your compiler is too > > > > old - please upgrade it. > > > >CHECK arch/powerpc/kernel/process.c > > > > ./include/linux/compiler-gcc.h:14:3: error: Sorry, your compiler is too > > > > old - please upgrade it. > > > > > > > > > > > > I have sparse version 0.5.2 > > > > > > > > What can be done to fix that ? > > > > > > > > Christophe > > > > Oof, sorry Christophe. Looks like that's the latest version of sparse: > > https://sparse.wiki.kernel.org/index.php/Main_Page#News > > > > I'm curious what sparse expands __GNUC__, __GNUC_MINOR__, and > > __GNUC_PATCHLEVEL__ to? Pre commit cafa0010cd51f, it MUST be > > expanding them to something, otherwise you'd have seen the error then, > > too. The previous check was GCC < 3.3, now it's GCC < 4.6. > > Sparse expand these macros to the same version than the compiler used > to compile GCC. I find a bit strange though to have sparse v0.5.2 but > using an old compiler. So Christophe must have a version of gcc < 4.6 installed somewhere? Does sparse use `cc`? If so, Christophe, does your `ls -l $(which cc)` point to an old version of gcc maybe? > > Also, it's worth to look at what is said in this email: > > https://lore.kernel.org/lkml/ca+55afzyenzr2gzlr-dwponjmnygyody+6awlcvnaywiazu...@mail.gmail.com/ > > > -- Luc -- Thanks, ~Nick Desaulniers
Re: [PATCH 1/2] Fix cg_read_strcmp()
Shuah Khan writes: > On 09/07/2018 10:49 AM, jgka...@fb.com wrote: >> From: Jay Kamat >> >> Fix a couple issues with cg_read_strcmp(), to improve correctness of >> cgroup tests >> - Fix cg_read_strcmp() always returning 0 for empty "needle" strings >> - Fix a memory leak in cg_read_strcmp() >> >> Fixes: 84092dbcf901 ("selftests: cgroup: add memory controller self-tests") >> >> Signed-off-by: Jay Kamat >> --- >> tools/testing/selftests/cgroup/cgroup_util.c | 17 ++--- >> 1 file changed, 14 insertions(+), 3 deletions(-) >> >> diff --git a/tools/testing/selftests/cgroup/cgroup_util.c >> b/tools/testing/selftests/cgroup/cgroup_util.c >> index 1e9e3c470561..8b644ea39725 100644 >> --- a/tools/testing/selftests/cgroup/cgroup_util.c >> +++ b/tools/testing/selftests/cgroup/cgroup_util.c >> @@ -89,17 +89,28 @@ int cg_read(const char *cgroup, const char *control, >> char *buf, size_t len) >> int cg_read_strcmp(const char *cgroup, const char *control, >> const char *expected) >> { >> -size_t size = strlen(expected) + 1; >> +size_t size; >> char *buf; >> +int ret; >> + >> +/* Handle the case of comparing against empty string */ >> +if (!expected) >> +size = 32; > > This doesn't look right. I would think expected shouldn't be null? > It gets used below. > >> +else >> +size = strlen(expected) + 1; >> >> buf = malloc(size); >> if (!buf) >> return -1; >> >> -if (cg_read(cgroup, control, buf, size)) >> +if (cg_read(cgroup, control, buf, size)) { >> +free(buf); >> return -1; >> +} >> >> -return strcmp(expected, buf); >> +ret = strcmp(expected, buf); > > If expected is null, what's the point in running the test? > Is empty "needle" string a valid test scenario? There are a couple places where an empty "needle" string is used currently: - cg_test_proc_killed (newly added in the next patch): Verify cgroup.procs is empty (there are no processes running) - test_memcg_oom_events: Verify cgroup.procs is empty Previously, when passing in an empty needle string, this function would always return 0, as the size allocated (1) would not be enough to read any data in 'cg_read', and strcmp would compare two null strings. > >> +free(buf); >> +return ret; >> } >> >> int cg_read_strstr(const char *cgroup, const char *control, const char >> *needle) >> > > thanks, > -- Shuah I could definitely remove the unneeded strcmp in the null 'expected' case, but I am worried it would feel a bit too hacky or add too much duplication. Would something like this be the best solution? If you had something else in mind (or if I'm misunderstanding something), please let me know, and I'll update the patchset! size_t size; char *buf; int ret; /* Handle the case of comparing against empty string */ if (!expected) size = 32; else size = strlen(expected) + 1; buf = malloc(size); if (!buf) return -1; if (cg_read(cgroup, control, buf, size)) { free(buf); return -1; } if (!expected) ret = !buf; else ret = strcmp(expected, buf); free(buf); return ret; Thanks, -Jay
Re: Conflict between sparse and commit cafa0010cd51f ("Raise the minimum required gcc version to 4.6")
On Fri, Sep 07, 2018 at 11:19:43AM -0700, Nick Desaulniers wrote: > On Fri, Sep 7, 2018 at 11:13 AM Luc Van Oostenryck wrote: > > > > Sparse expand these macros to the same version than the compiler used > > to compile GCC. I find a bit strange though to have sparse v0.5.2 but > > using an old compiler. > > So Christophe must have a version of gcc < 4.6 installed somewhere? It looks so. > Does sparse use `cc`? By default sparse use gcc (this can be overriden by using CC=...). -- Luc
[PATCH v2] optee: allow to work without static shared memory
From: Volodymyr Babchuk On virtualized systems it is possible that OP-TEE will provide only dynamic shared memory support. So it is fine to boot without static SHM enabled if dymanic one is supported. Signed-off-by: Volodymyr Babchuk --- Changes from v1: Patch is now can be applied to vanilla kernel instead of linaro's op-tee branch drivers/tee/optee/core.c | 80 +--- 1 file changed, 49 insertions(+), 31 deletions(-) diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c index e1aafe8..efd2e5c 100644 --- a/drivers/tee/optee/core.c +++ b/drivers/tee/optee/core.c @@ -419,9 +419,35 @@ static bool optee_msg_exchange_capabilities(optee_invoke_fn *invoke_fn, return true; } +static struct tee_shm_pool *optee_config_dyn_shm(void) +{ + struct tee_shm_pool_mgr *priv_mgr; + struct tee_shm_pool_mgr *dmabuf_mgr; + void *rc; + + rc = optee_shm_pool_alloc_pages(); + if (IS_ERR(rc)) + return rc; + priv_mgr = rc; + + rc = optee_shm_pool_alloc_pages(); + if (IS_ERR(rc)) { + tee_shm_pool_mgr_destroy(priv_mgr); + return rc; + } + dmabuf_mgr = rc; + + rc = tee_shm_pool_alloc(priv_mgr, dmabuf_mgr); + if (IS_ERR(rc)) { + tee_shm_pool_mgr_destroy(priv_mgr); + tee_shm_pool_mgr_destroy(dmabuf_mgr); + } + + return rc; +} + static struct tee_shm_pool * -optee_config_shm_memremap(optee_invoke_fn *invoke_fn, void **memremaped_shm, - u32 sec_caps) +optee_config_shm_memremap(optee_invoke_fn *invoke_fn, void **memremaped_shm) { union { struct arm_smccc_res smccc; @@ -436,10 +462,11 @@ optee_config_shm_memremap(optee_invoke_fn *invoke_fn, void **memremaped_shm, struct tee_shm_pool_mgr *priv_mgr; struct tee_shm_pool_mgr *dmabuf_mgr; void *rc; + const int sz = OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE; invoke_fn(OPTEE_SMC_GET_SHM_CONFIG, 0, 0, 0, 0, 0, 0, 0, ); if (res.result.status != OPTEE_SMC_RETURN_OK) { - pr_info("shm service not available\n"); + pr_err("static shm service not available\n"); return ERR_PTR(-ENOENT); } @@ -465,28 +492,15 @@ optee_config_shm_memremap(optee_invoke_fn *invoke_fn, void **memremaped_shm, } vaddr = (unsigned long)va; - /* -* If OP-TEE can work with unregistered SHM, we will use own pool -* for private shm -*/ - if (sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM) { - rc = optee_shm_pool_alloc_pages(); - if (IS_ERR(rc)) - goto err_memunmap; - priv_mgr = rc; - } else { - const size_t sz = OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE; - - rc = tee_shm_pool_mgr_alloc_res_mem(vaddr, paddr, sz, - 3 /* 8 bytes aligned */); - if (IS_ERR(rc)) - goto err_memunmap; - priv_mgr = rc; - - vaddr += sz; - paddr += sz; - size -= sz; - } + rc = tee_shm_pool_mgr_alloc_res_mem(vaddr, paddr, sz, + 3 /* 8 bytes aligned */); + if (IS_ERR(rc)) + goto err_memunmap; + priv_mgr = rc; + + vaddr += sz; + paddr += sz; + size -= sz; rc = tee_shm_pool_mgr_alloc_res_mem(vaddr, paddr, size, PAGE_SHIFT); if (IS_ERR(rc)) @@ -552,7 +566,7 @@ static optee_invoke_fn *get_invoke_func(struct device_node *np) static struct optee *optee_probe(struct device_node *np) { optee_invoke_fn *invoke_fn; - struct tee_shm_pool *pool; + struct tee_shm_pool *pool = ERR_PTR(-EINVAL); struct optee *optee = NULL; void *memremaped_shm = NULL; struct tee_device *teedev; @@ -581,13 +595,17 @@ static struct optee *optee_probe(struct device_node *np) } /* -* We have no other option for shared memory, if secure world -* doesn't have any reserved memory we can use we can't continue. +* Try to use dynamic shared memory if possible */ - if (!(sec_caps & OPTEE_SMC_SEC_CAP_HAVE_RESERVED_SHM)) - return ERR_PTR(-EINVAL); + if (sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM) + pool = optee_config_dyn_shm(); + + /* +* If dynamic shared memory is not available or failed - try static one +*/ + if (IS_ERR(pool) && (sec_caps & OPTEE_SMC_SEC_CAP_HAVE_RESERVED_SHM)) + pool = optee_config_shm_memremap(invoke_fn, _shm); - pool = optee_config_shm_memremap(invoke_fn, _shm, sec_caps); if (IS_ERR(pool)) return (void *)pool; -- 2.7.4
[GIT PULL] MD update for 4.19-rc2
Hi, Please pull MD fixes for 4.19-rc2: - Fix a locking issue for md-cluster from Guoqing - Fix a sync crash for raid10 from Ni - Fix a reshape bug with raid5 cache enabled from Me Thanks, Shaohua The following changes since commit 420f51f4ab6bce6e580390729fadb89c31123636: Merge tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux (2018-08-31 09:20:30 -0700) are available in the Git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git tags/md/4.19-rc2 for you to fetch changes up to 41a95041126522a921fb73df22cbdd520dfdebad: md-cluster: release RESYNC lock after the last resync message (2018-08-31 17:38:10 -0700) Guoqing Jiang (1): md-cluster: release RESYNC lock after the last resync message Shaohua Li (1): md/raid5-cache: disable reshape completely Xiao Ni (1): RAID10 BUG_ON in raise_barrier when force is true and conf->barrier is 0 drivers/md/md-cluster.c | 10 +- drivers/md/raid10.c | 5 - drivers/md/raid5-log.h | 5 + drivers/md/raid5.c | 6 +++--- 4 files changed, 17 insertions(+), 9 deletions(-)
Re: [PATCH] mtd: rawnand: denali: add DT property to specify skipped bytes in OOB
Hi Boris, 2018-09-07 23:53 GMT+09:00 Boris Brezillon : > On Fri, 7 Sep 2018 23:42:53 +0900 > Masahiro Yamada wrote: > >> Hi Boris, >> >> 2018-09-07 23:08 GMT+09:00 Boris Brezillon : >> > Hi Masahiro, >> > >> > On Fri, 7 Sep 2018 19:56:23 +0900 >> > Masahiro Yamada wrote: >> > >> >> NAND devices need additional data area (OOB) for error correction, >> >> but it is also used for Bad Block Marker (BBM). In many cases, the >> >> first byte in OOB is used for BBM, but the location actually depends >> >> on chip vendors. The NAND controller should preserve the precious >> >> BBM to keep track of bad blocks. >> >> >> >> In Denali IP, the SPARE_AREA_SKIP_BYTES register is used to specify >> >> the number of bytes to skip from the start of OOB. The ECC engine >> >> will automatically skip the specified number of bytes when it gets >> >> access to OOB area. >> >> >> >> The same value for SPARE_AREA_SKIP_BYTES should be used between >> >> firmware and the operating system if you intend to use the NAND >> >> device across the control hand-off. >> >> >> >> In fact, the current denali.c code expects firmware to have already >> >> set the SPARE_AREA_SKIP_BYTES register, then reads the value out. >> >> >> >> If no firmware (or bootloader) has initialized the controller, the >> >> register value is zero, which is the default after power-on-reset. >> >> >> >> In other words, the Linux driver cannot initialize the controller >> >> by itself. You cannot support the reset control either because >> >> resetting the controller will get register values lost. >> >> >> >> This commit adds a way to specify it via DT. If the property >> >> "denali,oob-skip-bytes" exists, the value will be set to the register. >> > >> > Hm, do we really need to make this config customizable? I mean, either >> > you have a large-page NAND (page > 512 bytes) and the 2 first bytes >> > must be reserved for the BBM or you have a small-page NAND and the BBM >> > is at position 4 and 5. Are you sure people configure that differently? >> > Don't you always have SPARE_AREA_SKIP_BYTES set to 6 or 2? >> >> >> As I said in the patch description, >> I need to use the same SPARE_AREA_SKIP_BYTES value >> across firmware, boot-loader, Linux, and whatever. >> >> I want to set the value to 8 for my platform >> because the on-chip boot ROM expects 8. >> I cannot change it since the boot ROM is hard-wired. >> >> >> The boot ROM skips 8 bytes in OOB >> when it loads images from the on-board NAND device. >> >> So, when I update the image from U-Boot or Linux, >> I need to make sure to set the register to 8. >> >> If I update the image with a different value, >> the Boot ROM fails to boot. >> >> >> >> When the system has booted from NAND, >> the register is already set to 8. It works. >> >> However, when the system has booted from eMMC, >> the register is not initialized by anyone. >> I am searching for a way to set the register to 8 >> in this case. >> >> >> The boot ROM in SOCFPGA might expect a different value, >> I am not sure. > > Okay, then why not having a per-compatible value if it's related to the > BootROM? Unless the BootROM is part of the FPGA and can be > reprogrammed. FPGA is unrelated here. Neither the boot ROM nor the Denali core is re-programmable. I hesitate to associate the number of skipped bytes with the compatible string because it is not a parameter of the Denali IP. Rather, it is the matter of "how we use the OOB", so I want to leave room for customization like nand-ecc-strength etc. even if the boot ROM happens to expect a particular value. If you prefer a per-compatible value, I can do that, but I believe the NAND core and the boot ROM are orthogonal. > I'd really prefer not having a generic property that > allows you to put anything you want. -- Best Regards Masahiro Yamada
Re: Regression in next with filesystem context concept
* David Howells [180907 08:51]: > Tony Lindgren wrote: > > > Looks like next-20180906 now has a regression where mounting > > root won't work with commit fd0002870b45 ("vfs: Implement a > > filesystem superblock creation/configuration context"). > > Am I right in thinking you're not using any of the LSMs? Assuming LSM as in Documentation/lsm.txt, right not using any. BTW, I don't think this issue shows up with ramdisk either, so that's probably why for example kernelci.org does not show errors. Regards, Tony
[PATCH] cpufreq: remove unnecessary unlikely()
WARN_ON() already contains an unlikely(), so it's not necessary to wrap it into another. Signed-off-by: Igor Stoppa Cc: Srivatsa S. Bhat Cc: "Rafael J. Wysocki" Cc: linux...@vger.kernel.org Cc: linux-kernel@vger.kernel.org --- drivers/cpufreq/cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index f53fb41efb7b..7aa3dcad2175 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -403,7 +403,7 @@ EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin); void cpufreq_freq_transition_end(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, int transition_failed) { - if (unlikely(WARN_ON(!policy->transition_ongoing))) + if (WARN_ON(!policy->transition_ongoing)) return; cpufreq_notify_post_transition(policy, freqs, transition_failed); -- 2.17.1
Re: [PATCH v2 3/3] x86/pti/64: Remove the SYSCALL64 entry trampoline
On Mon, Sep 03, 2018 at 03:59:44PM -0700, Andy Lutomirski wrote: > The SYSCALL64 trampoline has a couple of nice properties: > > - The usual sequence of SWAPGS followed by two GS-relative accesses to >set up RSP is somewhat slow because the GS-relative accesses need >to wait for SWAPGS to finish. The trampoline approach allows >RIP-relative accesses to set up RSP, which avoids the stall. > > - The trampoline avoids any percpu access before CR3 is set up, >which means that no percpu memory needs to be mapped in the user >page tables. This prevents using Meltdown to read any percpu memory >outside the cpu_entry_area and prevents using timing leaks >to directly locate the percpu areas. > > The downsides of using a trampoline may outweigh the upsides, however. > It adds an extra non-contiguous I$ cache line to system calls, and it > forces an indirect jump to transfer control back to the normal kernel > text after CR3 is set up. The latter is because x86 lacks a 64-bit > direct jump instruction that could jump from the trampoline to the entry > text. With retpolines enabled, the indirect jump is extremely slow. > > This patch changes the code to map the percpu TSS into the user page > tables to allow the non-trampoline SYSCALL64 path to work under PTI. > This does not add a new direct information leak, since the TSS is > readable by Meltdown from the cpu_entry_area alias regardless. It > does allow a timing attack to locate the percpu area, but KASLR is > more or less a lost cause against local attack on CPUs vulnerable to > Meltdown regardless. As far as I'm concerned, on current hardware, > KASLR is only useful to mitigate remote attacks that try to attack > the kernel without first gaining RCE against a vulnerable user > process. > > On Skylake, with CONFIG_RETPOLINE=y and KPTI on, this reduces > syscall overhead from ~237ns to ~228ns. > > There is a possible alternative approach: we could instead move the > trampoline within 2G of the entry text and make a separate copy for > each CPU. Then we could use a direct jump to rejoin the normal > entry path. > > Signed-off-by: Andy Lutomirski The following commit should also be reverted: 4d99e4136580 ("perf machine: Workaround missing maps for x86 PTI entry trampolines") -- Josh
[REGRESSION] Commit 5745392e0c2b ("PCI: Apply the new generic I/O management on PCI IO hosts") breaks PCI for legacy virtio devices with kvmtool on arm64
Hi all, I'm seeing a regression in Linux guests since 4.17 under kvmtool, where legacy virtio devices using the PCI transport fail to probe. Legacy virtio PCI devices must be accessed via "I/O space" (e.g. BAR0, which is IORESOURCE_IO) and kvmtool assigns this to the guest physical range 0x0 - 0x1. On arm64, when the virtio legacy PCI driver calls pci_iomap() for this BAR, it expands to ioport_map(): static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { return PCI_IOBASE + (port & MMIO_UPPER_LIMIT); } Since the indirect PIO changes, MMIO_UPPER_LIMIT is defined as: /* * We reserve 0x4000 bytes for Indirect IO as so far this library is only * used by the HiSilicon LPC Host. If needed, we can reserve a wider IO * area by redefining the macro below. */ #define PIO_INDIRECT_SIZE 0x4000 #define MMIO_UPPER_LIMIT (IO_SPACE_LIMIT - PIO_INDIRECT_SIZE) which corrupts the BAR address. For example, kvmtool has the BAR pointing at 0x6200 on my system, but pci_iomap() actually maps offset 0x2200. Changing PIO_INDIRECT_SIZE to 0 gets things working again. Since this stuff doesn't revert nicely, I'm not sure how to proceed. Any thoughts? Generally, having a per-platform magic constant hardcoded in the PCI mapping code makes me feel slightly ill... Cheers, Will
Re: [PATCH 8/9] psi: pressure stall information for CPU, memory, and IO
On Fri, Sep 07, 2018 at 04:58:58PM +0200, Peter Zijlstra wrote: > On Fri, Sep 07, 2018 at 10:44:22AM -0400, Johannes Weiner wrote: > > > > This does the whole seqcount thing 6x, which is a bit of a waste. > > > > [...] > > > > > It's a bit cumbersome, but that's because of C. > > > > I was actually debating exactly this with Suren before, but since this > > is a super cold path I went with readability. I was also thinking that > > restarts could happen quite regularly under heavy scheduler load, and > > so keeping the individual retry sections small could be helpful - but > > I didn't instrument this in any way. > > I was hoping going over the whole thing once would reduce the time we > need to keep that line in shared mode and reduce traffic. And yes, this > path is cold, but I was thinking about reducing the interference on the > remote CPU. > > Alternatively, we memcpy the whole line under the seqlock and then do > everything later. > > Also, this only has a single cpu_clock() invocation. Good points. How about the below? It's still pretty readable, and generates compact code inside the now single retry section: 81ed464f: 44 89 ffmov%r15d,%edi 81ed4652: e8 00 00 00 00 callq 81ed4657 81ed4653: R_X86_64_PLT32 sched_clock_cpu-0x4 memcpy(times, groupc->times, sizeof(groupc->times)); 81ed4657: 49 8b 14 24 mov(%r12),%rdx state_start = groupc->state_start; 81ed465b: 48 8b 4b 50 mov0x50(%rbx),%rcx memcpy(times, groupc->times, sizeof(groupc->times)); 81ed465f: 48 89 54 24 30 mov%rdx,0x30(%rsp) 81ed4664: 49 8b 54 24 08 mov0x8(%r12),%rdx 81ed4669: 48 89 54 24 38 mov%rdx,0x38(%rsp) 81ed466e: 49 8b 54 24 10 mov0x10(%r12),%rdx 81ed4673: 48 89 54 24 40 mov%rdx,0x40(%rsp) memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); 81ed4678: 49 8b 55 00 mov0x0(%r13),%rdx 81ed467c: 48 89 54 24 24 mov%rdx,0x24(%rsp) 81ed4681: 41 8b 55 08 mov0x8(%r13),%edx 81ed4685: 89 54 24 2c mov%edx,0x2c(%rsp) --- diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0f07749b60a4..595414599b98 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -197,17 +197,26 @@ static bool test_state(unsigned int *tasks, enum psi_states state) } } -static u32 get_recent_time(struct psi_group *group, int cpu, - enum psi_states state) +static void get_recent_times(struct psi_group *group, int cpu, u32 *times) { struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu); + unsigned int tasks[NR_PSI_TASK_COUNTS]; + u64 now, state_start; unsigned int seq; - u32 time, delta; + int s; + /* Snapshot a coherent view of the CPU state */ do { seq = read_seqcount_begin(>seq); + now = cpu_clock(cpu); + memcpy(times, groupc->times, sizeof(groupc->times)); + memcpy(tasks, groupc->tasks, sizeof(groupc->tasks)); + state_start = groupc->state_start; + } while (read_seqcount_retry(>seq, seq)); - time = groupc->times[state]; + /* Calculate state time deltas against the previous snapshot */ + for (s = 0; s < NR_PSI_STATES; s++) { + u32 delta; /* * In addition to already concluded states, we also * incorporate currently active states on the CPU, @@ -217,14 +226,14 @@ static u32 get_recent_time(struct psi_group *group, int cpu, * (u32) and our reported pressure close to what's * actually happening. */ - if (test_state(groupc->tasks, state)) - time += cpu_clock(cpu) - groupc->state_start; - } while (read_seqcount_retry(>seq, seq)); + if (test_state(tasks, s)) + times[s] += now - state_start; - delta = time - groupc->times_prev[state]; - groupc->times_prev[state] = time; + delta = times[s] - groupc->times_prev[s]; + groupc->times_prev[s] = times[s]; - return delta; + times[s] = delta; + } } static void calc_avgs(unsigned long avg[3], int missed_periods, @@ -267,18 +276,16 @@ static bool update_stats(struct psi_group *group) * loading, or even entirely idle CPUs. */ for_each_possible_cpu(cpu) { + u32 times[NR_PSI_STATES]; u32 nonidle; - nonidle = get_recent_time(group, cpu, PSI_NONIDLE); - nonidle = nsecs_to_jiffies(nonidle); -
Re: [PATCH 3/6] selinux: convert to kvmalloc
On Sat, Sep 08, 2018 at 02:08:03AM +0900, Tetsuo Handa wrote: > On 2018/09/08 1:56, Kent Overstreet wrote: > > @@ -329,8 +328,7 @@ int avtab_alloc(struct avtab *h, u32 nrules) > > nslot = MAX_AVTAB_HASH_BUCKETS; > > mask = nslot - 1; > > > > - h->htable = flex_array_alloc(sizeof(struct avtab_node *), nslot, > > -GFP_KERNEL | __GFP_ZERO); > > + h->htable = kvmalloc_array(nslot, sizeof(void *), GFP_KERNEL); > > if (!h->htable) > > return -ENOMEM; > > > > kvmalloc_array() does not imply __GFP_ZERO. Thanks, fixed