[PATCH v5 5/7] dmaengine: xilinx_dma: autodetect whether the HW supports scatter-gather

2018-09-07 Thread Andrea Merello
The AXIDMA and CDMA HW can be either direct-access or scatter-gather
version. These are SW incompatible.

The driver can handle both versions: a DT property was used to
tell the driver whether to assume the HW is in scatter-gather mode.

This patch makes the driver to autodetect this information. The DT
property is not required anymore.

No changes for VDMA.

Cc: Rob Herring 
Cc: Mark Rutland 
Cc: devicet...@vger.kernel.org
Cc: Radhey Shyam Pandey 
Signed-off-by: Andrea Merello 
Reviewed-by: Radhey Shyam Pandey 
---
Changes in v2:
- autodetect only in !VDMA case
Changes in v3:
- cc DT maintainers/ML
Changes in v4:
- fix typos in commit message
Changes in v5:
None
---
 drivers/dma/xilinx/xilinx_dma.c | 14 ++
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
index b17f24e4ec35..78d0f2f8225e 100644
--- a/drivers/dma/xilinx/xilinx_dma.c
+++ b/drivers/dma/xilinx/xilinx_dma.c
@@ -86,6 +86,7 @@
 #define XILINX_DMA_DMASR_DMA_DEC_ERR   BIT(6)
 #define XILINX_DMA_DMASR_DMA_SLAVE_ERR BIT(5)
 #define XILINX_DMA_DMASR_DMA_INT_ERR   BIT(4)
+#define XILINX_DMA_DMASR_SG_MASK   BIT(3)
 #define XILINX_DMA_DMASR_IDLE  BIT(1)
 #define XILINX_DMA_DMASR_HALTEDBIT(0)
 #define XILINX_DMA_DMASR_DELAY_MASKGENMASK(31, 24)
@@ -407,7 +408,6 @@ struct xilinx_dma_config {
  * @dev: Device Structure
  * @common: DMA device structure
  * @chan: Driver specific DMA channel
- * @has_sg: Specifies whether Scatter-Gather is present or not
  * @mcdma: Specifies whether Multi-Channel is present or not
  * @flush_on_fsync: Flush on frame sync
  * @ext_addr: Indicates 64 bit addressing is supported by dma device
@@ -427,7 +427,6 @@ struct xilinx_dma_device {
struct device *dev;
struct dma_device common;
struct xilinx_dma_chan *chan[XILINX_DMA_MAX_CHANS_PER_DEVICE];
-   bool has_sg;
bool mcdma;
u32 flush_on_fsync;
bool ext_addr;
@@ -2400,7 +2399,6 @@ static int xilinx_dma_chan_probe(struct xilinx_dma_device 
*xdev,
 
chan->dev = xdev->dev;
chan->xdev = xdev;
-   chan->has_sg = xdev->has_sg;
chan->desc_pendingcount = 0x0;
chan->ext_addr = xdev->ext_addr;
/* This variable ensures that descriptors are not
@@ -2493,6 +2491,15 @@ static int xilinx_dma_chan_probe(struct 
xilinx_dma_device *xdev,
chan->stop_transfer = xilinx_dma_stop_transfer;
}
 
+   /* check if SG is enabled (only for AXIDMA and CDMA) */
+   if (xdev->dma_config->dmatype != XDMA_TYPE_VDMA) {
+   if (dma_ctrl_read(chan, XILINX_DMA_REG_DMASR) &
+   XILINX_DMA_DMASR_SG_MASK)
+   chan->has_sg = true;
+   dev_dbg(chan->dev, "ch %d: SG %s\n", chan->id,
+   chan->has_sg ? "enabled" : "disabled");
+   }
+
/* Initialize the tasklet */
tasklet_init(>tasklet, xilinx_dma_do_tasklet,
(unsigned long)chan);
@@ -2631,7 +2638,6 @@ static int xilinx_dma_probe(struct platform_device *pdev)
return PTR_ERR(xdev->regs);
 
/* Retrieve the DMA engine properties from the device tree */
-   xdev->has_sg = of_property_read_bool(node, "xlnx,include-sg");
xdev->max_buffer_len = GENMASK(XILINX_DMA_MAX_TRANS_LEN_MAX - 1, 0);
 
if (xdev->dma_config->dmatype == XDMA_TYPE_AXIDMA) {
-- 
2.17.1



[PATCH v5 4/7] dmaengine: xilinx_dma: program hardware supported buffer length

2018-09-07 Thread Andrea Merello
From: Radhey Shyam Pandey 

AXI-DMA IP supports configurable (c_sg_length_width) buffer length
register width, hence read buffer length (xlnx,sg-length-width) DT
property and ensure that driver doesn't program buffer length
exceeding the supported limit. For VDMA and CDMA there is no change.

Cc: Rob Herring 
Cc: Mark Rutland 
Cc: devicet...@vger.kernel.org
Signed-off-by: Radhey Shyam Pandey 
Signed-off-by: Michal Simek 
Signed-off-by: Andrea Merello  [rebase, reword]
---
Changes in v2:
- drop original patch and replace with the one in Xilinx tree
Changes in v3:
- cc DT maintainers/ML
Changes in v4:
- upper bound for the property should be 26, not 23
- add warn for width > 23 as per xilinx original patch
- rework due to changes introduced in 1/6
Changes in v5:
None
---
 drivers/dma/xilinx/xilinx_dma.c | 36 +
 1 file changed, 28 insertions(+), 8 deletions(-)

diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
index aaa6de8a70e4..b17f24e4ec35 100644
--- a/drivers/dma/xilinx/xilinx_dma.c
+++ b/drivers/dma/xilinx/xilinx_dma.c
@@ -158,7 +158,9 @@
 #define XILINX_DMA_REG_BTT 0x28
 
 /* AXI DMA Specific Masks/Bit fields */
-#define XILINX_DMA_MAX_TRANS_LEN   GENMASK(22, 0)
+#define XILINX_DMA_MAX_TRANS_LEN_MIN   8
+#define XILINX_DMA_MAX_TRANS_LEN_MAX   23
+#define XILINX_DMA_V2_MAX_TRANS_LEN_MAX26
 #define XILINX_DMA_CR_COALESCE_MAX GENMASK(23, 16)
 #define XILINX_DMA_CR_CYCLIC_BD_EN_MASKBIT(4)
 #define XILINX_DMA_CR_COALESCE_SHIFT   16
@@ -418,6 +420,7 @@ struct xilinx_dma_config {
  * @rxs_clk: DMA s2mm stream clock
  * @nr_channels: Number of channels DMA device supports
  * @chan_id: DMA channel identifier
+ * @max_buffer_len: Max buffer length
  */
 struct xilinx_dma_device {
void __iomem *regs;
@@ -437,6 +440,7 @@ struct xilinx_dma_device {
struct clk *rxs_clk;
u32 nr_channels;
u32 chan_id;
+   u32 max_buffer_len;
 };
 
 /* Macros */
@@ -964,7 +968,7 @@ static int xilinx_dma_calc_copysize(struct xilinx_dma_chan 
*chan,
int size, int done)
 {
size_t copy = min_t(size_t, size - done,
-XILINX_DMA_MAX_TRANS_LEN);
+   chan->xdev->max_buffer_len);
 
if ((copy + done < size) &&
chan->xdev->common.copy_align) {
@@ -1011,7 +1015,7 @@ static enum dma_status xilinx_dma_tx_status(struct 
dma_chan *dchan,
list_for_each_entry(segment, >segments, node) {
hw = >hw;
residue += (hw->control - hw->status) &
-  XILINX_DMA_MAX_TRANS_LEN;
+  chan->xdev->max_buffer_len;
}
}
spin_unlock_irqrestore(>lock, flags);
@@ -1263,7 +1267,7 @@ static void xilinx_cdma_start_transfer(struct 
xilinx_dma_chan *chan)
 
/* Start the transfer */
dma_ctrl_write(chan, XILINX_DMA_REG_BTT,
-   hw->control & XILINX_DMA_MAX_TRANS_LEN);
+   hw->control & chan->xdev->max_buffer_len);
}
 
list_splice_tail_init(>pending_list, >active_list);
@@ -1366,7 +1370,7 @@ static void xilinx_dma_start_transfer(struct 
xilinx_dma_chan *chan)
 
/* Start the transfer */
dma_ctrl_write(chan, XILINX_DMA_REG_BTT,
-  hw->control & XILINX_DMA_MAX_TRANS_LEN);
+  hw->control & chan->xdev->max_buffer_len);
}
 
list_splice_tail_init(>pending_list, >active_list);
@@ -1727,7 +1731,7 @@ xilinx_cdma_prep_memcpy(struct dma_chan *dchan, 
dma_addr_t dma_dst,
struct xilinx_cdma_tx_segment *segment;
struct xilinx_cdma_desc_hw *hw;
 
-   if (!len || len > XILINX_DMA_MAX_TRANS_LEN)
+   if (!len || len > chan->xdev->max_buffer_len)
return NULL;
 
desc = xilinx_dma_alloc_tx_descriptor(chan);
@@ -2596,7 +2600,7 @@ static int xilinx_dma_probe(struct platform_device *pdev)
struct xilinx_dma_device *xdev;
struct device_node *child, *np = pdev->dev.of_node;
struct resource *io;
-   u32 num_frames, addr_width;
+   u32 num_frames, addr_width, len_width;
int i, err;
 
/* Allocate and initialize the DMA engine structure */
@@ -2628,8 +2632,24 @@ static int xilinx_dma_probe(struct platform_device *pdev)
 
/* Retrieve the DMA engine properties from the device tree */
xdev->has_sg = of_property_read_bool(node, "xlnx,include-sg");
-   if (xdev->dma_config->dmatype == XDMA_TYPE_AXIDMA)
+   xdev->max_buffer_len = GENMASK(XILINX_DMA_MAX_TRANS_LEN_MAX - 1, 0);
+
+   if (xdev->dma_config->dmatype == XDMA_TYPE_AXIDMA) {
xdev->mcdma = of_property_read_bool(node, 

[PATCH v5 3/7] dt-bindings: dmaengine: xilinx_dma: add optional xlnx,sg-length-width property

2018-09-07 Thread Andrea Merello
The width of the "length register" cannot be autodetected, and it is now
specified with a DT property. Add documentation for it.

Cc: Rob Herring 
Cc: Mark Rutland 
Cc: devicet...@vger.kernel.org
Cc: Radhey Shyam Pandey 
Signed-off-by: Andrea Merello 
Reviewed-by: Radhey Shyam Pandey 
---
Changes in v2:
- change property name
- property is now optional
- cc DT maintainer
Changes in v3:
- reword
- cc DT maintainerS and ML
Changes in v4:
- specify the unit, the valid range and the default value
Changes in v5:
- commit message trivial fix
- fix spaces before tab
---
 Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt | 4 
 1 file changed, 4 insertions(+)

diff --git a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt 
b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt
index a2b8bfaec43c..5df4eac7300c 100644
--- a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt
+++ b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt
@@ -41,6 +41,10 @@ Optional properties:
 - xlnx,include-sg: Tells configured for Scatter-mode in
the hardware.
 Optional properties for AXI DMA:
+- xlnx,sg-length-width: Should be set to the width in bits of the length
+   register as configured in h/w. Takes values {8...26}. If the property
+   is missing or invalid then the default value 23 is used. This is the
+   maximum value that is supported by all IP versions.
 - xlnx,mcdma: Tells whether configured for multi-channel mode in the hardware.
 Optional properties for VDMA:
 - xlnx,flush-fsync: Tells which channel to Flush on Frame sync.
-- 
2.17.1



[PATCH v5 1/7] dmaengine: xilinx_dma: commonize DMA copy size calculation

2018-09-07 Thread Andrea Merello
This patch removes a bit of duplicated code by introducing a new
function that implements calculations for DMA copy size.

Suggested-by: Vinod Koul 
Signed-off-by: Andrea Merello 
---
Changes in v4:
- introduce this patch in the patch series
Changes in v5:
None
---
 drivers/dma/xilinx/xilinx_dma.c | 20 
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
index 27b523530c4a..a3aaa0e34cc7 100644
--- a/drivers/dma/xilinx/xilinx_dma.c
+++ b/drivers/dma/xilinx/xilinx_dma.c
@@ -952,6 +952,19 @@ static int xilinx_dma_alloc_chan_resources(struct dma_chan 
*dchan)
return 0;
 }
 
+/**
+ * xilinx_dma_calc_copysize - Calculate the amount of data to copy
+ * @size: Total data that needs to be copied
+ * @done: Amount of data that has been already copied
+ *
+ * Return: Amount of data that has to be copied
+ */
+static int xilinx_dma_calc_copysize(int size, int done)
+{
+   return min_t(size_t, size - done,
+XILINX_DMA_MAX_TRANS_LEN);
+}
+
 /**
  * xilinx_dma_tx_status - Get DMA transaction status
  * @dchan: DMA channel
@@ -1791,8 +1804,8 @@ static struct dma_async_tx_descriptor 
*xilinx_dma_prep_slave_sg(
 * Calculate the maximum number of bytes to transfer,
 * making sure it is less than the hw limit
 */
-   copy = min_t(size_t, sg_dma_len(sg) - sg_used,
-XILINX_DMA_MAX_TRANS_LEN);
+   copy = xilinx_dma_calc_copysize(sg_dma_len(sg),
+   sg_used);
hw = >hw;
 
/* Fill in the descriptor */
@@ -1896,8 +1909,7 @@ static struct dma_async_tx_descriptor 
*xilinx_dma_prep_dma_cyclic(
 * Calculate the maximum number of bytes to transfer,
 * making sure it is less than the hw limit
 */
-   copy = min_t(size_t, period_len - sg_used,
-XILINX_DMA_MAX_TRANS_LEN);
+   copy = xilinx_dma_calc_copysize(period_len, sg_used);
hw = >hw;
xilinx_axidma_buf(chan, hw, buf_addr, sg_used,
  period_len * i);
-- 
2.17.1



[PATCH v5 2/7] dmaengine: xilinx_dma: in axidma slave_sg and dma_cyclic mode align split descriptors

2018-09-07 Thread Andrea Merello
Whenever a single or cyclic transaction is prepared, the driver
could eventually split it over several SG descriptors in order
to deal with the HW maximum transfer length.

This could end up in DMA operations starting from a misaligned
address. This seems fatal for the HW if DRE (Data Realignment Engine)
is not enabled.

This patch eventually adjusts the transfer size in order to make sure
all operations start from an aligned address.

Cc: Radhey Shyam Pandey 
Signed-off-by: Andrea Merello 
Reviewed-by: Radhey Shyam Pandey 
---
Changes in v2:
- don't introduce copy_mask field, rather rely on already-esistent
  copy_align field. Suggested by Radhey Shyam Pandey
- reword title
Changes in v3:
- fix bug introduced in v2: wrong copy size when DRE is enabled
- use implementation suggested by Radhey Shyam Pandey
Changes in v4:
- rework on the top of 1/6
Changes in v5:
- fix typo in commit title
- add hint about "DRE" meaning in commit message
---
 drivers/dma/xilinx/xilinx_dma.c | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
index a3aaa0e34cc7..aaa6de8a70e4 100644
--- a/drivers/dma/xilinx/xilinx_dma.c
+++ b/drivers/dma/xilinx/xilinx_dma.c
@@ -954,15 +954,28 @@ static int xilinx_dma_alloc_chan_resources(struct 
dma_chan *dchan)
 
 /**
  * xilinx_dma_calc_copysize - Calculate the amount of data to copy
+ * @chan: Driver specific DMA channel
  * @size: Total data that needs to be copied
  * @done: Amount of data that has been already copied
  *
  * Return: Amount of data that has to be copied
  */
-static int xilinx_dma_calc_copysize(int size, int done)
+static int xilinx_dma_calc_copysize(struct xilinx_dma_chan *chan,
+   int size, int done)
 {
-   return min_t(size_t, size - done,
+   size_t copy = min_t(size_t, size - done,
 XILINX_DMA_MAX_TRANS_LEN);
+
+   if ((copy + done < size) &&
+   chan->xdev->common.copy_align) {
+   /*
+* If this is not the last descriptor, make sure
+* the next one will be properly aligned
+*/
+   copy = rounddown(copy,
+(1 << chan->xdev->common.copy_align));
+   }
+   return copy;
 }
 
 /**
@@ -1804,7 +1817,7 @@ static struct dma_async_tx_descriptor 
*xilinx_dma_prep_slave_sg(
 * Calculate the maximum number of bytes to transfer,
 * making sure it is less than the hw limit
 */
-   copy = xilinx_dma_calc_copysize(sg_dma_len(sg),
+   copy = xilinx_dma_calc_copysize(chan, sg_dma_len(sg),
sg_used);
hw = >hw;
 
@@ -1909,7 +1922,8 @@ static struct dma_async_tx_descriptor 
*xilinx_dma_prep_dma_cyclic(
 * Calculate the maximum number of bytes to transfer,
 * making sure it is less than the hw limit
 */
-   copy = xilinx_dma_calc_copysize(period_len, sg_used);
+   copy = xilinx_dma_calc_copysize(chan,
+   period_len, sg_used);
hw = >hw;
xilinx_axidma_buf(chan, hw, buf_addr, sg_used,
  period_len * i);
-- 
2.17.1



[PATCH v5 7/7] dmaengine: xilinx_dma: Drop SG support for VDMA IP

2018-09-07 Thread Andrea Merello
xilinx_vdma_start_transfer() is used only for VDMA IP, still it contains
conditional code on has_sg variable. has_sg is set only whenever the HW
does support SG mode, that is never true for VDMA IP.

This patch drops the never-taken branches.

Signed-off-by: Andrea Merello 
---
Changes in V4: introduced this patch in series
Changes in v5:
None
---
 drivers/dma/xilinx/xilinx_dma.c | 84 +
 1 file changed, 32 insertions(+), 52 deletions(-)

diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
index 78d0f2f8225e..07ceadef0a00 100644
--- a/drivers/dma/xilinx/xilinx_dma.c
+++ b/drivers/dma/xilinx/xilinx_dma.c
@@ -1093,6 +1093,8 @@ static void xilinx_vdma_start_transfer(struct 
xilinx_dma_chan *chan)
struct xilinx_dma_tx_descriptor *desc, *tail_desc;
u32 reg, j;
struct xilinx_vdma_tx_segment *tail_segment;
+   struct xilinx_vdma_tx_segment *segment, *last = NULL;
+   int i = 0;
 
/* This function was invoked with lock held */
if (chan->err)
@@ -1112,14 +1114,6 @@ static void xilinx_vdma_start_transfer(struct 
xilinx_dma_chan *chan)
tail_segment = list_last_entry(_desc->segments,
   struct xilinx_vdma_tx_segment, node);
 
-   /*
-* If hardware is idle, then all descriptors on the running lists are
-* done, start new transfers
-*/
-   if (chan->has_sg)
-   dma_ctrl_write(chan, XILINX_DMA_REG_CURDESC,
-   desc->async_tx.phys);
-
/* Configure the hardware using info in the config structure */
reg = dma_ctrl_read(chan, XILINX_DMA_REG_DMACR);
 
@@ -1128,15 +1122,11 @@ static void xilinx_vdma_start_transfer(struct 
xilinx_dma_chan *chan)
else
reg &= ~XILINX_DMA_DMACR_FRAMECNT_EN;
 
-   /*
-* With SG, start with circular mode, so that BDs can be fetched.
-* In direct register mode, if not parking, enable circular mode
-*/
-   if (chan->has_sg || !config->park)
-   reg |= XILINX_DMA_DMACR_CIRC_EN;
-
+   /* If not parking, enable circular mode */
if (config->park)
reg &= ~XILINX_DMA_DMACR_CIRC_EN;
+   else
+   reg |= XILINX_DMA_DMACR_CIRC_EN;
 
dma_ctrl_write(chan, XILINX_DMA_REG_DMACR, reg);
 
@@ -1158,48 +1148,38 @@ static void xilinx_vdma_start_transfer(struct 
xilinx_dma_chan *chan)
return;
 
/* Start the transfer */
-   if (chan->has_sg) {
-   dma_ctrl_write(chan, XILINX_DMA_REG_TAILDESC,
-   tail_segment->phys);
-   list_splice_tail_init(>pending_list, >active_list);
-   chan->desc_pendingcount = 0;
-   } else {
-   struct xilinx_vdma_tx_segment *segment, *last = NULL;
-   int i = 0;
-
-   if (chan->desc_submitcount < chan->num_frms)
-   i = chan->desc_submitcount;
-
-   list_for_each_entry(segment, >segments, node) {
-   if (chan->ext_addr)
-   vdma_desc_write_64(chan,
-   XILINX_VDMA_REG_START_ADDRESS_64(i++),
-   segment->hw.buf_addr,
-   segment->hw.buf_addr_msb);
-   else
-   vdma_desc_write(chan,
+   if (chan->desc_submitcount < chan->num_frms)
+   i = chan->desc_submitcount;
+
+   list_for_each_entry(segment, >segments, node) {
+   if (chan->ext_addr)
+   vdma_desc_write_64(chan,
+  XILINX_VDMA_REG_START_ADDRESS_64(i++),
+  segment->hw.buf_addr,
+  segment->hw.buf_addr_msb);
+   else
+   vdma_desc_write(chan,
XILINX_VDMA_REG_START_ADDRESS(i++),
segment->hw.buf_addr);
 
-   last = segment;
-   }
-
-   if (!last)
-   return;
+   last = segment;
+   }
 
-   /* HW expects these parameters to be same for one transaction */
-   vdma_desc_write(chan, XILINX_DMA_REG_HSIZE, last->hw.hsize);
-   vdma_desc_write(chan, XILINX_DMA_REG_FRMDLY_STRIDE,
-   last->hw.stride);
-   vdma_desc_write(chan, XILINX_DMA_REG_VSIZE, last->hw.vsize);
+   if (!last)
+   return;
 
-   chan->desc_submitcount++;
-   chan->desc_pendingcount--;
-   list_del(>node);
-   list_add_tail(>node, >active_list);
-   if (chan->desc_submitcount == chan->num_frms)
-   chan->desc_submitcount = 0;
-   }
+   /* HW expects these 

[PATCH v5 6/7] dt-bindings: dmaengine: xilinx_dma: drop has-sg property

2018-09-07 Thread Andrea Merello
This property is not needed anymore, because the driver now autodetects it.
Delete references in documentation.

Cc: Rob Herring 
Cc: Mark Rutland 
Cc: devicet...@vger.kernel.org
Cc: Radhey Shyam Pandey 
Signed-off-by: Andrea Merello 
Reviewed-by: Radhey Shyam Pandey 
Reviewed-by: Rob Herring 
---
Changes in v2:
- cc DT maintainer
Changes in v3:
- cc DT maintainerS/ML
Changes in v4:
None
Changes in v5:
None
---
 Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt 
b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt
index 5df4eac7300c..6303ce7fcc3d 100644
--- a/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt
+++ b/Documentation/devicetree/bindings/dma/xilinx/xilinx_dma.txt
@@ -37,9 +37,6 @@ Required properties:
 Required properties for VDMA:
 - xlnx,num-fstores: Should be the number of framebuffers as configured in h/w.
 
-Optional properties:
-- xlnx,include-sg: Tells configured for Scatter-mode in
-   the hardware.
 Optional properties for AXI DMA:
 - xlnx,sg-length-width: Should be set to the width in bits of the length
register as configured in h/w. Takes values {8...26}. If the property
-- 
2.17.1



Re: [PATCH AUTOSEL 4.14 27/67] ARM: exynos: Define EINT_WAKEUP_MASK registers for S5Pv210 and Exynos5433

2018-09-07 Thread Krzysztof Kozlowski
On Fri, 7 Sep 2018 at 02:54, Sasha Levin  wrote:
>
> From: Krzysztof Kozlowski 
>
> [ Upstream commit e5cda42c16d89720c29678f51d95a119490ef7d8 ]
>
> S5Pv210 and Exynos5433/Exynos7 have different address of
> EINT_WAKEUP_MASK register.  Rename existing S5P_EINT_WAKEUP_MASK to
> avoid confusion and add new ones.

This should not be backported to stable. It does not fix anything but
prepares the code for a8be2af0218c ("pinctrl: samsung: Write external
wakeup interrupt mask").

Best regards,
Krzysztof

>
> Signed-off-by: Krzysztof Kozlowski 
> Cc: Tomasz Figa 
> Cc: Sylwester Nawrocki 
> Acked-by: Tomasz Figa 
> Tested-by: Marek Szyprowski 
> Signed-off-by: Sasha Levin 
> ---
>  arch/arm/mach-exynos/suspend.c  | 2 +-
>  include/linux/soc/samsung/exynos-regs-pmu.h | 6 +-
>  2 files changed, 6 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm/mach-exynos/suspend.c b/arch/arm/mach-exynos/suspend.c
> index b529ba04ed16..a6a4ba334147 100644
> --- a/arch/arm/mach-exynos/suspend.c
> +++ b/arch/arm/mach-exynos/suspend.c
> @@ -279,7 +279,7 @@ static int exynos5420_cpu_suspend(unsigned long arg)
>  static void exynos_pm_set_wakeup_mask(void)
>  {
> /* Set wake-up mask registers */
> -   pmu_raw_writel(exynos_get_eint_wake_mask(), S5P_EINT_WAKEUP_MASK);
> +   pmu_raw_writel(exynos_get_eint_wake_mask(), EXYNOS_EINT_WAKEUP_MASK);
> pmu_raw_writel(exynos_irqwake_intmask & ~(1 << 31), S5P_WAKEUP_MASK);
>  }
>
> diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h 
> b/include/linux/soc/samsung/exynos-regs-pmu.h
> index bebdde5dccd6..f248e7e079b7 100644
> --- a/include/linux/soc/samsung/exynos-regs-pmu.h
> +++ b/include/linux/soc/samsung/exynos-regs-pmu.h
> @@ -46,7 +46,7 @@
>  #define EXYNOS_SWRESET 0x0400
>
>  #define S5P_WAKEUP_STAT0x0600
> -#define S5P_EINT_WAKEUP_MASK   0x0604
> +#define EXYNOS_EINT_WAKEUP_MASK0x0604
>  #define S5P_WAKEUP_MASK0x0608
>  #define S5P_WAKEUP_MASK2   0x0614
>
> @@ -184,6 +184,9 @@
>  #define S5P_CORE_WAKEUP_FROM_LOCAL_CFG (0x3 << 8)
>  #define S5P_CORE_AUTOWAKEUP_EN (1 << 31)
>
> +/* Only for S5Pv210 */
> +#define S5PV210_EINT_WAKEUP_MASK   0xC004
> +
>  /* Only for EXYNOS4210 */
>  #define S5P_CMU_CLKSTOP_LCD1_LOWPWR0x1154
>  #define S5P_CMU_RESET_LCD1_LOWPWR  0x1174
> @@ -645,6 +648,7 @@
>  | EXYNOS5420_KFC_USE_STANDBY_WFI3)
>
>  /* For EXYNOS5433 */
> +#define EXYNOS5433_EINT_WAKEUP_MASK(0x060C)
>  #define EXYNOS5433_USBHOST30_PHY_CONTROL   (0x0728)
>  #define EXYNOS5433_PAD_RETENTION_AUD_OPTION(0x3028)
>  #define EXYNOS5433_PAD_RETENTION_MMC2_OPTION   (0x30C8)
> --
> 2.17.1


Re: [RFC PATCH 00/11] Avoid synchronous TLB invalidation for intermediate page-table entries on arm64

2018-09-07 Thread Jon Masters
On 09/05/2018 08:28 AM, Will Deacon wrote:
> On Tue, Sep 04, 2018 at 02:38:02PM -0400, Jon Masters wrote:
>> On 08/24/2018 11:52 AM, Will Deacon wrote:
>>
>>> I hacked up this RFC on the back of the recent changes to the mmu_gather
>>> stuff in mainline. It's had a bit of testing and it looks pretty good so
>>> far.
>>
>> I will request the server folks go and test this. You'll probably
>> remember a couple of parts we've seen where aggressive walker caches
>> ended up (correctly) seeing stale page table entries and we had all
>> manner of horrifically hard to debug problems. We have some fairly nice
>> reproducers that were able to find this last time that we can test.
> 
> Cheers, Jon, that would be very helpful. You're probably best off using
> my (rebasing) tlb branch rather than picking the RFC:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/will/linux.git tlb
> 
> Let me know if you'd prefer something stable (I can tag it with a date).

That would be useful. I've prodded each of the Arm server SoC vendors I
work with via our weekly call to have them each specifically check this.
A tag would be helpful to that effort I expect. They all claim to be
watching this thread now, so we'll see if they see cabbages here.

Jon.

-- 
Computer Architect | Sent from my Fedora powered laptop


Re: [LKP] [tty] 0b4f83d510: INFO:task_blocked_for_more_than#seconds

2018-09-07 Thread Jiri Slaby
On 09/07/2018, 06:50 AM, kernel test robot wrote:
> FYI, we noticed the following commit (built with gcc-7):
> 
> commit: 0b4f83d510f6fef6bb9da25f122c8d733d50516f ("[PATCH 2/4] tty: Hold 
> tty_ldisc_lock() during tty_reopen()")
> url: 
> https://github.com/0day-ci/linux/commits/Dmitry-Safonov/tty-Hold-write-ldisc-sem-in-tty_reopen/20180829-165618
> base: https://git.kernel.org/cgit/linux/kernel/git/gregkh/tty.git tty-testing
> 
> in testcase: trinity
> with following parameters:
> 
>   runtime: 300s
> 
> test-description: Trinity is a linux system call fuzz tester.
> test-url: http://codemonkey.org.uk/projects/trinity/
> 
> 
> on test machine: qemu-system-x86_64 -enable-kvm -m 256M
> 
> caused below changes (please refer to attached dmesg/kmsg for entire 
> log/backtrace):
> 
> 
> +--+++
> |  | 58dd163974 | 0b4f83d510 |
> +--+++
> | boot_successes   | 14 | 4  |
> | boot_failures| 0  | 6  |
> | INFO:task_blocked_for_more_than#seconds  | 0  | 6  |
> | Kernel_panic-not_syncing:hung_task:blocked_tasks | 0  | 6  |
> +--+++
> 
> 
> 
> [  244.816801] INFO: task validate_data:655 blocked for more than 120 seconds.
> [  244.818833]   Not tainted 4.18.0-11684-g0b4f83d #1
> [  244.820028] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables 
> this message.
> [  244.826965] validate_data   D0   655623 0x2002
> [  244.828279] Call Trace:
> [  244.828958]  ? __schedule+0x843/0x950
> [  244.830173]  ? __ldsem_down_read_nested+0x1c4/0x3b0
> [  244.834903]  schedule+0x31/0x70
> [  244.835665]  schedule_timeout+0x34/0x760
> [  244.836613]  ? ftrace_likely_update+0x35/0x60
> [  244.837683]  ? __ldsem_down_read_nested+0x1c4/0x3b0
> [  244.838818]  ? ftrace_likely_update+0x35/0x60
> [  244.840127]  ? ftrace_likely_update+0x35/0x60
> [  244.845947]  ? __ldsem_down_read_nested+0x1c4/0x3b0
> [  244.847882]  __ldsem_down_read_nested+0x23a/0x3b0
> [  244.849886]  ? tty_ldisc_ref_wait+0x25/0x50
> [  244.853807]  tty_ldisc_ref_wait+0x25/0x50
> [  244.854946]  tty_compat_ioctl+0x8a/0x120
> [  244.855928]  ? this_tty+0x80/0x80
> [  244.856742]  __ia32_compat_sys_ioctl+0xc28/0x1ce0
> [  244.857981]  do_int80_syscall_32+0x1d2/0x5f0
> [  244.859003]  entry_INT80_compat+0x88/0xa0
> [  244.859972] INFO: task dnsmasq:668 blocked for more than 120 seconds.
> [  244.868315]   Not tainted 4.18.0-11684-g0b4f83d #1
> [  244.869583] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables 
> this message.
> [  244.871744] dnsmasq D0   668  1 0x2002
> [  244.873063] Call Trace:
> [  244.873697]  ? __schedule+0x843/0x950
> [  244.874572]  ? __ldsem_down_read_nested+0x1c4/0x3b0
> [  244.875725]  schedule+0x31/0x70
> [  244.876576]  schedule_timeout+0x34/0x760
> [  244.877573]  ? ftrace_likely_update+0x35/0x60
> [  244.878660]  ? __ldsem_down_read_nested+0x1c4/0x3b0
> [  244.879872]  ? ftrace_likely_update+0x35/0x60
> [  244.890522]  ? ftrace_likely_update+0x35/0x60
> [  244.891572]  ? __ldsem_down_read_nested+0x1c4/0x3b0
> [  244.892746]  __ldsem_down_read_nested+0x23a/0x3b0
> [  244.893861]  ? tty_ldisc_ref_wait+0x25/0x50
> [  244.894841]  tty_ldisc_ref_wait+0x25/0x50
> [  244.895911]  tty_compat_ioctl+0x8a/0x120
> [  244.896916]  ? this_tty+0x80/0x80
> [  244.897717]  __ia32_compat_sys_ioctl+0xc28/0x1ce0
> [  244.898821]  do_int80_syscall_32+0x1d2/0x5f0
> [  244.899830]  entry_INT80_compat+0x88/0xa0
> [  244.909466] INFO: task dropbear:734 blocked for more than 120 seconds.
> [  244.911173]   Not tainted 4.18.0-11684-g0b4f83d #1
> [  244.912394] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables 
> this message.
> [  244.914176] dropbearD0   734  1 0x2002
> [  244.915446] Call Trace:
> [  244.916068]  ? __schedule+0x843/0x950
> [  244.916945]  ? __ldsem_down_read_nested+0x1c4/0x3b0
> [  244.918076]  schedule+0x31/0x70
> [  244.918832]  schedule_timeout+0x34/0x760
> [  244.919781]  ? ftrace_likely_update+0x35/0x60
> [  244.921104]  ? __ldsem_down_read_nested+0x1c4/0x3b0
> [  244.922304]  ? ftrace_likely_update+0x35/0x60
> [  244.923347]  ? ftrace_likely_update+0x35/0x60
> [  244.924369]  ? __ldsem_down_read_nested+0x1c4/0x3b0
> [  244.925496]  __ldsem_down_read_nested+0x23a/0x3b0
> [  244.926598]  ? tty_ldisc_ref_wait+0x25/0x50
> [  244.927578]  tty_ldisc_ref_wait+0x25/0x50
> [  244.928526]  tty_compat_ioctl+0x8a/0x120
> [  244.929449]  ? this_tty+0x80/0x80
> [  244.930240]  __ia32_compat_sys_ioctl+0xc28/0x1ce0
> [  244.940083]  do_int80_syscall_32+0x1d2/0x5f0
> [  244.941310]  entry_INT80_compat+0x88/0xa0
> [  244.944070] 
> [  244.944070] Showing all locks held 

Re: [PATCH v2 1/8] perf/x86: add a function to get the lbr stack

2018-09-07 Thread Wei Wang

On 09/07/2018 11:28 AM, Andi Kleen wrote:

+int perf_get_lbr_stack(struct perf_lbr_stack *stack)
+{
+   stack->lbr_nr = x86_pmu.lbr_nr;
+   stack->lbr_tos = x86_pmu.lbr_tos;
+   stack->lbr_from = x86_pmu.lbr_from;
+   stack->lbr_to = x86_pmu.lbr_to;
+
+   if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+   stack->lbr_info = MSR_LBR_INFO_0;
+   else
+   stack->lbr_info = 0;

Seems weird to export the enum value if the enum isn't exported.
How can it be used?



I'm not sure about the issue. The caller gets the value of 
MSR_LBR_INFO_0 (not the enum, LBR_FORMAT_INFO) only when the hardware 
supports it. If hardware doesn't support it, just sets it to 0, and 
there will be no lbr info msr to be passed through.


Best,
Wei


Re: [PATCH V3 00/26] C-SKY(csky) Linux Kernel Port

2018-09-07 Thread Guo Ren
On Thu, Sep 06, 2018 at 07:08:18PM -0700, Guenter Roeck wrote:
> Hi,
> 
> On Wed, Sep 05, 2018 at 08:07:39PM +0800, Guo Ren wrote:
> > This is the 3th version patchset to add the Linux kernel port for 
> > C-SKY(csky).
> > Thanks to everyone who provided feedback on the previous version.
> > 
> > This patchset adds architecture support to Linux for C-SKY's 32-bit embedded
> > CPU cores and the patches are based on linux-4.18.4
> > 
> > There are two ABI versions with several CPU cores in this patchset:
> >   ABIv1: ck610 (16-bit instruction, 32-bit data path, VIPT Cache ...)
> >   ABIv2: ck807 ck810 ck860 (16/32-bit variable length instruction, PIPT 
> > Cache,
> >  SMP ...)
> > 
> 
> My key question is about upstream toolchain support.
> The buildroot clone tells me
> 
> $ git describe csky/master
> 2017.11-2111-ge9cc5a5
> 
> and
> 
> $ git log --oneline origin/master..csky/master  | wc
>11807436   57104
> 
> with
> $ git remote -v
> csky  https://gitlab.com/c-sky/buildroot.git 
> origingit://git.buildroot.net/buildroot
> 
> So it looks like there are more thasn a thousand patches on top of
> buildroot. Adding an architecture to buildroot should only take a
> single patch, or maybe a few, but not more than a thousand.
> This strongly suggests that a lot of changes are not upstream
> but only available in the buildroot clone.
  csky  https://gitlab.com/c-sky/buildroot.git is our CI environment
  based on buildroot and it's so miscellaneous.
  We won't upstream it directly and we'll prepare another patch set for
  buildroot.org update after kernel, glibc upstreamed.
 
> When are we going to see all those changes in upstream gcc, binutils,
> and qemu ? I don't really want to dig through more than a thousand
> patches in a buildroot clone to find out details about the status
> of upstream toolchain support.
  Ok, you want to use upstream gcc, binutils to build the kernel. I'll
  give the tips in next version patch.

Best Regards
 Guo Ren


Re: [PATCH V3 19/26] dt-bindings: timer: gx6605s SOC timer

2018-09-07 Thread Guo Ren
On Thu, Sep 06, 2018 at 10:02:29AM +0800, Guo Ren wrote:
> On Wed, Sep 05, 2018 at 07:47:29PM -0500, Rob Herring wrote:
> > On Wed, Sep 5, 2018 at 7:09 AM Guo Ren  wrote:
> > >
> > > Signed-off-by: Guo Ren 
> > > ---
> > >  .../bindings/timer/csky,gx6605s-timer.txt  | 46 
> > > ++
> > >  1 file changed, 46 insertions(+)

> Ok, change to "timer0: timer@0x0020a000"
  Ok, change to "timer0: timer@20a000"



[PATCH v8 0/3]: perf: reduce data loss when profiling highly parallel CPU bound workloads

2018-09-07 Thread Alexey Budankov


Currently in record mode the tool implements trace writing serially. 
The algorithm loops over mapped per-cpu data buffers and stores 
ready data chunks into a trace file using write() system call.

At some circumstances the kernel may lack free space in a buffer 
because the other buffer's half is not yet written to disk due to 
some other buffer's data writing by the tool at the moment.

Thus serial trace writing implementation may cause the kernel 
to loose profiling data and that is what observed when profiling 
highly parallel CPU bound workloads on machines with big number 
of cores.

Experiment with profiling matrix multiplication code executing 128 
threads on Intel Xeon Phi (KNM) with 272 cores, like below,
demonstrates data loss metrics value of 98%:

/usr/bin/time perf record -o /tmp/perf-ser.data -a -N -B -T -R -g \
--call-graph dwarf,1024 --user-regs=IP,SP,BP \
--switch-events -e 
cycles,instructions,ref-cycles,software/period=1,name=cs,config=0x3/Duk -- \
matrix.gcc

Data loss metrics is the ratio lost_time/elapsed_time where 
lost_time is the sum of time intervals containing PERF_RECORD_LOST 
records and elapsed_time is the elapsed application run time 
under profiling.

Applying asynchronous trace streaming thru Posix AIO API
(http://man7.org/linux/man-pages/man7/aio.7.html) 
lowers data loss metrics value providing 2x improvement -
lowering 98% loss to almost 0%.

---
 Alexey Budankov (3):
perf util: map data buffer for preserving collected data
perf record: enable asynchronous trace writing
perf record: extend trace writing to multi AIO
 
 tools/perf/builtin-record.c | 166 ++--
 tools/perf/perf.h   |   1 +
 tools/perf/util/evlist.c|   7 +-
 tools/perf/util/evlist.h|   3 +-
 tools/perf/util/mmap.c  | 114 ++
 tools/perf/util/mmap.h  |  11 ++-
 6 files changed, 277 insertions(+), 25 deletions(-)

---
 Changes in v8:
 - run the whole thing thru checkpatch.pl and corrected found issues except
   lines longer than 80 symbols
 - corrected comments alignment and formatting
 - moved multi AIO implementation into 3rd patch in the series
 - implemented explicit cblocks array allocation
 - split AIO completion check into separate record__aio_complete()
 - set nr_cblocks default to 1 and max allowed value to 4
 Changes in v7:
 - implemented handling record.aio setting from perfconfig file
 Changes in v6:
 - adjusted setting of priorities for cblocks;
 - handled errno == EAGAIN case from aio_write() return;
 Changes in v5:
 - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero 
of=/dev/null count=10
 - data loss metrics decreased from 25% to 2x in trialed configuration;
 - reshaped layout of data structures;
 - implemented --aio option;
 - avoided nanosleep() prior calling aio_suspend();
 - switched to per-cpu aio multi buffer record__aio_sync();
 - record_mmap_read_sync() now does global sync just before 
   switching trace file or collection stop;
 Changes in v4:
 - converted mmap()/munmap() to malloc()/free() for mmap->data buffer management
 - converted void *bf to struct perf_mmap *md in signatures
 - written comment in perf_mmap__push() just before perf_mmap__get();
 - written comment in record__mmap_read_sync() on possible restarting 
   of aio_write() operation and releasing perf_mmap object after all;
 - added perf_mmap__put() for the cases of failed aio_write();
 Changes in v3:
 - written comments about nanosleep(0.5ms) call prior aio_suspend()
   to cope with intrusiveness of its implementation in glibc;
 - written comments about rationale behind coping profiling data 
   into mmap->data buffer;
 Changes in v2:
 - converted zalloc() to calloc() for allocation of mmap_aio array,
 - cleared typo and adjusted fallback branch code;


[PATCH] Input: elantech - enable middle button of touchpad on ThinkPad P72

2018-09-07 Thread Aaron Ma
Adding 2 new touchpad IDs to support middle button support.

Cc: sta...@vger.kernel.org
Signed-off-by: Aaron Ma 
---
 drivers/input/mouse/elantech.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/input/mouse/elantech.c b/drivers/input/mouse/elantech.c
index 44f57cf6675b..2d95e8d93cc7 100644
--- a/drivers/input/mouse/elantech.c
+++ b/drivers/input/mouse/elantech.c
@@ -1178,6 +1178,8 @@ static const struct dmi_system_id 
elantech_dmi_has_middle_button[] = {
 static const char * const middle_button_pnp_ids[] = {
"LEN2131", /* ThinkPad P52 w/ NFC */
"LEN2132", /* ThinkPad P52 */
+   "LEN2133", /* ThinkPad P72 w/ NFC */
+   "LEN2134", /* ThinkPad P72 */
NULL
 };
 
-- 
2.17.1



[PATCH v8 1/3]: perf util: map data buffer for preserving collected data

2018-09-07 Thread Alexey Budankov


The map->data buffer is used to preserve map->base profiling data 
for writing to disk. AIO map->cblock is used to queue corresponding 
map->data buffer for asynchronous writing.

Signed-off-by: Alexey Budankov 
---
 Changes in v7:
  - implemented handling record.aio setting from perfconfig file
 Changes in v6:
  - adjusted setting of priorities for cblocks;
 Changes in v5:
  - reshaped layout of data structures;
  - implemented --aio option;
 Changes in v4:
  - converted mmap()/munmap() to malloc()/free() for mmap->data buffer 
management 
 Changes in v2:
  - converted zalloc() to calloc() for allocation of mmap_aio array,
  - cleared typo and adjusted fallback branch code;
---
 tools/perf/util/mmap.c | 25 +
 tools/perf/util/mmap.h |  3 +++
 2 files changed, 28 insertions(+)

diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c
index fc832676a798..e53038d76445 100644
--- a/tools/perf/util/mmap.c
+++ b/tools/perf/util/mmap.c
@@ -155,6 +155,8 @@ void __weak auxtrace_mmap_params__set_idx(struct 
auxtrace_mmap_params *mp __mayb
 
 void perf_mmap__munmap(struct perf_mmap *map)
 {
+   if (map->data)
+   zfree(>data);
if (map->base != NULL) {
munmap(map->base, perf_mmap__mmap_len(map));
map->base = NULL;
@@ -166,6 +168,7 @@ void perf_mmap__munmap(struct perf_mmap *map)
 
 int perf_mmap__mmap(struct perf_mmap *map, struct mmap_params *mp, int fd)
 {
+   int delta_max;
/*
 * The last one will be done at perf_mmap__consume(), so that we
 * make sure we don't prevent tools from consuming every last event in
@@ -190,6 +193,28 @@ int perf_mmap__mmap(struct perf_mmap *map, struct 
mmap_params *mp, int fd)
map->base = NULL;
return -1;
}
+   delta_max = sysconf(_SC_AIO_PRIO_DELTA_MAX);
+   map->data = malloc(perf_mmap__mmap_len(map));
+   if (!map->data) {
+   pr_debug2("failed to allocate data buffer, error %d\n",
+   errno);
+   return -1;
+   }
+   /*
+* Use cblock.aio_fildes value different from -1
+* to denote started aio write operation on the
+* cblock so it requires explicit record__aio_sync()
+* call prior the cblock may be reused again.
+*/
+   map->cblock.aio_fildes = -1;
+   /*
+* Allocate cblock with max priority delta to
+* have faster aio_write() calls because queued
+* requests are kept in separate per-prio queues
+* and adding a new request iterates thru shorter
+* per-prio list.
+*/
+   map->cblock.aio_reqprio = delta_max;
map->fd = fd;
 
if (auxtrace_mmap__mmap(>auxtrace_mmap,
diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h
index d82294db1295..1974e621e36b 100644
--- a/tools/perf/util/mmap.h
+++ b/tools/perf/util/mmap.h
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "auxtrace.h"
 #include "event.h"
 
@@ -25,6 +26,8 @@ struct perf_mmap {
bool overwrite;
struct auxtrace_mmap auxtrace_mmap;
char event_copy[PERF_SAMPLE_MAX_SIZE] __aligned(8);
+   void *data;
+   struct aiocb cblock;
 };
 
 /*



Re: [PATCH AUTOSEL 4.18 043/131] ASoC: soc-pcm: Use delay set in component pointer function

2018-09-07 Thread Agrawal, Akshu



On 9/7/2018 5:53 AM, Sasha Levin wrote:
> On Mon, Sep 03, 2018 at 12:16:26PM +0100, Mark Brown wrote:
>> On Sun, Sep 02, 2018 at 01:03:55PM +, Sasha Levin wrote:
>>> From: Akshu Agrawal 
>>>
>>> [ Upstream commit 9fb4c2bf130b922c77c16a8368732699799c40de ]
>>>
>>> Take into account the base delay set in pointer callback.
>>>
>>> There are cases where a pointer function populates
>>> runtime->delay, such as:
>>> ./sound/pci/hda/hda_controller.c
>>> ./sound/soc/intel/atom/sst-mfld-platform-pcm.c
>>
>> I'm worried that if anyone notices this at all they will have already
>> compensated for the delays in userspace and therefore this will cause
>> them to see problems as they get double compenstation for delays.
> 
> But what happens when they update to a newer Stable? They're going to
> hit that issue anyways.
> 

Drivers which had exposed this delay in pointer function but have
compensated for the issue in userspace are likely see the problem of
double delay when the update happens.
I Don't know what is the best way to communicate that issue is fixed in
kernel and usersapce compensation isn't required.

But more likely I think the delay was just getting left out and there
wouldn't have been a compensation in userspace.

Thanks,
Akshu


Re: [PATCH] sched/fair: vruntime should normalize when switching from fair

2018-09-07 Thread Juri Lelli
On 06/09/18 16:25, Dietmar Eggemann wrote:
> Hi Juri,
> 
> On 08/23/2018 11:54 PM, Juri Lelli wrote:
> > On 23/08/18 18:52, Dietmar Eggemann wrote:
> > > Hi,
> > > 
> > > On 08/21/2018 01:54 AM, Miguel de Dios wrote:
> > > > On 08/17/2018 11:27 AM, Steve Muckle wrote:
> > > > > From: John Dias 
> 
> [...]
> 
> > > 
> > > I tried to catch this issue on my Arm64 Juno board using pi_test (and a
> > > slightly adapted pip_test (usleep_val = 1500 and keep low as cfs)) from
> > > rt-tests but wasn't able to do so.
> > > 
> > > # pi_stress --inversions=1 --duration=1 --groups=1 --sched 
> > > id=low,policy=cfs
> > > 
> > > Starting PI Stress Test
> > > Number of thread groups: 1
> > > Duration of test run: 1 seconds
> > > Number of inversions per group: 1
> > >   Admin thread SCHED_FIFO priority 4
> > > 1 groups of 3 threads will be created
> > >High thread SCHED_FIFO priority 3
> > > Med thread SCHED_FIFO priority 2
> > > Low thread SCHED_OTHER nice 0
> > > 
> > > # ./pip_stress
> > > 
> > > In both cases, the cfs task entering  rt_mutex_setprio() is queued, so
> > > dequeue_task_fair()->dequeue_entity(), which subtracts 
> > > cfs_rq->min_vruntime
> > > from se->vruntime, is called on it before it gets the rt prio.
> > > 
> > > Maybe it requires a very specific use of the pthread library to provoke 
> > > this
> > > issue by making sure that the cfs tasks really blocks/sleeps?
> > 
> > Maybe one could play with rt-app to recreate such specific use case?
> > 
> > https://github.com/scheduler-tools/rt-app/blob/master/doc/tutorial.txt#L459
> 
> I played a little bit with rt-app on hikey960 to re-create Steve's test
> program.

Oh, nice! Thanks for sharing what you have got.

> Since there is no semaphore support (sem_wait(), sem_post()) I used
> condition variables (wait: pthread_cond_wait() , signal:
> pthread_cond_signal()). It's not really the same since this is stateless but
> sleeps before the signals help to maintain the state in this easy example.
> 
> This provokes the vruntime issue e.g. for cpus 0,4 and it doesn't for 0,1:
> 
> 
> "global": {
> "calibration" : 130,
>   "pi_enabled" : true
> },
> "tasks": {
> "rt_task": {
>   "loop" : 100,
>   "policy" : "SCHED_FIFO",
>   "cpus" : [0],
> 
>   "lock" : "b_mutex",
>   "wait" : { "ref" : "b_cond", "mutex" : "b_mutex" },
>   "unlock" : "b_mutex",
>   "sleep" : 3000,
>   "lock1" : "a_mutex",
>   "signal" : "a_cond",
>   "unlock1" : "a_mutex",
>   "lock2" : "pi-mutex",
>   "unlock2" : "pi-mutex"
> },
>   "cfs_task": {
>   "loop" : 100,
>   "policy" : "SCHED_OTHER",
>   "cpus" : [4],
> 
>   "lock" : "pi-mutex",
>   "sleep" : 3000,
>   "lock1" : "b_mutex",
>   "signal" : "b_cond",
>   "unlock" : "b_mutex",
>   "lock2" : "a_mutex",
>   "wait" : { "ref" : "a_cond", "mutex" : "a_mutex" },
>   "unlock1" : "a_mutex",
>   "unlock2" : "pi-mutex"
>   }
> }
> }
> 
> Adding semaphores is possible but rt-app has no easy way to initialize
> individual objects, e.g. sem_init(..., value). The only way I see is via the
> global section, like "pi_enabled". But then, this is true for all objects of
> this kind (in this case mutexes)?

Right, global section should work fine. Why do you think this is a
problem/limitation?

> So the following couple of lines extension to rt-app works because both
> semaphores can be initialized to 0:
> 
>  {
> "global": {
> "calibration" : 130,
>   "pi_enabled" : true
> },
> "tasks": {
> "rt_task": {
>   "loop" : 100,
>   "policy" : "SCHED_FIFO",
>   "cpus" : [0],
> 
>   "sem_wait" : "b_sem",
>   "sleep" : 1000,
>   "sem_post" : "a_sem",
> 
>   "lock" : "pi-mutex",
>   "unlock" : "pi-mutex"
> },
>   "cfs_task": {
>   "loop" : 100,
>   "policy" : "SCHED_OTHER",
>   "cpus" : [4],
> 
>   "lock" : "pi-mutex",
>   "sleep" : 1000,
>   "sem_post" : "b_sem",
>   "sem_wait" : "a_sem",
>   "unlock" : "pi-mutex"
>   }
> }
> }
> 
> Any thoughts on that? I can see something like this as infrastructure to
> create a regression test case based on rt-app and standard ftrace.

Agree. I guess we should add your first example to the repo (you'd be
very welcome to create a PR) already and then work to support the second?


[PATCH v8 2/3]: perf record: enable asynchronous trace writing

2018-09-07 Thread Alexey Budankov


Trace file offset is calculated and updated linearly prior
enqueuing aio write at record__pushfn().

record__aio_sync() blocks till completion of started AIO operation 
and then proceeds.

record__mmap_read_sync() implements a barrier for all incomplete
aio write requests.

Signed-off-by: Alexey Budankov 
---
 Changes in v8:
 -  split AIO completion check into separate record__aio_complete()
 Changes in v6:
 - handled errno == EAGAIN case from aio_write();
 Changes in v5:
 - data loss metrics decreased from 25% to 2x in trialed configuration;
 - avoided nanosleep() prior calling aio_suspend();
 - switched to per cpu multi record__aio_sync() aio
 - record_mmap_read_sync() now does global barrier just before 
   switching trace file or collection stop;
 - resolved livelock on perf record -e intel_pt// -- dd if=/dev/zero 
of=/dev/null count=10
 Changes in v4:
 - converted void *bf to struct perf_mmap *md in signatures
 - written comment in perf_mmap__push() just before perf_mmap__get();
 - written comment in record__mmap_read_sync() on possible restarting 
   of aio_write() operation and releasing perf_mmap object after all;
 - added perf_mmap__put() for the cases of failed aio_write();
 Changes in v3:
 - written comments about nanosleep(0.5ms) call prior aio_suspend()
   to cope with intrusiveness of its implementation in glibc;
 - written comments about rationale behind coping profiling data 
   into mmap->data buffer;
---
 tools/perf/builtin-record.c | 128 +++-
 tools/perf/util/mmap.c  |  54 ++-
 tools/perf/util/mmap.h  |   2 +-
 3 files changed, 169 insertions(+), 15 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 22ebeb92ac51..d4857572cf33 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -121,6 +121,93 @@ static int record__write(struct record *rec, void *bf, 
size_t size)
return 0;
 }
 
+static int record__aio_write(struct aiocb *cblock, int trace_fd,
+   void *buf, size_t size, off_t off)
+{
+   int rc;
+
+   cblock->aio_fildes = trace_fd;
+   cblock->aio_buf= buf;
+   cblock->aio_nbytes = size;
+   cblock->aio_offset = off;
+   cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
+
+   do {
+   rc = aio_write(cblock);
+   if (rc == 0) {
+   break;
+   } else if (errno != EAGAIN) {
+   cblock->aio_fildes = -1;
+   pr_err("failed to queue perf data, error: %m\n");
+   break;
+   }
+   } while (1);
+
+   return rc;
+}
+
+static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
+{
+   void *rem_buf;
+   off_t rem_off;
+   size_t rem_size;
+   int rc, aio_errno;
+   ssize_t aio_ret, written;
+
+   aio_errno = aio_error(cblock);
+   if (aio_errno == EINPROGRESS)
+   return 0;
+
+   written = aio_ret = aio_return(cblock);
+   if (aio_ret < 0) {
+   if (!(aio_errno == EINTR))
+   pr_err("failed to write perf data, error: %m\n");
+   written = 0;
+   }
+
+   rem_size = cblock->aio_nbytes - written;
+
+   if (rem_size == 0) {
+   cblock->aio_fildes = -1;
+   /*
+* md->refcount is incremented in perf_mmap__push() for
+* every enqueued aio write request so decrement it because
+* the request is now complete.
+*/
+   perf_mmap__put(md);
+   rc = 1;
+   } else {
+   /*
+* aio write request may require restart with the
+* reminder if the kernel didn't write whole
+* chunk at once.
+*/
+   rem_off = cblock->aio_offset + written;
+   rem_buf = (void *)(cblock->aio_buf + written);
+   record__aio_write(cblock, cblock->aio_fildes,
+   rem_buf, rem_size, rem_off);
+   rc = 0;
+   }
+
+   return rc;
+}
+
+static void record__aio_sync(struct perf_mmap *md)
+{
+   struct aiocb *cblock = >cblock;
+   struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
+
+   do {
+   if (cblock->aio_fildes == -1 || record__aio_complete(md, 
cblock))
+   return;
+
+   while (aio_suspend((const struct aiocb**), 1, )) 
{
+   if (!(errno == EAGAIN || errno == EINTR))
+   pr_err("failed to sync perf data, error: %m\n");
+   }
+   } while (1);
+}
+
 static int process_synthesized_event(struct perf_tool *tool,
 union perf_event *event,
 struct perf_sample *sample __maybe_unused,
@@ -130,12 +217,27 @@ static int process_synthesized_event(struct 

Re: [PATCH resend 0/2] irqchip: convert to SPDX for Renesas drivers

2018-09-07 Thread Marc Zyngier
On Fri, 07 Sep 2018 02:50:13 +0100,
Kuninori Morimoto  wrote:
> 
> 
> Hi Thomas, Marc, Jason
> 
> 2weeks passed. I resend this patch again
> 
> Kuninori Morimoto (2):
>   pinctrl: sh-pfc: convert to SPDX identifiers
>   pinctrl: rza1: convert to SPDX identifiers
> 
>  drivers/pinctrl/pinctrl-rza1.c   |  5 +
>  drivers/pinctrl/sh-pfc/Kconfig   |  1 +
>  drivers/pinctrl/sh-pfc/core.c|  5 +
>  drivers/pinctrl/sh-pfc/core.h|  7 ++-
>  drivers/pinctrl/sh-pfc/gpio.c|  5 +
>  drivers/pinctrl/sh-pfc/pfc-emev2.c   |  5 +
>  drivers/pinctrl/sh-pfc/pfc-r8a73a4.c | 15 +--
>  drivers/pinctrl/sh-pfc/pfc-r8a7740.c | 15 +--
>  drivers/pinctrl/sh-pfc/pfc-r8a7778.c | 10 +-
>  drivers/pinctrl/sh-pfc/pfc-r8a7779.c | 14 +-
>  drivers/pinctrl/sh-pfc/pfc-r8a7790.c | 15 +--
>  drivers/pinctrl/sh-pfc/pfc-r8a7791.c |  5 +
>  drivers/pinctrl/sh-pfc/pfc-r8a7792.c |  5 +
>  drivers/pinctrl/sh-pfc/pfc-r8a7794.c |  5 +
>  drivers/pinctrl/sh-pfc/pfc-r8a7795-es1.c |  5 +
>  drivers/pinctrl/sh-pfc/pfc-r8a7795.c |  5 +
>  drivers/pinctrl/sh-pfc/pfc-r8a7796.c |  5 +
>  drivers/pinctrl/sh-pfc/pfc-r8a77970.c|  5 +
>  drivers/pinctrl/sh-pfc/pfc-r8a77995.c|  5 +
>  drivers/pinctrl/sh-pfc/pfc-sh7203.c  |  5 +
>  drivers/pinctrl/sh-pfc/pfc-sh7264.c  |  5 +
>  drivers/pinctrl/sh-pfc/pfc-sh7269.c  |  5 +
>  drivers/pinctrl/sh-pfc/pfc-sh73a0.c  | 15 +--
>  drivers/pinctrl/sh-pfc/pfc-sh7720.c  |  5 +
>  drivers/pinctrl/sh-pfc/pfc-sh7723.c  |  5 +
>  drivers/pinctrl/sh-pfc/pfc-sh7724.c  |  5 +
>  drivers/pinctrl/sh-pfc/pfc-sh7734.c  |  5 +
>  drivers/pinctrl/sh-pfc/pfc-sh7757.c  |  5 +
>  drivers/pinctrl/sh-pfc/pfc-sh7785.c  |  5 +
>  drivers/pinctrl/sh-pfc/pfc-sh7786.c  |  5 +
>  drivers/pinctrl/sh-pfc/pfc-shx3.c|  5 +
>  drivers/pinctrl/sh-pfc/pinctrl.c |  5 +
>  drivers/pinctrl/sh-pfc/sh_pfc.h  |  7 ++-
>  33 files changed, 35 insertions(+), 184 deletions(-)

[+ Linus]

If I trust the diffstat, should this be sent to the pinctrl maintainer
instead?

M.

-- 
Jazz is not dead, it just smell funny.


Re: [PATCH] vme: remove unneeded kfree

2018-09-07 Thread Greg Kroah-Hartman
On Thu, Sep 06, 2018 at 10:04:49PM -0700, Linus Torvalds wrote:
> On Thu, Sep 6, 2018 at 1:51 AM Ding Xiang
>  wrote:
> >
> > put_device will call vme_dev_release to free vdev, kfree is
> > unnecessary here.
> 
> That does seem to be the case.  I think "unnecessary" is overly kind,
> it does seem to be a double free.
> 
> Looks like the issue was introduced back in 2013 by commit
> def1820d25fa ("vme: add missing put_device() after device_register()
> fails").
> 
> It seems you should *either* kfree() the vdev, _or_ do put_device(),
> but doing both seems wrong.

You should only ever call put_device() after you have created the
structure, the documentation should say that somewhere...

> I presume the device_register() has never failed, and this being
> vme-only I'm guessing there isn't a vibrant testing community.
> 
> Greg?

It's the correct fix, I'll queue it up soon, thanks.

greg k-h


[PATCH 1/2] mtd: rawnand: denali: remove ->dev_ready() hook

2018-09-07 Thread Masahiro Yamada
The Denali NAND IP has no way to read out the current signal level
of the R/B# pin.  Instead, denali_dev_ready() checks if the R/B#
transition has already happened. (The INTR__INT_ACT interrupt is
asserted at the rising edge of the R/B# pin.)  It is not a correct
way to implement the ->dev_ready() hook.

In fact, it has a drawback; in the nand_scan_ident phase, the chip
detection iterates over maxchips until it fails to find a homogeneous
chip.  For the last loop, nand_reset() fails if no chip is there.

If ->dev_ready hook exists, nand_command(_lp) calls nand_wait_ready()
after NAND_CMD_RESET.  However, we know denali_dev_ready() never
returns 1 unless there exists a chip that toggles R/B# in that chip
select.  Then, nand_wait_ready() just ends up with wasting 400 msec,
in the end, shows the "timeout while waiting for chip to become ready"
warning.

Let's remove the mis-implemented dev_ready hook, and fallback to
sending the NAND_CMD_STATUS and nand_wait_status_ready(), which
bails out more quickly.

Signed-off-by: Masahiro Yamada 
---

 drivers/mtd/nand/raw/denali.c | 22 +-
 1 file changed, 1 insertion(+), 21 deletions(-)

diff --git a/drivers/mtd/nand/raw/denali.c b/drivers/mtd/nand/raw/denali.c
index f88a5dc..f069184 100644
--- a/drivers/mtd/nand/raw/denali.c
+++ b/drivers/mtd/nand/raw/denali.c
@@ -203,18 +203,6 @@ static uint32_t denali_wait_for_irq(struct 
denali_nand_info *denali,
return denali->irq_status;
 }
 
-static uint32_t denali_check_irq(struct denali_nand_info *denali)
-{
-   unsigned long flags;
-   uint32_t irq_status;
-
-   spin_lock_irqsave(>irq_lock, flags);
-   irq_status = denali->irq_status;
-   spin_unlock_irqrestore(>irq_lock, flags);
-
-   return irq_status;
-}
-
 static void denali_read_buf(struct mtd_info *mtd, uint8_t *buf, int len)
 {
struct denali_nand_info *denali = mtd_to_denali(mtd);
@@ -294,7 +282,7 @@ static void denali_cmd_ctrl(struct mtd_info *mtd, int dat, 
unsigned int ctrl)
return;
 
/*
-* Some commands are followed by chip->dev_ready or chip->waitfunc.
+* Some commands are followed by chip->waitfunc.
 * irq_status must be cleared here to catch the R/B# interrupt later.
 */
if (ctrl & NAND_CTRL_CHANGE)
@@ -303,13 +291,6 @@ static void denali_cmd_ctrl(struct mtd_info *mtd, int dat, 
unsigned int ctrl)
denali->host_write(denali, DENALI_BANK(denali) | type, dat);
 }
 
-static int denali_dev_ready(struct mtd_info *mtd)
-{
-   struct denali_nand_info *denali = mtd_to_denali(mtd);
-
-   return !!(denali_check_irq(denali) & INTR__INT_ACT);
-}
-
 static int denali_check_erased_page(struct mtd_info *mtd,
struct nand_chip *chip, uint8_t *buf,
unsigned long uncor_ecc_flags,
@@ -1349,7 +1330,6 @@ int denali_init(struct denali_nand_info *denali)
chip->write_byte = denali_write_byte;
chip->read_word = denali_read_word;
chip->cmd_ctrl = denali_cmd_ctrl;
-   chip->dev_ready = denali_dev_ready;
chip->waitfunc = denali_waitfunc;
 
if (features & FEATURES__INDEX_ADDR) {
-- 
2.7.4



[PATCH 0/2] mtd: rawnand: denali: clean-up unnecessary hook and device reset

2018-09-07 Thread Masahiro Yamada


As I replied to Boris [1],
I took a closer look for further cleanups.
I test this series on my board.

Remove mis-implemented ->dev_ready hook.
Remove unnecessary device resetting because
nand_scan_ident() reset devices anyway.

[1] http://patchwork.ozlabs.org/patch/960160/



Masahiro Yamada (2):
  mtd: rawnand: denali: remove ->dev_ready() hook
  mtd: rawnand: denali: remove denali_reset_banks()

 drivers/mtd/nand/raw/denali.c | 51 +--
 1 file changed, 1 insertion(+), 50 deletions(-)

-- 
2.7.4



[PATCH 2/2] mtd: rawnand: denali: remove denali_reset_banks()

2018-09-07 Thread Masahiro Yamada
In nand_scan_ident(), the controller driver resets every NAND chip.
This is done by sending NAND_CMD_RESET.  The Denali IP provides
another way to do the equivalent thing; if a bit is set in the
DEVICE_RESET register, the controller sends the RESET command to
the corresponding device.  denali_reset_banks() uses it to reset
all devices beforehand.

This redundant reset sequence was needed to know the actual number
of chips before calling nand_scan_ident(); if DEVICE_RESET fails,
there is no chip in that chip select.  Then, denali_reset_banks()
sets denali->max_banks to the number of detected chips.

As commit f486287d2372 ("mtd: nand: denali: fix bank reset function
to detect the number of chips") explained, nand_scan_ident() issued
Set Features (0xEF) command to all CS lines, some of which may not be
connected with a chip. Then, the driver would wait for R/B# response,
which never happens.

This problem was solved by commit 107b7d6a7ad4 ("mtd: rawnand: avoid
setting again the timings to mode 0 after a reset").  In the current
code, nand_setup_data_interface() is called from nand_scan_tail(),
which is invoked after the chip detection.

Now, we can really remove the redundant denali_nand_banks() by simply
passing the maximum number of chip selects supported by this IP
(typically 4 or 8) to nand_scan().  Let's leave all the chip detection
process to nand_scan_ident().

Signed-off-by: Masahiro Yamada 
---

 drivers/mtd/nand/raw/denali.c | 29 -
 1 file changed, 29 deletions(-)

diff --git a/drivers/mtd/nand/raw/denali.c b/drivers/mtd/nand/raw/denali.c
index f069184..d1ae968 100644
--- a/drivers/mtd/nand/raw/denali.c
+++ b/drivers/mtd/nand/raw/denali.c
@@ -1040,29 +1040,6 @@ static int denali_setup_data_interface(struct mtd_info 
*mtd, int chipnr,
return 0;
 }
 
-static void denali_reset_banks(struct denali_nand_info *denali)
-{
-   u32 irq_status;
-   int i;
-
-   for (i = 0; i < denali->max_banks; i++) {
-   denali->active_bank = i;
-
-   denali_reset_irq(denali);
-
-   iowrite32(DEVICE_RESET__BANK(i),
- denali->reg + DEVICE_RESET);
-
-   irq_status = denali_wait_for_irq(denali,
-   INTR__RST_COMP | INTR__INT_ACT | INTR__TIME_OUT);
-   if (!(irq_status & INTR__INT_ACT))
-   break;
-   }
-
-   dev_dbg(denali->dev, "%d chips connected\n", i);
-   denali->max_banks = i;
-}
-
 static void denali_hw_init(struct denali_nand_info *denali)
 {
/*
@@ -1311,12 +1288,6 @@ int denali_init(struct denali_nand_info *denali)
}
 
denali_enable_irq(denali);
-   denali_reset_banks(denali);
-   if (!denali->max_banks) {
-   /* Error out earlier if no chip is found for some reasons. */
-   ret = -ENODEV;
-   goto disable_irq;
-   }
 
denali->active_bank = DENALI_INVALID_BANK;
 
-- 
2.7.4



Re: [PATCH 1/2] platform/chrome: Move mfd/cros_ec_lpc* includes to drivers/platform.

2018-09-07 Thread Benson Leung
Hi Enric,

On Wed, Jul 18, 2018 at 06:09:55PM +0200, Enric Balletbo i Serra wrote:
> The cros-ec-lpc driver lives in drivers/platform because is platform
> specific, however there are two includes (cros_ec_lpc_mec.h and
> cros_ec_lpc_reg.h) that lives in include/linux/mfd. These two includes
> are only used for the platform driver and are not really related to the
> MFD subsystem, so move the includes from include/linux/mfd to
> drivers/platform/chrome.
> 
> Signed-off-by: Enric Balletbo i Serra 

Thanks. Applied to my working branch for v4.20.

-- 
Benson Leung
Staff Software Engineer
Chrome OS Kernel
Google Inc.
ble...@google.com
Chromium OS Project
ble...@chromium.org


signature.asc
Description: PGP signature


Re: [PATCH v7 1/2] leds: core: Introduce LED pattern trigger

2018-09-07 Thread Pavel Machek
Hi!

> +What:/sys/class/leds//hw_pattern
> +Date:September 2018
> +KernelVersion:   4.20
> +Description:
> + Specify a hardware pattern for the SC27XX LED. For the SC27XX
> + LED controller, it only supports 4 hardware patterns to 
> configure
> + the low time, rise time, high time and fall time for the 
> breathing
> + mode, and each stage duration unit is 125ms. So the format of
> + the hardware pattern values should be:
> + "brightness_1 duration_1 brightness_2 duration_2 brightness_3
> + duration_3 brightness_4 duration_4".
> 
> In this case low time and high time can be easily described with
> use of the proposed [brightness delta_t] tuples. It is not equally
> obvious in case of rise time and fall time.
> 
> I can imagine hw pattern that would require defining blink rate
> over period of time, or blink rate during rise/fall time - in the
> latter case we would have odd number of pattern components. Probably
> it wouldn't be a big deal, we'd need one "padding" value, but still
> there's room for improvement IMHO.

Well, you can describe blinking while rising, it is just going to be
awkward as you'll need to give precise times/brightnesses for each
blinking, and pattern will become long.

I'm sure some hardware can do that (the led in N900 can compute prime
numbers, it can blink while changing brightness, too).

OTOH people tend to use pretty simple patterns on their LEDs, so we
should be fine.

Pavel
-- 
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) 
http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html


signature.asc
Description: Digital signature


Re: [PATCH] printk/tracing: Do not trace printk_nmi_enter()

2018-09-07 Thread Peter Zijlstra
On Wed, Sep 05, 2018 at 09:33:34PM -0400, Steven Rostedt wrote:
>   do_idle {
> 
> [interrupts enabled]
> 
>  [interrupts disabled]
>   TRACE_IRQS_OFF [lockdep says irqs off]
>   [...]
>   TRACE_IRQS_IRET
>   test if pt_regs say return to interrupts enabled [yes]
>   TRACE_IRQS_ON [lockdep says irqs are on]
> 
>   
>   nmi_enter() {
>   printk_nmi_enter() [traced by ftrace]
>   [ hit ftrace breakpoint ]
>   
>   TRACE_IRQS_OFF [lockdep says irqs off]
>   [...]
>   TRACE_IRQS_IRET [return from breakpoint]
>  test if pt_regs say interrupts enabled [no]
>  [iret back to interrupt]
>  [iret back to code]
> 
> tick_nohz_idle_enter() {
> 
>   lockdep_assert_irqs_enabled() [lockdep say no!]

Isn't the problem that we muck with the IRQ state from NMI context? We
shouldn't be doing that.

The thing is, since we trace the IRQ state from within IRQ-disable,
since that's the only IRQ-safe option, it is very much not NMI-safe.

Your patch might avoid the symptom, but I don't think it cures the
fundamental problem.


Re: [PATCH v9 3/6] kernel/reboot.c: export pm_power_off_prepare

2018-09-07 Thread Oleksij Rempel
Hi Mark,

On Thu, Sep 06, 2018 at 11:15:17AM +0100, Mark Brown wrote:
> On Mon, Aug 27, 2018 at 09:48:16AM +0800, Shawn Guo wrote:
> 
> > Can you ACK on those two regulator patches, so that I can queue this
> > series up on IMX tree?
> 
> I was expecting to get a pull request with the precursor patches in it -
> the regulator driver seems to get a moderate amount of development so
> there's a reasonable risk of conflicts.

Are there any thing I can or should do?

-- 
Pengutronix e.K.   | |
Industrial Linux Solutions | http://www.pengutronix.de/  |
Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0|
Amtsgericht Hildesheim, HRA 2686   | Fax:   +49-5121-206917- |


signature.asc
Description: PGP signature


Re: [PATCH 0/9] psi: pressure stall information for CPU, memory, and IO v4

2018-09-07 Thread Daniel Drake
On Thu, Sep 6, 2018 at 5:43 AM, Johannes Weiner  wrote:
> Peter, do the changes from v3 look sane to you?
>
> If there aren't any further objections, I was hoping we could get this
> lined up for 4.20.

That would be excellent. I just retested the latest version at
http://git.cmpxchg.org/cgit.cgi/linux-psi.git (Linux 4.18) and the
results are great.

Test setup:
Endless OS
GeminiLake N4200 low end laptop
2GB RAM
swap (and zram swap) disabled

Baseline test: open a handful of large-ish apps and several website
tabs in Google Chrome.
Results: after a couple of minutes, system is excessively thrashing,
mouse cursor can barely be moved, UI is not responding to mouse
clicks, so it's impractical to recover from this situation as an
ordinary user

Add my simple killer:
https://gist.github.com/dsd/a8988bf0b81a6163475988120fe8d9cd
Results: when the thrashing causes the UI to become sluggish, the
killer steps in and kills something (usually a chrome tab), and the
system remains usable. I repeatedly opened more apps and more websites
over a 15 minute period but I wasn't able to get the system to a point
of UI unresponsiveness.

Thanks,
Daniel


[PATCH v8 3/3]: perf record: extend trace writing to multi AIO

2018-09-07 Thread Alexey Budankov


Multi AIO trace writing allows caching more kernel data into userspace 
memory postponing trace writing for the sake of overall profiling data 
thruput increase. It could be seen as kernel data buffer extension into
userspace memory.

With aio-cblocks option value different from 1, current default value, 
tool has capability to cache more and more data into user space
along with delegating spill to AIO.

That allows avoiding suspend at record__aio_sync() between calls of 
record__mmap_read_evlist() and increase profiling data thruput for 
the cost of userspace memory.

Signed-off-by: Alexey Budankov 
---
 tools/perf/builtin-record.c | 55 +++---
 tools/perf/perf.h   |  1 +
 tools/perf/util/evlist.c|  7 ++--
 tools/perf/util/evlist.h|  3 +-
 tools/perf/util/mmap.c  | 83 +++--
 tools/perf/util/mmap.h  | 10 +++---
 6 files changed, 114 insertions(+), 45 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index d4857572cf33..6361098a5898 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -192,16 +192,35 @@ static int record__aio_complete(struct perf_mmap *md, 
struct aiocb *cblock)
return rc;
 }
 
-static void record__aio_sync(struct perf_mmap *md)
+static int record__aio_sync(struct perf_mmap *md, bool sync_all)
 {
-   struct aiocb *cblock = >cblock;
+   struct aiocb **aiocb = md->aiocb;
+   struct aiocb *cblocks = md->cblocks;
struct timespec timeout = { 0, 1000 * 1000  * 1 }; // 1ms
+   int i, do_suspend;
 
do {
-   if (cblock->aio_fildes == -1 || record__aio_complete(md, 
cblock))
-   return;
+   do_suspend = 0;
+   for (i = 0; i < md->nr_cblocks; ++i) {
+   if (cblocks[i].aio_fildes == -1 || 
record__aio_complete(md, [i])) {
+   if (sync_all)
+   aiocb[i] = NULL;
+   else
+   return i;
+   } else {
+   /*
+* Started aio write is not complete yet
+* so it has to be waited before the
+* next allocation.
+*/
+   aiocb[i] = [i];
+   do_suspend = 1;
+   }
+   }
+   if (!do_suspend)
+   return -1;
 
-   while (aio_suspend((const struct aiocb**), 1, )) 
{
+   while (aio_suspend((const struct aiocb **)aiocb, 
md->nr_cblocks, )) {
if (!(errno == EAGAIN || errno == EINTR))
pr_err("failed to sync perf data, error: %m\n");
}
@@ -428,7 +447,8 @@ static int record__mmap_evlist(struct record *rec,
 
if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
 opts->auxtrace_mmap_pages,
-opts->auxtrace_snapshot_mode) < 0) {
+opts->auxtrace_snapshot_mode,
+opts->nr_cblocks) < 0) {
if (errno == EPERM) {
pr_err("Permission error mapping pages.\n"
   "Consider increasing "
@@ -621,7 +641,7 @@ static void record__mmap_read_sync(struct record *rec)
for (i = 0; i < evlist->nr_mmaps; i++) {
struct perf_mmap *map = [i];
if (map->base)
-   record__aio_sync(map);
+   record__aio_sync(map, true);
}
 }
 
@@ -629,7 +649,7 @@ static int record__mmap_read_evlist(struct record *rec, 
struct perf_evlist *evli
bool overwrite)
 {
u64 bytes_written = rec->bytes_written;
-   int i;
+   int i, idx;
int rc = 0;
struct perf_mmap *maps;
 
@@ -648,11 +668,12 @@ static int record__mmap_read_evlist(struct record *rec, 
struct perf_evlist *evli
 
if (maps[i].base) {
/*
-* Call record__aio_sync() to wait till map->data buffer
-* becomes available after previous aio write request.
+* Call record__aio_sync() to get some free map->data
+* buffer or wait if all of previously started aio
+* writes are still incomplete.
 */
-   record__aio_sync([i]);
-   if (perf_mmap__push([i], rec, record__pushfn) != 
0) {
+   idx = record__aio_sync([i], false);
+   if (perf_mmap__push([i], rec, idx, record__pushfn) 
!= 0) {
rc = -1;

Re: [PATCH] sched/fair: fix load_balance redo for null imbalance

2018-09-07 Thread Vincent Guittot
Le Friday 07 Sep 2018 à 13:37:49 (+0200), Peter Zijlstra a écrit :
> On Fri, Sep 07, 2018 at 09:51:04AM +0200, Vincent Guittot wrote:
> > It can happen that load_balance finds a busiest group and then a busiest rq
> > but the calculated imbalance is in fact null.
> 
> Cute. Does that happen often?

I have a use case with RT tasks that reproduces the problem regularly.
It happens at least when we have CPUs with different capacity either because
of heterogeous CPU or because of RT/DL reducing available capacity for cfs
I have put the call path that trigs the problem below and accroding to the
comment it seems that we can reach similar state when playing with priority.

> 
> > If the calculated imbalance is null, it's useless to try to find a busiest
> > rq as no task will be migrated and we can return immediately.
> > 
> > This situation can happen with heterogeneous system or smp system when RT
> > tasks are decreasing the capacity of some CPUs.
> 
> Is it the result of one of those "force_balance" conditions in
> find_busiest_group() ? Should we not fix that to then return NULL
> instead?

The UC is:
We have a newly_idle load balance that is triggered when RT task becomes idle
( but I think that I have seen that with idle load balance too)

we trigs:
if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;

In calculate_imbalance we use the path
/*
 * Avg load of busiest sg can be less and avg load of local sg can
 * be greater than avg load across all sgs of sd because avg load
 * factors in sg capacity and sgs with smaller group_type are
 * skipped when updating the busiest sg:
 */
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}

but fix_small_imbalance finally decides to return without modifying imbalance
like here
if (busiest->avg_load + scaled_busy_load_per_task >=
local->avg_load + (scaled_busy_load_per_task * imbn)) {
env->imbalance = busiest->load_per_task;
return;
}

Beside this patch, I'm preparing another patch in fix small imbalance to
ensure 1 task per CPU in similar situation but according to the comment above,
we can reach this situation because of tasks priority



Re: [PATCH 4/4] sched/numa: Do not move imbalanced load purely on the basis of an idle CPU

2018-09-07 Thread Mel Gorman
On Fri, Sep 07, 2018 at 01:33:09PM +0200, Peter Zijlstra wrote:
> > ---
> >  kernel/sched/fair.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index d59d3e00a480..d4c289c11012 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -1560,7 +1560,7 @@ static bool task_numa_compare(struct task_numa_env 
> > *env,
> > goto unlock;
> >  
> > if (!cur) {
> > -   if (maymove || imp > env->best_imp)
> > +   if (maymove)
> > goto assign;
> > else
> > goto unlock;
> 
> Srikar's patch here:
> 
>   
> http://lkml.kernel.org/r/1533276841-16341-4-git-send-email-sri...@linux.vnet.ibm.com
> 
> Also frobs this condition, but in a less radical way. Does that yield
> similar results?

I can check. I do wonder of course if the less radical approach just means
that automatic NUMA balancing and the load balancer simply disagree about
placement at a different time. It'll take a few days to have an answer as
the battery of workloads to check this take ages.

-- 
Mel Gorman
SUSE Labs


Re: [GIT PULL] ext4 updates for 3.11

2018-09-07 Thread Greg KH
Digging up an email thread from 2013...

On Wed, Jul 03, 2013 at 01:29:41PM +1000, Dave Chinner wrote:
> On Tue, Jul 02, 2013 at 06:01:11PM -0700, Greg KH wrote:
> > On Tue, Jul 02, 2013 at 05:58:15PM -0700, Linus Torvalds wrote:
> > > On Tue, Jul 2, 2013 at 5:54 PM, Greg KH  wrote:
> > > > On Tue, Jul 02, 2013 at 05:02:21PM -0700, Linus Torvalds wrote:
> > > >>
> > > >> I'm really not convinced this whole Lustre thing was correctly
> > > >> handled. Merging it into stable and yet being in such bad shape that
> > > >> it isn't enabled even there? I just dunno. But I have the turd in my
> > > >> tree now, let's hope it gets fixed up.
> > > >
> > > > It's in "staging", not "stable" :)
> > > 
> > > Yes. But what was the reason to actually merge it even there? And once
> > > it gets merged, disabling it again rather than fixing the problems it
> > > has?
> > 
> > The problems turned out to be too big, too late in the merge cycle for
> > me to be able to take them (they still aren't even done, as I don't have
> > a working set of patches yet.)  So I just disabled it from the build to
> > give Andreas and team time to get it working properly.
> > 
> > I could have just removed it, but I thought I would give them a chance.
> > 
> > > This is a filesystem that Intel apparently wants to push. I think it
> > > would have been a better idea to push back a bit and say "at least
> > > clean it up a bit first". It's not like Intel is one of the clueless
> > > companies that couldn't have done so and need help from the community.
> > 
> > For this filesystem, it seems that they don't have any resources to do
> > this work and are relying on the community to help out.  Which is odd,
> > but big companies are strange some times...
> 
> Didn't we learn this lesson already with POHMELFS? i.e. that dumping
> filesystem code in staging on the assumption "the community" will
> fix it up when nobody in "the community" uses or can even test that
> filesystem is a broken development model

Dave, and Linus, you were totally right here.  Sorry for not listening
to you before, my fault.  The lustre developers never got their act
together and probably by this being in staging, it only prolonged the
agony of everyone involved.

greg k-h


Re: [PATCH] printk/tracing: Do not trace printk_nmi_enter()

2018-09-07 Thread Steven Rostedt
On Fri, 7 Sep 2018 15:45:32 +0200
Peter Zijlstra  wrote:

> Yes really, we should not muck with the IRQ state from NMI context.

Right, and we didn't. Your patch didn't change anything, but allow for
printk_nmi_enter/exit() to be traced by ftrace, but that's wrong to
begin with because it ftrace_nmi_enter() hasn't been called yet.

-- Steve


Re: [PATCH 8/9] psi: pressure stall information for CPU, memory, and IO

2018-09-07 Thread Johannes Weiner
On Fri, Sep 07, 2018 at 12:16:34PM +0200, Peter Zijlstra wrote:
> On Tue, Aug 28, 2018 at 01:22:57PM -0400, Johannes Weiner wrote:
> > +enum psi_states {
> > +   PSI_IO_SOME,
> > +   PSI_IO_FULL,
> > +   PSI_MEM_SOME,
> > +   PSI_MEM_FULL,
> > +   PSI_CPU_SOME,
> > +   /* Only per-CPU, to weigh the CPU in the global average: */
> > +   PSI_NONIDLE,
> > +   NR_PSI_STATES,
> > +};
> 
> > +static u32 get_recent_time(struct psi_group *group, int cpu,
> > +  enum psi_states state)
> > +{
> > +   struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
> > +   unsigned int seq;
> > +   u32 time, delta;
> > +
> > +   do {
> > +   seq = read_seqcount_begin(>seq);
> > +
> > +   time = groupc->times[state];
> > +   /*
> > +* In addition to already concluded states, we also
> > +* incorporate currently active states on the CPU,
> > +* since states may last for many sampling periods.
> > +*
> > +* This way we keep our delta sampling buckets small
> > +* (u32) and our reported pressure close to what's
> > +* actually happening.
> > +*/
> > +   if (test_state(groupc->tasks, state))
> > +   time += cpu_clock(cpu) - groupc->state_start;
> > +   } while (read_seqcount_retry(>seq, seq));
> > +
> > +   delta = time - groupc->times_prev[state];
> > +   groupc->times_prev[state] = time;
> > +
> > +   return delta;
> > +}
> 
> > +static bool update_stats(struct psi_group *group)
> > +{
> > +   u64 deltas[NR_PSI_STATES - 1] = { 0, };
> > +   unsigned long missed_periods = 0;
> > +   unsigned long nonidle_total = 0;
> > +   u64 now, expires, period;
> > +   int cpu;
> > +   int s;
> > +
> > +   mutex_lock(>stat_lock);
> > +
> > +   /*
> > +* Collect the per-cpu time buckets and average them into a
> > +* single time sample that is normalized to wallclock time.
> > +*
> > +* For averaging, each CPU is weighted by its non-idle time in
> > +* the sampling period. This eliminates artifacts from uneven
> > +* loading, or even entirely idle CPUs.
> > +*/
> > +   for_each_possible_cpu(cpu) {
> > +   u32 nonidle;
> > +
> > +   nonidle = get_recent_time(group, cpu, PSI_NONIDLE);
> > +   nonidle = nsecs_to_jiffies(nonidle);
> > +   nonidle_total += nonidle;
> > +
> > +   for (s = 0; s < PSI_NONIDLE; s++) {
> > +   u32 delta;
> > +
> > +   delta = get_recent_time(group, cpu, s);
> > +   deltas[s] += (u64)delta * nonidle;
> > +   }
> > +   }
> 
> This does the whole seqcount thing 6x, which is a bit of a waste.

[...]

> It's a bit cumbersome, but that's because of C.

I was actually debating exactly this with Suren before, but since this
is a super cold path I went with readability. I was also thinking that
restarts could happen quite regularly under heavy scheduler load, and
so keeping the individual retry sections small could be helpful - but
I didn't instrument this in any way.

No strong opinion from me, I can send an updated patch if you prefer.


Re: [PATCH v11 0/3] remain and optimize memblock_next_valid_pfn on arm and arm64

2018-09-07 Thread Will Deacon
On Thu, Sep 06, 2018 at 01:24:22PM +0200, Ard Biesheuvel wrote:
> On 22 August 2018 at 05:07, Jia He  wrote:
> > Commit b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns
> > where possible") optimized the loop in memmap_init_zone(). But it causes
> > possible panic bug. So Daniel Vacek reverted it later.
> >
> > But as suggested by Daniel Vacek, it is fine to using memblock to skip
> > gaps and finding next valid frame with CONFIG_HAVE_ARCH_PFN_VALID.
> >
> > More from what Daniel said:
> > "On arm and arm64, memblock is used by default. But generic version of
> > pfn_valid() is based on mem sections and memblock_next_valid_pfn() does
> > not always return the next valid one but skips more resulting in some
> > valid frames to be skipped (as if they were invalid). And that's why
> > kernel was eventually crashing on some !arm machines."
> >
> > About the performance consideration:
> > As said by James in b92df1de5,
> > "I have tested this patch on a virtual model of a Samurai CPU with a
> > sparse memory map.  The kernel boot time drops from 109 to 62 seconds."
> > Thus it would be better if we remain memblock_next_valid_pfn on arm/arm64.
> >
> > Besides we can remain memblock_next_valid_pfn, there is still some room
> > for improvement. After this set, I can see the time overhead of memmap_init
> > is reduced from 27956us to 13537us in my armv8a server(QDF2400 with 96G
> > memory, pagesize 64k). I believe arm server will benefit more if memory is
> > larger than TBs
> >
> 
> OK so we can summarize the benefits of this series as follows:
> - boot time on a virtual model of a Samurai CPU drops from 109 to 62 seconds
> - boot time on a QDF2400 arm64 server with 96 GB of RAM drops by ~15
> *milliseconds*
> 
> Google was not very helpful in figuring out what a Samurai CPU is and
> why we should care about the boot time of Linux running on a virtual
> model of it, and the 15 ms speedup is not that compelling either.
> 
> Apologies to Jia that it took 11 revisions to reach this conclusion,
> but in /my/ opinion, tweaking the fragile memblock/pfn handling code
> for this reason is totally unjustified, and we're better off
> disregarding these patches.

Oh, we're talking about a *simulator* for the significant boot time
improvement here? I didn't realise that, so I agree that the premise of
this patch set looks pretty questionable given how much "fun" we've had
with the memmap on arm and arm64.

Will


Re: [PATCH] printk/tracing: Do not trace printk_nmi_enter()

2018-09-07 Thread Sergey Senozhatsky
On (09/07/18 16:03), Peter Zijlstra wrote:
> > 
> > I would even argue that placing printk_nmi_enter() between
> > lockdep_off() and ftrace_nmi_enter() is wrong because if in the future
> > printk_nmi_enter() were to do any ftrace tracing, it wont be caught, as
> > it was by having it before lockdep_off().
> > 
> > printk_nmi_enter() should not muck with IRQ state, nor should it do any
> > ftrace tracing. Since ftrace mucks with IRQ state when it gets enabled
> > or disabled, it will screw up lockdep, and lockdep will complain. That
> > way we can use lockdep not being off to catch this bug.
> 
> The very bestest solution is to rm -rf printk ;-)

Talented, capable and tremendously clever people had spent decades on
making printk what it is today. I feel responsible for respecting that
effort and, thus, my vote would be to keep printk around for a while.
... we also support !CONFIG_PRINTK builds ;)

-ss


Re: [tip:x86/paravirt] x86/paravirt: Move the pv_irq_ops under the PARAVIRT_XXL umbrella

2018-09-07 Thread Juergen Gross
On 07/09/18 16:49, Borislav Petkov wrote:
> Hi Jürgen,
> 
> On Mon, Sep 03, 2018 at 08:01:40AM -0700, tip-bot for Juergen Gross wrote:
>> Commit-ID:  6da63eb241a05b0e676d68975e793c0521387141
>> Gitweb: 
>> https://git.kernel.org/tip/6da63eb241a05b0e676d68975e793c0521387141
>> Author: Juergen Gross 
>> AuthorDate: Tue, 28 Aug 2018 09:40:24 +0200
>> Committer:  Thomas Gleixner 
>> CommitDate: Mon, 3 Sep 2018 16:50:36 +0200
>>
>> x86/paravirt: Move the pv_irq_ops under the PARAVIRT_XXL umbrella
>>
>> All of the paravirt ops defined in pv_irq_ops are for Xen PV guests
>> or VSMP only. Define them only if CONFIG_PARAVIRT_XXL is set.
>>
>> Signed-off-by: Juergen Gross 
>> Signed-off-by: Thomas Gleixner 
>> Cc: xen-de...@lists.xenproject.org
>> Cc: virtualizat...@lists.linux-foundation.org
>> Cc: akata...@vmware.com
>> Cc: ru...@rustcorp.com.au
>> Cc: boris.ostrov...@oracle.com
>> Cc: h...@zytor.com
>> Link: https://lkml.kernel.org/r/20180828074026.820-14-jgr...@suse.com
>>
>> ---
>>  arch/x86/include/asm/irqflags.h   | 8 +---
>>  arch/x86/include/asm/paravirt.h   | 6 +++---
>>  arch/x86/include/asm/paravirt_types.h | 3 ++-
>>  arch/x86/kernel/asm-offsets.c | 2 +-
>>  arch/x86/kernel/asm-offsets_64.c  | 2 +-
>>  arch/x86/kernel/paravirt.c| 2 +-
>>  arch/x86/kernel/paravirt_patch_32.c   | 4 ++--
>>  arch/x86/kernel/paravirt_patch_64.c   | 4 +++-
>>  arch/x86/kernel/vsmp_64.c | 2 +-
>>  9 files changed, 15 insertions(+), 18 deletions(-)
> 
> this one is breaking the randconfig builds with the following error
> (failure case simplified):
> 
> $ make arch/x86/entry/entry_64.o
>   DESCEND  objtool
>   CALLscripts/checksyscalls.sh
>   AS  arch/x86/entry/entry_64.o
> In file included from arch/x86/entry/entry_64.S:33:0:
> ./arch/x86/include/asm/paravirt.h:938:0: warning: "SAVE_FLAGS" redefined
>  #define SAVE_FLAGS(clobbers)\

And the fixing patch is already there:

https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/patch/?id=b7a5eb6aafa95fce45fc4dcbc195cb232fa1b76d


Juergen


Re: [PATCH v6 1/5] seccomp: add a return code to trap to userspace

2018-09-07 Thread Tycho Andersen
Hey Tyler,

On Thu, Sep 06, 2018 at 10:15:12PM +, Tyler Hicks wrote:
> > +Users can read via ``ioctl(SECCOMP_NOTIF_RECV)``  (or ``poll()``) on a 
> > seccomp
> > +notification fd to receive a ``struct seccomp_notif``, which contains five
> > +members: the input length of the structure, a globally unique ``id``, the
> 
> This documentation says that id is "globally unique" but an in-code
> comment below says "this is unique for this filter". IIUC, the id is
> only guaranteed to be unique for the filter so this documentation should
> be updated slightly to make it clear that the id is only global in those
> terms.

Yup, thanks.

> > +``pid`` of the task which triggered this request (which may be 0 if the 
> > task is
> > +in a pid ns not visible from the listener's pid namespace), a flag 
> > representing
> > +whether or not the notification is a result of a non-fatal signal, and the
> > +``data`` passed to seccomp. Userspace can then make a decision based on 
> > this
> > +information about what to do, and ``ioctl(SECCOMP_NOTIF_SEND)`` a response,
> > +indicating what should be returned to userspace. The ``id`` member of 
> > ``struct
> > +seccomp_notif_resp`` should be the same ``id`` as in ``struct 
> > seccomp_notif``.
> > +
> > +It is worth noting that ``struct seccomp_data`` contains the values of 
> > register
> > +arguments to the syscall, but does not contain pointers to memory. The 
> > task's
> > +memory is accessible to suitably privileged traces via ``ptrace()`` or
> > +``/proc/pid/map_files/``. However, care should be taken to avoid the TOCTOU
> > +mentioned above in this document: all arguments being read from the 
> > tracee's
> > +memory should be read into the tracer's memory before any policy decisions 
> > are
> > +made. This allows for an atomic decision on syscall arguments.
> > +
> >  Sysctls
> >  ===
> >  
> > diff --git a/arch/Kconfig b/arch/Kconfig
> > index 6801123932a5..42f3585d925d 100644
> > --- a/arch/Kconfig
> > +++ b/arch/Kconfig
> > @@ -419,6 +419,15 @@ config SECCOMP_FILTER
> >  
> >   See Documentation/userspace-api/seccomp_filter.rst for details.
> >  
> > +config SECCOMP_USER_NOTIFICATION
> 
> Did someone request a Kconfig option for this new feature? If not, I
> think that nuking the Kconfig option would reduce the test matrix. No
> other filter flags have their own build time option but maybe it makes
> sense in this case if this filter flag exposes the kernel to significant
> new attack surface since there's more to this than just a new filter
> flag.
> 
> If someone has a requirement to disable this feature, maybe it'd be
> better to leave the decision up to the distro *and* the admin via a
> sysctl instead of taking the admin out of the decision with a build
> time option.

No, there was no explicit request by anyone, I just did it so I
wouldn't offend anyone with this code. I'll drop it for the next
version.

> >  /**
> >   * struct seccomp_filter - container for seccomp BPF programs
> >   *
> > @@ -66,6 +114,30 @@ struct seccomp_filter {
> > bool log;
> > struct seccomp_filter *prev;
> > struct bpf_prog *prog;
> > +
> > +#ifdef CONFIG_SECCOMP_USER_NOTIFICATION
> > +   /*
> > +* A semaphore that users of this notification can wait on for
> > +* changes. Actual reads and writes are still controlled with
> > +* filter->notify_lock.
> > +*/
> > +   struct semaphore request;
> > +
> > +   /* A lock for all notification-related accesses. */
> > +   struct mutex notify_lock;
> > +
> > +   /* Is there currently an attached listener? */
> > +   bool has_listener;
> > +
> > +   /* The id of the next request. */
> > +   u64 next_id;
> > +
> > +   /* A list of struct seccomp_knotif elements. */
> > +   struct list_head notifications;
> > +
> > +   /* A wait queue for poll. */
> > +   wait_queue_head_t wqh;
> > +#endif
> 
> I suspect that these additions would benefit from better struct packing
> since there could be a lot of seccomp_filter structs floating around in
> memory on a system with a large number of running containers or
> otherwise sandboxed processes.
> 
> IIRC, there's a 3 byte hole following the log member that could be used
> by has_listener, at least, and I'm not sure how the rest of the new
> members affect things.

Ok, I'll take a look.

> > +static void seccomp_do_user_notification(int this_syscall,
> > +struct seccomp_filter *match,
> > +const struct seccomp_data *sd)
> > +{
> > +   int err;
> > +   long ret = 0;
> > +   struct seccomp_knotif n = {};
> > +
> > +   mutex_lock(>notify_lock);
> > +   err = -ENOSYS;
> > +   if (!match->has_listener)
> > +   goto out;
> > +
> > +   n.pid = task_pid(current);
> > +   n.state = SECCOMP_NOTIFY_INIT;
> > +   n.data = sd;
> > +   n.id = seccomp_next_notify_id(match);
> > +   init_completion();
> > +
> > +   list_add(, >notifications);
> > +   wake_up_poll(>wqh, EPOLLIN | EPOLLRDNORM);
> > +
> > +  

Re: BUG: bad usercopy in __check_object_size (2)

2018-09-07 Thread Tetsuo Handa
On 2018/09/08 0:29, syzbot wrote:
> syzbot has found a reproducer for the following crash on:
> 
> HEAD commit:    28619527b8a7 Merge git://git.kernel.org/pub/scm/linux/kern..
> git tree:   bpf
> console output: https://syzkaller.appspot.com/x/log.txt?x=124e64d140
> kernel config:  https://syzkaller.appspot.com/x/.config?x=62e9b447c16085cf
> dashboard link: https://syzkaller.appspot.com/bug?extid=a3c9d2673837ccc0f22b
> compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=179f9cd140
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=11b3e8be40
> 
> IMPORTANT: if you fix the bug, please add the following tag to the commit:
> Reported-by: syzbot+a3c9d2673837ccc0f...@syzkaller.appspotmail.com
> 
>  entry_SYSCALL_64_after_hwframe+0x49/0xbe
> RIP: 0033:0x440479
> usercopy: Kernel memory overwrite attempt detected to spans multiple pages 
> (offset 0, size 64)!

Kees, is this because check_page_span() is failing to allow on-stack variable

   u8 opcodes[OPCODE_BUFSIZE];

which by chance crossed PAGE_SIZE boundary?


Re: [PATCH V3] spi: spi-geni-qcom: Add SPI driver support for GENI based QUP

2018-09-07 Thread Doug Anderson
Hi,

On Fri, Sep 7, 2018 at 3:00 AM,   wrote:
>> In v2, I said:
>>
>>> I'm not sure where to comment about this, so adding it to the end:
>>>
>>> Between v1 and v2 you totally removed all the locking.  Presumably
>>> this is because you didn't want to hold the lock in
>>> handle_fifo_timeout() while waiting for the completion.  IMO taking
>>> the lock out was the wrong thing to do.  You should keep it, but just
>>> drop the lock before wait_for_completion_timeout() and add it back
>>> afterwards.  Specifically you _don't_ want the IRQ and timeout code
>>> stomping on each other.
>>
>>
>> ...but still no spinlock?
>
> I see there is no need of taking the spinlock as timeout will be handled
> after the calculated time as per data size and speed.
> There is 99.9% less chances of interrupt during the timeout handler.
>>
>>
>>
>> https://chromium-review.googlesource.com/c/chromiumos/third_party/kernel/+/1201081

The thing is, we want it to be 100% reliable, not 99.9% reliable.  Is
it somehow wrong to add the spinlock?  ...or are you noticing
performance problems with the spinlock there?  It's just nice not to
have to think about it.

-Doug


Re: [PATCH RFC LKMM 1/7] tools/memory-model: Add extra ordering for locks and remove it for ordinary release/acquire

2018-09-07 Thread Daniel Lustig
On 9/7/2018 9:09 AM, Will Deacon wrote:
> On Fri, Sep 07, 2018 at 12:00:19PM -0400, Alan Stern wrote:
>> On Thu, 6 Sep 2018, Andrea Parri wrote:
>>
 Have you noticed any part of the generic code that relies on ordinary 
 acquire-release (rather than atomic RMW acquire-release) in order to 
 implement locking constructs?
>>>
>>> There are several places in code where the "lock-acquire" seems to be
>>> provided by an atomic_cond_read_acquire/smp_cond_load_acquire: I have
>>> mentioned one in qspinlock in this thread; qrwlock and mcs_spinlock
>>> provide other examples (grep for the primitives...).
>>>
>>> As long as we don't consider these primitive as RMW (which would seem
>>> odd...) or as acquire for which "most people expect strong ordering"
>>> (see above), these provides other examples for the _gap_ I mentioned.
>>
>> Okay, now I understand your objection.  It does appear that on RISC-V,
>> if nowhere else, the current implementations of qspinlock, qrwlock,
>> etc. will not provide "RCtso" ordering.
>>
>> The discussions surrounding this topic have been so lengthy and 
>> confusing that I have lost track of any comments Palmer or Daniel may 
>> have made concerning this potential problem.
>>
>> One possible resolution would be to define smp_cond_load_acquire() 
>> specially on RISC-V so that it provided the same ordering guarantees as 
>> RMW-acquire.  (Plus adding a comment in the asm-generic/barrier.h 
>> pointing out the necessity for the stronger guarantee on all 
>> architectures.)
>>
>> Another would be to replace the usages of atomic/smp_cond_load_acquire 
>> in the locking constructs with a new function that would otherwise be 
>> the same but would provide the ordering guarantee we want.
>>
>> Do you think either of these would be an adequate fix?
> 
> I didn't think RISC-V used qspinlock or qrwlock, so I'm not sure there's
> actually anything to fix, is there?
> 
> Will

I've also lost track of whether the current preference is or is not for
RCtso, or in which subset of cases RCtso is currently preferred.  For
whichever cases do in fact need to be RCtso, the RISC-V approach would
still be the same as what I've written in the past, as far as I can
tell [1].

In a nutshell, if a data structure uses only atomics with .aq/.rl,
RISC-V provides RCtso already anyway.  If a data structure uses fences,
or mixes fences and atomics, we can replace a "fence r,rw" or a
"fence rw,w" with a "fence.tso" (== fence r,rw + fence rw,w) as
necessary, at the cost of some amount of performance.

I suppose the answer to the question of whether smp_cond_load_acquire()
needs to change depends on where exactly RCtso is needed, and which
data structures actually use that vs. some other macro. 

Does that answer your question Alan?  Does it make sense?

[1] 
https://lore.kernel.org/lkml/11b27d32-4a8a-3f84-0f25-723095ef1...@nvidia.com/

Dan


Re: [PATCH v4 0/3] mtd: rawnand: ams-delta: Cleanups and optimizations

2018-09-07 Thread Tony Lindgren
Hi,

* Janusz Krzysztofik  [180905 20:56]:
> On Wednesday, September 5, 2018 8:47:57 AM CEST Miquel Raynal wrote:
> > Patch 2/3 does not apply on nand/next. Indeed the driver does not look
> > the same as in the diff.
> 
> That's because I built it on top of my former series from the mid of July, 
> containing "[PATCH v2 2/3 v4] mtd: rawnand: ams-delta: use GPIO lookup 
> table". 
> It was acked by you, Miquel, and supposed to be merged via linux-omap tree.

Hmm I thought the plan was for dependencies to clear and then
merge the rest via various driver trees.. Or at least I don't have
the patch above tagged anywhere for me to merge. Then again, I
try to forget everything posted before -rc1 just to stay sane :)

> > I don't see any changes on my side that could
> > explain this so perhaps you could rebase on top of 4.19-rc2 (or
> > nand/next, as you wish) and resend the series?
> 
> As far as I can see, Tony hasn't applied that series yet, so maybe I can 
> still 
> move that patch out of there and insert it into this series in front of the 
> other 3 patches and resend. That would however make patch 3/3 of that old 
> series depend on this one.
> 
> Tony, what do you think?

Yes please resend based on v4.19-rc1 or MTD next. If there
are still pending dependencies, please let us know and we can
set up an immutable branch against v4.19-rc1 with those for
MTD and me to merge in as needed.

Regards,

Tony


[PATCH 1/4] 9p: acl: fix uninitialized iattr access

2018-09-07 Thread Dominique Martinet
From: Dominique Martinet 

iattr is passed to v9fs_vfs_setattr_dotl which does send various
values from iattr over the wire, even if it tells the server to
only look at iattr.ia_valid fields this could leak some stack data.

Addresses-Coverity-ID: 1195601 ("Uninitalized scalar variable")
Signed-off-by: Dominique Martinet 
---
 fs/9p/acl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 082d227fa56b..6261719f6f2a 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -276,7 +276,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler 
*handler,
switch (handler->flags) {
case ACL_TYPE_ACCESS:
if (acl) {
-   struct iattr iattr;
+   struct iattr iattr = { 0 };
struct posix_acl *old_acl = acl;
 
retval = posix_acl_update_mode(inode, _mode, 
);
-- 
2.17.1



[PATCH 3/4] 9p: p9dirent_read: check network-provided name length

2018-09-07 Thread Dominique Martinet
From: Dominique Martinet 

strcpy to dirent->d_name could overflow the buffer, use strscpy to check
the provided string length and error out if the size was too big.

While we are here, make the function return an error when the pdu
parsing failed, instead of returning the pdu offset as if it had been a
success...

Addresses-Coverity-ID: 139133 ("Copy into fixed size buffer")
Signed-off-by: Dominique Martinet 
---
 net/9p/protocol.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index b4d80c533f89..462ba144cb39 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -623,13 +623,19 @@ int p9dirent_read(struct p9_client *clnt, char *buf, int 
len,
if (ret) {
p9_debug(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret);
trace_9p_protocol_dump(clnt, _pdu);
-   goto out;
+   return ret;
}
 
-   strcpy(dirent->d_name, nameptr);
+   ret = strscpy(dirent->d_name, nameptr, sizeof(dirent->d_name));
+   if (ret < 0) {
+   p9_debug(P9_DEBUG_ERROR,
+"On the wire dirent name too long: %s\n",
+nameptr);
+   kfree(nameptr);
+   return ret;
+   }
kfree(nameptr);
 
-out:
return fake_pdu.offset;
 }
 EXPORT_SYMBOL(p9dirent_read);
-- 
2.17.1



[PATCH 2/4] 9p/rdma: remove useless check in cm_event_handler

2018-09-07 Thread Dominique Martinet
From: Dominique Martinet 

the client c is always dereferenced to get the rdma struct, so c has to
be a valid pointer at this point.
Gcc would optimize that away but let's make coverity happy...

Addresses-Coverity-ID: 102778 ("Dereference before null check")
Signed-off-by: Dominique Martinet 
---
 net/9p/trans_rdma.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 9719bc4d9424..119103bfa82e 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -274,8 +274,7 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct 
rdma_cm_event *event)
case RDMA_CM_EVENT_DISCONNECTED:
if (rdma)
rdma->state = P9_RDMA_CLOSED;
-   if (c)
-   c->status = Disconnected;
+   c->status = Disconnected;
break;
 
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
-- 
2.17.1



Re: [PATCH v3 0/4] pci-dra7xx: Enable errata i870 workaround for RC mode

2018-09-07 Thread Tony Lindgren
* Vignesh R  [180810 10:10]:
> 
> 
> On Wednesday 08 August 2018 10:27 PM, Lorenzo Pieralisi wrote:
> > On Tue, Jul 24, 2018 at 11:01:46PM +0530, Vignesh R wrote:
> >> Make workaround for errata i870 applicable in Host mode as
> >> well(previously it was enabled only for EP mode) as per errata
> >> documentation: http://www.ti.com/lit/er/sprz450/sprz450.pdf
> >>
> >> Tested on DRA72 EVM
> >>
> >> Tony,
> >>
> >> If you are okay with the series, could you pick this via omap tree?
> >> All ACKs are in place and Lorenzo is okay with PCIe bits to go along with
> >> rest of DTS changes.
> > 
> > I think we have missed the v4.19 merge window by now - 
> 
> Right. I didn't get any response from Tony.

Sorry catching up with pending mails.. I try hard to not touch
anything except fixes around -rc6 time.

> > please let me know if I can drop this series from the PCI patch queue.
> > 
> 
> Ok, I will resend the patch after 4.19-rc. Thanks!

FYI, I'm untagging this thread too. Please post the dts
changes separately once the dependencies (if any) have
cleared.

Regards,

Tony


Re: [PATCH 4.4 31/43] mm: fix cache mode tracking in vm_insert_mixed()

2018-09-07 Thread Ben Hutchings
On Tue, 2018-08-14 at 19:18 +0200, Greg Kroah-Hartman wrote:
> 4.4-stable review patch.  If anyone has any objections, please let me know.
> 
> --
> 
> From: Dan Williams 
> 
> commit 87744ab3832b83ba71b931f86f9cfdb000d07da5 upstream
> 
> vm_insert_mixed() unlike vm_insert_pfn_prot() and vmf_insert_pfn_pmd(),
> fails to check the pgprot_t it uses for the mapping against the one
> recorded in the memtype tracking tree.  Add the missing call to
> track_pfn_insert() to preclude cases where incompatible aliased mappings
> are established for a given physical address range.
[...]

This apparently breaks a number of DRM drivers.  The upstream fixes
are:

8ef4227615e1 x86/io: add interface to reserve io memtype for a resource range. 
(v1.1)
7cf321d118a8 drm/drivers: add support for using the arch wc mapping API.

They appear to apply cleanly to 4.4-stable.  They are included in 4.9
so no other stable branch needs them.

Ben.

-- 
Ben Hutchings, Software Developer Codethink Ltd
https://www.codethink.co.uk/ Dale House, 35 Dale Street
 Manchester, M1 2HF, United Kingdom


Re: [PATCH 6/7] HID: logitech-hidpp: support the G700 over wireless

2018-09-07 Thread Harry Cutts
Hi Benjamin,

On Fri, 7 Sep 2018 at 03:35, Benjamin Tissoires
 wrote:
>
> The G700 is using a non unifying receiver, so it's easy to add its support
> in hid-logitech-hidpp now.
> [snip]
> @@ -3671,6 +3671,9 @@ static const struct hid_device_id hidpp_devices[] = {
> { /* Solar Keyboard Logitech K750 */
>   LDJ_DEVICE(0x4002),
>   .driver_data = HIDPP_QUIRK_CLASS_K750 },
> +   { /* G700 over Wireless */
> + HID_USB_DEVICE(USB_VENDOR_ID_LOGITECH, 
> USB_DEVICE_ID_LOGITECH_G700_RECEIVER),
> + .driver_data = HIDPP_QUIRK_RECEIVER | HIDPP_QUIRK_UNIFYING },

As someone who's new to the codebase, it seems rather confusing to me
that HIDPP_QUIRK_UNIFYING would be present here for a device that
doesn't use a Unifying receiver. Am I misunderstanding, or should we
consider renaming the quirk or adding some clarifying comment?
(Similarly for the G900 in the next patch.)

>
> { LDJ_DEVICE(HID_ANY_ID) },
>
> --
> 2.14.3
>

Thanks,

Harry Cutts
Chrome OS Touch/Input team


Re: [PATCH RFC LKMM 1/7] tools/memory-model: Add extra ordering for locks and remove it for ordinary release/acquire

2018-09-07 Thread Alan Stern
On Fri, 7 Sep 2018, Daniel Lustig wrote:

> On 9/7/2018 9:09 AM, Will Deacon wrote:
> > On Fri, Sep 07, 2018 at 12:00:19PM -0400, Alan Stern wrote:
> >> On Thu, 6 Sep 2018, Andrea Parri wrote:
> >>
>  Have you noticed any part of the generic code that relies on ordinary 
>  acquire-release (rather than atomic RMW acquire-release) in order to 
>  implement locking constructs?
> >>>
> >>> There are several places in code where the "lock-acquire" seems to be
> >>> provided by an atomic_cond_read_acquire/smp_cond_load_acquire: I have
> >>> mentioned one in qspinlock in this thread; qrwlock and mcs_spinlock
> >>> provide other examples (grep for the primitives...).
> >>>
> >>> As long as we don't consider these primitive as RMW (which would seem
> >>> odd...) or as acquire for which "most people expect strong ordering"
> >>> (see above), these provides other examples for the _gap_ I mentioned.
> >>
> >> Okay, now I understand your objection.  It does appear that on RISC-V,
> >> if nowhere else, the current implementations of qspinlock, qrwlock,
> >> etc. will not provide "RCtso" ordering.
> >>
> >> The discussions surrounding this topic have been so lengthy and 
> >> confusing that I have lost track of any comments Palmer or Daniel may 
> >> have made concerning this potential problem.
> >>
> >> One possible resolution would be to define smp_cond_load_acquire() 
> >> specially on RISC-V so that it provided the same ordering guarantees as 
> >> RMW-acquire.  (Plus adding a comment in the asm-generic/barrier.h 
> >> pointing out the necessity for the stronger guarantee on all 
> >> architectures.)
> >>
> >> Another would be to replace the usages of atomic/smp_cond_load_acquire 
> >> in the locking constructs with a new function that would otherwise be 
> >> the same but would provide the ordering guarantee we want.
> >>
> >> Do you think either of these would be an adequate fix?
> > 
> > I didn't think RISC-V used qspinlock or qrwlock, so I'm not sure there's
> > actually anything to fix, is there?
> > 
> > Will
> 
> I've also lost track of whether the current preference is or is not for
> RCtso, or in which subset of cases RCtso is currently preferred.  For
> whichever cases do in fact need to be RCtso, the RISC-V approach would
> still be the same as what I've written in the past, as far as I can
> tell [1].

The patch which Paul plans to send in for the next merge window makes 
the LKMM require RCtso ordering for spinlocks, and by extension, for 
all locking operations.  As I understand it, the current RISC-V 
implementation of spinlocks does provide this ordering.

We have discussed creating another patch for the LKMM which would
require RMW-acquire/ordinary-release also to have RCtso ordering.  
Nobody has written the patch yet, but it would be straightfoward.  The
rationale is that many types of locks are implemented in terms of
RMW-acquire, so if the locks are required to be RCtso then so should
the lower-level operations they are built from.

Will feels strongly (and Linus agrees) that the LKMM should not require
ordinary acquire and release to be any stronger than RCpc.

The issue that Andrea raised has to do with qspinlock, qrwlock, and
mcs_spinlock, which are implemented using smp_cond_load_acquire()  
instead of RMW-acquire.  This provides only the ordering properties of
smp_load_acquire(), namely RCpc, which means that qspinlocks etc. might
not be RCtso.

Since we do want locks to be RCtso, the question is how to resolve this 
discrepancy.

> In a nutshell, if a data structure uses only atomics with .aq/.rl,
> RISC-V provides RCtso already anyway.  If a data structure uses fences,
> or mixes fences and atomics, we can replace a "fence r,rw" or a
> "fence rw,w" with a "fence.tso" (== fence r,rw + fence rw,w) as
> necessary, at the cost of some amount of performance.
> 
> I suppose the answer to the question of whether smp_cond_load_acquire()
> needs to change depends on where exactly RCtso is needed, and which
> data structures actually use that vs. some other macro. 
> 
> Does that answer your question Alan?  Does it make sense?

On all other architectures, as far as I know, smp_cond_load_acquire() 
is in fact RCtso.  Any changes would only be needed on RISC-V.

A quick grep of the kernel source (not quite up-to-date, unfortunately)  
turns up only the following additional usages of 
smp_cond_load_acquire():

It is used in kernel/smp.c for csd_lock(); I don't know what 
that is meant for.

It is also used in the scheduler core (kernel/sched/core.c).  I 
don't know what ordering requirements the scheduler has for it,
but Peter does.

There's a usage in drivers/iommu/arm-smmu-v3.c, but no comment 
to explain why it is needed.

To tell the truth, I'm not aware of any code in the kernel that
actually _needs_ RCtso ordering for locks, but Peter and Will are quite
firm that it should be required.  Linus would actually like locks to be

Re: [PATCH V2 6/8] input: stpmic1: add stpmic1 onkey driver

2018-09-07 Thread dmitry.torok...@gmail.com
Hi Pascal,

On Fri, Sep 07, 2018 at 12:59:45PM +, Pascal PAILLET-LME wrote:
> From: pascal paillet 
> 
> The stpmic1 pmic is able to manage an onkey button. This driver exposes
> the stpmic1 onkey as an input device. It can also be configured to
> shut-down the power supplies on a long key-press with an adjustable
> duration.
> 
> Signed-off-by: pascal paillet 
> ---
> changes in v2:
> * the hardware component has been renamed from stpmu1 to stpmic1 !
> * change headers
> * handle remarks from Dmitry
> * the irq is threaded because is is nested in a thread; I have added a 
> comment.
> Dmitry, I'm sorry, but I did not catch your comment regarding usage of 
> "generic device property API.". could you tell more ? 

You basically do

s/of_property_/device_property_/

and that's it.

> 
>  drivers/input/misc/Kconfig |  11 ++
>  drivers/input/misc/Makefile|   2 +
>  drivers/input/misc/stpmic1_onkey.c | 257 
> +
>  3 files changed, 270 insertions(+)
>  create mode 100644 drivers/input/misc/stpmic1_onkey.c
> 
> diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
> index c25606e..cc82dad 100644
> --- a/drivers/input/misc/Kconfig
> +++ b/drivers/input/misc/Kconfig
> @@ -841,4 +841,15 @@ config INPUT_RAVE_SP_PWRBUTTON
> To compile this driver as a module, choose M here: the
> module will be called rave-sp-pwrbutton.
>  
> +config INPUT_STPMIC1_ONKEY
> + tristate "STPMIC1 PMIC Onkey support"
> + depends on MFD_STPMIC1
> + help
> +   Say Y to enable support of onkey embedded into STPMIC1 PMIC. onkey
> +   can be used to wakeup from low power modes and force a shut-down on
> +   long press.
> +
> +   To compile this driver as a module, choose M here: the
> +   module will be called stpmic1_onkey.
> +
>  endif
> diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile
> index 72cde28..f0e11b0 100644
> --- a/drivers/input/misc/Makefile
> +++ b/drivers/input/misc/Makefile
> @@ -70,6 +70,7 @@ obj-$(CONFIG_INPUT_SGI_BTNS)+= sgi_btns.o
>  obj-$(CONFIG_INPUT_SIRFSOC_ONKEY)+= sirfsoc-onkey.o
>  obj-$(CONFIG_INPUT_SOC_BUTTON_ARRAY) += soc_button_array.o
>  obj-$(CONFIG_INPUT_SPARCSPKR)+= sparcspkr.o
> +obj-$(CONFIG_INPUT_STPMIC1_ONKEY)+= stpmic1_onkey.o
>  obj-$(CONFIG_INPUT_TPS65218_PWRBUTTON)   += tps65218-pwrbutton.o
>  obj-$(CONFIG_INPUT_TWL4030_PWRBUTTON)+= twl4030-pwrbutton.o
>  obj-$(CONFIG_INPUT_TWL4030_VIBRA)+= twl4030-vibra.o
> @@ -80,3 +81,4 @@ obj-$(CONFIG_INPUT_WM831X_ON)   += wm831x-on.o
>  obj-$(CONFIG_INPUT_XEN_KBDDEV_FRONTEND)  += xen-kbdfront.o
>  obj-$(CONFIG_INPUT_YEALINK)  += yealink.o
>  obj-$(CONFIG_INPUT_IDEAPAD_SLIDEBAR) += ideapad_slidebar.o
> +
> diff --git a/drivers/input/misc/stpmic1_onkey.c 
> b/drivers/input/misc/stpmic1_onkey.c
> new file mode 100644
> index 000..170d879
> --- /dev/null
> +++ b/drivers/input/misc/stpmic1_onkey.c
> @@ -0,0 +1,257 @@
> +// SPDX-License-Identifier: GPL-2.0
> +// Copyright (C) STMicroelectronics 2018
> +// Author: Pascal Paillet  for STMicroelectronics.
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +/**
> + * struct stpmic1_onkey - OnKey data
> + * @pmic:pointer to STPMIC1 PMIC device
> + * @input_dev:   pointer to input device
> + * @irq_falling: irq that we are hooked on to
> + * @irq_rising:  irq that we are hooked on to
> + */
> +struct stpmic1_onkey {
> + struct stpmic1_dev *pmic;
> + struct input_dev *input_dev;
> + int irq_falling;
> + int irq_rising;
> +};
> +
> +/**
> + * struct pmic_onkey_config - configuration of pmic PONKEYn
> + * @turnoff_enabled: value to enable turnoff condition
> + * @cc_flag_clear:   value to clear CC flag in case of PowerOff
> + * trigger by longkey press
> + * @onkey_pullup_val:value of PONKEY PullUp (active or 
> inactive)
> + * @long_press_time_val: value for long press h/w shutdown event
> + */
> +struct pmic_onkey_config {
> + bool turnoff_enabled;
> + bool cc_flag_clear;
> + u8 onkey_pullup_val;
> + u8 long_press_time_val;
> +};
> +
> +static irqreturn_t onkey_falling_irq(int irq, void *ponkey)
> +{
> + struct stpmic1_onkey *onkey = ponkey;
> + struct input_dev *input_dev = onkey->input_dev;
> +
> + input_report_key(input_dev, KEY_POWER, 1);
> + pm_wakeup_event(input_dev->dev.parent, 0);
> + input_sync(input_dev);
> +
> + dev_dbg(_dev->dev, "Pwr Onkey Falling Interrupt received\n");
> +
> + return IRQ_HANDLED;
> +}
> +
> +static irqreturn_t onkey_rising_irq(int irq, void *ponkey)
> +{
> + struct stpmic1_onkey *onkey = ponkey;
> + struct input_dev *input_dev = onkey->input_dev;
> +
> + input_report_key(input_dev, KEY_POWER, 0);
> + pm_wakeup_event(input_dev->dev.parent, 0);
> + input_sync(input_dev);
> +

Re: [PATCH v2 0/3] mtd concat device driver

2018-09-07 Thread Bernhard Frauendienst

Apologies, again, I seem not to be able to handle git-send-mail
correctly, the cover letter got lost in operation (using get_maintainers
on a cover letter is not a good idea). Here it is again:


Hi everybody,

when porting my router board from a mach-file based OpenWRT target to a
device-tree based target, I found that there is no generic way to create
a mtd_concat device from within the dts. The following patches attempt
to provide that possibility.

This is a second roll of that patch series, the first one can be seen at
[1]. Apologies for not including the correct recipients in the first
roll.

In this first discussion, concerns were raised that a driver for a
"virtual" device like this might have no place in the device tree
system. However, I would argue that this very similar to specifying the
partitions of a mtd device, which can also done in the device tree. In
fact, I believe this is the only way to be able to specify the
partitions of such a concat device in the dts file (but I'm happy to be
corrected if I'm mistaken).
I have made the example in the dt-binding documentation a little bit
more expressive in this detail.

In this second roll I have also addressed all issues that reviewers have
brought up so far, hopefully to their satisfaction.

Best Regards
Bernhard

[1] 
http://lists.infradead.org/pipermail/linux-mtd/2018-September/083832.html



Bernhard Frauendienst (3):
  mtd: core: add get_mtd_device_by_node
  dt-bindings: add bindings for mtd-concat devices
  mtd: mtdconcat: add dt driver for concat devices

 .../devicetree/bindings/mtd/mtd-concat.txt    |  36 +
 drivers/mtd/Kconfig   |   2 +
 drivers/mtd/Makefile  |   3 +
 drivers/mtd/composite/Kconfig |  12 ++
 drivers/mtd/composite/Makefile    |   7 +
 drivers/mtd/composite/virt_concat.c   | 128 ++
 drivers/mtd/mtdcore.c |  38 ++
 include/linux/mtd/mtd.h   |   2 +
 8 files changed, 228 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mtd/mtd-concat.txt
 create mode 100644 drivers/mtd/composite/Kconfig
 create mode 100644 drivers/mtd/composite/Makefile
 create mode 100644 drivers/mtd/composite/virt_concat.c

--
2.17.1



Re: Conflict between sparse and commit cafa0010cd51f ("Raise the minimum required gcc version to 4.6")

2018-09-07 Thread Luc Van Oostenryck
On Fri, Sep 07, 2018 at 10:22:56AM -0700, Nick Desaulniers wrote:
> On Fri, Sep 7, 2018 at 7:34 AM Christophe LEROY  
> wrote:
> >
> > Cc linux-spa...@vger.kernel.org
> >
> > Le 07/09/2018 à 14:22, Christophe Leroy a écrit :
> > > Since commit cafa0010cd51f ("Raise the minimum required gcc version to
> > > 4.6"), sparse check fails as follows:
> > >
> > > [root@pc16082vm linux-powerpc]# make C=2 arch/powerpc/kernel/process.o
> > >CALLscripts/checksyscalls.sh
> > >CHECK   scripts/mod/empty.c
> > > ./include/linux/compiler-gcc.h:14:3: error: Sorry, your compiler is too
> > > old - please upgrade it.
> > >CHECK   arch/powerpc/kernel/process.c
> > > ./include/linux/compiler-gcc.h:14:3: error: Sorry, your compiler is too
> > > old - please upgrade it.
> > >
> > >
> > > I have sparse version 0.5.2
> > >
> > > What can be done to fix that ?
> > >
> > > Christophe
> 
> Oof, sorry Christophe.  Looks like that's the latest version of sparse:
> https://sparse.wiki.kernel.org/index.php/Main_Page#News
> 
> I'm curious what sparse expands __GNUC__, __GNUC_MINOR__, and
> __GNUC_PATCHLEVEL__ to?  Pre commit cafa0010cd51f, it MUST be
> expanding them to something, otherwise you'd have seen the error then,
> too.  The previous check was GCC < 3.3, now it's GCC < 4.6.

Sparse expand these macros to the same version than the compiler used
to compile GCC. I find a bit strange though to have sparse v0.5.2 but
using an old compiler.
 
Also, it's worth to look at what is said in this email:
  
https://lore.kernel.org/lkml/ca+55afzyenzr2gzlr-dwponjmnygyody+6awlcvnaywiazu...@mail.gmail.com/


-- Luc


[PATCH] ring-buffer: Allow for rescheduling when removing pages

2018-09-07 Thread Vaibhav Nagarnaik
When reducing ring buffer size, pages are removed by scheduling a work
item on each CPU for the corresponding CPU ring buffer. After the pages
are removed from ring buffer linked list, the pages are free()d in a
tight loop. The loop does not give up CPU until all pages are removed.
In a worst case behavior, when lot of pages are to be freed, it can
cause system stall.

After the pages are removed from the list, the free() can happen while
the work is rescheduled. Add a check for need_sched() within the loop
to prevent the system hangup.

Reported-by: Jason Behmer 
Signed-off-by: Vaibhav Nagarnaik 
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1d92d4a982fd..bc1789df7c53 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1546,6 +1546,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, 
unsigned long nr_pages)
tmp_iter_page = first_page;
 
do {
+   if (need_resched())
+   schedule();
+
to_remove_page = tmp_iter_page;
rb_inc_page(cpu_buffer, _iter_page);
 
-- 
2.19.0.rc2.392.g5ba43deb5a-goog



Re: [PATCH] ring-buffer: Allow for rescheduling when removing pages

2018-09-07 Thread Steven Rostedt
On Fri,  7 Sep 2018 11:21:31 -0700
Vaibhav Nagarnaik  wrote:

> When reducing ring buffer size, pages are removed by scheduling a work
> item on each CPU for the corresponding CPU ring buffer. After the pages
> are removed from ring buffer linked list, the pages are free()d in a
> tight loop. The loop does not give up CPU until all pages are removed.
> In a worst case behavior, when lot of pages are to be freed, it can
> cause system stall.
> 
> After the pages are removed from the list, the free() can happen while
> the work is rescheduled. Add a check for need_sched() within the loop
> to prevent the system hangup.
> 
> Reported-by: Jason Behmer 
> Signed-off-by: Vaibhav Nagarnaik 
> ---
>  kernel/trace/ring_buffer.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
> index 1d92d4a982fd..bc1789df7c53 100644
> --- a/kernel/trace/ring_buffer.c
> +++ b/kernel/trace/ring_buffer.c
> @@ -1546,6 +1546,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, 
> unsigned long nr_pages)
>   tmp_iter_page = first_page;
>  
>   do {
> + if (need_resched())
> + schedule();
> +

Hi, thanks for the patch, but the proper way to do this is to stick in:

cond_resched();

And that should solve it for you. Want to send in another patch?

-- Steve

>   to_remove_page = tmp_iter_page;
>   rb_inc_page(cpu_buffer, _iter_page);
>  



[PATCH] sched/fair: fix 1 task per CPU

2018-09-07 Thread Vincent Guittot
When CPUs have different capacity because of RT/DL tasks or
micro-architecture or max frequency differences, there are situation where
the imbalance is not correctly set to migrate waiting task on the idle CPU.

The UC uses the force_balance case :
if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;

But calculate_imbalance fails to set the right amount of load to migrate
a task because of the special condition:
  busiest->avg_load <= sds->avg_load || local->avg_load >= sds->avg_load)

Add in fix_small_imbalance, this special case that triggered the force
balance in order to make sure that the amount of load to migrate will be
enough.

Signed-off-by: Vincent Guittot 
---
 kernel/sched/fair.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 309c93f..57b4d83 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8048,6 +8048,20 @@ void fix_small_imbalance(struct lb_env *env, struct 
sd_lb_stats *sds)
local = >local_stat;
busiest = >busiest_stat;
 
+   /*
+* There is available capacity in local group and busiest group is
+* overloaded but calculate_imbalance can't compute the amount of load
+* to migrate because they became meaningless because asymetric
+* capacity between group. In such case, we only want to migrate at
+* least one tasks of the busiest group and rely of the average load
+* per task to ensure the migration.
+*/
+   if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
+   busiest->group_no_capacity) {
+   env->imbalance = busiest->load_per_task;
+   return;
+   }
+
if (!local->sum_nr_running)
local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
else if (busiest->load_per_task > local->load_per_task)
-- 
2.7.4



Applied "spi: pic32: remove unnecessary of_node_get()" to the spi tree

2018-09-07 Thread Mark Brown
The patch

   spi: pic32: remove unnecessary of_node_get()

has been applied to the spi tree at

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git 

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.  

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark

>From b9a947dd756b7af84ababa57e0524788f91a5382 Mon Sep 17 00:00:00 2001
From: Alexey Khoroshilov 
Date: Fri, 7 Sep 2018 01:16:54 +0300
Subject: [PATCH] spi: pic32: remove unnecessary of_node_get()

Almost all spi drivers assign spi master->dev.of_node from
its parent platform device without additional refcounting.
It seems of_node_get() in pic32_spi_probe() is unnecessary
and there is no corresponding of_node_put().

Found by Linux Driver Verification project (linuxtesting.org).

Signed-off-by: Alexey Khoroshilov 
Signed-off-by: Mark Brown 
---
 drivers/spi/spi-pic32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/spi/spi-pic32.c b/drivers/spi/spi-pic32.c
index f8a45af1fa9f..46ff76193ee1 100644
--- a/drivers/spi/spi-pic32.c
+++ b/drivers/spi/spi-pic32.c
@@ -774,7 +774,7 @@ static int pic32_spi_probe(struct platform_device *pdev)
if (ret)
goto err_master;
 
-   master->dev.of_node = of_node_get(pdev->dev.of_node);
+   master->dev.of_node = pdev->dev.of_node;
master->mode_bits   = SPI_MODE_3 | SPI_MODE_0 | SPI_CS_HIGH;
master->num_chipselect  = 1; /* single chip-select */
master->max_speed_hz= clk_get_rate(pic32s->clk);
-- 
2.19.0.rc1



Applied "regmap: fix comment for regmap.use_single_write" to the regmap tree

2018-09-07 Thread Mark Brown
The patch

   regmap: fix comment for regmap.use_single_write

has been applied to the regmap tree at

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/regmap.git 

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.  

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark

>From 9ad8eb0168ab76786f65d4b80ce082980f79a1d9 Mon Sep 17 00:00:00 2001
From: David Frey 
Date: Sat, 1 Sep 2018 09:50:40 -0700
Subject: [PATCH] regmap: fix comment for regmap.use_single_write

Signed-off-by: David Frey 
Signed-off-by: Mark Brown 
---
 drivers/base/regmap/internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h
index a6bf34d6394e..16414ccace96 100644
--- a/drivers/base/regmap/internal.h
+++ b/drivers/base/regmap/internal.h
@@ -149,7 +149,7 @@ struct regmap {
 
/* if set, converts bulk read to single read */
bool use_single_read;
-   /* if set, converts bulk read to single read */
+   /* if set, converts bulk write to single write */
bool use_single_write;
/* if set, the device supports multi write mode */
bool can_multi_write;
-- 
2.19.0.rc1



Re: [PATCH RFC LKMM 1/7] tools/memory-model: Add extra ordering for locks and remove it for ordinary release/acquire

2018-09-07 Thread Will Deacon
On Fri, Sep 07, 2018 at 12:00:19PM -0400, Alan Stern wrote:
> On Thu, 6 Sep 2018, Andrea Parri wrote:
> 
> > > Have you noticed any part of the generic code that relies on ordinary 
> > > acquire-release (rather than atomic RMW acquire-release) in order to 
> > > implement locking constructs?
> > 
> > There are several places in code where the "lock-acquire" seems to be
> > provided by an atomic_cond_read_acquire/smp_cond_load_acquire: I have
> > mentioned one in qspinlock in this thread; qrwlock and mcs_spinlock
> > provide other examples (grep for the primitives...).
> > 
> > As long as we don't consider these primitive as RMW (which would seem
> > odd...) or as acquire for which "most people expect strong ordering"
> > (see above), these provides other examples for the _gap_ I mentioned.
> 
> Okay, now I understand your objection.  It does appear that on RISC-V,
> if nowhere else, the current implementations of qspinlock, qrwlock,
> etc. will not provide "RCtso" ordering.
> 
> The discussions surrounding this topic have been so lengthy and 
> confusing that I have lost track of any comments Palmer or Daniel may 
> have made concerning this potential problem.
> 
> One possible resolution would be to define smp_cond_load_acquire() 
> specially on RISC-V so that it provided the same ordering guarantees as 
> RMW-acquire.  (Plus adding a comment in the asm-generic/barrier.h 
> pointing out the necessity for the stronger guarantee on all 
> architectures.)
> 
> Another would be to replace the usages of atomic/smp_cond_load_acquire 
> in the locking constructs with a new function that would otherwise be 
> the same but would provide the ordering guarantee we want.
> 
> Do you think either of these would be an adequate fix?

I didn't think RISC-V used qspinlock or qrwlock, so I'm not sure there's
actually anything to fix, is there?

Will


[PATCH 2/6] md: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
The code really just wants a big flat buffer, so just do that.

Signed-off-by: Kent Overstreet 
Cc: Shaohua Li 
Cc: linux-r...@vger.kernel.org
---
 drivers/md/raid5-ppl.c |  7 ++--
 drivers/md/raid5.c | 82 +++---
 drivers/md/raid5.h |  9 ++---
 3 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 3a7c363265..5911810101 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -16,7 +16,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include "md.h"
@@ -165,7 +164,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
raid5_percpu *percpu,
   struct dma_async_tx_descriptor *tx)
 {
int disks = sh->disks;
-   struct page **srcs = flex_array_get(percpu->scribble, 0);
+   struct page **srcs = percpu->scribble;
int count = 0, pd_idx = sh->pd_idx, i;
struct async_submit_ctl submit;
 
@@ -196,8 +195,8 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
raid5_percpu *percpu,
}
 
init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
- NULL, sh, flex_array_get(percpu->scribble, 0)
- + sizeof(struct page *) * (sh->disks + 2));
+ NULL, sh, percpu->scribble +
+ sizeof(struct page *) * (sh->disks + 2));
 
if (count == 1)
tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2031506a0e..d5603946dc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -54,7 +54,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 #include 
@@ -1399,19 +1398,14 @@ static void ops_complete_compute(void *stripe_head_ref)
 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
 struct raid5_percpu *percpu, int i)
 {
-   void *addr;
-
-   addr = flex_array_get(percpu->scribble, i);
-   return addr + sizeof(struct page *) * (sh->disks + 2);
+   return percpu->scribble + i * percpu->scribble_obj_size +
+   sizeof(struct page *) * (sh->disks + 2);
 }
 
 /* return a pointer to the address conversion region of the scribble buffer */
 static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
 {
-   void *addr;
-
-   addr = flex_array_get(percpu->scribble, i);
-   return addr;
+   return percpu->scribble + i * percpu->scribble_obj_size;
 }
 
 static struct dma_async_tx_descriptor *
@@ -2240,21 +2234,23 @@ static int grow_stripes(struct r5conf *conf, int num)
  * calculate over all devices (not just the data blocks), using zeros in place
  * of the P and Q blocks.
  */
-static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
+static int scribble_alloc(struct raid5_percpu *percpu,
+ int num, int cnt, gfp_t flags)
 {
-   struct flex_array *ret;
-   size_t len;
+   size_t obj_size =
+   sizeof(struct page *) * (num+2) +
+   sizeof(addr_conv_t) * (num+2);
+   void *scribble;
 
-   len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
-   ret = flex_array_alloc(len, cnt, flags);
-   if (!ret)
-   return NULL;
-   /* always prealloc all elements, so no locking is required */
-   if (flex_array_prealloc(ret, 0, cnt, flags)) {
-   flex_array_free(ret);
-   return NULL;
-   }
-   return ret;
+   scribble = kvmalloc_array(cnt, obj_size, flags);
+   if (!scribble)
+   return -ENOMEM;
+
+   kvfree(percpu->scribble);
+
+   percpu->scribble = scribble;
+   percpu->scribble_obj_size = obj_size;
+   return 0;
 }
 
 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
@@ -2272,23 +2268,18 @@ static int resize_chunks(struct r5conf *conf, int 
new_disks, int new_sectors)
return 0;
mddev_suspend(conf->mddev);
get_online_cpus();
+
for_each_present_cpu(cpu) {
struct raid5_percpu *percpu;
-   struct flex_array *scribble;
 
percpu = per_cpu_ptr(conf->percpu, cpu);
-   scribble = scribble_alloc(new_disks,
- new_sectors / STRIPE_SECTORS,
- GFP_NOIO);
-
-   if (scribble) {
-   flex_array_free(percpu->scribble);
-   percpu->scribble = scribble;
-   } else {
-   err = -ENOMEM;
+   err = scribble_alloc(percpu, new_disks,
+new_sectors / STRIPE_SECTORS,
+GFP_NOIO);
+   if (err)
break;
-   }
}
+
put_online_cpus();
mddev_resume(conf->mddev);

[PATCH 3/6] selinux: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
The flex arrays were being used for constant sized arrays, so there's no
benefit to using flex_arrays over something simpler.

Signed-off-by: Kent Overstreet 
Cc: linux-security-mod...@vger.kernel.org
---
 security/selinux/ss/avtab.c   |  40 +-
 security/selinux/ss/avtab.h   |   4 +-
 security/selinux/ss/conditional.c |   6 +-
 security/selinux/ss/policydb.c| 122 --
 security/selinux/ss/policydb.h|  12 +--
 security/selinux/ss/services.c|  22 ++
 6 files changed, 62 insertions(+), 144 deletions(-)

diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index a2c9148b06..5a7fd5f0b7 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -93,12 +93,10 @@ avtab_insert_node(struct avtab *h, int hvalue,
newnode->next = prev->next;
prev->next = newnode;
} else {
-   newnode->next = flex_array_get_ptr(h->htable, hvalue);
-   if (flex_array_put_ptr(h->htable, hvalue, newnode,
-  GFP_KERNEL|__GFP_ZERO)) {
-   kmem_cache_free(avtab_node_cachep, newnode);
-   return NULL;
-   }
+   struct avtab_node **n = >htable[hvalue];
+
+   newnode->next = *n;
+   *n = newnode;
}
 
h->nel++;
@@ -111,11 +109,11 @@ static int avtab_insert(struct avtab *h, struct avtab_key 
*key, struct avtab_dat
struct avtab_node *prev, *cur, *newnode;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return -EINVAL;
 
hvalue = avtab_hash(key, h->mask);
-   for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue);
+   for (prev = NULL, cur = h->htable[hvalue];
 cur;
 prev = cur, cur = cur->next) {
if (key->source_type == cur->key.source_type &&
@@ -156,10 +154,10 @@ avtab_insert_nonunique(struct avtab *h, struct avtab_key 
*key, struct avtab_datu
struct avtab_node *prev, *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
hvalue = avtab_hash(key, h->mask);
-   for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue);
+   for (prev = NULL, cur = h->htable[hvalue];
 cur;
 prev = cur, cur = cur->next) {
if (key->source_type == cur->key.source_type &&
@@ -186,11 +184,11 @@ struct avtab_datum *avtab_search(struct avtab *h, struct 
avtab_key *key)
struct avtab_node *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
 
hvalue = avtab_hash(key, h->mask);
-   for (cur = flex_array_get_ptr(h->htable, hvalue); cur;
+   for (cur = h->htable[hvalue]; cur;
 cur = cur->next) {
if (key->source_type == cur->key.source_type &&
key->target_type == cur->key.target_type &&
@@ -222,11 +220,11 @@ avtab_search_node(struct avtab *h, struct avtab_key *key)
struct avtab_node *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
 
hvalue = avtab_hash(key, h->mask);
-   for (cur = flex_array_get_ptr(h->htable, hvalue); cur;
+   for (cur = h->htable[hvalue]; cur;
 cur = cur->next) {
if (key->source_type == cur->key.source_type &&
key->target_type == cur->key.target_type &&
@@ -281,11 +279,11 @@ void avtab_destroy(struct avtab *h)
int i;
struct avtab_node *cur, *temp;
 
-   if (!h || !h->htable)
+   if (!h)
return;
 
for (i = 0; i < h->nslot; i++) {
-   cur = flex_array_get_ptr(h->htable, i);
+   cur = h->htable[i];
while (cur) {
temp = cur;
cur = cur->next;
@@ -295,7 +293,7 @@ void avtab_destroy(struct avtab *h)
kmem_cache_free(avtab_node_cachep, temp);
}
}
-   flex_array_free(h->htable);
+   kvfree(h->htable);
h->htable = NULL;
h->nslot = 0;
h->mask = 0;
@@ -303,6 +301,7 @@ void avtab_destroy(struct avtab *h)
 
 int avtab_init(struct avtab *h)
 {
+   kvfree(h->htable);
h->htable = NULL;
h->nel = 0;
return 0;
@@ -329,8 +328,7 @@ int avtab_alloc(struct avtab *h, u32 nrules)
nslot = MAX_AVTAB_HASH_BUCKETS;
mask = nslot - 1;
 
-   h->htable = flex_array_alloc(sizeof(struct avtab_node *), nslot,
-GFP_KERNEL | __GFP_ZERO);
+   h->htable = kvmalloc_array(nslot, sizeof(void *), GFP_KERNEL);

[PATCH 4/6] Generic radix trees

2018-09-07 Thread Kent Overstreet
Very simple radix tree implementation that supports storing arbitrary
size entries, up to PAGE_SIZE - upcoming patches will convert existing
flex_array users to genradixes. The new genradix code has a much simpler
API and implementation, and doesn't have a hard limit on the number of
elements like flex_array does.

Signed-off-by: Kent Overstreet 
---
 include/linux/generic-radix-tree.h | 222 +
 lib/Makefile   |   3 +-
 lib/generic-radix-tree.c   | 180 +++
 3 files changed, 404 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/generic-radix-tree.h
 create mode 100644 lib/generic-radix-tree.c

diff --git a/include/linux/generic-radix-tree.h 
b/include/linux/generic-radix-tree.h
new file mode 100644
index 00..3328813322
--- /dev/null
+++ b/include/linux/generic-radix-tree.h
@@ -0,0 +1,222 @@
+#ifndef _LINUX_GENERIC_RADIX_TREE_H
+#define _LINUX_GENERIC_RADIX_TREE_H
+
+/*
+ * Generic radix trees/sparse arrays:
+ *
+ * Very simple and minimalistic, supporting arbitrary size entries up to
+ * PAGE_SIZE.
+ *
+ * A genradix is defined with the type it will store, like so:
+ * static GENRADIX(struct foo) foo_genradix;
+ *
+ * The main operations are:
+ * - genradix_init(radix) - initialize an empty genradix
+ *
+ * - genradix_free(radix) - free all memory owned by the genradix and
+ *   reinitialize it
+ *
+ * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
+ *   NULL if that entry does not exist
+ *
+ * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
+ *   allocating it if necessary
+ *
+ * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
+ *
+ * The radix tree allocates one page of entries at a time, so entries may exist
+ * that were never explicitly allocated - they will be initialized to all
+ * zeroes.
+ *
+ * Internally, a genradix is just a radix tree of pages, and indexing works in
+ * terms of byte offsets. The wrappers in this header file use sizeof on the
+ * type the radix contains to calculate a byte offset from the index - see
+ * __idx_to_offset.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct genradix_node;
+
+struct __genradix {
+   struct genradix_node*root;
+   size_t  depth;
+};
+
+#define __GENRADIX_INITIALIZER \
+   {   \
+   .tree = {   \
+   .root = NULL,   \
+   .depth = 0, \
+   }   \
+   }
+
+/*
+ * We use a 0 size array to stash the type we're storing without taking any
+ * space at runtime - then the various accessor macros can use typeof() to get
+ * to it for casts/sizeof - we also force the alignment so that storing a type
+ * with a ridiculous alignment doesn't blow up the alignment or size of the
+ * genradix.
+ */
+
+#define GENRADIX(_type)\
+struct {   \
+   struct __genradix   tree;   \
+   _type   type[0] __aligned(1);   \
+}
+
+#define DEFINE_GENRADIX(_name, _type)  \
+   GENRADIX(_type) _name = __GENRADIX_INITIALIZER
+
+/**
+ * genradix_init - initialize a genradix
+ * @_radix:genradix to initialize
+ *
+ * Does not fail
+ */
+#define genradix_init(_radix)  \
+do {   \
+   *(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER;   \
+} while (0)
+
+void __genradix_free(struct __genradix *);
+
+/**
+ * genradix_free: free all memory owned by a genradix
+ *
+ * After freeing, @_radix will be reinitialized and empty
+ */
+#define genradix_free(_radix)  __genradix_free(&(_radix)->tree)
+
+static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
+{
+   if (__builtin_constant_p(obj_size))
+   BUILD_BUG_ON(obj_size > PAGE_SIZE);
+   else
+   BUG_ON(obj_size > PAGE_SIZE);
+
+   if (!is_power_of_2(obj_size)) {
+   size_t objs_per_page = PAGE_SIZE / obj_size;
+
+   return (idx / objs_per_page) * PAGE_SIZE +
+   (idx % objs_per_page) * obj_size;
+   } else {
+   return idx * obj_size;
+   }
+}
+
+#define __genradix_cast(_radix)(typeof((_radix)->type[0]) *)
+#define __genradix_obj_size(_radix)sizeof((_radix)->type[0])
+#define __genradix_idx_to_offset(_radix, _idx) \
+   __idx_to_offset(_idx, __genradix_obj_size(_radix))
+
+void *__genradix_ptr(struct __genradix *, size_t);
+
+/**
+ * genradix_ptr - get a pointer to a genradix entry
+ * @_radix:genradix 

[PATCH 6/6] Drop flex_arrays

2018-09-07 Thread Kent Overstreet
All existing users have been converted to generic radix trees

Signed-off-by: Kent Overstreet 
Acked-by: Dave Hansen 
---
 Documentation/core-api/flexible-arrays.rst | 130 ---
 Documentation/flexible-arrays.txt  | 123 ---
 include/linux/flex_array.h | 149 
 include/linux/poison.h |   3 -
 lib/Makefile   |   2 +-
 lib/flex_array.c   | 398 -
 tools/include/linux/poison.h   |   3 -
 7 files changed, 1 insertion(+), 807 deletions(-)
 delete mode 100644 Documentation/core-api/flexible-arrays.rst
 delete mode 100644 Documentation/flexible-arrays.txt
 delete mode 100644 include/linux/flex_array.h
 delete mode 100644 lib/flex_array.c

diff --git a/Documentation/core-api/flexible-arrays.rst 
b/Documentation/core-api/flexible-arrays.rst
deleted file mode 100644
index b6b85a1b51..00
--- a/Documentation/core-api/flexible-arrays.rst
+++ /dev/null
@@ -1,130 +0,0 @@
-
-===
-Using flexible arrays in the kernel
-===
-
-Large contiguous memory allocations can be unreliable in the Linux kernel.
-Kernel programmers will sometimes respond to this problem by allocating
-pages with :c:func:`vmalloc()`.  This solution not ideal, though.  On 32-bit
-systems, memory from vmalloc() must be mapped into a relatively small address
-space; it's easy to run out.  On SMP systems, the page table changes required
-by vmalloc() allocations can require expensive cross-processor interrupts on
-all CPUs.  And, on all systems, use of space in the vmalloc() range increases
-pressure on the translation lookaside buffer (TLB), reducing the performance
-of the system.
-
-In many cases, the need for memory from vmalloc() can be eliminated by piecing
-together an array from smaller parts; the flexible array library exists to make
-this task easier.
-
-A flexible array holds an arbitrary (within limits) number of fixed-sized
-objects, accessed via an integer index.  Sparse arrays are handled
-reasonably well.  Only single-page allocations are made, so memory
-allocation failures should be relatively rare.  The down sides are that the
-arrays cannot be indexed directly, individual object size cannot exceed the
-system page size, and putting data into a flexible array requires a copy
-operation.  It's also worth noting that flexible arrays do no internal
-locking at all; if concurrent access to an array is possible, then the
-caller must arrange for appropriate mutual exclusion.
-
-The creation of a flexible array is done with :c:func:`flex_array_alloc()`::
-
-#include 
-
-struct flex_array *flex_array_alloc(int element_size,
-   unsigned int total,
-   gfp_t flags);
-
-The individual object size is provided by ``element_size``, while total is the
-maximum number of objects which can be stored in the array.  The flags
-argument is passed directly to the internal memory allocation calls.  With
-the current code, using flags to ask for high memory is likely to lead to
-notably unpleasant side effects.
-
-It is also possible to define flexible arrays at compile time with::
-
-DEFINE_FLEX_ARRAY(name, element_size, total);
-
-This macro will result in a definition of an array with the given name; the
-element size and total will be checked for validity at compile time.
-
-Storing data into a flexible array is accomplished with a call to
-:c:func:`flex_array_put()`::
-
-int flex_array_put(struct flex_array *array, unsigned int element_nr,
-  void *src, gfp_t flags);
-
-This call will copy the data from src into the array, in the position
-indicated by ``element_nr`` (which must be less than the maximum specified when
-the array was created).  If any memory allocations must be performed, flags
-will be used.  The return value is zero on success, a negative error code
-otherwise.
-
-There might possibly be a need to store data into a flexible array while
-running in some sort of atomic context; in this situation, sleeping in the
-memory allocator would be a bad thing.  That can be avoided by using
-``GFP_ATOMIC`` for the flags value, but, often, there is a better way.  The
-trick is to ensure that any needed memory allocations are done before
-entering atomic context, using :c:func:`flex_array_prealloc()`::
-
-int flex_array_prealloc(struct flex_array *array, unsigned int start,
-   unsigned int nr_elements, gfp_t flags);
-
-This function will ensure that memory for the elements indexed in the range
-defined by ``start`` and ``nr_elements`` has been allocated.  Thereafter, a
-``flex_array_put()`` call on an element in that range is guaranteed not to
-block.
-
-Getting data back out of the array is done with :c:func:`flex_array_get()`::
-
-void *flex_array_get(struct flex_array *fa, unsigned int element_nr);
-

Re: [PATCH 0/3] ARM: OMAP1: ams-delta: Clean up GPIO setup for MODEM

2018-09-07 Thread Tony Lindgren
* Janusz Krzysztofik  [180820 11:16]:
> 
> Convert modem related GPIO setup from integer space to GPIO descriptors.
> Also, restore original initialization order of the MODEM device and its
> related GPIO pins.
> 
> Cleanup of MODEM relaated regulator setup is postponed while waiting for
> upcoming conversion of fixed regulator API to GPIO descriptors.
> 
> 
> Janusz Krzysztofik (3):
>   ARM: OMAP1: ams-delta: assign MODEM IRQ from GPIO descriptor
>   ARM: OMAP1: ams-delta: initialize latch2 pins to safe values
>   ARM: OMAP1: ams-delta: register MODEM device earlier

Janusz, can you please repost this series based on v4.19-rc1
with Linus' acks?

At least the header file has moved around now.

And as this also conflicts with your earlier patch
"ARM: OMAP1: ams-delta: assign MODEM IRQ from GPIO descriptor"
please repost that too in the same series.

If you have other arch/arm/*omap*/* related patches then
please repost those too, these are the only ones I still had
tagged :)

Regards,

Tony



[PATCH 1/6] openvswitch: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
There was no real need for this code to be using flexarrays, it's just
implementing a hash table - ideally it would be using rhashtables, but
that conversion would be significantly more complicated.

Signed-off-by: Kent Overstreet 
Cc: Pravin B Shelar 
Cc: d...@openvswitch.org
---
 net/openvswitch/flow.h |  1 -
 net/openvswitch/flow_netlink.h |  1 -
 net/openvswitch/flow_table.c   | 51 --
 net/openvswitch/flow_table.h   |  3 +-
 4 files changed, 13 insertions(+), 43 deletions(-)

diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index c670dd24b8..4f06278166 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -30,7 +30,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 6657606b2b..66f9553758 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -30,7 +30,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 #include 
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 80ea2a7185..cfb0098c9a 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -111,29 +111,6 @@ int ovs_flow_tbl_count(const struct flow_table *table)
return table->count;
 }
 
-static struct flex_array *alloc_buckets(unsigned int n_buckets)
-{
-   struct flex_array *buckets;
-   int i, err;
-
-   buckets = flex_array_alloc(sizeof(struct hlist_head),
-  n_buckets, GFP_KERNEL);
-   if (!buckets)
-   return NULL;
-
-   err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
-   if (err) {
-   flex_array_free(buckets);
-   return NULL;
-   }
-
-   for (i = 0; i < n_buckets; i++)
-   INIT_HLIST_HEAD((struct hlist_head *)
-   flex_array_get(buckets, i));
-
-   return buckets;
-}
-
 static void flow_free(struct sw_flow *flow)
 {
int cpu;
@@ -168,31 +145,30 @@ void ovs_flow_free(struct sw_flow *flow, bool deferred)
flow_free(flow);
 }
 
-static void free_buckets(struct flex_array *buckets)
-{
-   flex_array_free(buckets);
-}
-
-
 static void __table_instance_destroy(struct table_instance *ti)
 {
-   free_buckets(ti->buckets);
+   kvfree(ti->buckets);
kfree(ti);
 }
 
 static struct table_instance *table_instance_alloc(int new_size)
 {
struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+   int i;
 
if (!ti)
return NULL;
 
-   ti->buckets = alloc_buckets(new_size);
-
+   ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head),
+GFP_KERNEL);
if (!ti->buckets) {
kfree(ti);
return NULL;
}
+
+   for (i = 0; i < new_size; i++)
+   INIT_HLIST_HEAD(>buckets[i]);
+
ti->n_buckets = new_size;
ti->node_ver = 0;
ti->keep_flows = false;
@@ -249,7 +225,7 @@ static void table_instance_destroy(struct table_instance 
*ti,
 
for (i = 0; i < ti->n_buckets; i++) {
struct sw_flow *flow;
-   struct hlist_head *head = flex_array_get(ti->buckets, i);
+   struct hlist_head *head = >buckets[i];
struct hlist_node *n;
int ver = ti->node_ver;
int ufid_ver = ufid_ti->node_ver;
@@ -294,7 +270,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct 
table_instance *ti,
ver = ti->node_ver;
while (*bucket < ti->n_buckets) {
i = 0;
-   head = flex_array_get(ti->buckets, *bucket);
+   head = >buckets[*bucket];
hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) {
if (i < *last) {
i++;
@@ -313,8 +289,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct 
table_instance *ti,
 static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
 {
hash = jhash_1word(hash, ti->hash_seed);
-   return flex_array_get(ti->buckets,
-   (hash & (ti->n_buckets - 1)));
+   return >buckets[hash & (ti->n_buckets - 1)];
 }
 
 static void table_instance_insert(struct table_instance *ti,
@@ -347,9 +322,7 @@ static void flow_table_copy_flows(struct table_instance 
*old,
/* Insert in new table. */
for (i = 0; i < old->n_buckets; i++) {
struct sw_flow *flow;
-   struct hlist_head *head;
-
-   head = flex_array_get(old->buckets, i);
+   struct hlist_head *head = >buckets[i];
 
if (ufid)
hlist_for_each_entry(flow, head,
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index 2dd9900f53..de5ec6cf51 100644
--- a/net/openvswitch/flow_table.h
+++ 

[PATCH 0/6] flex_arrays -> genradix; prep work for bcachefs

2018-09-07 Thread Kent Overstreet
Generic radix trees are a dead simple radix tree implementation that can store
types of different sizes, needed for bcachefs.

The patch series was sent out previously and was pretty uncontroversial - this
is a respin that converts most users to just use kvmalloc.

Kent Overstreet (6):
  openvswitch: convert to kvmalloc
  md: convert to kvmalloc
  selinux: convert to kvmalloc
  Generic radix trees
  proc: commit to genradix
  Drop flex_arrays

 Documentation/core-api/flexible-arrays.rst | 130 ---
 Documentation/flexible-arrays.txt  | 123 ---
 drivers/md/raid5-ppl.c |   7 +-
 drivers/md/raid5.c |  82 ++---
 drivers/md/raid5.h |   9 +-
 fs/proc/base.c |  43 +--
 include/linux/flex_array.h | 149 
 include/linux/generic-radix-tree.h | 222 
 include/linux/poison.h |   3 -
 lib/Makefile   |   5 +-
 lib/flex_array.c   | 398 -
 lib/generic-radix-tree.c   | 180 ++
 net/openvswitch/flow.h |   1 -
 net/openvswitch/flow_netlink.h |   1 -
 net/openvswitch/flow_table.c   |  51 +--
 net/openvswitch/flow_table.h   |   3 +-
 security/selinux/ss/avtab.c|  40 +--
 security/selinux/ss/avtab.h|   4 +-
 security/selinux/ss/conditional.c  |   6 +-
 security/selinux/ss/policydb.c | 122 ++-
 security/selinux/ss/policydb.h |  12 +-
 security/selinux/ss/services.c |  22 +-
 tools/include/linux/poison.h   |   3 -
 23 files changed, 540 insertions(+), 1076 deletions(-)
 delete mode 100644 Documentation/core-api/flexible-arrays.rst
 delete mode 100644 Documentation/flexible-arrays.txt
 delete mode 100644 include/linux/flex_array.h
 create mode 100644 include/linux/generic-radix-tree.h
 delete mode 100644 lib/flex_array.c
 create mode 100644 lib/generic-radix-tree.c

-- 
2.19.0.rc2



Re: [PATCH v13 11/13] platform/x86: Intel SGX driver

2018-09-07 Thread Sean Christopherson
On Thu, Sep 06, 2018 at 05:50:01PM -0700, Joe Perches wrote:
> On Thu, 2018-09-06 at 19:35 +0200, Miguel Ojeda wrote:
> > > Which one is right and why the kernel tree is polluted with C99-headers
> > > when they do not pass checkpatch.pl?
> 
> checkpatch ignores c99 headers since 2016.

Jarkko was referring to c99 comments for the SPDX license.  checkpatch
explicitly requires c-style comments for headers and assembly files as
dictated by Documentation/process/license-rules.rst.  

$ grep -r SPDX **/*.h | grep \/\/ | wc -l
665

$ grep -r SPDX **/*.S | grep \/\/ | wc -l
22

$ git show 9f3a89926d6df
commit 9f3a89926d6dfc30a4fd1bbcb92cc7b218d3786d
Author: Rob Herring 
Date:   Tue Apr 10 16:33:13 2018 -0700

checkpatch.pl: add SPDX license tag check

Add SPDX license tag check based on the rules defined in
Documentation/process/license-rules.rst.  To summarize, SPDX license
tags should be on the 1st line (or 2nd line in scripts) using the
appropriate comment style for the file type.

Link: http://lkml.kernel.org/r/20180202154026.15298-1-r...@kernel.org
Signed-off-by: Rob Herring 
Signed-off-by: Joe Perches 
Acked-by: Greg Kroah-Hartman 
Acked-by: Philippe Ombredanne 
Cc: Andy Whitcroft 
Cc: Joe Perches 
Cc: Thomas Gleixner 
Cc: Igor Stoppa 
Cc: Jonathan Corbet 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index b464a4c3f863..0f022b56f117 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -2257,6 +2257,8 @@ sub process {

my $camelcase_file_seeded = 0;

+   my $checklicenseline = 1;
+
sanitise_line_reset();
my $line;
foreach my $rawline (@rawlines) {
@@ -2448,6 +2450,7 @@ sub process {
} else {
$check = $check_orig;
}
+   $checklicenseline = 1;
next;
}

@@ -2911,6 +2914,30 @@ sub process {
}
}

+# check for using SPDX license tag at beginning of files
+   if ($realline == $checklicenseline) {
+   if ($rawline =~ /^[ \+]\s*\#\!\s*\//) {
+   $checklicenseline = 2;
+   } elsif ($rawline =~ /^\+/) {
+   my $comment = "";
+   if ($realfile =~ /\.(h|s|S)$/) {
+   $comment = '/*';
+   } elsif ($realfile =~ /\.(c|dts|dtsi)$/) {
+   $comment = '//';
+   } elsif (($checklicenseline == 2) || $realfile 
=~ /\.(sh|pl|py|awk|tc)$/) {
+   $comment = '#';
+   } elsif ($realfile =~ /\.rst$/) {
+   $comment = '..';
+   }
+
+   if ($comment !~ /^$/ &&
+   $rawline !~ /^\+\Q$comment\E 
SPDX-License-Identifier: /) {
+   WARN("SPDX_LICENSE_TAG",
+"Missing or malformed 
SPDX-License-Identifier tag in line $checklicenseline\n" . $herecurr);
+   }
+   }
+   }
+
 # check we are in a valid source file if not then ignore this hunk
next if ($realfile !~ /\.(h|c|s|S|sh|dtsi|dts)$/);





Re: [PATCH] mfd: ti-lmu: constify mfd_cell tables

2018-09-07 Thread Dan Murphy
Pavel

On 09/07/2018 04:39 AM, Pavel Machek wrote:
> On Wed 2018-08-29 11:31:08, Pavel Machek wrote:
>> From: Sebastian Reichel 
>>
>> mfd: ti-lmu: constify mfd_cell tables
>> 
>> Add const attribute to all mfd_cell structures.
>> 
>> Signed-off-by: Sebastian Reichel 
>> Signed-off-by: Pavel Machek 
> 
> Lee, I guess this is for you to apply. Any news there?
> 
> There are more patches ready,

As I stated in another email thread.  I don't see the need for this level of 
LMU framework.

Here is the reference thread

https://lore.kernel.org/patchwork/patch/982550/

> 
> https://git.kernel.org/pub/scm/linux/kernel/git/tmlind/linux-omap.git/log/?h=droid4-pending-v4.19
> 
> and it would be good to get them in. (Alternatively, you can just
> cherry-pick them from droid4-pending-v4.19).
> 
> Thanks,
>   Pavel
>   
> 
>> diff --git a/drivers/mfd/ti-lmu.c b/drivers/mfd/ti-lmu.c
>> index cfb411c..990437e 100644
>> --- a/drivers/mfd/ti-lmu.c
>> +++ b/drivers/mfd/ti-lmu.c
>> @@ -25,7 +25,7 @@
>>  #include 
>>  
>>  struct ti_lmu_data {
>> -struct mfd_cell *cells;
>> +const struct mfd_cell *cells;
>>  int num_cells;
>>  unsigned int max_register;
>>  };
>> @@ -63,7 +63,7 @@ static void ti_lmu_disable_hw(struct ti_lmu *lmu)
>>  gpio_set_value(lmu->en_gpio, 0);
>>  }
>>  
>> -static struct mfd_cell lm3532_devices[] = {
>> +static const struct mfd_cell lm3532_devices[] = {
>>  {
>>  .name  = "ti-lmu-backlight",
>>  .id= LM3532,
>> @@ -78,7 +78,7 @@ static struct mfd_cell lm3532_devices[] = {
>>  .of_compatible = "ti,lm363x-regulator", \
>>  }   \
>>  
>> -static struct mfd_cell lm3631_devices[] = {
>> +static const struct mfd_cell lm3631_devices[] = {
>>  LM363X_REGULATOR(LM3631_BOOST),
>>  LM363X_REGULATOR(LM3631_LDO_CONT),
>>  LM363X_REGULATOR(LM3631_LDO_OREF),
>> @@ -91,7 +91,7 @@ static struct mfd_cell lm3631_devices[] = {
>>  },
>>  };
>>  
>> -static struct mfd_cell lm3632_devices[] = {
>> +static const struct mfd_cell lm3632_devices[] = {
>>  LM363X_REGULATOR(LM3632_BOOST),
>>  LM363X_REGULATOR(LM3632_LDO_POS),
>>  LM363X_REGULATOR(LM3632_LDO_NEG),
>> @@ -102,7 +102,7 @@ static struct mfd_cell lm3632_devices[] = {
>>  },
>>  };
>>  
>> -static struct mfd_cell lm3633_devices[] = {
>> +static const struct mfd_cell lm3633_devices[] = {
>>  {
>>  .name  = "ti-lmu-backlight",
>>  .id= LM3633,
>> @@ -120,7 +120,7 @@ static struct mfd_cell lm3633_devices[] = {
>>  },
>>  };
>>  
>> -static struct mfd_cell lm3695_devices[] = {
>> +static const struct mfd_cell lm3695_devices[] = {
>>  {
>>  .name  = "ti-lmu-backlight",
>>  .id= LM3695,
>> @@ -128,7 +128,7 @@ static struct mfd_cell lm3695_devices[] = {
>>  },
>>  };
>>  
>> -static struct mfd_cell lm3697_devices[] = {
>> +static const struct mfd_cell lm3697_devices[] = {
>>  {
>>  .name  = "ti-lmu-backlight",
>>  .id= LM3697,
>>
> 
> 
> 


-- 
--
Dan Murphy


Re: [PATCH 2/6] md: convert to kvmalloc

2018-09-07 Thread Matthew Wilcox
On Fri, Sep 07, 2018 at 12:56:31PM -0400, Kent Overstreet wrote:
> @@ -165,7 +164,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
> raid5_percpu *percpu,
>  struct dma_async_tx_descriptor *tx)
>  {
>   int disks = sh->disks;
> - struct page **srcs = flex_array_get(percpu->scribble, 0);
> + struct page **srcs = percpu->scribble;
>   int count = 0, pd_idx = sh->pd_idx, i;
>   struct async_submit_ctl submit;
>  
> @@ -196,8 +195,8 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
> raid5_percpu *percpu,
>   }
>  
>   init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
> -   NULL, sh, flex_array_get(percpu->scribble, 0)
> -   + sizeof(struct page *) * (sh->disks + 2));
> +   NULL, sh, percpu->scribble +
> +   sizeof(struct page *) * (sh->disks + 2));

I think this would read better written as:

init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
  NULL, sh, srcs + sh->disks + 2);

>  static addr_conv_t *to_addr_conv(struct stripe_head *sh,
>struct raid5_percpu *percpu, int i)
>  {
> - void *addr;
> -
> - addr = flex_array_get(percpu->scribble, i);
> - return addr + sizeof(struct page *) * (sh->disks + 2);
> + return percpu->scribble + i * percpu->scribble_obj_size +
> + sizeof(struct page *) * (sh->disks + 2);
>  }
>  
>  /* return a pointer to the address conversion region of the scribble buffer 
> */
>  static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
>  {
> - void *addr;
> -
> - addr = flex_array_get(percpu->scribble, i);
> - return addr;
> + return percpu->scribble + i * percpu->scribble_obj_size;
>  }

Perhaps this would be better as ...

 static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
 {
-   void *addr;
-
-   addr = flex_array_get(percpu->scribble, i);
-   return addr;
+   return percpu->scribble + i * percpu->scribble_obj_size;
 }

 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
 struct raid5_percpu *percpu, int i)
 {
-   void *addr;
-
-   addr = flex_array_get(percpu->scribble, i);
-   return addr + sizeof(struct page *) * (sh->disks + 2);
+   return to_addr_page(percpu, i) + sh->disks + 2;
 }


The rest looks good.


Re: [PATCH] ASoC: max98373: usleep_range() needs include/delay.h

2018-09-07 Thread Mark Brown
On Fri, Sep 07, 2018 at 10:52:24AM -0700, Grant Grundler wrote:
> On Fri, Sep 7, 2018 at 5:11 AM Mark Brown  wrote:

> > Note that this isn't causing a warning upstream, presumably due to an
> > implicit inclusion that isn't present in the v4.4 kernel that you appear
> > to be using, or gets missed due to config differences.

> Ok. Is this just an observation or are these reasons to not accept the
> change?

An observation, you should already have a mail about it being applied.  


signature.asc
Description: PGP signature


[PATCH v6 5/5] x86/kvm: Avoid dynamic allocation of pvclock data when SEV is active

2018-09-07 Thread Brijesh Singh
Currently, the per-cpu pvclock data is allocated dynamically when
cpu > HVC_BOOT_ARRAY_SIZE. The physical address of this variable is
shared between the guest and the hypervisor hence it must be mapped as
unencrypted (ie. C=0) when SEV is active.

The C-bit works on a page, hence we will be required to perform a
full 4k page allocation to store a single 32-byte pvclock variable. It
will waste fairly sizeable amount of memory since each CPU will be doing
a separate 4k allocation. Let's define a second array for the SEV case to
statically allocate for NR_CPUS and put this array in .data..decrypted
section so that its mapped with C=0 during boot. The .data..decrypted
section has a big chunk of memory that is currently unused. And since
second array will be used only when memory encryption is active hence
free it when encryption is not active.

Signed-off-by: Brijesh Singh 
Suggested-by: Sean Christopherson 
Cc: Tom Lendacky 
Cc: k...@vger.kernel.org
Cc: Thomas Gleixner 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: linux-kernel@vger.kernel.org
Cc: Paolo Bonzini 
Cc: Sean Christopherson 
Cc: k...@vger.kernel.org
Cc: "Radim Krčmář" 
---
 arch/x86/include/asm/mem_encrypt.h |  4 
 arch/x86/kernel/kvmclock.c | 14 ++
 arch/x86/kernel/vmlinux.lds.S  |  3 +++
 arch/x86/mm/init.c |  3 +++
 arch/x86/mm/mem_encrypt.c  | 10 ++
 5 files changed, 34 insertions(+)

diff --git a/arch/x86/include/asm/mem_encrypt.h 
b/arch/x86/include/asm/mem_encrypt.h
index 802b2eb..cc46584 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -48,11 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, 
unsigned long size);
 
 /* Architecture __weak replacement functions */
 void __init mem_encrypt_init(void);
+void __init free_decrypted_mem(void);
 
 bool sme_active(void);
 bool sev_active(void);
 
 #define __decrypted __attribute__((__section__(".data..decrypted")))
+#define __decrypted_aux __attribute__((__section__(".data..decrypted.aux")))
 
 #else  /* !CONFIG_AMD_MEM_ENCRYPT */
 
@@ -80,6 +82,7 @@ static inline int __init
 early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 
0; }
 
 #define __decrypted
+#define __decrypted_aux
 
 #endif /* CONFIG_AMD_MEM_ENCRYPT */
 
@@ -93,6 +96,7 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long 
size) { return 0;
 #define __sme_pa_nodebug(x)(__pa_nodebug(x) | sme_me_mask)
 
 extern char __start_data_decrypted[], __end_data_decrypted[];
+extern char __start_data_decrypted_aux[];
 
 #endif /* __ASSEMBLY__ */
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 376fd3a..6086b56 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -65,6 +65,15 @@ static struct pvclock_vsyscall_time_info
 static struct pvclock_wall_clock wall_clock __decrypted;
 static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+/*
+ * The auxiliary array will be used when SEV is active. In non-SEV case,
+ * it will be freed by free_decrypted_mem().
+ */
+static struct pvclock_vsyscall_time_info
+   hv_clock_aux[NR_CPUS] __decrypted_aux;
+#endif
+
 static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
 {
return _cpu_read(hv_clock_per_cpu)->pvti;
@@ -269,6 +278,11 @@ static int kvmclock_setup_percpu(unsigned int cpu)
/* Use the static page for the first CPUs, allocate otherwise */
if (cpu < HVC_BOOT_ARRAY_SIZE)
p = _clock_boot[cpu];
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+   /* Use the static page from auxiliary array instead of allocating it. */
+   else if (sev_active())
+   p = _clock_aux[cpu - HVC_BOOT_ARRAY_SIZE];
+#endif
else
p = kzalloc(sizeof(*p), GFP_KERNEL);
 
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 4cb1064..bde287a 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -77,6 +77,9 @@ jiffies_64 = jiffies;
. = ALIGN(PMD_SIZE);\
__start_data_decrypted = .; \
*(.data..decrypted);\
+   . = ALIGN(PAGE_SIZE);   \
+   __start_data_decrypted_aux = .; \
+   *(.data..decrypted.aux);\
. = ALIGN(PMD_SIZE);\
__end_data_decrypted = .;   \
 
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7a8fc26..052b279 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -815,9 +815,12 @@ void free_kernel_image_pages(void *begin, void *end)
set_memory_np_noalias(begin_ul, len_pages);
 }
 
+void __weak free_decrypted_mem(void) { }
+
 void __ref free_initmem(void)
 {

[PATCH v6 3/5] x86/mm: add .data..decrypted section to hold shared variables

2018-09-07 Thread Brijesh Singh
kvmclock defines few static variables which are shared with the
hypervisor during the kvmclock initialization.

When SEV is active, memory is encrypted with a guest-specific key, and
if guest OS wants to share the memory region with hypervisor then it must
clear the C-bit before sharing it. Currently, we use
kernel_physical_mapping_init() to split large pages before clearing the
C-bit on shared pages. But it fails when called from the kvmclock
initialization (mainly because memblock allocator is not ready that early
during boot).

Add a __decrypted section attribute which can be used when defining
such shared variable. The so-defined variables will be placed in the
.data..decrypted section. This section is mapped with C=0 early
during boot, we also ensure that the initialized values are updated
to match with C=0 (i.e perform an in-place decryption). The
.data..decrypted section is PMD-aligned and sized so that we avoid
the need to split the large pages when mapping the section.

The sme_encrypt_kernel() was used to perform the in-place encryption
of the Linux kernel and initrd when SME is active. The routine has been
enhanced to decrypt the .data..decrypted section for both SME and SEV
cases.

Signed-off-by: Brijesh Singh 
Reviewed-by: Tom Lendacky 
Cc: Tom Lendacky 
Cc: k...@vger.kernel.org
Cc: Thomas Gleixner 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: linux-kernel@vger.kernel.org
Cc: Paolo Bonzini 
Cc: Sean Christopherson 
Cc: k...@vger.kernel.org
Cc: "Radim Krčmář" 
---
 arch/x86/include/asm/mem_encrypt.h |  6 +++
 arch/x86/kernel/head64.c   | 11 +
 arch/x86/kernel/vmlinux.lds.S  | 17 +++
 arch/x86/mm/mem_encrypt_identity.c | 94 --
 4 files changed, 113 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/mem_encrypt.h 
b/arch/x86/include/asm/mem_encrypt.h
index c064383..802b2eb 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -52,6 +52,8 @@ void __init mem_encrypt_init(void);
 bool sme_active(void);
 bool sev_active(void);
 
+#define __decrypted __attribute__((__section__(".data..decrypted")))
+
 #else  /* !CONFIG_AMD_MEM_ENCRYPT */
 
 #define sme_me_mask0ULL
@@ -77,6 +79,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long 
size) { return 0;
 static inline int __init
 early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 
0; }
 
+#define __decrypted
+
 #endif /* CONFIG_AMD_MEM_ENCRYPT */
 
 /*
@@ -88,6 +92,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long 
size) { return 0;
 #define __sme_pa(x)(__pa(x) | sme_me_mask)
 #define __sme_pa_nodebug(x)(__pa_nodebug(x) | sme_me_mask)
 
+extern char __start_data_decrypted[], __end_data_decrypted[];
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 8047379..af39d68 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -112,6 +112,7 @@ static bool __head check_la57_support(unsigned long 
physaddr)
 unsigned long __head __startup_64(unsigned long physaddr,
  struct boot_params *bp)
 {
+   unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
@@ -234,6 +235,16 @@ unsigned long __head __startup_64(unsigned long physaddr,
/* Encrypt the kernel and related (if SME is active) */
sme_encrypt_kernel(bp);
 
+   /* Clear the memory encryption mask from the .data..decrypted section. 
*/
+   if (mem_encrypt_active()) {
+   vaddr = (unsigned long)__start_data_decrypted;
+   vaddr_end = (unsigned long)__end_data_decrypted;
+   for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+   i = pmd_index(vaddr);
+   pmd[i] -= sme_get_me_mask();
+   }
+   }
+
/*
 * Return the SME encryption mask (if SME is active) to be used as a
 * modifier for the initial pgdir entry programmed into CR3.
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 8bde0a4..4cb1064 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -65,6 +65,21 @@ jiffies_64 = jiffies;
 #define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
 #define ALIGN_ENTRY_TEXT_END   . = ALIGN(PMD_SIZE);
 
+/*
+ * This section contains data which will be mapped as decrypted. Memory
+ * encryption operates on a page basis. Make this section PMD-aligned
+ * to avoid spliting the pages while mapping the section early.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#define DATA_DECRYPTED \
+   . = ALIGN(PMD_SIZE);\
+   __start_data_decrypted = .; \
+   

[PATCH v6 4/5] x86/kvm: use __decrypted attribute in shared variables

2018-09-07 Thread Brijesh Singh
Commit: 368a540e0232 (x86/kvmclock: Remove memblock dependency)
caused SEV guest regression. When SEV is active, we map the shared
variables (wall_clock and hv_clock_boot) with C=0 to ensure that both
the guest and the hypervisor are able to access the data. To map the
variables we use kernel_physical_mapping_init() to split the large pages,
but splitting large pages requires allocating a new PMD, which fails now
that kvmclock initialization is called early during boot.

Recently we added a special .data..decrypted section to hold the shared
variables. This section is mapped with C=0 early during boot. Use
__decrypted attribute to put the wall_clock and hv_clock_boot in
.data..decrypted section so that they are mapped with C=0.

Signed-off-by: Brijesh Singh 
Reviewed-by: Tom Lendacky 
Fixes: 368a540e0232 ("x86/kvmclock: Remove memblock dependency")
Cc: Tom Lendacky 
Cc: k...@vger.kernel.org
Cc: Thomas Gleixner 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: linux-kernel@vger.kernel.org
Cc: Paolo Bonzini 
Cc: Sean Christopherson 
Cc: k...@vger.kernel.org
Cc: "Radim Krčmář" 
---
 arch/x86/kernel/kvmclock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1e67646..376fd3a 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -61,8 +61,8 @@ early_param("no-kvmclock-vsyscall", 
parse_no_kvmclock_vsyscall);
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
 
 static struct pvclock_vsyscall_time_info
-   hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
-static struct pvclock_wall_clock wall_clock;
+   hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __decrypted 
__aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock __decrypted;
 static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
 
 static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
-- 
2.7.4



Re: Patch "arm64: mm: always enable CONFIG_HOLES_IN_ZONE" has been added to the 4.9-stable tree

2018-09-07 Thread Nathan Chancellor
On Fri, Sep 07, 2018 at 02:57:51PM +0200, gre...@linuxfoundation.org wrote:
> 
> This is a note to let you know that I've just added the patch titled
> 
> arm64: mm: always enable CONFIG_HOLES_IN_ZONE
> 
> to the 4.9-stable tree which can be found at:
> 
> http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary
> 
> The filename of the patch is:
>  arm64-mm-always-enable-config_holes_in_zone.patch
> and it can be found in the queue-4.9 subdirectory.
> 
> If you, or anyone else, feels it should not be added to the stable tree,
> please let  know about it.
> 
> 
> From f52bb98f5aded4c43e52f5ce19fb83f7261e9e73 Mon Sep 17 00:00:00 2001
> From: James Morse 
> Date: Thu, 30 Aug 2018 16:05:32 +0100
> Subject: arm64: mm: always enable CONFIG_HOLES_IN_ZONE
> 
> From: James Morse 
> 
> commit f52bb98f5aded4c43e52f5ce19fb83f7261e9e73 upstream.
> 
> Commit 6d526ee26ccd ("arm64: mm: enable CONFIG_HOLES_IN_ZONE for NUMA")
> only enabled HOLES_IN_ZONE for NUMA systems because the NUMA code was
> choking on the missing zone for nomap pages. This problem doesn't just
> apply to NUMA systems.
> 
> If the architecture doesn't set HAVE_ARCH_PFN_VALID, pfn_valid() will
> return true if the pfn is part of a valid sparsemem section.
> 
> When working with multiple pages, the mm code uses pfn_valid_within()
> to test each page it uses within the sparsemem section is valid. On
> most systems memory comes in MAX_ORDER_NR_PAGES chunks which all
> have valid/initialised struct pages. In this case pfn_valid_within()
> is optimised out.
> 
> Systems where this isn't true (e.g. due to nomap) should set
> HOLES_IN_ZONE and provide HAVE_ARCH_PFN_VALID so that mm tests each
> page as it works with it.
> 
> Currently non-NUMA arm64 systems can't enable HOLES_IN_ZONE, leading to
> a VM_BUG_ON():
> 
> | page:fdff802e1780 is uninitialized and poisoned
> | raw:    
> | raw:    
> | page dumped because: VM_BUG_ON_PAGE(PagePoisoned(p))
> | [ cut here ]
> | kernel BUG at include/linux/mm.h:978!
> | Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
> [...]
> | CPU: 1 PID: 25236 Comm: dd Not tainted 4.18.0 #7
> | Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015
> | pstate: 4085 (nZcv daIf -PAN -UAO)
> | pc : move_freepages_block+0x144/0x248
> | lr : move_freepages_block+0x144/0x248
> | sp : fe0071177680
> [...]
> | Process dd (pid: 25236, stack limit = 0x94cc07fb)
> | Call trace:
> |  move_freepages_block+0x144/0x248
> |  steal_suitable_fallback+0x100/0x16c
> |  get_page_from_freelist+0x440/0xb20
> |  __alloc_pages_nodemask+0xe8/0x838
> |  new_slab+0xd4/0x418
> |  ___slab_alloc.constprop.27+0x380/0x4a8
> |  __slab_alloc.isra.21.constprop.26+0x24/0x34
> |  kmem_cache_alloc+0xa8/0x180
> |  alloc_buffer_head+0x1c/0x90
> |  alloc_page_buffers+0x68/0xb0
> |  create_empty_buffers+0x20/0x1ec
> |  create_page_buffers+0xb0/0xf0
> |  __block_write_begin_int+0xc4/0x564
> |  __block_write_begin+0x10/0x18
> |  block_write_begin+0x48/0xd0
> |  blkdev_write_begin+0x28/0x30
> |  generic_perform_write+0x98/0x16c
> |  __generic_file_write_iter+0x138/0x168
> |  blkdev_write_iter+0x80/0xf0
> |  __vfs_write+0xe4/0x10c
> |  vfs_write+0xb4/0x168
> |  ksys_write+0x44/0x88
> |  sys_write+0xc/0x14
> |  el0_svc_naked+0x30/0x34
> | Code: aa1303e0 90001a01 91296421 94008902 (d421)
> | ---[ end trace 1601ba47f6e883fe ]---
> 
> Remove the NUMA dependency.
> 
> Link: https://www.spinics.net/lists/arm-kernel/msg671851.html
> Cc: 
> Cc: Ard Biesheuvel 
> Reported-by: Mikulas Patocka 
> Reviewed-by: Pavel Tatashin 
> Tested-by: Mikulas Patocka 
> Signed-off-by: James Morse 
> Signed-off-by: Will Deacon 
> Signed-off-by: Greg Kroah-Hartman 
> 
> ---
>  arch/arm64/Kconfig |1 -
>  1 file changed, 1 deletion(-)
> 
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -631,7 +631,6 @@ config HAVE_SETUP_PER_CPU_AREA
>  
>  config NEED_PER_CPU_EMBED_FIRST_CHUNK

Looks like git got confused here, this isn't HOLES_IN_ZONE.

Additionally, commit 6d526ee26ccd ("arm64: mm: enable
CONFIG_HOLES_IN_ZONE for NUMA") that introduced it to this file didn't
appear until 4.11 so this patch can be dropped from 4.9.

>   def_bool y
> - depends on NUMA
>  
>  source kernel/Kconfig.preempt
>  source kernel/Kconfig.hz
> 
> 
> Patches currently in stable-queue which might be from james.mo...@arm.com are
> 
> queue-4.9/arm64-mm-always-enable-config_holes_in_zone.patch


Re: [PATCH] of: Split up name & type in modalias generation

2018-09-07 Thread Rob Herring
On Fri, Sep 7, 2018 at 9:22 AM Thierry Reding  wrote:
>
> From: Thierry Reding 
>
> The kernel's vsnprintf() implementation discards all alpha-numeric
> characters following a %p conversion specifier. This is done in order to
> generically skip any of the various modifiers that the kernel supports.
> Unfortunately, the OF modalias is generated with a format string that
> violates the assumption made by vsnprintf():
>
> of:N%pOFnT%s
>
> While processing the above format string, vsnprintf() will eat the 'T'
> character, assuming that it belongs to the preceeding %p specifier. This
> results in a modalias with an incompatible format, which in turn causes
> the automatic loading of drivers based on modalias to no longer work.
>
> To fix this, split up the generation of the name & type fields into two
> separate snprintf() calls to avoid confusing the parser.
>
> Fixes: 73813f8483b1 ("of: Convert to using %pOFn instead of device_node.name")
> Signed-off-by: Thierry Reding 
> ---
> Note that a more elegant fix would be to make the %p format specifier
> parser report back the exact number of characters consumed. I briefly
> tried to implement it, but quickly ran into numerous special cases
> that make this solution rather involved.
>
> I can spend some more time to improve this in general if that's what we
> ultimately want, but I think this patch is a better short-term fix to
> workaround the issue.

See my reply on the original patch. I've updated the patch in my
dt/next branch with the fix to use %c.

Rob


Re: [PATCH RFC LKMM 1/7] tools/memory-model: Add extra ordering for locks and remove it for ordinary release/acquire

2018-09-07 Thread Alan Stern
On Thu, 6 Sep 2018, Andrea Parri wrote:

> > Have you noticed any part of the generic code that relies on ordinary 
> > acquire-release (rather than atomic RMW acquire-release) in order to 
> > implement locking constructs?
> 
> There are several places in code where the "lock-acquire" seems to be
> provided by an atomic_cond_read_acquire/smp_cond_load_acquire: I have
> mentioned one in qspinlock in this thread; qrwlock and mcs_spinlock
> provide other examples (grep for the primitives...).
> 
> As long as we don't consider these primitive as RMW (which would seem
> odd...) or as acquire for which "most people expect strong ordering"
> (see above), these provides other examples for the _gap_ I mentioned.

Okay, now I understand your objection.  It does appear that on RISC-V,
if nowhere else, the current implementations of qspinlock, qrwlock,
etc. will not provide "RCtso" ordering.

The discussions surrounding this topic have been so lengthy and 
confusing that I have lost track of any comments Palmer or Daniel may 
have made concerning this potential problem.

One possible resolution would be to define smp_cond_load_acquire() 
specially on RISC-V so that it provided the same ordering guarantees as 
RMW-acquire.  (Plus adding a comment in the asm-generic/barrier.h 
pointing out the necessity for the stronger guarantee on all 
architectures.)

Another would be to replace the usages of atomic/smp_cond_load_acquire 
in the locking constructs with a new function that would otherwise be 
the same but would provide the ordering guarantee we want.

Do you think either of these would be an adequate fix?

Alan



Re: [PATCH] apparmor: Fix network performance issue in aa_label_sk_perm

2018-09-07 Thread John Johansen
On 09/06/2018 09:33 PM, Tony Jones wrote:
> The netperf benchmark shows a 5.73% reduction in throughput for 
> small (64 byte) transfers by unconfined tasks.
> 
> DEFINE_AUDIT_SK() in aa_label_sk_perm() should not be performed 
> unconditionally, rather only when the label is confined.
> 
> netperf-tcp
> 56974a6fc^  56974a6fc
> Min   64 563.48 (   0.00%)  531.17 (  -5.73%)
> Min   128   1056.92 (   0.00%)  999.44 (  -5.44%)
> Min   256   1945.95 (   0.00%) 1867.97 (  -4.01%)
> Min   1024  6761.40 (   0.00%) 6364.23 (  -5.87%)
> Min   2048 0.53 (   0.00%)10606.20 (  -4.54%)
> Min   3312 13692.67 (   0.00%)13158.41 (  -3.90%)
> Min   4096 14926.29 (   0.00%)14457.46 (  -3.14%)
> Min   8192 18399.34 (   0.00%)18091.65 (  -1.67%)
> Min   1638421384.13 (   0.00%)21158.05 (  -1.06%)
> Hmean 64 564.96 (   0.00%)  534.38 (  -5.41%)
> Hmean 128   1064.42 (   0.00%) 1010.12 (  -5.10%)
> Hmean 256   1965.85 (   0.00%) 1879.16 (  -4.41%)
> Hmean 1024  6839.77 (   0.00%) 6478.70 (  -5.28%)
> Hmean 2048 11154.80 (   0.00%)10671.13 (  -4.34%)
> Hmean 3312 13838.12 (   0.00%)13249.01 (  -4.26%)
> Hmean 4096 15009.99 (   0.00%)14561.36 (  -2.99%)
> Hmean 8192 18975.57 (   0.00%)18326.54 (  -3.42%)
> Hmean 1638421440.44 (   0.00%)21324.59 (  -0.54%)
> Stddev64   1.24 (   0.00%)2.85 (-130.64%)
> Stddev128  4.51 (   0.00%)6.53 ( -44.84%)
> Stddev256 11.67 (   0.00%)8.50 (  27.16%)
> Stddev102448.33 (   0.00%)   75.07 ( -55.34%)
> Stddev204854.82 (   0.00%)   65.16 ( -18.86%)
> Stddev3312   153.57 (   0.00%)   56.29 (  63.35%)
> Stddev4096   100.25 (   0.00%)   88.50 (  11.72%)
> Stddev8192   358.13 (   0.00%)  169.99 (  52.54%)
> Stddev16384   43.99 (   0.00%)  141.82 (-222.39%)
> 
> Signed-off-by: Tony Jones 
> Fixes: 56974a6fcfef ("apparmor: add base infastructure for socket
> mediation")

hey Tony,

thanks for the patch, I am curious did you're investigation look
into what parts of DEFINE_AUDIT_SK are causing the issue?

regardless, I have pulled it into apparmor next

> ---
>  security/apparmor/net.c | 15 +--
>  1 file changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/security/apparmor/net.c b/security/apparmor/net.c
> index bb24cfa0a164..d5d72dd1ca1f 100644
> --- a/security/apparmor/net.c
> +++ b/security/apparmor/net.c
> @@ -146,17 +146,20 @@ int aa_af_perm(struct aa_label *label, const char *op, 
> u32 request, u16 family,
>  static int aa_label_sk_perm(struct aa_label *label, const char *op, u32 
> request,
>   struct sock *sk)
>  {
> - struct aa_profile *profile;
> - DEFINE_AUDIT_SK(sa, op, sk);
> + int error = 0;
>  
>   AA_BUG(!label);
>   AA_BUG(!sk);
>  
> - if (unconfined(label))
> - return 0;
> + if (!unconfined(label)) {
> + struct aa_profile *profile;
> + DEFINE_AUDIT_SK(sa, op, sk);
>  
> - return fn_for_each_confined(label, profile,
> - aa_profile_af_sk_perm(profile, , request, sk));
> + error = fn_for_each_confined(label, profile,
> + aa_profile_af_sk_perm(profile, , request, sk));
> + }
> +
> + return error;
>  }
>  
>  int aa_sk_perm(const char *op, u32 request, struct sock *sk)
> 



[PATCH 0/4] 9p coverity fixes

2018-09-07 Thread Dominique Martinet
From: Dominique Martinet 

Since we already had one coverity fix for 9p, I figured I could request
an account and look at stuff that actually could matter.

The leak of glock.client_id wasn't found by coverity but when I was
looking at a false positive there, of the rest the rdma one is useless
but the other two are pretty important -- I will probably mark the three
useful ones to backport to stable kernels.

As usual, comments more than welcome, but I'll probably push them to
linux-next along with the other patches that need test after testing the
whole batch together next week.


Dominique Martinet (4):
  9p: acl: fix uninitialized iattr access
  9p/rdma: remove useless check in cm_event_handler
  9p: p9dirent_read: check network-provided name length
  9p locks: fix glock.client_id leak in do_lock

 fs/9p/acl.c |  2 +-
 fs/9p/vfs_file.c| 16 ++--
 net/9p/protocol.c   | 12 +---
 net/9p/trans_rdma.c |  3 +--
 4 files changed, 25 insertions(+), 8 deletions(-)

-- 
2.17.1



[PATCH 4/4] 9p locks: fix glock.client_id leak in do_lock

2018-09-07 Thread Dominique Martinet
From: Dominique Martinet 

the 9p client code overwrites our glock.client_id pointing to a static
buffer by an allocated string holding the network provided value which
we do not care about; free and reset the value as appropriate.

This is almost identical to the leak in v9fs_file_getlock() fixed by
Al Viro in commit ce85dd58ad5a6 ("9p: we are leaking glock.client_id
in v9fs_file_getlock()"), which was returned as an error by a coverity
false positive -- while we are here attempt to make the code slightly
more robust to future change of the net/9p/client code and hopefully
more clear to coverity that there is no problem.

Signed-off-by: Dominique Martinet 
---
 fs/9p/vfs_file.c | 16 ++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 73857ebaedfb..a25efa782fcc 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -208,6 +208,14 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, 
struct file_lock *fl)
if (schedule_timeout_interruptible(v9ses->session_lock_timeout)
!= 0)
break;
+   /*
+* p9_client_lock_dotl overwrites flock.client_id with the
+* server message, free and reuse the client name
+*/
+   if (flock.client_id != fid->clnt->name) {
+   kfree(flock.client_id);
+   flock.client_id = fid->clnt->name;
+   }
}
 
/* map 9p status to VFS status */
@@ -239,6 +247,8 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, 
struct file_lock *fl)
locks_lock_file_wait(filp, fl);
fl->fl_type = fl_type;
}
+   if (flock.client_id != fid->clnt->name)
+   kfree(flock.client_id);
 out:
return res;
 }
@@ -273,7 +283,7 @@ static int v9fs_file_getlock(struct file *filp, struct 
file_lock *fl)
 
res = p9_client_getlock_dotl(fid, );
if (res < 0)
-   return res;
+   goto out;
/* map 9p lock type to os lock type */
switch (glock.type) {
case P9_LOCK_TYPE_RDLCK:
@@ -294,7 +304,9 @@ static int v9fs_file_getlock(struct file *filp, struct 
file_lock *fl)
fl->fl_end = glock.start + glock.length - 1;
fl->fl_pid = -glock.proc_id;
}
-   kfree(glock.client_id);
+out:
+   if (glock.client_id != fid->clnt->name)
+   kfree(glock.client_id);
return res;
 }
 
-- 
2.17.1



Re: [PATCH 01/12] blkcg: fix ref count issue with bio_blkcg using task_css

2018-09-07 Thread Tejun Heo
On Thu, Sep 06, 2018 at 05:10:34PM -0400, Dennis Zhou wrote:
> From: "Dennis Zhou (Facebook)" 
> 
> The accessor function bio_blkcg either returns the blkcg associated with
> the bio or finds one in the current context. This can cause an issue
> when trying to associate a bio with a blkcg. Particularly, it's the
> third case that is problematic:
> 
>   return css_to_blkcg(task_css(current, io_cgrp_id));
> 
> As the above may race against task migration and the cgroup exiting, it
> is not always ok to take a reference on the blkcg returned from
> bio_blkcg.
> 
> This patch adds association ahead of calling bio_blkcg rather than
> after. This makes association a required and explicit step along the
> code paths for calling bio_blkcg. blk_get_rl is modified as well to get
> a reference to the blkcg it may use and blk_put_rl will always put the
> reference back. Association is also moved above the bio_blkcg call to
> ensure it will not return NULL in blk-iolatency.
> 
> BFQ and CFQ utilize this flaw, but due to the complexity, I do not want
> to address this in this series. I've created a private version of the
> function with notes not to use it describing the flaw. Hopefully soon,
> that code can be cleaned up.
> 
> Signed-off-by: Dennis Zhou 

Acked-by: Tejun Heo 

-- 
tejun


[PATCH] firmware: arm_scmi: use strlcpy to ensure NULL-terminated strings

2018-09-07 Thread Sudeep Holla
Replace all the memcpy() for copying name strings from the firmware with
strlcpy() to make sure we are bounded by the source buffer size and we
also always have NULL-terminated strings.

This is needed to avoid out of bounds accesses if the firmware returns
a non-terminated string.

Reported-by: Olof Johansson 
Signed-off-by: Sudeep Holla 
---
 drivers/firmware/arm_scmi/base.c| 2 +-
 drivers/firmware/arm_scmi/clock.c   | 2 +-
 drivers/firmware/arm_scmi/perf.c| 2 +-
 drivers/firmware/arm_scmi/power.c   | 2 +-
 drivers/firmware/arm_scmi/sensors.c | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

Hi Olof,

Let me know if this is rc/fix material or need to wait for v4.20 ?

Regards,
Sudeep

diff --git a/drivers/firmware/arm_scmi/base.c b/drivers/firmware/arm_scmi/base.c
index 9dff33ea6416..204390297f4b 100644
--- a/drivers/firmware/arm_scmi/base.c
+++ b/drivers/firmware/arm_scmi/base.c
@@ -208,7 +208,7 @@ static int scmi_base_discover_agent_get(const struct 
scmi_handle *handle,

ret = scmi_do_xfer(handle, t);
if (!ret)
-   memcpy(name, t->rx.buf, SCMI_MAX_STR_SIZE);
+   strlcpy(name, t->rx.buf, SCMI_MAX_STR_SIZE);

scmi_xfer_put(handle, t);

diff --git a/drivers/firmware/arm_scmi/clock.c 
b/drivers/firmware/arm_scmi/clock.c
index e4119eb34986..30fc04e28431 100644
--- a/drivers/firmware/arm_scmi/clock.c
+++ b/drivers/firmware/arm_scmi/clock.c
@@ -111,7 +111,7 @@ static int scmi_clock_attributes_get(const struct 
scmi_handle *handle,

ret = scmi_do_xfer(handle, t);
if (!ret)
-   memcpy(clk->name, attr->name, SCMI_MAX_STR_SIZE);
+   strlcpy(clk->name, attr->name, SCMI_MAX_STR_SIZE);
else
clk->name[0] = '\0';

diff --git a/drivers/firmware/arm_scmi/perf.c b/drivers/firmware/arm_scmi/perf.c
index 721e6c57beae..c3b0041defee 100644
--- a/drivers/firmware/arm_scmi/perf.c
+++ b/drivers/firmware/arm_scmi/perf.c
@@ -168,7 +168,7 @@ scmi_perf_domain_attributes_get(const struct scmi_handle 
*handle, u32 domain,
le32_to_cpu(attr->sustained_perf_level);
dom_info->mult_factor = (dom_info->sustained_freq_khz * 1000) /
dom_info->sustained_perf_level;
-   memcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE);
+   strlcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE);
}

scmi_xfer_put(handle, t);
diff --git a/drivers/firmware/arm_scmi/power.c 
b/drivers/firmware/arm_scmi/power.c
index cfa033b05aed..62f3401a1f01 100644
--- a/drivers/firmware/arm_scmi/power.c
+++ b/drivers/firmware/arm_scmi/power.c
@@ -106,7 +106,7 @@ scmi_power_domain_attributes_get(const struct scmi_handle 
*handle, u32 domain,
dom_info->state_set_notify = SUPPORTS_STATE_SET_NOTIFY(flags);
dom_info->state_set_async = SUPPORTS_STATE_SET_ASYNC(flags);
dom_info->state_set_sync = SUPPORTS_STATE_SET_SYNC(flags);
-   memcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE);
+   strlcpy(dom_info->name, attr->name, SCMI_MAX_STR_SIZE);
}

scmi_xfer_put(handle, t);
diff --git a/drivers/firmware/arm_scmi/sensors.c 
b/drivers/firmware/arm_scmi/sensors.c
index 27f2092b9882..b53d5cc9c9f6 100644
--- a/drivers/firmware/arm_scmi/sensors.c
+++ b/drivers/firmware/arm_scmi/sensors.c
@@ -140,7 +140,7 @@ static int scmi_sensor_description_get(const struct 
scmi_handle *handle,
s = >sensors[desc_index + cnt];
s->id = le32_to_cpu(buf->desc[cnt].id);
s->type = SENSOR_TYPE(attrh);
-   memcpy(s->name, buf->desc[cnt].name, SCMI_MAX_STR_SIZE);
+   strlcpy(s->name, buf->desc[cnt].name, 
SCMI_MAX_STR_SIZE);
}

desc_index += num_returned;
--
2.7.4



Re: [PATCH 2/2] Add tests for memory.oom.group

2018-09-07 Thread Roman Gushchin
On Fri, Sep 07, 2018 at 09:49:24AM -0700, jgka...@fb.com wrote:
> From: Jay Kamat 
> 
> Add tests for memory.oom.group for the following cases:
> - Killing all processes in a leaf cgroup, but leaving the
>   parent untouched
> - Killing all processes in a parent and leaf cgroup
> - Keeping processes marked by OOM_SCORE_ADJ_MIN alive when considered
>   for being killed by the group oom killer.
> 
> Signed-off-by: Jay Kamat 

Acked-by: Roman Gushchin 



[PATCH 5/6] proc: commit to genradix

2018-09-07 Thread Kent Overstreet
the new generic radix trees have a simpler API and implementation, and
no limitations on number of elements, so all flex_array users are being
converted

Signed-off-by: Kent Overstreet 
Cc: Al Viro 
---
 fs/proc/base.c | 43 +++
 1 file changed, 15 insertions(+), 28 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index aaffc0c302..e11fbb390a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -59,6 +59,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -92,7 +93,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include "internal.h"
@@ -2128,11 +2128,12 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
struct task_struct *task;
struct mm_struct *mm;
unsigned long nr_files, pos, i;
-   struct flex_array *fa = NULL;
-   struct map_files_info info;
+   GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
 
+   genradix_init();
+
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2164,35 +2165,22 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
 */
 
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
-   if (vma->vm_file && ++pos > ctx->pos)
-   nr_files++;
-   }
+   if (!vma->vm_file)
+   continue;
+   if (++pos <= ctx->pos)
+   continue;
 
-   if (nr_files) {
-   fa = flex_array_alloc(sizeof(info), nr_files,
-   GFP_KERNEL);
-   if (!fa || flex_array_prealloc(fa, 0, nr_files,
-   GFP_KERNEL)) {
+   p = genradix_ptr_alloc(, nr_files++, GFP_KERNEL);
+   if (!p) {
ret = -ENOMEM;
-   if (fa)
-   flex_array_free(fa);
up_read(>mmap_sem);
mmput(mm);
goto out_put_task;
}
-   for (i = 0, vma = mm->mmap, pos = 2; vma;
-   vma = vma->vm_next) {
-   if (!vma->vm_file)
-   continue;
-   if (++pos <= ctx->pos)
-   continue;
 
-   info.start = vma->vm_start;
-   info.end = vma->vm_end;
-   info.mode = vma->vm_file->f_mode;
-   if (flex_array_put(fa, i++, , GFP_KERNEL))
-   BUG();
-   }
+   p->start = vma->vm_start;
+   p->end = vma->vm_end;
+   p->mode = vma->vm_file->f_mode;
}
up_read(>mmap_sem);
mmput(mm);
@@ -2201,7 +2189,7 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
unsigned int len;
 
-   p = flex_array_get(fa, i);
+   p = genradix_ptr(, i);
len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
if (!proc_fill_cache(file, ctx,
  buf, len,
@@ -2211,12 +2199,11 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
break;
ctx->pos++;
}
-   if (fa)
-   flex_array_free(fa);
 
 out_put_task:
put_task_struct(task);
 out:
+   genradix_free();
return ret;
 }
 
-- 
2.19.0.rc2



Re: [PATCH 4.4 105/124] xen-netfront: wait xenbus state change when load module manually

2018-09-07 Thread Boris Ostrovsky
On 09/06/2018 08:21 PM, Ben Hutchings wrote:
> On Sat, 2018-08-04 at 11:01 +0200, Greg Kroah-Hartman wrote:
>> 4.4-stable review patch.  If anyone has any objections, please let me know.
>>
>> --
>>
>> From: Xiao Liang 
>>
>> [ Upstream commit 822fb18a82abaf4ee7058793d95d340f5dab7bfc ]
>>
>> When loading module manually, after call xenbus_switch_state to initializes
>> the state of the netfront device, the driver state did not change so fast
>> that may lead no dev created in latest kernel. This patch adds wait to make
>> sure xenbus knows the driver is not in closed/unknown state.
> [...]
>> --- a/drivers/net/xen-netfront.c
>> +++ b/drivers/net/xen-netfront.c
>> @@ -86,6 +86,7 @@ struct netfront_cb {
>>  /* IRQ name is queue name with "-tx" or "-rx" appended */
>>  #define IRQ_NAME_SIZE (QUEUE_NAME_SIZE + 3)
>>  
>> +static DECLARE_WAIT_QUEUE_HEAD(module_load_q);
>>  static DECLARE_WAIT_QUEUE_HEAD(module_unload_q);
>>  
>>  struct netfront_stats {
>> @@ -1335,6 +1336,11 @@ static struct net_device *xennet_create_
>>  netif_carrier_off(netdev);
>>  
>>  xenbus_switch_state(dev, XenbusStateInitialising);
>> +wait_event(module_load_q,
>> +   xenbus_read_driver_state(dev->otherend) !=
>> +   XenbusStateClosed &&
>> +   xenbus_read_driver_state(dev->otherend) !=
>> +   XenbusStateUnknown);
>>  return netdev;
>>  
>>   exit:
> This won't work; it will hang.  Something (maybe netback_changed()?)
> needs to wake up tasks on the module_load_q.


https://lkml.org/lkml/2018/9/7/691


-boris


Re: [PATCH v2 00/32] Device Tree Updates for GTA04 (A3/A4/A5 variants)

2018-09-07 Thread Tony Lindgren
* Tony Lindgren  [180829 19:40]:
> * H. Nikolaus Schaller  [180829 07:24]:
> > Hi OMAP3 DTS Maintainers,
> > is there any progress in merging this patch series?
> 
> Looks good to me in general, I'll be getting into applying
> patches for v4.20 in few days.

Applying all into omap-for-v4.20/dt finally thanks.

Regards,

Tony


Re: Regression in next with filesystem context concept

2018-09-07 Thread Andreas Kemnade
Hi,

On Fri, 7 Sep 2018 09:10:23 -0700
Tony Lindgren  wrote:

> * David Howells  [180907 08:51]:
> > Tony Lindgren  wrote:
> >   
> > > Looks like next-20180906 now has a regression where mounting
> > > root won't work with commit fd0002870b45 ("vfs: Implement a
> > > filesystem superblock creation/configuration context").  
> > 
> > Am I right in thinking you're not using any of the LSMs?  
> 
> Assuming LSM as in Documentation/lsm.txt, right not using any.
> 
> BTW, I don't think this issue shows up with ramdisk either,
> so that's probably why for example kernelci.org does not
> show errors.
> 

I have also similar experience with my automated tests (automated
alarming does not work yet ;-)), I am still in the beginning.
I do there a ramdisk boot to create an overlay mount with the fresh
modules on top of an ordinary rootfs. initramfs mount is
ok, but the microsd card fails.

Testing from a ramdisk I get:
/ # ls -l /dev/mmcblk0p2 
brw---1 00 179,   2 Jan  1  1970 /dev/mmcblk0p2
/ # mount /dev/mmcblk0p2 /mnt/
[  682.819061] Filesystem requires source device
[  682.825103] Filesystem requires source device
[  682.830810] Filesystem requires source device
[  682.836303] Filesystem requires source device
[  682.843078] Filesystem requires source device
[  682.847991] Filesystem requires source device
[  682.853149] Filesystem requires source device
mount: mounting /dev/mmcblk0p2 on /mnt/ failed: No such file or directory

64GB microsd at omap_hsmmc correcly recognized. 
Last known successful boot: next-20180830

so you are not alone with such problems.
will investigate further

Regards,
Andreas


pgpHygN3ZoJxx.pgp
Description: OpenPGP digital signature


[RESEND PATCH] mm: percpu: remove unnecessary unlikely()

2018-09-07 Thread Igor Stoppa
WARN_ON() already contains an unlikely(), so it's not necessary to
wrap it into another.

Signed-off-by: Igor Stoppa 
Acked-by: Dennis Zhou 
Cc: Tejun Heo 
Cc: zijun_hu 
Cc: Christoph Lameter 
Cc: linux...@kvack.org
Cc: linux-kernel@vger.kernel.org
---
 mm/percpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index a749d4d96e3e..f5c2796fe63e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2588,7 +2588,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
BUG_ON(ai->nr_groups != 1);
upa = ai->alloc_size/ai->unit_size;
nr_g0_units = roundup(num_possible_cpus(), upa);
-   if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
+   if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
pcpu_free_alloc_info(ai);
return -EINVAL;
}
-- 
2.17.1



Re: [PATCH v2 2/9] nios2: build .dtb files in dts directory

2018-09-07 Thread Rob Herring
On Thu, Sep 6, 2018 at 9:21 PM Ley Foon Tan  wrote:
>
> On Wed, 2018-09-05 at 18:53 -0500, Rob Herring wrote:
> > Align nios2 with other architectures which build the dtb files in the
> > same directory as the dts files. This is also in line with most other
> > build targets which are located in the same directory as the source.
> > This move will help enable the 'dtbs' target which builds all the
> > dtbs
> > regardless of kernel config.
> >
> > This transition could break some scripts if they expect dtb files in
> > the old location.
> >
> > Cc: Ley Foon Tan 
> > Cc: nios2-...@lists.rocketboards.org
> > Signed-off-by: Rob Herring 
> > ---
> > Please ack so I can take the whole series via the DT tree.
> >
> >  arch/nios2/Makefile  | 4 ++--
> >  arch/nios2/boot/Makefile | 4 
> >  arch/nios2/boot/dts/Makefile | 1 +
> >  3 files changed, 3 insertions(+), 6 deletions(-)
> >  create mode 100644 arch/nios2/boot/dts/Makefile
> >
> > diff --git a/arch/nios2/Makefile b/arch/nios2/Makefile
> > index 8673a79dca9c..50eece1c6adb 100644
> > --- a/arch/nios2/Makefile
> > +++ b/arch/nios2/Makefile
> > @@ -59,10 +59,10 @@ archclean:
> > $(Q)$(MAKE) $(clean)=$(nios2-boot)
> >
> >  %.dtb: | scripts
> > -   $(Q)$(MAKE) $(build)=$(nios2-boot) $(nios2-boot)/$@
> > +   $(Q)$(MAKE) $(build)=$(nios2-boot)/dts $(nios2-boot)/dts/$@
> >
> >  dtbs:
> > -   $(Q)$(MAKE) $(build)=$(nios2-boot) $(nios2-boot)/$@
> > +   $(Q)$(MAKE) $(build)=$(nios2-boot)/dts
> >
> >  $(BOOT_TARGETS): vmlinux
> > $(Q)$(MAKE) $(build)=$(nios2-boot) $(nios2-boot)/$@
> > diff --git a/arch/nios2/boot/Makefile b/arch/nios2/boot/Makefile
> > index 2ba23a679732..007586094dde 100644
> > --- a/arch/nios2/boot/Makefile
> > +++ b/arch/nios2/boot/Makefile
> > @@ -47,10 +47,6 @@ obj-$(CONFIG_NIOS2_DTB_SOURCE_BOOL) +=
> > linked_dtb.o
> >
> >  targets += $(dtb-y)
> >
> > -# Rule to build device tree blobs with make command
> > -$(obj)/%.dtb: $(src)/dts/%.dts FORCE
> > -   $(call if_changed_dep,dtc)
> > -
> >  $(obj)/dtbs: $(addprefix $(obj)/, $(dtb-y))
> >
> >  install:
> > diff --git a/arch/nios2/boot/dts/Makefile
> > b/arch/nios2/boot/dts/Makefile
> > new file mode 100644
> > index ..f66554cd5c45
> > --- /dev/null
> > +++ b/arch/nios2/boot/dts/Makefile
> > @@ -0,0 +1 @@
> > +# SPDX-License-Identifier: GPL-2.0
> > --
> > 2.17.1
> >
> Hi Rob
>
> I have synced your all-dtbs branch from here: https://git.kernel.org/pu
> b/scm/linux/kernel/git/robh/linux.git/log/?h=all-dtbs
>
> It shows error when compile kernel image and also when "make
> dtbs_install".

Can you fetch the branch again and try it. I fixed a few dependency issues.

> make dtbs_install
> make[1]: *** No rule to make target
> 'arch/nios2/boot/dts/arch/nios2/boot/dts/10m50_devboard.dtb', needed by
> 'arch/nios2/boot/dts/arch/nios2/boot/dts/10m50_devboard.dtb.S'.  Stop.

What is the value of CONFIG_NIOS2_DTB_SOURCE? As patch 3 notes, it now
should not have any path.

If that's a problem, I could take the basename to strip the path, but
then sub directories wouldn't work either.

BTW, next up, I want to consolidate the config variables for built-in dtbs.

Rob


Re: Conflict between sparse and commit cafa0010cd51f ("Raise the minimum required gcc version to 4.6")

2018-09-07 Thread Nick Desaulniers
On Fri, Sep 7, 2018 at 11:13 AM Luc Van Oostenryck
 wrote:
>
> On Fri, Sep 07, 2018 at 10:22:56AM -0700, Nick Desaulniers wrote:
> > On Fri, Sep 7, 2018 at 7:34 AM Christophe LEROY  
> > wrote:
> > >
> > > Cc linux-spa...@vger.kernel.org
> > >
> > > Le 07/09/2018 à 14:22, Christophe Leroy a écrit :
> > > > Since commit cafa0010cd51f ("Raise the minimum required gcc version to
> > > > 4.6"), sparse check fails as follows:
> > > >
> > > > [root@pc16082vm linux-powerpc]# make C=2 arch/powerpc/kernel/process.o
> > > >CALLscripts/checksyscalls.sh
> > > >CHECK   scripts/mod/empty.c
> > > > ./include/linux/compiler-gcc.h:14:3: error: Sorry, your compiler is too
> > > > old - please upgrade it.
> > > >CHECK   arch/powerpc/kernel/process.c
> > > > ./include/linux/compiler-gcc.h:14:3: error: Sorry, your compiler is too
> > > > old - please upgrade it.
> > > >
> > > >
> > > > I have sparse version 0.5.2
> > > >
> > > > What can be done to fix that ?
> > > >
> > > > Christophe
> >
> > Oof, sorry Christophe.  Looks like that's the latest version of sparse:
> > https://sparse.wiki.kernel.org/index.php/Main_Page#News
> >
> > I'm curious what sparse expands __GNUC__, __GNUC_MINOR__, and
> > __GNUC_PATCHLEVEL__ to?  Pre commit cafa0010cd51f, it MUST be
> > expanding them to something, otherwise you'd have seen the error then,
> > too.  The previous check was GCC < 3.3, now it's GCC < 4.6.
>
> Sparse expand these macros to the same version than the compiler used
> to compile GCC. I find a bit strange though to have sparse v0.5.2 but
> using an old compiler.

So Christophe must have a version of gcc < 4.6 installed somewhere?
Does sparse use `cc`? If so, Christophe, does your `ls -l $(which cc)`
point to an old version of gcc maybe?

>
> Also, it's worth to look at what is said in this email:
>   
> https://lore.kernel.org/lkml/ca+55afzyenzr2gzlr-dwponjmnygyody+6awlcvnaywiazu...@mail.gmail.com/
>
>
> -- Luc



-- 
Thanks,
~Nick Desaulniers


Re: [PATCH 1/2] Fix cg_read_strcmp()

2018-09-07 Thread Jay Kamat


Shuah Khan writes:

> On 09/07/2018 10:49 AM, jgka...@fb.com wrote:
>> From: Jay Kamat 
>>
>> Fix a couple issues with cg_read_strcmp(), to improve correctness of
>> cgroup tests
>> - Fix cg_read_strcmp() always returning 0 for empty "needle" strings
>> - Fix a memory leak in cg_read_strcmp()
>>
>> Fixes: 84092dbcf901 ("selftests: cgroup: add memory controller self-tests")
>>
>> Signed-off-by: Jay Kamat 
>> ---
>>  tools/testing/selftests/cgroup/cgroup_util.c | 17 ++---
>>  1 file changed, 14 insertions(+), 3 deletions(-)
>>
>> diff --git a/tools/testing/selftests/cgroup/cgroup_util.c 
>> b/tools/testing/selftests/cgroup/cgroup_util.c
>> index 1e9e3c470561..8b644ea39725 100644
>> --- a/tools/testing/selftests/cgroup/cgroup_util.c
>> +++ b/tools/testing/selftests/cgroup/cgroup_util.c
>> @@ -89,17 +89,28 @@ int cg_read(const char *cgroup, const char *control, 
>> char *buf, size_t len)
>>  int cg_read_strcmp(const char *cgroup, const char *control,
>> const char *expected)
>>  {
>> -size_t size = strlen(expected) + 1;
>> +size_t size;
>>  char *buf;
>> +int ret;
>> +
>> +/* Handle the case of comparing against empty string */
>> +if (!expected)
>> +size = 32;
>
> This doesn't look right. I would think expected shouldn't be null?
> It gets used below.
>
>> +else
>> +size = strlen(expected) + 1;
>>
>>  buf = malloc(size);
>>  if (!buf)
>>  return -1;
>>
>> -if (cg_read(cgroup, control, buf, size))
>> +if (cg_read(cgroup, control, buf, size)) {
>> +free(buf);
>>  return -1;
>> +}
>>
>> -return strcmp(expected, buf);
>> +ret = strcmp(expected, buf);
>
> If expected is null, what's the point in running the test?
> Is  empty "needle" string a valid test scenario?

There are a couple places where an empty "needle" string is used currently:

- cg_test_proc_killed (newly added in the next patch): Verify cgroup.procs is
  empty (there are no processes running)
- test_memcg_oom_events: Verify cgroup.procs is empty

Previously, when passing in an empty needle string, this function would always
return 0, as the size allocated (1) would not be enough to read any data in
'cg_read', and strcmp would compare two null strings.

>
>> +free(buf);
>> +return ret;
>>  }
>>
>>  int cg_read_strstr(const char *cgroup, const char *control, const char 
>> *needle)
>>
>
> thanks,
> -- Shuah

I could definitely remove the unneeded strcmp in the null 'expected' case, but
I am worried it would feel a bit too hacky or add too much duplication.

Would something like this be the best solution? If you had something else in
mind (or if I'm misunderstanding something), please let me know, and I'll
update the patchset!

size_t size;
char *buf;
int ret;

/* Handle the case of comparing against empty string */
if (!expected)
size = 32;
else
size = strlen(expected) + 1;

buf = malloc(size);
if (!buf)
return -1;

if (cg_read(cgroup, control, buf, size)) {
free(buf);
return -1;
}

if (!expected)
ret = !buf;
else
ret = strcmp(expected, buf);
free(buf);
return ret;

Thanks,
-Jay


Re: Conflict between sparse and commit cafa0010cd51f ("Raise the minimum required gcc version to 4.6")

2018-09-07 Thread Luc Van Oostenryck
On Fri, Sep 07, 2018 at 11:19:43AM -0700, Nick Desaulniers wrote:
> On Fri, Sep 7, 2018 at 11:13 AM Luc Van Oostenryck wrote:
> >
> > Sparse expand these macros to the same version than the compiler used
> > to compile GCC. I find a bit strange though to have sparse v0.5.2 but
> > using an old compiler.
> 
> So Christophe must have a version of gcc < 4.6 installed somewhere?

It looks so.

> Does sparse use `cc`?

By default sparse use gcc (this can be overriden by using CC=...).

-- Luc


[PATCH v2] optee: allow to work without static shared memory

2018-09-07 Thread Volodymyr Babchuk
From: Volodymyr Babchuk 

On virtualized systems it is possible that OP-TEE will provide
only dynamic shared memory support. So it is fine to boot
without static SHM enabled if dymanic one is supported.

Signed-off-by: Volodymyr Babchuk 
---

 Changes from v1:
   Patch is now can be applied to vanilla kernel instead of
   linaro's op-tee branch

 drivers/tee/optee/core.c | 80 +---
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c
index e1aafe8..efd2e5c 100644
--- a/drivers/tee/optee/core.c
+++ b/drivers/tee/optee/core.c
@@ -419,9 +419,35 @@ static bool 
optee_msg_exchange_capabilities(optee_invoke_fn *invoke_fn,
return true;
 }
 
+static struct tee_shm_pool *optee_config_dyn_shm(void)
+{
+   struct tee_shm_pool_mgr *priv_mgr;
+   struct tee_shm_pool_mgr *dmabuf_mgr;
+   void *rc;
+
+   rc = optee_shm_pool_alloc_pages();
+   if (IS_ERR(rc))
+   return rc;
+   priv_mgr = rc;
+
+   rc = optee_shm_pool_alloc_pages();
+   if (IS_ERR(rc)) {
+   tee_shm_pool_mgr_destroy(priv_mgr);
+   return rc;
+   }
+   dmabuf_mgr = rc;
+
+   rc = tee_shm_pool_alloc(priv_mgr, dmabuf_mgr);
+   if (IS_ERR(rc)) {
+   tee_shm_pool_mgr_destroy(priv_mgr);
+   tee_shm_pool_mgr_destroy(dmabuf_mgr);
+   }
+
+   return rc;
+}
+
 static struct tee_shm_pool *
-optee_config_shm_memremap(optee_invoke_fn *invoke_fn, void **memremaped_shm,
- u32 sec_caps)
+optee_config_shm_memremap(optee_invoke_fn *invoke_fn, void **memremaped_shm)
 {
union {
struct arm_smccc_res smccc;
@@ -436,10 +462,11 @@ optee_config_shm_memremap(optee_invoke_fn *invoke_fn, 
void **memremaped_shm,
struct tee_shm_pool_mgr *priv_mgr;
struct tee_shm_pool_mgr *dmabuf_mgr;
void *rc;
+   const int sz = OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
 
invoke_fn(OPTEE_SMC_GET_SHM_CONFIG, 0, 0, 0, 0, 0, 0, 0, );
if (res.result.status != OPTEE_SMC_RETURN_OK) {
-   pr_info("shm service not available\n");
+   pr_err("static shm service not available\n");
return ERR_PTR(-ENOENT);
}
 
@@ -465,28 +492,15 @@ optee_config_shm_memremap(optee_invoke_fn *invoke_fn, 
void **memremaped_shm,
}
vaddr = (unsigned long)va;
 
-   /*
-* If OP-TEE can work with unregistered SHM, we will use own pool
-* for private shm
-*/
-   if (sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM) {
-   rc = optee_shm_pool_alloc_pages();
-   if (IS_ERR(rc))
-   goto err_memunmap;
-   priv_mgr = rc;
-   } else {
-   const size_t sz = OPTEE_SHM_NUM_PRIV_PAGES * PAGE_SIZE;
-
-   rc = tee_shm_pool_mgr_alloc_res_mem(vaddr, paddr, sz,
-   3 /* 8 bytes aligned */);
-   if (IS_ERR(rc))
-   goto err_memunmap;
-   priv_mgr = rc;
-
-   vaddr += sz;
-   paddr += sz;
-   size -= sz;
-   }
+   rc = tee_shm_pool_mgr_alloc_res_mem(vaddr, paddr, sz,
+   3 /* 8 bytes aligned */);
+   if (IS_ERR(rc))
+   goto err_memunmap;
+   priv_mgr = rc;
+
+   vaddr += sz;
+   paddr += sz;
+   size -= sz;
 
rc = tee_shm_pool_mgr_alloc_res_mem(vaddr, paddr, size, PAGE_SHIFT);
if (IS_ERR(rc))
@@ -552,7 +566,7 @@ static optee_invoke_fn *get_invoke_func(struct device_node 
*np)
 static struct optee *optee_probe(struct device_node *np)
 {
optee_invoke_fn *invoke_fn;
-   struct tee_shm_pool *pool;
+   struct tee_shm_pool *pool = ERR_PTR(-EINVAL);
struct optee *optee = NULL;
void *memremaped_shm = NULL;
struct tee_device *teedev;
@@ -581,13 +595,17 @@ static struct optee *optee_probe(struct device_node *np)
}
 
/*
-* We have no other option for shared memory, if secure world
-* doesn't have any reserved memory we can use we can't continue.
+* Try to use dynamic shared memory if possible
 */
-   if (!(sec_caps & OPTEE_SMC_SEC_CAP_HAVE_RESERVED_SHM))
-   return ERR_PTR(-EINVAL);
+   if (sec_caps & OPTEE_SMC_SEC_CAP_DYNAMIC_SHM)
+   pool = optee_config_dyn_shm();
+
+   /*
+* If dynamic shared memory is not available or failed - try static one
+*/
+   if (IS_ERR(pool) && (sec_caps & OPTEE_SMC_SEC_CAP_HAVE_RESERVED_SHM))
+   pool = optee_config_shm_memremap(invoke_fn, _shm);
 
-   pool = optee_config_shm_memremap(invoke_fn, _shm, sec_caps);
if (IS_ERR(pool))
return (void *)pool;
 
-- 
2.7.4



[GIT PULL] MD update for 4.19-rc2

2018-09-07 Thread Shaohua Li
Hi,
Please pull MD fixes for 4.19-rc2:
- Fix a locking issue for md-cluster from Guoqing
- Fix a sync crash for raid10 from Ni
- Fix a reshape bug with raid5 cache enabled from Me

Thanks,
Shaohua

The following changes since commit 420f51f4ab6bce6e580390729fadb89c31123636:

  Merge tag 'arm64-fixes' of 
git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux (2018-08-31 09:20:30 
-0700)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git tags/md/4.19-rc2

for you to fetch changes up to 41a95041126522a921fb73df22cbdd520dfdebad:

  md-cluster: release RESYNC lock after the last resync message (2018-08-31 
17:38:10 -0700)


Guoqing Jiang (1):
  md-cluster: release RESYNC lock after the last resync message

Shaohua Li (1):
  md/raid5-cache: disable reshape completely

Xiao Ni (1):
  RAID10 BUG_ON in raise_barrier when force is true and conf->barrier is 0

 drivers/md/md-cluster.c | 10 +-
 drivers/md/raid10.c |  5 -
 drivers/md/raid5-log.h  |  5 +
 drivers/md/raid5.c  |  6 +++---
 4 files changed, 17 insertions(+), 9 deletions(-)


Re: [PATCH] mtd: rawnand: denali: add DT property to specify skipped bytes in OOB

2018-09-07 Thread Masahiro Yamada
Hi Boris,

2018-09-07 23:53 GMT+09:00 Boris Brezillon :
> On Fri, 7 Sep 2018 23:42:53 +0900
> Masahiro Yamada  wrote:
>
>> Hi Boris,
>>
>> 2018-09-07 23:08 GMT+09:00 Boris Brezillon :
>> > Hi Masahiro,
>> >
>> > On Fri,  7 Sep 2018 19:56:23 +0900
>> > Masahiro Yamada  wrote:
>> >
>> >> NAND devices need additional data area (OOB) for error correction,
>> >> but it is also used for Bad Block Marker (BBM).  In many cases, the
>> >> first byte in OOB is used for BBM, but the location actually depends
>> >> on chip vendors.  The NAND controller should preserve the precious
>> >> BBM to keep track of bad blocks.
>> >>
>> >> In Denali IP, the SPARE_AREA_SKIP_BYTES register is used to specify
>> >> the number of bytes to skip from the start of OOB.  The ECC engine
>> >> will automatically skip the specified number of bytes when it gets
>> >> access to OOB area.
>> >>
>> >> The same value for SPARE_AREA_SKIP_BYTES should be used between
>> >> firmware and the operating system if you intend to use the NAND
>> >> device across the control hand-off.
>> >>
>> >> In fact, the current denali.c code expects firmware to have already
>> >> set the SPARE_AREA_SKIP_BYTES register, then reads the value out.
>> >>
>> >> If no firmware (or bootloader) has initialized the controller, the
>> >> register value is zero, which is the default after power-on-reset.
>> >>
>> >> In other words, the Linux driver cannot initialize the controller
>> >> by itself.  You cannot support the reset control either because
>> >> resetting the controller will get register values lost.
>> >>
>> >> This commit adds a way to specify it via DT.  If the property
>> >> "denali,oob-skip-bytes" exists, the value will be set to the register.
>> >
>> > Hm, do we really need to make this config customizable? I mean, either
>> > you have a large-page NAND (page > 512 bytes) and the 2 first bytes
>> > must be reserved for the BBM or you have a small-page NAND and the BBM
>> > is at position 4 and 5. Are you sure people configure that differently?
>> > Don't you always have SPARE_AREA_SKIP_BYTES set to 6 or 2?
>>
>>
>> As I said in the patch description,
>> I need to use the same SPARE_AREA_SKIP_BYTES value
>> across firmware, boot-loader, Linux, and whatever.
>>
>> I want to set the value to 8 for my platform
>> because the on-chip boot ROM expects 8.
>> I cannot change it since the boot ROM is hard-wired.
>>
>>
>> The boot ROM skips 8 bytes in OOB
>> when it loads images from the on-board NAND device.
>>
>> So, when I update the image from U-Boot or Linux,
>> I need to make sure to set the register to 8.
>>
>> If I update the image with a different value,
>> the Boot ROM fails to boot.
>>
>>
>>
>> When the system has booted from NAND,
>> the register is already set to 8.  It works.
>>
>> However, when the system has booted from eMMC,
>> the register is not initialized by anyone.
>> I am searching for a way to set the register to 8
>> in this case.
>>
>>
>> The boot ROM in SOCFPGA might expect a different value,
>> I am not sure.
>
> Okay, then why not having a per-compatible value if it's related to the
> BootROM? Unless the BootROM is part of the FPGA and can be
> reprogrammed.

FPGA is unrelated here.

Neither the boot ROM nor the Denali core is re-programmable.



I hesitate to associate the number of skipped bytes
with the compatible string because it is not a parameter
of the Denali IP.


Rather, it is the matter of "how we use the OOB",
so I want to leave room for customization like nand-ecc-strength etc.
even if the boot ROM happens to expect a particular value.


If you prefer a per-compatible value, I can do that,
but I believe the NAND core and the boot ROM are orthogonal.



> I'd really prefer not having a generic property that
> allows you to put anything you want.






-- 
Best Regards
Masahiro Yamada


Re: Regression in next with filesystem context concept

2018-09-07 Thread Tony Lindgren
* David Howells  [180907 08:51]:
> Tony Lindgren  wrote:
> 
> > Looks like next-20180906 now has a regression where mounting
> > root won't work with commit fd0002870b45 ("vfs: Implement a
> > filesystem superblock creation/configuration context").
> 
> Am I right in thinking you're not using any of the LSMs?

Assuming LSM as in Documentation/lsm.txt, right not using any.

BTW, I don't think this issue shows up with ramdisk either,
so that's probably why for example kernelci.org does not
show errors.

Regards,

Tony


[PATCH] cpufreq: remove unnecessary unlikely()

2018-09-07 Thread Igor Stoppa
WARN_ON() already contains an unlikely(), so it's not necessary to wrap it
into another.

Signed-off-by: Igor Stoppa 
Cc: Srivatsa S. Bhat 
Cc: "Rafael J. Wysocki" 
Cc: linux...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/cpufreq/cpufreq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index f53fb41efb7b..7aa3dcad2175 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -403,7 +403,7 @@ EXPORT_SYMBOL_GPL(cpufreq_freq_transition_begin);
 void cpufreq_freq_transition_end(struct cpufreq_policy *policy,
struct cpufreq_freqs *freqs, int transition_failed)
 {
-   if (unlikely(WARN_ON(!policy->transition_ongoing)))
+   if (WARN_ON(!policy->transition_ongoing))
return;
 
cpufreq_notify_post_transition(policy, freqs, transition_failed);
-- 
2.17.1



Re: [PATCH v2 3/3] x86/pti/64: Remove the SYSCALL64 entry trampoline

2018-09-07 Thread Josh Poimboeuf
On Mon, Sep 03, 2018 at 03:59:44PM -0700, Andy Lutomirski wrote:
> The SYSCALL64 trampoline has a couple of nice properties:
> 
>  - The usual sequence of SWAPGS followed by two GS-relative accesses to
>set up RSP is somewhat slow because the GS-relative accesses need
>to wait for SWAPGS to finish.  The trampoline approach allows
>RIP-relative accesses to set up RSP, which avoids the stall.
> 
>  - The trampoline avoids any percpu access before CR3 is set up,
>which means that no percpu memory needs to be mapped in the user
>page tables.  This prevents using Meltdown to read any percpu memory
>outside the cpu_entry_area and prevents using timing leaks
>to directly locate the percpu areas.
> 
> The downsides of using a trampoline may outweigh the upsides, however.
> It adds an extra non-contiguous I$ cache line to system calls, and it
> forces an indirect jump to transfer control back to the normal kernel
> text after CR3 is set up.  The latter is because x86 lacks a 64-bit
> direct jump instruction that could jump from the trampoline to the entry
> text.  With retpolines enabled, the indirect jump is extremely slow.
> 
> This patch changes the code to map the percpu TSS into the user page
> tables to allow the non-trampoline SYSCALL64 path to work under PTI.
> This does not add a new direct information leak, since the TSS is
> readable by Meltdown from the cpu_entry_area alias regardless.  It
> does allow a timing attack to locate the percpu area, but KASLR is
> more or less a lost cause against local attack on CPUs vulnerable to
> Meltdown regardless.  As far as I'm concerned, on current hardware,
> KASLR is only useful to mitigate remote attacks that try to attack
> the kernel without first gaining RCE against a vulnerable user
> process.
> 
> On Skylake, with CONFIG_RETPOLINE=y and KPTI on, this reduces
> syscall overhead from ~237ns to ~228ns.
> 
> There is a possible alternative approach: we could instead move the
> trampoline within 2G of the entry text and make a separate copy for
> each CPU.  Then we could use a direct jump to rejoin the normal
> entry path.
> 
> Signed-off-by: Andy Lutomirski 

The following commit should also be reverted:

  4d99e4136580 ("perf machine: Workaround missing maps for x86 PTI entry 
trampolines")

-- 
Josh


[REGRESSION] Commit 5745392e0c2b ("PCI: Apply the new generic I/O management on PCI IO hosts") breaks PCI for legacy virtio devices with kvmtool on arm64

2018-09-07 Thread Will Deacon
Hi all,

I'm seeing a regression in Linux guests since 4.17 under kvmtool, where
legacy virtio devices using the PCI transport fail to probe. Legacy virtio
PCI devices must be accessed via "I/O space" (e.g. BAR0, which is
IORESOURCE_IO) and kvmtool assigns this to the guest physical range
0x0 - 0x1.

On arm64, when the virtio legacy PCI driver calls pci_iomap() for this BAR,
it expands to ioport_map():

  static inline void __iomem *ioport_map(unsigned long port, unsigned int nr)
  {
return PCI_IOBASE + (port & MMIO_UPPER_LIMIT);
  }

Since the indirect PIO changes, MMIO_UPPER_LIMIT is defined as:

  /*
   * We reserve 0x4000 bytes for Indirect IO as so far this library is only
   * used by the HiSilicon LPC Host. If needed, we can reserve a wider IO
   * area by redefining the macro below.
   */
  #define PIO_INDIRECT_SIZE 0x4000
  #define MMIO_UPPER_LIMIT (IO_SPACE_LIMIT - PIO_INDIRECT_SIZE)

which corrupts the BAR address. For example, kvmtool has the BAR pointing
at 0x6200 on my system, but pci_iomap() actually maps offset 0x2200.
Changing PIO_INDIRECT_SIZE to 0 gets things working again.

Since this stuff doesn't revert nicely, I'm not sure how to proceed. Any
thoughts? Generally, having a per-platform magic constant hardcoded in
the PCI mapping code makes me feel slightly ill...

Cheers,

Will


Re: [PATCH 8/9] psi: pressure stall information for CPU, memory, and IO

2018-09-07 Thread Johannes Weiner
On Fri, Sep 07, 2018 at 04:58:58PM +0200, Peter Zijlstra wrote:
> On Fri, Sep 07, 2018 at 10:44:22AM -0400, Johannes Weiner wrote:
> 
> > > This does the whole seqcount thing 6x, which is a bit of a waste.
> > 
> > [...]
> > 
> > > It's a bit cumbersome, but that's because of C.
> > 
> > I was actually debating exactly this with Suren before, but since this
> > is a super cold path I went with readability. I was also thinking that
> > restarts could happen quite regularly under heavy scheduler load, and
> > so keeping the individual retry sections small could be helpful - but
> > I didn't instrument this in any way.
> 
> I was hoping going over the whole thing once would reduce the time we
> need to keep that line in shared mode and reduce traffic. And yes, this
> path is cold, but I was thinking about reducing the interference on the
> remote CPU.
> 
> Alternatively, we memcpy the whole line under the seqlock and then do
> everything later.
> 
> Also, this only has a single cpu_clock() invocation.

Good points.

How about the below? It's still pretty readable, and generates compact
code inside the now single retry section:

81ed464f:   44 89 ffmov%r15d,%edi
81ed4652:   e8 00 00 00 00  callq  81ed4657 

81ed4653: R_X86_64_PLT32
sched_clock_cpu-0x4
memcpy(times, groupc->times, sizeof(groupc->times));
81ed4657:   49 8b 14 24 mov(%r12),%rdx
state_start = groupc->state_start;
81ed465b:   48 8b 4b 50 mov0x50(%rbx),%rcx
memcpy(times, groupc->times, sizeof(groupc->times));
81ed465f:   48 89 54 24 30  mov%rdx,0x30(%rsp)
81ed4664:   49 8b 54 24 08  mov0x8(%r12),%rdx
81ed4669:   48 89 54 24 38  mov%rdx,0x38(%rsp)
81ed466e:   49 8b 54 24 10  mov0x10(%r12),%rdx
81ed4673:   48 89 54 24 40  mov%rdx,0x40(%rsp)
memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
81ed4678:   49 8b 55 00 mov0x0(%r13),%rdx
81ed467c:   48 89 54 24 24  mov%rdx,0x24(%rsp)
81ed4681:   41 8b 55 08 mov0x8(%r13),%edx
81ed4685:   89 54 24 2c mov%edx,0x2c(%rsp)

---

diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 0f07749b60a4..595414599b98 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -197,17 +197,26 @@ static bool test_state(unsigned int *tasks, enum 
psi_states state)
}
 }
 
-static u32 get_recent_time(struct psi_group *group, int cpu,
-  enum psi_states state)
+static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
 {
struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
+   unsigned int tasks[NR_PSI_TASK_COUNTS];
+   u64 now, state_start;
unsigned int seq;
-   u32 time, delta;
+   int s;
 
+   /* Snapshot a coherent view of the CPU state */
do {
seq = read_seqcount_begin(>seq);
+   now = cpu_clock(cpu);
+   memcpy(times, groupc->times, sizeof(groupc->times));
+   memcpy(tasks, groupc->tasks, sizeof(groupc->tasks));
+   state_start = groupc->state_start;
+   } while (read_seqcount_retry(>seq, seq));
 
-   time = groupc->times[state];
+   /* Calculate state time deltas against the previous snapshot */
+   for (s = 0; s < NR_PSI_STATES; s++) {
+   u32 delta;
/*
 * In addition to already concluded states, we also
 * incorporate currently active states on the CPU,
@@ -217,14 +226,14 @@ static u32 get_recent_time(struct psi_group *group, int 
cpu,
 * (u32) and our reported pressure close to what's
 * actually happening.
 */
-   if (test_state(groupc->tasks, state))
-   time += cpu_clock(cpu) - groupc->state_start;
-   } while (read_seqcount_retry(>seq, seq));
+   if (test_state(tasks, s))
+   times[s] += now - state_start;
 
-   delta = time - groupc->times_prev[state];
-   groupc->times_prev[state] = time;
+   delta = times[s] - groupc->times_prev[s];
+   groupc->times_prev[s] = times[s];
 
-   return delta;
+   times[s] = delta;
+   }
 }
 
 static void calc_avgs(unsigned long avg[3], int missed_periods,
@@ -267,18 +276,16 @@ static bool update_stats(struct psi_group *group)
 * loading, or even entirely idle CPUs.
 */
for_each_possible_cpu(cpu) {
+   u32 times[NR_PSI_STATES];
u32 nonidle;
 
-   nonidle = get_recent_time(group, cpu, PSI_NONIDLE);
-   nonidle = nsecs_to_jiffies(nonidle);
-

Re: [PATCH 3/6] selinux: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
On Sat, Sep 08, 2018 at 02:08:03AM +0900, Tetsuo Handa wrote:
> On 2018/09/08 1:56, Kent Overstreet wrote:
> > @@ -329,8 +328,7 @@ int avtab_alloc(struct avtab *h, u32 nrules)
> > nslot = MAX_AVTAB_HASH_BUCKETS;
> > mask = nslot - 1;
> >  
> > -   h->htable = flex_array_alloc(sizeof(struct avtab_node *), nslot,
> > -GFP_KERNEL | __GFP_ZERO);
> > +   h->htable = kvmalloc_array(nslot, sizeof(void *), GFP_KERNEL);
> > if (!h->htable)
> > return -ENOMEM;
> >  
> 
> kvmalloc_array() does not imply __GFP_ZERO.

Thanks, fixed


  1   2   3   4   5   6   7   8   9   10   >