date:20180709

Re: [PATCH 2/2] dt-bindings: arm: mediatek: cleanup MT7623N reference boards

2018-07-09 Thread John Crispin





On 10/07/18 07:09, Ryder Lee wrote:

Cleanup binding document to get rid of unsupported reference boards
for MT7623N.

Cc: John Crispin 
Cc: Sean Wang 
Signed-off-by: Ryder Lee 

Acked-by: John Crispin 

---
  Documentation/devicetree/bindings/arm/mediatek.txt | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/arm/mediatek.txt 
b/Documentation/devicetree/bindings/arm/mediatek.txt
index 7d21ab3..c07ddf4 100644
--- a/Documentation/devicetree/bindings/arm/mediatek.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek.txt
@@ -59,9 +59,6 @@ Supported boards:
  - Reference board for MT7623n with eMMC:
  Required root node properties:
- compatible = "mediatek,mt7623n-rfb-emmc", "mediatek,mt7623";
-- Reference  board for MT7623n with NAND:
-Required root node properties:
-  - compatible = "mediatek,mt7623n-rfb-nand", "mediatek,mt7623";
  - Bananapi BPI-R2 board:
- compatible = "bananapi,bpi-r2", "mediatek,mt7623";
  - MTK mt8127 tablet moose EVB:

Re: [PATCH 1/2] arm: dts: mt7623: cleanup MT7623N NAND dts file

2018-07-09 Thread John Crispin





On 10/07/18 07:09, Ryder Lee wrote:

Normally, we didn't release this kind of baord to user. This specific
board exists only in the early stage of development inside MediaTek -
and that may confuse peoples.

Hence this patch removes related files accordingly.

Cc: John Crispin 
Cc: Sean Wang 
Signed-off-by: Ryder Lee 

Acked-by: John Crispin 

---
  arch/arm/boot/dts/Makefile |  1 -
  arch/arm/boot/dts/mt7623n-rfb-nand.dts | 73 -
  arch/arm/boot/dts/mt7623n-rfb.dtsi | 86 --
  3 files changed, 160 deletions(-)
  delete mode 100644 arch/arm/boot/dts/mt7623n-rfb-nand.dts
  delete mode 100644 arch/arm/boot/dts/mt7623n-rfb.dtsi

diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 37a3de7..dde494e 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -1168,7 +1168,6 @@ dtb-$(CONFIG_ARCH_MEDIATEK) += \
mt7623a-rfb-emmc.dtb \
mt7623a-rfb-nand.dtb \
mt7623n-rfb-emmc.dtb \
-   mt7623n-rfb-nand.dtb \
mt7623n-bananapi-bpi-r2.dtb \
mt8127-moose.dtb \
mt8135-evbp1.dtb
diff --git a/arch/arm/boot/dts/mt7623n-rfb-nand.dts 
b/arch/arm/boot/dts/mt7623n-rfb-nand.dts
deleted file mode 100644
index 96ff3c9..000
--- a/arch/arm/boot/dts/mt7623n-rfb-nand.dts
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2017 MediaTek Inc.
- * Author: John Crispin 
- *
- */
-
-/dts-v1/;
-#include "mt7623n-rfb.dtsi"
-
-/ {
-   model = "MediaTek MT7623N NAND reference board";
-   compatible = "mediatek,mt7623n-rfb-nand", "mediatek,mt7623";
-};
-
- {
-   status = "okay";
-};
-
- {
-   status = "okay";
-   pinctrl-names = "default";
-   pinctrl-0 = <_pins_default>;
-
-   nand@0 {
-   reg = <0>;
-   spare_per_sector = <64>;
-   nand-ecc-mode = "hw";
-   nand-ecc-strength = <12>;
-   nand-ecc-step-size = <1024>;
-
-   partitions {
-   compatible = "fixed-partitions";
-   #address-cells = <1>;
-   #size-cells = <1>;
-
-   partition@0 {
-   label = "preloader";
-   reg = <0x0 0x4>;
-   };
-
-   partition@4 {
-   label = "uboot";
-   reg = <0x4 0x8>;
-   };
-
-   partition@c {
-   label = "uboot-env";
-   reg = <0xC 0x4>;
-   };
-
-   partition@14 {
-   label = "bootimg";
-   reg = <0x14 0x200>;
-   };
-
-   partition@214 {
-   label = "recovery";
-   reg = <0x214 0x200>;
-   };
-
-   partition@414 {
-   label = "rootfs";
-   reg = <0x414 0x100>;
-   };
-
-   partition@514 {
-   label = "usrdata";
-   reg = <0x514 0x100>;
-   };
-   };
-   };
-};
diff --git a/arch/arm/boot/dts/mt7623n-rfb.dtsi 
b/arch/arm/boot/dts/mt7623n-rfb.dtsi
deleted file mode 100644
index 5c5cc7d..000
--- a/arch/arm/boot/dts/mt7623n-rfb.dtsi
+++ /dev/null
@@ -1,86 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2017 MediaTek Inc.
- * Author: John Crispin 
- *Sean Wang 
- *
- */
-
-/dts-v1/;
-#include "mt7623.dtsi"
-#include "mt6323.dtsi"
-
-/ {
-   aliases {
-   serial0 = 
-   serial1 = 
-   serial2 = 
-   };
-
-   chosen {
-   stdout-path = "serial2:115200n8";
-   };
-
-   cpus {
-   cpu0 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu1 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu2 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu3 {
-   proc-supply = <_vproc_reg>;
-   };
-   };
-
-   memory@8000 {
-   device_type = "memory";
-   reg = <0 0x8000 0 0x4000>;
-   };
-
-   usb_p1_vbus: regulator-5v {
-   compatible = "regulator-fixed";
-   regulator-name = "usb_vbus";
-   regulator-min-microvolt = <500>;
-   regulator-max-microvolt = <500>;
-   gpio = < 135 GPIO_ACTIVE_HIGH>;
-   enable-active-high;
-   };
-};
-
- {
-   vmmc-supply =

Re: [PATCH 2/2] dt-bindings: arm: mediatek: cleanup MT7623N reference boards

2018-07-09 Thread John Crispin





On 10/07/18 07:09, Ryder Lee wrote:

Cleanup binding document to get rid of unsupported reference boards
for MT7623N.

Cc: John Crispin 
Cc: Sean Wang 
Signed-off-by: Ryder Lee 

Acked-by: John Crispin 

---
  Documentation/devicetree/bindings/arm/mediatek.txt | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/arm/mediatek.txt 
b/Documentation/devicetree/bindings/arm/mediatek.txt
index 7d21ab3..c07ddf4 100644
--- a/Documentation/devicetree/bindings/arm/mediatek.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek.txt
@@ -59,9 +59,6 @@ Supported boards:
  - Reference board for MT7623n with eMMC:
  Required root node properties:
- compatible = "mediatek,mt7623n-rfb-emmc", "mediatek,mt7623";
-- Reference  board for MT7623n with NAND:
-Required root node properties:
-  - compatible = "mediatek,mt7623n-rfb-nand", "mediatek,mt7623";
  - Bananapi BPI-R2 board:
- compatible = "bananapi,bpi-r2", "mediatek,mt7623";
  - MTK mt8127 tablet moose EVB:

Re: [PATCH 1/2] arm: dts: mt7623: cleanup MT7623N NAND dts file

2018-07-09 Thread John Crispin





On 10/07/18 07:09, Ryder Lee wrote:

Normally, we didn't release this kind of baord to user. This specific
board exists only in the early stage of development inside MediaTek -
and that may confuse peoples.

Hence this patch removes related files accordingly.

Cc: John Crispin 
Cc: Sean Wang 
Signed-off-by: Ryder Lee 

Acked-by: John Crispin 

---
  arch/arm/boot/dts/Makefile |  1 -
  arch/arm/boot/dts/mt7623n-rfb-nand.dts | 73 -
  arch/arm/boot/dts/mt7623n-rfb.dtsi | 86 --
  3 files changed, 160 deletions(-)
  delete mode 100644 arch/arm/boot/dts/mt7623n-rfb-nand.dts
  delete mode 100644 arch/arm/boot/dts/mt7623n-rfb.dtsi

diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 37a3de7..dde494e 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -1168,7 +1168,6 @@ dtb-$(CONFIG_ARCH_MEDIATEK) += \
mt7623a-rfb-emmc.dtb \
mt7623a-rfb-nand.dtb \
mt7623n-rfb-emmc.dtb \
-   mt7623n-rfb-nand.dtb \
mt7623n-bananapi-bpi-r2.dtb \
mt8127-moose.dtb \
mt8135-evbp1.dtb
diff --git a/arch/arm/boot/dts/mt7623n-rfb-nand.dts 
b/arch/arm/boot/dts/mt7623n-rfb-nand.dts
deleted file mode 100644
index 96ff3c9..000
--- a/arch/arm/boot/dts/mt7623n-rfb-nand.dts
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2017 MediaTek Inc.
- * Author: John Crispin 
- *
- */
-
-/dts-v1/;
-#include "mt7623n-rfb.dtsi"
-
-/ {
-   model = "MediaTek MT7623N NAND reference board";
-   compatible = "mediatek,mt7623n-rfb-nand", "mediatek,mt7623";
-};
-
- {
-   status = "okay";
-};
-
- {
-   status = "okay";
-   pinctrl-names = "default";
-   pinctrl-0 = <_pins_default>;
-
-   nand@0 {
-   reg = <0>;
-   spare_per_sector = <64>;
-   nand-ecc-mode = "hw";
-   nand-ecc-strength = <12>;
-   nand-ecc-step-size = <1024>;
-
-   partitions {
-   compatible = "fixed-partitions";
-   #address-cells = <1>;
-   #size-cells = <1>;
-
-   partition@0 {
-   label = "preloader";
-   reg = <0x0 0x4>;
-   };
-
-   partition@4 {
-   label = "uboot";
-   reg = <0x4 0x8>;
-   };
-
-   partition@c {
-   label = "uboot-env";
-   reg = <0xC 0x4>;
-   };
-
-   partition@14 {
-   label = "bootimg";
-   reg = <0x14 0x200>;
-   };
-
-   partition@214 {
-   label = "recovery";
-   reg = <0x214 0x200>;
-   };
-
-   partition@414 {
-   label = "rootfs";
-   reg = <0x414 0x100>;
-   };
-
-   partition@514 {
-   label = "usrdata";
-   reg = <0x514 0x100>;
-   };
-   };
-   };
-};
diff --git a/arch/arm/boot/dts/mt7623n-rfb.dtsi 
b/arch/arm/boot/dts/mt7623n-rfb.dtsi
deleted file mode 100644
index 5c5cc7d..000
--- a/arch/arm/boot/dts/mt7623n-rfb.dtsi
+++ /dev/null
@@ -1,86 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2017 MediaTek Inc.
- * Author: John Crispin 
- *Sean Wang 
- *
- */
-
-/dts-v1/;
-#include "mt7623.dtsi"
-#include "mt6323.dtsi"
-
-/ {
-   aliases {
-   serial0 = 
-   serial1 = 
-   serial2 = 
-   };
-
-   chosen {
-   stdout-path = "serial2:115200n8";
-   };
-
-   cpus {
-   cpu0 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu1 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu2 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu3 {
-   proc-supply = <_vproc_reg>;
-   };
-   };
-
-   memory@8000 {
-   device_type = "memory";
-   reg = <0 0x8000 0 0x4000>;
-   };
-
-   usb_p1_vbus: regulator-5v {
-   compatible = "regulator-fixed";
-   regulator-name = "usb_vbus";
-   regulator-min-microvolt = <500>;
-   regulator-max-microvolt = <500>;
-   gpio = < 135 GPIO_ACTIVE_HIGH>;
-   enable-active-high;
-   };
-};
-
- {
-   vmmc-supply =

RE: Re: devfreq relation with pm qos

2018-07-09 Thread MyungJoo Ham

> + dev freq maintainters.
> 
> On Mon, Jul 9, 2018 at 3:37 AM, noman pouigt  wrote:
> > folks,
> >
> > I am trying to figure out the relationship between PM QOS
> > with devfreq framework. I see this thread[1] where MyungJoo
> > talks about QOS and devfreq but that control is through
> > sysfs but I don't see any relation of pm qos (kernel/power/qos.c)
> > with devfreq directly as devfreq is not calling any of the QOS
> > api's. Is this intended?
> >
> > Isn't QOS value update using pm_qos_update_request has a
> > direct relation with devfreq drivers i.e. setting the value to
> > high or low selects the corresponding voltage and frequency
> > setting in devfreq framework? I went through the devfreq
> > drivers and couldn't find that relationship. Is this by design or
> > I am missing something very obvious?
> >
> > [1] https://lwn.net/Articles/484161/

Hello,

Unfortunately, the suggested concept in the referred article was not ever
accepted in the mainline.

Cheers,
MyungJo

RE: Re: devfreq relation with pm qos

2018-07-09 Thread MyungJoo Ham

> + dev freq maintainters.
> 
> On Mon, Jul 9, 2018 at 3:37 AM, noman pouigt  wrote:
> > folks,
> >
> > I am trying to figure out the relationship between PM QOS
> > with devfreq framework. I see this thread[1] where MyungJoo
> > talks about QOS and devfreq but that control is through
> > sysfs but I don't see any relation of pm qos (kernel/power/qos.c)
> > with devfreq directly as devfreq is not calling any of the QOS
> > api's. Is this intended?
> >
> > Isn't QOS value update using pm_qos_update_request has a
> > direct relation with devfreq drivers i.e. setting the value to
> > high or low selects the corresponding voltage and frequency
> > setting in devfreq framework? I went through the devfreq
> > drivers and couldn't find that relationship. Is this by design or
> > I am missing something very obvious?
> >
> > [1] https://lwn.net/Articles/484161/

Hello,

Unfortunately, the suggested concept in the referred article was not ever
accepted in the mainline.

Cheers,
MyungJo

Re: [PATCH v3 0/4] clk: clk: Add functions to save/restore clock context en-masse

2018-07-09 Thread Tony Lindgren

* Keerthy  [180621 01:18]:
> Deep enough power saving mode can result into losing context of the clock
> registers also, and they need to be restored once coming back from the power
> saving mode. Hence add functions to save/restore clock context.

Patches 1 to 3 look good to me:

Acked-by: Tony Lindgren 

Stephen, if no more comments, can you please set up an immutable
branch for me as otherwise the last patch might cause merge
conflicts?

Regards,

Tony

Re: [PATCH v3 0/4] clk: clk: Add functions to save/restore clock context en-masse

2018-07-09 Thread Tony Lindgren

* Keerthy  [180621 01:18]:
> Deep enough power saving mode can result into losing context of the clock
> registers also, and they need to be restored once coming back from the power
> saving mode. Hence add functions to save/restore clock context.

Patches 1 to 3 look good to me:

Acked-by: Tony Lindgren 

Stephen, if no more comments, can you please set up an immutable
branch for me as otherwise the last patch might cause merge
conflicts?

Regards,

Tony

Re: [RFC] dmaengine: Add metadat_ops for dma_async_tx_descriptor

2018-07-09 Thread Vinod



Hey Peter,

Sorry for late response on this..

On 01-06-18, 13:24, Peter Ujfalusi wrote:
> If the DMA supports per descriptor metadata it can implement the attach,
> get_ptr/set_len callbacks.
> 
> Client drivers must only use either attach or get_ptr/set_len to avoid
> miss configuration.
> 
> Wrappers are also added for the metadata_ops:
> dmaengine_desc_attach_metadata()
> dmaengine_desc_get_metadata_ptr()
> dmaengine_desc_set_metadata_len()
> 
> Signed-off-by: Peter Ujfalusi 
> ---
> Hi,
> 
> since attachments are bouncing back, I send the patch separately
> 
> Regards,
> Peter
> 
>  include/linux/dmaengine.h | 50 +++
>  1 file changed, 50 insertions(+)
> 
> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
> index 51fbb861e84b..ac42ace36aa3 100644
> --- a/include/linux/dmaengine.h
> +++ b/include/linux/dmaengine.h
> @@ -491,6 +491,18 @@ struct dmaengine_unmap_data {
>   dma_addr_t addr[0];
>  };
>  
> +struct dma_async_tx_descriptor;
> +
> +struct dma_descriptor_metadata_ops {
> + int (*attach)(struct dma_async_tx_descriptor *desc, void *data,
> +   size_t len);

How does one detach? When should the client free up the memory, IOW when
does dma driver drop ref to data.

> +
> + void *(*get_ptr)(struct dma_async_tx_descriptor *desc,
> +  size_t *payload_len, size_t *max_len);

so what is this supposed to do..?

> + int (*set_len)(struct dma_async_tx_descriptor *desc,
> +size_t payload_len);

attach already has length, so how does this help?

> +};
> +
>  /**
>   * struct dma_async_tx_descriptor - async transaction descriptor
>   * ---dma generic offload fields---
> @@ -520,6 +532,7 @@ struct dma_async_tx_descriptor {
>   dma_async_tx_callback_result callback_result;
>   void *callback_param;
>   struct dmaengine_unmap_data *unmap;
> + struct dma_descriptor_metadata_ops *metadata_ops;
>  #ifdef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH
>   struct dma_async_tx_descriptor *next;
>   struct dma_async_tx_descriptor *parent;
> @@ -932,6 +945,43 @@ static inline struct dma_async_tx_descriptor 
> *dmaengine_prep_dma_memcpy(
>   len, flags);
>  }
>  
> +static inline int dmaengine_desc_attach_metadata(
> + struct dma_async_tx_descriptor *desc, void *data, size_t len)
> +{
> + if (!desc)
> + return 0;
> +
> + if (!desc->metadata_ops || !desc->metadata_ops->attach)
> + return -ENOTSUPP;
> +
> + return desc->metadata_ops->attach(desc, data, len);
> +}
> +
> +static inline void *dmaengine_desc_get_metadata_ptr(
> + struct dma_async_tx_descriptor *desc, size_t *payload_len,
> + size_t *max_len)
> +{
> + if (!desc)
> + return NULL;
> +
> + if (!desc->metadata_ops || !desc->metadata_ops->get_ptr)
> + return ERR_PTR(-ENOTSUPP);
> +
> + return desc->metadata_ops->get_ptr(desc, payload_len, max_len);
> +}
> +
> +static inline int dmaengine_desc_set_metadata_len(
> + struct dma_async_tx_descriptor *desc, size_t payload_len)
> +{
> + if (!desc)
> + return 0;
> +
> + if (!desc->metadata_ops || !desc->metadata_ops->set_len)
> + return -ENOTSUPP;
> +
> + return desc->metadata_ops->set_len(desc, payload_len);
> +}
> +
>  /**
>   * dmaengine_terminate_all() - Terminate all active DMA transfers
>   * @chan: The channel for which to terminate the transfers
> -- 
> Peter
> 
> Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki.
> Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki
> 
> --
> To unsubscribe from this list: send the line "unsubscribe dmaengine" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
~Vinod

Re: [RFC] dmaengine: Add metadat_ops for dma_async_tx_descriptor

2018-07-09 Thread Vinod



Hey Peter,

Sorry for late response on this..

On 01-06-18, 13:24, Peter Ujfalusi wrote:
> If the DMA supports per descriptor metadata it can implement the attach,
> get_ptr/set_len callbacks.
> 
> Client drivers must only use either attach or get_ptr/set_len to avoid
> miss configuration.
> 
> Wrappers are also added for the metadata_ops:
> dmaengine_desc_attach_metadata()
> dmaengine_desc_get_metadata_ptr()
> dmaengine_desc_set_metadata_len()
> 
> Signed-off-by: Peter Ujfalusi 
> ---
> Hi,
> 
> since attachments are bouncing back, I send the patch separately
> 
> Regards,
> Peter
> 
>  include/linux/dmaengine.h | 50 +++
>  1 file changed, 50 insertions(+)
> 
> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
> index 51fbb861e84b..ac42ace36aa3 100644
> --- a/include/linux/dmaengine.h
> +++ b/include/linux/dmaengine.h
> @@ -491,6 +491,18 @@ struct dmaengine_unmap_data {
>   dma_addr_t addr[0];
>  };
>  
> +struct dma_async_tx_descriptor;
> +
> +struct dma_descriptor_metadata_ops {
> + int (*attach)(struct dma_async_tx_descriptor *desc, void *data,
> +   size_t len);

How does one detach? When should the client free up the memory, IOW when
does dma driver drop ref to data.

> +
> + void *(*get_ptr)(struct dma_async_tx_descriptor *desc,
> +  size_t *payload_len, size_t *max_len);

so what is this supposed to do..?

> + int (*set_len)(struct dma_async_tx_descriptor *desc,
> +size_t payload_len);

attach already has length, so how does this help?

> +};
> +
>  /**
>   * struct dma_async_tx_descriptor - async transaction descriptor
>   * ---dma generic offload fields---
> @@ -520,6 +532,7 @@ struct dma_async_tx_descriptor {
>   dma_async_tx_callback_result callback_result;
>   void *callback_param;
>   struct dmaengine_unmap_data *unmap;
> + struct dma_descriptor_metadata_ops *metadata_ops;
>  #ifdef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH
>   struct dma_async_tx_descriptor *next;
>   struct dma_async_tx_descriptor *parent;
> @@ -932,6 +945,43 @@ static inline struct dma_async_tx_descriptor 
> *dmaengine_prep_dma_memcpy(
>   len, flags);
>  }
>  
> +static inline int dmaengine_desc_attach_metadata(
> + struct dma_async_tx_descriptor *desc, void *data, size_t len)
> +{
> + if (!desc)
> + return 0;
> +
> + if (!desc->metadata_ops || !desc->metadata_ops->attach)
> + return -ENOTSUPP;
> +
> + return desc->metadata_ops->attach(desc, data, len);
> +}
> +
> +static inline void *dmaengine_desc_get_metadata_ptr(
> + struct dma_async_tx_descriptor *desc, size_t *payload_len,
> + size_t *max_len)
> +{
> + if (!desc)
> + return NULL;
> +
> + if (!desc->metadata_ops || !desc->metadata_ops->get_ptr)
> + return ERR_PTR(-ENOTSUPP);
> +
> + return desc->metadata_ops->get_ptr(desc, payload_len, max_len);
> +}
> +
> +static inline int dmaengine_desc_set_metadata_len(
> + struct dma_async_tx_descriptor *desc, size_t payload_len)
> +{
> + if (!desc)
> + return 0;
> +
> + if (!desc->metadata_ops || !desc->metadata_ops->set_len)
> + return -ENOTSUPP;
> +
> + return desc->metadata_ops->set_len(desc, payload_len);
> +}
> +
>  /**
>   * dmaengine_terminate_all() - Terminate all active DMA transfers
>   * @chan: The channel for which to terminate the transfers
> -- 
> Peter
> 
> Texas Instruments Finland Oy, Porkkalankatu 22, 00180 Helsinki.
> Y-tunnus/Business ID: 0615521-4. Kotipaikka/Domicile: Helsinki
> 
> --
> To unsubscribe from this list: send the line "unsubscribe dmaengine" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

-- 
~Vinod

Re: [PATCH v5 2/6] clk: ti: dra7: Add clkctrl clock data for the mcan clocks

2018-07-09 Thread Tony Lindgren

* Faiz Abbas  [180709 16:50]:
> Add clkctrl data for the m_can clocks and register it within the
> clkctrl driver

I'll apply patches 2 to 5 to omap-for-v4.19/ti-sysc. The clock
patch is trivial enough to not have to wait for Tero to be back
online.

And I'll apply the dts changes into omap-for-v4.19/dt-mcan based
on omap-for-v4.19/ti-sysc because of the dependency to the clocks.

Regards,

Tony

Re: [PATCH v5 2/6] clk: ti: dra7: Add clkctrl clock data for the mcan clocks

2018-07-09 Thread Tony Lindgren

* Faiz Abbas  [180709 16:50]:
> Add clkctrl data for the m_can clocks and register it within the
> clkctrl driver

I'll apply patches 2 to 5 to omap-for-v4.19/ti-sysc. The clock
patch is trivial enough to not have to wait for Tero to be back
online.

And I'll apply the dts changes into omap-for-v4.19/dt-mcan based
on omap-for-v4.19/ti-sysc because of the dependency to the clocks.

Regards,

Tony

Re: [PATCH v3] cpufreq / CPPC: Add cpuinfo_cur_freq support for CPPC

2018-07-09 Thread George Cherian


Hi Prakash,


On 07/09/2018 10:12 PM, Prakash, Prashanth wrote:


Hi George,


On 7/9/2018 4:10 AM, George Cherian wrote:

Per Section 8.4.7.1.3 of ACPI 6.2, The platform provides performance
feedback via set of performance counters. To determine the actual
performance level delivered over time, OSPM may read a set of
performance counters from the Reference Performance Counter Register
and the Delivered Performance Counter Register.

OSPM calculates the delivered performance over a given time period by
taking a beginning and ending snapshot of both the reference and
delivered performance counters, and calculating:

delivered_perf = reference_perf X (delta of delivered_perf counter / delta of 
reference_perf counter).

Implement the above and hook this to the cpufreq->get method.

Signed-off-by: George Cherian 
Acked-by: Viresh Kumar 
---
  drivers/cpufreq/cppc_cpufreq.c | 44 ++
  1 file changed, 44 insertions(+)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index a9d3eec..61132e8 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -296,10 +296,54 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy 
*policy)
   return ret;
  }

+static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu,
+  struct cppc_perf_fb_ctrs fb_ctrs_t0,
+  struct cppc_perf_fb_ctrs fb_ctrs_t1)
+{
+ u64 delta_reference, delta_delivered;
+ u64 reference_perf, delivered_perf;
+
+ reference_perf = fb_ctrs_t0.reference_perf;
+
+ delta_reference = (u32)fb_ctrs_t1.reference -
+ (u32)fb_ctrs_t0.reference;
+ delta_delivered = (u32)fb_ctrs_t1.delivered -
+ (u32)fb_ctrs_t0.delivered;

Why (u32)? These registers can be 64bits and that's why cppc_perf_fb_ctrs
have 64b fields for reference and delivered counters.

Moreover, the integer math is incorrect. You can run into a scenario where
t1.ref/del < t0.ref/del,  thus setting a negative number to u64! The likelihood
of this is very high especially when you throw away the higher 32bits.


Because of binary representation, unsigned subtraction will work even if
t1.ref/del < t0.ref/del. So essentially, the code should look like
this,

static inline u64 get_delta(u64 t1, u64 t0)
{
if (t1 > t0 || t0 > ~(u32)0)
return t1 - t0;

return (u32)t1 - (u32)t0;
}

As a further optimization, I used (u32) since that also works,
as long as the momentary delta at any point is not greater than 2 ^ 32.
I don't foresee any reason for any platform to increment the counters at
an interval greater than 2 ^ 32.


To keep things simple, do something like below:

if (t1.reference <= t0.reference || t1.delivered <= t0.delivered) {
  /* Atleast one of them should have overflowed */
  return desired_perf;
}
else {
  compute the delivered perf using the counters.
}


No need to do like this as this is tested and found working across 
counter overruns in our platform.



+
+ /* Check to avoid divide-by zero */
+ if (delta_reference || delta_delivered)
+ delivered_perf = (reference_perf * delta_delivered) /
+ delta_reference;
+ else
+ delivered_perf = cpu->perf_ctrls.desired_perf;
+
+ return cppc_cpufreq_perf_to_khz(cpu, delivered_perf);
+}
+
+static unsigned int cppc_cpufreq_get_rate(unsigned int cpunum)
+{
+ struct cppc_perf_fb_ctrs fb_ctrs_t0 = {0}, fb_ctrs_t1 = {0};
+ struct cppc_cpudata *cpu = all_cpu_data[cpunum];
+ int ret;
+
+ ret = cppc_get_perf_ctrs(cpunum, _ctrs_t0);
+ if (ret)
+ return ret;
+
+ udelay(2); /* 2usec delay between sampling */
+
+ ret = cppc_get_perf_ctrs(cpunum, _ctrs_t1);
+ if (ret)
+ return ret;
+
+ return cppc_get_rate_from_fbctrs(cpu, fb_ctrs_t0, fb_ctrs_t1);
+}
+
  static struct cpufreq_driver cppc_cpufreq_driver = {
   .flags = CPUFREQ_CONST_LOOPS,
   .verify = cppc_verify_policy,
   .target = cppc_cpufreq_set_target,
+ .get = cppc_cpufreq_get_rate,
   .init = cppc_cpufreq_cpu_init,
   .stop_cpu = cppc_cpufreq_stop_cpu,
   .name = "cppc_cpufreq",

Re: [PATCH v3] cpufreq / CPPC: Add cpuinfo_cur_freq support for CPPC

2018-07-09 Thread George Cherian


Hi Prakash,


On 07/09/2018 10:12 PM, Prakash, Prashanth wrote:


Hi George,


On 7/9/2018 4:10 AM, George Cherian wrote:

Per Section 8.4.7.1.3 of ACPI 6.2, The platform provides performance
feedback via set of performance counters. To determine the actual
performance level delivered over time, OSPM may read a set of
performance counters from the Reference Performance Counter Register
and the Delivered Performance Counter Register.

OSPM calculates the delivered performance over a given time period by
taking a beginning and ending snapshot of both the reference and
delivered performance counters, and calculating:

delivered_perf = reference_perf X (delta of delivered_perf counter / delta of 
reference_perf counter).

Implement the above and hook this to the cpufreq->get method.

Signed-off-by: George Cherian 
Acked-by: Viresh Kumar 
---
  drivers/cpufreq/cppc_cpufreq.c | 44 ++
  1 file changed, 44 insertions(+)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index a9d3eec..61132e8 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -296,10 +296,54 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy 
*policy)
   return ret;
  }

+static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu,
+  struct cppc_perf_fb_ctrs fb_ctrs_t0,
+  struct cppc_perf_fb_ctrs fb_ctrs_t1)
+{
+ u64 delta_reference, delta_delivered;
+ u64 reference_perf, delivered_perf;
+
+ reference_perf = fb_ctrs_t0.reference_perf;
+
+ delta_reference = (u32)fb_ctrs_t1.reference -
+ (u32)fb_ctrs_t0.reference;
+ delta_delivered = (u32)fb_ctrs_t1.delivered -
+ (u32)fb_ctrs_t0.delivered;

Why (u32)? These registers can be 64bits and that's why cppc_perf_fb_ctrs
have 64b fields for reference and delivered counters.

Moreover, the integer math is incorrect. You can run into a scenario where
t1.ref/del < t0.ref/del,  thus setting a negative number to u64! The likelihood
of this is very high especially when you throw away the higher 32bits.


Because of binary representation, unsigned subtraction will work even if
t1.ref/del < t0.ref/del. So essentially, the code should look like
this,

static inline u64 get_delta(u64 t1, u64 t0)
{
if (t1 > t0 || t0 > ~(u32)0)
return t1 - t0;

return (u32)t1 - (u32)t0;
}

As a further optimization, I used (u32) since that also works,
as long as the momentary delta at any point is not greater than 2 ^ 32.
I don't foresee any reason for any platform to increment the counters at
an interval greater than 2 ^ 32.


To keep things simple, do something like below:

if (t1.reference <= t0.reference || t1.delivered <= t0.delivered) {
  /* Atleast one of them should have overflowed */
  return desired_perf;
}
else {
  compute the delivered perf using the counters.
}


No need to do like this as this is tested and found working across 
counter overruns in our platform.



+
+ /* Check to avoid divide-by zero */
+ if (delta_reference || delta_delivered)
+ delivered_perf = (reference_perf * delta_delivered) /
+ delta_reference;
+ else
+ delivered_perf = cpu->perf_ctrls.desired_perf;
+
+ return cppc_cpufreq_perf_to_khz(cpu, delivered_perf);
+}
+
+static unsigned int cppc_cpufreq_get_rate(unsigned int cpunum)
+{
+ struct cppc_perf_fb_ctrs fb_ctrs_t0 = {0}, fb_ctrs_t1 = {0};
+ struct cppc_cpudata *cpu = all_cpu_data[cpunum];
+ int ret;
+
+ ret = cppc_get_perf_ctrs(cpunum, _ctrs_t0);
+ if (ret)
+ return ret;
+
+ udelay(2); /* 2usec delay between sampling */
+
+ ret = cppc_get_perf_ctrs(cpunum, _ctrs_t1);
+ if (ret)
+ return ret;
+
+ return cppc_get_rate_from_fbctrs(cpu, fb_ctrs_t0, fb_ctrs_t1);
+}
+
  static struct cpufreq_driver cppc_cpufreq_driver = {
   .flags = CPUFREQ_CONST_LOOPS,
   .verify = cppc_verify_policy,
   .target = cppc_cpufreq_set_target,
+ .get = cppc_cpufreq_get_rate,
   .init = cppc_cpufreq_cpu_init,
   .stop_cpu = cppc_cpufreq_stop_cpu,
   .name = "cppc_cpufreq",

Re: [PATCH] ib_srpt: use kvmalloc to allocate ring pointers

2018-07-09 Thread Leon Romanovsky

On Mon, Jul 09, 2018 at 04:51:08PM +0300, Jan Dakinevich wrote:
> An array of pointers to SRPT contexts in ib_device is over 30KiB even
> in default case, in which an amount of contexts is 4095. The patch
> is intended to weed out large contigous allocation for non-DMA memory.

kvmalloc* doesn't "weed out" large contiguous allocations, but tries to
allocate them and gracefully fallback to vmalloc if kmalloc fails.

More on that, for allocations less than page (64KB for PowerPC), it will
not call to vmalloc at all, which is fine too, because pages granularity.

If you want to get rid of "contiguous allocations", use vmalloc instead.

Thanks

>
> Signed-off-by: Jan Dakinevich 
> ---
>  drivers/infiniband/ulp/srpt/ib_srpt.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c 
> b/drivers/infiniband/ulp/srpt/ib_srpt.c
> index 3081c62..de167af 100644
> --- a/drivers/infiniband/ulp/srpt/ib_srpt.c
> +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
> @@ -720,7 +720,7 @@ static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct 
> srpt_device *sdev,
>   WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx)
>   && ioctx_size != sizeof(struct srpt_send_ioctx));
>
> - ring = kmalloc_array(ring_size, sizeof(ring[0]), GFP_KERNEL);
> + ring = kvmalloc_array(ring_size, sizeof(ring[0]), GFP_KERNEL);
>   if (!ring)
>   goto out;
>   for (i = 0; i < ring_size; ++i) {
> @@ -734,7 +734,7 @@ static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct 
> srpt_device *sdev,
>  err:
>   while (--i >= 0)
>   srpt_free_ioctx(sdev, ring[i], dma_size, dir);
> - kfree(ring);
> + kvfree(ring);
>   ring = NULL;
>  out:
>   return ring;
> @@ -759,7 +759,7 @@ static void srpt_free_ioctx_ring(struct srpt_ioctx 
> **ioctx_ring,
>
>   for (i = 0; i < ring_size; ++i)
>   srpt_free_ioctx(sdev, ioctx_ring[i], dma_size, dir);
> - kfree(ioctx_ring);
> + kvfree(ioctx_ring);
>  }
>
>  /**
> --
> 2.1.4
>


signature.asc
Description: PGP signature

Re: [PATCH] ib_srpt: use kvmalloc to allocate ring pointers

2018-07-09 Thread Leon Romanovsky

On Mon, Jul 09, 2018 at 04:51:08PM +0300, Jan Dakinevich wrote:
> An array of pointers to SRPT contexts in ib_device is over 30KiB even
> in default case, in which an amount of contexts is 4095. The patch
> is intended to weed out large contigous allocation for non-DMA memory.

kvmalloc* doesn't "weed out" large contiguous allocations, but tries to
allocate them and gracefully fallback to vmalloc if kmalloc fails.

More on that, for allocations less than page (64KB for PowerPC), it will
not call to vmalloc at all, which is fine too, because pages granularity.

If you want to get rid of "contiguous allocations", use vmalloc instead.

Thanks

>
> Signed-off-by: Jan Dakinevich 
> ---
>  drivers/infiniband/ulp/srpt/ib_srpt.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c 
> b/drivers/infiniband/ulp/srpt/ib_srpt.c
> index 3081c62..de167af 100644
> --- a/drivers/infiniband/ulp/srpt/ib_srpt.c
> +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
> @@ -720,7 +720,7 @@ static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct 
> srpt_device *sdev,
>   WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx)
>   && ioctx_size != sizeof(struct srpt_send_ioctx));
>
> - ring = kmalloc_array(ring_size, sizeof(ring[0]), GFP_KERNEL);
> + ring = kvmalloc_array(ring_size, sizeof(ring[0]), GFP_KERNEL);
>   if (!ring)
>   goto out;
>   for (i = 0; i < ring_size; ++i) {
> @@ -734,7 +734,7 @@ static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct 
> srpt_device *sdev,
>  err:
>   while (--i >= 0)
>   srpt_free_ioctx(sdev, ring[i], dma_size, dir);
> - kfree(ring);
> + kvfree(ring);
>   ring = NULL;
>  out:
>   return ring;
> @@ -759,7 +759,7 @@ static void srpt_free_ioctx_ring(struct srpt_ioctx 
> **ioctx_ring,
>
>   for (i = 0; i < ring_size; ++i)
>   srpt_free_ioctx(sdev, ioctx_ring[i], dma_size, dir);
> - kfree(ioctx_ring);
> + kvfree(ioctx_ring);
>  }
>
>  /**
> --
> 2.1.4
>


signature.asc
Description: PGP signature

Re: mm,tlb: revert 4647706ebeee?

2018-07-09 Thread Nicholas Piggin

On Mon, 9 Jul 2018 17:13:56 -0700
Andrew Morton  wrote:

> On Sun, 8 Jul 2018 01:25:38 +1000 Nicholas Piggin  wrote:
> 
> > On Fri, 06 Jul 2018 13:03:55 -0400
> > Rik van Riel  wrote:
> >   
> > > Hello,
> > > 
> > > It looks like last summer, there were 2 sets of patches
> > > in flight to fix the issue of simultaneous mprotect/madvise
> > > calls unmapping PTEs, and some pages not being flushed from
> > > the TLB before returning to userspace.
> > > 
> > > Minchan posted these patches:
> > > 56236a59556c ("mm: refactor TLB gathering API")
> > > 99baac21e458 ("mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem")
> > > 
> > > Around the same time, Mel posted:
> > > 4647706ebeee ("mm: always flush VMA ranges affected by zap_page_range")
> > > 
> > > They both appear to solve the same bug.
> > > 
> > > Only one of the two solutions is needed.
> > > 
> > > However, 4647706ebeee appears to introduce extra TLB
> > > flushes - one per VMA, instead of one over the entire
> > > range unmapped, and also extra flushes when there are
> > > no simultaneous unmappers of the same mm.
> > > 
> > > For that reason, it seems like we should revert
> > > 4647706ebeee and keep only Minchan's solution in
> > > the kernel.
> > > 
> > > Am I overlooking any reason why we should not revert
> > > 4647706ebeee?  
> > 
> > Yes I think so. Discussed here recently:
> > 
> > https://marc.info/?l=linux-mm=152878780528037=2  
> 
> Unclear if that was an ack ;)
>

Sure, I'm thinking Rik's mail is a ack for my patch :)

No actually I think it's okay, but was in the middle of testing
my series when Aneesh pointed out a bit was missing from powerpc,
so I had to go off and fix that, I think that's upstream now. So
need to go back and re-test this revert.

Wouldn't hurt for other arch maintainers to have a look I guess
(cc linux-arch):

The problem powerpc had is that mmu_gather flushing will flush a
single page size based on the ptes it encounters when we zap. If
we hit a different page size, it flushes and switches to the new
size. If we have concurrent zaps on the same range, the other
thread may have cleared a large page pte so we won't see that and
will only do a small page flush for that range. Which means we can
return before the other thread invalidated our TLB for the large
pages in the range we wanted to flush.

I suspect most arches are probably okay, but if you make any TLB
flush choices based on the pte contents, then you could be exposed.
Except in the case of archs like sparc and powerpc/hash which do
the flushing in arch_leave_lazy_mmu_mode(), because that is called
under the same page table lock, so there can't be concurrent zap.

A quick look through the archs doesn't show anything obvious, but
please take a look at your arch.

And I'll try to do a bit more testing.

Thanks,
Nick

Re: mm,tlb: revert 4647706ebeee?

2018-07-09 Thread Nicholas Piggin

On Mon, 9 Jul 2018 17:13:56 -0700
Andrew Morton  wrote:

> On Sun, 8 Jul 2018 01:25:38 +1000 Nicholas Piggin  wrote:
> 
> > On Fri, 06 Jul 2018 13:03:55 -0400
> > Rik van Riel  wrote:
> >   
> > > Hello,
> > > 
> > > It looks like last summer, there were 2 sets of patches
> > > in flight to fix the issue of simultaneous mprotect/madvise
> > > calls unmapping PTEs, and some pages not being flushed from
> > > the TLB before returning to userspace.
> > > 
> > > Minchan posted these patches:
> > > 56236a59556c ("mm: refactor TLB gathering API")
> > > 99baac21e458 ("mm: fix MADV_[FREE|DONTNEED] TLB flush miss problem")
> > > 
> > > Around the same time, Mel posted:
> > > 4647706ebeee ("mm: always flush VMA ranges affected by zap_page_range")
> > > 
> > > They both appear to solve the same bug.
> > > 
> > > Only one of the two solutions is needed.
> > > 
> > > However, 4647706ebeee appears to introduce extra TLB
> > > flushes - one per VMA, instead of one over the entire
> > > range unmapped, and also extra flushes when there are
> > > no simultaneous unmappers of the same mm.
> > > 
> > > For that reason, it seems like we should revert
> > > 4647706ebeee and keep only Minchan's solution in
> > > the kernel.
> > > 
> > > Am I overlooking any reason why we should not revert
> > > 4647706ebeee?  
> > 
> > Yes I think so. Discussed here recently:
> > 
> > https://marc.info/?l=linux-mm=152878780528037=2  
> 
> Unclear if that was an ack ;)
>

Sure, I'm thinking Rik's mail is a ack for my patch :)

No actually I think it's okay, but was in the middle of testing
my series when Aneesh pointed out a bit was missing from powerpc,
so I had to go off and fix that, I think that's upstream now. So
need to go back and re-test this revert.

Wouldn't hurt for other arch maintainers to have a look I guess
(cc linux-arch):

The problem powerpc had is that mmu_gather flushing will flush a
single page size based on the ptes it encounters when we zap. If
we hit a different page size, it flushes and switches to the new
size. If we have concurrent zaps on the same range, the other
thread may have cleared a large page pte so we won't see that and
will only do a small page flush for that range. Which means we can
return before the other thread invalidated our TLB for the large
pages in the range we wanted to flush.

I suspect most arches are probably okay, but if you make any TLB
flush choices based on the pte contents, then you could be exposed.
Except in the case of archs like sparc and powerpc/hash which do
the flushing in arch_leave_lazy_mmu_mode(), because that is called
under the same page table lock, so there can't be concurrent zap.

A quick look through the archs doesn't show anything obvious, but
please take a look at your arch.

And I'll try to do a bit more testing.

Thanks,
Nick

Re: [LKP] [lkp-robot] [brd] 316ba5736c: aim7.jobs-per-min -11.2% regression

2018-07-09 Thread kemi

Hi, SeongJae
  Do you have any input for this regression? thanks

On 2018年06月04日 13:52, kernel test robot wrote:
> 
> Greeting,
> 
> FYI, we noticed a -11.2% regression of aim7.jobs-per-min due to commit:
> 
> 
> commit: 316ba5736c9caa5dbcd84085989862d2df57431d ("brd: Mark as 
> non-rotational")
> https://git.kernel.org/cgit/linux/kernel/git/axboe/linux-block.git 
> for-4.18/block
> 
> in testcase: aim7
> on test machine: 40 threads Intel(R) Xeon(R) CPU E5-2690 v2 @ 3.00GHz with 
> 384G memory
> with following parameters:
> 
>   disk: 1BRD_48G
>   fs: btrfs
>   test: disk_rw
>   load: 1500
>   cpufreq_governor: performance
> 
> test-description: AIM7 is a traditional UNIX system level benchmark suite 
> which is used to test and measure the performance of multiuser system.
> test-url: https://sourceforge.net/projects/aimbench/files/aim-suite7/
> 
> 
> 
> Details are as below:
> -->
> 
> =
> compiler/cpufreq_governor/disk/fs/kconfig/load/rootfs/tbox_group/test/testcase:
>   
> gcc-7/performance/1BRD_48G/btrfs/x86_64-rhel-7.2/1500/debian-x86_64-2016-08-31.cgz/lkp-ivb-ep01/disk_rw/aim7
> 
> commit: 
>   522a777566 ("block: consolidate struct request timestamp fields")
>   316ba5736c ("brd: Mark as non-rotational")
> 
> 522a777566f56696 316ba5736c9caa5dbcd8408598 
>  -- 
>  %stddev %change %stddev
>  \  |\  
>  28321   -11.2%  25147aim7.jobs-per-min
> 318.19   +12.6% 358.23aim7.time.elapsed_time
> 318.19   +12.6% 358.23aim7.time.elapsed_time.max
>1437526 ±  2% +14.6%1646849 ±  2%  
> aim7.time.involuntary_context_switches
>  11986   +14.2%  13691aim7.time.system_time
>  73.06 ±  2%  -3.6%  70.43aim7.time.user_time
>2449470 ±  2% -25.0%1837521 ±  4%  
> aim7.time.voluntary_context_switches
>  20.25 ± 58%   +1681.5% 360.75 ±109%  numa-meminfo.node1.Mlocked
> 456062   -16.3% 381859softirqs.SCHED
>   9015 ±  7% -21.3%   7098 ± 22%  meminfo.CmaFree
>  47.50 ± 58%   +1355.8% 691.50 ± 92%  meminfo.Mlocked
>   5.24 ±  3%  -1.23.99 ±  2%  mpstat.cpu.idle%
>   0.61 ±  2%  -0.10.52 ±  2%  mpstat.cpu.usr%
>  16627   +12.8%  18762 ±  4%  slabinfo.Acpi-State.active_objs
>  16627   +12.9%  18775 ±  4%  slabinfo.Acpi-State.num_objs
>  57.00 ±  2% +17.5%  67.00vmstat.procs.r
>  20936   -24.8%  15752 ±  2%  vmstat.system.cs
>  45474-1.7%  44681vmstat.system.in
>   6.50 ± 59%   +1157.7%  81.75 ± 75%  numa-vmstat.node0.nr_mlock
> 242870 ±  3% +13.2% 274913 ±  7%  numa-vmstat.node0.nr_written
>   2278 ±  7% -22.6%   1763 ± 21%  numa-vmstat.node1.nr_free_cma
>   4.75 ± 58%   +1789.5%  89.75 ±109%  numa-vmstat.node1.nr_mlock
>   88018135 ±  3% -48.9%   44980457 ±  7%  cpuidle.C1.time
>1398288 ±  3% -51.1% 683493 ±  9%  cpuidle.C1.usage
>3499814 ±  2% -38.5%2153158 ±  5%  cpuidle.C1E.time
>  52722 ±  4% -45.6%  28692 ±  6%  cpuidle.C1E.usage
>9865857 ±  3% -40.1%5905155 ±  5%  cpuidle.C3.time
>  69656 ±  2% -42.6%  39990 ±  5%  cpuidle.C3.usage
> 590856 ±  2% -12.3% 517910cpuidle.C6.usage
>  46160 ±  7% -53.7%  21372 ± 11%  cpuidle.POLL.time
>   1716 ±  7% -46.6% 916.25 ± 14%  cpuidle.POLL.usage
> 197656+4.1% 205732proc-vmstat.nr_active_file
> 191867+4.1% 199647proc-vmstat.nr_dirty
> 509282+1.6% 517318proc-vmstat.nr_file_pages
>   2282 ±  8% -24.4%   1725 ± 22%  proc-vmstat.nr_free_cma
> 357.50   +10.6% 395.25 ±  2%  proc-vmstat.nr_inactive_file
>  11.50 ± 58%   +1397.8% 172.25 ± 93%  proc-vmstat.nr_mlock
> 970355 ±  4% +14.6%549 ±  8%  proc-vmstat.nr_written
> 197984+4.1% 206034proc-vmstat.nr_zone_active_file
> 357.50   +10.6% 395.25 ±  2%  
> proc-vmstat.nr_zone_inactive_file
> 192282+4.1% 200126
> proc-vmstat.nr_zone_write_pending
>7901465 ±  3% -14.0%6795016 ± 16%  proc-vmstat.pgalloc_movable
> 886101   +10.2% 976329proc-vmstat.pgfault
>  2.169e+12   +15.2%  2.497e+12perf-stat.branch-instructions
>   0.41-0.10.35perf-stat.branch-miss-rate%
>  31.19 ±  2%  +1.6   32.82perf-stat.cache-miss-rate%
>  9.116e+09+8.3%  9.869e+09perf-stat.cache-misses
>

Re: [LKP] [lkp-robot] [brd] 316ba5736c: aim7.jobs-per-min -11.2% regression

2018-07-09 Thread kemi

Hi, SeongJae
  Do you have any input for this regression? thanks

On 2018年06月04日 13:52, kernel test robot wrote:
> 
> Greeting,
> 
> FYI, we noticed a -11.2% regression of aim7.jobs-per-min due to commit:
> 
> 
> commit: 316ba5736c9caa5dbcd84085989862d2df57431d ("brd: Mark as 
> non-rotational")
> https://git.kernel.org/cgit/linux/kernel/git/axboe/linux-block.git 
> for-4.18/block
> 
> in testcase: aim7
> on test machine: 40 threads Intel(R) Xeon(R) CPU E5-2690 v2 @ 3.00GHz with 
> 384G memory
> with following parameters:
> 
>   disk: 1BRD_48G
>   fs: btrfs
>   test: disk_rw
>   load: 1500
>   cpufreq_governor: performance
> 
> test-description: AIM7 is a traditional UNIX system level benchmark suite 
> which is used to test and measure the performance of multiuser system.
> test-url: https://sourceforge.net/projects/aimbench/files/aim-suite7/
> 
> 
> 
> Details are as below:
> -->
> 
> =
> compiler/cpufreq_governor/disk/fs/kconfig/load/rootfs/tbox_group/test/testcase:
>   
> gcc-7/performance/1BRD_48G/btrfs/x86_64-rhel-7.2/1500/debian-x86_64-2016-08-31.cgz/lkp-ivb-ep01/disk_rw/aim7
> 
> commit: 
>   522a777566 ("block: consolidate struct request timestamp fields")
>   316ba5736c ("brd: Mark as non-rotational")
> 
> 522a777566f56696 316ba5736c9caa5dbcd8408598 
>  -- 
>  %stddev %change %stddev
>  \  |\  
>  28321   -11.2%  25147aim7.jobs-per-min
> 318.19   +12.6% 358.23aim7.time.elapsed_time
> 318.19   +12.6% 358.23aim7.time.elapsed_time.max
>1437526 ±  2% +14.6%1646849 ±  2%  
> aim7.time.involuntary_context_switches
>  11986   +14.2%  13691aim7.time.system_time
>  73.06 ±  2%  -3.6%  70.43aim7.time.user_time
>2449470 ±  2% -25.0%1837521 ±  4%  
> aim7.time.voluntary_context_switches
>  20.25 ± 58%   +1681.5% 360.75 ±109%  numa-meminfo.node1.Mlocked
> 456062   -16.3% 381859softirqs.SCHED
>   9015 ±  7% -21.3%   7098 ± 22%  meminfo.CmaFree
>  47.50 ± 58%   +1355.8% 691.50 ± 92%  meminfo.Mlocked
>   5.24 ±  3%  -1.23.99 ±  2%  mpstat.cpu.idle%
>   0.61 ±  2%  -0.10.52 ±  2%  mpstat.cpu.usr%
>  16627   +12.8%  18762 ±  4%  slabinfo.Acpi-State.active_objs
>  16627   +12.9%  18775 ±  4%  slabinfo.Acpi-State.num_objs
>  57.00 ±  2% +17.5%  67.00vmstat.procs.r
>  20936   -24.8%  15752 ±  2%  vmstat.system.cs
>  45474-1.7%  44681vmstat.system.in
>   6.50 ± 59%   +1157.7%  81.75 ± 75%  numa-vmstat.node0.nr_mlock
> 242870 ±  3% +13.2% 274913 ±  7%  numa-vmstat.node0.nr_written
>   2278 ±  7% -22.6%   1763 ± 21%  numa-vmstat.node1.nr_free_cma
>   4.75 ± 58%   +1789.5%  89.75 ±109%  numa-vmstat.node1.nr_mlock
>   88018135 ±  3% -48.9%   44980457 ±  7%  cpuidle.C1.time
>1398288 ±  3% -51.1% 683493 ±  9%  cpuidle.C1.usage
>3499814 ±  2% -38.5%2153158 ±  5%  cpuidle.C1E.time
>  52722 ±  4% -45.6%  28692 ±  6%  cpuidle.C1E.usage
>9865857 ±  3% -40.1%5905155 ±  5%  cpuidle.C3.time
>  69656 ±  2% -42.6%  39990 ±  5%  cpuidle.C3.usage
> 590856 ±  2% -12.3% 517910cpuidle.C6.usage
>  46160 ±  7% -53.7%  21372 ± 11%  cpuidle.POLL.time
>   1716 ±  7% -46.6% 916.25 ± 14%  cpuidle.POLL.usage
> 197656+4.1% 205732proc-vmstat.nr_active_file
> 191867+4.1% 199647proc-vmstat.nr_dirty
> 509282+1.6% 517318proc-vmstat.nr_file_pages
>   2282 ±  8% -24.4%   1725 ± 22%  proc-vmstat.nr_free_cma
> 357.50   +10.6% 395.25 ±  2%  proc-vmstat.nr_inactive_file
>  11.50 ± 58%   +1397.8% 172.25 ± 93%  proc-vmstat.nr_mlock
> 970355 ±  4% +14.6%549 ±  8%  proc-vmstat.nr_written
> 197984+4.1% 206034proc-vmstat.nr_zone_active_file
> 357.50   +10.6% 395.25 ±  2%  
> proc-vmstat.nr_zone_inactive_file
> 192282+4.1% 200126
> proc-vmstat.nr_zone_write_pending
>7901465 ±  3% -14.0%6795016 ± 16%  proc-vmstat.pgalloc_movable
> 886101   +10.2% 976329proc-vmstat.pgfault
>  2.169e+12   +15.2%  2.497e+12perf-stat.branch-instructions
>   0.41-0.10.35perf-stat.branch-miss-rate%
>  31.19 ±  2%  +1.6   32.82perf-stat.cache-miss-rate%
>  9.116e+09+8.3%  9.869e+09perf-stat.cache-misses
>

Re: [PATCH v2] objtool: move libelf detection to Kconfig from Makefile

2018-07-09 Thread Ingo Molnar



* Josh Poimboeuf  wrote:

> Since we switched the x86_64 default to the ORC unwinder, a lot of
> people have switched over.  But this patch will reverse (or at least
> slow down) that trend, because almost nobody has the libelf devel
> packaged installed by default.  So over time, it will effectively make
> frame pointers the default again in many cases.  That's exactly what we
> *don't* want to do.  It will also cause people to accidentally re-enable
> frame pointers when they thought they had ORC.

Yeah, agreed - turning ORC off like that is a non-starter.

Thanks,

Ingo

Re: [PATCH v2] objtool: move libelf detection to Kconfig from Makefile

2018-07-09 Thread Ingo Molnar



* Josh Poimboeuf  wrote:

> Since we switched the x86_64 default to the ORC unwinder, a lot of
> people have switched over.  But this patch will reverse (or at least
> slow down) that trend, because almost nobody has the libelf devel
> packaged installed by default.  So over time, it will effectively make
> frame pointers the default again in many cases.  That's exactly what we
> *don't* want to do.  It will also cause people to accidentally re-enable
> frame pointers when they thought they had ORC.

Yeah, agreed - turning ORC off like that is a non-starter.

Thanks,

Ingo

Re: [PATCH -mm -v4 02/21] mm, THP, swap: Make CONFIG_THP_SWAP depends on CONFIG_SWAP

2018-07-09 Thread Huang, Ying

Dave Hansen  writes:

> On 07/09/2018 06:19 PM, Huang, Ying wrote:
>> Dave Hansen  writes:
>> 
  config THP_SWAP
def_bool y
 -  depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
 +  depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
help
>>>
>>> This seems like a bug-fix.  Is there a reason this didn't cause problems
>>> up to now?
>> Yes.  The original code has some problem in theory, but not in practice
>> because all code enclosed by
>> 
>> #ifdef CONFIG_THP_SWAP
>> #endif
>> 
>> are in swapfile.c.  But that will be not true in this patchset.
>
> That's great info for the changelog.

Sure.  Will add it.

Best Regards,
Huang, Ying

Re: [PATCH -mm -v4 02/21] mm, THP, swap: Make CONFIG_THP_SWAP depends on CONFIG_SWAP

2018-07-09 Thread Huang, Ying

Dave Hansen  writes:

> On 07/09/2018 06:19 PM, Huang, Ying wrote:
>> Dave Hansen  writes:
>> 
  config THP_SWAP
def_bool y
 -  depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
 +  depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
help
>>>
>>> This seems like a bug-fix.  Is there a reason this didn't cause problems
>>> up to now?
>> Yes.  The original code has some problem in theory, but not in practice
>> because all code enclosed by
>> 
>> #ifdef CONFIG_THP_SWAP
>> #endif
>> 
>> are in swapfile.c.  But that will be not true in this patchset.
>
> That's great info for the changelog.

Sure.  Will add it.

Best Regards,
Huang, Ying

RE: [PATCH] mmc: core: improve rationality of bus width setting for HS400es

2018-07-09 Thread 方洪杰

> On 9 July 2018 at 08:47, Hongjie Fang  wrote:
> > mmc_select_hs400es() calls mmc_select_bus_width() which will try to
> > set 4bit transfer mode if fail to set 8bit mode. The problem is that
> > the bus width should not be set to 4bit in HS400es mode.
> 
> I guess it fails because there is something wrong. Can you please
> elaborate under what circumstance the problem occurs?
> 
This problem very occasionally occurred when system resume from
suspend state. The card stuck in the programming state after setting
8bit mode. After that, mmc_select_bus_width() will continue to set
4bit mode and return without errors. Of course, the following R/W
requests will be all failing.

The problem scene is hard to reproduce. Maybe it happened
because there is something wrong about bus clock or core voltage
stability when system do resume.

> >
> > Signed-off-by: Hongjie Fang 
> > ---
> >  drivers/mmc/core/mmc.c | 23 +--
> >  1 file changed, 21 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
> > index 4466f5d..94c3cc5 100644
> > --- a/drivers/mmc/core/mmc.c
> > +++ b/drivers/mmc/core/mmc.c
> > @@ -1337,9 +1337,28 @@ static int mmc_select_hs400es(struct mmc_card *card)
> > if (err)
> > goto out_err;
> >
> > -   err = mmc_select_bus_width(card);
> > -   if (err < 0)
> > +   /* Switch card to 8 bit mode */
> > +   err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
> > +EXT_CSD_BUS_WIDTH,
> > +EXT_CSD_BUS_WIDTH_8,
> > +card->ext_csd.generic_cmd6_time);
> > +   if (err) {
> > +   pr_err("%s: switch to 8 bit mode for hs400es failed, 
> > err:%d\n",
> > +   mmc_hostname(host), err);
> > +   goto out_err;
> > +   }
> > +
> > +   mmc_set_bus_width(host, MMC_BUS_WIDTH_8);
> > +
> > +   if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST))
> > +   err = mmc_compare_ext_csds(card, MMC_BUS_WIDTH_8);
> > +   else
> > +   err = mmc_bus_test(card, MMC_BUS_WIDTH_8);
> > +   if (err) {
> > +   pr_err("%s: test 8 bit mode for hs400es failed, err:%d\n",
> > +   mmc_hostname(host), err);
> > goto out_err;
> 
> Lot's open coding, no I don't like it.
> 
> Wouldn't it instead be possible to check the return value from
> mmc_select_bus_width(), as it should tell what width it manage to
> select. Then if it isn't 8-bit, then we should bail out. Does that
> work?
> 
After setting 4bit mode, it can't work because it cannot back to
HS200 mode like the HS400 mode.
For HS400es, if fail to set 8bit mode, maybe it is more reasonable
that return error directly.

> > +   }
> >
> > /* Switch card to HS mode */
> > err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
> > --
> > 1.9.1
> >
> 
> Kind regards
> Uffe


B
Hongjie

RE: [PATCH] mmc: core: improve rationality of bus width setting for HS400es

2018-07-09 Thread 方洪杰

> On 9 July 2018 at 08:47, Hongjie Fang  wrote:
> > mmc_select_hs400es() calls mmc_select_bus_width() which will try to
> > set 4bit transfer mode if fail to set 8bit mode. The problem is that
> > the bus width should not be set to 4bit in HS400es mode.
> 
> I guess it fails because there is something wrong. Can you please
> elaborate under what circumstance the problem occurs?
> 
This problem very occasionally occurred when system resume from
suspend state. The card stuck in the programming state after setting
8bit mode. After that, mmc_select_bus_width() will continue to set
4bit mode and return without errors. Of course, the following R/W
requests will be all failing.

The problem scene is hard to reproduce. Maybe it happened
because there is something wrong about bus clock or core voltage
stability when system do resume.

> >
> > Signed-off-by: Hongjie Fang 
> > ---
> >  drivers/mmc/core/mmc.c | 23 +--
> >  1 file changed, 21 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
> > index 4466f5d..94c3cc5 100644
> > --- a/drivers/mmc/core/mmc.c
> > +++ b/drivers/mmc/core/mmc.c
> > @@ -1337,9 +1337,28 @@ static int mmc_select_hs400es(struct mmc_card *card)
> > if (err)
> > goto out_err;
> >
> > -   err = mmc_select_bus_width(card);
> > -   if (err < 0)
> > +   /* Switch card to 8 bit mode */
> > +   err = mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
> > +EXT_CSD_BUS_WIDTH,
> > +EXT_CSD_BUS_WIDTH_8,
> > +card->ext_csd.generic_cmd6_time);
> > +   if (err) {
> > +   pr_err("%s: switch to 8 bit mode for hs400es failed, 
> > err:%d\n",
> > +   mmc_hostname(host), err);
> > +   goto out_err;
> > +   }
> > +
> > +   mmc_set_bus_width(host, MMC_BUS_WIDTH_8);
> > +
> > +   if (!(host->caps & MMC_CAP_BUS_WIDTH_TEST))
> > +   err = mmc_compare_ext_csds(card, MMC_BUS_WIDTH_8);
> > +   else
> > +   err = mmc_bus_test(card, MMC_BUS_WIDTH_8);
> > +   if (err) {
> > +   pr_err("%s: test 8 bit mode for hs400es failed, err:%d\n",
> > +   mmc_hostname(host), err);
> > goto out_err;
> 
> Lot's open coding, no I don't like it.
> 
> Wouldn't it instead be possible to check the return value from
> mmc_select_bus_width(), as it should tell what width it manage to
> select. Then if it isn't 8-bit, then we should bail out. Does that
> work?
> 
After setting 4bit mode, it can't work because it cannot back to
HS200 mode like the HS400 mode.
For HS400es, if fail to set 8bit mode, maybe it is more reasonable
that return error directly.

> > +   }
> >
> > /* Switch card to HS mode */
> > err = __mmc_switch(card, EXT_CSD_CMD_SET_NORMAL,
> > --
> > 1.9.1
> >
> 
> Kind regards
> Uffe


B
Hongjie

[PATCH 1/2] arm: dts: mt7623: cleanup MT7623N NAND dts file

2018-07-09 Thread Ryder Lee

Normally, we didn't release this kind of baord to user. This specific
board exists only in the early stage of development inside MediaTek -
and that may confuse peoples.

Hence this patch removes related files accordingly.

Cc: John Crispin 
Cc: Sean Wang 
Signed-off-by: Ryder Lee 
---
 arch/arm/boot/dts/Makefile |  1 -
 arch/arm/boot/dts/mt7623n-rfb-nand.dts | 73 -
 arch/arm/boot/dts/mt7623n-rfb.dtsi | 86 --
 3 files changed, 160 deletions(-)
 delete mode 100644 arch/arm/boot/dts/mt7623n-rfb-nand.dts
 delete mode 100644 arch/arm/boot/dts/mt7623n-rfb.dtsi

diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 37a3de7..dde494e 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -1168,7 +1168,6 @@ dtb-$(CONFIG_ARCH_MEDIATEK) += \
mt7623a-rfb-emmc.dtb \
mt7623a-rfb-nand.dtb \
mt7623n-rfb-emmc.dtb \
-   mt7623n-rfb-nand.dtb \
mt7623n-bananapi-bpi-r2.dtb \
mt8127-moose.dtb \
mt8135-evbp1.dtb
diff --git a/arch/arm/boot/dts/mt7623n-rfb-nand.dts 
b/arch/arm/boot/dts/mt7623n-rfb-nand.dts
deleted file mode 100644
index 96ff3c9..000
--- a/arch/arm/boot/dts/mt7623n-rfb-nand.dts
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2017 MediaTek Inc.
- * Author: John Crispin 
- *
- */
-
-/dts-v1/;
-#include "mt7623n-rfb.dtsi"
-
-/ {
-   model = "MediaTek MT7623N NAND reference board";
-   compatible = "mediatek,mt7623n-rfb-nand", "mediatek,mt7623";
-};
-
- {
-   status = "okay";
-};
-
- {
-   status = "okay";
-   pinctrl-names = "default";
-   pinctrl-0 = <_pins_default>;
-
-   nand@0 {
-   reg = <0>;
-   spare_per_sector = <64>;
-   nand-ecc-mode = "hw";
-   nand-ecc-strength = <12>;
-   nand-ecc-step-size = <1024>;
-
-   partitions {
-   compatible = "fixed-partitions";
-   #address-cells = <1>;
-   #size-cells = <1>;
-
-   partition@0 {
-   label = "preloader";
-   reg = <0x0 0x4>;
-   };
-
-   partition@4 {
-   label = "uboot";
-   reg = <0x4 0x8>;
-   };
-
-   partition@c {
-   label = "uboot-env";
-   reg = <0xC 0x4>;
-   };
-
-   partition@14 {
-   label = "bootimg";
-   reg = <0x14 0x200>;
-   };
-
-   partition@214 {
-   label = "recovery";
-   reg = <0x214 0x200>;
-   };
-
-   partition@414 {
-   label = "rootfs";
-   reg = <0x414 0x100>;
-   };
-
-   partition@514 {
-   label = "usrdata";
-   reg = <0x514 0x100>;
-   };
-   };
-   };
-};
diff --git a/arch/arm/boot/dts/mt7623n-rfb.dtsi 
b/arch/arm/boot/dts/mt7623n-rfb.dtsi
deleted file mode 100644
index 5c5cc7d..000
--- a/arch/arm/boot/dts/mt7623n-rfb.dtsi
+++ /dev/null
@@ -1,86 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2017 MediaTek Inc.
- * Author: John Crispin 
- *Sean Wang 
- *
- */
-
-/dts-v1/;
-#include "mt7623.dtsi"
-#include "mt6323.dtsi"
-
-/ {
-   aliases {
-   serial0 = 
-   serial1 = 
-   serial2 = 
-   };
-
-   chosen {
-   stdout-path = "serial2:115200n8";
-   };
-
-   cpus {
-   cpu0 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu1 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu2 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu3 {
-   proc-supply = <_vproc_reg>;
-   };
-   };
-
-   memory@8000 {
-   device_type = "memory";
-   reg = <0 0x8000 0 0x4000>;
-   };
-
-   usb_p1_vbus: regulator-5v {
-   compatible = "regulator-fixed";
-   regulator-name = "usb_vbus";
-   regulator-min-microvolt = <500>;
-   regulator-max-microvolt = <500>;
-   gpio = < 135 GPIO_ACTIVE_HIGH>;
-   enable-active-high;
-   };
-};
-
- {
-   vmmc-supply = <_vemc3v3_reg>;
-   vqmmc-supply = <_vio18_reg>;
-};
-
- {
-   vmmc-supply

Re: linux-next: manual merge of the rdma tree with the rdma-fixes tree

2018-07-09 Thread Leon Romanovsky

On Tue, Jul 10, 2018 at 11:17:40AM +1000, Stephen Rothwell wrote:
> Hi all,
>
> Today's linux-next merge of the rdma tree got a conflict in:
>
>   drivers/infiniband/core/uverbs_cmd.c
>
> between commit:
>
>   4fae7f170416 ("RDMA/uverbs: Fix slab-out-of-bounds in 
> ib_uverbs_ex_create_flow")
>
> from the rdma-fixes tree and commit:
>
>   2cc1e3b80942 ("IB/uverbs: Replace file->ucontext with file in uverbs_cmd.c")
>
> from the rdma tree.
>
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
>
> --

Thanks Stephen, Looks good.


signature.asc
Description: PGP signature

[PATCH 2/2] dt-bindings: arm: mediatek: cleanup MT7623N reference boards

2018-07-09 Thread Ryder Lee

Cleanup binding document to get rid of unsupported reference boards
for MT7623N.

Cc: John Crispin 
Cc: Sean Wang 
Signed-off-by: Ryder Lee 
---
 Documentation/devicetree/bindings/arm/mediatek.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/arm/mediatek.txt 
b/Documentation/devicetree/bindings/arm/mediatek.txt
index 7d21ab3..c07ddf4 100644
--- a/Documentation/devicetree/bindings/arm/mediatek.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek.txt
@@ -59,9 +59,6 @@ Supported boards:
 - Reference board for MT7623n with eMMC:
 Required root node properties:
   - compatible = "mediatek,mt7623n-rfb-emmc", "mediatek,mt7623";
-- Reference  board for MT7623n with NAND:
-Required root node properties:
-  - compatible = "mediatek,mt7623n-rfb-nand", "mediatek,mt7623";
 - Bananapi BPI-R2 board:
   - compatible = "bananapi,bpi-r2", "mediatek,mt7623";
 - MTK mt8127 tablet moose EVB:
-- 
1.9.1

[PATCH 1/2] arm: dts: mt7623: cleanup MT7623N NAND dts file

2018-07-09 Thread Ryder Lee

Normally, we didn't release this kind of baord to user. This specific
board exists only in the early stage of development inside MediaTek -
and that may confuse peoples.

Hence this patch removes related files accordingly.

Cc: John Crispin 
Cc: Sean Wang 
Signed-off-by: Ryder Lee 
---
 arch/arm/boot/dts/Makefile |  1 -
 arch/arm/boot/dts/mt7623n-rfb-nand.dts | 73 -
 arch/arm/boot/dts/mt7623n-rfb.dtsi | 86 --
 3 files changed, 160 deletions(-)
 delete mode 100644 arch/arm/boot/dts/mt7623n-rfb-nand.dts
 delete mode 100644 arch/arm/boot/dts/mt7623n-rfb.dtsi

diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile
index 37a3de7..dde494e 100644
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -1168,7 +1168,6 @@ dtb-$(CONFIG_ARCH_MEDIATEK) += \
mt7623a-rfb-emmc.dtb \
mt7623a-rfb-nand.dtb \
mt7623n-rfb-emmc.dtb \
-   mt7623n-rfb-nand.dtb \
mt7623n-bananapi-bpi-r2.dtb \
mt8127-moose.dtb \
mt8135-evbp1.dtb
diff --git a/arch/arm/boot/dts/mt7623n-rfb-nand.dts 
b/arch/arm/boot/dts/mt7623n-rfb-nand.dts
deleted file mode 100644
index 96ff3c9..000
--- a/arch/arm/boot/dts/mt7623n-rfb-nand.dts
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2017 MediaTek Inc.
- * Author: John Crispin 
- *
- */
-
-/dts-v1/;
-#include "mt7623n-rfb.dtsi"
-
-/ {
-   model = "MediaTek MT7623N NAND reference board";
-   compatible = "mediatek,mt7623n-rfb-nand", "mediatek,mt7623";
-};
-
- {
-   status = "okay";
-};
-
- {
-   status = "okay";
-   pinctrl-names = "default";
-   pinctrl-0 = <_pins_default>;
-
-   nand@0 {
-   reg = <0>;
-   spare_per_sector = <64>;
-   nand-ecc-mode = "hw";
-   nand-ecc-strength = <12>;
-   nand-ecc-step-size = <1024>;
-
-   partitions {
-   compatible = "fixed-partitions";
-   #address-cells = <1>;
-   #size-cells = <1>;
-
-   partition@0 {
-   label = "preloader";
-   reg = <0x0 0x4>;
-   };
-
-   partition@4 {
-   label = "uboot";
-   reg = <0x4 0x8>;
-   };
-
-   partition@c {
-   label = "uboot-env";
-   reg = <0xC 0x4>;
-   };
-
-   partition@14 {
-   label = "bootimg";
-   reg = <0x14 0x200>;
-   };
-
-   partition@214 {
-   label = "recovery";
-   reg = <0x214 0x200>;
-   };
-
-   partition@414 {
-   label = "rootfs";
-   reg = <0x414 0x100>;
-   };
-
-   partition@514 {
-   label = "usrdata";
-   reg = <0x514 0x100>;
-   };
-   };
-   };
-};
diff --git a/arch/arm/boot/dts/mt7623n-rfb.dtsi 
b/arch/arm/boot/dts/mt7623n-rfb.dtsi
deleted file mode 100644
index 5c5cc7d..000
--- a/arch/arm/boot/dts/mt7623n-rfb.dtsi
+++ /dev/null
@@ -1,86 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2017 MediaTek Inc.
- * Author: John Crispin 
- *Sean Wang 
- *
- */
-
-/dts-v1/;
-#include "mt7623.dtsi"
-#include "mt6323.dtsi"
-
-/ {
-   aliases {
-   serial0 = 
-   serial1 = 
-   serial2 = 
-   };
-
-   chosen {
-   stdout-path = "serial2:115200n8";
-   };
-
-   cpus {
-   cpu0 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu1 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu2 {
-   proc-supply = <_vproc_reg>;
-   };
-
-   cpu3 {
-   proc-supply = <_vproc_reg>;
-   };
-   };
-
-   memory@8000 {
-   device_type = "memory";
-   reg = <0 0x8000 0 0x4000>;
-   };
-
-   usb_p1_vbus: regulator-5v {
-   compatible = "regulator-fixed";
-   regulator-name = "usb_vbus";
-   regulator-min-microvolt = <500>;
-   regulator-max-microvolt = <500>;
-   gpio = < 135 GPIO_ACTIVE_HIGH>;
-   enable-active-high;
-   };
-};
-
- {
-   vmmc-supply = <_vemc3v3_reg>;
-   vqmmc-supply = <_vio18_reg>;
-};
-
- {
-   vmmc-supply

Re: linux-next: manual merge of the rdma tree with the rdma-fixes tree

2018-07-09 Thread Leon Romanovsky

On Tue, Jul 10, 2018 at 11:17:40AM +1000, Stephen Rothwell wrote:
> Hi all,
>
> Today's linux-next merge of the rdma tree got a conflict in:
>
>   drivers/infiniband/core/uverbs_cmd.c
>
> between commit:
>
>   4fae7f170416 ("RDMA/uverbs: Fix slab-out-of-bounds in 
> ib_uverbs_ex_create_flow")
>
> from the rdma-fixes tree and commit:
>
>   2cc1e3b80942 ("IB/uverbs: Replace file->ucontext with file in uverbs_cmd.c")
>
> from the rdma tree.
>
> I fixed it up (see below) and can carry the fix as necessary. This
> is now fixed as far as linux-next is concerned, but any non trivial
> conflicts should be mentioned to your upstream maintainer when your tree
> is submitted for merging.  You may also want to consider cooperating
> with the maintainer of the conflicting tree to minimise any particularly
> complex conflicts.
>
> --

Thanks Stephen, Looks good.


signature.asc
Description: PGP signature

[PATCH 2/2] dt-bindings: arm: mediatek: cleanup MT7623N reference boards

2018-07-09 Thread Ryder Lee

Cleanup binding document to get rid of unsupported reference boards
for MT7623N.

Cc: John Crispin 
Cc: Sean Wang 
Signed-off-by: Ryder Lee 
---
 Documentation/devicetree/bindings/arm/mediatek.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/arm/mediatek.txt 
b/Documentation/devicetree/bindings/arm/mediatek.txt
index 7d21ab3..c07ddf4 100644
--- a/Documentation/devicetree/bindings/arm/mediatek.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek.txt
@@ -59,9 +59,6 @@ Supported boards:
 - Reference board for MT7623n with eMMC:
 Required root node properties:
   - compatible = "mediatek,mt7623n-rfb-emmc", "mediatek,mt7623";
-- Reference  board for MT7623n with NAND:
-Required root node properties:
-  - compatible = "mediatek,mt7623n-rfb-nand", "mediatek,mt7623";
 - Bananapi BPI-R2 board:
   - compatible = "bananapi,bpi-r2", "mediatek,mt7623";
 - MTK mt8127 tablet moose EVB:
-- 
1.9.1

Re: [PATCH 7/7] aio: implement io_pgetevents

2018-07-09 Thread Andrei Vagin

On Sun, Jul 08, 2018 at 10:44:00PM +0200, Christoph Hellwig wrote:
> On Wed, Jul 04, 2018 at 04:21:16PM +0200, Adrian Reber wrote:
> > In file included from /usr/include/linux/signal.h:5,
> >  from /usr/include/linux/aio_abi.h:32,
> >  from include.c:2:
> > /usr/include/asm/signal.h:16:23: error: conflicting types for ‘sigset_t’
> >  typedef unsigned long sigset_t;
> >^~~~
> > In file included from /usr/include/signal.h:35,
> >  from include.c:1:
> > /usr/include/bits/types/sigset_t.h:7:20: note: previous declaration of 
> > ‘sigset_t’ was here
> >  typedef __sigset_t sigset_t;
> 
> I guess we could do something like the patch below, although it is
> rather ugly:
> 
> diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
> index 75846164290e..b7705ad66d78 100644
> --- a/include/uapi/linux/aio_abi.h
> +++ b/include/uapi/linux/aio_abi.h
> @@ -29,7 +29,11 @@
>  
>  #include 
>  #include 
> +#ifdef __KERNEL__
>  #include 
> +#else
> +#include 
> +#endif

I think we can not do this because this header specifies the kernel
API, but signal.h is provided by libc and sigset_t can be defined
differently there:

[avagin@laptop ~]$ cat test.c 
#ifdef TEST_LINUX_SIGNAL
#  include 
#  include 
#else
#  include 
#endif
#include 
int main()
{
printf("sizeof(sigset_t) = %d\n", sizeof(sigset_t));
return 0;
}
[avagin@laptop ~]$ gcc -DTEST_LINUX_SIGNAL test.c && ./a.out
sizeof(sigset_t) = 8
[avagin@laptop ~]$ gcc test.c && ./a.out
sizeof(sigset_t) = 128

[avagin@laptop include]$ rpm -qf /usr/include/signal.h 
glibc-headers-2.27-8.fc28.i686
glibc-headers-2.27-8.fc28.x86_64
[avagin@laptop include]$ rpm -qf /usr/include/linux/signal.h 
kernel-headers-4.16.5-300.fc28.x86_64

>  #include 
>  
>  typedef __kernel_ulong_t aio_context_t;

Re: [PATCH 7/7] aio: implement io_pgetevents

2018-07-09 Thread Andrei Vagin

On Sun, Jul 08, 2018 at 10:44:00PM +0200, Christoph Hellwig wrote:
> On Wed, Jul 04, 2018 at 04:21:16PM +0200, Adrian Reber wrote:
> > In file included from /usr/include/linux/signal.h:5,
> >  from /usr/include/linux/aio_abi.h:32,
> >  from include.c:2:
> > /usr/include/asm/signal.h:16:23: error: conflicting types for ‘sigset_t’
> >  typedef unsigned long sigset_t;
> >^~~~
> > In file included from /usr/include/signal.h:35,
> >  from include.c:1:
> > /usr/include/bits/types/sigset_t.h:7:20: note: previous declaration of 
> > ‘sigset_t’ was here
> >  typedef __sigset_t sigset_t;
> 
> I guess we could do something like the patch below, although it is
> rather ugly:
> 
> diff --git a/include/uapi/linux/aio_abi.h b/include/uapi/linux/aio_abi.h
> index 75846164290e..b7705ad66d78 100644
> --- a/include/uapi/linux/aio_abi.h
> +++ b/include/uapi/linux/aio_abi.h
> @@ -29,7 +29,11 @@
>  
>  #include 
>  #include 
> +#ifdef __KERNEL__
>  #include 
> +#else
> +#include 
> +#endif

I think we can not do this because this header specifies the kernel
API, but signal.h is provided by libc and sigset_t can be defined
differently there:

[avagin@laptop ~]$ cat test.c 
#ifdef TEST_LINUX_SIGNAL
#  include 
#  include 
#else
#  include 
#endif
#include 
int main()
{
printf("sizeof(sigset_t) = %d\n", sizeof(sigset_t));
return 0;
}
[avagin@laptop ~]$ gcc -DTEST_LINUX_SIGNAL test.c && ./a.out
sizeof(sigset_t) = 8
[avagin@laptop ~]$ gcc test.c && ./a.out
sizeof(sigset_t) = 128

[avagin@laptop include]$ rpm -qf /usr/include/signal.h 
glibc-headers-2.27-8.fc28.i686
glibc-headers-2.27-8.fc28.x86_64
[avagin@laptop include]$ rpm -qf /usr/include/linux/signal.h 
kernel-headers-4.16.5-300.fc28.x86_64

>  #include 
>  
>  typedef __kernel_ulong_t aio_context_t;

Re: [PATCH 0/12 V2] ipc: cleanups & bugfixes, rhashtable update

2018-07-09 Thread Manfred Spraul


Hi Davidlohr,

On 07/09/2018 10:09 PM, Davidlohr Bueso wrote:

On Mon, 09 Jul 2018, Manfred Spraul wrote:


@Davidlohr:
Please double check that I have taken the correct patches, and
that I didn't break anything.


Everything seems ok.

Patch 8 had an alternative patch that didn't change nowarn semantics for
the rhashtable resizing operations (https://lkml.org/lkml/2018/6/22/732),
but nobody complained about the one you picked up (and also has 
Michal's ack).



Which patch do you prefer?
I have seen two versions, and if I have picked up the wrong one, then I 
can change it.


--
    Manfred

Re: [PATCH 0/12 V2] ipc: cleanups & bugfixes, rhashtable update

2018-07-09 Thread Manfred Spraul


Hi Davidlohr,

On 07/09/2018 10:09 PM, Davidlohr Bueso wrote:

On Mon, 09 Jul 2018, Manfred Spraul wrote:


@Davidlohr:
Please double check that I have taken the correct patches, and
that I didn't break anything.


Everything seems ok.

Patch 8 had an alternative patch that didn't change nowarn semantics for
the rhashtable resizing operations (https://lkml.org/lkml/2018/6/22/732),
but nobody complained about the one you picked up (and also has 
Michal's ack).



Which patch do you prefer?
I have seen two versions, and if I have picked up the wrong one, then I 
can change it.


--
    Manfred

Re: [PATCH V2] MIPS: implement smp_cond_load_acquire() for Loongson-3

2018-07-09 Thread Huacai Chen

Hi, Paul and Peter,

I think we find the real root cause, READ_ONCE() doesn't need any
barriers, the problematic code is queued_spin_lock_slowpath() in
kernel/locking/qspinlock.c:

if (old & _Q_TAIL_MASK) {
prev = decode_tail(old);

/* Link @node into the waitqueue. */
WRITE_ONCE(prev->next, node);

pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(>locked);

/*
 * While waiting for the MCS lock, the next pointer may have
 * been set by another lock waiter. We optimistically load
 * the next pointer & prefetch the cacheline for writing
 * to reduce latency in the upcoming MCS unlock operation.
 */
next = READ_ONCE(node->next);
if (next)
prefetchw(next);
}

After WRITE_ONCE(prev->next, node); arch_mcs_spin_lock_contended()
enter a READ_ONCE() loop, so the effect of WRITE_ONCE() is invisible
by other cores because of the write buffer. As a result,
arch_mcs_spin_lock_contended() will wait for ever because the waiters
of prev->next will wait for ever. I think the right way to fix this is
flush SFB after this WRITE_ONCE(), but I don't have a good solution:
1, MIPS has wbflush() which can be used to flush SFB, but other archs
don't have;
2, Every arch has mb(), and add mb() after WRITE_ONCE() can actually
solve Loongson's problem, but in syntax, mb() is different from
wbflush();
3, Maybe we can define a Loongson-specific WRITE_ONCE(), but not every
WRITE_ONCE() need wbflush(), we only need wbflush() between
WRITE_ONCE() and a READ_ONCE() loop.

Any ideas?

Huacai


On Tue, Jul 10, 2018 at 12:49 AM, Paul Burton  wrote:
> Hi Huacai,
>
> On Mon, Jul 09, 2018 at 10:26:38AM +0800, Huacai Chen wrote:
>> After commit 7f56b58a92aaf2c ("locking/mcs: Use smp_cond_load_acquire()
>> in MCS spin loop") Loongson-3 fails to boot. This is because Loongson-3
>> has SFB (Store Fill Buffer) and the weak-ordering may cause READ_ONCE()
>> to get an old value in a tight loop. So in smp_cond_load_acquire() we
>> need a __smp_rmb() before the READ_ONCE() loop.
>>
>> This patch introduce a Loongson-specific smp_cond_load_acquire(). And
>> it should be backported to as early as linux-4.5, in which release the
>> smp_cond_acquire() is introduced.
>>
>> There may be other cases where memory barriers is needed, we will fix
>> them one by one.
>>
>> Cc: sta...@vger.kernel.org
>> Signed-off-by: Huacai Chen 
>> ---
>>  arch/mips/include/asm/barrier.h | 18 ++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/arch/mips/include/asm/barrier.h 
>> b/arch/mips/include/asm/barrier.h
>> index a5eb1bb..e8c4c63 100644
>> --- a/arch/mips/include/asm/barrier.h
>> +++ b/arch/mips/include/asm/barrier.h
>> @@ -222,6 +222,24 @@
>>  #define __smp_mb__before_atomic()__smp_mb__before_llsc()
>>  #define __smp_mb__after_atomic() smp_llsc_mb()
>>
>> +#ifdef CONFIG_CPU_LOONGSON3
>> +/* Loongson-3 need a __smp_rmb() before READ_ONCE() loop */
>> +#define smp_cond_load_acquire(ptr, cond_expr)\
>> +({   \
>> + typeof(ptr) __PTR = (ptr);  \
>> + typeof(*ptr) VAL;   \
>> + __smp_rmb();\
>> + for (;;) {  \
>> + VAL = READ_ONCE(*__PTR);\
>> + if (cond_expr)  \
>> + break;  \
>> + cpu_relax();\
>> + }   \
>> + __smp_rmb();\
>> + VAL;\
>> +})
>> +#endif   /* CONFIG_CPU_LOONGSON3 */
>
> The discussion on v1 of this patch [1] seemed to converge on the view
> that Loongson suffers from the same problem as ARM platforms which
> enable the CONFIG_ARM_ERRATA_754327 workaround, and that we might
> require a similar workaround.
>
> Is there a reason you've not done that, and instead tweaked your patch
> that's specific to the smp_cond_load_acquire() case? I'm not comfortable
> with fixing just this one case when there could be many similar
> problematic pieces of code you just haven't hit yet.
>
> Please also keep the LKMM maintainers on copy for this - their feedback
> will be valuable & I'll be much more comfortable applying a workaround
> for Loongson's behavior here if it's one that they're OK with.
>
> Thanks,
> Paul
>
> [1] https://www.linux-mips.org/archives/linux-mips/2018-06/msg00139.html

Re: [PATCH V2] MIPS: implement smp_cond_load_acquire() for Loongson-3

2018-07-09 Thread Huacai Chen

Hi, Paul and Peter,

I think we find the real root cause, READ_ONCE() doesn't need any
barriers, the problematic code is queued_spin_lock_slowpath() in
kernel/locking/qspinlock.c:

if (old & _Q_TAIL_MASK) {
prev = decode_tail(old);

/* Link @node into the waitqueue. */
WRITE_ONCE(prev->next, node);

pv_wait_node(node, prev);
arch_mcs_spin_lock_contended(>locked);

/*
 * While waiting for the MCS lock, the next pointer may have
 * been set by another lock waiter. We optimistically load
 * the next pointer & prefetch the cacheline for writing
 * to reduce latency in the upcoming MCS unlock operation.
 */
next = READ_ONCE(node->next);
if (next)
prefetchw(next);
}

After WRITE_ONCE(prev->next, node); arch_mcs_spin_lock_contended()
enter a READ_ONCE() loop, so the effect of WRITE_ONCE() is invisible
by other cores because of the write buffer. As a result,
arch_mcs_spin_lock_contended() will wait for ever because the waiters
of prev->next will wait for ever. I think the right way to fix this is
flush SFB after this WRITE_ONCE(), but I don't have a good solution:
1, MIPS has wbflush() which can be used to flush SFB, but other archs
don't have;
2, Every arch has mb(), and add mb() after WRITE_ONCE() can actually
solve Loongson's problem, but in syntax, mb() is different from
wbflush();
3, Maybe we can define a Loongson-specific WRITE_ONCE(), but not every
WRITE_ONCE() need wbflush(), we only need wbflush() between
WRITE_ONCE() and a READ_ONCE() loop.

Any ideas?

Huacai


On Tue, Jul 10, 2018 at 12:49 AM, Paul Burton  wrote:
> Hi Huacai,
>
> On Mon, Jul 09, 2018 at 10:26:38AM +0800, Huacai Chen wrote:
>> After commit 7f56b58a92aaf2c ("locking/mcs: Use smp_cond_load_acquire()
>> in MCS spin loop") Loongson-3 fails to boot. This is because Loongson-3
>> has SFB (Store Fill Buffer) and the weak-ordering may cause READ_ONCE()
>> to get an old value in a tight loop. So in smp_cond_load_acquire() we
>> need a __smp_rmb() before the READ_ONCE() loop.
>>
>> This patch introduce a Loongson-specific smp_cond_load_acquire(). And
>> it should be backported to as early as linux-4.5, in which release the
>> smp_cond_acquire() is introduced.
>>
>> There may be other cases where memory barriers is needed, we will fix
>> them one by one.
>>
>> Cc: sta...@vger.kernel.org
>> Signed-off-by: Huacai Chen 
>> ---
>>  arch/mips/include/asm/barrier.h | 18 ++
>>  1 file changed, 18 insertions(+)
>>
>> diff --git a/arch/mips/include/asm/barrier.h 
>> b/arch/mips/include/asm/barrier.h
>> index a5eb1bb..e8c4c63 100644
>> --- a/arch/mips/include/asm/barrier.h
>> +++ b/arch/mips/include/asm/barrier.h
>> @@ -222,6 +222,24 @@
>>  #define __smp_mb__before_atomic()__smp_mb__before_llsc()
>>  #define __smp_mb__after_atomic() smp_llsc_mb()
>>
>> +#ifdef CONFIG_CPU_LOONGSON3
>> +/* Loongson-3 need a __smp_rmb() before READ_ONCE() loop */
>> +#define smp_cond_load_acquire(ptr, cond_expr)\
>> +({   \
>> + typeof(ptr) __PTR = (ptr);  \
>> + typeof(*ptr) VAL;   \
>> + __smp_rmb();\
>> + for (;;) {  \
>> + VAL = READ_ONCE(*__PTR);\
>> + if (cond_expr)  \
>> + break;  \
>> + cpu_relax();\
>> + }   \
>> + __smp_rmb();\
>> + VAL;\
>> +})
>> +#endif   /* CONFIG_CPU_LOONGSON3 */
>
> The discussion on v1 of this patch [1] seemed to converge on the view
> that Loongson suffers from the same problem as ARM platforms which
> enable the CONFIG_ARM_ERRATA_754327 workaround, and that we might
> require a similar workaround.
>
> Is there a reason you've not done that, and instead tweaked your patch
> that's specific to the smp_cond_load_acquire() case? I'm not comfortable
> with fixing just this one case when there could be many similar
> problematic pieces of code you just haven't hit yet.
>
> Please also keep the LKMM maintainers on copy for this - their feedback
> will be valuable & I'll be much more comfortable applying a workaround
> for Loongson's behavior here if it's one that they're OK with.
>
> Thanks,
> Paul
>
> [1] https://www.linux-mips.org/archives/linux-mips/2018-06/msg00139.html

Re: [PATCH v2] objtool: move libelf detection to Kconfig from Makefile

2018-07-09 Thread Josh Poimboeuf

On Tue, Jul 10, 2018 at 12:47:49PM +0900, Masahiro Yamada wrote:
> Hi.
> 
> 
> 2018-07-10 11:29 GMT+09:00 Josh Poimboeuf :
> > On Tue, Jul 10, 2018 at 10:35:16AM +0900, Masahiro Yamada wrote:
> >> Currently, users are allowed to enable STACK_VALIDATION regardless
> >> of the compiler capability.  The top-level Makefile warns or breaks
> >> the build if it turns out that the host compiler cannot link libelf.
> >>
> >> Move the libelf test to Kconfig so that users can enable the feature
> >> only when the host compiler can build the objtool.  The ugly check
> >> in the Makefile will go away.
> >>
> >> Signed-off-by: Masahiro Yamada 
> >> Acked-by: Josh Poimboeuf 
> >
> > Actually, looking at this again, I want to rescind my ack.
> >
> > This patches changes the behavior in a subtle (but important) way.
> >
> > Before, if I did "make defconfig", it would *always* choose the ORC
> > unwinder.  Then, if I didn't have libelf-devel, the build would fail and
> > it would tell me what I need to install.
> >
> > But now with this patch, if I do "make defconfig", the generated config
> > actually changes based on what I happen to have installed on my build
> > system.  If I don't have libelf-devel, then it silently chooses the
> > non-default unwinder (frame pointer).
> >
> > This is a subtle difference, but IMO it's dangerous, because it will
> > silently enable the frame pointer unwinder for the majority of new
> > systems, even though it's not the default.
> >
> > I strongly prefer the original behavior, because we really want to
> > encourage people to use the ORC unwinder, even if that means they have
> > to install a package to build it.
> >
> > Also -- in general -- I suspect that silently changing the defaults like
> > this will create a lot of bad surprises.  The output of "make defconfig"
> > should be predictable and not dependent on what RPMs I happen to have
> > installed.
> 
> 
> 
> Actually, we had similar discussion for stack protector.
> 
> 
> First, Kees liked to let the build fail
> instead of disabling the stack protector silently:
> 
> https://lkml.org/lkml/2018/2/9/697
> 
> 
> 
> Linus said:
> But yes, I also reacted to your earlier " It can't silently rewrite it
> to _REGULAR because the compiler support for _STRONG regressed."
> Because it damn well can. If the compiler doesn't support
> -fstack-protector-strong, we can just fall back on -fstack-protector.
> Silently. No extra crazy complex logic for that either.
> 
> (https://lkml.org/lkml/2018/2/10/281)
> 
> 
> 
> I hope this is the same pattern.

I wasn't a part of the -fstack-protector conversation, but I doubt it's
the same pattern.  We're trying to phase out frame pointers, for several
reasons.  One big reason is that they cause a general slowdown across
the entire kernel.

Since we switched the x86_64 default to the ORC unwinder, a lot of
people have switched over.  But this patch will reverse (or at least
slow down) that trend, because almost nobody has the libelf devel
packaged installed by default.  So over time, it will effectively make
frame pointers the default again in many cases.  That's exactly what we
*don't* want to do.  It will also cause people to accidentally re-enable
frame pointers when they thought they had ORC.

-- 
Josh

Re: [PATCH v2] objtool: move libelf detection to Kconfig from Makefile

2018-07-09 Thread Josh Poimboeuf

On Tue, Jul 10, 2018 at 12:47:49PM +0900, Masahiro Yamada wrote:
> Hi.
> 
> 
> 2018-07-10 11:29 GMT+09:00 Josh Poimboeuf :
> > On Tue, Jul 10, 2018 at 10:35:16AM +0900, Masahiro Yamada wrote:
> >> Currently, users are allowed to enable STACK_VALIDATION regardless
> >> of the compiler capability.  The top-level Makefile warns or breaks
> >> the build if it turns out that the host compiler cannot link libelf.
> >>
> >> Move the libelf test to Kconfig so that users can enable the feature
> >> only when the host compiler can build the objtool.  The ugly check
> >> in the Makefile will go away.
> >>
> >> Signed-off-by: Masahiro Yamada 
> >> Acked-by: Josh Poimboeuf 
> >
> > Actually, looking at this again, I want to rescind my ack.
> >
> > This patches changes the behavior in a subtle (but important) way.
> >
> > Before, if I did "make defconfig", it would *always* choose the ORC
> > unwinder.  Then, if I didn't have libelf-devel, the build would fail and
> > it would tell me what I need to install.
> >
> > But now with this patch, if I do "make defconfig", the generated config
> > actually changes based on what I happen to have installed on my build
> > system.  If I don't have libelf-devel, then it silently chooses the
> > non-default unwinder (frame pointer).
> >
> > This is a subtle difference, but IMO it's dangerous, because it will
> > silently enable the frame pointer unwinder for the majority of new
> > systems, even though it's not the default.
> >
> > I strongly prefer the original behavior, because we really want to
> > encourage people to use the ORC unwinder, even if that means they have
> > to install a package to build it.
> >
> > Also -- in general -- I suspect that silently changing the defaults like
> > this will create a lot of bad surprises.  The output of "make defconfig"
> > should be predictable and not dependent on what RPMs I happen to have
> > installed.
> 
> 
> 
> Actually, we had similar discussion for stack protector.
> 
> 
> First, Kees liked to let the build fail
> instead of disabling the stack protector silently:
> 
> https://lkml.org/lkml/2018/2/9/697
> 
> 
> 
> Linus said:
> But yes, I also reacted to your earlier " It can't silently rewrite it
> to _REGULAR because the compiler support for _STRONG regressed."
> Because it damn well can. If the compiler doesn't support
> -fstack-protector-strong, we can just fall back on -fstack-protector.
> Silently. No extra crazy complex logic for that either.
> 
> (https://lkml.org/lkml/2018/2/10/281)
> 
> 
> 
> I hope this is the same pattern.

I wasn't a part of the -fstack-protector conversation, but I doubt it's
the same pattern.  We're trying to phase out frame pointers, for several
reasons.  One big reason is that they cause a general slowdown across
the entire kernel.

Since we switched the x86_64 default to the ORC unwinder, a lot of
people have switched over.  But this patch will reverse (or at least
slow down) that trend, because almost nobody has the libelf devel
packaged installed by default.  So over time, it will effectively make
frame pointers the default again in many cases.  That's exactly what we
*don't* want to do.  It will also cause people to accidentally re-enable
frame pointers when they thought they had ORC.

-- 
Josh

Re: linux-next: build warnings after merge of the rdma tree

2018-07-09 Thread Jason Gunthorpe

On Tue, Jul 10, 2018 at 02:05:12PM +1000, Stephen Rothwell wrote:
> Hi Jason,
> 
> On Mon, 9 Jul 2018 21:41:57 -0600 Jason Gunthorpe  wrote:
> > 
> > What compiler is producing these? I got nothing from 0-day build
> > service or my local gcc-7..
> 
> The x86 compiler is a v7.3.1 cross compiler hosted on PowerPC LE and
> built from sources.

Curiouser and curiouser.. Mine is

gcc version 7.3.0 (Ubuntu 7.3.0-21ubuntu1~16.04) 

> > They are false positives and I guess we need to put the
> > uninitialized_var back that was hiding them.
> 
> Rats.  :-(

Indeed.. I dislike that macro.

Jason

Re: linux-next: build warnings after merge of the rdma tree

2018-07-09 Thread Jason Gunthorpe

On Tue, Jul 10, 2018 at 02:05:12PM +1000, Stephen Rothwell wrote:
> Hi Jason,
> 
> On Mon, 9 Jul 2018 21:41:57 -0600 Jason Gunthorpe  wrote:
> > 
> > What compiler is producing these? I got nothing from 0-day build
> > service or my local gcc-7..
> 
> The x86 compiler is a v7.3.1 cross compiler hosted on PowerPC LE and
> built from sources.

Curiouser and curiouser.. Mine is

gcc version 7.3.0 (Ubuntu 7.3.0-21ubuntu1~16.04) 

> > They are false positives and I guess we need to put the
> > uninitialized_var back that was hiding them.
> 
> Rats.  :-(

Indeed.. I dislike that macro.

Jason

Re: linux-next: build warnings after merge of the rdma tree

2018-07-09 Thread Stephen Rothwell

Hi Jason,

On Mon, 9 Jul 2018 21:41:57 -0600 Jason Gunthorpe  wrote:
> 
> What compiler is producing these? I got nothing from 0-day build
> service or my local gcc-7..

The x86 compiler is a v7.3.1 cross compiler hosted on PowerPC LE and
built from sources.

> They are false positives and I guess we need to put the
> uninitialized_var back that was hiding them.

Rats.  :-(

> Also curious that the powerpc compiler gets a different set..

The powerpc builds are done with:

$ gcc --version
gcc (Debian 7.3.0-21) 7.3.0

-- 
Cheers,
Stephen Rothwell


pgpbY9v8zxu_1.pgp
Description: OpenPGP digital signature

Re: linux-next: build warnings after merge of the rdma tree

2018-07-09 Thread Stephen Rothwell

Hi Jason,

On Mon, 9 Jul 2018 21:41:57 -0600 Jason Gunthorpe  wrote:
> 
> What compiler is producing these? I got nothing from 0-day build
> service or my local gcc-7..

The x86 compiler is a v7.3.1 cross compiler hosted on PowerPC LE and
built from sources.

> They are false positives and I guess we need to put the
> uninitialized_var back that was hiding them.

Rats.  :-(

> Also curious that the powerpc compiler gets a different set..

The powerpc builds are done with:

$ gcc --version
gcc (Debian 7.3.0-21) 7.3.0

-- 
Cheers,
Stephen Rothwell


pgpbY9v8zxu_1.pgp
Description: OpenPGP digital signature

Re: Kernel 4.17.4 lockup

2018-07-09 Thread Dave Hansen

On 07/09/2018 07:14 PM, H.J. Lu wrote:
>> I'd really want to see this reproduced without KASLR to make the oops
>> easier to read.  It would also be handy to try your workload with all
>> the pedantic debugging: KASAN, slab debugging, DEBUG_PAGE_ALLOC, etc...
>> and see if it still triggers.
> How can I turn them on at boot time?

The only thing you can add at boot time is slab debugging, and it's
probably the most useless of the three that I listed since you're not
actually seeing any slab corruption.

The rest are compile-time options.

Re: Kernel 4.17.4 lockup

2018-07-09 Thread Dave Hansen

On 07/09/2018 07:14 PM, H.J. Lu wrote:
>> I'd really want to see this reproduced without KASLR to make the oops
>> easier to read.  It would also be handy to try your workload with all
>> the pedantic debugging: KASAN, slab debugging, DEBUG_PAGE_ALLOC, etc...
>> and see if it still triggers.
> How can I turn them on at boot time?

The only thing you can add at boot time is slab debugging, and it's
probably the most useless of the three that I listed since you're not
actually seeing any slab corruption.

The rest are compile-time options.

Re: [PATCH v2] objtool: move libelf detection to Kconfig from Makefile

2018-07-09 Thread Masahiro Yamada

Hi.


2018-07-10 11:29 GMT+09:00 Josh Poimboeuf :
> On Tue, Jul 10, 2018 at 10:35:16AM +0900, Masahiro Yamada wrote:
>> Currently, users are allowed to enable STACK_VALIDATION regardless
>> of the compiler capability.  The top-level Makefile warns or breaks
>> the build if it turns out that the host compiler cannot link libelf.
>>
>> Move the libelf test to Kconfig so that users can enable the feature
>> only when the host compiler can build the objtool.  The ugly check
>> in the Makefile will go away.
>>
>> Signed-off-by: Masahiro Yamada 
>> Acked-by: Josh Poimboeuf 
>
> Actually, looking at this again, I want to rescind my ack.
>
> This patches changes the behavior in a subtle (but important) way.
>
> Before, if I did "make defconfig", it would *always* choose the ORC
> unwinder.  Then, if I didn't have libelf-devel, the build would fail and
> it would tell me what I need to install.
>
> But now with this patch, if I do "make defconfig", the generated config
> actually changes based on what I happen to have installed on my build
> system.  If I don't have libelf-devel, then it silently chooses the
> non-default unwinder (frame pointer).
>
> This is a subtle difference, but IMO it's dangerous, because it will
> silently enable the frame pointer unwinder for the majority of new
> systems, even though it's not the default.
>
> I strongly prefer the original behavior, because we really want to
> encourage people to use the ORC unwinder, even if that means they have
> to install a package to build it.
>
> Also -- in general -- I suspect that silently changing the defaults like
> this will create a lot of bad surprises.  The output of "make defconfig"
> should be predictable and not dependent on what RPMs I happen to have
> installed.



Actually, we had similar discussion for stack protector.


First, Kees liked to let the build fail
instead of disabling the stack protector silently:

https://lkml.org/lkml/2018/2/9/697



Linus said:
But yes, I also reacted to your earlier " It can't silently rewrite it
to _REGULAR because the compiler support for _STRONG regressed."
Because it damn well can. If the compiler doesn't support
-fstack-protector-strong, we can just fall back on -fstack-protector.
Silently. No extra crazy complex logic for that either.

(https://lkml.org/lkml/2018/2/10/281)



I hope this is the same pattern.



-- 
Best Regards
Masahiro Yamada

Re: [PATCH v2] objtool: move libelf detection to Kconfig from Makefile

2018-07-09 Thread Masahiro Yamada

Hi.


2018-07-10 11:29 GMT+09:00 Josh Poimboeuf :
> On Tue, Jul 10, 2018 at 10:35:16AM +0900, Masahiro Yamada wrote:
>> Currently, users are allowed to enable STACK_VALIDATION regardless
>> of the compiler capability.  The top-level Makefile warns or breaks
>> the build if it turns out that the host compiler cannot link libelf.
>>
>> Move the libelf test to Kconfig so that users can enable the feature
>> only when the host compiler can build the objtool.  The ugly check
>> in the Makefile will go away.
>>
>> Signed-off-by: Masahiro Yamada 
>> Acked-by: Josh Poimboeuf 
>
> Actually, looking at this again, I want to rescind my ack.
>
> This patches changes the behavior in a subtle (but important) way.
>
> Before, if I did "make defconfig", it would *always* choose the ORC
> unwinder.  Then, if I didn't have libelf-devel, the build would fail and
> it would tell me what I need to install.
>
> But now with this patch, if I do "make defconfig", the generated config
> actually changes based on what I happen to have installed on my build
> system.  If I don't have libelf-devel, then it silently chooses the
> non-default unwinder (frame pointer).
>
> This is a subtle difference, but IMO it's dangerous, because it will
> silently enable the frame pointer unwinder for the majority of new
> systems, even though it's not the default.
>
> I strongly prefer the original behavior, because we really want to
> encourage people to use the ORC unwinder, even if that means they have
> to install a package to build it.
>
> Also -- in general -- I suspect that silently changing the defaults like
> this will create a lot of bad surprises.  The output of "make defconfig"
> should be predictable and not dependent on what RPMs I happen to have
> installed.



Actually, we had similar discussion for stack protector.


First, Kees liked to let the build fail
instead of disabling the stack protector silently:

https://lkml.org/lkml/2018/2/9/697



Linus said:
But yes, I also reacted to your earlier " It can't silently rewrite it
to _REGULAR because the compiler support for _STRONG regressed."
Because it damn well can. If the compiler doesn't support
-fstack-protector-strong, we can just fall back on -fstack-protector.
Silently. No extra crazy complex logic for that either.

(https://lkml.org/lkml/2018/2/10/281)



I hope this is the same pattern.



-- 
Best Regards
Masahiro Yamada

Re: 4.18rc3 TX2 boot failure with "ACPICA: AML parser: attempt to continue loading table after error"

2018-07-09 Thread Jeremy Linton


Hi,

On 07/09/2018 04:28 PM, Rafael J. Wysocki wrote:

On Mon, Jul 9, 2018 at 10:45 PM, Jeremy Linton  wrote:

Hi,

First thanks for the patch..

On 07/08/2018 04:14 AM, Rafael J. Wysocki wrote:


On Monday, July 2, 2018 11:41:42 PM CEST Jeremy Linton wrote:


Hi,

I'm experiencing two problems with commit 5088814a6e931 which is
"ACPICA: AML parser: attempt to continue loading table after error"

The first is this boot failure on a thunderX2:

[   10.770098] ACPI Error: Ignore error and continue table load



[trimming]


]---

Which does appear to be the result of some bad data in the table, but it
was working with 4.17, and reverting this commit solves the problem.



Does the patch below make any difference?

---
   drivers/acpi/acpica/psobject.c |3 +++
   1 file changed, 3 insertions(+)

Index: linux-pm/drivers/acpi/acpica/psobject.c
===
--- linux-pm.orig/drivers/acpi/acpica/psobject.c
+++ linux-pm/drivers/acpi/acpica/psobject.c
@@ -39,6 +39,9 @@ static acpi_status acpi_ps_get_aml_opcod
 ACPI_FUNCTION_TRACE_PTR(ps_get_aml_opcode, walk_state);
 walk_state->aml = walk_state->parser_state.aml;
+   if (!walk_state->aml)
+   return AE_CTRL_PARSE_CONTINUE;
+



Well this seems to avoid the crash, but now it hangs right after on the
"Ignore error and continue table load" message.


Well, maybe we should just abort in that case.

I'm wondering what happens if you replace the return statement in the
patch above with

return_ACPI_STATUS(AE_AML_BAD_OPCODE)


Yes, that is where I went when I applied the patch but I used 
AE_CTRL_TERMINATE, which terminates the loop in acpi_ps_parse_loop() and 
that appears to successfully finish/terminate the initial parsing pass. 
But, it then crashes in acpi_ns_lookup called via the 
acpi_walk_resources sequences that goes through ut_evalute_object() due 
to the path/scope_info->scope.node being ACPI_ROOT_OBJECT (-1) and 
bypassing the null check. Adding a ACPI_ROOT_OBJECT check as well as the 
null checks in acpi_ns_lookup results in a successful boot. Tracking 
down how the terminate (or whatever) is leaving the info->prefix_node 
(in acpi_ns_evaluate) set to ROOT_OBJECT instead of null, is something I 
don't yet understand.


Anyway, I tried Using BAD_OPCODE rather than TERMINATE and it seems to 
have the same basic result as PARSE_CONTINUE.

Re: 4.18rc3 TX2 boot failure with "ACPICA: AML parser: attempt to continue loading table after error"

2018-07-09 Thread Jeremy Linton


Hi,

On 07/09/2018 04:28 PM, Rafael J. Wysocki wrote:

On Mon, Jul 9, 2018 at 10:45 PM, Jeremy Linton  wrote:

Hi,

First thanks for the patch..

On 07/08/2018 04:14 AM, Rafael J. Wysocki wrote:


On Monday, July 2, 2018 11:41:42 PM CEST Jeremy Linton wrote:


Hi,

I'm experiencing two problems with commit 5088814a6e931 which is
"ACPICA: AML parser: attempt to continue loading table after error"

The first is this boot failure on a thunderX2:

[   10.770098] ACPI Error: Ignore error and continue table load



[trimming]


]---

Which does appear to be the result of some bad data in the table, but it
was working with 4.17, and reverting this commit solves the problem.



Does the patch below make any difference?

---
   drivers/acpi/acpica/psobject.c |3 +++
   1 file changed, 3 insertions(+)

Index: linux-pm/drivers/acpi/acpica/psobject.c
===
--- linux-pm.orig/drivers/acpi/acpica/psobject.c
+++ linux-pm/drivers/acpi/acpica/psobject.c
@@ -39,6 +39,9 @@ static acpi_status acpi_ps_get_aml_opcod
 ACPI_FUNCTION_TRACE_PTR(ps_get_aml_opcode, walk_state);
 walk_state->aml = walk_state->parser_state.aml;
+   if (!walk_state->aml)
+   return AE_CTRL_PARSE_CONTINUE;
+



Well this seems to avoid the crash, but now it hangs right after on the
"Ignore error and continue table load" message.


Well, maybe we should just abort in that case.

I'm wondering what happens if you replace the return statement in the
patch above with

return_ACPI_STATUS(AE_AML_BAD_OPCODE)


Yes, that is where I went when I applied the patch but I used 
AE_CTRL_TERMINATE, which terminates the loop in acpi_ps_parse_loop() and 
that appears to successfully finish/terminate the initial parsing pass. 
But, it then crashes in acpi_ns_lookup called via the 
acpi_walk_resources sequences that goes through ut_evalute_object() due 
to the path/scope_info->scope.node being ACPI_ROOT_OBJECT (-1) and 
bypassing the null check. Adding a ACPI_ROOT_OBJECT check as well as the 
null checks in acpi_ns_lookup results in a successful boot. Tracking 
down how the terminate (or whatever) is leaving the info->prefix_node 
(in acpi_ns_evaluate) set to ROOT_OBJECT instead of null, is something I 
don't yet understand.


Anyway, I tried Using BAD_OPCODE rather than TERMINATE and it seems to 
have the same basic result as PARSE_CONTINUE.

Re: linux-next: build warnings after merge of the rdma tree

2018-07-09 Thread Jason Gunthorpe

On Tue, Jul 10, 2018 at 11:33:42AM +1000, Stephen Rothwell wrote:
> Hi all,
> 
> After merging the rdma tree, today's linux-next build (x86_64
> allmodconfig) produced these warnings:
> 
> In file included from include/linux/printk.h:336:0,
>  from include/linux/kernel.h:14,
>  from arch/x86/include/asm/percpu.h:45,
>  from arch/x86/include/asm/current.h:6,
>  from include/linux/mutex.h:14,
>  from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
>  from drivers/infiniband/hw/cxgb4/cq.c:33:
> drivers/infiniband/hw/cxgb4/cq.c: In function '__c4iw_poll_cq_one':
> include/linux/dynamic_debug.h:127:3: warning: 'cqe.u.gen.wrid_hi' may be used 
> uninitialized in this function [-Wmaybe-uninitialized]
>__dynamic_pr_debug(, pr_fmt(fmt), \
>^~
> drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.u.gen.wrid_hi' was 
> declared here
>   struct t4_cqe cqe;
> ^~~
> In file included from include/linux/printk.h:336:0,
>  from include/linux/kernel.h:14,
>  from arch/x86/include/asm/percpu.h:45,
>  from arch/x86/include/asm/current.h:6,
>  from include/linux/mutex.h:14,
>  from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
>  from drivers/infiniband/hw/cxgb4/cq.c:33:
> include/linux/dynamic_debug.h:127:3: warning: 'cqe.u.gen.wrid_low' may be 
> used uninitialized in this function [-Wmaybe-uninitialized]
>__dynamic_pr_debug(, pr_fmt(fmt), \
>^~
> drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.u.gen.wrid_low' was 
> declared here
>   struct t4_cqe cqe;
> ^~~
> In file included from include/linux/printk.h:336:0,
>  from include/linux/kernel.h:14,
>  from arch/x86/include/asm/percpu.h:45,
>  from arch/x86/include/asm/current.h:6,
>  from include/linux/mutex.h:14,
>  from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
>  from drivers/infiniband/hw/cxgb4/cq.c:33:
> include/linux/dynamic_debug.h:127:3: warning: 'cqe.len' may be used 
> uninitialized in this function [-Wmaybe-uninitialized]
>__dynamic_pr_debug(, pr_fmt(fmt), \
>^~
> drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.len' was declared here
>   struct t4_cqe cqe;
> ^~~
> 
> Introduced by commit
> 
>   4ab39e2f98f2 ("RDMA/cxgb4: Make c4iw_poll_cq_one() easier to analyze")
> 
> Again, I can't easily tell if these are false positives or not.

What compiler is producing these? I got nothing from 0-day build
service or my local gcc-7..

They are false positives and I guess we need to put the
uninitialized_var back that was hiding them.

Also curious that the powerpc compiler gets a different set..

Thanks,
Jason

Re: linux-next: build warnings after merge of the rdma tree

2018-07-09 Thread Jason Gunthorpe

On Tue, Jul 10, 2018 at 11:33:42AM +1000, Stephen Rothwell wrote:
> Hi all,
> 
> After merging the rdma tree, today's linux-next build (x86_64
> allmodconfig) produced these warnings:
> 
> In file included from include/linux/printk.h:336:0,
>  from include/linux/kernel.h:14,
>  from arch/x86/include/asm/percpu.h:45,
>  from arch/x86/include/asm/current.h:6,
>  from include/linux/mutex.h:14,
>  from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
>  from drivers/infiniband/hw/cxgb4/cq.c:33:
> drivers/infiniband/hw/cxgb4/cq.c: In function '__c4iw_poll_cq_one':
> include/linux/dynamic_debug.h:127:3: warning: 'cqe.u.gen.wrid_hi' may be used 
> uninitialized in this function [-Wmaybe-uninitialized]
>__dynamic_pr_debug(, pr_fmt(fmt), \
>^~
> drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.u.gen.wrid_hi' was 
> declared here
>   struct t4_cqe cqe;
> ^~~
> In file included from include/linux/printk.h:336:0,
>  from include/linux/kernel.h:14,
>  from arch/x86/include/asm/percpu.h:45,
>  from arch/x86/include/asm/current.h:6,
>  from include/linux/mutex.h:14,
>  from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
>  from drivers/infiniband/hw/cxgb4/cq.c:33:
> include/linux/dynamic_debug.h:127:3: warning: 'cqe.u.gen.wrid_low' may be 
> used uninitialized in this function [-Wmaybe-uninitialized]
>__dynamic_pr_debug(, pr_fmt(fmt), \
>^~
> drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.u.gen.wrid_low' was 
> declared here
>   struct t4_cqe cqe;
> ^~~
> In file included from include/linux/printk.h:336:0,
>  from include/linux/kernel.h:14,
>  from arch/x86/include/asm/percpu.h:45,
>  from arch/x86/include/asm/current.h:6,
>  from include/linux/mutex.h:14,
>  from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
>  from drivers/infiniband/hw/cxgb4/cq.c:33:
> include/linux/dynamic_debug.h:127:3: warning: 'cqe.len' may be used 
> uninitialized in this function [-Wmaybe-uninitialized]
>__dynamic_pr_debug(, pr_fmt(fmt), \
>^~
> drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.len' was declared here
>   struct t4_cqe cqe;
> ^~~
> 
> Introduced by commit
> 
>   4ab39e2f98f2 ("RDMA/cxgb4: Make c4iw_poll_cq_one() easier to analyze")
> 
> Again, I can't easily tell if these are false positives or not.

What compiler is producing these? I got nothing from 0-day build
service or my local gcc-7..

They are false positives and I guess we need to put the
uninitialized_var back that was hiding them.

Also curious that the powerpc compiler gets a different set..

Thanks,
Jason

Re: [PATCH 1/7] mm: allocate mm_cpumask dynamically based on nr_cpu_ids

2018-07-09 Thread Mike Galbraith

On Mon, 2018-07-09 at 17:38 -0400, Rik van Riel wrote:
> 
> I added your code, and Signed-off-By in patch
> 1 for version 5 of the series.

No objection, but no need (like taking credit for fixing a typo:).

Re: [PATCH 1/7] mm: allocate mm_cpumask dynamically based on nr_cpu_ids

2018-07-09 Thread Mike Galbraith

On Mon, 2018-07-09 at 17:38 -0400, Rik van Riel wrote:
> 
> I added your code, and Signed-off-By in patch
> 1 for version 5 of the series.

No objection, but no need (like taking credit for fixing a typo:).

[PATCH v4 3/3] serial: 8250_dw: add fractional divisor support

2018-07-09 Thread Jisheng Zhang

For Synopsys DesignWare 8250 uart which version >= 4.00a, there's a
valid divisor latch fraction register. The fractional divisor width is
4bits ~ 6bits.

Now the preparation is done, it's easy to add the feature support.
This patch firstly tries to get the fractional divisor width during
probe, then setups dw specific get_divisor() and set_divisor() hook.

Signed-off-by: Jisheng Zhang 
---
 drivers/tty/serial/8250/8250_dw.c | 45 +++
 1 file changed, 45 insertions(+)

diff --git a/drivers/tty/serial/8250/8250_dw.c 
b/drivers/tty/serial/8250/8250_dw.c
index fa8a00e8c9c6..ad08d7a3b93b 100644
--- a/drivers/tty/serial/8250/8250_dw.c
+++ b/drivers/tty/serial/8250/8250_dw.c
@@ -31,6 +31,7 @@
 
 /* Offsets for the DesignWare specific registers */
 #define DW_UART_USR0x1f /* UART Status Register */
+#define DW_UART_DLF0xc0 /* Divisor Latch Fraction Register */
 #define DW_UART_CPR0xf4 /* Component Parameter Register */
 #define DW_UART_UCV0xf8 /* UART Component Version */
 
@@ -55,6 +56,7 @@
 
 struct dw8250_data {
u8  usr_reg;
+   u8  dlf_size;
int line;
int msr_mask_on;
int msr_mask_off;
@@ -366,6 +368,37 @@ static bool dw8250_idma_filter(struct dma_chan *chan, void 
*param)
return param == chan->device->dev->parent;
 }
 
+/*
+ * divisor = div(I) + div(F)
+ * "I" means integer, "F" means fractional
+ * quot = div(I) = clk / (16 * baud)
+ * frac = div(F) * 2^dlf_size
+ *
+ * let rem = clk % (16 * baud)
+ * we have: div(F) * (16 * baud) = rem
+ * so frac = 2^dlf_size * rem / (16 * baud) = (rem << dlf_size) / (16 * baud)
+ */
+static unsigned int dw8250_get_divisor(struct uart_port *p,
+  unsigned int baud,
+  unsigned int *frac)
+{
+   unsigned int quot, rem;
+   struct dw8250_data *d = p->private_data;
+
+   quot = p->uartclk / (16 * baud);
+   rem = p->uartclk % (16 * baud);
+   *frac = DIV_ROUND_CLOSEST(rem << d->dlf_size, 16 * baud);
+
+   return quot;
+}
+
+static void dw8250_set_divisor(struct uart_port *p, unsigned int baud,
+  unsigned int quot, unsigned int quot_frac)
+{
+   dw8250_writel_ext(p, DW_UART_DLF, quot_frac);
+   serial8250_do_set_divisor(p, baud, quot, quot_frac);
+}
+
 static void dw8250_quirks(struct uart_port *p, struct dw8250_data *data)
 {
if (p->dev->of_node) {
@@ -426,6 +459,18 @@ static void dw8250_setup_port(struct uart_port *p)
dev_dbg(p->dev, "Designware UART version %c.%c%c\n",
(reg >> 24) & 0xff, (reg >> 16) & 0xff, (reg >> 8) & 0xff);
 
+   dw8250_writel_ext(p, DW_UART_DLF, ~0U);
+   reg = dw8250_readl_ext(p, DW_UART_DLF);
+   dw8250_writel_ext(p, DW_UART_DLF, 0);
+
+   if (reg) {
+   struct dw8250_data *d = p->private_data;
+
+   d->dlf_size = fls(reg);
+   p->get_divisor = dw8250_get_divisor;
+   p->set_divisor = dw8250_set_divisor;
+   }
+
reg = dw8250_readl_ext(p, DW_UART_CPR);
if (!reg)
return;
-- 
2.18.0

[PATCH v4 3/3] serial: 8250_dw: add fractional divisor support

2018-07-09 Thread Jisheng Zhang

For Synopsys DesignWare 8250 uart which version >= 4.00a, there's a
valid divisor latch fraction register. The fractional divisor width is
4bits ~ 6bits.

Now the preparation is done, it's easy to add the feature support.
This patch firstly tries to get the fractional divisor width during
probe, then setups dw specific get_divisor() and set_divisor() hook.

Signed-off-by: Jisheng Zhang 
---
 drivers/tty/serial/8250/8250_dw.c | 45 +++
 1 file changed, 45 insertions(+)

diff --git a/drivers/tty/serial/8250/8250_dw.c 
b/drivers/tty/serial/8250/8250_dw.c
index fa8a00e8c9c6..ad08d7a3b93b 100644
--- a/drivers/tty/serial/8250/8250_dw.c
+++ b/drivers/tty/serial/8250/8250_dw.c
@@ -31,6 +31,7 @@
 
 /* Offsets for the DesignWare specific registers */
 #define DW_UART_USR0x1f /* UART Status Register */
+#define DW_UART_DLF0xc0 /* Divisor Latch Fraction Register */
 #define DW_UART_CPR0xf4 /* Component Parameter Register */
 #define DW_UART_UCV0xf8 /* UART Component Version */
 
@@ -55,6 +56,7 @@
 
 struct dw8250_data {
u8  usr_reg;
+   u8  dlf_size;
int line;
int msr_mask_on;
int msr_mask_off;
@@ -366,6 +368,37 @@ static bool dw8250_idma_filter(struct dma_chan *chan, void 
*param)
return param == chan->device->dev->parent;
 }
 
+/*
+ * divisor = div(I) + div(F)
+ * "I" means integer, "F" means fractional
+ * quot = div(I) = clk / (16 * baud)
+ * frac = div(F) * 2^dlf_size
+ *
+ * let rem = clk % (16 * baud)
+ * we have: div(F) * (16 * baud) = rem
+ * so frac = 2^dlf_size * rem / (16 * baud) = (rem << dlf_size) / (16 * baud)
+ */
+static unsigned int dw8250_get_divisor(struct uart_port *p,
+  unsigned int baud,
+  unsigned int *frac)
+{
+   unsigned int quot, rem;
+   struct dw8250_data *d = p->private_data;
+
+   quot = p->uartclk / (16 * baud);
+   rem = p->uartclk % (16 * baud);
+   *frac = DIV_ROUND_CLOSEST(rem << d->dlf_size, 16 * baud);
+
+   return quot;
+}
+
+static void dw8250_set_divisor(struct uart_port *p, unsigned int baud,
+  unsigned int quot, unsigned int quot_frac)
+{
+   dw8250_writel_ext(p, DW_UART_DLF, quot_frac);
+   serial8250_do_set_divisor(p, baud, quot, quot_frac);
+}
+
 static void dw8250_quirks(struct uart_port *p, struct dw8250_data *data)
 {
if (p->dev->of_node) {
@@ -426,6 +459,18 @@ static void dw8250_setup_port(struct uart_port *p)
dev_dbg(p->dev, "Designware UART version %c.%c%c\n",
(reg >> 24) & 0xff, (reg >> 16) & 0xff, (reg >> 8) & 0xff);
 
+   dw8250_writel_ext(p, DW_UART_DLF, ~0U);
+   reg = dw8250_readl_ext(p, DW_UART_DLF);
+   dw8250_writel_ext(p, DW_UART_DLF, 0);
+
+   if (reg) {
+   struct dw8250_data *d = p->private_data;
+
+   d->dlf_size = fls(reg);
+   p->get_divisor = dw8250_get_divisor;
+   p->set_divisor = dw8250_set_divisor;
+   }
+
reg = dw8250_readl_ext(p, DW_UART_CPR);
if (!reg)
return;
-- 
2.18.0

[PATCH v4 2/3] serial: 8250: export serial8250_do_set_divisor()

2018-07-09 Thread Jisheng Zhang

Some drivers could call serial8250_do_set_divisor() to complete its
own set_divisor routine. Export this symbol for code reusing.

Signed-off-by: Jisheng Zhang 
---
 drivers/tty/serial/8250/8250_port.c | 5 +++--
 include/linux/serial_8250.h | 3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/tty/serial/8250/8250_port.c 
b/drivers/tty/serial/8250/8250_port.c
index ce0dc17f18ee..945f8dc2d50f 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -2580,8 +2580,8 @@ static unsigned char serial8250_compute_lcr(struct 
uart_8250_port *up,
return cval;
 }
 
-static void serial8250_do_set_divisor(struct uart_port *port, unsigned int 
baud,
-   unsigned int quot, unsigned int quot_frac)
+void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
+  unsigned int quot, unsigned int quot_frac)
 {
struct uart_8250_port *up = up_to_u8250p(port);
 
@@ -2612,6 +2612,7 @@ static void serial8250_do_set_divisor(struct uart_port 
*port, unsigned int baud,
serial_port_out(port, 0x2, quot_frac);
}
 }
+EXPORT_SYMBOL_GPL(serial8250_do_set_divisor);
 
 static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
   unsigned int quot, unsigned int quot_frac)
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 76b9db71e489..18e21427bce4 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -160,6 +160,9 @@ extern void serial8250_do_shutdown(struct uart_port *port);
 extern void serial8250_do_pm(struct uart_port *port, unsigned int state,
 unsigned int oldstate);
 extern void serial8250_do_set_mctrl(struct uart_port *port, unsigned int 
mctrl);
+extern void serial8250_do_set_divisor(struct uart_port *port, unsigned int 
baud,
+ unsigned int quot,
+ unsigned int quot_frac);
 extern int fsl8250_handle_irq(struct uart_port *port);
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir);
 unsigned char serial8250_rx_chars(struct uart_8250_port *up, unsigned char 
lsr);
-- 
2.18.0

[PATCH v4 2/3] serial: 8250: export serial8250_do_set_divisor()

2018-07-09 Thread Jisheng Zhang

Some drivers could call serial8250_do_set_divisor() to complete its
own set_divisor routine. Export this symbol for code reusing.

Signed-off-by: Jisheng Zhang 
---
 drivers/tty/serial/8250/8250_port.c | 5 +++--
 include/linux/serial_8250.h | 3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/tty/serial/8250/8250_port.c 
b/drivers/tty/serial/8250/8250_port.c
index ce0dc17f18ee..945f8dc2d50f 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -2580,8 +2580,8 @@ static unsigned char serial8250_compute_lcr(struct 
uart_8250_port *up,
return cval;
 }
 
-static void serial8250_do_set_divisor(struct uart_port *port, unsigned int 
baud,
-   unsigned int quot, unsigned int quot_frac)
+void serial8250_do_set_divisor(struct uart_port *port, unsigned int baud,
+  unsigned int quot, unsigned int quot_frac)
 {
struct uart_8250_port *up = up_to_u8250p(port);
 
@@ -2612,6 +2612,7 @@ static void serial8250_do_set_divisor(struct uart_port 
*port, unsigned int baud,
serial_port_out(port, 0x2, quot_frac);
}
 }
+EXPORT_SYMBOL_GPL(serial8250_do_set_divisor);
 
 static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
   unsigned int quot, unsigned int quot_frac)
diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h
index 76b9db71e489..18e21427bce4 100644
--- a/include/linux/serial_8250.h
+++ b/include/linux/serial_8250.h
@@ -160,6 +160,9 @@ extern void serial8250_do_shutdown(struct uart_port *port);
 extern void serial8250_do_pm(struct uart_port *port, unsigned int state,
 unsigned int oldstate);
 extern void serial8250_do_set_mctrl(struct uart_port *port, unsigned int 
mctrl);
+extern void serial8250_do_set_divisor(struct uart_port *port, unsigned int 
baud,
+ unsigned int quot,
+ unsigned int quot_frac);
 extern int fsl8250_handle_irq(struct uart_port *port);
 int serial8250_handle_irq(struct uart_port *port, unsigned int iir);
 unsigned char serial8250_rx_chars(struct uart_8250_port *up, unsigned char 
lsr);
-- 
2.18.0

[PATCH v4 1/3] serial: 8250: introduce get_divisor() and set_divisor() hook

2018-07-09 Thread Jisheng Zhang

Add these two hooks so that they can be overridden with driver specific
implementations.

Signed-off-by: Jisheng Zhang 
Reviewed-by: Andy Shevchenko 
---
 drivers/tty/serial/8250/8250_core.c |  4 
 drivers/tty/serial/8250/8250_port.c | 27 +++
 include/linux/serial_core.h |  7 +++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/drivers/tty/serial/8250/8250_core.c 
b/drivers/tty/serial/8250/8250_core.c
index 9342fc2ee7df..a0bb77290747 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -1023,6 +1023,10 @@ int serial8250_register_8250_port(struct uart_8250_port 
*up)
uart->port.get_mctrl = up->port.get_mctrl;
if (up->port.set_mctrl)
uart->port.set_mctrl = up->port.set_mctrl;
+   if (up->port.get_divisor)
+   uart->port.get_divisor = up->port.get_divisor;
+   if (up->port.set_divisor)
+   uart->port.set_divisor = up->port.set_divisor;
if (up->port.startup)
uart->port.startup = up->port.startup;
if (up->port.shutdown)
diff --git a/drivers/tty/serial/8250/8250_port.c 
b/drivers/tty/serial/8250/8250_port.c
index 709fe6b4265c..ce0dc17f18ee 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -2498,9 +2498,9 @@ static unsigned int npcm_get_divisor(struct 
uart_8250_port *up,
return DIV_ROUND_CLOSEST(port->uartclk, 16 * baud + 2) - 2;
 }
 
-static unsigned int serial8250_get_divisor(struct uart_port *port,
-  unsigned int baud,
-  unsigned int *frac)
+static unsigned int serial8250_do_get_divisor(struct uart_port *port,
+ unsigned int baud,
+ unsigned int *frac)
 {
struct uart_8250_port *up = up_to_u8250p(port);
unsigned int quot;
@@ -2532,6 +2532,16 @@ static unsigned int serial8250_get_divisor(struct 
uart_port *port,
return quot;
 }
 
+static unsigned int serial8250_get_divisor(struct uart_port *port,
+  unsigned int baud,
+  unsigned int *frac)
+{
+   if (port->get_divisor)
+   return port->get_divisor(port, baud, frac);
+
+   return serial8250_do_get_divisor(port, baud, frac);
+}
+
 static unsigned char serial8250_compute_lcr(struct uart_8250_port *up,
tcflag_t c_cflag)
 {
@@ -2570,7 +2580,7 @@ static unsigned char serial8250_compute_lcr(struct 
uart_8250_port *up,
return cval;
 }
 
-static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
+static void serial8250_do_set_divisor(struct uart_port *port, unsigned int 
baud,
unsigned int quot, unsigned int quot_frac)
 {
struct uart_8250_port *up = up_to_u8250p(port);
@@ -2603,6 +2613,15 @@ static void serial8250_set_divisor(struct uart_port 
*port, unsigned int baud,
}
 }
 
+static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
+  unsigned int quot, unsigned int quot_frac)
+{
+   if (port->set_divisor)
+   port->set_divisor(port, baud, quot, quot_frac);
+   else
+   serial8250_do_set_divisor(port, baud, quot, quot_frac);
+}
+
 static unsigned int serial8250_get_baud_rate(struct uart_port *port,
 struct ktermios *termios,
 struct ktermios *old)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 06ea4eeb09ab..406edae44ca3 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -127,6 +127,13 @@ struct uart_port {
 struct ktermios *);
unsigned int(*get_mctrl)(struct uart_port *);
void(*set_mctrl)(struct uart_port *, unsigned int);
+   unsigned int(*get_divisor)(struct uart_port *,
+  unsigned int baud,
+  unsigned int *frac);
+   void(*set_divisor)(struct uart_port *,
+  unsigned int baud,
+  unsigned int quot,
+  unsigned int quot_frac);
int (*startup)(struct uart_port *port);
void(*shutdown)(struct uart_port *port);
void(*throttle)(struct uart_port *port);
-- 
2.18.0

[PATCH v4 1/3] serial: 8250: introduce get_divisor() and set_divisor() hook

2018-07-09 Thread Jisheng Zhang

Add these two hooks so that they can be overridden with driver specific
implementations.

Signed-off-by: Jisheng Zhang 
Reviewed-by: Andy Shevchenko 
---
 drivers/tty/serial/8250/8250_core.c |  4 
 drivers/tty/serial/8250/8250_port.c | 27 +++
 include/linux/serial_core.h |  7 +++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/drivers/tty/serial/8250/8250_core.c 
b/drivers/tty/serial/8250/8250_core.c
index 9342fc2ee7df..a0bb77290747 100644
--- a/drivers/tty/serial/8250/8250_core.c
+++ b/drivers/tty/serial/8250/8250_core.c
@@ -1023,6 +1023,10 @@ int serial8250_register_8250_port(struct uart_8250_port 
*up)
uart->port.get_mctrl = up->port.get_mctrl;
if (up->port.set_mctrl)
uart->port.set_mctrl = up->port.set_mctrl;
+   if (up->port.get_divisor)
+   uart->port.get_divisor = up->port.get_divisor;
+   if (up->port.set_divisor)
+   uart->port.set_divisor = up->port.set_divisor;
if (up->port.startup)
uart->port.startup = up->port.startup;
if (up->port.shutdown)
diff --git a/drivers/tty/serial/8250/8250_port.c 
b/drivers/tty/serial/8250/8250_port.c
index 709fe6b4265c..ce0dc17f18ee 100644
--- a/drivers/tty/serial/8250/8250_port.c
+++ b/drivers/tty/serial/8250/8250_port.c
@@ -2498,9 +2498,9 @@ static unsigned int npcm_get_divisor(struct 
uart_8250_port *up,
return DIV_ROUND_CLOSEST(port->uartclk, 16 * baud + 2) - 2;
 }
 
-static unsigned int serial8250_get_divisor(struct uart_port *port,
-  unsigned int baud,
-  unsigned int *frac)
+static unsigned int serial8250_do_get_divisor(struct uart_port *port,
+ unsigned int baud,
+ unsigned int *frac)
 {
struct uart_8250_port *up = up_to_u8250p(port);
unsigned int quot;
@@ -2532,6 +2532,16 @@ static unsigned int serial8250_get_divisor(struct 
uart_port *port,
return quot;
 }
 
+static unsigned int serial8250_get_divisor(struct uart_port *port,
+  unsigned int baud,
+  unsigned int *frac)
+{
+   if (port->get_divisor)
+   return port->get_divisor(port, baud, frac);
+
+   return serial8250_do_get_divisor(port, baud, frac);
+}
+
 static unsigned char serial8250_compute_lcr(struct uart_8250_port *up,
tcflag_t c_cflag)
 {
@@ -2570,7 +2580,7 @@ static unsigned char serial8250_compute_lcr(struct 
uart_8250_port *up,
return cval;
 }
 
-static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
+static void serial8250_do_set_divisor(struct uart_port *port, unsigned int 
baud,
unsigned int quot, unsigned int quot_frac)
 {
struct uart_8250_port *up = up_to_u8250p(port);
@@ -2603,6 +2613,15 @@ static void serial8250_set_divisor(struct uart_port 
*port, unsigned int baud,
}
 }
 
+static void serial8250_set_divisor(struct uart_port *port, unsigned int baud,
+  unsigned int quot, unsigned int quot_frac)
+{
+   if (port->set_divisor)
+   port->set_divisor(port, baud, quot, quot_frac);
+   else
+   serial8250_do_set_divisor(port, baud, quot, quot_frac);
+}
+
 static unsigned int serial8250_get_baud_rate(struct uart_port *port,
 struct ktermios *termios,
 struct ktermios *old)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 06ea4eeb09ab..406edae44ca3 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -127,6 +127,13 @@ struct uart_port {
 struct ktermios *);
unsigned int(*get_mctrl)(struct uart_port *);
void(*set_mctrl)(struct uart_port *, unsigned int);
+   unsigned int(*get_divisor)(struct uart_port *,
+  unsigned int baud,
+  unsigned int *frac);
+   void(*set_divisor)(struct uart_port *,
+  unsigned int baud,
+  unsigned int quot,
+  unsigned int quot_frac);
int (*startup)(struct uart_port *port);
void(*shutdown)(struct uart_port *port);
void(*throttle)(struct uart_port *port);
-- 
2.18.0

[PATCH v4 0/3] serial: 8250_dw: add fractional divisor support

2018-07-09 Thread Jisheng Zhang

For Synopsys DesignWare 8250 uart which version >= 4.00a, there's a
valid divisor latch fraction register. The fractional divisor width is
4bits ~ 6bits.

patch1 introduces necessary hooks to 8250 core.
patch2 exports serial8250_do_set_divisor()
patch3 implements the fractional divisor support for Synopsys DW 8250.

Since v3:
 - simplify the dw8250_get_divisor() implementation again.

Since v2:
 - rebase to tty-next branch, since I need one patch from Andy which
   is in tty-next
 - drop the patch "serial: 8250: let serial8250_get_divisor() get
   uart_port * as param" since it's in tty-next now.
 - add a new patch to export serial8250_do_set_divisor(), and reuse it
   to complete dw8250_set_divisor().
 - remove DW 8250 version check, since the DLF register always exists
   and if fractional divisor isn't supported, the register read as 0
 - add comments to explain how dw8250_get_divisor() get quot and frac.
 - the frac calcuation is simplified with well implemented GENMASK
 - Add Andy's Reviewed-by tag to patch1.

Since v1:
 - add an extra patch to let serial8250_get_divisor() get uart_port *
   as param
 - take Andy's suggestions to "integrates hooks in the same way like
   it's done for the rest of 8250 ones". Many thanks to Andy.

Jisheng Zhang (3):
  serial: 8250: introduce get_divisor() and set_divisor() hook
  serial: 8250: export serial8250_do_set_divisor()
  serial: 8250_dw: add fractional divisor support

 drivers/tty/serial/8250/8250_core.c |  4 +++
 drivers/tty/serial/8250/8250_dw.c   | 45 +
 drivers/tty/serial/8250/8250_port.c | 30 +++
 include/linux/serial_8250.h |  3 ++
 include/linux/serial_core.h |  7 +
 5 files changed, 84 insertions(+), 5 deletions(-)

-- 
2.18.0

[PATCH v4 0/3] serial: 8250_dw: add fractional divisor support

2018-07-09 Thread Jisheng Zhang

For Synopsys DesignWare 8250 uart which version >= 4.00a, there's a
valid divisor latch fraction register. The fractional divisor width is
4bits ~ 6bits.

patch1 introduces necessary hooks to 8250 core.
patch2 exports serial8250_do_set_divisor()
patch3 implements the fractional divisor support for Synopsys DW 8250.

Since v3:
 - simplify the dw8250_get_divisor() implementation again.

Since v2:
 - rebase to tty-next branch, since I need one patch from Andy which
   is in tty-next
 - drop the patch "serial: 8250: let serial8250_get_divisor() get
   uart_port * as param" since it's in tty-next now.
 - add a new patch to export serial8250_do_set_divisor(), and reuse it
   to complete dw8250_set_divisor().
 - remove DW 8250 version check, since the DLF register always exists
   and if fractional divisor isn't supported, the register read as 0
 - add comments to explain how dw8250_get_divisor() get quot and frac.
 - the frac calcuation is simplified with well implemented GENMASK
 - Add Andy's Reviewed-by tag to patch1.

Since v1:
 - add an extra patch to let serial8250_get_divisor() get uart_port *
   as param
 - take Andy's suggestions to "integrates hooks in the same way like
   it's done for the rest of 8250 ones". Many thanks to Andy.

Jisheng Zhang (3):
  serial: 8250: introduce get_divisor() and set_divisor() hook
  serial: 8250: export serial8250_do_set_divisor()
  serial: 8250_dw: add fractional divisor support

 drivers/tty/serial/8250/8250_core.c |  4 +++
 drivers/tty/serial/8250/8250_dw.c   | 45 +
 drivers/tty/serial/8250/8250_port.c | 30 +++
 include/linux/serial_8250.h |  3 ++
 include/linux/serial_core.h |  7 +
 5 files changed, 84 insertions(+), 5 deletions(-)

-- 
2.18.0

Re: [PATCH 0/3] fix: enable early printing of hashed pointers

2018-07-09 Thread Tobin C. Harding

On Mon, Jul 09, 2018 at 10:07:16PM -0400, Steven Rostedt wrote:
> On Tue, 10 Jul 2018 10:25:16 +1000
> "Tobin C. Harding"  wrote:
> 
> > Since I'm a massive noob I did not realise that since v7 of this set was
> > applied to random-next already that doing incremental versions was
> > pointless.
> 
> I wouldn't say you are a noob anymore. You are making the same types of
> mistakes that seasoned kernel developers make ;-)

Thanks for the vote of confidence Steve :)

Re: [PATCH 0/3] fix: enable early printing of hashed pointers

2018-07-09 Thread Tobin C. Harding

On Mon, Jul 09, 2018 at 10:07:16PM -0400, Steven Rostedt wrote:
> On Tue, 10 Jul 2018 10:25:16 +1000
> "Tobin C. Harding"  wrote:
> 
> > Since I'm a massive noob I did not realise that since v7 of this set was
> > applied to random-next already that doing incremental versions was
> > pointless.
> 
> I wouldn't say you are a noob anymore. You are making the same types of
> mistakes that seasoned kernel developers make ;-)

Thanks for the vote of confidence Steve :)

Re: [PATCH v3 3/3] uio: fix crash after the device is unregistered

2018-07-09 Thread Xiubo Li


On 2018/7/10 1:06, Mike Christie wrote:

On 07/06/2018 08:28 PM, Xiubo Li wrote:

On 2018/7/7 2:23, Mike Christie wrote:

On 07/05/2018 09:57 PM, xiu...@redhat.com wrote:

   static irqreturn_t uio_interrupt(int irq, void *dev_id)
   {
   struct uio_device *idev = (struct uio_device *)dev_id;
-irqreturn_t ret = idev->info->handler(irq, idev->info);
+irqreturn_t ret;
+
+mutex_lock(>info_lock);
+if (!idev->info) {
+ret = IRQ_NONE;
+goto out;
+}
   +ret = idev->info->handler(irq, idev->info);
   if (ret == IRQ_HANDLED)
   uio_event_notify(idev->info);
   +out:
+mutex_unlock(>info_lock);
   return ret;
   }

Do you need the interrupt related changes in this patch and the first
one?

Actually, the NULL checking is not a must, we can remove this. But the
lock/unlock is needed.

   When we do uio_unregister_device -> free_irq does free_irq return
when there are no longer running interrupt handlers that we requested?

If that is not the case then I think we can hit a similar bug. We do:

__uio_register_device -> device_register -> device's refcount goes to
zero so we do -> uio_device_release -> kfree(idev)

and if it is possible the interrupt handler could still run after
free_irq then we would end up doing:

uio_interrupt -> mutex_lock(>info_lock) -> idev access freed
memory.

I think this shouldn't happen. Because the free_irq function does not
return until any executing interrupts for this IRQ have completed.


If free_irq returns after executing interrupts and does not allow new
executions what is the lock protecting in uio_interrupt?

I meant idev->info->handler(irq, idev->info), it may should be protected 
by the related lock in each driver.


Thanks，

Re: [PATCH v3 3/3] uio: fix crash after the device is unregistered

2018-07-09 Thread Xiubo Li


On 2018/7/10 1:06, Mike Christie wrote:

On 07/06/2018 08:28 PM, Xiubo Li wrote:

On 2018/7/7 2:23, Mike Christie wrote:

On 07/05/2018 09:57 PM, xiu...@redhat.com wrote:

   static irqreturn_t uio_interrupt(int irq, void *dev_id)
   {
   struct uio_device *idev = (struct uio_device *)dev_id;
-irqreturn_t ret = idev->info->handler(irq, idev->info);
+irqreturn_t ret;
+
+mutex_lock(>info_lock);
+if (!idev->info) {
+ret = IRQ_NONE;
+goto out;
+}
   +ret = idev->info->handler(irq, idev->info);
   if (ret == IRQ_HANDLED)
   uio_event_notify(idev->info);
   +out:
+mutex_unlock(>info_lock);
   return ret;
   }

Do you need the interrupt related changes in this patch and the first
one?

Actually, the NULL checking is not a must, we can remove this. But the
lock/unlock is needed.

   When we do uio_unregister_device -> free_irq does free_irq return
when there are no longer running interrupt handlers that we requested?

If that is not the case then I think we can hit a similar bug. We do:

__uio_register_device -> device_register -> device's refcount goes to
zero so we do -> uio_device_release -> kfree(idev)

and if it is possible the interrupt handler could still run after
free_irq then we would end up doing:

uio_interrupt -> mutex_lock(>info_lock) -> idev access freed
memory.

I think this shouldn't happen. Because the free_irq function does not
return until any executing interrupts for this IRQ have completed.


If free_irq returns after executing interrupts and does not allow new
executions what is the lock protecting in uio_interrupt?

I meant idev->info->handler(irq, idev->info), it may should be protected 
by the related lock in each driver.


Thanks，

Re: [PATCH v3 3/3] uio: fix crash after the device is unregistered

2018-07-09 Thread Xiubo Li


On 2018/7/10 0:40, Mike Christie wrote:

On 07/06/2018 08:47 PM, Xiubo Li wrote:

On 2018/7/7 2:58, Mike Christie wrote:

On 07/05/2018 09:57 PM, xiu...@redhat.com wrote:

   void uio_event_notify(struct uio_info *info)
   {
-struct uio_device *idev = info->uio_dev;
+struct uio_device *idev;
+
+if (!info)
+return;
+
+idev = info->uio_dev;
   

For this one too, I am not sure if it is needed.

uio_interrupt -> uio_event_notify. See other mail.

driver XYZ -> uio_event_notify. I think drivers need to handle this and
set some bits and/or perform some cleanup to make sure they are not
calling uio_event_notify after it has called uio_unregister_device. The
problem with the above test is if they do not they could have called
uio_unregister_device right after the info test so you could still hit
the problem.

When we are tcmu_destroy_device(), if the netlink notify event to
userspace is not successful then the TCMU will call the uio unregister,
which will set the idev->info = NULL, without close and deleting the
device in userspace. But the TCMU could still queue cmds to the ring
buffer, then the uio_event_notify will be called.

Before tcmu_destroy_device is called LIO will have stopped new IO and
waited for outstanding IO to be completed, so it would never be called
after uio_unregister_device. If it does, it's a bug in LIO.
Yeah, So this should be fixed now. Only hit one crash seem related to 
uio of this code with lower kernel before.


Thanks,
BRs

Re: [PATCH v3 3/3] uio: fix crash after the device is unregistered

2018-07-09 Thread Xiubo Li


On 2018/7/10 0:40, Mike Christie wrote:

On 07/06/2018 08:47 PM, Xiubo Li wrote:

On 2018/7/7 2:58, Mike Christie wrote:

On 07/05/2018 09:57 PM, xiu...@redhat.com wrote:

   void uio_event_notify(struct uio_info *info)
   {
-struct uio_device *idev = info->uio_dev;
+struct uio_device *idev;
+
+if (!info)
+return;
+
+idev = info->uio_dev;
   

For this one too, I am not sure if it is needed.

uio_interrupt -> uio_event_notify. See other mail.

driver XYZ -> uio_event_notify. I think drivers need to handle this and
set some bits and/or perform some cleanup to make sure they are not
calling uio_event_notify after it has called uio_unregister_device. The
problem with the above test is if they do not they could have called
uio_unregister_device right after the info test so you could still hit
the problem.

When we are tcmu_destroy_device(), if the netlink notify event to
userspace is not successful then the TCMU will call the uio unregister,
which will set the idev->info = NULL, without close and deleting the
device in userspace. But the TCMU could still queue cmds to the ring
buffer, then the uio_event_notify will be called.

Before tcmu_destroy_device is called LIO will have stopped new IO and
waited for outstanding IO to be completed, so it would never be called
after uio_unregister_device. If it does, it's a bug in LIO.
Yeah, So this should be fixed now. Only hit one crash seem related to 
uio of this code with lower kernel before.


Thanks,
BRs

Re: [PATCH v2] objtool: move libelf detection to Kconfig from Makefile

2018-07-09 Thread Josh Poimboeuf

On Tue, Jul 10, 2018 at 10:35:16AM +0900, Masahiro Yamada wrote:
> Currently, users are allowed to enable STACK_VALIDATION regardless
> of the compiler capability.  The top-level Makefile warns or breaks
> the build if it turns out that the host compiler cannot link libelf.
> 
> Move the libelf test to Kconfig so that users can enable the feature
> only when the host compiler can build the objtool.  The ugly check
> in the Makefile will go away.
> 
> Signed-off-by: Masahiro Yamada 
> Acked-by: Josh Poimboeuf 

Actually, looking at this again, I want to rescind my ack.

This patches changes the behavior in a subtle (but important) way.

Before, if I did "make defconfig", it would *always* choose the ORC
unwinder.  Then, if I didn't have libelf-devel, the build would fail and
it would tell me what I need to install.

But now with this patch, if I do "make defconfig", the generated config
actually changes based on what I happen to have installed on my build
system.  If I don't have libelf-devel, then it silently chooses the
non-default unwinder (frame pointer).

This is a subtle difference, but IMO it's dangerous, because it will
silently enable the frame pointer unwinder for the majority of new
systems, even though it's not the default.

I strongly prefer the original behavior, because we really want to
encourage people to use the ORC unwinder, even if that means they have
to install a package to build it.

Also -- in general -- I suspect that silently changing the defaults like
this will create a lot of bad surprises.  The output of "make defconfig"
should be predictable and not dependent on what RPMs I happen to have
installed.

-- 
Josh

Re: [PATCH v2] objtool: move libelf detection to Kconfig from Makefile

2018-07-09 Thread Josh Poimboeuf

On Tue, Jul 10, 2018 at 10:35:16AM +0900, Masahiro Yamada wrote:
> Currently, users are allowed to enable STACK_VALIDATION regardless
> of the compiler capability.  The top-level Makefile warns or breaks
> the build if it turns out that the host compiler cannot link libelf.
> 
> Move the libelf test to Kconfig so that users can enable the feature
> only when the host compiler can build the objtool.  The ugly check
> in the Makefile will go away.
> 
> Signed-off-by: Masahiro Yamada 
> Acked-by: Josh Poimboeuf 

Actually, looking at this again, I want to rescind my ack.

This patches changes the behavior in a subtle (but important) way.

Before, if I did "make defconfig", it would *always* choose the ORC
unwinder.  Then, if I didn't have libelf-devel, the build would fail and
it would tell me what I need to install.

But now with this patch, if I do "make defconfig", the generated config
actually changes based on what I happen to have installed on my build
system.  If I don't have libelf-devel, then it silently chooses the
non-default unwinder (frame pointer).

This is a subtle difference, but IMO it's dangerous, because it will
silently enable the frame pointer unwinder for the majority of new
systems, even though it's not the default.

I strongly prefer the original behavior, because we really want to
encourage people to use the ORC unwinder, even if that means they have
to install a package to build it.

Also -- in general -- I suspect that silently changing the defaults like
this will create a lot of bad surprises.  The output of "make defconfig"
should be predictable and not dependent on what RPMs I happen to have
installed.

-- 
Josh

[PATCH] arm64: neon: Fix function may_use_simd() return error status

2018-07-09 Thread Yandong.Zhao

From: Yandong Zhao 

Operations for contexts where we do not want to do any checks for
preemptions.  Unless strictly necessary, always use this_cpu_read()
instead.  Because of the kernel_neon_busy here we have to make sure
that it is the current cpu.

Signed-off-by: Yandong Zhao 
---
 arch/arm64/include/asm/simd.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h
index fa8b3fe..8b97f8b 100644
--- a/arch/arm64/include/asm/simd.h
+++ b/arch/arm64/include/asm/simd.h
@@ -29,7 +29,8 @@
 static __must_check inline bool may_use_simd(void)
 {
/*
-* The raw_cpu_read() is racy if called with preemption enabled.
+* The this_cpu_read() is racy if called with preemption enabled,
+* since the task my subsequently migrate to another CPU.
 * This is not a bug: kernel_neon_busy is only set when
 * preemption is disabled, so we cannot migrate to another CPU
 * while it is set, nor can we migrate to a CPU where it is set.
@@ -42,7 +43,7 @@ static __must_check inline bool may_use_simd(void)
 * false.
 */
return !in_irq() && !irqs_disabled() && !in_nmi() &&
-   !raw_cpu_read(kernel_neon_busy);
+   !this_cpu_read(kernel_neon_busy);
 }
 
 #else /* ! CONFIG_KERNEL_MODE_NEON */
-- 
1.9.1

[PATCH] arm64: neon: Fix function may_use_simd() return error status

2018-07-09 Thread Yandong.Zhao

From: Yandong Zhao 

Operations for contexts where we do not want to do any checks for
preemptions.  Unless strictly necessary, always use this_cpu_read()
instead.  Because of the kernel_neon_busy here we have to make sure
that it is the current cpu.

Signed-off-by: Yandong Zhao 
---
 arch/arm64/include/asm/simd.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/simd.h b/arch/arm64/include/asm/simd.h
index fa8b3fe..8b97f8b 100644
--- a/arch/arm64/include/asm/simd.h
+++ b/arch/arm64/include/asm/simd.h
@@ -29,7 +29,8 @@
 static __must_check inline bool may_use_simd(void)
 {
/*
-* The raw_cpu_read() is racy if called with preemption enabled.
+* The this_cpu_read() is racy if called with preemption enabled,
+* since the task my subsequently migrate to another CPU.
 * This is not a bug: kernel_neon_busy is only set when
 * preemption is disabled, so we cannot migrate to another CPU
 * while it is set, nor can we migrate to a CPU where it is set.
@@ -42,7 +43,7 @@ static __must_check inline bool may_use_simd(void)
 * false.
 */
return !in_irq() && !irqs_disabled() && !in_nmi() &&
-   !raw_cpu_read(kernel_neon_busy);
+   !this_cpu_read(kernel_neon_busy);
 }
 
 #else /* ! CONFIG_KERNEL_MODE_NEON */
-- 
1.9.1

Re: [PATCH] x86/mm: fix cpu stuck issue in __change_page_attr_set_clr

2018-07-09 Thread Yang, Bin

On Wed, 2018-07-04 at 16:01 +0200, Thomas Gleixner wrote:
> On Wed, 4 Jul 2018, Yang, Bin wrote:
> > e820 table:
> > =
> > 
> > [0.00] BIOS-e820: [mem 0x-
> > 0x0009fbff]
> > usable
> > [0.00] BIOS-e820: [mem 0x0009fc00-
> > 0x0009]
> > reserved
> > [0.00] BIOS-e820: [mem 0x000f-
> > 0x000f]
> > reserved
> > [0.00] BIOS-e820: [mem 0x0010-
> > 0xbffd]
> > usable
> > [0.00] BIOS-e820: [mem 0xbffe-
> > 0xbfff]
> > reserved
> > [0.00] BIOS-e820: [mem 0xfeffc000-
> > 0xfeff]
> > reserved
> > [0.00] BIOS-e820: [mem 0xfffc-
> > 0x]
> > reserved
> > [0.00] BIOS-e820: [mem 0x0001-
> > 0x00013fff]
> > usable
> > 
> > call chain:
> > ==
> > 
> > ...
> > => free_init_pages(what="initrd" or "unused kernel",
> > begin=9b26b000, end=9b26c000); begin and end
> > addresses
> > are random. The begin/end value above is just for reference.
> > 
> > => set_memory_rw()
> > => change_page_attr_set()
> > => change_page_attr_set_clr()
> > => __change_page_attr_set_clr(); cpa->numpages is 512 on my board
> > if
> > what=="unused kernel"
> > => __change_page_attr()
> > => try_preserve_large_page(); address=9b26bfacf000, pfn=8,
> > level=3; and the check loop count is 262144, exit loop after 861
> > usecs
> > on my board
> 
> You are talking about the static_protections() check, right?
> 
> > the actual problem
> > ===
> > sometimes, free_init_pages returns after hundreds of secounds. The
> > major impact is kernel boot time.
> 
> That's the symptom you are observing. The problem is in the
> try_to_preserve_large_page() logic.
> 
> The address range from the example above is:
> 
>0x9b26b000 - 0x9b26c000
> 
> i.e. 256 MB.
> 
> So the first call with address 0x9b26b000 will try to
> preserve the
> 1GB page, but it's obvious that if pgrot changes that this has to
> split the
> 1GB page.
> 
> The current code is stupid in that regard simply because it does the
> static_protection() check loop _before_ checking:
> 
>   1) Whether the new and the old pgprot are the same
>   
>   2) Whether the address range which needs to be changed is aligned
> to and
>  covers the full large mapping
> 
> So it does the static_protections() loop before #1 and #2 and checks
> the
> full 1GB page wise, which makes it loop 262144 times.
> 
> With your magic 'cache' logic this will still loop exactly 262144
> times at
> least on the first invocation because there is no valid information
> in that
> 'cache'. So I really have no idea how your patch makes any difference
> unless it is breaking stuff left and right in very subtle ways.
> 
> If there is a second invocation with the same pgprot on that very
> same
> range, then I can see it somehow having that effect by chance, but
> not by
> design.
> 
> But this is all voodoo programming and there is a very obvious and
> simple
> solution for this:
> 
>   The check for pgprot_val(new_prot) == pgprot_val(old_port) can
> definitely
>   be done _before_ the check loop. The check loop is pointless in
> that
>   case, really. If there is a mismatch anywhere then it existed
> already and
>   instead of yelling loudly the checking would just discover it,
> enforce
>   the split and that would in the worst case preserve the old wrong
> mapping
>   for those pages.
> 
>   What we want there is a debug mechanism which catches such cases,
> but is
>   not effective on production kernels and runs once or twice during
> boot.
> 
>   The range check whether the address is aligned to the large page
> and
>   covers the full large page (1G or 2M) is also obvious to do
> _before_ that
>   check, because if the requested range does not fit and has a
> different
>   pgprot_val() then it will decide to split after the check anyway.
> 
>   The check loop itself is stupid as well. Instead of looping in 4K
> steps
>   the thing can be rewritten to check for overlapping ranges and then
> check
>   explicitely for those. If there is no overlap in the first place
> the
>   whole loop thing can be avoided, but that's a pure optimization and
> needs
>   more thought than the straight forward and obvious solution to the
>   problem at hand.
> 
> Untested patch just moving the quick checks to the obvious right
> place
> below.
> 
> Thanks,
> 
>   tglx
> 
> 8<-
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -628,47 +628,61 @@ try_preserve_large_page(pte_t *kpte, uns
>   new_prot = static_protections(req_prot, address, pfn);
>  
>   /*
> -  * We need to check the full range, whether
> -  * static_protection() requires a different pgprot for one
> of
> -  * the pages in the range we try to preserve:
> +  * If there are no changes,

Re: [PATCH] x86/mm: fix cpu stuck issue in __change_page_attr_set_clr

2018-07-09 Thread Yang, Bin

On Wed, 2018-07-04 at 16:01 +0200, Thomas Gleixner wrote:
> On Wed, 4 Jul 2018, Yang, Bin wrote:
> > e820 table:
> > =
> > 
> > [0.00] BIOS-e820: [mem 0x-
> > 0x0009fbff]
> > usable
> > [0.00] BIOS-e820: [mem 0x0009fc00-
> > 0x0009]
> > reserved
> > [0.00] BIOS-e820: [mem 0x000f-
> > 0x000f]
> > reserved
> > [0.00] BIOS-e820: [mem 0x0010-
> > 0xbffd]
> > usable
> > [0.00] BIOS-e820: [mem 0xbffe-
> > 0xbfff]
> > reserved
> > [0.00] BIOS-e820: [mem 0xfeffc000-
> > 0xfeff]
> > reserved
> > [0.00] BIOS-e820: [mem 0xfffc-
> > 0x]
> > reserved
> > [0.00] BIOS-e820: [mem 0x0001-
> > 0x00013fff]
> > usable
> > 
> > call chain:
> > ==
> > 
> > ...
> > => free_init_pages(what="initrd" or "unused kernel",
> > begin=9b26b000, end=9b26c000); begin and end
> > addresses
> > are random. The begin/end value above is just for reference.
> > 
> > => set_memory_rw()
> > => change_page_attr_set()
> > => change_page_attr_set_clr()
> > => __change_page_attr_set_clr(); cpa->numpages is 512 on my board
> > if
> > what=="unused kernel"
> > => __change_page_attr()
> > => try_preserve_large_page(); address=9b26bfacf000, pfn=8,
> > level=3; and the check loop count is 262144, exit loop after 861
> > usecs
> > on my board
> 
> You are talking about the static_protections() check, right?
> 
> > the actual problem
> > ===
> > sometimes, free_init_pages returns after hundreds of secounds. The
> > major impact is kernel boot time.
> 
> That's the symptom you are observing. The problem is in the
> try_to_preserve_large_page() logic.
> 
> The address range from the example above is:
> 
>0x9b26b000 - 0x9b26c000
> 
> i.e. 256 MB.
> 
> So the first call with address 0x9b26b000 will try to
> preserve the
> 1GB page, but it's obvious that if pgrot changes that this has to
> split the
> 1GB page.
> 
> The current code is stupid in that regard simply because it does the
> static_protection() check loop _before_ checking:
> 
>   1) Whether the new and the old pgprot are the same
>   
>   2) Whether the address range which needs to be changed is aligned
> to and
>  covers the full large mapping
> 
> So it does the static_protections() loop before #1 and #2 and checks
> the
> full 1GB page wise, which makes it loop 262144 times.
> 
> With your magic 'cache' logic this will still loop exactly 262144
> times at
> least on the first invocation because there is no valid information
> in that
> 'cache'. So I really have no idea how your patch makes any difference
> unless it is breaking stuff left and right in very subtle ways.
> 
> If there is a second invocation with the same pgprot on that very
> same
> range, then I can see it somehow having that effect by chance, but
> not by
> design.
> 
> But this is all voodoo programming and there is a very obvious and
> simple
> solution for this:
> 
>   The check for pgprot_val(new_prot) == pgprot_val(old_port) can
> definitely
>   be done _before_ the check loop. The check loop is pointless in
> that
>   case, really. If there is a mismatch anywhere then it existed
> already and
>   instead of yelling loudly the checking would just discover it,
> enforce
>   the split and that would in the worst case preserve the old wrong
> mapping
>   for those pages.
> 
>   What we want there is a debug mechanism which catches such cases,
> but is
>   not effective on production kernels and runs once or twice during
> boot.
> 
>   The range check whether the address is aligned to the large page
> and
>   covers the full large page (1G or 2M) is also obvious to do
> _before_ that
>   check, because if the requested range does not fit and has a
> different
>   pgprot_val() then it will decide to split after the check anyway.
> 
>   The check loop itself is stupid as well. Instead of looping in 4K
> steps
>   the thing can be rewritten to check for overlapping ranges and then
> check
>   explicitely for those. If there is no overlap in the first place
> the
>   whole loop thing can be avoided, but that's a pure optimization and
> needs
>   more thought than the straight forward and obvious solution to the
>   problem at hand.
> 
> Untested patch just moving the quick checks to the obvious right
> place
> below.
> 
> Thanks,
> 
>   tglx
> 
> 8<-
> --- a/arch/x86/mm/pageattr.c
> +++ b/arch/x86/mm/pageattr.c
> @@ -628,47 +628,61 @@ try_preserve_large_page(pte_t *kpte, uns
>   new_prot = static_protections(req_prot, address, pfn);
>  
>   /*
> -  * We need to check the full range, whether
> -  * static_protection() requires a different pgprot for one
> of
> -  * the pages in the range we try to preserve:
> +  * If there are no changes,

Re: KASAN: use-after-free Write in _free_event

2018-07-09 Thread syzbot


syzbot has found a reproducer for the following crash on:

HEAD commit:092150a25cb7 Merge branch 'for-linus' of git://git.kernel...
git tree:   upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=16627cc240
kernel config:  https://syzkaller.appspot.com/x/.config?x=25856fac4e580aa7
dashboard link: https://syzkaller.appspot.com/bug?extid=a24c397a29ad22d86c98
compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=14d45afc40
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=16dd496840

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+a24c397a29ad22d86...@syzkaller.appspotmail.com

==
BUG: KASAN: use-after-free in atomic_dec_and_test  
include/asm-generic/atomic-instrumented.h:222 [inline]
BUG: KASAN: use-after-free in put_task_struct include/linux/sched/task.h:95  
[inline]
BUG: KASAN: use-after-free in _free_event+0x48d/0x1440  
kernel/events/core.c:4451

Write of size 4 at addr 8801ab6360e0 by task syz-executor205/5237

CPU: 1 PID: 5237 Comm: syz-executor205 Not tainted 4.18.0-rc4+ #139
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011

Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 check_memory_region_inline mm/kasan/kasan.c:260 [inline]
 check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
 kasan_check_write+0x14/0x20 mm/kasan/kasan.c:278
 atomic_dec_and_test include/asm-generic/atomic-instrumented.h:222 [inline]
 put_task_struct include/linux/sched/task.h:95 [inline]
 _free_event+0x48d/0x1440 kernel/events/core.c:4451
 free_event+0xb4/0x180 kernel/events/core.c:4472
 perf_event_release_kernel+0x7d5/0x1050 kernel/events/core.c:4633
 perf_release+0x37/0x50 kernel/events/core.c:4647
 __fput+0x355/0x8b0 fs/file_table.c:209
 fput+0x15/0x20 fs/file_table.c:243
 task_work_run+0x1ec/0x2a0 kernel/task_work.c:113
 exit_task_work include/linux/task_work.h:22 [inline]
 do_exit+0x1b08/0x2750 kernel/exit.c:865
 do_group_exit+0x177/0x440 kernel/exit.c:968
 get_signal+0x88e/0x1970 kernel/signal.c:2468
 do_signal+0x9c/0x21c0 arch/x86/kernel/signal.c:816
 exit_to_usermode_loop+0x2e0/0x370 arch/x86/entry/common.c:162
 prepare_exit_to_usermode+0x342/0x3b0 arch/x86/entry/common.c:197
 retint_user+0x8/0x18
RIP: 0033:  (null)
Code: Bad RIP value.
RSP: 002b:2048 EFLAGS: 00010217
RAX:  RBX: 006dbc24 RCX: 00445d89
RDX: 2100 RSI: 2040 RDI: 
RBP:  R08: 2080 R09: 
R10: 2000 R11: 0246 R12: 006dbc20
R13: 0030656c69662f2e R14: 00408912 R15: 0006

Allocated by task 5236:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
 kmem_cache_alloc_node+0x144/0x780 mm/slab.c:3644
 alloc_task_struct_node kernel/fork.c:157 [inline]
 dup_task_struct kernel/fork.c:779 [inline]
 copy_process.part.39+0x16b5/0x7220 kernel/fork.c:1641
 copy_process kernel/fork.c:1616 [inline]
 _do_fork+0x291/0x12a0 kernel/fork.c:2099
 __do_sys_clone kernel/fork.c:2206 [inline]
 __se_sys_clone kernel/fork.c:2200 [inline]
 __x64_sys_clone+0xbf/0x150 kernel/fork.c:2200
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 5236:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
 __cache_free mm/slab.c:3498 [inline]
 kmem_cache_free+0x86/0x2d0 mm/slab.c:3756
 free_task_struct kernel/fork.c:162 [inline]
 free_task+0x16e/0x1f0 kernel/fork.c:390
 copy_process.part.39+0x15c9/0x7220 kernel/fork.c:2034
 copy_process kernel/fork.c:1616 [inline]
 _do_fork+0x291/0x12a0 kernel/fork.c:2099
 __do_sys_clone kernel/fork.c:2206 [inline]
 __se_sys_clone kernel/fork.c:2200 [inline]
 __x64_sys_clone+0xbf/0x150 kernel/fork.c:2200
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at 8801ab6360c0
 which belongs to the cache task_struct of size 5952
The buggy address is located 32 bytes inside of
 5952-byte region [8801ab6360c0, 8801ab637800)
The buggy address belongs to the page:
page:ea0006ad8d80 count:1 mapcount:0 mapping:8801da94b200 index:0x0  
compound_mapcount: 0

flags: 0x2fffc008100(slab|head)
raw: 02fffc008100 ea0006ad9008 ea00075f5888 8801da94b200
raw:  8801ab6360c0

Re: KASAN: use-after-free Write in _free_event

2018-07-09 Thread syzbot


syzbot has found a reproducer for the following crash on:

HEAD commit:092150a25cb7 Merge branch 'for-linus' of git://git.kernel...
git tree:   upstream
console output: https://syzkaller.appspot.com/x/log.txt?x=16627cc240
kernel config:  https://syzkaller.appspot.com/x/.config?x=25856fac4e580aa7
dashboard link: https://syzkaller.appspot.com/bug?extid=a24c397a29ad22d86c98
compiler:   gcc (GCC) 8.0.1 20180413 (experimental)
syzkaller repro:https://syzkaller.appspot.com/x/repro.syz?x=14d45afc40
C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=16dd496840

IMPORTANT: if you fix the bug, please add the following tag to the commit:
Reported-by: syzbot+a24c397a29ad22d86...@syzkaller.appspotmail.com

==
BUG: KASAN: use-after-free in atomic_dec_and_test  
include/asm-generic/atomic-instrumented.h:222 [inline]
BUG: KASAN: use-after-free in put_task_struct include/linux/sched/task.h:95  
[inline]
BUG: KASAN: use-after-free in _free_event+0x48d/0x1440  
kernel/events/core.c:4451

Write of size 4 at addr 8801ab6360e0 by task syz-executor205/5237

CPU: 1 PID: 5237 Comm: syz-executor205 Not tainted 4.18.0-rc4+ #139
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS  
Google 01/01/2011

Call Trace:
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x1c9/0x2b4 lib/dump_stack.c:113
 print_address_description+0x6c/0x20b mm/kasan/report.c:256
 kasan_report_error mm/kasan/report.c:354 [inline]
 kasan_report.cold.7+0x242/0x2fe mm/kasan/report.c:412
 check_memory_region_inline mm/kasan/kasan.c:260 [inline]
 check_memory_region+0x13e/0x1b0 mm/kasan/kasan.c:267
 kasan_check_write+0x14/0x20 mm/kasan/kasan.c:278
 atomic_dec_and_test include/asm-generic/atomic-instrumented.h:222 [inline]
 put_task_struct include/linux/sched/task.h:95 [inline]
 _free_event+0x48d/0x1440 kernel/events/core.c:4451
 free_event+0xb4/0x180 kernel/events/core.c:4472
 perf_event_release_kernel+0x7d5/0x1050 kernel/events/core.c:4633
 perf_release+0x37/0x50 kernel/events/core.c:4647
 __fput+0x355/0x8b0 fs/file_table.c:209
 fput+0x15/0x20 fs/file_table.c:243
 task_work_run+0x1ec/0x2a0 kernel/task_work.c:113
 exit_task_work include/linux/task_work.h:22 [inline]
 do_exit+0x1b08/0x2750 kernel/exit.c:865
 do_group_exit+0x177/0x440 kernel/exit.c:968
 get_signal+0x88e/0x1970 kernel/signal.c:2468
 do_signal+0x9c/0x21c0 arch/x86/kernel/signal.c:816
 exit_to_usermode_loop+0x2e0/0x370 arch/x86/entry/common.c:162
 prepare_exit_to_usermode+0x342/0x3b0 arch/x86/entry/common.c:197
 retint_user+0x8/0x18
RIP: 0033:  (null)
Code: Bad RIP value.
RSP: 002b:2048 EFLAGS: 00010217
RAX:  RBX: 006dbc24 RCX: 00445d89
RDX: 2100 RSI: 2040 RDI: 
RBP:  R08: 2080 R09: 
R10: 2000 R11: 0246 R12: 006dbc20
R13: 0030656c69662f2e R14: 00408912 R15: 0006

Allocated by task 5236:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 kasan_kmalloc+0xc4/0xe0 mm/kasan/kasan.c:553
 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:490
 kmem_cache_alloc_node+0x144/0x780 mm/slab.c:3644
 alloc_task_struct_node kernel/fork.c:157 [inline]
 dup_task_struct kernel/fork.c:779 [inline]
 copy_process.part.39+0x16b5/0x7220 kernel/fork.c:1641
 copy_process kernel/fork.c:1616 [inline]
 _do_fork+0x291/0x12a0 kernel/fork.c:2099
 __do_sys_clone kernel/fork.c:2206 [inline]
 __se_sys_clone kernel/fork.c:2200 [inline]
 __x64_sys_clone+0xbf/0x150 kernel/fork.c:2200
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

Freed by task 5236:
 save_stack+0x43/0xd0 mm/kasan/kasan.c:448
 set_track mm/kasan/kasan.c:460 [inline]
 __kasan_slab_free+0x11a/0x170 mm/kasan/kasan.c:521
 kasan_slab_free+0xe/0x10 mm/kasan/kasan.c:528
 __cache_free mm/slab.c:3498 [inline]
 kmem_cache_free+0x86/0x2d0 mm/slab.c:3756
 free_task_struct kernel/fork.c:162 [inline]
 free_task+0x16e/0x1f0 kernel/fork.c:390
 copy_process.part.39+0x15c9/0x7220 kernel/fork.c:2034
 copy_process kernel/fork.c:1616 [inline]
 _do_fork+0x291/0x12a0 kernel/fork.c:2099
 __do_sys_clone kernel/fork.c:2206 [inline]
 __se_sys_clone kernel/fork.c:2200 [inline]
 __x64_sys_clone+0xbf/0x150 kernel/fork.c:2200
 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
 entry_SYSCALL_64_after_hwframe+0x49/0xbe

The buggy address belongs to the object at 8801ab6360c0
 which belongs to the cache task_struct of size 5952
The buggy address is located 32 bytes inside of
 5952-byte region [8801ab6360c0, 8801ab637800)
The buggy address belongs to the page:
page:ea0006ad8d80 count:1 mapcount:0 mapping:8801da94b200 index:0x0  
compound_mapcount: 0

flags: 0x2fffc008100(slab|head)
raw: 02fffc008100 ea0006ad9008 ea00075f5888 8801da94b200
raw:  8801ab6360c0

Re: what trees/branches to test on syzbot

2018-07-09 Thread Tetsuo Handa

Andrew Morton wrote:
> On Sat, 7 Jul 2018 08:26:32 +0900 Tetsuo Handa 
>  wrote:
> 
> > Hello Andrew,
> > 
> > It seems that syzbot (experimentally ?) restarted testing linux-next.
> > 
> > May I ask you to carry temporarily debug printk() patch at
> > https://groups.google.com/d/msg/syzkaller-bugs/E8M8WTqt034/OpadOICfCAAJ
> > for "INFO: task hung in __sb_start_write" case?
> > 
> > The bug should be reproduced within a day if executed under syzbot 
> > environment.
> > Thus, I'm sure that we don't need to carry this patch for long.
> 
> Sure, I can add that.

Thank you.

>Let's get the build warning sorted out first,
> please.  Any old silly workaround will suffice in a developer-only
> debug patch.

The build warning is about mips architecture rather than this patch itself,
for x86_64 builds fine. You can add this patch despite the build warning.

Re: Kernel 4.17.4 lockup

2018-07-09 Thread H.J. Lu

On Mon, Jul 9, 2018 at 5:44 PM, Dave Hansen  wrote:
> ...  cc'ing a few folks who I know have been looking at this code
> lately.  The full oops is below if any of you want to take a look.
>
> OK, well, annotating the disassembly a bit:
>
>> (gdb) disass free_pages_and_swap_cache
>> Dump of assembler code for function free_pages_and_swap_cache:
>>0x8124c0d0 <+0>: callq  0x81a017a0 <__fentry__>
>>0x8124c0d5 <+5>: push   %r14
>>0x8124c0d7 <+7>: push   %r13
>>0x8124c0d9 <+9>: push   %r12
>>0x8124c0db <+11>: mov%rdi,%r12 // %r12 = pages
>>0x8124c0de <+14>: push   %rbp
>>0x8124c0df <+15>: mov%esi,%ebp // %ebp = nr
>>0x8124c0e1 <+17>: push   %rbx
>>0x8124c0e2 <+18>: callq  0x81205a10 
>>0x8124c0e7 <+23>: test   %ebp,%ebp // test nr==0
>>0x8124c0e9 <+25>: jle0x8124c156 
>> 
>>0x8124c0eb <+27>: lea-0x1(%rbp),%eax
>>0x8124c0ee <+30>: mov%r12,%rbx // %rbx = pages
>>0x8124c0f1 <+33>: lea0x8(%r12,%rax,8),%r14 // load [nr] 
>> into %r14?
>>0x8124c0f6 <+38>: mov(%rbx),%r13   // %r13 = pages[i]
>>0x8124c0f9 <+41>: mov0x20(%r13),%rdx   // 
>> GPF here.
> %r13 is 64-byte aligned, so looks like a halfway reasonable 'struct page *'.
>
> %R14 looks OK (0x93d4abb5f000) because it points to the end of a
> dynamically-allocated (not on-stack) mmu_gather_batch page.  %RBX is
> pointing 50 pages up from the start of the previous page.  That makes it
> the 48th page in pages[] after a pointer and two integers in the
> beginning of the structure.  That 48 is important because it's way
> larger than the on-stack size of 8.
>
> It's hard to make much sense of %R13 (pages[48] / 0xfffbf0809e304bc0)
> because the vmemmap addresses get randomized.  But, I _think_ that's too
> high of an address for a 4-level paging vmemmap[] entry.  Does anybody
> else know offhand?
>
> I'd really want to see this reproduced without KASLR to make the oops
> easier to read.  It would also be handy to try your workload with all
> the pedantic debugging: KASAN, slab debugging, DEBUG_PAGE_ALLOC, etc...
> and see if it still triggers.

How can I turn them on at boot time?

> Some relevant functions and structures below for reference.
>
> void free_pages_and_swap_cache(struct page **pages, int nr)
> {
> for (i = 0; i < nr; i++)
> free_swap_cache(pages[i]);
> }
>
>
> static void tlb_flush_mmu_free(struct mmu_gather *tlb)
> {
> for (batch = >local; batch && batch->nr;
>  batch = batch->next) {
> free_pages_and_swap_cache(batch->pages, batch->nr);
> }
>
> zap_pte_range()
> {
> if (force_flush)
> tlb_flush_mmu_free(tlb);
> }
>
> ... all the way up to the on-stack-allocated mmu_gather:
>
> void zap_page_range(struct vm_area_struct *vma, unsigned long start,
> unsigned long size)
> {
> struct mmu_gather tlb;
>
>
> #define MMU_GATHER_BUNDLE   8
>
> struct mmu_gather {
> ...
> struct mmu_gather_batch local;
> struct page *__pages[MMU_GATHER_BUNDLE];
> }
>
> struct mmu_gather_batch {
> struct mmu_gather_batch *next;
> unsigned intnr;
> unsigned intmax;
> struct page *pages[0];
> };
>
> #define MAX_GATHER_BATCH\
> ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))
>



-- 
H.J.

Re: what trees/branches to test on syzbot

2018-07-09 Thread Tetsuo Handa

Andrew Morton wrote:
> On Sat, 7 Jul 2018 08:26:32 +0900 Tetsuo Handa 
>  wrote:
> 
> > Hello Andrew,
> > 
> > It seems that syzbot (experimentally ?) restarted testing linux-next.
> > 
> > May I ask you to carry temporarily debug printk() patch at
> > https://groups.google.com/d/msg/syzkaller-bugs/E8M8WTqt034/OpadOICfCAAJ
> > for "INFO: task hung in __sb_start_write" case?
> > 
> > The bug should be reproduced within a day if executed under syzbot 
> > environment.
> > Thus, I'm sure that we don't need to carry this patch for long.
> 
> Sure, I can add that.

Thank you.

>Let's get the build warning sorted out first,
> please.  Any old silly workaround will suffice in a developer-only
> debug patch.

The build warning is about mips architecture rather than this patch itself,
for x86_64 builds fine. You can add this patch despite the build warning.

Re: Kernel 4.17.4 lockup

2018-07-09 Thread H.J. Lu

On Mon, Jul 9, 2018 at 5:44 PM, Dave Hansen  wrote:
> ...  cc'ing a few folks who I know have been looking at this code
> lately.  The full oops is below if any of you want to take a look.
>
> OK, well, annotating the disassembly a bit:
>
>> (gdb) disass free_pages_and_swap_cache
>> Dump of assembler code for function free_pages_and_swap_cache:
>>0x8124c0d0 <+0>: callq  0x81a017a0 <__fentry__>
>>0x8124c0d5 <+5>: push   %r14
>>0x8124c0d7 <+7>: push   %r13
>>0x8124c0d9 <+9>: push   %r12
>>0x8124c0db <+11>: mov%rdi,%r12 // %r12 = pages
>>0x8124c0de <+14>: push   %rbp
>>0x8124c0df <+15>: mov%esi,%ebp // %ebp = nr
>>0x8124c0e1 <+17>: push   %rbx
>>0x8124c0e2 <+18>: callq  0x81205a10 
>>0x8124c0e7 <+23>: test   %ebp,%ebp // test nr==0
>>0x8124c0e9 <+25>: jle0x8124c156 
>> 
>>0x8124c0eb <+27>: lea-0x1(%rbp),%eax
>>0x8124c0ee <+30>: mov%r12,%rbx // %rbx = pages
>>0x8124c0f1 <+33>: lea0x8(%r12,%rax,8),%r14 // load [nr] 
>> into %r14?
>>0x8124c0f6 <+38>: mov(%rbx),%r13   // %r13 = pages[i]
>>0x8124c0f9 <+41>: mov0x20(%r13),%rdx   // 
>> GPF here.
> %r13 is 64-byte aligned, so looks like a halfway reasonable 'struct page *'.
>
> %R14 looks OK (0x93d4abb5f000) because it points to the end of a
> dynamically-allocated (not on-stack) mmu_gather_batch page.  %RBX is
> pointing 50 pages up from the start of the previous page.  That makes it
> the 48th page in pages[] after a pointer and two integers in the
> beginning of the structure.  That 48 is important because it's way
> larger than the on-stack size of 8.
>
> It's hard to make much sense of %R13 (pages[48] / 0xfffbf0809e304bc0)
> because the vmemmap addresses get randomized.  But, I _think_ that's too
> high of an address for a 4-level paging vmemmap[] entry.  Does anybody
> else know offhand?
>
> I'd really want to see this reproduced without KASLR to make the oops
> easier to read.  It would also be handy to try your workload with all
> the pedantic debugging: KASAN, slab debugging, DEBUG_PAGE_ALLOC, etc...
> and see if it still triggers.

How can I turn them on at boot time?

> Some relevant functions and structures below for reference.
>
> void free_pages_and_swap_cache(struct page **pages, int nr)
> {
> for (i = 0; i < nr; i++)
> free_swap_cache(pages[i]);
> }
>
>
> static void tlb_flush_mmu_free(struct mmu_gather *tlb)
> {
> for (batch = >local; batch && batch->nr;
>  batch = batch->next) {
> free_pages_and_swap_cache(batch->pages, batch->nr);
> }
>
> zap_pte_range()
> {
> if (force_flush)
> tlb_flush_mmu_free(tlb);
> }
>
> ... all the way up to the on-stack-allocated mmu_gather:
>
> void zap_page_range(struct vm_area_struct *vma, unsigned long start,
> unsigned long size)
> {
> struct mmu_gather tlb;
>
>
> #define MMU_GATHER_BUNDLE   8
>
> struct mmu_gather {
> ...
> struct mmu_gather_batch local;
> struct page *__pages[MMU_GATHER_BUNDLE];
> }
>
> struct mmu_gather_batch {
> struct mmu_gather_batch *next;
> unsigned intnr;
> unsigned intmax;
> struct page *pages[0];
> };
>
> #define MAX_GATHER_BATCH\
> ((PAGE_SIZE - sizeof(struct mmu_gather_batch)) / sizeof(void *))
>



-- 
H.J.

Re: [PATCH 3/3] docs: Fix docs for boot parameter debug_boot_weak_hash

2018-07-09 Thread Steven Rostedt

On Tue, 10 Jul 2018 10:25:19 +1000
"Tobin C. Harding"  wrote:

> Recently boot parameter 'debug_boot_weak_hash' was added.  Latter
> discussion on LKML developed more descriptive documentation but due to
> the programmers lack of understanding (*cough* me) on how linux-next
> works these improvements never got picked up.
> 
> Update documentation for boot parameter 'debug_boot_weak_hash' as
> discussed (patch posted and acked) on LKML.
> 
> Fixes: bfe80ed3d7c76 ("vsprintf: add command line option 
> debug_boot_weak_hash")
> Acked-by: Randy Dunlap 
> Signed-off-by: Tobin C. Harding 

Reviewed-by: Steven Rostedt (VMware) 

-- Steve

> ---
>  Documentation/admin-guide/kernel-parameters.txt | 13 ++---
>  1 file changed, 6 insertions(+), 7 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index 31c9b76058a7..e3b1c84d19c6 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -749,13 +749,12 @@
>   debug   [KNL] Enable kernel debugging (events log level).
>  
>   debug_boot_weak_hash
> - [KNL] Enable printing pointers early in the boot
> - sequence.  If enabled, we use a weak hash instead of
> - siphash to hash pointers.  Use this option if you need
> - to see pointer values during early boot (i.e you are
> - seeing instances of '(___ptrval___)').
> - Cryptographically insecure, please do not use on
> - production kernels.
> + [KNL] Enable printing [hashed] pointers early in the
> + boot sequence.  If enabled, we use a weak hash instead
> + of siphash to hash pointers.  Use this option if you are
> + seeing instances of '(___ptrval___)') and need to see a
> + value (hashed pointer) instead. Cryptographically
> + insecure, please do not use on production kernels.
>  
>   debug_locks_verbose=
>   [KNL] verbose self-tests

Re: [PATCH 3/3] docs: Fix docs for boot parameter debug_boot_weak_hash

2018-07-09 Thread Steven Rostedt

On Tue, 10 Jul 2018 10:25:19 +1000
"Tobin C. Harding"  wrote:

> Recently boot parameter 'debug_boot_weak_hash' was added.  Latter
> discussion on LKML developed more descriptive documentation but due to
> the programmers lack of understanding (*cough* me) on how linux-next
> works these improvements never got picked up.
> 
> Update documentation for boot parameter 'debug_boot_weak_hash' as
> discussed (patch posted and acked) on LKML.
> 
> Fixes: bfe80ed3d7c76 ("vsprintf: add command line option 
> debug_boot_weak_hash")
> Acked-by: Randy Dunlap 
> Signed-off-by: Tobin C. Harding 

Reviewed-by: Steven Rostedt (VMware) 

-- Steve

> ---
>  Documentation/admin-guide/kernel-parameters.txt | 13 ++---
>  1 file changed, 6 insertions(+), 7 deletions(-)
> 
> diff --git a/Documentation/admin-guide/kernel-parameters.txt 
> b/Documentation/admin-guide/kernel-parameters.txt
> index 31c9b76058a7..e3b1c84d19c6 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -749,13 +749,12 @@
>   debug   [KNL] Enable kernel debugging (events log level).
>  
>   debug_boot_weak_hash
> - [KNL] Enable printing pointers early in the boot
> - sequence.  If enabled, we use a weak hash instead of
> - siphash to hash pointers.  Use this option if you need
> - to see pointer values during early boot (i.e you are
> - seeing instances of '(___ptrval___)').
> - Cryptographically insecure, please do not use on
> - production kernels.
> + [KNL] Enable printing [hashed] pointers early in the
> + boot sequence.  If enabled, we use a weak hash instead
> + of siphash to hash pointers.  Use this option if you are
> + seeing instances of '(___ptrval___)') and need to see a
> + value (hashed pointer) instead. Cryptographically
> + insecure, please do not use on production kernels.
>  
>   debug_locks_verbose=
>   [KNL] verbose self-tests

Re: [PATCH 2/3] vsprintf: Remove incorrect EXPORT_SYMBOL

2018-07-09 Thread Steven Rostedt

On Tue, 10 Jul 2018 10:25:18 +1000
"Tobin C. Harding"  wrote:

> Recently boot variable 'debug_boot_weak_hash' was added (by me).  In a
> classic case of cargo cult programming the novice developer added a
> macro call to EXPORT_SYMBOL().  This is wrong and was pointed out on
> LKML.  As pointed out EXPORT_SYMBOL() cannot be used on static
> variables.
> 
> Remove incorrect usage of macro EXPORT_SYMBOL()
> 
> Fixes: bfe80ed3d7c76 ("vsprintf: add command line option 
> debug_boot_weak_hash")
> Signed-off-by: Tobin C. Harding 

Revieved-by: Steven Rostedt (VMware) 

Hmm, not sure if Ted rebases his trees or not, but if he does, he could
also fold this into that commit.

-- Steve

> ---
>  lib/vsprintf.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> index cdc2c355dff5..fe834a201f3d 100644
> --- a/lib/vsprintf.c
> +++ b/lib/vsprintf.c
> @@ -1655,7 +1655,6 @@ static 
> DEFINE_STATIC_KEY_TRUE(not_filled_random_ptr_key);
>  
>  /* Make pointers available for printing early in the boot sequence. */
>  static int debug_boot_weak_hash __ro_after_init;
> -EXPORT_SYMBOL(debug_boot_weak_hash);
>  
>  static int __init debug_boot_weak_hash_enable(char *str)
>  {

Re: [PATCH 2/3] vsprintf: Remove incorrect EXPORT_SYMBOL

2018-07-09 Thread Steven Rostedt

On Tue, 10 Jul 2018 10:25:18 +1000
"Tobin C. Harding"  wrote:

> Recently boot variable 'debug_boot_weak_hash' was added (by me).  In a
> classic case of cargo cult programming the novice developer added a
> macro call to EXPORT_SYMBOL().  This is wrong and was pointed out on
> LKML.  As pointed out EXPORT_SYMBOL() cannot be used on static
> variables.
> 
> Remove incorrect usage of macro EXPORT_SYMBOL()
> 
> Fixes: bfe80ed3d7c76 ("vsprintf: add command line option 
> debug_boot_weak_hash")
> Signed-off-by: Tobin C. Harding 

Revieved-by: Steven Rostedt (VMware) 

Hmm, not sure if Ted rebases his trees or not, but if he does, he could
also fold this into that commit.

-- Steve

> ---
>  lib/vsprintf.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/lib/vsprintf.c b/lib/vsprintf.c
> index cdc2c355dff5..fe834a201f3d 100644
> --- a/lib/vsprintf.c
> +++ b/lib/vsprintf.c
> @@ -1655,7 +1655,6 @@ static 
> DEFINE_STATIC_KEY_TRUE(not_filled_random_ptr_key);
>  
>  /* Make pointers available for printing early in the boot sequence. */
>  static int debug_boot_weak_hash __ro_after_init;
> -EXPORT_SYMBOL(debug_boot_weak_hash);
>  
>  static int __init debug_boot_weak_hash_enable(char *str)
>  {

Re: [PATCH v3 3/3] serial: 8250_dw: add fractional divisor support

2018-07-09 Thread Jisheng Zhang

Hi Andy,

On Mon, 9 Jul 2018 16:32:41 +0800 Jisheng Zhang wrote:

> Hi Andy,
> 
> On Mon, 9 Jul 2018 16:23:15 +0800 Jisheng Zhang wrote:
> 
> > For Synopsys DesignWare 8250 uart which version >= 4.00a, there's a
> > valid divisor latch fraction register. The fractional divisor width is
> > 4bits ~ 6bits.
> > 
> > Now the preparation is done, it's easy to add the feature support.
> > This patch firstly tries to get the fractional divisor width during
> > probe, then setups dw specific get_divisor() and set_divisor() hook.
> > 
> > Signed-off-by: Jisheng Zhang 
> > ---
> >  drivers/tty/serial/8250/8250_dw.c | 45 +++
> >  1 file changed, 45 insertions(+)
> > 
> > diff --git a/drivers/tty/serial/8250/8250_dw.c 
> > b/drivers/tty/serial/8250/8250_dw.c
> > index fa8a00e8c9c6..e90c3d229f00 100644
> > --- a/drivers/tty/serial/8250/8250_dw.c
> > +++ b/drivers/tty/serial/8250/8250_dw.c
> > @@ -31,6 +31,7 @@
> >  
> >  /* Offsets for the DesignWare specific registers */
> >  #define DW_UART_USR0x1f /* UART Status Register */
> > +#define DW_UART_DLF0xc0 /* Divisor Latch Fraction Register */
> >  #define DW_UART_CPR0xf4 /* Component Parameter Register */
> >  #define DW_UART_UCV0xf8 /* UART Component Version */
> >  
> > @@ -55,6 +56,7 @@
> >  
> >  struct dw8250_data {
> > u8  usr_reg;
> > +   u8  dlf_size;
> > int line;
> > int msr_mask_on;
> > int msr_mask_off;
> > @@ -366,6 +368,37 @@ static bool dw8250_idma_filter(struct dma_chan *chan, 
> > void *param)
> > return param == chan->device->dev->parent;
> >  }
> >  
> > +/*
> > + * divisor = clk / (16 * baud) = div(I) + div(F)
> > + * "I" means integer, "F" means fractional
> > + *
> > + * 2^dlf_size * clk / (16 * baud) = 2^dlf_size * (div(I) + div(F))
> > + * so, (clk << (dlf_siz - 4)) / baud = (div(I) + div(F)) << dlf_size
> > + *
> > + * let quot = DIV_ROUND_CLOSEST(clk << (dlf_size - 4), baud), we get
> > + * div(I) = quot >> dlf_size
> > + * div(F) = quot & dlf_mask, where dlf_mask = GENMASK(dlf_size - 1, 0)
> > + */
> > +static unsigned int dw8250_get_divisor(struct uart_port *p,
> > +  unsigned int baud,
> > +  unsigned int *frac)
> > +{
> > +   unsigned int quot;
> > +   struct dw8250_data *d = p->private_data;
> > +
> > +   quot = DIV_ROUND_CLOSEST(p->uartclk << (d->dlf_size - 4), baud);
> > +   *frac = quot & GENMASK(d->dlf_size - 1, 0);
> > +
> > +   return quot >> d->dlf_size;  
> 
> After more consideration, I sent out this version for the following two
> points:
> 
> 1. the max frac divisor width is 6bits now, but we dunno whether future IP
> extends it or not. This patch version can still support > 6bits width except
> the overflow concern. But fixing overflow(if there is) is as simple as using\
> the DIV_ROUND_CLOSEST_ULL macro.
> 
> 2. this version makes use of well implemented GENMASK to get the dlf_mask,
> the micro is well understood, I think. so the code is simplified as well.
> 
> 3. the magic "4" is explained in the comments, I hope it could help the
> code.

I agree with your "making the code simple" concern. So I have a consideration
again. I think I made the code complex unnecessarily, the simplest get_divisor 
code
could be:

get_divisor()
{
unsigned int quot, rem;

quot = clk / (16 * baud);
rem = clk % (16 *baud);
*frac = DIV_ROUND_CLOSEST(rem << dlf_size, 16*baud);

return quot;
}

Compared with previous version, this one adds one more div instruction,
but remove several "shift, and" instructions, the performance isn't that bad.
From another side, even this version gets a trivial performance regression,
the get_divisor() doesn't sit on the hot code path. Making the code simpler is
more important.

I will send a new version.

Thanks

Re: [PATCH v3 3/3] serial: 8250_dw: add fractional divisor support

2018-07-09 Thread Jisheng Zhang

Hi Andy,

On Mon, 9 Jul 2018 16:32:41 +0800 Jisheng Zhang wrote:

> Hi Andy,
> 
> On Mon, 9 Jul 2018 16:23:15 +0800 Jisheng Zhang wrote:
> 
> > For Synopsys DesignWare 8250 uart which version >= 4.00a, there's a
> > valid divisor latch fraction register. The fractional divisor width is
> > 4bits ~ 6bits.
> > 
> > Now the preparation is done, it's easy to add the feature support.
> > This patch firstly tries to get the fractional divisor width during
> > probe, then setups dw specific get_divisor() and set_divisor() hook.
> > 
> > Signed-off-by: Jisheng Zhang 
> > ---
> >  drivers/tty/serial/8250/8250_dw.c | 45 +++
> >  1 file changed, 45 insertions(+)
> > 
> > diff --git a/drivers/tty/serial/8250/8250_dw.c 
> > b/drivers/tty/serial/8250/8250_dw.c
> > index fa8a00e8c9c6..e90c3d229f00 100644
> > --- a/drivers/tty/serial/8250/8250_dw.c
> > +++ b/drivers/tty/serial/8250/8250_dw.c
> > @@ -31,6 +31,7 @@
> >  
> >  /* Offsets for the DesignWare specific registers */
> >  #define DW_UART_USR0x1f /* UART Status Register */
> > +#define DW_UART_DLF0xc0 /* Divisor Latch Fraction Register */
> >  #define DW_UART_CPR0xf4 /* Component Parameter Register */
> >  #define DW_UART_UCV0xf8 /* UART Component Version */
> >  
> > @@ -55,6 +56,7 @@
> >  
> >  struct dw8250_data {
> > u8  usr_reg;
> > +   u8  dlf_size;
> > int line;
> > int msr_mask_on;
> > int msr_mask_off;
> > @@ -366,6 +368,37 @@ static bool dw8250_idma_filter(struct dma_chan *chan, 
> > void *param)
> > return param == chan->device->dev->parent;
> >  }
> >  
> > +/*
> > + * divisor = clk / (16 * baud) = div(I) + div(F)
> > + * "I" means integer, "F" means fractional
> > + *
> > + * 2^dlf_size * clk / (16 * baud) = 2^dlf_size * (div(I) + div(F))
> > + * so, (clk << (dlf_siz - 4)) / baud = (div(I) + div(F)) << dlf_size
> > + *
> > + * let quot = DIV_ROUND_CLOSEST(clk << (dlf_size - 4), baud), we get
> > + * div(I) = quot >> dlf_size
> > + * div(F) = quot & dlf_mask, where dlf_mask = GENMASK(dlf_size - 1, 0)
> > + */
> > +static unsigned int dw8250_get_divisor(struct uart_port *p,
> > +  unsigned int baud,
> > +  unsigned int *frac)
> > +{
> > +   unsigned int quot;
> > +   struct dw8250_data *d = p->private_data;
> > +
> > +   quot = DIV_ROUND_CLOSEST(p->uartclk << (d->dlf_size - 4), baud);
> > +   *frac = quot & GENMASK(d->dlf_size - 1, 0);
> > +
> > +   return quot >> d->dlf_size;  
> 
> After more consideration, I sent out this version for the following two
> points:
> 
> 1. the max frac divisor width is 6bits now, but we dunno whether future IP
> extends it or not. This patch version can still support > 6bits width except
> the overflow concern. But fixing overflow(if there is) is as simple as using\
> the DIV_ROUND_CLOSEST_ULL macro.
> 
> 2. this version makes use of well implemented GENMASK to get the dlf_mask,
> the micro is well understood, I think. so the code is simplified as well.
> 
> 3. the magic "4" is explained in the comments, I hope it could help the
> code.

I agree with your "making the code simple" concern. So I have a consideration
again. I think I made the code complex unnecessarily, the simplest get_divisor 
code
could be:

get_divisor()
{
unsigned int quot, rem;

quot = clk / (16 * baud);
rem = clk % (16 *baud);
*frac = DIV_ROUND_CLOSEST(rem << dlf_size, 16*baud);

return quot;
}

Compared with previous version, this one adds one more div instruction,
but remove several "shift, and" instructions, the performance isn't that bad.
From another side, even this version gets a trivial performance regression,
the get_divisor() doesn't sit on the hot code path. Making the code simpler is
more important.

I will send a new version.

Thanks

Re: [PATCH 0/3] fix: enable early printing of hashed pointers

2018-07-09 Thread Steven Rostedt

On Tue, 10 Jul 2018 10:25:16 +1000
"Tobin C. Harding"  wrote:

> Since I'm a massive noob I did not realise that since v7 of this set was
> applied to random-next already that doing incremental versions was
> pointless.

I wouldn't say you are a noob anymore. You are making the same types of
mistakes that seasoned kernel developers make ;-)

-- Steve

Re: [PATCH 0/3] fix: enable early printing of hashed pointers

2018-07-09 Thread Steven Rostedt

On Tue, 10 Jul 2018 10:25:16 +1000
"Tobin C. Harding"  wrote:

> Since I'm a massive noob I did not realise that since v7 of this set was
> applied to random-next already that doing incremental versions was
> pointless.

I wouldn't say you are a noob anymore. You are making the same types of
mistakes that seasoned kernel developers make ;-)

-- Steve

Re: [PATCH 1/3] random: Remove unnecessary cast

2018-07-09 Thread Steven Rostedt

On Tue, 10 Jul 2018 10:25:17 +1000
"Tobin C. Harding"  wrote:

> Currently we are casting an argument to the macro min_t().  This is
> unnecessary since the macro already includes a cast.
> 
> Remove unnecessary cast from argument to macro min_t()
> 
> Signed-off-by: Tobin C. Harding 

Reviewed-by: Steven Rostedt (VMware) 

-- Steve

> ---
>  drivers/char/random.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index d686aa2a129b..03bd13876901 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -1736,7 +1736,7 @@ int __must_check get_random_bytes_arch(void *buf, int 
> nbytes)
>   trace_get_random_bytes_arch(left, _RET_IP_);
>   while (left) {
>   unsigned long v;
> - int chunk = min_t(int, left, (int)sizeof(unsigned long));
> + int chunk = min_t(int, left, sizeof(unsigned long));
>  
>   if (!arch_get_random_long())
>   break;

Re: [PATCH 1/3] random: Remove unnecessary cast

2018-07-09 Thread Steven Rostedt

On Tue, 10 Jul 2018 10:25:17 +1000
"Tobin C. Harding"  wrote:

> Currently we are casting an argument to the macro min_t().  This is
> unnecessary since the macro already includes a cast.
> 
> Remove unnecessary cast from argument to macro min_t()
> 
> Signed-off-by: Tobin C. Harding 

Reviewed-by: Steven Rostedt (VMware) 

-- Steve

> ---
>  drivers/char/random.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/char/random.c b/drivers/char/random.c
> index d686aa2a129b..03bd13876901 100644
> --- a/drivers/char/random.c
> +++ b/drivers/char/random.c
> @@ -1736,7 +1736,7 @@ int __must_check get_random_bytes_arch(void *buf, int 
> nbytes)
>   trace_get_random_bytes_arch(left, _RET_IP_);
>   while (left) {
>   unsigned long v;
> - int chunk = min_t(int, left, (int)sizeof(unsigned long));
> + int chunk = min_t(int, left, sizeof(unsigned long));
>  
>   if (!arch_get_random_long())
>   break;

Re: [PATCH -mm -v4 02/21] mm, THP, swap: Make CONFIG_THP_SWAP depends on CONFIG_SWAP

2018-07-09 Thread Dave Hansen

On 07/09/2018 06:19 PM, Huang, Ying wrote:
> Dave Hansen  writes:
> 
>>>  config THP_SWAP
>>> def_bool y
>>> -   depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
>>> +   depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
>>> help
>>
>> This seems like a bug-fix.  Is there a reason this didn't cause problems
>> up to now?
> Yes.  The original code has some problem in theory, but not in practice
> because all code enclosed by
> 
> #ifdef CONFIG_THP_SWAP
> #endif
> 
> are in swapfile.c.  But that will be not true in this patchset.

That's great info for the changelog.

Re: [PATCH -mm -v4 02/21] mm, THP, swap: Make CONFIG_THP_SWAP depends on CONFIG_SWAP

2018-07-09 Thread Dave Hansen

On 07/09/2018 06:19 PM, Huang, Ying wrote:
> Dave Hansen  writes:
> 
>>>  config THP_SWAP
>>> def_bool y
>>> -   depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP
>>> +   depends on TRANSPARENT_HUGEPAGE && ARCH_WANTS_THP_SWAP && SWAP
>>> help
>>
>> This seems like a bug-fix.  Is there a reason this didn't cause problems
>> up to now?
> Yes.  The original code has some problem in theory, but not in practice
> because all code enclosed by
> 
> #ifdef CONFIG_THP_SWAP
> #endif
> 
> are in swapfile.c.  But that will be not true in this patchset.

That's great info for the changelog.

Re: [patch] mm, vmacache: hash addresses based on pmd

2018-07-09 Thread David Rientjes

On Mon, 9 Jul 2018, Andrew Morton wrote:

> > When perf profiling a wide variety of different workloads, it was found
> > that vmacache_find() had higher than expected cost: up to 0.08% of cpu
> > utilization in some cases.  This was found to rival other core VM
> > functions such as alloc_pages_vma() with thp enabled and default
> > mempolicy, and the conditionals in __get_vma_policy().
> > 
> > VMACACHE_HASH() determines which of the four per-task_struct slots a vma
> > is cached for a particular address.  This currently depends on the pfn,
> > so pfn 5212 occupies a different vmacache slot than its neighboring
> > pfn 5213.
> > 
> > vmacache_find() iterates through all four of current's vmacache slots
> > when looking up an address.  Hashing based on pfn, an address has
> > ~1/VMACACHE_SIZE chance of being cached in the first vmacache slot, or
> > about 25%, *if* the vma is cached.
> > 
> > This patch hashes an address by its pmd instead of pte to optimize for
> > workloads with good spatial locality.  This results in a higher
> > probability of vmas being cached in the first slot that is checked:
> > normally ~70% on the same workloads instead of 25%.
> 
> Was the improvement quantifiable?
> 

I've run page fault testing to answer this question on Haswell since the 
initial profiling was done over a wide variety of user-controlled 
workloads and there's no guarantee that such profiling would be a fair 
comparison either way.  For page faulting it's either falling below our 
testing levels of 0.02%, or is right at 0.02%.  Running without the patch 
it's 0.05-0.06% overhead.

> Surprised.  That little array will all be in CPU cache and that loop
> should execute pretty quickly?  If it's *that* sensitive then let's zap
> the no-longer-needed WARN_ON.  And we could hide all the event counting
> behind some developer-only ifdef.
> 

Those vmevents are only defined for CONFIG_DEBUG_VM_VMACACHE, so no change 
needed there.  The WARN_ON() could be moved under the same config option.  
I assume that if such a config option exists that at least somebody is 
interested in debugging mm/vmacache.c once in a while.

> Did you consider LRU-sorting the array instead?
> 

It adds 40 bytes to struct task_struct, but I'm not sure the least 
recently used is the first preferred check.  If I do 
madvise(MADV_DONTNEED) from a malloc implementation where I don't control 
what is free()'d and I'm constantly freeing back to the same hugepages, 
for example, I may always get first slot cache hits with this patch as 
opposed to the 25% chance that the current implementation has (and perhaps 
an lru would as well).

I'm sure that I could construct a workload where LRU would be better and 
could show that the added footprint were worthwhile, but I could also 
construct a workload where the current implementation based on pfn would 
outperform all of these.  It simply turns out that on the user-controlled 
workloads that I was profiling that hashing based on pmd was the win.

Re: [PATCH v2 2/2] drivers: core: Remove glue dirs from sysfs earlier

2018-07-09 Thread Benjamin Herrenschmidt

On Mon, 2018-07-09 at 17:33 -0700, Linus Torvalds wrote:
> On Mon, Jul 9, 2018 at 5:29 PM Benjamin Herrenschmidt
>  wrote:
> > 
> > For devices with a class, we create a "glue" directory between
> > the parent device and the new device with the class name.
> > 
> > This directory is never "explicitely" removed when empty however,
> 
> explicitly
> 
> Is the mis-spelling why you had the quotes? I do find that spelling in
> the kernel, but not in drivers/base/.

No idea :-) Just my poor english, I'm not sure why I put quotes, I
think I meant *explictly* as enphasis, not sure.

> > This fixes it by instead doing an explicit kobject_del() when
> > the glue dir is empty, by keeping track of the number of
> > child devices of the gluedir.
> 
> Ack. This looks good to me.
> 
> I didn't see your 1/2 - you should probably re-send that one too so
> that Greg doesn't have to fish for it. But I'll Ack that one too in
> this same email regardless.

OK, I didn't re-send it. Greg just nak'ed it though :-)

Cheers,
Ben.

Re: [patch] mm, vmacache: hash addresses based on pmd

2018-07-09 Thread David Rientjes

On Mon, 9 Jul 2018, Andrew Morton wrote:

> > When perf profiling a wide variety of different workloads, it was found
> > that vmacache_find() had higher than expected cost: up to 0.08% of cpu
> > utilization in some cases.  This was found to rival other core VM
> > functions such as alloc_pages_vma() with thp enabled and default
> > mempolicy, and the conditionals in __get_vma_policy().
> > 
> > VMACACHE_HASH() determines which of the four per-task_struct slots a vma
> > is cached for a particular address.  This currently depends on the pfn,
> > so pfn 5212 occupies a different vmacache slot than its neighboring
> > pfn 5213.
> > 
> > vmacache_find() iterates through all four of current's vmacache slots
> > when looking up an address.  Hashing based on pfn, an address has
> > ~1/VMACACHE_SIZE chance of being cached in the first vmacache slot, or
> > about 25%, *if* the vma is cached.
> > 
> > This patch hashes an address by its pmd instead of pte to optimize for
> > workloads with good spatial locality.  This results in a higher
> > probability of vmas being cached in the first slot that is checked:
> > normally ~70% on the same workloads instead of 25%.
> 
> Was the improvement quantifiable?
> 

I've run page fault testing to answer this question on Haswell since the 
initial profiling was done over a wide variety of user-controlled 
workloads and there's no guarantee that such profiling would be a fair 
comparison either way.  For page faulting it's either falling below our 
testing levels of 0.02%, or is right at 0.02%.  Running without the patch 
it's 0.05-0.06% overhead.

> Surprised.  That little array will all be in CPU cache and that loop
> should execute pretty quickly?  If it's *that* sensitive then let's zap
> the no-longer-needed WARN_ON.  And we could hide all the event counting
> behind some developer-only ifdef.
> 

Those vmevents are only defined for CONFIG_DEBUG_VM_VMACACHE, so no change 
needed there.  The WARN_ON() could be moved under the same config option.  
I assume that if such a config option exists that at least somebody is 
interested in debugging mm/vmacache.c once in a while.

> Did you consider LRU-sorting the array instead?
> 

It adds 40 bytes to struct task_struct, but I'm not sure the least 
recently used is the first preferred check.  If I do 
madvise(MADV_DONTNEED) from a malloc implementation where I don't control 
what is free()'d and I'm constantly freeing back to the same hugepages, 
for example, I may always get first slot cache hits with this patch as 
opposed to the 25% chance that the current implementation has (and perhaps 
an lru would as well).

I'm sure that I could construct a workload where LRU would be better and 
could show that the added footprint were worthwhile, but I could also 
construct a workload where the current implementation based on pfn would 
outperform all of these.  It simply turns out that on the user-controlled 
workloads that I was profiling that hashing based on pmd was the win.

Re: [PATCH v2 2/2] drivers: core: Remove glue dirs from sysfs earlier

2018-07-09 Thread Benjamin Herrenschmidt

On Mon, 2018-07-09 at 17:33 -0700, Linus Torvalds wrote:
> On Mon, Jul 9, 2018 at 5:29 PM Benjamin Herrenschmidt
>  wrote:
> > 
> > For devices with a class, we create a "glue" directory between
> > the parent device and the new device with the class name.
> > 
> > This directory is never "explicitely" removed when empty however,
> 
> explicitly
> 
> Is the mis-spelling why you had the quotes? I do find that spelling in
> the kernel, but not in drivers/base/.

No idea :-) Just my poor english, I'm not sure why I put quotes, I
think I meant *explictly* as enphasis, not sure.

> > This fixes it by instead doing an explicit kobject_del() when
> > the glue dir is empty, by keeping track of the number of
> > child devices of the gluedir.
> 
> Ack. This looks good to me.
> 
> I didn't see your 1/2 - you should probably re-send that one too so
> that Greg doesn't have to fish for it. But I'll Ack that one too in
> this same email regardless.

OK, I didn't re-send it. Greg just nak'ed it though :-)

Cheers,
Ben.

[PATCH v2] objtool: move libelf detection to Kconfig from Makefile

2018-07-09 Thread Masahiro Yamada

Currently, users are allowed to enable STACK_VALIDATION regardless
of the compiler capability.  The top-level Makefile warns or breaks
the build if it turns out that the host compiler cannot link libelf.

Move the libelf test to Kconfig so that users can enable the feature
only when the host compiler can build the objtool.  The ugly check
in the Makefile will go away.

Signed-off-by: Masahiro Yamada 
Acked-by: Josh Poimboeuf 
---

Changes in v2:
  - Move package information to help of STACK_VALIDATION

 Makefile   | 14 +-
 arch/x86/Kconfig   |  2 +-
 arch/x86/Kconfig.debug |  2 +-
 lib/Kconfig.debug  |  6 +-
 scripts/Makefile.build |  2 --
 5 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/Makefile b/Makefile
index 925c55f..65befa0 100644
--- a/Makefile
+++ b/Makefile
@@ -927,19 +927,7 @@ endif
 export mod_sign_cmd
 
 ifdef CONFIG_STACK_VALIDATION
-  has_libelf := $(call try-run,\
-   echo "int main() {}" | $(HOSTCC) -xc -o /dev/null -lelf -,1,0)
-  ifeq ($(has_libelf),1)
-objtool_target := tools/objtool FORCE
-  else
-ifdef CONFIG_UNWINDER_ORC
-  $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please 
install libelf-dev, libelf-devel or elfutils-libelf-devel")
-else
-  $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install 
libelf-dev, libelf-devel or elfutils-libelf-devel")
-endif
-SKIP_STACK_VALIDATION := 1
-export SKIP_STACK_VALIDATION
-  endif
+objtool_target := tools/objtool
 endif
 
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f1dbb4e..c1ded99 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -434,7 +434,7 @@ config GOLDFISH
 config RETPOLINE
bool "Avoid speculative indirect branches in kernel"
default y
-   select STACK_VALIDATION if HAVE_STACK_VALIDATION
+   imply STACK_VALIDATION
help
  Compile kernel with the retpoline compiler options to guard against
  kernel-to-user data leaks by avoiding speculative indirect
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index c6dd1d9..4713b3c 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -364,7 +364,7 @@ choice
 config UNWINDER_ORC
bool "ORC unwinder"
depends on X86_64
-   select STACK_VALIDATION
+   depends on STACK_VALIDATION
---help---
  This option enables the ORC (Oops Rewind Capability) unwinder for
  unwinding kernel stack traces.  It uses a custom data format which is
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8838d11..f4d48af 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -364,7 +364,7 @@ config FRAME_POINTER
 config STACK_VALIDATION
bool "Compile-time stack metadata validation"
depends on HAVE_STACK_VALIDATION
-   default n
+   depends on $(success,echo "int main() {}" | $(HOSTCC) -xc -o /dev/null 
-lelf -)
help
  Add compile-time checks to validate stack metadata, including frame
  pointers (if CONFIG_FRAME_POINTER is enabled).  This helps ensure
@@ -373,6 +373,10 @@ config STACK_VALIDATION
  This is also a prerequisite for generation of ORC unwind data, which
  is needed for CONFIG_UNWINDER_ORC.
 
+ To enable this, the host compiler needs to be able to link libelf.
+ If it is missing on your system, please install libelf-dev,
+ libelf-devel or elfutils-libelf-devel.
+
  For more information, see
  tools/objtool/Documentation/stack-validation.txt.
 
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index e7889f4..154205b 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -243,7 +243,6 @@ endif # -record-mcount
 endif # CONFIG_FTRACE_MCOUNT_RECORD
 
 ifdef CONFIG_STACK_VALIDATION
-ifneq ($(SKIP_STACK_VALIDATION),1)
 
 __objtool_obj := $(objtree)/tools/objtool/objtool
 
@@ -282,7 +281,6 @@ objtool_obj = $(if $(patsubst y%,, \

$(OBJECT_FILES_NON_STANDARD_$(basetarget).o)$(OBJECT_FILES_NON_STANDARD)n), \
$(__objtool_obj))
 
-endif # SKIP_STACK_VALIDATION
 endif # CONFIG_STACK_VALIDATION
 
 # Rebuild all objects when objtool changes, or is enabled/disabled.
-- 
2.7.4

[PATCH v2] objtool: move libelf detection to Kconfig from Makefile

2018-07-09 Thread Masahiro Yamada

Currently, users are allowed to enable STACK_VALIDATION regardless
of the compiler capability.  The top-level Makefile warns or breaks
the build if it turns out that the host compiler cannot link libelf.

Move the libelf test to Kconfig so that users can enable the feature
only when the host compiler can build the objtool.  The ugly check
in the Makefile will go away.

Signed-off-by: Masahiro Yamada 
Acked-by: Josh Poimboeuf 
---

Changes in v2:
  - Move package information to help of STACK_VALIDATION

 Makefile   | 14 +-
 arch/x86/Kconfig   |  2 +-
 arch/x86/Kconfig.debug |  2 +-
 lib/Kconfig.debug  |  6 +-
 scripts/Makefile.build |  2 --
 5 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/Makefile b/Makefile
index 925c55f..65befa0 100644
--- a/Makefile
+++ b/Makefile
@@ -927,19 +927,7 @@ endif
 export mod_sign_cmd
 
 ifdef CONFIG_STACK_VALIDATION
-  has_libelf := $(call try-run,\
-   echo "int main() {}" | $(HOSTCC) -xc -o /dev/null -lelf -,1,0)
-  ifeq ($(has_libelf),1)
-objtool_target := tools/objtool FORCE
-  else
-ifdef CONFIG_UNWINDER_ORC
-  $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please 
install libelf-dev, libelf-devel or elfutils-libelf-devel")
-else
-  $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install 
libelf-dev, libelf-devel or elfutils-libelf-devel")
-endif
-SKIP_STACK_VALIDATION := 1
-export SKIP_STACK_VALIDATION
-  endif
+objtool_target := tools/objtool
 endif
 
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f1dbb4e..c1ded99 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -434,7 +434,7 @@ config GOLDFISH
 config RETPOLINE
bool "Avoid speculative indirect branches in kernel"
default y
-   select STACK_VALIDATION if HAVE_STACK_VALIDATION
+   imply STACK_VALIDATION
help
  Compile kernel with the retpoline compiler options to guard against
  kernel-to-user data leaks by avoiding speculative indirect
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index c6dd1d9..4713b3c 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -364,7 +364,7 @@ choice
 config UNWINDER_ORC
bool "ORC unwinder"
depends on X86_64
-   select STACK_VALIDATION
+   depends on STACK_VALIDATION
---help---
  This option enables the ORC (Oops Rewind Capability) unwinder for
  unwinding kernel stack traces.  It uses a custom data format which is
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8838d11..f4d48af 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -364,7 +364,7 @@ config FRAME_POINTER
 config STACK_VALIDATION
bool "Compile-time stack metadata validation"
depends on HAVE_STACK_VALIDATION
-   default n
+   depends on $(success,echo "int main() {}" | $(HOSTCC) -xc -o /dev/null 
-lelf -)
help
  Add compile-time checks to validate stack metadata, including frame
  pointers (if CONFIG_FRAME_POINTER is enabled).  This helps ensure
@@ -373,6 +373,10 @@ config STACK_VALIDATION
  This is also a prerequisite for generation of ORC unwind data, which
  is needed for CONFIG_UNWINDER_ORC.
 
+ To enable this, the host compiler needs to be able to link libelf.
+ If it is missing on your system, please install libelf-dev,
+ libelf-devel or elfutils-libelf-devel.
+
  For more information, see
  tools/objtool/Documentation/stack-validation.txt.
 
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index e7889f4..154205b 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -243,7 +243,6 @@ endif # -record-mcount
 endif # CONFIG_FTRACE_MCOUNT_RECORD
 
 ifdef CONFIG_STACK_VALIDATION
-ifneq ($(SKIP_STACK_VALIDATION),1)
 
 __objtool_obj := $(objtree)/tools/objtool/objtool
 
@@ -282,7 +281,6 @@ objtool_obj = $(if $(patsubst y%,, \

$(OBJECT_FILES_NON_STANDARD_$(basetarget).o)$(OBJECT_FILES_NON_STANDARD)n), \
$(__objtool_obj))
 
-endif # SKIP_STACK_VALIDATION
 endif # CONFIG_STACK_VALIDATION
 
 # Rebuild all objects when objtool changes, or is enabled/disabled.
-- 
2.7.4

linux-next: build warnings after merge of the rdma tree

2018-07-09 Thread Stephen Rothwell

Hi all,

After merging the rdma tree, today's linux-next build (x86_64
allmodconfig) produced these warnings:

In file included from include/linux/printk.h:336:0,
 from include/linux/kernel.h:14,
 from arch/x86/include/asm/percpu.h:45,
 from arch/x86/include/asm/current.h:6,
 from include/linux/mutex.h:14,
 from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
 from drivers/infiniband/hw/cxgb4/cq.c:33:
drivers/infiniband/hw/cxgb4/cq.c: In function '__c4iw_poll_cq_one':
include/linux/dynamic_debug.h:127:3: warning: 'cqe.u.gen.wrid_hi' may be used 
uninitialized in this function [-Wmaybe-uninitialized]
   __dynamic_pr_debug(, pr_fmt(fmt), \
   ^~
drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.u.gen.wrid_hi' was declared 
here
  struct t4_cqe cqe;
^~~
In file included from include/linux/printk.h:336:0,
 from include/linux/kernel.h:14,
 from arch/x86/include/asm/percpu.h:45,
 from arch/x86/include/asm/current.h:6,
 from include/linux/mutex.h:14,
 from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
 from drivers/infiniband/hw/cxgb4/cq.c:33:
include/linux/dynamic_debug.h:127:3: warning: 'cqe.u.gen.wrid_low' may be used 
uninitialized in this function [-Wmaybe-uninitialized]
   __dynamic_pr_debug(, pr_fmt(fmt), \
   ^~
drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.u.gen.wrid_low' was 
declared here
  struct t4_cqe cqe;
^~~
In file included from include/linux/printk.h:336:0,
 from include/linux/kernel.h:14,
 from arch/x86/include/asm/percpu.h:45,
 from arch/x86/include/asm/current.h:6,
 from include/linux/mutex.h:14,
 from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
 from drivers/infiniband/hw/cxgb4/cq.c:33:
include/linux/dynamic_debug.h:127:3: warning: 'cqe.len' may be used 
uninitialized in this function [-Wmaybe-uninitialized]
   __dynamic_pr_debug(, pr_fmt(fmt), \
   ^~
drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.len' was declared here
  struct t4_cqe cqe;
^~~

Introduced by commit

  4ab39e2f98f2 ("RDMA/cxgb4: Make c4iw_poll_cq_one() easier to analyze")

Again, I can't easily tell if these are false positives or not.

-- 
Cheers,
Stephen Rothwell


pgp3FHnKV3M0j.pgp
Description: OpenPGP digital signature

linux-next: build warnings after merge of the rdma tree

2018-07-09 Thread Stephen Rothwell

Hi all,

After merging the rdma tree, today's linux-next build (x86_64
allmodconfig) produced these warnings:

In file included from include/linux/printk.h:336:0,
 from include/linux/kernel.h:14,
 from arch/x86/include/asm/percpu.h:45,
 from arch/x86/include/asm/current.h:6,
 from include/linux/mutex.h:14,
 from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
 from drivers/infiniband/hw/cxgb4/cq.c:33:
drivers/infiniband/hw/cxgb4/cq.c: In function '__c4iw_poll_cq_one':
include/linux/dynamic_debug.h:127:3: warning: 'cqe.u.gen.wrid_hi' may be used 
uninitialized in this function [-Wmaybe-uninitialized]
   __dynamic_pr_debug(, pr_fmt(fmt), \
   ^~
drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.u.gen.wrid_hi' was declared 
here
  struct t4_cqe cqe;
^~~
In file included from include/linux/printk.h:336:0,
 from include/linux/kernel.h:14,
 from arch/x86/include/asm/percpu.h:45,
 from arch/x86/include/asm/current.h:6,
 from include/linux/mutex.h:14,
 from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
 from drivers/infiniband/hw/cxgb4/cq.c:33:
include/linux/dynamic_debug.h:127:3: warning: 'cqe.u.gen.wrid_low' may be used 
uninitialized in this function [-Wmaybe-uninitialized]
   __dynamic_pr_debug(, pr_fmt(fmt), \
   ^~
drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.u.gen.wrid_low' was 
declared here
  struct t4_cqe cqe;
^~~
In file included from include/linux/printk.h:336:0,
 from include/linux/kernel.h:14,
 from arch/x86/include/asm/percpu.h:45,
 from arch/x86/include/asm/current.h:6,
 from include/linux/mutex.h:14,
 from drivers/infiniband/hw/cxgb4/iw_cxgb4.h:34,
 from drivers/infiniband/hw/cxgb4/cq.c:33:
include/linux/dynamic_debug.h:127:3: warning: 'cqe.len' may be used 
uninitialized in this function [-Wmaybe-uninitialized]
   __dynamic_pr_debug(, pr_fmt(fmt), \
   ^~
drivers/infiniband/hw/cxgb4/cq.c:674:16: note: 'cqe.len' was declared here
  struct t4_cqe cqe;
^~~

Introduced by commit

  4ab39e2f98f2 ("RDMA/cxgb4: Make c4iw_poll_cq_one() easier to analyze")

Again, I can't easily tell if these are false positives or not.

-- 
Cheers,
Stephen Rothwell


pgp3FHnKV3M0j.pgp
Description: OpenPGP digital signature

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1608 matches

Mail list logo