Re: [PATCH kernel v4 08/11] powerpc/powernv/ioda2: Export debug helper pe_level_printk()

2016-05-02 Thread Alistair Popple
On Tue, 3 May 2016 15:46:33 Alistair Popple wrote:
> There's one call to pr_warn() in pnv_npu_disable_bypass() that could 
arguably 
> be converted to pe_warn(), but we can clean that up later as the patch looks 
> fine and I'm assuming subsequent patches make use of these.

And inevitably the next patch in the series cleans that up anyway. Feel free 
to ignore the noise above :-)

> Reviewed-By: Alistair Popple 
> 
> On Fri, 29 Apr 2016 18:55:21 Alexey Kardashevskiy wrote:
> > This exports debugging helper pe_level_printk() and corresponding macroses
> > so they can be used in npu-dma.c.
> > 
> > Signed-off-by: Alexey Kardashevskiy 
> > ---
> >  arch/powerpc/platforms/powernv/pci-ioda.c | 9 +
> >  arch/powerpc/platforms/powernv/pci.h  | 9 +
> >  2 files changed, 10 insertions(+), 8 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> > index 272521e..db7695f 100644
> > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > @@ -56,7 +56,7 @@
> >  
> >  static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
> >  
> > -static void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
> *level,
> > +void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
> > const char *fmt, ...)
> >  {
> > struct va_format vaf;
> > @@ -87,13 +87,6 @@ static void pe_level_printk(const struct pnv_ioda_pe 
*pe, 
> const char *level,
> > va_end(args);
> >  }
> >  
> > -#define pe_err(pe, fmt, ...)   \
> > -   pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
> > -#define pe_warn(pe, fmt, ...)  \
> > -   pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
> > -#define pe_info(pe, fmt, ...)  \
> > -   pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
> > -
> >  static bool pnv_iommu_bypass_disabled __read_mostly;
> >  
> >  static int __init iommu_setup(char *str)
> > diff --git a/arch/powerpc/platforms/powernv/pci.h 
> b/arch/powerpc/platforms/powernv/pci.h
> > index d574a9d..485e5b1 100644
> > --- a/arch/powerpc/platforms/powernv/pci.h
> > +++ b/arch/powerpc/platforms/powernv/pci.h
> > @@ -236,6 +236,15 @@ extern void pnv_pci_dma_bus_setup(struct pci_bus 
*bus);
> >  extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
> >  extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
> >  
> > +extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
> *level,
> > +   const char *fmt, ...);
> > +#define pe_err(pe, fmt, ...)   \
> > +   pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
> > +#define pe_warn(pe, fmt, ...)  \
> > +   pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
> > +#define pe_info(pe, fmt, ...)  \
> > +   pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
> > +
> >  /* Nvlink functions */
> >  extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe);
> >  extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe);
> > 



Re: [PATCH kernel v4 08/11] powerpc/powernv/ioda2: Export debug helper pe_level_printk()

2016-05-02 Thread Alistair Popple
On Tue, 3 May 2016 15:46:33 Alistair Popple wrote:
> There's one call to pr_warn() in pnv_npu_disable_bypass() that could 
arguably 
> be converted to pe_warn(), but we can clean that up later as the patch looks 
> fine and I'm assuming subsequent patches make use of these.

And inevitably the next patch in the series cleans that up anyway. Feel free 
to ignore the noise above :-)

> Reviewed-By: Alistair Popple 
> 
> On Fri, 29 Apr 2016 18:55:21 Alexey Kardashevskiy wrote:
> > This exports debugging helper pe_level_printk() and corresponding macroses
> > so they can be used in npu-dma.c.
> > 
> > Signed-off-by: Alexey Kardashevskiy 
> > ---
> >  arch/powerpc/platforms/powernv/pci-ioda.c | 9 +
> >  arch/powerpc/platforms/powernv/pci.h  | 9 +
> >  2 files changed, 10 insertions(+), 8 deletions(-)
> > 
> > diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
> b/arch/powerpc/platforms/powernv/pci-ioda.c
> > index 272521e..db7695f 100644
> > --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> > +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> > @@ -56,7 +56,7 @@
> >  
> >  static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
> >  
> > -static void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
> *level,
> > +void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
> > const char *fmt, ...)
> >  {
> > struct va_format vaf;
> > @@ -87,13 +87,6 @@ static void pe_level_printk(const struct pnv_ioda_pe 
*pe, 
> const char *level,
> > va_end(args);
> >  }
> >  
> > -#define pe_err(pe, fmt, ...)   \
> > -   pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
> > -#define pe_warn(pe, fmt, ...)  \
> > -   pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
> > -#define pe_info(pe, fmt, ...)  \
> > -   pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
> > -
> >  static bool pnv_iommu_bypass_disabled __read_mostly;
> >  
> >  static int __init iommu_setup(char *str)
> > diff --git a/arch/powerpc/platforms/powernv/pci.h 
> b/arch/powerpc/platforms/powernv/pci.h
> > index d574a9d..485e5b1 100644
> > --- a/arch/powerpc/platforms/powernv/pci.h
> > +++ b/arch/powerpc/platforms/powernv/pci.h
> > @@ -236,6 +236,15 @@ extern void pnv_pci_dma_bus_setup(struct pci_bus 
*bus);
> >  extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
> >  extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
> >  
> > +extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
> *level,
> > +   const char *fmt, ...);
> > +#define pe_err(pe, fmt, ...)   \
> > +   pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
> > +#define pe_warn(pe, fmt, ...)  \
> > +   pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
> > +#define pe_info(pe, fmt, ...)  \
> > +   pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
> > +
> >  /* Nvlink functions */
> >  extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe);
> >  extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe);
> > 



Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Sergey Senozhatsky
On (05/03/16 14:40), Minchan Kim wrote:
[..]
> > At least, we need sanity check code, still?
> > Otherwise, user can echo "garbage" > /sys/xxx/max_comp_stream" and then
> > cat /sys/xxx/max_comp_stream returns num_online_cpus.
> 
> One more thing,
> 
> User:
> echo 4 > /sys/xxx/max_comp_stream"
> cat /sys/xxx/max_comp_streams
> 8

sure, it can also be

cat /sys/xxx/max_comp_streams
5
cat /sys/xxx/max_comp_streams
6
cat /sys/xxx/max_comp_streams
7
cat /sys/xxx/max_comp_streams
3

depending on the availability of CPUs. but why would user space
constantly check max_comp_streams?

> which is rather weird?
> 
> We should keep user's value and return it to user although it's techically
> lying. IMO, it would be best way to prevent confusing for user until we
> removes max_comp_streams finally.

well, I preferred to show the actual state of the device. besides,
does anyone really do

write buffer to file
if (success)
read from file and compare with the buffer

?

-ss


Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Sergey Senozhatsky
On (05/03/16 14:40), Minchan Kim wrote:
[..]
> > At least, we need sanity check code, still?
> > Otherwise, user can echo "garbage" > /sys/xxx/max_comp_stream" and then
> > cat /sys/xxx/max_comp_stream returns num_online_cpus.
> 
> One more thing,
> 
> User:
> echo 4 > /sys/xxx/max_comp_stream"
> cat /sys/xxx/max_comp_streams
> 8

sure, it can also be

cat /sys/xxx/max_comp_streams
5
cat /sys/xxx/max_comp_streams
6
cat /sys/xxx/max_comp_streams
7
cat /sys/xxx/max_comp_streams
3

depending on the availability of CPUs. but why would user space
constantly check max_comp_streams?

> which is rather weird?
> 
> We should keep user's value and return it to user although it's techically
> lying. IMO, it would be best way to prevent confusing for user until we
> removes max_comp_streams finally.

well, I preferred to show the actual state of the device. besides,
does anyone really do

write buffer to file
if (success)
read from file and compare with the buffer

?

-ss


[PATCH 2/2] extcon: gpio: add DT binding doc for extcon-gpio

2016-05-02 Thread Venkat Reddy Talla
Addiing DT binding doc for the extcon gpios properties.

Signed-off-by: Venkat Reddy Talla 
---
 .../devicetree/bindings/extcon/extcon-gpio.txt| 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/extcon/extcon-gpio.txt

diff --git a/Documentation/devicetree/bindings/extcon/extcon-gpio.txt 
b/Documentation/devicetree/bindings/extcon/extcon-gpio.txt
new file mode 100644
index 000..4a49c23
--- /dev/null
+++ b/Documentation/devicetree/bindings/extcon/extcon-gpio.txt
@@ -0,0 +1,19 @@
+EXTCON FOR GPIO
+
+Required Properties:
+ - compatible : Should be "extcon-gpio";
+
+Optional Properties:
+ - extcon-gpio,name: Name of extcon device.
+ - gpio: gpio number.
+ - extcon-gpio,irq-flags: IRQ flags for GPIO.
+ - extcon-gpio,debounce: Debounce time in ms.
+ - extcon-gpio,connection-state-low: boolean, Connection state with
+gpio state. True if gpio low means connected.
+
+extcon-gpio {
+   compatible = "extcon-gpio";
+extcon-gpio,name = "VBUS";
+   gpio = < 20 0>;
+   extcon-gpio,cable-names = ;
+};
-- 
2.1.4



Re: [PATCH v2 00/12] sched/fair: Optimize and clean up sched averages

2016-05-02 Thread Yuyang Du
Hi,

This patch series should have no perceivable changes to load
and util except that load's range is increased by 1024.

My initial tests suggest that. See attached figures. The workload
is running 100us out of every 200us, and 2000us out of every 8000us.
Again fixed workload, fixed CPU, and fixed frequency.
 
In addition, of course, I believe the codes should be cleaner and
more efficient after the patches.

Thanks,
Yuyang

On Tue, May 03, 2016 at 05:54:26AM +0800, Yuyang Du wrote:
> Hi Peter,
> 
> This patch series combines the previous cleanup and optimization
> series. And as you and Ingo suggested, the increased kernel load
> scale is reinstated when on 64BIT and FAIR_GROUP_SCHED. In addition
> to that, the changes include Vincent's fix, typos fixes, changelog
> and comment reword.
> 
> Thanks,
> Yuyang
> 
> Yuyang Du (12):
>   sched/fair: Optimize sum computation with a lookup table
>   sched/fair: Rename variable names for sched averages
>   sched/fair: Change the variable to hold the number of periods to
> 32bit integer
>   sched/fair: Add __always_inline compiler attribute to
> __accumulate_sum()
>   sched/fair: Optimize __update_sched_avg()
>   documentation: Add scheduler/sched-avg.txt
>   sched/fair: Generalize the load/util averages resolution definition
>   sched/fair: Remove SCHED_LOAD_SHIFT and SCHED_LOAD_SCALE
>   sched/fair: Add introduction to the sched average metrics
>   sched/fair: Remove scale_load_down() for load_avg
>   sched/fair: Rename scale_load() and scale_load_down()
>   sched/fair: Enable increased scale for kernel load
> 
>  Documentation/scheduler/sched-avg.txt |  137 
>  include/linux/sched.h |   81 ++-
>  kernel/sched/core.c   |8 +-
>  kernel/sched/fair.c   |  398 
> +
>  kernel/sched/sched.h  |   48 ++--
>  5 files changed, 439 insertions(+), 233 deletions(-)
>  create mode 100644 Documentation/scheduler/sched-avg.txt
> 
> -- 
> 1.7.9.5


[PATCH 2/2] extcon: gpio: add DT binding doc for extcon-gpio

2016-05-02 Thread Venkat Reddy Talla
Addiing DT binding doc for the extcon gpios properties.

Signed-off-by: Venkat Reddy Talla 
---
 .../devicetree/bindings/extcon/extcon-gpio.txt| 19 +++
 1 file changed, 19 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/extcon/extcon-gpio.txt

diff --git a/Documentation/devicetree/bindings/extcon/extcon-gpio.txt 
b/Documentation/devicetree/bindings/extcon/extcon-gpio.txt
new file mode 100644
index 000..4a49c23
--- /dev/null
+++ b/Documentation/devicetree/bindings/extcon/extcon-gpio.txt
@@ -0,0 +1,19 @@
+EXTCON FOR GPIO
+
+Required Properties:
+ - compatible : Should be "extcon-gpio";
+
+Optional Properties:
+ - extcon-gpio,name: Name of extcon device.
+ - gpio: gpio number.
+ - extcon-gpio,irq-flags: IRQ flags for GPIO.
+ - extcon-gpio,debounce: Debounce time in ms.
+ - extcon-gpio,connection-state-low: boolean, Connection state with
+gpio state. True if gpio low means connected.
+
+extcon-gpio {
+   compatible = "extcon-gpio";
+extcon-gpio,name = "VBUS";
+   gpio = < 20 0>;
+   extcon-gpio,cable-names = ;
+};
-- 
2.1.4



Re: [PATCH v2 00/12] sched/fair: Optimize and clean up sched averages

2016-05-02 Thread Yuyang Du
Hi,

This patch series should have no perceivable changes to load
and util except that load's range is increased by 1024.

My initial tests suggest that. See attached figures. The workload
is running 100us out of every 200us, and 2000us out of every 8000us.
Again fixed workload, fixed CPU, and fixed frequency.
 
In addition, of course, I believe the codes should be cleaner and
more efficient after the patches.

Thanks,
Yuyang

On Tue, May 03, 2016 at 05:54:26AM +0800, Yuyang Du wrote:
> Hi Peter,
> 
> This patch series combines the previous cleanup and optimization
> series. And as you and Ingo suggested, the increased kernel load
> scale is reinstated when on 64BIT and FAIR_GROUP_SCHED. In addition
> to that, the changes include Vincent's fix, typos fixes, changelog
> and comment reword.
> 
> Thanks,
> Yuyang
> 
> Yuyang Du (12):
>   sched/fair: Optimize sum computation with a lookup table
>   sched/fair: Rename variable names for sched averages
>   sched/fair: Change the variable to hold the number of periods to
> 32bit integer
>   sched/fair: Add __always_inline compiler attribute to
> __accumulate_sum()
>   sched/fair: Optimize __update_sched_avg()
>   documentation: Add scheduler/sched-avg.txt
>   sched/fair: Generalize the load/util averages resolution definition
>   sched/fair: Remove SCHED_LOAD_SHIFT and SCHED_LOAD_SCALE
>   sched/fair: Add introduction to the sched average metrics
>   sched/fair: Remove scale_load_down() for load_avg
>   sched/fair: Rename scale_load() and scale_load_down()
>   sched/fair: Enable increased scale for kernel load
> 
>  Documentation/scheduler/sched-avg.txt |  137 
>  include/linux/sched.h |   81 ++-
>  kernel/sched/core.c   |8 +-
>  kernel/sched/fair.c   |  398 
> +
>  kernel/sched/sched.h  |   48 ++--
>  5 files changed, 439 insertions(+), 233 deletions(-)
>  create mode 100644 Documentation/scheduler/sched-avg.txt
> 
> -- 
> 1.7.9.5


[PATCH 1/2] extcon: gpio: add device tree support for extcon-gpio

2016-05-02 Thread Venkat Reddy Talla
Adding device tree support for extcon-gpio driver.

Signed-off-by: Venkat Reddy Talla 
---
 drivers/extcon/extcon-gpio.c   | 80 +++---
 include/linux/extcon/extcon-gpio.h |  2 +
 2 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/drivers/extcon/extcon-gpio.c b/drivers/extcon/extcon-gpio.c
index d023789..b7fae7e 100644
--- a/drivers/extcon/extcon-gpio.c
+++ b/drivers/extcon/extcon-gpio.c
@@ -28,6 +28,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 struct gpio_extcon_data {
struct extcon_dev *edev;
@@ -90,21 +92,79 @@ static int gpio_extcon_init(struct device *dev, struct 
gpio_extcon_data *data)
return 0;
 }
 
+static struct gpio_extcon_pdata *gpio_extcon_of_pdata(
+   struct platform_device *pdev)
+{
+   struct gpio_extcon_pdata *pdata;
+   struct device_node *np = pdev->dev.of_node;
+   int gpio;
+   u32 pval;
+   int ret;
+
+   pdata = devm_kzalloc(>dev, sizeof(*pdata), GFP_KERNEL);
+   if (!pdata)
+   return ERR_PTR(-ENOMEM);
+
+   gpio = of_get_named_gpio(np, "gpio", 0);
+   if (gpio < 0)
+   return ERR_PTR(gpio);
+
+   pdata->gpio = gpio;
+
+   ret = of_property_read_u32(np, "extcon-gpio,irq-flags", );
+   if (!ret)
+   pdata->irq_flags = pval;
+   else
+   pdata->irq_flags = IRQF_TRIGGER_RISING |
+   IRQF_TRIGGER_FALLING;
+
+   ret = of_property_read_u32(np, "extcon-gpio,debounce", );
+   if (!ret)
+   pdata->debounce = pval;
+
+   pdata->gpio_active_low = of_property_read_bool(np,
+   "extcon-gpio,connection-state-low");
+
+   pdata->extcon_cable_cnt = of_property_count_u32_elems(np,
+   "extcon-gpio,cable-names");
+   if (pdata->extcon_cable_cnt <= 0) {
+   dev_err(>dev, "not found out cable names\n");
+   return ERR_PTR(-EINVAL);
+   }
+
+   pdata->extcon_cable_names = devm_kzalloc(>dev,
+   (pdata->extcon_cable_cnt) *
+   sizeof(*pdata->extcon_cable_names), GFP_KERNEL);
+   if (!pdata->extcon_cable_names)
+   return ERR_PTR(-ENOMEM);
+
+   ret = of_property_read_u32_array(np, "extcon-gpio,cable-names",
+   pdata->extcon_cable_names, pdata->extcon_cable_cnt);
+   if (ret)
+   return ERR_PTR(-EINVAL);
+
+   return pdata;
+}
+
 static int gpio_extcon_probe(struct platform_device *pdev)
 {
struct gpio_extcon_pdata *pdata = dev_get_platdata(>dev);
struct gpio_extcon_data *data;
int ret;
 
-   if (!pdata)
-   return -EBUSY;
-   if (!pdata->irq_flags || pdata->extcon_id > EXTCON_NONE)
-   return -EINVAL;
-
data = devm_kzalloc(>dev, sizeof(struct gpio_extcon_data),
   GFP_KERNEL);
if (!data)
return -ENOMEM;
+
+   if (!pdata && pdev->dev.of_node)
+   pdata = gpio_extcon_of_pdata(pdev);
+
+   if (IS_ERR(pdata))
+   return PTR_ERR(pdata);
+   if (!pdata->irq_flags || !pdata->extcon_cable_names)
+   return -EINVAL;
+
data->pdata = pdata;
 
/* Initialize the gpio */
@@ -113,7 +173,8 @@ static int gpio_extcon_probe(struct platform_device *pdev)
return ret;
 
/* Allocate the memory of extcon devie and register extcon device */
-   data->edev = devm_extcon_dev_allocate(>dev, >extcon_id);
+   data->edev = devm_extcon_dev_allocate(>dev,
+   pdata->extcon_cable_names);
if (IS_ERR(data->edev)) {
dev_err(>dev, "failed to allocate extcon device\n");
return -ENOMEM;
@@ -167,12 +228,19 @@ static int gpio_extcon_resume(struct device *dev)
 
 static SIMPLE_DEV_PM_OPS(gpio_extcon_pm_ops, NULL, gpio_extcon_resume);
 
+static const struct of_device_id of_extcon_gpio_tbl[] = {
+   { .compatible = "extcon-gpio", },
+   { /* end */ }
+};
+MODULE_DEVICE_TABLE(of, of_extcon_gpio_tbl);
+
 static struct platform_driver gpio_extcon_driver = {
.probe  = gpio_extcon_probe,
.remove = gpio_extcon_remove,
.driver = {
.name   = "extcon-gpio",
.pm = _extcon_pm_ops,
+   .of_match_table = of_extcon_gpio_tbl,
},
 };
 
diff --git a/include/linux/extcon/extcon-gpio.h 
b/include/linux/extcon/extcon-gpio.h
index 7cacafb..c27df9b 100644
--- a/include/linux/extcon/extcon-gpio.h
+++ b/include/linux/extcon/extcon-gpio.h
@@ -36,6 +36,8 @@
  */
 struct gpio_extcon_pdata {
unsigned int extcon_id;
+   unsigned int *extcon_cable_names;
+   int extcon_cable_cnt;
unsigned gpio;
bool gpio_active_low;
unsigned long debounce;
-- 
2.1.4



[PATCH 1/2] extcon: gpio: add device tree support for extcon-gpio

2016-05-02 Thread Venkat Reddy Talla
Adding device tree support for extcon-gpio driver.

Signed-off-by: Venkat Reddy Talla 
---
 drivers/extcon/extcon-gpio.c   | 80 +++---
 include/linux/extcon/extcon-gpio.h |  2 +
 2 files changed, 76 insertions(+), 6 deletions(-)

diff --git a/drivers/extcon/extcon-gpio.c b/drivers/extcon/extcon-gpio.c
index d023789..b7fae7e 100644
--- a/drivers/extcon/extcon-gpio.c
+++ b/drivers/extcon/extcon-gpio.c
@@ -28,6 +28,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 struct gpio_extcon_data {
struct extcon_dev *edev;
@@ -90,21 +92,79 @@ static int gpio_extcon_init(struct device *dev, struct 
gpio_extcon_data *data)
return 0;
 }
 
+static struct gpio_extcon_pdata *gpio_extcon_of_pdata(
+   struct platform_device *pdev)
+{
+   struct gpio_extcon_pdata *pdata;
+   struct device_node *np = pdev->dev.of_node;
+   int gpio;
+   u32 pval;
+   int ret;
+
+   pdata = devm_kzalloc(>dev, sizeof(*pdata), GFP_KERNEL);
+   if (!pdata)
+   return ERR_PTR(-ENOMEM);
+
+   gpio = of_get_named_gpio(np, "gpio", 0);
+   if (gpio < 0)
+   return ERR_PTR(gpio);
+
+   pdata->gpio = gpio;
+
+   ret = of_property_read_u32(np, "extcon-gpio,irq-flags", );
+   if (!ret)
+   pdata->irq_flags = pval;
+   else
+   pdata->irq_flags = IRQF_TRIGGER_RISING |
+   IRQF_TRIGGER_FALLING;
+
+   ret = of_property_read_u32(np, "extcon-gpio,debounce", );
+   if (!ret)
+   pdata->debounce = pval;
+
+   pdata->gpio_active_low = of_property_read_bool(np,
+   "extcon-gpio,connection-state-low");
+
+   pdata->extcon_cable_cnt = of_property_count_u32_elems(np,
+   "extcon-gpio,cable-names");
+   if (pdata->extcon_cable_cnt <= 0) {
+   dev_err(>dev, "not found out cable names\n");
+   return ERR_PTR(-EINVAL);
+   }
+
+   pdata->extcon_cable_names = devm_kzalloc(>dev,
+   (pdata->extcon_cable_cnt) *
+   sizeof(*pdata->extcon_cable_names), GFP_KERNEL);
+   if (!pdata->extcon_cable_names)
+   return ERR_PTR(-ENOMEM);
+
+   ret = of_property_read_u32_array(np, "extcon-gpio,cable-names",
+   pdata->extcon_cable_names, pdata->extcon_cable_cnt);
+   if (ret)
+   return ERR_PTR(-EINVAL);
+
+   return pdata;
+}
+
 static int gpio_extcon_probe(struct platform_device *pdev)
 {
struct gpio_extcon_pdata *pdata = dev_get_platdata(>dev);
struct gpio_extcon_data *data;
int ret;
 
-   if (!pdata)
-   return -EBUSY;
-   if (!pdata->irq_flags || pdata->extcon_id > EXTCON_NONE)
-   return -EINVAL;
-
data = devm_kzalloc(>dev, sizeof(struct gpio_extcon_data),
   GFP_KERNEL);
if (!data)
return -ENOMEM;
+
+   if (!pdata && pdev->dev.of_node)
+   pdata = gpio_extcon_of_pdata(pdev);
+
+   if (IS_ERR(pdata))
+   return PTR_ERR(pdata);
+   if (!pdata->irq_flags || !pdata->extcon_cable_names)
+   return -EINVAL;
+
data->pdata = pdata;
 
/* Initialize the gpio */
@@ -113,7 +173,8 @@ static int gpio_extcon_probe(struct platform_device *pdev)
return ret;
 
/* Allocate the memory of extcon devie and register extcon device */
-   data->edev = devm_extcon_dev_allocate(>dev, >extcon_id);
+   data->edev = devm_extcon_dev_allocate(>dev,
+   pdata->extcon_cable_names);
if (IS_ERR(data->edev)) {
dev_err(>dev, "failed to allocate extcon device\n");
return -ENOMEM;
@@ -167,12 +228,19 @@ static int gpio_extcon_resume(struct device *dev)
 
 static SIMPLE_DEV_PM_OPS(gpio_extcon_pm_ops, NULL, gpio_extcon_resume);
 
+static const struct of_device_id of_extcon_gpio_tbl[] = {
+   { .compatible = "extcon-gpio", },
+   { /* end */ }
+};
+MODULE_DEVICE_TABLE(of, of_extcon_gpio_tbl);
+
 static struct platform_driver gpio_extcon_driver = {
.probe  = gpio_extcon_probe,
.remove = gpio_extcon_remove,
.driver = {
.name   = "extcon-gpio",
.pm = _extcon_pm_ops,
+   .of_match_table = of_extcon_gpio_tbl,
},
 };
 
diff --git a/include/linux/extcon/extcon-gpio.h 
b/include/linux/extcon/extcon-gpio.h
index 7cacafb..c27df9b 100644
--- a/include/linux/extcon/extcon-gpio.h
+++ b/include/linux/extcon/extcon-gpio.h
@@ -36,6 +36,8 @@
  */
 struct gpio_extcon_pdata {
unsigned int extcon_id;
+   unsigned int *extcon_cable_names;
+   int extcon_cable_cnt;
unsigned gpio;
bool gpio_active_low;
unsigned long debounce;
-- 
2.1.4



Re: [PATCH kernel v4 08/11] powerpc/powernv/ioda2: Export debug helper pe_level_printk()

2016-05-02 Thread Alistair Popple
There's one call to pr_warn() in pnv_npu_disable_bypass() that could arguably 
be converted to pe_warn(), but we can clean that up later as the patch looks 
fine and I'm assuming subsequent patches make use of these.

Reviewed-By: Alistair Popple 

On Fri, 29 Apr 2016 18:55:21 Alexey Kardashevskiy wrote:
> This exports debugging helper pe_level_printk() and corresponding macroses
> so they can be used in npu-dma.c.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 9 +
>  arch/powerpc/platforms/powernv/pci.h  | 9 +
>  2 files changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 272521e..db7695f 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -56,7 +56,7 @@
>  
>  static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
>  
> -static void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
*level,
> +void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>   const char *fmt, ...)
>  {
>   struct va_format vaf;
> @@ -87,13 +87,6 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, 
const char *level,
>   va_end(args);
>  }
>  
> -#define pe_err(pe, fmt, ...) \
> - pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
> -#define pe_warn(pe, fmt, ...)\
> - pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
> -#define pe_info(pe, fmt, ...)\
> - pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
> -
>  static bool pnv_iommu_bypass_disabled __read_mostly;
>  
>  static int __init iommu_setup(char *str)
> diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
> index d574a9d..485e5b1 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -236,6 +236,15 @@ extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
>  extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
>  extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
>  
> +extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
*level,
> + const char *fmt, ...);
> +#define pe_err(pe, fmt, ...) \
> + pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
> +#define pe_warn(pe, fmt, ...)\
> + pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
> +#define pe_info(pe, fmt, ...)\
> + pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
> +
>  /* Nvlink functions */
>  extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe);
>  extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe);
> 



Re: [PATCH kernel v4 08/11] powerpc/powernv/ioda2: Export debug helper pe_level_printk()

2016-05-02 Thread Alistair Popple
There's one call to pr_warn() in pnv_npu_disable_bypass() that could arguably 
be converted to pe_warn(), but we can clean that up later as the patch looks 
fine and I'm assuming subsequent patches make use of these.

Reviewed-By: Alistair Popple 

On Fri, 29 Apr 2016 18:55:21 Alexey Kardashevskiy wrote:
> This exports debugging helper pe_level_printk() and corresponding macroses
> so they can be used in npu-dma.c.
> 
> Signed-off-by: Alexey Kardashevskiy 
> ---
>  arch/powerpc/platforms/powernv/pci-ioda.c | 9 +
>  arch/powerpc/platforms/powernv/pci.h  | 9 +
>  2 files changed, 10 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
> index 272521e..db7695f 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda.c
> @@ -56,7 +56,7 @@
>  
>  static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
>  
> -static void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
*level,
> +void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
>   const char *fmt, ...)
>  {
>   struct va_format vaf;
> @@ -87,13 +87,6 @@ static void pe_level_printk(const struct pnv_ioda_pe *pe, 
const char *level,
>   va_end(args);
>  }
>  
> -#define pe_err(pe, fmt, ...) \
> - pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
> -#define pe_warn(pe, fmt, ...)\
> - pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
> -#define pe_info(pe, fmt, ...)\
> - pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
> -
>  static bool pnv_iommu_bypass_disabled __read_mostly;
>  
>  static int __init iommu_setup(char *str)
> diff --git a/arch/powerpc/platforms/powernv/pci.h 
b/arch/powerpc/platforms/powernv/pci.h
> index d574a9d..485e5b1 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -236,6 +236,15 @@ extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
>  extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
>  extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
>  
> +extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
*level,
> + const char *fmt, ...);
> +#define pe_err(pe, fmt, ...) \
> + pe_level_printk(pe, KERN_ERR, fmt, ##__VA_ARGS__)
> +#define pe_warn(pe, fmt, ...)\
> + pe_level_printk(pe, KERN_WARNING, fmt, ##__VA_ARGS__)
> +#define pe_info(pe, fmt, ...)\
> + pe_level_printk(pe, KERN_INFO, fmt, ##__VA_ARGS__)
> +
>  /* Nvlink functions */
>  extern void pnv_npu_init_dma_pe(struct pnv_ioda_pe *npe);
>  extern void pnv_npu_setup_dma_pe(struct pnv_ioda_pe *npe);
> 



Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Sergey Senozhatsky
On (05/03/16 14:23), Minchan Kim wrote:
[..]
> > -   zram->max_comp_streams = num;
> > -   ret = len;
> > -out:
> > -   up_write(>init_lock);
> > -   return ret;
> 
> At least, we need sanity check code, still?
> Otherwise, user can echo "garbage" > /sys/xxx/max_comp_stream" and then
> cat /sys/xxx/max_comp_stream returns num_online_cpus.

hm, I couldn't find any reason to keep the check. we completely
ignore the value anyway, cat /sys/xxx/max_comp_stream will always
return num_online_cpus(), regardless the correctness of supplied
data; `garbage', `2', `1024', `32' make no difference.

-ss


Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Sergey Senozhatsky
On (05/03/16 14:23), Minchan Kim wrote:
[..]
> > -   zram->max_comp_streams = num;
> > -   ret = len;
> > -out:
> > -   up_write(>init_lock);
> > -   return ret;
> 
> At least, we need sanity check code, still?
> Otherwise, user can echo "garbage" > /sys/xxx/max_comp_stream" and then
> cat /sys/xxx/max_comp_stream returns num_online_cpus.

hm, I couldn't find any reason to keep the check. we completely
ignore the value anyway, cat /sys/xxx/max_comp_stream will always
return num_online_cpus(), regardless the correctness of supplied
data; `garbage', `2', `1024', `32' make no difference.

-ss


Re: [PATCH 3/3] platform/chrome: pstore: Move to larger record size.

2016-05-02 Thread Benson Leung
On Mon, Feb 15, 2016 at 3:58 PM, Enric Balletbo i Serra
 wrote:
> From: Olof Johansson 
>
> Accidentally specified a smaller record size, bring it back
> to the same size as we had when we used the config file.
>
> Signed-off-by: Olof Johansson 
> Signed-off-by: Enric Balletbo i Serra 
> Reviewed-by: Sameer Nanda 

Reviewed-by: Benson Leung 

-- 
Benson Leung
Senior Software Engineer, Chrom* OS
ble...@chromium.org


Re: [PATCH 3/3] platform/chrome: pstore: Move to larger record size.

2016-05-02 Thread Benson Leung
On Mon, Feb 15, 2016 at 3:58 PM, Enric Balletbo i Serra
 wrote:
> From: Olof Johansson 
>
> Accidentally specified a smaller record size, bring it back
> to the same size as we had when we used the config file.
>
> Signed-off-by: Olof Johansson 
> Signed-off-by: Enric Balletbo i Serra 
> Reviewed-by: Sameer Nanda 

Reviewed-by: Benson Leung 

-- 
Benson Leung
Senior Software Engineer, Chrom* OS
ble...@chromium.org


Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Minchan Kim
On Tue, May 03, 2016 at 02:23:24PM +0900, Minchan Kim wrote:
> On Mon, May 02, 2016 at 05:06:00PM +0900, Sergey Senozhatsky wrote:
> > On (05/02/16 16:25), Sergey Senozhatsky wrote:
> > [..]
> > > > Trivial:
> > > > We could remove max_strm now and change description.
> > > 
> > > oh, yes.
> > 
> > how about something like this? remove max_comp_streams entirely, but
> > leave the attr. if we keep zram->max_comp_streams and return its value
> > (set by user space) from show() handler, we are techically lying;
> > because the actual number of streams is now num_online_cpus().
> 
> Yes, we should have limit the value to num_online_cpus from the
> beginning.
> 
> > 
> > 
> > ===8<===8<===
> > 
> > From: Sergey Senozhatsky 
> > Subject: [PATCH] zram: remove max_comp_streams internals
> > 
> > Remove the internal part of max_comp_streams interface, since we
> > switched to per-cpu streams. We will keep RW max_comp_streams attr
> > around, because:
> > 
> > a) we may (silently) switch back to idle compression streams list
> >and don't want to disturb user space
> > b) max_comp_streams attr must wait for the next 'lay off cycle';
> >we give user space 2 years to adjust before we remove/downgrade
> >the attr, and there are already several attrs scheduled for
> >removal in 4.11, so it's too late for max_comp_streams.
> > 
> > Signed-off-by: Sergey Senozhatsky 
> > ---
> >  drivers/block/zram/zcomp.c|  7 +--
> >  drivers/block/zram/zcomp.h|  2 +-
> >  drivers/block/zram/zram_drv.c | 47 
> > +++
> >  drivers/block/zram/zram_drv.h |  1 -
> >  4 files changed, 14 insertions(+), 43 deletions(-)
> > 
> > diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
> > index d4159e4..d4de9cb 100644
> > --- a/drivers/block/zram/zcomp.c
> > +++ b/drivers/block/zram/zcomp.c
> > @@ -95,11 +95,6 @@ bool zcomp_available_algorithm(const char *comp)
> > return find_backend(comp) != NULL;
> >  }
> >  
> > -bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
> > -{
> > -   return true;
> > -}
> > -
> >  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
> >  {
> > return *get_cpu_ptr(comp->stream);
> > @@ -211,7 +206,7 @@ void zcomp_destroy(struct zcomp *comp)
> >   * case of allocation error, or any other error potentially
> >   * returned by functions zcomp_strm_{multi,single}_create.
> >   */
> > -struct zcomp *zcomp_create(const char *compress, int max_strm)
> > +struct zcomp *zcomp_create(const char *compress)
> >  {
> > struct zcomp *comp;
> > struct zcomp_backend *backend;
> > diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
> > index aba8c21..ffd88cb 100644
> > --- a/drivers/block/zram/zcomp.h
> > +++ b/drivers/block/zram/zcomp.h
> > @@ -45,7 +45,7 @@ struct zcomp {
> >  ssize_t zcomp_available_show(const char *comp, char *buf);
> >  bool zcomp_available_algorithm(const char *comp);
> >  
> > -struct zcomp *zcomp_create(const char *comp, int max_strm);
> > +struct zcomp *zcomp_create(const char *comp);
> >  void zcomp_destroy(struct zcomp *comp);
> >  
> >  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index cad1751..817e511 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev,
> > return len;
> >  }
> >  
> > +/*
> > + * We switched to per-cpu streams and this attr is not needed anymore.
> > + * However, we will keep it around for some time, because:
> > + * a) we may revert per-cpu streams in the future
> > + * b) it's visible to user space and we need to follow our 2 years
> > + *retirement rule; but we already have a number of 'soon to be
> > + *altered' attrs, so max_comp_streams need to wait for the next
> > + *layoff cycle.
> > + */
> 
> Thanks for nice comment.
> 
> >  static ssize_t max_comp_streams_show(struct device *dev,
> > struct device_attribute *attr, char *buf)
> >  {
> > -   int val;
> > -   struct zram *zram = dev_to_zram(dev);
> > -
> > -   down_read(>init_lock);
> > -   val = zram->max_comp_streams;
> > -   up_read(>init_lock);
> > -
> > -   return scnprintf(buf, PAGE_SIZE, "%d\n", val);
> > +   return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
> >  }
> >  
> >  static ssize_t max_comp_streams_store(struct device *dev,
> > struct device_attribute *attr, const char *buf, size_t len)
> >  {
> > -   int num;
> > -   struct zram *zram = dev_to_zram(dev);
> > -   int ret;
> > -
> > -   ret = kstrtoint(buf, 0, );
> > -   if (ret < 0)
> > -   return ret;
> > -   if (num < 1)
> > -   return -EINVAL;
> > -
> > -   down_write(>init_lock);
> > -   if (init_done(zram)) {
> > -   if (!zcomp_set_max_streams(zram->comp, num)) {
> > -   

Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Minchan Kim
On Tue, May 03, 2016 at 02:23:24PM +0900, Minchan Kim wrote:
> On Mon, May 02, 2016 at 05:06:00PM +0900, Sergey Senozhatsky wrote:
> > On (05/02/16 16:25), Sergey Senozhatsky wrote:
> > [..]
> > > > Trivial:
> > > > We could remove max_strm now and change description.
> > > 
> > > oh, yes.
> > 
> > how about something like this? remove max_comp_streams entirely, but
> > leave the attr. if we keep zram->max_comp_streams and return its value
> > (set by user space) from show() handler, we are techically lying;
> > because the actual number of streams is now num_online_cpus().
> 
> Yes, we should have limit the value to num_online_cpus from the
> beginning.
> 
> > 
> > 
> > ===8<===8<===
> > 
> > From: Sergey Senozhatsky 
> > Subject: [PATCH] zram: remove max_comp_streams internals
> > 
> > Remove the internal part of max_comp_streams interface, since we
> > switched to per-cpu streams. We will keep RW max_comp_streams attr
> > around, because:
> > 
> > a) we may (silently) switch back to idle compression streams list
> >and don't want to disturb user space
> > b) max_comp_streams attr must wait for the next 'lay off cycle';
> >we give user space 2 years to adjust before we remove/downgrade
> >the attr, and there are already several attrs scheduled for
> >removal in 4.11, so it's too late for max_comp_streams.
> > 
> > Signed-off-by: Sergey Senozhatsky 
> > ---
> >  drivers/block/zram/zcomp.c|  7 +--
> >  drivers/block/zram/zcomp.h|  2 +-
> >  drivers/block/zram/zram_drv.c | 47 
> > +++
> >  drivers/block/zram/zram_drv.h |  1 -
> >  4 files changed, 14 insertions(+), 43 deletions(-)
> > 
> > diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
> > index d4159e4..d4de9cb 100644
> > --- a/drivers/block/zram/zcomp.c
> > +++ b/drivers/block/zram/zcomp.c
> > @@ -95,11 +95,6 @@ bool zcomp_available_algorithm(const char *comp)
> > return find_backend(comp) != NULL;
> >  }
> >  
> > -bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
> > -{
> > -   return true;
> > -}
> > -
> >  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
> >  {
> > return *get_cpu_ptr(comp->stream);
> > @@ -211,7 +206,7 @@ void zcomp_destroy(struct zcomp *comp)
> >   * case of allocation error, or any other error potentially
> >   * returned by functions zcomp_strm_{multi,single}_create.
> >   */
> > -struct zcomp *zcomp_create(const char *compress, int max_strm)
> > +struct zcomp *zcomp_create(const char *compress)
> >  {
> > struct zcomp *comp;
> > struct zcomp_backend *backend;
> > diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
> > index aba8c21..ffd88cb 100644
> > --- a/drivers/block/zram/zcomp.h
> > +++ b/drivers/block/zram/zcomp.h
> > @@ -45,7 +45,7 @@ struct zcomp {
> >  ssize_t zcomp_available_show(const char *comp, char *buf);
> >  bool zcomp_available_algorithm(const char *comp);
> >  
> > -struct zcomp *zcomp_create(const char *comp, int max_strm);
> > +struct zcomp *zcomp_create(const char *comp);
> >  void zcomp_destroy(struct zcomp *comp);
> >  
> >  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
> > diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> > index cad1751..817e511 100644
> > --- a/drivers/block/zram/zram_drv.c
> > +++ b/drivers/block/zram/zram_drv.c
> > @@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev,
> > return len;
> >  }
> >  
> > +/*
> > + * We switched to per-cpu streams and this attr is not needed anymore.
> > + * However, we will keep it around for some time, because:
> > + * a) we may revert per-cpu streams in the future
> > + * b) it's visible to user space and we need to follow our 2 years
> > + *retirement rule; but we already have a number of 'soon to be
> > + *altered' attrs, so max_comp_streams need to wait for the next
> > + *layoff cycle.
> > + */
> 
> Thanks for nice comment.
> 
> >  static ssize_t max_comp_streams_show(struct device *dev,
> > struct device_attribute *attr, char *buf)
> >  {
> > -   int val;
> > -   struct zram *zram = dev_to_zram(dev);
> > -
> > -   down_read(>init_lock);
> > -   val = zram->max_comp_streams;
> > -   up_read(>init_lock);
> > -
> > -   return scnprintf(buf, PAGE_SIZE, "%d\n", val);
> > +   return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
> >  }
> >  
> >  static ssize_t max_comp_streams_store(struct device *dev,
> > struct device_attribute *attr, const char *buf, size_t len)
> >  {
> > -   int num;
> > -   struct zram *zram = dev_to_zram(dev);
> > -   int ret;
> > -
> > -   ret = kstrtoint(buf, 0, );
> > -   if (ret < 0)
> > -   return ret;
> > -   if (num < 1)
> > -   return -EINVAL;
> > -
> > -   down_write(>init_lock);
> > -   if (init_done(zram)) {
> > -   if (!zcomp_set_max_streams(zram->comp, num)) {
> > -   pr_info("Cannot change max compression streams\n");
> > - 

[PATCH v2 05/12] sched/fair: Optimize __update_sched_avg()

2016-05-02 Thread Yuyang Du
__update_sched_avg() has these steps:
1. add the left of the last incomplete period
2. decay old sum
3. accumulate new sum since last_update_time
4. add the current incomplete period
5. update averages

Previously, we separately computed steps 1, 3, and 4, leading to
each one of them ugly in codes and costly in overhead. But actually
they all do the same thing, so we combine them together. The result
will be much cleaner codes and less CPU cycles.

Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |  185 +--
 1 file changed, 92 insertions(+), 93 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1655280..ea99c2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -668,7 +668,7 @@ static unsigned long task_h_load(struct task_struct *p);
  */
 #define SCHED_AVG_HALFLIFE 32  /* number of periods as a half-life */
 #define SCHED_AVG_MAX 47742/* maximum possible sched avg */
-#define SCHED_AVG_MAX_N 345/* number of full periods to produce 
SCHED_AVG_MAX */
+#define SCHED_AVG_MAX_N 347/* number of full periods to produce 
SCHED_AVG_MAX */
 
 /* Give new sched_entity start runnable values to heavy its load in infant 
time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -2606,7 +2606,7 @@ static const u32 __accumulated_sum_N[] = {
 
 /*
  * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers.
+ * lower integers. Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11
  */
 static const u32 __accumulated_sum_N32[] = {
0, 23371, 35056, 40899, 43820, 45281,
@@ -2649,20 +2649,31 @@ static __always_inline u64 __decay_sum(u64 val, u32 n)
  * We can compute this efficiently by combining:
  * y^32 = 1/2 with precomputed \Sum 1024*y^n   (where n < 32)
  */
-static __always_inline u32 __accumulate_sum(u32 n)
+static __always_inline u32
+__accumulate_sum(u32 periods, u32 period_contrib, u32 remainder)
 {
-   u32 contrib = 0;
+   u32 contrib;
 
-   if (likely(n <= SCHED_AVG_HALFLIFE))
-   return __accumulated_sum_N[n];
-   else if (unlikely(n >= SCHED_AVG_MAX_N))
+   if (!periods)
+   return remainder - period_contrib;
+
+   if (unlikely(periods >= SCHED_AVG_MAX_N))
return SCHED_AVG_MAX;
 
-   /* Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11 */
-   contrib = __accumulated_sum_N32[n/SCHED_AVG_HALFLIFE];
-   n %= SCHED_AVG_HALFLIFE;
-   contrib = __decay_sum(contrib, n);
-   return contrib + __accumulated_sum_N[n];
+   remainder += __decay_sum((u64)(1024 - period_contrib), periods);
+
+   periods -= 1;
+   if (likely(periods <= SCHED_AVG_HALFLIFE))
+   contrib = __accumulated_sum_N[periods];
+   else {
+   /*(periods>>5) = (periods/SCHED_AVG_HALFLIFE) */
+   contrib = __accumulated_sum_N32[periods/SCHED_AVG_HALFLIFE];
+   periods %= SCHED_AVG_HALFLIFE;
+   contrib = __decay_sum(contrib, periods);
+   contrib += __accumulated_sum_N[periods];
+   }
+
+   return contrib + remainder;
 }
 
 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT 
!= 10
@@ -2671,6 +2682,55 @@ static __always_inline u32 __accumulate_sum(u32 n)
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
+static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
+   struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+{
+   u32 contrib, periods;
+   unsigned long scale_freq, scale_cpu;
+
+   scale_freq = arch_scale_freq_capacity(NULL, cpu);
+   scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+   delta += sa->period_contrib;
+   periods = delta >> 10; /* A period is 1024us (~1ms) */
+
+   /*
+* Accumulating *_sum has two steps.
+*
+* Step 1: decay old *_sum if we crossed period boundaries.
+*/
+   if (periods) {
+   sa->load_sum = __decay_sum(sa->load_sum, periods);
+   if (cfs_rq) {
+   cfs_rq->runnable_load_sum =
+   __decay_sum(cfs_rq->runnable_load_sum, periods);
+   }
+   sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+   }
+
+   /*
+* Step 2: accumulate new *_sum since last_update_time. This at most has
+* three parts (at least one part): (1) remainder of incomplete last
+* period, (2) full periods since (1), and (3) incomplete current 
period.
+*
+* Fortunately, we can (and should) do all these three at once.
+*/
+   delta %= 1024;
+   contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+   sa->period_contrib = delta;
+
+   contrib = cap_scale(contrib, scale_freq);
+   if (weight) {
+   sa->load_sum += weight * contrib;
+   if (cfs_rq)
+ 

[PATCH v2 05/12] sched/fair: Optimize __update_sched_avg()

2016-05-02 Thread Yuyang Du
__update_sched_avg() has these steps:
1. add the left of the last incomplete period
2. decay old sum
3. accumulate new sum since last_update_time
4. add the current incomplete period
5. update averages

Previously, we separately computed steps 1, 3, and 4, leading to
each one of them ugly in codes and costly in overhead. But actually
they all do the same thing, so we combine them together. The result
will be much cleaner codes and less CPU cycles.

Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |  185 +--
 1 file changed, 92 insertions(+), 93 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1655280..ea99c2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -668,7 +668,7 @@ static unsigned long task_h_load(struct task_struct *p);
  */
 #define SCHED_AVG_HALFLIFE 32  /* number of periods as a half-life */
 #define SCHED_AVG_MAX 47742/* maximum possible sched avg */
-#define SCHED_AVG_MAX_N 345/* number of full periods to produce 
SCHED_AVG_MAX */
+#define SCHED_AVG_MAX_N 347/* number of full periods to produce 
SCHED_AVG_MAX */
 
 /* Give new sched_entity start runnable values to heavy its load in infant 
time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -2606,7 +2606,7 @@ static const u32 __accumulated_sum_N[] = {
 
 /*
  * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers.
+ * lower integers. Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11
  */
 static const u32 __accumulated_sum_N32[] = {
0, 23371, 35056, 40899, 43820, 45281,
@@ -2649,20 +2649,31 @@ static __always_inline u64 __decay_sum(u64 val, u32 n)
  * We can compute this efficiently by combining:
  * y^32 = 1/2 with precomputed \Sum 1024*y^n   (where n < 32)
  */
-static __always_inline u32 __accumulate_sum(u32 n)
+static __always_inline u32
+__accumulate_sum(u32 periods, u32 period_contrib, u32 remainder)
 {
-   u32 contrib = 0;
+   u32 contrib;
 
-   if (likely(n <= SCHED_AVG_HALFLIFE))
-   return __accumulated_sum_N[n];
-   else if (unlikely(n >= SCHED_AVG_MAX_N))
+   if (!periods)
+   return remainder - period_contrib;
+
+   if (unlikely(periods >= SCHED_AVG_MAX_N))
return SCHED_AVG_MAX;
 
-   /* Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11 */
-   contrib = __accumulated_sum_N32[n/SCHED_AVG_HALFLIFE];
-   n %= SCHED_AVG_HALFLIFE;
-   contrib = __decay_sum(contrib, n);
-   return contrib + __accumulated_sum_N[n];
+   remainder += __decay_sum((u64)(1024 - period_contrib), periods);
+
+   periods -= 1;
+   if (likely(periods <= SCHED_AVG_HALFLIFE))
+   contrib = __accumulated_sum_N[periods];
+   else {
+   /*(periods>>5) = (periods/SCHED_AVG_HALFLIFE) */
+   contrib = __accumulated_sum_N32[periods/SCHED_AVG_HALFLIFE];
+   periods %= SCHED_AVG_HALFLIFE;
+   contrib = __decay_sum(contrib, periods);
+   contrib += __accumulated_sum_N[periods];
+   }
+
+   return contrib + remainder;
 }
 
 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT 
!= 10
@@ -2671,6 +2682,55 @@ static __always_inline u32 __accumulate_sum(u32 n)
 
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
+static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
+   struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+{
+   u32 contrib, periods;
+   unsigned long scale_freq, scale_cpu;
+
+   scale_freq = arch_scale_freq_capacity(NULL, cpu);
+   scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+   delta += sa->period_contrib;
+   periods = delta >> 10; /* A period is 1024us (~1ms) */
+
+   /*
+* Accumulating *_sum has two steps.
+*
+* Step 1: decay old *_sum if we crossed period boundaries.
+*/
+   if (periods) {
+   sa->load_sum = __decay_sum(sa->load_sum, periods);
+   if (cfs_rq) {
+   cfs_rq->runnable_load_sum =
+   __decay_sum(cfs_rq->runnable_load_sum, periods);
+   }
+   sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+   }
+
+   /*
+* Step 2: accumulate new *_sum since last_update_time. This at most has
+* three parts (at least one part): (1) remainder of incomplete last
+* period, (2) full periods since (1), and (3) incomplete current 
period.
+*
+* Fortunately, we can (and should) do all these three at once.
+*/
+   delta %= 1024;
+   contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+   sa->period_contrib = delta;
+
+   contrib = cap_scale(contrib, scale_freq);
+   if (weight) {
+   sa->load_sum += weight * contrib;
+   if (cfs_rq)
+   

Re: [PATCH 1/3] platform/chrome: chromeos_laptop: Add Leon Touch

2016-05-02 Thread Benson Leung
Hi Enric,

On Mon, Feb 15, 2016 at 3:58 PM, Enric Balletbo i Serra
 wrote:
> From: Gene Chen 
>
> Add support for Leon touch devices, which is the same as
> slippy/falco/peppy/wolf on the same buses using the LynxPoint-LP I2C via
> the i2c-designware-pci driver.
>
> Based on the following patch:
> https://chromium-review.googlesource.com/#/c/168351/
>
> Signed-off-by: Gene Chen 
> Reviewed-by: Benson Leung 
> Signed-off-by: Enric Balletbo i Serra 

I'll have to nack this patch. This actually already landed here :
963cb6f platform/chrome: chromeos_laptop - Add Toshiba CB35 Touch

As we've been upstreaming these patches, I've been changing the names
in this file to reflect the public model names instead of the
codenames we used for development. Sorry for the confusion.




-- 
Benson Leung
Senior Software Engineer, Chrom* OS
ble...@chromium.org


[PATCH v2 08/12] sched/fair: Remove SCHED_LOAD_SHIFT and SCHED_LOAD_SCALE

2016-05-02 Thread Yuyang Du
After cleaning up the sched metrics, these two definitions that cause
ambiguity are not needed any more. Use NICE_0_LOAD_SHIFT and NICE_0_LOAD
instead (the names suggest clearly who they are).

Suggested-by: Ben Segall 
Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c  |4 ++--
 kernel/sched/sched.h |   22 +++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69bfb07..fa79820 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -721,7 +721,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
 {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = >avg;
-   long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - 
cfs_rq->avg.util_avg) / 2;
+   long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
 
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@@ -7017,7 +7017,7 @@ static inline void calculate_imbalance(struct lb_env 
*env, struct sd_lb_stats *s
if (busiest->group_type == group_overloaded &&
local->group_type   == group_overloaded) {
load_above_capacity = busiest->sum_nr_running *
-   SCHED_LOAD_SCALE;
+ scale_load_down(NICE_0_LOAD);
if (load_above_capacity > busiest->group_capacity)
load_above_capacity -= busiest->group_capacity;
else
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 996a137..1a3be6f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -54,25 +54,25 @@ static inline void cpu_load_update_active(struct rq 
*this_rq) { }
  * increased costs.
  */
 #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage 
under light load  */
-# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
 # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
 # define scale_load_down(w)((w) >> SCHED_FIXEDPOINT_SHIFT)
 #else
-# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT)
+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
 # define scale_load(w) (w)
 # define scale_load_down(w)(w)
 #endif
 
-#define SCHED_LOAD_SCALE   (1L << SCHED_LOAD_SHIFT)
-
 /*
- * NICE_0's weight (visible to user) and its load (invisible to user) have
- * independent ranges, but they should be well calibrated. We use scale_load()
- * and scale_load_down(w) to convert between them, the following must be true:
- * scale_load(sched_prio_to_weight[20]) == NICE_0_LOAD
+ * Task weight (visible to user) and its load (invisible to user) have
+ * independent resolution, but they should be well calibrated. We use
+ * scale_load() and scale_load_down(w) to convert between them. The
+ * following must be true:
+ *
+ *  scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ *
  */
-#define NICE_0_LOADSCHED_LOAD_SCALE
-#define NICE_0_SHIFT   SCHED_LOAD_SHIFT
+#define NICE_0_LOAD(1L << NICE_0_LOAD_SHIFT)
 
 /*
  * Single value that decides SCHED_DEADLINE internal math precision.
@@ -861,7 +861,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 struct sched_group_capacity {
atomic_t ref;
/*
-* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+* CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
 * for a single CPU.
 */
unsigned int capacity;
-- 
1.7.9.5



[PATCH v2 08/12] sched/fair: Remove SCHED_LOAD_SHIFT and SCHED_LOAD_SCALE

2016-05-02 Thread Yuyang Du
After cleaning up the sched metrics, these two definitions that cause
ambiguity are not needed any more. Use NICE_0_LOAD_SHIFT and NICE_0_LOAD
instead (the names suggest clearly who they are).

Suggested-by: Ben Segall 
Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c  |4 ++--
 kernel/sched/sched.h |   22 +++---
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69bfb07..fa79820 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -721,7 +721,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
 {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = >avg;
-   long cap = (long)(scale_load_down(SCHED_LOAD_SCALE) - 
cfs_rq->avg.util_avg) / 2;
+   long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
 
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@@ -7017,7 +7017,7 @@ static inline void calculate_imbalance(struct lb_env 
*env, struct sd_lb_stats *s
if (busiest->group_type == group_overloaded &&
local->group_type   == group_overloaded) {
load_above_capacity = busiest->sum_nr_running *
-   SCHED_LOAD_SCALE;
+ scale_load_down(NICE_0_LOAD);
if (load_above_capacity > busiest->group_capacity)
load_above_capacity -= busiest->group_capacity;
else
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 996a137..1a3be6f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -54,25 +54,25 @@ static inline void cpu_load_update_active(struct rq 
*this_rq) { }
  * increased costs.
  */
 #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage 
under light load  */
-# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
 # define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
 # define scale_load_down(w)((w) >> SCHED_FIXEDPOINT_SHIFT)
 #else
-# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT)
+# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
 # define scale_load(w) (w)
 # define scale_load_down(w)(w)
 #endif
 
-#define SCHED_LOAD_SCALE   (1L << SCHED_LOAD_SHIFT)
-
 /*
- * NICE_0's weight (visible to user) and its load (invisible to user) have
- * independent ranges, but they should be well calibrated. We use scale_load()
- * and scale_load_down(w) to convert between them, the following must be true:
- * scale_load(sched_prio_to_weight[20]) == NICE_0_LOAD
+ * Task weight (visible to user) and its load (invisible to user) have
+ * independent resolution, but they should be well calibrated. We use
+ * scale_load() and scale_load_down(w) to convert between them. The
+ * following must be true:
+ *
+ *  scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD
+ *
  */
-#define NICE_0_LOADSCHED_LOAD_SCALE
-#define NICE_0_SHIFT   SCHED_LOAD_SHIFT
+#define NICE_0_LOAD(1L << NICE_0_LOAD_SHIFT)
 
 /*
  * Single value that decides SCHED_DEADLINE internal math precision.
@@ -861,7 +861,7 @@ DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 struct sched_group_capacity {
atomic_t ref;
/*
-* CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+* CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
 * for a single CPU.
 */
unsigned int capacity;
-- 
1.7.9.5



Re: [PATCH 1/3] platform/chrome: chromeos_laptop: Add Leon Touch

2016-05-02 Thread Benson Leung
Hi Enric,

On Mon, Feb 15, 2016 at 3:58 PM, Enric Balletbo i Serra
 wrote:
> From: Gene Chen 
>
> Add support for Leon touch devices, which is the same as
> slippy/falco/peppy/wolf on the same buses using the LynxPoint-LP I2C via
> the i2c-designware-pci driver.
>
> Based on the following patch:
> https://chromium-review.googlesource.com/#/c/168351/
>
> Signed-off-by: Gene Chen 
> Reviewed-by: Benson Leung 
> Signed-off-by: Enric Balletbo i Serra 

I'll have to nack this patch. This actually already landed here :
963cb6f platform/chrome: chromeos_laptop - Add Toshiba CB35 Touch

As we've been upstreaming these patches, I've been changing the names
in this file to reflect the public model names instead of the
codenames we used for development. Sorry for the confusion.




-- 
Benson Leung
Senior Software Engineer, Chrom* OS
ble...@chromium.org


[PATCH v2 12/12] sched/fair: Enable increased scale for kernel load

2016-05-02 Thread Yuyang Du
The increased scale or precision for kernel load has been disabled
since the commit e4c2fb0d5776 ("sched: Disable (revert) SCHED_LOAD_SCALE
increase"). But we do need it when we have task groups, especially on
bigger machines. Otherwise, we probably will run out of precision for
load distribution.

So, we reinstate it and resolve to fix whatsoever power regression may
be seen.

Suggested-by: Ingo Molnar 
Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Yuyang Du 
---
 kernel/sched/sched.h |   51 +-
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 871da67..5f66a2c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -42,37 +42,36 @@ static inline void cpu_load_update_active(struct rq 
*this_rq) { }
 #define NS_TO_JIFFIES(TIME)((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 
 /*
- * Increase resolution of nice-level calculations for 64-bit architectures.
- * The extra resolution improves shares distribution and load balancing of
- * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
- * hierarchies, especially on larger systems. This is not a user-visible change
- * and does not change the user-interface for setting shares/weights.
+ * Task weight (visible and set by user) and its load (invisible to user)
+ * can have independent ranges. We increase the scale of load for 64-bit
+ * architectures. The extra precision improves share distribution and
+ * load balancing of low-weight task groups (e.g., nice +19 on an autogroup),
+ * deeper taskgroup hierarchies, especially on larger systems. This is not
+ * a user-visible change and does not change the user-interface for setting
+ * shares/weights. We increase resolution only if we have enough bits to allow
+ * this increased precision (i.e., BITS_PER_LONG > 32). The costs for 
increasing
+ * resolution when BITS_PER_LONG <= 32 are pretty high and the returns do not
+ * justify the increased costs.
  *
- * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
- * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
- * increased costs.
- */
-#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage 
under light load  */
-# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
-# define user_to_kernel_load(w)((w) << SCHED_FIXEDPOINT_SHIFT)
-# define kernel_to_user_load(w)((w) >> SCHED_FIXEDPOINT_SHIFT)
-#else
-# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
-# define user_to_kernel_load(w)(w)
-# define kernel_to_user_load(w)(w)
-#endif
-
-/*
- * Task weight (visible to user) and its load (invisible to user) have
- * independent resolution, but they should be well calibrated. We use
- * user_to_kernel_load() and kernel_to_user_load(w) to convert between
- * them. The following must be true:
+ * Therefore, the user load and kernel should be well expressed to make them
+ * easily exchanged. We use user_to_kernel_load() and kernel_to_user_load(w)
+ * to convert between them.
  *
+ * Following equations are a simple illustration of their relationship:
  * user_to_kernel_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == 
NICE_0_LOAD
  * kernel_to_user_load(NICE_0_LOAD) == 
sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]
  */
-#define NICE_0_LOAD(1L << NICE_0_LOAD_SHIFT)
+#if defined(CONFIG_64BIT) && defined(CONFIG_FAIR_GROUP_SCHED)
+#define NICE_0_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
+#define user_to_kernel_load(w) (w << SCHED_FIXEDPOINT_SHIFT)
+#define kernel_to_user_load(w) (w >> SCHED_FIXEDPOINT_SHIFT)
+#else
+#define NICE_0_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT)
+#define user_to_kernel_load(w) (w)
+#define kernel_to_user_load(w) (w)
+#endif
+
+#define NICE_0_LOAD(1UL << NICE_0_LOAD_SHIFT)
 
 /*
  * Single value that decides SCHED_DEADLINE internal math precision.
-- 
1.7.9.5



[PATCH v2 12/12] sched/fair: Enable increased scale for kernel load

2016-05-02 Thread Yuyang Du
The increased scale or precision for kernel load has been disabled
since the commit e4c2fb0d5776 ("sched: Disable (revert) SCHED_LOAD_SCALE
increase"). But we do need it when we have task groups, especially on
bigger machines. Otherwise, we probably will run out of precision for
load distribution.

So, we reinstate it and resolve to fix whatsoever power regression may
be seen.

Suggested-by: Ingo Molnar 
Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Yuyang Du 
---
 kernel/sched/sched.h |   51 +-
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 871da67..5f66a2c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -42,37 +42,36 @@ static inline void cpu_load_update_active(struct rq 
*this_rq) { }
 #define NS_TO_JIFFIES(TIME)((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 
 /*
- * Increase resolution of nice-level calculations for 64-bit architectures.
- * The extra resolution improves shares distribution and load balancing of
- * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
- * hierarchies, especially on larger systems. This is not a user-visible change
- * and does not change the user-interface for setting shares/weights.
+ * Task weight (visible and set by user) and its load (invisible to user)
+ * can have independent ranges. We increase the scale of load for 64-bit
+ * architectures. The extra precision improves share distribution and
+ * load balancing of low-weight task groups (e.g., nice +19 on an autogroup),
+ * deeper taskgroup hierarchies, especially on larger systems. This is not
+ * a user-visible change and does not change the user-interface for setting
+ * shares/weights. We increase resolution only if we have enough bits to allow
+ * this increased precision (i.e., BITS_PER_LONG > 32). The costs for 
increasing
+ * resolution when BITS_PER_LONG <= 32 are pretty high and the returns do not
+ * justify the increased costs.
  *
- * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
- * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
- * increased costs.
- */
-#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage 
under light load  */
-# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
-# define user_to_kernel_load(w)((w) << SCHED_FIXEDPOINT_SHIFT)
-# define kernel_to_user_load(w)((w) >> SCHED_FIXEDPOINT_SHIFT)
-#else
-# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT)
-# define user_to_kernel_load(w)(w)
-# define kernel_to_user_load(w)(w)
-#endif
-
-/*
- * Task weight (visible to user) and its load (invisible to user) have
- * independent resolution, but they should be well calibrated. We use
- * user_to_kernel_load() and kernel_to_user_load(w) to convert between
- * them. The following must be true:
+ * Therefore, the user load and kernel should be well expressed to make them
+ * easily exchanged. We use user_to_kernel_load() and kernel_to_user_load(w)
+ * to convert between them.
  *
+ * Following equations are a simple illustration of their relationship:
  * user_to_kernel_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == 
NICE_0_LOAD
  * kernel_to_user_load(NICE_0_LOAD) == 
sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]
  */
-#define NICE_0_LOAD(1L << NICE_0_LOAD_SHIFT)
+#if defined(CONFIG_64BIT) && defined(CONFIG_FAIR_GROUP_SCHED)
+#define NICE_0_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
+#define user_to_kernel_load(w) (w << SCHED_FIXEDPOINT_SHIFT)
+#define kernel_to_user_load(w) (w >> SCHED_FIXEDPOINT_SHIFT)
+#else
+#define NICE_0_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT)
+#define user_to_kernel_load(w) (w)
+#define kernel_to_user_load(w) (w)
+#endif
+
+#define NICE_0_LOAD(1UL << NICE_0_LOAD_SHIFT)
 
 /*
  * Single value that decides SCHED_DEADLINE internal math precision.
-- 
1.7.9.5



[PATCH v2 04/12] sched/fair: Add __always_inline compiler attribute to __accumulate_sum()

2016-05-02 Thread Yuyang Du
Everybody has it. If code-size is not the problem, __accumulate_sum()
should have it too.

Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 17bc721..1655280 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2649,7 +2649,7 @@ static __always_inline u64 __decay_sum(u64 val, u32 n)
  * We can compute this efficiently by combining:
  * y^32 = 1/2 with precomputed \Sum 1024*y^n   (where n < 32)
  */
-static u32 __accumulate_sum(u32 n)
+static __always_inline u32 __accumulate_sum(u32 n)
 {
u32 contrib = 0;
 
-- 
1.7.9.5



[PATCH v2 09/12] sched/fair: Add introduction to the sched average metrics

2016-05-02 Thread Yuyang Du
These sched metrics have become complex enough. We introduce them
at their definitions.

Signed-off-by: Yuyang Du 
---
 include/linux/sched.h |   60 -
 1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33e7929..a7cddd6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1211,18 +1211,56 @@ struct load_weight {
 };
 
 /*
- * The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors frequency scaling into the amount of time that a
- * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
- * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency and cpu capacity scaling into the amount of 
time
- * that a sched_entity is running on a CPU, in the range 
[0..SCHED_CAPACITY_SCALE].
- * For cfs_rq, it is the aggregated such times of all runnable and
+ * The load_avg/util_avg accumulates an infinite geometric series
+ * (see __update_sched_avg() in kernel/sched/fair.c).
+ *
+ * [load_avg definition]
+ *
+ * load_avg = runnable% * scale_load_down(load)
+ *
+ * where runnable% is the time ratio that a sched_entity is runnable.
+ * For cfs_rq, it is the aggregated such load_avg of all runnable and
  * blocked sched_entities.
- * The 64 bit load_sum can:
- * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
- * the highest weight (=88761) always runnable, we should not overflow
- * 2) for entity, support any load.weight always runnable
+ *
+ * load_avg may also take frequency scaling into account:
+ *
+ * load_avg = runnable% * scale_load_down(load) * freq%
+ *
+ * where freq% is the CPU frequency normalize to the highest frequency
+ *
+ * [util_avg definition]
+ *
+ * util_avg = running% * SCHED_CAPACITY_SCALE
+ *
+ * where running% is the time ratio that a sched_entity is running on
+ * a CPU. For cfs_rq, it is the aggregated such util_avg of all runnable
+ * and blocked sched_entities.
+ *
+ * util_avg may also factor frequency scaling and CPU capacity scaling:
+ *
+ * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
+ *
+ * where freq% is the same as above, and capacity% is the CPU capacity
+ * normalized to the greatest capacity (due to uarch differences, etc).
+ *
+ * N.B., the above ratios (runnable%, running%, freq%, and capacity%)
+ * themselves are in the range of [0, 1]. To do fixed point arithmetic,
+ * we therefore scale them to as large range as necessary. This is for
+ * example reflected by util_avg's SCHED_CAPACITY_SCALE.
+ *
+ * [Overflow issue]
+ *
+ * The 64bit load_sum can have 4353082796 (=2^64/47742/88761) entities
+ * with the highest load (=88761) always runnable on a single cfs_rq, we
+ * should not overflow as the number already hits PID_MAX_LIMIT.
+ *
+ * For all other cases (including 32bit kernel), struct load_weight's
+ * weight will overflow first before we do, because:
+ *
+ *Max(load_avg) <= Max(load.weight)
+ *
+ * Then, it is the load_weight's responsibility to consider overflow
+ * issues.
  */
 struct sched_avg {
u64 last_update_time, load_sum;
-- 
1.7.9.5



[PATCH v2 10/12] sched/fair: Remove scale_load_down() for load_avg

2016-05-02 Thread Yuyang Du
Currently, load_avg = scale_load_down(load) * runnable%. The extra scaling
down of load does not make much sense, because load_avg is primarily THE
load and on top of that, we take runnable time into account.

We therefore remove scale_load_down() for load_avg. But we need to
carefully consider the overflow risk if load has higher fixed point range
(2*SCHED_FIXEDPOINT_SHIFT). The only case an overflow may occur due
to us is on 64bit kernel with increased fixed point range. In this case,
the 64bit load_sum can afford 4251057 (=2^64/47742/88761/1024)
entities with the highest load (=88761*1024) always runnable on one
single cfs_rq, which may be an issue, but should be fine. Even if this
occurs at the end of the day, on the condition where it occurs, the
load average will not be useful anyway. And afterwards if the machine
can survive, the load will correct itself very quickly in no more than
~2 seconds (=32ms*64).

[update calculate_imbalance]
Signed-off-by: Vincent Guittot 
Signed-off-by: Yuyang Du 
---
 include/linux/sched.h |   19 ++-
 kernel/sched/fair.c   |   19 +--
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a7cddd6..b718cb0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1216,7 +1216,7 @@ struct load_weight {
  *
  * [load_avg definition]
  *
- * load_avg = runnable% * scale_load_down(load)
+ * load_avg = runnable% * load
  *
  * where runnable% is the time ratio that a sched_entity is runnable.
  * For cfs_rq, it is the aggregated such load_avg of all runnable and
@@ -1224,7 +1224,7 @@ struct load_weight {
  *
  * load_avg may also take frequency scaling into account:
  *
- * load_avg = runnable% * scale_load_down(load) * freq%
+ * load_avg = runnable% * load * freq%
  *
  * where freq% is the CPU frequency normalize to the highest frequency
  *
@@ -1250,9 +1250,18 @@ struct load_weight {
  *
  * [Overflow issue]
  *
- * The 64bit load_sum can have 4353082796 (=2^64/47742/88761) entities
- * with the highest load (=88761) always runnable on a single cfs_rq, we
- * should not overflow as the number already hits PID_MAX_LIMIT.
+ * On 64bit kernel:
+ *
+ * When load has small fixed point range (SCHED_FIXEDPOINT_SHIFT), the
+ * 64bit load_sum can have 4353082796 (=2^64/47742/88761) tasks with
+ * the highest load (=88761) always runnable on a cfs_rq, we should
+ * not overflow as the number already hits PID_MAX_LIMIT.
+ *
+ * When load has large fixed point range (2*SCHED_FIXEDPOINT_SHIFT),
+ * the 64bit load_sum can have 4251057 (=2^64/47742/88761/1024) tasks
+ * with the highest load (=88761*1024) always runnable on ONE cfs_rq,
+ * we should be fine. Even if the overflow occurs at the end of day,
+ * at the time the load_avg won't be useful anyway in that situation.
  *
  * For all other cases (including 32bit kernel), struct load_weight's
  * weight will overflow first before we do, because:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fa79820..504803e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -682,7 +682,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 * will definitely be update (after enqueue).
 */
sa->period_contrib = 1023;
-   sa->load_avg = scale_load_down(se->load.weight);
+   sa->load_avg = se->load.weight;
sa->load_sum = sa->load_avg * SCHED_AVG_MAX;
/*
 * At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -2927,7 +2927,7 @@ update_cfs_rq_sched_avg(u64 now, struct cfs_rq *cfs_rq, 
bool update_freq)
}
 
decayed = __update_sched_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-   scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, 
cfs_rq);
+cfs_rq->load.weight, cfs_rq->curr != NULL, 
cfs_rq);
 
 #ifndef CONFIG_64BIT
smp_wmb();
@@ -2952,8 +2952,7 @@ static inline void update_sched_avg(struct sched_entity 
*se, int update_tg)
 * Track task load average for carrying it to new CPU after migrated, 
and
 * track group sched_entity load average for task_h_load calc in 
migration
 */
-   __update_sched_avg(now, cpu, >avg,
-  se->on_rq * scale_load_down(se->load.weight),
+   __update_sched_avg(now, cpu, >avg, se->on_rq * se->load.weight,
   cfs_rq->curr == se, NULL);
 
if (update_cfs_rq_sched_avg(now, cfs_rq, true) && update_tg)
@@ -2992,7 +2991,7 @@ skip_aging:
 static void detach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se)
 {
__update_sched_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-  >avg, se->on_rq * 
scale_load_down(se->load.weight),
+  >avg, se->on_rq * se->load.weight,
   cfs_rq->curr == se, NULL);
 
cfs_rq->avg.load_avg = 

[PATCH v2 04/12] sched/fair: Add __always_inline compiler attribute to __accumulate_sum()

2016-05-02 Thread Yuyang Du
Everybody has it. If code-size is not the problem, __accumulate_sum()
should have it too.

Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 17bc721..1655280 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2649,7 +2649,7 @@ static __always_inline u64 __decay_sum(u64 val, u32 n)
  * We can compute this efficiently by combining:
  * y^32 = 1/2 with precomputed \Sum 1024*y^n   (where n < 32)
  */
-static u32 __accumulate_sum(u32 n)
+static __always_inline u32 __accumulate_sum(u32 n)
 {
u32 contrib = 0;
 
-- 
1.7.9.5



[PATCH v2 09/12] sched/fair: Add introduction to the sched average metrics

2016-05-02 Thread Yuyang Du
These sched metrics have become complex enough. We introduce them
at their definitions.

Signed-off-by: Yuyang Du 
---
 include/linux/sched.h |   60 -
 1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 33e7929..a7cddd6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1211,18 +1211,56 @@ struct load_weight {
 };
 
 /*
- * The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors frequency scaling into the amount of time that a
- * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
- * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency and cpu capacity scaling into the amount of 
time
- * that a sched_entity is running on a CPU, in the range 
[0..SCHED_CAPACITY_SCALE].
- * For cfs_rq, it is the aggregated such times of all runnable and
+ * The load_avg/util_avg accumulates an infinite geometric series
+ * (see __update_sched_avg() in kernel/sched/fair.c).
+ *
+ * [load_avg definition]
+ *
+ * load_avg = runnable% * scale_load_down(load)
+ *
+ * where runnable% is the time ratio that a sched_entity is runnable.
+ * For cfs_rq, it is the aggregated such load_avg of all runnable and
  * blocked sched_entities.
- * The 64 bit load_sum can:
- * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
- * the highest weight (=88761) always runnable, we should not overflow
- * 2) for entity, support any load.weight always runnable
+ *
+ * load_avg may also take frequency scaling into account:
+ *
+ * load_avg = runnable% * scale_load_down(load) * freq%
+ *
+ * where freq% is the CPU frequency normalize to the highest frequency
+ *
+ * [util_avg definition]
+ *
+ * util_avg = running% * SCHED_CAPACITY_SCALE
+ *
+ * where running% is the time ratio that a sched_entity is running on
+ * a CPU. For cfs_rq, it is the aggregated such util_avg of all runnable
+ * and blocked sched_entities.
+ *
+ * util_avg may also factor frequency scaling and CPU capacity scaling:
+ *
+ * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
+ *
+ * where freq% is the same as above, and capacity% is the CPU capacity
+ * normalized to the greatest capacity (due to uarch differences, etc).
+ *
+ * N.B., the above ratios (runnable%, running%, freq%, and capacity%)
+ * themselves are in the range of [0, 1]. To do fixed point arithmetic,
+ * we therefore scale them to as large range as necessary. This is for
+ * example reflected by util_avg's SCHED_CAPACITY_SCALE.
+ *
+ * [Overflow issue]
+ *
+ * The 64bit load_sum can have 4353082796 (=2^64/47742/88761) entities
+ * with the highest load (=88761) always runnable on a single cfs_rq, we
+ * should not overflow as the number already hits PID_MAX_LIMIT.
+ *
+ * For all other cases (including 32bit kernel), struct load_weight's
+ * weight will overflow first before we do, because:
+ *
+ *Max(load_avg) <= Max(load.weight)
+ *
+ * Then, it is the load_weight's responsibility to consider overflow
+ * issues.
  */
 struct sched_avg {
u64 last_update_time, load_sum;
-- 
1.7.9.5



[PATCH v2 10/12] sched/fair: Remove scale_load_down() for load_avg

2016-05-02 Thread Yuyang Du
Currently, load_avg = scale_load_down(load) * runnable%. The extra scaling
down of load does not make much sense, because load_avg is primarily THE
load and on top of that, we take runnable time into account.

We therefore remove scale_load_down() for load_avg. But we need to
carefully consider the overflow risk if load has higher fixed point range
(2*SCHED_FIXEDPOINT_SHIFT). The only case an overflow may occur due
to us is on 64bit kernel with increased fixed point range. In this case,
the 64bit load_sum can afford 4251057 (=2^64/47742/88761/1024)
entities with the highest load (=88761*1024) always runnable on one
single cfs_rq, which may be an issue, but should be fine. Even if this
occurs at the end of the day, on the condition where it occurs, the
load average will not be useful anyway. And afterwards if the machine
can survive, the load will correct itself very quickly in no more than
~2 seconds (=32ms*64).

[update calculate_imbalance]
Signed-off-by: Vincent Guittot 
Signed-off-by: Yuyang Du 
---
 include/linux/sched.h |   19 ++-
 kernel/sched/fair.c   |   19 +--
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a7cddd6..b718cb0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1216,7 +1216,7 @@ struct load_weight {
  *
  * [load_avg definition]
  *
- * load_avg = runnable% * scale_load_down(load)
+ * load_avg = runnable% * load
  *
  * where runnable% is the time ratio that a sched_entity is runnable.
  * For cfs_rq, it is the aggregated such load_avg of all runnable and
@@ -1224,7 +1224,7 @@ struct load_weight {
  *
  * load_avg may also take frequency scaling into account:
  *
- * load_avg = runnable% * scale_load_down(load) * freq%
+ * load_avg = runnable% * load * freq%
  *
  * where freq% is the CPU frequency normalize to the highest frequency
  *
@@ -1250,9 +1250,18 @@ struct load_weight {
  *
  * [Overflow issue]
  *
- * The 64bit load_sum can have 4353082796 (=2^64/47742/88761) entities
- * with the highest load (=88761) always runnable on a single cfs_rq, we
- * should not overflow as the number already hits PID_MAX_LIMIT.
+ * On 64bit kernel:
+ *
+ * When load has small fixed point range (SCHED_FIXEDPOINT_SHIFT), the
+ * 64bit load_sum can have 4353082796 (=2^64/47742/88761) tasks with
+ * the highest load (=88761) always runnable on a cfs_rq, we should
+ * not overflow as the number already hits PID_MAX_LIMIT.
+ *
+ * When load has large fixed point range (2*SCHED_FIXEDPOINT_SHIFT),
+ * the 64bit load_sum can have 4251057 (=2^64/47742/88761/1024) tasks
+ * with the highest load (=88761*1024) always runnable on ONE cfs_rq,
+ * we should be fine. Even if the overflow occurs at the end of day,
+ * at the time the load_avg won't be useful anyway in that situation.
  *
  * For all other cases (including 32bit kernel), struct load_weight's
  * weight will overflow first before we do, because:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fa79820..504803e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -682,7 +682,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 * will definitely be update (after enqueue).
 */
sa->period_contrib = 1023;
-   sa->load_avg = scale_load_down(se->load.weight);
+   sa->load_avg = se->load.weight;
sa->load_sum = sa->load_avg * SCHED_AVG_MAX;
/*
 * At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -2927,7 +2927,7 @@ update_cfs_rq_sched_avg(u64 now, struct cfs_rq *cfs_rq, 
bool update_freq)
}
 
decayed = __update_sched_avg(now, cpu_of(rq_of(cfs_rq)), sa,
-   scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, 
cfs_rq);
+cfs_rq->load.weight, cfs_rq->curr != NULL, 
cfs_rq);
 
 #ifndef CONFIG_64BIT
smp_wmb();
@@ -2952,8 +2952,7 @@ static inline void update_sched_avg(struct sched_entity 
*se, int update_tg)
 * Track task load average for carrying it to new CPU after migrated, 
and
 * track group sched_entity load average for task_h_load calc in 
migration
 */
-   __update_sched_avg(now, cpu, >avg,
-  se->on_rq * scale_load_down(se->load.weight),
+   __update_sched_avg(now, cpu, >avg, se->on_rq * se->load.weight,
   cfs_rq->curr == se, NULL);
 
if (update_cfs_rq_sched_avg(now, cfs_rq, true) && update_tg)
@@ -2992,7 +2991,7 @@ skip_aging:
 static void detach_entity_sched_avg(struct cfs_rq *cfs_rq, struct sched_entity 
*se)
 {
__update_sched_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
-  >avg, se->on_rq * 
scale_load_down(se->load.weight),
+  >avg, se->on_rq * se->load.weight,
   cfs_rq->curr == se, NULL);
 
cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - 

[PATCH v2 11/12] sched/fair: Rename scale_load() and scale_load_down()

2016-05-02 Thread Yuyang Du
Rename scale_load() and scale_load_down() to user_to_kernel_load()
and kernel_to_user_load() respectively. This helps us tag them
clearly and avoid confusion.

[update calculate_imbalance]
Signed-off-by: Vincent Guittot 
Signed-off-by: Yuyang Du 
---
 kernel/sched/core.c  |8 
 kernel/sched/fair.c  |   18 --
 kernel/sched/sched.h |   16 
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c82ca6e..349d776 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -698,12 +698,12 @@ static void set_load_weight(struct task_struct *p)
 * SCHED_IDLE tasks get minimal weight:
 */
if (idle_policy(p->policy)) {
-   load->weight = scale_load(WEIGHT_IDLEPRIO);
+   load->weight = user_to_kernel_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
return;
}
 
-   load->weight = scale_load(sched_prio_to_weight[prio]);
+   load->weight = user_to_kernel_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
 }
 
@@ -8184,7 +8184,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
 {
-   return sched_group_set_shares(css_tg(css), scale_load(shareval));
+   return sched_group_set_shares(css_tg(css), 
user_to_kernel_load(shareval));
 }
 
 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
@@ -8192,7 +8192,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state 
*css,
 {
struct task_group *tg = css_tg(css);
 
-   return (u64) scale_load_down(tg->shares);
+   return (u64) kernel_to_user_load(tg->shares);
 }
 
 #ifdef CONFIG_CFS_BANDWIDTH
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 504803e..26bd0df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -189,7 +189,7 @@ static void __update_inv_weight(struct load_weight *lw)
if (likely(lw->inv_weight))
return;
 
-   w = scale_load_down(lw->weight);
+   w = kernel_to_user_load(lw->weight);
 
if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
lw->inv_weight = 1;
@@ -210,10 +210,14 @@ static void __update_inv_weight(struct load_weight *lw)
  *
  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
  * weight/lw.weight <= 1, and therefore our shift will also be positive.
+ *
+ * Note load.weight falls back to user load scale (i.e., NICE_0's load is
+ * 1024), instead of possibly increased kernel load scale (i.e., NICE_0's
+ * load is NICE_0_LOAD) due to multiplication and division efficiency.
  */
 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct 
load_weight *lw)
 {
-   u64 fact = scale_load_down(weight);
+   u64 fact = kernel_to_user_load(weight);
int shift = WMULT_SHIFT;
 
__update_inv_weight(lw);
@@ -7015,10 +7019,11 @@ static inline void calculate_imbalance(struct lb_env 
*env, struct sd_lb_stats *s
 */
if (busiest->group_type == group_overloaded &&
local->group_type   == group_overloaded) {
+   unsigned long min_cpu_load =
+   busiest->group_capacity * NICE_0_LOAD / 
SCHED_CAPACITY_SCALE;
load_above_capacity = busiest->sum_nr_running * NICE_0_LOAD;
-   if (load_above_capacity > scale_load(busiest->group_capacity))
-   load_above_capacity -=
-   scale_load(busiest->group_capacity);
+   if (load_above_capacity > min_cpu_load)
+   load_above_capacity -= min_cpu_load;
else
load_above_capacity = ~0UL;
}
@@ -8572,7 +8577,8 @@ int sched_group_set_shares(struct task_group *tg, 
unsigned long shares)
if (!tg->se[0])
return -EINVAL;
 
-   shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
+   shares = clamp(shares, user_to_kernel_load(MIN_SHARES),
+  user_to_kernel_load(MAX_SHARES));
 
mutex_lock(_mutex);
if (tg->shares == shares)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1a3be6f..871da67 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -55,22 +55,22 @@ static inline void cpu_load_update_active(struct rq 
*this_rq) { }
  */
 #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage 
under light load  */
 # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
-# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
-# define scale_load_down(w)((w) >> SCHED_FIXEDPOINT_SHIFT)
+# define user_to_kernel_load(w)((w) << SCHED_FIXEDPOINT_SHIFT)
+# define kernel_to_user_load(w)   

[PATCH v2 07/12] sched/fair: Generalize the load/util averages resolution definition

2016-05-02 Thread Yuyang Du
Integer metric needs fixed point arithmetic. In sched/fair, a few
metrics, including weight, load, load_avg, util_avg, freq, and capacity,
may have different fixed point ranges.

In order to avoid errors relating to the fixed point range of these
metrics, we definie a basic fixed point range, and then formalize all
metrics to base on the basic range.

The basic range is 1024. Further, one can recursively apply this basic
range to have larger range.

Pointed out by Ben Segall, weight (visible to user, e.g., NICE-0 has
1024) and load (e.g., NICE_0_LOAD) have independent ranges, but they
must be well calibrated.

Signed-off-by: Yuyang Du 
---
 include/linux/sched.h |   16 +---
 kernel/sched/fair.c   |4 
 kernel/sched/sched.h  |   15 ++-
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad9454d..33e7929 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -937,9 +937,19 @@ enum cpu_idle_type {
 };
 
 /*
+ * Integer metrics need fixed point arithmetic, e.g., sched/fair
+ * has a few: load, load_avg, util_avg, freq, and capacity.
+ *
+ * We define a basic fixed point arithmetic range, and then formalize
+ * all these metrics based on that basic range.
+ */
+# define SCHED_FIXEDPOINT_SHIFT10
+# define SCHED_FIXEDPOINT_SCALE(1L << SCHED_FIXEDPOINT_SHIFT)
+
+/*
  * Increase resolution of cpu_capacity calculations
  */
-#define SCHED_CAPACITY_SHIFT   10
+#define SCHED_CAPACITY_SHIFT   SCHED_FIXEDPOINT_SHIFT
 #define SCHED_CAPACITY_SCALE   (1L << SCHED_CAPACITY_SHIFT)
 
 /*
@@ -1205,8 +1215,8 @@ struct load_weight {
  * 1) load_avg factors frequency scaling into the amount of time that a
  * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
  * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency and cpu scaling into the amount of time
- * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
+ * 2) util_avg factors frequency and cpu capacity scaling into the amount of 
time
+ * that a sched_entity is running on a CPU, in the range 
[0..SCHED_CAPACITY_SCALE].
  * For cfs_rq, it is the aggregated such times of all runnable and
  * blocked sched_entities.
  * The 64 bit load_sum can:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ea99c2c..69bfb07 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2676,10 +2676,6 @@ __accumulate_sum(u32 periods, u32 period_contrib, u32 
remainder)
return contrib + remainder;
 }
 
-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT 
!= 10
-#error "load tracking assumes 2^10 as unit"
-#endif
-
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
 static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 69da6fc..996a137 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -54,18 +54,23 @@ static inline void cpu_load_update_active(struct rq 
*this_rq) { }
  * increased costs.
  */
 #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage 
under light load  */
-# define SCHED_LOAD_RESOLUTION 10
-# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
-# define scale_load_down(w)((w) >> SCHED_LOAD_RESOLUTION)
+# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
+# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w)((w) >> SCHED_FIXEDPOINT_SHIFT)
 #else
-# define SCHED_LOAD_RESOLUTION 0
+# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT)
 # define scale_load(w) (w)
 # define scale_load_down(w)(w)
 #endif
 
-#define SCHED_LOAD_SHIFT   (10 + SCHED_LOAD_RESOLUTION)
 #define SCHED_LOAD_SCALE   (1L << SCHED_LOAD_SHIFT)
 
+/*
+ * NICE_0's weight (visible to user) and its load (invisible to user) have
+ * independent ranges, but they should be well calibrated. We use scale_load()
+ * and scale_load_down(w) to convert between them, the following must be true:
+ * scale_load(sched_prio_to_weight[20]) == NICE_0_LOAD
+ */
 #define NICE_0_LOADSCHED_LOAD_SCALE
 #define NICE_0_SHIFT   SCHED_LOAD_SHIFT
 
-- 
1.7.9.5



[PATCH v2 06/12] documentation: Add scheduler/sched-avg.txt

2016-05-02 Thread Yuyang Du
This doc file has the programs to generate the constants to compute
sched averages.

Signed-off-by: Yuyang Du 
---
 Documentation/scheduler/sched-avg.txt |  137 +
 1 file changed, 137 insertions(+)
 create mode 100644 Documentation/scheduler/sched-avg.txt

diff --git a/Documentation/scheduler/sched-avg.txt 
b/Documentation/scheduler/sched-avg.txt
new file mode 100644
index 000..ae4132f
--- /dev/null
+++ b/Documentation/scheduler/sched-avg.txt
@@ -0,0 +1,137 @@
+The following programs are used to generate the constants for
+computing sched averages.
+
+==
+   C program (compile with -lm)
+==
+
+#include 
+#include 
+
+#define HALFLIFE 32
+#define SHIFT 32
+
+double y;
+
+void calc_decay_inv_multiply() {
+   int i;
+   unsigned int x;
+
+   printf("static const u32 __decay_inv_multiply_N[] = {");
+   for(i = 0; i < HALFLIFE; i++) {
+   x = ((1UL<<32)-1)*pow(y, i);
+
+   if (i % 6 == 0) printf("\n\t");
+   printf("0x%8x, ", x);
+   }
+   printf("\n};\n\n");
+}
+
+int sum = 1024;
+void calc_accumulated_sum() {
+   int i;
+
+   printf("static const u32 __accumulated_sum_N[] = {\n\t0,");
+   for(i = 1; i <= HALFLIFE; i++) {
+   if (i == 1)
+   sum *= y;
+   else
+   sum = sum*y + 1024*y;
+
+   if (i % 11 == 0) printf("\n\t");
+   printf("%5d,", sum);
+   }
+   printf("\n};\n\n");
+}
+
+int n = 1;
+/* first period */
+long max = 1024;
+
+void calc_converged_max() {
+   long last = 0, y_inv = ((1UL<<32)-1)*y;
+
+   for (; ; n++) {
+   if (n > 1)
+   max = ((max*y_inv)>>SHIFT) + 1024;
+   /*
+* This is the same as:
+* max = max*y + 1024;
+*/
+
+   if (last == max)
+   break;
+
+   last = max;
+   }
+   n--;
+   printf("#define SCHED_AVG_HALFLIFE %d\n", HALFLIFE);
+   printf("#define SCHED_AVG_MAX %ld\n", max);
+   printf("#define SCHED_AVG_MAX_N %d\n\n", n);
+}
+
+void calc_accumulated_sum_32() {
+   int i, x = sum;
+
+   printf("static const u32 __accumulated_sum_N32[] = {\n\t 0,");
+   for(i = 1; i <= n/HALFLIFE+1; i++) {
+   if (i > 1)
+   x = x/2 + sum;
+
+   if (i % 6 == 0) printf("\n\t");
+   printf("%6d,", x);
+   }
+   printf("\n};\n\n");
+}
+
+void main() {
+   y = pow(0.5, 1/(double)HALFLIFE);
+
+   calc_decay_inv_multiply();
+   calc_accumulated_sum();
+   calc_converged_max();
+   calc_accumulated_sum_32();
+}
+
+==
+   Python script if you speak snake
+==
+
+#!/usr/bin/env python
+
+print " #: yN_inv   yN_sum"
+print "---"
+y = (0.5)**(1/32.0)
+x = 2**32
+xx = 1024
+for i in range(0, 32):
+   if i == 0:
+   x = x-1
+   xx = xx*y
+   else:
+   x = x*y
+   xx = int(xx*y + 1024*y)
+   print "%2d: %#x %8d" % (i, int(x), int(xx))
+
+print
+print " #:  sum_N32"
+print ""
+xxx = xx
+for i in range(0, 11):
+   if i > 0:
+   xxx = xxx/2 + xx
+   print "%2d: %8d" % (i, xxx)
+
+print
+print "  n: max"
+print ""
+ = 1024
+old = 0
+i = 2
+while (1):
+    = int(*y + 1024)
+   if old == :
+   break
+   i = i+1
+   old = 
+print "%3d: %7d" % (i-1, )
-- 
1.7.9.5



[PATCH v2 11/12] sched/fair: Rename scale_load() and scale_load_down()

2016-05-02 Thread Yuyang Du
Rename scale_load() and scale_load_down() to user_to_kernel_load()
and kernel_to_user_load() respectively. This helps us tag them
clearly and avoid confusion.

[update calculate_imbalance]
Signed-off-by: Vincent Guittot 
Signed-off-by: Yuyang Du 
---
 kernel/sched/core.c  |8 
 kernel/sched/fair.c  |   18 --
 kernel/sched/sched.h |   16 
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c82ca6e..349d776 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -698,12 +698,12 @@ static void set_load_weight(struct task_struct *p)
 * SCHED_IDLE tasks get minimal weight:
 */
if (idle_policy(p->policy)) {
-   load->weight = scale_load(WEIGHT_IDLEPRIO);
+   load->weight = user_to_kernel_load(WEIGHT_IDLEPRIO);
load->inv_weight = WMULT_IDLEPRIO;
return;
}
 
-   load->weight = scale_load(sched_prio_to_weight[prio]);
+   load->weight = user_to_kernel_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
 }
 
@@ -8184,7 +8184,7 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
 {
-   return sched_group_set_shares(css_tg(css), scale_load(shareval));
+   return sched_group_set_shares(css_tg(css), 
user_to_kernel_load(shareval));
 }
 
 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
@@ -8192,7 +8192,7 @@ static u64 cpu_shares_read_u64(struct cgroup_subsys_state 
*css,
 {
struct task_group *tg = css_tg(css);
 
-   return (u64) scale_load_down(tg->shares);
+   return (u64) kernel_to_user_load(tg->shares);
 }
 
 #ifdef CONFIG_CFS_BANDWIDTH
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 504803e..26bd0df 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -189,7 +189,7 @@ static void __update_inv_weight(struct load_weight *lw)
if (likely(lw->inv_weight))
return;
 
-   w = scale_load_down(lw->weight);
+   w = kernel_to_user_load(lw->weight);
 
if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
lw->inv_weight = 1;
@@ -210,10 +210,14 @@ static void __update_inv_weight(struct load_weight *lw)
  *
  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
  * weight/lw.weight <= 1, and therefore our shift will also be positive.
+ *
+ * Note load.weight falls back to user load scale (i.e., NICE_0's load is
+ * 1024), instead of possibly increased kernel load scale (i.e., NICE_0's
+ * load is NICE_0_LOAD) due to multiplication and division efficiency.
  */
 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct 
load_weight *lw)
 {
-   u64 fact = scale_load_down(weight);
+   u64 fact = kernel_to_user_load(weight);
int shift = WMULT_SHIFT;
 
__update_inv_weight(lw);
@@ -7015,10 +7019,11 @@ static inline void calculate_imbalance(struct lb_env 
*env, struct sd_lb_stats *s
 */
if (busiest->group_type == group_overloaded &&
local->group_type   == group_overloaded) {
+   unsigned long min_cpu_load =
+   busiest->group_capacity * NICE_0_LOAD / 
SCHED_CAPACITY_SCALE;
load_above_capacity = busiest->sum_nr_running * NICE_0_LOAD;
-   if (load_above_capacity > scale_load(busiest->group_capacity))
-   load_above_capacity -=
-   scale_load(busiest->group_capacity);
+   if (load_above_capacity > min_cpu_load)
+   load_above_capacity -= min_cpu_load;
else
load_above_capacity = ~0UL;
}
@@ -8572,7 +8577,8 @@ int sched_group_set_shares(struct task_group *tg, 
unsigned long shares)
if (!tg->se[0])
return -EINVAL;
 
-   shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
+   shares = clamp(shares, user_to_kernel_load(MIN_SHARES),
+  user_to_kernel_load(MAX_SHARES));
 
mutex_lock(_mutex);
if (tg->shares == shares)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1a3be6f..871da67 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -55,22 +55,22 @@ static inline void cpu_load_update_active(struct rq 
*this_rq) { }
  */
 #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage 
under light load  */
 # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
-# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
-# define scale_load_down(w)((w) >> SCHED_FIXEDPOINT_SHIFT)
+# define user_to_kernel_load(w)((w) << SCHED_FIXEDPOINT_SHIFT)
+# define kernel_to_user_load(w)((w) >> SCHED_FIXEDPOINT_SHIFT)
 #else
 # 

[PATCH v2 07/12] sched/fair: Generalize the load/util averages resolution definition

2016-05-02 Thread Yuyang Du
Integer metric needs fixed point arithmetic. In sched/fair, a few
metrics, including weight, load, load_avg, util_avg, freq, and capacity,
may have different fixed point ranges.

In order to avoid errors relating to the fixed point range of these
metrics, we definie a basic fixed point range, and then formalize all
metrics to base on the basic range.

The basic range is 1024. Further, one can recursively apply this basic
range to have larger range.

Pointed out by Ben Segall, weight (visible to user, e.g., NICE-0 has
1024) and load (e.g., NICE_0_LOAD) have independent ranges, but they
must be well calibrated.

Signed-off-by: Yuyang Du 
---
 include/linux/sched.h |   16 +---
 kernel/sched/fair.c   |4 
 kernel/sched/sched.h  |   15 ++-
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad9454d..33e7929 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -937,9 +937,19 @@ enum cpu_idle_type {
 };
 
 /*
+ * Integer metrics need fixed point arithmetic, e.g., sched/fair
+ * has a few: load, load_avg, util_avg, freq, and capacity.
+ *
+ * We define a basic fixed point arithmetic range, and then formalize
+ * all these metrics based on that basic range.
+ */
+# define SCHED_FIXEDPOINT_SHIFT10
+# define SCHED_FIXEDPOINT_SCALE(1L << SCHED_FIXEDPOINT_SHIFT)
+
+/*
  * Increase resolution of cpu_capacity calculations
  */
-#define SCHED_CAPACITY_SHIFT   10
+#define SCHED_CAPACITY_SHIFT   SCHED_FIXEDPOINT_SHIFT
 #define SCHED_CAPACITY_SCALE   (1L << SCHED_CAPACITY_SHIFT)
 
 /*
@@ -1205,8 +1215,8 @@ struct load_weight {
  * 1) load_avg factors frequency scaling into the amount of time that a
  * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
  * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency and cpu scaling into the amount of time
- * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
+ * 2) util_avg factors frequency and cpu capacity scaling into the amount of 
time
+ * that a sched_entity is running on a CPU, in the range 
[0..SCHED_CAPACITY_SCALE].
  * For cfs_rq, it is the aggregated such times of all runnable and
  * blocked sched_entities.
  * The 64 bit load_sum can:
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ea99c2c..69bfb07 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2676,10 +2676,6 @@ __accumulate_sum(u32 periods, u32 period_contrib, u32 
remainder)
return contrib + remainder;
 }
 
-#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT 
!= 10
-#error "load tracking assumes 2^10 as unit"
-#endif
-
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 
 static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 69da6fc..996a137 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -54,18 +54,23 @@ static inline void cpu_load_update_active(struct rq 
*this_rq) { }
  * increased costs.
  */
 #if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage 
under light load  */
-# define SCHED_LOAD_RESOLUTION 10
-# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
-# define scale_load_down(w)((w) >> SCHED_LOAD_RESOLUTION)
+# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT + 
SCHED_FIXEDPOINT_SHIFT)
+# define scale_load(w) ((w) << SCHED_FIXEDPOINT_SHIFT)
+# define scale_load_down(w)((w) >> SCHED_FIXEDPOINT_SHIFT)
 #else
-# define SCHED_LOAD_RESOLUTION 0
+# define SCHED_LOAD_SHIFT  (SCHED_FIXEDPOINT_SHIFT)
 # define scale_load(w) (w)
 # define scale_load_down(w)(w)
 #endif
 
-#define SCHED_LOAD_SHIFT   (10 + SCHED_LOAD_RESOLUTION)
 #define SCHED_LOAD_SCALE   (1L << SCHED_LOAD_SHIFT)
 
+/*
+ * NICE_0's weight (visible to user) and its load (invisible to user) have
+ * independent ranges, but they should be well calibrated. We use scale_load()
+ * and scale_load_down(w) to convert between them, the following must be true:
+ * scale_load(sched_prio_to_weight[20]) == NICE_0_LOAD
+ */
 #define NICE_0_LOADSCHED_LOAD_SCALE
 #define NICE_0_SHIFT   SCHED_LOAD_SHIFT
 
-- 
1.7.9.5



[PATCH v2 06/12] documentation: Add scheduler/sched-avg.txt

2016-05-02 Thread Yuyang Du
This doc file has the programs to generate the constants to compute
sched averages.

Signed-off-by: Yuyang Du 
---
 Documentation/scheduler/sched-avg.txt |  137 +
 1 file changed, 137 insertions(+)
 create mode 100644 Documentation/scheduler/sched-avg.txt

diff --git a/Documentation/scheduler/sched-avg.txt 
b/Documentation/scheduler/sched-avg.txt
new file mode 100644
index 000..ae4132f
--- /dev/null
+++ b/Documentation/scheduler/sched-avg.txt
@@ -0,0 +1,137 @@
+The following programs are used to generate the constants for
+computing sched averages.
+
+==
+   C program (compile with -lm)
+==
+
+#include 
+#include 
+
+#define HALFLIFE 32
+#define SHIFT 32
+
+double y;
+
+void calc_decay_inv_multiply() {
+   int i;
+   unsigned int x;
+
+   printf("static const u32 __decay_inv_multiply_N[] = {");
+   for(i = 0; i < HALFLIFE; i++) {
+   x = ((1UL<<32)-1)*pow(y, i);
+
+   if (i % 6 == 0) printf("\n\t");
+   printf("0x%8x, ", x);
+   }
+   printf("\n};\n\n");
+}
+
+int sum = 1024;
+void calc_accumulated_sum() {
+   int i;
+
+   printf("static const u32 __accumulated_sum_N[] = {\n\t0,");
+   for(i = 1; i <= HALFLIFE; i++) {
+   if (i == 1)
+   sum *= y;
+   else
+   sum = sum*y + 1024*y;
+
+   if (i % 11 == 0) printf("\n\t");
+   printf("%5d,", sum);
+   }
+   printf("\n};\n\n");
+}
+
+int n = 1;
+/* first period */
+long max = 1024;
+
+void calc_converged_max() {
+   long last = 0, y_inv = ((1UL<<32)-1)*y;
+
+   for (; ; n++) {
+   if (n > 1)
+   max = ((max*y_inv)>>SHIFT) + 1024;
+   /*
+* This is the same as:
+* max = max*y + 1024;
+*/
+
+   if (last == max)
+   break;
+
+   last = max;
+   }
+   n--;
+   printf("#define SCHED_AVG_HALFLIFE %d\n", HALFLIFE);
+   printf("#define SCHED_AVG_MAX %ld\n", max);
+   printf("#define SCHED_AVG_MAX_N %d\n\n", n);
+}
+
+void calc_accumulated_sum_32() {
+   int i, x = sum;
+
+   printf("static const u32 __accumulated_sum_N32[] = {\n\t 0,");
+   for(i = 1; i <= n/HALFLIFE+1; i++) {
+   if (i > 1)
+   x = x/2 + sum;
+
+   if (i % 6 == 0) printf("\n\t");
+   printf("%6d,", x);
+   }
+   printf("\n};\n\n");
+}
+
+void main() {
+   y = pow(0.5, 1/(double)HALFLIFE);
+
+   calc_decay_inv_multiply();
+   calc_accumulated_sum();
+   calc_converged_max();
+   calc_accumulated_sum_32();
+}
+
+==
+   Python script if you speak snake
+==
+
+#!/usr/bin/env python
+
+print " #: yN_inv   yN_sum"
+print "---"
+y = (0.5)**(1/32.0)
+x = 2**32
+xx = 1024
+for i in range(0, 32):
+   if i == 0:
+   x = x-1
+   xx = xx*y
+   else:
+   x = x*y
+   xx = int(xx*y + 1024*y)
+   print "%2d: %#x %8d" % (i, int(x), int(xx))
+
+print
+print " #:  sum_N32"
+print ""
+xxx = xx
+for i in range(0, 11):
+   if i > 0:
+   xxx = xxx/2 + xx
+   print "%2d: %8d" % (i, xxx)
+
+print
+print "  n: max"
+print ""
+ = 1024
+old = 0
+i = 2
+while (1):
+    = int(*y + 1024)
+   if old == :
+   break
+   i = i+1
+   old = 
+print "%3d: %7d" % (i-1, )
-- 
1.7.9.5



[PATCH v2 01/12] sched/fair: Optimize sum computation with a lookup table

2016-05-02 Thread Yuyang Du
__compute_runnable_contrib() uses a loop to compute sum, whereas a
table lookup can do it faster in a constant time.

The program to generate the constants is located at:
Documentation/scheduler/sched-avg.txt

Signed-off-by: Yuyang Du 
Reviewed-by: Morten Rasmussen 
Acked-by: Vincent Guittot 
Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |   20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8a33ab..e803f11 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2603,6 +2603,15 @@ static const u32 runnable_avg_yN_sum[] = {
 };
 
 /*
+ * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
+ * lower integers.
+ */
+static const u32 __accumulated_sum_N32[] = {
+   0, 23371, 35056, 40899, 43820, 45281,
+   46011, 46376, 46559, 46650, 46696, 46719,
+};
+
+/*
  * Approximate:
  *   val * y^n,where y^32 ~= 0.5 (~1 scheduling period)
  */
@@ -2650,14 +2659,9 @@ static u32 __compute_runnable_contrib(u64 n)
else if (unlikely(n >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;
 
-   /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
-   do {
-   contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
-   contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
-
-   n -= LOAD_AVG_PERIOD;
-   } while (n > LOAD_AVG_PERIOD);
-
+   /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
+   contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
+   n %= LOAD_AVG_PERIOD;
contrib = decay_load(contrib, n);
return contrib + runnable_avg_yN_sum[n];
 }
-- 
1.7.9.5



[patch added to 3.12-stable] workqueue: fix ghost PENDING flag while doing MQ IO

2016-05-02 Thread Jiri Slaby
From: Roman Pen 

This patch has been added to the 3.12 stable tree. If you have any
objections, please let us know.

===

commit 346c09f80459a3ad97df1816d6d606169a51001a upstream.

The bug in a workqueue leads to a stalled IO request in MQ ctx->rq_list
with the following backtrace:

[  601.347452] INFO: task kworker/u129:5:1636 blocked for more than 120 seconds.
[  601.347574]   Tainted: G   O4.4.5-1-storage+ #6
[  601.347651] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this 
message.
[  601.348142] kworker/u129:5  D 880803077988 0  1636  2 0x
[  601.348519] Workqueue: ibnbd_server_fileio_wq 
ibnbd_dev_file_submit_io_worker [ibnbd_server]
[  601.348999]  880803077988 88080466b900 8808033f9c80 
880803078000
[  601.349662]  880807c95000 7fff 815b0920 
880803077ad0
[  601.350333]  8808030779a0 815b01d5  
880803077a38
[  601.350965] Call Trace:
[  601.351203]  [] ? bit_wait+0x60/0x60
[  601.351444]  [] schedule+0x35/0x80
[  601.351709]  [] schedule_timeout+0x192/0x230
[  601.351958]  [] ? blk_flush_plug_list+0xc7/0x220
[  601.352208]  [] ? ktime_get+0x37/0xa0
[  601.352446]  [] ? bit_wait+0x60/0x60
[  601.352688]  [] io_schedule_timeout+0xa4/0x110
[  601.352951]  [] ? _raw_spin_unlock_irqrestore+0xe/0x10
[  601.353196]  [] bit_wait_io+0x1b/0x70
[  601.353440]  [] __wait_on_bit+0x5d/0x90
[  601.353689]  [] wait_on_page_bit+0xc0/0xd0
[  601.353958]  [] ? autoremove_wake_function+0x40/0x40
[  601.354200]  [] __filemap_fdatawait_range+0xe4/0x140
[  601.354441]  [] filemap_fdatawait_range+0x14/0x30
[  601.354688]  [] filemap_write_and_wait_range+0x3f/0x70
[  601.354932]  [] blkdev_fsync+0x1b/0x50
[  601.355193]  [] vfs_fsync_range+0x49/0xa0
[  601.355432]  [] blkdev_write_iter+0xca/0x100
[  601.355679]  [] __vfs_write+0xaa/0xe0
[  601.355925]  [] vfs_write+0xa9/0x1a0
[  601.356164]  [] kernel_write+0x38/0x50

The underlying device is a null_blk, with default parameters:

  queue_mode= MQ
  submit_queues = 1

Verification that nullb0 has something inflight:

root@pserver8:~# cat /sys/block/nullb0/inflight
   01
root@pserver8:~# find /sys/block/nullb0/mq/0/cpu* -name rq_list -print -exec 
cat {} \;
...
/sys/block/nullb0/mq/0/cpu2/rq_list
CTX pending:
8838038e2400
...

During debug it became clear that stalled request is always inserted in
the rq_list from the following path:

   save_stack_trace_tsk + 34
   blk_mq_insert_requests + 231
   blk_mq_flush_plug_list + 281
   blk_flush_plug_list + 199
   wait_on_page_bit + 192
   __filemap_fdatawait_range + 228
   filemap_fdatawait_range + 20
   filemap_write_and_wait_range + 63
   blkdev_fsync + 27
   vfs_fsync_range + 73
   blkdev_write_iter + 202
   __vfs_write + 170
   vfs_write + 169
   kernel_write + 56

So blk_flush_plug_list() was called with from_schedule == true.

If from_schedule is true, that means that finally blk_mq_insert_requests()
offloads execution of __blk_mq_run_hw_queue() and uses kblockd workqueue,
i.e. it calls kblockd_schedule_delayed_work_on().

That means, that we race with another CPU, which is about to execute
__blk_mq_run_hw_queue() work.

Further debugging shows the following traces from different CPUs:

  CPU#0  CPU#1
  -- ---
  reqeust A inserted
  STORE hctx->ctx_map[0] bit marked
  kblockd_schedule...() returns 1
  
 request B inserted
 STORE hctx->ctx_map[1] bit marked
 kblockd_schedule...() returns 0
  *** WORK PENDING bit is cleared ***
  flush_busy_ctxs() is executed, but
  bit 1, set by CPU#1, is not observed

As a result request B pended forever.

This behaviour can be explained by speculative LOAD of hctx->ctx_map on
CPU#0, which is reordered with clear of PENDING bit and executed _before_
actual STORE of bit 1 on CPU#1.

The proper fix is an explicit full barrier , which guarantees
that clear of PENDING bit is to be executed before all possible
speculative LOADS or STORES inside actual work function.

Signed-off-by: Roman Pen 
Cc: Gioh Kim 
Cc: Michael Wang 
Cc: Tejun Heo 
Cc: Jens Axboe 
Cc: linux-bl...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Tejun Heo 
Signed-off-by: Jiri Slaby 
---
 kernel/workqueue.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bb5f920268d7..2bc1257e420f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -622,6 +622,35 @@ static void set_work_pool_and_clear_pending(struct 
work_struct *work,
 */
smp_wmb();

[PATCH v2 03/12] sched/fair: Change the variable to hold the number of periods to 32bit integer

2016-05-02 Thread Yuyang Du
In sched average update, a period is about 1ms, so a 32-bit unsigned
integer can approximately hold a maximum of 49 (=2^32/1000/3600/24)
days, which means it is big enough and 64-bit is needless.

Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |   27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 74eaeab..17bc721 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2619,18 +2619,13 @@ static const u32 __accumulated_sum_N32[] = {
  * n is the number of periods past; a period is ~1ms
  * m is called half-life in exponential decay; here it is 
SCHED_AVG_HALFLIFE=32.
  */
-static __always_inline u64 __decay_sum(u64 val, u64 n)
+static __always_inline u64 __decay_sum(u64 val, u32 n)
 {
-   unsigned int local_n;
-
if (!n)
return val;
else if (unlikely(n > SCHED_AVG_HALFLIFE * 63))
return 0;
 
-   /* after bounds checking we can collapse to 32-bit */
-   local_n = n;
-
/*
 * As y^PERIOD = 1/2, we can combine
 *y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
@@ -2638,12 +2633,12 @@ static __always_inline u64 __decay_sum(u64 val, u64 n)
 *
 * To achieve constant time decay_load.
 */
-   if (unlikely(local_n >= SCHED_AVG_HALFLIFE)) {
-   val >>= local_n / SCHED_AVG_HALFLIFE;
-   local_n %= SCHED_AVG_HALFLIFE;
+   if (unlikely(n >= SCHED_AVG_HALFLIFE)) {
+   val >>= n / SCHED_AVG_HALFLIFE;
+   n %= SCHED_AVG_HALFLIFE;
}
 
-   val = mul_u64_u32_shr(val, __decay_inv_multiply_N[local_n], 32);
+   val = mul_u64_u32_shr(val, __decay_inv_multiply_N[n], 32);
return val;
 }
 
@@ -2654,7 +2649,7 @@ static __always_inline u64 __decay_sum(u64 val, u64 n)
  * We can compute this efficiently by combining:
  * y^32 = 1/2 with precomputed \Sum 1024*y^n   (where n < 32)
  */
-static u32 __accumulate_sum(u64 n)
+static u32 __accumulate_sum(u32 n)
 {
u32 contrib = 0;
 
@@ -2708,8 +2703,8 @@ static __always_inline int
 __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-   u64 delta, scaled_delta, periods;
-   u32 contrib;
+   u64 delta, scaled_delta;
+   u32 contrib, periods;
unsigned int delta_w, scaled_delta_w, decayed = 0;
unsigned long scale_freq, scale_cpu;
 
@@ -2762,7 +2757,11 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg 
*sa,
 
delta -= delta_w;
 
-   /* Figure out how many additional periods this update spans */
+   /*
+* Figure out how many additional periods this update spans.
+* A period is 1024*1024ns or ~1ms, so a 32bit integer can hold
+* approximately a maximum of 49 (=2^32/1000/3600/24) days.
+*/
periods = delta / 1024;
delta %= 1024;
 
-- 
1.7.9.5



[PATCH v2 01/12] sched/fair: Optimize sum computation with a lookup table

2016-05-02 Thread Yuyang Du
__compute_runnable_contrib() uses a loop to compute sum, whereas a
table lookup can do it faster in a constant time.

The program to generate the constants is located at:
Documentation/scheduler/sched-avg.txt

Signed-off-by: Yuyang Du 
Reviewed-by: Morten Rasmussen 
Acked-by: Vincent Guittot 
Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |   20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b8a33ab..e803f11 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2603,6 +2603,15 @@ static const u32 runnable_avg_yN_sum[] = {
 };
 
 /*
+ * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
+ * lower integers.
+ */
+static const u32 __accumulated_sum_N32[] = {
+   0, 23371, 35056, 40899, 43820, 45281,
+   46011, 46376, 46559, 46650, 46696, 46719,
+};
+
+/*
  * Approximate:
  *   val * y^n,where y^32 ~= 0.5 (~1 scheduling period)
  */
@@ -2650,14 +2659,9 @@ static u32 __compute_runnable_contrib(u64 n)
else if (unlikely(n >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;
 
-   /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
-   do {
-   contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
-   contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
-
-   n -= LOAD_AVG_PERIOD;
-   } while (n > LOAD_AVG_PERIOD);
-
+   /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
+   contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
+   n %= LOAD_AVG_PERIOD;
contrib = decay_load(contrib, n);
return contrib + runnable_avg_yN_sum[n];
 }
-- 
1.7.9.5



[patch added to 3.12-stable] workqueue: fix ghost PENDING flag while doing MQ IO

2016-05-02 Thread Jiri Slaby
From: Roman Pen 

This patch has been added to the 3.12 stable tree. If you have any
objections, please let us know.

===

commit 346c09f80459a3ad97df1816d6d606169a51001a upstream.

The bug in a workqueue leads to a stalled IO request in MQ ctx->rq_list
with the following backtrace:

[  601.347452] INFO: task kworker/u129:5:1636 blocked for more than 120 seconds.
[  601.347574]   Tainted: G   O4.4.5-1-storage+ #6
[  601.347651] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this 
message.
[  601.348142] kworker/u129:5  D 880803077988 0  1636  2 0x
[  601.348519] Workqueue: ibnbd_server_fileio_wq 
ibnbd_dev_file_submit_io_worker [ibnbd_server]
[  601.348999]  880803077988 88080466b900 8808033f9c80 
880803078000
[  601.349662]  880807c95000 7fff 815b0920 
880803077ad0
[  601.350333]  8808030779a0 815b01d5  
880803077a38
[  601.350965] Call Trace:
[  601.351203]  [] ? bit_wait+0x60/0x60
[  601.351444]  [] schedule+0x35/0x80
[  601.351709]  [] schedule_timeout+0x192/0x230
[  601.351958]  [] ? blk_flush_plug_list+0xc7/0x220
[  601.352208]  [] ? ktime_get+0x37/0xa0
[  601.352446]  [] ? bit_wait+0x60/0x60
[  601.352688]  [] io_schedule_timeout+0xa4/0x110
[  601.352951]  [] ? _raw_spin_unlock_irqrestore+0xe/0x10
[  601.353196]  [] bit_wait_io+0x1b/0x70
[  601.353440]  [] __wait_on_bit+0x5d/0x90
[  601.353689]  [] wait_on_page_bit+0xc0/0xd0
[  601.353958]  [] ? autoremove_wake_function+0x40/0x40
[  601.354200]  [] __filemap_fdatawait_range+0xe4/0x140
[  601.354441]  [] filemap_fdatawait_range+0x14/0x30
[  601.354688]  [] filemap_write_and_wait_range+0x3f/0x70
[  601.354932]  [] blkdev_fsync+0x1b/0x50
[  601.355193]  [] vfs_fsync_range+0x49/0xa0
[  601.355432]  [] blkdev_write_iter+0xca/0x100
[  601.355679]  [] __vfs_write+0xaa/0xe0
[  601.355925]  [] vfs_write+0xa9/0x1a0
[  601.356164]  [] kernel_write+0x38/0x50

The underlying device is a null_blk, with default parameters:

  queue_mode= MQ
  submit_queues = 1

Verification that nullb0 has something inflight:

root@pserver8:~# cat /sys/block/nullb0/inflight
   01
root@pserver8:~# find /sys/block/nullb0/mq/0/cpu* -name rq_list -print -exec 
cat {} \;
...
/sys/block/nullb0/mq/0/cpu2/rq_list
CTX pending:
8838038e2400
...

During debug it became clear that stalled request is always inserted in
the rq_list from the following path:

   save_stack_trace_tsk + 34
   blk_mq_insert_requests + 231
   blk_mq_flush_plug_list + 281
   blk_flush_plug_list + 199
   wait_on_page_bit + 192
   __filemap_fdatawait_range + 228
   filemap_fdatawait_range + 20
   filemap_write_and_wait_range + 63
   blkdev_fsync + 27
   vfs_fsync_range + 73
   blkdev_write_iter + 202
   __vfs_write + 170
   vfs_write + 169
   kernel_write + 56

So blk_flush_plug_list() was called with from_schedule == true.

If from_schedule is true, that means that finally blk_mq_insert_requests()
offloads execution of __blk_mq_run_hw_queue() and uses kblockd workqueue,
i.e. it calls kblockd_schedule_delayed_work_on().

That means, that we race with another CPU, which is about to execute
__blk_mq_run_hw_queue() work.

Further debugging shows the following traces from different CPUs:

  CPU#0  CPU#1
  -- ---
  reqeust A inserted
  STORE hctx->ctx_map[0] bit marked
  kblockd_schedule...() returns 1
  
 request B inserted
 STORE hctx->ctx_map[1] bit marked
 kblockd_schedule...() returns 0
  *** WORK PENDING bit is cleared ***
  flush_busy_ctxs() is executed, but
  bit 1, set by CPU#1, is not observed

As a result request B pended forever.

This behaviour can be explained by speculative LOAD of hctx->ctx_map on
CPU#0, which is reordered with clear of PENDING bit and executed _before_
actual STORE of bit 1 on CPU#1.

The proper fix is an explicit full barrier , which guarantees
that clear of PENDING bit is to be executed before all possible
speculative LOADS or STORES inside actual work function.

Signed-off-by: Roman Pen 
Cc: Gioh Kim 
Cc: Michael Wang 
Cc: Tejun Heo 
Cc: Jens Axboe 
Cc: linux-bl...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Tejun Heo 
Signed-off-by: Jiri Slaby 
---
 kernel/workqueue.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index bb5f920268d7..2bc1257e420f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -622,6 +622,35 @@ static void set_work_pool_and_clear_pending(struct 
work_struct *work,
 */
smp_wmb();
set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0);
+   /*
+* The following mb guarantees that previous clear of a PENDING bit
+* will not be reordered 

[PATCH v2 03/12] sched/fair: Change the variable to hold the number of periods to 32bit integer

2016-05-02 Thread Yuyang Du
In sched average update, a period is about 1ms, so a 32-bit unsigned
integer can approximately hold a maximum of 49 (=2^32/1000/3600/24)
days, which means it is big enough and 64-bit is needless.

Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |   27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 74eaeab..17bc721 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2619,18 +2619,13 @@ static const u32 __accumulated_sum_N32[] = {
  * n is the number of periods past; a period is ~1ms
  * m is called half-life in exponential decay; here it is 
SCHED_AVG_HALFLIFE=32.
  */
-static __always_inline u64 __decay_sum(u64 val, u64 n)
+static __always_inline u64 __decay_sum(u64 val, u32 n)
 {
-   unsigned int local_n;
-
if (!n)
return val;
else if (unlikely(n > SCHED_AVG_HALFLIFE * 63))
return 0;
 
-   /* after bounds checking we can collapse to 32-bit */
-   local_n = n;
-
/*
 * As y^PERIOD = 1/2, we can combine
 *y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
@@ -2638,12 +2633,12 @@ static __always_inline u64 __decay_sum(u64 val, u64 n)
 *
 * To achieve constant time decay_load.
 */
-   if (unlikely(local_n >= SCHED_AVG_HALFLIFE)) {
-   val >>= local_n / SCHED_AVG_HALFLIFE;
-   local_n %= SCHED_AVG_HALFLIFE;
+   if (unlikely(n >= SCHED_AVG_HALFLIFE)) {
+   val >>= n / SCHED_AVG_HALFLIFE;
+   n %= SCHED_AVG_HALFLIFE;
}
 
-   val = mul_u64_u32_shr(val, __decay_inv_multiply_N[local_n], 32);
+   val = mul_u64_u32_shr(val, __decay_inv_multiply_N[n], 32);
return val;
 }
 
@@ -2654,7 +2649,7 @@ static __always_inline u64 __decay_sum(u64 val, u64 n)
  * We can compute this efficiently by combining:
  * y^32 = 1/2 with precomputed \Sum 1024*y^n   (where n < 32)
  */
-static u32 __accumulate_sum(u64 n)
+static u32 __accumulate_sum(u32 n)
 {
u32 contrib = 0;
 
@@ -2708,8 +2703,8 @@ static __always_inline int
 __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-   u64 delta, scaled_delta, periods;
-   u32 contrib;
+   u64 delta, scaled_delta;
+   u32 contrib, periods;
unsigned int delta_w, scaled_delta_w, decayed = 0;
unsigned long scale_freq, scale_cpu;
 
@@ -2762,7 +2757,11 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg 
*sa,
 
delta -= delta_w;
 
-   /* Figure out how many additional periods this update spans */
+   /*
+* Figure out how many additional periods this update spans.
+* A period is 1024*1024ns or ~1ms, so a 32bit integer can hold
+* approximately a maximum of 49 (=2^32/1000/3600/24) days.
+*/
periods = delta / 1024;
delta %= 1024;
 
-- 
1.7.9.5



[PATCH v2 02/12] sched/fair: Rename variable names for sched averages

2016-05-02 Thread Yuyang Du
The names of sched averages (including load_avg and util_avg) have
been changed and added in the past a couple of years, some of
the names are a bit confusing especially to people who first read them.
This patch attempts to make the names more self-explaining. And some
comments are updated too.

Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |  209 ++-
 1 file changed, 107 insertions(+), 102 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e803f11..74eaeab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -660,13 +660,15 @@ static int select_idle_sibling(struct task_struct *p, int 
cpu);
 static unsigned long task_h_load(struct task_struct *p);
 
 /*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
- * dependent on this value.
+ * Note: everything in sched average calculation, including
+ * __decay_inv_multiply_N, __accumulated_sum_N, __accumulated_sum_N32,
+ * SCHED_AVG_MAX, and SCHED_AVG_MAX_N are all dependent on and only on
+ * (1) exponential decay, (2) a period of 1024*1024ns (~1ms), and (3)
+ * a half-life of 32 periods.
  */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
+#define SCHED_AVG_HALFLIFE 32  /* number of periods as a half-life */
+#define SCHED_AVG_MAX 47742/* maximum possible sched avg */
+#define SCHED_AVG_MAX_N 345/* number of full periods to produce 
SCHED_AVG_MAX */
 
 /* Give new sched_entity start runnable values to heavy its load in infant 
time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -681,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 */
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
-   sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+   sa->load_sum = sa->load_avg * SCHED_AVG_MAX;
/*
 * At this point, util_avg won't be used in select_task_rq_fair anyway
 */
@@ -731,7 +733,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
} else {
sa->util_avg = cap;
}
-   sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+   sa->util_sum = sa->util_avg * SCHED_AVG_MAX;
}
 }
 
@@ -1834,7 +1836,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, 
u64 *period)
*period = now - p->last_task_numa_placement;
} else {
delta = p->se.avg.load_sum / p->se.load.weight;
-   *period = LOAD_AVG_MAX;
+   *period = SCHED_AVG_MAX;
}
 
p->last_sum_exec_runtime = runtime;
@@ -2583,7 +2585,7 @@ static inline void update_cfs_shares(struct cfs_rq 
*cfs_rq)
 
 #ifdef CONFIG_SMP
 /* Precomputed fixed inverse multiplies for multiplication by y^n */
-static const u32 runnable_avg_yN_inv[] = {
+static const u32 __decay_inv_multiply_N[] = {
0x, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
@@ -2596,7 +2598,7 @@ static const u32 runnable_avg_yN_inv[] = {
  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
  * over-estimates when re-combining.
  */
-static const u32 runnable_avg_yN_sum[] = {
+static const u32 __accumulated_sum_N[] = {
0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
@@ -2612,16 +2614,18 @@ static const u32 __accumulated_sum_N32[] = {
 };
 
 /*
- * Approximate:
- *   val * y^n,where y^32 ~= 0.5 (~1 scheduling period)
+ * val * y^n, where y^m ~= 0.5
+ *
+ * n is the number of periods past; a period is ~1ms
+ * m is called half-life in exponential decay; here it is 
SCHED_AVG_HALFLIFE=32.
  */
-static __always_inline u64 decay_load(u64 val, u64 n)
+static __always_inline u64 __decay_sum(u64 val, u64 n)
 {
unsigned int local_n;
 
if (!n)
return val;
-   else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+   else if (unlikely(n > SCHED_AVG_HALFLIFE * 63))
return 0;
 
/* after bounds checking we can collapse to 32-bit */
@@ -2634,36 +2638,36 @@ static __always_inline u64 decay_load(u64 val, u64 n)
 *
 * To achieve constant time decay_load.
 */
-   if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
-   val >>= local_n / LOAD_AVG_PERIOD;
-   local_n %= LOAD_AVG_PERIOD;
+   if (unlikely(local_n >= SCHED_AVG_HALFLIFE)) {
+   val >>= local_n / SCHED_AVG_HALFLIFE;
+   local_n %= 

[PATCH v2 00/12] sched/fair: Optimize and clean up sched averages

2016-05-02 Thread Yuyang Du
Hi Peter,

This patch series combines the previous cleanup and optimization
series. And as you and Ingo suggested, the increased kernel load
scale is reinstated when on 64BIT and FAIR_GROUP_SCHED. In addition
to that, the changes include Vincent's fix, typos fixes, changelog
and comment reword.

Thanks,
Yuyang

Yuyang Du (12):
  sched/fair: Optimize sum computation with a lookup table
  sched/fair: Rename variable names for sched averages
  sched/fair: Change the variable to hold the number of periods to
32bit integer
  sched/fair: Add __always_inline compiler attribute to
__accumulate_sum()
  sched/fair: Optimize __update_sched_avg()
  documentation: Add scheduler/sched-avg.txt
  sched/fair: Generalize the load/util averages resolution definition
  sched/fair: Remove SCHED_LOAD_SHIFT and SCHED_LOAD_SCALE
  sched/fair: Add introduction to the sched average metrics
  sched/fair: Remove scale_load_down() for load_avg
  sched/fair: Rename scale_load() and scale_load_down()
  sched/fair: Enable increased scale for kernel load

 Documentation/scheduler/sched-avg.txt |  137 
 include/linux/sched.h |   81 ++-
 kernel/sched/core.c   |8 +-
 kernel/sched/fair.c   |  398 +
 kernel/sched/sched.h  |   48 ++--
 5 files changed, 439 insertions(+), 233 deletions(-)
 create mode 100644 Documentation/scheduler/sched-avg.txt

-- 
1.7.9.5



[PATCH v2 00/12] sched/fair: Optimize and clean up sched averages

2016-05-02 Thread Yuyang Du
Hi Peter,

This patch series combines the previous cleanup and optimization
series. And as you and Ingo suggested, the increased kernel load
scale is reinstated when on 64BIT and FAIR_GROUP_SCHED. In addition
to that, the changes include Vincent's fix, typos fixes, changelog
and comment reword.

Thanks,
Yuyang

Yuyang Du (12):
  sched/fair: Optimize sum computation with a lookup table
  sched/fair: Rename variable names for sched averages
  sched/fair: Change the variable to hold the number of periods to
32bit integer
  sched/fair: Add __always_inline compiler attribute to
__accumulate_sum()
  sched/fair: Optimize __update_sched_avg()
  documentation: Add scheduler/sched-avg.txt
  sched/fair: Generalize the load/util averages resolution definition
  sched/fair: Remove SCHED_LOAD_SHIFT and SCHED_LOAD_SCALE
  sched/fair: Add introduction to the sched average metrics
  sched/fair: Remove scale_load_down() for load_avg
  sched/fair: Rename scale_load() and scale_load_down()
  sched/fair: Enable increased scale for kernel load

 Documentation/scheduler/sched-avg.txt |  137 
 include/linux/sched.h |   81 ++-
 kernel/sched/core.c   |8 +-
 kernel/sched/fair.c   |  398 +
 kernel/sched/sched.h  |   48 ++--
 5 files changed, 439 insertions(+), 233 deletions(-)
 create mode 100644 Documentation/scheduler/sched-avg.txt

-- 
1.7.9.5



[PATCH v2 02/12] sched/fair: Rename variable names for sched averages

2016-05-02 Thread Yuyang Du
The names of sched averages (including load_avg and util_avg) have
been changed and added in the past a couple of years, some of
the names are a bit confusing especially to people who first read them.
This patch attempts to make the names more self-explaining. And some
comments are updated too.

Signed-off-by: Yuyang Du 
---
 kernel/sched/fair.c |  209 ++-
 1 file changed, 107 insertions(+), 102 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e803f11..74eaeab 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -660,13 +660,15 @@ static int select_idle_sibling(struct task_struct *p, int 
cpu);
 static unsigned long task_h_load(struct task_struct *p);
 
 /*
- * We choose a half-life close to 1 scheduling period.
- * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
- * dependent on this value.
+ * Note: everything in sched average calculation, including
+ * __decay_inv_multiply_N, __accumulated_sum_N, __accumulated_sum_N32,
+ * SCHED_AVG_MAX, and SCHED_AVG_MAX_N are all dependent on and only on
+ * (1) exponential decay, (2) a period of 1024*1024ns (~1ms), and (3)
+ * a half-life of 32 periods.
  */
-#define LOAD_AVG_PERIOD 32
-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
+#define SCHED_AVG_HALFLIFE 32  /* number of periods as a half-life */
+#define SCHED_AVG_MAX 47742/* maximum possible sched avg */
+#define SCHED_AVG_MAX_N 345/* number of full periods to produce 
SCHED_AVG_MAX */
 
 /* Give new sched_entity start runnable values to heavy its load in infant 
time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -681,7 +683,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 */
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
-   sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
+   sa->load_sum = sa->load_avg * SCHED_AVG_MAX;
/*
 * At this point, util_avg won't be used in select_task_rq_fair anyway
 */
@@ -731,7 +733,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
} else {
sa->util_avg = cap;
}
-   sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+   sa->util_sum = sa->util_avg * SCHED_AVG_MAX;
}
 }
 
@@ -1834,7 +1836,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, 
u64 *period)
*period = now - p->last_task_numa_placement;
} else {
delta = p->se.avg.load_sum / p->se.load.weight;
-   *period = LOAD_AVG_MAX;
+   *period = SCHED_AVG_MAX;
}
 
p->last_sum_exec_runtime = runtime;
@@ -2583,7 +2585,7 @@ static inline void update_cfs_shares(struct cfs_rq 
*cfs_rq)
 
 #ifdef CONFIG_SMP
 /* Precomputed fixed inverse multiplies for multiplication by y^n */
-static const u32 runnable_avg_yN_inv[] = {
+static const u32 __decay_inv_multiply_N[] = {
0x, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
@@ -2596,7 +2598,7 @@ static const u32 runnable_avg_yN_inv[] = {
  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
  * over-estimates when re-combining.
  */
-static const u32 runnable_avg_yN_sum[] = {
+static const u32 __accumulated_sum_N[] = {
0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
@@ -2612,16 +2614,18 @@ static const u32 __accumulated_sum_N32[] = {
 };
 
 /*
- * Approximate:
- *   val * y^n,where y^32 ~= 0.5 (~1 scheduling period)
+ * val * y^n, where y^m ~= 0.5
+ *
+ * n is the number of periods past; a period is ~1ms
+ * m is called half-life in exponential decay; here it is 
SCHED_AVG_HALFLIFE=32.
  */
-static __always_inline u64 decay_load(u64 val, u64 n)
+static __always_inline u64 __decay_sum(u64 val, u64 n)
 {
unsigned int local_n;
 
if (!n)
return val;
-   else if (unlikely(n > LOAD_AVG_PERIOD * 63))
+   else if (unlikely(n > SCHED_AVG_HALFLIFE * 63))
return 0;
 
/* after bounds checking we can collapse to 32-bit */
@@ -2634,36 +2638,36 @@ static __always_inline u64 decay_load(u64 val, u64 n)
 *
 * To achieve constant time decay_load.
 */
-   if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
-   val >>= local_n / LOAD_AVG_PERIOD;
-   local_n %= LOAD_AVG_PERIOD;
+   if (unlikely(local_n >= SCHED_AVG_HALFLIFE)) {
+   val >>= local_n / SCHED_AVG_HALFLIFE;
+   local_n %= SCHED_AVG_HALFLIFE;
  

RE: [PATCH 5/5] vfio-pci: Allow to mmap MSI-X table if interrupt remapping is supported

2016-05-02 Thread Tian, Kevin
> From: Yongji Xie
> Sent: Wednesday, April 27, 2016 8:43 PM
> 
> This patch enables mmapping MSI-X tables if hardware supports
> interrupt remapping which can ensure that a given pci device
> can only shoot the MSIs assigned for it.
> 
> With MSI-X table mmapped, we also need to expose the
> read/write interface which will be used to access MSI-X table.
> 
> Signed-off-by: Yongji Xie 

A curious question here. Does "allow to mmap MSI-X" essentially
mean that KVM guest can directly read/write physical MSI-X
structure then?

Thanks
Kevin


RE: [PATCH 5/5] vfio-pci: Allow to mmap MSI-X table if interrupt remapping is supported

2016-05-02 Thread Tian, Kevin
> From: Yongji Xie
> Sent: Wednesday, April 27, 2016 8:43 PM
> 
> This patch enables mmapping MSI-X tables if hardware supports
> interrupt remapping which can ensure that a given pci device
> can only shoot the MSIs assigned for it.
> 
> With MSI-X table mmapped, we also need to expose the
> read/write interface which will be used to access MSI-X table.
> 
> Signed-off-by: Yongji Xie 

A curious question here. Does "allow to mmap MSI-X" essentially
mean that KVM guest can directly read/write physical MSI-X
structure then?

Thanks
Kevin


Re: [PATCH 7/9] powerpc/powernv: Add platform support for stop instruction

2016-05-02 Thread Michael Neuling

> diff --git a/arch/powerpc/include/asm/cputable.h 
> b/arch/powerpc/include/asm/cputable.h
> index df4fb5f..a4739a1 100644
> --- a/arch/powerpc/include/asm/cputable.h
> +++ b/arch/powerpc/include/asm/cputable.h
> @@ -205,6 +205,7 @@ enum {
>  #define CPU_FTR_DABRX
> LONG_ASM_CONST(0x0800)
>  #define CPU_FTR_PMAO_BUG LONG_ASM_CONST(0x1000)
>  #define CPU_FTR_SUBCORE  
> LONG_ASM_CONST(0x2000)
> +#define CPU_FTR_STOP_INSTLONG_ASM_CONST(0x4000)

In general, we are putting all the POWER9 features under CPU_FTR_ARCH_300.
Is there a reason you need this separate bit?

CPU_FTR bits are fairly scarce these days.

Mikey


Re: [PATCH 7/9] powerpc/powernv: Add platform support for stop instruction

2016-05-02 Thread Michael Neuling

> diff --git a/arch/powerpc/include/asm/cputable.h 
> b/arch/powerpc/include/asm/cputable.h
> index df4fb5f..a4739a1 100644
> --- a/arch/powerpc/include/asm/cputable.h
> +++ b/arch/powerpc/include/asm/cputable.h
> @@ -205,6 +205,7 @@ enum {
>  #define CPU_FTR_DABRX
> LONG_ASM_CONST(0x0800)
>  #define CPU_FTR_PMAO_BUG LONG_ASM_CONST(0x1000)
>  #define CPU_FTR_SUBCORE  
> LONG_ASM_CONST(0x2000)
> +#define CPU_FTR_STOP_INSTLONG_ASM_CONST(0x4000)

In general, we are putting all the POWER9 features under CPU_FTR_ARCH_300.
Is there a reason you need this separate bit?

CPU_FTR bits are fairly scarce these days.

Mikey


[PATCH 4/6] mm/page_owner: introduce split_page_owner and replace manual handling

2016-05-02 Thread js1304
From: Joonsoo Kim 

split_page() calls set_page_owner() to set up page_owner to each pages.
But, it has a drawback that head page and the others have different
stacktrace because callsite of set_page_owner() is slightly differnt.
To avoid this problem, this patch copies head page's page_owner to
the others. It needs to introduce new function, split_page_owner() but
it also remove the other function, get_page_owner_gfp() so looks good
to do.

Signed-off-by: Joonsoo Kim 
---
 include/linux/page_owner.h | 12 +---
 mm/page_alloc.c|  8 ++--
 mm/page_owner.c|  7 +--
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 46f1b93..30583ab 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -10,7 +10,7 @@ extern struct page_ext_operations page_owner_ops;
 extern void __reset_page_owner(struct page *page, unsigned int order);
 extern void __set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask);
-extern gfp_t __get_page_owner_gfp(struct page *page);
+extern void __split_page_owner(struct page *page, unsigned int order);
 extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
 extern void __set_page_owner_migrate_reason(struct page *page, int reason);
 extern void __dump_page_owner(struct page *page);
@@ -28,12 +28,10 @@ static inline void set_page_owner(struct page *page,
__set_page_owner(page, order, gfp_mask);
 }
 
-static inline gfp_t get_page_owner_gfp(struct page *page)
+static inline void split_page_owner(struct page *page, unsigned int order)
 {
if (static_branch_unlikely(_owner_inited))
-   return __get_page_owner_gfp(page);
-   else
-   return 0;
+   __split_page_owner(page, order);
 }
 static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
 {
@@ -58,9 +56,9 @@ static inline void set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask)
 {
 }
-static inline gfp_t get_page_owner_gfp(struct page *page)
+static inline void split_page_owner(struct page *page,
+   unsigned int order)
 {
-   return 0;
 }
 static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b632be..7cefc90 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2466,7 +2466,6 @@ void free_hot_cold_page_list(struct list_head *list, bool 
cold)
 void split_page(struct page *page, unsigned int order)
 {
int i;
-   gfp_t gfp_mask;
 
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
@@ -2480,12 +2479,9 @@ void split_page(struct page *page, unsigned int order)
split_page(virt_to_page(page[0].shadow), order);
 #endif
 
-   gfp_mask = get_page_owner_gfp(page);
-   set_page_owner(page, 0, gfp_mask);
-   for (i = 1; i < (1 << order); i++) {
+   for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
-   set_page_owner(page + i, 0, gfp_mask);
-   }
+   split_page_owner(page, order);
 }
 EXPORT_SYMBOL_GPL(split_page);
 
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 6693959..7b5a834 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -86,11 +86,14 @@ void __set_page_owner_migrate_reason(struct page *page, int 
reason)
page_ext->last_migrate_reason = reason;
 }
 
-gfp_t __get_page_owner_gfp(struct page *page)
+void __split_page_owner(struct page *page, unsigned int order)
 {
struct page_ext *page_ext = lookup_page_ext(page);
+   int i;
 
-   return page_ext->gfp_mask;
+   page_ext->order = 0;
+   for (i = 1; i < (1 << order); i++)
+   __copy_page_owner(page, page + i);
 }
 
 void __copy_page_owner(struct page *oldpage, struct page *newpage)
-- 
1.9.1



[PATCH 4/6] mm/page_owner: introduce split_page_owner and replace manual handling

2016-05-02 Thread js1304
From: Joonsoo Kim 

split_page() calls set_page_owner() to set up page_owner to each pages.
But, it has a drawback that head page and the others have different
stacktrace because callsite of set_page_owner() is slightly differnt.
To avoid this problem, this patch copies head page's page_owner to
the others. It needs to introduce new function, split_page_owner() but
it also remove the other function, get_page_owner_gfp() so looks good
to do.

Signed-off-by: Joonsoo Kim 
---
 include/linux/page_owner.h | 12 +---
 mm/page_alloc.c|  8 ++--
 mm/page_owner.c|  7 +--
 3 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 46f1b93..30583ab 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -10,7 +10,7 @@ extern struct page_ext_operations page_owner_ops;
 extern void __reset_page_owner(struct page *page, unsigned int order);
 extern void __set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask);
-extern gfp_t __get_page_owner_gfp(struct page *page);
+extern void __split_page_owner(struct page *page, unsigned int order);
 extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
 extern void __set_page_owner_migrate_reason(struct page *page, int reason);
 extern void __dump_page_owner(struct page *page);
@@ -28,12 +28,10 @@ static inline void set_page_owner(struct page *page,
__set_page_owner(page, order, gfp_mask);
 }
 
-static inline gfp_t get_page_owner_gfp(struct page *page)
+static inline void split_page_owner(struct page *page, unsigned int order)
 {
if (static_branch_unlikely(_owner_inited))
-   return __get_page_owner_gfp(page);
-   else
-   return 0;
+   __split_page_owner(page, order);
 }
 static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
 {
@@ -58,9 +56,9 @@ static inline void set_page_owner(struct page *page,
unsigned int order, gfp_t gfp_mask)
 {
 }
-static inline gfp_t get_page_owner_gfp(struct page *page)
+static inline void split_page_owner(struct page *page,
+   unsigned int order)
 {
-   return 0;
 }
 static inline void copy_page_owner(struct page *oldpage, struct page *newpage)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5b632be..7cefc90 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2466,7 +2466,6 @@ void free_hot_cold_page_list(struct list_head *list, bool 
cold)
 void split_page(struct page *page, unsigned int order)
 {
int i;
-   gfp_t gfp_mask;
 
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
@@ -2480,12 +2479,9 @@ void split_page(struct page *page, unsigned int order)
split_page(virt_to_page(page[0].shadow), order);
 #endif
 
-   gfp_mask = get_page_owner_gfp(page);
-   set_page_owner(page, 0, gfp_mask);
-   for (i = 1; i < (1 << order); i++) {
+   for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
-   set_page_owner(page + i, 0, gfp_mask);
-   }
+   split_page_owner(page, order);
 }
 EXPORT_SYMBOL_GPL(split_page);
 
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 6693959..7b5a834 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -86,11 +86,14 @@ void __set_page_owner_migrate_reason(struct page *page, int 
reason)
page_ext->last_migrate_reason = reason;
 }
 
-gfp_t __get_page_owner_gfp(struct page *page)
+void __split_page_owner(struct page *page, unsigned int order)
 {
struct page_ext *page_ext = lookup_page_ext(page);
+   int i;
 
-   return page_ext->gfp_mask;
+   page_ext->order = 0;
+   for (i = 1; i < (1 << order); i++)
+   __copy_page_owner(page, page + i);
 }
 
 void __copy_page_owner(struct page *oldpage, struct page *newpage)
-- 
1.9.1



[PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

2016-05-02 Thread js1304
From: Joonsoo Kim 

Currently, we store each page's allocation stacktrace on corresponding
page_ext structure and it requires a lot of memory. This causes the problem
that memory tight system doesn't work well if page_owner is enabled.
Moreover, even with this large memory consumption, we cannot get full
stacktrace because we allocate memory at boot time and just maintain
8 stacktrace slots to balance memory consumption. We could increase it
to more but it would make system unusable or change system behaviour.

To solve the problem, this patch uses stackdepot to store stacktrace.
It obviously provides memory saving but there is a drawback that
stackdepot could fail.

stackdepot allocates memory at runtime so it could fail if system has
not enough memory. But, most of allocation stack are generated at very
early time and there are much memory at this time. So, failure would not
happen easily. And, one failure means that we miss just one page's
allocation stacktrace so it would not be a big problem. In this patch,
when memory allocation failure happens, we store special stracktrace
handle to the page that is failed to save stacktrace. With it, user
can guess memory usage properly even if failure happens.

Memory saving looks as following. (Boot 4GB memory system with page_owner)

92274688 bytes -> 25165824 bytes

72% reduction in static allocation size. Even if we should add up size of
dynamic allocation memory, it would not that big because stacktrace is
mostly duplicated.

Note that implementation looks complex than someone would imagine because
there is recursion issue. stackdepot uses page allocator and page_owner
is called at page allocation. Using stackdepot in page_owner could re-call
page allcator and then page_owner. That is a recursion. To detect and
avoid it, whenever we obtain stacktrace, recursion is checked and
page_owner is set to dummy information if found. Dummy information means
that this page is allocated for page_owner feature itself
(such as stackdepot) and it's understandable behavior for user.

Signed-off-by: Joonsoo Kim 
---
 include/linux/page_ext.h |   4 +-
 lib/Kconfig.debug|   1 +
 mm/page_owner.c  | 128 ---
 3 files changed, 114 insertions(+), 19 deletions(-)

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index e1fe7cf..03f2a3e 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -3,6 +3,7 @@
 
 #include 
 #include 
+#include 
 
 struct pglist_data;
 struct page_ext_operations {
@@ -44,9 +45,8 @@ struct page_ext {
 #ifdef CONFIG_PAGE_OWNER
unsigned int order;
gfp_t gfp_mask;
-   unsigned int nr_entries;
int last_migrate_reason;
-   unsigned long trace_entries[8];
+   depot_stack_handle_t handle;
 #endif
 };
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5d57177..a32fd24 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -248,6 +248,7 @@ config PAGE_OWNER
depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
select DEBUG_FS
select STACKTRACE
+   select STACKDEPOT
select PAGE_EXTENSION
help
  This keeps track of what call chain is the owner of a page, may
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 7b5a834..7875de5 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -7,11 +7,18 @@
 #include 
 #include 
 #include 
+#include 
+
 #include "internal.h"
 
+#define PAGE_OWNER_STACK_DEPTH (64)
+
 static bool page_owner_disabled = true;
 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
 
+static depot_stack_handle_t dummy_handle;
+static depot_stack_handle_t failure_handle;
+
 static void init_early_allocated_pages(void);
 
 static int early_page_owner_param(char *buf)
@@ -34,11 +41,41 @@ static bool need_page_owner(void)
return true;
 }
 
+static noinline void register_dummy_stack(void)
+{
+   unsigned long entries[4];
+   struct stack_trace dummy;
+
+   dummy.nr_entries = 0;
+   dummy.max_entries = ARRAY_SIZE(entries);
+   dummy.entries = [0];
+   dummy.skip = 0;
+
+   save_stack_trace();
+   dummy_handle = depot_save_stack(, GFP_KERNEL);
+}
+
+static noinline void register_failure_stack(void)
+{
+   unsigned long entries[4];
+   struct stack_trace failure;
+
+   failure.nr_entries = 0;
+   failure.max_entries = ARRAY_SIZE(entries);
+   failure.entries = [0];
+   failure.skip = 0;
+
+   save_stack_trace();
+   failure_handle = depot_save_stack(, GFP_KERNEL);
+}
+
 static void init_page_owner(void)
 {
if (page_owner_disabled)
return;
 
+   register_dummy_stack();
+   register_failure_stack();
static_branch_enable(_owner_inited);
init_early_allocated_pages();
 }
@@ -59,21 +96,56 @@ void __reset_page_owner(struct page *page, unsigned int 
order)
}
 }
 
-void __set_page_owner(struct page *page, unsigned int order, 

Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Minchan Kim
On Mon, May 02, 2016 at 05:06:00PM +0900, Sergey Senozhatsky wrote:
> On (05/02/16 16:25), Sergey Senozhatsky wrote:
> [..]
> > > Trivial:
> > > We could remove max_strm now and change description.
> > 
> > oh, yes.
> 
> how about something like this? remove max_comp_streams entirely, but
> leave the attr. if we keep zram->max_comp_streams and return its value
> (set by user space) from show() handler, we are techically lying;
> because the actual number of streams is now num_online_cpus().

Yes, we should have limit the value to num_online_cpus from the
beginning.

> 
> 
> ===8<===8<===
> 
> From: Sergey Senozhatsky 
> Subject: [PATCH] zram: remove max_comp_streams internals
> 
> Remove the internal part of max_comp_streams interface, since we
> switched to per-cpu streams. We will keep RW max_comp_streams attr
> around, because:
> 
> a) we may (silently) switch back to idle compression streams list
>and don't want to disturb user space
> b) max_comp_streams attr must wait for the next 'lay off cycle';
>we give user space 2 years to adjust before we remove/downgrade
>the attr, and there are already several attrs scheduled for
>removal in 4.11, so it's too late for max_comp_streams.
> 
> Signed-off-by: Sergey Senozhatsky 
> ---
>  drivers/block/zram/zcomp.c|  7 +--
>  drivers/block/zram/zcomp.h|  2 +-
>  drivers/block/zram/zram_drv.c | 47 
> +++
>  drivers/block/zram/zram_drv.h |  1 -
>  4 files changed, 14 insertions(+), 43 deletions(-)
> 
> diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
> index d4159e4..d4de9cb 100644
> --- a/drivers/block/zram/zcomp.c
> +++ b/drivers/block/zram/zcomp.c
> @@ -95,11 +95,6 @@ bool zcomp_available_algorithm(const char *comp)
>   return find_backend(comp) != NULL;
>  }
>  
> -bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
> -{
> - return true;
> -}
> -
>  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
>  {
>   return *get_cpu_ptr(comp->stream);
> @@ -211,7 +206,7 @@ void zcomp_destroy(struct zcomp *comp)
>   * case of allocation error, or any other error potentially
>   * returned by functions zcomp_strm_{multi,single}_create.
>   */
> -struct zcomp *zcomp_create(const char *compress, int max_strm)
> +struct zcomp *zcomp_create(const char *compress)
>  {
>   struct zcomp *comp;
>   struct zcomp_backend *backend;
> diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
> index aba8c21..ffd88cb 100644
> --- a/drivers/block/zram/zcomp.h
> +++ b/drivers/block/zram/zcomp.h
> @@ -45,7 +45,7 @@ struct zcomp {
>  ssize_t zcomp_available_show(const char *comp, char *buf);
>  bool zcomp_available_algorithm(const char *comp);
>  
> -struct zcomp *zcomp_create(const char *comp, int max_strm);
> +struct zcomp *zcomp_create(const char *comp);
>  void zcomp_destroy(struct zcomp *comp);
>  
>  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index cad1751..817e511 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev,
>   return len;
>  }
>  
> +/*
> + * We switched to per-cpu streams and this attr is not needed anymore.
> + * However, we will keep it around for some time, because:
> + * a) we may revert per-cpu streams in the future
> + * b) it's visible to user space and we need to follow our 2 years
> + *retirement rule; but we already have a number of 'soon to be
> + *altered' attrs, so max_comp_streams need to wait for the next
> + *layoff cycle.
> + */

Thanks for nice comment.

>  static ssize_t max_comp_streams_show(struct device *dev,
>   struct device_attribute *attr, char *buf)
>  {
> - int val;
> - struct zram *zram = dev_to_zram(dev);
> -
> - down_read(>init_lock);
> - val = zram->max_comp_streams;
> - up_read(>init_lock);
> -
> - return scnprintf(buf, PAGE_SIZE, "%d\n", val);
> + return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
>  }
>  
>  static ssize_t max_comp_streams_store(struct device *dev,
>   struct device_attribute *attr, const char *buf, size_t len)
>  {
> - int num;
> - struct zram *zram = dev_to_zram(dev);
> - int ret;
> -
> - ret = kstrtoint(buf, 0, );
> - if (ret < 0)
> - return ret;
> - if (num < 1)
> - return -EINVAL;
> -
> - down_write(>init_lock);
> - if (init_done(zram)) {
> - if (!zcomp_set_max_streams(zram->comp, num)) {
> - pr_info("Cannot change max compression streams\n");
> - ret = -EINVAL;
> - goto out;
> - }
> - }
> -
> - zram->max_comp_streams = num;
> - ret = len;
> -out:
> - up_write(>init_lock);
> - return ret;

At 

[PATCH 6/6] mm/page_owner: use stackdepot to store stacktrace

2016-05-02 Thread js1304
From: Joonsoo Kim 

Currently, we store each page's allocation stacktrace on corresponding
page_ext structure and it requires a lot of memory. This causes the problem
that memory tight system doesn't work well if page_owner is enabled.
Moreover, even with this large memory consumption, we cannot get full
stacktrace because we allocate memory at boot time and just maintain
8 stacktrace slots to balance memory consumption. We could increase it
to more but it would make system unusable or change system behaviour.

To solve the problem, this patch uses stackdepot to store stacktrace.
It obviously provides memory saving but there is a drawback that
stackdepot could fail.

stackdepot allocates memory at runtime so it could fail if system has
not enough memory. But, most of allocation stack are generated at very
early time and there are much memory at this time. So, failure would not
happen easily. And, one failure means that we miss just one page's
allocation stacktrace so it would not be a big problem. In this patch,
when memory allocation failure happens, we store special stracktrace
handle to the page that is failed to save stacktrace. With it, user
can guess memory usage properly even if failure happens.

Memory saving looks as following. (Boot 4GB memory system with page_owner)

92274688 bytes -> 25165824 bytes

72% reduction in static allocation size. Even if we should add up size of
dynamic allocation memory, it would not that big because stacktrace is
mostly duplicated.

Note that implementation looks complex than someone would imagine because
there is recursion issue. stackdepot uses page allocator and page_owner
is called at page allocation. Using stackdepot in page_owner could re-call
page allcator and then page_owner. That is a recursion. To detect and
avoid it, whenever we obtain stacktrace, recursion is checked and
page_owner is set to dummy information if found. Dummy information means
that this page is allocated for page_owner feature itself
(such as stackdepot) and it's understandable behavior for user.

Signed-off-by: Joonsoo Kim 
---
 include/linux/page_ext.h |   4 +-
 lib/Kconfig.debug|   1 +
 mm/page_owner.c  | 128 ---
 3 files changed, 114 insertions(+), 19 deletions(-)

diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index e1fe7cf..03f2a3e 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -3,6 +3,7 @@
 
 #include 
 #include 
+#include 
 
 struct pglist_data;
 struct page_ext_operations {
@@ -44,9 +45,8 @@ struct page_ext {
 #ifdef CONFIG_PAGE_OWNER
unsigned int order;
gfp_t gfp_mask;
-   unsigned int nr_entries;
int last_migrate_reason;
-   unsigned long trace_entries[8];
+   depot_stack_handle_t handle;
 #endif
 };
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5d57177..a32fd24 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -248,6 +248,7 @@ config PAGE_OWNER
depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
select DEBUG_FS
select STACKTRACE
+   select STACKDEPOT
select PAGE_EXTENSION
help
  This keeps track of what call chain is the owner of a page, may
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 7b5a834..7875de5 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -7,11 +7,18 @@
 #include 
 #include 
 #include 
+#include 
+
 #include "internal.h"
 
+#define PAGE_OWNER_STACK_DEPTH (64)
+
 static bool page_owner_disabled = true;
 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
 
+static depot_stack_handle_t dummy_handle;
+static depot_stack_handle_t failure_handle;
+
 static void init_early_allocated_pages(void);
 
 static int early_page_owner_param(char *buf)
@@ -34,11 +41,41 @@ static bool need_page_owner(void)
return true;
 }
 
+static noinline void register_dummy_stack(void)
+{
+   unsigned long entries[4];
+   struct stack_trace dummy;
+
+   dummy.nr_entries = 0;
+   dummy.max_entries = ARRAY_SIZE(entries);
+   dummy.entries = [0];
+   dummy.skip = 0;
+
+   save_stack_trace();
+   dummy_handle = depot_save_stack(, GFP_KERNEL);
+}
+
+static noinline void register_failure_stack(void)
+{
+   unsigned long entries[4];
+   struct stack_trace failure;
+
+   failure.nr_entries = 0;
+   failure.max_entries = ARRAY_SIZE(entries);
+   failure.entries = [0];
+   failure.skip = 0;
+
+   save_stack_trace();
+   failure_handle = depot_save_stack(, GFP_KERNEL);
+}
+
 static void init_page_owner(void)
 {
if (page_owner_disabled)
return;
 
+   register_dummy_stack();
+   register_failure_stack();
static_branch_enable(_owner_inited);
init_early_allocated_pages();
 }
@@ -59,21 +96,56 @@ void __reset_page_owner(struct page *page, unsigned int 
order)
}
 }
 
-void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
+static inline bool 

Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Minchan Kim
On Mon, May 02, 2016 at 05:06:00PM +0900, Sergey Senozhatsky wrote:
> On (05/02/16 16:25), Sergey Senozhatsky wrote:
> [..]
> > > Trivial:
> > > We could remove max_strm now and change description.
> > 
> > oh, yes.
> 
> how about something like this? remove max_comp_streams entirely, but
> leave the attr. if we keep zram->max_comp_streams and return its value
> (set by user space) from show() handler, we are techically lying;
> because the actual number of streams is now num_online_cpus().

Yes, we should have limit the value to num_online_cpus from the
beginning.

> 
> 
> ===8<===8<===
> 
> From: Sergey Senozhatsky 
> Subject: [PATCH] zram: remove max_comp_streams internals
> 
> Remove the internal part of max_comp_streams interface, since we
> switched to per-cpu streams. We will keep RW max_comp_streams attr
> around, because:
> 
> a) we may (silently) switch back to idle compression streams list
>and don't want to disturb user space
> b) max_comp_streams attr must wait for the next 'lay off cycle';
>we give user space 2 years to adjust before we remove/downgrade
>the attr, and there are already several attrs scheduled for
>removal in 4.11, so it's too late for max_comp_streams.
> 
> Signed-off-by: Sergey Senozhatsky 
> ---
>  drivers/block/zram/zcomp.c|  7 +--
>  drivers/block/zram/zcomp.h|  2 +-
>  drivers/block/zram/zram_drv.c | 47 
> +++
>  drivers/block/zram/zram_drv.h |  1 -
>  4 files changed, 14 insertions(+), 43 deletions(-)
> 
> diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
> index d4159e4..d4de9cb 100644
> --- a/drivers/block/zram/zcomp.c
> +++ b/drivers/block/zram/zcomp.c
> @@ -95,11 +95,6 @@ bool zcomp_available_algorithm(const char *comp)
>   return find_backend(comp) != NULL;
>  }
>  
> -bool zcomp_set_max_streams(struct zcomp *comp, int num_strm)
> -{
> - return true;
> -}
> -
>  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp)
>  {
>   return *get_cpu_ptr(comp->stream);
> @@ -211,7 +206,7 @@ void zcomp_destroy(struct zcomp *comp)
>   * case of allocation error, or any other error potentially
>   * returned by functions zcomp_strm_{multi,single}_create.
>   */
> -struct zcomp *zcomp_create(const char *compress, int max_strm)
> +struct zcomp *zcomp_create(const char *compress)
>  {
>   struct zcomp *comp;
>   struct zcomp_backend *backend;
> diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
> index aba8c21..ffd88cb 100644
> --- a/drivers/block/zram/zcomp.h
> +++ b/drivers/block/zram/zcomp.h
> @@ -45,7 +45,7 @@ struct zcomp {
>  ssize_t zcomp_available_show(const char *comp, char *buf);
>  bool zcomp_available_algorithm(const char *comp);
>  
> -struct zcomp *zcomp_create(const char *comp, int max_strm);
> +struct zcomp *zcomp_create(const char *comp);
>  void zcomp_destroy(struct zcomp *comp);
>  
>  struct zcomp_strm *zcomp_strm_find(struct zcomp *comp);
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index cad1751..817e511 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -304,46 +304,25 @@ static ssize_t mem_used_max_store(struct device *dev,
>   return len;
>  }
>  
> +/*
> + * We switched to per-cpu streams and this attr is not needed anymore.
> + * However, we will keep it around for some time, because:
> + * a) we may revert per-cpu streams in the future
> + * b) it's visible to user space and we need to follow our 2 years
> + *retirement rule; but we already have a number of 'soon to be
> + *altered' attrs, so max_comp_streams need to wait for the next
> + *layoff cycle.
> + */

Thanks for nice comment.

>  static ssize_t max_comp_streams_show(struct device *dev,
>   struct device_attribute *attr, char *buf)
>  {
> - int val;
> - struct zram *zram = dev_to_zram(dev);
> -
> - down_read(>init_lock);
> - val = zram->max_comp_streams;
> - up_read(>init_lock);
> -
> - return scnprintf(buf, PAGE_SIZE, "%d\n", val);
> + return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
>  }
>  
>  static ssize_t max_comp_streams_store(struct device *dev,
>   struct device_attribute *attr, const char *buf, size_t len)
>  {
> - int num;
> - struct zram *zram = dev_to_zram(dev);
> - int ret;
> -
> - ret = kstrtoint(buf, 0, );
> - if (ret < 0)
> - return ret;
> - if (num < 1)
> - return -EINVAL;
> -
> - down_write(>init_lock);
> - if (init_done(zram)) {
> - if (!zcomp_set_max_streams(zram->comp, num)) {
> - pr_info("Cannot change max compression streams\n");
> - ret = -EINVAL;
> - goto out;
> - }
> - }
> -
> - zram->max_comp_streams = num;
> - ret = len;
> -out:
> - up_write(>init_lock);
> - return ret;

At least, we need sanity check code, still?
Otherwise, user can 

[PATCH 2/6] mm/page_owner: initialize page owner without holding the zone lock

2016-05-02 Thread js1304
From: Joonsoo Kim 

It's not necessary to initialized page_owner with holding the zone lock.
It would cause more contention on the zone lock although it's not
a big problem since it is just debug feature. But, it is better
than before so do it. This is also preparation step to use stackdepot
in page owner feature. Stackdepot allocates new pages when there is no
reserved space and holding the zone lock in this case will cause deadlock.

Signed-off-by: Joonsoo Kim 
---
 mm/compaction.c | 3 +++
 mm/page_alloc.c | 2 --
 mm/page_isolation.c | 9 ++---
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index ecf0252..dbad95b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "internal.h"
 
 #ifdef CONFIG_COMPACTION
@@ -80,6 +81,8 @@ static void map_pages(struct list_head *list)
arch_alloc_page(page, order);
kernel_map_pages(page, nr_pages, 1);
kasan_alloc_pages(page, order);
+
+   set_page_owner(page, order, __GFP_MOVABLE);
if (order)
split_page(page, order);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 60d7f10..5b632be 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2514,8 +2514,6 @@ int __isolate_free_page(struct page *page, unsigned int 
order)
zone->free_area[order].nr_free--;
rmv_page_order(page);
 
-   set_page_owner(page, order, __GFP_MOVABLE);
-
/* Set the pageblock if the isolated page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 612122b..927f5ee 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "internal.h"
 
 #define CREATE_TRACE_POINTS
@@ -108,8 +109,6 @@ static void unset_migratetype_isolate(struct page *page, 
unsigned migratetype)
if (pfn_valid_within(page_to_pfn(buddy)) &&
!is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
-   kernel_map_pages(page, (1 << order), 1);
-   set_page_refcounted(page);
isolated_page = page;
}
}
@@ -128,8 +127,12 @@ static void unset_migratetype_isolate(struct page *page, 
unsigned migratetype)
zone->nr_isolate_pageblock--;
 out:
spin_unlock_irqrestore(>lock, flags);
-   if (isolated_page)
+   if (isolated_page) {
+   kernel_map_pages(page, (1 << order), 1);
+   set_page_refcounted(page);
+   set_page_owner(page, order, __GFP_MOVABLE);
__free_pages(isolated_page, order);
+   }
 }
 
 static inline struct page *
-- 
1.9.1



[PATCH 3/6] mm/page_owner: copy last_migrate_reason in copy_page_owner()

2016-05-02 Thread js1304
From: Joonsoo Kim 

Currently, copy_page_owner() doesn't copy all the owner information.
It skips last_migrate_reason because copy_page_owner() is used for
migration and it will be properly set soon. But, following patch
will use copy_page_owner() and this skip will cause the problem that
allocated page has uninitialied last_migrate_reason. To prevent it,
this patch also copy last_migrate_reason in copy_page_owner().

Signed-off-by: Joonsoo Kim 
---
 mm/page_owner.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/page_owner.c b/mm/page_owner.c
index 792b56d..6693959 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -101,6 +101,7 @@ void __copy_page_owner(struct page *oldpage, struct page 
*newpage)
 
new_ext->order = old_ext->order;
new_ext->gfp_mask = old_ext->gfp_mask;
+   new_ext->last_migrate_reason = old_ext->last_migrate_reason;
new_ext->nr_entries = old_ext->nr_entries;
 
for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
-- 
1.9.1



[PATCH 2/6] mm/page_owner: initialize page owner without holding the zone lock

2016-05-02 Thread js1304
From: Joonsoo Kim 

It's not necessary to initialized page_owner with holding the zone lock.
It would cause more contention on the zone lock although it's not
a big problem since it is just debug feature. But, it is better
than before so do it. This is also preparation step to use stackdepot
in page owner feature. Stackdepot allocates new pages when there is no
reserved space and holding the zone lock in this case will cause deadlock.

Signed-off-by: Joonsoo Kim 
---
 mm/compaction.c | 3 +++
 mm/page_alloc.c | 2 --
 mm/page_isolation.c | 9 ++---
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/mm/compaction.c b/mm/compaction.c
index ecf0252..dbad95b 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "internal.h"
 
 #ifdef CONFIG_COMPACTION
@@ -80,6 +81,8 @@ static void map_pages(struct list_head *list)
arch_alloc_page(page, order);
kernel_map_pages(page, nr_pages, 1);
kasan_alloc_pages(page, order);
+
+   set_page_owner(page, order, __GFP_MOVABLE);
if (order)
split_page(page, order);
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 60d7f10..5b632be 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2514,8 +2514,6 @@ int __isolate_free_page(struct page *page, unsigned int 
order)
zone->free_area[order].nr_free--;
rmv_page_order(page);
 
-   set_page_owner(page, order, __GFP_MOVABLE);
-
/* Set the pageblock if the isolated page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 612122b..927f5ee 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "internal.h"
 
 #define CREATE_TRACE_POINTS
@@ -108,8 +109,6 @@ static void unset_migratetype_isolate(struct page *page, 
unsigned migratetype)
if (pfn_valid_within(page_to_pfn(buddy)) &&
!is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
-   kernel_map_pages(page, (1 << order), 1);
-   set_page_refcounted(page);
isolated_page = page;
}
}
@@ -128,8 +127,12 @@ static void unset_migratetype_isolate(struct page *page, 
unsigned migratetype)
zone->nr_isolate_pageblock--;
 out:
spin_unlock_irqrestore(>lock, flags);
-   if (isolated_page)
+   if (isolated_page) {
+   kernel_map_pages(page, (1 << order), 1);
+   set_page_refcounted(page);
+   set_page_owner(page, order, __GFP_MOVABLE);
__free_pages(isolated_page, order);
+   }
 }
 
 static inline struct page *
-- 
1.9.1



[PATCH 3/6] mm/page_owner: copy last_migrate_reason in copy_page_owner()

2016-05-02 Thread js1304
From: Joonsoo Kim 

Currently, copy_page_owner() doesn't copy all the owner information.
It skips last_migrate_reason because copy_page_owner() is used for
migration and it will be properly set soon. But, following patch
will use copy_page_owner() and this skip will cause the problem that
allocated page has uninitialied last_migrate_reason. To prevent it,
this patch also copy last_migrate_reason in copy_page_owner().

Signed-off-by: Joonsoo Kim 
---
 mm/page_owner.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/page_owner.c b/mm/page_owner.c
index 792b56d..6693959 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -101,6 +101,7 @@ void __copy_page_owner(struct page *oldpage, struct page 
*newpage)
 
new_ext->order = old_ext->order;
new_ext->gfp_mask = old_ext->gfp_mask;
+   new_ext->last_migrate_reason = old_ext->last_migrate_reason;
new_ext->nr_entries = old_ext->nr_entries;
 
for (i = 0; i < ARRAY_SIZE(new_ext->trace_entries); i++)
-- 
1.9.1



[PATCH 5/6] tools/vm/page_owner: increase temporary buffer size

2016-05-02 Thread js1304
From: Joonsoo Kim 

Page owner will be changed to store more deep stacktrace
so current temporary buffer size isn't enough. Increase it.

Signed-off-by: Joonsoo Kim 
---
 tools/vm/page_owner_sort.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index 77147b4..f1c055f 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -79,12 +79,12 @@ static void add_list(char *buf, int len)
}
 }
 
-#define BUF_SIZE   1024
+#define BUF_SIZE   (128 * 1024)
 
 int main(int argc, char **argv)
 {
FILE *fin, *fout;
-   char buf[BUF_SIZE];
+   char *buf;
int ret, i, count;
struct block_list *list2;
struct stat st;
@@ -107,6 +107,11 @@ int main(int argc, char **argv)
max_size = st.st_size / 100; /* hack ... */
 
list = malloc(max_size * sizeof(*list));
+   buf = malloc(BUF_SIZE);
+   if (!list || !buf) {
+   printf("Out of memory\n");
+   exit(1);
+   }
 
for ( ; ; ) {
ret = read_block(buf, BUF_SIZE, fin);
-- 
1.9.1



[PATCH 1/6] mm/compaction: split freepages without holding the zone lock

2016-05-02 Thread js1304
From: Joonsoo Kim 

We don't need to split freepages with holding the zone lock. It will cause
more contention on zone lock so not desirable.

Signed-off-by: Joonsoo Kim 
---
 include/linux/mm.h |  1 -
 mm/compaction.c| 42 ++
 mm/page_alloc.c| 27 ---
 3 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b52750..9608f33 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -523,7 +523,6 @@ void __put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
-int split_free_page(struct page *page);
 
 /*
  * Compound pages have a destructor function.  Provide a
diff --git a/mm/compaction.c b/mm/compaction.c
index c9a95c1..ecf0252 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -65,13 +65,31 @@ static unsigned long release_freepages(struct list_head 
*freelist)
 
 static void map_pages(struct list_head *list)
 {
-   struct page *page;
+   unsigned int i, order, nr_pages;
+   struct page *page, *next;
+   LIST_HEAD(tmp_list);
+
+   list_for_each_entry_safe(page, next, list, lru) {
+   list_del(>lru);
+
+   order = page_private(page);
+   nr_pages = 1 << order;
+   set_page_private(page, 0);
+   set_page_refcounted(page);
+
+   arch_alloc_page(page, order);
+   kernel_map_pages(page, nr_pages, 1);
+   kasan_alloc_pages(page, order);
+   if (order)
+   split_page(page, order);
 
-   list_for_each_entry(page, list, lru) {
-   arch_alloc_page(page, 0);
-   kernel_map_pages(page, 1, 1);
-   kasan_alloc_pages(page, 0);
+   for (i = 0; i < nr_pages; i++) {
+   list_add(>lru, _list);
+   page++;
+   }
}
+
+   list_splice(_list, list);
 }
 
 static inline bool migrate_async_suitable(int migratetype)
@@ -368,12 +386,13 @@ static unsigned long isolate_freepages_block(struct 
compact_control *cc,
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
+   unsigned int order;
 
cursor = pfn_to_page(blockpfn);
 
/* Isolate free pages. */
for (; blockpfn < end_pfn; blockpfn++, cursor++) {
-   int isolated, i;
+   int isolated;
struct page *page = cursor;
 
/*
@@ -439,13 +458,12 @@ static unsigned long isolate_freepages_block(struct 
compact_control *cc,
goto isolate_fail;
}
 
-   /* Found a free page, break it into order-0 pages */
-   isolated = split_free_page(page);
+   /* Found a free page, will break it into order-0 pages */
+   order = page_order(page);
+   isolated = __isolate_free_page(page, page_order(page));
+   set_page_private(page, order);
total_isolated += isolated;
-   for (i = 0; i < isolated; i++) {
-   list_add(>lru, freelist);
-   page++;
-   }
+   list_add_tail(>lru, freelist);
 
/* If a page was split, advance to the end of it */
if (isolated) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5dd65d9..60d7f10 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2532,33 +2532,6 @@ int __isolate_free_page(struct page *page, unsigned int 
order)
 }
 
 /*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
- */
-int split_free_page(struct page *page)
-{
-   unsigned int order;
-   int nr_pages;
-
-   order = page_order(page);
-
-   nr_pages = __isolate_free_page(page, order);
-   if (!nr_pages)
-   return 0;
-
-   /* Split into individual pages */
-   set_page_refcounted(page);
-   split_page(page, order);
-   return nr_pages;
-}
-
-/*
  * Update NUMA hit/miss statistics
  *
  * Must be called with interrupts disabled.
-- 
1.9.1



[PATCH 0/6] mm/page_owner: use tackdepot to store stacktrace

2016-05-02 Thread js1304
From: Joonsoo Kim 

This patchset changes a way to store stacktrace in page_owner in order to
reduce memory usage. Below is motivation of this patchset coped
from the patch 6.

Currently, we store each page's allocation stacktrace on corresponding
page_ext structure and it requires a lot of memory. This causes the problem
that memory tight system doesn't work well if page_owner is enabled.
Moreover, even with this large memory consumption, we cannot get full
stacktrace because we allocate memory at boot time and just maintain
8 stacktrace slots to balance memory consumption. We could increase it
to more but it would make system unusable or change system behaviour.

To solve the problem, this patch uses stackdepot to store stacktrace.

Thanks.

Joonsoo Kim (6):
  mm/compaction: split freepages without holding the zone lock
  mm/page_owner: initialize page owner without holding the zone lock
  mm/page_owner: copy last_migrate_reason in copy_page_owner()
  mm/page_owner: introduce split_page_owner and replace manual handling
  tools/vm/page_owner: increase temporary buffer size
  mm/page_owner: use stackdepot to store stacktrace

 include/linux/mm.h |   1 -
 include/linux/page_ext.h   |   4 +-
 include/linux/page_owner.h |  12 ++--
 lib/Kconfig.debug  |   1 +
 mm/compaction.c|  45 +++
 mm/page_alloc.c|  37 +---
 mm/page_isolation.c|   9 ++-
 mm/page_owner.c| 136 ++---
 tools/vm/page_owner_sort.c |   9 ++-
 9 files changed, 173 insertions(+), 81 deletions(-)

-- 
1.9.1



[PATCH 5/6] tools/vm/page_owner: increase temporary buffer size

2016-05-02 Thread js1304
From: Joonsoo Kim 

Page owner will be changed to store more deep stacktrace
so current temporary buffer size isn't enough. Increase it.

Signed-off-by: Joonsoo Kim 
---
 tools/vm/page_owner_sort.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c
index 77147b4..f1c055f 100644
--- a/tools/vm/page_owner_sort.c
+++ b/tools/vm/page_owner_sort.c
@@ -79,12 +79,12 @@ static void add_list(char *buf, int len)
}
 }
 
-#define BUF_SIZE   1024
+#define BUF_SIZE   (128 * 1024)
 
 int main(int argc, char **argv)
 {
FILE *fin, *fout;
-   char buf[BUF_SIZE];
+   char *buf;
int ret, i, count;
struct block_list *list2;
struct stat st;
@@ -107,6 +107,11 @@ int main(int argc, char **argv)
max_size = st.st_size / 100; /* hack ... */
 
list = malloc(max_size * sizeof(*list));
+   buf = malloc(BUF_SIZE);
+   if (!list || !buf) {
+   printf("Out of memory\n");
+   exit(1);
+   }
 
for ( ; ; ) {
ret = read_block(buf, BUF_SIZE, fin);
-- 
1.9.1



[PATCH 1/6] mm/compaction: split freepages without holding the zone lock

2016-05-02 Thread js1304
From: Joonsoo Kim 

We don't need to split freepages with holding the zone lock. It will cause
more contention on zone lock so not desirable.

Signed-off-by: Joonsoo Kim 
---
 include/linux/mm.h |  1 -
 mm/compaction.c| 42 ++
 mm/page_alloc.c| 27 ---
 3 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7b52750..9608f33 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -523,7 +523,6 @@ void __put_page(struct page *page);
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
-int split_free_page(struct page *page);
 
 /*
  * Compound pages have a destructor function.  Provide a
diff --git a/mm/compaction.c b/mm/compaction.c
index c9a95c1..ecf0252 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -65,13 +65,31 @@ static unsigned long release_freepages(struct list_head 
*freelist)
 
 static void map_pages(struct list_head *list)
 {
-   struct page *page;
+   unsigned int i, order, nr_pages;
+   struct page *page, *next;
+   LIST_HEAD(tmp_list);
+
+   list_for_each_entry_safe(page, next, list, lru) {
+   list_del(>lru);
+
+   order = page_private(page);
+   nr_pages = 1 << order;
+   set_page_private(page, 0);
+   set_page_refcounted(page);
+
+   arch_alloc_page(page, order);
+   kernel_map_pages(page, nr_pages, 1);
+   kasan_alloc_pages(page, order);
+   if (order)
+   split_page(page, order);
 
-   list_for_each_entry(page, list, lru) {
-   arch_alloc_page(page, 0);
-   kernel_map_pages(page, 1, 1);
-   kasan_alloc_pages(page, 0);
+   for (i = 0; i < nr_pages; i++) {
+   list_add(>lru, _list);
+   page++;
+   }
}
+
+   list_splice(_list, list);
 }
 
 static inline bool migrate_async_suitable(int migratetype)
@@ -368,12 +386,13 @@ static unsigned long isolate_freepages_block(struct 
compact_control *cc,
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
+   unsigned int order;
 
cursor = pfn_to_page(blockpfn);
 
/* Isolate free pages. */
for (; blockpfn < end_pfn; blockpfn++, cursor++) {
-   int isolated, i;
+   int isolated;
struct page *page = cursor;
 
/*
@@ -439,13 +458,12 @@ static unsigned long isolate_freepages_block(struct 
compact_control *cc,
goto isolate_fail;
}
 
-   /* Found a free page, break it into order-0 pages */
-   isolated = split_free_page(page);
+   /* Found a free page, will break it into order-0 pages */
+   order = page_order(page);
+   isolated = __isolate_free_page(page, page_order(page));
+   set_page_private(page, order);
total_isolated += isolated;
-   for (i = 0; i < isolated; i++) {
-   list_add(>lru, freelist);
-   page++;
-   }
+   list_add_tail(>lru, freelist);
 
/* If a page was split, advance to the end of it */
if (isolated) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5dd65d9..60d7f10 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2532,33 +2532,6 @@ int __isolate_free_page(struct page *page, unsigned int 
order)
 }
 
 /*
- * Similar to split_page except the page is already free. As this is only
- * being used for migration, the migratetype of the block also changes.
- * As this is called with interrupts disabled, the caller is responsible
- * for calling arch_alloc_page() and kernel_map_page() after interrupts
- * are enabled.
- *
- * Note: this is probably too low level an operation for use in drivers.
- * Please consult with lkml before using this in your driver.
- */
-int split_free_page(struct page *page)
-{
-   unsigned int order;
-   int nr_pages;
-
-   order = page_order(page);
-
-   nr_pages = __isolate_free_page(page, order);
-   if (!nr_pages)
-   return 0;
-
-   /* Split into individual pages */
-   set_page_refcounted(page);
-   split_page(page, order);
-   return nr_pages;
-}
-
-/*
  * Update NUMA hit/miss statistics
  *
  * Must be called with interrupts disabled.
-- 
1.9.1



[PATCH 0/6] mm/page_owner: use tackdepot to store stacktrace

2016-05-02 Thread js1304
From: Joonsoo Kim 

This patchset changes a way to store stacktrace in page_owner in order to
reduce memory usage. Below is motivation of this patchset coped
from the patch 6.

Currently, we store each page's allocation stacktrace on corresponding
page_ext structure and it requires a lot of memory. This causes the problem
that memory tight system doesn't work well if page_owner is enabled.
Moreover, even with this large memory consumption, we cannot get full
stacktrace because we allocate memory at boot time and just maintain
8 stacktrace slots to balance memory consumption. We could increase it
to more but it would make system unusable or change system behaviour.

To solve the problem, this patch uses stackdepot to store stacktrace.

Thanks.

Joonsoo Kim (6):
  mm/compaction: split freepages without holding the zone lock
  mm/page_owner: initialize page owner without holding the zone lock
  mm/page_owner: copy last_migrate_reason in copy_page_owner()
  mm/page_owner: introduce split_page_owner and replace manual handling
  tools/vm/page_owner: increase temporary buffer size
  mm/page_owner: use stackdepot to store stacktrace

 include/linux/mm.h |   1 -
 include/linux/page_ext.h   |   4 +-
 include/linux/page_owner.h |  12 ++--
 lib/Kconfig.debug  |   1 +
 mm/compaction.c|  45 +++
 mm/page_alloc.c|  37 +---
 mm/page_isolation.c|   9 ++-
 mm/page_owner.c| 136 ++---
 tools/vm/page_owner_sort.c |   9 ++-
 9 files changed, 173 insertions(+), 81 deletions(-)

-- 
1.9.1



Re: [PATCH 1/1] simplified security.nscapability xattr

2016-05-02 Thread Serge E. Hallyn
Quoting Andrew G. Morgan (mor...@kernel.org):
> On 2 May 2016 6:04 p.m., "Eric W. Biederman"  wrote:
> >
> > "Serge E. Hallyn"  writes:
> >
> > > On Tue, Apr 26, 2016 at 03:39:54PM -0700, Kees Cook wrote:
> > >> On Tue, Apr 26, 2016 at 3:26 PM, Serge E. Hallyn 
> wrote:
> > >> > Quoting Kees Cook (keesc...@chromium.org):
> > >> >> On Fri, Apr 22, 2016 at 10:26 AM,   wrote:
> > >> >> > From: Serge Hallyn 
> > > ...
> > >> >> This looks like userspace must knowingly be aware that it is in a
> > >> >> namespace and to DTRT instead of it being translated by the kernel
> > >> >> when setxattr is called under !init_user_ns?
> > >> >
> > >> > Yes - my libcap2 patch checks /proc/self/uid_map to decide that.  If
> that
> > >> > shows you are in init_user_ns then it uses security.capability,
> otherwise
> > >> > it uses security.nscapability.
> > >> >
> > >> > I've occasionally considered having the xattr code do the quiet
> > >> > substitution if need be.
> > >> >
> > >> > In fact, much of this structure comes from when I was still trying to
> > >> > do multiple values per xattr.  Given what we're doing here, we could
> > >> > keep the xattr contents exactly the same, just changing the name.
> > >> > So userspace could just get and set security.capability;  if you are
> > >> > in a non-init user_ns, if security.capability is set then you cannot
> > >> > set it;  if security.capability is not set, then the kernel writes
> > >> > security.nscapability instead and returns success.
> > >> >
> > >> > I don't like magic, but this might be just straightforward enough
> > >> > to not be offensive.  Thoughts?
> > >>
> > >> Yeah, I think it might be better to have the magic in this case, since
> > >> it seems weird to just reject setxattr if a tool didn't realize it was
> > >> in a namespace. I'm not sure -- it is also nice to have an explicit
> > >> API here.
> > >>
> > >> I would defer to Eric or Michael on that. I keep going back and forth,
> > >> though I suspect it's probably best to do what you already have
> > >> (explicit API).
> > >
> > > Michael, Eric, what do you think?  The choice we're making here is
> > > whether we should
> > >
> > > 1. Keep a nice simple separate pair of xattrs, the pre-existing
> > > security.capability which can only be written from init_user_ns,
> > > and the new (in this patch) security.nscapability which you can
> > > write to any file where you are privileged wrt the file.
> > >
> > > 2. Make security.capability somewhat 'magic' - if someone in a
> > > non-initial user ns tries to write it and has privilege wrt the
> > > file, then the kernel silently writes security.nscapability instead.
> > >
> > > The biggest drawback of (1) would be any tar-like program trying
> > > to restore a file which had security.capability, needing to know
> > > to detect its userns and write the security.nscapability instead.
> > > The drawback of (2) is ~\o/~ magic.
> >
> > Apologies for not having followed this more closely before.
> >
> > I don't like either option.  I think we will be in much better shape if
> > we upgrade the capability xattr.  It seems totally wrong or at least
> > confusing for a file to have both capability xattrs.
> >
> > Just using security.capability allows us to confront any weird issues
> > with mixing both the old semantics and the new semantics.
> >
> > We had previously discussioned extending the capbility a little and
> > adding a uid who needed to be the root uid in a user namespace, to be
> > valid.  Using the owner of the file seems simpler, and even a little
> > more transparent as this makes the security.capability xattr a limited
> > form of setuid (which it semantically is).
> >
> > So I believe the new semantics in general are an improvement.
> >
> >
> > Given the expected use case let me ask as simple question: Are there any
> > known cases where the owner of a setcap exectuable is not root?
> >
> > I expect the pile of setcap exectuables is small enough we can go
> > through the top distros and look at all of the setcap executlables.
> >
> >
> > If there is not a need to support setcap executables owned by non-root,
> > I suspect the right play is to just change the semantics to always treat
> > the security.capability attribute this way.
> >
> 
> I guess I'm confused how we have strayed so far that this isn't an obvious
> requirement. Uid=0 as being the root of privilege was the basic problem
> that capabilities were designed to change.

The task executing the file can be any uid mapped into the namespace.  The
file only has to be owned by the root of the user_ns.  Which I agree is
unfortunate.  We can work around it by putting the root uid into the xattr
itself (which still isn't orthogonal but allows the file to at least by
owned by non-root), but the problem then is that a task needs to know its
global root k_uid just to write the xattr.

> Uid is an 

Re: [PATCH 1/1] simplified security.nscapability xattr

2016-05-02 Thread Serge E. Hallyn
Quoting Andrew G. Morgan (mor...@kernel.org):
> On 2 May 2016 6:04 p.m., "Eric W. Biederman"  wrote:
> >
> > "Serge E. Hallyn"  writes:
> >
> > > On Tue, Apr 26, 2016 at 03:39:54PM -0700, Kees Cook wrote:
> > >> On Tue, Apr 26, 2016 at 3:26 PM, Serge E. Hallyn 
> wrote:
> > >> > Quoting Kees Cook (keesc...@chromium.org):
> > >> >> On Fri, Apr 22, 2016 at 10:26 AM,   wrote:
> > >> >> > From: Serge Hallyn 
> > > ...
> > >> >> This looks like userspace must knowingly be aware that it is in a
> > >> >> namespace and to DTRT instead of it being translated by the kernel
> > >> >> when setxattr is called under !init_user_ns?
> > >> >
> > >> > Yes - my libcap2 patch checks /proc/self/uid_map to decide that.  If
> that
> > >> > shows you are in init_user_ns then it uses security.capability,
> otherwise
> > >> > it uses security.nscapability.
> > >> >
> > >> > I've occasionally considered having the xattr code do the quiet
> > >> > substitution if need be.
> > >> >
> > >> > In fact, much of this structure comes from when I was still trying to
> > >> > do multiple values per xattr.  Given what we're doing here, we could
> > >> > keep the xattr contents exactly the same, just changing the name.
> > >> > So userspace could just get and set security.capability;  if you are
> > >> > in a non-init user_ns, if security.capability is set then you cannot
> > >> > set it;  if security.capability is not set, then the kernel writes
> > >> > security.nscapability instead and returns success.
> > >> >
> > >> > I don't like magic, but this might be just straightforward enough
> > >> > to not be offensive.  Thoughts?
> > >>
> > >> Yeah, I think it might be better to have the magic in this case, since
> > >> it seems weird to just reject setxattr if a tool didn't realize it was
> > >> in a namespace. I'm not sure -- it is also nice to have an explicit
> > >> API here.
> > >>
> > >> I would defer to Eric or Michael on that. I keep going back and forth,
> > >> though I suspect it's probably best to do what you already have
> > >> (explicit API).
> > >
> > > Michael, Eric, what do you think?  The choice we're making here is
> > > whether we should
> > >
> > > 1. Keep a nice simple separate pair of xattrs, the pre-existing
> > > security.capability which can only be written from init_user_ns,
> > > and the new (in this patch) security.nscapability which you can
> > > write to any file where you are privileged wrt the file.
> > >
> > > 2. Make security.capability somewhat 'magic' - if someone in a
> > > non-initial user ns tries to write it and has privilege wrt the
> > > file, then the kernel silently writes security.nscapability instead.
> > >
> > > The biggest drawback of (1) would be any tar-like program trying
> > > to restore a file which had security.capability, needing to know
> > > to detect its userns and write the security.nscapability instead.
> > > The drawback of (2) is ~\o/~ magic.
> >
> > Apologies for not having followed this more closely before.
> >
> > I don't like either option.  I think we will be in much better shape if
> > we upgrade the capability xattr.  It seems totally wrong or at least
> > confusing for a file to have both capability xattrs.
> >
> > Just using security.capability allows us to confront any weird issues
> > with mixing both the old semantics and the new semantics.
> >
> > We had previously discussioned extending the capbility a little and
> > adding a uid who needed to be the root uid in a user namespace, to be
> > valid.  Using the owner of the file seems simpler, and even a little
> > more transparent as this makes the security.capability xattr a limited
> > form of setuid (which it semantically is).
> >
> > So I believe the new semantics in general are an improvement.
> >
> >
> > Given the expected use case let me ask as simple question: Are there any
> > known cases where the owner of a setcap exectuable is not root?
> >
> > I expect the pile of setcap exectuables is small enough we can go
> > through the top distros and look at all of the setcap executlables.
> >
> >
> > If there is not a need to support setcap executables owned by non-root,
> > I suspect the right play is to just change the semantics to always treat
> > the security.capability attribute this way.
> >
> 
> I guess I'm confused how we have strayed so far that this isn't an obvious
> requirement. Uid=0 as being the root of privilege was the basic problem
> that capabilities were designed to change.

The task executing the file can be any uid mapped into the namespace.  The
file only has to be owned by the root of the user_ns.  Which I agree is
unfortunate.  We can work around it by putting the root uid into the xattr
itself (which still isn't orthogonal but allows the file to at least by
owned by non-root), but the problem then is that a task needs to know its
global root k_uid just to write the xattr.

> Uid is an acl concept. Capabilities are supposed to be independent of that.
> 
> If we want to support NS file 

[PATCH for v4.6] lib/stackdepot: avoid to return 0 handle

2016-05-02 Thread js1304
From: Joonsoo Kim 

Recently, we allow to save the stacktrace whose hashed value is 0.
It causes the problem that stackdepot could return 0 even if in success.
User of stackdepot cannot distinguish whether it is success or not so we
need to solve this problem. In this patch, 1 bit are added to handle
and make valid handle none 0 by setting this bit. After that, valid handle
will not be 0 and 0 handle will represent failure correctly.

Fixes: 4e25769c ("lib/stackdepot.c: allow the stack trace hash
to be zero")
Signed-off-by: Joonsoo Kim 
---
 lib/stackdepot.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 9e0b031..53ad6c0 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -42,12 +42,14 @@
 
 #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8)
 
+#define STACK_ALLOC_NULL_PROTECTION_BITS 1
 #define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */
 #define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER))
 #define STACK_ALLOC_ALIGN 4
 #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \
STACK_ALLOC_ALIGN)
-#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - STACK_ALLOC_OFFSET_BITS)
+#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \
+   STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS)
 #define STACK_ALLOC_SLABS_CAP 1024
 #define STACK_ALLOC_MAX_SLABS \
(((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \
@@ -59,6 +61,7 @@ union handle_parts {
struct {
u32 slabindex : STACK_ALLOC_INDEX_BITS;
u32 offset : STACK_ALLOC_OFFSET_BITS;
+   u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS;
};
 };
 
@@ -136,6 +139,7 @@ static struct stack_record *depot_alloc_stack(unsigned long 
*entries, int size,
stack->size = size;
stack->handle.slabindex = depot_index;
stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN;
+   stack->handle.valid = 1;
memcpy(stack->entries, entries, size * sizeof(unsigned long));
depot_offset += required_size;
 
-- 
1.9.1



[PATCH for v4.6] lib/stackdepot: avoid to return 0 handle

2016-05-02 Thread js1304
From: Joonsoo Kim 

Recently, we allow to save the stacktrace whose hashed value is 0.
It causes the problem that stackdepot could return 0 even if in success.
User of stackdepot cannot distinguish whether it is success or not so we
need to solve this problem. In this patch, 1 bit are added to handle
and make valid handle none 0 by setting this bit. After that, valid handle
will not be 0 and 0 handle will represent failure correctly.

Fixes: 4e25769c ("lib/stackdepot.c: allow the stack trace hash
to be zero")
Signed-off-by: Joonsoo Kim 
---
 lib/stackdepot.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 9e0b031..53ad6c0 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -42,12 +42,14 @@
 
 #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8)
 
+#define STACK_ALLOC_NULL_PROTECTION_BITS 1
 #define STACK_ALLOC_ORDER 2 /* 'Slab' size order for stack depot, 4 pages */
 #define STACK_ALLOC_SIZE (1LL << (PAGE_SHIFT + STACK_ALLOC_ORDER))
 #define STACK_ALLOC_ALIGN 4
 #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \
STACK_ALLOC_ALIGN)
-#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - STACK_ALLOC_OFFSET_BITS)
+#define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \
+   STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS)
 #define STACK_ALLOC_SLABS_CAP 1024
 #define STACK_ALLOC_MAX_SLABS \
(((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \
@@ -59,6 +61,7 @@ union handle_parts {
struct {
u32 slabindex : STACK_ALLOC_INDEX_BITS;
u32 offset : STACK_ALLOC_OFFSET_BITS;
+   u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS;
};
 };
 
@@ -136,6 +139,7 @@ static struct stack_record *depot_alloc_stack(unsigned long 
*entries, int size,
stack->size = size;
stack->handle.slabindex = depot_index;
stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN;
+   stack->handle.valid = 1;
memcpy(stack->entries, entries, size * sizeof(unsigned long));
depot_offset += required_size;
 
-- 
1.9.1



[PATCH 1/1] drivers: acpi: add CPU id to cooling device type of processor driver

2016-05-02 Thread Eduardo Valentin
Currently, in an ACPI based system, the processor driver registers
one cooling device per processor. However, the cooling device type
is the same for each processor. For example, on a system with four
processors, the sysfs reading of each cooling device would look like:
ebv@besouro ~ $ cat /sys/class/thermal/cooling_device*/type
Processor
Processor
Processor
Processor

which turns out to fine. But, some parts of the thermal code may use
type to identify participating devices in a thermal zone. Besides,
adding notifications to user space may cause the production of messages
that may confuse the listener.

For this reason, this patch adds the processor ID cooling device type.
After this change, the cooling device listing in the same previous example
would look like this:
ebv@besouro ~ $ cat /sys/class/thermal/cooling_device*/type
Processor.0
Processor.1
Processor.2
Processor.3

allowing an easier identification of cooling device target.

Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: linux-a...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Eduardo Valentin 
---
 drivers/acpi/processor_driver.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index d2fa8cb..6e982c1 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -163,6 +163,7 @@ static struct notifier_block acpi_cpu_notifier = {
 static int acpi_pss_perf_init(struct acpi_processor *pr,
struct acpi_device *device)
 {
+   char cdev_name[THERMAL_NAME_LENGTH];
int result = 0;
 
acpi_processor_ppc_has_changed(pr, 0);
@@ -172,7 +173,8 @@ static int acpi_pss_perf_init(struct acpi_processor *pr,
if (pr->flags.throttling)
pr->flags.limit = 1;
 
-   pr->cdev = thermal_cooling_device_register("Processor", device,
+   snprintf(cdev_name, sizeof(cdev_name), "Processor.%d", pr->id);
+   pr->cdev = thermal_cooling_device_register(cdev_name, device,
   _cooling_ops);
if (IS_ERR(pr->cdev)) {
result = PTR_ERR(pr->cdev);
-- 
2.1.4



[PATCH 1/1] drivers: acpi: add CPU id to cooling device type of processor driver

2016-05-02 Thread Eduardo Valentin
Currently, in an ACPI based system, the processor driver registers
one cooling device per processor. However, the cooling device type
is the same for each processor. For example, on a system with four
processors, the sysfs reading of each cooling device would look like:
ebv@besouro ~ $ cat /sys/class/thermal/cooling_device*/type
Processor
Processor
Processor
Processor

which turns out to fine. But, some parts of the thermal code may use
type to identify participating devices in a thermal zone. Besides,
adding notifications to user space may cause the production of messages
that may confuse the listener.

For this reason, this patch adds the processor ID cooling device type.
After this change, the cooling device listing in the same previous example
would look like this:
ebv@besouro ~ $ cat /sys/class/thermal/cooling_device*/type
Processor.0
Processor.1
Processor.2
Processor.3

allowing an easier identification of cooling device target.

Cc: "Rafael J. Wysocki" 
Cc: Len Brown 
Cc: linux-a...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Eduardo Valentin 
---
 drivers/acpi/processor_driver.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c
index d2fa8cb..6e982c1 100644
--- a/drivers/acpi/processor_driver.c
+++ b/drivers/acpi/processor_driver.c
@@ -163,6 +163,7 @@ static struct notifier_block acpi_cpu_notifier = {
 static int acpi_pss_perf_init(struct acpi_processor *pr,
struct acpi_device *device)
 {
+   char cdev_name[THERMAL_NAME_LENGTH];
int result = 0;
 
acpi_processor_ppc_has_changed(pr, 0);
@@ -172,7 +173,8 @@ static int acpi_pss_perf_init(struct acpi_processor *pr,
if (pr->flags.throttling)
pr->flags.limit = 1;
 
-   pr->cdev = thermal_cooling_device_register("Processor", device,
+   snprintf(cdev_name, sizeof(cdev_name), "Processor.%d", pr->id);
+   pr->cdev = thermal_cooling_device_register(cdev_name, device,
   _cooling_ops);
if (IS_ERR(pr->cdev)) {
result = PTR_ERR(pr->cdev);
-- 
2.1.4



Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Minchan Kim
On Tue, May 03, 2016 at 01:29:02PM +0900, Sergey Senozhatsky wrote:
> On (05/03/16 11:30), Sergey Senozhatsky wrote:
> > > We are concerning about returing back to no per-cpu options but actually,
> > > I don't want. If duplicate compression is really problem(But It's really
> > > unlikely), we should try to solve the problem itself with different way
> > > rather than roll-back to old, first of all.
> > > 
> > > I hope we can. So let's not add big worry about adding new dup stat. :)
> > 
> > ok, no prob. do you want it a separate sysfs node or a column in mm_stat?
> > I'd prefer mm_stat column, or somewhere in those cumulative files; not a
> > dedicated node: we decided to get rid of them some time ago.
> > 
> 
> will io_stat node work for you?

Firstly, I thought io_stat would be better. However, on second thought,
I want to withdraw.

I think io_stat should go away.

failed_read
failed_write
invalid_io

I think Above things are really unneeded. If something is fail, upper
class on top of zram, for example, FSes or Swap should emit the warning.
So, I don't think we need to maintain it in zram layer.

notify_free

It's kins of discard command for the point of block device so I think
general block should take care of it like read and write. If block will
do it, remained thing about notify_free is only zram_slot_free_notify
so I think we can move it from io_stat to mm_stat because it's related
to memory, not block I/O.

With hoping with above things, I suggest let's not add anything to
io_stat any more from now on and let's remove it sometime.
Instead of it, let's add new dup stat.

What do you think about it?


> I'll submit a formal patch later today. when you have time, can you
> take a look at http://marc.info/?l=linux-kernel=146217628030970 ?

Oops, Sorry I missed. I will take a look.

> I think we can fold this one into 0002. it will make 0002 slightly
> bigger, but there nothing complicated in there, just cleanup.
> 
> 
> 
> From: Sergey Senozhatsky 
> Subject: [PATCH] zram: export the number of re-compressions
> 
> Make the number of re-compressions visible via the io_stat node,
> so we will be able to track down any issues caused by per-cpu
> compression streams.
> 
> Signed-off-by: Sergey Senozhatsky 
> Suggested-by: Minchan Kim 
> ---
>  Documentation/blockdev/zram.txt | 3 +++
>  drivers/block/zram/zram_drv.c   | 7 +--
>  drivers/block/zram/zram_drv.h   | 1 +
>  3 files changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
> index 5bda503..386d260 100644
> --- a/Documentation/blockdev/zram.txt
> +++ b/Documentation/blockdev/zram.txt
> @@ -183,6 +183,8 @@ mem_limit RWthe maximum amount of memory ZRAM 
> can use to store
>  pages_compacted   ROthe number of pages freed during compaction
>  (available only via zram/mm_stat node)
>  compact   WOtrigger memory compaction
> +num_recompressROthe number of times fast compression paths failed
> +and zram performed re-compression via a slow path
>  
>  WARNING
>  ===
> @@ -215,6 +217,7 @@ whitespace:
>   failed_writes
>   invalid_io
>   notify_free
> + num_recompress
>  
>  File /sys/block/zram/mm_stat
>  
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 817e511..11b19c9 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -395,7 +395,8 @@ static ssize_t io_stat_show(struct device *dev,
>   (u64)atomic64_read(>stats.failed_reads),
>   (u64)atomic64_read(>stats.failed_writes),
>   (u64)atomic64_read(>stats.invalid_io),
> - (u64)atomic64_read(>stats.notify_free));
> + (u64)atomic64_read(>stats.notify_free),
> + (u64)atomic64_read(>stats.num_recompress));
>   up_read(>init_lock);
>  
>   return ret;
> @@ -721,8 +722,10 @@ compress_again:
>  
>   handle = zs_malloc(meta->mem_pool, clen,
>   GFP_NOIO | __GFP_HIGHMEM);
> - if (handle)
> + if (handle) {
> + atomic64_inc(>stats.num_recompress);
>   goto compress_again;
> + }
>  
>   pr_err("Error allocating memory for compressed page: %u, 
> size=%zu\n",
>   index, clen);
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index 06b1636..78d7e8f 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -85,6 +85,7 @@ struct zram_stats {
>   atomic64_t zero_pages;  /* no. of zero filled pages */
>   atomic64_t pages_stored;/* no. of pages currently stored */
>   atomic_long_t max_used_pages;   /* no. 

Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Minchan Kim
On Tue, May 03, 2016 at 01:29:02PM +0900, Sergey Senozhatsky wrote:
> On (05/03/16 11:30), Sergey Senozhatsky wrote:
> > > We are concerning about returing back to no per-cpu options but actually,
> > > I don't want. If duplicate compression is really problem(But It's really
> > > unlikely), we should try to solve the problem itself with different way
> > > rather than roll-back to old, first of all.
> > > 
> > > I hope we can. So let's not add big worry about adding new dup stat. :)
> > 
> > ok, no prob. do you want it a separate sysfs node or a column in mm_stat?
> > I'd prefer mm_stat column, or somewhere in those cumulative files; not a
> > dedicated node: we decided to get rid of them some time ago.
> > 
> 
> will io_stat node work for you?

Firstly, I thought io_stat would be better. However, on second thought,
I want to withdraw.

I think io_stat should go away.

failed_read
failed_write
invalid_io

I think Above things are really unneeded. If something is fail, upper
class on top of zram, for example, FSes or Swap should emit the warning.
So, I don't think we need to maintain it in zram layer.

notify_free

It's kins of discard command for the point of block device so I think
general block should take care of it like read and write. If block will
do it, remained thing about notify_free is only zram_slot_free_notify
so I think we can move it from io_stat to mm_stat because it's related
to memory, not block I/O.

With hoping with above things, I suggest let's not add anything to
io_stat any more from now on and let's remove it sometime.
Instead of it, let's add new dup stat.

What do you think about it?


> I'll submit a formal patch later today. when you have time, can you
> take a look at http://marc.info/?l=linux-kernel=146217628030970 ?

Oops, Sorry I missed. I will take a look.

> I think we can fold this one into 0002. it will make 0002 slightly
> bigger, but there nothing complicated in there, just cleanup.
> 
> 
> 
> From: Sergey Senozhatsky 
> Subject: [PATCH] zram: export the number of re-compressions
> 
> Make the number of re-compressions visible via the io_stat node,
> so we will be able to track down any issues caused by per-cpu
> compression streams.
> 
> Signed-off-by: Sergey Senozhatsky 
> Suggested-by: Minchan Kim 
> ---
>  Documentation/blockdev/zram.txt | 3 +++
>  drivers/block/zram/zram_drv.c   | 7 +--
>  drivers/block/zram/zram_drv.h   | 1 +
>  3 files changed, 9 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
> index 5bda503..386d260 100644
> --- a/Documentation/blockdev/zram.txt
> +++ b/Documentation/blockdev/zram.txt
> @@ -183,6 +183,8 @@ mem_limit RWthe maximum amount of memory ZRAM 
> can use to store
>  pages_compacted   ROthe number of pages freed during compaction
>  (available only via zram/mm_stat node)
>  compact   WOtrigger memory compaction
> +num_recompressROthe number of times fast compression paths failed
> +and zram performed re-compression via a slow path
>  
>  WARNING
>  ===
> @@ -215,6 +217,7 @@ whitespace:
>   failed_writes
>   invalid_io
>   notify_free
> + num_recompress
>  
>  File /sys/block/zram/mm_stat
>  
> diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
> index 817e511..11b19c9 100644
> --- a/drivers/block/zram/zram_drv.c
> +++ b/drivers/block/zram/zram_drv.c
> @@ -395,7 +395,8 @@ static ssize_t io_stat_show(struct device *dev,
>   (u64)atomic64_read(>stats.failed_reads),
>   (u64)atomic64_read(>stats.failed_writes),
>   (u64)atomic64_read(>stats.invalid_io),
> - (u64)atomic64_read(>stats.notify_free));
> + (u64)atomic64_read(>stats.notify_free),
> + (u64)atomic64_read(>stats.num_recompress));
>   up_read(>init_lock);
>  
>   return ret;
> @@ -721,8 +722,10 @@ compress_again:
>  
>   handle = zs_malloc(meta->mem_pool, clen,
>   GFP_NOIO | __GFP_HIGHMEM);
> - if (handle)
> + if (handle) {
> + atomic64_inc(>stats.num_recompress);
>   goto compress_again;
> + }
>  
>   pr_err("Error allocating memory for compressed page: %u, 
> size=%zu\n",
>   index, clen);
> diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
> index 06b1636..78d7e8f 100644
> --- a/drivers/block/zram/zram_drv.h
> +++ b/drivers/block/zram/zram_drv.h
> @@ -85,6 +85,7 @@ struct zram_stats {
>   atomic64_t zero_pages;  /* no. of zero filled pages */
>   atomic64_t pages_stored;/* no. of pages currently stored */
>   atomic_long_t max_used_pages;   /* no. of maximum pages stored */
> + atomic64_t num_recompress; /* no. of failed 

Re: [PATCH 1/1] simplified security.nscapability xattr

2016-05-02 Thread Eric W. Biederman
"Andrew G. Morgan"  writes:

> On 2 May 2016 6:04 p.m., "Eric W. Biederman" 
> wrote:
>>
>> "Serge E. Hallyn"  writes:
>>
>> > On Tue, Apr 26, 2016 at 03:39:54PM -0700, Kees Cook wrote:
>> >> On Tue, Apr 26, 2016 at 3:26 PM, Serge E. Hallyn
>  wrote:
>> >> > Quoting Kees Cook (keesc...@chromium.org):
>> >> >> On Fri, Apr 22, 2016 at 10:26 AM, 
> wrote:
>> >> >> > From: Serge Hallyn 
>> > ...
>> >> >> This looks like userspace must knowingly be aware that it is
> in a
>> >> >> namespace and to DTRT instead of it being translated by the
> kernel
>> >> >> when setxattr is called under !init_user_ns?
>> >> >
>> >> > Yes - my libcap2 patch checks /proc/self/uid_map to decide
> that. If that
>> >> > shows you are in init_user_ns then it uses security.capability,
> otherwise
>> >> > it uses security.nscapability.
>> >> >
>> >> > I've occasionally considered having the xattr code do the quiet
>> >> > substitution if need be.
>> >> >
>> >> > In fact, much of this structure comes from when I was still
> trying to
>> >> > do multiple values per xattr. Given what we're doing here, we
> could
>> >> > keep the xattr contents exactly the same, just changing the
> name.
>> >> > So userspace could just get and set security.capability; if you
> are
>> >> > in a non-init user_ns, if security.capability is set then you
> cannot
>> >> > set it; if security.capability is not set, then the kernel
> writes
>> >> > security.nscapability instead and returns success.
>> >> >
>> >> > I don't like magic, but this might be just straightforward
> enough
>> >> > to not be offensive. Thoughts?
>> >>
>> >> Yeah, I think it might be better to have the magic in this case,
> since
>> >> it seems weird to just reject setxattr if a tool didn't realize
> it was
>> >> in a namespace. I'm not sure -- it is also nice to have an
> explicit
>> >> API here.
>> >>
>> >> I would defer to Eric or Michael on that. I keep going back and
> forth,
>> >> though I suspect it's probably best to do what you already have
>> >> (explicit API).
>> >
>> > Michael, Eric, what do you think? The choice we're making here is
>> > whether we should
>> >
>> > 1. Keep a nice simple separate pair of xattrs, the pre-existing
>> > security.capability which can only be written from init_user_ns,
>> > and the new (in this patch) security.nscapability which you can
>> > write to any file where you are privileged wrt the file.
>> >
>> > 2. Make security.capability somewhat 'magic' - if someone in a
>> > non-initial user ns tries to write it and has privilege wrt the
>> > file, then the kernel silently writes security.nscapability
> instead.
>> >
>> > The biggest drawback of (1) would be any tar-like program trying
>> > to restore a file which had security.capability, needing to know
>> > to detect its userns and write the security.nscapability instead.
>> > The drawback of (2) is ~\o/~ magic.
>>
>> Apologies for not having followed this more closely before.
>>
>> I don't like either option. I think we will be in much better shape
> if
>> we upgrade the capability xattr. It seems totally wrong or at least
>> confusing for a file to have both capability xattrs.
>>
>> Just using security.capability allows us to confront any weird
> issues
>> with mixing both the old semantics and the new semantics.
>>
>> We had previously discussioned extending the capbility a little and
>> adding a uid who needed to be the root uid in a user namespace, to
> be
>> valid. Using the owner of the file seems simpler, and even a little
>> more transparent as this makes the security.capability xattr a
> limited
>> form of setuid (which it semantically is).
>>
>> So I believe the new semantics in general are an improvement.
>>
>>
>> Given the expected use case let me ask as simple question: Are there
> any
>> known cases where the owner of a setcap exectuable is not root?
>>
>> I expect the pile of setcap exectuables is small enough we can go
>> through the top distros and look at all of the setcap executlables.
>>
>>
>> If there is not a need to support setcap executables owned by
> non-root,
>> I suspect the right play is to just change the semantics to always
> treat
>> the security.capability attribute this way.
>>
>
> I guess I'm confused how we have strayed so far that this isn't an
> obvious requirement. Uid=0 as being the root of privilege was the
> basic problem that capabilities were designed to change.

uid==0 as the owner of a file is slightly different from uid==0 of a
running process.  Last I checked if it is installed as part of a
distribution the programs are owned by root by default.

> Uid is an acl concept. Capabilities are supposed to be independent of
> that.

I don't have a clue what you mean.  Posix capabilities on executables
are part of discretionary access control.  Whatever their rules posix
capabilities are just watered down versions of the permissions 

Re: [PATCH 1/1] simplified security.nscapability xattr

2016-05-02 Thread Eric W. Biederman
"Andrew G. Morgan"  writes:

> On 2 May 2016 6:04 p.m., "Eric W. Biederman" 
> wrote:
>>
>> "Serge E. Hallyn"  writes:
>>
>> > On Tue, Apr 26, 2016 at 03:39:54PM -0700, Kees Cook wrote:
>> >> On Tue, Apr 26, 2016 at 3:26 PM, Serge E. Hallyn
>  wrote:
>> >> > Quoting Kees Cook (keesc...@chromium.org):
>> >> >> On Fri, Apr 22, 2016 at 10:26 AM, 
> wrote:
>> >> >> > From: Serge Hallyn 
>> > ...
>> >> >> This looks like userspace must knowingly be aware that it is
> in a
>> >> >> namespace and to DTRT instead of it being translated by the
> kernel
>> >> >> when setxattr is called under !init_user_ns?
>> >> >
>> >> > Yes - my libcap2 patch checks /proc/self/uid_map to decide
> that. If that
>> >> > shows you are in init_user_ns then it uses security.capability,
> otherwise
>> >> > it uses security.nscapability.
>> >> >
>> >> > I've occasionally considered having the xattr code do the quiet
>> >> > substitution if need be.
>> >> >
>> >> > In fact, much of this structure comes from when I was still
> trying to
>> >> > do multiple values per xattr. Given what we're doing here, we
> could
>> >> > keep the xattr contents exactly the same, just changing the
> name.
>> >> > So userspace could just get and set security.capability; if you
> are
>> >> > in a non-init user_ns, if security.capability is set then you
> cannot
>> >> > set it; if security.capability is not set, then the kernel
> writes
>> >> > security.nscapability instead and returns success.
>> >> >
>> >> > I don't like magic, but this might be just straightforward
> enough
>> >> > to not be offensive. Thoughts?
>> >>
>> >> Yeah, I think it might be better to have the magic in this case,
> since
>> >> it seems weird to just reject setxattr if a tool didn't realize
> it was
>> >> in a namespace. I'm not sure -- it is also nice to have an
> explicit
>> >> API here.
>> >>
>> >> I would defer to Eric or Michael on that. I keep going back and
> forth,
>> >> though I suspect it's probably best to do what you already have
>> >> (explicit API).
>> >
>> > Michael, Eric, what do you think? The choice we're making here is
>> > whether we should
>> >
>> > 1. Keep a nice simple separate pair of xattrs, the pre-existing
>> > security.capability which can only be written from init_user_ns,
>> > and the new (in this patch) security.nscapability which you can
>> > write to any file where you are privileged wrt the file.
>> >
>> > 2. Make security.capability somewhat 'magic' - if someone in a
>> > non-initial user ns tries to write it and has privilege wrt the
>> > file, then the kernel silently writes security.nscapability
> instead.
>> >
>> > The biggest drawback of (1) would be any tar-like program trying
>> > to restore a file which had security.capability, needing to know
>> > to detect its userns and write the security.nscapability instead.
>> > The drawback of (2) is ~\o/~ magic.
>>
>> Apologies for not having followed this more closely before.
>>
>> I don't like either option. I think we will be in much better shape
> if
>> we upgrade the capability xattr. It seems totally wrong or at least
>> confusing for a file to have both capability xattrs.
>>
>> Just using security.capability allows us to confront any weird
> issues
>> with mixing both the old semantics and the new semantics.
>>
>> We had previously discussioned extending the capbility a little and
>> adding a uid who needed to be the root uid in a user namespace, to
> be
>> valid. Using the owner of the file seems simpler, and even a little
>> more transparent as this makes the security.capability xattr a
> limited
>> form of setuid (which it semantically is).
>>
>> So I believe the new semantics in general are an improvement.
>>
>>
>> Given the expected use case let me ask as simple question: Are there
> any
>> known cases where the owner of a setcap exectuable is not root?
>>
>> I expect the pile of setcap exectuables is small enough we can go
>> through the top distros and look at all of the setcap executlables.
>>
>>
>> If there is not a need to support setcap executables owned by
> non-root,
>> I suspect the right play is to just change the semantics to always
> treat
>> the security.capability attribute this way.
>>
>
> I guess I'm confused how we have strayed so far that this isn't an
> obvious requirement. Uid=0 as being the root of privilege was the
> basic problem that capabilities were designed to change.

uid==0 as the owner of a file is slightly different from uid==0 of a
running process.  Last I checked if it is installed as part of a
distribution the programs are owned by root by default.

> Uid is an acl concept. Capabilities are supposed to be independent of
> that.

I don't have a clue what you mean.  Posix capabilities on executables
are part of discretionary access control.  Whatever their rules posix
capabilities are just watered down versions of the permissions of
a setuid root exectuable.  I don't think anyone has ever actually run a
system with setuid root exectuables not being 

[REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions

2016-05-02 Thread John Stultz
In testing with HiKey, we found that since commit 3f30b158eba5c60
(asix: On RX avoid creating bad Ethernet frames), we're seeing lots of
noise during network transfers:

[  239.027993] asix 1-1.1:1.0 eth0: asix_rx_fixup() Data Header
synchronisation was lost, remaining 988
[  239.037310] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0x54ebb5ec, offset 4
[  239.045519] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0xcdffe7a2, offset 4
[  239.275044] asix 1-1.1:1.0 eth0: asix_rx_fixup() Data Header
synchronisation was lost, remaining 988
[  239.284355] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0x1d36f59d, offset 4
[  239.292541] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0xaef3c1e9, offset 4
[  239.518996] asix 1-1.1:1.0 eth0: asix_rx_fixup() Data Header
synchronisation was lost, remaining 988
[  239.528300] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0x2881912, offset 4
[  239.536413] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0x5638f7e2, offset 4


And network throughput ends up being pretty bursty and slow with a
overall throughput of at best ~30kB/s.

Looking through the commits since the v4.1 kernel where we didn't see
this, I narrowed the regression down, and reverting the following two
commits seems to avoid the problem:

6a570814cd430fa5ef4f278e8046dcf12ee63f13 asix: Continue processing URB
if no RX netdev buffer
3f30b158eba5c604b6e0870027eef5d19fc9271d asix: On RX avoid creating
bad Ethernet frames

With these reverted, we don't see all the error messages, and we see
better ~1.1MB/s throughput (I've got a mouse plugged in, so I think
the usb host is only running at "full-speed" mode here).

This worries me some, as the patches seem to describe trying to fix
the issue they seem to cause, so I suspect a revert isn't the correct
solution, but am not sure why we're having such trouble and the patch
authors did not.  I'd be happy to do further testing of patches if
folks have any ideas.

Originally Reported-by: Yongqin Liu 

thanks
-john


[REGRESSION] asix: Lots of asix_rx_fixup() errors and slow transmissions

2016-05-02 Thread John Stultz
In testing with HiKey, we found that since commit 3f30b158eba5c60
(asix: On RX avoid creating bad Ethernet frames), we're seeing lots of
noise during network transfers:

[  239.027993] asix 1-1.1:1.0 eth0: asix_rx_fixup() Data Header
synchronisation was lost, remaining 988
[  239.037310] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0x54ebb5ec, offset 4
[  239.045519] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0xcdffe7a2, offset 4
[  239.275044] asix 1-1.1:1.0 eth0: asix_rx_fixup() Data Header
synchronisation was lost, remaining 988
[  239.284355] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0x1d36f59d, offset 4
[  239.292541] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0xaef3c1e9, offset 4
[  239.518996] asix 1-1.1:1.0 eth0: asix_rx_fixup() Data Header
synchronisation was lost, remaining 988
[  239.528300] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0x2881912, offset 4
[  239.536413] asix 1-1.1:1.0 eth0: asix_rx_fixup() Bad Header Length
0x5638f7e2, offset 4


And network throughput ends up being pretty bursty and slow with a
overall throughput of at best ~30kB/s.

Looking through the commits since the v4.1 kernel where we didn't see
this, I narrowed the regression down, and reverting the following two
commits seems to avoid the problem:

6a570814cd430fa5ef4f278e8046dcf12ee63f13 asix: Continue processing URB
if no RX netdev buffer
3f30b158eba5c604b6e0870027eef5d19fc9271d asix: On RX avoid creating
bad Ethernet frames

With these reverted, we don't see all the error messages, and we see
better ~1.1MB/s throughput (I've got a mouse plugged in, so I think
the usb host is only running at "full-speed" mode here).

This worries me some, as the patches seem to describe trying to fix
the issue they seem to cause, so I suspect a revert isn't the correct
solution, but am not sure why we're having such trouble and the patch
authors did not.  I'd be happy to do further testing of patches if
folks have any ideas.

Originally Reported-by: Yongqin Liu 

thanks
-john


[PATCH v5 2/2] kvm: introduce KVM_MAX_VCPU_ID

2016-05-02 Thread Greg Kurz
The KVM_MAX_VCPUS define provides the maximum number of vCPUs per guest, and
also the upper limit for vCPU ids. This is okay for all archs except PowerPC
which can have higher ids, depending on the cpu/core/thread topology. In the
worst case (single threaded guest, host with 8 threads per core), it limits
the maximum number of vCPUS to KVM_MAX_VCPUS / 8.

This patch separates the vCPU numbering from the total number of vCPUs, with
the introduction of KVM_MAX_VCPU_ID, as the maximal valid value for vCPU ids
plus one.

The corresponding KVM_CAP_MAX_VCPU_ID allows userspace to validate vCPU ids
before passing them to KVM_CREATE_VCPU.

Only PowerPC gets unlimited vCPU ids for the moment. This patch doesn't
change anything for other archs.

Suggested-by: Radim Krcmar 
Signed-off-by: Greg Kurz 
---
 Documentation/virtual/kvm/api.txt   |   10 --
 arch/powerpc/include/asm/kvm_host.h |2 ++
 arch/powerpc/kvm/powerpc.c  |3 +++
 include/linux/kvm_host.h|4 
 include/uapi/linux/kvm.h|1 +
 virt/kvm/kvm_main.c |2 +-
 6 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 4d0542c5206b..2da127f21ffc 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -199,8 +199,8 @@ Type: vm ioctl
 Parameters: vcpu id (apic id on x86)
 Returns: vcpu fd on success, -1 on error
 
-This API adds a vcpu to a virtual machine.  The vcpu id is a small integer
-in the range [0, max_vcpus).
+This API adds a vcpu to a virtual machine. No more than max_vcpus may be added.
+The vcpu id is an integer in the range [0, max_vcpu_id).
 
 The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of
 the KVM_CHECK_EXTENSION ioctl() at run-time.
@@ -212,6 +212,12 @@ cpus max.
 If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is
 same as the value returned from KVM_CAP_NR_VCPUS.
 
+The maximum possible value for max_vcpu_id can be retrieved using the
+KVM_CAP_MAX_VCPU_ID of the KVM_CHECK_EXTENSION ioctl() at run-time.
+
+If the KVM_CAP_MAX_VCPU_ID does not exist, you should assume that max_vcpu_id
+is the same as the value returned from KVM_CAP_MAX_VCPUS.
+
 On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
 threads in one or more virtual CPU cores.  (This is because the
 hardware requires all the hardware threads in a CPU core to be in the
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d7b343170453..6b4b78d6131b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -39,6 +39,8 @@
 #define KVM_MAX_VCPUS  NR_CPUS
 #define KVM_MAX_VCORES NR_CPUS
 #define KVM_USER_MEM_SLOTS 512
+#define KVM_MAX_VCPU_IDINT_MAX
+
 
 #ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6a68730774ee..bef0e6e5e8d0 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -580,6 +580,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
+   case KVM_CAP_MAX_VCPU_ID:
+   r = KVM_MAX_VCPU_ID;
+   break;
 #ifdef CONFIG_PPC_BOOK3S_64
case KVM_CAP_PPC_GET_SMMU_INFO:
r = 1;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 23bfe1bd159c..3b4efa1c088c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,6 +35,10 @@
 
 #include 
 
+#ifndef KVM_MAX_VCPU_ID
+#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
+#endif
+
 /*
  * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
  * in kvm, other bits are visible for userspace which are defined in
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a7f1f8032ec1..05ebf475104c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -865,6 +865,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_SPAPR_TCE_64 125
 #define KVM_CAP_ARM_PMU_V3 126
 #define KVM_CAP_VCPU_ATTRIBUTES 127
+#define KVM_CAP_MAX_VCPU_ID 128
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4fd482fb9260..210ab88466fd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2272,7 +2272,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 
id)
int r;
struct kvm_vcpu *vcpu;
 
-   if (id >= KVM_MAX_VCPUS)
+   if (id >= KVM_MAX_VCPU_ID)
return -EINVAL;
 
vcpu = kvm_arch_vcpu_create(kvm, id);



[PATCH v5 2/2] kvm: introduce KVM_MAX_VCPU_ID

2016-05-02 Thread Greg Kurz
The KVM_MAX_VCPUS define provides the maximum number of vCPUs per guest, and
also the upper limit for vCPU ids. This is okay for all archs except PowerPC
which can have higher ids, depending on the cpu/core/thread topology. In the
worst case (single threaded guest, host with 8 threads per core), it limits
the maximum number of vCPUS to KVM_MAX_VCPUS / 8.

This patch separates the vCPU numbering from the total number of vCPUs, with
the introduction of KVM_MAX_VCPU_ID, as the maximal valid value for vCPU ids
plus one.

The corresponding KVM_CAP_MAX_VCPU_ID allows userspace to validate vCPU ids
before passing them to KVM_CREATE_VCPU.

Only PowerPC gets unlimited vCPU ids for the moment. This patch doesn't
change anything for other archs.

Suggested-by: Radim Krcmar 
Signed-off-by: Greg Kurz 
---
 Documentation/virtual/kvm/api.txt   |   10 --
 arch/powerpc/include/asm/kvm_host.h |2 ++
 arch/powerpc/kvm/powerpc.c  |3 +++
 include/linux/kvm_host.h|4 
 include/uapi/linux/kvm.h|1 +
 virt/kvm/kvm_main.c |2 +-
 6 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 4d0542c5206b..2da127f21ffc 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -199,8 +199,8 @@ Type: vm ioctl
 Parameters: vcpu id (apic id on x86)
 Returns: vcpu fd on success, -1 on error
 
-This API adds a vcpu to a virtual machine.  The vcpu id is a small integer
-in the range [0, max_vcpus).
+This API adds a vcpu to a virtual machine. No more than max_vcpus may be added.
+The vcpu id is an integer in the range [0, max_vcpu_id).
 
 The recommended max_vcpus value can be retrieved using the KVM_CAP_NR_VCPUS of
 the KVM_CHECK_EXTENSION ioctl() at run-time.
@@ -212,6 +212,12 @@ cpus max.
 If the KVM_CAP_MAX_VCPUS does not exist, you should assume that max_vcpus is
 same as the value returned from KVM_CAP_NR_VCPUS.
 
+The maximum possible value for max_vcpu_id can be retrieved using the
+KVM_CAP_MAX_VCPU_ID of the KVM_CHECK_EXTENSION ioctl() at run-time.
+
+If the KVM_CAP_MAX_VCPU_ID does not exist, you should assume that max_vcpu_id
+is the same as the value returned from KVM_CAP_MAX_VCPUS.
+
 On powerpc using book3s_hv mode, the vcpus are mapped onto virtual
 threads in one or more virtual CPU cores.  (This is because the
 hardware requires all the hardware threads in a CPU core to be in the
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index d7b343170453..6b4b78d6131b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -39,6 +39,8 @@
 #define KVM_MAX_VCPUS  NR_CPUS
 #define KVM_MAX_VCORES NR_CPUS
 #define KVM_USER_MEM_SLOTS 512
+#define KVM_MAX_VCPU_IDINT_MAX
+
 
 #ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6a68730774ee..bef0e6e5e8d0 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -580,6 +580,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
+   case KVM_CAP_MAX_VCPU_ID:
+   r = KVM_MAX_VCPU_ID;
+   break;
 #ifdef CONFIG_PPC_BOOK3S_64
case KVM_CAP_PPC_GET_SMMU_INFO:
r = 1;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 23bfe1bd159c..3b4efa1c088c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -35,6 +35,10 @@
 
 #include 
 
+#ifndef KVM_MAX_VCPU_ID
+#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
+#endif
+
 /*
  * The bit 16 ~ bit 31 of kvm_memory_region::flags are internally used
  * in kvm, other bits are visible for userspace which are defined in
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a7f1f8032ec1..05ebf475104c 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -865,6 +865,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_SPAPR_TCE_64 125
 #define KVM_CAP_ARM_PMU_V3 126
 #define KVM_CAP_VCPU_ATTRIBUTES 127
+#define KVM_CAP_MAX_VCPU_ID 128
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4fd482fb9260..210ab88466fd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2272,7 +2272,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 
id)
int r;
struct kvm_vcpu *vcpu;
 
-   if (id >= KVM_MAX_VCPUS)
+   if (id >= KVM_MAX_VCPU_ID)
return -EINVAL;
 
vcpu = kvm_arch_vcpu_create(kvm, id);



[PATCH] sched/debug: fix idle balance factors aren't printed out if w/o CONFIG_SCHEDSTATS

2016-05-02 Thread Wanpeng Li
From: Wanpeng Li 

max_idle_balance_cost and avg_idle which used to capture short idle 
are not associated with schedstats, however, the information of these
two factors are't printed out if w/o CONFIG_SCHEDSTATS. 

This patch fix it by moving max_idle_balance_cost and avg_idle print 
out of CONFIG_SCHEDSTATS.

Signed-off-by: Wanpeng Li 
---
 kernel/sched/debug.c |   10 +-
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fbc3bd..cf905f6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -626,15 +626,16 @@ do {  
\
 #undef P
 #undef PN
 
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
-
 #ifdef CONFIG_SMP
+#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
P64(avg_idle);
P64(max_idle_balance_cost);
+#undef P64
 #endif
 
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+
if (schedstat_enabled()) {
P(yld_count);
P(sched_count);
@@ -644,7 +645,6 @@ do {
\
}
 
 #undef P
-#undef P64
 #endif
spin_lock_irqsave(_debug_lock, flags);
print_cfs_stats(m, cpu);
-- 
1.7.1



[PATCH] sched/debug: fix idle balance factors aren't printed out if w/o CONFIG_SCHEDSTATS

2016-05-02 Thread Wanpeng Li
From: Wanpeng Li 

max_idle_balance_cost and avg_idle which used to capture short idle 
are not associated with schedstats, however, the information of these
two factors are't printed out if w/o CONFIG_SCHEDSTATS. 

This patch fix it by moving max_idle_balance_cost and avg_idle print 
out of CONFIG_SCHEDSTATS.

Signed-off-by: Wanpeng Li 
---
 kernel/sched/debug.c |   10 +-
 1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fbc3bd..cf905f6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -626,15 +626,16 @@ do {  
\
 #undef P
 #undef PN
 
-#ifdef CONFIG_SCHEDSTATS
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
-#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
-
 #ifdef CONFIG_SMP
+#define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
P64(avg_idle);
P64(max_idle_balance_cost);
+#undef P64
 #endif
 
+#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
+
if (schedstat_enabled()) {
P(yld_count);
P(sched_count);
@@ -644,7 +645,6 @@ do {
\
}
 
 #undef P
-#undef P64
 #endif
spin_lock_irqsave(_debug_lock, flags);
print_cfs_stats(m, cpu);
-- 
1.7.1



Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Sergey Senozhatsky
On (05/03/16 11:30), Sergey Senozhatsky wrote:
> > We are concerning about returing back to no per-cpu options but actually,
> > I don't want. If duplicate compression is really problem(But It's really
> > unlikely), we should try to solve the problem itself with different way
> > rather than roll-back to old, first of all.
> > 
> > I hope we can. So let's not add big worry about adding new dup stat. :)
> 
> ok, no prob. do you want it a separate sysfs node or a column in mm_stat?
> I'd prefer mm_stat column, or somewhere in those cumulative files; not a
> dedicated node: we decided to get rid of them some time ago.
> 

will io_stat node work for you?
I'll submit a formal patch later today. when you have time, can you
take a look at http://marc.info/?l=linux-kernel=146217628030970 ?
I think we can fold this one into 0002. it will make 0002 slightly
bigger, but there nothing complicated in there, just cleanup.



From: Sergey Senozhatsky 
Subject: [PATCH] zram: export the number of re-compressions

Make the number of re-compressions visible via the io_stat node,
so we will be able to track down any issues caused by per-cpu
compression streams.

Signed-off-by: Sergey Senozhatsky 
Suggested-by: Minchan Kim 
---
 Documentation/blockdev/zram.txt | 3 +++
 drivers/block/zram/zram_drv.c   | 7 +--
 drivers/block/zram/zram_drv.h   | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 5bda503..386d260 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -183,6 +183,8 @@ mem_limit RWthe maximum amount of memory ZRAM 
can use to store
 pages_compacted   ROthe number of pages freed during compaction
 (available only via zram/mm_stat node)
 compact   WOtrigger memory compaction
+num_recompressROthe number of times fast compression paths failed
+and zram performed re-compression via a slow path
 
 WARNING
 ===
@@ -215,6 +217,7 @@ whitespace:
failed_writes
invalid_io
notify_free
+   num_recompress
 
 File /sys/block/zram/mm_stat
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 817e511..11b19c9 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -395,7 +395,8 @@ static ssize_t io_stat_show(struct device *dev,
(u64)atomic64_read(>stats.failed_reads),
(u64)atomic64_read(>stats.failed_writes),
(u64)atomic64_read(>stats.invalid_io),
-   (u64)atomic64_read(>stats.notify_free));
+   (u64)atomic64_read(>stats.notify_free),
+   (u64)atomic64_read(>stats.num_recompress));
up_read(>init_lock);
 
return ret;
@@ -721,8 +722,10 @@ compress_again:
 
handle = zs_malloc(meta->mem_pool, clen,
GFP_NOIO | __GFP_HIGHMEM);
-   if (handle)
+   if (handle) {
+   atomic64_inc(>stats.num_recompress);
goto compress_again;
+   }
 
pr_err("Error allocating memory for compressed page: %u, 
size=%zu\n",
index, clen);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 06b1636..78d7e8f 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -85,6 +85,7 @@ struct zram_stats {
atomic64_t zero_pages;  /* no. of zero filled pages */
atomic64_t pages_stored;/* no. of pages currently stored */
atomic_long_t max_used_pages;   /* no. of maximum pages stored */
+   atomic64_t num_recompress; /* no. of failed compression fast paths */
 };
 
 struct zram_meta {
-- 
2.8.2



Re: [PATCH 2/2] zram: user per-cpu compression streams

2016-05-02 Thread Sergey Senozhatsky
On (05/03/16 11:30), Sergey Senozhatsky wrote:
> > We are concerning about returing back to no per-cpu options but actually,
> > I don't want. If duplicate compression is really problem(But It's really
> > unlikely), we should try to solve the problem itself with different way
> > rather than roll-back to old, first of all.
> > 
> > I hope we can. So let's not add big worry about adding new dup stat. :)
> 
> ok, no prob. do you want it a separate sysfs node or a column in mm_stat?
> I'd prefer mm_stat column, or somewhere in those cumulative files; not a
> dedicated node: we decided to get rid of them some time ago.
> 

will io_stat node work for you?
I'll submit a formal patch later today. when you have time, can you
take a look at http://marc.info/?l=linux-kernel=146217628030970 ?
I think we can fold this one into 0002. it will make 0002 slightly
bigger, but there nothing complicated in there, just cleanup.



From: Sergey Senozhatsky 
Subject: [PATCH] zram: export the number of re-compressions

Make the number of re-compressions visible via the io_stat node,
so we will be able to track down any issues caused by per-cpu
compression streams.

Signed-off-by: Sergey Senozhatsky 
Suggested-by: Minchan Kim 
---
 Documentation/blockdev/zram.txt | 3 +++
 drivers/block/zram/zram_drv.c   | 7 +--
 drivers/block/zram/zram_drv.h   | 1 +
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt
index 5bda503..386d260 100644
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -183,6 +183,8 @@ mem_limit RWthe maximum amount of memory ZRAM 
can use to store
 pages_compacted   ROthe number of pages freed during compaction
 (available only via zram/mm_stat node)
 compact   WOtrigger memory compaction
+num_recompressROthe number of times fast compression paths failed
+and zram performed re-compression via a slow path
 
 WARNING
 ===
@@ -215,6 +217,7 @@ whitespace:
failed_writes
invalid_io
notify_free
+   num_recompress
 
 File /sys/block/zram/mm_stat
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 817e511..11b19c9 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -395,7 +395,8 @@ static ssize_t io_stat_show(struct device *dev,
(u64)atomic64_read(>stats.failed_reads),
(u64)atomic64_read(>stats.failed_writes),
(u64)atomic64_read(>stats.invalid_io),
-   (u64)atomic64_read(>stats.notify_free));
+   (u64)atomic64_read(>stats.notify_free),
+   (u64)atomic64_read(>stats.num_recompress));
up_read(>init_lock);
 
return ret;
@@ -721,8 +722,10 @@ compress_again:
 
handle = zs_malloc(meta->mem_pool, clen,
GFP_NOIO | __GFP_HIGHMEM);
-   if (handle)
+   if (handle) {
+   atomic64_inc(>stats.num_recompress);
goto compress_again;
+   }
 
pr_err("Error allocating memory for compressed page: %u, 
size=%zu\n",
index, clen);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index 06b1636..78d7e8f 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -85,6 +85,7 @@ struct zram_stats {
atomic64_t zero_pages;  /* no. of zero filled pages */
atomic64_t pages_stored;/* no. of pages currently stored */
atomic_long_t max_used_pages;   /* no. of maximum pages stored */
+   atomic64_t num_recompress; /* no. of failed compression fast paths */
 };
 
 struct zram_meta {
-- 
2.8.2



linux-next: manual merge of the block tree with Linus' tree

2016-05-02 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  drivers/nvme/host/pci.c

between commit:

  9bf2b972afea ("NVMe: Fix reset/remove race")

from Linus' tree and commit:

  bb8d261e0888 ("nvme: introduce a controller state machine")

from the block tree.

I fixed it up (I think - see below) and can carry the fix as necessary.
This is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/nvme/host/pci.c
index 4fd733ff72b1,077e9bf6a1b8..
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@@ -1864,10 -1789,8 +1789,11 @@@ static void nvme_reset_work(struct work
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
  
-   if (test_bit(NVME_CTRL_REMOVING, >flags))
++  if (dev->ctrl.state != NVME_CTRL_DELETING)
 +  goto out;
 +
-   set_bit(NVME_CTRL_RESETTING, >flags);
+   if (!nvme_change_ctrl_state(>ctrl, NVME_CTRL_RESETTING))
+   goto out;
  
result = nvme_pci_enable(dev);
if (result)
@@@ -2086,12 -2014,11 +2017,10 @@@ static void nvme_remove(struct pci_dev 
  {
struct nvme_dev *dev = pci_get_drvdata(pdev);
  
-   set_bit(NVME_CTRL_REMOVING, >flags);
 -  del_timer_sync(>watchdog_timer);
 -
+   nvme_change_ctrl_state(>ctrl, NVME_CTRL_DELETING);
+ 
pci_set_drvdata(pdev, NULL);
-   flush_work(>async_work);
 +  flush_work(>reset_work);
-   flush_work(>scan_work);
-   nvme_remove_namespaces(>ctrl);
nvme_uninit_ctrl(>ctrl);
nvme_dev_disable(dev, true);
flush_work(>reset_work);


linux-next: manual merge of the block tree with Linus' tree

2016-05-02 Thread Stephen Rothwell
Hi Jens,

Today's linux-next merge of the block tree got a conflict in:

  drivers/nvme/host/pci.c

between commit:

  9bf2b972afea ("NVMe: Fix reset/remove race")

from Linus' tree and commit:

  bb8d261e0888 ("nvme: introduce a controller state machine")

from the block tree.

I fixed it up (I think - see below) and can carry the fix as necessary.
This is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc drivers/nvme/host/pci.c
index 4fd733ff72b1,077e9bf6a1b8..
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@@ -1864,10 -1789,8 +1789,11 @@@ static void nvme_reset_work(struct work
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
  
-   if (test_bit(NVME_CTRL_REMOVING, >flags))
++  if (dev->ctrl.state != NVME_CTRL_DELETING)
 +  goto out;
 +
-   set_bit(NVME_CTRL_RESETTING, >flags);
+   if (!nvme_change_ctrl_state(>ctrl, NVME_CTRL_RESETTING))
+   goto out;
  
result = nvme_pci_enable(dev);
if (result)
@@@ -2086,12 -2014,11 +2017,10 @@@ static void nvme_remove(struct pci_dev 
  {
struct nvme_dev *dev = pci_get_drvdata(pdev);
  
-   set_bit(NVME_CTRL_REMOVING, >flags);
 -  del_timer_sync(>watchdog_timer);
 -
+   nvme_change_ctrl_state(>ctrl, NVME_CTRL_DELETING);
+ 
pci_set_drvdata(pdev, NULL);
-   flush_work(>async_work);
 +  flush_work(>reset_work);
-   flush_work(>scan_work);
-   nvme_remove_namespaces(>ctrl);
nvme_uninit_ctrl(>ctrl);
nvme_dev_disable(dev, true);
flush_work(>reset_work);


Re: [PATCH v2 0/3] AM57/DRA7 Clock Tree DTSI Fix-ups

2016-05-02 Thread Lokesh Vutla


On Monday 02 May 2016 10:42 PM, J.D. Schroeder wrote:
> This series of patches fixes several discrepancies between the
> AM57/DRA7 clock tree description and the actual hardware behavior and
> frequencies. With these changes a more complete picture of the clock
> tree is represented for a few of the clocks and their resulting
> frequencies.

Tested on my DRA74-evm.

Tested-by: Lokesh Vutla 

Thanks and regards,
Lokesh

> 
> v2 Changes:
> * Rebased on linux-next as requested by Tony Lindgren
> * Eliminated previous patch 2 as another change fixing the same thing
>   was merged in eea08802f586acd6aef377d1b4a541821013cc0b
> * Added to the commit message in patch 2 to clarify the source of the
>   clock being internal to the processor
> * Added a new patch 3 to fix a new warning introduced by
>   eea08802f586acd6aef377d1b4a541821013cc0b
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-omap" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


Re: [PATCH v2 0/3] AM57/DRA7 Clock Tree DTSI Fix-ups

2016-05-02 Thread Lokesh Vutla


On Monday 02 May 2016 10:42 PM, J.D. Schroeder wrote:
> This series of patches fixes several discrepancies between the
> AM57/DRA7 clock tree description and the actual hardware behavior and
> frequencies. With these changes a more complete picture of the clock
> tree is represented for a few of the clocks and their resulting
> frequencies.

Tested on my DRA74-evm.

Tested-by: Lokesh Vutla 

Thanks and regards,
Lokesh

> 
> v2 Changes:
> * Rebased on linux-next as requested by Tony Lindgren
> * Eliminated previous patch 2 as another change fixing the same thing
>   was merged in eea08802f586acd6aef377d1b4a541821013cc0b
> * Added to the commit message in patch 2 to clarify the source of the
>   clock being internal to the processor
> * Added a new patch 3 to fix a new warning introduced by
>   eea08802f586acd6aef377d1b4a541821013cc0b
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-omap" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


hi

2016-05-02 Thread cputrdoc
Salutations 



http://kaminoexpress.com/jet.php?garden=1pbbqhgcy4f346prg



cputrdoc


hi

2016-05-02 Thread cputrdoc
Salutations 



http://kaminoexpress.com/jet.php?garden=1pbbqhgcy4f346prg



cputrdoc


Re: [PATCH 1/2] ASoC: rockchip: i2s: separate capture and playback

2016-05-02 Thread sugar

Hi John,

On 4/30/2016 15:00, John Keeping Wrote:

Hi Enric,

On Fri, Apr 29, 2016 at 04:59:27PM +0200, Enric Balletbo Serra wrote:

2015-12-09 11:32 GMT+01:00 John Keeping :

If we only clear the tx/rx state when both are disabled it is not
possible to start/stop one multiple times while the other is running.
Since the two are independently controlled, treat them as such and
remove the false dependency between capture and playback.

Signed-off-by: John Keeping 
---
  sound/soc/rockchip/rockchip_i2s.c | 72 +--
  1 file changed, 32 insertions(+), 40 deletions(-)

diff --git a/sound/soc/rockchip/rockchip_i2s.c 
b/sound/soc/rockchip/rockchip_i2s.c
index 83b1b9c..acc6225 100644
--- a/sound/soc/rockchip/rockchip_i2s.c
+++ b/sound/soc/rockchip/rockchip_i2s.c
@@ -82,8 +82,8 @@ static void rockchip_snd_txctrl(struct rk_i2s_dev *i2s, int 
on)
I2S_DMACR_TDE_ENABLE, I2S_DMACR_TDE_ENABLE);

 regmap_update_bits(i2s->regmap, I2S_XFER,
-  I2S_XFER_TXS_START | I2S_XFER_RXS_START,
-  I2S_XFER_TXS_START | I2S_XFER_RXS_START);
+  I2S_XFER_TXS_START,
+  I2S_XFER_TXS_START);

 i2s->tx_start = true;
 } else {
@@ -92,27 +92,23 @@ static void rockchip_snd_txctrl(struct rk_i2s_dev *i2s, int 
on)
 regmap_update_bits(i2s->regmap, I2S_DMACR,
I2S_DMACR_TDE_ENABLE, 
I2S_DMACR_TDE_DISABLE);

-   if (!i2s->rx_start) {
-   regmap_update_bits(i2s->regmap, I2S_XFER,
-  I2S_XFER_TXS_START |
-  I2S_XFER_RXS_START,
-  I2S_XFER_TXS_STOP |
-  I2S_XFER_RXS_STOP);
+   regmap_update_bits(i2s->regmap, I2S_XFER,
+  I2S_XFER_TXS_START,
+  I2S_XFER_TXS_STOP);

-   regmap_update_bits(i2s->regmap, I2S_CLR,
-  I2S_CLR_TXC | I2S_CLR_RXC,
-  I2S_CLR_TXC | I2S_CLR_RXC);
+   regmap_update_bits(i2s->regmap, I2S_CLR,
+  I2S_CLR_TXC,
+  I2S_CLR_TXC);

-   regmap_read(i2s->regmap, I2S_CLR, );
+   regmap_read(i2s->regmap, I2S_CLR, );

-   /* Should wait for clear operation to finish */
-   while (val) {
-   regmap_read(i2s->regmap, I2S_CLR, );
-   retry--;
-   if (!retry) {
-   dev_warn(i2s->dev, "fail to clear\n");
-   break;
-   }
+   /* Should wait for clear operation to finish */
+   while (val & I2S_CLR_TXC) {
+   regmap_read(i2s->regmap, I2S_CLR, );
+   retry--;
+   if (!retry) {
+   dev_warn(i2s->dev, "fail to clear\n");
+   break;
 }
 }
 }
@@ -128,8 +124,8 @@ static void rockchip_snd_rxctrl(struct rk_i2s_dev *i2s, int 
on)
I2S_DMACR_RDE_ENABLE, I2S_DMACR_RDE_ENABLE);

 regmap_update_bits(i2s->regmap, I2S_XFER,
-  I2S_XFER_TXS_START | I2S_XFER_RXS_START,
-  I2S_XFER_TXS_START | I2S_XFER_RXS_START);
+  I2S_XFER_RXS_START,
+  I2S_XFER_RXS_START);

 i2s->rx_start = true;
 } else {
@@ -138,27 +134,23 @@ static void rockchip_snd_rxctrl(struct rk_i2s_dev *i2s, 
int on)
 regmap_update_bits(i2s->regmap, I2S_DMACR,
I2S_DMACR_RDE_ENABLE, 
I2S_DMACR_RDE_DISABLE);

-   if (!i2s->tx_start) {
-   regmap_update_bits(i2s->regmap, I2S_XFER,
-  I2S_XFER_TXS_START |
-  I2S_XFER_RXS_START,
-  I2S_XFER_TXS_STOP |
-  I2S_XFER_RXS_STOP);
+   regmap_update_bits(i2s->regmap, I2S_XFER,
+  I2S_XFER_RXS_START,
+  I2S_XFER_RXS_STOP);

-   regmap_update_bits(i2s->regmap, I2S_CLR,
-  I2S_CLR_TXC | I2S_CLR_RXC,
-  I2S_CLR_TXC | I2S_CLR_RXC);
+   

Re: [PATCH 1/2] ASoC: rockchip: i2s: separate capture and playback

2016-05-02 Thread sugar

Hi John,

On 4/30/2016 15:00, John Keeping Wrote:

Hi Enric,

On Fri, Apr 29, 2016 at 04:59:27PM +0200, Enric Balletbo Serra wrote:

2015-12-09 11:32 GMT+01:00 John Keeping :

If we only clear the tx/rx state when both are disabled it is not
possible to start/stop one multiple times while the other is running.
Since the two are independently controlled, treat them as such and
remove the false dependency between capture and playback.

Signed-off-by: John Keeping 
---
  sound/soc/rockchip/rockchip_i2s.c | 72 +--
  1 file changed, 32 insertions(+), 40 deletions(-)

diff --git a/sound/soc/rockchip/rockchip_i2s.c 
b/sound/soc/rockchip/rockchip_i2s.c
index 83b1b9c..acc6225 100644
--- a/sound/soc/rockchip/rockchip_i2s.c
+++ b/sound/soc/rockchip/rockchip_i2s.c
@@ -82,8 +82,8 @@ static void rockchip_snd_txctrl(struct rk_i2s_dev *i2s, int 
on)
I2S_DMACR_TDE_ENABLE, I2S_DMACR_TDE_ENABLE);

 regmap_update_bits(i2s->regmap, I2S_XFER,
-  I2S_XFER_TXS_START | I2S_XFER_RXS_START,
-  I2S_XFER_TXS_START | I2S_XFER_RXS_START);
+  I2S_XFER_TXS_START,
+  I2S_XFER_TXS_START);

 i2s->tx_start = true;
 } else {
@@ -92,27 +92,23 @@ static void rockchip_snd_txctrl(struct rk_i2s_dev *i2s, int 
on)
 regmap_update_bits(i2s->regmap, I2S_DMACR,
I2S_DMACR_TDE_ENABLE, 
I2S_DMACR_TDE_DISABLE);

-   if (!i2s->rx_start) {
-   regmap_update_bits(i2s->regmap, I2S_XFER,
-  I2S_XFER_TXS_START |
-  I2S_XFER_RXS_START,
-  I2S_XFER_TXS_STOP |
-  I2S_XFER_RXS_STOP);
+   regmap_update_bits(i2s->regmap, I2S_XFER,
+  I2S_XFER_TXS_START,
+  I2S_XFER_TXS_STOP);

-   regmap_update_bits(i2s->regmap, I2S_CLR,
-  I2S_CLR_TXC | I2S_CLR_RXC,
-  I2S_CLR_TXC | I2S_CLR_RXC);
+   regmap_update_bits(i2s->regmap, I2S_CLR,
+  I2S_CLR_TXC,
+  I2S_CLR_TXC);

-   regmap_read(i2s->regmap, I2S_CLR, );
+   regmap_read(i2s->regmap, I2S_CLR, );

-   /* Should wait for clear operation to finish */
-   while (val) {
-   regmap_read(i2s->regmap, I2S_CLR, );
-   retry--;
-   if (!retry) {
-   dev_warn(i2s->dev, "fail to clear\n");
-   break;
-   }
+   /* Should wait for clear operation to finish */
+   while (val & I2S_CLR_TXC) {
+   regmap_read(i2s->regmap, I2S_CLR, );
+   retry--;
+   if (!retry) {
+   dev_warn(i2s->dev, "fail to clear\n");
+   break;
 }
 }
 }
@@ -128,8 +124,8 @@ static void rockchip_snd_rxctrl(struct rk_i2s_dev *i2s, int 
on)
I2S_DMACR_RDE_ENABLE, I2S_DMACR_RDE_ENABLE);

 regmap_update_bits(i2s->regmap, I2S_XFER,
-  I2S_XFER_TXS_START | I2S_XFER_RXS_START,
-  I2S_XFER_TXS_START | I2S_XFER_RXS_START);
+  I2S_XFER_RXS_START,
+  I2S_XFER_RXS_START);

 i2s->rx_start = true;
 } else {
@@ -138,27 +134,23 @@ static void rockchip_snd_rxctrl(struct rk_i2s_dev *i2s, 
int on)
 regmap_update_bits(i2s->regmap, I2S_DMACR,
I2S_DMACR_RDE_ENABLE, 
I2S_DMACR_RDE_DISABLE);

-   if (!i2s->tx_start) {
-   regmap_update_bits(i2s->regmap, I2S_XFER,
-  I2S_XFER_TXS_START |
-  I2S_XFER_RXS_START,
-  I2S_XFER_TXS_STOP |
-  I2S_XFER_RXS_STOP);
+   regmap_update_bits(i2s->regmap, I2S_XFER,
+  I2S_XFER_RXS_START,
+  I2S_XFER_RXS_STOP);

-   regmap_update_bits(i2s->regmap, I2S_CLR,
-  I2S_CLR_TXC | I2S_CLR_RXC,
-  I2S_CLR_TXC | I2S_CLR_RXC);
+   regmap_update_bits(i2s->regmap, I2S_CLR,
+

Re: [PATCH v3 12/16] rtc: powerpc: provide rtc_class_ops directly

2016-05-02 Thread Michael Ellerman
On Thu, 2016-04-28 at 00:34 +0200, Arnd Bergmann wrote:

> The rtc-generic driver provides an architecture specific
> wrapper on top of the generic rtc_class_ops abstraction,
> and powerpc has another abstraction on top, which is a bit
> silly.
> 
> This changes the powerpc rtc-generic device to provide its
> rtc_class_ops directly, to reduce the number of layers
> by one.
> 
> Signed-off-by: Arnd Bergmann 
> ---
>  arch/powerpc/kernel/time.c | 29 -
>  drivers/rtc/rtc-generic.c  |  2 +-
>  2 files changed, 29 insertions(+), 2 deletions(-)

If this hits linux-next it will go through my automated boot testing, which
hopefully would be sufficient to catch any bugs in this patch, cross fingers.

I don't know jack about all the layers of RTC mess, so my ack is basically
worthless here. But if you like you can have one anyway :)

Acked-by: Michael Ellerman 

cheers



Re: [PATCH v3 12/16] rtc: powerpc: provide rtc_class_ops directly

2016-05-02 Thread Michael Ellerman
On Thu, 2016-04-28 at 00:34 +0200, Arnd Bergmann wrote:

> The rtc-generic driver provides an architecture specific
> wrapper on top of the generic rtc_class_ops abstraction,
> and powerpc has another abstraction on top, which is a bit
> silly.
> 
> This changes the powerpc rtc-generic device to provide its
> rtc_class_ops directly, to reduce the number of layers
> by one.
> 
> Signed-off-by: Arnd Bergmann 
> ---
>  arch/powerpc/kernel/time.c | 29 -
>  drivers/rtc/rtc-generic.c  |  2 +-
>  2 files changed, 29 insertions(+), 2 deletions(-)

If this hits linux-next it will go through my automated boot testing, which
hopefully would be sufficient to catch any bugs in this patch, cross fingers.

I don't know jack about all the layers of RTC mess, so my ack is basically
worthless here. But if you like you can have one anyway :)

Acked-by: Michael Ellerman 

cheers



Re: [Question] Missing data after DMA read transfer - mm issue with transparent huge page?

2016-05-02 Thread Hugh Dickins
On Fri, 29 Apr 2016, Nicolas Morey Chaisemartin wrote:

> Hi everyone,
> 
> This is a repost from a different address as it seems the previous one ended 
> in Gmail junk due to a domain error..

linux-kernel is a very high volume list which few are reading:
that also will account for your lack of response so far
(apart from the indefatigable Alan).

I've added linux-mm, and some people from another thread regarding
THP and get_user_pages() pins which has been discussed in recent days.

Make no mistake, the issue you're raising here is definitely not the
same as that one (which is specifically about the new THP refcounting
in v4.5+, whereas you're reporting a problem you've seen in both a
v3.10-based kernel and in v4.5).  But I think their heads are in
gear, much more so than mine, and likely to spot something.

> I added more info found while blindly debugging the issue.
> 
> Short version:
> I'm having an issue with direct DMA transfer from a device to host memory.
> It seems some of the data is not transferring to the appropriate page.
> 
> Some more details:
> I'm debugging a home made PCI driver for our board (Kalray), attached to a 
> x86_64 host running centos7 (3.10.0-327.el7.x86_64)
> 
> In the current case, a userland application transfers back and forth data 
> through read/write operations on a file.
> On the kernel side, it triggers DMA transfers through the PCI to/from our 
> board memory.
> 
> We followed what pretty much all docs said about direct I/O to user buffers:
> 
> 1) get_user_pages() (in the current case, it's at most 16 pages at once)
> 2) convert to a scatterlist
> 3) pci_map_sg
> 4) eventually coalesce sg (Intel IOMMU is enabled, so it's usually possible)
> 4) A lot of DMA engine handling code, using the dmaengine layer and virt-dma
> 5) wait for transfer complete, in the mean time, go back to (1) to schedule 
> more work, if any
> 6) pci_unmap_sg
> 7) for read (card2host) transfer, set_page_dirty_lock
> 8) page_cache_release
> 
> In 99,% it works perfectly.
> However, I have one userland application where a few pages are not written by 
> a read (card2host) transfer.
> The buffer is memset them to a different value so I can check that nothing 
> has overwritten them.
> 
> I know (PCI protocol analyser) that the data left our board for the "right" 
> address (the one set in the sg by pci_map_sg).
> I tried reading the data between the pci_unmap_sg and the set_page_dirty, 
> using
> uint32_t *addr = page_address(trans->pages[0]);
> dev_warn(>pdev->dev, "val = %x\n", *addr);
> and it has the expected value.
> But if I try to copy_from_user (using the address coming from userland, the 
> one passed to get_user_pages), the data has not been written and I see the 
> memset value.
> 
> New infos:
> 
> The issue happens with IOMMU on or off.
> I compiled a kernel with DMA_API_DEBUG enabled and got no warnings or errors.
> 
> I digged a little bit deeper with my very small understanding of linux mm and 
> I discovered that:
>  * we are using transparent huge pages
>  * the page 'not transferred' are the last few of a huge page
> More precisely:
> - We have several transfer in flight from the same user buffer
> - Each transfer is 16 pages long
> - At one point in time, we start transferring from another huge page 
> (transfers are still in flight from the previous one)
> - When a transfer from the previous huge page completes, I dumped at the 
> mapcount of the pages from the previous transfers,
>   they are all to 0. The pages are still mapped to dma at this point.
> - A get_user_page to the address of the completed transfer returns return a 
> different struct page * then the on I had.
> But this is before I have unmapped/put_page them back. From my understanding 
> this should not have happened.
> 
> I tried the same code with a kernel 4.5 and encountered the same issue
> 
> Disabling transparent huge pages makes the issue disapear
> 
> Thanks in advance

It does look to me as if pages are being migrated, despite being pinned
by get_user_pages(): and that would be wrong.  Originally I intended
to suggest that THP is probably merely the cause of compaction, with
compaction causing the page migration.  But you posted very interesting
details in an earlier mail on 27th April from :

> I ran some more tests:
> 
> * Test is OK if transparent huge tlb are disabled
> 
> * For all the page where data are not transfered, and only those pages, a 
> call to get_user_page(user vaddr) just before dma_unmap_sg returns a 
> different page from the original one.
> [436477.927279] mppa :03:00.0: org_page= ea0009f60080 cur page = 
> ea00074e0080
> [436477.927298] page:ea0009f60080 count:0 mapcount:1 mapping:  
> (null) index:0x2
> [436477.927314] page flags: 0x2f8000(tail)
> [436477.927354] page dumped because: org_page
> [436477.927369] page:ea00074e0080 count:0 mapcount:1 mapping:  
> (null) index:0x2
> [436477.927382] 

Re: [Question] Missing data after DMA read transfer - mm issue with transparent huge page?

2016-05-02 Thread Hugh Dickins
On Fri, 29 Apr 2016, Nicolas Morey Chaisemartin wrote:

> Hi everyone,
> 
> This is a repost from a different address as it seems the previous one ended 
> in Gmail junk due to a domain error..

linux-kernel is a very high volume list which few are reading:
that also will account for your lack of response so far
(apart from the indefatigable Alan).

I've added linux-mm, and some people from another thread regarding
THP and get_user_pages() pins which has been discussed in recent days.

Make no mistake, the issue you're raising here is definitely not the
same as that one (which is specifically about the new THP refcounting
in v4.5+, whereas you're reporting a problem you've seen in both a
v3.10-based kernel and in v4.5).  But I think their heads are in
gear, much more so than mine, and likely to spot something.

> I added more info found while blindly debugging the issue.
> 
> Short version:
> I'm having an issue with direct DMA transfer from a device to host memory.
> It seems some of the data is not transferring to the appropriate page.
> 
> Some more details:
> I'm debugging a home made PCI driver for our board (Kalray), attached to a 
> x86_64 host running centos7 (3.10.0-327.el7.x86_64)
> 
> In the current case, a userland application transfers back and forth data 
> through read/write operations on a file.
> On the kernel side, it triggers DMA transfers through the PCI to/from our 
> board memory.
> 
> We followed what pretty much all docs said about direct I/O to user buffers:
> 
> 1) get_user_pages() (in the current case, it's at most 16 pages at once)
> 2) convert to a scatterlist
> 3) pci_map_sg
> 4) eventually coalesce sg (Intel IOMMU is enabled, so it's usually possible)
> 4) A lot of DMA engine handling code, using the dmaengine layer and virt-dma
> 5) wait for transfer complete, in the mean time, go back to (1) to schedule 
> more work, if any
> 6) pci_unmap_sg
> 7) for read (card2host) transfer, set_page_dirty_lock
> 8) page_cache_release
> 
> In 99,% it works perfectly.
> However, I have one userland application where a few pages are not written by 
> a read (card2host) transfer.
> The buffer is memset them to a different value so I can check that nothing 
> has overwritten them.
> 
> I know (PCI protocol analyser) that the data left our board for the "right" 
> address (the one set in the sg by pci_map_sg).
> I tried reading the data between the pci_unmap_sg and the set_page_dirty, 
> using
> uint32_t *addr = page_address(trans->pages[0]);
> dev_warn(>pdev->dev, "val = %x\n", *addr);
> and it has the expected value.
> But if I try to copy_from_user (using the address coming from userland, the 
> one passed to get_user_pages), the data has not been written and I see the 
> memset value.
> 
> New infos:
> 
> The issue happens with IOMMU on or off.
> I compiled a kernel with DMA_API_DEBUG enabled and got no warnings or errors.
> 
> I digged a little bit deeper with my very small understanding of linux mm and 
> I discovered that:
>  * we are using transparent huge pages
>  * the page 'not transferred' are the last few of a huge page
> More precisely:
> - We have several transfer in flight from the same user buffer
> - Each transfer is 16 pages long
> - At one point in time, we start transferring from another huge page 
> (transfers are still in flight from the previous one)
> - When a transfer from the previous huge page completes, I dumped at the 
> mapcount of the pages from the previous transfers,
>   they are all to 0. The pages are still mapped to dma at this point.
> - A get_user_page to the address of the completed transfer returns return a 
> different struct page * then the on I had.
> But this is before I have unmapped/put_page them back. From my understanding 
> this should not have happened.
> 
> I tried the same code with a kernel 4.5 and encountered the same issue
> 
> Disabling transparent huge pages makes the issue disapear
> 
> Thanks in advance

It does look to me as if pages are being migrated, despite being pinned
by get_user_pages(): and that would be wrong.  Originally I intended
to suggest that THP is probably merely the cause of compaction, with
compaction causing the page migration.  But you posted very interesting
details in an earlier mail on 27th April from :

> I ran some more tests:
> 
> * Test is OK if transparent huge tlb are disabled
> 
> * For all the page where data are not transfered, and only those pages, a 
> call to get_user_page(user vaddr) just before dma_unmap_sg returns a 
> different page from the original one.
> [436477.927279] mppa :03:00.0: org_page= ea0009f60080 cur page = 
> ea00074e0080
> [436477.927298] page:ea0009f60080 count:0 mapcount:1 mapping:  
> (null) index:0x2
> [436477.927314] page flags: 0x2f8000(tail)
> [436477.927354] page dumped because: org_page
> [436477.927369] page:ea00074e0080 count:0 mapcount:1 mapping:  
> (null) index:0x2
> [436477.927382] page flags: 

Re: Fail to build tools/all

2016-05-02 Thread Sean Fu
On Mon, May 02, 2016 at 12:20:08PM +0200, Thomas Renninger wrote:
> On Sunday, May 01, 2016 01:11:33 PM Sean Fu wrote:
> > Hi guys:
> > I encountered a build error when running "make V=1 tools/all".
> > Shall we write a patch to fix it?
> 
> This is not a bug.
> 
> > The following is error log.
> > 
> >  start ==
> > commit 05cf8077e54b20dddb756eaa26f3aeb5c38dd3cf
> > Merge: cf78031 db5dd0d
> > Author: Linus Torvalds 
> > Date:   Fri Apr 1 20:03:33 2016 -0500
> > = end  =
> > 
> > == start ===
> > mkdir -p
> > /home/sean/work/source/upstream/kernel.linus/linux/tools/power/cpupower
> > && make O=/home/sean/work/source/upstream/kernel.linus/linux
> > subdir=tools/power/cpupower --no-print-directory -C power/cpupower
> > gcc  -DVERSION=\"4.6.rc1.190.g05cf80\" -DPACKAGE=\"cpupower\"
> > -DPACKAGE_BUGREPORT=\"linux...@vger.kernel.org\" -D_GNU_SOURCE -pipe
> > -DNLS -Wall -Wchar-subscripts -Wpointer-arith -Wsign-compare
> > -Wno-pointer-sign -Wdeclaration-after-statement -Wshadow -O1 -g -DDEBUG
> > -I./lib -I ./utils -o
> > /home/sean/work/source/upstream/kernel.linus/linux/utils/helpers/amd.o
> > -c utils/helpers/amd.c
> > utils/helpers/amd.c:7:21: fatal error: pci/pci.h: No such file or
> > directory
> >  #include 
> 
> You need pciutils header and library:
> ftp://ftp.kernel.org/pub/software/utils/pciutils
> 
> Thomas
Thanks


Re: Fail to build tools/all

2016-05-02 Thread Sean Fu
On Mon, May 02, 2016 at 12:20:08PM +0200, Thomas Renninger wrote:
> On Sunday, May 01, 2016 01:11:33 PM Sean Fu wrote:
> > Hi guys:
> > I encountered a build error when running "make V=1 tools/all".
> > Shall we write a patch to fix it?
> 
> This is not a bug.
> 
> > The following is error log.
> > 
> >  start ==
> > commit 05cf8077e54b20dddb756eaa26f3aeb5c38dd3cf
> > Merge: cf78031 db5dd0d
> > Author: Linus Torvalds 
> > Date:   Fri Apr 1 20:03:33 2016 -0500
> > = end  =
> > 
> > == start ===
> > mkdir -p
> > /home/sean/work/source/upstream/kernel.linus/linux/tools/power/cpupower
> > && make O=/home/sean/work/source/upstream/kernel.linus/linux
> > subdir=tools/power/cpupower --no-print-directory -C power/cpupower
> > gcc  -DVERSION=\"4.6.rc1.190.g05cf80\" -DPACKAGE=\"cpupower\"
> > -DPACKAGE_BUGREPORT=\"linux...@vger.kernel.org\" -D_GNU_SOURCE -pipe
> > -DNLS -Wall -Wchar-subscripts -Wpointer-arith -Wsign-compare
> > -Wno-pointer-sign -Wdeclaration-after-statement -Wshadow -O1 -g -DDEBUG
> > -I./lib -I ./utils -o
> > /home/sean/work/source/upstream/kernel.linus/linux/utils/helpers/amd.o
> > -c utils/helpers/amd.c
> > utils/helpers/amd.c:7:21: fatal error: pci/pci.h: No such file or
> > directory
> >  #include 
> 
> You need pciutils header and library:
> ftp://ftp.kernel.org/pub/software/utils/pciutils
> 
> Thomas
Thanks


  1   2   3   4   5   6   7   8   9   10   >