date:20130104

[PATCH v2] input: keyboard: tegra: add support for rows/cols configuration from dt

2013-01-04 Thread Laxman Dewangan

The NVIDIA's Tegra KBC has maximum 24 pins to make matrix keypad.
Any pin can be configured as row or column. The maximum column pin
can be 8 and maximum row pin can be 16.

Remove the assumption that all first 16 pins will be used as row
and remaining as columns and Add the property for configuring pins
to either row or column from DT. Update the devicetree binding
document accordingly.

Signed-off-by: Laxman Dewangan 
---
Changes from V1:
- renames kbc-rows and kbc-cols to kbc-row-pins and kbc-col-pins.
- cleanusp.

 .../bindings/input/nvidia,tegra20-kbc.txt  |   22 ++
 drivers/input/keyboard/tegra-kbc.c |   74 +++-
 2 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/Documentation/devicetree/bindings/input/nvidia,tegra20-kbc.txt 
b/Documentation/devicetree/bindings/input/nvidia,tegra20-kbc.txt
index 72683be..2995fae 100644
--- a/Documentation/devicetree/bindings/input/nvidia,tegra20-kbc.txt
+++ b/Documentation/devicetree/bindings/input/nvidia,tegra20-kbc.txt
@@ -1,7 +1,18 @@
 * Tegra keyboard controller
+The key controller has maximum 24 pins to make matrix keypad. Any pin
+can be configured as row or column. The maximum column pin can be 8
+and maximum row pins can be 16 for Tegra20/Tegra30.
 
 Required properties:
 - compatible: "nvidia,tegra20-kbc"
+- reg: Register base address of KBC.
+- interrupts: Interrupt number for the KBC.
+- nvidia,kbc-row-pins: The KBC pins which are configured as row. This is an
+  array of pin numbers which is used as rows.
+- nvidia,kbc-col-pins: The KBC pins which are configured as column. This is an
+  array of pin numbers which is used as column.
+- linux,keymap: The keymap for keys as described in the binding document
+  devicetree/bindings/input/matrix-keymap.txt.
 
 Optional properties, in addition to those specified by the shared
 matrix-keyboard bindings:
@@ -19,5 +30,16 @@ Example:
 keyboard: keyboard {
compatible = "nvidia,tegra20-kbc";
reg = <0x7000e200 0x100>;
+   interrupts = <0 85 0x04>;
nvidia,ghost-filter;
+   nvidia,debounce-delay-ms = <640>;
+   nvidia,kbc-row-pins = <0 1 2>;/* pin 0, 1, 2 as rows */
+   nvidia,kbc-col-pins = <11 12 13>; /* pin 11, 12, 13 as columns */
+   linux,keymap = <0x0074
+   0x00010067
+   0x00020066
+   0x01010068
+   0x0269
+   0x02010070
+   0x02020071>;
 };
diff --git a/drivers/input/keyboard/tegra-kbc.c 
b/drivers/input/keyboard/tegra-kbc.c
index c036425..b65971d 100644
--- a/drivers/input/keyboard/tegra-kbc.c
+++ b/drivers/input/keyboard/tegra-kbc.c
@@ -614,13 +614,16 @@ static struct tegra_kbc_platform_data 
*tegra_kbc_dt_parse_pdata(
struct device_node *np = pdev->dev.of_node;
u32 prop;
int i;
-
-   if (!np)
-   return NULL;
+   u32 num_rows = 0;
+   u32 num_cols = 0;
+   u32 cols_cfg[KBC_MAX_GPIO];
+   u32 rows_cfg[KBC_MAX_GPIO];
+   int proplen;
+   int ret;
 
pdata = devm_kzalloc(>dev, sizeof(*pdata), GFP_KERNEL);
if (!pdata)
-   return NULL;
+   return ERR_PTR(-ENOMEM);
 
if (!of_property_read_u32(np, "nvidia,debounce-delay-ms", ))
pdata->debounce_cnt = prop;
@@ -634,18 +637,55 @@ static struct tegra_kbc_platform_data 
*tegra_kbc_dt_parse_pdata(
if (of_find_property(np, "nvidia,wakeup-source", NULL))
pdata->wakeup = true;
 
-   /*
-* All currently known keymaps with device tree support use the same
-* pin_cfg, so set it up here.
-*/
-   for (i = 0; i < KBC_MAX_ROW; i++) {
-   pdata->pin_cfg[i].num = i;
-   pdata->pin_cfg[i].type = PIN_CFG_ROW;
+   if (!of_get_property(np, "nvidia,kbc-row-pins", )) {
+   dev_err(>dev, "property nvidia,kbc-row-pins not found\n");
+   return ERR_PTR(-ENOENT);
+   }
+   num_rows = proplen / sizeof(u32);
+
+   if (!of_get_property(np, "nvidia,kbc-col-pins", )) {
+   dev_err(>dev, "property nvidia,kbc-col-pins not found\n");
+   return ERR_PTR(-ENOENT);
+   }
+   num_cols = proplen / sizeof(u32);
+
+   if (!of_get_property(np, "linux,keymap", )) {
+   dev_err(>dev, "property linux,keymap not found\n");
+   return ERR_PTR(-ENOENT);
+   }
+
+   if (!num_rows || !num_cols || ((num_rows + num_cols) > KBC_MAX_GPIO)) {
+   dev_err(>dev,
+   "keypad rows/columns not porperly specified\n");
+   return ERR_PTR(-EINVAL);
}
 
-   for (i = 0; i < KBC_MAX_COL; i++) {
-   pdata->pin_cfg[KBC_MAX_ROW + i].num = i;
-   pdata->pin_cfg[KBC_MAX_ROW + i].type = PIN_CFG_COL;
+   /* Set all pins as non-configured */
+   for (i = 0; i < KBC_MAX_GPIO; i++)
+

[PATCH v2 4/4] input: keyboard: tegra: remove default key mapping

2013-01-04 Thread Laxman Dewangan

Tegra KBC driver have the default key mapping for 16x8 configuration.
The key mapping can be provided through platform data or through DT
and the mapping varies from platform to platform, hence this default
mapping is not so useful. Remove the default mapping to reduce the code
lines of the driver.

Signed-off-by: Laxman Dewangan 
---
Changes from V1:
- None

 drivers/input/keyboard/tegra-kbc.c |  156 +---
 1 files changed, 1 insertions(+), 155 deletions(-)

diff --git a/drivers/input/keyboard/tegra-kbc.c 
b/drivers/input/keyboard/tegra-kbc.c
index ef7a0ac..c6e4985 100644
--- a/drivers/input/keyboard/tegra-kbc.c
+++ b/drivers/input/keyboard/tegra-kbc.c
@@ -87,147 +87,6 @@ struct tegra_kbc {
struct clk *clk;
 };
 
-static const u32 tegra_kbc_default_keymap[] = {
-   KEY(0, 2, KEY_W),
-   KEY(0, 3, KEY_S),
-   KEY(0, 4, KEY_A),
-   KEY(0, 5, KEY_Z),
-   KEY(0, 7, KEY_FN),
-
-   KEY(1, 7, KEY_LEFTMETA),
-
-   KEY(2, 6, KEY_RIGHTALT),
-   KEY(2, 7, KEY_LEFTALT),
-
-   KEY(3, 0, KEY_5),
-   KEY(3, 1, KEY_4),
-   KEY(3, 2, KEY_R),
-   KEY(3, 3, KEY_E),
-   KEY(3, 4, KEY_F),
-   KEY(3, 5, KEY_D),
-   KEY(3, 6, KEY_X),
-
-   KEY(4, 0, KEY_7),
-   KEY(4, 1, KEY_6),
-   KEY(4, 2, KEY_T),
-   KEY(4, 3, KEY_H),
-   KEY(4, 4, KEY_G),
-   KEY(4, 5, KEY_V),
-   KEY(4, 6, KEY_C),
-   KEY(4, 7, KEY_SPACE),
-
-   KEY(5, 0, KEY_9),
-   KEY(5, 1, KEY_8),
-   KEY(5, 2, KEY_U),
-   KEY(5, 3, KEY_Y),
-   KEY(5, 4, KEY_J),
-   KEY(5, 5, KEY_N),
-   KEY(5, 6, KEY_B),
-   KEY(5, 7, KEY_BACKSLASH),
-
-   KEY(6, 0, KEY_MINUS),
-   KEY(6, 1, KEY_0),
-   KEY(6, 2, KEY_O),
-   KEY(6, 3, KEY_I),
-   KEY(6, 4, KEY_L),
-   KEY(6, 5, KEY_K),
-   KEY(6, 6, KEY_COMMA),
-   KEY(6, 7, KEY_M),
-
-   KEY(7, 1, KEY_EQUAL),
-   KEY(7, 2, KEY_RIGHTBRACE),
-   KEY(7, 3, KEY_ENTER),
-   KEY(7, 7, KEY_MENU),
-
-   KEY(8, 4, KEY_RIGHTSHIFT),
-   KEY(8, 5, KEY_LEFTSHIFT),
-
-   KEY(9, 5, KEY_RIGHTCTRL),
-   KEY(9, 7, KEY_LEFTCTRL),
-
-   KEY(11, 0, KEY_LEFTBRACE),
-   KEY(11, 1, KEY_P),
-   KEY(11, 2, KEY_APOSTROPHE),
-   KEY(11, 3, KEY_SEMICOLON),
-   KEY(11, 4, KEY_SLASH),
-   KEY(11, 5, KEY_DOT),
-
-   KEY(12, 0, KEY_F10),
-   KEY(12, 1, KEY_F9),
-   KEY(12, 2, KEY_BACKSPACE),
-   KEY(12, 3, KEY_3),
-   KEY(12, 4, KEY_2),
-   KEY(12, 5, KEY_UP),
-   KEY(12, 6, KEY_PRINT),
-   KEY(12, 7, KEY_PAUSE),
-
-   KEY(13, 0, KEY_INSERT),
-   KEY(13, 1, KEY_DELETE),
-   KEY(13, 3, KEY_PAGEUP),
-   KEY(13, 4, KEY_PAGEDOWN),
-   KEY(13, 5, KEY_RIGHT),
-   KEY(13, 6, KEY_DOWN),
-   KEY(13, 7, KEY_LEFT),
-
-   KEY(14, 0, KEY_F11),
-   KEY(14, 1, KEY_F12),
-   KEY(14, 2, KEY_F8),
-   KEY(14, 3, KEY_Q),
-   KEY(14, 4, KEY_F4),
-   KEY(14, 5, KEY_F3),
-   KEY(14, 6, KEY_1),
-   KEY(14, 7, KEY_F7),
-
-   KEY(15, 0, KEY_ESC),
-   KEY(15, 1, KEY_GRAVE),
-   KEY(15, 2, KEY_F5),
-   KEY(15, 3, KEY_TAB),
-   KEY(15, 4, KEY_F1),
-   KEY(15, 5, KEY_F2),
-   KEY(15, 6, KEY_CAPSLOCK),
-   KEY(15, 7, KEY_F6),
-
-   /* Software Handled Function Keys */
-   KEY(20, 0, KEY_KP7),
-
-   KEY(21, 0, KEY_KP9),
-   KEY(21, 1, KEY_KP8),
-   KEY(21, 2, KEY_KP4),
-   KEY(21, 4, KEY_KP1),
-
-   KEY(22, 1, KEY_KPSLASH),
-   KEY(22, 2, KEY_KP6),
-   KEY(22, 3, KEY_KP5),
-   KEY(22, 4, KEY_KP3),
-   KEY(22, 5, KEY_KP2),
-   KEY(22, 7, KEY_KP0),
-
-   KEY(27, 1, KEY_KPASTERISK),
-   KEY(27, 3, KEY_KPMINUS),
-   KEY(27, 4, KEY_KPPLUS),
-   KEY(27, 5, KEY_KPDOT),
-
-   KEY(28, 5, KEY_VOLUMEUP),
-
-   KEY(29, 3, KEY_HOME),
-   KEY(29, 4, KEY_END),
-   KEY(29, 5, KEY_BRIGHTNESSDOWN),
-   KEY(29, 6, KEY_VOLUMEDOWN),
-   KEY(29, 7, KEY_BRIGHTNESSUP),
-
-   KEY(30, 0, KEY_NUMLOCK),
-   KEY(30, 1, KEY_SCROLLLOCK),
-   KEY(30, 2, KEY_MUTE),
-
-   KEY(31, 4, KEY_HELP),
-};
-
-static const
-struct matrix_keymap_data tegra_kbc_default_keymap_data = {
-   .keymap = tegra_kbc_default_keymap,
-   .keymap_size= ARRAY_SIZE(tegra_kbc_default_keymap),
-};
-
 static void tegra_kbc_report_released_keys(struct input_dev *input,
   unsigned short old_keycodes[],
   unsigned int old_num_keys,
@@ -701,26 +560,13 @@ static int tegra_kbd_setup_keymap(struct tegra_kbc *kbc)
const struct tegra_kbc_platform_data *pdata = kbc->pdata;
const struct matrix_keymap_data *keymap_data = pdata->keymap_data;
unsigned int keymap_rows = KBC_MAX_KEY;
-   int retval;
 
if (keymap_data && pdata->use_fn_map)
keymap_rows *= 2;
 
-   retval = matrix_keypad_build_keymap(keymap_data, NULL,
+   return

[PATCH V2 0/4] input: keyboard: tegra: cleanups and DT supports

2013-01-04 Thread Laxman Dewangan

This patch series:
 - fix build warning,
 - use devm_* for allocation,
 - make column/rows configuration through DT and 
 - remove the rarely used  key mapping table.

Changes from V1:
- renames the rows and pins property array.
- nit cleanups.

Laxman Dewangan (4):
  input: keyboard: tegra: fix build warning
  input: keyboard: tegra: use devm_* for resource allocation
  input: keyboard: tegra: add support for rows/cols configuration from
dt
  input: keyboard: tegra: remove default key mapping

 .../bindings/input/nvidia,tegra20-kbc.txt  |   22 ++
 drivers/input/keyboard/tegra-kbc.c |  345 ++--
 2 files changed, 118 insertions(+), 249 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 2/4] input: keyboard: tegra: use devm_* for resource allocation

2013-01-04 Thread Laxman Dewangan

Use devm_* for memory, clock, input device allocation. This reduces
code for freeing these resources.

Signed-off-by: Laxman Dewangan 
---
Changes from V1:
None

 drivers/input/keyboard/tegra-kbc.c |   93 +++-
 1 files changed, 28 insertions(+), 65 deletions(-)

diff --git a/drivers/input/keyboard/tegra-kbc.c 
b/drivers/input/keyboard/tegra-kbc.c
index f1d3ba0..c036425 100644
--- a/drivers/input/keyboard/tegra-kbc.c
+++ b/drivers/input/keyboard/tegra-kbc.c
@@ -618,7 +618,7 @@ static struct tegra_kbc_platform_data 
*tegra_kbc_dt_parse_pdata(
if (!np)
return NULL;
 
-   pdata = kzalloc(sizeof(*pdata), GFP_KERNEL);
+   pdata = devm_kzalloc(>dev, sizeof(*pdata), GFP_KERNEL);
if (!pdata)
return NULL;
 
@@ -700,33 +700,36 @@ static int tegra_kbc_probe(struct platform_device *pdev)
if (!pdata)
pdata = tegra_kbc_dt_parse_pdata(pdev);
 
-   if (!pdata)
+   if (!pdata) {
+   dev_err(>dev, "Platform data missing\n");
return -EINVAL;
-
-   if (!tegra_kbc_check_pin_cfg(pdata, >dev, _rows)) {
-   err = -EINVAL;
-   goto err_free_pdata;
}
 
+   if (!tegra_kbc_check_pin_cfg(pdata, >dev, _rows))
+   return -EINVAL;
+
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
if (!res) {
dev_err(>dev, "failed to get I/O memory\n");
-   err = -ENXIO;
-   goto err_free_pdata;
+   return -ENXIO;
}
 
irq = platform_get_irq(pdev, 0);
if (irq < 0) {
dev_err(>dev, "failed to get keyboard IRQ\n");
-   err = -ENXIO;
-   goto err_free_pdata;
+   return -ENXIO;
+   }
+
+   kbc = devm_kzalloc(>dev, sizeof(*kbc), GFP_KERNEL);
+   if (!kbc) {
+   dev_err(>dev, "failed to alloc memory for kbc\n");
+   return -ENOMEM;
}
 
-   kbc = kzalloc(sizeof(*kbc), GFP_KERNEL);
-   input_dev = input_allocate_device();
-   if (!kbc || !input_dev) {
-   err = -ENOMEM;
-   goto err_free_mem;
+   input_dev = devm_input_allocate_device(>dev);
+   if (!input_dev) {
+   dev_err(>dev, "failed to allocate input device\n");
+   return -ENOMEM;
}
 
kbc->pdata = pdata;
@@ -735,25 +738,16 @@ static int tegra_kbc_probe(struct platform_device *pdev)
spin_lock_init(>lock);
setup_timer(>timer, tegra_kbc_keypress_timer, (unsigned long)kbc);
 
-   res = request_mem_region(res->start, resource_size(res), pdev->name);
-   if (!res) {
-   dev_err(>dev, "failed to request I/O memory\n");
-   err = -EBUSY;
-   goto err_free_mem;
-   }
-
-   kbc->mmio = ioremap(res->start, resource_size(res));
+   kbc->mmio = devm_request_and_ioremap(>dev, res);
if (!kbc->mmio) {
-   dev_err(>dev, "failed to remap I/O memory\n");
-   err = -ENXIO;
-   goto err_free_mem_region;
+   dev_err(>dev, "Cannot request memregion/iomap address\n");
+   return -EADDRNOTAVAIL;
}
 
-   kbc->clk = clk_get(>dev, NULL);
+   kbc->clk = devm_clk_get(>dev, NULL);
if (IS_ERR(kbc->clk)) {
dev_err(>dev, "failed to get keyboard clock\n");
-   err = PTR_ERR(kbc->clk);
-   goto err_iounmap;
+   return PTR_ERR(kbc->clk);
}
 
/*
@@ -778,9 +772,9 @@ static int tegra_kbc_probe(struct platform_device *pdev)
input_dev->close = tegra_kbc_close;
 
err = tegra_kbd_setup_keymap(kbc);
-   if (err) {
+   if (err < 0) {
dev_err(>dev, "failed to setup keymap\n");
-   goto err_put_clk;
+   return err;
}
 
__set_bit(EV_REP, input_dev->evbit);
@@ -790,15 +784,15 @@ static int tegra_kbc_probe(struct platform_device *pdev)
 
err = request_irq(kbc->irq, tegra_kbc_isr,
  IRQF_NO_SUSPEND | IRQF_TRIGGER_HIGH, pdev->name, kbc);
-   if (err) {
+   if (err < 0) {
dev_err(>dev, "failed to request keyboard IRQ\n");
-   goto err_put_clk;
+   return err;
}
 
disable_irq(kbc->irq);
 
err = input_register_device(kbc->idev);
-   if (err) {
+   if (err < 0) {
dev_err(>dev, "failed to register input device\n");
goto err_free_irq;
}
@@ -810,46 +804,15 @@ static int tegra_kbc_probe(struct platform_device *pdev)
 
 err_free_irq:
free_irq(kbc->irq, pdev);
-err_put_clk:
-   clk_put(kbc->clk);
-err_iounmap:
-   iounmap(kbc->mmio);
-err_free_mem_region:
-   release_mem_region(res->start, resource_size(res));
-err_free_mem:
-   input_free_device(input_dev);
-   kfree(kbc);
-err_free_pdata:
-   if

[PATCH v2 1/4] input: keyboard: tegra: fix build warning

2013-01-04 Thread Laxman Dewangan

Fix the following build warning when building driver with CONFIG_PM_SLEEP
not selected.

tegra-kbc.c:360:13: warning: 'tegra_kbc_set_keypress_interrupt' defined but not 
used [-Wunused-function]

Signed-off-by: Laxman Dewangan 
---
Changes form V1:
- none

 drivers/input/keyboard/tegra-kbc.c |   24 
 1 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/drivers/input/keyboard/tegra-kbc.c 
b/drivers/input/keyboard/tegra-kbc.c
index c76f968..f1d3ba0 100644
--- a/drivers/input/keyboard/tegra-kbc.c
+++ b/drivers/input/keyboard/tegra-kbc.c
@@ -357,18 +357,6 @@ static void tegra_kbc_set_fifo_interrupt(struct tegra_kbc 
*kbc, bool enable)
writel(val, kbc->mmio + KBC_CONTROL_0);
 }
 
-static void tegra_kbc_set_keypress_interrupt(struct tegra_kbc *kbc, bool 
enable)
-{
-   u32 val;
-
-   val = readl(kbc->mmio + KBC_CONTROL_0);
-   if (enable)
-   val |= KBC_CONTROL_KEYPRESS_INT_EN;
-   else
-   val &= ~KBC_CONTROL_KEYPRESS_INT_EN;
-   writel(val, kbc->mmio + KBC_CONTROL_0);
-}
-
 static void tegra_kbc_keypress_timer(unsigned long data)
 {
struct tegra_kbc *kbc = (struct tegra_kbc *)data;
@@ -866,6 +854,18 @@ static int tegra_kbc_remove(struct platform_device *pdev)
 }
 
 #ifdef CONFIG_PM_SLEEP
+static void tegra_kbc_set_keypress_interrupt(struct tegra_kbc *kbc, bool 
enable)
+{
+   u32 val;
+
+   val = readl(kbc->mmio + KBC_CONTROL_0);
+   if (enable)
+   val |= KBC_CONTROL_KEYPRESS_INT_EN;
+   else
+   val &= ~KBC_CONTROL_KEYPRESS_INT_EN;
+   writel(val, kbc->mmio + KBC_CONTROL_0);
+}
+
 static int tegra_kbc_suspend(struct device *dev)
 {
struct platform_device *pdev = to_platform_device(dev);
-- 
1.7.1.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: mmap() scalability in the presence of the MAP_POPULATE flag

2013-01-04 Thread Michel Lespinasse

On Fri, Jan 4, 2013 at 10:40 PM, Roman Dubtsov  wrote:
> On Fri, 2013-01-04 at 03:57 -0800, Michel Lespinasse wrote:
>> If this doesn't help, could you please send me your test case ? I
>> think you described enough of it that I would be able to reproduce it
>> given some time, but it's just easier if you send me a short C file :)
>
> It does not, the results are more or less the same. I've attached my
> testcase. It does map anonymous memory. It also uses OpenMP for
> threading because I'm lazy, so it requires passing -fopenmp to gcc and
> the number of threads it runs is defined via OMP_NUM_THREADS environment
> variable. There are also two macros that influence test's behavior:
>
> - POPULATE_VIA_LOOP -- makes the test populate memory using a loop
> - POPULATE_VIA_MMAP -- makes the test populate memory via MAP_POPULATE
>
> If none of the macros are defined, the test does not populate memory.

Heh, very interesting. As it turns out, the problem gets MUCH worse as
the number of threads increase.

We are populating the anon mapping with huge pages. In the
POPULATE_VIA_LOOP case, we are just taking a page fault every 2MB and
filling it up with a zeroed huge page - most of the runtime comes from
clearing the huge page.

In the POPULATE_VIA_MMAP, follow_page() is called at 4KB increment
addresses, and it takes the mm->page_table_lock 511 times out of 512
(that is, every time it falls within a huge page that's just been
populated). So all OMP_NUM_THREADS threads are constantly bouncing
over the mm->page_table_lock, and getting terrible performance as a
result.

Thanks for the report. I don't have a patch just now, but this does
seem very solvable.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] writeback: fix writeback cache thrashing

2013-01-04 Thread Fengguang Wu

On Fri, Jan 04, 2013 at 11:26:43PM -0600, Simon Jeons wrote:
> On Sat, 2013-01-05 at 11:26 +0800, Fengguang Wu wrote:
> > > > > Hi Namjae,
> > > > >
> > > > > Why use bdi_stat_error here? What's the meaning of its comment 
> > > > > "maximal
> > > > > error of a stat counter"?
> > > > Hi Simon,
> > > > 
> > > > As you know bdi stats (BDI_RECLAIMABLE, BDI_WRITEBACK …) are kept in
> > > > percpu counters.
> > > > When these percpu counters are incremented/decremented simultaneously
> > > > on multiple CPUs by small amount (individual cpu counter less than
> > > > threshold BDI_STAT_BATCH),
> > > > it is possible that we get approximate value (not exact value) of
> > > > these percpu counters.
> > > > In order, to handle these percpu counter error we have used
> > > > bdi_stat_error. bdi_stat_error is the maximum error which can happen
> > > > in percpu bdi stats accounting.
> > > > 
> > > > bdi_stat(bdi, BDI_RECLAIMABLE);
> > > >  -> This will give approximate value of BDI_RECLAIMABLE by reading
> > > > previous value of percpu count.
> > > > 
> > > > bdi_stat_sum(bdi, BDI_RECLAIMABLE);
> > > >  ->This will give exact value of BDI_RECLAIMABLE. It will take lock
> > > > and add current percpu count of individual CPUs.
> > > >It is not recommended to use it frequently as it is expensive. We
> > > > can better use “bdi_stat” and work with approx value of bdi stats.
> > > > 
> > > 
> > > Hi Namjae, thanks for your clarify.
> > > 
> > > But why compare error stat count to bdi_bground_thresh? What's the
> > 
> > It's not comparing bdi_stat_error to bdi_bground_thresh, but rather,
> > in concept, comparing bdi_stat (with error bound adjustments) to
> > bdi_bground_thresh.
> > 
> > > relationship between them? I also see bdi_stat_error compare to
> > > bdi_thresh/bdi_dirty in function balance_dirty_pages. 
> > 
> 
> Hi Fengguang,
> 
> > Here, it's trying to use bdi_stat_sum(), the accurate (however more
> > costly) version of bdi_stat(), if the error would possibly be large:
> 
> Why error is large use bdi_stat_sum and error is few use bdi_stat?

It's the opposite. Please check this per-cpu counter routine to get an idea:

/*
 * Add up all the per-cpu counts, return the result.  This is a more accurate
 * but much slower version of percpu_counter_read_positive()
 */ 
s64 __percpu_counter_sum(struct percpu_counter *fbc)

> > 
> > if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
> > bdi_reclaimable = bdi_stat_sum(bdi, 
> > BDI_RECLAIMABLE);
> > //...
> > } else {
> > bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> > //...
> > }
> > 
> > Here the comment should have explained it well:
> > 
> >  * In theory 1 page is enough to keep the comsumer-producer
> >  * pipe going: the flusher cleans 1 page => the task 
> > dirties 1
> >  * more page. However bdi_dirty has accounting errors.  So 
> > use
> 
> Why bdi_dirty has accounting errors?

Because it typically uses bdi_stat() to get the rough sum of the per-cpu
counters.
 
Thanks,
Fengguang

> >  * the larger and more IO friendly bdi_stat_error.
> >  */
> > if (bdi_dirty <= bdi_stat_error(bdi))
> > break;
> > 
> > 
> > Thanks,
> > Fengguang
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V3 6/8] memcg: Don't account root_mem_cgroup page statistics

2013-01-04 Thread Sha Zhengju

On Fri, Dec 28, 2012 at 9:04 AM, Kamezawa Hiroyuki
 wrote:
> (2012/12/26 2:27), Sha Zhengju wrote:
>> From: Sha Zhengju 
>>
>> If memcg is enabled and no non-root memcg exists, all allocated pages
>> belongs to root_mem_cgroup and go through root memcg statistics routines
>> which brings some overheads. So for the sake of performance, we can give
>> up accounting stats of root memcg for MEM_CGROUP_STAT_FILE_MAPPED/FILE_DIRTY
>> /WRITEBACK and instead we pay special attention while showing root
>> memcg numbers in memcg_stat_show(): as we don't account root memcg stats
>> anymore, the root_mem_cgroup->stat numbers are actually 0. But because of
>> hierachy, figures of root_mem_cgroup may just represent numbers of pages
>> used by its own tasks(not belonging to any other child cgroup). So here we
>> fake these root numbers by using stats of global state and all other memcg.
>> That is for root memcg:
>>   nr(MEM_CGROUP_STAT_FILE_MAPPED) = global_page_state(NR_FILE_MAPPED) -
>>sum_of_all_memcg(MEM_CGROUP_STAT_FILE_MAPPED);
>> Dirty/Writeback pages accounting are in the similar way.
>>
>> Signed-off-by: Sha Zhengju 
>
> isn't it better to use mem_cgroup_is_root() call rather than
> direct comparison (memcg == root_mem_cgroup) ?
>

Okay, it's better to use the wrapper.

> Anyway, Ack to this approach.
>

Thanks for reviewing!


Regards,
Sha
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH RFT] regulator: lp8788-ldo: Use ldo->en_pin to check if regulator is enabled by external pin

2013-01-04 Thread Axel Lin

ldo->en_pin is set iff the regulator is enabled by external pin.

This patch sets ldo->en_pin to NULL if lp8788_gpio_request_ldo_en() fails, then
we can use it to determinate if the regulator is controlled by external pin or
register.

lp8788_get_ldo_enable_mode(), lp8788_ldo_ctrl_by_extern_pin() and
lp8788_ldo_is_enabled_by_extern_pin() functions are not used now, remove them.

Signed-off-by: Axel Lin 
---
 drivers/regulator/lp8788-ldo.c |  107 ++--
 1 file changed, 14 insertions(+), 93 deletions(-)

diff --git a/drivers/regulator/lp8788-ldo.c b/drivers/regulator/lp8788-ldo.c
index 416bb60..cd5a14a 100644
--- a/drivers/regulator/lp8788-ldo.c
+++ b/drivers/regulator/lp8788-ldo.c
@@ -88,11 +88,6 @@
 #define ENABLE GPIOF_OUT_INIT_HIGH
 #define DISABLEGPIOF_OUT_INIT_LOW
 
-enum lp8788_enable_mode {
-   REGISTER,
-   EXTPIN,
-};
-
 enum lp8788_ldo_id {
DLDO1,
DLDO2,
@@ -189,114 +184,38 @@ static enum lp8788_ldo_id lp8788_aldo_id[] = {
ALDO10,
 };
 
-/* DLDO 7, 9 and 11, ALDO 1 ~ 5 and 7
-   : can be enabled either by external pin or by i2c register */
-static enum lp8788_enable_mode
-lp8788_get_ldo_enable_mode(struct lp8788_ldo *ldo, enum lp8788_ldo_id id)
-{
-   int ret;
-   u8 val, mask;
-
-   ret = lp8788_read_byte(ldo->lp, LP8788_EN_SEL, );
-   if (ret)
-   return ret;
-
-   switch (id) {
-   case DLDO7:
-   mask =  LP8788_EN_SEL_DLDO7_M;
-   break;
-   case DLDO9:
-   case DLDO11:
-   mask =  LP8788_EN_SEL_DLDO911_M;
-   break;
-   case ALDO1:
-   mask =  LP8788_EN_SEL_ALDO1_M;
-   break;
-   case ALDO2 ... ALDO4:
-   mask =  LP8788_EN_SEL_ALDO234_M;
-   break;
-   case ALDO5:
-   mask =  LP8788_EN_SEL_ALDO5_M;
-   break;
-   case ALDO7:
-   mask =  LP8788_EN_SEL_ALDO7_M;
-   break;
-   default:
-   return REGISTER;
-   }
-
-   return val & mask ? EXTPIN : REGISTER;
-}
-
-static int lp8788_ldo_ctrl_by_extern_pin(struct lp8788_ldo *ldo, int pinstate)
-{
-   struct lp8788_ldo_enable_pin *pin = ldo->en_pin;
-
-   if (!pin)
-   return -EINVAL;
-
-   if (gpio_is_valid(pin->gpio))
-   gpio_set_value(pin->gpio, pinstate);
-
-   return 0;
-}
-
-static int lp8788_ldo_is_enabled_by_extern_pin(struct lp8788_ldo *ldo)
-{
-   struct lp8788_ldo_enable_pin *pin = ldo->en_pin;
-
-   if (!pin)
-   return -EINVAL;
-
-   return gpio_get_value(pin->gpio) ? 1 : 0;
-}
-
 static int lp8788_ldo_enable(struct regulator_dev *rdev)
 {
struct lp8788_ldo *ldo = rdev_get_drvdata(rdev);
-   enum lp8788_ldo_id id = rdev_get_id(rdev);
-   enum lp8788_enable_mode mode = lp8788_get_ldo_enable_mode(ldo, id);
 
-   switch (mode) {
-   case EXTPIN:
-   return lp8788_ldo_ctrl_by_extern_pin(ldo, ENABLE);
-   case REGISTER:
+   if (ldo->en_pin) {
+   gpio_set_value(ldo->en_pin->gpio, ENABLE);
+   return 0;
+   } else {
return regulator_enable_regmap(rdev);
-   default:
-   return -EINVAL;
}
 }
 
 static int lp8788_ldo_disable(struct regulator_dev *rdev)
 {
struct lp8788_ldo *ldo = rdev_get_drvdata(rdev);
-   enum lp8788_ldo_id id = rdev_get_id(rdev);
-   enum lp8788_enable_mode mode = lp8788_get_ldo_enable_mode(ldo, id);
 
-   switch (mode) {
-   case EXTPIN:
-   return lp8788_ldo_ctrl_by_extern_pin(ldo, DISABLE);
-   case REGISTER:
+   if (ldo->en_pin) {
+   gpio_set_value(ldo->en_pin->gpio, DISABLE);
+   return 0;
+   } else {
return regulator_disable_regmap(rdev);
-   default:
-   return -EINVAL;
}
 }
 
 static int lp8788_ldo_is_enabled(struct regulator_dev *rdev)
 {
struct lp8788_ldo *ldo = rdev_get_drvdata(rdev);
-   enum lp8788_ldo_id id = rdev_get_id(rdev);
-   enum lp8788_enable_mode mode = lp8788_get_ldo_enable_mode(ldo, id);
 
-   switch (mode) {
-   case EXTPIN:
-   return lp8788_ldo_is_enabled_by_extern_pin(ldo);
-   case REGISTER:
+   if (ldo->en_pin)
+   return gpio_get_value(ldo->en_pin->gpio) ? 1 : 0;
+   else
return regulator_is_enabled_regmap(rdev);
-   default:
-   return -EINVAL;
-   }
 }
 
 static int lp8788_ldo_enable_time(struct regulator_dev *rdev)
@@ -696,8 +615,10 @@ static int lp8788_config_ldo_enable_mode(struct 
platform_device *pdev,
ldo->en_pin = pdata->ldo_pin[enable_id];
 
ret = lp8788_gpio_request_ldo_en(pdev, ldo, enable_id);
-   if (ret)
+   if (ret) {
+   ldo->en_pin = NULL;
goto set_default_ldo_enable_mode;
+   }
 
return

Re: [PATCH 4/4] input: keyboard: tegra: remove default key mapping

2013-01-04 Thread Laxman Dewangan


On Saturday 05 January 2013 01:22 AM, Stephen Warren wrote:

On 01/04/2013 04:02 AM, Laxman Dewangan wrote:

Tegra KBC driver have the default key mapping for 16x8 configuration.
The key mapping can be provided through platform data or through DT
and the mapping varies from platform to platform, hence this default
mapping is not so useful. Remove the default mapping to reduce the code
lines of the driver.

Overall, I agree with the intent of this change, but I /think/ the
default keymap described here actually /is/ useful for the
Seaboard/Springbank platforms. I guess the KBC driver isn't in the
device tree for that (or any) board yet, so removing this keymap isn't
really a regression, but it would be nice if you could come up with a
change to add suitable keymaps to the device tree files too.


OK, I will push the patch for enabling keys in seaboard along with the 
other changes for DT files for tegra.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v4 3/5] KVM: x86: clean up reexecute_instruction

2013-01-04 Thread Xiao Guangrong

On 01/05/2013 06:21 AM, Marcelo Tosatti wrote:
> On Fri, Jan 04, 2013 at 09:55:40PM +0800, Xiao Guangrong wrote:
>> Little cleanup for reexecute_instruction, also use gpa_to_gfn in
>> retry_instruction
>>
>> Signed-off-by: Xiao Guangrong 
>> ---
>>  arch/x86/kvm/x86.c |   13 ++---
>>  1 files changed, 6 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 1c9c834..ad39018 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -4761,19 +4761,18 @@ static bool reexecute_instruction(struct kvm_vcpu 
>> *vcpu, gva_t gva)
>>  if (tdp_enabled)
>>  return false;
>>
>> +gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
>> +if (gpa == UNMAPPED_GVA)
>> +return true; /* let cpu generate fault */
>> +
> 
> Why change from _system to _read here? Purely cleanup patch should
> have no logical changes.

Ouch, my mistake, will drop this change.

> 
> BTW, there is not much logic in using reexecute_instruction() at
> for x86_decode_insn (checks in reexecute_instruction() assume 
> write to the cr2, for instance).
> Fault propagation for x86_decode_insn seems completly broken
> (which is perhaps why reexecute_instruction() there survived).

Currently, reexecute_instruction can work only if it is called on page
fault path where cr2 is valid. On other paths, cr2 is 0 which is always
not be mapped on guest since it is NULL pointer, so reexecute_instruction
always retry the instruction.

Yes, as you point it out, it is better if the fault address can be got
from x86_decode_insn. I will consider it later.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: mmap() scalability in the presence of the MAP_POPULATE flag

2013-01-04 Thread Roman Dubtsov

On Fri, 2013-01-04 at 03:57 -0800, Michel Lespinasse wrote:
> On Fri, Jan 04, 2013 at 12:09:37AM +0700, Roman Dubtsov wrote:
> > On Wed, 2013-01-02 at 16:09 -0800, Michel Lespinasse wrote:
> > > > Is there an interest in fixing this or concurrent mmaps() from the same
> > > > process are too much of a corner case to worry about it?
> > > 
> > > Funny this comes up again. I actually have a patch series that is
> > > supposed to do that:
> > > [PATCH 0/9] Avoid populating unbounded num of ptes with mmap_sem held
> > > 
> > > However, the patches are still pending, didn't get much review
> > > (probably not enough for Andrew to take them at this point), and I
> > > think everyone forgot about them during the winter break.
> > > 
> > > Care to have a look at that thread and see if it works for you ?
> > > 
> > > (caveat: you will possibly also need "[PATCH 10/9] mm: make
> > > do_mmap_pgoff return populate as a size in bytes, not as a bool" to
> > > make the series actually work for you)
> > 
> > I applied the patches on top of 3.7.1. Here're the results for 4 threads
> > concurrently mmap()-ing 10 64MB buffers in a loop without munmap()-s.
> > The data is from a Nehalem i7-920 single-socket 4-core CPU. I've also
> > added the older data I have for the 3.6.11 (patched and not) for
> > reference.
> > 
> > 3.6.11 vanilla, do not populate: 0.001 seconds
> > 3.6.11 vanilla, populate via a loop: 0.216 seconds
> > 3.6.11 vanilla, populate via MAP_POPULATE: 0.358 seconds 
> > 
> > 3.6.11 + crude patch, do not populate: 0.002 seconds
> > 3.6.11 + crude patch, populate via loop: 0.215 seconds
> > 3.6.11 + crude patch, populate via MAP_POPULATE: 0.217 seconds
> > 
> > 3.7.1 vanilla, do not populate: 0.001 seconds
> > 3.7.1 vanilla, populate via a loop: 0.216 seconds
> > 3.7.1 vanilla, populate via MAP_POPULATE: 0.411 seconds
> > 
> > 3.7.1 + patch series, do not populate: 0.001 seconds
> > 3.7.1 + patch series, populate via loop: 0.216 seconds
> > 3.7.1 + patch series, populate via MAP_POPULATE: 0.273 seconds
> > 
> > So, the patch series mentioned above do improve performance but as far
> > as I can read the benchmarking data there's still some performance left
> > on the table.
> 
> Interesting. I expect you are using anon memory, so it's likely that
> mm_populate() holds the mmap_sem read side for the entire duration of
> the 64MB populate.
> 
> Just curious, does the following help ?
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index e4ab66b94bb8..f65a4b3b2141 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1627,6 +1627,12 @@ static inline int stack_guard_page(struct 
> vm_area_struct *vma, unsigned long add
>  stack_guard_page_end(vma, addr+PAGE_SIZE);
>  }
>  
> +/* not upstreamable as is, just for the sake of testing */
> +static inline int rwsem_is_contended(struct rw_semaphore *sem)
> +{
> + return (sem->count < 0);
> +}
> +
>  /**
>   * __get_user_pages() - pin user pages in memory
>   * @tsk: task_struct of target task
> @@ -1854,6 +1860,11 @@ next_page:
>   i++;
>   start += PAGE_SIZE;
>   nr_pages--;
> + if (nonblocking && rwsem_is_contended(>mmap_sem)) {
> + up_read(>mmap_sem);
> + *nonblocking = 0;
> + return i;
> + }
>   } while (nr_pages && start < vma->vm_end);
>   } while (nr_pages);
>   return i;
> 
> Linus didn't like rwsem_is_contended() when I implemented the mlock
> side of this a couple years ago, but maybe we can change his mind now.
> 
> If this doesn't help, could you please send me your test case ? I
> think you described enough of it that I would be able to reproduce it
> given some time, but it's just easier if you send me a short C file :)
> 

It does not, the results are more or less the same. I've attached my
testcase. It does map anonymous memory. It also uses OpenMP for
threading because I'm lazy, so it requires passing -fopenmp to gcc and
the number of threads it runs is defined via OMP_NUM_THREADS environment
variable. There are also two macros that influence test's behavior:

- POPULATE_VIA_LOOP -- makes the test populate memory using a loop
- POPULATE_VIA_MMAP -- makes the test populate memory via MAP_POPULATE

If none of the macros are defined, the test does not populate memory.

#include 
#include 
#include 

#include "omp.h"

#ifndef BUF_SIZE
#define BUF_SIZE (64 * 1024 * 1024)
#endif

#ifndef PAGE_SIZE
#define PAGE_SIZE (4 * 1024)
#endif

#ifdef POPULATE_VIA_MMAP
#define MMAP_FLAGS (MAP_ANONYMOUS | MAP_PRIVATE | MAP_POPULATE)
#else
#define MMAP_FLAGS (MAP_ANONYMOUS | MAP_PRIVATE)
#endif

int main(int argc, char **argv)
{
#pragma omp parallel
	{
	}

	double t0 = omp_get_wtime();
#pragma omp parallel
	{
		int i;
		for (i = 0; i < 10; i++) {
			char *p = mmap(NULL, BUF_SIZE,
PROT_READ | PROT_WRITE, MMAP_FLAGS, -1, 0);
#ifdef POPULATE_VIA_LOOP

Re: [PATCH 0/6] Introducing Device Tree Overlays

2013-01-04 Thread Joel A Fernandes

Hi Richard,

On Fri, Jan 4, 2013 at 9:35 PM, Richard Cochran
 wrote:
> On Fri, Jan 04, 2013 at 09:31:04PM +0200, Pantelis Antoniou wrote:
>> The following patchset introduces Device Tree overlays, a method
>> of dynamically altering the kernel's live Device Tree.
>
> It would be nice to know the motivation for this code.
>
> What is the use case? What problem or issue is being addressed?

The problem being addressed is discussed in this thread:
http://permalink.gmane.org/gmane.linux.kernel/1389017

Regards,
Joel
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: thp: Acquire the anon_vma rwsem for lock during split

2013-01-04 Thread Zhouping Liu


On 01/04/2013 10:08 PM, Mel Gorman wrote:

Zhouping, please test this patch.


Tested it, the issue is gone with following patch.

Tested-by: Zhouping Liu 

Thanks,
Zhouping



Andrea and Hugh, any comments on whether this could be improved?

---8<---
mm: thp: Acquire the anon_vma rwsem for lock during split

Zhouping Liu reported the following against 3.8-rc1 when running a mmap
testcase from LTP.

[  588.143072] mapcount 0 page_mapcount 3
[  588.147471] [ cut here ]
[  588.152856] kernel BUG at mm/huge_memory.c:1798!
[  588.158125] invalid opcode:  [#1] SMP
[  588.162882] Modules linked in: ip6table_filter ip6_tables ebtable_nat 
ebtables bnep bluetooth rfkill iptable_mangle ipt_REJECT nf_conntrack_ipv4 
nf_defrag_ipv4 xt_conntrack nf_conntrack iptable_filter
+ip_tables be2iscsi iscsi_boot_sysfs bnx2i cnic uio cxgb4i cxgb4 cxgb3i cxgb3 
mdio libcxgbi ib_iser rdma_cm ib_addr iw_cm ib_cm ib_sa ib_mad ib_core 
iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi vfat fat
+dm_mirror dm_region_hash dm_log dm_mod cdc_ether iTCO_wdt i7core_edac coretemp 
usbnet iTCO_vendor_support mii crc32c_intel edac_core lpc_ich shpchp ioatdma 
mfd_core i2c_i801 pcspkr serio_raw bnx2 microcode dca
+vhost_net tun macvtap macvlan kvm_intel kvm uinput mgag200 sr_mod cdrom 
i2c_algo_bit sd_mod drm_kms_helper crc_t10dif ata_generic pata_acpi ttm 
ata_piix drm libata i2c_core megaraid_sas

[  588.246517] CPU 1
[  588.248636] Pid: 23217, comm: mmap10 Not tainted 3.8.0-rc1mainline+ #17 IBM 
IBM System x3400 M3 Server -[7379I08]-/69Y4356
[  588.262171] RIP: 0010:[]  [] 
__split_huge_page+0x677/0x6d0
[  588.272067] RSP: :88017a03fc08  EFLAGS: 00010293
[  588.278235] RAX: 0003 RBX: 88027a6c22e0 RCX: 34d2
[  588.286394] RDX: 748b RSI: 0046 RDI: 0246
[  588.294216] RBP: 88017a03fcb8 R08: 819d2440 R09: 054a
[  588.302441] R10: 00aa R11:  R12: 
[  588.310495] R13: 7f4f11a0 R14: 880179e96e00 R15: ea0005c08000
[  588.318640] FS:  7f4f11f4a740() GS:88017bc2() 
knlGS:
[  588.327894] CS:  0010 DS:  ES:  CR0: 8005003b
[  588.334569] CR2: 0037e9ebb404 CR3: 00017a436000 CR4: 07e0
[  588.342718] DR0:  DR1:  DR2: 
[  588.350861] DR3:  DR6: 0ff0 DR7: 0400
[  588.359134] Process mmap10 (pid: 23217, threadinfo 88017a03e000, task 
880172dd32e0)
[  588.368667] Stack:
[  588.370960]  88017a540ec8 88017a03fc20 816017b5 
88017a03fc88
[  588.379566]  812fa014  880279ebd5c0 
f4f11a4c
[  588.388150]  0007f4f11f49 0007f4f11a00 88017a540ef0 
88017a540ee8
[  588.396711] Call Trace:
[  588.455106]  [] ? rwsem_down_read_failed+0x15/0x17
[  588.518106]  [] ? call_rwsem_down_read_failed+0x14/0x30
[  588.580897]  [] ? down_read+0x24/0x2b
[  588.642630]  [] split_huge_page+0x68/0xb0
[  588.703814]  [] __split_huge_page_pmd+0x134/0x330
[  588.766064]  [] ? pte_alloc_one+0x37/0x50
[  588.826460]  [] split_huge_page_pmd_mm+0x51/0x60
[  588.887746]  [] split_huge_page_address+0x3b/0x50
[  588.948673]  [] __vma_adjust_trans_huge+0x9c/0xf0
[  589.008660]  [] vma_adjust+0x684/0x750
[  589.066328]  [] __split_vma.isra.28+0x1fa/0x220
[  589.123497]  [] ? __switch_to+0x181/0x4a0
[  589.180704]  [] do_munmap+0xf9/0x420
[  589.237461]  [] ? __schedule+0x3cc/0x7b0
[  589.294520]  [] vm_munmap+0x4e/0x70
[  589.350784]  [] sys_munmap+0x2b/0x40
[  589.406971]  [] system_call_fastpath+0x16/0x1b

Alexander Beregalov reported a very similar bug and Hillf Danton identified
that commit 5a505085 (mm/rmap: Convert the struct anon_vma::mutex to an
rwsem) and commit 4fc3f1d6 (mm/rmap, migration: Make rmap_walk_anon()
and try_to_unmap_anon() more scalable) were likely the problem. Reverting
these commits was reported to solve the problem.

Despite the reason for these commits, NUMA balancing is not the direct
source of the problem. split_huge_page() expected the anon_vma lock to be
exclusive to serialise the whole split operation. Ordinarily it is expected
that the anon_vma lock would only be required when updating the avcs but
THP also uses it. The locking requirements for THP are complex and there
is some overlap but broadly speaking they include the following

1. mmap_sem for read or write prevents THPs being created underneath
2. anon_vma is taken for write if collapsing a huge page
3. mm->page_table_lock should be taken when checking if pmd_trans_huge as
split_huge_page can run in parallel
4. wait_split_huge_page uses anon_vma taken for write mode to serialise
against other THP operations
5. compound_lock is used to serialise between
__split_huge_page_refcount() and gup

split_huge_page takes anon_vma for read but that does not serialise against
parallel

RE: [PATCH] tools: fix a typo in hv_set_ifconfig.sh

2013-01-04 Thread KY Srinivasan



> -Original Message-
> From: Jason Wang [mailto:jasow...@redhat.com]
> Sent: Saturday, January 05, 2013 12:03 AM
> To: gre...@linuxfoundation.org; KY Srinivasan; Haiyang Zhang; linux-
> ker...@vger.kernel.org
> Cc: tho...@redhat.com; Jason Wang
> Subject: [PATCH] tools: fix a typo in hv_set_ifconfig.sh
> 
> Signed-off-by: Jason Wang 
Acked-by: K. Y. Srinivasan 

> ---
>  tools/hv/hv_set_ifconfig.sh |2 +-
>  1 files changed, 1 insertions(+), 1 deletions(-)
> 
> diff --git a/tools/hv/hv_set_ifconfig.sh b/tools/hv/hv_set_ifconfig.sh
> index 3e9427e..daf7ec0 100755
> --- a/tools/hv/hv_set_ifconfig.sh
> +++ b/tools/hv/hv_set_ifconfig.sh
> @@ -65,4 +65,4 @@ cp $1 /etc/sysconfig/network-scripts/
>  interface=$(echo $1 | awk -F - '{ print $2 }')
> 
>  /sbin/ifdown $interface 2>/dev/null
> -/sbin/ifup $interfac 2>/dev/null
> +/sbin/ifup $interface 2>/dev/null
> --
> 1.7.1
> 
> 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] Perf: add anonymous huge page recognition

2013-01-04 Thread Joshua Zhu

Judging anonymous memory's vm_area_struct,
perf_mmap_event's filename will be set to
"//anon" indicating this vma is belong to
anonymous memory.
Once hugepage is used, vma's vm_file points
to hugetlbfs. In this way, this vma will not
be regarded as anonymous memory by is_anon_memory()
in perf user space utility.

Signed-off-by: Joshua Zhu 
---
 tools/perf/util/map.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c
index 0328d45..ff94425 100644
--- a/tools/perf/util/map.c
+++ b/tools/perf/util/map.c
@@ -19,7 +19,8 @@ const char *map_type__name[MAP__NR_TYPES] = {
 
 static inline int is_anon_memory(const char *filename)
 {
-   return strcmp(filename, "//anon") == 0;
+   return !strcmp(filename, "//anon") ||
+  !strcmp(filename, "/anon_hugepage (deleted)");
 }
 
 static inline int is_no_dso_memory(const char *filename)
-- 
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] writeback: fix writeback cache thrashing

2013-01-04 Thread Simon Jeons

On Sat, 2013-01-05 at 11:26 +0800, Fengguang Wu wrote:
> > > > Hi Namjae,
> > > >
> > > > Why use bdi_stat_error here? What's the meaning of its comment "maximal
> > > > error of a stat counter"?
> > > Hi Simon,
> > > 
> > > As you know bdi stats (BDI_RECLAIMABLE, BDI_WRITEBACK …) are kept in
> > > percpu counters.
> > > When these percpu counters are incremented/decremented simultaneously
> > > on multiple CPUs by small amount (individual cpu counter less than
> > > threshold BDI_STAT_BATCH),
> > > it is possible that we get approximate value (not exact value) of
> > > these percpu counters.
> > > In order, to handle these percpu counter error we have used
> > > bdi_stat_error. bdi_stat_error is the maximum error which can happen
> > > in percpu bdi stats accounting.
> > > 
> > > bdi_stat(bdi, BDI_RECLAIMABLE);
> > >  -> This will give approximate value of BDI_RECLAIMABLE by reading
> > > previous value of percpu count.
> > > 
> > > bdi_stat_sum(bdi, BDI_RECLAIMABLE);
> > >  ->This will give exact value of BDI_RECLAIMABLE. It will take lock
> > > and add current percpu count of individual CPUs.
> > >It is not recommended to use it frequently as it is expensive. We
> > > can better use “bdi_stat” and work with approx value of bdi stats.
> > > 
> > 
> > Hi Namjae, thanks for your clarify.
> > 
> > But why compare error stat count to bdi_bground_thresh? What's the
> 
> It's not comparing bdi_stat_error to bdi_bground_thresh, but rather,
> in concept, comparing bdi_stat (with error bound adjustments) to
> bdi_bground_thresh.
> 
> > relationship between them? I also see bdi_stat_error compare to
> > bdi_thresh/bdi_dirty in function balance_dirty_pages. 
> 

Hi Fengguang,

> Here, it's trying to use bdi_stat_sum(), the accurate (however more
> costly) version of bdi_stat(), if the error would possibly be large:

Why error is large use bdi_stat_sum and error is few use bdi_stat?

> 
> if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
> bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
> //...
> } else {
> bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
> //...
> }
> 
> Here the comment should have explained it well:
> 
>  * In theory 1 page is enough to keep the comsumer-producer
>  * pipe going: the flusher cleans 1 page => the task dirties 1
>  * more page. However bdi_dirty has accounting errors.  So use

Why bdi_dirty has accounting errors?

>  * the larger and more IO friendly bdi_stat_error.
>  */
> if (bdi_dirty <= bdi_stat_error(bdi))
> break;
> 
> 
> Thanks,
> Fengguang


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 22/49] mm: mempolicy: Add MPOL_MF_LAZY

2013-01-04 Thread Simon Jeons

On Fri, 2012-12-07 at 10:23 +, Mel Gorman wrote:
> From: Lee Schermerhorn 
> 
> NOTE: Once again there is a lot of patch stealing and the end result
>   is sufficiently different that I had to drop the signed-offs.
>   Will re-add if the original authors are ok with that.
> 
> This patch adds another mbind() flag to request "lazy migration".  The
> flag, MPOL_MF_LAZY, modifies MPOL_MF_MOVE* such that the selected
> pages are marked PROT_NONE. The pages will be migrated in the fault
> path on "first touch", if the policy dictates at that time.
> 
> "Lazy Migration" will allow testing of migrate-on-fault via mbind().
> Also allows applications to specify that only subsequently touched
> pages be migrated to obey new policy, instead of all pages in range.
> This can be useful for multi-threaded applications working on a
> large shared data area that is initialized by an initial thread
> resulting in all pages on one [or a few, if overflowed] nodes.
> After PROT_NONE, the pages in regions assigned to the worker threads
> will be automatically migrated local to the threads on 1st touch.
> 
> Signed-off-by: Mel Gorman 
> Reviewed-by: Rik van Riel 
> ---
>  include/linux/mm.h |5 ++
>  include/uapi/linux/mempolicy.h |   13 ++-
>  mm/mempolicy.c |  185 
> 
>  3 files changed, 185 insertions(+), 18 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index fa16152..471185e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1551,6 +1551,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long 
> vm_flags)
>  }
>  #endif
>  
> +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
> +void change_prot_numa(struct vm_area_struct *vma,
> + unsigned long start, unsigned long end);
> +#endif
> +
>  struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long 
> addr);
>  int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
>   unsigned long pfn, unsigned long size, pgprot_t);
> diff --git a/include/uapi/linux/mempolicy.h b/include/uapi/linux/mempolicy.h
> index 472de8a..6a1baae 100644
> --- a/include/uapi/linux/mempolicy.h
> +++ b/include/uapi/linux/mempolicy.h
> @@ -49,9 +49,16 @@ enum mpol_rebind_step {
>  
>  /* Flags for mbind */
>  #define MPOL_MF_STRICT   (1<<0)  /* Verify existing pages in the mapping 
> */
> -#define MPOL_MF_MOVE (1<<1)  /* Move pages owned by this process to conform 
> to mapping */
> -#define MPOL_MF_MOVE_ALL (1<<2)  /* Move every page to conform to 
> mapping */
> -#define MPOL_MF_INTERNAL (1<<3)  /* Internal flags start here */
> +#define MPOL_MF_MOVE  (1<<1) /* Move pages owned by this process to conform
> +to policy */
> +#define MPOL_MF_MOVE_ALL (1<<2)  /* Move every page to conform to policy 
> */
> +#define MPOL_MF_LAZY  (1<<3) /* Modifies '_MOVE:  lazy migrate on fault */
> +#define MPOL_MF_INTERNAL (1<<4)  /* Internal flags start here */
> +
> +#define MPOL_MF_VALID(MPOL_MF_STRICT   | \
> +  MPOL_MF_MOVE | \
> +  MPOL_MF_MOVE_ALL | \
> +  MPOL_MF_LAZY)
>  
>  /*
>   * Internal flags that share the struct mempolicy flags word with
> diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> index df1466d..51d3ebd 100644
> --- a/mm/mempolicy.c
> +++ b/mm/mempolicy.c
> @@ -90,6 +90,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include 
>  #include 
> @@ -565,6 +566,145 @@ static inline int check_pgd_range(struct vm_area_struct 
> *vma,
>   return 0;
>  }
>  
> +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE
> +/*
> + * Here we search for not shared page mappings (mapcount == 1) and we
> + * set up the pmd/pte_numa on those mappings so the very next access
> + * will fire a NUMA hinting page fault.
> + */
> +static int
> +change_prot_numa_range(struct mm_struct *mm, struct vm_area_struct *vma,
> + unsigned long address)
> +{
> + pgd_t *pgd;
> + pud_t *pud;
> + pmd_t *pmd;
> + pte_t *pte, *_pte;
> + struct page *page;
> + unsigned long _address, end;
> + spinlock_t *ptl;
> + int ret = 0;
> +
> + VM_BUG_ON(address & ~PAGE_MASK);
> +
> + pgd = pgd_offset(mm, address);
> + if (!pgd_present(*pgd))
> + goto out;
> +
> + pud = pud_offset(pgd, address);
> + if (!pud_present(*pud))
> + goto out;
> +
> + pmd = pmd_offset(pud, address);
> + if (pmd_none(*pmd))
> + goto out;
> +
> + if (pmd_trans_huge_lock(pmd, vma) == 1) {
> + int page_nid;
> + ret = HPAGE_PMD_NR;
> +
> + VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> +
> + if (pmd_numa(*pmd)) {
> + spin_unlock(>page_table_lock);
> + goto out;
> + }
> +
> + page = pmd_page(*pmd);
> +
> + /* only check non-shared

[PATCH] tools: fix a typo in hv_set_ifconfig.sh

2013-01-04 Thread Jason Wang

Signed-off-by: Jason Wang 
---
 tools/hv/hv_set_ifconfig.sh |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/tools/hv/hv_set_ifconfig.sh b/tools/hv/hv_set_ifconfig.sh
index 3e9427e..daf7ec0 100755
--- a/tools/hv/hv_set_ifconfig.sh
+++ b/tools/hv/hv_set_ifconfig.sh
@@ -65,4 +65,4 @@ cp $1 /etc/sysconfig/network-scripts/
 interface=$(echo $1 | awk -F - '{ print $2 }')
 
 /sbin/ifdown $interface 2>/dev/null
-/sbin/ifup $interfac 2>/dev/null
+/sbin/ifup $interface 2>/dev/null
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V3 4/8] memcg: add per cgroup dirty pages accounting

2013-01-04 Thread Sha Zhengju

On Wed, Jan 2, 2013 at 6:44 PM, Michal Hocko  wrote:
> On Wed 26-12-12 01:26:07, Sha Zhengju wrote:
>> From: Sha Zhengju 
>>
>> This patch adds memcg routines to count dirty pages, which allows memory 
>> controller
>> to maintain an accurate view of the amount of its dirty memory and can 
>> provide some
>> info for users while cgroup's direct reclaim is working.
>
> I guess you meant targeted resp. (hard/soft) limit reclaim here,
> right? It is true that this is direct reclaim but it is not clear to me

Yes, I meant memcg hard/soft reclaim here which is triggered directly
by allocation and is distinct from background kswapd reclaim (global).

> why the usefulnes should be limitted to the reclaim for users. I would
> understand this if the users was in fact in-kernel users.
>

One of the reasons I'm trying to accounting the dirty pages is to get a
more board overall view of memory usages because memcg hard/soft
reclaim may have effect on response time of user application.
Yeah, the beneficiary can be application administrator or kernel users.  :P

> [...]
>> To prevent AB/BA deadlock mentioned by Greg Thelen in previous version
>> (https://lkml.org/lkml/2012/7/30/227), we adjust the lock order:
>> ->private_lock --> mapping->tree_lock --> memcg->move_lock.
>> So we need to make mapping->tree_lock ahead of TestSetPageDirty in 
>> __set_page_dirty()
>> and __set_page_dirty_nobuffers(). But in order to avoiding useless spinlock 
>> contention,
>> a prepare PageDirty() checking is added.
>
> But there is another AA deadlock here I believe.
> page_remove_rmap
>   mem_cgroup_begin_update_page_stat <<< 1
>   set_page_dirty
> __set_page_dirty_buffers
>   __set_page_dirty
> mem_cgroup_begin_update_page_stat   <<< 2
>   move_lock_mem_cgroup
> spin_lock_irqsave(>move_lock, *flags);
>
> mem_cgroup_begin_update_page_stat is not recursive wrt. locking AFAICS
> because we might race with the moving charges:
> CPU0CPU1
> page_remove_rmap
> mem_cgroup_can_attach
>   mem_cgroup_begin_update_page_stat (1)
> rcu_read_lock
>   mem_cgroup_start_move
> atomic_inc(_moving)
> 
> atomic_inc(>moving_account)
> synchronize_rcu
> __mem_cgroup_begin_update_page_stat
>   mem_cgroup_stolen <<< TRUE
>   move_lock_mem_cgroup
>   [...]
> mem_cgroup_begin_update_page_stat (2)
>   __mem_cgroup_begin_update_page_stat
> mem_cgroup_stolen <<< still TRUE
> move_lock_mem_cgroup  <<< DEADLOCK
>   [...]
>   mem_cgroup_end_update_page_stat
> rcu_unlock
>   # wake up from 
> synchronize_rcu
> [...]
> mem_cgroup_move_task
>   mem_cgroup_move_charge
> walk_page_range
>   mem_cgroup_move_account
> move_lock_mem_cgroup
>
>
> Maybe I have missed some other locking which would prevent this from
> happening but the locking relations are really complicated in this area
> so if mem_cgroup_{begin,end}_update_page_stat might be called
> recursively then we need a fat comment which justifies that.
>

Ohhh...good catching!  I didn't notice there is a recursive call of
mem_cgroup_{begin,end}_update_page_stat in page_remove_rmap().
The mem_cgroup_{begin,end}_update_page_stat() design has depressed
me a lot recently as the lock granularity is a little bigger than I thought.
Not only the resource but also some code logic is in the range of locking
which may be deadlock prone. The problem still exists if we are trying to
add stat account of other memcg page later, may I make bold to suggest
that we dig into the lock again...

But with regard to the current lock implementation, I doubt if we can we can
account MEM_CGROUP_STAT_FILE_{MAPPED, DIRTY} in one breath and just
try to get move_lock once in the beginning. IMHO we can make
mem_cgroup_{begin,end}_update_page_stat() to recursive aware and what I'm
thinking now is changing memcg->move_lock to rw-spinlock from the
original spinlock:
mem_cgroup_{begin,end}_update_page_stat() try to get the read lock which make it
reenterable and memcg moving task side try to get the write spinlock.
Then the race may be following:

CPU0CPU1
page_remove_rmap
mem_cgroup_can_attach
  mem_cgroup_begin_update_page_stat (1)
rcu_read_lock

Re: no config opt for k8temp in 3.6.11

2013-01-04 Thread Steven A. DuChene

When I do a grep for K8 in my .config file all I get is the following:

> grep -i k8 /usr/local/hugedisk/src/linux-3.6.11/.config
CONFIG_MK8=y
CONFIG_X86_POWERNOW_K8=y

When I try your grep command I get:

> grep -E "^CONFIG_SENSORS" /usr/local/hugedisk/src/linux-3.6.11/.config|grep K
CONFIG_SENSORS_K10TEMP=m

I will try doing a make mrproper and see if anything changes.

Thanks for the reply.
--
Steven DuChene


-Original Message-
>From: Borislav Petkov 
>Sent: Jan 4, 2013 3:35 PM
>To: "Steven A. DuChene" 
>Cc: linux-kernel@vger.kernel.org
>Subject: Re: no config opt for k8temp in 3.6.11
>
>On Fri, Jan 04, 2013 at 09:25:14AM -0500, Steven A. DuChene wrote:
>> I have a few systems with AMD processors. One is a AMD Phenom II based
>> system which uses a k10temp kernel module to get the CPU temperature.
>> Another system has an Athlon64 processor which apparently is supposed
>> to use a k8temp module to do the same thing. I have built a 3.6.11
>> kernel for each system that otherwise runs OpenSuSE-11.X. On the
>> Phenom system the k10temp module is gets built and performs as
>> expected. However on the Atlnon system even though I see the k8temp.c
>> source file in the drivers/hwmon directory there does not seem to be
>> a way in the kernel config files available from the 3.6.11 kernel
>> sources to actually select that the k8temp module should be built. Is
>> this some oversight in the makeup of the kernel config files or am I
>> missing something obvious?
>
>That's strange, I can select all the AMD power drivers on 3.6.11:
>
>$ grep -E "^CONFIG_SENSORS" .config
>CONFIG_SENSORS_K8TEMP=m
>CONFIG_SENSORS_K10TEMP=m
>CONFIG_SENSORS_FAM15H_POWER=m
>
>Can you do 'make mrproper' and try regenerating your config? But
>remember, mrproper will remove your .config so you might want to save
>that upfront in case you need it.
>
>HTH.
>
>-- 
>Regards/Gruss,
>Boris.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7u1 26/31] x86: Don't enable swiotlb if there is not enough ram for it

2013-01-04 Thread Yinghai Lu

On Fri, Jan 4, 2013 at 6:02 PM, Shuah Khan  wrote:
> I applied your patch to 3.6.11 and changed the panic() to pr_info()
> and also changed enough_mem_for_swiotlb() to always return false to
> simulate not enough memory condition as this system does have enough
> memory.
>
> So at least on this AMD system, your patch will result in a panic.

ok, thanks for testing.

if enough_mem_for_swiotlb() return false really,  allocating buffer
for swiotlb with bootmem would panic already, right?

so this patch just delay the panic a while for AMD system with
unhandled devices by IOMMU.

Thanks

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[ANNOUNCE] 3.2.36-rt54

2013-01-04 Thread Steven Rostedt


Dear RT Folks,

I'm pleased to announce the 3.2.36-rt54 stable release.


This release is just an update to the new stable 3.2.36 version
and no RT specific changes have been made.


You can get this release via the git tree at:

  git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-stable-rt.git

  Head SHA1: cb81da1cc94200564e52411e0f31f3f867a80141


Or to build 3.2.36-rt54 directly, the following patches should be applied:

  http://www.kernel.org/pub/linux/kernel/v3.x/linux-3.2.tar.xz

  http://www.kernel.org/pub/linux/kernel/v3.x/patch-3.2.36.xz

  
http://www.kernel.org/pub/linux/kernel/projects/rt/3.2/patch-3.2.36-rt54.patch.xz



Enjoy,

-- Steve



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/6] Introducing Device Tree Overlays

2013-01-04 Thread Richard Cochran

On Fri, Jan 04, 2013 at 09:31:04PM +0200, Pantelis Antoniou wrote:
> The following patchset introduces Device Tree overlays, a method
> of dynamically altering the kernel's live Device Tree.

It would be nice to know the motivation for this code.

What is the use case? What problem or issue is being addressed?

Thanks,
Richard
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC PATCH 1/2] efi: Make 'efi_enabled' a function to

2013-01-04 Thread Guo Chao

query EFI facilities
Reply-To: <1357219085-4312-2-git-send-email-m...@console-pimps.org> 
In-Reply-To: <1357219085-4312-2-git-send-email-m...@console-pimps.org> 

> diff --git a/init/main.c b/init/main.c
> index e33e09d..e71d924 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -604,7 +604,7 @@ asmlinkage void __init start_kernel(void)
>   pidmap_init();
>   anon_vma_init();
>  #ifdef CONFIG_X86
> - if (efi_enabled)
> + if (efi_enabled(EFI_RUNTIME_SERVICES))
>   efi_enter_virtual_mode();
>  #endif
>   thread_info_cache_init();
> @@ -632,7 +632,7 @@ asmlinkage void __init start_kernel(void)
>   acpi_early_init(); /* before LAPIC and SMP init */
>   sfi_init_late();
> 
> - if (efi_enabled) {
> + if (efi_enabled(EFI_RUNTIME_SERVICES)) {
>   efi_late_init();
>   efi_free_boot_services();
>   }

I just wonder why we compile efi code away explicitly by CONFIG_X86 in
one place and implicitly by if (0) in another place, in the same
function.

Thanks,
Guo Chao

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] writeback: fix writeback cache thrashing

2013-01-04 Thread Fengguang Wu

> > > Hi Namjae,
> > >
> > > Why use bdi_stat_error here? What's the meaning of its comment "maximal
> > > error of a stat counter"?
> > Hi Simon,
> > 
> > As you know bdi stats (BDI_RECLAIMABLE, BDI_WRITEBACK …) are kept in
> > percpu counters.
> > When these percpu counters are incremented/decremented simultaneously
> > on multiple CPUs by small amount (individual cpu counter less than
> > threshold BDI_STAT_BATCH),
> > it is possible that we get approximate value (not exact value) of
> > these percpu counters.
> > In order, to handle these percpu counter error we have used
> > bdi_stat_error. bdi_stat_error is the maximum error which can happen
> > in percpu bdi stats accounting.
> > 
> > bdi_stat(bdi, BDI_RECLAIMABLE);
> >  -> This will give approximate value of BDI_RECLAIMABLE by reading
> > previous value of percpu count.
> > 
> > bdi_stat_sum(bdi, BDI_RECLAIMABLE);
> >  ->This will give exact value of BDI_RECLAIMABLE. It will take lock
> > and add current percpu count of individual CPUs.
> >It is not recommended to use it frequently as it is expensive. We
> > can better use “bdi_stat” and work with approx value of bdi stats.
> > 
> 
> Hi Namjae, thanks for your clarify.
> 
> But why compare error stat count to bdi_bground_thresh? What's the

It's not comparing bdi_stat_error to bdi_bground_thresh, but rather,
in concept, comparing bdi_stat (with error bound adjustments) to
bdi_bground_thresh.

> relationship between them? I also see bdi_stat_error compare to
> bdi_thresh/bdi_dirty in function balance_dirty_pages. 

Here, it's trying to use bdi_stat_sum(), the accurate (however more
costly) version of bdi_stat(), if the error would possibly be large:

if (bdi_thresh < 2 * bdi_stat_error(bdi)) {
bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
//...
} else {
bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
//...
}

Here the comment should have explained it well:

 * In theory 1 page is enough to keep the comsumer-producer
 * pipe going: the flusher cleans 1 page => the task dirties 1
 * more page. However bdi_dirty has accounting errors.  So use
 * the larger and more IO friendly bdi_stat_error.
 */
if (bdi_dirty <= bdi_stat_error(bdi))
break;


Thanks,
Fengguang
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RESEND PATCH v1] driver core: fix possible missing of device probe

2013-01-04 Thread Ming Lei

Inside bus_add_driver(), one device might be added(device_add()) into
the bus or probed which is triggered by deferred probe
just after completing of driver_attach() and before
'klist_add_tail(>knode_bus, >p->klist_drivers)',
so the device won't be probed by this driver.

This patch moves the below line

'klist_add_tail(>knode_bus, >p->klist_drivers)'

before driver_attach() inside bus_add_driver() to fix the
problem.

Signed-off-by: Ming Lei 
---
v1:
- remove memory barrier part of previous commit log because
klist lock of 'bus->p->klist_drivers' is held during both
adding driver into the list and iterating the driver list,
so the new added driver can always be perceived immediately
on other CPUs.
---
 drivers/base/bus.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 181ed26..3b5bddb 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -714,12 +714,12 @@ int bus_add_driver(struct device_driver *drv)
if (error)
goto out_unregister;
 
+   klist_add_tail(>knode_bus, >p->klist_drivers);
if (drv->bus->p->drivers_autoprobe) {
error = driver_attach(drv);
if (error)
goto out_unregister;
}
-   klist_add_tail(>knode_bus, >p->klist_drivers);
module_add_driver(drv->owner, drv);
 
error = driver_create_file(drv, _attr_uevent);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] writeback: fix writeback cache thrashing

2013-01-04 Thread Fengguang Wu

Hi Namjae,

On Sun, Dec 30, 2012 at 02:59:50PM +0900, Namjae Jeon wrote:
> From: Namjae Jeon 
> 
> Consider Process A: huge I/O on sda
> doing heavy write operation - dirty memory becomes more
> than dirty_background_ratio
> on HDD - flusher thread flush-8:0
> 
> Consider Process B: small I/O on sdb
> doing while [1]; read 1024K + rewrite 1024K + sleep 2sec
> on Flash device - flusher thread flush-8:16
> 
> As Process A is a heavy dirtier, dirty memory becomes more
> than dirty_background_thresh. Due to this, below check becomes
> true(checking global_page_state in over_bground_thresh)
> for all bdi devices(even for very small dirtied bdi - sdb):
> 
> In this case, even small cached data on 'sdb' is forced to flush
> and writeback cache thrashing happens.
> 
> When we added debug prints inside above 'if' condition and ran
> above Process A(heavy dirtier on bdi with flush-8:0) and
> Process B(1024K frequent read/rewrite on bdi with flush-8:16)
> we got below prints:
> 
> [Test setup: ARM dual core CPU, 512 MB RAM]
> 
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56064 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56704 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 84720 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 94720 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   384 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   960 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92160 KB

> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   256 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   768 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   256 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   320 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0 KB

Yeah, that IO pattern is not good. Perhaps it's 6 small IOs in /one/
second?  However that's not quite in line with "sleep 2sec" in your
workload description. Note that I assume flush-8:0 works on a hard
disk, so each flush-8:0 line indicates roughly 1 second interval
elapsed. It would be much more clear if the printk timestamps are
turned on (CONFIG_PRINTK_TIME=y).

> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92032 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 91968 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =  1024 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   576 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 84352 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   512 KB
> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92608 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92544 KB
> 
> As mentioned in above log, when global dirty memory > global background_thresh
> small cached data is also forced to flush by flush-8:16.
> 
> If removing global background_thresh checking code, we can reduce cache
> thrashing of frequently used small data.
> And It will be great if we can reserve a portion of writeback cache using
> min_ratio.
 
> After applying patch:
> $ echo 5 > /sys/block/sdb/bdi/min_ratio
> $ cat /sys/block/sdb/bdi/min_ratio
> 5

The below log looks all perfect. However the min_ratio setup is a
problem. If possible, I'd like the final patch being able to work
reasonably well with min_ratio=0 (the system default), too.

> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56064 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56704 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  84160 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  96960 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  94080 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  93120 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  93120 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  91520 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  89600 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  93696 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  93696 KB
> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  72960 KB
> [over_bground_thresh]:

[PATCH 1/1]linux-usb:optimize to match the Huawei USB storage devices and support new switch command

2013-01-04 Thread fangxiaozhi 00110321

From: fangxiaozhi 

1. Optimize the match rules with new macro for Huawei USB storage devices, 
   to avoid to load USB storage driver for the modem interface 
   with Huawei devices.
2. Add to support new switch command for new Huawei USB dongles.

Signed-off-by: fangxiaozhi 

diff -uprN linux-3.8-rc2_orig/drivers/usb/storage/initializers.c 
linux-3.8-rc2/drivers/usb/storage/initializers.c
--- linux-3.8-rc2_orig/drivers/usb/storage/initializers.c   2013-01-04 
10:12:01.441356344 +0800
+++ linux-3.8-rc2/drivers/usb/storage/initializers.c2013-01-04 
10:55:49.512500933 +0800
@@ -92,8 +92,8 @@ int usb_stor_ucr61s2b_init(struct us_dat
return 0;
 }
 
-/* This places the HUAWEI E220 devices in multi-port mode */
-int usb_stor_huawei_e220_init(struct us_data *us)
+/* This places the HUAWEI usb dongles in multi-port mode */
+static int usb_stor_huawei_feature_init(struct us_data *us)
 {
int result;
 
@@ -104,3 +104,75 @@ int usb_stor_huawei_e220_init(struct us_
US_DEBUGP("Huawei mode set result is %d\n", result);
return 0;
 }
+
+/* This function will send
+ * a scsi switch command called rewind' to huawei dongle.
+ * When the dongle receives this command at the first time,
+ * it will reboot immediately,
+ * after rebooted, it will ignore this command and do nothing,
+ * if it receives this command again.
+ * So it is  unnecessary to read its response. */
+static int usb_stor_huawei_scsi_init(struct us_data *us)
+{
+   int result = 0;
+   int act_len = 0;
+   struct bulk_cb_wrap *bcbw = (struct bulk_cb_wrap *) us->iobuf;
+   char rewind_cmd[] = {0x11, 0x06, 0x20, 0x00, 0x00, 0x01, 0x01, 0x00,
+   0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+   
+   memset(bcbw, 0, sizeof(struct bulk_cb_wrap));
+   bcbw->Signature = cpu_to_le32(US_BULK_CB_SIGN);
+   bcbw->Tag = 0;
+   bcbw->DataTransferLength = 0;
+   bcbw->Flags = bcbw->Lun = 0;
+   bcbw->Length = sizeof(rewind_cmd);
+   memcpy(bcbw->CDB, rewind_cmd, sizeof(rewind_cmd));
+
+   result = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, bcbw,
+   US_BULK_CB_WRAP_LEN, _len);
+   US_DEBUGP("transfer actual length=%d, result=%d\n", act_len, result);
+   return result;
+}
+
+/* usb_stor_huawei_dongles_pid: try to find the supported Huawei USB dongles
+ * In Huawei, they assign the following product IDs
+ * for all of their mobile broadband dongles,
+ * including the new dongles in the future.
+ * So if the product ID is not included in this list,
+ * it means it is not Huawei's mobile broadband dongles.
+ */
+static int usb_stor_huawei_dongles_pid(struct us_data *us)
+{
+   struct usb_interface_descriptor *idesc;
+   int idProduct;
+   
+   idesc = >pusb_intf->cur_altsetting->desc;
+   idProduct = us->pusb_dev->descriptor.idProduct;
+   /* The first port is CDROM,
+* means the dongle in the single port mode,
+* and a switch command is required to be sent. */
+   if (idesc && idesc->bInterfaceNumber == 0) {
+   if ((idProduct == 0x1001)
+   || (idProduct == 0x1003)
+   || (idProduct == 0x1004)
+   || (idProduct >= 0x1401 && idProduct < 0x1501)
+   || (idProduct > 0x1504 && idProduct <= 0x1600)
+   || (idProduct >= 0x1c02 && idProduct <= 0x2202)) {
+   return 1;
+   }
+   }
+   return 0;
+}
+
+int usb_stor_huawei_init(struct us_data *us)
+{
+   int result = 0;
+   
+   if (usb_stor_huawei_dongles_pid(us)) {
+   if (us->pusb_dev->descriptor.idProduct >= 0x1446)
+   result = usb_stor_huawei_scsi_init(us);
+   else
+   result = usb_stor_huawei_feature_init(us);
+   }
+   return result;
+}
diff -uprN linux-3.8-rc2_orig/drivers/usb/storage/initializers.h 
linux-3.8-rc2/drivers/usb/storage/initializers.h
--- linux-3.8-rc2_orig/drivers/usb/storage/initializers.h   2013-01-04 
10:12:01.445356294 +0800
+++ linux-3.8-rc2/drivers/usb/storage/initializers.h2013-01-04 
10:35:03.427079144 +0800
@@ -46,5 +46,5 @@ int usb_stor_euscsi_init(struct us_data 
  * flash reader */
 int usb_stor_ucr61s2b_init(struct us_data *us);
 
-/* This places the HUAWEI E220 devices in multi-port mode */
-int usb_stor_huawei_e220_init(struct us_data *us);
+/* This places the HUAWEI usb dongles in multi-port mode */
+int usb_stor_huawei_init(struct us_data *us);
Binary files linux-3.8-rc2_orig/drivers/usb/storage/initializers.o and 
linux-3.8-rc2/drivers/usb/storage/initializers.o differ
diff -uprN linux-3.8-rc2_orig/drivers/usb/storage/unusual_devs.h 
linux-3.8-rc2/drivers/usb/storage/unusual_devs.h
--- linux-3.8-rc2_orig/drivers/usb/storage/unusual_devs.h   2013-01-04 
10:12:01.445356294 +0800

Re: [PATCH 2/2] clk: tegra30: Convert clk out to composite clk

2013-01-04 Thread Prashant Gaikwad


On Friday 04 January 2013 09:55 PM, Stephen Warren wrote:

On 01/03/2013 10:51 PM, Prashant Gaikwad wrote:

Convert clk out to composite clock type which removes
the mux clock.

Signed-off-by: Prashant Gaikwad 
---
This patch is rebased on ccf-rework for Tegra patch series. It is just to show
how clk-composite can be used, not to be merged. If patch 1 is accepted then
I would like to merge this patch to ccf-rework series.

Just so I'm clear, is the intent that patch 1 of this series gets
reviewed/accepted, and then you'll repost an updated version of the
Tegra CCF rework series that relies on patch 1? If so, patch 1 would
need to be either taken through the Tegra tree, or put into a separate
branch in the clock tree, so the Tegra tree can merge it as a dependency
of the Tegra CCF rework branch.


Yes, that is my plan but you can tell whatever you are comfortable with. 
I will re-order the dependencies.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 6/6] USB: forbid memory allocation with I/O during bus reset

2013-01-04 Thread Ming Lei

If one storage interface or usb network interface(iSCSI case)
exists in current configuration, memory allocation with
GFP_KERNEL during usb_device_reset() might trigger I/O transfer
on the storage interface itself and cause deadlock because
the 'us->dev_mutex' is held in .pre_reset() and the storage
interface can't do I/O transfer when the reset is triggered
by other interface, or the error handling can't be completed
if the reset is triggered by the storage itself(error handling path).

Cc: Alan Stern 
Cc: Oliver Neukum 
Signed-off-by: Ming Lei 
--
v5:
- use inline memalloc_noio_save()
v4:
- mark current memalloc_noio for every usb device reset
---
 drivers/usb/core/hub.c |   13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index a815fd2..698922e 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -5040,6 +5040,7 @@ int usb_reset_device(struct usb_device *udev)
 {
int ret;
int i;
+   unsigned int noio_flag;
struct usb_host_config *config = udev->actconfig;
 
if (udev->state == USB_STATE_NOTATTACHED ||
@@ -5049,6 +5050,17 @@ int usb_reset_device(struct usb_device *udev)
return -EINVAL;
}
 
+   /*
+* Don't allocate memory with GFP_KERNEL in current
+* context to avoid possible deadlock if usb mass
+* storage interface or usbnet interface(iSCSI case)
+* is included in current configuration. The easist
+* approach is to do it for every device reset,
+* because the device 'memalloc_noio' flag may have
+* not been set before reseting the usb device.
+*/
+   noio_flag = memalloc_noio_save();
+
/* Prevent autosuspend during the reset */
usb_autoresume_device(udev);
 
@@ -5093,6 +5105,7 @@ int usb_reset_device(struct usb_device *udev)
}
 
usb_autosuspend_device(udev);
+   memalloc_noio_restore(noio_flag);
return ret;
 }
 EXPORT_SYMBOL_GPL(usb_reset_device);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/1]linux-usb:Define a new macro for USB storage match rules

2013-01-04 Thread fangxiaozhi 00110321

From: fangxiaozhi 

1. Define a new macro for USB storage match rules: 
matching with Vendor ID and interface descriptors.

Signed-off-by: fangxiaozhi 

diff -uprN linux-3.8-rc2_orig/drivers/usb/storage/usb.c 
linux-3.8-rc2/drivers/usb/storage/usb.c
--- linux-3.8-rc2_orig/drivers/usb/storage/usb.c2013-01-04 
10:12:01.421356594 +0800
+++ linux-3.8-rc2/drivers/usb/storage/usb.c 2013-01-04 10:15:06.404043992 
+0800
@@ -120,6 +120,17 @@ MODULE_PARM_DESC(quirks, "supplemental l
.useTransport = use_transport,  \
 }
 
+#define UNUSUAL_VENDOR_INTF(idVendor, cl, sc, pr, \
+   vendor_name, product_name, use_protocol, use_transport, \
+   init_function, Flags) \
+{ \
+   .vendorName = vendor_name,  \
+   .productName = product_name,\
+   .useProtocol = use_protocol,\
+   .useTransport = use_transport,  \
+   .initFunction = init_function,  \
+}
+
 static struct us_unusual_dev us_unusual_dev_list[] = {
 #  include "unusual_devs.h"
{ } /* Terminating entry */
@@ -131,6 +142,7 @@ static struct us_unusual_dev for_dynamic
 #undef UNUSUAL_DEV
 #undef COMPLIANT_DEV
 #undef USUAL_DEV
+#undef UNUSUAL_VENDOR_INTF
 
 #ifdef CONFIG_LOCKDEP
 
diff -uprN linux-3.8-rc2_orig/drivers/usb/storage/usual-tables.c 
linux-3.8-rc2/drivers/usb/storage/usual-tables.c
--- linux-3.8-rc2_orig/drivers/usb/storage/usual-tables.c   2013-01-04 
10:12:01.446356281 +0800
+++ linux-3.8-rc2/drivers/usb/storage/usual-tables.c2013-01-04 
10:15:20.186871683 +0800
@@ -41,6 +41,19 @@
 #define USUAL_DEV(useProto, useTrans) \
 { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, useProto, useTrans) }
 
+/* Define the device is matched with Vendor ID and interface descriptors */
+#define UNUSUAL_VENDOR_INTF(id_vendor, cl, sc, pr, \
+   vendorName, productName, useProtocol, useTransport, \
+   initFunction, flags) \
+{ \
+   .match_flags = USB_DEVICE_ID_MATCH_INT_INFO \
+   | USB_DEVICE_ID_MATCH_VENDOR, \
+   .idVendor= (id_vendor), \
+   .bInterfaceClass = (cl), \
+   .bInterfaceSubClass = (sc), \
+   .bInterfaceProtocol = (pr), \
+   .driver_info = (flags) }
+
 struct usb_device_id usb_storage_usb_ids[] = {
 #  include "unusual_devs.h"
{ } /* Terminating entry */
@@ -50,6 +63,7 @@ MODULE_DEVICE_TABLE(usb, usb_storage_usb
 #undef UNUSUAL_DEV
 #undef COMPLIANT_DEV
 #undef USUAL_DEV
+#undef UNUSUAL_VENDOR_INTF
 
 /*
  * The table of devices to ignore

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/2] clk: Add composite clock type

2013-01-04 Thread Prashant Gaikwad


On Saturday 05 January 2013 03:48 AM, Stephen Boyd wrote:

On 01/03/13 21:51, Prashant Gaikwad wrote:

diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index f0b269a..baf7608 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -2,7 +2,8 @@
  obj-$(CONFIG_HAVE_CLK)+= clk-devres.o
  obj-$(CONFIG_CLKDEV_LOOKUP)   += clkdev.o
  obj-$(CONFIG_COMMON_CLK)  += clk.o clk-fixed-rate.o clk-gate.o \
-  clk-mux.o clk-divider.o clk-fixed-factor.o
+  clk-mux.o clk-divider.o clk-fixed-factor.o \
+  clk-composite.o

This list is getting a little out of hand. Should we sort it
alphabetically and put each file on one line?


Do you want me to do it in this patch?




  # SoCs specific
  obj-$(CONFIG_ARCH_BCM2835)+= clk-bcm2835.o
  obj-$(CONFIG_ARCH_NOMADIK)+= clk-nomadik.o
diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
new file mode 100644
index 000..8634dbf
--- /dev/null
+++ b/drivers/clk/clk-composite.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2012, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#define to_clk_composite(_hw) container_of(_hw, struct clk_composite, hw)
+
+static u8 clk_composite_get_parent(struct clk_hw *hw)
+{
+   struct clk_composite *composite = to_clk_composite(hw);
+   const struct clk_ops *mux_ops = composite->mux_ops;
+   struct clk_hw *mux_hw = composite->mux_hw;
+
+   mux_hw->clk = hw->clk;

Looks like this is already done down in the register function. Why are
we doing it again here and in each op?


Some ops gets called during clk_init which is before clk_register returns.



+
+   return mux_ops->get_parent(mux_hw);
+}

[snip]

+struct clk *clk_register_composite(struct device *dev, const char *name,
+   const char **parent_names, int num_parents,
+   struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+   struct clk_hw *div_hw, const struct clk_ops *div_ops,
+   struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+   unsigned long flags)
+{
+   struct clk *clk;
+   struct clk_init_data init;
+   struct clk_composite *composite;
+   struct clk_ops *clk_composite_ops;
+
+   composite = kzalloc(sizeof(struct clk_ops), GFP_KERNEL);

sizeof(*composite) != sizeof(struct clk_ops)


Thanks.


+   if (!composite) {
+   pr_err("%s: could not allocate composite clk\n", __func__);
+   return ERR_PTR(-ENOMEM);
+   }
+
+   init.name = name;
+   init.flags = flags | CLK_IS_BASIC;
+   init.parent_names = parent_names;
+   init.num_parents = num_parents;
+
+   /* allocate the clock ops */
+   clk_composite_ops = kzalloc(sizeof(struct clk_ops), GFP_KERNEL);

This one looks right though. Perhaps you should change style to use
sizeof(*clk_composite_ops) so that the above mistake doesn't happen.


Sure.


+   if (!clk_composite_ops) {
+   pr_err("%s: could not allocate clk ops\n", __func__);
+   kfree(composite);
+   return ERR_PTR(-ENOMEM);
+   }
+
+   if (mux_hw && mux_ops) {
+   if (!mux_ops->get_parent || !mux_ops->set_parent) {
+   clk = ERR_PTR(-EINVAL);
+   goto err;
+   }
+
+   composite->mux_hw = mux_hw;
+   composite->mux_ops = mux_ops;
+   clk_composite_ops->get_parent = clk_composite_get_parent;
+   clk_composite_ops->set_parent = clk_composite_set_parent;
+   }
+
+   if (div_hw && div_ops) {
+   if (!div_ops->recalc_rate || !div_ops->round_rate ||
+   !div_ops->set_rate) {
+   clk = ERR_PTR(-EINVAL);
+   goto err;
+   }
+
+   composite->div_hw = div_hw;
+   composite->div_ops = div_ops;
+   clk_composite_ops->recalc_rate = clk_composite_recalc_rate;
+   clk_composite_ops->round_rate = clk_composite_round_rate;
+   clk_composite_ops->set_rate = clk_composite_set_rate;
+   }
+
+   if (gate_hw && gate_ops) {
+   if (!gate_ops->is_enabled || !gate_ops->enable ||
+

Re: [PATCH V3 2/8] Make TestSetPageDirty and dirty page accounting in one func

2013-01-04 Thread Sha Zhengju

Hi Michal,

Sorry for my late response, I'm just back from vocation. : )

On Wed, Jan 2, 2013 at 5:08 PM, Michal Hocko  wrote:
> On Wed 26-12-12 01:22:36, Sha Zhengju wrote:
>> From: Sha Zhengju 
>>
>> Commit a8e7d49a(Fix race in create_empty_buffers() vs 
>> __set_page_dirty_buffers())
>> extracts TestSetPageDirty from __set_page_dirty and is far away from
>> account_page_dirtied. But it's better to make the two operations in one 
>> single
>> function to keep modular. So in order to avoid the potential race mentioned 
>> in
>> commit a8e7d49a, we can hold private_lock until __set_page_dirty completes.
>> There's no deadlock between ->private_lock and ->tree_lock after 
>> confirmation.
>
> Could you be more specific here? E.g. quote mm/filemap.c comment I have
> mentioned during the first round of review?
>

Okay, sorry for forgetting the comment. I'll add it next round.

>> It's a prepare patch for following memcg dirty page accounting patches.
>>
>>
>> Here is some test numbers that before/after this patch:
>> Test steps(Mem-4g, ext4):
>> drop_cache; sync
>> fio 
>> (ioengine=sync/write/buffered/bs=4k/size=1g/numjobs=2/group_reporting/thread)
>
> Could also add some rationale why you think this test is relevant?
>

The test is aiming at finding the impact of performance due to lock
contention by writing parallel
to the same file. I'll add the reason next version too.

Thanks for reviewing!


Regards,
Sha

>> We test it for 10 times and get the average numbers:
>> Before:
>> write: io=2048.0MB, bw=254117KB/s, iops=63528.9 , runt=  8279msec
>> lat (usec): min=1 , max=742361 , avg=30.918, stdev=1601.02
>> After:
>> write: io=2048.0MB, bw=254044KB/s, iops=63510.3 , runt=  8274.4msec
>> lat (usec): min=1 , max=856333 , avg=31.043, stdev=1769.32
>>
>> Note that the impact is little(<1%).
>>
>>
>> Signed-off-by: Sha Zhengju 
>> Reviewed-by: Michal Hocko 
>> ---
>>  fs/buffer.c |   24 
>>  1 file changed, 12 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/buffer.c b/fs/buffer.c
>> index c017a2d..3b032b9 100644
>> --- a/fs/buffer.c
>> +++ b/fs/buffer.c
>> @@ -609,9 +609,15 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
>>   * If warn is true, then emit a warning if the page is not uptodate and has
>>   * not been truncated.
>>   */
>> -static void __set_page_dirty(struct page *page,
>> +static int __set_page_dirty(struct page *page,
>>   struct address_space *mapping, int warn)
>>  {
>> + if (unlikely(!mapping))
>> + return !TestSetPageDirty(page);
>> +
>> + if (TestSetPageDirty(page))
>> + return 0;
>> +
>>   spin_lock_irq(>tree_lock);
>>   if (page->mapping) {/* Race with truncate? */
>>   WARN_ON_ONCE(warn && !PageUptodate(page));
>> @@ -621,6 +627,8 @@ static void __set_page_dirty(struct page *page,
>>   }
>>   spin_unlock_irq(>tree_lock);
>>   __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
>> +
>> + return 1;
>>  }
>>
>>  /*
>> @@ -666,11 +674,9 @@ int __set_page_dirty_buffers(struct page *page)
>>   bh = bh->b_this_page;
>>   } while (bh != head);
>>   }
>> - newly_dirty = !TestSetPageDirty(page);
>> + newly_dirty = __set_page_dirty(page, mapping, 1);
>>   spin_unlock(>private_lock);
>>
>> - if (newly_dirty)
>> - __set_page_dirty(page, mapping, 1);
>>   return newly_dirty;
>>  }
>>  EXPORT_SYMBOL(__set_page_dirty_buffers);
>> @@ -1125,14 +1131,8 @@ void mark_buffer_dirty(struct buffer_head *bh)
>>   return;
>>   }
>>
>> - if (!test_set_buffer_dirty(bh)) {
>> - struct page *page = bh->b_page;
>> - if (!TestSetPageDirty(page)) {
>> - struct address_space *mapping = page_mapping(page);
>> - if (mapping)
>> - __set_page_dirty(page, mapping, 0);
>> - }
>> - }
>> + if (!test_set_buffer_dirty(bh))
>> + __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
>>  }
>>  EXPORT_SYMBOL(mark_buffer_dirty);
>>
>> --
>> 1.7.9.5
>>
>
> --
> Michal Hocko
> SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] x86,apic: Blacklist x2APIC on some platforms

2013-01-04 Thread Youquan Song

On Tue, Dec 18, 2012 at 09:42:30AM -0800, Yinghai Lu wrote:
> On Tue, Dec 18, 2012 at 9:33 AM, H. Peter Anvin  wrote:
> > On 12/18/2012 09:07 AM, Youquan Song wrote:
> >> Blacklist x2apic when Nivida graphics enabled on Lenovo ThinkPad T420.
> >> Also set blacklist x2apic for Lenovo ThinkPad W520 and L520.
> >
> > I thought we had gotten reports that the Nvidia correlation was false?
> 
> that's T520.

Hi hpa,

Yinghai's T520 works when x2APIC enabled, so do not need to blacklist.

Would you like to take the patch?

Thanks
-Youquan
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[v3 PATCH 3/9] watchdog/at91sam9_wdt: Convert to use the watchdog framework

2013-01-04 Thread Wenyou Yang

According to Documentation/watchdog/convert_drivers_to_kernel_api.txt,
remove the file_operations struct, miscdevice, and obsolete includes

Since the at91sam watchdog inherent characteristics, add the watchdog
operations: at91wdt_start, at91wdt_stop and at91wdt_ping.

Signed-off-by: Wenyou Yang 
Cc: w...@iguana.be
Cc: linux-watch...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/watchdog/at91sam9_wdt.c |  198 ++-
 1 file changed, 70 insertions(+), 128 deletions(-)

diff --git a/drivers/watchdog/at91sam9_wdt.c b/drivers/watchdog/at91sam9_wdt.c
index f10a897..febd028 100644
--- a/drivers/watchdog/at91sam9_wdt.c
+++ b/drivers/watchdog/at91sam9_wdt.c
@@ -13,16 +13,17 @@
  * The Watchdog Timer Mode Register can be only written to once. If the
  * timeout need to be set from Linux, be sure that the bootstrap or the
  * bootloader doesn't write to this register.
+ * The Watchdog Timer default is running with maximum counter value
+ * (WDV=0xfff) at reset, i.e., at power-up. It MUST be either disabled
+ * or be reprogrammed within the maxinum margin(16s).
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include 
-#include 
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -31,7 +32,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #include "at91sam9_wdt.h"
@@ -65,8 +65,6 @@ module_param(nowayout, bool, 0);
 MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started "
"(default=" __MODULE_STRING(WATCHDOG_NOWAYOUT) ")");
 
-static void at91_ping(unsigned long data);
-
 struct at91wdt_drvdata {
void __iomem*phybase;
boolis_enable;  /* indicate if the watchdog is eabled */
@@ -99,7 +97,7 @@ static inline void at91_wdt_reset(struct at91wdt_drvdata 
*driver_data)
 /*
  * Timer tick
  */
-static void at91_ping(unsigned long data)
+static void at91wdt_timer_tick(unsigned long data)
 {
struct watchdog_device *wddev = (struct watchdog_device *)data;
struct at91wdt_drvdata *driver_data = watchdog_get_drvdata(wddev);
@@ -107,45 +105,30 @@ static void at91_ping(unsigned long data)
if (time_before(jiffies, driver_data->next_heartbeat)) {
at91_wdt_reset(driver_data);
mod_timer(_data->timer, jiffies + WDT_TIMEOUT);
+
+   if (!watchdog_is_open(wddev))
+   driver_data->next_heartbeat = jiffies
+   + wddev->timeout * HZ;
} else
pr_crit("I will reset your machine !\n");
 }
 
-/*
- * Watchdog device is opened, and watchdog starts running.
- */
-static int at91_wdt_open(struct inode *inode, struct file *file)
-{
-   driver_data->next_heartbeat = jiffies + heartbeat * HZ;
-   mod_timer(_data->timer, jiffies + WDT_TIMEOUT);
-
-   return nonseekable_open(inode, file);
-}
-
-/*
- * Close the watchdog device.
- */
-static int at91_wdt_close(struct inode *inode, struct file *file)
-{
-   del_timer(_data->timer);
-
-   return 0;
-}
-
-/*
- * Set the watchdog time interval in 1/256Hz (write-once)
- * Counter is 12 bit.
- */
-static int at91_wdt_settimeout(unsigned int timeout)
+static int at91wdt_enable(struct watchdog_device *wddev, unsigned int timeout)
 {
+   struct at91wdt_drvdata *driver_data = watchdog_get_drvdata(wddev);
unsigned int reg;
-   unsigned int mr;
 
-   /* Check if disabled */
-   mr = wdt_read(AT91_WDT_MR);
-   if (mr & AT91_WDT_WDDIS) {
-   pr_err("sorry, watchdog is disabled\n");
-   return -EIO;
+   /*
+* Check if the watchdog is disabled,
+* if disabled, the reason is the bootstrap or the bootloader has
+* written the Watchdog Timer Mode Register to disable the
+* watchdog timer
+*/
+   reg = wdt_read(driver_data, AT91_WDT_MR);
+   if (reg & AT91_WDT_WDDIS) {
+   driver_data->is_enable = false;
+   pr_info("sorry, watchdog is disabled\n");
+   return -1;
}
 
/*
@@ -159,7 +142,9 @@ static int at91_wdt_settimeout(unsigned int timeout)
| AT91_WDT_WDDBGHLT /* disabled in debug mode */
| AT91_WDT_WDD  /* restart at any time */
| (timeout & AT91_WDT_WDV);  /* timer value */
-   wdt_write(AT91_WDT_MR, reg);
+   wdt_write(driver_data, AT91_WDT_MR, reg);
+
+   driver_data->is_enable = true;
 
return 0;
 }
@@ -170,99 +155,61 @@ static const struct watchdog_info at91_wdt_info = {
WDIOF_MAGICCLOSE,
 };
 
-/*
- * Handle commands from user-space.
- */
-static long at91_wdt_ioctl(struct file *file,
-   unsigned int cmd, unsigned long arg)
+static int at91wdt_start(struct watchdog_device *wddev)
 {
-   void __user *argp = (void __user *)arg;
-   int __user *p = argp;
-   int new_value;
-
-   switch (cmd)

[v3 PATCH 2/9] watchdog/at91sam9_wdt: Remove at91wdt_private and add at91wdt_drvdata struct

2013-01-04 Thread Wenyou Yang

Remove the global variable at91wdt_private, add the struct at91wdt_drvdata
as a substitute, and set it as the driver data of the at91wdt_wdd.

Signed-off-by: Wenyou Yang 
Cc: w...@iguana.be
Cc: linux-watch...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/watchdog/at91sam9_wdt.c |   88 +--
 1 file changed, 47 insertions(+), 41 deletions(-)

diff --git a/drivers/watchdog/at91sam9_wdt.c b/drivers/watchdog/at91sam9_wdt.c
index d864dc4..f10a897 100644
--- a/drivers/watchdog/at91sam9_wdt.c
+++ b/drivers/watchdog/at91sam9_wdt.c
@@ -38,11 +38,6 @@
 
 #define DRV_NAME "AT91SAM9 Watchdog"
 
-#define wdt_read(field) \
-   __raw_readl(at91wdt_private.base + field)
-#define wdt_write(field, val) \
-   __raw_writel((val), at91wdt_private.base + field)
-
 /* AT91SAM9 watchdog runs a 12bit counter @ 256Hz,
  * use this to convert a watchdog
  * value from/to milliseconds.
@@ -72,23 +67,33 @@ MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once 
started "
 
 static void at91_ping(unsigned long data);
 
-static struct {
-   void __iomem *base;
-   unsigned long next_heartbeat;   /* the next_heartbeat for the timer */
-   unsigned long open;
-   char expect_close;
-   struct timer_list timer;/* The timer that pings the watchdog */
-} at91wdt_private;
+struct at91wdt_drvdata {
+   void __iomem*phybase;
+   boolis_enable;  /* indicate if the watchdog is eabled */
+   unsigned long   next_heartbeat; /* the next_heartbeat for the timer */
+   struct timer_list   timer;  /* The timer that pings the watchdog */
+};
 
 /* . */
 
+static inline unsigned int wdt_read(struct at91wdt_drvdata *driver_data,
+   unsigned int field)
+{
+   return readl_relaxed(driver_data->phybase + field);
+}
+
+static inline void wdt_write(struct at91wdt_drvdata *driver_data,
+   unsigned int field, unsigned int val)
+{
+   writel_relaxed((val), driver_data->phybase + field);
+}
 
 /*
  * Reload the watchdog timer.  (ie, pat the watchdog)
  */
-static inline void at91_wdt_reset(void)
+static inline void at91_wdt_reset(struct at91wdt_drvdata *driver_data)
 {
-   wdt_write(AT91_WDT_CR, AT91_WDT_KEY | AT91_WDT_WDRSTT);
+   wdt_write(driver_data, AT91_WDT_CR, AT91_WDT_KEY | AT91_WDT_WDRSTT);
 }
 
 /*
@@ -96,10 +101,12 @@ static inline void at91_wdt_reset(void)
  */
 static void at91_ping(unsigned long data)
 {
-   if (time_before(jiffies, at91wdt_private.next_heartbeat) ||
-   (!nowayout && !at91wdt_private.open)) {
-   at91_wdt_reset();
-   mod_timer(_private.timer, jiffies + WDT_TIMEOUT);
+   struct watchdog_device *wddev = (struct watchdog_device *)data;
+   struct at91wdt_drvdata *driver_data = watchdog_get_drvdata(wddev);
+
+   if (time_before(jiffies, driver_data->next_heartbeat)) {
+   at91_wdt_reset(driver_data);
+   mod_timer(_data->timer, jiffies + WDT_TIMEOUT);
} else
pr_crit("I will reset your machine !\n");
 }
@@ -109,11 +116,8 @@ static void at91_ping(unsigned long data)
  */
 static int at91_wdt_open(struct inode *inode, struct file *file)
 {
-   if (test_and_set_bit(0, _private.open))
-   return -EBUSY;
-
-   at91wdt_private.next_heartbeat = jiffies + heartbeat * HZ;
-   mod_timer(_private.timer, jiffies + WDT_TIMEOUT);
+   driver_data->next_heartbeat = jiffies + heartbeat * HZ;
+   mod_timer(_data->timer, jiffies + WDT_TIMEOUT);
 
return nonseekable_open(inode, file);
 }
@@ -123,13 +127,8 @@ static int at91_wdt_open(struct inode *inode, struct file 
*file)
  */
 static int at91_wdt_close(struct inode *inode, struct file *file)
 {
-   clear_bit(0, _private.open);
+   del_timer(_data->timer);
 
-   /* stop internal ping */
-   if (!at91wdt_private.expect_close)
-   del_timer(_private.timer);
-
-   at91wdt_private.expect_close = 0;
return 0;
 }
 
@@ -191,7 +190,7 @@ static long at91_wdt_ioctl(struct file *file,
return put_user(0, p);
 
case WDIOC_KEEPALIVE:
-   at91wdt_private.next_heartbeat = jiffies + heartbeat * HZ;
+   driver_data->next_heartbeat = jiffies + heartbeat * HZ;
return 0;
 
case WDIOC_SETTIMEOUT:
@@ -199,7 +198,7 @@ static long at91_wdt_ioctl(struct file *file,
return -EFAULT;
 
heartbeat = new_value;
-   at91wdt_private.next_heartbeat = jiffies + heartbeat * HZ;
+   driver_data->next_heartbeat = jiffies + heartbeat * HZ;
 
return put_user(new_value, p);  /* return current value */
 
@@ -222,20 +221,16 @@ static ssize_t at91_wdt_write(struct file *file, const 
char *data, size_t len,
if (!nowayout) {

[v3 PATCH 4/9] watchdog/at91sam9_wdt: Adjust the options of watchdog_info

2013-01-04 Thread Wenyou Yang

Since the Watchdog Timer Mode Register can be only written only once,
so the watchdog_info shall not support WDIOF_SETTIMEOUT
and WDIOF_MAGICCLOSE options, remove them.

Signed-off-by: Wenyou Yang 
Cc: w...@iguana.be
Cc: linux-watch...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/watchdog/at91sam9_wdt.c |3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/watchdog/at91sam9_wdt.c b/drivers/watchdog/at91sam9_wdt.c
index febd028..a3d1a09 100644
--- a/drivers/watchdog/at91sam9_wdt.c
+++ b/drivers/watchdog/at91sam9_wdt.c
@@ -151,8 +151,7 @@ static int at91wdt_enable(struct watchdog_device *wddev, 
unsigned int timeout)
 
 static const struct watchdog_info at91_wdt_info = {
.identity   = DRV_NAME,
-   .options= WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING |
-   WDIOF_MAGICCLOSE,
+   .options= WDIOF_KEEPALIVEPING,
 };
 
 static int at91wdt_start(struct watchdog_device *wddev)
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/7] clk: tegra: Use common of_clk_init() function

2013-01-04 Thread Prashant Gaikwad


On Friday 04 January 2013 10:00 PM, Stephen Warren wrote:

On 01/04/2013 12:00 AM, Prashant Gaikwad wrote:

Use common of_clk_init() function for clocks initialization.
  drivers/clk/tegra/clk-tegra20.c |3 ++-
  drivers/clk/tegra/clk-tegra30.c |3 ++-

Oh, so this series is written assuming that the Tegra CCF rework is
already applied then? That makes the dependencies quite painful, since I
think we'll end up with the following order being needed:

1) clk: Add composite clock type
-> This would usually go through the clk tree.
2) The Tegra CCF rework series
-> This must go through the Tegra tree due to lots of dependencies
and merge conflicts with other Tegra patches.
3) This series
-> This would usually go through the clk tree.

Is it possible to re-order the dependencies as (1) (3) (2), so that Mike
can apply (1) and (3) to the clock tree, then I can use the clk tree as
the basis for a branch in the Tegra tree to apply (2) and all the other
Tegra patches that will conflict with (2)?


If Mike approves the concept and implementation in (1) and (3) then I 
will repost (2) and (3) with dependencies re-ordered.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[v3 PATCH 6/9] watchdog/at91sam9_wdt: Remove the __initdata of at91wdt_wdd

2013-01-04 Thread Wenyou Yang

For this variable will be used in the timer handler.

Signed-off-by: Wenyou Yang 
Cc: w...@iguana.be
Cc: linux-watch...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/watchdog/at91sam9_wdt.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/watchdog/at91sam9_wdt.c b/drivers/watchdog/at91sam9_wdt.c
index 53ede84..94be9d6 100644
--- a/drivers/watchdog/at91sam9_wdt.c
+++ b/drivers/watchdog/at91sam9_wdt.c
@@ -196,7 +196,7 @@ static struct watchdog_ops at91wdt_ops = {
.ping = at91wdt_ping,
 };
 
-static struct watchdog_device at91wdt_wdd __initdata = {
+static struct watchdog_device at91wdt_wdd = {
.timeout = WDT_HEARTBEAT,
.min_timeout = MIN_HEARTBEAT,
.max_timeout = MAX_HEARTBEAT,
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[v3 PATCH 7/9] watchdog/at91sam9_wdt: Use module_platform_driver()

2013-01-04 Thread Wenyou Yang

Using module_platform_driver() replaces module_init() and module_exit()
and makes the code simpler.

Remove '__init' annotation from the function 'at91wdt_probe'
since the driver becomes hot-plug aware now.

Signed-off-by: Wenyou Yang 
Cc: w...@iguana.be
Cc: linux-watch...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/watchdog/at91sam9_wdt.c |   16 +++-
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/drivers/watchdog/at91sam9_wdt.c b/drivers/watchdog/at91sam9_wdt.c
index 94be9d6..7c13dda 100644
--- a/drivers/watchdog/at91sam9_wdt.c
+++ b/drivers/watchdog/at91sam9_wdt.c
@@ -204,7 +204,7 @@ static struct watchdog_device at91wdt_wdd = {
.ops = _ops,
 };
 
-static int __init at91wdt_probe(struct platform_device *pdev)
+static int at91wdt_probe(struct platform_device *pdev)
 {
struct at91wdt_drvdata *driver_data;
struct resource *r;
@@ -273,6 +273,7 @@ MODULE_DEVICE_TABLE(of, at91_wdt_dt_ids);
 #endif
 
 static struct platform_driver at91wdt_driver = {
+   .probe  = at91wdt_probe,
.remove = __exit_p(at91wdt_remove),
.driver = {
.name   = "at91_wdt",
@@ -281,18 +282,7 @@ static struct platform_driver at91wdt_driver = {
},
 };
 
-static int __init at91sam_wdt_init(void)
-{
-   return platform_driver_probe(_driver, at91wdt_probe);
-}
-
-static void __exit at91sam_wdt_exit(void)
-{
-   platform_driver_unregister(_driver);
-}
-
-module_init(at91sam_wdt_init);
-module_exit(at91sam_wdt_exit);
+module_platform_driver(at91wdt_driver);
 
 MODULE_AUTHOR("Renaud CERRATO ");
 MODULE_DESCRIPTION("Watchdog driver for Atmel AT91SAM9x processors");
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[v3 PATCH 9/9] ARM: dts: add the watchdog nodes for at91sam9g25ek boards

2013-01-04 Thread Wenyou Yang

Tested on the at91sam9g25ek boards

Signed-off-by: Wenyou Yang 
Cc: li...@arm.linux.org.uk
Cc: linux-kernel@vger.kernel.org
---
 arch/arm/boot/dts/at91sam9x5ek.dtsi |4 
 1 file changed, 4 insertions(+)

diff --git a/arch/arm/boot/dts/at91sam9x5ek.dtsi 
b/arch/arm/boot/dts/at91sam9x5ek.dtsi
index 8a7cf1d..afddf75 100644
--- a/arch/arm/boot/dts/at91sam9x5ek.dtsi
+++ b/arch/arm/boot/dts/at91sam9x5ek.dtsi
@@ -69,6 +69,10 @@
status = "okay";
};
 
+   watchdog@fe40 {
+   status = "okay";
+   };
+
pinctrl@f400 {
mmc0 {
pinctrl_board_mmc0: mmc0-board {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[v3 PATCH 8/9] ARM: dts: add the watchdog nodes for at91sam9x5 and at91sam9n12 SoC

2013-01-04 Thread Wenyou Yang

Signed-off-by: Wenyou Yang 
Cc: li...@arm.linux.org.uk
Cc: linux-kernel@vger.kernel.org
---
 arch/arm/boot/dts/at91sam9n12.dtsi |6 ++
 arch/arm/boot/dts/at91sam9x5.dtsi  |6 ++
 2 files changed, 12 insertions(+)

diff --git a/arch/arm/boot/dts/at91sam9n12.dtsi 
b/arch/arm/boot/dts/at91sam9n12.dtsi
index e9efb34..f60bbbc 100644
--- a/arch/arm/boot/dts/at91sam9n12.dtsi
+++ b/arch/arm/boot/dts/at91sam9n12.dtsi
@@ -355,6 +355,12 @@
#size-cells = <0>;
status = "disabled";
};
+
+   watchdog@fe40 {
+   compatible = "atmel,at91sam9260-wdt";
+   reg = <0xfe40 0x10>;
+   status = "disabled";
+   };
};
 
nand0: nand@4000 {
diff --git a/arch/arm/boot/dts/at91sam9x5.dtsi 
b/arch/arm/boot/dts/at91sam9x5.dtsi
index 40ac3a4..cb3ffb2 100644
--- a/arch/arm/boot/dts/at91sam9x5.dtsi
+++ b/arch/arm/boot/dts/at91sam9x5.dtsi
@@ -473,6 +473,12 @@
trigger-value = <0x6>;
};
};
+
+   watchdog@fe40 {
+   compatible = "atmel,at91sam9260-wdt";
+   reg = <0xfe40 0x10>;
+   status = "disabled";
+   };
};
 
nand0: nand@4000 {
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[v3 PATCH 5/9] watchdog/at91sam9_wdt: Add nowayout helpers to Watchdog Timer Driver Kernel API

2013-01-04 Thread Wenyou Yang

Signed-off-by: Wenyou Yang 
Cc: w...@iguana.be
Cc: linux-watch...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
---
 drivers/watchdog/at91sam9_wdt.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/watchdog/at91sam9_wdt.c b/drivers/watchdog/at91sam9_wdt.c
index a3d1a09..53ede84 100644
--- a/drivers/watchdog/at91sam9_wdt.c
+++ b/drivers/watchdog/at91sam9_wdt.c
@@ -235,6 +235,8 @@ static int __init at91wdt_probe(struct platform_device 
*pdev)
return ret;
}
 
+   watchdog_set_nowayout(_wdd, nowayout);
+
watchdog_init_timeout(_wdd, heartbeat, pdev->dev.of_node);
 
ret = at91wdt_enable(_wdd, ms_to_ticks(WDT_HW_TIMEOUT * 1000));
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH 1/6] ACPI / PM: Change the way power transitions to D3cold are carried out

2013-01-04 Thread Zheng, Lv

Thanks for your patch, it might be useful as_PR3 off method of I2C hosts and 
targets might be different from their _PR0 off method, ACPI BIOS may implement 
protection in the _PR3 off method in order not to break the transactions during 
the powering off process:
As I2C is wired-AND logic bus, if a device will be LOW on SDA or SCL during the 
powering off process, it will pull the bus from HIGH to LOW which may break the 
current transaction on the bus that targeted to another slave device.

I just wonder one more thing which is not related to the ACPI BIOS.
If busses like I2C have such "non-hotpluggable" nature, we need to cut power of 
single target device only when there are not any transactions visible in the 
same segment.

How could an equivalent solution be implemented in the Linux kernel for I2C 
busses?
It could be useful for those platforms without such firmware deployed to 
protect the OS.

It seems we may need to redesign acpi_device_set_power/acpi_device_power_state 
to meet the following requirements:
1. suspend/resume -> One function can be used to switch device power state from 
0 to 3 or 3 to 0.
2. poweroff -> One function can be used to cut device power currently applied.
And we may need new interface in the power core as "poweroff" (maybe also 
poweron) or likewise for platform_suspend/hibernate_ops.

Then I2C in the kernel can also implement a solution putting all of the devices 
(the masters and all of their slaves) in one segment (where there is wired-AND 
logic) into D3 (suspend all non-hotpluggable devices in one segment), and 
"poweroff" one of them when there are not any transactions visible on the bus.

Thanks and best regards
-Lv

> -Original Message-
> From: Rafael J. Wysocki [mailto:r...@sisk.pl]
> Sent: Saturday, January 05, 2013 6:00 AM
> To: ACPI Devel Maling List
> Cc: LKML; Len Brown; Zheng, Lv; Huang, Ying
> Subject: [PATCH 1/6] ACPI / PM: Change the way power transitions to D3cold
> are carried out
> 
> From: Rafael J. Wysocki 
> 
> During power transitions into D3cold from any shallower power states we are
> supposed to transition the device into D3hot and remove power from it
> afterward, but the current code in acpi_device_set_power() doesn't work this
> way.
> 
> At the same time, though, we need to be careful enough to preserve
> backwards compatibility for systems that don't distinguish between D3hot and
> D3cold (e.g. designed before ACPI 4).
> 
> Modify acpi_device_set_power() so that it works in accordance with the
> expectations in both cases.
> 
> Signed-off-by: Rafael J. Wysocki 
> ---
>  drivers/acpi/bus.c |   12 ++--
>  1 file changed, 10 insertions(+), 2 deletions(-)
> 
> Index: linux/drivers/acpi/bus.c
> 
> ===
> --- linux.orig/drivers/acpi/bus.c
> +++ linux/drivers/acpi/bus.c
> @@ -270,6 +270,7 @@ int acpi_device_set_power(struct acpi_de
>   int result = 0;
>   acpi_status status = AE_OK;
>   char object_name[5] = { '_', 'P', 'S', '0' + state, '\0' };
> + bool cut_power = false;
> 
>   if (!device || (state < ACPI_STATE_D0) || (state > ACPI_STATE_D3_COLD))
>   return -EINVAL;
> @@ -294,9 +295,13 @@ int acpi_device_set_power(struct acpi_de
>   return -ENODEV;
>   }
> 
> - /* For D3cold we should execute _PS3, not _PS4. */
> - if (state == ACPI_STATE_D3_COLD)
> + /* For D3cold we should first transition into D3hot. */
> + if (state == ACPI_STATE_D3_COLD
> + &&
> device->power.states[ACPI_STATE_D3_COLD].flags.os_accessible) {
> + state = ACPI_STATE_D3_HOT;
>   object_name[3] = '3';
> + cut_power = true;
> + }
> 
>   /*
>* Transition Power
> @@ -341,6 +346,9 @@ int acpi_device_set_power(struct acpi_de
>   }
>   }
> 
> + if (cut_power)
> + result = acpi_power_transition(device, ACPI_STATE_D3_COLD);
> +
>end:
>   if (result)
>   printk(KERN_WARNING PREFIX

N�r��yb�X��ǧv�^�)޺{.n�+{zX����ܨ}���Ơz�:+v���zZ+��+zf���h���~i���z��w���?�&�)ߢf��^jǫy�m��@A�a���
0��h���i

[PATCH v7 4/6] net/core: apply pm_runtime_set_memalloc_noio on network devices

2013-01-04 Thread Ming Lei

Deadlock might be caused by allocating memory with GFP_KERNEL in
runtime_resume and runtime_suspend callback of network devices in
iSCSI situation, so mark network devices and its ancestor as
'memalloc_noio' with the introduced pm_runtime_set_memalloc_noio().

Cc: "David S. Miller" 
Cc: Eric Dumazet 
Cc: David Decotigny 
Cc: Tom Herbert 
Cc: Ingo Molnar 
Signed-off-by: Ming Lei 
--
v7:
- rebase on v3.8-rc2-next-20130104

v4:
- call pm_runtime_set_memalloc_noio(ddev, true) after
device_add
---
 net/core/net-sysfs.c |5 +
 1 file changed, 5 insertions(+)

diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 29c884a..67e00b2 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "net-sysfs.h"
 
@@ -1409,6 +1410,8 @@ void netdev_unregister_kobject(struct net_device * net)
 
remove_queue_kobjects(net);
 
+   pm_runtime_set_memalloc_noio(dev, false);
+
device_del(dev);
 }
 
@@ -1453,6 +1456,8 @@ int netdev_register_kobject(struct net_device *net)
return error;
}
 
+   pm_runtime_set_memalloc_noio(dev, true);
+
return error;
 }
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V3 2/8] Make TestSetPageDirty and dirty page accounting in one func

2013-01-04 Thread Sha Zhengju

Hi Kame,

Sorry for the late response, I'm just back from vocation. : )

On Fri, Dec 28, 2012 at 8:39 AM, Kamezawa Hiroyuki
 wrote:
> (2012/12/26 2:22), Sha Zhengju wrote:
>> From: Sha Zhengju 
>>
>> Commit a8e7d49a(Fix race in create_empty_buffers() vs 
>> __set_page_dirty_buffers())
>> extracts TestSetPageDirty from __set_page_dirty and is far away from
>> account_page_dirtied. But it's better to make the two operations in one 
>> single
>> function to keep modular. So in order to avoid the potential race mentioned 
>> in
>> commit a8e7d49a, we can hold private_lock until __set_page_dirty completes.
>> There's no deadlock between ->private_lock and ->tree_lock after 
>> confirmation.
>> It's a prepare patch for following memcg dirty page accounting patches.
>>
>>
>> Here is some test numbers that before/after this patch:
>> Test steps(Mem-4g, ext4):
>> drop_cache; sync
>> fio 
>> (ioengine=sync/write/buffered/bs=4k/size=1g/numjobs=2/group_reporting/thread)
>>
>> We test it for 10 times and get the average numbers:
>> Before:
>> write: io=2048.0MB, bw=254117KB/s, iops=63528.9 , runt=  8279msec
>> lat (usec): min=1 , max=742361 , avg=30.918, stdev=1601.02
>> After:
>> write: io=2048.0MB, bw=254044KB/s, iops=63510.3 , runt=  8274.4msec
>> lat (usec): min=1 , max=856333 , avg=31.043, stdev=1769.32
>>
>> Note that the impact is little(<1%).
>>
>>
>> Signed-off-by: Sha Zhengju 
>> Reviewed-by: Michal Hocko 
>
> Acked-by: KAMEZAWA Hiroyuki 
>
> Hmm,..this change should be double-checked by vfs, I/O guys...
>

Now it seems they haven't paid attention here... I'll push it soon for
more review.

> increasing hold time of mapping->private_lock doesn't affect performance ?
>
>

Yes, pointed by Fengguang in the previous round, mapping->private_lock and
mapping->tree_lock are often contented locks that in a dd testcase
they have the top
 #1 and #2 contention.
So the numbers above are trying to find the impaction of lock
contention by multiple
threads(numjobs=2) writing to the same file in parallel and it seems
the impact is
little (<1%).
I'm not sure if the test case is enough, any advice is welcomed! : )
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: manual merge of the driver-core.current tree with Linus tree

2013-01-04 Thread Greg KH

On Wed, Jan 02, 2013 at 10:37:26AM +1100, Stephen Rothwell wrote:
> Hi Greg,
> 
> Today's linux-next merge of the driver-core.current tree got a conflict
> in drivers/i2c/busses/i2c-au1550.c between commit 0b255e927d47 ("i2c:
> remove __dev* attributes from subsystem") from Linus' tree and commit
> eeb30d064414 ("Drivers: i2c: remove __dev* attributes") from the
> driver-core.current tree.
> 
> I fixed it up (I used the driver-core.current version) and can carry the
> fix as necessary (no action is required).

All of these should no longer be an issue due to the driver-core.current
tree getting merged into Linus's tree.

Hopefully it should not be a merge issue for you anymore as well.

thanks,

greg k-h
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 5/6] PM / Runtime: force memory allocation with no I/O during Runtime PM callbcack

2013-01-04 Thread Ming Lei

This patch applies the introduced memalloc_noio_save() and
memalloc_noio_restore() to force memory allocation with no I/O
during runtime_resume/runtime_suspend callback on device with
the flag of 'memalloc_noio' set.

Cc: Alan Stern 
Cc: Oliver Neukum 
Cc: Rafael J. Wysocki 
Signed-off-by: Ming Lei 
--
v7:
- move memalloc_noio_save/memalloc_noio_restore into
rpm_callback to avoid code duplication, as suggested
by Rafael
v5:
- use inline memalloc_noio_save()
v4:
- runtime_suspend need this too because rpm_resume may wait for
completion of concurrent runtime_suspend, so deadlock still may
be triggered in runtime_suspend path.
---
 drivers/base/power/runtime.c |   19 ++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index cd92e1c..1244930 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -348,7 +348,24 @@ static int rpm_callback(int (*cb)(struct device *), struct 
device *dev)
if (!cb)
return -ENOSYS;
 
-   retval = __rpm_callback(cb, dev);
+   if (dev->power.memalloc_noio) {
+   unsigned int noio_flag;
+
+   /*
+* Deadlock might be caused if memory allocation with
+* GFP_KERNEL happens inside runtime_suspend and
+* runtime_resume callbacks of one block device's
+* ancestor or the block device itself. Network
+* device might be thought as part of iSCSI block
+* device, so network device and its ancestor should
+* be marked as memalloc_noio too.
+*/
+   noio_flag = memalloc_noio_save();
+   retval = __rpm_callback(cb, dev);
+   memalloc_noio_restore(noio_flag);
+   } else {
+   retval = __rpm_callback(cb, dev);
+   }
 
dev->power.runtime_error = retval;
return retval != -EACCES ? retval : -EIO;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 3/6] block/genhd.c: apply pm_runtime_set_memalloc_noio on block devices

2013-01-04 Thread Ming Lei

This patch applyes the introduced pm_runtime_set_memalloc_noio on
block device so that PM core will teach mm to not allocate memory with
GFP_IOFS when calling the runtime_resume and runtime_suspend callback
for block devices and its ancestors.

Cc: Jens Axboe 
Signed-off-by: Ming Lei 
--
v5:
- fix code style and one typo
v4:
- call pm_runtime_set_memalloc_noio(ddev, true) after device_add
---
 block/genhd.c |   10 ++
 1 file changed, 10 insertions(+)

diff --git a/block/genhd.c b/block/genhd.c
index 4125beb..2eb64a3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "blk.h"
 
@@ -534,6 +535,14 @@ static void register_disk(struct gendisk *disk)
return;
}
}
+
+   /*
+* avoid probable deadlock caused by allocating memory with
+* GFP_KERNEL in runtime_resume callback of its all ancestor
+* devices
+*/
+   pm_runtime_set_memalloc_noio(ddev, true);
+
disk->part0.holder_dir = kobject_create_and_add("holders", >kobj);
disk->slave_dir = kobject_create_and_add("slaves", >kobj);
 
@@ -663,6 +672,7 @@ void del_gendisk(struct gendisk *disk)
disk->driverfs_dev = NULL;
if (!sysfs_deprecated)
sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+   pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
device_del(disk_to_dev(disk));
 }
 EXPORT_SYMBOL(del_gendisk);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v7 2/6] PM / Runtime: introduce pm_runtime_set_memalloc_noio()

2013-01-04 Thread Ming Lei

The patch introduces the flag of memalloc_noio in 'struct dev_pm_info'
to help PM core to teach mm not allocating memory with GFP_KERNEL
flag for avoiding probable deadlock.

As explained in the comment, any GFP_KERNEL allocation inside
runtime_resume() or runtime_suspend() on any one of device in
the path from one block or network device to the root device
in the device tree may cause deadlock, the introduced
pm_runtime_set_memalloc_noio() sets or clears the flag on
device in the path recursively.

Cc: Alan Stern 
Cc: "Rafael J. Wysocki" 
Signed-off-by: Ming Lei 
--
v7:
- optimize on pm_runtime_set_memalloc_noio(true)

v5:
- fix code style error
- add comment on clear the device memalloc_noio flag
v4:
- rename memalloc_noio_resume as memalloc_noio
- remove pm_runtime_get_memalloc_noio()
- add comments on pm_runtime_set_memalloc_noio
v3:
- introduce pm_runtime_get_memalloc_noio()
- hold one global lock on pm_runtime_set_memalloc_noio
- hold device power lock when accessing memalloc_noio_resume
  flag suggested by Alan Stern
- implement pm_runtime_set_memalloc_noio without recursion
  suggested by Alan Stern
v2:
- introduce pm_runtime_set_memalloc_noio()
---
 drivers/base/power/runtime.c |   70 ++
 include/linux/pm.h   |1 +
 include/linux/pm_runtime.h   |3 ++
 3 files changed, 74 insertions(+)

diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 3148b10..cd92e1c 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -124,6 +124,76 @@ unsigned long pm_runtime_autosuspend_expiration(struct 
device *dev)
 }
 EXPORT_SYMBOL_GPL(pm_runtime_autosuspend_expiration);
 
+static int dev_memalloc_noio(struct device *dev, void *data)
+{
+   return dev->power.memalloc_noio;
+}
+
+/*
+ * pm_runtime_set_memalloc_noio - Set a device's memalloc_noio flag.
+ * @dev: Device to handle.
+ * @enable: True for setting the flag and False for clearing the flag.
+ *
+ * Set the flag for all devices in the path from the device to the
+ * root device in the device tree if @enable is true, otherwise clear
+ * the flag for devices in the path whose siblings don't set the flag.
+ *
+ * The function should only be called by block device, or network
+ * device driver for solving the deadlock problem during runtime
+ * resume/suspend:
+ *
+ * If memory allocation with GFP_KERNEL is called inside runtime
+ * resume/suspend callback of any one of its ancestors(or the
+ * block device itself), the deadlock may be triggered inside the
+ * memory allocation since it might not complete until the block
+ * device becomes active and the involed page I/O finishes. The
+ * situation is pointed out first by Alan Stern. Network device
+ * are involved in iSCSI kind of situation.
+ *
+ * The lock of dev_hotplug_mutex is held in the function for handling
+ * hotplug race because pm_runtime_set_memalloc_noio() may be called
+ * in async probe().
+ *
+ * The function should be called between device_add() and device_del()
+ * on the affected device(block/network device).
+ */
+void pm_runtime_set_memalloc_noio(struct device *dev, bool enable)
+{
+   static DEFINE_MUTEX(dev_hotplug_mutex);
+
+   mutex_lock(_hotplug_mutex);
+   for (;;) {
+   bool enabled;
+
+   /* hold power lock since bitfield is not SMP-safe. */
+   spin_lock_irq(>power.lock);
+   enabled = dev->power.memalloc_noio;
+   dev->power.memalloc_noio = enable;
+   spin_unlock_irq(>power.lock);
+
+   /*
+* not need to enable ancestors any more if the device
+* has been enabled.
+*/
+   if (enabled && enable)
+   break;
+
+   dev = dev->parent;
+
+   /*
+* clear flag of the parent device only if all the
+* children don't set the flag because ancestor's
+* flag was set by any one of the descendants.
+*/
+   if (!dev || (!enable &&
+device_for_each_child(dev, NULL,
+  dev_memalloc_noio)))
+   break;
+   }
+   mutex_unlock(_hotplug_mutex);
+}
+EXPORT_SYMBOL_GPL(pm_runtime_set_memalloc_noio);
+
 /**
  * rpm_check_suspend_allowed - Test whether a device may be suspended.
  * @dev: Device to test.
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 03d7bb1..1a8a69d 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -538,6 +538,7 @@ struct dev_pm_info {
unsigned intirq_safe:1;
unsigned intuse_autosuspend:1;
unsigned inttimer_autosuspends:1;
+   unsigned intmemalloc_noio:1;
enum rpm_request

[PATCH v7 1/6] mm: teach mm by current context info to not do I/O during memory allocation

2013-01-04 Thread Ming Lei

This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of
'struct task_struct'), so that the flag can be set by one task
to avoid doing I/O inside memory allocation in the task's context.

The patch trys to solve one deadlock problem caused by block device,
and the problem may happen at least in the below situations:

- during block device runtime resume, if memory allocation with
GFP_KERNEL is called inside runtime resume callback of any one
of its ancestors(or the block device itself), the deadlock may be
triggered inside the memory allocation since it might not complete
until the block device becomes active and the involed page I/O finishes.
The situation is pointed out first by Alan Stern. It is not a good
approach to convert all GFP_KERNEL[1] in the path into GFP_NOIO because
several subsystems may be involved(for example, PCI, USB and SCSI may
be involved for usb mass stoarage device, network devices involved too
in the iSCSI case)

- during block device runtime suspend, because runtime resume need
to wait for completion of concurrent runtime suspend.

- during error handling of usb mass storage deivce, USB bus reset
will be put on the device, so there shouldn't have any
memory allocation with GFP_KERNEL during USB bus reset, otherwise
the deadlock similar with above may be triggered. Unfortunately, any
usb device may include one mass storage interface in theory, so it
requires all usb interface drivers to handle the situation. In fact,
most usb drivers don't know how to handle bus reset on the device
and don't provide .pre_set() and .post_reset() callback at all, so
USB core has to unbind and bind driver for these devices. So it
is still not practical to resort to GFP_NOIO for solving the problem.

Also the introduced solution can be used by block subsystem or block
drivers too, for example, set the PF_MEMALLOC_NOIO flag before doing
actual I/O transfer.

It is not a good idea to convert all these GFP_KERNEL in the
affected path into GFP_NOIO because these functions doing that may be
implemented as library and will be called in many other contexts.

In fact, memalloc_noio_flags() can convert some of current static GFP_NOIO
allocation into GFP_KERNEL back in other non-affected contexts, at least
almost all GFP_NOIO in USB subsystem can be converted into GFP_KERNEL
after applying the approach and make allocation with GFP_NOIO
only happen in runtime resume/bus reset/block I/O transfer contexts
generally.

[1], several GFP_KERNEL allocation examples in runtime resume path

- pci subsystem
acpi_os_allocate
<-acpi_ut_allocate
<-ACPI_ALLOCATE_ZEROED
<-acpi_evaluate_object
<-__acpi_bus_set_power
<-acpi_bus_set_power
<-acpi_pci_set_power_state

<-platform_pci_set_power_state

<-pci_platform_power_transition

<-__pci_complete_power_transition

<-pci_set_power_state

<-pci_restore_standard_config

<-pci_pm_runtime_resume
- usb subsystem
usb_get_status
<-finish_port_resume
<-usb_port_resume
<-generic_resume
<-usb_resume_device
<-usb_resume_both
<-usb_runtime_resume

- some individual usb drivers
usblp, uvc, gspca, most of dvb-usb-v2 media drivers, cpia2, az6007, 

That is just what I have found.  Unfortunately, this allocation can
only be found by human being now, and there should be many not found
since any function in the resume path(call tree) may allocate memory
with GFP_KERNEL.

Cc: Alan Stern 
Cc: Oliver Neukum 
Cc: Jiri Kosina 
Cc: Andrew Morton 
Cc: Mel Gorman 
Cc: KAMEZAWA Hiroyuki 
Cc: Michal Hocko 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
Cc: "Rafael J. Wysocki" 
Signed-off-by: Minchan Kim 
Signed-off-by: Ming Lei 
--
v7:
- fix type of 'flags' in memalloc_noio_save()/memalloc_noio_restore()
- rebase on v3.8-rc2-next-20130104

v6:
- replace GFP_IO with __GFP_IO to fix compile failure

v5:
- use inline instead of macro to define memalloc_noio_*
- replace memalloc_noio() with memalloc_noio_flags() to
make code neater
- don't clear GFP_FS because no GFP_IO means
that allocation won't enter device driver as pointed by
Andrew Morton

v4:
- fix comment
v3:
- no chan

[PATCH v7 0/6] solve deadlock caused by memory allocation with I/O

2013-01-04 Thread Ming Lei

Hi,

This patchset try to solve one deadlock problem which might be caused
by memory allocation with block I/O during runtime PM and block device
error handling path. Traditionly, the problem is addressed by passing
GFP_NOIO statically to mm, but that is not a effective solution, see
detailed description in patch 1's commit log.

This patch set introduces one process flag and trys to fix the deadlock
problem on block device/network device during runtime PM or usb bus reset.

The 1st one is the change on include/sched.h and mm.

The 2nd patch introduces the flag of memalloc_noio on 'dev_pm_info',
and pm_runtime_set_memalloc_noio(), so that PM Core can teach mm to not
allocate mm with GFP_IO during the runtime_resume callback only on
device with the flag set.

The following 2 patches apply the introduced pm_runtime_set_memalloc_noio()
to mark all devices as memalloc_noio_resume in the path from the block or
network device to the root device in device tree.

The last 2 patches are applied again PM and USB subsystem to demonstrate
how to use the introduced mechanism to fix the deadlock problem.

Andrew, could you queue these patches into your tree since V6 fixes all
your concerns and looks no one objects these patches?

Change logs:
V7:
- rebase on v3.8-rc2-next-20130104
- move memalloc_noio_save/memalloc_noio_restore into
rpm_callback to avoid code duplication, as suggested
by Rafael
- optimize on pm_runtime_set_memalloc_noio(true)
- fix type of 'flags' in memalloc_noio_save()/memalloc_noio_restore()
V6:
- fix one compile failure(1/6), and only one line change
V5:
- don't clear GFP_FS
- coding style fix
- add comments
- see details in individual change logs
V4:
- patches from the 2nd to the 6th changed
- call pm_runtime_set_memalloc_noio() after device_add() as pointed
by Alan
- set PF_MEMALLOC_NOIO during runtime_suspend()
V3:
- patch 2/6 and 5/6 changed, see their commit log
- remove RFC from title since several guys have expressed that
it is a reasonable solution
V2:
- remove changes on 'may_writepage' and 'may_swap'(1/6)
- unset GFP_IOFS in try_to_free_pages() path(1/6)
- introduce pm_runtime_set_memalloc_noio()
- only apply the meachnism on block/network device and its ancestors
for runtime resume context
V1:
- take Minchan's change to avoid the check in alloc_page hot path
- change the helpers' style into save/restore as suggested by Alan
- memory allocation with no io in usb bus reset path for all devices
as suggested by Greg and Oliver

 block/genhd.c|   10 +
 drivers/base/power/runtime.c |   89 +-
 drivers/usb/core/hub.c   |   13 ++
 include/linux/pm.h   |1 +
 include/linux/pm_runtime.h   |3 ++
 include/linux/sched.h|   22 +++
 mm/page_alloc.c  |9 -
 mm/vmscan.c  |4 +-
 net/core/net-sysfs.c |5 +++
 9 files changed, 152 insertions(+), 4 deletions(-)


Thanks,
--
Ming Lei

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [3.8-{rc1,rc2}] ata1.00: failed to get Identify Device Data, Emask 0x1

2013-01-04 Thread Huang, Shane

> OK, I see the patch I mentioned to fix the problem was later reverted [1].
> The real fix is "libata: replace sata_settings with devslp_timing" [2].

Yes, please use [2] which can also be found in kernel bugzilla #51881
and is pending on Jeff's acceptance. Sorry for the trouble to you guys.

Thanks,
Shane

Re: [PATCH] x86,perf: Add IvyBridge EP support

2013-01-04 Thread Youquan Song


Would you like to take it? It is needed by Linux OSVs.

Thanks
-Youquan

On Tue, Dec 18, 2012 at 12:20:23PM -0500, Youquan Song wrote:
> Run in perf utility at Ivybridge EP server, encouter "not supported" event
> 
> L1-dcache-loads 
> L1-dcache-load-misses   
> L1-dcache-stores
> L1-dcache-store-misses  
> L1-dcache-prefetches
> L1-dcache-prefetch-misses
> 
> This patch add the support for this processor.
> 
> Reviewed-by: Andi Kleen 
> Signed-off-by: Youquan Song 
> ---
>  arch/x86/kernel/cpu/perf_event_intel.c |1 +
>  1 files changed, 1 insertions(+), 0 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/perf_event_intel.c 
> b/arch/x86/kernel/cpu/perf_event_intel.c
> index 324bb52..aea3503 100644
> --- a/arch/x86/kernel/cpu/perf_event_intel.c
> +++ b/arch/x86/kernel/cpu/perf_event_intel.c
> @@ -2075,6 +2075,7 @@ __init int intel_pmu_init(void)
>   pr_cont("SandyBridge events, ");
>   break;
>   case 58: /* IvyBridge */
> + case 62: /* IvyBridge EP */
>   memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
>  sizeof(hw_cache_event_ids));
>   memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
> -- 
> 1.6.4.2
> 
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: load/unload dccp module caused oops

2013-01-04 Thread CAI Qian



- Original Message -
> From: "Christoph Lameter" 
> To: "CAI Qian" 
> Cc: net...@vger.kernel.org, "Dave Miller" , 
> sta...@vger.kernel.org, "linux-kernel"
> , "Pekka Enberg" , "Glauber 
> Costa" 
> Sent: Friday, January 4, 2013 11:05:35 PM
> Subject: Re: load/unload dccp module caused oops
> 
> See the fix available here:
> 
> https://patchwork.kernel.org/patch/1909861/
Excellent! Thanks Christoph.

Tested-by: CAI Qian 
> 
> 
> On Fri, 4 Jan 2013, CAI Qian wrote:
> 
> > The bisecting pointed out this commit fixed the problem in
> > the mainline.
> >
> > 3c58346525d82625e68e24f071804c2dc057b6f4
> > slab: Simplify bootstrap
> >
> > However, simply back-ported this single commit to the 3.7.1
> > stable wasn't enough to fix it. My guess is that there are
> > some other slab/slub commits required to fix this. Keep digging...
> >
> > The kernel config used the SLUB,
> > http://people.redhat.com/qcai/stable/.config
> >
> > CAI Qian
> >
> > - Original Message -
> > > From: "CAI Qian" 
> > > To: net...@vger.kernel.org
> > > Cc: "Dave Miller" , sta...@vger.kernel.org
> > > Sent: Friday, January 4, 2013 9:57:43 AM
> > > Subject: Re: load/unload dccp module caused
> > >
> > > Adding the netdev as Dave suggested.
> > >
> > > - Original Message -
> > > > From: "CAI Qian" 
> > > > To: sta...@vger.kernel.org
> > > > Cc: "Dave Miller" 
> > > > Sent: Monday, December 31, 2012 5:42:59 PM
> > > > Subject: load/unload dccp module caused
> > > >
> > > > Just a head up that load and then unload the dccp module
> > > > caused an oops below using the current stable kernel - v3.7.1.
> > > > Some additional data point here: the mainline v3.6 release has
> > > > no such problem, so this looks like a regression. The mainline
> > > > v3.8-rc1 also has no such problem, so it looks like it has
> > > > already been fixed there but looks like yet queued up for the
> > > > stable yet (tested a few commits in Greg's stable-queue and
> > > > Dave's net-stable queue did not find anything obvious to fix
> > > > this). I am in-process to bisect to figure out the one that
> > > > need to back-port right now.
> > > >
> > > > [   93.809573]
> > > > =
> > > > [   93.809577] BUG kmalloc-16 (Tainted: GB   ): Objects
> > > > remaining in kmalloc-16 on kmem_cache_close()
> > > > [   93.809580]
> > > > -
> > > > [   93.809580]
> > > > ...
> > > > [  356.336244] INFO: Object 0xc000fa1f0aa0 @offset=2720
> > > > [  356.336247] INFO: Object 0xc000fa1f0ab0 @offset=2736
> > > > [  356.336249] INFO: Object 0xc000fa1f0ac0 @offset=2752
> > > > [  356.336254] INFO: Object 0xc000fa1f0ad0 @offset=2768
> > > > [  356.336257] INFO: Object 0xc000fa1f0ae0 @offset=2784
> > > > [  356.336259] INFO: Object 0xc000fa1f0af0 @offset=2800
> > > > [  356.336262] INFO: Object 0xc000fa1f0b80 @offset=2944
> > > > [  356.336264] INFO: Object 0xc000fa1f0bd0 @offset=3024
> > > > [  356.336271] INFO: Object 0xc000fa1f1870 @offset=6256
> > > > [  356.336274] INFO: Object 0xc000fa1f1880 @offset=6272
> > > > [  356.336276] INFO: Object 0xc000fa1f1890 @offset=6288
> > > > [  356.346976] INFO: Object 0xc000fa1f18a0 @offset=6304
> > > > [  356.346979] INFO: Object 0xc000fa1f18b0 @offset=6320
> > > > [  356.346981] INFO: Object 0xc000fa1f1950 @offset=6480
> > > > [  356.346986] INFO: Object 0xc000fa1f1960 @offset=6496
> > > > [  356.346989] INFO: Object 0xc000fa1f1970 @offset=6512
> > > > [  356.346991] INFO: Object 0xc000fa1f1980 @offset=6528
> > > > [  356.346994] INFO: Object 0xc000fa1f1990 @offset=6544
> > > > [  356.346997] INFO: Object 0xc000fa1f19a0 @offset=6560
> > > > [  356.346999] INFO: Object 0xc000fa1f19b0 @offset=6576
> > > > [  356.347005] INFO: Object 0xc000fa1f19c0 @offset=6592
> > > > [  356.347008] INFO: Object 0xc000fa1f19d0 @offset=6608
> > > > [  356.347010] INFO: Object 0xc000fa1f19e0 @offset=6624
> > > > [  356.347012] INFO: Object 0xc000fa1f19f0 @offset=6640
> > > > [  356.347081] kmem_cache_destroy kmalloc-16: Slab cache still
> > > > has
> > > > objects
> > > > ...
> > > > [441283.322161] BUG: unable to handle kernel NULL pointer
> > > > dereference
> > > > at   (null)
> > > > [441283.331020] IP: []
> > > > __kmem_cache_shutdown+0xa9/0x2f0
> > > > [441283.338320] PGD 105568f067 PUD 104a086067 PMD 0
> > > > [441283.343600] Oops:  [#1] SMP
> > > > [441283.347318] Modules linked in: dccp(-) nf_tproxy_core
> > > > deflate
> > > > zlib_deflate lzo nls_koi8_u nls_cp932 ts_kmp sctp libcrc32c
> > > > binfmt_misc des_generic md4 nls_utf8 cifs dns_resolver sg
> > > > iTCO_wdt
> > > > kvm_intel igb iTCO_vendor_support coretemp kvm crc32c_intel
> > > > lpc_ich
> > > > i7core_edac edac_core i2c_i801 i2c_core mfd_core pcspkr
> > > > microcode
> > > > ioatdma dca sr_mod cdrom ata_generic

Re: [PATCH v7u1 26/31] x86: Don't enable swiotlb if there is not enough ram for it

2013-01-04 Thread Shuah Khan

On Fri, Jan 4, 2013 at 4:55 PM, Yinghai Lu  wrote:
> On Fri, Jan 4, 2013 at 3:21 PM, Shuah Khan  wrote:
>
>> Please see attached dmesg for full log. I can do some testing on this
>> system with your patch if you would like.
>
> That would be great.
>
> Please try
> git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git
> for-x86-boot
> or just this patch.
>
> Too bad, I can not access AMD systems with IOMMU support.
>
> Thanks a lot.

I tried your patch on my AMD system. I did change the patch to print
warning instead of panic() and it did trigger the condition for
panic.:

[5.376654] AMD-Vi: Found IOMMU at :00:00.2 cap 0x40
[5.376717]
[5.376799] pci :00:00.2: irq 72 for MSI/MSI-X

It would have panic'ed here:

[5.388858] AMD-Vi: can not enable swiotlb for unhandled devices by
AMD iommu!

[5.388964] AMD-Vi: Lazy IO/TLB flushing enabled
[5.389324] LVT offset 0 assigned for vector 0x400

I applied your patch to 3.6.11 and changed the panic() to pr_info()
and also changed enough_mem_for_swiotlb() to always return false to
simulate not enough memory condition as this system does have enough
memory.

So at least on this AMD system, your patch will result in a panic.

Thanks,
-- Shuah
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/1] tracing: remove deprecated power trace API

2013-01-04 Thread Paul Gortmaker

The text in Documentation said it would be removed in 2.6.41;
the text in the Kconfig said removal in the 3.1 release.  Either
way you look at it, we are well past both, so push it off a cliff.

Note that the POWER_CSTATE and the POWER_PSTATE are part of the
legacy tracing API.  Remove all tracepoints which use these flags.
As can be seen from context, most already have a trace entry via
trace_cpu_idle anyways.

Also, the cpufreq/cpufreq.c PSTATE one is actually unpaired, as
compared to the CSTATE ones which all have a clear start/stop.
As part of this, the trace_power_frequency also becomes orphaned,
so it too is deleted.

Signed-off-by: Paul Gortmaker 
---
 Documentation/trace/events-power.txt | 27 +--
 arch/arm/mach-omap2/pm34xx.c |  2 -
 arch/x86/kernel/process.c|  6 ---
 drivers/cpufreq/cpufreq.c|  1 -
 drivers/cpuidle/cpuidle.c|  2 -
 include/trace/events/power.h | 92 
 kernel/trace/Kconfig | 15 --
 kernel/trace/power-traces.c  |  3 --
 8 files changed, 1 insertion(+), 147 deletions(-)

diff --git a/Documentation/trace/events-power.txt 
b/Documentation/trace/events-power.txt
index cf794af..e1498ff 100644
--- a/Documentation/trace/events-power.txt
+++ b/Documentation/trace/events-power.txt
@@ -17,7 +17,7 @@ Cf. include/trace/events/power.h for the events definitions.
 1. Power state switch events
 
 
-1.1 New trace API
+1.1 Trace API
 -
 
 A 'cpu' event class gathers the CPU-related events: cpuidle and
@@ -41,31 +41,6 @@ The event which has 'state=4294967295' in the trace is very 
important to the use
 space tools which are using it to detect the end of the current state, and so 
to
 correctly draw the states diagrams and to calculate accurate statistics etc.
 
-1.2 DEPRECATED trace API
-
-
-A new Kconfig option CONFIG_EVENT_POWER_TRACING_DEPRECATED with the default 
value of
-'y' has been created. This allows the legacy trace power API to be used 
conjointly
-with the new trace API.
-The Kconfig option, the old trace API (in include/trace/events/power.h) and the
-old trace points will disappear in a future release (namely 2.6.41).
-
-power_start"type=%lu state=%lu cpu_id=%lu"
-power_frequency"type=%lu state=%lu cpu_id=%lu"
-power_end  "cpu_id=%lu"
-
-The 'type' parameter takes one of those macros:
- . POWER_NONE  = 0,
- . POWER_CSTATE= 1,/* C-State */
- . POWER_PSTATE= 2,/* Frequency change or DVFS */
-
-The 'state' parameter is set depending on the type:
- . Target C-state for type=POWER_CSTATE,
- . Target frequency for type=POWER_PSTATE,
-
-power_end is used to indicate the exit of a state, corresponding to the latest
-power_start event.
-
 2. Clocks events
 
 The clock events are used for clock enable/disable and for
diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
index 7be3622..2d93d8b 100644
--- a/arch/arm/mach-omap2/pm34xx.c
+++ b/arch/arm/mach-omap2/pm34xx.c
@@ -351,12 +351,10 @@ static void omap3_pm_idle(void)
if (omap_irq_pending())
goto out;
 
-   trace_power_start(POWER_CSTATE, 1, smp_processor_id());
trace_cpu_idle(1, smp_processor_id());
 
omap_sram_idle();
 
-   trace_power_end(smp_processor_id());
trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
 
 out:
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2ed787f..dcfc1f4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -375,7 +375,6 @@ void cpu_idle(void)
  */
 void default_idle(void)
 {
-   trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
trace_cpu_idle_rcuidle(1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
@@ -389,7 +388,6 @@ void default_idle(void)
else
local_irq_enable();
current_thread_info()->status |= TS_POLLING;
-   trace_power_end_rcuidle(smp_processor_id());
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
 }
 #ifdef CONFIG_APM_MODULE
@@ -423,7 +421,6 @@ void stop_this_cpu(void *dummy)
 static void mwait_idle(void)
 {
if (!need_resched()) {
-   trace_power_start_rcuidle(POWER_CSTATE, 1, smp_processor_id());
trace_cpu_idle_rcuidle(1, smp_processor_id());
if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)_thread_info()->flags);
@@ -434,7 +431,6 @@ static void mwait_idle(void)
__sti_mwait(0, 0);
else
local_irq_enable();
-   trace_power_end_rcuidle(smp_processor_id());
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
} else
local_irq_enable();
@@ -447,12 +443,10 @@ static void mwait_idle(void)
  */

[PATCH/RFC 0/1] Delete legacy power trace API

2013-01-04 Thread Paul Gortmaker

The actual deletion is mind-numbingly simple; and if you go by the
comments in the code, it is well overdue.  However, in discussions
with Frederic, he suggested to me that those comments might have
been overly optimistic, and that there may still be people out
there who are still unknowingly using this dead API.

So, that is the crux of the RFC component -- to check whether the
comments saying "delete by v3.1" can be taken at face value, or
whether they were overly optimistic, and hence this stuff is still
actively used even though it is overdue for deletion.

Thanks,
Paul.
---

Paul Gortmaker (1):
  tracing: remove deprecated power trace API

 Documentation/trace/events-power.txt | 27 +--
 arch/arm/mach-omap2/pm34xx.c |  2 -
 arch/x86/kernel/process.c|  6 ---
 drivers/cpufreq/cpufreq.c|  1 -
 drivers/cpuidle/cpuidle.c|  2 -
 include/trace/events/power.h | 92 
 kernel/trace/Kconfig | 15 --
 kernel/trace/power-traces.c  |  3 --
 8 files changed, 1 insertion(+), 147 deletions(-)

-- 
1.8.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/5] perf: Make EVENT_ATTR and EVENT_PTR global

2013-01-04 Thread Sukadev Bhattiprolu

Jiri Olsa [jo...@redhat.com] wrote:
| On Tue, Dec 18, 2012 at 11:28:02PM -0800, Sukadev Bhattiprolu wrote:
| > 
| > Rename EVENT_ATTR() and EVENT_PTR() PMU_EVENT_ATTR() and PMU_EVENT_PTR().
| > Make them global so they are available to all architectures.
| > 
| > Further to allow architectures flexibility, have PMU_EVENT_PTR() pass in the
| > variable name as a parameter.
| > 
| hi,
| the change looks ok apart from some nits below.
| 
| There' another version of the x86 event attributes change
| I mentioned earlier:
| 
| http://marc.info/?l=linux-kernel=135601815224373=2
| 
| I'm not sure which one will make it in first, but you
| guys need to sync ;-) CC-ing Andi and Stephane.

One change that would help powerpc (and other architectures) is to move
the 'struct perf_pmu_events_attr' to say, include/linux/perf_event.h.

Each architecture can define EVENT_VAR(), EVENT_PTR() etc as needed.

| 
| thanks,
| jirka
| 



| > +struct perf_pmu_events_attr {
| > +   struct device_attribute attr;
| > +   u64 id;
| > +};
| > +
| > +#define PMU_EVENT_PTR(_var)&_var.attr.attr
| 
| this one seems superfluous as well, could be replaced by '&'

I guess that would encode the assumption that both the 'attr' fields are 
the first in their respective structures. If so, an explicit comment beside
the fields would be useful.

Sukadev

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: thp: Acquire the anon_vma rwsem for lock during split

2013-01-04 Thread Michel Lespinasse

On Fri, Jan 4, 2013 at 6:08 AM, Mel Gorman  wrote:
> Despite the reason for these commits, NUMA balancing is not the direct
> source of the problem. split_huge_page() expected the anon_vma lock to be
> exclusive to serialise the whole split operation. Ordinarily it is expected
> that the anon_vma lock would only be required when updating the avcs but
> THP also uses it. The locking requirements for THP are complex and there
> is some overlap but broadly speaking they include the following
>
> 1. mmap_sem for read or write prevents THPs being created underneath
> 2. anon_vma is taken for write if collapsing a huge page
> 3. mm->page_table_lock should be taken when checking if pmd_trans_huge as
>split_huge_page can run in parallel
> 4. wait_split_huge_page uses anon_vma taken for write mode to serialise
>against other THP operations
> 5. compound_lock is used to serialise between
>__split_huge_page_refcount() and gup
>
> split_huge_page takes anon_vma for read but that does not serialise against
> parallel split_huge_page operations on the same page (rule 2). One process
> could be modifying the ref counts while the other modifies the page tables
> leading to counters not being reliable. This patch takes the anon_vma
> lock for write to serialise against parallel split_huge_page and parallel
> collapse operations as it is the most fine-grained lock available that
> protects against both.

Your comment about this being the most fine-grained lock made me
think, couldn't we use lock_page() on the THP page here ?

Now I don't necessarily want to push you that direction, because I
haven't fully thought it trough and because what you propose brings us
closer to what happened before anon_vma became an rwlock, which is
more obviously safe. But I felt I should still mention it, since we're
really only trying to protect from concurrent operations on the same
THP page, so locking at just that granularity would seem desirable.

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 3.8-rc2: pciehp waitqueue hang...

2013-01-04 Thread Yijing Wang

On 2013/1/5 5:50, Bjorn Helgaas wrote:
> [+to Yijing, +cc Kenji]
> 
> On Fri, Jan 4, 2013 at 1:01 PM, Bjorn Helgaas  wrote:
>> On Thu, Jan 3, 2013 at 8:41 AM, Jiang Liu  wrote:
>>> Hi Daniel,
>>> It seems like an issue caused by recursive PCIe HPC.
>>> Could you please help to try the patch from:
>>> http://www.spinics.net/lists/linux-pci/msg18625.html
>>
>> Hi Gerry,
>>
>> I'm working on merging this patch.  Seems like something that might be
>> appropriate for stable as well.
>>
>> Did you look for similar problems in other hotplug drivers?
> 
> Oops, sorry, I forgot that Yijing is the author of the patch in question.
> 
> Yijing, please check for the same problem in other hotplug drivers.
> Questions I have after a quick look:
> 

OK, I will check the similar problems for other hotplug drivers, my pleasure.

Thanks!
Yijing.

>   - shpchp_wq looks like it might have the same deadlock issue.
> 
>   - pciehp_wq (and your per-slot replacement) are allocated with
> alloc_workqueue().  shpchp_wq is allocated with
> alloc_ordered_workqueue().  Why the difference?
> 
>   - The alloc/alloc_ordered difference might be related to 486b10b9f4,
> where Kenji removed alloc_ordered from pciehp.  Should a similar
> change be made to shpchp?
> 
>   - acpiphp uses the global kacpi_hotplug_wq.  We never flush or drain
> kacpi_hotplug_wq, so I doubt there's a deadlock issue, but I wonder if
> there are any ordering issues there because we *don't* ever wait for
> things in that queue to be completed.
> 
>>> Thanks!
>>> Gerry
>>> On 01/03/2013 11:11 PM, Daniel J Blueman wrote:
 When the Apple thunderbolt ethernet adapter comes loose on my Macbook
 Pro Retina (Intel DSL3510), we see pci_slot_name return
 non-deterministic data (ie varying each boot), and we see pciehp_wp
 remain armed with events causing the kthread to get stuck:

 tg3 :0a:00.0 eth0: Link is up at 1000 Mbps, full duplex
 tg3 :0a:00.0 eth0: Flow control is on for TX and on for RX
 
 pciehp :06:03.0:pcie24: Card not present on Slot(3)
 tg3 :0a:00.0: tg3_abort_hw timed out, TX_MODE_ENABLE will not
 clear MAC_TX_MODE=
 tg3 :0a:00.0 eth0: No firmware running
 tg3 :0a:00.0 eth0: Link is down
 pcieport :00:01.1: System wakeup enabled by ACPI
 pciehp :09:00.0:pcie24: unloading service driver pciehp
 pciehp :09:00.0:pcie24: Latch open on
 Slot(\xfff89\xffbbe\x02\xff88\x\x\xffe09\xffbbe\x02\xff88\x\xfbcon)
 pciehp :09:00.0:pcie24: Button pressed on
 Slot(\xfff89\xffbbe\x02\xff88\x\x\xffe09\xffbbe\x02\xff88\x\xfbcon)
 pciehp :09:00.0:pcie24: Card present on
 Slot(\xfff89\xffbbe\x02\xff88\x\x\xffe09\xffbbe\x02\xff88\x\xfbcon)
 pciehp :09:00.0:pcie24: Power fault on slot
 \xfff89\xffbbe\x02\xff88\x\x\xffe09\xffbbe\x02\xff88\x\xfbcon
 pciehp :09:00.0:pcie24: Power fault bit 0 set
 pciehp :09:00.0:pcie24: PCI slot
 #\xfff89\xffbbe\x02\xff88\x\x\xffe09\xffbbe\x02\xff88\x\xfbcon
 - powering on due to button press.
 pciehp :09:00.0:pcie24: Link Training Error occurs
 pciehp :09:00.0:pcie24: Failed to check link status
 INFO: task kworker/0:1:52 blocked for more than 120 seconds.
 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
 kworker/0:1   D 880265893090   0  52   2 0x
  8802655456f8 0046 81a21a60 880265545fd8
  4000 880265545fd8 880265892bb0 880265adc8d0
  059e 0082 880265545668 810415aa
 Call Trace:
  [] ? console_unlock+0x1fa/0x4a0
  [] ? trace_hardirqs_off+0xd/0x10
  [] ? vprintk_emit+0x1c9/0x510
  [] schedule+0x24/0x70
  [] schedule_timeout+0x19c/0x1e0
  [] wait_for_common+0xe3/0x180
  [] ? flush_workqueue+0x111/0x4d0
  [] ? try_to_wake_up+0x2d0/0x2d0
  [] wait_for_completion+0x18/0x20
  [] flush_workqueue+0x1d6/0x4d0
  [] ? flush_workqueue_prep_cwqs+0x200/0x200
  [] pciehp_release_ctrl+0x39/0x90
  [] pciehp_remove+0x25/0x30
  [] pcie_port_remove_service+0x52/0x70
  [] __device_release_driver+0x77/0xe0
  [] device_release_driver+0x29/0x40
  [] bus_remove_device+0xf1/0x140
  [] device_del+0x127/0x1c0
  [] ? resume_iter+0x40/0x40
  [] device_unregister+0x11/0x20
  [] remove_iter+0x35/0x40
  [] device_for_each_child+0x36/0x70
  [] pcie_port_device_remove+0x21/0x40
  [] pcie_portdrv_remove+0x28/0x50
  [] pci_device_remove+0x41/0xc0
  [] __device_release_driver+0x77/0xe0
  [] device_release_driver+0x29/0x40
  [] bus_remove_device+0xf1/0x140

[PATCH] block: delete super ancient PC-XT driver for 1980's hardware

2013-01-04 Thread Paul Gortmaker

This driver was for the 8 bit ISA cards that were installed in
the PC-XT machines of 1980 vintage.  They supported the dual
ribbon cable MFM drives of 10-20MB capacity, and ran at a 3:1
interleave, giving performance on the order of 128kB/s.

By the introduction of the PC-AT (286) these controllers were
already scrapped in favour of 16 bit controllers with some onboard
RAM that could support a 1:1 interleave.

The git history doesn't show any evidence of runtime fixes that
would reflect active usage; instead just the usual tree-wide API
type changes/cleanups.  Going back to in-source changelogs, the
last "runtime" fix that is evident is something I did over a
dozen years ago[1] -- and even back then, the hardware was long
since unavailable, so that ancient fix was also not runtime tested.

The time is long overdue for this to get flushed, so lets get
rid of it before anyone wastes more time doing builds and sparse
checks etc. on long since dead code.

[1] http://lkml.indiana.edu/hypermail/linux/kernel/0102.2/0027.html

Signed-off-by: Paul Gortmaker 
---
 drivers/block/Kconfig  |   13 -
 drivers/block/Makefile |1 -
 drivers/block/xd.c | 1123 
 drivers/block/xd.h |  134 --
 4 files changed, 1271 deletions(-)
 delete mode 100644 drivers/block/xd.c
 delete mode 100644 drivers/block/xd.h

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 824e09c..e29a44e 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -63,19 +63,6 @@ config AMIGA_Z2RAM
  To compile this driver as a module, choose M here: the
  module will be called z2ram.
 
-config BLK_DEV_XD
-   tristate "XT hard disk support"
-   depends on ISA && ISA_DMA_API
-   select CHECK_SIGNATURE
-   help
- Very old 8 bit hard disk controllers used in the IBM XT computer
- will be supported if you say Y here.
-
- To compile this driver as a module, choose M here: the
- module will be called xd.
-
- It's pretty unlikely that you have one of these: say N.
-
 config GDROM
tristate "SEGA Dreamcast GD-ROM drive"
depends on SH_DREAMCAST
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 17e82df..5195c1f 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_ATARI_FLOPPY)+= ataflop.o
 obj-$(CONFIG_AMIGA_Z2RAM)  += z2ram.o
 obj-$(CONFIG_BLK_DEV_RAM)  += brd.o
 obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
-obj-$(CONFIG_BLK_DEV_XD)   += xd.o
 obj-$(CONFIG_BLK_CPQ_DA)   += cpqarray.o
 obj-$(CONFIG_BLK_CPQ_CISS_DA)  += cciss.o
 obj-$(CONFIG_BLK_DEV_DAC960)   += DAC960.o
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
deleted file mode 100644
index ff54052..000
--- a/drivers/block/xd.c
+++ /dev/null
@@ -1,1123 +0,0 @@
-/*
- * This file contains the driver for an XT hard disk controller
- * (at least the DTC 5150X) for Linux.
- *
- * Author: Pat Mackinlay, p...@it.com.au
- * Date: 29/09/92
- * 
- * Revised: 01/01/93, ...
- *
- * Ref: DTC 5150X Controller Specification (thanks to Kevin Fowler,
- *   kev...@agora.rain.com)
- * Also thanks to: Salvador Abreu, Dave Thaler, Risto Kankkunen and
- *   Wim Van Dorst.
- *
- * Revised: 04/04/94 by Risto Kankkunen
- *   Moved the detection code from xd_init() to xd_geninit() as it needed
- *   interrupts enabled and Linus didn't want to enable them in that first
- *   phase. xd_geninit() is the place to do these kinds of things anyway,
- *   he says.
- *
- * Modularized: 04/10/96 by Todd Fries, tfr...@umr.edu
- *
- * Revised: 13/12/97 by Andrzej Krzysztofowicz, an...@mif.pg.gda.pl
- *   Fixed some problems with disk initialization and module initiation.
- *   Added support for manual geometry setting (except Seagate controllers)
- *   in form:
- *  xd_geo=,,[,,,]
- *   Recovered DMA access. Abridged messages. Added support for DTC5051CX,
- *   WD1002-27X & XEBEC controllers. Driver uses now some jumper settings.
- *   Extended ioctl() support.
- *
- * Bugfix: 15/02/01, Paul G. - inform queue layer of tiny xd_maxsect.
- *
- */
-
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-#include 
-
-#include 
-#include 
-
-#include "xd.h"
-
-static DEFINE_MUTEX(xd_mutex);
-static void __init do_xd_setup (int *integers);
-#ifdef MODULE
-static int xd[5] = { -1,-1,-1,-1, };
-#endif
-
-#define XD_DONT_USE_DMA0  /* Initial value. may be overriden 
using
- "nodma" module option */
-#define XD_INIT_DISK_DELAY (30)  /* 30 ms delay during disk initialization 
*/
-
-/* Above may need to be increased if a problem with the 2nd drive detection
-   (ST11M controller) or resetting a controller (WD) appears */
-
-static XD_INFO xd_info[XD_MAXDRIVES];
-
-/* If you try this driver and find that your card is not

Re: ppoll() stuck on POLLIN while TCP peer is sending

2013-01-04 Thread Eric Wong

Mel Gorman  wrote:
> On Wed, Jan 02, 2013 at 08:08:48PM +, Eric Wong wrote:
> > Instead, I disabled THP+compaction under v3.7.1 and I've been unable to
> > reproduce the issue without THP+compaction.
> > 
> 
> Implying that it's stuck in compaction somewhere. It could be the case
> that compaction alters timing enough to trigger another bug. You say it
> tests differently depending on whether TCP or unix sockets are used
> which might indicate multiple problems. However, lets try and see if
> compaction is the primary problem or not.

I've only managed to encounter this issue with TCP sockets.

No luck reproducing the issue with Unix sockets, not even with 90K
buffers as suggested by Eric Dumazet.  This seems unique to TCP.

Fwiw, I also tried going back to a 16K MTU on loopback a few days ago,
but was still able to reproduce the issue, so
commit 0cf833aefaa85bbfce3ff70485e5534e09254773 doesn't seem
to be a culprit, either.

> > As I mention in http://mid.gmane.org/20121229113434.ga13...@dcvr.yhbt.net
> > I run my below test (`toosleepy') with heavy network and disk activity
> > for a long time before hitting this.
> > 
> 
> Using a 3.7.1 or 3.8-rc2 kernel, can you reproduce the problem and then
> answer the following questions please?

OK, I'm on 3.8-rc2.

> 1. What are the contents of /proc/vmstat at the time it is stuck?

nr_free_pages 1998
nr_inactive_anon 3401
nr_active_anon 3349
nr_inactive_file 94361
nr_active_file 10929
nr_unevictable 0
nr_mlock 0
nr_anon_pages 6643
nr_mapped 2255
nr_file_pages 105400
nr_dirty 44
nr_writeback 0
nr_slab_reclaimable 0
nr_slab_unreclaimable 0
nr_page_table_pages 697
nr_kernel_stack 161
nr_unstable 0
nr_bounce 0
nr_vmscan_write 0
nr_vmscan_immediate_reclaim 0
nr_writeback_temp 0
nr_isolated_anon 0
nr_isolated_file 0
nr_shmem 114
nr_dirtied 1076168
nr_written 46330
nr_anon_transparent_hugepages 0
nr_free_cma 0
nr_dirty_threshold 22495
nr_dirty_background_threshold 11247
pgpgin 4398164
pgpgout 188556
pswpin 0
pswpout 0
pgalloc_dma 369887
pgalloc_dma32 28406230
pgalloc_normal 0
pgalloc_movable 0
pgfree 28779104
pgactivate 18160
pgdeactivate 17404
pgfault 34862559
pgmajfault 358
pgrefill_dma 14076
pgrefill_dma32 3328
pgrefill_normal 0
pgrefill_movable 0
pgsteal_kswapd_dma 12708
pgsteal_kswapd_dma32 917837
pgsteal_kswapd_normal 0
pgsteal_kswapd_movable 0
pgsteal_direct_dma 73
pgsteal_direct_dma32 4085
pgsteal_direct_normal 0
pgsteal_direct_movable 0
pgscan_kswapd_dma 12708
pgscan_kswapd_dma32 918789
pgscan_kswapd_normal 0
pgscan_kswapd_movable 0
pgscan_direct_dma 73
pgscan_direct_dma32 4115
pgscan_direct_normal 0
pgscan_direct_movable 0
pgscan_direct_throttle 0
pginodesteal 0
slabs_scanned 257024
kswapd_inodesteal 69910
kswapd_low_wmark_hit_quickly 2165
kswapd_high_wmark_hit_quickly 275
kswapd_skip_congestion_wait 0
pageoutrun 13412
allocstall 73
pgrotated 3
pgmigrate_success 448
pgmigrate_fail 0
compact_migrate_scanned 14860
compact_free_scanned 219867
compact_isolated 1652
compact_stall 33
compact_fail 10
compact_success 23
unevictable_pgs_culled 1058
unevictable_pgs_scanned 0
unevictable_pgs_rescued 1671
unevictable_pgs_mlocked 1671
unevictable_pgs_munlocked 1671
unevictable_pgs_cleared 0
unevictable_pgs_stranded 0
thp_fault_alloc 0
thp_fault_fallback 0
thp_collapse_alloc 0
thp_collapse_alloc_failed 0
thp_split 0
thp_zero_page_alloc 0
thp_zero_page_alloc_failed 0

> 2. What are the contents of /proc/PID/stack for every toosleepy
>process when they are stuck?

Oops, I needed a rebuild with CONFIG_STACKTRACE=y (it took some effort
to get the right combination of options).

I probably enabled a few more debugging options than I needed and it
seems to have taken longer to reproduce the issue.  Unfortunately I was
distracted when toosleepy got stuck and missed the change to inspect
before hitting ETIMEDOUT :x

Attempting to reproduce the issue while I'm looking.

> 3. Can you do a sysrq+m and post the resulting dmesg?

SysRq : Show Memory
Mem-Info:
DMA per-cpu:
CPU0: hi:0, btch:   1 usd:   0
CPU1: hi:0, btch:   1 usd:   0
DMA32 per-cpu:
CPU0: hi:  186, btch:  31 usd: 144
CPU1: hi:  186, btch:  31 usd: 160
active_anon:3358 inactive_anon:3379 isolated_anon:0
 active_file:10615 inactive_file:92319 isolated_file:0
 unevictable:0 dirty:3 writeback:0 unstable:0
 free:2240 slab_reclaimable:0 slab_unreclaimable:0
 mapped:2333 shmem:114 pagetables:697 bounce:0
 free_cma:0
DMA free:2408kB min:84kB low:104kB high:124kB active_anon:8kB 
inactive_anon:44kB active_file:824kB inactive_file:11512kB unevictable:0kB 
isolated(anon):0kB isolated(file):0kB present:15676kB managed:15900kB 
mlocked:0kB dirty:0kB writeback:0kB mapped:16kB shmem:0kB slab_reclaimable:0kB 
slab_unreclaimable:0kB kernel_stack:112kB pagetables:20kB unstable:0kB 
bounce:0kB free_cma:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 489 489 489
DMA32 free:6552kB min:2784kB low:3480kB high:4176kB active_anon:13424kB 
inactive_anon:13472kB active_file:41636kB

Re: PEBS (in perf) stopped working from 3.6 -> 3.7

2013-01-04 Thread Steinar H. Gunderson

On Fri, Jan 04, 2013 at 05:16:27PM -0700, David Ahern wrote:
> Known problem. Pick one of: update perf to 3.7, add H to the command
> (-e cycles:ppH) or apply this patch:
> https://lkml.org/lkml/2012/12/28/384

I spoke too soon. This works for cycles, but not for branch-misses:

  pannekake:~> sudo perf record -e branch-misses:ppH -a

Error: sys_perf_event_open() syscall returned with 95 (Operation not 
supported) for event branch-misses:ppH. /bin/dmesg may provide additional 
information.

  No hardware sampling interrupt available. No APIC? If so then you can boot 
the kernel with the "lapic" boot parameter to force-enable it.

Why would the two be different?

/* Steinar */
-- 
Homepage: http://www.sesse.net/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Alternative 2][PATCH] ACPI / PCI: Set root bridge ACPI handle in advance

2013-01-04 Thread Rafael J. Wysocki

On Friday, January 04, 2013 05:36:55 PM Bjorn Helgaas wrote:
> On Fri, Jan 4, 2013 at 5:19 PM, Yinghai Lu  wrote:
> > On Fri, Jan 4, 2013 at 4:14 PM, Rafael J. Wysocki  wrote:
> >> On Friday, January 04, 2013 04:03:01 PM Yinghai Lu wrote:
> >>> On Fri, Jan 4, 2013 at 3:38 AM, Rafael J. Wysocki  wrote:
> >>> >> --- a/arch/x86/include/asm/pci.h
> >>> >> +++ b/arch/x86/include/asm/pci.h
> >>> >> @@ -14,6 +14,7 @@
> >>> >>  struct pci_sysdata {
> >>> >>   int domain; /* PCI domain */
> >>> >>   int node;   /* NUMA node */
> >>> >> + void*acpi_handle;
> >>> >>  #ifdef CONFIG_X86_64
> >>> >>   void*iommu; /* IOMMU private data */
> >>> >>  #endif
> >>> >>
> >>>
> >>> acpi_handle is not good name and it is confusing.
> >>
> >> Well, what would be a better name in your opinion?
> >>
> >> I was going to put that into a #ifdef CONFIG_ACPI / #endif, so what about
> >> calling it acpi_data?
> >
> > yes, with #ifdef, you can use acpi_handle type directly.
> >
> > it is acpi handle for pci_root.
> >
> > so would call int pci_root_acpi_handle ?
> 
> I just copied the name from the corresponding ia64 code.  I don't care
> if you want to change it, but I think there is *some* value in keeping
> the x86 and ia64 code as similar as possible because it would be nice
> to converge it some day.

Well, the corresponding data structure for ia64 is called struct pci_controller,
so it is quite obvious what acpi_handle in there means. :-)

Since the data structure for x86 is called struct pci_sysdata and the data
type for the field in question may be acpi_handle, perhaps we can call that
field simply "root_handle"?

Alternatively, in analogy with the iommu we could use void * as its data
type and call it simply "acpi".

That said I'm fine with using just "void *acpi_handle" as you did, but I would
do the #ifdef CONFIG_ACPI / #endif around it anyway.

I wonder what Peter thinks?

Thanks,
Rafael


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] writeback: fix writeback cache thrashing

2013-01-04 Thread Simon Jeons

On Fri, 2013-01-04 at 16:41 +0900, Namjae Jeon wrote:
> 2013/1/4, Simon Jeons :
> > On Thu, 2013-01-03 at 13:35 +0900, Namjae Jeon wrote:
> >> 2013/1/2, Jan Kara :
> >> > On Tue 01-01-13 08:51:04, Wanpeng Li wrote:
> >> >> On Mon, Dec 31, 2012 at 12:30:54PM +0100, Jan Kara wrote:
> >> >> >On Sun 30-12-12 14:59:50, Namjae Jeon wrote:
> >> >> >> From: Namjae Jeon 
> >> >> >>
> >> >> >> Consider Process A: huge I/O on sda
> >> >> >> doing heavy write operation - dirty memory becomes more
> >> >> >> than dirty_background_ratio
> >> >> >> on HDD - flusher thread flush-8:0
> >> >> >>
> >> >> >> Consider Process B: small I/O on sdb
> >> >> >> doing while [1]; read 1024K + rewrite 1024K + sleep 2sec
> >> >> >> on Flash device - flusher thread flush-8:16
> >> >> >>
> >> >> >> As Process A is a heavy dirtier, dirty memory becomes more
> >> >> >> than dirty_background_thresh. Due to this, below check becomes
> >> >> >> true(checking global_page_state in over_bground_thresh)
> >> >> >> for all bdi devices(even for very small dirtied bdi - sdb):
> >> >> >>
> >> >> >> In this case, even small cached data on 'sdb' is forced to flush
> >> >> >> and writeback cache thrashing happens.
> >> >> >>
> >> >> >> When we added debug prints inside above 'if' condition and ran
> >> >> >> above Process A(heavy dirtier on bdi with flush-8:0) and
> >> >> >> Process B(1024K frequent read/rewrite on bdi with flush-8:16)
> >> >> >> we got below prints:
> >> >> >>
> >> >> >> [Test setup: ARM dual core CPU, 512 MB RAM]
> >> >> >>
> >> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56064
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE =  56704
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 84720
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 94720
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   384
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   960
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92160
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   256
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   768
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   256
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   320
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92032
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 91968
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =  1024
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =64
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   576
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 84352
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   192
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE =   512
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:16 : BDI_RECLAIMABLE = 0
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92608
> >> >> >> KB
> >> >> >> [over_bground_thresh]: wakeup flush-8:0 : BDI_RECLAIMABLE = 92544
> >> >> >> KB
> >> >> >>
> >> >> >> As mentioned in above log, when global dirty memory > global
> >> >> >> background_thresh
> >> >> >> small cached data is also forced to flush by flush-8:16.
> >> >> >>
> >> >> >> If removing global background_thresh checking code, we can reduce
> >> >> >> cache
> >> >> >> thrashing of frequently used small data.
> >> >> >  It's not completely clear to me:
> >> >> >  Why is this a problem? Wearing of the flash? Power consumption? I'd
> >> >> > like
> >> >> >to understand this before changing the code...
> >> Hi Jan.
> >> Yes, it can reduce wearing and fragmentation of flash. And also from
> >> one scenario - we
> >> think it might reduce power consumption also.
> >>
> >> >> >
> >> >> >> And It will be great if we can reserve a portion of writeback cache
> >> >> >> using
> >> >> >> min_ratio.
> >> >> >>
> >> >> >> After applying patch:

Re: [RFC PATCH 3/5] x86,smp: auto tune spinlock backoff delay factor

2013-01-04 Thread Rik van Riel


On 01/03/2013 12:17 PM, Rik van Riel wrote:


+   if (!(head % 7) && delay < MAX_SPINLOCK_DELAY)
+   delay++;
+
+   loops = delay * waiters_ahead;


I don't like the head % 7 thing. I think using fixed point arithmetic
would be nicer:

if (delay < MAX_SPINLOCK_DELAY)
   delay += 256/7; /* Or whatever constant we choose */

loops = (delay * waiter_ahead) >> 8;


I'll do that. That could get completely rid of any artifacts
caused by incrementing sometimes, and not other times.


Also, we should probably skip the delay increment on the first loop
iteration - after all, we haven't waited yet, so we can't say that the
delay was too short.


Good point. I will do that.



I will build a kernel with the things you pointed out fixed,
and will give it a spin this afternoon.

Expect new patches soonish :)


After implementing all the ideas you came up with, which made
perfect sense to me, the code performs significantly worse
than before.

*sigh*

New patches will be coming ... later.

--
All rights reversed
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2] mm: bootmem: fix free_all_bootmem_core with odd bitmap alignment

2013-01-04 Thread Max Filippov

Currently free_all_bootmem_core ignores that node_min_pfn may be not
multiple of BITS_PER_LONG. E.g. commit 6dccdcbe "mm: bootmem: fix
checking the bitmap when finally freeing bootmem" shifts vec by lower
bits of start instead of lower bits of idx. Also

  if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL)

assumes that vec bit 0 corresponds to start pfn, which is only true when
node_min_pfn is a multiple of BITS_PER_LONG. Also loop in the else
clause can double-free pages (e.g. with node_min_pfn == start == 1,
map[0] == ~0 on 32-bit machine page 32 will be double-freed).

This bug causes the following message during xtensa kernel boot:

[0.00] bootmem::free_all_bootmem_core nid=0 start=1 end=8000
[0.00] BUG: Bad page state in process swapper  pfn:1
[0.00] page:d04bd020 count:0 mapcount:-127 mapping:  (null) index:0x2
[0.00] page flags: 0x0()
[0.00]
[0.00] Stack:  0002 0004  d0193e44 ff81 
 0002
[0.00]90038c66 d0193e90 d04bd020 01a8   
 0020
[0.00]90039a4c d0193eb0 d04bd020 0001 d04b7b20 8ad0 
 
[0.00] Call Trace:
[0.00]  [] bad_page+0x8c/0x9c
[0.00]  [] free_pages_prepare+0x5e/0x88
[0.00]  [] free_hot_cold_page+0xc/0xa0
[0.00]  [] __free_pages+0x24/0x38
[0.00]  [] __free_pages_bootmem+0x54/0x56
[0.00]  [] free_all_bootmem_core$part$11+0xeb/0x138
[0.00]  [] free_all_bootmem+0x46/0x58
[0.00]  [] mem_init+0x25/0xa4
[0.00]  [] start_kernel+0x11e/0x25c
[0.00]  [] should_never_return+0x0/0x3be7

The fix is the following:
- always align vec so that its bit 0 corresponds to start
- provide BITS_PER_LONG bits in vec, if those bits are available in the map
- don't free pages past next start position in the else clause.

Signed-off-by: Max Filippov 
---
Sent wrong version for v1, 'while' should have been 'for'.

 mm/bootmem.c |   23 +--
 1 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 1324cd7..1157be7 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -185,10 +185,23 @@ static unsigned long __init 
free_all_bootmem_core(bootmem_data_t *bdata)
 
while (start < end) {
unsigned long *map, idx, vec;
+   unsigned shift;
 
map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
+   shift = idx & (BITS_PER_LONG - 1);
+   /*
+* vec holds at most BITS_PER_LONG map bits,
+* bit 0 corresponds to start.
+*/
vec = ~map[idx / BITS_PER_LONG];
+
+   if (shift) {
+   vec >>= shift;
+   if (end - start >= BITS_PER_LONG)
+   vec |= ~map[idx / BITS_PER_LONG + 1] <<
+   (BITS_PER_LONG - shift);
+   }
/*
 * If we have a properly aligned and fully unreserved
 * BITS_PER_LONG block of pages in front of us, free
@@ -201,19 +214,17 @@ static unsigned long __init 
free_all_bootmem_core(bootmem_data_t *bdata)
count += BITS_PER_LONG;
start += BITS_PER_LONG;
} else {
-   unsigned long off = 0;
+   unsigned long cur;
 
-   vec >>= start & (BITS_PER_LONG - 1);
-   while (vec) {
+   start = ALIGN(start + 1, BITS_PER_LONG);
+   for (cur = start; vec && cur != start; ++cur) {
if (vec & 1) {
-   page = pfn_to_page(start + off);
+   page = pfn_to_page(cur);
__free_pages_bootmem(page, 0);
count++;
}
vec >>= 1;
-   off++;
}
-   start = ALIGN(start + 1, BITS_PER_LONG);
}
}
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

mmotm 2013-01-04-15-43 uploaded

2013-01-04 Thread akpm

The mm-of-the-moment snapshot 2013-01-04-15-43 has been uploaded to

   http://www.ozlabs.org/~akpm/mmotm/

mmotm-readme.txt says

README for mm-of-the-moment:

http://www.ozlabs.org/~akpm/mmotm/

This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
more than once a week.

You will need quilt to apply these patches to the latest Linus release (3.x
or 3.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
http://ozlabs.org/~akpm/mmotm/series

The file broken-out.tar.gz contains two datestamp files: .DATE and
.DATE--mm-dd-hh-mm-ss.  Both contain the string -mm-dd-hh-mm-ss,
followed by the base kernel version against which this patch series is to
be applied.

This tree is partially included in linux-next.  To see which patches are
included in linux-next, consult the `series' file.  Only the patches
within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in
linux-next.

A git tree which contains the memory management portion of this tree is
maintained at git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git
by Michal Hocko.  It contains the patches which are between the
"#NEXT_PATCHES_START mm" and "#NEXT_PATCHES_END" markers, from the series
file, http://www.ozlabs.org/~akpm/mmotm/series.


A full copy of the full kernel tree with the linux-next and mmotm patches
already applied is available through git within an hour of the mmotm
release.  Individual mmotm releases are tagged.  The master branch always
points to the latest release, so it's constantly rebasing.

http://git.cmpxchg.org/?p=linux-mmotm.git;a=summary

To develop on top of mmotm git:

  $ git remote add mmotm 
git://git.kernel.org/pub/scm/linux/kernel/git/mhocko/mm.git
  $ git remote update mmotm
  $ git checkout -b topic mmotm/master
  
  $ git send-email mmotm/master.. [...]

To rebase a branch with older patches to a new mmotm release:

  $ git remote update mmotm
  $ git rebase --onto mmotm/master  topic




The directory http://www.ozlabs.org/~akpm/mmots/ (mm-of-the-second)
contains daily snapshots of the -mm tree.  It is updated more frequently
than mmotm, and is untested.

A git copy of this tree is available at

http://git.cmpxchg.org/?p=linux-mmots.git;a=summary

and use of this tree is similar to
http://git.cmpxchg.org/?p=linux-mmotm.git, described above.


This mmotm tree contains the following patches against 3.8-rc2:
(patches marked "*" will be included in linux-next)

  origin.patch
* drivers-rtc-rtc-tegrac-convert-to-dt-driver.patch
* ipc-remove-forced-assignment-of-selected-message.patch
* ipc-add-sysctl-to-specify-desired-next-object-id.patch
* ipc-message-queue-receive-cleanup.patch
* ipc-message-queue-copy-feature-introduced.patch
* selftests-ipc-message-queue-copy-feature-test.patch
* ipc-simplify-free_copy-call.patch
* ipc-convert-prepare_copy-from-macro-to-function.patch
* ipc-simplify-message-copying.patch
* ipc-add-more-comments-to-message-copying-related-code.patch
* documentation-sysctl-kerneltxt-document-proc-sys-shmall.patch
* mm-fix-zone_watermark_ok_safe-accounting-of-isolated-pages.patch
* mm-limit-mmu_gather-batching-to-fix-soft-lockups-on-config_preempt.patch
* maintainers-remove-drivers-platform-msm.patch
* maintainers-remove-arch-arm-common-time-acornc.patch
* maintainers-remove-arch-arm-plat-s5p.patch
* maintainers-fix-drivers-rtc-rtc-vt8500c.patch
* maintainers-fix-arch-arm-mach-at91-include-mach-at_hdmach.patch
* maintainers-fix-drivers-media-platform-atmel-isic.patch
* maintainers-adjust-for-uapi.patch
* maintainers-fix-drivers-media-usb-dvb-usb-cxusb.patch
* maintainers-remove-drivers-video-epson1355fbc.patch
* maintainers-fix-plat-mxc-include-mach-imxfbh.patch
* maintainers-fix-drivers-ieee802154.patch
* maintainers-remove-firmware-isci.patch
* maintainers-remove-arch-x86-platform-mrst-pmu.patch
* maintainers-fix-documentation-mei.patch
* maintainers-remove-drivers-mmc-host-imxmmc.patch
* maintainers-remove-arch-lib-perf_eventc.patch
* maintainers-remove-include-linux-of_pwmh.patch
* maintainers-fix-drivers-staging-sm7xx.patch
* rtc-add-rtc-driver-for-tps6586x.patch
* drivers-rtc-rtc-vt8500c-correct-handling-of-cr_24h-bitfield.patch
* drivers-rtc-rtc-vt8500c-fix-handling-of-data-passed-in-struct-rtc_time.patch
* printk-fix-incorrect-length-from-print_time-when-seconds-9.patch
  linux-next.patch
  linux-next-git-rejects.patch
  make-my-i386-build-work.patch
  arch-alpha-kernel-systblss-remove-debug-check.patch
* compiler-gcc4h-reorder-macros-based-upon-gcc-ver.patch
* compiler-gcch-add-gcc-recommended-gcc_version-macro.patch
* compiler-gcc34h-use-gcc_version-macro.patch
* compiler-gcc4h-bugh-remove-duplicate-macros.patch
* bugh-fix-build_bug_on-macro-in-__checker__.patch
* bugh-prevent-double-evaulation-of-in-build_bug_on.patch
* bugh-prevent-double-evaulation-of-in-build_bug_on-fix.patch
* bugh-make-build_bug_on-generate-compile-time-error.patch
* compilerh-bugh-prevent-double-error-messages-with-build_bug_on.patch
*

Re: [Alternative 2][PATCH] ACPI / PCI: Set root bridge ACPI handle in advance

2013-01-04 Thread Bjorn Helgaas

On Fri, Jan 4, 2013 at 5:19 PM, Yinghai Lu  wrote:
> On Fri, Jan 4, 2013 at 4:14 PM, Rafael J. Wysocki  wrote:
>> On Friday, January 04, 2013 04:03:01 PM Yinghai Lu wrote:
>>> On Fri, Jan 4, 2013 at 3:38 AM, Rafael J. Wysocki  wrote:
>>> >> --- a/arch/x86/include/asm/pci.h
>>> >> +++ b/arch/x86/include/asm/pci.h
>>> >> @@ -14,6 +14,7 @@
>>> >>  struct pci_sysdata {
>>> >>   int domain; /* PCI domain */
>>> >>   int node;   /* NUMA node */
>>> >> + void*acpi_handle;
>>> >>  #ifdef CONFIG_X86_64
>>> >>   void*iommu; /* IOMMU private data */
>>> >>  #endif
>>> >>
>>>
>>> acpi_handle is not good name and it is confusing.
>>
>> Well, what would be a better name in your opinion?
>>
>> I was going to put that into a #ifdef CONFIG_ACPI / #endif, so what about
>> calling it acpi_data?
>
> yes, with #ifdef, you can use acpi_handle type directly.
>
> it is acpi handle for pci_root.
>
> so would call int pci_root_acpi_handle ?

I just copied the name from the corresponding ia64 code.  I don't care
if you want to change it, but I think there is *some* value in keeping
the x86 and ia64 code as similar as possible because it would be nice
to converge it some day.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] mm: mmap: annotate vm_lock_anon_vma locking properly for lockdep

2013-01-04 Thread Jiri Kosina

Commit 5a505085f04 ("mm/rmap: Convert the struct anon_vma::mutex to an 
rwsem") turned anon_vma mutex to rwsem.

However, the properly annotated nested locking in mm_take_all_locks() has 
been converted from

mutex_lock_nest_lock(_vma->root->mutex, >mmap_sem);

to

down_write(_vma->root->rwsem);

which is incomplete, and causes the false positive report from lockdep below.

Annotate the fact that mmap_sem is used as an outter lock to serialize taking
of all the anon_vma rwsems at once no matter the order, using the
down_write_nest_lock() primitive.

This patch fixes this lockdep report:

 =
 [ INFO: possible recursive locking detected ]
 3.8.0-rc2-00036-g5f73896 #171 Not tainted
 -
 qemu-kvm/2315 is trying to acquire lock:
  (_vma->rwsem){+.+...}, at: [] 
mm_take_all_locks+0x149/0x1b0

 but task is already holding lock:
  (_vma->rwsem){+.+...}, at: [] 
mm_take_all_locks+0x149/0x1b0

 other info that might help us debug this:
  Possible unsafe locking scenario:

CPU0

   lock(_vma->rwsem);
   lock(_vma->rwsem);

  *** DEADLOCK ***

  May be due to missing lock nesting notation

 4 locks held by qemu-kvm/2315:
  #0:  (>mmap_sem){++}, at: [] 
do_mmu_notifier_register+0xfc/0x170
  #1:  (mm_all_locks_mutex){+.+...}, at: [] 
mm_take_all_locks+0x36/0x1b0
  #2:  (>i_mmap_mutex){+.+...}, at: [] 
mm_take_all_locks+0xc9/0x1b0
  #3:  (_vma->rwsem){+.+...}, at: [] 
mm_take_all_locks+0x149/0x1b0

 stack backtrace:
 Pid: 2315, comm: qemu-kvm Not tainted 3.8.0-rc2-00036-g5f73896 #171
 Call Trace:
  [] print_deadlock_bug+0xf2/0x100
  [] validate_chain+0x4f6/0x720
  [] __lock_acquire+0x359/0x580
  [] ? trace_hardirqs_on_caller+0x12d/0x1b0
  [] lock_acquire+0x121/0x190
  [] ? mm_take_all_locks+0x149/0x1b0
  [] down_write+0x3f/0x70
  [] ? mm_take_all_locks+0x149/0x1b0
  [] mm_take_all_locks+0x149/0x1b0
  [] do_mmu_notifier_register+0x68/0x170
  [] mmu_notifier_register+0xe/0x10
  [] kvm_create_vm+0x22b/0x330 [kvm]
  [] kvm_dev_ioctl+0xf8/0x1a0 [kvm]
  [] do_vfs_ioctl+0x9d/0x350
  [] ? sysret_check+0x22/0x5d
  [] sys_ioctl+0x91/0xb0
  [] system_call_fastpath+0x16/0x1b

Signed-off-by: Jiri Kosina 
---
 mm/mmap.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index f54b235..35730ee 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2886,7 +2886,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct 
anon_vma *anon_vma)
 * The LSB of head.next can't change from under us
 * because we hold the mm_all_locks_mutex.
 */
-   down_write(_vma->root->rwsem);
+   down_write_nest_lock(_vma->root->rwsem, >mmap_sem);
/*
 * We can safely modify head.next after taking the
 * anon_vma->root->rwsem. If some other vma in this mm shares
-- 
Jiri Kosina
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] lockdep, rwsem: provide down_write_nest_lock()

2013-01-04 Thread Jiri Kosina

down_write_nest_lock() provides means to annotate locking scenario where 
an outter lock is guaranteed to serialize the order nested locks are being 
acquired.

This is an analogy to already existing mutex_lock_nest_lock() and
spin_lock_nest_lock().

Signed-off-by: Jiri Kosina 
---
 include/linux/lockdep.h |3 +++
 include/linux/rwsem.h   |9 +
 kernel/rwsem.c  |   10 ++
 3 files changed, 22 insertions(+), 0 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 00e4637..2bca44b 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -524,14 +524,17 @@ static inline void print_irqtrace_events(struct 
task_struct *curr)
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 # ifdef CONFIG_PROVE_LOCKING
 #  define rwsem_acquire(l, s, t, i)lock_acquire(l, s, t, 0, 2, 
NULL, i)
+#  define rwsem_acquire_nest(l, s, t, n, i)lock_acquire(l, s, t, 0, 2, n, 
i)
 #  define rwsem_acquire_read(l, s, t, i)   lock_acquire(l, s, t, 1, 2, 
NULL, i)
 # else
 #  define rwsem_acquire(l, s, t, i)lock_acquire(l, s, t, 0, 1, 
NULL, i)
+#  define rwsem_acquire_nest(l, s, t, n, i)lock_acquire(l, s, t, 0, 1, n, 
i)
 #  define rwsem_acquire_read(l, s, t, i)   lock_acquire(l, s, t, 1, 1, 
NULL, i)
 # endif
 # define rwsem_release(l, n, i)lock_release(l, n, i)
 #else
 # define rwsem_acquire(l, s, t, i) do { } while (0)
+# define rwsem_acquire_nest(l, s, t, n, i) do { } while (0)
 # define rwsem_acquire_read(l, s, t, i)do { } while (0)
 # define rwsem_release(l, n, i)do { } while (0)
 #endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 54bd7cd..413cc11 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -125,8 +125,17 @@ extern void downgrade_write(struct rw_semaphore *sem);
  */
 extern void down_read_nested(struct rw_semaphore *sem, int subclass);
 extern void down_write_nested(struct rw_semaphore *sem, int subclass);
+extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map 
*nest_lock);
+
+# define down_write_nest_lock(sem, nest_lock)  \
+do {   \
+   typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \
+   _down_write_nest_lock(sem, &(nest_lock)->dep_map);  \
+} while (0);
+
 #else
 # define down_read_nested(sem, subclass)   down_read(sem)
+# define down_write_nest_lock(sem, nest_lock)  down_read(sem)
 # define down_write_nested(sem, subclass)  down_write(sem)
 #endif
 
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 6850f53..b3c6c3f 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -116,6 +116,16 @@ void down_read_nested(struct rw_semaphore *sem, int 
subclass)
 
 EXPORT_SYMBOL(down_read_nested);
 
+void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
+{
+   might_sleep();
+   rwsem_acquire_nest(>dep_map, 0, 0, nest, _RET_IP_);
+
+   LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+}
+
+EXPORT_SYMBOL(_down_write_nest_lock);
+
 void down_write_nested(struct rw_semaphore *sem, int subclass)
 {
might_sleep();
-- 
Jiri Kosina
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7 1/2] KSM: numa awareness sysfs knob

2013-01-04 Thread Simon Jeons

On Fri, 2013-01-04 at 15:03 -0800, Hugh Dickins wrote:
> On Thu, 3 Jan 2013, Simon Jeons wrote:
> > On Wed, 2013-01-02 at 21:10 -0800, Hugh Dickins wrote:
> > > 
> > > As you can see, remove_rmap_item_from_tree uses it to decide whether
> > > or not it should rb_erase the rmap_item from the unstable_tree.
> > > 
> > > Every full scan of all the rmap_items, we increment ksm_scan.seqnr,
> > > forget the old unstable_tree (it would just be a waste of processing
> > > to remove every node one by one), and build up the unstable_tree afresh.
> > > 
> > 
> > When the rmap_items left over from the previous scan will be removed?
> 
> Removed from the unstable rbtree?  Not at all, it's simply restarted
> afresh, and the old rblinkages ignored.  Freed back to slab?  When the
> scan passes that mm+address and realizes that rmap_item is not wanted
> any more.  (Or when ksm is shut down with KSM_RUN_UNMERGE.)
> 

Make sense. Thanks Hugh. :)

> > 
> > > That works fine until we need to remove an rmap_item: then we have to be
> > > very sure to remove it from the unstable_tree if it's already been linked
> > > there during this scan, but ignore its rblinkage if that's just left over
> > > from the previous scan.
> > > 
> > > A single bit would be enough to decide this; but we got it troublesomely
> > > wrong in the early days of KSM (didn't always visit every rmap_item each
> > > scan), so it's convenient to use 8 bits (the low unsigned char, stored
> > 
> > When the scenario didn't always visit every rmap_item each scan can
> > occur? 
> 
> You're asking me about a stage of KSM development 3.5 years ago:
> I don't remember the details.
> 
> > 
> > > below the FLAGs and below the page-aligned address in the rmap_item -
> > > there's lots of them, best keep them as small as we can) and do a
> > > BUG_ON(age > 1) if we made a mistake.
> > > 
> > > We haven't hit that BUG_ON in over three years: if we need some more
> > > bits for something, we can cut the age down to one or two bits.
> > > 
> > > Hugh


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] mm: bootmem: fix free_all_bootmem_core with odd bitmap alignment

2013-01-04 Thread Max Filippov

Currently free_all_bootmem_core ignores that node_min_pfn may be not
multiple of BITS_PER_LONG. E.g. commit 6dccdcbe "mm: bootmem: fix
checking the bitmap when finally freeing bootmem" shifts vec by lower
bits of start instead of lower bits of idx. Also

  if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL)

assumes that vec bit 0 corresponds to start pfn, which is only true when
node_min_pfn is a multiple of BITS_PER_LONG. Also loop in the else
clause can double-free pages (e.g. with node_min_pfn == start == 1,
map[0] == ~0 on 32-bit machine page 32 will be double-freed).

This bug causes the following message during xtensa kernel boot:

[0.00] bootmem::free_all_bootmem_core nid=0 start=1 end=8000
[0.00] BUG: Bad page state in process swapper  pfn:1
[0.00] page:d04bd020 count:0 mapcount:-127 mapping:  (null) index:0x2
[0.00] page flags: 0x0()
[0.00]
[0.00] Stack:  0002 0004  d0193e44 ff81 
 0002
[0.00]90038c66 d0193e90 d04bd020 01a8   
 0020
[0.00]90039a4c d0193eb0 d04bd020 0001 d04b7b20 8ad0 
 
[0.00] Call Trace:
[0.00]  [] bad_page+0x8c/0x9c
[0.00]  [] free_pages_prepare+0x5e/0x88
[0.00]  [] free_hot_cold_page+0xc/0xa0
[0.00]  [] __free_pages+0x24/0x38
[0.00]  [] __free_pages_bootmem+0x54/0x56
[0.00]  [] free_all_bootmem_core$part$11+0xeb/0x138
[0.00]  [] free_all_bootmem+0x46/0x58
[0.00]  [] mem_init+0x25/0xa4
[0.00]  [] start_kernel+0x11e/0x25c
[0.00]  [] should_never_return+0x0/0x3be7

The fix is the following:
- always align vec so that its bit 0 corresponds to start
- provide BITS_PER_LONG bits in vec, if those bits are available in the map
- don't free pages past next start position in the else clause.

Signed-off-by: Max Filippov 
---
 mm/bootmem.c |   23 +--
 1 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 1324cd7..ece83ca 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -185,10 +185,23 @@ static unsigned long __init 
free_all_bootmem_core(bootmem_data_t *bdata)
 
while (start < end) {
unsigned long *map, idx, vec;
+   unsigned shift;
 
map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
+   shift = idx & (BITS_PER_LONG - 1);
+   /*
+* vec holds at most BITS_PER_LONG map bits,
+* bit 0 corresponds to start.
+*/
vec = ~map[idx / BITS_PER_LONG];
+
+   if (shift) {
+   vec >>= shift;
+   if (end - start >= BITS_PER_LONG)
+   vec |= ~map[idx / BITS_PER_LONG + 1] <<
+   (BITS_PER_LONG - shift);
+   }
/*
 * If we have a properly aligned and fully unreserved
 * BITS_PER_LONG block of pages in front of us, free
@@ -201,19 +214,17 @@ static unsigned long __init 
free_all_bootmem_core(bootmem_data_t *bdata)
count += BITS_PER_LONG;
start += BITS_PER_LONG;
} else {
-   unsigned long off = 0;
+   unsigned long cur;
 
-   vec >>= start & (BITS_PER_LONG - 1);
-   while (vec) {
+   start = ALIGN(start + 1, BITS_PER_LONG);
+   while (cur = start; vec && cur != start; ++cur) {
if (vec & 1) {
-   page = pfn_to_page(start + off);
+   page = pfn_to_page(cur);
__free_pages_bootmem(page, 0);
count++;
}
vec >>= 1;
-   off++;
}
-   start = ALIGN(start + 1, BITS_PER_LONG);
}
}
 
-- 
1.7.7.6

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: PEBS (in perf) stopped working from 3.6 -> 3.7

2013-01-04 Thread Steinar H. Gunderson

On Fri, Jan 04, 2013 at 05:16:27PM -0700, David Ahern wrote:
> Known problem. Pick one of: update perf to 3.7, add H to the command
> (-e cycles:ppH) or apply this patch:
> https://lkml.org/lkml/2012/12/28/384

Oh, thinking of it, I've actually read about this flamew^Wdiscussion :-)

Thanks!

/* Steinar */
-- 
Homepage: http://www.sesse.net/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Alternative 2][PATCH] ACPI / PCI: Set root bridge ACPI handle in advance

2013-01-04 Thread Yinghai Lu

On Fri, Jan 4, 2013 at 4:14 PM, Rafael J. Wysocki  wrote:
> On Friday, January 04, 2013 04:03:01 PM Yinghai Lu wrote:
>> On Fri, Jan 4, 2013 at 3:38 AM, Rafael J. Wysocki  wrote:
>> >> --- a/arch/x86/include/asm/pci.h
>> >> +++ b/arch/x86/include/asm/pci.h
>> >> @@ -14,6 +14,7 @@
>> >>  struct pci_sysdata {
>> >>   int domain; /* PCI domain */
>> >>   int node;   /* NUMA node */
>> >> + void*acpi_handle;
>> >>  #ifdef CONFIG_X86_64
>> >>   void*iommu; /* IOMMU private data */
>> >>  #endif
>> >>
>>
>> acpi_handle is not good name and it is confusing.
>
> Well, what would be a better name in your opinion?
>
> I was going to put that into a #ifdef CONFIG_ACPI / #endif, so what about
> calling it acpi_data?

yes, with #ifdef, you can use acpi_handle type directly.

it is acpi handle for pci_root.

so would call int pci_root_acpi_handle ?

Thanks

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: PEBS (in perf) stopped working from 3.6 -> 3.7

2013-01-04 Thread David Ahern


On 1/4/13 4:47 PM, Steinar H. Gunderson wrote:

[Please Cc me on any replies; I'm not subscribed to lkml]

Hi,

I recently upgraded from 3.6.5 to 3.7.1 to get around some MM issues that
have been bothering me. However, it appears it broke PEBS:

   pannekake:/usr/src/linux-3.7.1# perf record -a -e cycles:pp

   Error: sys_perf_event_open() syscall returned with 95 (Operation not 
supported).  /bin/dmesg may provide additional information.

   Fatal: No hardware sampling interrupt available. No APIC? If so then you can boot the 
kernel with the "lapic" boot parameter to force-enable it.


Known problem. Pick one of: update perf to 3.7, add H to the command (-e 
cycles:ppH) or apply this patch: https://lkml.org/lkml/2012/12/28/384


David
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

PEBS (in perf) stopped working from 3.6 -> 3.7

2013-01-04 Thread Steinar H. Gunderson

[Please Cc me on any replies; I'm not subscribed to lkml]

Hi,

I recently upgraded from 3.6.5 to 3.7.1 to get around some MM issues that
have been bothering me. However, it appears it broke PEBS:

  pannekake:/usr/src/linux-3.7.1# perf record -a -e cycles:pp

  Error: sys_perf_event_open() syscall returned with 95 (Operation not 
supported).  /bin/dmesg may provide additional information.

  Fatal: No hardware sampling interrupt available. No APIC? If so then you can 
boot the kernel with the "lapic" boot parameter to force-enable it.

Non-precise tracing works fine. This used to work in 3.6.5, and I used make
oldconfig (not that I can find any .config settings that would seem relevant
either). I certainly have APIC. There's nothing in dmesg about the error.
I'm using perf built from the same kernel tree (ie., 3.7.1).

This is on a dual Xeon E5520 (ie. Westmere), on a Supermicro board.

/* Steinar */
-- 
Chief, Tech:Server
http://www.sesse.net/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Alternative 2][PATCH] ACPI / PCI: Set root bridge ACPI handle in advance

2013-01-04 Thread Rafael J. Wysocki

On Friday, January 04, 2013 04:03:01 PM Yinghai Lu wrote:
> On Fri, Jan 4, 2013 at 3:38 AM, Rafael J. Wysocki  wrote:
> >> --- a/arch/x86/include/asm/pci.h
> >> +++ b/arch/x86/include/asm/pci.h
> >> @@ -14,6 +14,7 @@
> >>  struct pci_sysdata {
> >>   int domain; /* PCI domain */
> >>   int node;   /* NUMA node */
> >> + void*acpi_handle;
> >>  #ifdef CONFIG_X86_64
> >>   void*iommu; /* IOMMU private data */
> >>  #endif
> >>
> 
> acpi_handle is not good name and it is confusing.

Well, what would be a better name in your opinion?

I was going to put that into a #ifdef CONFIG_ACPI / #endif, so what about
calling it acpi_data?

Rafael


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Alternative 2][PATCH] ACPI / PCI: Set root bridge ACPI handle in advance

2013-01-04 Thread Yinghai Lu

On Fri, Jan 4, 2013 at 3:38 AM, Rafael J. Wysocki  wrote:
>> --- a/arch/x86/include/asm/pci.h
>> +++ b/arch/x86/include/asm/pci.h
>> @@ -14,6 +14,7 @@
>>  struct pci_sysdata {
>>   int domain; /* PCI domain */
>>   int node;   /* NUMA node */
>> + void*acpi_handle;
>>  #ifdef CONFIG_X86_64
>>   void*iommu; /* IOMMU private data */
>>  #endif
>>

acpi_handle is not good name and it is confusing.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH v3 0/2] pstore,efi_pstore: Avoid deadlock in non-blocking paths

2013-01-04 Thread Seiji Aguchi

Tony,

Can you review this patchset?

Seiji

> -Original Message-
> From: Anton Vorontsov [mailto:cbouatmai...@gmail.com]
> Sent: Friday, December 21, 2012 6:37 PM
> To: Seiji Aguchi
> Cc: Luck, Tony (tony.l...@intel.com); linux-kernel@vger.kernel.org; 
> ccr...@android.com; keesc...@chromium.org; Satoru Moriya;
> dle-deve...@lists.sourceforge.net; dzic...@redhat.com
> Subject: Re: [PATCH v3 0/2] pstore,efi_pstore: Avoid deadlock in non-blocking 
> paths
> 
> On Fri, Dec 21, 2012 at 11:27:00PM +, Seiji Aguchi wrote:
> > Tony,
> >
> > Could you please apply this patchset to your tree?
> 
> Actually, I'd prefer to take both patches via pstore tree. The EFI part is 
> isolated and small, so even if it conflicts, it would be easy to
> resolve, unlike to pstore core part.
> 
> So, would be great if Tony could give an Ack for EFI part and we'd merge this 
> via pstore tree.
> 
> Thanks,
> Anton

Re: [PATCH] cpuidle - fix lock contention in the idle path

2013-01-04 Thread Rafael J. Wysocki

On Friday, January 04, 2013 07:27:24 AM Daniel Lezcano wrote:
> On 01/02/2013 10:13 PM, Russ Anderson wrote:
> > On Wed, Dec 26, 2012 at 11:01:48AM +0100, Daniel Lezcano wrote:
> >> The commit bf4d1b5ddb78f86078ac6ae0415802d5f0c68f92 introduces
> >> a lock in the cpuidle_get_cpu_driver function. This function
> >> is used in the idle_call function.
> >>
> >> The problem is the contention with a large number of cpus because
> >> they try to access the idle routine at the same time.
> >>
> >> The lock could be safely removed because of how is used the
> >> cpuidle api. The cpuidle_register_driver is called first but
> >> until the cpuidle_register_device is not called we don't
> >> enter in the cpuidle idle call function because the device
> >> is not enabled.
> >>
> >> The cpuidle_unregister_driver function, leading the a NULL driver,
> >> is not called before the cpuidle_unregister_device.
> >>
> >> This is how is used the cpuidle api from the different drivers.
> >>
> >> However, a cleanup around the lock and a proper refcounting
> >> mechanism should be used to ensure the consistency in the api,
> >> like cpuidle_unregister_driver should failed if its refcounting
> >> is not 0.
> >>
> >> These modifications will need some code reorganization and rewrite
> >> which does not fit with a fix.
> > 
> > I agree.
> > 
> >> The following patch is a hot fix by returning to the initial behavior
> >> by removing the lock when getting the driver.
> > 
> > The patch fixes the problem.  Verified on a system with 1024 cpus.
> > Thanks.
> > 
> >> Signed-off-by: Daniel Lezcano 
> > 
> > Reported-by: Russ Anderson 
> > Acked-by: Russ Anderson 
> 
> Hi Rafael,
> 
> could you consider this patch for merging ?

Yes, I've taken it already.

I'll include it into the next pull request.

Thanks,
Rafael


-- 
I speak only for myself.
Rafael J. Wysocki, Intel Open Source Technology Center.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7u1 26/31] x86: Don't enable swiotlb if there is not enough ram for it

2013-01-04 Thread Yinghai Lu

On Fri, Jan 4, 2013 at 3:21 PM, Shuah Khan  wrote:

> Please see attached dmesg for full log. I can do some testing on this
> system with your patch if you would like.

That would be great.

Please try
git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-yinghai.git
for-x86-boot
or just this patch.

Too bad, I can not access AMD systems with IOMMU support.

Thanks a lot.

Yinghai
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: fdtdump: properly handle multi-string properties

2013-01-04 Thread David Gibson

On Fri, Jan 04, 2013 at 09:12:46PM +0200, Pantelis Antoniou wrote:
> Device tree can store multiple strings in a single property.
> We didn't handle that case properly.
> 
> Signed-off-by: Pantelis Antoniou 

Acked-by: David Gibson 

-- 
David Gibson| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
| _way_ _around_!
http://www.ozlabs.org/~dgibson


signature.asc
Description: Digital signature

Re: [ANNOUNCE] 3.7-nohz1

2013-01-04 Thread Frederic Weisbecker

2012/12/30 Paul E. McKenney :
> On Mon, Dec 24, 2012 at 12:43:25AM +0100, Frederic Weisbecker wrote:
>> 2012/12/21 Steven Rostedt :
>> > On Thu, 2012-12-20 at 19:32 +0100, Frederic Weisbecker wrote:
>> >> Let's imagine you have 4 CPUs. We keep the CPU 0 to offline RCU callbacks 
>> >> there and to
>> >> handle the timekeeping. We set the rest as full dynticks. So you need the 
>> >> following kernel
>> >> parameters:
>> >>
>> >>   rcu_nocbs=1-3 full_nohz=1-3
>> >>
>> >> (Note rcu_nocbs value must always be the same as full_nohz).
>> >
>> > Why? You can't have: rcu_nocbs=1-4 full_nohz=1-3
>>
>> That should be allowed.
>>
>> >   or: rcu_nocbs=1-3 full_nohz=1-4 ?
>>
>> But that not.
>>
>> You need to have: rcu_nocbs & full_nohz == full_nohz. This is because
>> the tick is not there to maintain the local RCU callbacks anymore. So
>> this must be offloaded to the rcu_nocb threads.
>>
>> I just have a doubt with rcu_nocb. Do we still need the tick to
>> complete the grace period for local rcu callbacks? I need to discuss
>> that with Paul.
>
> The tick is only needed if rcu_needs_cpu() returns false.  Of course,
> this means that if you don't invoke rcu_needs_cpu() before returning to
> adaptive-idle usermode execution, you are correct that a full_nohz CPU
> would also have to be a rcu_nocbs CPU.
>
> That said, I am getting close to having an rcu_needs_cpu() that only
> returns false if there are callbacks immediately ready to invoke, at
> least if RCU_FAST_NO_HZ=y.

Ok. Also when a CPU enqueues a callback and starts a grace period, the
tick polls on the grace period completion. How is it handled with
rcu_nocbs CPUs? Does rcu_needs_cpu() return false until the grace
period is completed? If so I still need to restart the local tick
whenever a new callback is enqueued.

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v4 5/5] KVM: x86: improve reexecute_instruction

2013-01-04 Thread Marcelo Tosatti

On Fri, Jan 04, 2013 at 09:56:59PM +0800, Xiao Guangrong wrote:
> The current reexecute_instruction can not well detect the failed instruction
> emulation. It allows guest to retry all the instructions except it accesses
> on error pfn
> 
> For example, some cases are nested-write-protect - if the page we want to
> write is used as PDE but it chains to itself. Under this case, we should
> stop the emulation and report the case to userspace
> 
> Signed-off-by: Xiao Guangrong 
> ---
>  arch/x86/include/asm/kvm_host.h |7 +
>  arch/x86/kvm/paging_tmpl.h  |   24 +-
>  arch/x86/kvm/x86.c  |   50 --
>  3 files changed, 55 insertions(+), 26 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index c431b33..de229e6 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -502,6 +502,13 @@ struct kvm_vcpu_arch {
>   u64 msr_val;
>   struct gfn_to_hva_cache data;
>   } pv_eoi;
> +
> + /*
> +  * Indicate whether the gfn is used as page table in guest which
> +  * is set when fix page fault and used to detect unhandeable
> +  * instruction.
> +  */
> + bool target_gfn_is_pt;
>  };
> 
>  struct kvm_lpage_info {
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 0453fa0..ca1be75 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -507,20 +507,27 @@ out_gpte_changed:
>   */
>  static bool
>  FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
> -   struct guest_walker *walker, int user_fault)
> +   struct guest_walker *walker, int user_fault,
> +   bool *target_gfn_is_pt)
>  {
>   int level;
>   gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
> + bool self_changed = false;
> +
> + *target_gfn_is_pt = false;
> 
>   if (!(walker->pte_access & ACC_WRITE_MASK ||
> (!is_write_protection(vcpu) && !user_fault)))
>   return false;
> 
> - for (level = walker->level; level <= walker->max_level; level++)
> - if (!((walker->gfn ^ walker->table_gfn[level - 1]) & mask))
> - return true;
> + for (level = walker->level; level <= walker->max_level; level++) {
> + gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
> +
> + self_changed |= !(gfn & mask);
> + *target_gfn_is_pt |= !gfn;
> + }
> 
> - return false;
> + return self_changed;
>  }
> 
>  /*
> @@ -548,7 +555,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t 
> addr, u32 error_code,
>   int level = PT_PAGE_TABLE_LEVEL;
>   int force_pt_level;
>   unsigned long mmu_seq;
> - bool map_writable;
> + bool map_writable, is_self_change_mapping;
> 
>   pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
> 
> @@ -576,9 +583,12 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, 
> gva_t addr, u32 error_code,
>   return 0;
>   }
> 
> + is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
> +   , user_fault, >arch.target_gfn_is_pt);
> +
>   if (walker.level >= PT_DIRECTORY_LEVEL)
>   force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
> -|| FNAME(is_self_change_mapping)(vcpu, , user_fault);
> +|| is_self_change_mapping;
>   else
>   force_pt_level = 1;
>   if (!force_pt_level) {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index b0a3678..44c6992 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4756,15 +4756,8 @@ static int handle_emulation_failure(struct kvm_vcpu 
> *vcpu)
>  static bool reexecute_instruction(struct kvm_vcpu *vcpu, unsigned long cr2)
>  {
>   gpa_t gpa = cr2;
> + gfn_t gfn;
>   pfn_t pfn;
> - unsigned int indirect_shadow_pages;
> -
> - spin_lock(>kvm->mmu_lock);
> - indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
> - spin_unlock(>kvm->mmu_lock);
> -
> - if (!indirect_shadow_pages)
> - return false;

This renders the previous patch obsolete, pretty much (please fold).

>   if (!vcpu->arch.mmu.direct_map) {
>   /*
> @@ -4781,13 +4774,7 @@ static bool reexecute_instruction(struct kvm_vcpu 
> *vcpu, unsigned long cr2)
>   return true;
>   }
> 
> - /*
> -  * if emulation was due to access to shadowed page table
> -  * and it failed try to unshadow page and re-enter the
> -  * guest to let CPU execute the instruction.
> -  */
> - if (kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)))
> - return true;
> + gfn = gpa_to_gfn(gpa);
> 
>   /*
>* Do not retry the unhandleable instruction if it faults on the
> @@ -4795,13 +4782,38 @@ static bool

Re: [PATCH] mv643xx_eth: Fix a possible deadlock upon ifdown

2013-01-04 Thread David Miller

From: Lubomir Rintel 
Date: Fri,  4 Jan 2013 15:17:43 +0100

> @@ -943,7 +943,7 @@ static int txq_reclaim(struct tx_queue *txq, int budget, 
> int force)
>   struct netdev_queue *nq = netdev_get_tx_queue(mp->dev, txq->index);
>   int reclaimed;
>  
> - __netif_tx_lock(nq, smp_processor_id());
> + __netif_tx_lock_bh(nq);

I still don't understand why this change is necessary.

The TX reclaim function is invoked in software interrupt context in
all of the places where this lockdep warning might matter.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH net-next/master] ethernet/broadcom/tg3: Fix sparse warning: constant 0x7fffffffffffffff is so big it is long long

2013-01-04 Thread David Miller

From: Peter Huewe 
Date: Thu,  3 Jan 2013 15:23:50 +0100

> Sparse complains that:
> drivers/net/ethernet/broadcom/tg3.c:5670:55: sparse: constant
> 0x7fff is so big it is long long (on x86/32 bit)
> 
> so we suffix the constant with LL in the header file.
> 
> Reported-by: Fengguang Wu 
> Signed-off-by: Peter Huewe 

Applied.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7 1/2] KSM: numa awareness sysfs knob

2013-01-04 Thread Hugh Dickins

On Thu, 3 Jan 2013, Simon Jeons wrote:
> On Wed, 2013-01-02 at 21:10 -0800, Hugh Dickins wrote:
> > 
> > As you can see, remove_rmap_item_from_tree uses it to decide whether
> > or not it should rb_erase the rmap_item from the unstable_tree.
> > 
> > Every full scan of all the rmap_items, we increment ksm_scan.seqnr,
> > forget the old unstable_tree (it would just be a waste of processing
> > to remove every node one by one), and build up the unstable_tree afresh.
> > 
> 
> When the rmap_items left over from the previous scan will be removed?

Removed from the unstable rbtree?  Not at all, it's simply restarted
afresh, and the old rblinkages ignored.  Freed back to slab?  When the
scan passes that mm+address and realizes that rmap_item is not wanted
any more.  (Or when ksm is shut down with KSM_RUN_UNMERGE.)

> 
> > That works fine until we need to remove an rmap_item: then we have to be
> > very sure to remove it from the unstable_tree if it's already been linked
> > there during this scan, but ignore its rblinkage if that's just left over
> > from the previous scan.
> > 
> > A single bit would be enough to decide this; but we got it troublesomely
> > wrong in the early days of KSM (didn't always visit every rmap_item each
> > scan), so it's convenient to use 8 bits (the low unsigned char, stored
> 
> When the scenario didn't always visit every rmap_item each scan can
> occur? 

You're asking me about a stage of KSM development 3.5 years ago:
I don't remember the details.

> 
> > below the FLAGs and below the page-aligned address in the rmap_item -
> > there's lots of them, best keep them as small as we can) and do a
> > BUG_ON(age > 1) if we made a mistake.
> > 
> > We haven't hit that BUG_ON in over three years: if we need some more
> > bits for something, we can cut the age down to one or two bits.
> > 
> > Hugh
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7u1 26/31] x86: Don't enable swiotlb if there is not enough ram for it

2013-01-04 Thread Yinghai Lu

On Fri, Jan 4, 2013 at 2:56 PM, Shuah Khan  wrote:
>
> AMD IOMMU driver is using this lever to leave swiotlb enabled when it
> detects devices that can't be supported by iommu. My concern is that
> this change for kdump removes that handshake ability between iommu and
> swiolb.

No, it does not remove that ability.

I'd like to see the boot log on system that could be affected by this patch.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7u1 26/31] x86: Don't enable swiotlb if there is not enough ram for it

2013-01-04 Thread Yinghai Lu

On Fri, Jan 4, 2013 at 2:47 PM, Eric W. Biederman  wrote:
> Yinghai Lu it looks like your autodetection of the problem case in this
> patch is problematic and needs a rethink.  My quick skim says you are
> trying to detect failure too early in the code.  Furthermore having
> kexec on panic sized magic comments without explanation is wrong.

current amd iommu implementation have this sequence:
1. alloc buffer for swiotlb.
2. detect and initialize intel iommu or amd iommu
3. release swiotlb if swiotlb == 0 , set by ops_init.

so we need to detect that before allocating buffer for swiotlb.

>
> Shuah Khan this is motivated by kdump.  However a correct implementation
> should be about dealing with the case when there is simply not enough
> memory available below 4G for bounce buffers.
>
> If a device needs an iommu, and swiotlb is the only iommu option, and
> there is not enough memory below 4G panic'ing is entirely reasonable.
>
> Do I read this discussion right that we are waisting 64M on systems
> that have the swiotlb code but don't use the swiotlb?

No wasting.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/9] Avoid populating unbounded num of ptes with mmap_sem held

2013-01-04 Thread Michel Lespinasse

On Fri, Jan 4, 2013 at 10:16 AM, Andy Lutomirski  wrote:
> I still have quite a few instances of 2-6 ms of latency due to
> "call_rwsem_down_read_failed __do_page_fault do_page_fault
> page_fault".  Any idea why?  I don't know any great way to figure out
> who is holding mmap_sem at the time.  Given what my code is doing, I
> suspect the contention is due to mmap or munmap on a file.  MCL_FUTURE
> is set, and MAP_POPULATE is not set.
>
> It could be the other thread calling mmap and getting preempted (or
> otherwise calling schedule()).  Grr.

The simplest way to find out who's holding the lock too long might be
to enable CONFIG_LOCK_STATS. This will slow things down a little, but
give you lots of useful information including which threads hold
mmap_sem the longest and the call stack for where they grab it from.
See Documentation/lockstat.txt

I think munmap is a likely culprit, as it still happens with mmap_sem
held for write (I do plan to go work on this next). But it's hard to
be sure without lockstats :)

-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7u1 26/31] x86: Don't enable swiotlb if there is not enough ram for it

2013-01-04 Thread Shuah Khan

On Fri, Jan 4, 2013 at 3:47 PM, Eric W. Biederman  wrote:
> Shuah Khan  writes:
>
>> On Fri, Jan 4, 2013 at 3:10 PM, Yinghai Lu  wrote:
>>> On Fri, Jan 4, 2013 at 1:02 PM, Shuah Khan  wrote:
 Pani'cing the system doesn't sound like a good option to me in this
 case. This change to disable swiotlb is made for kdump. However, with
 this change several system fail to boot, unless crashkernel_low=72M is
 specified.
>>>
>>> this patchset is new feature to put second kdump kernel above 4G.
>>>
>> I understand this is just one of the patches to implement the new
>> kdump feature. However, I think regression on existing behavior with a
>> panic is a bit of a big hammer. Thie change causes panic on systems
>> even when kdump is not enabled, if I understand it correctly.
>>
>> Granted kdump gets enabled by several distros, but it is not a
>> required feature. However, expecting system to boot with devices that
>> require swiotlb fully functioning is a basic feature. So I would argue
>> that not breaking the basic functionality is a higher priority over
>> enabling kdump in this case.
>
> Yinghai Lu it looks like your autodetection of the problem case in this
> patch is problematic and needs a rethink.  My quick skim says you are
> trying to detect failure too early in the code.  Furthermore having
> kexec on panic sized magic comments without explanation is wrong.
>
> Shuah Khan this is motivated by kdump.  However a correct implementation
> should be about dealing with the case when there is simply not enough
> memory available below 4G for bounce buffers.
>
> If a device needs an iommu, and swiotlb is the only iommu option, and
> there is not enough memory below 4G panic'ing is entirely reasonable.
>
> Do I read this discussion right that we are waisting 64M on systems
> that have the swiotlb code but don't use the swiotlb?
>

No. pci_swiotlb_late_init() does free reserved swiolb buffers on
systems that don't need swiolb. IOMMU drivers turn off swiotlb after
iommu is initialized correctly. It is possible on some systems when
BIOS is incorrect, iommu initialization could fail and swiotlb is left
enabled.

AMD IOMMU driver is using this lever to leave swiotlb enabled when it
detects devices that can't be supported by iommu. My concern is that
this change for kdump removes that handshake ability between iommu and
swiolb.

-- Shuah
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

RE: [PATCH 7/8] zswap: add to mm/

2013-01-04 Thread Dan Magenheimer

> From: Seth Jennings [mailto:sjenn...@linux.vnet.ibm.com]
> Subject: Re: [PATCH 7/8] zswap: add to mm/
> 
> On 01/03/2013 04:33 PM, Dan Magenheimer wrote:
> >> From: Seth Jennings [mailto:sjenn...@linux.vnet.ibm.com]
> >>
> >> However, once the flushing code was introduced and could free an entry
> >> from the zswap_fs_store() path, it became necessary to add a per-entry
> >> refcount to make sure that the entry isn't freed while another code
> >> path was operating on it.
> >
> > Hmmm... doesn't the refcount at least need to be an atomic_t?
> 
> An entry's refcount is only ever changed under the tree lock, so
> making them atomic_t would be redundantly atomic.

Maybe I'm missing something still but then I think you also
need to evaluate and act on the refcount (not just read it) while
your treelock is held.  I.e., in:

> + /* page is already in the swap cache, ignore for now */
> + spin_lock(>lock);
> + refcount = zswap_entry_put(entry);
> + spin_unlock(>lock);
> +
> + if (likely(refcount))
> + return 0;
> +
> + /* if the refcount is zero, invalidate must have come in */
> + /* free */
> + zs_free(tree->pool, entry->handle);
> + zswap_entry_cache_free(entry);
> + atomic_dec(_stored_pages);

the entry's refcount may be changed by another processor
immediately after the unlock, and then the "if (refcount)"
is testing a stale value and you will get (I think) a memory leak.

There is similar racy code in zswap_fs_invalidate_page which
I think could lead to a double free.  There's another
I think in zswap_fs_load...  And the refcount is dec'd
in one path inside of zswap_fs_store as well which may
race with the above.

When flushing multiple zpages to free a pageframe, you may
need to test refcounts for all the entries while within the lock.
If so, this is one place where the high-density storage will make
things messy, especially if page boundaries are crossed.

A nit: Even I, steeped in tmem terminology, was confused by
your use of "fs"... to nearly all readers it will
be translated as "filesystem" which is mystifying.
Just spell it out "frontswap", even if it causes a few
lines to be wrapped.

Have a good weekend!
Dan
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] x86: Don't print extended CMOS year when reading RTC

2013-01-04 Thread Bjorn Helgaas

We shouldn't print the current century every time we read the RTC.

Signed-off-by: Bjorn Helgaas 
---
 arch/x86/kernel/rtc.c |1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 801602b..2e8f3d3 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -149,7 +149,6 @@ unsigned long mach_get_cmos_time(void)
if (century) {
century = bcd2bin(century);
year += century * 100;
-   printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
} else
year += CMOS_YEARS_OFFS;
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7u1 26/31] x86: Don't enable swiotlb if there is not enough ram for it

2013-01-04 Thread Eric W. Biederman

Shuah Khan  writes:

> On Fri, Jan 4, 2013 at 3:10 PM, Yinghai Lu  wrote:
>> On Fri, Jan 4, 2013 at 1:02 PM, Shuah Khan  wrote:
>>> Pani'cing the system doesn't sound like a good option to me in this
>>> case. This change to disable swiotlb is made for kdump. However, with
>>> this change several system fail to boot, unless crashkernel_low=72M is
>>> specified.
>>
>> this patchset is new feature to put second kdump kernel above 4G.
>>
> I understand this is just one of the patches to implement the new
> kdump feature. However, I think regression on existing behavior with a
> panic is a bit of a big hammer. Thie change causes panic on systems
> even when kdump is not enabled, if I understand it correctly.
>
> Granted kdump gets enabled by several distros, but it is not a
> required feature. However, expecting system to boot with devices that
> require swiotlb fully functioning is a basic feature. So I would argue
> that not breaking the basic functionality is a higher priority over
> enabling kdump in this case.

Yinghai Lu it looks like your autodetection of the problem case in this
patch is problematic and needs a rethink.  My quick skim says you are
trying to detect failure too early in the code.  Furthermore having
kexec on panic sized magic comments without explanation is wrong.

Shuah Khan this is motivated by kdump.  However a correct implementation
should be about dealing with the case when there is simply not enough
memory available below 4G for bounce buffers.

If a device needs an iommu, and swiotlb is the only iommu option, and 
there is not enough memory below 4G panic'ing is entirely reasonable.

Do I read this discussion right that we are waisting 64M on systems
that have the swiotlb code but don't use the swiotlb? 

Eric

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2 v3.7.0] Add AHCI support for Enmotus Bobcat device.

2013-01-04 Thread Hugh Daschbach

Silicon does not support standard AHCI BAR assignment.  Add
vendor/device exception to force BAR 2.

Signed-off-by: Hugh Daschbach 
---
v2: Corrected comment in response to Sergei's review.  Thanks, Sergei.

 drivers/ata/ahci.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 7862d17..4979127 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -53,6 +53,7 @@
 
 enum {
AHCI_PCI_BAR_STA2X11= 0,
+   AHCI_PCI_BAR_ENMOTUS= 2,
AHCI_PCI_BAR_STANDARD   = 5,
 };
 
@@ -410,6 +411,9 @@ static const struct pci_device_id ahci_pci_tbl[] = {
{ PCI_VDEVICE(ASMEDIA, 0x0611), board_ahci },   /* ASM1061 */
{ PCI_VDEVICE(ASMEDIA, 0x0612), board_ahci },   /* ASM1062 */
 
+   /* Enmotus */
+   { PCI_DEVICE(0x1c44, 0x8000), board_ahci },
+
/* Generic, PCI class code for AHCI */
{ PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
  PCI_CLASS_STORAGE_SATA_AHCI, 0xff, board_ahci },
@@ -1098,9 +1102,11 @@ static int ahci_init_one(struct pci_dev *pdev, const 
struct pci_device_id *ent)
dev_info(>dev,
 "PDC42819 can only drive SATA devices with this 
driver\n");
 
-   /* The Connext uses non-standard BAR */
+   /* Both Connext and Enmotus devices use non-standard BARs */
if (pdev->vendor == PCI_VENDOR_ID_STMICRO && pdev->device == 0xCC06)
ahci_pci_bar = AHCI_PCI_BAR_STA2X11;
+   else if (pdev->vendor == 0x1c44 && pdev->device == 0x8000)
+   ahci_pci_bar = AHCI_PCI_BAR_ENMOTUS;
 
/* acquire resources */
rc = pcim_enable_device(pdev);
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v7u1 26/31] x86: Don't enable swiotlb if there is not enough ram for it

2013-01-04 Thread Yinghai Lu

On Fri, Jan 4, 2013 at 2:26 PM, Shuah Khan  wrote:

> However, I think regression on existing behavior with a
> panic is a bit of a big hammer. Thie change causes panic on systems
> even when kdump is not enabled, if I understand it correctly.

I don't think so.

+static bool __init enough_mem_for_swiotlb(void)
+{
+   /* do we have less than 1M RAM under 4G ? */
+   return memblock_mem_size(1ULL<<(32-PAGE_SHIFT)) > (1ULL<<20);
+}

enough_mem_for_swiotlb could return false for them?

and

 int __init pci_swiotlb_detect_override(void)
 {
-   int use_swiotlb = swiotlb | swiotlb_force;
-
if (swiotlb_force)
swiotlb = 1;
+   else if (!enough_mem_for_swiotlb())
+   swiotlb = 0;

-   return use_swiotlb;
+   return swiotlb;
 }

it only disable swiotlb when there is less 1M mem under 4G.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v4 3/5] KVM: x86: clean up reexecute_instruction

2013-01-04 Thread Marcelo Tosatti

On Fri, Jan 04, 2013 at 09:55:40PM +0800, Xiao Guangrong wrote:
> Little cleanup for reexecute_instruction, also use gpa_to_gfn in
> retry_instruction
> 
> Signed-off-by: Xiao Guangrong 
> ---
>  arch/x86/kvm/x86.c |   13 ++---
>  1 files changed, 6 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 1c9c834..ad39018 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -4761,19 +4761,18 @@ static bool reexecute_instruction(struct kvm_vcpu 
> *vcpu, gva_t gva)
>   if (tdp_enabled)
>   return false;
> 
> + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
> + if (gpa == UNMAPPED_GVA)
> + return true; /* let cpu generate fault */
> +

Why change from _system to _read here? Purely cleanup patch should
have no logical changes.

BTW, there is not much logic in using reexecute_instruction() at
for x86_decode_insn (checks in reexecute_instruction() assume 
write to the cr2, for instance).
Fault propagation for x86_decode_insn seems completly broken
(which is perhaps why reexecute_instruction() there survived).

>   /*
>* if emulation was due to access to shadowed page table
>* and it failed try to unshadow page and re-enter the
>* guest to let CPU execute the instruction.
>*/
> - if (kvm_mmu_unprotect_page_virt(vcpu, gva))
> + if (kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa)))
>   return true;
> 
> - gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
> -
> - if (gpa == UNMAPPED_GVA)
> - return true; /* let cpu generate fault */
> -
>   /*
>* Do not retry the unhandleable instruction if it faults on the
>* readonly host memory, otherwise it will goto a infinite loop:
> @@ -4828,7 +4827,7 @@ static bool retry_instruction(struct x86_emulate_ctxt 
> *ctxt,
>   if (!vcpu->arch.mmu.direct_map)
>   gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
> 
> - kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
> + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
> 
>   return true;
>  }
> -- 
> 1.7.7.6
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 5 6 7 8 >

1 - 100 of 798 matches

Mail list logo