from:"Ken Chen"

[PATCH linux dev-4.16 v2] i2c: muxes: pca9641: new driver

2018-03-20 Thread Ken Chen

Signed-off-by: Ken Chen <chen.ke...@inventec.com>

---
v1->v2
- Merged PCA9641 code into i2c-mux-pca9541.c
- Modified title
- Add PCA9641 detect function
---
 drivers/i2c/muxes/i2c-mux-pca9541.c | 184 ++--
 1 file changed, 174 insertions(+), 10 deletions(-)

diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c 
b/drivers/i2c/muxes/i2c-mux-pca9541.c
index 6a39ada..493f947 100644
--- a/drivers/i2c/muxes/i2c-mux-pca9541.c
+++ b/drivers/i2c/muxes/i2c-mux-pca9541.c
@@ -1,5 +1,5 @@
 /*
- * I2C multiplexer driver for PCA9541 bus master selector
+ * I2C multiplexer driver for PCA9541/PCA9641 bus master selector
  *
  * Copyright (c) 2010 Ericsson AB.
  *
@@ -26,8 +26,8 @@
 #include 
 
 /*
- * The PCA9541 is a bus master selector. It supports two I2C masters connected
- * to a single slave bus.
+ * The PCA9541/PCA9641 is a bus master selector. It supports two I2C masters 
+ * connected to a single slave bus.
  *
  * Before each bus transaction, a master has to acquire bus ownership. After 
the
  * transaction is complete, bus ownership has to be released. This fits well
@@ -58,11 +58,43 @@
 #define PCA9541_ISTAT_MYTEST   (1 << 6)
 #define PCA9541_ISTAT_NMYTEST  (1 << 7)
 
+#define PCA9641_ID 0x00
+#define PCA9641_ID_MAGIC   0x38
+
+#define PCA9641_CONTROL0x01
+#define PCA9641_STATUS 0x02
+#define PCA9641_TIME   0x03
+
+#define PCA9641_CTL_LOCK_REQ   BIT(0)
+#define PCA9641_CTL_LOCK_GRANT BIT(1)
+#define PCA9641_CTL_BUS_CONNECTBIT(2)
+#define PCA9641_CTL_BUS_INIT   BIT(3)
+#define PCA9641_CTL_SMBUS_SWRSTBIT(4)
+#define PCA9641_CTL_IDLE_TIMER_DIS BIT(5)
+#define PCA9641_CTL_SMBUS_DIS  BIT(6)
+#define PCA9641_CTL_PRIORITY   BIT(7)
+
+#define PCA9641_STS_OTHER_LOCK BIT(0)
+#define PCA9641_STS_BUS_INIT_FAIL  BIT(1)
+#define PCA9641_STS_BUS_HUNG   BIT(2)
+#define PCA9641_STS_MBOX_EMPTY BIT(3)
+#define PCA9641_STS_MBOX_FULL  BIT(4)
+#define PCA9641_STS_TEST_INT   BIT(5)
+#define PCA9641_STS_SCL_IO BIT(6)
+#define PCA9641_STS_SDA_IO BIT(7)
+
+#define PCA9641_RES_TIME   0x03
+
 #define BUSON  (PCA9541_CTL_BUSON | PCA9541_CTL_NBUSON)
 #define MYBUS  (PCA9541_CTL_MYBUS | PCA9541_CTL_NMYBUS)
 #define mybus(x)   (!((x) & MYBUS) || ((x) & MYBUS) == MYBUS)
 #define busoff(x)  (!((x) & BUSON) || ((x) & BUSON) == BUSON)
 
+#define BUSOFF(x, y)   (!((x) & PCA9641_CTL_LOCK_GRANT) && \
+   !((y) & PCA9641_STS_OTHER_LOCK))
+#define other_lock(x)  ((x) & PCA9641_STS_OTHER_LOCK)
+#define lock_grant(x)  ((x) & PCA9641_CTL_LOCK_GRANT)
+
 /* arbitration timeouts, in jiffies */
 #define ARB_TIMEOUT(HZ / 8)/* 125 ms until forcing bus ownership */
 #define ARB2_TIMEOUT   (HZ / 4)/* 250 ms until acquisition failure */
@@ -79,6 +111,7 @@ struct pca9541 {
 
 static const struct i2c_device_id pca9541_id[] = {
{"pca9541", 0},
+   {"pca9641", 1},
{}
 };
 
@@ -87,6 +120,7 @@ MODULE_DEVICE_TABLE(i2c, pca9541_id);
 #ifdef CONFIG_OF
 static const struct of_device_id pca9541_of_match[] = {
{ .compatible = "nxp,pca9541" },
+   { .compatible = "nxp,pca9641" },
{}
 };
 MODULE_DEVICE_TABLE(of, pca9541_of_match);
@@ -328,6 +362,125 @@ static int pca9541_release_chan(struct i2c_mux_core 
*muxc, u32 chan)
 }
 
 /*
+ * Arbitration management functions
+ */
+static void pca9641_release_bus(struct i2c_client *client)
+{
+   pca9541_reg_write(client, PCA9641_CONTROL, 0);
+}
+
+/*
+ * Channel arbitration
+ *
+ * Return values:
+ *  <0: error
+ *  0 : bus not acquired
+ *  1 : bus acquired
+ */
+static int pca9641_arbitrate(struct i2c_client *client)
+{
+   struct i2c_mux_core *muxc = i2c_get_clientdata(client);
+   struct pca9541 *data = i2c_mux_priv(muxc);
+   int reg_ctl, reg_sts;
+
+   reg_ctl = pca9541_reg_read(client, PCA9641_CONTROL);
+   if (reg_ctl < 0)
+   return reg_ctl;
+   reg_sts = pca9541_reg_read(client, PCA9641_STATUS);
+
+   if (BUSOFF(reg_ctl, reg_sts)) {
+   /*
+* Bus is off. Request ownership or turn it on unless
+* other master requested ownership.
+*/
+   reg_ctl |= PCA9641_CTL_LOCK_REQ;
+   pca9541_reg_write(client, PCA9641_CONTROL, reg_ctl);
+   reg_ctl = pca9541_reg_read(client, PCA9641_CONTROL);
+
+   if (lock_grant(reg_ctl)) {
+   /*
+* Other master did not request ownership,
+* or arbitration timeout expired. Take the bus.
+*/
+   reg_ctl |= PCA9641_CTL_BUS_CONNECT
+   | PCA9641_CTL_LOCK_REQ;
+

[PATCH linux dev-4.16 v2] i2c: muxes: pca9641: new driver

2018-03-20 Thread Ken Chen

Signed-off-by: Ken Chen 

---
v1->v2
- Merged PCA9641 code into i2c-mux-pca9541.c
- Modified title
- Add PCA9641 detect function
---
 drivers/i2c/muxes/i2c-mux-pca9541.c | 184 ++--
 1 file changed, 174 insertions(+), 10 deletions(-)

diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c 
b/drivers/i2c/muxes/i2c-mux-pca9541.c
index 6a39ada..493f947 100644
--- a/drivers/i2c/muxes/i2c-mux-pca9541.c
+++ b/drivers/i2c/muxes/i2c-mux-pca9541.c
@@ -1,5 +1,5 @@
 /*
- * I2C multiplexer driver for PCA9541 bus master selector
+ * I2C multiplexer driver for PCA9541/PCA9641 bus master selector
  *
  * Copyright (c) 2010 Ericsson AB.
  *
@@ -26,8 +26,8 @@
 #include 
 
 /*
- * The PCA9541 is a bus master selector. It supports two I2C masters connected
- * to a single slave bus.
+ * The PCA9541/PCA9641 is a bus master selector. It supports two I2C masters 
+ * connected to a single slave bus.
  *
  * Before each bus transaction, a master has to acquire bus ownership. After 
the
  * transaction is complete, bus ownership has to be released. This fits well
@@ -58,11 +58,43 @@
 #define PCA9541_ISTAT_MYTEST   (1 << 6)
 #define PCA9541_ISTAT_NMYTEST  (1 << 7)
 
+#define PCA9641_ID 0x00
+#define PCA9641_ID_MAGIC   0x38
+
+#define PCA9641_CONTROL0x01
+#define PCA9641_STATUS 0x02
+#define PCA9641_TIME   0x03
+
+#define PCA9641_CTL_LOCK_REQ   BIT(0)
+#define PCA9641_CTL_LOCK_GRANT BIT(1)
+#define PCA9641_CTL_BUS_CONNECTBIT(2)
+#define PCA9641_CTL_BUS_INIT   BIT(3)
+#define PCA9641_CTL_SMBUS_SWRSTBIT(4)
+#define PCA9641_CTL_IDLE_TIMER_DIS BIT(5)
+#define PCA9641_CTL_SMBUS_DIS  BIT(6)
+#define PCA9641_CTL_PRIORITY   BIT(7)
+
+#define PCA9641_STS_OTHER_LOCK BIT(0)
+#define PCA9641_STS_BUS_INIT_FAIL  BIT(1)
+#define PCA9641_STS_BUS_HUNG   BIT(2)
+#define PCA9641_STS_MBOX_EMPTY BIT(3)
+#define PCA9641_STS_MBOX_FULL  BIT(4)
+#define PCA9641_STS_TEST_INT   BIT(5)
+#define PCA9641_STS_SCL_IO BIT(6)
+#define PCA9641_STS_SDA_IO BIT(7)
+
+#define PCA9641_RES_TIME   0x03
+
 #define BUSON  (PCA9541_CTL_BUSON | PCA9541_CTL_NBUSON)
 #define MYBUS  (PCA9541_CTL_MYBUS | PCA9541_CTL_NMYBUS)
 #define mybus(x)   (!((x) & MYBUS) || ((x) & MYBUS) == MYBUS)
 #define busoff(x)  (!((x) & BUSON) || ((x) & BUSON) == BUSON)
 
+#define BUSOFF(x, y)   (!((x) & PCA9641_CTL_LOCK_GRANT) && \
+   !((y) & PCA9641_STS_OTHER_LOCK))
+#define other_lock(x)  ((x) & PCA9641_STS_OTHER_LOCK)
+#define lock_grant(x)  ((x) & PCA9641_CTL_LOCK_GRANT)
+
 /* arbitration timeouts, in jiffies */
 #define ARB_TIMEOUT(HZ / 8)/* 125 ms until forcing bus ownership */
 #define ARB2_TIMEOUT   (HZ / 4)/* 250 ms until acquisition failure */
@@ -79,6 +111,7 @@ struct pca9541 {
 
 static const struct i2c_device_id pca9541_id[] = {
{"pca9541", 0},
+   {"pca9641", 1},
{}
 };
 
@@ -87,6 +120,7 @@ MODULE_DEVICE_TABLE(i2c, pca9541_id);
 #ifdef CONFIG_OF
 static const struct of_device_id pca9541_of_match[] = {
{ .compatible = "nxp,pca9541" },
+   { .compatible = "nxp,pca9641" },
{}
 };
 MODULE_DEVICE_TABLE(of, pca9541_of_match);
@@ -328,6 +362,125 @@ static int pca9541_release_chan(struct i2c_mux_core 
*muxc, u32 chan)
 }
 
 /*
+ * Arbitration management functions
+ */
+static void pca9641_release_bus(struct i2c_client *client)
+{
+   pca9541_reg_write(client, PCA9641_CONTROL, 0);
+}
+
+/*
+ * Channel arbitration
+ *
+ * Return values:
+ *  <0: error
+ *  0 : bus not acquired
+ *  1 : bus acquired
+ */
+static int pca9641_arbitrate(struct i2c_client *client)
+{
+   struct i2c_mux_core *muxc = i2c_get_clientdata(client);
+   struct pca9541 *data = i2c_mux_priv(muxc);
+   int reg_ctl, reg_sts;
+
+   reg_ctl = pca9541_reg_read(client, PCA9641_CONTROL);
+   if (reg_ctl < 0)
+   return reg_ctl;
+   reg_sts = pca9541_reg_read(client, PCA9641_STATUS);
+
+   if (BUSOFF(reg_ctl, reg_sts)) {
+   /*
+* Bus is off. Request ownership or turn it on unless
+* other master requested ownership.
+*/
+   reg_ctl |= PCA9641_CTL_LOCK_REQ;
+   pca9541_reg_write(client, PCA9641_CONTROL, reg_ctl);
+   reg_ctl = pca9541_reg_read(client, PCA9641_CONTROL);
+
+   if (lock_grant(reg_ctl)) {
+   /*
+* Other master did not request ownership,
+* or arbitration timeout expired. Take the bus.
+*/
+   reg_ctl |= PCA9641_CTL_BUS_CONNECT
+   | PCA9641_CTL_LOCK_REQ;
+   pca9541_reg_write

[PATCH linux dev-4.16] drivers: i2c: master arbiter: create driver

2018-03-19 Thread Ken Chen

Initial PCA9641 2 chennel I2C bus master arbiter
with separate implementation. It has probe issue
and difference device hehavior between PCA9541
and PCA9641 in original PCA9641 patch.

Signed-off-by: Ken Chen <chen.ke...@inventec.com>
---
 drivers/i2c/muxes/Kconfig   |   9 +
 drivers/i2c/muxes/Makefile  |   1 +
 drivers/i2c/muxes/i2c-mux-pca9641.c | 360 
 3 files changed, 370 insertions(+)
 create mode 100644 drivers/i2c/muxes/i2c-mux-pca9641.c

diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig
index 52a4a92..f9c51b8 100644
--- a/drivers/i2c/muxes/Kconfig
+++ b/drivers/i2c/muxes/Kconfig
@@ -73,6 +73,15 @@ config I2C_MUX_PCA954x
  This driver can also be built as a module.  If so, the module
  will be called i2c-mux-pca954x.
 
+config I2C_MUX_PCA9641
+   tristate "NXP PCA9641 I2C Master demultiplexer"
+   help
+ If you say yes here you get support for the NXP PCA9641
+ I2C Master demultiplexer with an arbiter function.
+
+ This driver can also be built as a module.  If so, the module
+ will be called i2c-mux-pca9641.
+
 config I2C_MUX_PINCTRL
tristate "pinctrl-based I2C multiplexer"
depends on PINCTRL
diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile
index 6d9d865..a229a1a 100644
--- a/drivers/i2c/muxes/Makefile
+++ b/drivers/i2c/muxes/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_I2C_MUX_LTC4306) += i2c-mux-ltc4306.o
 obj-$(CONFIG_I2C_MUX_MLXCPLD)  += i2c-mux-mlxcpld.o
 obj-$(CONFIG_I2C_MUX_PCA9541)  += i2c-mux-pca9541.o
 obj-$(CONFIG_I2C_MUX_PCA954x)  += i2c-mux-pca954x.o
+obj-$(CONFIG_I2C_MUX_PCA9641)  += i2c-mux-pca9641.o
 obj-$(CONFIG_I2C_MUX_PINCTRL)  += i2c-mux-pinctrl.o
 obj-$(CONFIG_I2C_MUX_REG)  += i2c-mux-reg.o
 
diff --git a/drivers/i2c/muxes/i2c-mux-pca9641.c 
b/drivers/i2c/muxes/i2c-mux-pca9641.c
new file mode 100644
index 000..bcf45c8
--- /dev/null
+++ b/drivers/i2c/muxes/i2c-mux-pca9641.c
@@ -0,0 +1,360 @@
+/*
+ * I2C demultiplexer driver for PCA9641 bus master demultiplexer
+ *
+ * Copyright (c) 2010 Ericsson AB.
+ *
+ * Author: Ken Chen <chen.ke...@inventec.com>
+ *
+ * Derived from:
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+/*
+ * The PCA9641 is two I2C bus masters demultiplexer. It supports two I2C 
masters
+ * connected to a single slave bus.
+ *
+ * It is designed for high reliability dual master I2C bus applications where
+ * correct system operation is required, even when two I2C master issue their
+ * commands at the same time. The arbiter will select a winner and let it work
+ * uninterrupted, and the losing master will take control of the I2C bus after
+ * the winnter has finished. The arbiter also allows for queued requests where
+ * a master requests the downstream bus while the other master has control.
+ *
+ */
+
+#define PCA9641_ID 0x01
+#define PCA9641_ID_MAGIC   0x38
+
+#define PCA9641_CONTROL0x01
+#define PCA9641_STATUS 0x02
+#define PCA9641_TIME   0x03
+
+#define PCA9641_CTL_LOCK_REQ   BIT(0)
+#define PCA9641_CTL_LOCK_GRANT BIT(1)
+#define PCA9641_CTL_BUS_CONNECTBIT(2)
+#define PCA9641_CTL_BUS_INIT   BIT(3)
+#define PCA9641_CTL_SMBUS_SWRSTBIT(4)
+#define PCA9641_CTL_IDLE_TIMER_DIS BIT(5)
+#define PCA9641_CTL_SMBUS_DIS  BIT(6)
+#define PCA9641_CTL_PRIORITY   BIT(7)
+
+#define PCA9641_STS_OTHER_LOCK BIT(0)
+#define PCA9641_STS_BUS_INIT_FAIL  BIT(1)
+#define PCA9641_STS_BUS_HUNG   BIT(2)
+#define PCA9641_STS_MBOX_EMPTY BIT(3)
+#define PCA9641_STS_MBOX_FULL  BIT(4)
+#define PCA9641_STS_TEST_INT   BIT(5)
+#define PCA9641_STS_SCL_IO BIT(6)
+#define PCA9641_STS_SDA_IO BIT(7)
+
+#define PCA9641_RES_TIME   0x03
+
+#define busoff(x, y)   (!((x) & PCA9641_CTL_LOCK_GRANT) && \
+   !((y) & PCA9641_STS_OTHER_LOCK))
+#define other_lock(x)  ((x) & PCA9641_STS_OTHER_LOCK)
+#define lock_grant(x)  ((x) & PCA9641_CTL_LOCK_GRANT)
+
+/* arbitration timeouts, in jiffies */
+#define ARB_TIMEOUT(HZ / 8)/* 125 ms until forcing bus ownership */
+#define ARB2_TIMEOUT   (HZ / 4)/* 250 ms until acquisition failure */
+
+/* arbitration retry delays, in us */
+#define SELECT_DELAY_SHORT 50
+#define SELECT_DELAY_LONG  1000
+
+struct pca9641 {
+   struct i2c_client *client;
+   unsigned long select_timeout;
+   unsigned long arb_timeout;
+};
+
+static const struct i2c_device_id pca9641_id[] = {
+   {"pca9641", 0},
+   {}
+};
+
+MODULE_DEVICE_TABLE(i2c, pca9641

[PATCH linux dev-4.16] drivers: i2c: master arbiter: create driver

2018-03-19 Thread Ken Chen

Initial PCA9641 2 chennel I2C bus master arbiter
with separate implementation. It has probe issue
and difference device hehavior between PCA9541
and PCA9641 in original PCA9641 patch.

Signed-off-by: Ken Chen 
---
 drivers/i2c/muxes/Kconfig   |   9 +
 drivers/i2c/muxes/Makefile  |   1 +
 drivers/i2c/muxes/i2c-mux-pca9641.c | 360 
 3 files changed, 370 insertions(+)
 create mode 100644 drivers/i2c/muxes/i2c-mux-pca9641.c

diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig
index 52a4a92..f9c51b8 100644
--- a/drivers/i2c/muxes/Kconfig
+++ b/drivers/i2c/muxes/Kconfig
@@ -73,6 +73,15 @@ config I2C_MUX_PCA954x
  This driver can also be built as a module.  If so, the module
  will be called i2c-mux-pca954x.
 
+config I2C_MUX_PCA9641
+   tristate "NXP PCA9641 I2C Master demultiplexer"
+   help
+ If you say yes here you get support for the NXP PCA9641
+ I2C Master demultiplexer with an arbiter function.
+
+ This driver can also be built as a module.  If so, the module
+ will be called i2c-mux-pca9641.
+
 config I2C_MUX_PINCTRL
tristate "pinctrl-based I2C multiplexer"
depends on PINCTRL
diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile
index 6d9d865..a229a1a 100644
--- a/drivers/i2c/muxes/Makefile
+++ b/drivers/i2c/muxes/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_I2C_MUX_LTC4306) += i2c-mux-ltc4306.o
 obj-$(CONFIG_I2C_MUX_MLXCPLD)  += i2c-mux-mlxcpld.o
 obj-$(CONFIG_I2C_MUX_PCA9541)  += i2c-mux-pca9541.o
 obj-$(CONFIG_I2C_MUX_PCA954x)  += i2c-mux-pca954x.o
+obj-$(CONFIG_I2C_MUX_PCA9641)  += i2c-mux-pca9641.o
 obj-$(CONFIG_I2C_MUX_PINCTRL)  += i2c-mux-pinctrl.o
 obj-$(CONFIG_I2C_MUX_REG)  += i2c-mux-reg.o
 
diff --git a/drivers/i2c/muxes/i2c-mux-pca9641.c 
b/drivers/i2c/muxes/i2c-mux-pca9641.c
new file mode 100644
index 000..bcf45c8
--- /dev/null
+++ b/drivers/i2c/muxes/i2c-mux-pca9641.c
@@ -0,0 +1,360 @@
+/*
+ * I2C demultiplexer driver for PCA9641 bus master demultiplexer
+ *
+ * Copyright (c) 2010 Ericsson AB.
+ *
+ * Author: Ken Chen 
+ *
+ * Derived from:
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+/*
+ * The PCA9641 is two I2C bus masters demultiplexer. It supports two I2C 
masters
+ * connected to a single slave bus.
+ *
+ * It is designed for high reliability dual master I2C bus applications where
+ * correct system operation is required, even when two I2C master issue their
+ * commands at the same time. The arbiter will select a winner and let it work
+ * uninterrupted, and the losing master will take control of the I2C bus after
+ * the winnter has finished. The arbiter also allows for queued requests where
+ * a master requests the downstream bus while the other master has control.
+ *
+ */
+
+#define PCA9641_ID 0x01
+#define PCA9641_ID_MAGIC   0x38
+
+#define PCA9641_CONTROL0x01
+#define PCA9641_STATUS 0x02
+#define PCA9641_TIME   0x03
+
+#define PCA9641_CTL_LOCK_REQ   BIT(0)
+#define PCA9641_CTL_LOCK_GRANT BIT(1)
+#define PCA9641_CTL_BUS_CONNECTBIT(2)
+#define PCA9641_CTL_BUS_INIT   BIT(3)
+#define PCA9641_CTL_SMBUS_SWRSTBIT(4)
+#define PCA9641_CTL_IDLE_TIMER_DIS BIT(5)
+#define PCA9641_CTL_SMBUS_DIS  BIT(6)
+#define PCA9641_CTL_PRIORITY   BIT(7)
+
+#define PCA9641_STS_OTHER_LOCK BIT(0)
+#define PCA9641_STS_BUS_INIT_FAIL  BIT(1)
+#define PCA9641_STS_BUS_HUNG   BIT(2)
+#define PCA9641_STS_MBOX_EMPTY BIT(3)
+#define PCA9641_STS_MBOX_FULL  BIT(4)
+#define PCA9641_STS_TEST_INT   BIT(5)
+#define PCA9641_STS_SCL_IO BIT(6)
+#define PCA9641_STS_SDA_IO BIT(7)
+
+#define PCA9641_RES_TIME   0x03
+
+#define busoff(x, y)   (!((x) & PCA9641_CTL_LOCK_GRANT) && \
+   !((y) & PCA9641_STS_OTHER_LOCK))
+#define other_lock(x)  ((x) & PCA9641_STS_OTHER_LOCK)
+#define lock_grant(x)  ((x) & PCA9641_CTL_LOCK_GRANT)
+
+/* arbitration timeouts, in jiffies */
+#define ARB_TIMEOUT(HZ / 8)/* 125 ms until forcing bus ownership */
+#define ARB2_TIMEOUT   (HZ / 4)/* 250 ms until acquisition failure */
+
+/* arbitration retry delays, in us */
+#define SELECT_DELAY_SHORT 50
+#define SELECT_DELAY_LONG  1000
+
+struct pca9641 {
+   struct i2c_client *client;
+   unsigned long select_timeout;
+   unsigned long arb_timeout;
+};
+
+static const struct i2c_device_id pca9641_id[] = {
+   {"pca9641", 0},
+   {}
+};
+
+MODULE_DEVICE_TABLE(i2c, pca9641_id);
+
+#ifdef CONFIG_OF
+static const struct of_devic

Re: [PATCH] hugetlbfs: document that sticky mounts are allowed

2014-09-04 Thread Ken Chen

On Thu, Sep 4, 2014 at 6:20 AM, Kirill Smelkov  wrote:
> Commit 75897d60 (hugetlb: allow sticky directory mount option) added
> support for mounting hugetlbfs with sticky option set, like /tmp is
> usually mounted, but forgot to document that.
>
> Cc: Ken Chen 
> Signed-off-by: Kirill Smelkov 
> ---
>  Documentation/vm/hugetlbpage.txt | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/Documentation/vm/hugetlbpage.txt 
> b/Documentation/vm/hugetlbpage.txt
> index bdd4bb9..b64e0af 100644
> --- a/Documentation/vm/hugetlbpage.txt
> +++ b/Documentation/vm/hugetlbpage.txt
> @@ -274,7 +274,7 @@ This command mounts a (pseudo) filesystem of type 
> hugetlbfs on the directory
>  /mnt/huge.  Any files created on /mnt/huge uses huge pages.  The uid and gid
>  options sets the owner and group of the root of the file system.  By default
>  the uid and gid of the current process are taken.  The mode option sets the
> -mode of root of file system to value & 0777.  This value is given in octal.
> +mode of root of file system to value & 01777.  This value is given in octal.
>  By default the value 0755 is picked. The size option sets the maximum value 
> of
>  memory (huge pages) allowed for that filesystem (/mnt/huge). The size is
>  rounded down to HPAGE_SIZE.  The option nr_inodes sets the maximum number of

Acked-by: Ken Chen 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] hugetlbfs: document that sticky mounts are allowed

2014-09-04 Thread Ken Chen

On Thu, Sep 4, 2014 at 6:20 AM, Kirill Smelkov k...@nexedi.com wrote:
 Commit 75897d60 (hugetlb: allow sticky directory mount option) added
 support for mounting hugetlbfs with sticky option set, like /tmp is
 usually mounted, but forgot to document that.

 Cc: Ken Chen kenc...@google.com
 Signed-off-by: Kirill Smelkov k...@nexedi.com
 ---
  Documentation/vm/hugetlbpage.txt | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/Documentation/vm/hugetlbpage.txt 
 b/Documentation/vm/hugetlbpage.txt
 index bdd4bb9..b64e0af 100644
 --- a/Documentation/vm/hugetlbpage.txt
 +++ b/Documentation/vm/hugetlbpage.txt
 @@ -274,7 +274,7 @@ This command mounts a (pseudo) filesystem of type 
 hugetlbfs on the directory
  /mnt/huge.  Any files created on /mnt/huge uses huge pages.  The uid and gid
  options sets the owner and group of the root of the file system.  By default
  the uid and gid of the current process are taken.  The mode option sets the
 -mode of root of file system to value  0777.  This value is given in octal.
 +mode of root of file system to value  01777.  This value is given in octal.
  By default the value 0755 is picked. The size option sets the maximum value 
 of
  memory (huge pages) allowed for that filesystem (/mnt/huge). The size is
  rounded down to HPAGE_SIZE.  The option nr_inodes sets the maximum number of

Acked-by: Ken Chen kenc...@google.com
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] hugetlb: follow_hugetlb_page for write access

2007-11-08 Thread Ken Chen

On Nov 7, 2007 11:51 AM, Adam Litke <[EMAIL PROTECTED]> wrote:
> When calling get_user_pages(), a write flag is passed in by the caller to
> indicate if write access is required on the faulted-in pages.  Currently,
> follow_hugetlb_page() ignores this flag and always faults pages for
> read-only access.  This can cause data corruption because a device driver
> that calls get_user_pages() with write set will not expect COW faults to
> occur on the returned pages.
>
> This patch passes the write flag down to follow_hugetlb_page() and makes
> sure hugetlb_fault() is called with the right write_access parameter.
>
> Signed-off-by: Adam Litke <[EMAIL PROTECTED]>

Adam, this looks good.

Reviewed-by: Ken Chen <[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] hugetlb: follow_hugetlb_page for write access

2007-11-08 Thread Ken Chen

On Nov 7, 2007 11:51 AM, Adam Litke [EMAIL PROTECTED] wrote:
 When calling get_user_pages(), a write flag is passed in by the caller to
 indicate if write access is required on the faulted-in pages.  Currently,
 follow_hugetlb_page() ignores this flag and always faults pages for
 read-only access.  This can cause data corruption because a device driver
 that calls get_user_pages() with write set will not expect COW faults to
 occur on the returned pages.

 This patch passes the write flag down to follow_hugetlb_page() and makes
 sure hugetlb_fault() is called with the right write_access parameter.

 Signed-off-by: Adam Litke [EMAIL PROTECTED]

Adam, this looks good.

Reviewed-by: Ken Chen [EMAIL PROTECTED]
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] sched: schedstat needs a diet

2007-10-18 Thread Ken Chen

On 10/18/07, Mathieu Desnoyers <[EMAIL PROTECTED]> wrote:
> Good question indeed. How large is this memory footprint exactly ? If it
> is as small as you say, I suspect that the real issue could be that
> these variable are accessed by the scheduler critical paths and
> therefore trash the caches.

Maybe my wording was ambiguous, I meant to reduce cache line pollution
when accessing these schedstat fields.

With unsigned long, on x86_64, schedstat consumes 288 bytes for each
sched_domain and 128 bytes in struct rq.  On a extremely small system
that has a couple of CPU sockets with one level of numa node, there
will be 704 bytes per CPU for schedstat.  Given the sparseness of
them, we are probably talking about 11-12 cache line eviction on
several heavily used scheduler functions.  Reduce cache line pollution
is the primary goal, actual memory consumption isn't really a concern.

- Ken
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] sched: schedstat needs a diet

2007-10-18 Thread Ken Chen

On 10/18/07, Mathieu Desnoyers [EMAIL PROTECTED] wrote:
 Good question indeed. How large is this memory footprint exactly ? If it
 is as small as you say, I suspect that the real issue could be that
 these variable are accessed by the scheduler critical paths and
 therefore trash the caches.

Maybe my wording was ambiguous, I meant to reduce cache line pollution
when accessing these schedstat fields.

With unsigned long, on x86_64, schedstat consumes 288 bytes for each
sched_domain and 128 bytes in struct rq.  On a extremely small system
that has a couple of CPU sockets with one level of numa node, there
will be 704 bytes per CPU for schedstat.  Given the sparseness of
them, we are probably talking about 11-12 cache line eviction on
several heavily used scheduler functions.  Reduce cache line pollution
is the primary goal, actual memory consumption isn't really a concern.

- Ken
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] sched: fix improper load balance across sched domain

2007-10-17 Thread Ken Chen

On 10/16/07, Siddha, Suresh B <[EMAIL PROTECTED]> wrote:
> On Tue, Oct 16, 2007 at 12:07:06PM -0700, Ken Chen wrote:
> > We recently discovered a nasty performance bug in the kernel CPU load
> > balancer where we were hit by 50% performance regression.
> >
> > When tasks are assigned to a subset of CPUs that span across
> > sched_domains (either ccNUMA node or the new multi-core domain) via
> > cpu affinity, kernel fails to perform proper load balance at
> > these domains, due to several logic in find_busiest_group() miss
> > identified busiest sched group within a given domain. This leads to
> > inadequate load balance and causes 50% performance hit.
> >
> > To give you a concrete example, on a dual-core, 2 socket numa system,
> > there are 4 logical cpu, organized as:
>
> oops, this issue can easily happen when cores are not sharing caches. I
> think this is what happening on your setup, right?

yes, we observed the bad behavior on quad-core system with separate L2
cache as well.

- Ken
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] sched: fix improper load balance across sched domain

2007-10-17 Thread Ken Chen

On 10/16/07, Siddha, Suresh B [EMAIL PROTECTED] wrote:
 On Tue, Oct 16, 2007 at 12:07:06PM -0700, Ken Chen wrote:
  We recently discovered a nasty performance bug in the kernel CPU load
  balancer where we were hit by 50% performance regression.
 
  When tasks are assigned to a subset of CPUs that span across
  sched_domains (either ccNUMA node or the new multi-core domain) via
  cpu affinity, kernel fails to perform proper load balance at
  these domains, due to several logic in find_busiest_group() miss
  identified busiest sched group within a given domain. This leads to
  inadequate load balance and causes 50% performance hit.
 
  To give you a concrete example, on a dual-core, 2 socket numa system,
  there are 4 logical cpu, organized as:

 oops, this issue can easily happen when cores are not sharing caches. I
 think this is what happening on your setup, right?

yes, we observed the bad behavior on quad-core system with separate L2
cache as well.

- Ken
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch] sched: schedstat needs a diet

2007-10-16 Thread Ken Chen

schedstat is useful in investigating CPU scheduler behavior.  Ideally,
I think it is beneficial to have it on all the time.  However, the
cost of turning it on in production system is quite high, largely due
to number of events it collects and also due to its large memory
footprint.

Most of the fields probably don't need to be full 64-bit on 64-bit
arch.  Rolling over 4 billion events will most like take a long time
and user space tool can be made to accommodate that.  I'm proposing
kernel to cut back most of variable width on 64-bit system.  (note,
the following patch doesn't affect 32-bit system).


Signed-off-by: Ken Chen <[EMAIL PROTECTED]>


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 592e3a5..311a8bd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -562,7 +562,7 @@ struct sched_info {
   last_queued; /* when we were last queued to run */
 #ifdef CONFIG_SCHEDSTATS
/* BKL stats */
-   unsigned long bkl_count;
+   unsigned int bkl_count;
 #endif
 };
 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
@@ -698,34 +698,34 @@ struct sched_domain {

 #ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
-   unsigned long lb_count[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_gained[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_count[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];

/* Active load balancing */
-   unsigned long alb_count;
-   unsigned long alb_failed;
-   unsigned long alb_pushed;
+   unsigned int alb_count;
+   unsigned int alb_failed;
+   unsigned int alb_pushed;

/* SD_BALANCE_EXEC stats */
-   unsigned long sbe_count;
-   unsigned long sbe_balanced;
-   unsigned long sbe_pushed;
+   unsigned int sbe_count;
+   unsigned int sbe_balanced;
+   unsigned int sbe_pushed;

/* SD_BALANCE_FORK stats */
-   unsigned long sbf_count;
-   unsigned long sbf_balanced;
-   unsigned long sbf_pushed;
+   unsigned int sbf_count;
+   unsigned int sbf_balanced;
+   unsigned int sbf_pushed;

/* try_to_wake_up() stats */
-   unsigned long ttwu_wake_remote;
-   unsigned long ttwu_move_affine;
-   unsigned long ttwu_move_balance;
+   unsigned int ttwu_wake_remote;
+   unsigned int ttwu_move_affine;
+   unsigned int ttwu_move_balance;
 #endif
 };

diff --git a/kernel/sched.c b/kernel/sched.c
index 0da2b26..5e7fce9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -328,22 +328,22 @@ struct rq {
struct sched_info rq_sched_info;

/* sys_sched_yield() stats */
-   unsigned long yld_exp_empty;
-   unsigned long yld_act_empty;
-   unsigned long yld_both_empty;
-   unsigned long yld_count;
+   unsigned int yld_exp_empty;
+   unsigned int yld_act_empty;
+   unsigned int yld_both_empty;
+   unsigned int yld_count;

/* schedule() stats */
-   unsigned long sched_switch;
-   unsigned long sched_count;
-   unsigned long sched_goidle;
+   unsigned int sched_switch;
+   unsigned int sched_count;
+   unsigned int sched_goidle;

/* try_to_wake_up() stats */
-   unsigned long ttwu_count;
-   unsigned long ttwu_local;
+   unsigned int ttwu_count;
+   unsigned int ttwu_local;

/* BKL stats */
-   unsigned long bkl_count;
+   unsigned int bkl_count;
 #endif
struct lock_class_key rq_lock_key;
 };
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index a5e517e..e6fb392 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -137,7 +137,7 @@ void print_cfs_rq(struct seq_file *m, int cpu,
struct cfs_rq *cfs_rq)
SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_SCHEDSTATS
-   SEQ_printf(m, "  .%-30s: %ld\n", "bkl_count",
+   SEQ_printf(m, "  .%-30s: %d\n", "bkl_count",
rq->bkl_count);
 #endif
SEQ_printf(m, "  .%-30s: %ld\n", "nr_spread_over",
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 1c08484..ef1a7df 100644
---

[patch] sched: fix improper load balance across sched domain

2007-10-16 Thread Ken Chen

We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.

When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.

To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:

CPU0 attaching sched-domain:
 domain 0: span 0003  groups: 0001 0002
 domain 1: span 000f  groups: 0003 000c
CPU1 attaching sched-domain:
 domain 0: span 0003  groups: 0002 0001
 domain 1: span 000f  groups: 0003 000c
CPU2 attaching sched-domain:
 domain 0: span 000c  groups: 0004 0008
 domain 1: span 000f  groups: 000c 0003
CPU3 attaching sched-domain:
 domain 0: span 000c  groups: 0008 0004
 domain 1: span 000f  groups: 000c 0003

If I run 2 tasks with CPU affinity set to 0x5.  There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle.  The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining "busiest" variable, kernel only set it when
"sum_nr_running > group_capacity".  This test is flawed that
"sum_nr_running" is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group.  The end result is
that kernel "think" everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle.  There are two other logic in the same function will also
causing similar effect.  The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state.  From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).

So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group.  When such condition is detected, load balance goes into spread
mode instead of default grouping mode.


Signed-off-by: Ken Chen <[EMAIL PROTECTED]>


--- ./kernel/sched.c.orig   2007-10-16 10:08:01.0 -0700
+++ ./kernel/sched.c2007-10-16 10:56:13.0 -0700
@@ -2339,7 +2339,7 @@
unsigned long max_pull;
unsigned long busiest_load_per_task, busiest_nr_running;
unsigned long this_load_per_task, this_nr_running;
-   int load_idx;
+   int load_idx, group_imb = 0;
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int power_savings_balance = 1;
unsigned long leader_nr_running = 0, min_load_per_task = 0;
@@ -2358,9 +2358,10 @@
load_idx = sd->idle_idx;

do {
-   unsigned long load, group_capacity;
+   unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
int local_group;
int i;
+   int __group_imb = 0;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long sum_nr_running, sum_weighted_load;

@@ -2371,6 +2372,8 @@

/* Tally up the load of all CPUs in the group */
sum_weighted_load = sum_nr_running = avg_load = 0;
+   max_cpu_load = 0;
+   min_cpu_load = ~0UL;

for_each_cpu_mask(i, group->cpumask) {
struct rq *rq;
@@ -2391,8 +2394,13 @@
}

load = target_load(i, load_idx);
-   } else
+   } else {
load = source_load(i, load_idx);
+   if (load > max_cpu_load)
+   max_cpu_load = load;
+   if (min_cpu_load > load)
+   min_cpu_load = load;
+   }

avg_load += load;
sum_nr_running += rq->nr_running;
@@ -2418,6 +2426,9 @@
avg_load = sg_div_cpu_power(group,
avg_load * SCHED_LOAD_SCALE);

+   if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE)
+   __group_imb = 1;
+
group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;

if (local_group) {
@@ -2426,11 +2437,12 @@
this_nr_running = sum_nr_running;
this_load_per_task = sum_weighted_load;
} else if (avg_load &g

[patch] sched: fix improper load balance across sched domain

2007-10-16 Thread Ken Chen

We recently discovered a nasty performance bug in the kernel CPU load
balancer where we were hit by 50% performance regression.

When tasks are assigned to a subset of CPUs that span across
sched_domains (either ccNUMA node or the new multi-core domain) via
cpu affinity, kernel fails to perform proper load balance at
these domains, due to several logic in find_busiest_group() miss
identified busiest sched group within a given domain. This leads to
inadequate load balance and causes 50% performance hit.

To give you a concrete example, on a dual-core, 2 socket numa system,
there are 4 logical cpu, organized as:

CPU0 attaching sched-domain:
 domain 0: span 0003  groups: 0001 0002
 domain 1: span 000f  groups: 0003 000c
CPU1 attaching sched-domain:
 domain 0: span 0003  groups: 0002 0001
 domain 1: span 000f  groups: 0003 000c
CPU2 attaching sched-domain:
 domain 0: span 000c  groups: 0004 0008
 domain 1: span 000f  groups: 000c 0003
CPU3 attaching sched-domain:
 domain 0: span 000c  groups: 0008 0004
 domain 1: span 000f  groups: 000c 0003

If I run 2 tasks with CPU affinity set to 0x5.  There are situation
where cpu0 has run queue length of 2, and cpu2 will be idle.  The
kernel load balancer is unable to balance out these two tasks over
cpu0 and cpu2 due to at least three logics in find_busiest_group()
that heavily bias load balance towards power saving mode. e.g. while
determining busiest variable, kernel only set it when
sum_nr_running  group_capacity.  This test is flawed that
sum_nr_running is not necessary same as
sum-tasks-allowed-to-run-within-the sched-group.  The end result is
that kernel think everything is balanced, but in reality we have an
imbalance and thus causing one CPU to be over-subscribed and leaving
other idle.  There are two other logic in the same function will also
causing similar effect.  The nastiness of this bug is that kernel not
be able to get unstuck in this unfortunate broken state.  From what
we've seen in our environment, kernel will stuck in imbalanced state
for extended period of time and it is also very easy for the kernel to
stuck into that state (it's pretty much 100% reproducible for us).

So proposing the following fix: add addition logic in
find_busiest_group to detect intrinsic imbalance within the busiest
group.  When such condition is detected, load balance goes into spread
mode instead of default grouping mode.


Signed-off-by: Ken Chen [EMAIL PROTECTED]


--- ./kernel/sched.c.orig   2007-10-16 10:08:01.0 -0700
+++ ./kernel/sched.c2007-10-16 10:56:13.0 -0700
@@ -2339,7 +2339,7 @@
unsigned long max_pull;
unsigned long busiest_load_per_task, busiest_nr_running;
unsigned long this_load_per_task, this_nr_running;
-   int load_idx;
+   int load_idx, group_imb = 0;
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int power_savings_balance = 1;
unsigned long leader_nr_running = 0, min_load_per_task = 0;
@@ -2358,9 +2358,10 @@
load_idx = sd-idle_idx;

do {
-   unsigned long load, group_capacity;
+   unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
int local_group;
int i;
+   int __group_imb = 0;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long sum_nr_running, sum_weighted_load;

@@ -2371,6 +2372,8 @@

/* Tally up the load of all CPUs in the group */
sum_weighted_load = sum_nr_running = avg_load = 0;
+   max_cpu_load = 0;
+   min_cpu_load = ~0UL;

for_each_cpu_mask(i, group-cpumask) {
struct rq *rq;
@@ -2391,8 +2394,13 @@
}

load = target_load(i, load_idx);
-   } else
+   } else {
load = source_load(i, load_idx);
+   if (load  max_cpu_load)
+   max_cpu_load = load;
+   if (min_cpu_load  load)
+   min_cpu_load = load;
+   }

avg_load += load;
sum_nr_running += rq-nr_running;
@@ -2418,6 +2426,9 @@
avg_load = sg_div_cpu_power(group,
avg_load * SCHED_LOAD_SCALE);

+   if ((max_cpu_load - min_cpu_load)  SCHED_LOAD_SCALE)
+   __group_imb = 1;
+
group_capacity = group-__cpu_power / SCHED_LOAD_SCALE;

if (local_group) {
@@ -2426,11 +2437,12 @@
this_nr_running = sum_nr_running;
this_load_per_task = sum_weighted_load;
} else if (avg_load  max_load 
-  sum_nr_running  group_capacity) {
+  (sum_nr_running

[patch] sched: schedstat needs a diet

2007-10-16 Thread Ken Chen

schedstat is useful in investigating CPU scheduler behavior.  Ideally,
I think it is beneficial to have it on all the time.  However, the
cost of turning it on in production system is quite high, largely due
to number of events it collects and also due to its large memory
footprint.

Most of the fields probably don't need to be full 64-bit on 64-bit
arch.  Rolling over 4 billion events will most like take a long time
and user space tool can be made to accommodate that.  I'm proposing
kernel to cut back most of variable width on 64-bit system.  (note,
the following patch doesn't affect 32-bit system).


Signed-off-by: Ken Chen [EMAIL PROTECTED]


diff --git a/include/linux/sched.h b/include/linux/sched.h
index 592e3a5..311a8bd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -562,7 +562,7 @@ struct sched_info {
   last_queued; /* when we were last queued to run */
 #ifdef CONFIG_SCHEDSTATS
/* BKL stats */
-   unsigned long bkl_count;
+   unsigned int bkl_count;
 #endif
 };
 #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
@@ -698,34 +698,34 @@ struct sched_domain {

 #ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
-   unsigned long lb_count[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_failed[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_balanced[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_gained[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES];
-   unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_count[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
+   unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];

/* Active load balancing */
-   unsigned long alb_count;
-   unsigned long alb_failed;
-   unsigned long alb_pushed;
+   unsigned int alb_count;
+   unsigned int alb_failed;
+   unsigned int alb_pushed;

/* SD_BALANCE_EXEC stats */
-   unsigned long sbe_count;
-   unsigned long sbe_balanced;
-   unsigned long sbe_pushed;
+   unsigned int sbe_count;
+   unsigned int sbe_balanced;
+   unsigned int sbe_pushed;

/* SD_BALANCE_FORK stats */
-   unsigned long sbf_count;
-   unsigned long sbf_balanced;
-   unsigned long sbf_pushed;
+   unsigned int sbf_count;
+   unsigned int sbf_balanced;
+   unsigned int sbf_pushed;

/* try_to_wake_up() stats */
-   unsigned long ttwu_wake_remote;
-   unsigned long ttwu_move_affine;
-   unsigned long ttwu_move_balance;
+   unsigned int ttwu_wake_remote;
+   unsigned int ttwu_move_affine;
+   unsigned int ttwu_move_balance;
 #endif
 };

diff --git a/kernel/sched.c b/kernel/sched.c
index 0da2b26..5e7fce9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -328,22 +328,22 @@ struct rq {
struct sched_info rq_sched_info;

/* sys_sched_yield() stats */
-   unsigned long yld_exp_empty;
-   unsigned long yld_act_empty;
-   unsigned long yld_both_empty;
-   unsigned long yld_count;
+   unsigned int yld_exp_empty;
+   unsigned int yld_act_empty;
+   unsigned int yld_both_empty;
+   unsigned int yld_count;

/* schedule() stats */
-   unsigned long sched_switch;
-   unsigned long sched_count;
-   unsigned long sched_goidle;
+   unsigned int sched_switch;
+   unsigned int sched_count;
+   unsigned int sched_goidle;

/* try_to_wake_up() stats */
-   unsigned long ttwu_count;
-   unsigned long ttwu_local;
+   unsigned int ttwu_count;
+   unsigned int ttwu_local;

/* BKL stats */
-   unsigned long bkl_count;
+   unsigned int bkl_count;
 #endif
struct lock_class_key rq_lock_key;
 };
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index a5e517e..e6fb392 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -137,7 +137,7 @@ void print_cfs_rq(struct seq_file *m, int cpu,
struct cfs_rq *cfs_rq)
SEQ_printf(m,   .%-30s: %ld\n, nr_running, cfs_rq-nr_running);
SEQ_printf(m,   .%-30s: %ld\n, load, cfs_rq-load.weight);
 #ifdef CONFIG_SCHEDSTATS
-   SEQ_printf(m,   .%-30s: %ld\n, bkl_count,
+   SEQ_printf(m,   .%-30s: %d\n, bkl_count,
rq-bkl_count);
 #endif
SEQ_printf(m,   .%-30s: %ld\n, nr_spread_over,
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 1c08484..ef1a7df 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -21,7 +21,7 @@ static int show_schedstat(struct seq_file *seq, void *v

Re: [PATCH] hugetlb: Fix pool resizing corner case

2007-10-03 Thread Ken Chen

On 10/3/07, Dave Hansen <[EMAIL PROTECTED]> wrote:
> > Not quite.  Count can never go below the number of reserved pages plus
> > pages allocated to MAP_PRIVATE mappings.  That number is computed by:
> > (resv + (total - free)).
>
> So, (total - free) equals the number of MAP_PRIVATE pages?  Does that
> imply that all reserved pages are shared and that all shared pages are
> reserved?

no, not quite.  In-use huge page (total - free) can be both private or
shared.  resv_huge_pages counts number of pages that is committed for
shared mapping, but not yet faulted in.

What the equation does essentially is: resv_huge_pages + nr-huge-pages-in-use.

- Ken
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] hugetlb: Fix pool resizing corner case

2007-10-03 Thread Ken Chen

On 10/3/07, Adam Litke <[EMAIL PROTECTED]> wrote:
> The key is that we don't want to shrink the pool below the number of
> pages we are committed to keeping around.  Before this patch, we only
> accounted for the pages we plan to hand out (reserved huge pages) but
> not the ones we've already handed out (total - free).  Does that make
> sense?

Good catch, adam.

>From what I can see, the statement
if (count >= nr_huge_pages)
return nr_huge_pages;

in set_max_huge_pages() is useless because (1) we recalculate "count"
variable below it; and (2) both try_to_free_low() and the while loop
below the call to try_to_free_low() will terminate correctly.  If you
feel like it, please clean it up as well.

If not, I'm fine with that.

Acked-by: Ken Chen <[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] hugetlb: Fix pool resizing corner case

2007-10-03 Thread Ken Chen

On 10/3/07, Adam Litke [EMAIL PROTECTED] wrote:
 The key is that we don't want to shrink the pool below the number of
 pages we are committed to keeping around.  Before this patch, we only
 accounted for the pages we plan to hand out (reserved huge pages) but
 not the ones we've already handed out (total - free).  Does that make
 sense?

Good catch, adam.

From what I can see, the statement
if (count = nr_huge_pages)
return nr_huge_pages;

in set_max_huge_pages() is useless because (1) we recalculate count
variable below it; and (2) both try_to_free_low() and the while loop
below the call to try_to_free_low() will terminate correctly.  If you
feel like it, please clean it up as well.

If not, I'm fine with that.

Acked-by: Ken Chen [EMAIL PROTECTED]
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] hugetlb: Fix pool resizing corner case

2007-10-03 Thread Ken Chen

On 10/3/07, Dave Hansen [EMAIL PROTECTED] wrote:
  Not quite.  Count can never go below the number of reserved pages plus
  pages allocated to MAP_PRIVATE mappings.  That number is computed by:
  (resv + (total - free)).

 So, (total - free) equals the number of MAP_PRIVATE pages?  Does that
 imply that all reserved pages are shared and that all shared pages are
 reserved?

no, not quite.  In-use huge page (total - free) can be both private or
shared.  resv_huge_pages counts number of pages that is committed for
shared mapping, but not yet faulted in.

What the equation does essentially is: resv_huge_pages + nr-huge-pages-in-use.

- Ken
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] shm: Fix the filename of hugetlb sysv shared memory

2007-06-11 Thread Ken Chen

On 6/11/07, Adam Litke <[EMAIL PROTECTED]> wrote:

On 6/8/07, Eric W. Biederman <[EMAIL PROTECTED]> wrote:
> -struct file *hugetlb_zero_setup(size_t size)
> +struct file *hugetlb_file_setup(const char *name, size_t size)

The bulk of this patch seems to handle renaming this function.  Is
that really necessary?

It looks OK to me though, because the argument list to that function
is changed.  Avoid change the function name isn't going to reduce the
patch size either.  So we might just change the name as well to match
the function name shmem_file_setup().
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] shm: Fix the filename of hugetlb sysv shared memory

2007-06-11 Thread Ken Chen


On 6/11/07, Adam Litke [EMAIL PROTECTED] wrote:

On 6/8/07, Eric W. Biederman [EMAIL PROTECTED] wrote:
 -struct file *hugetlb_zero_setup(size_t size)
 +struct file *hugetlb_file_setup(const char *name, size_t size)

The bulk of this patch seems to handle renaming this function.  Is
that really necessary?


It looks OK to me though, because the argument list to that function
is changed.  Avoid change the function name isn't going to reduce the
patch size either.  So we might just change the name as well to match
the function name shmem_file_setup().
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-06-01 Thread Ken Chen


On 5/31/07, Andrew Morton <[EMAIL PROTECTED]> wrote:

Could you please send a fresh, shiny, new, changelogged patch against mainline?


OK, here it is.  It would be nice to merge this before final 2.6.22
release.  Thank you.


The kernel on-demand loop device instantiation breaks several user space
tools as the tools are not ready to cope with the "on-demand feature".  Fix
it by instantiate default 8 loop devices and also reinstate max_loop module
parameter.


Signed-off-by: Ken Chen <[EMAIL PROTECTED]>
Acked-by: Al Viro <[EMAIL PROTECTED]>


diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5526ead..0ed5470 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1354,7 +1354,7 @@ #endif
 */
static int max_loop;
module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, "obsolete, loop device is created on-demand");
+MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1394,16 +1394,11 @@ int loop_unregister_transfer
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

-static struct loop_device *loop_init_one(int i)
+static struct loop_device *loop_alloc(int i)
{
struct loop_device *lo;
struct gendisk *disk;

-   list_for_each_entry(lo, _devices, lo_list) {
-   if (lo->lo_number == i)
-   return lo;
-   }
-
lo = kzalloc(sizeof(*lo), GFP_KERNEL);
if (!lo)
goto out;
@@ -1427,8 +1422,6 @@ static struct loop_device *loop_init_one
disk->private_data   = lo;
disk->queue  = lo->lo_queue;
sprintf(disk->disk_name, "loop%d", i);
-   add_disk(disk);
-   list_add_tail(>lo_list, _devices);
return lo;

out_free_queue:
@@ -1439,15 +1432,37 @@ out:
return NULL;
}

-static void loop_del_one(struct loop_device *lo)
+static void loop_free(struct loop_device *lo)
{
-   del_gendisk(lo->lo_disk);
blk_cleanup_queue(lo->lo_queue);
put_disk(lo->lo_disk);
list_del(>lo_list);
kfree(lo);
}

+static struct loop_device *loop_init_one(int i)
+{
+   struct loop_device *lo;
+
+   list_for_each_entry(lo, _devices, lo_list) {
+   if (lo->lo_number == i)
+   return lo;
+   }
+
+   lo = loop_alloc(i);
+   if (lo) {
+   add_disk(lo->lo_disk);
+   list_add_tail(>lo_list, _devices);
+   }
+   return lo;
+}
+
+static void loop_del_one(struct loop_device *lo)
+{
+   del_gendisk(lo->lo_disk);
+   loop_free(lo);
+}
+
static struct kobject *loop_probe(dev_t dev, int *part, void *data)
{
struct loop_device *lo;
@@ -1464,28 +1479,77 @@ static struct kobject *loop_probe

static int __init loop_init(void)
{
-   if (register_blkdev(LOOP_MAJOR, "loop"))
-   return -EIO;
-   blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL << MINORBITS,
- THIS_MODULE, loop_probe, NULL, NULL);
+   int i, nr;
+   unsigned long range;
+   struct loop_device *lo, *next;
+
+   /*
+* loop module now has a feature to instantiate underlying device
+* structure on-demand, provided that there is an access dev node.
+* However, this will not work well with user space tool that doesn't
+* know about such "feature".  In order to not break any existing
+* tool, we do the following:
+*
+* (1) if max_loop is specified, create that many upfront, and this
+* also becomes a hard limit.
+* (2) if max_loop is not specified, create 8 loop device on module
+* load, user can further extend loop device by create dev node
+* themselves and have kernel automatically instantiate actual
+* device on-demand.
+*/
+   if (max_loop > 1UL << MINORBITS)
+   return -EINVAL;

if (max_loop) {
-   printk(KERN_INFO "loop: the max_loop option is obsolete "
-"and will be removed in March 2008\n");
+   nr = max_loop;
+   range = max_loop;
+   } else {
+   nr = 8;
+   range = 1UL << MINORBITS;
+   }
+
+   if (register_blkdev(LOOP_MAJOR, "loop"))
+   return -EIO;

+   for (i = 0; i < nr; i++) {
+   lo = loop_alloc(i);
+   if (!lo)
+   goto Enomem;
+   list_add_tail(>lo_list, _devices);
}
+
+   /* point of no return */
+
+   list_for_each_entry(lo, _devices, lo_list)
+   add_disk(lo->lo_disk);
+
+   blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
+ THIS_MODULE, loop_probe, NULL, NUL

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-06-01 Thread Ken Chen


On 5/31/07, Andrew Morton [EMAIL PROTECTED] wrote:

Could you please send a fresh, shiny, new, changelogged patch against mainline?


OK, here it is.  It would be nice to merge this before final 2.6.22
release.  Thank you.


The kernel on-demand loop device instantiation breaks several user space
tools as the tools are not ready to cope with the on-demand feature.  Fix
it by instantiate default 8 loop devices and also reinstate max_loop module
parameter.


Signed-off-by: Ken Chen [EMAIL PROTECTED]
Acked-by: Al Viro [EMAIL PROTECTED]


diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5526ead..0ed5470 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1354,7 +1354,7 @@ #endif
 */
static int max_loop;
module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, obsolete, loop device is created on-demand);
+MODULE_PARM_DESC(max_loop, Maximum number of loop devices);
MODULE_LICENSE(GPL);
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1394,16 +1394,11 @@ int loop_unregister_transfer
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

-static struct loop_device *loop_init_one(int i)
+static struct loop_device *loop_alloc(int i)
{
struct loop_device *lo;
struct gendisk *disk;

-   list_for_each_entry(lo, loop_devices, lo_list) {
-   if (lo-lo_number == i)
-   return lo;
-   }
-
lo = kzalloc(sizeof(*lo), GFP_KERNEL);
if (!lo)
goto out;
@@ -1427,8 +1422,6 @@ static struct loop_device *loop_init_one
disk-private_data   = lo;
disk-queue  = lo-lo_queue;
sprintf(disk-disk_name, loop%d, i);
-   add_disk(disk);
-   list_add_tail(lo-lo_list, loop_devices);
return lo;

out_free_queue:
@@ -1439,15 +1432,37 @@ out:
return NULL;
}

-static void loop_del_one(struct loop_device *lo)
+static void loop_free(struct loop_device *lo)
{
-   del_gendisk(lo-lo_disk);
blk_cleanup_queue(lo-lo_queue);
put_disk(lo-lo_disk);
list_del(lo-lo_list);
kfree(lo);
}

+static struct loop_device *loop_init_one(int i)
+{
+   struct loop_device *lo;
+
+   list_for_each_entry(lo, loop_devices, lo_list) {
+   if (lo-lo_number == i)
+   return lo;
+   }
+
+   lo = loop_alloc(i);
+   if (lo) {
+   add_disk(lo-lo_disk);
+   list_add_tail(lo-lo_list, loop_devices);
+   }
+   return lo;
+}
+
+static void loop_del_one(struct loop_device *lo)
+{
+   del_gendisk(lo-lo_disk);
+   loop_free(lo);
+}
+
static struct kobject *loop_probe(dev_t dev, int *part, void *data)
{
struct loop_device *lo;
@@ -1464,28 +1479,77 @@ static struct kobject *loop_probe

static int __init loop_init(void)
{
-   if (register_blkdev(LOOP_MAJOR, loop))
-   return -EIO;
-   blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL  MINORBITS,
- THIS_MODULE, loop_probe, NULL, NULL);
+   int i, nr;
+   unsigned long range;
+   struct loop_device *lo, *next;
+
+   /*
+* loop module now has a feature to instantiate underlying device
+* structure on-demand, provided that there is an access dev node.
+* However, this will not work well with user space tool that doesn't
+* know about such feature.  In order to not break any existing
+* tool, we do the following:
+*
+* (1) if max_loop is specified, create that many upfront, and this
+* also becomes a hard limit.
+* (2) if max_loop is not specified, create 8 loop device on module
+* load, user can further extend loop device by create dev node
+* themselves and have kernel automatically instantiate actual
+* device on-demand.
+*/
+   if (max_loop  1UL  MINORBITS)
+   return -EINVAL;

if (max_loop) {
-   printk(KERN_INFO loop: the max_loop option is obsolete 
-and will be removed in March 2008\n);
+   nr = max_loop;
+   range = max_loop;
+   } else {
+   nr = 8;
+   range = 1UL  MINORBITS;
+   }
+
+   if (register_blkdev(LOOP_MAJOR, loop))
+   return -EIO;

+   for (i = 0; i  nr; i++) {
+   lo = loop_alloc(i);
+   if (!lo)
+   goto Enomem;
+   list_add_tail(lo-lo_list, loop_devices);
}
+
+   /* point of no return */
+
+   list_for_each_entry(lo, loop_devices, lo_list)
+   add_disk(lo-lo_disk);
+
+   blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
+ THIS_MODULE, loop_probe, NULL, NULL);
+
printk(KERN_INFO loop: module loaded\n);
return 0;
+
+Enomem:
+   printk(KERN_INFO loop: out of memory\n);
+
+   list_for_each_entry_safe(lo, next, loop_devices, lo_list

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-05-31 Thread Ken Chen


On 5/31/07, Andrew Morton <[EMAIL PROTECTED]> wrote:

I have a note here that it needs additional work.  This discussion:

http://lkml.org/lkml/2007/5/21/602

seemed to peter out and go nowhere?


The first rev went in -mm needs work and the above url is the result
of feedback from Al Viro.  He also acked the patch in the follow on
thread:
http://lkml.org/lkml/2007/5/22/5

Not sure how widely it was tested on that last patch, I see a few
reports that the patch works out OK.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-05-31 Thread Ken Chen

On 5/31/07, walt <[EMAIL PROTECTED]> wrote:

Ken Chen wrote:
> On 5/21/07, Ken Chen <[EMAIL PROTECTED]> wrote:
...
> tested, like this?

Ken, your patch below works for me.  Are you going to send this
on to Linus?

I think akpm will route this to Linus.  andrew?
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-05-31 Thread Ken Chen


On 5/31/07, walt [EMAIL PROTECTED] wrote:

Ken Chen wrote:
 On 5/21/07, Ken Chen [EMAIL PROTECTED] wrote:
...
 tested, like this?

Ken, your patch below works for me.  Are you going to send this
on to Linus?


I think akpm will route this to Linus.  andrew?
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-05-31 Thread Ken Chen


On 5/31/07, Andrew Morton [EMAIL PROTECTED] wrote:

I have a note here that it needs additional work.  This discussion:

http://lkml.org/lkml/2007/5/21/602

seemed to peter out and go nowhere?


The first rev went in -mm needs work and the above url is the result
of feedback from Al Viro.  He also acked the patch in the follow on
thread:
http://lkml.org/lkml/2007/5/22/5

Not sure how widely it was tested on that last patch, I see a few
reports that the patch works out OK.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-05-21 Thread Ken Chen


On 5/21/07, Ken Chen <[EMAIL PROTECTED]> wrote:

On 5/21/07, Al Viro <[EMAIL PROTECTED]> wrote:
> No, it doesn't.  Really.  It's easy to split; untested incremental to your
> patch follows:
>
> for (i = 0; i < nr; i++) {
> -   if (!loop_init_one(i))
> -   goto err;
> +   lo = loop_alloc(i);
> +   if (!lo)
> +   goto Enomem;
> +   list_add_tail(>lo_list, _devices);
> }

ah, yes, use the loop_device list_head to link all the pending devices.


> +   /* point of no return */
> +
> +   list_for_each_entry(lo, _devices, lo_list)
> +   add_disk(lo->lo_disk);
> +
> +   blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
> + THIS_MODULE, loop_probe, NULL, NULL);
> +
> printk(KERN_INFO "loop: module loaded\n");
> return 0;
> -err:
> -   loop_exit();
> +
> +Enomem:
> printk(KERN_INFO "loop: out of memory\n");
> +
> +   while(!list_empty(_devices)) {
> +   lo = list_entry(loop_devices.next, struct loop_device, 
lo_list);
> +   loop_del_one(lo);
> +   }
> +
> +   unregister_blkdev(LOOP_MAJOR, "loop");
> return -ENOMEM;
>  }

I suppose the loop_del_one call in Enomem label needs to be split up
too since in the error path, it hasn't done add_disk() yet?



tested, like this?

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5526ead..0ed5470 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1354,7 +1354,7 @@ #endif
 */
static int max_loop;
module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, "obsolete, loop device is created on-demand");
+MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1394,16 +1394,11 @@ int loop_unregister_transfer
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

-static struct loop_device *loop_init_one(int i)
+static struct loop_device *loop_alloc(int i)
{
struct loop_device *lo;
struct gendisk *disk;

-   list_for_each_entry(lo, _devices, lo_list) {
-   if (lo->lo_number == i)
-   return lo;
-   }
-
lo = kzalloc(sizeof(*lo), GFP_KERNEL);
if (!lo)
goto out;
@@ -1427,8 +1422,6 @@ static struct loop_device *loop_init_one
disk->private_data   = lo;
disk->queue  = lo->lo_queue;
sprintf(disk->disk_name, "loop%d", i);
-   add_disk(disk);
-   list_add_tail(>lo_list, _devices);
return lo;

out_free_queue:
@@ -1439,15 +1432,37 @@ out:
return NULL;
}

-static void loop_del_one(struct loop_device *lo)
+static void loop_free(struct loop_device *lo)
{
-   del_gendisk(lo->lo_disk);
blk_cleanup_queue(lo->lo_queue);
put_disk(lo->lo_disk);
list_del(>lo_list);
kfree(lo);
}

+static struct loop_device *loop_init_one(int i)
+{
+   struct loop_device *lo;
+
+   list_for_each_entry(lo, _devices, lo_list) {
+   if (lo->lo_number == i)
+   return lo;
+   }
+
+   lo = loop_alloc(i);
+   if (lo) {
+   add_disk(lo->lo_disk);
+   list_add_tail(>lo_list, _devices);
+   }
+   return lo;
+}
+
+static void loop_del_one(struct loop_device *lo)
+{
+   del_gendisk(lo->lo_disk);
+   loop_free(lo);
+}
+
static struct kobject *loop_probe(dev_t dev, int *part, void *data)
{
struct loop_device *lo;
@@ -1464,28 +1479,77 @@ static struct kobject *loop_probe

static int __init loop_init(void)
{
-   if (register_blkdev(LOOP_MAJOR, "loop"))
-   return -EIO;
-   blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL << MINORBITS,
- THIS_MODULE, loop_probe, NULL, NULL);
+   int i, nr;
+   unsigned long range;
+   struct loop_device *lo, *next;
+
+   /*
+* loop module now has a feature to instantiate underlying device
+* structure on-demand, provided that there is an access dev node.
+* However, this will not work well with user space tool that doesn't
+* know about such "feature".  In order to not break any existing
+* tool, we do the following:
+*
+* (1) if max_loop is specified, create that many upfront, and this
+* also becomes a hard limit.
+* (2) if max_loop is not specified, create 8 loop device on module
+* load, user can further extend loop device by create dev node
+* themselves and have kernel automatically instantiate actual
+* device on-demand.
+*/

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-05-21 Thread Ken Chen


On 5/21/07, Al Viro <[EMAIL PROTECTED]> wrote:

No, it doesn't.  Really.  It's easy to split; untested incremental to your
patch follows:

for (i = 0; i < nr; i++) {
-   if (!loop_init_one(i))
-   goto err;
+   lo = loop_alloc(i);
+   if (!lo)
+   goto Enomem;
+   list_add_tail(>lo_list, _devices);
}


ah, yes, use the loop_device list_head to link all the pending devices.



+   /* point of no return */
+
+   list_for_each_entry(lo, _devices, lo_list)
+   add_disk(lo->lo_disk);
+
+   blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
+ THIS_MODULE, loop_probe, NULL, NULL);
+
printk(KERN_INFO "loop: module loaded\n");
return 0;
-err:
-   loop_exit();
+
+Enomem:
printk(KERN_INFO "loop: out of memory\n");
+
+   while(!list_empty(_devices)) {
+   lo = list_entry(loop_devices.next, struct loop_device, lo_list);
+   loop_del_one(lo);
+   }
+
+   unregister_blkdev(LOOP_MAJOR, "loop");
return -ENOMEM;
 }


I suppose the loop_del_one call in Enomem label needs to be split up
too since in the error path, it hasn't done add_disk() yet?
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-05-21 Thread Ken Chen

On 5/21/07, Al Viro <[EMAIL PROTECTED]> wrote:

On Mon, May 21, 2007 at 03:00:55PM -0700, [EMAIL PROTECTED] wrote:
> + if (register_blkdev(LOOP_MAJOR, "loop"))
> + return -EIO;
> + blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
> +   THIS_MODULE, loop_probe, NULL, NULL);
> +
> + for (i = 0; i < nr; i++) {
> + if (!loop_init_one(i))
> + goto err;
> + }
> +
> + printk(KERN_INFO "loop: module loaded\n");
> + return 0;
> +err:
> + loop_exit();

This isn't good.  You *can't* fail once a single disk has been registered.
Anyone could've opened it by now.

IOW, you need to
* register region *after* you are past the point of no return

That option is a lot harder than I thought.  This requires an array to
keep intermediate result of preallocated "lo" device, blk_queue, and
disk structure before calling add_disk() or register region.  And this
array could be potentially 1 million entries.  Maybe I will use
vmalloc for it, but seems rather sick.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: bug in 2.6.22-rc2: loop mount limited to one single iso image

2007-05-21 Thread Ken Chen


On 5/21/07, Ken Chen <[EMAIL PROTECTED]> wrote:

The easiest way is to reinstate max_loop and create "max_loop" device
up front at module load time.  However, that will lose all the "fancy
on-demand device instantiation feature".

So I propose we do the following:

1. have the module honor "max_loop" parameter and create that many
device upfront on module load (max_loop will also be a hard max) iff
user specify the parameter.
2. if max_loop is not specified, default create 8 loop device.  User
can extent more loop device by create device node themselves and have
kernel automatically instantiate loop device on-demand.

Is this acceptable?  Patch in a bit.



Could people who has problem with loop device please test this?  I
tested it on my Ubuntu feisty distribution and it works fine. Though I
typically don't use loop device at all.

---

The kernel on-demand loop device instantiation breaks several user
space tools as the tools are not ready to cope with the "on-demand
feature".  Fix it by instantiate default 8 loop devices and also
reinstate max_loop module parameter.

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5526ead..0aae8d8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1354,7 +1354,7 @@ #endif
 */
static int max_loop;
module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, "obsolete, loop device is created on-demand");
+MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1462,34 +1462,66 @@ static struct kobject *loop_probe(dev_t
return kobj;
}

-static int __init loop_init(void)
-{
-   if (register_blkdev(LOOP_MAJOR, "loop"))
-   return -EIO;
-   blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL << MINORBITS,
- THIS_MODULE, loop_probe, NULL, NULL);
-
-   if (max_loop) {
-   printk(KERN_INFO "loop: the max_loop option is obsolete "
-"and will be removed in March 2008\n");
-
-   }
-   printk(KERN_INFO "loop: module loaded\n");
-   return 0;
-}
-
static void __exit loop_exit(void)
{
+   unsigned long range;
struct loop_device *lo, *next;

+   range = max_loop ? max_loop :  1UL << MINORBITS;
+
list_for_each_entry_safe(lo, next, _devices, lo_list)
loop_del_one(lo);

-   blk_unregister_region(MKDEV(LOOP_MAJOR, 0), 1UL << MINORBITS);
+   blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
if (unregister_blkdev(LOOP_MAJOR, "loop"))
printk(KERN_WARNING "loop: cannot unregister blkdev\n");
}

+static int __init loop_init(void)
+{
+   int i, nr;
+   unsigned long range;
+
+   /*
+* loop module now has a feature to instantiate underlying device
+* structure on-demand, provided that there is an access dev node.
+* However, this will not work well with user space tool that doesn't
+* know about such "feature".  In order to not break any existing
+* tool, we do the following:
+*
+* (1) if max_loop is specified, create that many upfront, and this
+* also becomes a hard limit.
+* (2) if max_loop is not specified, create 8 loop device on module
+* load, user can further extend loop device by create dev node
+* themselves and have kernel automatically instantiate actual
+* device on-demand.
+*/
+   if (max_loop) {
+   nr = max_loop;
+   range = max_loop;
+   } else {
+   nr = 8;
+   range = 1UL << MINORBITS;
+   }
+
+   if (register_blkdev(LOOP_MAJOR, "loop"))
+   return -EIO;
+   blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
+ THIS_MODULE, loop_probe, NULL, NULL);
+
+   for (i = 0; i < nr; i++) {
+   if (!loop_init_one(i))
+   goto err;
+   }
+
+   printk(KERN_INFO "loop: module loaded\n");
+   return 0;
+err:
+   loop_exit();
+   printk(KERN_INFO "loop: out of memory\n");
+   return -ENOMEM;
+}
+
module_init(loop_init);
module_exit(loop_exit);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5526ead..0aae8d8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1354,7 +1354,7 @@ #endif
  */
 static int max_loop;
 module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, "obsolete, loop device is created on-demand");
+MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
 
@@ -1462,34 +1462,66 @@ static struct kobject *

Re: bug in 2.6.22-rc2: loop mount limited to one single iso image

2007-05-21 Thread Ken Chen


On 5/21/07, Ken Chen <[EMAIL PROTECTED]> wrote:

yes and no. in that commit, I automatically create n+1 device when
loop device n is created, allergically was tested to be fine with
casual usage of "losetup" and "mount -o loop".  However, there is a
bug in that commit when loop.c was compiled as a module.  And when Al
fixed it, he also removed that magic "n+1" trick.

Nevertheless, yes, I'm guilty of introducing the new behavior.


The easiest way is to reinstate max_loop and create "max_loop" device
up front at module load time.  However, that will lose all the "fancy
on-demand device instantiation feature".

So I propose we do the following:

1. have the module honor "max_loop" parameter and create that many
device upfront on module load (max_loop will also be a hard max) iff
user specify the parameter.
2. if max_loop is not specified, default create 8 loop device.  User
can extent more loop device by create device node themselves and have
kernel automatically instantiate loop device on-demand.

Is this acceptable?  Patch in a bit.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: bug in 2.6.22-rc2: loop mount limited to one single iso image

2007-05-21 Thread Ken Chen


On 5/21/07, Linus Torvalds <[EMAIL PROTECTED]> wrote:

Yes. Can somebody who actually _uses_ loop send a tested patch, please?

We're not going to break existing user space over something idiotic like
this. Not acceptable.

The alternative is to just revert all the loop.c patches. Which I'll do
unless somebody sends me a tested fix - because as a non-loop user myself,
I don't have much choice. I assume it is

   commit 73285082 "remove artificial software max_loop limit"

that introduced the new behaviour. Ken?


yes and no. in that commit, I automatically create n+1 device when
loop device n is created, allergically was tested to be fine with
casual usage of "losetup" and "mount -o loop".  However, there is a
bug in that commit when loop.c was compiled as a module.  And when Al
fixed it, he also removed that magic "n+1" trick.

Nevertheless, yes, I'm guilty of introducing the new behavior.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: bug in 2.6.22-rc2: loop mount limited to one single iso image

2007-05-21 Thread Ken Chen


On 5/19/07, Ray Lee <[EMAIL PROTECTED]> wrote:

Yeah, that's the only one left. I was hoping it wasn't that one, as it
claimed to have been tested extensively. Guess it wasn't tested with
udev.

Ken? Ball's in your court. As the patch isn't providing a killer
feature for 2.6.22, I'd suggest just reverting it for now until the
issues are ironed out.


The real solution is to have the user space tool to create these
device nodes in advance.

The original loop patch was coded such that when we open a loop device
N, the immediate adjacent device "N + 1" is created.  This will keep
"mount -o loop" happy because it typically does a linear scan to find
a free device.  This might be somewhat hackary, but certainly will be
backward compatible before user space solution is deployed.

However, the code was removed by Al in this commit:

commit 07002e995638b83a6987180f43722a0eb39d4932
Author: Al Viro <[EMAIL PROTECTED]>
Date:   Sat May 12 16:23:15 2007 -0400

   fix the dynamic allocation and probe in loop.c

   Signed-off-by: Al Viro <[EMAIL PROTECTED]>
   Acked-by: Ken Chen <[EMAIL PROTECTED]>
   Signed-off-by: Linus Torvalds <[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: bug in 2.6.22-rc2: loop mount limited to one single iso image

2007-05-21 Thread Ken Chen


On 5/19/07, Ray Lee [EMAIL PROTECTED] wrote:

Yeah, that's the only one left. I was hoping it wasn't that one, as it
claimed to have been tested extensively. Guess it wasn't tested with
udev.

Ken? Ball's in your court. As the patch isn't providing a killer
feature for 2.6.22, I'd suggest just reverting it for now until the
issues are ironed out.


The real solution is to have the user space tool to create these
device nodes in advance.

The original loop patch was coded such that when we open a loop device
N, the immediate adjacent device N + 1 is created.  This will keep
mount -o loop happy because it typically does a linear scan to find
a free device.  This might be somewhat hackary, but certainly will be
backward compatible before user space solution is deployed.

However, the code was removed by Al in this commit:

commit 07002e995638b83a6987180f43722a0eb39d4932
Author: Al Viro [EMAIL PROTECTED]
Date:   Sat May 12 16:23:15 2007 -0400

   fix the dynamic allocation and probe in loop.c

   Signed-off-by: Al Viro [EMAIL PROTECTED]
   Acked-by: Ken Chen [EMAIL PROTECTED]
   Signed-off-by: Linus Torvalds [EMAIL PROTECTED]
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: bug in 2.6.22-rc2: loop mount limited to one single iso image

2007-05-21 Thread Ken Chen


On 5/21/07, Linus Torvalds [EMAIL PROTECTED] wrote:

Yes. Can somebody who actually _uses_ loop send a tested patch, please?

We're not going to break existing user space over something idiotic like
this. Not acceptable.

The alternative is to just revert all the loop.c patches. Which I'll do
unless somebody sends me a tested fix - because as a non-loop user myself,
I don't have much choice. I assume it is

   commit 73285082 remove artificial software max_loop limit

that introduced the new behaviour. Ken?


yes and no. in that commit, I automatically create n+1 device when
loop device n is created, allergically was tested to be fine with
casual usage of losetup and mount -o loop.  However, there is a
bug in that commit when loop.c was compiled as a module.  And when Al
fixed it, he also removed that magic n+1 trick.

Nevertheless, yes, I'm guilty of introducing the new behavior.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: bug in 2.6.22-rc2: loop mount limited to one single iso image

2007-05-21 Thread Ken Chen


On 5/21/07, Ken Chen [EMAIL PROTECTED] wrote:

yes and no. in that commit, I automatically create n+1 device when
loop device n is created, allergically was tested to be fine with
casual usage of losetup and mount -o loop.  However, there is a
bug in that commit when loop.c was compiled as a module.  And when Al
fixed it, he also removed that magic n+1 trick.

Nevertheless, yes, I'm guilty of introducing the new behavior.


The easiest way is to reinstate max_loop and create max_loop device
up front at module load time.  However, that will lose all the fancy
on-demand device instantiation feature.

So I propose we do the following:

1. have the module honor max_loop parameter and create that many
device upfront on module load (max_loop will also be a hard max) iff
user specify the parameter.
2. if max_loop is not specified, default create 8 loop device.  User
can extent more loop device by create device node themselves and have
kernel automatically instantiate loop device on-demand.

Is this acceptable?  Patch in a bit.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: bug in 2.6.22-rc2: loop mount limited to one single iso image

2007-05-21 Thread Ken Chen


On 5/21/07, Ken Chen [EMAIL PROTECTED] wrote:

The easiest way is to reinstate max_loop and create max_loop device
up front at module load time.  However, that will lose all the fancy
on-demand device instantiation feature.

So I propose we do the following:

1. have the module honor max_loop parameter and create that many
device upfront on module load (max_loop will also be a hard max) iff
user specify the parameter.
2. if max_loop is not specified, default create 8 loop device.  User
can extent more loop device by create device node themselves and have
kernel automatically instantiate loop device on-demand.

Is this acceptable?  Patch in a bit.



Could people who has problem with loop device please test this?  I
tested it on my Ubuntu feisty distribution and it works fine. Though I
typically don't use loop device at all.

---

The kernel on-demand loop device instantiation breaks several user
space tools as the tools are not ready to cope with the on-demand
feature.  Fix it by instantiate default 8 loop devices and also
reinstate max_loop module parameter.

Signed-off-by: Ken Chen [EMAIL PROTECTED]

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5526ead..0aae8d8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1354,7 +1354,7 @@ #endif
 */
static int max_loop;
module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, obsolete, loop device is created on-demand);
+MODULE_PARM_DESC(max_loop, Maximum number of loop devices);
MODULE_LICENSE(GPL);
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1462,34 +1462,66 @@ static struct kobject *loop_probe(dev_t
return kobj;
}

-static int __init loop_init(void)
-{
-   if (register_blkdev(LOOP_MAJOR, loop))
-   return -EIO;
-   blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL  MINORBITS,
- THIS_MODULE, loop_probe, NULL, NULL);
-
-   if (max_loop) {
-   printk(KERN_INFO loop: the max_loop option is obsolete 
-and will be removed in March 2008\n);
-
-   }
-   printk(KERN_INFO loop: module loaded\n);
-   return 0;
-}
-
static void __exit loop_exit(void)
{
+   unsigned long range;
struct loop_device *lo, *next;

+   range = max_loop ? max_loop :  1UL  MINORBITS;
+
list_for_each_entry_safe(lo, next, loop_devices, lo_list)
loop_del_one(lo);

-   blk_unregister_region(MKDEV(LOOP_MAJOR, 0), 1UL  MINORBITS);
+   blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
if (unregister_blkdev(LOOP_MAJOR, loop))
printk(KERN_WARNING loop: cannot unregister blkdev\n);
}

+static int __init loop_init(void)
+{
+   int i, nr;
+   unsigned long range;
+
+   /*
+* loop module now has a feature to instantiate underlying device
+* structure on-demand, provided that there is an access dev node.
+* However, this will not work well with user space tool that doesn't
+* know about such feature.  In order to not break any existing
+* tool, we do the following:
+*
+* (1) if max_loop is specified, create that many upfront, and this
+* also becomes a hard limit.
+* (2) if max_loop is not specified, create 8 loop device on module
+* load, user can further extend loop device by create dev node
+* themselves and have kernel automatically instantiate actual
+* device on-demand.
+*/
+   if (max_loop) {
+   nr = max_loop;
+   range = max_loop;
+   } else {
+   nr = 8;
+   range = 1UL  MINORBITS;
+   }
+
+   if (register_blkdev(LOOP_MAJOR, loop))
+   return -EIO;
+   blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
+ THIS_MODULE, loop_probe, NULL, NULL);
+
+   for (i = 0; i  nr; i++) {
+   if (!loop_init_one(i))
+   goto err;
+   }
+
+   printk(KERN_INFO loop: module loaded\n);
+   return 0;
+err:
+   loop_exit();
+   printk(KERN_INFO loop: out of memory\n);
+   return -ENOMEM;
+}
+
module_init(loop_init);
module_exit(loop_exit);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5526ead..0aae8d8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1354,7 +1354,7 @@ #endif
  */
 static int max_loop;
 module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, obsolete, loop device is created on-demand);
+MODULE_PARM_DESC(max_loop, Maximum number of loop devices);
 MODULE_LICENSE(GPL);
 MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
 
@@ -1462,34 +1462,66 @@ static struct kobject *loop_probe(dev_t 
 	return kobj;
 }
 
-static int __init loop_init(void)
-{
-	if (register_blkdev(LOOP_MAJOR, loop))
-		return -EIO;
-	blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL  MINORBITS,
-  THIS_MODULE, loop_probe, NULL, NULL);
-
-	if (max_loop) {
-		printk(KERN_INFO loop: the max_loop

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-05-21 Thread Ken Chen


On 5/21/07, Al Viro [EMAIL PROTECTED] wrote:

On Mon, May 21, 2007 at 03:00:55PM -0700, [EMAIL PROTECTED] wrote:
 + if (register_blkdev(LOOP_MAJOR, loop))
 + return -EIO;
 + blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
 +   THIS_MODULE, loop_probe, NULL, NULL);
 +
 + for (i = 0; i  nr; i++) {
 + if (!loop_init_one(i))
 + goto err;
 + }
 +
 + printk(KERN_INFO loop: module loaded\n);
 + return 0;
 +err:
 + loop_exit();

This isn't good.  You *can't* fail once a single disk has been registered.
Anyone could've opened it by now.

IOW, you need to
* register region *after* you are past the point of no return


That option is a lot harder than I thought.  This requires an array to
keep intermediate result of preallocated lo device, blk_queue, and
disk structure before calling add_disk() or register region.  And this
array could be potentially 1 million entries.  Maybe I will use
vmalloc for it, but seems rather sick.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-05-21 Thread Ken Chen


On 5/21/07, Al Viro [EMAIL PROTECTED] wrote:

No, it doesn't.  Really.  It's easy to split; untested incremental to your
patch follows:

for (i = 0; i  nr; i++) {
-   if (!loop_init_one(i))
-   goto err;
+   lo = loop_alloc(i);
+   if (!lo)
+   goto Enomem;
+   list_add_tail(lo-lo_list, loop_devices);
}


ah, yes, use the loop_device list_head to link all the pending devices.



+   /* point of no return */
+
+   list_for_each_entry(lo, loop_devices, lo_list)
+   add_disk(lo-lo_disk);
+
+   blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
+ THIS_MODULE, loop_probe, NULL, NULL);
+
printk(KERN_INFO loop: module loaded\n);
return 0;
-err:
-   loop_exit();
+
+Enomem:
printk(KERN_INFO loop: out of memory\n);
+
+   while(!list_empty(loop_devices)) {
+   lo = list_entry(loop_devices.next, struct loop_device, lo_list);
+   loop_del_one(lo);
+   }
+
+   unregister_blkdev(LOOP_MAJOR, loop);
return -ENOMEM;
 }


I suppose the loop_del_one call in Enomem label needs to be split up
too since in the error path, it hasn't done add_disk() yet?
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree

2007-05-21 Thread Ken Chen


On 5/21/07, Ken Chen [EMAIL PROTECTED] wrote:

On 5/21/07, Al Viro [EMAIL PROTECTED] wrote:
 No, it doesn't.  Really.  It's easy to split; untested incremental to your
 patch follows:

 for (i = 0; i  nr; i++) {
 -   if (!loop_init_one(i))
 -   goto err;
 +   lo = loop_alloc(i);
 +   if (!lo)
 +   goto Enomem;
 +   list_add_tail(lo-lo_list, loop_devices);
 }

ah, yes, use the loop_device list_head to link all the pending devices.


 +   /* point of no return */
 +
 +   list_for_each_entry(lo, loop_devices, lo_list)
 +   add_disk(lo-lo_disk);
 +
 +   blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
 + THIS_MODULE, loop_probe, NULL, NULL);
 +
 printk(KERN_INFO loop: module loaded\n);
 return 0;
 -err:
 -   loop_exit();
 +
 +Enomem:
 printk(KERN_INFO loop: out of memory\n);
 +
 +   while(!list_empty(loop_devices)) {
 +   lo = list_entry(loop_devices.next, struct loop_device, 
lo_list);
 +   loop_del_one(lo);
 +   }
 +
 +   unregister_blkdev(LOOP_MAJOR, loop);
 return -ENOMEM;
  }

I suppose the loop_del_one call in Enomem label needs to be split up
too since in the error path, it hasn't done add_disk() yet?



tested, like this?

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5526ead..0ed5470 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1354,7 +1354,7 @@ #endif
 */
static int max_loop;
module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, obsolete, loop device is created on-demand);
+MODULE_PARM_DESC(max_loop, Maximum number of loop devices);
MODULE_LICENSE(GPL);
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1394,16 +1394,11 @@ int loop_unregister_transfer
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

-static struct loop_device *loop_init_one(int i)
+static struct loop_device *loop_alloc(int i)
{
struct loop_device *lo;
struct gendisk *disk;

-   list_for_each_entry(lo, loop_devices, lo_list) {
-   if (lo-lo_number == i)
-   return lo;
-   }
-
lo = kzalloc(sizeof(*lo), GFP_KERNEL);
if (!lo)
goto out;
@@ -1427,8 +1422,6 @@ static struct loop_device *loop_init_one
disk-private_data   = lo;
disk-queue  = lo-lo_queue;
sprintf(disk-disk_name, loop%d, i);
-   add_disk(disk);
-   list_add_tail(lo-lo_list, loop_devices);
return lo;

out_free_queue:
@@ -1439,15 +1432,37 @@ out:
return NULL;
}

-static void loop_del_one(struct loop_device *lo)
+static void loop_free(struct loop_device *lo)
{
-   del_gendisk(lo-lo_disk);
blk_cleanup_queue(lo-lo_queue);
put_disk(lo-lo_disk);
list_del(lo-lo_list);
kfree(lo);
}

+static struct loop_device *loop_init_one(int i)
+{
+   struct loop_device *lo;
+
+   list_for_each_entry(lo, loop_devices, lo_list) {
+   if (lo-lo_number == i)
+   return lo;
+   }
+
+   lo = loop_alloc(i);
+   if (lo) {
+   add_disk(lo-lo_disk);
+   list_add_tail(lo-lo_list, loop_devices);
+   }
+   return lo;
+}
+
+static void loop_del_one(struct loop_device *lo)
+{
+   del_gendisk(lo-lo_disk);
+   loop_free(lo);
+}
+
static struct kobject *loop_probe(dev_t dev, int *part, void *data)
{
struct loop_device *lo;
@@ -1464,28 +1479,77 @@ static struct kobject *loop_probe

static int __init loop_init(void)
{
-   if (register_blkdev(LOOP_MAJOR, loop))
-   return -EIO;
-   blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL  MINORBITS,
- THIS_MODULE, loop_probe, NULL, NULL);
+   int i, nr;
+   unsigned long range;
+   struct loop_device *lo, *next;
+
+   /*
+* loop module now has a feature to instantiate underlying device
+* structure on-demand, provided that there is an access dev node.
+* However, this will not work well with user space tool that doesn't
+* know about such feature.  In order to not break any existing
+* tool, we do the following:
+*
+* (1) if max_loop is specified, create that many upfront, and this
+* also becomes a hard limit.
+* (2) if max_loop is not specified, create 8 loop device on module
+* load, user can further extend loop device by create dev node
+* themselves and have kernel automatically instantiate actual
+* device on-demand.
+*/
+   if (max_loop  1UL  MINORBITS)
+   return -EINVAL;

if (max_loop) {
-   printk(KERN_INFO loop: the max_loop option is obsolete 
-and will be removed in March 2008\n);
+   nr = max_loop;
+   range = max_loop

Re: 2.6.21-git11: BUG in loop.ko

2007-05-11 Thread Ken Chen

On 5/9/07, Andrew Morton <[EMAIL PROTECTED]> wrote:

On Wed, 09 May 2007 16:52:41 -0700
Jeremy Fitzhardinge <[EMAIL PROTECTED]> wrote:

> Seems to be getting a 0 refcount.  I don't see anything in the recent
> changes which might cause this, but this is relatively new behaviour.
> It was working for me in the 2.6.21-pre time period, but I haven't tried
> this since 2.6.21 was released.
>
> The BUG is actually triggered by the __module_get(THIS_MODULE) in
> loop_set_fd.

A few people have been playing with module refcounting lately.  Did you
work out a reproduce-it recipe?

Ah, it's a mis-understanding on what kobj_probe_t function is suppose
to return on success.  When we open loop device that has not been
initialized, we probe it via:

do_open
 get_gendisk
   kobj_lookup
 loop_probe

Notice that in kobj_lookup(), when p->probe() returns non-zero value
(I presume it is an -ERRNO), it breaks out of the loop and propagate
the return value, otherwise, loops back to the beginning of the for
loop and retry, and in there get_disk() will be called via p->lock()
to get a ref against the module.

kobj_look_up(...) {
retry:
   mutex_lock(domain->lock);
   for (p = domain->probes[MAJOR(dev) % 255]; p; p = p->next) {
   ...
   if (kobj)
   return kobj;
   goto retry;
}

So loop_probe() mistakenly returned wrong status and leads to future
oops on inconsistent module ref count.  The following patch fixes the
issue.

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 18cdd8c..40f7bc2 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1460,6 +1460,7 @@ static void loop_del_one(struct loop_device *lo)
kfree(lo);
}

+/* return NULL for success, or return non-zero value if there are error */
static struct kobject *loop_probe(dev_t dev, int *part, void *data)
{
unsigned int number = dev & MINORMASK;
@@ -1474,8 +1475,8 @@ static struct kobject *loop_probe
*part = 0;
if (IS_ERR(lo))
return (void *)lo;
-   else
-   return >lo_disk->kobj;
+
+   return NULL;
}

static int __init loop_init(void)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: 2.6.21-git11: BUG in loop.ko

2007-05-11 Thread Ken Chen


On 5/9/07, Andrew Morton [EMAIL PROTECTED] wrote:

On Wed, 09 May 2007 16:52:41 -0700
Jeremy Fitzhardinge [EMAIL PROTECTED] wrote:

 Seems to be getting a 0 refcount.  I don't see anything in the recent
 changes which might cause this, but this is relatively new behaviour.
 It was working for me in the 2.6.21-pre time period, but I haven't tried
 this since 2.6.21 was released.

 The BUG is actually triggered by the __module_get(THIS_MODULE) in
 loop_set_fd.

A few people have been playing with module refcounting lately.  Did you
work out a reproduce-it recipe?


Ah, it's a mis-understanding on what kobj_probe_t function is suppose
to return on success.  When we open loop device that has not been
initialized, we probe it via:

do_open
 get_gendisk
   kobj_lookup
 loop_probe

Notice that in kobj_lookup(), when p-probe() returns non-zero value
(I presume it is an -ERRNO), it breaks out of the loop and propagate
the return value, otherwise, loops back to the beginning of the for
loop and retry, and in there get_disk() will be called via p-lock()
to get a ref against the module.

kobj_look_up(...) {
retry:
   mutex_lock(domain-lock);
   for (p = domain-probes[MAJOR(dev) % 255]; p; p = p-next) {
   ...
   if (kobj)
   return kobj;
   goto retry;
}

So loop_probe() mistakenly returned wrong status and leads to future
oops on inconsistent module ref count.  The following patch fixes the
issue.

Signed-off-by: Ken Chen [EMAIL PROTECTED]


diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 18cdd8c..40f7bc2 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1460,6 +1460,7 @@ static void loop_del_one(struct loop_device *lo)
kfree(lo);
}

+/* return NULL for success, or return non-zero value if there are error */
static struct kobject *loop_probe(dev_t dev, int *part, void *data)
{
unsigned int number = dev  MINORMASK;
@@ -1474,8 +1475,8 @@ static struct kobject *loop_probe
*part = 0;
if (IS_ERR(lo))
return (void *)lo;
-   else
-   return lo-lo_disk-kobj;
+
+   return NULL;
}

static int __init loop_init(void)
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + per-cpuset-hugetlb-accounting-and-administration.patch added to -mm tree

2007-05-04 Thread Ken Chen


On 5/3/07, Paul Jackson <[EMAIL PROTECTED]> wrote:

Note, Ken, that if we did that, the calculation of these new Total and
Free stats would be a little different than your new code.  Instead of
looping over the memory nodes in the current tasks mems_allowed mask,
we would loop over the memory nodes allowed in the cpuset being queried
(the cpuset whose 'hugepages_total' or 'hugepages_free' special
file we were reading, not the current tasks cpuset.)


This is even more controversial and messy.  akpm already dropped the
patch and expressed that he doesn't like it.  And I won't go down
another messy path. I will let this idea RIP.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + per-cpuset-hugetlb-accounting-and-administration.patch added to -mm tree

2007-05-04 Thread Ken Chen


On 5/3/07, Paul Jackson [EMAIL PROTECTED] wrote:

Note, Ken, that if we did that, the calculation of these new Total and
Free stats would be a little different than your new code.  Instead of
looping over the memory nodes in the current tasks mems_allowed mask,
we would loop over the memory nodes allowed in the cpuset being queried
(the cpuset whose 'hugepages_total' or 'hugepages_free' special
file we were reading, not the current tasks cpuset.)


This is even more controversial and messy.  akpm already dropped the
patch and expressed that he doesn't like it.  And I won't go down
another messy path. I will let this idea RIP.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + per-cpuset-hugetlb-accounting-and-administration.patch added to -mm tree

2007-05-03 Thread Ken Chen


On 5/3/07, Paul Jackson <[EMAIL PROTECTED]> wrote:

Adding Christoph Lameter <[EMAIL PROTECTED]> to the cc list, as he knows
more about hugetlb pages than I do.

This patch strikes me as a bit odd.

Granted, it's solving what could be a touchy problem with a fairly
simple solution, which is usually a Good Thing(tm).

However, the idea that different tasks would see different values for
the following fields in /proc/meminfo:

HugePages_Total: 0
HugePages_Free:  0

strikes me as odd, and risky.  I would have thought that usually, all
tasks in the system should see the same values in the files in /proc
(as opposed to the files in particular task subdirectories /proc/.)

This patch strikes me as a bit of a hack, good for compatibility, but
hiding a booby trap that will bite some user code in the long run.

But I'm not enough of an expert to know what the right tradeoffs are
in this matter.


Would annotating the Hugepages_* field with name of cpuset help?  I
orginally thought that since cpuset's mems are hirearchical in memory
assignment, it is fairly straightforward to understand what's going
on: parent cpuset stats include its and all of its children.  For
example, if root cpuset has two sub job1 and job2 cpusets, each has 20
and 30 htlb pages, when query at each level, we have:

[EMAIL PROTECTED] echo $$ > /dev/cpuset/tasks
[EMAIL PROTECTED] grep HugePages_Total /proc/meminfo
HugePages_Total:50

[EMAIL PROTECTED] echo $$ > /dev/cpuset/job1/tasks
[EMAIL PROTECTED] grep HugePages_Total /proc/meminfo
HugePages_Total:20

[EMAIL PROTECTED] echo $$ > /dev/cpuset/job2/tasks
[EMAIL PROTECTED] grep HugePages_Total /proc/meminfo
HugePages_Total:30

If this is odd, do you have any suggestions for alternative?

- Ken
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: + per-cpuset-hugetlb-accounting-and-administration.patch added to -mm tree

2007-05-03 Thread Ken Chen


On 5/3/07, Paul Jackson [EMAIL PROTECTED] wrote:

Adding Christoph Lameter [EMAIL PROTECTED] to the cc list, as he knows
more about hugetlb pages than I do.

This patch strikes me as a bit odd.

Granted, it's solving what could be a touchy problem with a fairly
simple solution, which is usually a Good Thing(tm).

However, the idea that different tasks would see different values for
the following fields in /proc/meminfo:

HugePages_Total: 0
HugePages_Free:  0

strikes me as odd, and risky.  I would have thought that usually, all
tasks in the system should see the same values in the files in /proc
(as opposed to the files in particular task subdirectories /proc/pid.)

This patch strikes me as a bit of a hack, good for compatibility, but
hiding a booby trap that will bite some user code in the long run.

But I'm not enough of an expert to know what the right tradeoffs are
in this matter.


Would annotating the Hugepages_* field with name of cpuset help?  I
orginally thought that since cpuset's mems are hirearchical in memory
assignment, it is fairly straightforward to understand what's going
on: parent cpuset stats include its and all of its children.  For
example, if root cpuset has two sub job1 and job2 cpusets, each has 20
and 30 htlb pages, when query at each level, we have:

[EMAIL PROTECTED] echo $$  /dev/cpuset/tasks
[EMAIL PROTECTED] grep HugePages_Total /proc/meminfo
HugePages_Total:50

[EMAIL PROTECTED] echo $$  /dev/cpuset/job1/tasks
[EMAIL PROTECTED] grep HugePages_Total /proc/meminfo
HugePages_Total:20

[EMAIL PROTECTED] echo $$  /dev/cpuset/job2/tasks
[EMAIL PROTECTED] grep HugePages_Total /proc/meminfo
HugePages_Total:30

If this is odd, do you have any suggestions for alternative?

- Ken
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: cache-pipe-buf-page-address-for-non-highmem-arch.patch

2007-05-02 Thread Ken Chen


On 5/1/07, Andrew Morton <[EMAIL PROTECTED]> wrote:

Fair enough, it is a bit of an ugly thing.  And I see no measurements there
on what the overall speedup was for any workload.

Ken, which memory model was in use?  sparsemem?


discontigmem with config_numa on.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: cache-pipe-buf-page-address-for-non-highmem-arch.patch

2007-05-02 Thread Ken Chen


On 5/1/07, Andrew Morton [EMAIL PROTECTED] wrote:

Fair enough, it is a bit of an ugly thing.  And I see no measurements there
on what the overall speedup was for any workload.

Ken, which memory model was in use?  sparsemem?


discontigmem with config_numa on.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-12 Thread Ken Chen

On 4/12/07, Ken Chen <[EMAIL PROTECTED]> wrote:

On 4/12/07, Jeff Moyer <[EMAIL PROTECTED]> wrote:
> I didn't see any response to Zach's request for code that actually
> tests out the shared ring buffer.  Do you have such code?

Yes, I do.  I was stress testing the code since last night.  After 20+
hours of stress run with fio and aio-stress, now I'm posting it with
confidence.

I modified libaio's io_getevents to take advantage of new user level
reap function. The feature is exported out via ring->compat_features.
btw, is compat_feature suppose to be a version number or a bit mask?
I think bitmask make more sense and more flexible.

Additional patch on the kernel side to export the new features.  On
top of patch posted at:
http://marc.info/?l=linux-kernel=117636401818057=2

--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -138,8 +138,11 @@ #define init_sync_kiocb(x, filp)   \
init_wait((&(x)->ki_wait)); \
} while (0)

+#define AIO_RING_BASE  1
+#define AIO_RING_USER_REAP 2
+
#define AIO_RING_MAGIC  0xa10a10a1
-#define AIO_RING_COMPAT_FEATURES   1
+#define AIO_RING_COMPAT_FEATURES   (AIO_RING_BASE | AIO_RING_USER_REAP)
#define AIO_RING_INCOMPAT_FEATURES  0
struct aio_ring {
unsignedid; /* kernel internal index number */
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-12 Thread Ken Chen


On 4/12/07, Jeff Moyer <[EMAIL PROTECTED]> wrote:

I didn't see any response to Zach's request for code that actually
tests out the shared ring buffer.  Do you have such code?


Yes, I do.  I was stress testing the code since last night.  After 20+
hours of stress run with fio and aio-stress, now I'm posting it with
confidence.

I modified libaio's io_getevents to take advantage of new user level
reap function. The feature is exported out via ring->compat_features.
btw, is compat_feature suppose to be a version number or a bit mask?
I think bitmask make more sense and more flexible.

(warning: some lines are extremely long in the patch and my email
client will probably mangle it badly).


diff -Nurp libaio-0.3.104/src/io_getevents.c
libaio-0.3.104-new/src/io_getevents.c
--- libaio-0.3.104/src/io_getevents.c   2003-06-18 12:58:21.0 -0700
+++ libaio-0.3.104-new/src/io_getevents.c   2007-04-12 17:35:06.0 
-0700
@@ -21,10 +21,13 @@
#include 
#include 
#include "syscall.h"
+#include 

io_syscall5(int, __io_getevents_0_4, io_getevents, io_context_t, ctx,
long, min_nr, long, nr, struct io_event *, events, struct timespec *,
timeout)

#define AIO_RING_MAGIC  0xa10a10a1
+#define AIO_RING_BASE  1
+#define AIO_RING_USER_REAP 2

/* Ben will hate me for this */
struct aio_ring {
@@ -41,7 +44,11 @@ struct aio_ring {

int io_getevents_0_4(io_context_t ctx, long min_nr, long nr, struct
io_event * events, struct timespec * timeout)
{
+   long i = 0, ret;
+   unsigned head;
+   struct io_event *evt_base;
struct aio_ring *ring;
+
ring = (struct aio_ring*)ctx;
if (ring==NULL || ring->magic != AIO_RING_MAGIC)
goto do_syscall;
@@ -49,9 +56,35 @@ int io_getevents_0_4(io_context_t ctx, l
if (ring->head == ring->tail)
return 0;
}
-   
+
+   if (!(ring->compat_features & AIO_RING_USER_REAP))
+   goto do_syscall;
+
+   if (min_nr > nr || min_nr < 0 || nr < 0)
+   return -EINVAL;
+
+   evt_base = (struct io_event *) (ring + 1);
+   while (i < nr) {
+   head = ring->head;
+   if (head == ring->tail)
+   break;
+
+   *events = evt_base[head & (ring->nr - 1)];
+   if (head == cmpxchg(>head, head, head + 1)) {
+   events++;
+   i++;
+   }
+   }
+
+   if (i >= min_nr)
+   return i;
+
do_syscall: 
-   return __io_getevents_0_4(ctx, min_nr, nr, events, timeout);
+   ret = __io_getevents_0_4(ctx, min_nr - i, nr - i, events, timeout);
+   if (ret >= 0)
+   return i + ret;
+   else
+   return i ? i : ret;
}

DEFSYMVER(io_getevents_0_4, io_getevents, 0.4)
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-12 Thread Ken Chen


On 4/11/07, Zach Brown <[EMAIL PROTECTED]> wrote:

First, I'll NAK this and all AIO patches until the patch description
says that it's been run through the regression tests that we've started
collecting in autotest.  They're trivial to run, never fear:

 cd /usr/local
 svn checkout svn://test.kernel.org/autotest/trunk/client autotest
 cd autotest
 bin/autotest tests/aio_dio_bugs/control


I ran through the autotest (with bug fix in the test code).  It passes
the regression tests. I made the following change since last rev:

* struct aio_ring stays unchanged.  I added a cast to atomic_cmpxchg()
* aio_ring_event() has grown too big, I turned that into a function
* take out white space change from previous rev.

open issue that this patch does not try to fix:
* kmap address alias needs a flush_dcache_page

Zach, does this make you more comfortable? (or maybe less iffy is the
right word?)


From: Ken Chen <[EMAIL PROTECTED]>

Resurrect an old patch that uses atomic operation to update ring buffer
index on AIO event queue.  This work allows further application/libaio
optimization to run fast path io_getevents in user space.

I've also added one more change on top of old implementation that rounds
ring buffer size to power of 2 to allow fast head/tail calculation. With
the new scheme, there is no more modulo operation on them and we simply
increment either pointer index directly.  This scheme also automatically
handles integer wrap nicely without any additional special treatment.

Patch was stress tested and in addition, tested on
autotest:aio_dio_bugs.  The results were all pass:  Output of
autotest:aio_dio_bugs:
ran for 200 seconds without error, passing
AIO read of last block in file succeeded.
aio-free-ring-with-bogus-nr-pages: Success!
aio-io-setup-with-nonwritable-context-pointer: Success!
GOOD aio_dio_bugs completed successfully


Signed-off-by: Ken Chen <[EMAIL PROTECTED]>
diff --git a/fs/aio.c b/fs/aio.c
index e4598d6..f8b8e45 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 	unsigned long size;
 	int nr_pages;
 
-	/* Compensate for the ring buffer's head/tail overlap entry */
-	nr_events += 2;	/* 1 is required, 2 for good luck */
+	/* round nr_event to next power of 2 */
+	nr_events = roundup_pow_of_two(nr_events);
 
 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
@@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx)
 	if (nr_pages < 0)
 		return -EINVAL;
 
-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
-
 	info->nr = 0;
 	info->ring_pages = info->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
@@ -177,14 +175,16 @@ static int aio_setup_ring(struct kioctx *ctx)
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-#define aio_ring_event(info, nr, km) ({	\
-	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\
-	struct io_event *__event;	\
-	__event = kmap_atomic(		\
-			(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
-	__event += pos % AIO_EVENTS_PER_PAGE;\
-	__event;			\
-})
+static struct io_event *aio_ring_event(struct aio_ring_info *info,
+	unsigned nr, enum km_type km)
+{
+	unsigned pos = (nr & (info->nr - 1)) + AIO_EVENTS_OFFSET;
+	struct io_event *event;
+
+	event = kmap_atomic(info->ring_pages[pos / AIO_EVENTS_PER_PAGE], km);
+	event += pos % AIO_EVENTS_PER_PAGE;
+	return event;
+}
 
 #define put_aio_ring_event(event, km) do {	\
 	struct io_event *__event = (event);	\
@@ -220,7 +220,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	atomic_set(>users, 1);
 	spin_lock_init(>ctx_lock);
-	spin_lock_init(>ring_info.ring_lock);
 	init_waitqueue_head(>wait);
 
 	INIT_LIST_HEAD(>active_reqs);
@@ -927,7 +926,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	struct aio_ring	*ring;
 	struct io_event	*event;
 	unsigned long	flags;
-	unsigned long	tail;
+	unsigned	tail;
 	int		ret;
 
 	/*
@@ -969,8 +968,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 
 	tail = info->tail;
 	event = aio_ring_event(info, tail, KM_IRQ0);
-	if (++tail >= info->nr)
-		tail = 0;
 
 	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
 	event->data = iocb->ki_user_data;
@@ -986,13 +983,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	 */
 	smp_wmb();	/* make event visible before updating tail */
 
+	tail++;
 	info->tail = tail;
 	ring->tail = tail;
 
 	put_aio_ring_event(event, KM_IRQ0);
 	kunmap_atomic(ring, KM_IRQ1);
 
-	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+	pr_debug("added to ring %p at [%u]\n", iocb, tail);
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
 	ret = __aio_put_req(ctx, iocb);
@@ -1007,39 +1005,35 @@

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-12 Thread Ken Chen


On 4/11/07, Ken Chen <[EMAIL PROTECTED]> wrote:

On 4/11/07, Zach Brown <[EMAIL PROTECTED]> wrote:
> First, I'll NAK this and all AIO patches until the patch description
> says that it's been run through the regression tests that we've started
> collecting in autotest.  They're trivial to run, never fear:

OK.  I will run those regression tests.


Unfortunately, the aio_dio_bugs test in autotest has bug in it :-(
We need stress test the "test code".

on stock 2.6.21-rc6 kernel:

[rock-me-baby]$ cd autotest/tests/aio_dio_bugs/src
[rock-me-baby]$ make
[rock-me-baby]$ ./aio-free-ring-with-bogus-nr-pages
aio-free-ring-with-bogus-nr-pages: Error: io_setup returned -22,
expected -ENOMEM

hmm???  The problem is that the test code forgot to initialized ctx
variable and in the kernel, sys_io_setup returns EINVAL if user
address contain none-zero value.

I will submit the following patch to autotest to correct the test code.

--- aio-free-ring-with-bogus-nr-pages.c.orig2007-04-11 23:57:45 -0700
+++ aio-free-ring-with-bogus-nr-pages.c 2007-04-11 23:57:59 -0700
@@ -39,7 +39,7 @@
int main(int __attribute__((unused)) argc, char **argv)
{
long res;
-   io_context_t ctx;
+   io_context_t ctx = 0;
void* map;

while (1) {
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-12 Thread Ken Chen


On 4/11/07, Ken Chen [EMAIL PROTECTED] wrote:

On 4/11/07, Zach Brown [EMAIL PROTECTED] wrote:
 First, I'll NAK this and all AIO patches until the patch description
 says that it's been run through the regression tests that we've started
 collecting in autotest.  They're trivial to run, never fear:

OK.  I will run those regression tests.


Unfortunately, the aio_dio_bugs test in autotest has bug in it :-(
We need stress test the test code.

on stock 2.6.21-rc6 kernel:

[rock-me-baby]$ cd autotest/tests/aio_dio_bugs/src
[rock-me-baby]$ make
[rock-me-baby]$ ./aio-free-ring-with-bogus-nr-pages
aio-free-ring-with-bogus-nr-pages: Error: io_setup returned -22,
expected -ENOMEM

hmm???  The problem is that the test code forgot to initialized ctx
variable and in the kernel, sys_io_setup returns EINVAL if user
address contain none-zero value.

I will submit the following patch to autotest to correct the test code.

--- aio-free-ring-with-bogus-nr-pages.c.orig2007-04-11 23:57:45 -0700
+++ aio-free-ring-with-bogus-nr-pages.c 2007-04-11 23:57:59 -0700
@@ -39,7 +39,7 @@
int main(int __attribute__((unused)) argc, char **argv)
{
long res;
-   io_context_t ctx;
+   io_context_t ctx = 0;
void* map;

while (1) {
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-12 Thread Ken Chen


On 4/11/07, Zach Brown [EMAIL PROTECTED] wrote:

First, I'll NAK this and all AIO patches until the patch description
says that it's been run through the regression tests that we've started
collecting in autotest.  They're trivial to run, never fear:

 cd /usr/local
 svn checkout svn://test.kernel.org/autotest/trunk/client autotest
 cd autotest
 bin/autotest tests/aio_dio_bugs/control


I ran through the autotest (with bug fix in the test code).  It passes
the regression tests. I made the following change since last rev:

* struct aio_ring stays unchanged.  I added a cast to atomic_cmpxchg()
* aio_ring_event() has grown too big, I turned that into a function
* take out white space change from previous rev.

open issue that this patch does not try to fix:
* kmap address alias needs a flush_dcache_page

Zach, does this make you more comfortable? (or maybe less iffy is the
right word?)


From: Ken Chen [EMAIL PROTECTED]

Resurrect an old patch that uses atomic operation to update ring buffer
index on AIO event queue.  This work allows further application/libaio
optimization to run fast path io_getevents in user space.

I've also added one more change on top of old implementation that rounds
ring buffer size to power of 2 to allow fast head/tail calculation. With
the new scheme, there is no more modulo operation on them and we simply
increment either pointer index directly.  This scheme also automatically
handles integer wrap nicely without any additional special treatment.

Patch was stress tested and in addition, tested on
autotest:aio_dio_bugs.  The results were all pass:  Output of
autotest:aio_dio_bugs:
ran for 200 seconds without error, passing
AIO read of last block in file succeeded.
aio-free-ring-with-bogus-nr-pages: Success!
aio-io-setup-with-nonwritable-context-pointer: Success!
GOOD aio_dio_bugs completed successfully


Signed-off-by: Ken Chen [EMAIL PROTECTED]
diff --git a/fs/aio.c b/fs/aio.c
index e4598d6..f8b8e45 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 	unsigned long size;
 	int nr_pages;
 
-	/* Compensate for the ring buffer's head/tail overlap entry */
-	nr_events += 2;	/* 1 is required, 2 for good luck */
+	/* round nr_event to next power of 2 */
+	nr_events = roundup_pow_of_two(nr_events);
 
 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
@@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx)
 	if (nr_pages  0)
 		return -EINVAL;
 
-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
-
 	info-nr = 0;
 	info-ring_pages = info-internal_pages;
 	if (nr_pages  AIO_RING_PAGES) {
@@ -177,14 +175,16 @@ static int aio_setup_ring(struct kioctx *ctx)
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-#define aio_ring_event(info, nr, km) ({	\
-	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\
-	struct io_event *__event;	\
-	__event = kmap_atomic(		\
-			(info)-ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
-	__event += pos % AIO_EVENTS_PER_PAGE;\
-	__event;			\
-})
+static struct io_event *aio_ring_event(struct aio_ring_info *info,
+	unsigned nr, enum km_type km)
+{
+	unsigned pos = (nr  (info-nr - 1)) + AIO_EVENTS_OFFSET;
+	struct io_event *event;
+
+	event = kmap_atomic(info-ring_pages[pos / AIO_EVENTS_PER_PAGE], km);
+	event += pos % AIO_EVENTS_PER_PAGE;
+	return event;
+}
 
 #define put_aio_ring_event(event, km) do {	\
 	struct io_event *__event = (event);	\
@@ -220,7 +220,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	atomic_set(ctx-users, 1);
 	spin_lock_init(ctx-ctx_lock);
-	spin_lock_init(ctx-ring_info.ring_lock);
 	init_waitqueue_head(ctx-wait);
 
 	INIT_LIST_HEAD(ctx-active_reqs);
@@ -927,7 +926,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	struct aio_ring	*ring;
 	struct io_event	*event;
 	unsigned long	flags;
-	unsigned long	tail;
+	unsigned	tail;
 	int		ret;
 
 	/*
@@ -969,8 +968,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 
 	tail = info-tail;
 	event = aio_ring_event(info, tail, KM_IRQ0);
-	if (++tail = info-nr)
-		tail = 0;
 
 	event-obj = (u64)(unsigned long)iocb-ki_obj.user;
 	event-data = iocb-ki_user_data;
@@ -986,13 +983,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	 */
 	smp_wmb();	/* make event visible before updating tail */
 
+	tail++;
 	info-tail = tail;
 	ring-tail = tail;
 
 	put_aio_ring_event(event, KM_IRQ0);
 	kunmap_atomic(ring, KM_IRQ1);
 
-	pr_debug(added to ring %p at [%lu]\n, iocb, tail);
+	pr_debug(added to ring %p at [%u]\n, iocb, tail);
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
 	ret = __aio_put_req(ctx, iocb);
@@ -1007,39 +1005,35 @@ put_rq:
 /* aio_read_evt
  *	Pull an event off of the ioctx's event ring.  Returns the number of 
  *	events fetched (0 or 1

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-12 Thread Ken Chen


On 4/12/07, Jeff Moyer [EMAIL PROTECTED] wrote:

I didn't see any response to Zach's request for code that actually
tests out the shared ring buffer.  Do you have such code?


Yes, I do.  I was stress testing the code since last night.  After 20+
hours of stress run with fio and aio-stress, now I'm posting it with
confidence.

I modified libaio's io_getevents to take advantage of new user level
reap function. The feature is exported out via ring-compat_features.
btw, is compat_feature suppose to be a version number or a bit mask?
I think bitmask make more sense and more flexible.

(warning: some lines are extremely long in the patch and my email
client will probably mangle it badly).


diff -Nurp libaio-0.3.104/src/io_getevents.c
libaio-0.3.104-new/src/io_getevents.c
--- libaio-0.3.104/src/io_getevents.c   2003-06-18 12:58:21.0 -0700
+++ libaio-0.3.104-new/src/io_getevents.c   2007-04-12 17:35:06.0 
-0700
@@ -21,10 +21,13 @@
#include stdlib.h
#include time.h
#include syscall.h
+#include asm/system.h

io_syscall5(int, __io_getevents_0_4, io_getevents, io_context_t, ctx,
long, min_nr, long, nr, struct io_event *, events, struct timespec *,
timeout)

#define AIO_RING_MAGIC  0xa10a10a1
+#define AIO_RING_BASE  1
+#define AIO_RING_USER_REAP 2

/* Ben will hate me for this */
struct aio_ring {
@@ -41,7 +44,11 @@ struct aio_ring {

int io_getevents_0_4(io_context_t ctx, long min_nr, long nr, struct
io_event * events, struct timespec * timeout)
{
+   long i = 0, ret;
+   unsigned head;
+   struct io_event *evt_base;
struct aio_ring *ring;
+
ring = (struct aio_ring*)ctx;
if (ring==NULL || ring-magic != AIO_RING_MAGIC)
goto do_syscall;
@@ -49,9 +56,35 @@ int io_getevents_0_4(io_context_t ctx, l
if (ring-head == ring-tail)
return 0;
}
-   
+
+   if (!(ring-compat_features  AIO_RING_USER_REAP))
+   goto do_syscall;
+
+   if (min_nr  nr || min_nr  0 || nr  0)
+   return -EINVAL;
+
+   evt_base = (struct io_event *) (ring + 1);
+   while (i  nr) {
+   head = ring-head;
+   if (head == ring-tail)
+   break;
+
+   *events = evt_base[head  (ring-nr - 1)];
+   if (head == cmpxchg(ring-head, head, head + 1)) {
+   events++;
+   i++;
+   }
+   }
+
+   if (i = min_nr)
+   return i;
+
do_syscall: 
-   return __io_getevents_0_4(ctx, min_nr, nr, events, timeout);
+   ret = __io_getevents_0_4(ctx, min_nr - i, nr - i, events, timeout);
+   if (ret = 0)
+   return i + ret;
+   else
+   return i ? i : ret;
}

DEFSYMVER(io_getevents_0_4, io_getevents, 0.4)
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-12 Thread Ken Chen


On 4/12/07, Ken Chen [EMAIL PROTECTED] wrote:

On 4/12/07, Jeff Moyer [EMAIL PROTECTED] wrote:
 I didn't see any response to Zach's request for code that actually
 tests out the shared ring buffer.  Do you have such code?

Yes, I do.  I was stress testing the code since last night.  After 20+
hours of stress run with fio and aio-stress, now I'm posting it with
confidence.

I modified libaio's io_getevents to take advantage of new user level
reap function. The feature is exported out via ring-compat_features.
btw, is compat_feature suppose to be a version number or a bit mask?
I think bitmask make more sense and more flexible.


Additional patch on the kernel side to export the new features.  On
top of patch posted at:
http://marc.info/?l=linux-kernelm=117636401818057w=2

--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -138,8 +138,11 @@ #define init_sync_kiocb(x, filp)   \
init_wait(((x)-ki_wait)); \
} while (0)

+#define AIO_RING_BASE  1
+#define AIO_RING_USER_REAP 2
+
#define AIO_RING_MAGIC  0xa10a10a1
-#define AIO_RING_COMPAT_FEATURES   1
+#define AIO_RING_COMPAT_FEATURES   (AIO_RING_BASE | AIO_RING_USER_REAP)
#define AIO_RING_INCOMPAT_FEATURES  0
struct aio_ring {
unsignedid; /* kernel internal index number */
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-11 Thread Ken Chen

On 4/11/07, Zach Brown <[EMAIL PROTECTED]> wrote:

First, I'll NAK this and all AIO patches until the patch description
says that it's been run through the regression tests that we've started
collecting in autotest.  They're trivial to run, never fear:

OK.  I will run those regression tests.

> Resurrect an old patch that uses atomic operation to update ring buffer
> index on AIO event queue.  This work allows futher application/libaio
> optimization to run fast path io_getevents in user space.

I have mixed feelings.  I think the userspace getevents support was
poorly designed and the simple fact that we've gone this long without it
says just how desperately the feature isn't needed.

I kept on getting requests from application developers who want that
feature.  My initial patch was dated back May 2004.

We're allocating more ring elements
than userspace asked for and than were accounted for in aio_max_nr.
That cost, however teeny, will be measured against the benefit.

I noticed that while writing the patch.  We have the same bug right
now that nr_events is enlarged to consume the entire mmap'ed ring
buffer pages.  But yet, we don't account those in aio_max_nr.  I
didn't fix that up in this patch because I was going to do a separate
patch on that.

> - /* Compensate for the ring buffer's head/tail overlap entry */
> - nr_events += 2; /* 1 is required, 2 for good luck */
> + /* round nr_event to next power of 2 */
> + nr_events = roundup_pow_of_two(nr_events);

Is that buggy?  How will the code tell if head == tail means a full ring
or an empty ring?  The old code added that extra event to let it tell
the ring was full before head == tail and it would think it's empty
again, I think.  I'm guessing roundup(nr_events + 1) is needed.

I don't think it's a bug.  akpm taught me the trick: we don't wrap the
index around the ring size in this scheme, so the extra space is not
needed.

The ring page, which is mmaped to userspace at some weird address, was
just written through a kmap.  Do we need a flush_dcache_page()?  This
isn't this patch's problem, but it'd need to be addressed if we're using
the ring from userspace.

I will look into this aside from this patch.

- Ken
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-11 Thread Ken Chen

On 4/11/07, Zach Brown <[EMAIL PROTECTED]> wrote:

> Sorry I wasn't thorough enough.  And partially because I was worried
> about changing structure type for user space facing struct aio_ring.
> Now that I looked through all arches, it looks safe as all arch's
> atomic_t has the same size as int.

> Here is the updated patch.

> @@ -144,7 +144,7 @@ struct kiocb {
>  struct aio_ring {
>   unsignedid; /* kernel internal index number */
>   unsignednr; /* number of io_events */
> - unsignedhead;
> + atomic_thead;
>   unsignedtail;

Embedding an atomic_t in an ABI struct?  That makes everyone else
nervous too, right?

sure, make me nervous at least.

It may look safe on i386/x86-64 today, but this doesn't seem like wise
practice.  Is there any reason to believe that atomic_t will never
change size?  Does anything else do this already?

If nothing else, the "unsigned" (should be __u32, sigh) could be cast to
an atomic_t.

The atomic_t cast is equally iffy.  I don't know what would be a best solution.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-11 Thread Ken Chen


On 4/10/07, Andrew Morton <[EMAIL PROTECTED]> wrote:

On Tue, 10 Apr 2007 16:53:53 -0700 (PDT) [EMAIL PROTECTED] (Ken Chen) wrote:

> + } while (head != cmpxchg(>head, head, head + 1));

A hasty grep indicates that only 14 out of 23 architectures implement
cmpxchg().


Sorry I wasn't thorough enough.  And partially because I was worried
about changing structure type for user space facing struct aio_ring.
Now that I looked through all arches, it looks safe as all arch's
atomic_t has the same size as int.

Here is the updated patch.
diff --git a/fs/aio.c b/fs/aio.c
index e4598d6..091bcb4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 	unsigned long size;
 	int nr_pages;
 
-	/* Compensate for the ring buffer's head/tail overlap entry */
-	nr_events += 2;	/* 1 is required, 2 for good luck */
+	/* round nr_event to next power of 2 */
+	nr_events = roundup_pow_of_two(nr_events);
 
 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
@@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx)
 	if (nr_pages < 0)
 		return -EINVAL;
 
-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
-
 	info->nr = 0;
 	info->ring_pages = info->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
@@ -159,7 +157,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
 	ring->nr = nr_events;	/* user copy */
 	ring->id = ctx->user_id;
-	ring->head = ring->tail = 0;
+	atomic_set(>head, 0);
+	ring->tail = 0;
 	ring->magic = AIO_RING_MAGIC;
 	ring->compat_features = AIO_RING_COMPAT_FEATURES;
 	ring->incompat_features = AIO_RING_INCOMPAT_FEATURES;
@@ -177,8 +176,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-#define aio_ring_event(info, nr, km) ({	\
-	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\
+#define aio_ring_event(info, __nr, km) ({\
+	unsigned pos = ((__nr) & ((info)->nr - 1)) + AIO_EVENTS_OFFSET;	\
 	struct io_event *__event;	\
 	__event = kmap_atomic(		\
 			(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
@@ -220,7 +219,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	atomic_set(>users, 1);
 	spin_lock_init(>ctx_lock);
-	spin_lock_init(>ring_info.ring_lock);
 	init_waitqueue_head(>wait);
 
 	INIT_LIST_HEAD(>active_reqs);
@@ -927,7 +925,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	struct aio_ring	*ring;
 	struct io_event	*event;
 	unsigned long	flags;
-	unsigned long	tail;
+	unsigned	tail;
 	int		ret;
 
 	/*
@@ -969,8 +967,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 
 	tail = info->tail;
 	event = aio_ring_event(info, tail, KM_IRQ0);
-	if (++tail >= info->nr)
-		tail = 0;
 
 	event->obj = (u64)(unsigned long)iocb->ki_obj.user;
 	event->data = iocb->ki_user_data;
@@ -986,13 +982,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	 */
 	smp_wmb();	/* make event visible before updating tail */
 
+	tail++;
 	info->tail = tail;
 	ring->tail = tail;
 
 	put_aio_ring_event(event, KM_IRQ0);
 	kunmap_atomic(ring, KM_IRQ1);
 
-	pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+	pr_debug("added to ring %p at [%u]\n", iocb, tail);
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
 	ret = __aio_put_req(ctx, iocb);
@@ -1007,42 +1004,36 @@ put_rq:
 /* aio_read_evt
  *	Pull an event off of the ioctx's event ring.  Returns the number of 
  *	events fetched (0 or 1 ;-)
- *	FIXME: make this use cmpxchg.
  *	TODO: make the ringbuffer user mmap()able (requires FIXME).
  */
 static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
 {
 	struct aio_ring_info *info = >ring_info;
 	struct aio_ring *ring;
-	unsigned long head;
-	int ret = 0;
+	struct io_event *evp;
+	unsigned head;
+	int ret;
 
 	ring = kmap_atomic(info->ring_pages[0], KM_USER0);
-	dprintk("in aio_read_evt h%lu t%lu m%lu\n",
-		 (unsigned long)ring->head, (unsigned long)ring->tail,
-		 (unsigned long)ring->nr);
-
-	if (ring->head == ring->tail)
-		goto out;
-
-	spin_lock(>ring_lock);
+	dprintk("in aio_read_evt h%u t%u m%u\n",
+		 atomic_read(>head), ring->tail, ring->nr);
 
-	head = ring->head % info->nr;
-	if (head != ring->tail) {
-		struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+	do {
+		head = atomic_read(>head);
+		if (head == ring->tail) {
+			ret = 0;
+			break;
+		}
+		evp = aio_ring_event(info, head, KM_USER1);
 		*ent = *evp;
-		head = (head + 1) % info->nr;
 		smp_mb(); /* finish reading the event before updatng the head */
-		ring->head = head;
 		ret = 1;
 		put

Re: Why kmem_cache_free occupy CPU for more than 10 seconds?

2007-04-11 Thread Ken Chen


On 4/11/07, Peter Zijlstra <[EMAIL PROTECTED]> wrote:

On Wed, 2007-04-11 at 17:53 +0800, Zhao Forrest wrote:
> I got some new information:
> Before soft lockup message is out, we have:
> [EMAIL PROTECTED] home]# cat /proc/slabinfo |grep buffer_head
> buffer_head   10927942 10942560120   321 : tunables   32
> 168 : slabdata 341955 341955  6 : globalstat 37602996 11589379
> 11743736  01 6918 12166031 1013708
> : cpustat 35254590 2350698 13610965 907286
>
> Then after buffer_head is freed, we have:
> [EMAIL PROTECTED] home]# cat /proc/slabinfo |grep buffer_head
> buffer_head 9542  36384120   321 : tunables   32   16
>   8 : slabdata   1137   1137245 : globalstat 37602996 11589379
> 11743736  01 6983 20507478
> 1708818 : cpustat 35254625 2350704 16027174 1068367
>
> Does this huge number of buffer_head cause the soft lockup?

__blkdev_put() takes the BKL and bd_mutex
invalidate_mapping_pages() tries to take the PageLock

But no other looks seem held while free_buffer_head() is called

All these locks are preemptible (CONFIG_PREEMPT_BKL?=y) and should not
hog the cpu like that, what preemption mode have you got selected?
(CONFIG_PREEMPT_VOLUNTARY?=y)


also, you can try this patch:

http://groups.google.com/group/linux.kernel/browse_thread/thread/7086e4b9d5504dc9/c608bfea4614b07e?lnk=gst=+Big+kernel+lock+contention+in+do_open=1#c608bfea4614b07e
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Why kmem_cache_free occupy CPU for more than 10 seconds?

2007-04-11 Thread Ken Chen


On 4/11/07, Peter Zijlstra [EMAIL PROTECTED] wrote:

On Wed, 2007-04-11 at 17:53 +0800, Zhao Forrest wrote:
 I got some new information:
 Before soft lockup message is out, we have:
 [EMAIL PROTECTED] home]# cat /proc/slabinfo |grep buffer_head
 buffer_head   10927942 10942560120   321 : tunables   32
 168 : slabdata 341955 341955  6 : globalstat 37602996 11589379
 11743736  01 6918 12166031 1013708
 : cpustat 35254590 2350698 13610965 907286

 Then after buffer_head is freed, we have:
 [EMAIL PROTECTED] home]# cat /proc/slabinfo |grep buffer_head
 buffer_head 9542  36384120   321 : tunables   32   16
   8 : slabdata   1137   1137245 : globalstat 37602996 11589379
 11743736  01 6983 20507478
 1708818 : cpustat 35254625 2350704 16027174 1068367

 Does this huge number of buffer_head cause the soft lockup?

__blkdev_put() takes the BKL and bd_mutex
invalidate_mapping_pages() tries to take the PageLock

But no other looks seem held while free_buffer_head() is called

All these locks are preemptible (CONFIG_PREEMPT_BKL?=y) and should not
hog the cpu like that, what preemption mode have you got selected?
(CONFIG_PREEMPT_VOLUNTARY?=y)


also, you can try this patch:

http://groups.google.com/group/linux.kernel/browse_thread/thread/7086e4b9d5504dc9/c608bfea4614b07e?lnk=gstq=+Big+kernel+lock+contention+in+do_openrnum=1#c608bfea4614b07e
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-11 Thread Ken Chen


On 4/10/07, Andrew Morton [EMAIL PROTECTED] wrote:

On Tue, 10 Apr 2007 16:53:53 -0700 (PDT) [EMAIL PROTECTED] (Ken Chen) wrote:

 + } while (head != cmpxchg(ring-head, head, head + 1));

A hasty grep indicates that only 14 out of 23 architectures implement
cmpxchg().


Sorry I wasn't thorough enough.  And partially because I was worried
about changing structure type for user space facing struct aio_ring.
Now that I looked through all arches, it looks safe as all arch's
atomic_t has the same size as int.

Here is the updated patch.
diff --git a/fs/aio.c b/fs/aio.c
index e4598d6..091bcb4 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 	unsigned long size;
 	int nr_pages;
 
-	/* Compensate for the ring buffer's head/tail overlap entry */
-	nr_events += 2;	/* 1 is required, 2 for good luck */
+	/* round nr_event to next power of 2 */
+	nr_events = roundup_pow_of_two(nr_events);
 
 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
@@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx)
 	if (nr_pages  0)
 		return -EINVAL;
 
-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
-
 	info-nr = 0;
 	info-ring_pages = info-internal_pages;
 	if (nr_pages  AIO_RING_PAGES) {
@@ -159,7 +157,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 	ring = kmap_atomic(info-ring_pages[0], KM_USER0);
 	ring-nr = nr_events;	/* user copy */
 	ring-id = ctx-user_id;
-	ring-head = ring-tail = 0;
+	atomic_set(ring-head, 0);
+	ring-tail = 0;
 	ring-magic = AIO_RING_MAGIC;
 	ring-compat_features = AIO_RING_COMPAT_FEATURES;
 	ring-incompat_features = AIO_RING_INCOMPAT_FEATURES;
@@ -177,8 +176,8 @@ static int aio_setup_ring(struct kioctx *ctx)
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-#define aio_ring_event(info, nr, km) ({	\
-	unsigned pos = (nr) + AIO_EVENTS_OFFSET;			\
+#define aio_ring_event(info, __nr, km) ({\
+	unsigned pos = ((__nr)  ((info)-nr - 1)) + AIO_EVENTS_OFFSET;	\
 	struct io_event *__event;	\
 	__event = kmap_atomic(		\
 			(info)-ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
@@ -220,7 +219,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	atomic_set(ctx-users, 1);
 	spin_lock_init(ctx-ctx_lock);
-	spin_lock_init(ctx-ring_info.ring_lock);
 	init_waitqueue_head(ctx-wait);
 
 	INIT_LIST_HEAD(ctx-active_reqs);
@@ -927,7 +925,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	struct aio_ring	*ring;
 	struct io_event	*event;
 	unsigned long	flags;
-	unsigned long	tail;
+	unsigned	tail;
 	int		ret;
 
 	/*
@@ -969,8 +967,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 
 	tail = info-tail;
 	event = aio_ring_event(info, tail, KM_IRQ0);
-	if (++tail = info-nr)
-		tail = 0;
 
 	event-obj = (u64)(unsigned long)iocb-ki_obj.user;
 	event-data = iocb-ki_user_data;
@@ -986,13 +982,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	 */
 	smp_wmb();	/* make event visible before updating tail */
 
+	tail++;
 	info-tail = tail;
 	ring-tail = tail;
 
 	put_aio_ring_event(event, KM_IRQ0);
 	kunmap_atomic(ring, KM_IRQ1);
 
-	pr_debug(added to ring %p at [%lu]\n, iocb, tail);
+	pr_debug(added to ring %p at [%u]\n, iocb, tail);
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
 	ret = __aio_put_req(ctx, iocb);
@@ -1007,42 +1004,36 @@ put_rq:
 /* aio_read_evt
  *	Pull an event off of the ioctx's event ring.  Returns the number of 
  *	events fetched (0 or 1 ;-)
- *	FIXME: make this use cmpxchg.
  *	TODO: make the ringbuffer user mmap()able (requires FIXME).
  */
 static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
 {
 	struct aio_ring_info *info = ioctx-ring_info;
 	struct aio_ring *ring;
-	unsigned long head;
-	int ret = 0;
+	struct io_event *evp;
+	unsigned head;
+	int ret;
 
 	ring = kmap_atomic(info-ring_pages[0], KM_USER0);
-	dprintk(in aio_read_evt h%lu t%lu m%lu\n,
-		 (unsigned long)ring-head, (unsigned long)ring-tail,
-		 (unsigned long)ring-nr);
-
-	if (ring-head == ring-tail)
-		goto out;
-
-	spin_lock(info-ring_lock);
+	dprintk(in aio_read_evt h%u t%u m%u\n,
+		 atomic_read(ring-head), ring-tail, ring-nr);
 
-	head = ring-head % info-nr;
-	if (head != ring-tail) {
-		struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+	do {
+		head = atomic_read(ring-head);
+		if (head == ring-tail) {
+			ret = 0;
+			break;
+		}
+		evp = aio_ring_event(info, head, KM_USER1);
 		*ent = *evp;
-		head = (head + 1) % info-nr;
 		smp_mb(); /* finish reading the event before updatng the head */
-		ring-head = head;
 		ret = 1;
 		put_aio_ring_event(evp, KM_USER1);
-	}
-	spin_unlock(info-ring_lock);
+	} while (head != atomic_cmpxchg(ring-head, head, head + 1));
 
-out:
 	kunmap_atomic(ring, KM_USER0);
-	dprintk(leaving aio_read_evt: %d  h%lu t%lu\n

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-11 Thread Ken Chen


On 4/11/07, Zach Brown [EMAIL PROTECTED] wrote:

 Sorry I wasn't thorough enough.  And partially because I was worried
 about changing structure type for user space facing struct aio_ring.
 Now that I looked through all arches, it looks safe as all arch's
 atomic_t has the same size as int.

 Here is the updated patch.

 @@ -144,7 +144,7 @@ struct kiocb {
  struct aio_ring {
   unsignedid; /* kernel internal index number */
   unsignednr; /* number of io_events */
 - unsignedhead;
 + atomic_thead;
   unsignedtail;

Embedding an atomic_t in an ABI struct?  That makes everyone else
nervous too, right?


sure, make me nervous at least.



It may look safe on i386/x86-64 today, but this doesn't seem like wise
practice.  Is there any reason to believe that atomic_t will never
change size?  Does anything else do this already?

If nothing else, the unsigned (should be __u32, sigh) could be cast to
an atomic_t.


The atomic_t cast is equally iffy.  I don't know what would be a best solution.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-11 Thread Ken Chen


On 4/11/07, Zach Brown [EMAIL PROTECTED] wrote:

First, I'll NAK this and all AIO patches until the patch description
says that it's been run through the regression tests that we've started
collecting in autotest.  They're trivial to run, never fear:


OK.  I will run those regression tests.



 Resurrect an old patch that uses atomic operation to update ring buffer
 index on AIO event queue.  This work allows futher application/libaio
 optimization to run fast path io_getevents in user space.

I have mixed feelings.  I think the userspace getevents support was
poorly designed and the simple fact that we've gone this long without it
says just how desperately the feature isn't needed.


I kept on getting requests from application developers who want that
feature.  My initial patch was dated back May 2004.



We're allocating more ring elements
than userspace asked for and than were accounted for in aio_max_nr.
That cost, however teeny, will be measured against the benefit.


I noticed that while writing the patch.  We have the same bug right
now that nr_events is enlarged to consume the entire mmap'ed ring
buffer pages.  But yet, we don't account those in aio_max_nr.  I
didn't fix that up in this patch because I was going to do a separate
patch on that.



 - /* Compensate for the ring buffer's head/tail overlap entry */
 - nr_events += 2; /* 1 is required, 2 for good luck */
 + /* round nr_event to next power of 2 */
 + nr_events = roundup_pow_of_two(nr_events);

Is that buggy?  How will the code tell if head == tail means a full ring
or an empty ring?  The old code added that extra event to let it tell
the ring was full before head == tail and it would think it's empty
again, I think.  I'm guessing roundup(nr_events + 1) is needed.


I don't think it's a bug.  akpm taught me the trick: we don't wrap the
index around the ring size in this scheme, so the extra space is not
needed.



The ring page, which is mmaped to userspace at some weird address, was
just written through a kmap.  Do we need a flush_dcache_page()?  This
isn't this patch's problem, but it'd need to be addressed if we're using
the ring from userspace.


I will look into this aside from this patch.

- Ken
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-10 Thread Ken Chen

Resurrect an old patch that uses atomic operation to update ring buffer
index on AIO event queue.  This work allows futher application/libaio
optimization to run fast path io_getevents in user space.

I've also added one more change on top of old implementation that rounds
ring buffer size to power of 2 to allow fast head/tail calculation. With
the new scheme, there is no more modulo operation on them and we simply
increment either pointer index directly.  This scheme also automatically
handles integer wrap nicely without any additional special treatment.


Signed-off-by: Ken Chen <[EMAIL PROTECTED]>


diff --git a/fs/aio.c b/fs/aio.c
index e4598d6..00ecd14 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx 
unsigned long size;
int nr_pages;
 
-   /* Compensate for the ring buffer's head/tail overlap entry */
-   nr_events += 2; /* 1 is required, 2 for good luck */
+   /* round nr_event to next power of 2 */
+   nr_events = roundup_pow_of_two(nr_events);
 
size = sizeof(struct aio_ring);
size += sizeof(struct io_event) * nr_events;
@@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx 
if (nr_pages < 0)
return -EINVAL;
 
-   nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / 
sizeof(struct io_event);
-
info->nr = 0;
info->ring_pages = info->internal_pages;
if (nr_pages > AIO_RING_PAGES) {
@@ -177,8 +175,8 @@ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE /
 #define AIO_EVENTS_FIRST_PAGE  ((PAGE_SIZE - sizeof(struct aio_ring)) / 
sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET  (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-#define aio_ring_event(info, nr, km) ({
\
-   unsigned pos = (nr) + AIO_EVENTS_OFFSET;\
+#define aio_ring_event(info, __nr, km) ({  \
+   unsigned pos = ((__nr) & ((info)->nr - 1)) + AIO_EVENTS_OFFSET; \
struct io_event *__event;   \
__event = kmap_atomic(  \
(info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
@@ -220,7 +218,6 @@ static struct kioctx *ioctx_alloc(unsign
 
atomic_set(>users, 1);
spin_lock_init(>ctx_lock);
-   spin_lock_init(>ring_info.ring_lock);
init_waitqueue_head(>wait);
 
INIT_LIST_HEAD(>active_reqs);
@@ -927,7 +924,7 @@ int fastcall aio_complete(struct kiocb *
struct aio_ring *ring;
struct io_event *event;
unsigned long   flags;
-   unsigned long   tail;
+   unsignedtail;
int ret;
 
/*
@@ -969,8 +966,6 @@ int fastcall aio_complete(struct kiocb *
 
tail = info->tail;
event = aio_ring_event(info, tail, KM_IRQ0);
-   if (++tail >= info->nr)
-   tail = 0;
 
event->obj = (u64)(unsigned long)iocb->ki_obj.user;
event->data = iocb->ki_user_data;
@@ -986,13 +981,14 @@ int fastcall aio_complete(struct kiocb *
 */
smp_wmb();  /* make event visible before updating tail */
 
+   tail++;
info->tail = tail;
ring->tail = tail;
 
put_aio_ring_event(event, KM_IRQ0);
kunmap_atomic(ring, KM_IRQ1);
 
-   pr_debug("added to ring %p at [%lu]\n", iocb, tail);
+   pr_debug("added to ring %p at [%u]\n", iocb, tail);
 put_rq:
/* everything turned out well, dispose of the aiocb. */
ret = __aio_put_req(ctx, iocb);
@@ -1007,42 +1003,36 @@ put_rq:
 /* aio_read_evt
  * Pull an event off of the ioctx's event ring.  Returns the number of 
  * events fetched (0 or 1 ;-)
- * FIXME: make this use cmpxchg.
  * TODO: make the ringbuffer user mmap()able (requires FIXME).
  */
 static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
 {
struct aio_ring_info *info = >ring_info;
struct aio_ring *ring;
-   unsigned long head;
-   int ret = 0;
+   struct io_event *evp;
+   unsigned head;
+   int ret;
 
ring = kmap_atomic(info->ring_pages[0], KM_USER0);
-   dprintk("in aio_read_evt h%lu t%lu m%lu\n",
-(unsigned long)ring->head, (unsigned long)ring->tail,
-(unsigned long)ring->nr);
-
-   if (ring->head == ring->tail)
-   goto out;
-
-   spin_lock(>ring_lock);
+   dprintk("in aio_read_evt h%u t%u m%u\n",
+ring->head, ring->tail, ring->nr);
 
-   head = ring->head % info->nr;
-   if (head != ring->tail) {
-   struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+   do {
+   head = ring->head;
+   if (head == ring->tail) {
+   ret =

[patch] convert aio event reap to use atomic-op instead of spin_lock

2007-04-10 Thread Ken Chen

Resurrect an old patch that uses atomic operation to update ring buffer
index on AIO event queue.  This work allows futher application/libaio
optimization to run fast path io_getevents in user space.

I've also added one more change on top of old implementation that rounds
ring buffer size to power of 2 to allow fast head/tail calculation. With
the new scheme, there is no more modulo operation on them and we simply
increment either pointer index directly.  This scheme also automatically
handles integer wrap nicely without any additional special treatment.


Signed-off-by: Ken Chen [EMAIL PROTECTED]


diff --git a/fs/aio.c b/fs/aio.c
index e4598d6..00ecd14 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx 
unsigned long size;
int nr_pages;
 
-   /* Compensate for the ring buffer's head/tail overlap entry */
-   nr_events += 2; /* 1 is required, 2 for good luck */
+   /* round nr_event to next power of 2 */
+   nr_events = roundup_pow_of_two(nr_events);
 
size = sizeof(struct aio_ring);
size += sizeof(struct io_event) * nr_events;
@@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx 
if (nr_pages  0)
return -EINVAL;
 
-   nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / 
sizeof(struct io_event);
-
info-nr = 0;
info-ring_pages = info-internal_pages;
if (nr_pages  AIO_RING_PAGES) {
@@ -177,8 +175,8 @@ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE /
 #define AIO_EVENTS_FIRST_PAGE  ((PAGE_SIZE - sizeof(struct aio_ring)) / 
sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET  (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
-#define aio_ring_event(info, nr, km) ({
\
-   unsigned pos = (nr) + AIO_EVENTS_OFFSET;\
+#define aio_ring_event(info, __nr, km) ({  \
+   unsigned pos = ((__nr)  ((info)-nr - 1)) + AIO_EVENTS_OFFSET; \
struct io_event *__event;   \
__event = kmap_atomic(  \
(info)-ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \
@@ -220,7 +218,6 @@ static struct kioctx *ioctx_alloc(unsign
 
atomic_set(ctx-users, 1);
spin_lock_init(ctx-ctx_lock);
-   spin_lock_init(ctx-ring_info.ring_lock);
init_waitqueue_head(ctx-wait);
 
INIT_LIST_HEAD(ctx-active_reqs);
@@ -927,7 +924,7 @@ int fastcall aio_complete(struct kiocb *
struct aio_ring *ring;
struct io_event *event;
unsigned long   flags;
-   unsigned long   tail;
+   unsignedtail;
int ret;
 
/*
@@ -969,8 +966,6 @@ int fastcall aio_complete(struct kiocb *
 
tail = info-tail;
event = aio_ring_event(info, tail, KM_IRQ0);
-   if (++tail = info-nr)
-   tail = 0;
 
event-obj = (u64)(unsigned long)iocb-ki_obj.user;
event-data = iocb-ki_user_data;
@@ -986,13 +981,14 @@ int fastcall aio_complete(struct kiocb *
 */
smp_wmb();  /* make event visible before updating tail */
 
+   tail++;
info-tail = tail;
ring-tail = tail;
 
put_aio_ring_event(event, KM_IRQ0);
kunmap_atomic(ring, KM_IRQ1);
 
-   pr_debug(added to ring %p at [%lu]\n, iocb, tail);
+   pr_debug(added to ring %p at [%u]\n, iocb, tail);
 put_rq:
/* everything turned out well, dispose of the aiocb. */
ret = __aio_put_req(ctx, iocb);
@@ -1007,42 +1003,36 @@ put_rq:
 /* aio_read_evt
  * Pull an event off of the ioctx's event ring.  Returns the number of 
  * events fetched (0 or 1 ;-)
- * FIXME: make this use cmpxchg.
  * TODO: make the ringbuffer user mmap()able (requires FIXME).
  */
 static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
 {
struct aio_ring_info *info = ioctx-ring_info;
struct aio_ring *ring;
-   unsigned long head;
-   int ret = 0;
+   struct io_event *evp;
+   unsigned head;
+   int ret;
 
ring = kmap_atomic(info-ring_pages[0], KM_USER0);
-   dprintk(in aio_read_evt h%lu t%lu m%lu\n,
-(unsigned long)ring-head, (unsigned long)ring-tail,
-(unsigned long)ring-nr);
-
-   if (ring-head == ring-tail)
-   goto out;
-
-   spin_lock(info-ring_lock);
+   dprintk(in aio_read_evt h%u t%u m%u\n,
+ring-head, ring-tail, ring-nr);
 
-   head = ring-head % info-nr;
-   if (head != ring-tail) {
-   struct io_event *evp = aio_ring_event(info, head, KM_USER1);
+   do {
+   head = ring-head;
+   if (head == ring-tail) {
+   ret = 0;
+   break;
+   }
+   evp = aio_ring_event(info, head, KM_USER1);
*ent = *evp;
-   head = (head + 1

Re: [patch] remove artificial software max_loop limit

2007-04-01 Thread Ken Chen


On 4/1/07, Tomas M <[EMAIL PROTECTED]> wrote:

I believe that IF you _really_ need to preserve the max_loop module
parameter, then the parameter should _not_ be ignored, rather it should
have the same function like before - to limit the loop driver so if you
use max_loop=10 for example, it should not allow loop.c to create more
than 10 loops.


Blame on the dual meaning of max_loop that it uses currently: to
initialize a set of loop devices and as a side effect, it also sets
the upper limit.  People are complaining about the former constrain,
isn't it?  Does anyone uses the 2nd meaning of upper limit?

- Ken
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] remove artificial software max_loop limit

2007-04-01 Thread Ken Chen


On 4/1/07, [EMAIL PROTECTED] <[EMAIL PROTECTED]> wrote:

not sure if this is a real issue and if it`s UML or loop related -
but how is low-memory situations being handled when
creating loop devices ?


kernel returns -ENOMEM as an error code if there are no memory left to
initialize loop device.



should losetup or dmesg tell "out of memory" if there is not
enough memory left ?


It should, as kernel does pass that info back to app.  Does losetup
check return value of open() syscall?
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] remove artificial software max_loop limit

2007-04-01 Thread Ken Chen


On 4/1/07, [EMAIL PROTECTED] [EMAIL PROTECTED] wrote:

not sure if this is a real issue and if it`s UML or loop related -
but how is low-memory situations being handled when
creating loop devices ?


kernel returns -ENOMEM as an error code if there are no memory left to
initialize loop device.



should losetup or dmesg tell out of memory if there is not
enough memory left ?


It should, as kernel does pass that info back to app.  Does losetup
check return value of open() syscall?
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] remove artificial software max_loop limit

2007-04-01 Thread Ken Chen


On 4/1/07, Tomas M [EMAIL PROTECTED] wrote:

I believe that IF you _really_ need to preserve the max_loop module
parameter, then the parameter should _not_ be ignored, rather it should
have the same function like before - to limit the loop driver so if you
use max_loop=10 for example, it should not allow loop.c to create more
than 10 loops.


Blame on the dual meaning of max_loop that it uses currently: to
initialize a set of loop devices and as a side effect, it also sets
the upper limit.  People are complaining about the former constrain,
isn't it?  Does anyone uses the 2nd meaning of upper limit?

- Ken
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] remove artificial software max_loop limit

2007-03-31 Thread Ken Chen


On 3/31/07, Andrew Morton <[EMAIL PROTECTED]> wrote:

> Yes, the distros do, and they recommend it to their users a lot.

Thanks.  In that case I think we should retain the max_loop module parameter
for now.

Ken, when you respin that patch could you restore max_loop, and make its
use trigger a warning along the lines of "loop: the max_loop option is obsolete
and will be removed in March 2008"?


OK, here is a re-spin patch that I tested as module or
link-in-vmlinux.  Both produce satisfactory result for me.  I also
enclosed the patch as an attachment just in case my email client
decide to eat away white spaces for the in-line text.

--
Subject: remove artificial software max_loop limit
From: "Ken Chen" <[EMAIL PROTECTED]>

Remove artificial maximum 256 loop device that can be created due to a
legacy device number limit.  Searching through lkml archive, there are
several instances where users complained about the artificial limit
that the loop driver impose.  There is no reason to have such limit.

This patch rid the limit entirely and make loop device and associated
block queue instantiation on demand.  With on-demand instantiation, it
also gives the benefit of not wasting memory if these devices are not
in use (compare to current implementation that always create 8 loop
devices), a net improvement in both areas.  This version is both
tested with creation of large number of loop devices and is compatible
with existing losetup/mount user land tools.

There are a number of people who worked on this and provided valuable
suggestions, in no particular order, by:

Jens Axboe
Jan Engelhardt
Christoph Hellwig
Thomas M

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>
Cc: Jan Engelhardt <[EMAIL PROTECTED]>
Cc: Christoph Hellwig <[EMAIL PROTECTED]>

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6b5b642..605c1d3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,9 +77,8 @@

#include 

-static int max_loop = 8;
-static struct loop_device *loop_dev;
-static struct gendisk **disks;
+static LIST_HEAD(loop_devices);
+static DEFINE_MUTEX(loop_devices_mutex);

/*
 * Transfer functions
@@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo)
if (unlikely((loff_t)x != size))
return -EFBIG;

-   set_capacity(disks[lo->lo_number], x);
+   set_capacity(lo->lo_disk, x);
return 0;   
}

@@ -812,7 +811,7 @@ static int loop_set_fd
lo->lo_queue->queuedata = lo;
lo->lo_queue->unplug_fn = loop_unplug;

-   set_capacity(disks[lo->lo_number], size);
+   set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);

set_blocksize(bdev, lo_blocksize);
@@ -832,7 +831,7 @@ out_clr:
lo->lo_device = NULL;
lo->lo_backing_file = NULL;
lo->lo_flags = 0;
-   set_capacity(disks[lo->lo_number], 0);
+   set_capacity(lo->lo_disk, 0);
invalidate_bdev(bdev, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
@@ -918,7 +917,7 @@ static int loop_clr_fd
memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
invalidate_bdev(bdev, 0);
-   set_capacity(disks[lo->lo_number], 0);
+   set_capacity(lo->lo_disk, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(filp->f_mapping, gfp);
lo->lo_state = Lo_unbound;
@@ -1322,6 +1321,18 @@ static long lo_compat_ioctl
}
#endif

+static struct loop_device *loop_find_dev(int number)
+{
+   struct loop_device *lo;
+
+   list_for_each_entry(lo, _devices, lo_list) {
+   if (lo->lo_number == number)
+   return lo;
+   }
+   return NULL;
+}
+
+static struct loop_device *loop_init_one(int i);
static int lo_open(struct inode *inode, struct file *file)
{
struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
@@ -1330,6 +1341,11 @@ static int lo_open
lo->lo_refcnt++;
mutex_unlock(>lo_ctl_mutex);

+   mutex_lock(_devices_mutex);
+   if (!loop_find_dev(lo->lo_number + 1))
+   loop_init_one(lo->lo_number + 1);
+   mutex_unlock(_devices_mutex);
+
return 0;
}

@@ -1357,8 +1373,9 @@ static struct block_device_operations lo_fops = {
/*
 * And now the modules code and kernel interface.
 */
+static int max_loop;
module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, "Maximum number of loop devices (1-256)");
+MODULE_PARM_DESC(max_loop, "obsolete, loop device is created on-demand");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1383,7 +1400,7 @@ int loop_unregister_transfer(int number)

xfer_funcs[n] = NULL;

-   for (lo = _dev[0]; lo < _dev[max_loop]; lo++

Re: [patch] remove artificial software max_loop limit

2007-03-31 Thread Ken Chen


On 3/31/07, Andrew Morton [EMAIL PROTECTED] wrote:

 Yes, the distros do, and they recommend it to their users a lot.

Thanks.  In that case I think we should retain the max_loop module parameter
for now.

Ken, when you respin that patch could you restore max_loop, and make its
use trigger a warning along the lines of loop: the max_loop option is obsolete
and will be removed in March 2008?


OK, here is a re-spin patch that I tested as module or
link-in-vmlinux.  Both produce satisfactory result for me.  I also
enclosed the patch as an attachment just in case my email client
decide to eat away white spaces for the in-line text.

--
Subject: remove artificial software max_loop limit
From: Ken Chen [EMAIL PROTECTED]

Remove artificial maximum 256 loop device that can be created due to a
legacy device number limit.  Searching through lkml archive, there are
several instances where users complained about the artificial limit
that the loop driver impose.  There is no reason to have such limit.

This patch rid the limit entirely and make loop device and associated
block queue instantiation on demand.  With on-demand instantiation, it
also gives the benefit of not wasting memory if these devices are not
in use (compare to current implementation that always create 8 loop
devices), a net improvement in both areas.  This version is both
tested with creation of large number of loop devices and is compatible
with existing losetup/mount user land tools.

There are a number of people who worked on this and provided valuable
suggestions, in no particular order, by:

Jens Axboe
Jan Engelhardt
Christoph Hellwig
Thomas M

Signed-off-by: Ken Chen [EMAIL PROTECTED]
Cc: Jan Engelhardt [EMAIL PROTECTED]
Cc: Christoph Hellwig [EMAIL PROTECTED]

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6b5b642..605c1d3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,9 +77,8 @@

#include asm/uaccess.h

-static int max_loop = 8;
-static struct loop_device *loop_dev;
-static struct gendisk **disks;
+static LIST_HEAD(loop_devices);
+static DEFINE_MUTEX(loop_devices_mutex);

/*
 * Transfer functions
@@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo)
if (unlikely((loff_t)x != size))
return -EFBIG;

-   set_capacity(disks[lo-lo_number], x);
+   set_capacity(lo-lo_disk, x);
return 0;   
}

@@ -812,7 +811,7 @@ static int loop_set_fd
lo-lo_queue-queuedata = lo;
lo-lo_queue-unplug_fn = loop_unplug;

-   set_capacity(disks[lo-lo_number], size);
+   set_capacity(lo-lo_disk, size);
bd_set_size(bdev, size  9);

set_blocksize(bdev, lo_blocksize);
@@ -832,7 +831,7 @@ out_clr:
lo-lo_device = NULL;
lo-lo_backing_file = NULL;
lo-lo_flags = 0;
-   set_capacity(disks[lo-lo_number], 0);
+   set_capacity(lo-lo_disk, 0);
invalidate_bdev(bdev, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(mapping, lo-old_gfp_mask);
@@ -918,7 +917,7 @@ static int loop_clr_fd
memset(lo-lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo-lo_file_name, 0, LO_NAME_SIZE);
invalidate_bdev(bdev, 0);
-   set_capacity(disks[lo-lo_number], 0);
+   set_capacity(lo-lo_disk, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(filp-f_mapping, gfp);
lo-lo_state = Lo_unbound;
@@ -1322,6 +1321,18 @@ static long lo_compat_ioctl
}
#endif

+static struct loop_device *loop_find_dev(int number)
+{
+   struct loop_device *lo;
+
+   list_for_each_entry(lo, loop_devices, lo_list) {
+   if (lo-lo_number == number)
+   return lo;
+   }
+   return NULL;
+}
+
+static struct loop_device *loop_init_one(int i);
static int lo_open(struct inode *inode, struct file *file)
{
struct loop_device *lo = inode-i_bdev-bd_disk-private_data;
@@ -1330,6 +1341,11 @@ static int lo_open
lo-lo_refcnt++;
mutex_unlock(lo-lo_ctl_mutex);

+   mutex_lock(loop_devices_mutex);
+   if (!loop_find_dev(lo-lo_number + 1))
+   loop_init_one(lo-lo_number + 1);
+   mutex_unlock(loop_devices_mutex);
+
return 0;
}

@@ -1357,8 +1373,9 @@ static struct block_device_operations lo_fops = {
/*
 * And now the modules code and kernel interface.
 */
+static int max_loop;
module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, Maximum number of loop devices (1-256));
+MODULE_PARM_DESC(max_loop, obsolete, loop device is created on-demand);
MODULE_LICENSE(GPL);
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1383,7 +1400,7 @@ int loop_unregister_transfer(int number)

xfer_funcs[n] = NULL;

-   for (lo = loop_dev[0]; lo  loop_dev[max_loop]; lo++) {
+   list_for_each_entry(lo, loop_devices, lo_list) {
mutex_lock(lo-lo_ctl_mutex);

if (lo-lo_encryption == xfer)
@@ -1398,91 +1415,110 @@ int

Re: [patch] remove artificial software max_loop limit

2007-03-30 Thread Ken Chen


On 3/30/07, Andrew Morton <[EMAIL PROTECTED]> wrote:

So..  this change will cause a fatal error for anyone who is presently
using max_loop, won't it?  If they're doing that within their
initramfs/initrd/etc then things could get rather ugly for them.


probably, if they access loop device non-sequentially.



I don't know how much of a problem this will be in practice - do people use
max_loop much?


I don't know either.



btw, did you test this change as both a module and as linked-into-vmlinux?


as linked-into-vmlinux.  why do you ask?  It breaks if it is module?
I made last minute change to a mutex name and shamely posted without
doing a compile test.  Besides that, is there something else breaks?

- Ken
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] remove artificial software max_loop limit

2007-03-30 Thread Ken Chen


On 3/30/07, Jan Engelhardt <[EMAIL PROTECTED]> wrote:

> lo->lo_device = NULL;
> lo->lo_backing_file = NULL;
> lo->lo_flags = 0;
> - set_capacity(disks[lo->lo_number], 0);
> + set_capacity(lo->lo_disk, 0);
> invalidate_bdev(bdev, 0);
> bd_set_size(bdev, 0);
> mapping_set_gfp_mask(mapping, lo->old_gfp_mask);

Welcome to Google Mail, we eat your whitespace :-)


Oh, crap.  Google mail is innocent.  It was me who did some ugly
copy-paste between apps.


> + sprintf(disk->disk_name, "loop%d", i);

Let's make this %u, no?


I don't mind either way, this thing won't be bigger than 1^20 anyway.
Oh, which reminds me that we probably should explicitly test and cap
that limit (1^20 that is).


> +static struct kobject *loop_probe(dev_t dev, int *part, void *data)
> +{
> + unsigned int number = dev & MINORMASK;
> + struct loop_device *lo;
>
> - for (i = 0; i < max_loop; i++) {
> - disks[i] = alloc_disk(1);
> - if (!disks[i])
> - goto out_mem3;
> - }

Could not we use kobject_uevent(), like I did (though perhaps without
the k.oops), to make udev create a node?


Well, I go with a version that doesn't panic kernel, which is a
deciding factor here ;-)


Full patch attached.

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6b5b642..1e87aea 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,9 +77,8 @@

#include 

-static int max_loop = 8;
-static struct loop_device *loop_dev;
-static struct gendisk **disks;
+static LIST_HEAD(loop_devices);
+static DEFINE_MUTEX(loop_devices_mutex);

/*
 * Transfer functions
@@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo)
if (unlikely((loff_t)x != size))
return -EFBIG;

-   set_capacity(disks[lo->lo_number], x);
+   set_capacity(lo->lo_disk, x);
return 0;   
}

@@ -812,7 +811,7 @@ static int loop_set_fd
lo->lo_queue->queuedata = lo;
lo->lo_queue->unplug_fn = loop_unplug;

-   set_capacity(disks[lo->lo_number], size);
+   set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);

set_blocksize(bdev, lo_blocksize);
@@ -832,7 +831,7 @@ out_clr:
lo->lo_device = NULL;
lo->lo_backing_file = NULL;
lo->lo_flags = 0;
-   set_capacity(disks[lo->lo_number], 0);
+   set_capacity(lo->lo_disk, 0);
invalidate_bdev(bdev, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
@@ -918,7 +917,7 @@ static int loop_clr_fd
memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
invalidate_bdev(bdev, 0);
-   set_capacity(disks[lo->lo_number], 0);
+   set_capacity(lo->lo_disk, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(filp->f_mapping, gfp);
lo->lo_state = Lo_unbound;
@@ -1322,6 +1321,18 @@ static long lo_compat_ioctl
}
#endif

+static struct loop_device *loop_find_dev(int number)
+{
+   struct loop_device *lo;
+
+   list_for_each_entry(lo, _devices, lo_list) {
+   if (lo->lo_number == number)
+   return lo;
+   }
+   return NULL;
+}
+
+static struct loop_device *loop_init_one(int i);
static int lo_open(struct inode *inode, struct file *file)
{
struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
@@ -1330,6 +1341,11 @@ static int lo_open
lo->lo_refcnt++;
mutex_unlock(>lo_ctl_mutex);

+   mutex_lock(_device_mutex);
+   if (!loop_find_dev(lo->lo_number + 1))
+   loop_init_one(lo->lo_number + 1);
+   mutex_unlock(_device_mutex);
+
return 0;
}

@@ -1357,8 +1373,6 @@ static struct block_device_operations lo_fops
/*
 * And now the modules code and kernel interface.
 */
-module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, "Maximum number of loop devices (1-256)");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1383,7 +1397,7 @@ int loop_unregister_transfer(int number)

xfer_funcs[n] = NULL;

-   for (lo = _dev[0]; lo < _dev[max_loop]; lo++) {
+   list_for_each_entry(lo, _devices, lo_list) {
mutex_lock(>lo_ctl_mutex);

if (lo->lo_encryption == xfer)
@@ -1398,102 +1412,106 @@ int loop_unregister_transfer(int number)
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

-static int __init loop_init(void)
+static struct loop_device *loop_init_one(int i)
{
-   int i;
+   struct loop_device *lo;
+   struct gendisk *disk;

-   if (max_loop < 1 || max_loop > 256) {
-   printk(KERN_WARNING "loop: invalid max_loop (must be between"
-   " 1 and 256), using default (8)\n");
-   max_loop = 8;
-   }
+   lo = kzalloc(sizeof(*lo), GFP_KERNEL);
+   if (!lo)
+   goto out;

-   if (register_blkdev(LOOP_MAJOR, "loop"))
-

Re: [patch] remove artificial software max_loop limit

2007-03-30 Thread Ken Chen


On 3/30/07, Ken Chen <[EMAIL PROTECTED]> wrote:

Remove artificial maximum 256 loop device that can be created due to a
legacy device number limit.  Searching through lkml archive, there are
several instances where users complained about the artificial limit
that the loop driver impose.  There is no reason to have such limit.

This patch rid the limit entirely and make loop device and associated
block queue instantiation on demand.  With on-demand instantiation, it
also gives the benefit of not wasting memory if these devices are not
in use (compare to current implementation that always create 8 loop
devices), a net improvement in both areas.  This version is both
tested with creation of large number of loop devices and is compatible
with existing losetup/mount user land tools.

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>


Spotted one bug: the sequence of loop device lookup, instantiation,
and then insert into global loop_devices_list better be in one
critical section, otherwise smp race ensues.  Fix that up and also use
mutex lock instead of spin_lock for loop_devices_list.

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6b5b642..1e87aea 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,9 +77,8 @@

#include 

-static int max_loop = 8;
-static struct loop_device *loop_dev;
-static struct gendisk **disks;
+static LIST_HEAD(loop_devices);
+static DEFINE_MUTEX(loop_devices_mutex);

/*
 * Transfer functions
@@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo)
if (unlikely((loff_t)x != size))
return -EFBIG;

-   set_capacity(disks[lo->lo_number], x);
+   set_capacity(lo->lo_disk, x);
return 0;   
}

@@ -812,7 +811,7 @@ static int loop_set_fd
lo->lo_queue->queuedata = lo;
lo->lo_queue->unplug_fn = loop_unplug;

-   set_capacity(disks[lo->lo_number], size);
+   set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);

set_blocksize(bdev, lo_blocksize);
@@ -832,7 +831,7 @@ out_clr:
lo->lo_device = NULL;
lo->lo_backing_file = NULL;
lo->lo_flags = 0;
-   set_capacity(disks[lo->lo_number], 0);
+   set_capacity(lo->lo_disk, 0);
invalidate_bdev(bdev, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
@@ -918,7 +917,7 @@ static int loop_clr_fd
memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
invalidate_bdev(bdev, 0);
-   set_capacity(disks[lo->lo_number], 0);
+   set_capacity(lo->lo_disk, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(filp->f_mapping, gfp);
lo->lo_state = Lo_unbound;
@@ -1322,6 +1321,18 @@ static long lo_compat_ioctl
}
#endif

+static struct loop_device *loop_find_dev(int number)
+{
+   struct loop_device *lo;
+
+   list_for_each_entry(lo, _devices, lo_list) {
+   if (lo->lo_number == number)
+   return lo;
+   }
+   return NULL;
+}
+
+static struct loop_device *loop_init_one(int i);
static int lo_open(struct inode *inode, struct file *file)
{
struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
@@ -1330,6 +1341,11 @@ static int lo_open
lo->lo_refcnt++;
mutex_unlock(>lo_ctl_mutex);

+   mutex_lock(_device_mutex);
+   if (!loop_find_dev(lo->lo_number + 1))
+   loop_init_one(lo->lo_number + 1);
+   mutex_unlock(_device_mutex);
+
return 0;
}

@@ -1357,8 +1373,6 @@ static struct block_device_operations lo_fops
/*
 * And now the modules code and kernel interface.
 */
-module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, "Maximum number of loop devices (1-256)");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1383,7 +1397,7 @@ int loop_unregister_transfer(int number)

xfer_funcs[n] = NULL;

-   for (lo = _dev[0]; lo < _dev[max_loop]; lo++) {
+   list_for_each_entry(lo, _devices, lo_list) {
mutex_lock(>lo_ctl_mutex);

if (lo->lo_encryption == xfer)
@@ -1398,102 +1412,106 @@ int loop_unregister_transfer(int number)
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

-static int __init loop_init(void)
+static struct loop_device *loop_init_one(int i)
{
-   int i;
+   struct loop_device *lo;
+   struct gendisk *disk;

-   if (max_loop < 1 || max_loop > 256) {
-   printk(KERN_WARNING "loop: invalid max_loop (must be between"
-   " 1 and 256), using default (8)\n");
-   max_loop = 8;
-   }
+   lo = kzalloc(sizeof(*lo), GFP_KERNEL);
+   if (!lo)
+   got

[patch] remove artificial software max_loop limit

2007-03-30 Thread Ken Chen


Remove artificial maximum 256 loop device that can be created due to a
legacy device number limit.  Searching through lkml archive, there are
several instances where users complained about the artificial limit
that the loop driver impose.  There is no reason to have such limit.

This patch rid the limit entirely and make loop device and associated
block queue instantiation on demand.  With on-demand instantiation, it
also gives the benefit of not wasting memory if these devices are not
in use (compare to current implementation that always create 8 loop
devices), a net improvement in both areas.  This version is both
tested with creation of large number of loop devices and is compatible
with existing losetup/mount user land tools.

There are a number of people who worked on this and provided valuable
suggestions, in no particular order, by:

Jens Axboe
Jan Engelhardt
Christoph Hellwig
Thomas M

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6b5b642..7db2c38 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,9 +77,8 @@ #include 

#include 

-static int max_loop = 8;
-static struct loop_device *loop_dev;
-static struct gendisk **disks;
+static LIST_HEAD(loop_devices);
+static DEFINE_SPINLOCK(loop_devices_lock);

/*
 * Transfer functions
@@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo)
if (unlikely((loff_t)x != size))
return -EFBIG;

-   set_capacity(disks[lo->lo_number], x);
+   set_capacity(lo->lo_disk, x);
return 0;   
}

@@ -812,7 +811,7 @@ static int loop_set_fd(struct loop_devic
lo->lo_queue->queuedata = lo;
lo->lo_queue->unplug_fn = loop_unplug;

-   set_capacity(disks[lo->lo_number], size);
+   set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);

set_blocksize(bdev, lo_blocksize);
@@ -832,7 +831,7 @@ out_clr:
lo->lo_device = NULL;
lo->lo_backing_file = NULL;
lo->lo_flags = 0;
-   set_capacity(disks[lo->lo_number], 0);
+   set_capacity(lo->lo_disk, 0);
invalidate_bdev(bdev, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
@@ -918,7 +917,7 @@ static int loop_clr_fd(struct loop_devic
memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
invalidate_bdev(bdev, 0);
-   set_capacity(disks[lo->lo_number], 0);
+   set_capacity(lo->lo_disk, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(filp->f_mapping, gfp);
lo->lo_state = Lo_unbound;
@@ -1322,6 +1321,23 @@ static long lo_compat_ioctl(struct file
}
#endif

+static struct loop_device *loop_find_dev(int number)
+{
+   struct loop_device *lo;
+   int found = 0;
+   
+   spin_lock(_devices_lock);
+   list_for_each_entry(lo, _devices, lo_list) {
+   if (lo->lo_number == number) {
+   found = 1;
+   break;
+   }
+   }
+   spin_unlock(_devices_lock);
+   return found ? lo : NULL;
+}
+
+static struct loop_device *loop_init_one(int i);
static int lo_open(struct inode *inode, struct file *file)
{
struct loop_device *lo = inode->i_bdev->bd_disk->private_data;
@@ -1330,6 +1346,9 @@ static int lo_open(struct inode *inode,
lo->lo_refcnt++;
mutex_unlock(>lo_ctl_mutex);

+   if (!loop_find_dev(lo->lo_number + 1))
+   loop_init_one(lo->lo_number + 1);
+
return 0;
}

@@ -1357,8 +1376,6 @@ #endif
/*
 * And now the modules code and kernel interface.
 */
-module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, "Maximum number of loop devices (1-256)");
MODULE_LICENSE("GPL");
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1383,7 +1400,7 @@ int loop_unregister_transfer(int number)

xfer_funcs[n] = NULL;

-   for (lo = _dev[0]; lo < _dev[max_loop]; lo++) {
+   list_for_each_entry(lo, _devices, lo_list) {
mutex_lock(>lo_ctl_mutex);

if (lo->lo_encryption == xfer)
@@ -1398,102 +1415,104 @@ int loop_unregister_transfer(int number)
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

-static int __init loop_init(void)
+static struct loop_device *loop_init_one(int i)
{
-   int i;
+   struct loop_device *lo;
+   struct gendisk *disk;

-   if (max_loop < 1 || max_loop > 256) {
-   printk(KERN_WARNING "loop: invalid max_loop (must be between"
-   " 1 and 256), using default (8)\n");
-   max_loop = 8;
-   }
+   lo = kzalloc(sizeof(*lo), GFP_KERNEL);
+   if (!lo)
+   goto out;

-   if (register_blkdev(LOOP_MAJOR, "loop"))
-

[patch] remove artificial software max_loop limit

2007-03-30 Thread Ken Chen


Remove artificial maximum 256 loop device that can be created due to a
legacy device number limit.  Searching through lkml archive, there are
several instances where users complained about the artificial limit
that the loop driver impose.  There is no reason to have such limit.

This patch rid the limit entirely and make loop device and associated
block queue instantiation on demand.  With on-demand instantiation, it
also gives the benefit of not wasting memory if these devices are not
in use (compare to current implementation that always create 8 loop
devices), a net improvement in both areas.  This version is both
tested with creation of large number of loop devices and is compatible
with existing losetup/mount user land tools.

There are a number of people who worked on this and provided valuable
suggestions, in no particular order, by:

Jens Axboe
Jan Engelhardt
Christoph Hellwig
Thomas M

Signed-off-by: Ken Chen [EMAIL PROTECTED]

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6b5b642..7db2c38 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,9 +77,8 @@ #include linux/kthread.h

#include asm/uaccess.h

-static int max_loop = 8;
-static struct loop_device *loop_dev;
-static struct gendisk **disks;
+static LIST_HEAD(loop_devices);
+static DEFINE_SPINLOCK(loop_devices_lock);

/*
 * Transfer functions
@@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo)
if (unlikely((loff_t)x != size))
return -EFBIG;

-   set_capacity(disks[lo-lo_number], x);
+   set_capacity(lo-lo_disk, x);
return 0;   
}

@@ -812,7 +811,7 @@ static int loop_set_fd(struct loop_devic
lo-lo_queue-queuedata = lo;
lo-lo_queue-unplug_fn = loop_unplug;

-   set_capacity(disks[lo-lo_number], size);
+   set_capacity(lo-lo_disk, size);
bd_set_size(bdev, size  9);

set_blocksize(bdev, lo_blocksize);
@@ -832,7 +831,7 @@ out_clr:
lo-lo_device = NULL;
lo-lo_backing_file = NULL;
lo-lo_flags = 0;
-   set_capacity(disks[lo-lo_number], 0);
+   set_capacity(lo-lo_disk, 0);
invalidate_bdev(bdev, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(mapping, lo-old_gfp_mask);
@@ -918,7 +917,7 @@ static int loop_clr_fd(struct loop_devic
memset(lo-lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo-lo_file_name, 0, LO_NAME_SIZE);
invalidate_bdev(bdev, 0);
-   set_capacity(disks[lo-lo_number], 0);
+   set_capacity(lo-lo_disk, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(filp-f_mapping, gfp);
lo-lo_state = Lo_unbound;
@@ -1322,6 +1321,23 @@ static long lo_compat_ioctl(struct file
}
#endif

+static struct loop_device *loop_find_dev(int number)
+{
+   struct loop_device *lo;
+   int found = 0;
+   
+   spin_lock(loop_devices_lock);
+   list_for_each_entry(lo, loop_devices, lo_list) {
+   if (lo-lo_number == number) {
+   found = 1;
+   break;
+   }
+   }
+   spin_unlock(loop_devices_lock);
+   return found ? lo : NULL;
+}
+
+static struct loop_device *loop_init_one(int i);
static int lo_open(struct inode *inode, struct file *file)
{
struct loop_device *lo = inode-i_bdev-bd_disk-private_data;
@@ -1330,6 +1346,9 @@ static int lo_open(struct inode *inode,
lo-lo_refcnt++;
mutex_unlock(lo-lo_ctl_mutex);

+   if (!loop_find_dev(lo-lo_number + 1))
+   loop_init_one(lo-lo_number + 1);
+
return 0;
}

@@ -1357,8 +1376,6 @@ #endif
/*
 * And now the modules code and kernel interface.
 */
-module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, Maximum number of loop devices (1-256));
MODULE_LICENSE(GPL);
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1383,7 +1400,7 @@ int loop_unregister_transfer(int number)

xfer_funcs[n] = NULL;

-   for (lo = loop_dev[0]; lo  loop_dev[max_loop]; lo++) {
+   list_for_each_entry(lo, loop_devices, lo_list) {
mutex_lock(lo-lo_ctl_mutex);

if (lo-lo_encryption == xfer)
@@ -1398,102 +1415,104 @@ int loop_unregister_transfer(int number)
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

-static int __init loop_init(void)
+static struct loop_device *loop_init_one(int i)
{
-   int i;
+   struct loop_device *lo;
+   struct gendisk *disk;

-   if (max_loop  1 || max_loop  256) {
-   printk(KERN_WARNING loop: invalid max_loop (must be between
-1 and 256), using default (8)\n);
-   max_loop = 8;
-   }
+   lo = kzalloc(sizeof(*lo), GFP_KERNEL);
+   if (!lo)
+   goto out;

-   if (register_blkdev(LOOP_MAJOR, loop))
-   return -EIO;
+   lo-lo_queue = blk_alloc_queue(GFP_KERNEL);
+   if (!lo-lo_queue)
+   goto out_free_dev

Re: [patch] remove artificial software max_loop limit

2007-03-30 Thread Ken Chen


On 3/30/07, Ken Chen [EMAIL PROTECTED] wrote:

Remove artificial maximum 256 loop device that can be created due to a
legacy device number limit.  Searching through lkml archive, there are
several instances where users complained about the artificial limit
that the loop driver impose.  There is no reason to have such limit.

This patch rid the limit entirely and make loop device and associated
block queue instantiation on demand.  With on-demand instantiation, it
also gives the benefit of not wasting memory if these devices are not
in use (compare to current implementation that always create 8 loop
devices), a net improvement in both areas.  This version is both
tested with creation of large number of loop devices and is compatible
with existing losetup/mount user land tools.

Signed-off-by: Ken Chen [EMAIL PROTECTED]


Spotted one bug: the sequence of loop device lookup, instantiation,
and then insert into global loop_devices_list better be in one
critical section, otherwise smp race ensues.  Fix that up and also use
mutex lock instead of spin_lock for loop_devices_list.

Signed-off-by: Ken Chen [EMAIL PROTECTED]

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6b5b642..1e87aea 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,9 +77,8 @@

#include asm/uaccess.h

-static int max_loop = 8;
-static struct loop_device *loop_dev;
-static struct gendisk **disks;
+static LIST_HEAD(loop_devices);
+static DEFINE_MUTEX(loop_devices_mutex);

/*
 * Transfer functions
@@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo)
if (unlikely((loff_t)x != size))
return -EFBIG;

-   set_capacity(disks[lo-lo_number], x);
+   set_capacity(lo-lo_disk, x);
return 0;   
}

@@ -812,7 +811,7 @@ static int loop_set_fd
lo-lo_queue-queuedata = lo;
lo-lo_queue-unplug_fn = loop_unplug;

-   set_capacity(disks[lo-lo_number], size);
+   set_capacity(lo-lo_disk, size);
bd_set_size(bdev, size  9);

set_blocksize(bdev, lo_blocksize);
@@ -832,7 +831,7 @@ out_clr:
lo-lo_device = NULL;
lo-lo_backing_file = NULL;
lo-lo_flags = 0;
-   set_capacity(disks[lo-lo_number], 0);
+   set_capacity(lo-lo_disk, 0);
invalidate_bdev(bdev, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(mapping, lo-old_gfp_mask);
@@ -918,7 +917,7 @@ static int loop_clr_fd
memset(lo-lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo-lo_file_name, 0, LO_NAME_SIZE);
invalidate_bdev(bdev, 0);
-   set_capacity(disks[lo-lo_number], 0);
+   set_capacity(lo-lo_disk, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(filp-f_mapping, gfp);
lo-lo_state = Lo_unbound;
@@ -1322,6 +1321,18 @@ static long lo_compat_ioctl
}
#endif

+static struct loop_device *loop_find_dev(int number)
+{
+   struct loop_device *lo;
+
+   list_for_each_entry(lo, loop_devices, lo_list) {
+   if (lo-lo_number == number)
+   return lo;
+   }
+   return NULL;
+}
+
+static struct loop_device *loop_init_one(int i);
static int lo_open(struct inode *inode, struct file *file)
{
struct loop_device *lo = inode-i_bdev-bd_disk-private_data;
@@ -1330,6 +1341,11 @@ static int lo_open
lo-lo_refcnt++;
mutex_unlock(lo-lo_ctl_mutex);

+   mutex_lock(loop_device_mutex);
+   if (!loop_find_dev(lo-lo_number + 1))
+   loop_init_one(lo-lo_number + 1);
+   mutex_unlock(loop_device_mutex);
+
return 0;
}

@@ -1357,8 +1373,6 @@ static struct block_device_operations lo_fops
/*
 * And now the modules code and kernel interface.
 */
-module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, Maximum number of loop devices (1-256));
MODULE_LICENSE(GPL);
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1383,7 +1397,7 @@ int loop_unregister_transfer(int number)

xfer_funcs[n] = NULL;

-   for (lo = loop_dev[0]; lo  loop_dev[max_loop]; lo++) {
+   list_for_each_entry(lo, loop_devices, lo_list) {
mutex_lock(lo-lo_ctl_mutex);

if (lo-lo_encryption == xfer)
@@ -1398,102 +1412,106 @@ int loop_unregister_transfer(int number)
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

-static int __init loop_init(void)
+static struct loop_device *loop_init_one(int i)
{
-   int i;
+   struct loop_device *lo;
+   struct gendisk *disk;

-   if (max_loop  1 || max_loop  256) {
-   printk(KERN_WARNING loop: invalid max_loop (must be between
-1 and 256), using default (8)\n);
-   max_loop = 8;
-   }
+   lo = kzalloc(sizeof(*lo), GFP_KERNEL);
+   if (!lo)
+   goto out;

-   if (register_blkdev(LOOP_MAJOR, loop))
-   return -EIO;
+   lo-lo_queue = blk_alloc_queue(GFP_KERNEL);
+   if (!lo-lo_queue)
+   goto

Re: [patch] remove artificial software max_loop limit

2007-03-30 Thread Ken Chen


On 3/30/07, Jan Engelhardt [EMAIL PROTECTED] wrote:

 lo-lo_device = NULL;
 lo-lo_backing_file = NULL;
 lo-lo_flags = 0;
 - set_capacity(disks[lo-lo_number], 0);
 + set_capacity(lo-lo_disk, 0);
 invalidate_bdev(bdev, 0);
 bd_set_size(bdev, 0);
 mapping_set_gfp_mask(mapping, lo-old_gfp_mask);

Welcome to Google Mail, we eat your whitespace :-)


Oh, crap.  Google mail is innocent.  It was me who did some ugly
copy-paste between apps.


 + sprintf(disk-disk_name, loop%d, i);

Let's make this %u, no?


I don't mind either way, this thing won't be bigger than 1^20 anyway.
Oh, which reminds me that we probably should explicitly test and cap
that limit (1^20 that is).


 +static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 +{
 + unsigned int number = dev  MINORMASK;
 + struct loop_device *lo;

 - for (i = 0; i  max_loop; i++) {
 - disks[i] = alloc_disk(1);
 - if (!disks[i])
 - goto out_mem3;
 - }

Could not we use kobject_uevent(), like I did (though perhaps without
the k.oops), to make udev create a node?


Well, I go with a version that doesn't panic kernel, which is a
deciding factor here ;-)


Full patch attached.

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 6b5b642..1e87aea 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -77,9 +77,8 @@

#include asm/uaccess.h

-static int max_loop = 8;
-static struct loop_device *loop_dev;
-static struct gendisk **disks;
+static LIST_HEAD(loop_devices);
+static DEFINE_MUTEX(loop_devices_mutex);

/*
 * Transfer functions
@@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo)
if (unlikely((loff_t)x != size))
return -EFBIG;

-   set_capacity(disks[lo-lo_number], x);
+   set_capacity(lo-lo_disk, x);
return 0;   
}

@@ -812,7 +811,7 @@ static int loop_set_fd
lo-lo_queue-queuedata = lo;
lo-lo_queue-unplug_fn = loop_unplug;

-   set_capacity(disks[lo-lo_number], size);
+   set_capacity(lo-lo_disk, size);
bd_set_size(bdev, size  9);

set_blocksize(bdev, lo_blocksize);
@@ -832,7 +831,7 @@ out_clr:
lo-lo_device = NULL;
lo-lo_backing_file = NULL;
lo-lo_flags = 0;
-   set_capacity(disks[lo-lo_number], 0);
+   set_capacity(lo-lo_disk, 0);
invalidate_bdev(bdev, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(mapping, lo-old_gfp_mask);
@@ -918,7 +917,7 @@ static int loop_clr_fd
memset(lo-lo_crypt_name, 0, LO_NAME_SIZE);
memset(lo-lo_file_name, 0, LO_NAME_SIZE);
invalidate_bdev(bdev, 0);
-   set_capacity(disks[lo-lo_number], 0);
+   set_capacity(lo-lo_disk, 0);
bd_set_size(bdev, 0);
mapping_set_gfp_mask(filp-f_mapping, gfp);
lo-lo_state = Lo_unbound;
@@ -1322,6 +1321,18 @@ static long lo_compat_ioctl
}
#endif

+static struct loop_device *loop_find_dev(int number)
+{
+   struct loop_device *lo;
+
+   list_for_each_entry(lo, loop_devices, lo_list) {
+   if (lo-lo_number == number)
+   return lo;
+   }
+   return NULL;
+}
+
+static struct loop_device *loop_init_one(int i);
static int lo_open(struct inode *inode, struct file *file)
{
struct loop_device *lo = inode-i_bdev-bd_disk-private_data;
@@ -1330,6 +1341,11 @@ static int lo_open
lo-lo_refcnt++;
mutex_unlock(lo-lo_ctl_mutex);

+   mutex_lock(loop_device_mutex);
+   if (!loop_find_dev(lo-lo_number + 1))
+   loop_init_one(lo-lo_number + 1);
+   mutex_unlock(loop_device_mutex);
+
return 0;
}

@@ -1357,8 +1373,6 @@ static struct block_device_operations lo_fops
/*
 * And now the modules code and kernel interface.
 */
-module_param(max_loop, int, 0);
-MODULE_PARM_DESC(max_loop, Maximum number of loop devices (1-256));
MODULE_LICENSE(GPL);
MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);

@@ -1383,7 +1397,7 @@ int loop_unregister_transfer(int number)

xfer_funcs[n] = NULL;

-   for (lo = loop_dev[0]; lo  loop_dev[max_loop]; lo++) {
+   list_for_each_entry(lo, loop_devices, lo_list) {
mutex_lock(lo-lo_ctl_mutex);

if (lo-lo_encryption == xfer)
@@ -1398,102 +1412,106 @@ int loop_unregister_transfer(int number)
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

-static int __init loop_init(void)
+static struct loop_device *loop_init_one(int i)
{
-   int i;
+   struct loop_device *lo;
+   struct gendisk *disk;

-   if (max_loop  1 || max_loop  256) {
-   printk(KERN_WARNING loop: invalid max_loop (must be between
-1 and 256), using default (8)\n);
-   max_loop = 8;
-   }
+   lo = kzalloc(sizeof(*lo), GFP_KERNEL);
+   if (!lo)
+   goto out;

-   if (register_blkdev(LOOP_MAJOR, loop))
-   return -EIO;
+

Re: [patch] remove artificial software max_loop limit

2007-03-30 Thread Ken Chen


On 3/30/07, Andrew Morton [EMAIL PROTECTED] wrote:

So..  this change will cause a fatal error for anyone who is presently
using max_loop, won't it?  If they're doing that within their
initramfs/initrd/etc then things could get rather ugly for them.


probably, if they access loop device non-sequentially.



I don't know how much of a problem this will be in practice - do people use
max_loop much?


I don't know either.



btw, did you test this change as both a module and as linked-into-vmlinux?


as linked-into-vmlinux.  why do you ask?  It breaks if it is module?
I made last minute change to a mutex name and shamely posted without
doing a compile test.  Besides that, is there something else breaks?

- Ken
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] cache pipe buf page address for non-highmem arch

2007-03-27 Thread Ken Chen


On 27 Mar 2007 17:01:01 +0200, Andi Kleen <[EMAIL PROTECTED]> wrote:

What is the problem? You have cache misses on the the hash table
or are the instructions really an issue on a modern CPU?


It was page_to_pfn() on numa system.  I think it's cache misses on per
node pgdat lookup with node index calculation on each pipe buffer
page.



e.g. i out of lined virt_to_page to save space, but it could be probably
inlined again if it was severly time critical.


That would be a problem too, though not showing up in the pipe code path.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] cache pipe buf page address for non-highmem arch

2007-03-27 Thread Ken Chen


On 3/26/07, Andrew Morton <[EMAIL PROTECTED]> wrote:

LTP's vmsplice01 triggers the below:

Unable to handle kernel NULL pointer dereference at 0130 RIP:
 [] pipe_to_file+0x1f3/0x2a6


Ouch, generic_pipe_buf_map() and unmap is used by both pipe and
splice, I better constrain all changes within fs/pipe.c because from
splice path, the page->private is not initialized.

Double ouch for missing test that code path.  Here is an updated
patch, tested with LTP and FIO.  I will write more test cases to make
sure all code path are covered.


diff --git a/fs/pipe.c b/fs/pipe.c
index ebafde7..4b55452 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,6 +21,21 @@
#include 
#include 

+#ifdef CONFIG_HIGHMEM
+#define pipe_kmap  kmap
+#define pipe_kmap_atomic   kmap_atomic
+#define pipe_kunmapkunmap
+#define pipe_kunmap_atomic kunmap_atomic
+#else  /* CONFIG_HIGHMEM */
+static inline void *pipe_kmap(struct page *page)
+{
+   return (void *) page->private;
+}
+#define pipe_kmap_atomic(page, type)   pipe_kmap(page)
+#define pipe_kunmap(page)  do { } while (0)
+#define pipe_kunmap_atomic(page, type) do { } while (0)
+#endif
+
/*
 * We use a start+len construction, which provides full use of the
 * allocated memory.
@@ -185,6 +200,27 @@ void generic_pipe_buf_unmap
kunmap(buf->page);
}

+static void *pipe_buf_map(struct pipe_inode_info *pipe,
+  struct pipe_buffer *buf, int atomic)
+{
+   if (atomic) {
+   buf->flags |= PIPE_BUF_FLAG_ATOMIC;
+   return pipe_kmap_atomic(buf->page, KM_USER0);
+   }
+
+   return pipe_kmap(buf->page);
+}
+
+static void pipe_buf_unmap(struct pipe_inode_info *pipe,
+   struct pipe_buffer *buf, void *map_data)
+{
+   if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
+   buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
+   pipe_kunmap_atomic(map_data, KM_USER0);
+   } else
+   pipe_kunmap(buf->page);
+}
+
int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
   struct pipe_buffer *buf)
{
@@ -210,8 +246,8 @@ int generic_pipe_buf_pin

static const struct pipe_buf_operations anon_pipe_buf_ops = {
.can_merge = 1,
-   .map = generic_pipe_buf_map,
-   .unmap = generic_pipe_buf_unmap,
+   .map = pipe_buf_map,
+   .unmap = pipe_buf_unmap,
.pin = generic_pipe_buf_pin,
.release = anon_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -316,6 +352,7 @@ redo:
if (do_wakeup) {
wake_up_interruptible_sync(>wait);
kill_fasync(>fasync_writers, SIGIO, POLL_OUT);
+   do_wakeup = 0;
}
pipe_wait(pipe);
}
@@ -423,6 +460,8 @@ redo1:
ret = ret ? : -ENOMEM;
break;
}
+   page->private = (unsigned long)
+   page_address(page);
pipe->tmp_page = page;
}
/* Always wake up, even if the copy fails. Otherwise
@@ -438,16 +477,16 @@ redo1:
iov_fault_in_pages_read(iov, chars);
redo2:
if (atomic)
-   src = kmap_atomic(page, KM_USER0);
+   src = pipe_kmap_atomic(page, KM_USER0);
else
-   src = kmap(page);
+   src = pipe_kmap(page);

error = pipe_iov_copy_from_user(src, iov, chars,
atomic);
if (atomic)
-   kunmap_atomic(src, KM_USER0);
+   pipe_kunmap_atomic(src, KM_USER0);
else
-   kunmap(page);
+   pipe_kunmap(page);

if (unlikely(error)) {
if (atomic) {
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] cache pipe buf page address for non-highmem arch

2007-03-27 Thread Ken Chen


On 3/26/07, Andrew Morton [EMAIL PROTECTED] wrote:

LTP's vmsplice01 triggers the below:

Unable to handle kernel NULL pointer dereference at 0130 RIP:
 [8029e1b6] pipe_to_file+0x1f3/0x2a6


Ouch, generic_pipe_buf_map() and unmap is used by both pipe and
splice, I better constrain all changes within fs/pipe.c because from
splice path, the page-private is not initialized.

Double ouch for missing test that code path.  Here is an updated
patch, tested with LTP and FIO.  I will write more test cases to make
sure all code path are covered.


diff --git a/fs/pipe.c b/fs/pipe.c
index ebafde7..4b55452 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,6 +21,21 @@
#include asm/uaccess.h
#include asm/ioctls.h

+#ifdef CONFIG_HIGHMEM
+#define pipe_kmap  kmap
+#define pipe_kmap_atomic   kmap_atomic
+#define pipe_kunmapkunmap
+#define pipe_kunmap_atomic kunmap_atomic
+#else  /* CONFIG_HIGHMEM */
+static inline void *pipe_kmap(struct page *page)
+{
+   return (void *) page-private;
+}
+#define pipe_kmap_atomic(page, type)   pipe_kmap(page)
+#define pipe_kunmap(page)  do { } while (0)
+#define pipe_kunmap_atomic(page, type) do { } while (0)
+#endif
+
/*
 * We use a start+len construction, which provides full use of the
 * allocated memory.
@@ -185,6 +200,27 @@ void generic_pipe_buf_unmap
kunmap(buf-page);
}

+static void *pipe_buf_map(struct pipe_inode_info *pipe,
+  struct pipe_buffer *buf, int atomic)
+{
+   if (atomic) {
+   buf-flags |= PIPE_BUF_FLAG_ATOMIC;
+   return pipe_kmap_atomic(buf-page, KM_USER0);
+   }
+
+   return pipe_kmap(buf-page);
+}
+
+static void pipe_buf_unmap(struct pipe_inode_info *pipe,
+   struct pipe_buffer *buf, void *map_data)
+{
+   if (buf-flags  PIPE_BUF_FLAG_ATOMIC) {
+   buf-flags = ~PIPE_BUF_FLAG_ATOMIC;
+   pipe_kunmap_atomic(map_data, KM_USER0);
+   } else
+   pipe_kunmap(buf-page);
+}
+
int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
   struct pipe_buffer *buf)
{
@@ -210,8 +246,8 @@ int generic_pipe_buf_pin

static const struct pipe_buf_operations anon_pipe_buf_ops = {
.can_merge = 1,
-   .map = generic_pipe_buf_map,
-   .unmap = generic_pipe_buf_unmap,
+   .map = pipe_buf_map,
+   .unmap = pipe_buf_unmap,
.pin = generic_pipe_buf_pin,
.release = anon_pipe_buf_release,
.steal = generic_pipe_buf_steal,
@@ -316,6 +352,7 @@ redo:
if (do_wakeup) {
wake_up_interruptible_sync(pipe-wait);
kill_fasync(pipe-fasync_writers, SIGIO, POLL_OUT);
+   do_wakeup = 0;
}
pipe_wait(pipe);
}
@@ -423,6 +460,8 @@ redo1:
ret = ret ? : -ENOMEM;
break;
}
+   page-private = (unsigned long)
+   page_address(page);
pipe-tmp_page = page;
}
/* Always wake up, even if the copy fails. Otherwise
@@ -438,16 +477,16 @@ redo1:
iov_fault_in_pages_read(iov, chars);
redo2:
if (atomic)
-   src = kmap_atomic(page, KM_USER0);
+   src = pipe_kmap_atomic(page, KM_USER0);
else
-   src = kmap(page);
+   src = pipe_kmap(page);

error = pipe_iov_copy_from_user(src, iov, chars,
atomic);
if (atomic)
-   kunmap_atomic(src, KM_USER0);
+   pipe_kunmap_atomic(src, KM_USER0);
else
-   kunmap(page);
+   pipe_kunmap(page);

if (unlikely(error)) {
if (atomic) {
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] cache pipe buf page address for non-highmem arch

2007-03-27 Thread Ken Chen


On 27 Mar 2007 17:01:01 +0200, Andi Kleen [EMAIL PROTECTED] wrote:

What is the problem? You have cache misses on the the hash table
or are the instructions really an issue on a modern CPU?


It was page_to_pfn() on numa system.  I think it's cache misses on per
node pgdat lookup with node index calculation on each pipe buffer
page.



e.g. i out of lined virt_to_page to save space, but it could be probably
inlined again if it was severly time critical.


That would be a problem too, though not showing up in the pipe code path.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] max_loop limit

2007-03-24 Thread Ken Chen


On 3/23/07, Jan Engelhardt <[EMAIL PROTECTED]> wrote:

@@ -1383,7 +1380,7 @@ int loop_unregister_transfer(int number)

xfer_funcs[n] = NULL;

-   for (lo = _dev[0]; lo < _dev[max_loop]; lo++) {
+   list_for_each_entry(lo, _devices, lo_list) {
mutex_lock(>lo_ctl_mutex);


Don't you need to use loop_devices_lock to protect the linked list here?



+static struct loop_device *loop_find_dev(unsigned int number)
+{
+   struct loop_device *lo;
+   list_for_each_entry(lo, _devices, lo_list)
+   if (lo->lo_number == number)
+   return lo;
+   return NULL;


Here too with spin lock??
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] max_loop limit

2007-03-24 Thread Ken Chen


On 3/23/07, Jan Engelhardt <[EMAIL PROTECTED]> wrote:

Sadly, it locks up the foreground process (losetup that would be), and I
have not yet figured out why. And the mpt regression elsewhere is
hindering me in finding out faster.


You need to tell the block layer that each loop device is a whole
block device, not a partition within another device. Otherwise, I
think it will cause a recursive mutex lock in block_dev.c:do_open().

This patch should fix the problem.

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>

--- ./drivers/block/loop.c.orig 2007-03-24 17:05:51.0 -0700
+++ ./drivers/block/loop.c  2007-03-24 17:06:06.0 -0700
@@ -1464,6 +1464,7 @@

if ((lo = loop_find_dev(number)) == NULL) {
lo = loop_init_one(number);
+   *part = 0;
if (IS_ERR(lo))
return (void *)lo;
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] rfc: introduce /dev/hugetlb

2007-03-24 Thread Ken Chen


On 3/23/07, Andrew Morton <[EMAIL PROTECTED]> wrote:

a) Ken observes that obtaining private hugetlb memory via hugetlbfs
   involves "fuss".

b) the libhugetlbfs maintainers then go off and implement a no-fuss way of
   doing this.


Hmm, what started this thread was libhugetlbfs maintainer complained
how "fuss" it was to create private hugetlb mapping and suggested an
even bigger kernel change with pagetable_operations API.  The new API
was designed with an end goal of introduce /dev/hugetlb (as one of the
feature, they might be thinking more).  What motivated me here is to
point out that we can achieve the same goal of having a /dev/hugetlb
with existing hugetlbfs infrastructure and the implementation is
relatively straightforward.  What it also buys us is a bit more
flexibility to the end user who wants to use the interface directly.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] rfc: introduce /dev/hugetlb

2007-03-24 Thread Ken Chen


On 3/23/07, Andrew Morton [EMAIL PROTECTED] wrote:

a) Ken observes that obtaining private hugetlb memory via hugetlbfs
   involves fuss.

b) the libhugetlbfs maintainers then go off and implement a no-fuss way of
   doing this.


Hmm, what started this thread was libhugetlbfs maintainer complained
how fuss it was to create private hugetlb mapping and suggested an
even bigger kernel change with pagetable_operations API.  The new API
was designed with an end goal of introduce /dev/hugetlb (as one of the
feature, they might be thinking more).  What motivated me here is to
point out that we can achieve the same goal of having a /dev/hugetlb
with existing hugetlbfs infrastructure and the implementation is
relatively straightforward.  What it also buys us is a bit more
flexibility to the end user who wants to use the interface directly.
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] max_loop limit

2007-03-24 Thread Ken Chen


On 3/23/07, Jan Engelhardt [EMAIL PROTECTED] wrote:

Sadly, it locks up the foreground process (losetup that would be), and I
have not yet figured out why. And the mpt regression elsewhere is
hindering me in finding out faster.


You need to tell the block layer that each loop device is a whole
block device, not a partition within another device. Otherwise, I
think it will cause a recursive mutex lock in block_dev.c:do_open().

This patch should fix the problem.

Signed-off-by: Ken Chen [EMAIL PROTECTED]

--- ./drivers/block/loop.c.orig 2007-03-24 17:05:51.0 -0700
+++ ./drivers/block/loop.c  2007-03-24 17:06:06.0 -0700
@@ -1464,6 +1464,7 @@

if ((lo = loop_find_dev(number)) == NULL) {
lo = loop_init_one(number);
+   *part = 0;
if (IS_ERR(lo))
return (void *)lo;
}
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] max_loop limit

2007-03-24 Thread Ken Chen


On 3/23/07, Jan Engelhardt [EMAIL PROTECTED] wrote:

@@ -1383,7 +1380,7 @@ int loop_unregister_transfer(int number)

xfer_funcs[n] = NULL;

-   for (lo = loop_dev[0]; lo  loop_dev[max_loop]; lo++) {
+   list_for_each_entry(lo, loop_devices, lo_list) {
mutex_lock(lo-lo_ctl_mutex);


Don't you need to use loop_devices_lock to protect the linked list here?



+static struct loop_device *loop_find_dev(unsigned int number)
+{
+   struct loop_device *lo;
+   list_for_each_entry(lo, loop_devices, lo_list)
+   if (lo-lo_number == number)
+   return lo;
+   return NULL;


Here too with spin lock??
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] cache pipe buf page address for non-highmem arch

2007-03-23 Thread Ken Chen


On 3/23/07, Christoph Hellwig <[EMAIL PROTECTED]> wrote:

I think you're fixing the symptom here and not the cause.  If calculating
the virtual address of a page is so expensive on your setup it should
define WANT_PAGE_VIRTUAL and we should always cache the virtual address
in struct page.  There's a lot more code, epecially in filesystems that's
rather upset about a slow page_address.


Adding WANT_PAGE_VIRTUAL would be too costly I think as it will expand
the sizeof struct page.  I would rather go incremental fix on these
issues and not blindly increase page struct size.
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] cache pipe buf page address for non-highmem arch

2007-03-23 Thread Ken Chen


On 3/22/07, Andrew Morton <[EMAIL PROTECTED]> wrote:

If we're going to do this then we should also implement pipe_kunmap_atomic().
Relying upon kunmap_atomic() internals like this is weird-looking, and is 
fragile
against future changes to kunmap_atomic().


OK, I added the matching pipe_kunmap variants.  Now that we have the
matching pair, I made further optimization to make both pipe_kmap and
pipe_kmap_atomic the same because pipe_kmap does not hold any critical
resource that can't sleep.  There is no reason to make them different.

Full patch and changelog:


It is really sad that we always call kmap and friends for every pipe
buffer page on 64-bit arch that doesn't use HIGHMEM, or on
configuration that doesn't turn on HIGHMEM.

The effect of calling kmap* is visible in the execution profile when
pipe code is being stressed.  It is especially true on amd's x86-64
platform where kmap() has to traverse through numa node index
calculation in order to convert struct page * to kernel virtual
address.  It is fairly pointless to perform that calculation repeatly
on system with no highmem (i.e., 64-bit arch like x86-64).  This patch
caches kernel pipe buffer page's kernel vaddr to speed up pipe buffer
mapping functions.

There is another suboptimal block in pipe_read() where wake_up is
called twice.  I think it was an oversight since in pipe_write(), it
looks like it is doing the right thing.

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>

diff --git a/fs/pipe.c b/fs/pipe.c
index ebafde7..4cd1d68 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,6 +21,21 @@ #include 
#include 
#include 

+#ifdef CONFIG_HIGHMEM
+#define pipe_kmap  kmap
+#define pipe_kmap_atomic   kmap_atomic
+#define pipe_kunmapkunmap
+#define pipe_kunmap_atomic kunmap_atomic
+#else  /* CONFIG_HIGHMEM */
+static inline void *pipe_kmap(struct page *page)
+{
+   return (void *) page->private;
+}
+#define pipe_kmap_atomic(page, type)   pipe_kmap(page)
+#define pipe_kunmap(page)  do { } while (0)
+#define pipe_kunmap_atomic(page, type) do { } while (0)
+#endif
+
/*
 * We use a start+len construction, which provides full use of the
 * allocated memory.
@@ -169,10 +184,10 @@ void *generic_pipe_buf_map(struct pipe_i
{
if (atomic) {
buf->flags |= PIPE_BUF_FLAG_ATOMIC;
-   return kmap_atomic(buf->page, KM_USER0);
+   return pipe_kmap_atomic(buf->page, KM_USER0);
}

-   return kmap(buf->page);
+   return pipe_kmap(buf->page);
}

void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
@@ -180,9 +195,9 @@ void generic_pipe_buf_unmap(struct pipe_
{
if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
-   kunmap_atomic(map_data, KM_USER0);
+   pipe_kunmap_atomic(map_data, KM_USER0);
} else
-   kunmap(buf->page);
+   pipe_kunmap(buf->page);
}

int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
@@ -316,6 +331,7 @@ redo:
if (do_wakeup) {
wake_up_interruptible_sync(>wait);
kill_fasync(>fasync_writers, SIGIO, POLL_OUT);
+   do_wakeup = 0;
}
pipe_wait(pipe);
}
@@ -423,6 +439,8 @@ redo1:
ret = ret ? : -ENOMEM;
break;
}
+   page->private = (unsigned long)
+   page_address(page);
pipe->tmp_page = page;
}
/* Always wake up, even if the copy fails. Otherwise
@@ -438,16 +456,16 @@ redo1:
iov_fault_in_pages_read(iov, chars);
redo2:
if (atomic)
-   src = kmap_atomic(page, KM_USER0);
+   src = pipe_kmap_atomic(page, KM_USER0);
else
-   src = kmap(page);
+   src = pipe_kmap(page);

error = pipe_iov_copy_from_user(src, iov, chars,
atomic);
if (atomic)
-   kunmap_atomic(src, KM_USER0);
+   pipe_kunmap_atomic(src, KM_USER0);
else
-   kunmap(page);
+   pipe_kunmap(page);

if (unlikely(error)) {
if (atomic) {
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch 1/2] hugetlb: add resv argument to hugetlb_file_setup

2007-03-23 Thread Ken Chen


On 3/23/07, Nish Aravamudan <[EMAIL PROTECTED]> wrote:

Comment needs updating too.


Thanks.  How could I miss that :-(

updated patch:


diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c718a3..981886f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -734,7 +734,7 @@ static int can_do_hugetlb_shm(void)
can_do_mlock());
}

-struct file *hugetlb_zero_setup(size_t size)
+struct file *hugetlb_file_setup(size_t size, int resv)
{
int error = -ENOMEM;
struct file *file;
@@ -771,7 +771,7 @@ struct file *hugetlb_zero_setup(size_t s
goto out_file;

error = -ENOMEM;
-   if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
+   if (resv && hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
goto out_inode;

d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3f3e7a6..55cccd8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -163,7 +163,7 @@ static inline struct hugetlbfs_sb_info *

extern const struct file_operations hugetlbfs_file_operations;
extern struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_zero_setup(size_t);
+struct file *hugetlb_file_setup(size_t, int);
int hugetlb_get_quota(struct address_space *mapping);
void hugetlb_put_quota(struct address_space *mapping);

@@ -185,7 +185,7 @@ #else /* !CONFIG_HUGETLBFS */

#define is_file_hugepages(file) 0
#define set_file_hugepages(file)BUG()
-#define hugetlb_zero_setup(size)   ERR_PTR(-ENOSYS)
+#define hugetlb_file_setup(size, resv) ERR_PTR(-ENOSYS)

#endif /* !CONFIG_HUGETLBFS */

diff --git a/ipc/shm.c b/ipc/shm.c
index 4fefbad..81c8344 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -365,8 +365,8 @@ static int newseg (struct ipc_namespace
}

if (shmflg & SHM_HUGETLB) {
-   /* hugetlb_zero_setup takes care of mlock user accounting */
-   file = hugetlb_zero_setup(size);
+   /* hugetlb_file_setup takes care of mlock user accounting */
+   file = hugetlb_file_setup(size, 1);
shp->mlock_user = current->user;
} else {
int acctflag = VM_ACCOUNT;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 2/2] hugetlb: add /dev/hugetlb char device

2007-03-23 Thread Ken Chen


add a char device /dev/hugetlb that behaves similar to /dev/zero,
built on top of internal hugetlbfs mount.

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>


diff -u b/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- b/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -795,6 +795,23 @@
return ERR_PTR(error);
}

+int hugetlb_zero_setup(struct file *file, struct vm_area_struct *vma)
+{
+   file = hugetlb_file_setup(vma->vm_end - vma->vm_start, 0);
+   if (IS_ERR(file))
+   return PTR_ERR(file);
+
+   if (vma->vm_file)
+   fput(vma->vm_file);
+   vma->vm_file = file;
+   return hugetlbfs_file_mmap(file, vma);
+}
+
+const struct file_operations hugetlb_dev_fops = {
+   .mmap   = hugetlb_zero_setup,
+   .get_unmapped_area  = hugetlb_get_unmapped_area,
+};
+
static int __init init_hugetlbfs_fs(void)
{
int error;
diff -u b/include/linux/hugetlb.h b/include/linux/hugetlb.h
--- b/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -162,6 +162,7 @@
}

extern const struct file_operations hugetlbfs_file_operations;
+extern const struct file_operations hugetlb_dev_fops;
extern struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(size_t, int);
int hugetlb_get_quota(struct address_space *mapping);
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -27,6 +27,7 @@ #include 
#include 
#include 
#include 
+#include 

#include 
#include 
@@ -939,6 +940,12 @@ #ifdef CONFIG_CRASH_DUMP
filp->f_op = _fops;
break;
#endif
+#ifdef CONFIG_HUGETLBFS
+   case 13:
+   printk("open hugetlb dev device\n");
+   filp->f_op = _dev_fops;
+   break;
+#endif
default:
return -ENXIO;
}
@@ -971,6 +978,9 @@ #endif
#ifdef CONFIG_CRASH_DUMP
{12,"oldmem",S_IRUSR | S_IWUSR | S_IRGRP, _fops},
#endif
+#ifdef CONFIG_HUGETLBFS
+   {13, "hugetlb",S_IRUGO | S_IWUGO, _dev_fops},
+#endif
};

static struct class *mem_class;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 1/2] hugetlb: add resv argument to hugetlb_file_setup

2007-03-23 Thread Ken Chen


rename hugetlb_zero_setup() to hugetlb_file_setup() to better match
function name convention like shmem implementation.  Also add an
argument to the function to indicate whether file setup should reserve
hugetlb page upfront or not.

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>


diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8c718a3..981886f 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -734,7 +734,7 @@ static int can_do_hugetlb_shm(void)
can_do_mlock());
}

-struct file *hugetlb_zero_setup(size_t size)
+struct file *hugetlb_file_setup(size_t size, int resv)
{
int error = -ENOMEM;
struct file *file;
@@ -771,7 +771,7 @@ struct file *hugetlb_zero_setup(size_t s
goto out_file;

error = -ENOMEM;
-   if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
+   if (resv && hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
goto out_inode;

d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3f3e7a6..55cccd8 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -163,7 +163,7 @@ static inline struct hugetlbfs_sb_info *

extern const struct file_operations hugetlbfs_file_operations;
extern struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_zero_setup(size_t);
+struct file *hugetlb_file_setup(size_t, int);
int hugetlb_get_quota(struct address_space *mapping);
void hugetlb_put_quota(struct address_space *mapping);

@@ -185,7 +185,7 @@ #else /* !CONFIG_HUGETLBFS */

#define is_file_hugepages(file) 0
#define set_file_hugepages(file)BUG()
-#define hugetlb_zero_setup(size)   ERR_PTR(-ENOSYS)
+#define hugetlb_file_setup(size, resv) ERR_PTR(-ENOSYS)

#endif /* !CONFIG_HUGETLBFS */

diff --git a/ipc/shm.c b/ipc/shm.c
index 4fefbad..c64643f 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -366,7 +366,7 @@ static int newseg (struct ipc_namespace

if (shmflg & SHM_HUGETLB) {
/* hugetlb_zero_setup takes care of mlock user accounting */
-   file = hugetlb_zero_setup(size);
+   file = hugetlb_file_setup(size, 1);
shp->mlock_user = current->user;
} else {
int acctflag = VM_ACCOUNT;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[patch 0/2] introduce /dev/hugetlb char device

2007-03-23 Thread Ken Chen


Introduce /dev/hugetlb device that behaves similar to /dev/zero for
allocating anonymous hugetlb page.  It is especially beneficial that
application developers can have an easy way to create MAP_PRIVATE
hugetlb mappings without all the fuss about the hugetlbfs filesystem.

Two follow on patches has more detail description for each changeset.

Signed-off-by: Ken Chen <[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] rfc: introduce /dev/hugetlb

2007-03-23 Thread Ken Chen

On 3/23/07, William Lee Irwin III <[EMAIL PROTECTED]> wrote:

On Fri, 23 Mar 2007, William Lee Irwin III wrote:
>> Lack of compiletesting beyond x86-64 in all probability.

On Fri, Mar 23, 2007 at 03:15:55PM +, Mel Gorman wrote:
> Ok, this will go kablamo on Power then even if it compiles. I don't
> consider it a fundamental problem though. For the purposes of an RFC, it's
> grand and something that can be worked with.

He needs to un-#ifdef the prototype (which he already does), but he
needs to leave the definition under #ifdef while removing the static
qualifier. A relatively minor fixup.

Yes, sorry about that for lack of access to non-x86-64 machines.  I
needed to move the function prototype to hugetlb.h and evidently
removed the #ifdef by mistake.  I'm not going to touch this in my next
clean up patch, instead I will just declare char specific
file_operations struct in hugetlbfs and then have char device
reference it.

But nevertheless, hugetlb_get_unmapped_area function prototype  better
be in a header file somewhere.

- Ken
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [patch] rfc: introduce /dev/hugetlb

2007-03-23 Thread Ken Chen


On 3/23/07, William Lee Irwin III <[EMAIL PROTECTED]> wrote:

I like this patch a lot, though I'm not likely to get around to testing
it today. If userspace testcode is available that would be great to see
posted so I can just boot into things and run that.


Here is the test code that I used:
(warning: x86 centric)

#include 
#include 
#include 
#include 

#define SIZE(4*1024*1024UL)

int main(void)
{
int fd;
long i;
char *addr;

fd = open("/dev/hugetlb", O_RDWR);
if (fd == -1) {
perror("open failure");
exit(1);
}

addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
if (addr == MAP_FAILED) {
perror("mmap failure");
exit(2);
}

for (i = 0; i < SIZE; i+=4096)
addr[i] = 1;

printf("success!\n");
}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 >

1 - 100 of 132 matches

Mail list logo