[PATCH linux dev-4.16 v2] i2c: muxes: pca9641: new driver
Signed-off-by: Ken Chen <chen.ke...@inventec.com> --- v1->v2 - Merged PCA9641 code into i2c-mux-pca9541.c - Modified title - Add PCA9641 detect function --- drivers/i2c/muxes/i2c-mux-pca9541.c | 184 ++-- 1 file changed, 174 insertions(+), 10 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c index 6a39ada..493f947 100644 --- a/drivers/i2c/muxes/i2c-mux-pca9541.c +++ b/drivers/i2c/muxes/i2c-mux-pca9541.c @@ -1,5 +1,5 @@ /* - * I2C multiplexer driver for PCA9541 bus master selector + * I2C multiplexer driver for PCA9541/PCA9641 bus master selector * * Copyright (c) 2010 Ericsson AB. * @@ -26,8 +26,8 @@ #include /* - * The PCA9541 is a bus master selector. It supports two I2C masters connected - * to a single slave bus. + * The PCA9541/PCA9641 is a bus master selector. It supports two I2C masters + * connected to a single slave bus. * * Before each bus transaction, a master has to acquire bus ownership. After the * transaction is complete, bus ownership has to be released. This fits well @@ -58,11 +58,43 @@ #define PCA9541_ISTAT_MYTEST (1 << 6) #define PCA9541_ISTAT_NMYTEST (1 << 7) +#define PCA9641_ID 0x00 +#define PCA9641_ID_MAGIC 0x38 + +#define PCA9641_CONTROL0x01 +#define PCA9641_STATUS 0x02 +#define PCA9641_TIME 0x03 + +#define PCA9641_CTL_LOCK_REQ BIT(0) +#define PCA9641_CTL_LOCK_GRANT BIT(1) +#define PCA9641_CTL_BUS_CONNECTBIT(2) +#define PCA9641_CTL_BUS_INIT BIT(3) +#define PCA9641_CTL_SMBUS_SWRSTBIT(4) +#define PCA9641_CTL_IDLE_TIMER_DIS BIT(5) +#define PCA9641_CTL_SMBUS_DIS BIT(6) +#define PCA9641_CTL_PRIORITY BIT(7) + +#define PCA9641_STS_OTHER_LOCK BIT(0) +#define PCA9641_STS_BUS_INIT_FAIL BIT(1) +#define PCA9641_STS_BUS_HUNG BIT(2) +#define PCA9641_STS_MBOX_EMPTY BIT(3) +#define PCA9641_STS_MBOX_FULL BIT(4) +#define PCA9641_STS_TEST_INT BIT(5) +#define PCA9641_STS_SCL_IO BIT(6) +#define PCA9641_STS_SDA_IO BIT(7) + +#define PCA9641_RES_TIME 0x03 + #define BUSON (PCA9541_CTL_BUSON | PCA9541_CTL_NBUSON) #define MYBUS (PCA9541_CTL_MYBUS | PCA9541_CTL_NMYBUS) #define mybus(x) (!((x) & MYBUS) || ((x) & MYBUS) == MYBUS) #define busoff(x) (!((x) & BUSON) || ((x) & BUSON) == BUSON) +#define BUSOFF(x, y) (!((x) & PCA9641_CTL_LOCK_GRANT) && \ + !((y) & PCA9641_STS_OTHER_LOCK)) +#define other_lock(x) ((x) & PCA9641_STS_OTHER_LOCK) +#define lock_grant(x) ((x) & PCA9641_CTL_LOCK_GRANT) + /* arbitration timeouts, in jiffies */ #define ARB_TIMEOUT(HZ / 8)/* 125 ms until forcing bus ownership */ #define ARB2_TIMEOUT (HZ / 4)/* 250 ms until acquisition failure */ @@ -79,6 +111,7 @@ struct pca9541 { static const struct i2c_device_id pca9541_id[] = { {"pca9541", 0}, + {"pca9641", 1}, {} }; @@ -87,6 +120,7 @@ MODULE_DEVICE_TABLE(i2c, pca9541_id); #ifdef CONFIG_OF static const struct of_device_id pca9541_of_match[] = { { .compatible = "nxp,pca9541" }, + { .compatible = "nxp,pca9641" }, {} }; MODULE_DEVICE_TABLE(of, pca9541_of_match); @@ -328,6 +362,125 @@ static int pca9541_release_chan(struct i2c_mux_core *muxc, u32 chan) } /* + * Arbitration management functions + */ +static void pca9641_release_bus(struct i2c_client *client) +{ + pca9541_reg_write(client, PCA9641_CONTROL, 0); +} + +/* + * Channel arbitration + * + * Return values: + * <0: error + * 0 : bus not acquired + * 1 : bus acquired + */ +static int pca9641_arbitrate(struct i2c_client *client) +{ + struct i2c_mux_core *muxc = i2c_get_clientdata(client); + struct pca9541 *data = i2c_mux_priv(muxc); + int reg_ctl, reg_sts; + + reg_ctl = pca9541_reg_read(client, PCA9641_CONTROL); + if (reg_ctl < 0) + return reg_ctl; + reg_sts = pca9541_reg_read(client, PCA9641_STATUS); + + if (BUSOFF(reg_ctl, reg_sts)) { + /* +* Bus is off. Request ownership or turn it on unless +* other master requested ownership. +*/ + reg_ctl |= PCA9641_CTL_LOCK_REQ; + pca9541_reg_write(client, PCA9641_CONTROL, reg_ctl); + reg_ctl = pca9541_reg_read(client, PCA9641_CONTROL); + + if (lock_grant(reg_ctl)) { + /* +* Other master did not request ownership, +* or arbitration timeout expired. Take the bus. +*/ + reg_ctl |= PCA9641_CTL_BUS_CONNECT + | PCA9641_CTL_LOCK_REQ; +
[PATCH linux dev-4.16 v2] i2c: muxes: pca9641: new driver
Signed-off-by: Ken Chen --- v1->v2 - Merged PCA9641 code into i2c-mux-pca9541.c - Modified title - Add PCA9641 detect function --- drivers/i2c/muxes/i2c-mux-pca9541.c | 184 ++-- 1 file changed, 174 insertions(+), 10 deletions(-) diff --git a/drivers/i2c/muxes/i2c-mux-pca9541.c b/drivers/i2c/muxes/i2c-mux-pca9541.c index 6a39ada..493f947 100644 --- a/drivers/i2c/muxes/i2c-mux-pca9541.c +++ b/drivers/i2c/muxes/i2c-mux-pca9541.c @@ -1,5 +1,5 @@ /* - * I2C multiplexer driver for PCA9541 bus master selector + * I2C multiplexer driver for PCA9541/PCA9641 bus master selector * * Copyright (c) 2010 Ericsson AB. * @@ -26,8 +26,8 @@ #include /* - * The PCA9541 is a bus master selector. It supports two I2C masters connected - * to a single slave bus. + * The PCA9541/PCA9641 is a bus master selector. It supports two I2C masters + * connected to a single slave bus. * * Before each bus transaction, a master has to acquire bus ownership. After the * transaction is complete, bus ownership has to be released. This fits well @@ -58,11 +58,43 @@ #define PCA9541_ISTAT_MYTEST (1 << 6) #define PCA9541_ISTAT_NMYTEST (1 << 7) +#define PCA9641_ID 0x00 +#define PCA9641_ID_MAGIC 0x38 + +#define PCA9641_CONTROL0x01 +#define PCA9641_STATUS 0x02 +#define PCA9641_TIME 0x03 + +#define PCA9641_CTL_LOCK_REQ BIT(0) +#define PCA9641_CTL_LOCK_GRANT BIT(1) +#define PCA9641_CTL_BUS_CONNECTBIT(2) +#define PCA9641_CTL_BUS_INIT BIT(3) +#define PCA9641_CTL_SMBUS_SWRSTBIT(4) +#define PCA9641_CTL_IDLE_TIMER_DIS BIT(5) +#define PCA9641_CTL_SMBUS_DIS BIT(6) +#define PCA9641_CTL_PRIORITY BIT(7) + +#define PCA9641_STS_OTHER_LOCK BIT(0) +#define PCA9641_STS_BUS_INIT_FAIL BIT(1) +#define PCA9641_STS_BUS_HUNG BIT(2) +#define PCA9641_STS_MBOX_EMPTY BIT(3) +#define PCA9641_STS_MBOX_FULL BIT(4) +#define PCA9641_STS_TEST_INT BIT(5) +#define PCA9641_STS_SCL_IO BIT(6) +#define PCA9641_STS_SDA_IO BIT(7) + +#define PCA9641_RES_TIME 0x03 + #define BUSON (PCA9541_CTL_BUSON | PCA9541_CTL_NBUSON) #define MYBUS (PCA9541_CTL_MYBUS | PCA9541_CTL_NMYBUS) #define mybus(x) (!((x) & MYBUS) || ((x) & MYBUS) == MYBUS) #define busoff(x) (!((x) & BUSON) || ((x) & BUSON) == BUSON) +#define BUSOFF(x, y) (!((x) & PCA9641_CTL_LOCK_GRANT) && \ + !((y) & PCA9641_STS_OTHER_LOCK)) +#define other_lock(x) ((x) & PCA9641_STS_OTHER_LOCK) +#define lock_grant(x) ((x) & PCA9641_CTL_LOCK_GRANT) + /* arbitration timeouts, in jiffies */ #define ARB_TIMEOUT(HZ / 8)/* 125 ms until forcing bus ownership */ #define ARB2_TIMEOUT (HZ / 4)/* 250 ms until acquisition failure */ @@ -79,6 +111,7 @@ struct pca9541 { static const struct i2c_device_id pca9541_id[] = { {"pca9541", 0}, + {"pca9641", 1}, {} }; @@ -87,6 +120,7 @@ MODULE_DEVICE_TABLE(i2c, pca9541_id); #ifdef CONFIG_OF static const struct of_device_id pca9541_of_match[] = { { .compatible = "nxp,pca9541" }, + { .compatible = "nxp,pca9641" }, {} }; MODULE_DEVICE_TABLE(of, pca9541_of_match); @@ -328,6 +362,125 @@ static int pca9541_release_chan(struct i2c_mux_core *muxc, u32 chan) } /* + * Arbitration management functions + */ +static void pca9641_release_bus(struct i2c_client *client) +{ + pca9541_reg_write(client, PCA9641_CONTROL, 0); +} + +/* + * Channel arbitration + * + * Return values: + * <0: error + * 0 : bus not acquired + * 1 : bus acquired + */ +static int pca9641_arbitrate(struct i2c_client *client) +{ + struct i2c_mux_core *muxc = i2c_get_clientdata(client); + struct pca9541 *data = i2c_mux_priv(muxc); + int reg_ctl, reg_sts; + + reg_ctl = pca9541_reg_read(client, PCA9641_CONTROL); + if (reg_ctl < 0) + return reg_ctl; + reg_sts = pca9541_reg_read(client, PCA9641_STATUS); + + if (BUSOFF(reg_ctl, reg_sts)) { + /* +* Bus is off. Request ownership or turn it on unless +* other master requested ownership. +*/ + reg_ctl |= PCA9641_CTL_LOCK_REQ; + pca9541_reg_write(client, PCA9641_CONTROL, reg_ctl); + reg_ctl = pca9541_reg_read(client, PCA9641_CONTROL); + + if (lock_grant(reg_ctl)) { + /* +* Other master did not request ownership, +* or arbitration timeout expired. Take the bus. +*/ + reg_ctl |= PCA9641_CTL_BUS_CONNECT + | PCA9641_CTL_LOCK_REQ; + pca9541_reg_write
[PATCH linux dev-4.16] drivers: i2c: master arbiter: create driver
Initial PCA9641 2 chennel I2C bus master arbiter with separate implementation. It has probe issue and difference device hehavior between PCA9541 and PCA9641 in original PCA9641 patch. Signed-off-by: Ken Chen <chen.ke...@inventec.com> --- drivers/i2c/muxes/Kconfig | 9 + drivers/i2c/muxes/Makefile | 1 + drivers/i2c/muxes/i2c-mux-pca9641.c | 360 3 files changed, 370 insertions(+) create mode 100644 drivers/i2c/muxes/i2c-mux-pca9641.c diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig index 52a4a92..f9c51b8 100644 --- a/drivers/i2c/muxes/Kconfig +++ b/drivers/i2c/muxes/Kconfig @@ -73,6 +73,15 @@ config I2C_MUX_PCA954x This driver can also be built as a module. If so, the module will be called i2c-mux-pca954x. +config I2C_MUX_PCA9641 + tristate "NXP PCA9641 I2C Master demultiplexer" + help + If you say yes here you get support for the NXP PCA9641 + I2C Master demultiplexer with an arbiter function. + + This driver can also be built as a module. If so, the module + will be called i2c-mux-pca9641. + config I2C_MUX_PINCTRL tristate "pinctrl-based I2C multiplexer" depends on PINCTRL diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile index 6d9d865..a229a1a 100644 --- a/drivers/i2c/muxes/Makefile +++ b/drivers/i2c/muxes/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_I2C_MUX_LTC4306) += i2c-mux-ltc4306.o obj-$(CONFIG_I2C_MUX_MLXCPLD) += i2c-mux-mlxcpld.o obj-$(CONFIG_I2C_MUX_PCA9541) += i2c-mux-pca9541.o obj-$(CONFIG_I2C_MUX_PCA954x) += i2c-mux-pca954x.o +obj-$(CONFIG_I2C_MUX_PCA9641) += i2c-mux-pca9641.o obj-$(CONFIG_I2C_MUX_PINCTRL) += i2c-mux-pinctrl.o obj-$(CONFIG_I2C_MUX_REG) += i2c-mux-reg.o diff --git a/drivers/i2c/muxes/i2c-mux-pca9641.c b/drivers/i2c/muxes/i2c-mux-pca9641.c new file mode 100644 index 000..bcf45c8 --- /dev/null +++ b/drivers/i2c/muxes/i2c-mux-pca9641.c @@ -0,0 +1,360 @@ +/* + * I2C demultiplexer driver for PCA9641 bus master demultiplexer + * + * Copyright (c) 2010 Ericsson AB. + * + * Author: Ken Chen <chen.ke...@inventec.com> + * + * Derived from: + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * The PCA9641 is two I2C bus masters demultiplexer. It supports two I2C masters + * connected to a single slave bus. + * + * It is designed for high reliability dual master I2C bus applications where + * correct system operation is required, even when two I2C master issue their + * commands at the same time. The arbiter will select a winner and let it work + * uninterrupted, and the losing master will take control of the I2C bus after + * the winnter has finished. The arbiter also allows for queued requests where + * a master requests the downstream bus while the other master has control. + * + */ + +#define PCA9641_ID 0x01 +#define PCA9641_ID_MAGIC 0x38 + +#define PCA9641_CONTROL0x01 +#define PCA9641_STATUS 0x02 +#define PCA9641_TIME 0x03 + +#define PCA9641_CTL_LOCK_REQ BIT(0) +#define PCA9641_CTL_LOCK_GRANT BIT(1) +#define PCA9641_CTL_BUS_CONNECTBIT(2) +#define PCA9641_CTL_BUS_INIT BIT(3) +#define PCA9641_CTL_SMBUS_SWRSTBIT(4) +#define PCA9641_CTL_IDLE_TIMER_DIS BIT(5) +#define PCA9641_CTL_SMBUS_DIS BIT(6) +#define PCA9641_CTL_PRIORITY BIT(7) + +#define PCA9641_STS_OTHER_LOCK BIT(0) +#define PCA9641_STS_BUS_INIT_FAIL BIT(1) +#define PCA9641_STS_BUS_HUNG BIT(2) +#define PCA9641_STS_MBOX_EMPTY BIT(3) +#define PCA9641_STS_MBOX_FULL BIT(4) +#define PCA9641_STS_TEST_INT BIT(5) +#define PCA9641_STS_SCL_IO BIT(6) +#define PCA9641_STS_SDA_IO BIT(7) + +#define PCA9641_RES_TIME 0x03 + +#define busoff(x, y) (!((x) & PCA9641_CTL_LOCK_GRANT) && \ + !((y) & PCA9641_STS_OTHER_LOCK)) +#define other_lock(x) ((x) & PCA9641_STS_OTHER_LOCK) +#define lock_grant(x) ((x) & PCA9641_CTL_LOCK_GRANT) + +/* arbitration timeouts, in jiffies */ +#define ARB_TIMEOUT(HZ / 8)/* 125 ms until forcing bus ownership */ +#define ARB2_TIMEOUT (HZ / 4)/* 250 ms until acquisition failure */ + +/* arbitration retry delays, in us */ +#define SELECT_DELAY_SHORT 50 +#define SELECT_DELAY_LONG 1000 + +struct pca9641 { + struct i2c_client *client; + unsigned long select_timeout; + unsigned long arb_timeout; +}; + +static const struct i2c_device_id pca9641_id[] = { + {"pca9641", 0}, + {} +}; + +MODULE_DEVICE_TABLE(i2c, pca9641
[PATCH linux dev-4.16] drivers: i2c: master arbiter: create driver
Initial PCA9641 2 chennel I2C bus master arbiter with separate implementation. It has probe issue and difference device hehavior between PCA9541 and PCA9641 in original PCA9641 patch. Signed-off-by: Ken Chen --- drivers/i2c/muxes/Kconfig | 9 + drivers/i2c/muxes/Makefile | 1 + drivers/i2c/muxes/i2c-mux-pca9641.c | 360 3 files changed, 370 insertions(+) create mode 100644 drivers/i2c/muxes/i2c-mux-pca9641.c diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig index 52a4a92..f9c51b8 100644 --- a/drivers/i2c/muxes/Kconfig +++ b/drivers/i2c/muxes/Kconfig @@ -73,6 +73,15 @@ config I2C_MUX_PCA954x This driver can also be built as a module. If so, the module will be called i2c-mux-pca954x. +config I2C_MUX_PCA9641 + tristate "NXP PCA9641 I2C Master demultiplexer" + help + If you say yes here you get support for the NXP PCA9641 + I2C Master demultiplexer with an arbiter function. + + This driver can also be built as a module. If so, the module + will be called i2c-mux-pca9641. + config I2C_MUX_PINCTRL tristate "pinctrl-based I2C multiplexer" depends on PINCTRL diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile index 6d9d865..a229a1a 100644 --- a/drivers/i2c/muxes/Makefile +++ b/drivers/i2c/muxes/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_I2C_MUX_LTC4306) += i2c-mux-ltc4306.o obj-$(CONFIG_I2C_MUX_MLXCPLD) += i2c-mux-mlxcpld.o obj-$(CONFIG_I2C_MUX_PCA9541) += i2c-mux-pca9541.o obj-$(CONFIG_I2C_MUX_PCA954x) += i2c-mux-pca954x.o +obj-$(CONFIG_I2C_MUX_PCA9641) += i2c-mux-pca9641.o obj-$(CONFIG_I2C_MUX_PINCTRL) += i2c-mux-pinctrl.o obj-$(CONFIG_I2C_MUX_REG) += i2c-mux-reg.o diff --git a/drivers/i2c/muxes/i2c-mux-pca9641.c b/drivers/i2c/muxes/i2c-mux-pca9641.c new file mode 100644 index 000..bcf45c8 --- /dev/null +++ b/drivers/i2c/muxes/i2c-mux-pca9641.c @@ -0,0 +1,360 @@ +/* + * I2C demultiplexer driver for PCA9641 bus master demultiplexer + * + * Copyright (c) 2010 Ericsson AB. + * + * Author: Ken Chen + * + * Derived from: + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * The PCA9641 is two I2C bus masters demultiplexer. It supports two I2C masters + * connected to a single slave bus. + * + * It is designed for high reliability dual master I2C bus applications where + * correct system operation is required, even when two I2C master issue their + * commands at the same time. The arbiter will select a winner and let it work + * uninterrupted, and the losing master will take control of the I2C bus after + * the winnter has finished. The arbiter also allows for queued requests where + * a master requests the downstream bus while the other master has control. + * + */ + +#define PCA9641_ID 0x01 +#define PCA9641_ID_MAGIC 0x38 + +#define PCA9641_CONTROL0x01 +#define PCA9641_STATUS 0x02 +#define PCA9641_TIME 0x03 + +#define PCA9641_CTL_LOCK_REQ BIT(0) +#define PCA9641_CTL_LOCK_GRANT BIT(1) +#define PCA9641_CTL_BUS_CONNECTBIT(2) +#define PCA9641_CTL_BUS_INIT BIT(3) +#define PCA9641_CTL_SMBUS_SWRSTBIT(4) +#define PCA9641_CTL_IDLE_TIMER_DIS BIT(5) +#define PCA9641_CTL_SMBUS_DIS BIT(6) +#define PCA9641_CTL_PRIORITY BIT(7) + +#define PCA9641_STS_OTHER_LOCK BIT(0) +#define PCA9641_STS_BUS_INIT_FAIL BIT(1) +#define PCA9641_STS_BUS_HUNG BIT(2) +#define PCA9641_STS_MBOX_EMPTY BIT(3) +#define PCA9641_STS_MBOX_FULL BIT(4) +#define PCA9641_STS_TEST_INT BIT(5) +#define PCA9641_STS_SCL_IO BIT(6) +#define PCA9641_STS_SDA_IO BIT(7) + +#define PCA9641_RES_TIME 0x03 + +#define busoff(x, y) (!((x) & PCA9641_CTL_LOCK_GRANT) && \ + !((y) & PCA9641_STS_OTHER_LOCK)) +#define other_lock(x) ((x) & PCA9641_STS_OTHER_LOCK) +#define lock_grant(x) ((x) & PCA9641_CTL_LOCK_GRANT) + +/* arbitration timeouts, in jiffies */ +#define ARB_TIMEOUT(HZ / 8)/* 125 ms until forcing bus ownership */ +#define ARB2_TIMEOUT (HZ / 4)/* 250 ms until acquisition failure */ + +/* arbitration retry delays, in us */ +#define SELECT_DELAY_SHORT 50 +#define SELECT_DELAY_LONG 1000 + +struct pca9641 { + struct i2c_client *client; + unsigned long select_timeout; + unsigned long arb_timeout; +}; + +static const struct i2c_device_id pca9641_id[] = { + {"pca9641", 0}, + {} +}; + +MODULE_DEVICE_TABLE(i2c, pca9641_id); + +#ifdef CONFIG_OF +static const struct of_devic
Re: [PATCH] hugetlbfs: document that sticky mounts are allowed
On Thu, Sep 4, 2014 at 6:20 AM, Kirill Smelkov wrote: > Commit 75897d60 (hugetlb: allow sticky directory mount option) added > support for mounting hugetlbfs with sticky option set, like /tmp is > usually mounted, but forgot to document that. > > Cc: Ken Chen > Signed-off-by: Kirill Smelkov > --- > Documentation/vm/hugetlbpage.txt | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/Documentation/vm/hugetlbpage.txt > b/Documentation/vm/hugetlbpage.txt > index bdd4bb9..b64e0af 100644 > --- a/Documentation/vm/hugetlbpage.txt > +++ b/Documentation/vm/hugetlbpage.txt > @@ -274,7 +274,7 @@ This command mounts a (pseudo) filesystem of type > hugetlbfs on the directory > /mnt/huge. Any files created on /mnt/huge uses huge pages. The uid and gid > options sets the owner and group of the root of the file system. By default > the uid and gid of the current process are taken. The mode option sets the > -mode of root of file system to value & 0777. This value is given in octal. > +mode of root of file system to value & 01777. This value is given in octal. > By default the value 0755 is picked. The size option sets the maximum value > of > memory (huge pages) allowed for that filesystem (/mnt/huge). The size is > rounded down to HPAGE_SIZE. The option nr_inodes sets the maximum number of Acked-by: Ken Chen -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] hugetlbfs: document that sticky mounts are allowed
On Thu, Sep 4, 2014 at 6:20 AM, Kirill Smelkov k...@nexedi.com wrote: Commit 75897d60 (hugetlb: allow sticky directory mount option) added support for mounting hugetlbfs with sticky option set, like /tmp is usually mounted, but forgot to document that. Cc: Ken Chen kenc...@google.com Signed-off-by: Kirill Smelkov k...@nexedi.com --- Documentation/vm/hugetlbpage.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index bdd4bb9..b64e0af 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -274,7 +274,7 @@ This command mounts a (pseudo) filesystem of type hugetlbfs on the directory /mnt/huge. Any files created on /mnt/huge uses huge pages. The uid and gid options sets the owner and group of the root of the file system. By default the uid and gid of the current process are taken. The mode option sets the -mode of root of file system to value 0777. This value is given in octal. +mode of root of file system to value 01777. This value is given in octal. By default the value 0755 is picked. The size option sets the maximum value of memory (huge pages) allowed for that filesystem (/mnt/huge). The size is rounded down to HPAGE_SIZE. The option nr_inodes sets the maximum number of Acked-by: Ken Chen kenc...@google.com -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] hugetlb: follow_hugetlb_page for write access
On Nov 7, 2007 11:51 AM, Adam Litke <[EMAIL PROTECTED]> wrote: > When calling get_user_pages(), a write flag is passed in by the caller to > indicate if write access is required on the faulted-in pages. Currently, > follow_hugetlb_page() ignores this flag and always faults pages for > read-only access. This can cause data corruption because a device driver > that calls get_user_pages() with write set will not expect COW faults to > occur on the returned pages. > > This patch passes the write flag down to follow_hugetlb_page() and makes > sure hugetlb_fault() is called with the right write_access parameter. > > Signed-off-by: Adam Litke <[EMAIL PROTECTED]> Adam, this looks good. Reviewed-by: Ken Chen <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] hugetlb: follow_hugetlb_page for write access
On Nov 7, 2007 11:51 AM, Adam Litke [EMAIL PROTECTED] wrote: When calling get_user_pages(), a write flag is passed in by the caller to indicate if write access is required on the faulted-in pages. Currently, follow_hugetlb_page() ignores this flag and always faults pages for read-only access. This can cause data corruption because a device driver that calls get_user_pages() with write set will not expect COW faults to occur on the returned pages. This patch passes the write flag down to follow_hugetlb_page() and makes sure hugetlb_fault() is called with the right write_access parameter. Signed-off-by: Adam Litke [EMAIL PROTECTED] Adam, this looks good. Reviewed-by: Ken Chen [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] sched: schedstat needs a diet
On 10/18/07, Mathieu Desnoyers <[EMAIL PROTECTED]> wrote: > Good question indeed. How large is this memory footprint exactly ? If it > is as small as you say, I suspect that the real issue could be that > these variable are accessed by the scheduler critical paths and > therefore trash the caches. Maybe my wording was ambiguous, I meant to reduce cache line pollution when accessing these schedstat fields. With unsigned long, on x86_64, schedstat consumes 288 bytes for each sched_domain and 128 bytes in struct rq. On a extremely small system that has a couple of CPU sockets with one level of numa node, there will be 704 bytes per CPU for schedstat. Given the sparseness of them, we are probably talking about 11-12 cache line eviction on several heavily used scheduler functions. Reduce cache line pollution is the primary goal, actual memory consumption isn't really a concern. - Ken - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] sched: schedstat needs a diet
On 10/18/07, Mathieu Desnoyers [EMAIL PROTECTED] wrote: Good question indeed. How large is this memory footprint exactly ? If it is as small as you say, I suspect that the real issue could be that these variable are accessed by the scheduler critical paths and therefore trash the caches. Maybe my wording was ambiguous, I meant to reduce cache line pollution when accessing these schedstat fields. With unsigned long, on x86_64, schedstat consumes 288 bytes for each sched_domain and 128 bytes in struct rq. On a extremely small system that has a couple of CPU sockets with one level of numa node, there will be 704 bytes per CPU for schedstat. Given the sparseness of them, we are probably talking about 11-12 cache line eviction on several heavily used scheduler functions. Reduce cache line pollution is the primary goal, actual memory consumption isn't really a concern. - Ken - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] sched: fix improper load balance across sched domain
On 10/16/07, Siddha, Suresh B <[EMAIL PROTECTED]> wrote: > On Tue, Oct 16, 2007 at 12:07:06PM -0700, Ken Chen wrote: > > We recently discovered a nasty performance bug in the kernel CPU load > > balancer where we were hit by 50% performance regression. > > > > When tasks are assigned to a subset of CPUs that span across > > sched_domains (either ccNUMA node or the new multi-core domain) via > > cpu affinity, kernel fails to perform proper load balance at > > these domains, due to several logic in find_busiest_group() miss > > identified busiest sched group within a given domain. This leads to > > inadequate load balance and causes 50% performance hit. > > > > To give you a concrete example, on a dual-core, 2 socket numa system, > > there are 4 logical cpu, organized as: > > oops, this issue can easily happen when cores are not sharing caches. I > think this is what happening on your setup, right? yes, we observed the bad behavior on quad-core system with separate L2 cache as well. - Ken - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] sched: fix improper load balance across sched domain
On 10/16/07, Siddha, Suresh B [EMAIL PROTECTED] wrote: On Tue, Oct 16, 2007 at 12:07:06PM -0700, Ken Chen wrote: We recently discovered a nasty performance bug in the kernel CPU load balancer where we were hit by 50% performance regression. When tasks are assigned to a subset of CPUs that span across sched_domains (either ccNUMA node or the new multi-core domain) via cpu affinity, kernel fails to perform proper load balance at these domains, due to several logic in find_busiest_group() miss identified busiest sched group within a given domain. This leads to inadequate load balance and causes 50% performance hit. To give you a concrete example, on a dual-core, 2 socket numa system, there are 4 logical cpu, organized as: oops, this issue can easily happen when cores are not sharing caches. I think this is what happening on your setup, right? yes, we observed the bad behavior on quad-core system with separate L2 cache as well. - Ken - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch] sched: schedstat needs a diet
schedstat is useful in investigating CPU scheduler behavior. Ideally, I think it is beneficial to have it on all the time. However, the cost of turning it on in production system is quite high, largely due to number of events it collects and also due to its large memory footprint. Most of the fields probably don't need to be full 64-bit on 64-bit arch. Rolling over 4 billion events will most like take a long time and user space tool can be made to accommodate that. I'm proposing kernel to cut back most of variable width on 64-bit system. (note, the following patch doesn't affect 32-bit system). Signed-off-by: Ken Chen <[EMAIL PROTECTED]> diff --git a/include/linux/sched.h b/include/linux/sched.h index 592e3a5..311a8bd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -562,7 +562,7 @@ struct sched_info { last_queued; /* when we were last queued to run */ #ifdef CONFIG_SCHEDSTATS /* BKL stats */ - unsigned long bkl_count; + unsigned int bkl_count; #endif }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ @@ -698,34 +698,34 @@ struct sched_domain { #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ - unsigned long lb_count[CPU_MAX_IDLE_TYPES]; - unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; - unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; - unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; - unsigned long lb_gained[CPU_MAX_IDLE_TYPES]; - unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES]; - unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES]; - unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; + unsigned int lb_count[CPU_MAX_IDLE_TYPES]; + unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; + unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; + unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; + unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; /* Active load balancing */ - unsigned long alb_count; - unsigned long alb_failed; - unsigned long alb_pushed; + unsigned int alb_count; + unsigned int alb_failed; + unsigned int alb_pushed; /* SD_BALANCE_EXEC stats */ - unsigned long sbe_count; - unsigned long sbe_balanced; - unsigned long sbe_pushed; + unsigned int sbe_count; + unsigned int sbe_balanced; + unsigned int sbe_pushed; /* SD_BALANCE_FORK stats */ - unsigned long sbf_count; - unsigned long sbf_balanced; - unsigned long sbf_pushed; + unsigned int sbf_count; + unsigned int sbf_balanced; + unsigned int sbf_pushed; /* try_to_wake_up() stats */ - unsigned long ttwu_wake_remote; - unsigned long ttwu_move_affine; - unsigned long ttwu_move_balance; + unsigned int ttwu_wake_remote; + unsigned int ttwu_move_affine; + unsigned int ttwu_move_balance; #endif }; diff --git a/kernel/sched.c b/kernel/sched.c index 0da2b26..5e7fce9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -328,22 +328,22 @@ struct rq { struct sched_info rq_sched_info; /* sys_sched_yield() stats */ - unsigned long yld_exp_empty; - unsigned long yld_act_empty; - unsigned long yld_both_empty; - unsigned long yld_count; + unsigned int yld_exp_empty; + unsigned int yld_act_empty; + unsigned int yld_both_empty; + unsigned int yld_count; /* schedule() stats */ - unsigned long sched_switch; - unsigned long sched_count; - unsigned long sched_goidle; + unsigned int sched_switch; + unsigned int sched_count; + unsigned int sched_goidle; /* try_to_wake_up() stats */ - unsigned long ttwu_count; - unsigned long ttwu_local; + unsigned int ttwu_count; + unsigned int ttwu_local; /* BKL stats */ - unsigned long bkl_count; + unsigned int bkl_count; #endif struct lock_class_key rq_lock_key; }; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index a5e517e..e6fb392 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -137,7 +137,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, " .%-30s: %ld\n", "bkl_count", + SEQ_printf(m, " .%-30s: %d\n", "bkl_count", rq->bkl_count); #endif SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 1c08484..ef1a7df 100644 ---
[patch] sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load balancer where we were hit by 50% performance regression. When tasks are assigned to a subset of CPUs that span across sched_domains (either ccNUMA node or the new multi-core domain) via cpu affinity, kernel fails to perform proper load balance at these domains, due to several logic in find_busiest_group() miss identified busiest sched group within a given domain. This leads to inadequate load balance and causes 50% performance hit. To give you a concrete example, on a dual-core, 2 socket numa system, there are 4 logical cpu, organized as: CPU0 attaching sched-domain: domain 0: span 0003 groups: 0001 0002 domain 1: span 000f groups: 0003 000c CPU1 attaching sched-domain: domain 0: span 0003 groups: 0002 0001 domain 1: span 000f groups: 0003 000c CPU2 attaching sched-domain: domain 0: span 000c groups: 0004 0008 domain 1: span 000f groups: 000c 0003 CPU3 attaching sched-domain: domain 0: span 000c groups: 0008 0004 domain 1: span 000f groups: 000c 0003 If I run 2 tasks with CPU affinity set to 0x5. There are situation where cpu0 has run queue length of 2, and cpu2 will be idle. The kernel load balancer is unable to balance out these two tasks over cpu0 and cpu2 due to at least three logics in find_busiest_group() that heavily bias load balance towards power saving mode. e.g. while determining "busiest" variable, kernel only set it when "sum_nr_running > group_capacity". This test is flawed that "sum_nr_running" is not necessary same as sum-tasks-allowed-to-run-within-the sched-group. The end result is that kernel "think" everything is balanced, but in reality we have an imbalance and thus causing one CPU to be over-subscribed and leaving other idle. There are two other logic in the same function will also causing similar effect. The nastiness of this bug is that kernel not be able to get unstuck in this unfortunate broken state. From what we've seen in our environment, kernel will stuck in imbalanced state for extended period of time and it is also very easy for the kernel to stuck into that state (it's pretty much 100% reproducible for us). So proposing the following fix: add addition logic in find_busiest_group to detect intrinsic imbalance within the busiest group. When such condition is detected, load balance goes into spread mode instead of default grouping mode. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> --- ./kernel/sched.c.orig 2007-10-16 10:08:01.0 -0700 +++ ./kernel/sched.c2007-10-16 10:56:13.0 -0700 @@ -2339,7 +2339,7 @@ unsigned long max_pull; unsigned long busiest_load_per_task, busiest_nr_running; unsigned long this_load_per_task, this_nr_running; - int load_idx; + int load_idx, group_imb = 0; #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int power_savings_balance = 1; unsigned long leader_nr_running = 0, min_load_per_task = 0; @@ -2358,9 +2358,10 @@ load_idx = sd->idle_idx; do { - unsigned long load, group_capacity; + unsigned long load, group_capacity, max_cpu_load, min_cpu_load; int local_group; int i; + int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; @@ -2371,6 +2372,8 @@ /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; + max_cpu_load = 0; + min_cpu_load = ~0UL; for_each_cpu_mask(i, group->cpumask) { struct rq *rq; @@ -2391,8 +2394,13 @@ } load = target_load(i, load_idx); - } else + } else { load = source_load(i, load_idx); + if (load > max_cpu_load) + max_cpu_load = load; + if (min_cpu_load > load) + min_cpu_load = load; + } avg_load += load; sum_nr_running += rq->nr_running; @@ -2418,6 +2426,9 @@ avg_load = sg_div_cpu_power(group, avg_load * SCHED_LOAD_SCALE); + if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) + __group_imb = 1; + group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) { @@ -2426,11 +2437,12 @@ this_nr_running = sum_nr_running; this_load_per_task = sum_weighted_load; } else if (avg_load &g
[patch] sched: fix improper load balance across sched domain
We recently discovered a nasty performance bug in the kernel CPU load balancer where we were hit by 50% performance regression. When tasks are assigned to a subset of CPUs that span across sched_domains (either ccNUMA node or the new multi-core domain) via cpu affinity, kernel fails to perform proper load balance at these domains, due to several logic in find_busiest_group() miss identified busiest sched group within a given domain. This leads to inadequate load balance and causes 50% performance hit. To give you a concrete example, on a dual-core, 2 socket numa system, there are 4 logical cpu, organized as: CPU0 attaching sched-domain: domain 0: span 0003 groups: 0001 0002 domain 1: span 000f groups: 0003 000c CPU1 attaching sched-domain: domain 0: span 0003 groups: 0002 0001 domain 1: span 000f groups: 0003 000c CPU2 attaching sched-domain: domain 0: span 000c groups: 0004 0008 domain 1: span 000f groups: 000c 0003 CPU3 attaching sched-domain: domain 0: span 000c groups: 0008 0004 domain 1: span 000f groups: 000c 0003 If I run 2 tasks with CPU affinity set to 0x5. There are situation where cpu0 has run queue length of 2, and cpu2 will be idle. The kernel load balancer is unable to balance out these two tasks over cpu0 and cpu2 due to at least three logics in find_busiest_group() that heavily bias load balance towards power saving mode. e.g. while determining busiest variable, kernel only set it when sum_nr_running group_capacity. This test is flawed that sum_nr_running is not necessary same as sum-tasks-allowed-to-run-within-the sched-group. The end result is that kernel think everything is balanced, but in reality we have an imbalance and thus causing one CPU to be over-subscribed and leaving other idle. There are two other logic in the same function will also causing similar effect. The nastiness of this bug is that kernel not be able to get unstuck in this unfortunate broken state. From what we've seen in our environment, kernel will stuck in imbalanced state for extended period of time and it is also very easy for the kernel to stuck into that state (it's pretty much 100% reproducible for us). So proposing the following fix: add addition logic in find_busiest_group to detect intrinsic imbalance within the busiest group. When such condition is detected, load balance goes into spread mode instead of default grouping mode. Signed-off-by: Ken Chen [EMAIL PROTECTED] --- ./kernel/sched.c.orig 2007-10-16 10:08:01.0 -0700 +++ ./kernel/sched.c2007-10-16 10:56:13.0 -0700 @@ -2339,7 +2339,7 @@ unsigned long max_pull; unsigned long busiest_load_per_task, busiest_nr_running; unsigned long this_load_per_task, this_nr_running; - int load_idx; + int load_idx, group_imb = 0; #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int power_savings_balance = 1; unsigned long leader_nr_running = 0, min_load_per_task = 0; @@ -2358,9 +2358,10 @@ load_idx = sd-idle_idx; do { - unsigned long load, group_capacity; + unsigned long load, group_capacity, max_cpu_load, min_cpu_load; int local_group; int i; + int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; @@ -2371,6 +2372,8 @@ /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; + max_cpu_load = 0; + min_cpu_load = ~0UL; for_each_cpu_mask(i, group-cpumask) { struct rq *rq; @@ -2391,8 +2394,13 @@ } load = target_load(i, load_idx); - } else + } else { load = source_load(i, load_idx); + if (load max_cpu_load) + max_cpu_load = load; + if (min_cpu_load load) + min_cpu_load = load; + } avg_load += load; sum_nr_running += rq-nr_running; @@ -2418,6 +2426,9 @@ avg_load = sg_div_cpu_power(group, avg_load * SCHED_LOAD_SCALE); + if ((max_cpu_load - min_cpu_load) SCHED_LOAD_SCALE) + __group_imb = 1; + group_capacity = group-__cpu_power / SCHED_LOAD_SCALE; if (local_group) { @@ -2426,11 +2437,12 @@ this_nr_running = sum_nr_running; this_load_per_task = sum_weighted_load; } else if (avg_load max_load - sum_nr_running group_capacity) { + (sum_nr_running
[patch] sched: schedstat needs a diet
schedstat is useful in investigating CPU scheduler behavior. Ideally, I think it is beneficial to have it on all the time. However, the cost of turning it on in production system is quite high, largely due to number of events it collects and also due to its large memory footprint. Most of the fields probably don't need to be full 64-bit on 64-bit arch. Rolling over 4 billion events will most like take a long time and user space tool can be made to accommodate that. I'm proposing kernel to cut back most of variable width on 64-bit system. (note, the following patch doesn't affect 32-bit system). Signed-off-by: Ken Chen [EMAIL PROTECTED] diff --git a/include/linux/sched.h b/include/linux/sched.h index 592e3a5..311a8bd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -562,7 +562,7 @@ struct sched_info { last_queued; /* when we were last queued to run */ #ifdef CONFIG_SCHEDSTATS /* BKL stats */ - unsigned long bkl_count; + unsigned int bkl_count; #endif }; #endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ @@ -698,34 +698,34 @@ struct sched_domain { #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ - unsigned long lb_count[CPU_MAX_IDLE_TYPES]; - unsigned long lb_failed[CPU_MAX_IDLE_TYPES]; - unsigned long lb_balanced[CPU_MAX_IDLE_TYPES]; - unsigned long lb_imbalance[CPU_MAX_IDLE_TYPES]; - unsigned long lb_gained[CPU_MAX_IDLE_TYPES]; - unsigned long lb_hot_gained[CPU_MAX_IDLE_TYPES]; - unsigned long lb_nobusyg[CPU_MAX_IDLE_TYPES]; - unsigned long lb_nobusyq[CPU_MAX_IDLE_TYPES]; + unsigned int lb_count[CPU_MAX_IDLE_TYPES]; + unsigned int lb_failed[CPU_MAX_IDLE_TYPES]; + unsigned int lb_balanced[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES]; + unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; + unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES]; /* Active load balancing */ - unsigned long alb_count; - unsigned long alb_failed; - unsigned long alb_pushed; + unsigned int alb_count; + unsigned int alb_failed; + unsigned int alb_pushed; /* SD_BALANCE_EXEC stats */ - unsigned long sbe_count; - unsigned long sbe_balanced; - unsigned long sbe_pushed; + unsigned int sbe_count; + unsigned int sbe_balanced; + unsigned int sbe_pushed; /* SD_BALANCE_FORK stats */ - unsigned long sbf_count; - unsigned long sbf_balanced; - unsigned long sbf_pushed; + unsigned int sbf_count; + unsigned int sbf_balanced; + unsigned int sbf_pushed; /* try_to_wake_up() stats */ - unsigned long ttwu_wake_remote; - unsigned long ttwu_move_affine; - unsigned long ttwu_move_balance; + unsigned int ttwu_wake_remote; + unsigned int ttwu_move_affine; + unsigned int ttwu_move_balance; #endif }; diff --git a/kernel/sched.c b/kernel/sched.c index 0da2b26..5e7fce9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -328,22 +328,22 @@ struct rq { struct sched_info rq_sched_info; /* sys_sched_yield() stats */ - unsigned long yld_exp_empty; - unsigned long yld_act_empty; - unsigned long yld_both_empty; - unsigned long yld_count; + unsigned int yld_exp_empty; + unsigned int yld_act_empty; + unsigned int yld_both_empty; + unsigned int yld_count; /* schedule() stats */ - unsigned long sched_switch; - unsigned long sched_count; - unsigned long sched_goidle; + unsigned int sched_switch; + unsigned int sched_count; + unsigned int sched_goidle; /* try_to_wake_up() stats */ - unsigned long ttwu_count; - unsigned long ttwu_local; + unsigned int ttwu_count; + unsigned int ttwu_local; /* BKL stats */ - unsigned long bkl_count; + unsigned int bkl_count; #endif struct lock_class_key rq_lock_key; }; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index a5e517e..e6fb392 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -137,7 +137,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, .%-30s: %ld\n, nr_running, cfs_rq-nr_running); SEQ_printf(m, .%-30s: %ld\n, load, cfs_rq-load.weight); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, .%-30s: %ld\n, bkl_count, + SEQ_printf(m, .%-30s: %d\n, bkl_count, rq-bkl_count); #endif SEQ_printf(m, .%-30s: %ld\n, nr_spread_over, diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index 1c08484..ef1a7df 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h @@ -21,7 +21,7 @@ static int show_schedstat(struct seq_file *seq, void *v
Re: [PATCH] hugetlb: Fix pool resizing corner case
On 10/3/07, Dave Hansen <[EMAIL PROTECTED]> wrote: > > Not quite. Count can never go below the number of reserved pages plus > > pages allocated to MAP_PRIVATE mappings. That number is computed by: > > (resv + (total - free)). > > So, (total - free) equals the number of MAP_PRIVATE pages? Does that > imply that all reserved pages are shared and that all shared pages are > reserved? no, not quite. In-use huge page (total - free) can be both private or shared. resv_huge_pages counts number of pages that is committed for shared mapping, but not yet faulted in. What the equation does essentially is: resv_huge_pages + nr-huge-pages-in-use. - Ken - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] hugetlb: Fix pool resizing corner case
On 10/3/07, Adam Litke <[EMAIL PROTECTED]> wrote: > The key is that we don't want to shrink the pool below the number of > pages we are committed to keeping around. Before this patch, we only > accounted for the pages we plan to hand out (reserved huge pages) but > not the ones we've already handed out (total - free). Does that make > sense? Good catch, adam. >From what I can see, the statement if (count >= nr_huge_pages) return nr_huge_pages; in set_max_huge_pages() is useless because (1) we recalculate "count" variable below it; and (2) both try_to_free_low() and the while loop below the call to try_to_free_low() will terminate correctly. If you feel like it, please clean it up as well. If not, I'm fine with that. Acked-by: Ken Chen <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] hugetlb: Fix pool resizing corner case
On 10/3/07, Adam Litke [EMAIL PROTECTED] wrote: The key is that we don't want to shrink the pool below the number of pages we are committed to keeping around. Before this patch, we only accounted for the pages we plan to hand out (reserved huge pages) but not the ones we've already handed out (total - free). Does that make sense? Good catch, adam. From what I can see, the statement if (count = nr_huge_pages) return nr_huge_pages; in set_max_huge_pages() is useless because (1) we recalculate count variable below it; and (2) both try_to_free_low() and the while loop below the call to try_to_free_low() will terminate correctly. If you feel like it, please clean it up as well. If not, I'm fine with that. Acked-by: Ken Chen [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] hugetlb: Fix pool resizing corner case
On 10/3/07, Dave Hansen [EMAIL PROTECTED] wrote: Not quite. Count can never go below the number of reserved pages plus pages allocated to MAP_PRIVATE mappings. That number is computed by: (resv + (total - free)). So, (total - free) equals the number of MAP_PRIVATE pages? Does that imply that all reserved pages are shared and that all shared pages are reserved? no, not quite. In-use huge page (total - free) can be both private or shared. resv_huge_pages counts number of pages that is committed for shared mapping, but not yet faulted in. What the equation does essentially is: resv_huge_pages + nr-huge-pages-in-use. - Ken - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] shm: Fix the filename of hugetlb sysv shared memory
On 6/11/07, Adam Litke <[EMAIL PROTECTED]> wrote: On 6/8/07, Eric W. Biederman <[EMAIL PROTECTED]> wrote: > -struct file *hugetlb_zero_setup(size_t size) > +struct file *hugetlb_file_setup(const char *name, size_t size) The bulk of this patch seems to handle renaming this function. Is that really necessary? It looks OK to me though, because the argument list to that function is changed. Avoid change the function name isn't going to reduce the patch size either. So we might just change the name as well to match the function name shmem_file_setup(). - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] shm: Fix the filename of hugetlb sysv shared memory
On 6/11/07, Adam Litke [EMAIL PROTECTED] wrote: On 6/8/07, Eric W. Biederman [EMAIL PROTECTED] wrote: -struct file *hugetlb_zero_setup(size_t size) +struct file *hugetlb_file_setup(const char *name, size_t size) The bulk of this patch seems to handle renaming this function. Is that really necessary? It looks OK to me though, because the argument list to that function is changed. Avoid change the function name isn't going to reduce the patch size either. So we might just change the name as well to match the function name shmem_file_setup(). - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/31/07, Andrew Morton <[EMAIL PROTECTED]> wrote: Could you please send a fresh, shiny, new, changelogged patch against mainline? OK, here it is. It would be nice to merge this before final 2.6.22 release. Thank you. The kernel on-demand loop device instantiation breaks several user space tools as the tools are not ready to cope with the "on-demand feature". Fix it by instantiate default 8 loop devices and also reinstate max_loop module parameter. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> Acked-by: Al Viro <[EMAIL PROTECTED]> diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5526ead..0ed5470 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1354,7 +1354,7 @@ #endif */ static int max_loop; module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, "obsolete, loop device is created on-demand"); +MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1394,16 +1394,11 @@ int loop_unregister_transfer EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static struct loop_device *loop_init_one(int i) +static struct loop_device *loop_alloc(int i) { struct loop_device *lo; struct gendisk *disk; - list_for_each_entry(lo, _devices, lo_list) { - if (lo->lo_number == i) - return lo; - } - lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; @@ -1427,8 +1422,6 @@ static struct loop_device *loop_init_one disk->private_data = lo; disk->queue = lo->lo_queue; sprintf(disk->disk_name, "loop%d", i); - add_disk(disk); - list_add_tail(>lo_list, _devices); return lo; out_free_queue: @@ -1439,15 +1432,37 @@ out: return NULL; } -static void loop_del_one(struct loop_device *lo) +static void loop_free(struct loop_device *lo) { - del_gendisk(lo->lo_disk); blk_cleanup_queue(lo->lo_queue); put_disk(lo->lo_disk); list_del(>lo_list); kfree(lo); } +static struct loop_device *loop_init_one(int i) +{ + struct loop_device *lo; + + list_for_each_entry(lo, _devices, lo_list) { + if (lo->lo_number == i) + return lo; + } + + lo = loop_alloc(i); + if (lo) { + add_disk(lo->lo_disk); + list_add_tail(>lo_list, _devices); + } + return lo; +} + +static void loop_del_one(struct loop_device *lo) +{ + del_gendisk(lo->lo_disk); + loop_free(lo); +} + static struct kobject *loop_probe(dev_t dev, int *part, void *data) { struct loop_device *lo; @@ -1464,28 +1479,77 @@ static struct kobject *loop_probe static int __init loop_init(void) { - if (register_blkdev(LOOP_MAJOR, "loop")) - return -EIO; - blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL << MINORBITS, - THIS_MODULE, loop_probe, NULL, NULL); + int i, nr; + unsigned long range; + struct loop_device *lo, *next; + + /* +* loop module now has a feature to instantiate underlying device +* structure on-demand, provided that there is an access dev node. +* However, this will not work well with user space tool that doesn't +* know about such "feature". In order to not break any existing +* tool, we do the following: +* +* (1) if max_loop is specified, create that many upfront, and this +* also becomes a hard limit. +* (2) if max_loop is not specified, create 8 loop device on module +* load, user can further extend loop device by create dev node +* themselves and have kernel automatically instantiate actual +* device on-demand. +*/ + if (max_loop > 1UL << MINORBITS) + return -EINVAL; if (max_loop) { - printk(KERN_INFO "loop: the max_loop option is obsolete " -"and will be removed in March 2008\n"); + nr = max_loop; + range = max_loop; + } else { + nr = 8; + range = 1UL << MINORBITS; + } + + if (register_blkdev(LOOP_MAJOR, "loop")) + return -EIO; + for (i = 0; i < nr; i++) { + lo = loop_alloc(i); + if (!lo) + goto Enomem; + list_add_tail(>lo_list, _devices); } + + /* point of no return */ + + list_for_each_entry(lo, _devices, lo_list) + add_disk(lo->lo_disk); + + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, + THIS_MODULE, loop_probe, NULL, NUL
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/31/07, Andrew Morton [EMAIL PROTECTED] wrote: Could you please send a fresh, shiny, new, changelogged patch against mainline? OK, here it is. It would be nice to merge this before final 2.6.22 release. Thank you. The kernel on-demand loop device instantiation breaks several user space tools as the tools are not ready to cope with the on-demand feature. Fix it by instantiate default 8 loop devices and also reinstate max_loop module parameter. Signed-off-by: Ken Chen [EMAIL PROTECTED] Acked-by: Al Viro [EMAIL PROTECTED] diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5526ead..0ed5470 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1354,7 +1354,7 @@ #endif */ static int max_loop; module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, obsolete, loop device is created on-demand); +MODULE_PARM_DESC(max_loop, Maximum number of loop devices); MODULE_LICENSE(GPL); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1394,16 +1394,11 @@ int loop_unregister_transfer EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static struct loop_device *loop_init_one(int i) +static struct loop_device *loop_alloc(int i) { struct loop_device *lo; struct gendisk *disk; - list_for_each_entry(lo, loop_devices, lo_list) { - if (lo-lo_number == i) - return lo; - } - lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; @@ -1427,8 +1422,6 @@ static struct loop_device *loop_init_one disk-private_data = lo; disk-queue = lo-lo_queue; sprintf(disk-disk_name, loop%d, i); - add_disk(disk); - list_add_tail(lo-lo_list, loop_devices); return lo; out_free_queue: @@ -1439,15 +1432,37 @@ out: return NULL; } -static void loop_del_one(struct loop_device *lo) +static void loop_free(struct loop_device *lo) { - del_gendisk(lo-lo_disk); blk_cleanup_queue(lo-lo_queue); put_disk(lo-lo_disk); list_del(lo-lo_list); kfree(lo); } +static struct loop_device *loop_init_one(int i) +{ + struct loop_device *lo; + + list_for_each_entry(lo, loop_devices, lo_list) { + if (lo-lo_number == i) + return lo; + } + + lo = loop_alloc(i); + if (lo) { + add_disk(lo-lo_disk); + list_add_tail(lo-lo_list, loop_devices); + } + return lo; +} + +static void loop_del_one(struct loop_device *lo) +{ + del_gendisk(lo-lo_disk); + loop_free(lo); +} + static struct kobject *loop_probe(dev_t dev, int *part, void *data) { struct loop_device *lo; @@ -1464,28 +1479,77 @@ static struct kobject *loop_probe static int __init loop_init(void) { - if (register_blkdev(LOOP_MAJOR, loop)) - return -EIO; - blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL MINORBITS, - THIS_MODULE, loop_probe, NULL, NULL); + int i, nr; + unsigned long range; + struct loop_device *lo, *next; + + /* +* loop module now has a feature to instantiate underlying device +* structure on-demand, provided that there is an access dev node. +* However, this will not work well with user space tool that doesn't +* know about such feature. In order to not break any existing +* tool, we do the following: +* +* (1) if max_loop is specified, create that many upfront, and this +* also becomes a hard limit. +* (2) if max_loop is not specified, create 8 loop device on module +* load, user can further extend loop device by create dev node +* themselves and have kernel automatically instantiate actual +* device on-demand. +*/ + if (max_loop 1UL MINORBITS) + return -EINVAL; if (max_loop) { - printk(KERN_INFO loop: the max_loop option is obsolete -and will be removed in March 2008\n); + nr = max_loop; + range = max_loop; + } else { + nr = 8; + range = 1UL MINORBITS; + } + + if (register_blkdev(LOOP_MAJOR, loop)) + return -EIO; + for (i = 0; i nr; i++) { + lo = loop_alloc(i); + if (!lo) + goto Enomem; + list_add_tail(lo-lo_list, loop_devices); } + + /* point of no return */ + + list_for_each_entry(lo, loop_devices, lo_list) + add_disk(lo-lo_disk); + + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, + THIS_MODULE, loop_probe, NULL, NULL); + printk(KERN_INFO loop: module loaded\n); return 0; + +Enomem: + printk(KERN_INFO loop: out of memory\n); + + list_for_each_entry_safe(lo, next, loop_devices, lo_list
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/31/07, Andrew Morton <[EMAIL PROTECTED]> wrote: I have a note here that it needs additional work. This discussion: http://lkml.org/lkml/2007/5/21/602 seemed to peter out and go nowhere? The first rev went in -mm needs work and the above url is the result of feedback from Al Viro. He also acked the patch in the follow on thread: http://lkml.org/lkml/2007/5/22/5 Not sure how widely it was tested on that last patch, I see a few reports that the patch works out OK. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/31/07, walt <[EMAIL PROTECTED]> wrote: Ken Chen wrote: > On 5/21/07, Ken Chen <[EMAIL PROTECTED]> wrote: ... > tested, like this? Ken, your patch below works for me. Are you going to send this on to Linus? I think akpm will route this to Linus. andrew? - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/31/07, walt [EMAIL PROTECTED] wrote: Ken Chen wrote: On 5/21/07, Ken Chen [EMAIL PROTECTED] wrote: ... tested, like this? Ken, your patch below works for me. Are you going to send this on to Linus? I think akpm will route this to Linus. andrew? - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/31/07, Andrew Morton [EMAIL PROTECTED] wrote: I have a note here that it needs additional work. This discussion: http://lkml.org/lkml/2007/5/21/602 seemed to peter out and go nowhere? The first rev went in -mm needs work and the above url is the result of feedback from Al Viro. He also acked the patch in the follow on thread: http://lkml.org/lkml/2007/5/22/5 Not sure how widely it was tested on that last patch, I see a few reports that the patch works out OK. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/21/07, Ken Chen <[EMAIL PROTECTED]> wrote: On 5/21/07, Al Viro <[EMAIL PROTECTED]> wrote: > No, it doesn't. Really. It's easy to split; untested incremental to your > patch follows: > > for (i = 0; i < nr; i++) { > - if (!loop_init_one(i)) > - goto err; > + lo = loop_alloc(i); > + if (!lo) > + goto Enomem; > + list_add_tail(>lo_list, _devices); > } ah, yes, use the loop_device list_head to link all the pending devices. > + /* point of no return */ > + > + list_for_each_entry(lo, _devices, lo_list) > + add_disk(lo->lo_disk); > + > + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, > + THIS_MODULE, loop_probe, NULL, NULL); > + > printk(KERN_INFO "loop: module loaded\n"); > return 0; > -err: > - loop_exit(); > + > +Enomem: > printk(KERN_INFO "loop: out of memory\n"); > + > + while(!list_empty(_devices)) { > + lo = list_entry(loop_devices.next, struct loop_device, lo_list); > + loop_del_one(lo); > + } > + > + unregister_blkdev(LOOP_MAJOR, "loop"); > return -ENOMEM; > } I suppose the loop_del_one call in Enomem label needs to be split up too since in the error path, it hasn't done add_disk() yet? tested, like this? diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5526ead..0ed5470 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1354,7 +1354,7 @@ #endif */ static int max_loop; module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, "obsolete, loop device is created on-demand"); +MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1394,16 +1394,11 @@ int loop_unregister_transfer EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static struct loop_device *loop_init_one(int i) +static struct loop_device *loop_alloc(int i) { struct loop_device *lo; struct gendisk *disk; - list_for_each_entry(lo, _devices, lo_list) { - if (lo->lo_number == i) - return lo; - } - lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; @@ -1427,8 +1422,6 @@ static struct loop_device *loop_init_one disk->private_data = lo; disk->queue = lo->lo_queue; sprintf(disk->disk_name, "loop%d", i); - add_disk(disk); - list_add_tail(>lo_list, _devices); return lo; out_free_queue: @@ -1439,15 +1432,37 @@ out: return NULL; } -static void loop_del_one(struct loop_device *lo) +static void loop_free(struct loop_device *lo) { - del_gendisk(lo->lo_disk); blk_cleanup_queue(lo->lo_queue); put_disk(lo->lo_disk); list_del(>lo_list); kfree(lo); } +static struct loop_device *loop_init_one(int i) +{ + struct loop_device *lo; + + list_for_each_entry(lo, _devices, lo_list) { + if (lo->lo_number == i) + return lo; + } + + lo = loop_alloc(i); + if (lo) { + add_disk(lo->lo_disk); + list_add_tail(>lo_list, _devices); + } + return lo; +} + +static void loop_del_one(struct loop_device *lo) +{ + del_gendisk(lo->lo_disk); + loop_free(lo); +} + static struct kobject *loop_probe(dev_t dev, int *part, void *data) { struct loop_device *lo; @@ -1464,28 +1479,77 @@ static struct kobject *loop_probe static int __init loop_init(void) { - if (register_blkdev(LOOP_MAJOR, "loop")) - return -EIO; - blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL << MINORBITS, - THIS_MODULE, loop_probe, NULL, NULL); + int i, nr; + unsigned long range; + struct loop_device *lo, *next; + + /* +* loop module now has a feature to instantiate underlying device +* structure on-demand, provided that there is an access dev node. +* However, this will not work well with user space tool that doesn't +* know about such "feature". In order to not break any existing +* tool, we do the following: +* +* (1) if max_loop is specified, create that many upfront, and this +* also becomes a hard limit. +* (2) if max_loop is not specified, create 8 loop device on module +* load, user can further extend loop device by create dev node +* themselves and have kernel automatically instantiate actual +* device on-demand. +*/
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/21/07, Al Viro <[EMAIL PROTECTED]> wrote: No, it doesn't. Really. It's easy to split; untested incremental to your patch follows: for (i = 0; i < nr; i++) { - if (!loop_init_one(i)) - goto err; + lo = loop_alloc(i); + if (!lo) + goto Enomem; + list_add_tail(>lo_list, _devices); } ah, yes, use the loop_device list_head to link all the pending devices. + /* point of no return */ + + list_for_each_entry(lo, _devices, lo_list) + add_disk(lo->lo_disk); + + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, + THIS_MODULE, loop_probe, NULL, NULL); + printk(KERN_INFO "loop: module loaded\n"); return 0; -err: - loop_exit(); + +Enomem: printk(KERN_INFO "loop: out of memory\n"); + + while(!list_empty(_devices)) { + lo = list_entry(loop_devices.next, struct loop_device, lo_list); + loop_del_one(lo); + } + + unregister_blkdev(LOOP_MAJOR, "loop"); return -ENOMEM; } I suppose the loop_del_one call in Enomem label needs to be split up too since in the error path, it hasn't done add_disk() yet? - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/21/07, Al Viro <[EMAIL PROTECTED]> wrote: On Mon, May 21, 2007 at 03:00:55PM -0700, [EMAIL PROTECTED] wrote: > + if (register_blkdev(LOOP_MAJOR, "loop")) > + return -EIO; > + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, > + THIS_MODULE, loop_probe, NULL, NULL); > + > + for (i = 0; i < nr; i++) { > + if (!loop_init_one(i)) > + goto err; > + } > + > + printk(KERN_INFO "loop: module loaded\n"); > + return 0; > +err: > + loop_exit(); This isn't good. You *can't* fail once a single disk has been registered. Anyone could've opened it by now. IOW, you need to * register region *after* you are past the point of no return That option is a lot harder than I thought. This requires an array to keep intermediate result of preallocated "lo" device, blk_queue, and disk structure before calling add_disk() or register region. And this array could be potentially 1 million entries. Maybe I will use vmalloc for it, but seems rather sick. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bug in 2.6.22-rc2: loop mount limited to one single iso image
On 5/21/07, Ken Chen <[EMAIL PROTECTED]> wrote: The easiest way is to reinstate max_loop and create "max_loop" device up front at module load time. However, that will lose all the "fancy on-demand device instantiation feature". So I propose we do the following: 1. have the module honor "max_loop" parameter and create that many device upfront on module load (max_loop will also be a hard max) iff user specify the parameter. 2. if max_loop is not specified, default create 8 loop device. User can extent more loop device by create device node themselves and have kernel automatically instantiate loop device on-demand. Is this acceptable? Patch in a bit. Could people who has problem with loop device please test this? I tested it on my Ubuntu feisty distribution and it works fine. Though I typically don't use loop device at all. --- The kernel on-demand loop device instantiation breaks several user space tools as the tools are not ready to cope with the "on-demand feature". Fix it by instantiate default 8 loop devices and also reinstate max_loop module parameter. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5526ead..0aae8d8 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1354,7 +1354,7 @@ #endif */ static int max_loop; module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, "obsolete, loop device is created on-demand"); +MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1462,34 +1462,66 @@ static struct kobject *loop_probe(dev_t return kobj; } -static int __init loop_init(void) -{ - if (register_blkdev(LOOP_MAJOR, "loop")) - return -EIO; - blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL << MINORBITS, - THIS_MODULE, loop_probe, NULL, NULL); - - if (max_loop) { - printk(KERN_INFO "loop: the max_loop option is obsolete " -"and will be removed in March 2008\n"); - - } - printk(KERN_INFO "loop: module loaded\n"); - return 0; -} - static void __exit loop_exit(void) { + unsigned long range; struct loop_device *lo, *next; + range = max_loop ? max_loop : 1UL << MINORBITS; + list_for_each_entry_safe(lo, next, _devices, lo_list) loop_del_one(lo); - blk_unregister_region(MKDEV(LOOP_MAJOR, 0), 1UL << MINORBITS); + blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); if (unregister_blkdev(LOOP_MAJOR, "loop")) printk(KERN_WARNING "loop: cannot unregister blkdev\n"); } +static int __init loop_init(void) +{ + int i, nr; + unsigned long range; + + /* +* loop module now has a feature to instantiate underlying device +* structure on-demand, provided that there is an access dev node. +* However, this will not work well with user space tool that doesn't +* know about such "feature". In order to not break any existing +* tool, we do the following: +* +* (1) if max_loop is specified, create that many upfront, and this +* also becomes a hard limit. +* (2) if max_loop is not specified, create 8 loop device on module +* load, user can further extend loop device by create dev node +* themselves and have kernel automatically instantiate actual +* device on-demand. +*/ + if (max_loop) { + nr = max_loop; + range = max_loop; + } else { + nr = 8; + range = 1UL << MINORBITS; + } + + if (register_blkdev(LOOP_MAJOR, "loop")) + return -EIO; + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, + THIS_MODULE, loop_probe, NULL, NULL); + + for (i = 0; i < nr; i++) { + if (!loop_init_one(i)) + goto err; + } + + printk(KERN_INFO "loop: module loaded\n"); + return 0; +err: + loop_exit(); + printk(KERN_INFO "loop: out of memory\n"); + return -ENOMEM; +} + module_init(loop_init); module_exit(loop_exit); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5526ead..0aae8d8 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1354,7 +1354,7 @@ #endif */ static int max_loop; module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, "obsolete, loop device is created on-demand"); +MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1462,34 +1462,66 @@ static struct kobject *
Re: bug in 2.6.22-rc2: loop mount limited to one single iso image
On 5/21/07, Ken Chen <[EMAIL PROTECTED]> wrote: yes and no. in that commit, I automatically create n+1 device when loop device n is created, allergically was tested to be fine with casual usage of "losetup" and "mount -o loop". However, there is a bug in that commit when loop.c was compiled as a module. And when Al fixed it, he also removed that magic "n+1" trick. Nevertheless, yes, I'm guilty of introducing the new behavior. The easiest way is to reinstate max_loop and create "max_loop" device up front at module load time. However, that will lose all the "fancy on-demand device instantiation feature". So I propose we do the following: 1. have the module honor "max_loop" parameter and create that many device upfront on module load (max_loop will also be a hard max) iff user specify the parameter. 2. if max_loop is not specified, default create 8 loop device. User can extent more loop device by create device node themselves and have kernel automatically instantiate loop device on-demand. Is this acceptable? Patch in a bit. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bug in 2.6.22-rc2: loop mount limited to one single iso image
On 5/21/07, Linus Torvalds <[EMAIL PROTECTED]> wrote: Yes. Can somebody who actually _uses_ loop send a tested patch, please? We're not going to break existing user space over something idiotic like this. Not acceptable. The alternative is to just revert all the loop.c patches. Which I'll do unless somebody sends me a tested fix - because as a non-loop user myself, I don't have much choice. I assume it is commit 73285082 "remove artificial software max_loop limit" that introduced the new behaviour. Ken? yes and no. in that commit, I automatically create n+1 device when loop device n is created, allergically was tested to be fine with casual usage of "losetup" and "mount -o loop". However, there is a bug in that commit when loop.c was compiled as a module. And when Al fixed it, he also removed that magic "n+1" trick. Nevertheless, yes, I'm guilty of introducing the new behavior. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bug in 2.6.22-rc2: loop mount limited to one single iso image
On 5/19/07, Ray Lee <[EMAIL PROTECTED]> wrote: Yeah, that's the only one left. I was hoping it wasn't that one, as it claimed to have been tested extensively. Guess it wasn't tested with udev. Ken? Ball's in your court. As the patch isn't providing a killer feature for 2.6.22, I'd suggest just reverting it for now until the issues are ironed out. The real solution is to have the user space tool to create these device nodes in advance. The original loop patch was coded such that when we open a loop device N, the immediate adjacent device "N + 1" is created. This will keep "mount -o loop" happy because it typically does a linear scan to find a free device. This might be somewhat hackary, but certainly will be backward compatible before user space solution is deployed. However, the code was removed by Al in this commit: commit 07002e995638b83a6987180f43722a0eb39d4932 Author: Al Viro <[EMAIL PROTECTED]> Date: Sat May 12 16:23:15 2007 -0400 fix the dynamic allocation and probe in loop.c Signed-off-by: Al Viro <[EMAIL PROTECTED]> Acked-by: Ken Chen <[EMAIL PROTECTED]> Signed-off-by: Linus Torvalds <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bug in 2.6.22-rc2: loop mount limited to one single iso image
On 5/19/07, Ray Lee [EMAIL PROTECTED] wrote: Yeah, that's the only one left. I was hoping it wasn't that one, as it claimed to have been tested extensively. Guess it wasn't tested with udev. Ken? Ball's in your court. As the patch isn't providing a killer feature for 2.6.22, I'd suggest just reverting it for now until the issues are ironed out. The real solution is to have the user space tool to create these device nodes in advance. The original loop patch was coded such that when we open a loop device N, the immediate adjacent device N + 1 is created. This will keep mount -o loop happy because it typically does a linear scan to find a free device. This might be somewhat hackary, but certainly will be backward compatible before user space solution is deployed. However, the code was removed by Al in this commit: commit 07002e995638b83a6987180f43722a0eb39d4932 Author: Al Viro [EMAIL PROTECTED] Date: Sat May 12 16:23:15 2007 -0400 fix the dynamic allocation and probe in loop.c Signed-off-by: Al Viro [EMAIL PROTECTED] Acked-by: Ken Chen [EMAIL PROTECTED] Signed-off-by: Linus Torvalds [EMAIL PROTECTED] - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bug in 2.6.22-rc2: loop mount limited to one single iso image
On 5/21/07, Linus Torvalds [EMAIL PROTECTED] wrote: Yes. Can somebody who actually _uses_ loop send a tested patch, please? We're not going to break existing user space over something idiotic like this. Not acceptable. The alternative is to just revert all the loop.c patches. Which I'll do unless somebody sends me a tested fix - because as a non-loop user myself, I don't have much choice. I assume it is commit 73285082 remove artificial software max_loop limit that introduced the new behaviour. Ken? yes and no. in that commit, I automatically create n+1 device when loop device n is created, allergically was tested to be fine with casual usage of losetup and mount -o loop. However, there is a bug in that commit when loop.c was compiled as a module. And when Al fixed it, he also removed that magic n+1 trick. Nevertheless, yes, I'm guilty of introducing the new behavior. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bug in 2.6.22-rc2: loop mount limited to one single iso image
On 5/21/07, Ken Chen [EMAIL PROTECTED] wrote: yes and no. in that commit, I automatically create n+1 device when loop device n is created, allergically was tested to be fine with casual usage of losetup and mount -o loop. However, there is a bug in that commit when loop.c was compiled as a module. And when Al fixed it, he also removed that magic n+1 trick. Nevertheless, yes, I'm guilty of introducing the new behavior. The easiest way is to reinstate max_loop and create max_loop device up front at module load time. However, that will lose all the fancy on-demand device instantiation feature. So I propose we do the following: 1. have the module honor max_loop parameter and create that many device upfront on module load (max_loop will also be a hard max) iff user specify the parameter. 2. if max_loop is not specified, default create 8 loop device. User can extent more loop device by create device node themselves and have kernel automatically instantiate loop device on-demand. Is this acceptable? Patch in a bit. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: bug in 2.6.22-rc2: loop mount limited to one single iso image
On 5/21/07, Ken Chen [EMAIL PROTECTED] wrote: The easiest way is to reinstate max_loop and create max_loop device up front at module load time. However, that will lose all the fancy on-demand device instantiation feature. So I propose we do the following: 1. have the module honor max_loop parameter and create that many device upfront on module load (max_loop will also be a hard max) iff user specify the parameter. 2. if max_loop is not specified, default create 8 loop device. User can extent more loop device by create device node themselves and have kernel automatically instantiate loop device on-demand. Is this acceptable? Patch in a bit. Could people who has problem with loop device please test this? I tested it on my Ubuntu feisty distribution and it works fine. Though I typically don't use loop device at all. --- The kernel on-demand loop device instantiation breaks several user space tools as the tools are not ready to cope with the on-demand feature. Fix it by instantiate default 8 loop devices and also reinstate max_loop module parameter. Signed-off-by: Ken Chen [EMAIL PROTECTED] diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5526ead..0aae8d8 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1354,7 +1354,7 @@ #endif */ static int max_loop; module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, obsolete, loop device is created on-demand); +MODULE_PARM_DESC(max_loop, Maximum number of loop devices); MODULE_LICENSE(GPL); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1462,34 +1462,66 @@ static struct kobject *loop_probe(dev_t return kobj; } -static int __init loop_init(void) -{ - if (register_blkdev(LOOP_MAJOR, loop)) - return -EIO; - blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL MINORBITS, - THIS_MODULE, loop_probe, NULL, NULL); - - if (max_loop) { - printk(KERN_INFO loop: the max_loop option is obsolete -and will be removed in March 2008\n); - - } - printk(KERN_INFO loop: module loaded\n); - return 0; -} - static void __exit loop_exit(void) { + unsigned long range; struct loop_device *lo, *next; + range = max_loop ? max_loop : 1UL MINORBITS; + list_for_each_entry_safe(lo, next, loop_devices, lo_list) loop_del_one(lo); - blk_unregister_region(MKDEV(LOOP_MAJOR, 0), 1UL MINORBITS); + blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range); if (unregister_blkdev(LOOP_MAJOR, loop)) printk(KERN_WARNING loop: cannot unregister blkdev\n); } +static int __init loop_init(void) +{ + int i, nr; + unsigned long range; + + /* +* loop module now has a feature to instantiate underlying device +* structure on-demand, provided that there is an access dev node. +* However, this will not work well with user space tool that doesn't +* know about such feature. In order to not break any existing +* tool, we do the following: +* +* (1) if max_loop is specified, create that many upfront, and this +* also becomes a hard limit. +* (2) if max_loop is not specified, create 8 loop device on module +* load, user can further extend loop device by create dev node +* themselves and have kernel automatically instantiate actual +* device on-demand. +*/ + if (max_loop) { + nr = max_loop; + range = max_loop; + } else { + nr = 8; + range = 1UL MINORBITS; + } + + if (register_blkdev(LOOP_MAJOR, loop)) + return -EIO; + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, + THIS_MODULE, loop_probe, NULL, NULL); + + for (i = 0; i nr; i++) { + if (!loop_init_one(i)) + goto err; + } + + printk(KERN_INFO loop: module loaded\n); + return 0; +err: + loop_exit(); + printk(KERN_INFO loop: out of memory\n); + return -ENOMEM; +} + module_init(loop_init); module_exit(loop_exit); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5526ead..0aae8d8 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1354,7 +1354,7 @@ #endif */ static int max_loop; module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, obsolete, loop device is created on-demand); +MODULE_PARM_DESC(max_loop, Maximum number of loop devices); MODULE_LICENSE(GPL); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1462,34 +1462,66 @@ static struct kobject *loop_probe(dev_t return kobj; } -static int __init loop_init(void) -{ - if (register_blkdev(LOOP_MAJOR, loop)) - return -EIO; - blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL MINORBITS, - THIS_MODULE, loop_probe, NULL, NULL); - - if (max_loop) { - printk(KERN_INFO loop: the max_loop
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/21/07, Al Viro [EMAIL PROTECTED] wrote: On Mon, May 21, 2007 at 03:00:55PM -0700, [EMAIL PROTECTED] wrote: + if (register_blkdev(LOOP_MAJOR, loop)) + return -EIO; + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, + THIS_MODULE, loop_probe, NULL, NULL); + + for (i = 0; i nr; i++) { + if (!loop_init_one(i)) + goto err; + } + + printk(KERN_INFO loop: module loaded\n); + return 0; +err: + loop_exit(); This isn't good. You *can't* fail once a single disk has been registered. Anyone could've opened it by now. IOW, you need to * register region *after* you are past the point of no return That option is a lot harder than I thought. This requires an array to keep intermediate result of preallocated lo device, blk_queue, and disk structure before calling add_disk() or register region. And this array could be potentially 1 million entries. Maybe I will use vmalloc for it, but seems rather sick. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/21/07, Al Viro [EMAIL PROTECTED] wrote: No, it doesn't. Really. It's easy to split; untested incremental to your patch follows: for (i = 0; i nr; i++) { - if (!loop_init_one(i)) - goto err; + lo = loop_alloc(i); + if (!lo) + goto Enomem; + list_add_tail(lo-lo_list, loop_devices); } ah, yes, use the loop_device list_head to link all the pending devices. + /* point of no return */ + + list_for_each_entry(lo, loop_devices, lo_list) + add_disk(lo-lo_disk); + + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, + THIS_MODULE, loop_probe, NULL, NULL); + printk(KERN_INFO loop: module loaded\n); return 0; -err: - loop_exit(); + +Enomem: printk(KERN_INFO loop: out of memory\n); + + while(!list_empty(loop_devices)) { + lo = list_entry(loop_devices.next, struct loop_device, lo_list); + loop_del_one(lo); + } + + unregister_blkdev(LOOP_MAJOR, loop); return -ENOMEM; } I suppose the loop_del_one call in Enomem label needs to be split up too since in the error path, it hasn't done add_disk() yet? - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + loop-preallocate-eight-loop-devices.patch added to -mm tree
On 5/21/07, Ken Chen [EMAIL PROTECTED] wrote: On 5/21/07, Al Viro [EMAIL PROTECTED] wrote: No, it doesn't. Really. It's easy to split; untested incremental to your patch follows: for (i = 0; i nr; i++) { - if (!loop_init_one(i)) - goto err; + lo = loop_alloc(i); + if (!lo) + goto Enomem; + list_add_tail(lo-lo_list, loop_devices); } ah, yes, use the loop_device list_head to link all the pending devices. + /* point of no return */ + + list_for_each_entry(lo, loop_devices, lo_list) + add_disk(lo-lo_disk); + + blk_register_region(MKDEV(LOOP_MAJOR, 0), range, + THIS_MODULE, loop_probe, NULL, NULL); + printk(KERN_INFO loop: module loaded\n); return 0; -err: - loop_exit(); + +Enomem: printk(KERN_INFO loop: out of memory\n); + + while(!list_empty(loop_devices)) { + lo = list_entry(loop_devices.next, struct loop_device, lo_list); + loop_del_one(lo); + } + + unregister_blkdev(LOOP_MAJOR, loop); return -ENOMEM; } I suppose the loop_del_one call in Enomem label needs to be split up too since in the error path, it hasn't done add_disk() yet? tested, like this? diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5526ead..0ed5470 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1354,7 +1354,7 @@ #endif */ static int max_loop; module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, obsolete, loop device is created on-demand); +MODULE_PARM_DESC(max_loop, Maximum number of loop devices); MODULE_LICENSE(GPL); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1394,16 +1394,11 @@ int loop_unregister_transfer EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static struct loop_device *loop_init_one(int i) +static struct loop_device *loop_alloc(int i) { struct loop_device *lo; struct gendisk *disk; - list_for_each_entry(lo, loop_devices, lo_list) { - if (lo-lo_number == i) - return lo; - } - lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) goto out; @@ -1427,8 +1422,6 @@ static struct loop_device *loop_init_one disk-private_data = lo; disk-queue = lo-lo_queue; sprintf(disk-disk_name, loop%d, i); - add_disk(disk); - list_add_tail(lo-lo_list, loop_devices); return lo; out_free_queue: @@ -1439,15 +1432,37 @@ out: return NULL; } -static void loop_del_one(struct loop_device *lo) +static void loop_free(struct loop_device *lo) { - del_gendisk(lo-lo_disk); blk_cleanup_queue(lo-lo_queue); put_disk(lo-lo_disk); list_del(lo-lo_list); kfree(lo); } +static struct loop_device *loop_init_one(int i) +{ + struct loop_device *lo; + + list_for_each_entry(lo, loop_devices, lo_list) { + if (lo-lo_number == i) + return lo; + } + + lo = loop_alloc(i); + if (lo) { + add_disk(lo-lo_disk); + list_add_tail(lo-lo_list, loop_devices); + } + return lo; +} + +static void loop_del_one(struct loop_device *lo) +{ + del_gendisk(lo-lo_disk); + loop_free(lo); +} + static struct kobject *loop_probe(dev_t dev, int *part, void *data) { struct loop_device *lo; @@ -1464,28 +1479,77 @@ static struct kobject *loop_probe static int __init loop_init(void) { - if (register_blkdev(LOOP_MAJOR, loop)) - return -EIO; - blk_register_region(MKDEV(LOOP_MAJOR, 0), 1UL MINORBITS, - THIS_MODULE, loop_probe, NULL, NULL); + int i, nr; + unsigned long range; + struct loop_device *lo, *next; + + /* +* loop module now has a feature to instantiate underlying device +* structure on-demand, provided that there is an access dev node. +* However, this will not work well with user space tool that doesn't +* know about such feature. In order to not break any existing +* tool, we do the following: +* +* (1) if max_loop is specified, create that many upfront, and this +* also becomes a hard limit. +* (2) if max_loop is not specified, create 8 loop device on module +* load, user can further extend loop device by create dev node +* themselves and have kernel automatically instantiate actual +* device on-demand. +*/ + if (max_loop 1UL MINORBITS) + return -EINVAL; if (max_loop) { - printk(KERN_INFO loop: the max_loop option is obsolete -and will be removed in March 2008\n); + nr = max_loop; + range = max_loop
Re: 2.6.21-git11: BUG in loop.ko
On 5/9/07, Andrew Morton <[EMAIL PROTECTED]> wrote: On Wed, 09 May 2007 16:52:41 -0700 Jeremy Fitzhardinge <[EMAIL PROTECTED]> wrote: > Seems to be getting a 0 refcount. I don't see anything in the recent > changes which might cause this, but this is relatively new behaviour. > It was working for me in the 2.6.21-pre time period, but I haven't tried > this since 2.6.21 was released. > > The BUG is actually triggered by the __module_get(THIS_MODULE) in > loop_set_fd. A few people have been playing with module refcounting lately. Did you work out a reproduce-it recipe? Ah, it's a mis-understanding on what kobj_probe_t function is suppose to return on success. When we open loop device that has not been initialized, we probe it via: do_open get_gendisk kobj_lookup loop_probe Notice that in kobj_lookup(), when p->probe() returns non-zero value (I presume it is an -ERRNO), it breaks out of the loop and propagate the return value, otherwise, loops back to the beginning of the for loop and retry, and in there get_disk() will be called via p->lock() to get a ref against the module. kobj_look_up(...) { retry: mutex_lock(domain->lock); for (p = domain->probes[MAJOR(dev) % 255]; p; p = p->next) { ... if (kobj) return kobj; goto retry; } So loop_probe() mistakenly returned wrong status and leads to future oops on inconsistent module ref count. The following patch fixes the issue. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 18cdd8c..40f7bc2 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1460,6 +1460,7 @@ static void loop_del_one(struct loop_device *lo) kfree(lo); } +/* return NULL for success, or return non-zero value if there are error */ static struct kobject *loop_probe(dev_t dev, int *part, void *data) { unsigned int number = dev & MINORMASK; @@ -1474,8 +1475,8 @@ static struct kobject *loop_probe *part = 0; if (IS_ERR(lo)) return (void *)lo; - else - return >lo_disk->kobj; + + return NULL; } static int __init loop_init(void) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: 2.6.21-git11: BUG in loop.ko
On 5/9/07, Andrew Morton [EMAIL PROTECTED] wrote: On Wed, 09 May 2007 16:52:41 -0700 Jeremy Fitzhardinge [EMAIL PROTECTED] wrote: Seems to be getting a 0 refcount. I don't see anything in the recent changes which might cause this, but this is relatively new behaviour. It was working for me in the 2.6.21-pre time period, but I haven't tried this since 2.6.21 was released. The BUG is actually triggered by the __module_get(THIS_MODULE) in loop_set_fd. A few people have been playing with module refcounting lately. Did you work out a reproduce-it recipe? Ah, it's a mis-understanding on what kobj_probe_t function is suppose to return on success. When we open loop device that has not been initialized, we probe it via: do_open get_gendisk kobj_lookup loop_probe Notice that in kobj_lookup(), when p-probe() returns non-zero value (I presume it is an -ERRNO), it breaks out of the loop and propagate the return value, otherwise, loops back to the beginning of the for loop and retry, and in there get_disk() will be called via p-lock() to get a ref against the module. kobj_look_up(...) { retry: mutex_lock(domain-lock); for (p = domain-probes[MAJOR(dev) % 255]; p; p = p-next) { ... if (kobj) return kobj; goto retry; } So loop_probe() mistakenly returned wrong status and leads to future oops on inconsistent module ref count. The following patch fixes the issue. Signed-off-by: Ken Chen [EMAIL PROTECTED] diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 18cdd8c..40f7bc2 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1460,6 +1460,7 @@ static void loop_del_one(struct loop_device *lo) kfree(lo); } +/* return NULL for success, or return non-zero value if there are error */ static struct kobject *loop_probe(dev_t dev, int *part, void *data) { unsigned int number = dev MINORMASK; @@ -1474,8 +1475,8 @@ static struct kobject *loop_probe *part = 0; if (IS_ERR(lo)) return (void *)lo; - else - return lo-lo_disk-kobj; + + return NULL; } static int __init loop_init(void) - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + per-cpuset-hugetlb-accounting-and-administration.patch added to -mm tree
On 5/3/07, Paul Jackson <[EMAIL PROTECTED]> wrote: Note, Ken, that if we did that, the calculation of these new Total and Free stats would be a little different than your new code. Instead of looping over the memory nodes in the current tasks mems_allowed mask, we would loop over the memory nodes allowed in the cpuset being queried (the cpuset whose 'hugepages_total' or 'hugepages_free' special file we were reading, not the current tasks cpuset.) This is even more controversial and messy. akpm already dropped the patch and expressed that he doesn't like it. And I won't go down another messy path. I will let this idea RIP. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + per-cpuset-hugetlb-accounting-and-administration.patch added to -mm tree
On 5/3/07, Paul Jackson [EMAIL PROTECTED] wrote: Note, Ken, that if we did that, the calculation of these new Total and Free stats would be a little different than your new code. Instead of looping over the memory nodes in the current tasks mems_allowed mask, we would loop over the memory nodes allowed in the cpuset being queried (the cpuset whose 'hugepages_total' or 'hugepages_free' special file we were reading, not the current tasks cpuset.) This is even more controversial and messy. akpm already dropped the patch and expressed that he doesn't like it. And I won't go down another messy path. I will let this idea RIP. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + per-cpuset-hugetlb-accounting-and-administration.patch added to -mm tree
On 5/3/07, Paul Jackson <[EMAIL PROTECTED]> wrote: Adding Christoph Lameter <[EMAIL PROTECTED]> to the cc list, as he knows more about hugetlb pages than I do. This patch strikes me as a bit odd. Granted, it's solving what could be a touchy problem with a fairly simple solution, which is usually a Good Thing(tm). However, the idea that different tasks would see different values for the following fields in /proc/meminfo: HugePages_Total: 0 HugePages_Free: 0 strikes me as odd, and risky. I would have thought that usually, all tasks in the system should see the same values in the files in /proc (as opposed to the files in particular task subdirectories /proc/.) This patch strikes me as a bit of a hack, good for compatibility, but hiding a booby trap that will bite some user code in the long run. But I'm not enough of an expert to know what the right tradeoffs are in this matter. Would annotating the Hugepages_* field with name of cpuset help? I orginally thought that since cpuset's mems are hirearchical in memory assignment, it is fairly straightforward to understand what's going on: parent cpuset stats include its and all of its children. For example, if root cpuset has two sub job1 and job2 cpusets, each has 20 and 30 htlb pages, when query at each level, we have: [EMAIL PROTECTED] echo $$ > /dev/cpuset/tasks [EMAIL PROTECTED] grep HugePages_Total /proc/meminfo HugePages_Total:50 [EMAIL PROTECTED] echo $$ > /dev/cpuset/job1/tasks [EMAIL PROTECTED] grep HugePages_Total /proc/meminfo HugePages_Total:20 [EMAIL PROTECTED] echo $$ > /dev/cpuset/job2/tasks [EMAIL PROTECTED] grep HugePages_Total /proc/meminfo HugePages_Total:30 If this is odd, do you have any suggestions for alternative? - Ken - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: + per-cpuset-hugetlb-accounting-and-administration.patch added to -mm tree
On 5/3/07, Paul Jackson [EMAIL PROTECTED] wrote: Adding Christoph Lameter [EMAIL PROTECTED] to the cc list, as he knows more about hugetlb pages than I do. This patch strikes me as a bit odd. Granted, it's solving what could be a touchy problem with a fairly simple solution, which is usually a Good Thing(tm). However, the idea that different tasks would see different values for the following fields in /proc/meminfo: HugePages_Total: 0 HugePages_Free: 0 strikes me as odd, and risky. I would have thought that usually, all tasks in the system should see the same values in the files in /proc (as opposed to the files in particular task subdirectories /proc/pid.) This patch strikes me as a bit of a hack, good for compatibility, but hiding a booby trap that will bite some user code in the long run. But I'm not enough of an expert to know what the right tradeoffs are in this matter. Would annotating the Hugepages_* field with name of cpuset help? I orginally thought that since cpuset's mems are hirearchical in memory assignment, it is fairly straightforward to understand what's going on: parent cpuset stats include its and all of its children. For example, if root cpuset has two sub job1 and job2 cpusets, each has 20 and 30 htlb pages, when query at each level, we have: [EMAIL PROTECTED] echo $$ /dev/cpuset/tasks [EMAIL PROTECTED] grep HugePages_Total /proc/meminfo HugePages_Total:50 [EMAIL PROTECTED] echo $$ /dev/cpuset/job1/tasks [EMAIL PROTECTED] grep HugePages_Total /proc/meminfo HugePages_Total:20 [EMAIL PROTECTED] echo $$ /dev/cpuset/job2/tasks [EMAIL PROTECTED] grep HugePages_Total /proc/meminfo HugePages_Total:30 If this is odd, do you have any suggestions for alternative? - Ken - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: cache-pipe-buf-page-address-for-non-highmem-arch.patch
On 5/1/07, Andrew Morton <[EMAIL PROTECTED]> wrote: Fair enough, it is a bit of an ugly thing. And I see no measurements there on what the overall speedup was for any workload. Ken, which memory model was in use? sparsemem? discontigmem with config_numa on. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: cache-pipe-buf-page-address-for-non-highmem-arch.patch
On 5/1/07, Andrew Morton [EMAIL PROTECTED] wrote: Fair enough, it is a bit of an ugly thing. And I see no measurements there on what the overall speedup was for any workload. Ken, which memory model was in use? sparsemem? discontigmem with config_numa on. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/12/07, Ken Chen <[EMAIL PROTECTED]> wrote: On 4/12/07, Jeff Moyer <[EMAIL PROTECTED]> wrote: > I didn't see any response to Zach's request for code that actually > tests out the shared ring buffer. Do you have such code? Yes, I do. I was stress testing the code since last night. After 20+ hours of stress run with fio and aio-stress, now I'm posting it with confidence. I modified libaio's io_getevents to take advantage of new user level reap function. The feature is exported out via ring->compat_features. btw, is compat_feature suppose to be a version number or a bit mask? I think bitmask make more sense and more flexible. Additional patch on the kernel side to export the new features. On top of patch posted at: http://marc.info/?l=linux-kernel=117636401818057=2 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -138,8 +138,11 @@ #define init_sync_kiocb(x, filp) \ init_wait((&(x)->ki_wait)); \ } while (0) +#define AIO_RING_BASE 1 +#define AIO_RING_USER_REAP 2 + #define AIO_RING_MAGIC 0xa10a10a1 -#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_COMPAT_FEATURES (AIO_RING_BASE | AIO_RING_USER_REAP) #define AIO_RING_INCOMPAT_FEATURES 0 struct aio_ring { unsignedid; /* kernel internal index number */ - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/12/07, Jeff Moyer <[EMAIL PROTECTED]> wrote: I didn't see any response to Zach's request for code that actually tests out the shared ring buffer. Do you have such code? Yes, I do. I was stress testing the code since last night. After 20+ hours of stress run with fio and aio-stress, now I'm posting it with confidence. I modified libaio's io_getevents to take advantage of new user level reap function. The feature is exported out via ring->compat_features. btw, is compat_feature suppose to be a version number or a bit mask? I think bitmask make more sense and more flexible. (warning: some lines are extremely long in the patch and my email client will probably mangle it badly). diff -Nurp libaio-0.3.104/src/io_getevents.c libaio-0.3.104-new/src/io_getevents.c --- libaio-0.3.104/src/io_getevents.c 2003-06-18 12:58:21.0 -0700 +++ libaio-0.3.104-new/src/io_getevents.c 2007-04-12 17:35:06.0 -0700 @@ -21,10 +21,13 @@ #include #include #include "syscall.h" +#include io_syscall5(int, __io_getevents_0_4, io_getevents, io_context_t, ctx, long, min_nr, long, nr, struct io_event *, events, struct timespec *, timeout) #define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_BASE 1 +#define AIO_RING_USER_REAP 2 /* Ben will hate me for this */ struct aio_ring { @@ -41,7 +44,11 @@ struct aio_ring { int io_getevents_0_4(io_context_t ctx, long min_nr, long nr, struct io_event * events, struct timespec * timeout) { + long i = 0, ret; + unsigned head; + struct io_event *evt_base; struct aio_ring *ring; + ring = (struct aio_ring*)ctx; if (ring==NULL || ring->magic != AIO_RING_MAGIC) goto do_syscall; @@ -49,9 +56,35 @@ int io_getevents_0_4(io_context_t ctx, l if (ring->head == ring->tail) return 0; } - + + if (!(ring->compat_features & AIO_RING_USER_REAP)) + goto do_syscall; + + if (min_nr > nr || min_nr < 0 || nr < 0) + return -EINVAL; + + evt_base = (struct io_event *) (ring + 1); + while (i < nr) { + head = ring->head; + if (head == ring->tail) + break; + + *events = evt_base[head & (ring->nr - 1)]; + if (head == cmpxchg(>head, head, head + 1)) { + events++; + i++; + } + } + + if (i >= min_nr) + return i; + do_syscall: - return __io_getevents_0_4(ctx, min_nr, nr, events, timeout); + ret = __io_getevents_0_4(ctx, min_nr - i, nr - i, events, timeout); + if (ret >= 0) + return i + ret; + else + return i ? i : ret; } DEFSYMVER(io_getevents_0_4, io_getevents, 0.4) - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/11/07, Zach Brown <[EMAIL PROTECTED]> wrote: First, I'll NAK this and all AIO patches until the patch description says that it's been run through the regression tests that we've started collecting in autotest. They're trivial to run, never fear: cd /usr/local svn checkout svn://test.kernel.org/autotest/trunk/client autotest cd autotest bin/autotest tests/aio_dio_bugs/control I ran through the autotest (with bug fix in the test code). It passes the regression tests. I made the following change since last rev: * struct aio_ring stays unchanged. I added a cast to atomic_cmpxchg() * aio_ring_event() has grown too big, I turned that into a function * take out white space change from previous rev. open issue that this patch does not try to fix: * kmap address alias needs a flush_dcache_page Zach, does this make you more comfortable? (or maybe less iffy is the right word?) From: Ken Chen <[EMAIL PROTECTED]> Resurrect an old patch that uses atomic operation to update ring buffer index on AIO event queue. This work allows further application/libaio optimization to run fast path io_getevents in user space. I've also added one more change on top of old implementation that rounds ring buffer size to power of 2 to allow fast head/tail calculation. With the new scheme, there is no more modulo operation on them and we simply increment either pointer index directly. This scheme also automatically handles integer wrap nicely without any additional special treatment. Patch was stress tested and in addition, tested on autotest:aio_dio_bugs. The results were all pass: Output of autotest:aio_dio_bugs: ran for 200 seconds without error, passing AIO read of last block in file succeeded. aio-free-ring-with-bogus-nr-pages: Success! aio-io-setup-with-nonwritable-context-pointer: Success! GOOD aio_dio_bugs completed successfully Signed-off-by: Ken Chen <[EMAIL PROTECTED]> diff --git a/fs/aio.c b/fs/aio.c index e4598d6..f8b8e45 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx) unsigned long size; int nr_pages; - /* Compensate for the ring buffer's head/tail overlap entry */ - nr_events += 2; /* 1 is required, 2 for good luck */ + /* round nr_event to next power of 2 */ + nr_events = roundup_pow_of_two(nr_events); size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; @@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx) if (nr_pages < 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; info->ring_pages = info->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -177,14 +175,16 @@ static int aio_setup_ring(struct kioctx *ctx) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ - struct io_event *__event; \ - __event = kmap_atomic( \ - (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ - __event += pos % AIO_EVENTS_PER_PAGE;\ - __event; \ -}) +static struct io_event *aio_ring_event(struct aio_ring_info *info, + unsigned nr, enum km_type km) +{ + unsigned pos = (nr & (info->nr - 1)) + AIO_EVENTS_OFFSET; + struct io_event *event; + + event = kmap_atomic(info->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); + event += pos % AIO_EVENTS_PER_PAGE; + return event; +} #define put_aio_ring_event(event, km) do { \ struct io_event *__event = (event); \ @@ -220,7 +220,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) atomic_set(>users, 1); spin_lock_init(>ctx_lock); - spin_lock_init(>ring_info.ring_lock); init_waitqueue_head(>wait); INIT_LIST_HEAD(>active_reqs); @@ -927,7 +926,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) struct aio_ring *ring; struct io_event *event; unsigned long flags; - unsigned long tail; + unsigned tail; int ret; /* @@ -969,8 +968,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) tail = info->tail; event = aio_ring_event(info, tail, KM_IRQ0); - if (++tail >= info->nr) - tail = 0; event->obj = (u64)(unsigned long)iocb->ki_obj.user; event->data = iocb->ki_user_data; @@ -986,13 +983,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) */ smp_wmb(); /* make event visible before updating tail */ + tail++; info->tail = tail; ring->tail = tail; put_aio_ring_event(event, KM_IRQ0); kunmap_atomic(ring, KM_IRQ1); - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + pr_debug("added to ring %p at [%u]\n", iocb, tail); put_rq: /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); @@ -1007,39 +1005,35 @@
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/11/07, Ken Chen <[EMAIL PROTECTED]> wrote: On 4/11/07, Zach Brown <[EMAIL PROTECTED]> wrote: > First, I'll NAK this and all AIO patches until the patch description > says that it's been run through the regression tests that we've started > collecting in autotest. They're trivial to run, never fear: OK. I will run those regression tests. Unfortunately, the aio_dio_bugs test in autotest has bug in it :-( We need stress test the "test code". on stock 2.6.21-rc6 kernel: [rock-me-baby]$ cd autotest/tests/aio_dio_bugs/src [rock-me-baby]$ make [rock-me-baby]$ ./aio-free-ring-with-bogus-nr-pages aio-free-ring-with-bogus-nr-pages: Error: io_setup returned -22, expected -ENOMEM hmm??? The problem is that the test code forgot to initialized ctx variable and in the kernel, sys_io_setup returns EINVAL if user address contain none-zero value. I will submit the following patch to autotest to correct the test code. --- aio-free-ring-with-bogus-nr-pages.c.orig2007-04-11 23:57:45 -0700 +++ aio-free-ring-with-bogus-nr-pages.c 2007-04-11 23:57:59 -0700 @@ -39,7 +39,7 @@ int main(int __attribute__((unused)) argc, char **argv) { long res; - io_context_t ctx; + io_context_t ctx = 0; void* map; while (1) { - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/11/07, Ken Chen [EMAIL PROTECTED] wrote: On 4/11/07, Zach Brown [EMAIL PROTECTED] wrote: First, I'll NAK this and all AIO patches until the patch description says that it's been run through the regression tests that we've started collecting in autotest. They're trivial to run, never fear: OK. I will run those regression tests. Unfortunately, the aio_dio_bugs test in autotest has bug in it :-( We need stress test the test code. on stock 2.6.21-rc6 kernel: [rock-me-baby]$ cd autotest/tests/aio_dio_bugs/src [rock-me-baby]$ make [rock-me-baby]$ ./aio-free-ring-with-bogus-nr-pages aio-free-ring-with-bogus-nr-pages: Error: io_setup returned -22, expected -ENOMEM hmm??? The problem is that the test code forgot to initialized ctx variable and in the kernel, sys_io_setup returns EINVAL if user address contain none-zero value. I will submit the following patch to autotest to correct the test code. --- aio-free-ring-with-bogus-nr-pages.c.orig2007-04-11 23:57:45 -0700 +++ aio-free-ring-with-bogus-nr-pages.c 2007-04-11 23:57:59 -0700 @@ -39,7 +39,7 @@ int main(int __attribute__((unused)) argc, char **argv) { long res; - io_context_t ctx; + io_context_t ctx = 0; void* map; while (1) { - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/11/07, Zach Brown [EMAIL PROTECTED] wrote: First, I'll NAK this and all AIO patches until the patch description says that it's been run through the regression tests that we've started collecting in autotest. They're trivial to run, never fear: cd /usr/local svn checkout svn://test.kernel.org/autotest/trunk/client autotest cd autotest bin/autotest tests/aio_dio_bugs/control I ran through the autotest (with bug fix in the test code). It passes the regression tests. I made the following change since last rev: * struct aio_ring stays unchanged. I added a cast to atomic_cmpxchg() * aio_ring_event() has grown too big, I turned that into a function * take out white space change from previous rev. open issue that this patch does not try to fix: * kmap address alias needs a flush_dcache_page Zach, does this make you more comfortable? (or maybe less iffy is the right word?) From: Ken Chen [EMAIL PROTECTED] Resurrect an old patch that uses atomic operation to update ring buffer index on AIO event queue. This work allows further application/libaio optimization to run fast path io_getevents in user space. I've also added one more change on top of old implementation that rounds ring buffer size to power of 2 to allow fast head/tail calculation. With the new scheme, there is no more modulo operation on them and we simply increment either pointer index directly. This scheme also automatically handles integer wrap nicely without any additional special treatment. Patch was stress tested and in addition, tested on autotest:aio_dio_bugs. The results were all pass: Output of autotest:aio_dio_bugs: ran for 200 seconds without error, passing AIO read of last block in file succeeded. aio-free-ring-with-bogus-nr-pages: Success! aio-io-setup-with-nonwritable-context-pointer: Success! GOOD aio_dio_bugs completed successfully Signed-off-by: Ken Chen [EMAIL PROTECTED] diff --git a/fs/aio.c b/fs/aio.c index e4598d6..f8b8e45 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx) unsigned long size; int nr_pages; - /* Compensate for the ring buffer's head/tail overlap entry */ - nr_events += 2; /* 1 is required, 2 for good luck */ + /* round nr_event to next power of 2 */ + nr_events = roundup_pow_of_two(nr_events); size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; @@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx) if (nr_pages 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info-nr = 0; info-ring_pages = info-internal_pages; if (nr_pages AIO_RING_PAGES) { @@ -177,14 +175,16 @@ static int aio_setup_ring(struct kioctx *ctx) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ - struct io_event *__event; \ - __event = kmap_atomic( \ - (info)-ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ - __event += pos % AIO_EVENTS_PER_PAGE;\ - __event; \ -}) +static struct io_event *aio_ring_event(struct aio_ring_info *info, + unsigned nr, enum km_type km) +{ + unsigned pos = (nr (info-nr - 1)) + AIO_EVENTS_OFFSET; + struct io_event *event; + + event = kmap_atomic(info-ring_pages[pos / AIO_EVENTS_PER_PAGE], km); + event += pos % AIO_EVENTS_PER_PAGE; + return event; +} #define put_aio_ring_event(event, km) do { \ struct io_event *__event = (event); \ @@ -220,7 +220,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) atomic_set(ctx-users, 1); spin_lock_init(ctx-ctx_lock); - spin_lock_init(ctx-ring_info.ring_lock); init_waitqueue_head(ctx-wait); INIT_LIST_HEAD(ctx-active_reqs); @@ -927,7 +926,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) struct aio_ring *ring; struct io_event *event; unsigned long flags; - unsigned long tail; + unsigned tail; int ret; /* @@ -969,8 +968,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) tail = info-tail; event = aio_ring_event(info, tail, KM_IRQ0); - if (++tail = info-nr) - tail = 0; event-obj = (u64)(unsigned long)iocb-ki_obj.user; event-data = iocb-ki_user_data; @@ -986,13 +983,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) */ smp_wmb(); /* make event visible before updating tail */ + tail++; info-tail = tail; ring-tail = tail; put_aio_ring_event(event, KM_IRQ0); kunmap_atomic(ring, KM_IRQ1); - pr_debug(added to ring %p at [%lu]\n, iocb, tail); + pr_debug(added to ring %p at [%u]\n, iocb, tail); put_rq: /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); @@ -1007,39 +1005,35 @@ put_rq: /* aio_read_evt * Pull an event off of the ioctx's event ring. Returns the number of * events fetched (0 or 1
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/12/07, Jeff Moyer [EMAIL PROTECTED] wrote: I didn't see any response to Zach's request for code that actually tests out the shared ring buffer. Do you have such code? Yes, I do. I was stress testing the code since last night. After 20+ hours of stress run with fio and aio-stress, now I'm posting it with confidence. I modified libaio's io_getevents to take advantage of new user level reap function. The feature is exported out via ring-compat_features. btw, is compat_feature suppose to be a version number or a bit mask? I think bitmask make more sense and more flexible. (warning: some lines are extremely long in the patch and my email client will probably mangle it badly). diff -Nurp libaio-0.3.104/src/io_getevents.c libaio-0.3.104-new/src/io_getevents.c --- libaio-0.3.104/src/io_getevents.c 2003-06-18 12:58:21.0 -0700 +++ libaio-0.3.104-new/src/io_getevents.c 2007-04-12 17:35:06.0 -0700 @@ -21,10 +21,13 @@ #include stdlib.h #include time.h #include syscall.h +#include asm/system.h io_syscall5(int, __io_getevents_0_4, io_getevents, io_context_t, ctx, long, min_nr, long, nr, struct io_event *, events, struct timespec *, timeout) #define AIO_RING_MAGIC 0xa10a10a1 +#define AIO_RING_BASE 1 +#define AIO_RING_USER_REAP 2 /* Ben will hate me for this */ struct aio_ring { @@ -41,7 +44,11 @@ struct aio_ring { int io_getevents_0_4(io_context_t ctx, long min_nr, long nr, struct io_event * events, struct timespec * timeout) { + long i = 0, ret; + unsigned head; + struct io_event *evt_base; struct aio_ring *ring; + ring = (struct aio_ring*)ctx; if (ring==NULL || ring-magic != AIO_RING_MAGIC) goto do_syscall; @@ -49,9 +56,35 @@ int io_getevents_0_4(io_context_t ctx, l if (ring-head == ring-tail) return 0; } - + + if (!(ring-compat_features AIO_RING_USER_REAP)) + goto do_syscall; + + if (min_nr nr || min_nr 0 || nr 0) + return -EINVAL; + + evt_base = (struct io_event *) (ring + 1); + while (i nr) { + head = ring-head; + if (head == ring-tail) + break; + + *events = evt_base[head (ring-nr - 1)]; + if (head == cmpxchg(ring-head, head, head + 1)) { + events++; + i++; + } + } + + if (i = min_nr) + return i; + do_syscall: - return __io_getevents_0_4(ctx, min_nr, nr, events, timeout); + ret = __io_getevents_0_4(ctx, min_nr - i, nr - i, events, timeout); + if (ret = 0) + return i + ret; + else + return i ? i : ret; } DEFSYMVER(io_getevents_0_4, io_getevents, 0.4) - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/12/07, Ken Chen [EMAIL PROTECTED] wrote: On 4/12/07, Jeff Moyer [EMAIL PROTECTED] wrote: I didn't see any response to Zach's request for code that actually tests out the shared ring buffer. Do you have such code? Yes, I do. I was stress testing the code since last night. After 20+ hours of stress run with fio and aio-stress, now I'm posting it with confidence. I modified libaio's io_getevents to take advantage of new user level reap function. The feature is exported out via ring-compat_features. btw, is compat_feature suppose to be a version number or a bit mask? I think bitmask make more sense and more flexible. Additional patch on the kernel side to export the new features. On top of patch posted at: http://marc.info/?l=linux-kernelm=117636401818057w=2 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -138,8 +138,11 @@ #define init_sync_kiocb(x, filp) \ init_wait(((x)-ki_wait)); \ } while (0) +#define AIO_RING_BASE 1 +#define AIO_RING_USER_REAP 2 + #define AIO_RING_MAGIC 0xa10a10a1 -#define AIO_RING_COMPAT_FEATURES 1 +#define AIO_RING_COMPAT_FEATURES (AIO_RING_BASE | AIO_RING_USER_REAP) #define AIO_RING_INCOMPAT_FEATURES 0 struct aio_ring { unsignedid; /* kernel internal index number */ - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/11/07, Zach Brown <[EMAIL PROTECTED]> wrote: First, I'll NAK this and all AIO patches until the patch description says that it's been run through the regression tests that we've started collecting in autotest. They're trivial to run, never fear: OK. I will run those regression tests. > Resurrect an old patch that uses atomic operation to update ring buffer > index on AIO event queue. This work allows futher application/libaio > optimization to run fast path io_getevents in user space. I have mixed feelings. I think the userspace getevents support was poorly designed and the simple fact that we've gone this long without it says just how desperately the feature isn't needed. I kept on getting requests from application developers who want that feature. My initial patch was dated back May 2004. We're allocating more ring elements than userspace asked for and than were accounted for in aio_max_nr. That cost, however teeny, will be measured against the benefit. I noticed that while writing the patch. We have the same bug right now that nr_events is enlarged to consume the entire mmap'ed ring buffer pages. But yet, we don't account those in aio_max_nr. I didn't fix that up in this patch because I was going to do a separate patch on that. > - /* Compensate for the ring buffer's head/tail overlap entry */ > - nr_events += 2; /* 1 is required, 2 for good luck */ > + /* round nr_event to next power of 2 */ > + nr_events = roundup_pow_of_two(nr_events); Is that buggy? How will the code tell if head == tail means a full ring or an empty ring? The old code added that extra event to let it tell the ring was full before head == tail and it would think it's empty again, I think. I'm guessing roundup(nr_events + 1) is needed. I don't think it's a bug. akpm taught me the trick: we don't wrap the index around the ring size in this scheme, so the extra space is not needed. The ring page, which is mmaped to userspace at some weird address, was just written through a kmap. Do we need a flush_dcache_page()? This isn't this patch's problem, but it'd need to be addressed if we're using the ring from userspace. I will look into this aside from this patch. - Ken - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/11/07, Zach Brown <[EMAIL PROTECTED]> wrote: > Sorry I wasn't thorough enough. And partially because I was worried > about changing structure type for user space facing struct aio_ring. > Now that I looked through all arches, it looks safe as all arch's > atomic_t has the same size as int. > Here is the updated patch. > @@ -144,7 +144,7 @@ struct kiocb { > struct aio_ring { > unsignedid; /* kernel internal index number */ > unsignednr; /* number of io_events */ > - unsignedhead; > + atomic_thead; > unsignedtail; Embedding an atomic_t in an ABI struct? That makes everyone else nervous too, right? sure, make me nervous at least. It may look safe on i386/x86-64 today, but this doesn't seem like wise practice. Is there any reason to believe that atomic_t will never change size? Does anything else do this already? If nothing else, the "unsigned" (should be __u32, sigh) could be cast to an atomic_t. The atomic_t cast is equally iffy. I don't know what would be a best solution. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/10/07, Andrew Morton <[EMAIL PROTECTED]> wrote: On Tue, 10 Apr 2007 16:53:53 -0700 (PDT) [EMAIL PROTECTED] (Ken Chen) wrote: > + } while (head != cmpxchg(>head, head, head + 1)); A hasty grep indicates that only 14 out of 23 architectures implement cmpxchg(). Sorry I wasn't thorough enough. And partially because I was worried about changing structure type for user space facing struct aio_ring. Now that I looked through all arches, it looks safe as all arch's atomic_t has the same size as int. Here is the updated patch. diff --git a/fs/aio.c b/fs/aio.c index e4598d6..091bcb4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx) unsigned long size; int nr_pages; - /* Compensate for the ring buffer's head/tail overlap entry */ - nr_events += 2; /* 1 is required, 2 for good luck */ + /* round nr_event to next power of 2 */ + nr_events = roundup_pow_of_two(nr_events); size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; @@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx) if (nr_pages < 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; info->ring_pages = info->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -159,7 +157,8 @@ static int aio_setup_ring(struct kioctx *ctx) ring = kmap_atomic(info->ring_pages[0], KM_USER0); ring->nr = nr_events; /* user copy */ ring->id = ctx->user_id; - ring->head = ring->tail = 0; + atomic_set(>head, 0); + ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; ring->incompat_features = AIO_RING_INCOMPAT_FEATURES; @@ -177,8 +176,8 @@ static int aio_setup_ring(struct kioctx *ctx) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ +#define aio_ring_event(info, __nr, km) ({\ + unsigned pos = ((__nr) & ((info)->nr - 1)) + AIO_EVENTS_OFFSET; \ struct io_event *__event; \ __event = kmap_atomic( \ (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ @@ -220,7 +219,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) atomic_set(>users, 1); spin_lock_init(>ctx_lock); - spin_lock_init(>ring_info.ring_lock); init_waitqueue_head(>wait); INIT_LIST_HEAD(>active_reqs); @@ -927,7 +925,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) struct aio_ring *ring; struct io_event *event; unsigned long flags; - unsigned long tail; + unsigned tail; int ret; /* @@ -969,8 +967,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) tail = info->tail; event = aio_ring_event(info, tail, KM_IRQ0); - if (++tail >= info->nr) - tail = 0; event->obj = (u64)(unsigned long)iocb->ki_obj.user; event->data = iocb->ki_user_data; @@ -986,13 +982,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) */ smp_wmb(); /* make event visible before updating tail */ + tail++; info->tail = tail; ring->tail = tail; put_aio_ring_event(event, KM_IRQ0); kunmap_atomic(ring, KM_IRQ1); - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + pr_debug("added to ring %p at [%u]\n", iocb, tail); put_rq: /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); @@ -1007,42 +1004,36 @@ put_rq: /* aio_read_evt * Pull an event off of the ioctx's event ring. Returns the number of * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. * TODO: make the ringbuffer user mmap()able (requires FIXME). */ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) { struct aio_ring_info *info = >ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; + struct io_event *evp; + unsigned head; + int ret; ring = kmap_atomic(info->ring_pages[0], KM_USER0); - dprintk("in aio_read_evt h%lu t%lu m%lu\n", - (unsigned long)ring->head, (unsigned long)ring->tail, - (unsigned long)ring->nr); - - if (ring->head == ring->tail) - goto out; - - spin_lock(>ring_lock); + dprintk("in aio_read_evt h%u t%u m%u\n", + atomic_read(>head), ring->tail, ring->nr); - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); + do { + head = atomic_read(>head); + if (head == ring->tail) { + ret = 0; + break; + } + evp = aio_ring_event(info, head, KM_USER1); *ent = *evp; - head = (head + 1) % info->nr; smp_mb(); /* finish reading the event before updatng the head */ - ring->head = head; ret = 1; put
Re: Why kmem_cache_free occupy CPU for more than 10 seconds?
On 4/11/07, Peter Zijlstra <[EMAIL PROTECTED]> wrote: On Wed, 2007-04-11 at 17:53 +0800, Zhao Forrest wrote: > I got some new information: > Before soft lockup message is out, we have: > [EMAIL PROTECTED] home]# cat /proc/slabinfo |grep buffer_head > buffer_head 10927942 10942560120 321 : tunables 32 > 168 : slabdata 341955 341955 6 : globalstat 37602996 11589379 > 11743736 01 6918 12166031 1013708 > : cpustat 35254590 2350698 13610965 907286 > > Then after buffer_head is freed, we have: > [EMAIL PROTECTED] home]# cat /proc/slabinfo |grep buffer_head > buffer_head 9542 36384120 321 : tunables 32 16 > 8 : slabdata 1137 1137245 : globalstat 37602996 11589379 > 11743736 01 6983 20507478 > 1708818 : cpustat 35254625 2350704 16027174 1068367 > > Does this huge number of buffer_head cause the soft lockup? __blkdev_put() takes the BKL and bd_mutex invalidate_mapping_pages() tries to take the PageLock But no other looks seem held while free_buffer_head() is called All these locks are preemptible (CONFIG_PREEMPT_BKL?=y) and should not hog the cpu like that, what preemption mode have you got selected? (CONFIG_PREEMPT_VOLUNTARY?=y) also, you can try this patch: http://groups.google.com/group/linux.kernel/browse_thread/thread/7086e4b9d5504dc9/c608bfea4614b07e?lnk=gst=+Big+kernel+lock+contention+in+do_open=1#c608bfea4614b07e - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Why kmem_cache_free occupy CPU for more than 10 seconds?
On 4/11/07, Peter Zijlstra [EMAIL PROTECTED] wrote: On Wed, 2007-04-11 at 17:53 +0800, Zhao Forrest wrote: I got some new information: Before soft lockup message is out, we have: [EMAIL PROTECTED] home]# cat /proc/slabinfo |grep buffer_head buffer_head 10927942 10942560120 321 : tunables 32 168 : slabdata 341955 341955 6 : globalstat 37602996 11589379 11743736 01 6918 12166031 1013708 : cpustat 35254590 2350698 13610965 907286 Then after buffer_head is freed, we have: [EMAIL PROTECTED] home]# cat /proc/slabinfo |grep buffer_head buffer_head 9542 36384120 321 : tunables 32 16 8 : slabdata 1137 1137245 : globalstat 37602996 11589379 11743736 01 6983 20507478 1708818 : cpustat 35254625 2350704 16027174 1068367 Does this huge number of buffer_head cause the soft lockup? __blkdev_put() takes the BKL and bd_mutex invalidate_mapping_pages() tries to take the PageLock But no other looks seem held while free_buffer_head() is called All these locks are preemptible (CONFIG_PREEMPT_BKL?=y) and should not hog the cpu like that, what preemption mode have you got selected? (CONFIG_PREEMPT_VOLUNTARY?=y) also, you can try this patch: http://groups.google.com/group/linux.kernel/browse_thread/thread/7086e4b9d5504dc9/c608bfea4614b07e?lnk=gstq=+Big+kernel+lock+contention+in+do_openrnum=1#c608bfea4614b07e - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/10/07, Andrew Morton [EMAIL PROTECTED] wrote: On Tue, 10 Apr 2007 16:53:53 -0700 (PDT) [EMAIL PROTECTED] (Ken Chen) wrote: + } while (head != cmpxchg(ring-head, head, head + 1)); A hasty grep indicates that only 14 out of 23 architectures implement cmpxchg(). Sorry I wasn't thorough enough. And partially because I was worried about changing structure type for user space facing struct aio_ring. Now that I looked through all arches, it looks safe as all arch's atomic_t has the same size as int. Here is the updated patch. diff --git a/fs/aio.c b/fs/aio.c index e4598d6..091bcb4 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx *ctx) unsigned long size; int nr_pages; - /* Compensate for the ring buffer's head/tail overlap entry */ - nr_events += 2; /* 1 is required, 2 for good luck */ + /* round nr_event to next power of 2 */ + nr_events = roundup_pow_of_two(nr_events); size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; @@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx *ctx) if (nr_pages 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info-nr = 0; info-ring_pages = info-internal_pages; if (nr_pages AIO_RING_PAGES) { @@ -159,7 +157,8 @@ static int aio_setup_ring(struct kioctx *ctx) ring = kmap_atomic(info-ring_pages[0], KM_USER0); ring-nr = nr_events; /* user copy */ ring-id = ctx-user_id; - ring-head = ring-tail = 0; + atomic_set(ring-head, 0); + ring-tail = 0; ring-magic = AIO_RING_MAGIC; ring-compat_features = AIO_RING_COMPAT_FEATURES; ring-incompat_features = AIO_RING_INCOMPAT_FEATURES; @@ -177,8 +176,8 @@ static int aio_setup_ring(struct kioctx *ctx) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET; \ +#define aio_ring_event(info, __nr, km) ({\ + unsigned pos = ((__nr) ((info)-nr - 1)) + AIO_EVENTS_OFFSET; \ struct io_event *__event; \ __event = kmap_atomic( \ (info)-ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ @@ -220,7 +219,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) atomic_set(ctx-users, 1); spin_lock_init(ctx-ctx_lock); - spin_lock_init(ctx-ring_info.ring_lock); init_waitqueue_head(ctx-wait); INIT_LIST_HEAD(ctx-active_reqs); @@ -927,7 +925,7 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) struct aio_ring *ring; struct io_event *event; unsigned long flags; - unsigned long tail; + unsigned tail; int ret; /* @@ -969,8 +967,6 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) tail = info-tail; event = aio_ring_event(info, tail, KM_IRQ0); - if (++tail = info-nr) - tail = 0; event-obj = (u64)(unsigned long)iocb-ki_obj.user; event-data = iocb-ki_user_data; @@ -986,13 +982,14 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2) */ smp_wmb(); /* make event visible before updating tail */ + tail++; info-tail = tail; ring-tail = tail; put_aio_ring_event(event, KM_IRQ0); kunmap_atomic(ring, KM_IRQ1); - pr_debug(added to ring %p at [%lu]\n, iocb, tail); + pr_debug(added to ring %p at [%u]\n, iocb, tail); put_rq: /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); @@ -1007,42 +1004,36 @@ put_rq: /* aio_read_evt * Pull an event off of the ioctx's event ring. Returns the number of * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. * TODO: make the ringbuffer user mmap()able (requires FIXME). */ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) { struct aio_ring_info *info = ioctx-ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; + struct io_event *evp; + unsigned head; + int ret; ring = kmap_atomic(info-ring_pages[0], KM_USER0); - dprintk(in aio_read_evt h%lu t%lu m%lu\n, - (unsigned long)ring-head, (unsigned long)ring-tail, - (unsigned long)ring-nr); - - if (ring-head == ring-tail) - goto out; - - spin_lock(info-ring_lock); + dprintk(in aio_read_evt h%u t%u m%u\n, + atomic_read(ring-head), ring-tail, ring-nr); - head = ring-head % info-nr; - if (head != ring-tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); + do { + head = atomic_read(ring-head); + if (head == ring-tail) { + ret = 0; + break; + } + evp = aio_ring_event(info, head, KM_USER1); *ent = *evp; - head = (head + 1) % info-nr; smp_mb(); /* finish reading the event before updatng the head */ - ring-head = head; ret = 1; put_aio_ring_event(evp, KM_USER1); - } - spin_unlock(info-ring_lock); + } while (head != atomic_cmpxchg(ring-head, head, head + 1)); -out: kunmap_atomic(ring, KM_USER0); - dprintk(leaving aio_read_evt: %d h%lu t%lu\n
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/11/07, Zach Brown [EMAIL PROTECTED] wrote: Sorry I wasn't thorough enough. And partially because I was worried about changing structure type for user space facing struct aio_ring. Now that I looked through all arches, it looks safe as all arch's atomic_t has the same size as int. Here is the updated patch. @@ -144,7 +144,7 @@ struct kiocb { struct aio_ring { unsignedid; /* kernel internal index number */ unsignednr; /* number of io_events */ - unsignedhead; + atomic_thead; unsignedtail; Embedding an atomic_t in an ABI struct? That makes everyone else nervous too, right? sure, make me nervous at least. It may look safe on i386/x86-64 today, but this doesn't seem like wise practice. Is there any reason to believe that atomic_t will never change size? Does anything else do this already? If nothing else, the unsigned (should be __u32, sigh) could be cast to an atomic_t. The atomic_t cast is equally iffy. I don't know what would be a best solution. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] convert aio event reap to use atomic-op instead of spin_lock
On 4/11/07, Zach Brown [EMAIL PROTECTED] wrote: First, I'll NAK this and all AIO patches until the patch description says that it's been run through the regression tests that we've started collecting in autotest. They're trivial to run, never fear: OK. I will run those regression tests. Resurrect an old patch that uses atomic operation to update ring buffer index on AIO event queue. This work allows futher application/libaio optimization to run fast path io_getevents in user space. I have mixed feelings. I think the userspace getevents support was poorly designed and the simple fact that we've gone this long without it says just how desperately the feature isn't needed. I kept on getting requests from application developers who want that feature. My initial patch was dated back May 2004. We're allocating more ring elements than userspace asked for and than were accounted for in aio_max_nr. That cost, however teeny, will be measured against the benefit. I noticed that while writing the patch. We have the same bug right now that nr_events is enlarged to consume the entire mmap'ed ring buffer pages. But yet, we don't account those in aio_max_nr. I didn't fix that up in this patch because I was going to do a separate patch on that. - /* Compensate for the ring buffer's head/tail overlap entry */ - nr_events += 2; /* 1 is required, 2 for good luck */ + /* round nr_event to next power of 2 */ + nr_events = roundup_pow_of_two(nr_events); Is that buggy? How will the code tell if head == tail means a full ring or an empty ring? The old code added that extra event to let it tell the ring was full before head == tail and it would think it's empty again, I think. I'm guessing roundup(nr_events + 1) is needed. I don't think it's a bug. akpm taught me the trick: we don't wrap the index around the ring size in this scheme, so the extra space is not needed. The ring page, which is mmaped to userspace at some weird address, was just written through a kmap. Do we need a flush_dcache_page()? This isn't this patch's problem, but it'd need to be addressed if we're using the ring from userspace. I will look into this aside from this patch. - Ken - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch] convert aio event reap to use atomic-op instead of spin_lock
Resurrect an old patch that uses atomic operation to update ring buffer index on AIO event queue. This work allows futher application/libaio optimization to run fast path io_getevents in user space. I've also added one more change on top of old implementation that rounds ring buffer size to power of 2 to allow fast head/tail calculation. With the new scheme, there is no more modulo operation on them and we simply increment either pointer index directly. This scheme also automatically handles integer wrap nicely without any additional special treatment. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> diff --git a/fs/aio.c b/fs/aio.c index e4598d6..00ecd14 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx unsigned long size; int nr_pages; - /* Compensate for the ring buffer's head/tail overlap entry */ - nr_events += 2; /* 1 is required, 2 for good luck */ + /* round nr_event to next power of 2 */ + nr_events = roundup_pow_of_two(nr_events); size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; @@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx if (nr_pages < 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info->nr = 0; info->ring_pages = info->internal_pages; if (nr_pages > AIO_RING_PAGES) { @@ -177,8 +175,8 @@ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET;\ +#define aio_ring_event(info, __nr, km) ({ \ + unsigned pos = ((__nr) & ((info)->nr - 1)) + AIO_EVENTS_OFFSET; \ struct io_event *__event; \ __event = kmap_atomic( \ (info)->ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ @@ -220,7 +218,6 @@ static struct kioctx *ioctx_alloc(unsign atomic_set(>users, 1); spin_lock_init(>ctx_lock); - spin_lock_init(>ring_info.ring_lock); init_waitqueue_head(>wait); INIT_LIST_HEAD(>active_reqs); @@ -927,7 +924,7 @@ int fastcall aio_complete(struct kiocb * struct aio_ring *ring; struct io_event *event; unsigned long flags; - unsigned long tail; + unsignedtail; int ret; /* @@ -969,8 +966,6 @@ int fastcall aio_complete(struct kiocb * tail = info->tail; event = aio_ring_event(info, tail, KM_IRQ0); - if (++tail >= info->nr) - tail = 0; event->obj = (u64)(unsigned long)iocb->ki_obj.user; event->data = iocb->ki_user_data; @@ -986,13 +981,14 @@ int fastcall aio_complete(struct kiocb * */ smp_wmb(); /* make event visible before updating tail */ + tail++; info->tail = tail; ring->tail = tail; put_aio_ring_event(event, KM_IRQ0); kunmap_atomic(ring, KM_IRQ1); - pr_debug("added to ring %p at [%lu]\n", iocb, tail); + pr_debug("added to ring %p at [%u]\n", iocb, tail); put_rq: /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); @@ -1007,42 +1003,36 @@ put_rq: /* aio_read_evt * Pull an event off of the ioctx's event ring. Returns the number of * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. * TODO: make the ringbuffer user mmap()able (requires FIXME). */ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) { struct aio_ring_info *info = >ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; + struct io_event *evp; + unsigned head; + int ret; ring = kmap_atomic(info->ring_pages[0], KM_USER0); - dprintk("in aio_read_evt h%lu t%lu m%lu\n", -(unsigned long)ring->head, (unsigned long)ring->tail, -(unsigned long)ring->nr); - - if (ring->head == ring->tail) - goto out; - - spin_lock(>ring_lock); + dprintk("in aio_read_evt h%u t%u m%u\n", +ring->head, ring->tail, ring->nr); - head = ring->head % info->nr; - if (head != ring->tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); + do { + head = ring->head; + if (head == ring->tail) { + ret =
[patch] convert aio event reap to use atomic-op instead of spin_lock
Resurrect an old patch that uses atomic operation to update ring buffer index on AIO event queue. This work allows futher application/libaio optimization to run fast path io_getevents in user space. I've also added one more change on top of old implementation that rounds ring buffer size to power of 2 to allow fast head/tail calculation. With the new scheme, there is no more modulo operation on them and we simply increment either pointer index directly. This scheme also automatically handles integer wrap nicely without any additional special treatment. Signed-off-by: Ken Chen [EMAIL PROTECTED] diff --git a/fs/aio.c b/fs/aio.c index e4598d6..00ecd14 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -108,8 +108,8 @@ static int aio_setup_ring(struct kioctx unsigned long size; int nr_pages; - /* Compensate for the ring buffer's head/tail overlap entry */ - nr_events += 2; /* 1 is required, 2 for good luck */ + /* round nr_event to next power of 2 */ + nr_events = roundup_pow_of_two(nr_events); size = sizeof(struct aio_ring); size += sizeof(struct io_event) * nr_events; @@ -118,8 +118,6 @@ static int aio_setup_ring(struct kioctx if (nr_pages 0) return -EINVAL; - nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); - info-nr = 0; info-ring_pages = info-internal_pages; if (nr_pages AIO_RING_PAGES) { @@ -177,8 +175,8 @@ #define AIO_EVENTS_PER_PAGE (PAGE_SIZE / #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -#define aio_ring_event(info, nr, km) ({ \ - unsigned pos = (nr) + AIO_EVENTS_OFFSET;\ +#define aio_ring_event(info, __nr, km) ({ \ + unsigned pos = ((__nr) ((info)-nr - 1)) + AIO_EVENTS_OFFSET; \ struct io_event *__event; \ __event = kmap_atomic( \ (info)-ring_pages[pos / AIO_EVENTS_PER_PAGE], km); \ @@ -220,7 +218,6 @@ static struct kioctx *ioctx_alloc(unsign atomic_set(ctx-users, 1); spin_lock_init(ctx-ctx_lock); - spin_lock_init(ctx-ring_info.ring_lock); init_waitqueue_head(ctx-wait); INIT_LIST_HEAD(ctx-active_reqs); @@ -927,7 +924,7 @@ int fastcall aio_complete(struct kiocb * struct aio_ring *ring; struct io_event *event; unsigned long flags; - unsigned long tail; + unsignedtail; int ret; /* @@ -969,8 +966,6 @@ int fastcall aio_complete(struct kiocb * tail = info-tail; event = aio_ring_event(info, tail, KM_IRQ0); - if (++tail = info-nr) - tail = 0; event-obj = (u64)(unsigned long)iocb-ki_obj.user; event-data = iocb-ki_user_data; @@ -986,13 +981,14 @@ int fastcall aio_complete(struct kiocb * */ smp_wmb(); /* make event visible before updating tail */ + tail++; info-tail = tail; ring-tail = tail; put_aio_ring_event(event, KM_IRQ0); kunmap_atomic(ring, KM_IRQ1); - pr_debug(added to ring %p at [%lu]\n, iocb, tail); + pr_debug(added to ring %p at [%u]\n, iocb, tail); put_rq: /* everything turned out well, dispose of the aiocb. */ ret = __aio_put_req(ctx, iocb); @@ -1007,42 +1003,36 @@ put_rq: /* aio_read_evt * Pull an event off of the ioctx's event ring. Returns the number of * events fetched (0 or 1 ;-) - * FIXME: make this use cmpxchg. * TODO: make the ringbuffer user mmap()able (requires FIXME). */ static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) { struct aio_ring_info *info = ioctx-ring_info; struct aio_ring *ring; - unsigned long head; - int ret = 0; + struct io_event *evp; + unsigned head; + int ret; ring = kmap_atomic(info-ring_pages[0], KM_USER0); - dprintk(in aio_read_evt h%lu t%lu m%lu\n, -(unsigned long)ring-head, (unsigned long)ring-tail, -(unsigned long)ring-nr); - - if (ring-head == ring-tail) - goto out; - - spin_lock(info-ring_lock); + dprintk(in aio_read_evt h%u t%u m%u\n, +ring-head, ring-tail, ring-nr); - head = ring-head % info-nr; - if (head != ring-tail) { - struct io_event *evp = aio_ring_event(info, head, KM_USER1); + do { + head = ring-head; + if (head == ring-tail) { + ret = 0; + break; + } + evp = aio_ring_event(info, head, KM_USER1); *ent = *evp; - head = (head + 1
Re: [patch] remove artificial software max_loop limit
On 4/1/07, Tomas M <[EMAIL PROTECTED]> wrote: I believe that IF you _really_ need to preserve the max_loop module parameter, then the parameter should _not_ be ignored, rather it should have the same function like before - to limit the loop driver so if you use max_loop=10 for example, it should not allow loop.c to create more than 10 loops. Blame on the dual meaning of max_loop that it uses currently: to initialize a set of loop devices and as a side effect, it also sets the upper limit. People are complaining about the former constrain, isn't it? Does anyone uses the 2nd meaning of upper limit? - Ken - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] remove artificial software max_loop limit
On 4/1/07, [EMAIL PROTECTED] <[EMAIL PROTECTED]> wrote: not sure if this is a real issue and if it`s UML or loop related - but how is low-memory situations being handled when creating loop devices ? kernel returns -ENOMEM as an error code if there are no memory left to initialize loop device. should losetup or dmesg tell "out of memory" if there is not enough memory left ? It should, as kernel does pass that info back to app. Does losetup check return value of open() syscall? - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] remove artificial software max_loop limit
On 4/1/07, [EMAIL PROTECTED] [EMAIL PROTECTED] wrote: not sure if this is a real issue and if it`s UML or loop related - but how is low-memory situations being handled when creating loop devices ? kernel returns -ENOMEM as an error code if there are no memory left to initialize loop device. should losetup or dmesg tell out of memory if there is not enough memory left ? It should, as kernel does pass that info back to app. Does losetup check return value of open() syscall? - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] remove artificial software max_loop limit
On 4/1/07, Tomas M [EMAIL PROTECTED] wrote: I believe that IF you _really_ need to preserve the max_loop module parameter, then the parameter should _not_ be ignored, rather it should have the same function like before - to limit the loop driver so if you use max_loop=10 for example, it should not allow loop.c to create more than 10 loops. Blame on the dual meaning of max_loop that it uses currently: to initialize a set of loop devices and as a side effect, it also sets the upper limit. People are complaining about the former constrain, isn't it? Does anyone uses the 2nd meaning of upper limit? - Ken - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] remove artificial software max_loop limit
On 3/31/07, Andrew Morton <[EMAIL PROTECTED]> wrote: > Yes, the distros do, and they recommend it to their users a lot. Thanks. In that case I think we should retain the max_loop module parameter for now. Ken, when you respin that patch could you restore max_loop, and make its use trigger a warning along the lines of "loop: the max_loop option is obsolete and will be removed in March 2008"? OK, here is a re-spin patch that I tested as module or link-in-vmlinux. Both produce satisfactory result for me. I also enclosed the patch as an attachment just in case my email client decide to eat away white spaces for the in-line text. -- Subject: remove artificial software max_loop limit From: "Ken Chen" <[EMAIL PROTECTED]> Remove artificial maximum 256 loop device that can be created due to a legacy device number limit. Searching through lkml archive, there are several instances where users complained about the artificial limit that the loop driver impose. There is no reason to have such limit. This patch rid the limit entirely and make loop device and associated block queue instantiation on demand. With on-demand instantiation, it also gives the benefit of not wasting memory if these devices are not in use (compare to current implementation that always create 8 loop devices), a net improvement in both areas. This version is both tested with creation of large number of loop devices and is compatible with existing losetup/mount user land tools. There are a number of people who worked on this and provided valuable suggestions, in no particular order, by: Jens Axboe Jan Engelhardt Christoph Hellwig Thomas M Signed-off-by: Ken Chen <[EMAIL PROTECTED]> Cc: Jan Engelhardt <[EMAIL PROTECTED]> Cc: Christoph Hellwig <[EMAIL PROTECTED]> diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6b5b642..605c1d3 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,9 +77,8 @@ #include -static int max_loop = 8; -static struct loop_device *loop_dev; -static struct gendisk **disks; +static LIST_HEAD(loop_devices); +static DEFINE_MUTEX(loop_devices_mutex); /* * Transfer functions @@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo) if (unlikely((loff_t)x != size)) return -EFBIG; - set_capacity(disks[lo->lo_number], x); + set_capacity(lo->lo_disk, x); return 0; } @@ -812,7 +811,7 @@ static int loop_set_fd lo->lo_queue->queuedata = lo; lo->lo_queue->unplug_fn = loop_unplug; - set_capacity(disks[lo->lo_number], size); + set_capacity(lo->lo_disk, size); bd_set_size(bdev, size << 9); set_blocksize(bdev, lo_blocksize); @@ -832,7 +831,7 @@ out_clr: lo->lo_device = NULL; lo->lo_backing_file = NULL; lo->lo_flags = 0; - set_capacity(disks[lo->lo_number], 0); + set_capacity(lo->lo_disk, 0); invalidate_bdev(bdev, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(mapping, lo->old_gfp_mask); @@ -918,7 +917,7 @@ static int loop_clr_fd memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); invalidate_bdev(bdev, 0); - set_capacity(disks[lo->lo_number], 0); + set_capacity(lo->lo_disk, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(filp->f_mapping, gfp); lo->lo_state = Lo_unbound; @@ -1322,6 +1321,18 @@ static long lo_compat_ioctl } #endif +static struct loop_device *loop_find_dev(int number) +{ + struct loop_device *lo; + + list_for_each_entry(lo, _devices, lo_list) { + if (lo->lo_number == number) + return lo; + } + return NULL; +} + +static struct loop_device *loop_init_one(int i); static int lo_open(struct inode *inode, struct file *file) { struct loop_device *lo = inode->i_bdev->bd_disk->private_data; @@ -1330,6 +1341,11 @@ static int lo_open lo->lo_refcnt++; mutex_unlock(>lo_ctl_mutex); + mutex_lock(_devices_mutex); + if (!loop_find_dev(lo->lo_number + 1)) + loop_init_one(lo->lo_number + 1); + mutex_unlock(_devices_mutex); + return 0; } @@ -1357,8 +1373,9 @@ static struct block_device_operations lo_fops = { /* * And now the modules code and kernel interface. */ +static int max_loop; module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, "Maximum number of loop devices (1-256)"); +MODULE_PARM_DESC(max_loop, "obsolete, loop device is created on-demand"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1383,7 +1400,7 @@ int loop_unregister_transfer(int number) xfer_funcs[n] = NULL; - for (lo = _dev[0]; lo < _dev[max_loop]; lo++
Re: [patch] remove artificial software max_loop limit
On 3/31/07, Andrew Morton [EMAIL PROTECTED] wrote: Yes, the distros do, and they recommend it to their users a lot. Thanks. In that case I think we should retain the max_loop module parameter for now. Ken, when you respin that patch could you restore max_loop, and make its use trigger a warning along the lines of loop: the max_loop option is obsolete and will be removed in March 2008? OK, here is a re-spin patch that I tested as module or link-in-vmlinux. Both produce satisfactory result for me. I also enclosed the patch as an attachment just in case my email client decide to eat away white spaces for the in-line text. -- Subject: remove artificial software max_loop limit From: Ken Chen [EMAIL PROTECTED] Remove artificial maximum 256 loop device that can be created due to a legacy device number limit. Searching through lkml archive, there are several instances where users complained about the artificial limit that the loop driver impose. There is no reason to have such limit. This patch rid the limit entirely and make loop device and associated block queue instantiation on demand. With on-demand instantiation, it also gives the benefit of not wasting memory if these devices are not in use (compare to current implementation that always create 8 loop devices), a net improvement in both areas. This version is both tested with creation of large number of loop devices and is compatible with existing losetup/mount user land tools. There are a number of people who worked on this and provided valuable suggestions, in no particular order, by: Jens Axboe Jan Engelhardt Christoph Hellwig Thomas M Signed-off-by: Ken Chen [EMAIL PROTECTED] Cc: Jan Engelhardt [EMAIL PROTECTED] Cc: Christoph Hellwig [EMAIL PROTECTED] diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6b5b642..605c1d3 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,9 +77,8 @@ #include asm/uaccess.h -static int max_loop = 8; -static struct loop_device *loop_dev; -static struct gendisk **disks; +static LIST_HEAD(loop_devices); +static DEFINE_MUTEX(loop_devices_mutex); /* * Transfer functions @@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo) if (unlikely((loff_t)x != size)) return -EFBIG; - set_capacity(disks[lo-lo_number], x); + set_capacity(lo-lo_disk, x); return 0; } @@ -812,7 +811,7 @@ static int loop_set_fd lo-lo_queue-queuedata = lo; lo-lo_queue-unplug_fn = loop_unplug; - set_capacity(disks[lo-lo_number], size); + set_capacity(lo-lo_disk, size); bd_set_size(bdev, size 9); set_blocksize(bdev, lo_blocksize); @@ -832,7 +831,7 @@ out_clr: lo-lo_device = NULL; lo-lo_backing_file = NULL; lo-lo_flags = 0; - set_capacity(disks[lo-lo_number], 0); + set_capacity(lo-lo_disk, 0); invalidate_bdev(bdev, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(mapping, lo-old_gfp_mask); @@ -918,7 +917,7 @@ static int loop_clr_fd memset(lo-lo_crypt_name, 0, LO_NAME_SIZE); memset(lo-lo_file_name, 0, LO_NAME_SIZE); invalidate_bdev(bdev, 0); - set_capacity(disks[lo-lo_number], 0); + set_capacity(lo-lo_disk, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(filp-f_mapping, gfp); lo-lo_state = Lo_unbound; @@ -1322,6 +1321,18 @@ static long lo_compat_ioctl } #endif +static struct loop_device *loop_find_dev(int number) +{ + struct loop_device *lo; + + list_for_each_entry(lo, loop_devices, lo_list) { + if (lo-lo_number == number) + return lo; + } + return NULL; +} + +static struct loop_device *loop_init_one(int i); static int lo_open(struct inode *inode, struct file *file) { struct loop_device *lo = inode-i_bdev-bd_disk-private_data; @@ -1330,6 +1341,11 @@ static int lo_open lo-lo_refcnt++; mutex_unlock(lo-lo_ctl_mutex); + mutex_lock(loop_devices_mutex); + if (!loop_find_dev(lo-lo_number + 1)) + loop_init_one(lo-lo_number + 1); + mutex_unlock(loop_devices_mutex); + return 0; } @@ -1357,8 +1373,9 @@ static struct block_device_operations lo_fops = { /* * And now the modules code and kernel interface. */ +static int max_loop; module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, Maximum number of loop devices (1-256)); +MODULE_PARM_DESC(max_loop, obsolete, loop device is created on-demand); MODULE_LICENSE(GPL); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1383,7 +1400,7 @@ int loop_unregister_transfer(int number) xfer_funcs[n] = NULL; - for (lo = loop_dev[0]; lo loop_dev[max_loop]; lo++) { + list_for_each_entry(lo, loop_devices, lo_list) { mutex_lock(lo-lo_ctl_mutex); if (lo-lo_encryption == xfer) @@ -1398,91 +1415,110 @@ int
Re: [patch] remove artificial software max_loop limit
On 3/30/07, Andrew Morton <[EMAIL PROTECTED]> wrote: So.. this change will cause a fatal error for anyone who is presently using max_loop, won't it? If they're doing that within their initramfs/initrd/etc then things could get rather ugly for them. probably, if they access loop device non-sequentially. I don't know how much of a problem this will be in practice - do people use max_loop much? I don't know either. btw, did you test this change as both a module and as linked-into-vmlinux? as linked-into-vmlinux. why do you ask? It breaks if it is module? I made last minute change to a mutex name and shamely posted without doing a compile test. Besides that, is there something else breaks? - Ken - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] remove artificial software max_loop limit
On 3/30/07, Jan Engelhardt <[EMAIL PROTECTED]> wrote: > lo->lo_device = NULL; > lo->lo_backing_file = NULL; > lo->lo_flags = 0; > - set_capacity(disks[lo->lo_number], 0); > + set_capacity(lo->lo_disk, 0); > invalidate_bdev(bdev, 0); > bd_set_size(bdev, 0); > mapping_set_gfp_mask(mapping, lo->old_gfp_mask); Welcome to Google Mail, we eat your whitespace :-) Oh, crap. Google mail is innocent. It was me who did some ugly copy-paste between apps. > + sprintf(disk->disk_name, "loop%d", i); Let's make this %u, no? I don't mind either way, this thing won't be bigger than 1^20 anyway. Oh, which reminds me that we probably should explicitly test and cap that limit (1^20 that is). > +static struct kobject *loop_probe(dev_t dev, int *part, void *data) > +{ > + unsigned int number = dev & MINORMASK; > + struct loop_device *lo; > > - for (i = 0; i < max_loop; i++) { > - disks[i] = alloc_disk(1); > - if (!disks[i]) > - goto out_mem3; > - } Could not we use kobject_uevent(), like I did (though perhaps without the k.oops), to make udev create a node? Well, I go with a version that doesn't panic kernel, which is a deciding factor here ;-) Full patch attached. diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6b5b642..1e87aea 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,9 +77,8 @@ #include -static int max_loop = 8; -static struct loop_device *loop_dev; -static struct gendisk **disks; +static LIST_HEAD(loop_devices); +static DEFINE_MUTEX(loop_devices_mutex); /* * Transfer functions @@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo) if (unlikely((loff_t)x != size)) return -EFBIG; - set_capacity(disks[lo->lo_number], x); + set_capacity(lo->lo_disk, x); return 0; } @@ -812,7 +811,7 @@ static int loop_set_fd lo->lo_queue->queuedata = lo; lo->lo_queue->unplug_fn = loop_unplug; - set_capacity(disks[lo->lo_number], size); + set_capacity(lo->lo_disk, size); bd_set_size(bdev, size << 9); set_blocksize(bdev, lo_blocksize); @@ -832,7 +831,7 @@ out_clr: lo->lo_device = NULL; lo->lo_backing_file = NULL; lo->lo_flags = 0; - set_capacity(disks[lo->lo_number], 0); + set_capacity(lo->lo_disk, 0); invalidate_bdev(bdev, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(mapping, lo->old_gfp_mask); @@ -918,7 +917,7 @@ static int loop_clr_fd memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); invalidate_bdev(bdev, 0); - set_capacity(disks[lo->lo_number], 0); + set_capacity(lo->lo_disk, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(filp->f_mapping, gfp); lo->lo_state = Lo_unbound; @@ -1322,6 +1321,18 @@ static long lo_compat_ioctl } #endif +static struct loop_device *loop_find_dev(int number) +{ + struct loop_device *lo; + + list_for_each_entry(lo, _devices, lo_list) { + if (lo->lo_number == number) + return lo; + } + return NULL; +} + +static struct loop_device *loop_init_one(int i); static int lo_open(struct inode *inode, struct file *file) { struct loop_device *lo = inode->i_bdev->bd_disk->private_data; @@ -1330,6 +1341,11 @@ static int lo_open lo->lo_refcnt++; mutex_unlock(>lo_ctl_mutex); + mutex_lock(_device_mutex); + if (!loop_find_dev(lo->lo_number + 1)) + loop_init_one(lo->lo_number + 1); + mutex_unlock(_device_mutex); + return 0; } @@ -1357,8 +1373,6 @@ static struct block_device_operations lo_fops /* * And now the modules code and kernel interface. */ -module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, "Maximum number of loop devices (1-256)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1383,7 +1397,7 @@ int loop_unregister_transfer(int number) xfer_funcs[n] = NULL; - for (lo = _dev[0]; lo < _dev[max_loop]; lo++) { + list_for_each_entry(lo, _devices, lo_list) { mutex_lock(>lo_ctl_mutex); if (lo->lo_encryption == xfer) @@ -1398,102 +1412,106 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static int __init loop_init(void) +static struct loop_device *loop_init_one(int i) { - int i; + struct loop_device *lo; + struct gendisk *disk; - if (max_loop < 1 || max_loop > 256) { - printk(KERN_WARNING "loop: invalid max_loop (must be between" - " 1 and 256), using default (8)\n"); - max_loop = 8; - } + lo = kzalloc(sizeof(*lo), GFP_KERNEL); + if (!lo) + goto out; - if (register_blkdev(LOOP_MAJOR, "loop")) -
Re: [patch] remove artificial software max_loop limit
On 3/30/07, Ken Chen <[EMAIL PROTECTED]> wrote: Remove artificial maximum 256 loop device that can be created due to a legacy device number limit. Searching through lkml archive, there are several instances where users complained about the artificial limit that the loop driver impose. There is no reason to have such limit. This patch rid the limit entirely and make loop device and associated block queue instantiation on demand. With on-demand instantiation, it also gives the benefit of not wasting memory if these devices are not in use (compare to current implementation that always create 8 loop devices), a net improvement in both areas. This version is both tested with creation of large number of loop devices and is compatible with existing losetup/mount user land tools. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> Spotted one bug: the sequence of loop device lookup, instantiation, and then insert into global loop_devices_list better be in one critical section, otherwise smp race ensues. Fix that up and also use mutex lock instead of spin_lock for loop_devices_list. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6b5b642..1e87aea 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,9 +77,8 @@ #include -static int max_loop = 8; -static struct loop_device *loop_dev; -static struct gendisk **disks; +static LIST_HEAD(loop_devices); +static DEFINE_MUTEX(loop_devices_mutex); /* * Transfer functions @@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo) if (unlikely((loff_t)x != size)) return -EFBIG; - set_capacity(disks[lo->lo_number], x); + set_capacity(lo->lo_disk, x); return 0; } @@ -812,7 +811,7 @@ static int loop_set_fd lo->lo_queue->queuedata = lo; lo->lo_queue->unplug_fn = loop_unplug; - set_capacity(disks[lo->lo_number], size); + set_capacity(lo->lo_disk, size); bd_set_size(bdev, size << 9); set_blocksize(bdev, lo_blocksize); @@ -832,7 +831,7 @@ out_clr: lo->lo_device = NULL; lo->lo_backing_file = NULL; lo->lo_flags = 0; - set_capacity(disks[lo->lo_number], 0); + set_capacity(lo->lo_disk, 0); invalidate_bdev(bdev, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(mapping, lo->old_gfp_mask); @@ -918,7 +917,7 @@ static int loop_clr_fd memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); invalidate_bdev(bdev, 0); - set_capacity(disks[lo->lo_number], 0); + set_capacity(lo->lo_disk, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(filp->f_mapping, gfp); lo->lo_state = Lo_unbound; @@ -1322,6 +1321,18 @@ static long lo_compat_ioctl } #endif +static struct loop_device *loop_find_dev(int number) +{ + struct loop_device *lo; + + list_for_each_entry(lo, _devices, lo_list) { + if (lo->lo_number == number) + return lo; + } + return NULL; +} + +static struct loop_device *loop_init_one(int i); static int lo_open(struct inode *inode, struct file *file) { struct loop_device *lo = inode->i_bdev->bd_disk->private_data; @@ -1330,6 +1341,11 @@ static int lo_open lo->lo_refcnt++; mutex_unlock(>lo_ctl_mutex); + mutex_lock(_device_mutex); + if (!loop_find_dev(lo->lo_number + 1)) + loop_init_one(lo->lo_number + 1); + mutex_unlock(_device_mutex); + return 0; } @@ -1357,8 +1373,6 @@ static struct block_device_operations lo_fops /* * And now the modules code and kernel interface. */ -module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, "Maximum number of loop devices (1-256)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1383,7 +1397,7 @@ int loop_unregister_transfer(int number) xfer_funcs[n] = NULL; - for (lo = _dev[0]; lo < _dev[max_loop]; lo++) { + list_for_each_entry(lo, _devices, lo_list) { mutex_lock(>lo_ctl_mutex); if (lo->lo_encryption == xfer) @@ -1398,102 +1412,106 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static int __init loop_init(void) +static struct loop_device *loop_init_one(int i) { - int i; + struct loop_device *lo; + struct gendisk *disk; - if (max_loop < 1 || max_loop > 256) { - printk(KERN_WARNING "loop: invalid max_loop (must be between" - " 1 and 256), using default (8)\n"); - max_loop = 8; - } + lo = kzalloc(sizeof(*lo), GFP_KERNEL); + if (!lo) + got
[patch] remove artificial software max_loop limit
Remove artificial maximum 256 loop device that can be created due to a legacy device number limit. Searching through lkml archive, there are several instances where users complained about the artificial limit that the loop driver impose. There is no reason to have such limit. This patch rid the limit entirely and make loop device and associated block queue instantiation on demand. With on-demand instantiation, it also gives the benefit of not wasting memory if these devices are not in use (compare to current implementation that always create 8 loop devices), a net improvement in both areas. This version is both tested with creation of large number of loop devices and is compatible with existing losetup/mount user land tools. There are a number of people who worked on this and provided valuable suggestions, in no particular order, by: Jens Axboe Jan Engelhardt Christoph Hellwig Thomas M Signed-off-by: Ken Chen <[EMAIL PROTECTED]> diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6b5b642..7db2c38 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,9 +77,8 @@ #include #include -static int max_loop = 8; -static struct loop_device *loop_dev; -static struct gendisk **disks; +static LIST_HEAD(loop_devices); +static DEFINE_SPINLOCK(loop_devices_lock); /* * Transfer functions @@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo) if (unlikely((loff_t)x != size)) return -EFBIG; - set_capacity(disks[lo->lo_number], x); + set_capacity(lo->lo_disk, x); return 0; } @@ -812,7 +811,7 @@ static int loop_set_fd(struct loop_devic lo->lo_queue->queuedata = lo; lo->lo_queue->unplug_fn = loop_unplug; - set_capacity(disks[lo->lo_number], size); + set_capacity(lo->lo_disk, size); bd_set_size(bdev, size << 9); set_blocksize(bdev, lo_blocksize); @@ -832,7 +831,7 @@ out_clr: lo->lo_device = NULL; lo->lo_backing_file = NULL; lo->lo_flags = 0; - set_capacity(disks[lo->lo_number], 0); + set_capacity(lo->lo_disk, 0); invalidate_bdev(bdev, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(mapping, lo->old_gfp_mask); @@ -918,7 +917,7 @@ static int loop_clr_fd(struct loop_devic memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); invalidate_bdev(bdev, 0); - set_capacity(disks[lo->lo_number], 0); + set_capacity(lo->lo_disk, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(filp->f_mapping, gfp); lo->lo_state = Lo_unbound; @@ -1322,6 +1321,23 @@ static long lo_compat_ioctl(struct file } #endif +static struct loop_device *loop_find_dev(int number) +{ + struct loop_device *lo; + int found = 0; + + spin_lock(_devices_lock); + list_for_each_entry(lo, _devices, lo_list) { + if (lo->lo_number == number) { + found = 1; + break; + } + } + spin_unlock(_devices_lock); + return found ? lo : NULL; +} + +static struct loop_device *loop_init_one(int i); static int lo_open(struct inode *inode, struct file *file) { struct loop_device *lo = inode->i_bdev->bd_disk->private_data; @@ -1330,6 +1346,9 @@ static int lo_open(struct inode *inode, lo->lo_refcnt++; mutex_unlock(>lo_ctl_mutex); + if (!loop_find_dev(lo->lo_number + 1)) + loop_init_one(lo->lo_number + 1); + return 0; } @@ -1357,8 +1376,6 @@ #endif /* * And now the modules code and kernel interface. */ -module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, "Maximum number of loop devices (1-256)"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1383,7 +1400,7 @@ int loop_unregister_transfer(int number) xfer_funcs[n] = NULL; - for (lo = _dev[0]; lo < _dev[max_loop]; lo++) { + list_for_each_entry(lo, _devices, lo_list) { mutex_lock(>lo_ctl_mutex); if (lo->lo_encryption == xfer) @@ -1398,102 +1415,104 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static int __init loop_init(void) +static struct loop_device *loop_init_one(int i) { - int i; + struct loop_device *lo; + struct gendisk *disk; - if (max_loop < 1 || max_loop > 256) { - printk(KERN_WARNING "loop: invalid max_loop (must be between" - " 1 and 256), using default (8)\n"); - max_loop = 8; - } + lo = kzalloc(sizeof(*lo), GFP_KERNEL); + if (!lo) + goto out; - if (register_blkdev(LOOP_MAJOR, "loop")) -
[patch] remove artificial software max_loop limit
Remove artificial maximum 256 loop device that can be created due to a legacy device number limit. Searching through lkml archive, there are several instances where users complained about the artificial limit that the loop driver impose. There is no reason to have such limit. This patch rid the limit entirely and make loop device and associated block queue instantiation on demand. With on-demand instantiation, it also gives the benefit of not wasting memory if these devices are not in use (compare to current implementation that always create 8 loop devices), a net improvement in both areas. This version is both tested with creation of large number of loop devices and is compatible with existing losetup/mount user land tools. There are a number of people who worked on this and provided valuable suggestions, in no particular order, by: Jens Axboe Jan Engelhardt Christoph Hellwig Thomas M Signed-off-by: Ken Chen [EMAIL PROTECTED] diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6b5b642..7db2c38 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,9 +77,8 @@ #include linux/kthread.h #include asm/uaccess.h -static int max_loop = 8; -static struct loop_device *loop_dev; -static struct gendisk **disks; +static LIST_HEAD(loop_devices); +static DEFINE_SPINLOCK(loop_devices_lock); /* * Transfer functions @@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo) if (unlikely((loff_t)x != size)) return -EFBIG; - set_capacity(disks[lo-lo_number], x); + set_capacity(lo-lo_disk, x); return 0; } @@ -812,7 +811,7 @@ static int loop_set_fd(struct loop_devic lo-lo_queue-queuedata = lo; lo-lo_queue-unplug_fn = loop_unplug; - set_capacity(disks[lo-lo_number], size); + set_capacity(lo-lo_disk, size); bd_set_size(bdev, size 9); set_blocksize(bdev, lo_blocksize); @@ -832,7 +831,7 @@ out_clr: lo-lo_device = NULL; lo-lo_backing_file = NULL; lo-lo_flags = 0; - set_capacity(disks[lo-lo_number], 0); + set_capacity(lo-lo_disk, 0); invalidate_bdev(bdev, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(mapping, lo-old_gfp_mask); @@ -918,7 +917,7 @@ static int loop_clr_fd(struct loop_devic memset(lo-lo_crypt_name, 0, LO_NAME_SIZE); memset(lo-lo_file_name, 0, LO_NAME_SIZE); invalidate_bdev(bdev, 0); - set_capacity(disks[lo-lo_number], 0); + set_capacity(lo-lo_disk, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(filp-f_mapping, gfp); lo-lo_state = Lo_unbound; @@ -1322,6 +1321,23 @@ static long lo_compat_ioctl(struct file } #endif +static struct loop_device *loop_find_dev(int number) +{ + struct loop_device *lo; + int found = 0; + + spin_lock(loop_devices_lock); + list_for_each_entry(lo, loop_devices, lo_list) { + if (lo-lo_number == number) { + found = 1; + break; + } + } + spin_unlock(loop_devices_lock); + return found ? lo : NULL; +} + +static struct loop_device *loop_init_one(int i); static int lo_open(struct inode *inode, struct file *file) { struct loop_device *lo = inode-i_bdev-bd_disk-private_data; @@ -1330,6 +1346,9 @@ static int lo_open(struct inode *inode, lo-lo_refcnt++; mutex_unlock(lo-lo_ctl_mutex); + if (!loop_find_dev(lo-lo_number + 1)) + loop_init_one(lo-lo_number + 1); + return 0; } @@ -1357,8 +1376,6 @@ #endif /* * And now the modules code and kernel interface. */ -module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, Maximum number of loop devices (1-256)); MODULE_LICENSE(GPL); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1383,7 +1400,7 @@ int loop_unregister_transfer(int number) xfer_funcs[n] = NULL; - for (lo = loop_dev[0]; lo loop_dev[max_loop]; lo++) { + list_for_each_entry(lo, loop_devices, lo_list) { mutex_lock(lo-lo_ctl_mutex); if (lo-lo_encryption == xfer) @@ -1398,102 +1415,104 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static int __init loop_init(void) +static struct loop_device *loop_init_one(int i) { - int i; + struct loop_device *lo; + struct gendisk *disk; - if (max_loop 1 || max_loop 256) { - printk(KERN_WARNING loop: invalid max_loop (must be between -1 and 256), using default (8)\n); - max_loop = 8; - } + lo = kzalloc(sizeof(*lo), GFP_KERNEL); + if (!lo) + goto out; - if (register_blkdev(LOOP_MAJOR, loop)) - return -EIO; + lo-lo_queue = blk_alloc_queue(GFP_KERNEL); + if (!lo-lo_queue) + goto out_free_dev
Re: [patch] remove artificial software max_loop limit
On 3/30/07, Ken Chen [EMAIL PROTECTED] wrote: Remove artificial maximum 256 loop device that can be created due to a legacy device number limit. Searching through lkml archive, there are several instances where users complained about the artificial limit that the loop driver impose. There is no reason to have such limit. This patch rid the limit entirely and make loop device and associated block queue instantiation on demand. With on-demand instantiation, it also gives the benefit of not wasting memory if these devices are not in use (compare to current implementation that always create 8 loop devices), a net improvement in both areas. This version is both tested with creation of large number of loop devices and is compatible with existing losetup/mount user land tools. Signed-off-by: Ken Chen [EMAIL PROTECTED] Spotted one bug: the sequence of loop device lookup, instantiation, and then insert into global loop_devices_list better be in one critical section, otherwise smp race ensues. Fix that up and also use mutex lock instead of spin_lock for loop_devices_list. Signed-off-by: Ken Chen [EMAIL PROTECTED] diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6b5b642..1e87aea 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,9 +77,8 @@ #include asm/uaccess.h -static int max_loop = 8; -static struct loop_device *loop_dev; -static struct gendisk **disks; +static LIST_HEAD(loop_devices); +static DEFINE_MUTEX(loop_devices_mutex); /* * Transfer functions @@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo) if (unlikely((loff_t)x != size)) return -EFBIG; - set_capacity(disks[lo-lo_number], x); + set_capacity(lo-lo_disk, x); return 0; } @@ -812,7 +811,7 @@ static int loop_set_fd lo-lo_queue-queuedata = lo; lo-lo_queue-unplug_fn = loop_unplug; - set_capacity(disks[lo-lo_number], size); + set_capacity(lo-lo_disk, size); bd_set_size(bdev, size 9); set_blocksize(bdev, lo_blocksize); @@ -832,7 +831,7 @@ out_clr: lo-lo_device = NULL; lo-lo_backing_file = NULL; lo-lo_flags = 0; - set_capacity(disks[lo-lo_number], 0); + set_capacity(lo-lo_disk, 0); invalidate_bdev(bdev, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(mapping, lo-old_gfp_mask); @@ -918,7 +917,7 @@ static int loop_clr_fd memset(lo-lo_crypt_name, 0, LO_NAME_SIZE); memset(lo-lo_file_name, 0, LO_NAME_SIZE); invalidate_bdev(bdev, 0); - set_capacity(disks[lo-lo_number], 0); + set_capacity(lo-lo_disk, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(filp-f_mapping, gfp); lo-lo_state = Lo_unbound; @@ -1322,6 +1321,18 @@ static long lo_compat_ioctl } #endif +static struct loop_device *loop_find_dev(int number) +{ + struct loop_device *lo; + + list_for_each_entry(lo, loop_devices, lo_list) { + if (lo-lo_number == number) + return lo; + } + return NULL; +} + +static struct loop_device *loop_init_one(int i); static int lo_open(struct inode *inode, struct file *file) { struct loop_device *lo = inode-i_bdev-bd_disk-private_data; @@ -1330,6 +1341,11 @@ static int lo_open lo-lo_refcnt++; mutex_unlock(lo-lo_ctl_mutex); + mutex_lock(loop_device_mutex); + if (!loop_find_dev(lo-lo_number + 1)) + loop_init_one(lo-lo_number + 1); + mutex_unlock(loop_device_mutex); + return 0; } @@ -1357,8 +1373,6 @@ static struct block_device_operations lo_fops /* * And now the modules code and kernel interface. */ -module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, Maximum number of loop devices (1-256)); MODULE_LICENSE(GPL); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1383,7 +1397,7 @@ int loop_unregister_transfer(int number) xfer_funcs[n] = NULL; - for (lo = loop_dev[0]; lo loop_dev[max_loop]; lo++) { + list_for_each_entry(lo, loop_devices, lo_list) { mutex_lock(lo-lo_ctl_mutex); if (lo-lo_encryption == xfer) @@ -1398,102 +1412,106 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static int __init loop_init(void) +static struct loop_device *loop_init_one(int i) { - int i; + struct loop_device *lo; + struct gendisk *disk; - if (max_loop 1 || max_loop 256) { - printk(KERN_WARNING loop: invalid max_loop (must be between -1 and 256), using default (8)\n); - max_loop = 8; - } + lo = kzalloc(sizeof(*lo), GFP_KERNEL); + if (!lo) + goto out; - if (register_blkdev(LOOP_MAJOR, loop)) - return -EIO; + lo-lo_queue = blk_alloc_queue(GFP_KERNEL); + if (!lo-lo_queue) + goto
Re: [patch] remove artificial software max_loop limit
On 3/30/07, Jan Engelhardt [EMAIL PROTECTED] wrote: lo-lo_device = NULL; lo-lo_backing_file = NULL; lo-lo_flags = 0; - set_capacity(disks[lo-lo_number], 0); + set_capacity(lo-lo_disk, 0); invalidate_bdev(bdev, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(mapping, lo-old_gfp_mask); Welcome to Google Mail, we eat your whitespace :-) Oh, crap. Google mail is innocent. It was me who did some ugly copy-paste between apps. + sprintf(disk-disk_name, loop%d, i); Let's make this %u, no? I don't mind either way, this thing won't be bigger than 1^20 anyway. Oh, which reminds me that we probably should explicitly test and cap that limit (1^20 that is). +static struct kobject *loop_probe(dev_t dev, int *part, void *data) +{ + unsigned int number = dev MINORMASK; + struct loop_device *lo; - for (i = 0; i max_loop; i++) { - disks[i] = alloc_disk(1); - if (!disks[i]) - goto out_mem3; - } Could not we use kobject_uevent(), like I did (though perhaps without the k.oops), to make udev create a node? Well, I go with a version that doesn't panic kernel, which is a deciding factor here ;-) Full patch attached. diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 6b5b642..1e87aea 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -77,9 +77,8 @@ #include asm/uaccess.h -static int max_loop = 8; -static struct loop_device *loop_dev; -static struct gendisk **disks; +static LIST_HEAD(loop_devices); +static DEFINE_MUTEX(loop_devices_mutex); /* * Transfer functions @@ -183,7 +182,7 @@ figure_loop_size(struct loop_device *lo) if (unlikely((loff_t)x != size)) return -EFBIG; - set_capacity(disks[lo-lo_number], x); + set_capacity(lo-lo_disk, x); return 0; } @@ -812,7 +811,7 @@ static int loop_set_fd lo-lo_queue-queuedata = lo; lo-lo_queue-unplug_fn = loop_unplug; - set_capacity(disks[lo-lo_number], size); + set_capacity(lo-lo_disk, size); bd_set_size(bdev, size 9); set_blocksize(bdev, lo_blocksize); @@ -832,7 +831,7 @@ out_clr: lo-lo_device = NULL; lo-lo_backing_file = NULL; lo-lo_flags = 0; - set_capacity(disks[lo-lo_number], 0); + set_capacity(lo-lo_disk, 0); invalidate_bdev(bdev, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(mapping, lo-old_gfp_mask); @@ -918,7 +917,7 @@ static int loop_clr_fd memset(lo-lo_crypt_name, 0, LO_NAME_SIZE); memset(lo-lo_file_name, 0, LO_NAME_SIZE); invalidate_bdev(bdev, 0); - set_capacity(disks[lo-lo_number], 0); + set_capacity(lo-lo_disk, 0); bd_set_size(bdev, 0); mapping_set_gfp_mask(filp-f_mapping, gfp); lo-lo_state = Lo_unbound; @@ -1322,6 +1321,18 @@ static long lo_compat_ioctl } #endif +static struct loop_device *loop_find_dev(int number) +{ + struct loop_device *lo; + + list_for_each_entry(lo, loop_devices, lo_list) { + if (lo-lo_number == number) + return lo; + } + return NULL; +} + +static struct loop_device *loop_init_one(int i); static int lo_open(struct inode *inode, struct file *file) { struct loop_device *lo = inode-i_bdev-bd_disk-private_data; @@ -1330,6 +1341,11 @@ static int lo_open lo-lo_refcnt++; mutex_unlock(lo-lo_ctl_mutex); + mutex_lock(loop_device_mutex); + if (!loop_find_dev(lo-lo_number + 1)) + loop_init_one(lo-lo_number + 1); + mutex_unlock(loop_device_mutex); + return 0; } @@ -1357,8 +1373,6 @@ static struct block_device_operations lo_fops /* * And now the modules code and kernel interface. */ -module_param(max_loop, int, 0); -MODULE_PARM_DESC(max_loop, Maximum number of loop devices (1-256)); MODULE_LICENSE(GPL); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1383,7 +1397,7 @@ int loop_unregister_transfer(int number) xfer_funcs[n] = NULL; - for (lo = loop_dev[0]; lo loop_dev[max_loop]; lo++) { + list_for_each_entry(lo, loop_devices, lo_list) { mutex_lock(lo-lo_ctl_mutex); if (lo-lo_encryption == xfer) @@ -1398,102 +1412,106 @@ int loop_unregister_transfer(int number) EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); -static int __init loop_init(void) +static struct loop_device *loop_init_one(int i) { - int i; + struct loop_device *lo; + struct gendisk *disk; - if (max_loop 1 || max_loop 256) { - printk(KERN_WARNING loop: invalid max_loop (must be between -1 and 256), using default (8)\n); - max_loop = 8; - } + lo = kzalloc(sizeof(*lo), GFP_KERNEL); + if (!lo) + goto out; - if (register_blkdev(LOOP_MAJOR, loop)) - return -EIO; +
Re: [patch] remove artificial software max_loop limit
On 3/30/07, Andrew Morton [EMAIL PROTECTED] wrote: So.. this change will cause a fatal error for anyone who is presently using max_loop, won't it? If they're doing that within their initramfs/initrd/etc then things could get rather ugly for them. probably, if they access loop device non-sequentially. I don't know how much of a problem this will be in practice - do people use max_loop much? I don't know either. btw, did you test this change as both a module and as linked-into-vmlinux? as linked-into-vmlinux. why do you ask? It breaks if it is module? I made last minute change to a mutex name and shamely posted without doing a compile test. Besides that, is there something else breaks? - Ken - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] cache pipe buf page address for non-highmem arch
On 27 Mar 2007 17:01:01 +0200, Andi Kleen <[EMAIL PROTECTED]> wrote: What is the problem? You have cache misses on the the hash table or are the instructions really an issue on a modern CPU? It was page_to_pfn() on numa system. I think it's cache misses on per node pgdat lookup with node index calculation on each pipe buffer page. e.g. i out of lined virt_to_page to save space, but it could be probably inlined again if it was severly time critical. That would be a problem too, though not showing up in the pipe code path. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] cache pipe buf page address for non-highmem arch
On 3/26/07, Andrew Morton <[EMAIL PROTECTED]> wrote: LTP's vmsplice01 triggers the below: Unable to handle kernel NULL pointer dereference at 0130 RIP: [] pipe_to_file+0x1f3/0x2a6 Ouch, generic_pipe_buf_map() and unmap is used by both pipe and splice, I better constrain all changes within fs/pipe.c because from splice path, the page->private is not initialized. Double ouch for missing test that code path. Here is an updated patch, tested with LTP and FIO. I will write more test cases to make sure all code path are covered. diff --git a/fs/pipe.c b/fs/pipe.c index ebafde7..4b55452 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,6 +21,21 @@ #include #include +#ifdef CONFIG_HIGHMEM +#define pipe_kmap kmap +#define pipe_kmap_atomic kmap_atomic +#define pipe_kunmapkunmap +#define pipe_kunmap_atomic kunmap_atomic +#else /* CONFIG_HIGHMEM */ +static inline void *pipe_kmap(struct page *page) +{ + return (void *) page->private; +} +#define pipe_kmap_atomic(page, type) pipe_kmap(page) +#define pipe_kunmap(page) do { } while (0) +#define pipe_kunmap_atomic(page, type) do { } while (0) +#endif + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -185,6 +200,27 @@ void generic_pipe_buf_unmap kunmap(buf->page); } +static void *pipe_buf_map(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, int atomic) +{ + if (atomic) { + buf->flags |= PIPE_BUF_FLAG_ATOMIC; + return pipe_kmap_atomic(buf->page, KM_USER0); + } + + return pipe_kmap(buf->page); +} + +static void pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, void *map_data) +{ + if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { + buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; + pipe_kunmap_atomic(map_data, KM_USER0); + } else + pipe_kunmap(buf->page); +} + int generic_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -210,8 +246,8 @@ int generic_pipe_buf_pin static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, + .map = pipe_buf_map, + .unmap = pipe_buf_unmap, .pin = generic_pipe_buf_pin, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -316,6 +352,7 @@ redo: if (do_wakeup) { wake_up_interruptible_sync(>wait); kill_fasync(>fasync_writers, SIGIO, POLL_OUT); + do_wakeup = 0; } pipe_wait(pipe); } @@ -423,6 +460,8 @@ redo1: ret = ret ? : -ENOMEM; break; } + page->private = (unsigned long) + page_address(page); pipe->tmp_page = page; } /* Always wake up, even if the copy fails. Otherwise @@ -438,16 +477,16 @@ redo1: iov_fault_in_pages_read(iov, chars); redo2: if (atomic) - src = kmap_atomic(page, KM_USER0); + src = pipe_kmap_atomic(page, KM_USER0); else - src = kmap(page); + src = pipe_kmap(page); error = pipe_iov_copy_from_user(src, iov, chars, atomic); if (atomic) - kunmap_atomic(src, KM_USER0); + pipe_kunmap_atomic(src, KM_USER0); else - kunmap(page); + pipe_kunmap(page); if (unlikely(error)) { if (atomic) { - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] cache pipe buf page address for non-highmem arch
On 3/26/07, Andrew Morton [EMAIL PROTECTED] wrote: LTP's vmsplice01 triggers the below: Unable to handle kernel NULL pointer dereference at 0130 RIP: [8029e1b6] pipe_to_file+0x1f3/0x2a6 Ouch, generic_pipe_buf_map() and unmap is used by both pipe and splice, I better constrain all changes within fs/pipe.c because from splice path, the page-private is not initialized. Double ouch for missing test that code path. Here is an updated patch, tested with LTP and FIO. I will write more test cases to make sure all code path are covered. diff --git a/fs/pipe.c b/fs/pipe.c index ebafde7..4b55452 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,6 +21,21 @@ #include asm/uaccess.h #include asm/ioctls.h +#ifdef CONFIG_HIGHMEM +#define pipe_kmap kmap +#define pipe_kmap_atomic kmap_atomic +#define pipe_kunmapkunmap +#define pipe_kunmap_atomic kunmap_atomic +#else /* CONFIG_HIGHMEM */ +static inline void *pipe_kmap(struct page *page) +{ + return (void *) page-private; +} +#define pipe_kmap_atomic(page, type) pipe_kmap(page) +#define pipe_kunmap(page) do { } while (0) +#define pipe_kunmap_atomic(page, type) do { } while (0) +#endif + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -185,6 +200,27 @@ void generic_pipe_buf_unmap kunmap(buf-page); } +static void *pipe_buf_map(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, int atomic) +{ + if (atomic) { + buf-flags |= PIPE_BUF_FLAG_ATOMIC; + return pipe_kmap_atomic(buf-page, KM_USER0); + } + + return pipe_kmap(buf-page); +} + +static void pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, void *map_data) +{ + if (buf-flags PIPE_BUF_FLAG_ATOMIC) { + buf-flags = ~PIPE_BUF_FLAG_ATOMIC; + pipe_kunmap_atomic(map_data, KM_USER0); + } else + pipe_kunmap(buf-page); +} + int generic_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { @@ -210,8 +246,8 @@ int generic_pipe_buf_pin static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, - .map = generic_pipe_buf_map, - .unmap = generic_pipe_buf_unmap, + .map = pipe_buf_map, + .unmap = pipe_buf_unmap, .pin = generic_pipe_buf_pin, .release = anon_pipe_buf_release, .steal = generic_pipe_buf_steal, @@ -316,6 +352,7 @@ redo: if (do_wakeup) { wake_up_interruptible_sync(pipe-wait); kill_fasync(pipe-fasync_writers, SIGIO, POLL_OUT); + do_wakeup = 0; } pipe_wait(pipe); } @@ -423,6 +460,8 @@ redo1: ret = ret ? : -ENOMEM; break; } + page-private = (unsigned long) + page_address(page); pipe-tmp_page = page; } /* Always wake up, even if the copy fails. Otherwise @@ -438,16 +477,16 @@ redo1: iov_fault_in_pages_read(iov, chars); redo2: if (atomic) - src = kmap_atomic(page, KM_USER0); + src = pipe_kmap_atomic(page, KM_USER0); else - src = kmap(page); + src = pipe_kmap(page); error = pipe_iov_copy_from_user(src, iov, chars, atomic); if (atomic) - kunmap_atomic(src, KM_USER0); + pipe_kunmap_atomic(src, KM_USER0); else - kunmap(page); + pipe_kunmap(page); if (unlikely(error)) { if (atomic) { - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] cache pipe buf page address for non-highmem arch
On 27 Mar 2007 17:01:01 +0200, Andi Kleen [EMAIL PROTECTED] wrote: What is the problem? You have cache misses on the the hash table or are the instructions really an issue on a modern CPU? It was page_to_pfn() on numa system. I think it's cache misses on per node pgdat lookup with node index calculation on each pipe buffer page. e.g. i out of lined virt_to_page to save space, but it could be probably inlined again if it was severly time critical. That would be a problem too, though not showing up in the pipe code path. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] max_loop limit
On 3/23/07, Jan Engelhardt <[EMAIL PROTECTED]> wrote: @@ -1383,7 +1380,7 @@ int loop_unregister_transfer(int number) xfer_funcs[n] = NULL; - for (lo = _dev[0]; lo < _dev[max_loop]; lo++) { + list_for_each_entry(lo, _devices, lo_list) { mutex_lock(>lo_ctl_mutex); Don't you need to use loop_devices_lock to protect the linked list here? +static struct loop_device *loop_find_dev(unsigned int number) +{ + struct loop_device *lo; + list_for_each_entry(lo, _devices, lo_list) + if (lo->lo_number == number) + return lo; + return NULL; Here too with spin lock?? - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] max_loop limit
On 3/23/07, Jan Engelhardt <[EMAIL PROTECTED]> wrote: Sadly, it locks up the foreground process (losetup that would be), and I have not yet figured out why. And the mpt regression elsewhere is hindering me in finding out faster. You need to tell the block layer that each loop device is a whole block device, not a partition within another device. Otherwise, I think it will cause a recursive mutex lock in block_dev.c:do_open(). This patch should fix the problem. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> --- ./drivers/block/loop.c.orig 2007-03-24 17:05:51.0 -0700 +++ ./drivers/block/loop.c 2007-03-24 17:06:06.0 -0700 @@ -1464,6 +1464,7 @@ if ((lo = loop_find_dev(number)) == NULL) { lo = loop_init_one(number); + *part = 0; if (IS_ERR(lo)) return (void *)lo; } - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] rfc: introduce /dev/hugetlb
On 3/23/07, Andrew Morton <[EMAIL PROTECTED]> wrote: a) Ken observes that obtaining private hugetlb memory via hugetlbfs involves "fuss". b) the libhugetlbfs maintainers then go off and implement a no-fuss way of doing this. Hmm, what started this thread was libhugetlbfs maintainer complained how "fuss" it was to create private hugetlb mapping and suggested an even bigger kernel change with pagetable_operations API. The new API was designed with an end goal of introduce /dev/hugetlb (as one of the feature, they might be thinking more). What motivated me here is to point out that we can achieve the same goal of having a /dev/hugetlb with existing hugetlbfs infrastructure and the implementation is relatively straightforward. What it also buys us is a bit more flexibility to the end user who wants to use the interface directly. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] rfc: introduce /dev/hugetlb
On 3/23/07, Andrew Morton [EMAIL PROTECTED] wrote: a) Ken observes that obtaining private hugetlb memory via hugetlbfs involves fuss. b) the libhugetlbfs maintainers then go off and implement a no-fuss way of doing this. Hmm, what started this thread was libhugetlbfs maintainer complained how fuss it was to create private hugetlb mapping and suggested an even bigger kernel change with pagetable_operations API. The new API was designed with an end goal of introduce /dev/hugetlb (as one of the feature, they might be thinking more). What motivated me here is to point out that we can achieve the same goal of having a /dev/hugetlb with existing hugetlbfs infrastructure and the implementation is relatively straightforward. What it also buys us is a bit more flexibility to the end user who wants to use the interface directly. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] max_loop limit
On 3/23/07, Jan Engelhardt [EMAIL PROTECTED] wrote: Sadly, it locks up the foreground process (losetup that would be), and I have not yet figured out why. And the mpt regression elsewhere is hindering me in finding out faster. You need to tell the block layer that each loop device is a whole block device, not a partition within another device. Otherwise, I think it will cause a recursive mutex lock in block_dev.c:do_open(). This patch should fix the problem. Signed-off-by: Ken Chen [EMAIL PROTECTED] --- ./drivers/block/loop.c.orig 2007-03-24 17:05:51.0 -0700 +++ ./drivers/block/loop.c 2007-03-24 17:06:06.0 -0700 @@ -1464,6 +1464,7 @@ if ((lo = loop_find_dev(number)) == NULL) { lo = loop_init_one(number); + *part = 0; if (IS_ERR(lo)) return (void *)lo; } - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] max_loop limit
On 3/23/07, Jan Engelhardt [EMAIL PROTECTED] wrote: @@ -1383,7 +1380,7 @@ int loop_unregister_transfer(int number) xfer_funcs[n] = NULL; - for (lo = loop_dev[0]; lo loop_dev[max_loop]; lo++) { + list_for_each_entry(lo, loop_devices, lo_list) { mutex_lock(lo-lo_ctl_mutex); Don't you need to use loop_devices_lock to protect the linked list here? +static struct loop_device *loop_find_dev(unsigned int number) +{ + struct loop_device *lo; + list_for_each_entry(lo, loop_devices, lo_list) + if (lo-lo_number == number) + return lo; + return NULL; Here too with spin lock?? - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] cache pipe buf page address for non-highmem arch
On 3/23/07, Christoph Hellwig <[EMAIL PROTECTED]> wrote: I think you're fixing the symptom here and not the cause. If calculating the virtual address of a page is so expensive on your setup it should define WANT_PAGE_VIRTUAL and we should always cache the virtual address in struct page. There's a lot more code, epecially in filesystems that's rather upset about a slow page_address. Adding WANT_PAGE_VIRTUAL would be too costly I think as it will expand the sizeof struct page. I would rather go incremental fix on these issues and not blindly increase page struct size. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] cache pipe buf page address for non-highmem arch
On 3/22/07, Andrew Morton <[EMAIL PROTECTED]> wrote: If we're going to do this then we should also implement pipe_kunmap_atomic(). Relying upon kunmap_atomic() internals like this is weird-looking, and is fragile against future changes to kunmap_atomic(). OK, I added the matching pipe_kunmap variants. Now that we have the matching pair, I made further optimization to make both pipe_kmap and pipe_kmap_atomic the same because pipe_kmap does not hold any critical resource that can't sleep. There is no reason to make them different. Full patch and changelog: It is really sad that we always call kmap and friends for every pipe buffer page on 64-bit arch that doesn't use HIGHMEM, or on configuration that doesn't turn on HIGHMEM. The effect of calling kmap* is visible in the execution profile when pipe code is being stressed. It is especially true on amd's x86-64 platform where kmap() has to traverse through numa node index calculation in order to convert struct page * to kernel virtual address. It is fairly pointless to perform that calculation repeatly on system with no highmem (i.e., 64-bit arch like x86-64). This patch caches kernel pipe buffer page's kernel vaddr to speed up pipe buffer mapping functions. There is another suboptimal block in pipe_read() where wake_up is called twice. I think it was an oversight since in pipe_write(), it looks like it is doing the right thing. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> diff --git a/fs/pipe.c b/fs/pipe.c index ebafde7..4cd1d68 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,6 +21,21 @@ #include #include #include +#ifdef CONFIG_HIGHMEM +#define pipe_kmap kmap +#define pipe_kmap_atomic kmap_atomic +#define pipe_kunmapkunmap +#define pipe_kunmap_atomic kunmap_atomic +#else /* CONFIG_HIGHMEM */ +static inline void *pipe_kmap(struct page *page) +{ + return (void *) page->private; +} +#define pipe_kmap_atomic(page, type) pipe_kmap(page) +#define pipe_kunmap(page) do { } while (0) +#define pipe_kunmap_atomic(page, type) do { } while (0) +#endif + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -169,10 +184,10 @@ void *generic_pipe_buf_map(struct pipe_i { if (atomic) { buf->flags |= PIPE_BUF_FLAG_ATOMIC; - return kmap_atomic(buf->page, KM_USER0); + return pipe_kmap_atomic(buf->page, KM_USER0); } - return kmap(buf->page); + return pipe_kmap(buf->page); } void generic_pipe_buf_unmap(struct pipe_inode_info *pipe, @@ -180,9 +195,9 @@ void generic_pipe_buf_unmap(struct pipe_ { if (buf->flags & PIPE_BUF_FLAG_ATOMIC) { buf->flags &= ~PIPE_BUF_FLAG_ATOMIC; - kunmap_atomic(map_data, KM_USER0); + pipe_kunmap_atomic(map_data, KM_USER0); } else - kunmap(buf->page); + pipe_kunmap(buf->page); } int generic_pipe_buf_steal(struct pipe_inode_info *pipe, @@ -316,6 +331,7 @@ redo: if (do_wakeup) { wake_up_interruptible_sync(>wait); kill_fasync(>fasync_writers, SIGIO, POLL_OUT); + do_wakeup = 0; } pipe_wait(pipe); } @@ -423,6 +439,8 @@ redo1: ret = ret ? : -ENOMEM; break; } + page->private = (unsigned long) + page_address(page); pipe->tmp_page = page; } /* Always wake up, even if the copy fails. Otherwise @@ -438,16 +456,16 @@ redo1: iov_fault_in_pages_read(iov, chars); redo2: if (atomic) - src = kmap_atomic(page, KM_USER0); + src = pipe_kmap_atomic(page, KM_USER0); else - src = kmap(page); + src = pipe_kmap(page); error = pipe_iov_copy_from_user(src, iov, chars, atomic); if (atomic) - kunmap_atomic(src, KM_USER0); + pipe_kunmap_atomic(src, KM_USER0); else - kunmap(page); + pipe_kunmap(page); if (unlikely(error)) { if (atomic) { - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch 1/2] hugetlb: add resv argument to hugetlb_file_setup
On 3/23/07, Nish Aravamudan <[EMAIL PROTECTED]> wrote: Comment needs updating too. Thanks. How could I miss that :-( updated patch: diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8c718a3..981886f 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -734,7 +734,7 @@ static int can_do_hugetlb_shm(void) can_do_mlock()); } -struct file *hugetlb_zero_setup(size_t size) +struct file *hugetlb_file_setup(size_t size, int resv) { int error = -ENOMEM; struct file *file; @@ -771,7 +771,7 @@ struct file *hugetlb_zero_setup(size_t s goto out_file; error = -ENOMEM; - if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT)) + if (resv && hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT)) goto out_inode; d_instantiate(dentry, inode); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3f3e7a6..55cccd8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -163,7 +163,7 @@ static inline struct hugetlbfs_sb_info * extern const struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_zero_setup(size_t); +struct file *hugetlb_file_setup(size_t, int); int hugetlb_get_quota(struct address_space *mapping); void hugetlb_put_quota(struct address_space *mapping); @@ -185,7 +185,7 @@ #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) 0 #define set_file_hugepages(file)BUG() -#define hugetlb_zero_setup(size) ERR_PTR(-ENOSYS) +#define hugetlb_file_setup(size, resv) ERR_PTR(-ENOSYS) #endif /* !CONFIG_HUGETLBFS */ diff --git a/ipc/shm.c b/ipc/shm.c index 4fefbad..81c8344 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -365,8 +365,8 @@ static int newseg (struct ipc_namespace } if (shmflg & SHM_HUGETLB) { - /* hugetlb_zero_setup takes care of mlock user accounting */ - file = hugetlb_zero_setup(size); + /* hugetlb_file_setup takes care of mlock user accounting */ + file = hugetlb_file_setup(size, 1); shp->mlock_user = current->user; } else { int acctflag = VM_ACCOUNT; - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch 2/2] hugetlb: add /dev/hugetlb char device
add a char device /dev/hugetlb that behaves similar to /dev/zero, built on top of internal hugetlbfs mount. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> diff -u b/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c --- b/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -795,6 +795,23 @@ return ERR_PTR(error); } +int hugetlb_zero_setup(struct file *file, struct vm_area_struct *vma) +{ + file = hugetlb_file_setup(vma->vm_end - vma->vm_start, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (vma->vm_file) + fput(vma->vm_file); + vma->vm_file = file; + return hugetlbfs_file_mmap(file, vma); +} + +const struct file_operations hugetlb_dev_fops = { + .mmap = hugetlb_zero_setup, + .get_unmapped_area = hugetlb_get_unmapped_area, +}; + static int __init init_hugetlbfs_fs(void) { int error; diff -u b/include/linux/hugetlb.h b/include/linux/hugetlb.h --- b/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -162,6 +162,7 @@ } extern const struct file_operations hugetlbfs_file_operations; +extern const struct file_operations hugetlb_dev_fops; extern struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(size_t, int); int hugetlb_get_quota(struct address_space *mapping); --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -27,6 +27,7 @@ #include #include #include #include +#include #include #include @@ -939,6 +940,12 @@ #ifdef CONFIG_CRASH_DUMP filp->f_op = _fops; break; #endif +#ifdef CONFIG_HUGETLBFS + case 13: + printk("open hugetlb dev device\n"); + filp->f_op = _dev_fops; + break; +#endif default: return -ENXIO; } @@ -971,6 +978,9 @@ #endif #ifdef CONFIG_CRASH_DUMP {12,"oldmem",S_IRUSR | S_IWUSR | S_IRGRP, _fops}, #endif +#ifdef CONFIG_HUGETLBFS + {13, "hugetlb",S_IRUGO | S_IWUGO, _dev_fops}, +#endif }; static struct class *mem_class; - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch 1/2] hugetlb: add resv argument to hugetlb_file_setup
rename hugetlb_zero_setup() to hugetlb_file_setup() to better match function name convention like shmem implementation. Also add an argument to the function to indicate whether file setup should reserve hugetlb page upfront or not. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8c718a3..981886f 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -734,7 +734,7 @@ static int can_do_hugetlb_shm(void) can_do_mlock()); } -struct file *hugetlb_zero_setup(size_t size) +struct file *hugetlb_file_setup(size_t size, int resv) { int error = -ENOMEM; struct file *file; @@ -771,7 +771,7 @@ struct file *hugetlb_zero_setup(size_t s goto out_file; error = -ENOMEM; - if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT)) + if (resv && hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT)) goto out_inode; d_instantiate(dentry, inode); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3f3e7a6..55cccd8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -163,7 +163,7 @@ static inline struct hugetlbfs_sb_info * extern const struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_zero_setup(size_t); +struct file *hugetlb_file_setup(size_t, int); int hugetlb_get_quota(struct address_space *mapping); void hugetlb_put_quota(struct address_space *mapping); @@ -185,7 +185,7 @@ #else /* !CONFIG_HUGETLBFS */ #define is_file_hugepages(file) 0 #define set_file_hugepages(file)BUG() -#define hugetlb_zero_setup(size) ERR_PTR(-ENOSYS) +#define hugetlb_file_setup(size, resv) ERR_PTR(-ENOSYS) #endif /* !CONFIG_HUGETLBFS */ diff --git a/ipc/shm.c b/ipc/shm.c index 4fefbad..c64643f 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -366,7 +366,7 @@ static int newseg (struct ipc_namespace if (shmflg & SHM_HUGETLB) { /* hugetlb_zero_setup takes care of mlock user accounting */ - file = hugetlb_zero_setup(size); + file = hugetlb_file_setup(size, 1); shp->mlock_user = current->user; } else { int acctflag = VM_ACCOUNT; - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[patch 0/2] introduce /dev/hugetlb char device
Introduce /dev/hugetlb device that behaves similar to /dev/zero for allocating anonymous hugetlb page. It is especially beneficial that application developers can have an easy way to create MAP_PRIVATE hugetlb mappings without all the fuss about the hugetlbfs filesystem. Two follow on patches has more detail description for each changeset. Signed-off-by: Ken Chen <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] rfc: introduce /dev/hugetlb
On 3/23/07, William Lee Irwin III <[EMAIL PROTECTED]> wrote: On Fri, 23 Mar 2007, William Lee Irwin III wrote: >> Lack of compiletesting beyond x86-64 in all probability. On Fri, Mar 23, 2007 at 03:15:55PM +, Mel Gorman wrote: > Ok, this will go kablamo on Power then even if it compiles. I don't > consider it a fundamental problem though. For the purposes of an RFC, it's > grand and something that can be worked with. He needs to un-#ifdef the prototype (which he already does), but he needs to leave the definition under #ifdef while removing the static qualifier. A relatively minor fixup. Yes, sorry about that for lack of access to non-x86-64 machines. I needed to move the function prototype to hugetlb.h and evidently removed the #ifdef by mistake. I'm not going to touch this in my next clean up patch, instead I will just declare char specific file_operations struct in hugetlbfs and then have char device reference it. But nevertheless, hugetlb_get_unmapped_area function prototype better be in a header file somewhere. - Ken - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [patch] rfc: introduce /dev/hugetlb
On 3/23/07, William Lee Irwin III <[EMAIL PROTECTED]> wrote: I like this patch a lot, though I'm not likely to get around to testing it today. If userspace testcode is available that would be great to see posted so I can just boot into things and run that. Here is the test code that I used: (warning: x86 centric) #include #include #include #include #define SIZE(4*1024*1024UL) int main(void) { int fd; long i; char *addr; fd = open("/dev/hugetlb", O_RDWR); if (fd == -1) { perror("open failure"); exit(1); } addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); if (addr == MAP_FAILED) { perror("mmap failure"); exit(2); } for (i = 0; i < SIZE; i+=4096) addr[i] = 1; printf("success!\n"); } - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/