Re: [PATCH 1/3] zpool: extend API to match zsmalloc

2019-10-18 Thread Dan Streetman
On Thu, Oct 10, 2019 at 4:09 PM Vitaly Wool  wrote:
>
> This patch adds the following functions to the zpool API:
> - zpool_compact()
> - zpool_get_num_compacted()
> - zpool_huge_class_size()
>
> The first one triggers compaction for the underlying allocator, the
> second retrieves the number of pages migrated due to compaction for
> the whole time of this pool's existence and the third one returns
> the huge class size.
>
> This API extension is done to align zpool API with zsmalloc API.
>
> Signed-off-by: Vitaly Wool 

Seems reasonable to me.

Reviewed-by: Dan Streetman 

> ---
>  include/linux/zpool.h | 14 +-
>  mm/zpool.c| 36 
>  2 files changed, 49 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/zpool.h b/include/linux/zpool.h
> index 51bf43076165..31f0c1360569 100644
> --- a/include/linux/zpool.h
> +++ b/include/linux/zpool.h
> @@ -61,8 +61,13 @@ void *zpool_map_handle(struct zpool *pool, unsigned long 
> handle,
>
>  void zpool_unmap_handle(struct zpool *pool, unsigned long handle);
>
> +unsigned long zpool_compact(struct zpool *pool);
> +
> +unsigned long zpool_get_num_compacted(struct zpool *pool);
> +
>  u64 zpool_get_total_size(struct zpool *pool);
>
> +size_t zpool_huge_class_size(struct zpool *zpool);
>
>  /**
>   * struct zpool_driver - driver implementation for zpool
> @@ -75,7 +80,10 @@ u64 zpool_get_total_size(struct zpool *pool);
>   * @shrink:shrink the pool.
>   * @map:   map a handle.
>   * @unmap: unmap a handle.
> - * @total_size:get total size of a pool.
> + * @compact:   try to run compaction over a pool
> + * @get_num_compacted: get amount of compacted pages for a pool
> + * @total_size:get total size of a pool
> + * @huge_class_size: huge class threshold for pool pages.
>   *
>   * This is created by a zpool implementation and registered
>   * with zpool.
> @@ -104,7 +112,11 @@ struct zpool_driver {
> enum zpool_mapmode mm);
> void (*unmap)(void *pool, unsigned long handle);
>
> +   unsigned long (*compact)(void *pool);
> +   unsigned long (*get_num_compacted)(void *pool);
> +
> u64 (*total_size)(void *pool);
> +   size_t (*huge_class_size)(void *pool);
>  };
>
>  void zpool_register_driver(struct zpool_driver *driver);
> diff --git a/mm/zpool.c b/mm/zpool.c
> index 863669212070..55e69213c2eb 100644
> --- a/mm/zpool.c
> +++ b/mm/zpool.c
> @@ -362,6 +362,30 @@ void zpool_unmap_handle(struct zpool *zpool, unsigned 
> long handle)
> zpool->driver->unmap(zpool->pool, handle);
>  }
>
> + /**
> + * zpool_compact() - try to run compaction over zpool
> + * @pool   The zpool to compact
> + *
> + * Returns: the number of migrated pages
> + */
> +unsigned long zpool_compact(struct zpool *zpool)
> +{
> +   return zpool->driver->compact ? zpool->driver->compact(zpool->pool) : 
> 0;
> +}
> +
> +
> +/**
> + * zpool_get_num_compacted() - get the number of migrated/compacted pages
> + * @pool   The zpool to get compaction statistic for
> + *
> + * Returns: the total number of migrated pages for the pool
> + */
> +unsigned long zpool_get_num_compacted(struct zpool *zpool)
> +{
> +   return zpool->driver->get_num_compacted ?
> +   zpool->driver->get_num_compacted(zpool->pool) : 0;
> +}
> +
>  /**
>   * zpool_get_total_size() - The total size of the pool
>   * @zpool: The zpool to check
> @@ -375,6 +399,18 @@ u64 zpool_get_total_size(struct zpool *zpool)
> return zpool->driver->total_size(zpool->pool);
>  }
>
> +/**
> + * zpool_huge_class_size() - get size for the "huge" class
> + * @pool   The zpool to check
> + *
> + * Returns: size of the huge class
> + */
> +size_t zpool_huge_class_size(struct zpool *zpool)
> +{
> +   return zpool->driver->huge_class_size ?
> +   zpool->driver->huge_class_size(zpool->pool) : 0;
> +}
> +
>  /**
>   * zpool_evictable() - Test if zpool is potentially evictable
>   * @zpool: The zpool to test
> --
> 2.20.1


Re: [PATCH] zswap: allow setting default status, compressor and allocator in Kconfig

2019-10-18 Thread Dan Streetman
On Fri, Oct 11, 2019 at 7:40 PM Maciej S. Szmigiero
 wrote:
>
> The compressed cache for swap pages (zswap) currently needs from 1 to 3
> extra kernel command line parameters in order to make it work: it has to be
> enabled by adding a "zswap.enabled=1" command line parameter and if one
> wants a different compressor or pool allocator than the default lzo / zbud
> combination then these choices also need to be specified on the kernel
> command line in additional parameters.
>
> Using a different compressor and allocator for zswap is actually pretty
> common as guides often recommend using the lz4 / z3fold pair instead of
> the default one.
> In such case it is also necessary to remember to enable the appropriate
> compression algorithm and pool allocator in the kernel config manually.
>
> Let's avoid the need for adding these kernel command line parameters and
> automatically pull in the dependencies for the selected compressor
> algorithm and pool allocator by adding an appropriate default switches to
> Kconfig.

Who is the target for using these kernel build-time defaults?  I don't
think any distribution would be defaulting zswap to enabled, and if
the config defaults are intended for personal kernel builds, it is
really so much harder to just configure it on the boot cmdline?

>
> The default values for these options match what the code was using
> previously as its defaults.
>
> Signed-off-by: Maciej S. Szmigiero 
> ---
>  mm/Kconfig | 103 -
>  mm/zswap.c |  26 --
>  2 files changed, 117 insertions(+), 12 deletions(-)
>
> diff --git a/mm/Kconfig b/mm/Kconfig
> index a5dae9a7eb51..4309bcaaa29d 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -525,7 +525,6 @@ config MEM_SOFT_DIRTY
>  config ZSWAP
> bool "Compressed cache for swap pages (EXPERIMENTAL)"
> depends on FRONTSWAP && CRYPTO=y
> -   select CRYPTO_LZO
> select ZPOOL
> help
>   A lightweight compressed cache for swap pages.  It takes
> @@ -541,6 +540,108 @@ config ZSWAP
>   they have not be fully explored on the large set of potential
>   configurations and workloads that exist.
>
> +choice

Using choice becomes a bit of a maintenence issue...if we add this,
wouldn't it be better to use string input so new compression algs can
be added without having to update this Kconfig?

> +   prompt "Compressed cache for swap pages default compressor"
> +   depends on ZSWAP
> +   default ZSWAP_DEFAULT_COMP_LZO
> +   help
> + Selects the default compression algorithm for the compressed cache
> + for swap pages.
> + If in doubt, select 'LZO'.
> +
> + The selection made here can be overridden by using the kernel
> + command line 'zswap.compressor=' option.
> +
> +config ZSWAP_DEFAULT_COMP_DEFLATE
> +   bool "Deflate"
> +   select CRYPTO_DEFLATE
> +   help
> + Use the Deflate algorithm as the default compression algorithm.
> +
> +config ZSWAP_DEFAULT_COMP_LZO
> +   bool "LZO"
> +   select CRYPTO_LZO
> +   help
> + Use the LZO algorithm as the default compression algorithm.
> +
> +config ZSWAP_DEFAULT_COMP_842
> +   bool "842"
> +   select CRYPTO_842
> +   help
> + Use the 842 algorithm as the default compression algorithm.
> +
> +config ZSWAP_DEFAULT_COMP_LZ4
> +   bool "LZ4"
> +   select CRYPTO_LZ4
> +   help
> + Use the LZ4 algorithm as the default compression algorithm.
> +
> +config ZSWAP_DEFAULT_COMP_LZ4HC
> +   bool "LZ4HC"
> +   select CRYPTO_LZ4HC
> +   help
> + Use the LZ4HC algorithm as the default compression algorithm.
> +
> +config ZSWAP_DEFAULT_COMP_ZSTD
> +   bool "zstd"
> +   select CRYPTO_ZSTD
> +   help
> + Use the zstd algorithm as the default compression algorithm.
> +endchoice
> +
> +config ZSWAP_DEFAULT_COMP_NAME
> +   string
> +   default "deflate" if ZSWAP_DEFAULT_COMP_DEFLATE
> +   default "lzo" if ZSWAP_DEFAULT_COMP_LZO
> +   default "842" if ZSWAP_DEFAULT_COMP_842
> +   default "lz4" if ZSWAP_DEFAULT_COMP_LZ4
> +   default "lz4hc" if ZSWAP_DEFAULT_COMP_LZ4HC
> +   default "zstd" if ZSWAP_DEFAULT_COMP_ZSTD
> +   default ""
> +
> +choice
> +   prompt "Compressed cache for swap pages default allocator"
> +   depends on ZSWAP
> +   default ZSWAP_DEFAULT_ZPOOL_ZBUD
> +   help
> + Selects the default allocator for the compressed cache for
> + swap pages.
> + The default is 'zbud' for compatibility, however please do
> + read the description of each of the allocators below before
> + making a right choice.
> +
> + The selection made here can be overridden by using the kernel
> + command line 'zswap.zpool=' option.
> +
> +config ZSWAP_DEFAULT_ZPOOL_ZBUD
> +   bool "zbud"
> +   select ZBUD
> +   help
> + Use the zbud allocator 

Re: [RFC v4] zswap: Add CONFIG_ZSWAP_IO_SWITCH to handle swap IO issue

2019-10-10 Thread Dan Streetman
On Tue, Oct 8, 2019 at 4:07 AM Hui Zhu  wrote:
>
> This is the fourth version of this patch.  The perious versions
> are in [1], [2] and [3].
>
> The parameters read_in_flight_limit and write_in_flight_limit were
> replaced by io_switch_enabled_enabled in this verion to make this
> function more clear.
>
> Currently, I use a VM that has 1 CPU, 4G memory and 4G swap file.
> I found that swap will affect the IO performance when it is running.
> So I open zswap to handle it because it just use CPU cycles but not
> disk IO.
>
> It work OK but I found that zswap is slower than normal swap in this
> VM.  zswap is about 300M/s and normal swap is about 500M/s. (The reason
> is the swap disk device config is "cache=none,aio=native".)
> So open zswap is make memory shrinker slower but good for IO performance
> in this VM.
> So I just want zswap work when the disk of the swap file is under high
> IO load.

I'm still not excited about this, I feel like this will only be useful
in situations where zswap probably wasn't a good idea in the first
place.  And I'm still not clear on why you're using zswap *at all*, if
your disk I/O is faster than zswap can compress pages - you clearly
should have zswap disabled, if that's the case.

Can you run some more tests and make sure this param really, actually,
helps your workload?  Please also check that you aren't filling up
zswap as well; if your problem is you're filling up zswap and that's
when you want to divert more pages into swap, then I think that would
be much better handled by adding hysteresis logic instead of checking
the swap device io load.

>
> This commit is designed for this idea.
> When this function is enabled by the swap parameter
> io_switch_enabled_enabled, zswap will just work when the swap disk has
> outstanding I/O requests.
>
> [1] https://lkml.org/lkml/2019/9/11/935
> [2] https://lkml.org/lkml/2019/9/20/90
> [3] https://lkml.org/lkml/2019/9/22/927
>
> Signed-off-by: Hui Zhu 
> ---
>  include/linux/swap.h |  3 +++
>  mm/Kconfig   | 14 ++
>  mm/page_io.c | 16 
>  mm/zswap.c   | 25 +
>  4 files changed, 58 insertions(+)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index de2c67a..82b621f 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -389,6 +389,9 @@ extern void end_swap_bio_write(struct bio *bio);
>  extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
> bio_end_io_t end_write_func);
>  extern int swap_set_page_dirty(struct page *page);
> +#ifdef CONFIG_ZSWAP_IO_SWITCH
> +extern void swap_io_in_flight(struct page *page, unsigned int inflight[2]);
> +#endif
>
>  int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
> unsigned long nr_pages, sector_t start_block);
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 56cec63..f5740e3 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -546,6 +546,20 @@ config ZSWAP
>   they have not be fully explored on the large set of potential
>   configurations and workloads that exist.
>
> +config ZSWAP_IO_SWITCH

let's drop this, if we're going to add this I don't think we need both
a build time and runtime switch.  Just defaulting the runtime switch
to off should be fine.

> +   bool "Compressed cache for swap pages according to the IO status"
> +   depends on ZSWAP
> +   help
> + This function helps the system that normal swap speed is higher
> + than zswap speed to handle the swap IO issue.
> + For example, a VM where the swap disk device with config
> + "cache=none,aio=native".
> +
> + When this function is enabled by the swap parameter
> + io_switch_enabled_enabled, zswap will just work when the swap disk
> + has outstanding I/O requests.

I think this doc should go into Documentation/vm/zswap.rst instead, please.

> + If unsure, say "n".
> +
>  config ZPOOL
> tristate "Common API for compressed memory storage"
> help
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 24ee600..e66b050 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -434,3 +434,19 @@ int swap_set_page_dirty(struct page *page)
> return __set_page_dirty_no_writeback(page);
> }
>  }
> +
> +#ifdef CONFIG_ZSWAP_IO_SWITCH
> +void swap_io_in_flight(struct page *page, unsigned int inflight[2])
> +{
> +   struct swap_info_struct *sis = page_swap_info(page);
> +
> +   if (!sis->bdev) {
> +   inflight[0] = 0;
> +   inflight[1] = 0;

I'm not quite sure when a swap_info_struct won't have a bdev (looks
like if it's neither ISBLK nor ISREG, and I'm not sure what's left
after those), but if you set both to 0 that means it will effectively
disable zswap completely for this swap device, writing all pages to
it.  Is that really the right thing to do?

> +   return;
> +   }
> +
> +   

Re: [PATCH] powerpc/vio: use simple dummy struct device as bus parent

2019-10-03 Thread Dan Streetman
On Fri, Sep 27, 2019 at 2:19 PM Greg Kroah-Hartman
 wrote:
>
> On Fri, Sep 27, 2019 at 09:04:02AM -0400, Dan Streetman wrote:
> > The dummy vio_bus_device creates the /sys/devices/vio directory, which
> > contains real vio devices under it; since it represents itself as having
> > a bus = _bus_type, its /sys/devices/vio/uevent does call the bus's
> > .uevent function, vio_hotplug(), and as that function won't find a real
> > device for the dummy vio_dev, it will return -ENODEV.
> >
> > One of the main users of the uevent node is udevadm, e.g. when it is called
> > with 'udevadm trigger --devices'.  Up until recently, it would ignore any
> > errors returned when writing to devices' uevent file, but it was recently
> > changed to start returning error if it gets an error writing to any uevent
> > file:
> > https://github.com/systemd/systemd/commit/97afc0351a96e0daa83964df33937967c75c644f
> >
> > since the /sys/devices/vio/uevent file has always returned ENODEV from
> > any write to it, this now causes the udevadm trigger command to return
> > an error.  This may be fixed in udevadm to ignore ENODEV errors, but the
> > vio driver should still be fixed.
> >
> > This patch changes the arch/powerpc/platform/pseries/vio.c 'dummy'
> > parent device into a real dummy device with no .bus, so its uevent
> > file will stop returning ENODEV and simply do nothing and return 0.
> >
> > Signed-off-by: Dan Streetman 
> > ---
> >  arch/powerpc/platforms/pseries/vio.c | 11 ---
> >  1 file changed, 4 insertions(+), 7 deletions(-)
> >
> > diff --git a/arch/powerpc/platforms/pseries/vio.c 
> > b/arch/powerpc/platforms/pseries/vio.c
> > index 79e2287991db..63bc16631680 100644
> > --- a/arch/powerpc/platforms/pseries/vio.c
> > +++ b/arch/powerpc/platforms/pseries/vio.c
> > @@ -32,11 +32,8 @@
> >  #include 
> >  #include 
> >
> > -static struct vio_dev vio_bus_device  = { /* fake "parent" device */
> > - .name = "vio",
> > - .type = "",
> > - .dev.init_name = "vio",
> > - .dev.bus = _bus_type,
> > +static struct device vio_bus = {
> > + .init_name  = "vio",
>
> Eeek, no!  Why are you creating a static device that will then be
> reference counted?  Not nice :(

so, I looked again and it seems quite a few places appear to do
exactly this, is it something that should be fixed?

$ git grep 'static struct device [^*{]*{'
arch/arm/kernel/dma-isa.c:static struct device isa_dma_dev = {
arch/arm/mach-rpc/dma.c:static struct device isa_dma_dev = {
arch/arm/mach-s3c24xx/s3c2410.c:static struct device s3c2410_dev = {
arch/arm/mach-s3c24xx/s3c2412.c:static struct device s3c2412_dev = {
arch/arm/mach-s3c24xx/s3c2416.c:static struct device s3c2416_dev = {
arch/arm/mach-s3c24xx/s3c2440.c:static struct device s3c2440_dev = {
arch/arm/mach-s3c24xx/s3c2442.c:static struct device s3c2442_dev = {
arch/arm/mach-s3c24xx/s3c2443.c:static struct device s3c2443_dev = {
arch/arm/mach-s3c64xx/common.c:static struct device s3c64xx_dev = {
arch/arm/mach-s3c64xx/s3c6400.c:static struct device s3c6400_dev = {
arch/arm/mach-s3c64xx/s3c6410.c:static struct device s3c6410_dev = {
arch/mips/sgi-ip22/ip22-gio.c:static struct device gio_bus = {
arch/parisc/kernel/drivers.c:static struct device root = {
arch/powerpc/platforms/ps3/system-bus.c:static struct device ps3_system_bus = {
arch/powerpc/platforms/pseries/ibmebus.c:static struct device
ibmebus_bus_device = { /* fake "parent" device */
arch/powerpc/platforms/pseries/vio.c:static struct device vio_bus = {
arch/um/drivers/virtio_uml.c:static struct device vu_cmdline_parent = {
drivers/base/isa.c:static struct device isa_bus = {
drivers/block/rbd.c:static struct device rbd_root_dev = {
drivers/gpu/drm/ttm/ttm_module.c:static struct device ttm_drm_class_device = {
drivers/iio/dummy/iio_dummy_evgen.c:static struct device iio_evgen_dev = {
drivers/iio/trigger/iio-trig-sysfs.c:static struct device iio_sysfs_trig_dev = {
drivers/misc/sgi-gru/grumain.c:static struct device gru_device = {
drivers/nubus/bus.c:static struct device nubus_parent = {
drivers/sh/maple/maple.c:static struct device maple_bus = {
drivers/sh/superhyway/superhyway.c:static struct device
superhyway_bus_device = {
drivers/soc/fsl/qe/qe_ic.c:static struct device device_qe_ic = {
drivers/virtio/virtio_mmio.c:static struct device vm_cmdline_parent = {
kernel/time/clockevents.c:static struct device tick_bc_dev = {
kernel/time/clocksource.c:static struct device device_clocksource = {


>
> What's wrong with a simple call to device_create() for your "fake"
> device you want to make here?  That's what it is there for :)
>
> thanks,
>
> greg k-h


Re: [PATCH] powerpc/vio: use simple dummy struct device as bus parent

2019-09-28 Thread Dan Streetman
On Sat, Sep 28, 2019 at 3:41 AM Greg Kroah-Hartman
 wrote:
>
> On Fri, Sep 27, 2019 at 03:48:49PM -0400, Dan Streetman wrote:
> > On Fri, Sep 27, 2019 at 2:19 PM Greg Kroah-Hartman
> >  wrote:
> > >
> > > On Fri, Sep 27, 2019 at 09:04:02AM -0400, Dan Streetman wrote:
> > > > The dummy vio_bus_device creates the /sys/devices/vio directory, which
> > > > contains real vio devices under it; since it represents itself as having
> > > > a bus = _bus_type, its /sys/devices/vio/uevent does call the bus's
> > > > .uevent function, vio_hotplug(), and as that function won't find a real
> > > > device for the dummy vio_dev, it will return -ENODEV.
> > > >
> > > > One of the main users of the uevent node is udevadm, e.g. when it is 
> > > > called
> > > > with 'udevadm trigger --devices'.  Up until recently, it would ignore 
> > > > any
> > > > errors returned when writing to devices' uevent file, but it was 
> > > > recently
> > > > changed to start returning error if it gets an error writing to any 
> > > > uevent
> > > > file:
> > > > https://github.com/systemd/systemd/commit/97afc0351a96e0daa83964df33937967c75c644f
> > > >
> > > > since the /sys/devices/vio/uevent file has always returned ENODEV from
> > > > any write to it, this now causes the udevadm trigger command to return
> > > > an error.  This may be fixed in udevadm to ignore ENODEV errors, but the
> > > > vio driver should still be fixed.
> > > >
> > > > This patch changes the arch/powerpc/platform/pseries/vio.c 'dummy'
> > > > parent device into a real dummy device with no .bus, so its uevent
> > > > file will stop returning ENODEV and simply do nothing and return 0.
> > > >
> > > > Signed-off-by: Dan Streetman 
> > > > ---
> > > >  arch/powerpc/platforms/pseries/vio.c | 11 ---
> > > >  1 file changed, 4 insertions(+), 7 deletions(-)
> > > >
> > > > diff --git a/arch/powerpc/platforms/pseries/vio.c 
> > > > b/arch/powerpc/platforms/pseries/vio.c
> > > > index 79e2287991db..63bc16631680 100644
> > > > --- a/arch/powerpc/platforms/pseries/vio.c
> > > > +++ b/arch/powerpc/platforms/pseries/vio.c
> > > > @@ -32,11 +32,8 @@
> > > >  #include 
> > > >  #include 
> > > >
> > > > -static struct vio_dev vio_bus_device  = { /* fake "parent" device */
> > > > - .name = "vio",
> > > > - .type = "",
> > > > - .dev.init_name = "vio",
> > > > - .dev.bus = _bus_type,
> > > > +static struct device vio_bus = {
> > > > + .init_name  = "vio",
> > >
> > > Eeek, no!  Why are you creating a static device that will then be
> > > reference counted?  Not nice :(
> >
> > sorry!  I'll admit that I simply copied what drivers/base/platform.c
> > seemed to be doing.
>
> I don't see platform.c having a 'static struct device' anywhere in it,
> am I missing it in my searching?

no, you are right, what I meant was:

struct device platform_bus = {
.init_name  = "platform",
};


>
> thanks,
>
> greg k-h


Re: [PATCH] powerpc/vio: use simple dummy struct device as bus parent

2019-09-27 Thread Dan Streetman
On Fri, Sep 27, 2019 at 2:19 PM Greg Kroah-Hartman
 wrote:
>
> On Fri, Sep 27, 2019 at 09:04:02AM -0400, Dan Streetman wrote:
> > The dummy vio_bus_device creates the /sys/devices/vio directory, which
> > contains real vio devices under it; since it represents itself as having
> > a bus = _bus_type, its /sys/devices/vio/uevent does call the bus's
> > .uevent function, vio_hotplug(), and as that function won't find a real
> > device for the dummy vio_dev, it will return -ENODEV.
> >
> > One of the main users of the uevent node is udevadm, e.g. when it is called
> > with 'udevadm trigger --devices'.  Up until recently, it would ignore any
> > errors returned when writing to devices' uevent file, but it was recently
> > changed to start returning error if it gets an error writing to any uevent
> > file:
> > https://github.com/systemd/systemd/commit/97afc0351a96e0daa83964df33937967c75c644f
> >
> > since the /sys/devices/vio/uevent file has always returned ENODEV from
> > any write to it, this now causes the udevadm trigger command to return
> > an error.  This may be fixed in udevadm to ignore ENODEV errors, but the
> > vio driver should still be fixed.
> >
> > This patch changes the arch/powerpc/platform/pseries/vio.c 'dummy'
> > parent device into a real dummy device with no .bus, so its uevent
> > file will stop returning ENODEV and simply do nothing and return 0.
> >
> > Signed-off-by: Dan Streetman 
> > ---
> >  arch/powerpc/platforms/pseries/vio.c | 11 ---
> >  1 file changed, 4 insertions(+), 7 deletions(-)
> >
> > diff --git a/arch/powerpc/platforms/pseries/vio.c 
> > b/arch/powerpc/platforms/pseries/vio.c
> > index 79e2287991db..63bc16631680 100644
> > --- a/arch/powerpc/platforms/pseries/vio.c
> > +++ b/arch/powerpc/platforms/pseries/vio.c
> > @@ -32,11 +32,8 @@
> >  #include 
> >  #include 
> >
> > -static struct vio_dev vio_bus_device  = { /* fake "parent" device */
> > - .name = "vio",
> > - .type = "",
> > - .dev.init_name = "vio",
> > - .dev.bus = _bus_type,
> > +static struct device vio_bus = {
> > + .init_name  = "vio",
>
> Eeek, no!  Why are you creating a static device that will then be
> reference counted?  Not nice :(

sorry!  I'll admit that I simply copied what drivers/base/platform.c
seemed to be doing.

>
> What's wrong with a simple call to device_create() for your "fake"
> device you want to make here?  That's what it is there for :)

ack, will send a new patch using that.  thanks!

>
> thanks,
>
> greg k-h


[PATCH] powerpc/vio: use simple dummy struct device as bus parent

2019-09-27 Thread Dan Streetman
The dummy vio_bus_device creates the /sys/devices/vio directory, which
contains real vio devices under it; since it represents itself as having
a bus = _bus_type, its /sys/devices/vio/uevent does call the bus's
.uevent function, vio_hotplug(), and as that function won't find a real
device for the dummy vio_dev, it will return -ENODEV.

One of the main users of the uevent node is udevadm, e.g. when it is called
with 'udevadm trigger --devices'.  Up until recently, it would ignore any
errors returned when writing to devices' uevent file, but it was recently
changed to start returning error if it gets an error writing to any uevent
file:
https://github.com/systemd/systemd/commit/97afc0351a96e0daa83964df33937967c75c644f

since the /sys/devices/vio/uevent file has always returned ENODEV from
any write to it, this now causes the udevadm trigger command to return
an error.  This may be fixed in udevadm to ignore ENODEV errors, but the
vio driver should still be fixed.

This patch changes the arch/powerpc/platform/pseries/vio.c 'dummy'
parent device into a real dummy device with no .bus, so its uevent
file will stop returning ENODEV and simply do nothing and return 0.

Signed-off-by: Dan Streetman 
---
 arch/powerpc/platforms/pseries/vio.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index 79e2287991db..63bc16631680 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -32,11 +32,8 @@
 #include 
 #include 
 
-static struct vio_dev vio_bus_device  = { /* fake "parent" device */
-   .name = "vio",
-   .type = "",
-   .dev.init_name = "vio",
-   .dev.bus = _bus_type,
+static struct device vio_bus = {
+   .init_name  = "vio",
 };
 
 #ifdef CONFIG_PPC_SMLPAR
@@ -1412,7 +1409,7 @@ struct vio_dev *vio_register_device_node(struct 
device_node *of_node)
set_dev_node(>dev, of_node_to_nid(of_node));
 
/* init generic 'struct device' fields: */
-   viodev->dev.parent = _bus_device.dev;
+   viodev->dev.parent = _bus;
viodev->dev.bus = _bus_type;
viodev->dev.release = vio_dev_release;
 
@@ -1499,7 +1496,7 @@ static int __init vio_bus_init(void)
 * The fake parent of all vio devices, just to give us
 * a nice directory
 */
-   err = device_register(_bus_device.dev);
+   err = device_register(_bus);
if (err) {
printk(KERN_WARNING "%s: device_register returned %i\n",
__func__, err);
-- 
2.20.1



Re: [RFC v3] zswap: Add CONFIG_ZSWAP_IO_SWITCH to handle swap IO issue

2019-09-26 Thread Dan Streetman
On Mon, Sep 23, 2019 at 4:14 PM Dan Streetman  wrote:
>
> On Sun, Sep 22, 2019 at 11:32 PM Hui Zhu  wrote:
> >
> > This is the third version of this patch.  The first and second version
> > is in [1] and [2].
> > This verion is updated according to the comments from Randy Dunlap
> > in [3].
> >
> > Currently, I use a VM that has 2 CPUs, 4G memory and 4G swap file.
> > I found that swap will affect the IO performance when it is running.
> > So I open zswap to handle it because it just use CPU cycles but not
> > disk IO.
> >
> > It work OK but I found that zswap is slower than normal swap in this
> > VM.  zswap is about 300M/s and normal swap is about 500M/s. (The reason
> > is disk inside VM has fscache in host machine.)
>
> I must be missing something here - if zswap in the guest is *slower*
> than real swap, why are you using zswap?
>
> Also, I don't see why zswap is slower than normal swap, unless you
> mean that your zswap is full, since once zswap fills up any additional
> swap will absolutely be slower than not having zswap at all.
>
> > So open zswap is make memory shrinker slower but good for IO performance
> > in this VM.
> > So I just want zswap work when the disk of the swap file is under high
> > IO load.
> >
> > This commit is designed for this idea.
> > It add two parameters read_in_flight_limit and write_in_flight_limit to
> > zswap.
> > In zswap_frontswap_store, pages will be stored to zswap only when
> > the IO in flight number of swap device is bigger than
> > zswap_read_in_flight_limit or zswap_write_in_flight_limit
> > when zswap is enabled.
> > Then the zswap just work when the IO in flight number of swap device
> > is low.
>
> Ok, so maybe I understand what you mean, your disk I/O is normally
> very fast, but once your host-side cache is full it starts actually
> writing to your host physical disk, and your guest swap I/O drops way
> down (since caching pages in host memory is much faster than writing
> to a host physical disk).  Is that what's going on?  That was not
> clear at all to me from the commit description...
>
> In general I think the description of this commit, as well as the docs
> and even user interface of how to use it, is very confusing.  I can
> see how it would be beneficial in this specific situation, but I'm not
> a fan of the implementation, and I'm very concerned that nobody will
> be able to understand how to use it properly - when should they enable
> it?  What limit values should they use?  Why are there separate read
> and write limits?  None of that is clear to me, and I'm fairly
> certainly it would not be clear to other normal users.
>
> Is there a better way this can be done?
>
> >
> > [1] https://lkml.org/lkml/2019/9/11/935
> > [2] https://lkml.org/lkml/2019/9/20/90
> > [3] https://lkml.org/lkml/2019/9/20/1076
> >
> > Signed-off-by: Hui Zhu 

Nacked-by: Dan Streetman 

due to my concerns that I emailed before


> > ---
> >  include/linux/swap.h |  3 +++
> >  mm/Kconfig   | 18 
> >  mm/page_io.c | 16 +++
> >  mm/zswap.c   | 58 
> > 
> >  4 files changed, 95 insertions(+)
> >
> > diff --git a/include/linux/swap.h b/include/linux/swap.h
> > index de2c67a..82b621f 100644
> > --- a/include/linux/swap.h
> > +++ b/include/linux/swap.h
> > @@ -389,6 +389,9 @@ extern void end_swap_bio_write(struct bio *bio);
> >  extern int __swap_writepage(struct page *page, struct writeback_control 
> > *wbc,
> > bio_end_io_t end_write_func);
> >  extern int swap_set_page_dirty(struct page *page);
> > +#ifdef CONFIG_ZSWAP_IO_SWITCH
> > +extern void swap_io_in_flight(struct page *page, unsigned int inflight[2]);
> > +#endif
> >
> >  int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
> > unsigned long nr_pages, sector_t start_block);
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index 56cec63..387c3b5 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -546,6 +546,24 @@ config ZSWAP
> >   they have not be fully explored on the large set of potential
> >   configurations and workloads that exist.
> >
> > +config ZSWAP_IO_SWITCH
> > +   bool "Compressed cache for swap pages according to the IO status"
> > +   depends on ZSWAP
> > +   help
> > + This function helps the system that normal swap speed is higher
> > + than zswap speed to handle the swap IO issue.
> >

Re: [RFC v3] zswap: Add CONFIG_ZSWAP_IO_SWITCH to handle swap IO issue

2019-09-23 Thread Dan Streetman
On Sun, Sep 22, 2019 at 11:32 PM Hui Zhu  wrote:
>
> This is the third version of this patch.  The first and second version
> is in [1] and [2].
> This verion is updated according to the comments from Randy Dunlap
> in [3].
>
> Currently, I use a VM that has 2 CPUs, 4G memory and 4G swap file.
> I found that swap will affect the IO performance when it is running.
> So I open zswap to handle it because it just use CPU cycles but not
> disk IO.
>
> It work OK but I found that zswap is slower than normal swap in this
> VM.  zswap is about 300M/s and normal swap is about 500M/s. (The reason
> is disk inside VM has fscache in host machine.)

I must be missing something here - if zswap in the guest is *slower*
than real swap, why are you using zswap?

Also, I don't see why zswap is slower than normal swap, unless you
mean that your zswap is full, since once zswap fills up any additional
swap will absolutely be slower than not having zswap at all.

> So open zswap is make memory shrinker slower but good for IO performance
> in this VM.
> So I just want zswap work when the disk of the swap file is under high
> IO load.
>
> This commit is designed for this idea.
> It add two parameters read_in_flight_limit and write_in_flight_limit to
> zswap.
> In zswap_frontswap_store, pages will be stored to zswap only when
> the IO in flight number of swap device is bigger than
> zswap_read_in_flight_limit or zswap_write_in_flight_limit
> when zswap is enabled.
> Then the zswap just work when the IO in flight number of swap device
> is low.

Ok, so maybe I understand what you mean, your disk I/O is normally
very fast, but once your host-side cache is full it starts actually
writing to your host physical disk, and your guest swap I/O drops way
down (since caching pages in host memory is much faster than writing
to a host physical disk).  Is that what's going on?  That was not
clear at all to me from the commit description...

In general I think the description of this commit, as well as the docs
and even user interface of how to use it, is very confusing.  I can
see how it would be beneficial in this specific situation, but I'm not
a fan of the implementation, and I'm very concerned that nobody will
be able to understand how to use it properly - when should they enable
it?  What limit values should they use?  Why are there separate read
and write limits?  None of that is clear to me, and I'm fairly
certainly it would not be clear to other normal users.

Is there a better way this can be done?

>
> [1] https://lkml.org/lkml/2019/9/11/935
> [2] https://lkml.org/lkml/2019/9/20/90
> [3] https://lkml.org/lkml/2019/9/20/1076
>
> Signed-off-by: Hui Zhu 
> ---
>  include/linux/swap.h |  3 +++
>  mm/Kconfig   | 18 
>  mm/page_io.c | 16 +++
>  mm/zswap.c   | 58 
> 
>  4 files changed, 95 insertions(+)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index de2c67a..82b621f 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -389,6 +389,9 @@ extern void end_swap_bio_write(struct bio *bio);
>  extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
> bio_end_io_t end_write_func);
>  extern int swap_set_page_dirty(struct page *page);
> +#ifdef CONFIG_ZSWAP_IO_SWITCH
> +extern void swap_io_in_flight(struct page *page, unsigned int inflight[2]);
> +#endif
>
>  int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
> unsigned long nr_pages, sector_t start_block);
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 56cec63..387c3b5 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -546,6 +546,24 @@ config ZSWAP
>   they have not be fully explored on the large set of potential
>   configurations and workloads that exist.
>
> +config ZSWAP_IO_SWITCH
> +   bool "Compressed cache for swap pages according to the IO status"
> +   depends on ZSWAP
> +   help
> + This function helps the system that normal swap speed is higher
> + than zswap speed to handle the swap IO issue.
> + For example, a VM where the disk device is not set cache config or
> + set cache=writeback.
> +
> + This function makes zswap just work when the disk of the swap file
> + is under high IO load.
> + It add two parameters (read_in_flight_limit and
> + write_in_flight_limit) to zswap.  When zswap is enabled, pages will
> + be stored to zswap only when the IO in flight number of swap device
> + is bigger than zswap_read_in_flight_limit or
> + zswap_write_in_flight_limit.
> + If unsure, say "n".
> +
>  config ZPOOL
> tristate "Common API for compressed memory storage"
> help
> diff --git a/mm/page_io.c b/mm/page_io.c
> index 24ee600..e66b050 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -434,3 +434,19 @@ int swap_set_page_dirty(struct page 

Re: [PATCH/RFC] zswap: do not map same object twice

2019-09-18 Thread Dan Streetman
On Sun, Sep 15, 2019 at 5:46 PM Vitaly Wool  wrote:
>
> zswap_writeback_entry() maps a handle to read swpentry first, and
> then in the most common case it would map the same handle again.
> This is ok when zbud is the backend since its mapping callback is
> plain and simple, but it slows things down for z3fold.
>
> Since there's hardly a point in unmapping a handle _that_ fast as
> zswap_writeback_entry() does when it reads swpentry, the
> suggestion is to keep the handle mapped till the end.

LGTM

>
> Signed-off-by: Vitaly Wool 

Reviewed-by: Dan Streetman 

> ---
>  mm/zswap.c | 7 +++
>  1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 0e22744a76cb..b35464bc7315 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -856,7 +856,6 @@ static int zswap_writeback_entry(struct zpool *pool, 
> unsigned long handle)
> /* extract swpentry from data */
> zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
> swpentry = zhdr->swpentry; /* here */
> -   zpool_unmap_handle(pool, handle);
> tree = zswap_trees[swp_type(swpentry)];
> offset = swp_offset(swpentry);
>
> @@ -866,6 +865,7 @@ static int zswap_writeback_entry(struct zpool *pool, 
> unsigned long handle)
> if (!entry) {
> /* entry was invalidated */
> spin_unlock(>lock);
> +   zpool_unmap_handle(pool, handle);
> return 0;
> }
> spin_unlock(>lock);
> @@ -886,15 +886,13 @@ static int zswap_writeback_entry(struct zpool *pool, 
> unsigned long handle)
> case ZSWAP_SWAPCACHE_NEW: /* page is locked */
> /* decompress */
> dlen = PAGE_SIZE;
> -   src = (u8 *)zpool_map_handle(entry->pool->zpool, 
> entry->handle,
> -   ZPOOL_MM_RO) + sizeof(struct zswap_header);
> +   src = (u8 *)zhdr + sizeof(struct zswap_header);
> dst = kmap_atomic(page);
> tfm = *get_cpu_ptr(entry->pool->tfm);
> ret = crypto_comp_decompress(tfm, src, entry->length,
>  dst, );
> put_cpu_ptr(entry->pool->tfm);
> kunmap_atomic(dst);
> -   zpool_unmap_handle(entry->pool->zpool, entry->handle);
> BUG_ON(ret);
> BUG_ON(dlen != PAGE_SIZE);
>
> @@ -940,6 +938,7 @@ static int zswap_writeback_entry(struct zpool *pool, 
> unsigned long handle)
> spin_unlock(>lock);
>
>  end:
> +   zpool_unmap_handle(pool, handle);
> return ret;
>  }
>
> --
> 2.17.1


Re: [PATCH] zswap: Add CONFIG_ZSWAP_IO_SWITCH

2019-09-13 Thread Dan Streetman
On Wed, Sep 11, 2019 at 11:22 PM Hui Zhu  wrote:
>
> I use zswap to handle the swap IO issue in a VM that uses a swap file.
> This VM has 4G memory and 2 CPUs.  And I set up 4G swap in /swapfile.
> This is test script:
> cat 1.sh
> ./usemem --sleep 3600 -M -a -n 1 $((3 * 1024 * 1024 * 1024)) &
> sleep 10
> echo 1 > /proc/sys/vm/drop_caches
> ./usemem -S -f /test2 $((2 * 1024 * 1024 * 1024)) &
> while [ True ]; do ./usemem -a -n 1 $((1 * 1024 * 1024 * 1024)); done
>
> Without ZSWAP:
> echo 100 > /proc/sys/vm/swappiness
> swapon /swapfile
> sh 1.sh
> ...
> ...
> 1207959552 bytes / 2076479 usecs = 568100 KB/s
> 61088 usecs to free memory
> 1207959552 bytes / 2035439 usecs = 579554 KB/s
> 55073 usecs to free memory
> 2415919104 bytes / 24054408 usecs = 98081 KB/s
> 3741 usecs to free memory
> 1207959552 bytes / 1954371 usecs = 603594 KB/s
> 53161 usecs to free memory
> ...
> ...
>
> With ZSWAP:
> echo 100 > /proc/sys/vm/swappiness
> swapon /swapfile
> echo lz4 > /sys/module/zswap/parameters/compressor
> echo zsmalloc > /sys/module/zswap/parameters/zpool
> echo 0 > /sys/module/zswap/parameters/same_filled_pages_enabled
> echo 20 > /sys/module/zswap/parameters/max_pool_percent
> echo 1 > /sys/module/zswap/parameters/enabled
> sh 1.sh
> 1207959552 bytes / 3619283 usecs = 325934 KB/s
> 194825 usecs to free memory
> 1207959552 bytes / 3439563 usecs = 342964 KB/s
> 218419 usecs to free memory
> 2415919104 bytes / 19508762 usecs = 120935 KB/s
> 5632 usecs to free memory
> 1207959552 bytes / 3329369 usecs = 354315 KB/s
> 179764 usecs to free memory
>
> The normal io speed is increased from 98081 KB/s to 120935 KB/s.
> But I found 2 issues of zswap in this machine:
> 1. Because the disk of VM has the file cache in the host layer,
>so normal swap speed is higher than with zswap.

I don't understand what you mean, that normal swap speed is higher
than with zswap.  Anyway, if that's true for your use case, then just
disable zswap, don't try to make zswap disable itself.

>
> 2. Because zswap need allocates memory to store the compressed pages,
>it will make memory capacity worse.

well of course, this is the tradeoff with zswap; it's faster than real
swap (or at least it's supposed to be), but doesn't provide as much
memory pressure relief as real swap.

> For example:
> Command "./usemem -a -n 1 $((7 * 1024 * 1024 * 1024))" request 7G memory
> from this machine.
> It will work OK without zswap but got OOM when zswap is opened.
>
> This commit adds CONFIG_ZSWAP_IO_SWITCH that try to handle the issues
> and let zswap keep save IO.
> It add two parameters read_in_flight_limit and write_in_flight_limit to
> zswap.
> In zswap_frontswap_store, pages will be stored to zswap only when
> the IO in flight number of swap device is bigger than
> zswap_read_in_flight_limit or zswap_write_in_flight_limit
> when zswap is enabled.
> Then the zswap just work when the IO in flight number of swap device
> is low.
>
> This is the test result:
> echo 100 > /proc/sys/vm/swappiness
> swapon /swapfile
> echo lz4 > /sys/module/zswap/parameters/compressor
> echo zsmalloc > /sys/module/zswap/parameters/zpool
> echo 0 > /sys/module/zswap/parameters/same_filled_pages_enabled
> echo 20 > /sys/module/zswap/parameters/max_pool_percent
> echo 1 > /sys/module/zswap/parameters/enabled
> echo 3 > /sys/module/zswap/parameters/read_in_flight_limit
> echo 50 > /sys/module/zswap/parameters/write_in_flight_limit
> sh 1.sh
> ...
> 1207959552 bytes / 2320861 usecs = 508280 KB/s
> 106164 usecs to free memory
> 1207959552 bytes / 2343916 usecs = 503280 KB/s
> 79386 usecs to free memory
> 2415919104 bytes / 20136015 usecs = 117167 KB/s
> 4411 usecs to free memory
> 1207959552 bytes / 1833403 usecs = 643419 KB/s
> 70452 usecs to free memory
> ...
> killall usemem
> ./usemem -a -n 1 $((7 * 1024 * 1024 * 1024))
> 8455716864 bytes / 14457505 usecs = 571159 KB/s
> 365961 usecs to free memory
>
> Signed-off-by: Hui Zhu 

Unless you can significantly clarify why this is needed, i'm a nak.

Naked-by: Dan Streetman 


> ---
>  include/linux/swap.h |  3 +++
>  mm/Kconfig   | 11 +++
>  mm/page_io.c | 16 +++
>  mm/zswap.c   | 55 
> 
>  4 files changed, 85 insertions(+)
>
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index de2c67a..82b621f 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -389,6 +389,9 @@ extern void end_swap_bio_write(struct bio *bio);
>  extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
> bio_end

Re: Follow up on hid2hci: Fix udev rules for linux-4.14+

2019-08-28 Thread Dan Streetman
On Wed, Aug 28, 2019 at 1:59 PM Ville Syrjälä
 wrote:
>
> On Wed, Aug 28, 2019 at 08:50:51PM +0300, Ville Syrjälä wrote:
> > On Wed, Aug 28, 2019 at 01:34:07PM -0400, Dan Streetman wrote:
> > > It looks like this patch got lost at some point:
> > > https://lore.kernel.org/patchwork/patch/902126/#1138115
> > >
> > > but it seems to still be a problem and I'd like to pull it into Ubuntu:
> > > https://bugs.launchpad.net/ubuntu/+source/bluez/+bug/1759836
> > >
> > > Ville, did you ever follow up with a v2 for that patch and/or do you
> > > know if it will be accepted soon?
> >
> > There's a more recent version of that somewhere on the mailing list.
> > The problem is getting someone to actually apply it. Seems much harder
> > than it should be...
>
> https://lore.kernel.org/patchwork/patch/1021109/

I added to this reply a few of the most recent commit authors to the
bluez tools/ subdir...can any of you review and/or apply Ville's
patch?

Marcel, you appear to have created the hid2hci.rules file back in
2012, can you comment on the patch?

>
> >
> > And IIRC I also posted a few other fixes for hid2hci tool which didn't
> > get any response from the crowd.
>
> https://www.spinics.net/lists/linux-bluetooth/msg79803.html
>
> --
> Ville Syrjälä
> Intel


Follow up on hid2hci: Fix udev rules for linux-4.14+

2019-08-28 Thread Dan Streetman
It looks like this patch got lost at some point:
https://lore.kernel.org/patchwork/patch/902126/#1138115

but it seems to still be a problem and I'd like to pull it into Ubuntu:
https://bugs.launchpad.net/ubuntu/+source/bluez/+bug/1759836

Ville, did you ever follow up with a v2 for that patch and/or do you
know if it will be accepted soon?


Re: [PATCH] zswap: ignore debugfs_create_dir() return value

2019-01-31 Thread Dan Streetman
On Tue, Jan 29, 2019 at 3:33 PM Greg Kroah-Hartman
 wrote:
>
> On Tue, Jan 29, 2019 at 02:46:30PM -0500, Dan Streetman wrote:
> > On Tue, Jan 22, 2019 at 10:23 AM Greg Kroah-Hartman
> >  wrote:
> > >
> > > When calling debugfs functions, there is no need to ever check the
> > > return value.  The function can work or not, but the code logic should
> > > never do something different based on this.
> > >
> > > Cc: Seth Jennings 
> > > Cc: Dan Streetman 
> > > Cc: linux...@kvack.org
> > > Signed-off-by: Greg Kroah-Hartman 
> > > ---
> > >  mm/zswap.c | 2 --
> > >  1 file changed, 2 deletions(-)
> > >
> > > diff --git a/mm/zswap.c b/mm/zswap.c
> > > index a4e4d36ec085..f583d08f6e24 100644
> > > --- a/mm/zswap.c
> > > +++ b/mm/zswap.c
> > > @@ -1262,8 +1262,6 @@ static int __init zswap_debugfs_init(void)
> > > return -ENODEV;
> > >
> > > zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
> > > -   if (!zswap_debugfs_root)
> > > -   return -ENOMEM;
> > >
> > > debugfs_create_u64("pool_limit_hit", 0444,
> > >zswap_debugfs_root, _pool_limit_hit);
> >
> > wait, so if i'm reading the code right, in the case where
> > debugfs_create_dir() returns NULL, that will then be passed along to
> > debugfs_create_u64() as its parent directory - and the debugfs nodes
> > will then get created in the root debugfs directory.  That's not what
> > we want to happen...
>
> True, but that is such a rare thing to ever happen (hint, you have to be
> out of memory), that it's not really a bad thing.  But, you are not the
> first to mention this, which is why this patch is on its way to Linus
> for 5.0-final:
> https://lore.kernel.org/lkml/20190123102814.gb17...@kroah.com/

Ah!  Great, in that case then definitely

Acked-by: Dan Streetman 

>
> thanks,
>
> greg k-h


Re: [PATCH] zswap: ignore debugfs_create_dir() return value

2019-01-29 Thread Dan Streetman
On Tue, Jan 22, 2019 at 10:23 AM Greg Kroah-Hartman
 wrote:
>
> When calling debugfs functions, there is no need to ever check the
> return value.  The function can work or not, but the code logic should
> never do something different based on this.
>
> Cc: Seth Jennings 
> Cc: Dan Streetman 
> Cc: linux...@kvack.org
> Signed-off-by: Greg Kroah-Hartman 
> ---
>  mm/zswap.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index a4e4d36ec085..f583d08f6e24 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1262,8 +1262,6 @@ static int __init zswap_debugfs_init(void)
> return -ENODEV;
>
> zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
> -   if (!zswap_debugfs_root)
> -   return -ENOMEM;
>
> debugfs_create_u64("pool_limit_hit", 0444,
>zswap_debugfs_root, _pool_limit_hit);

wait, so if i'm reading the code right, in the case where
debugfs_create_dir() returns NULL, that will then be passed along to
debugfs_create_u64() as its parent directory - and the debugfs nodes
will then get created in the root debugfs directory.  That's not what
we want to happen...

> --
> 2.20.1
>


Re: [PATCH v2] mm: fix z3fold warnings on CONFIG_SMP=n

2018-09-28 Thread Dan Streetman
On Thu, Sep 27, 2018 at 5:15 PM Alex Xu (Hello71)  wrote:
>
> Spinlocks are always lockable on UP systems, even if they were just
> locked.
>
> Cc: Dan Streetman 

I cc'ed Vitaly also, as this code is from him, but the change
certainly looks correct to me.

Acked-by: Dan Streetman 

> Signed-off-by: Alex Xu (Hello71) 
> ---
>  mm/z3fold.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 4b366d181..2e8d268ac 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -277,7 +277,7 @@ static void release_z3fold_page_locked(struct kref *ref)
>  {
> struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
> refcount);
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   WARN_ON_SMP(z3fold_page_trylock(zhdr));
> __release_z3fold_page(zhdr, true);
>  }
>
> @@ -289,7 +289,7 @@ static void release_z3fold_page_locked_list(struct kref 
> *ref)
> list_del_init(>buddy);
> spin_unlock(>pool->lock);
>
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   WARN_ON_SMP(z3fold_page_trylock(zhdr));
> __release_z3fold_page(zhdr, true);
>  }
>
> @@ -403,7 +403,7 @@ static void do_compact_page(struct z3fold_header *zhdr, 
> bool locked)
>
> page = virt_to_page(zhdr);
> if (locked)
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   WARN_ON_SMP(z3fold_page_trylock(zhdr));
> else
> z3fold_page_lock(zhdr);
> if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, >private))) {
> --
> 2.19.0
>


Re: [PATCH v2] mm: fix z3fold warnings on CONFIG_SMP=n

2018-09-28 Thread Dan Streetman
On Thu, Sep 27, 2018 at 5:15 PM Alex Xu (Hello71)  wrote:
>
> Spinlocks are always lockable on UP systems, even if they were just
> locked.
>
> Cc: Dan Streetman 

I cc'ed Vitaly also, as this code is from him, but the change
certainly looks correct to me.

Acked-by: Dan Streetman 

> Signed-off-by: Alex Xu (Hello71) 
> ---
>  mm/z3fold.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 4b366d181..2e8d268ac 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -277,7 +277,7 @@ static void release_z3fold_page_locked(struct kref *ref)
>  {
> struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
> refcount);
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   WARN_ON_SMP(z3fold_page_trylock(zhdr));
> __release_z3fold_page(zhdr, true);
>  }
>
> @@ -289,7 +289,7 @@ static void release_z3fold_page_locked_list(struct kref 
> *ref)
> list_del_init(>buddy);
> spin_unlock(>pool->lock);
>
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   WARN_ON_SMP(z3fold_page_trylock(zhdr));
> __release_z3fold_page(zhdr, true);
>  }
>
> @@ -403,7 +403,7 @@ static void do_compact_page(struct z3fold_header *zhdr, 
> bool locked)
>
> page = virt_to_page(zhdr);
> if (locked)
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   WARN_ON_SMP(z3fold_page_trylock(zhdr));
> else
> z3fold_page_lock(zhdr);
> if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, >private))) {
> --
> 2.19.0
>


Re: [PATCH] mm: fix z3fold warnings on CONFIG_SMP=n

2018-09-27 Thread Dan Streetman
On Thu, Sep 27, 2018 at 4:27 PM Alex Xu (Hello71)  wrote:
>
> Spinlocks are always lockable on UP systems, even if they were just
> locked.

i think it would be much better to just use either
assert_spin_locked() or just spin_is_locked(), instead of an #ifdef.

>
> Cc: Dan Streetman 
> Signed-off-by: Alex Xu (Hello71) 
> ---
>  mm/z3fold.c | 13 ++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 4b366d181..4e6ad2de4 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -202,6 +202,13 @@ static inline void z3fold_page_lock(struct z3fold_header 
> *zhdr)
> spin_lock(>page_lock);
>  }
>
> +static inline void z3fold_page_ensure_locked(struct z3fold_header *zhdr)
> +{
> +#ifdef CONFIG_SMP
> +   WARN_ON(z3fold_page_trylock(zhdr));
> +#endif
> +}
> +
>  /* Try to lock a z3fold page */
>  static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
>  {
> @@ -277,7 +284,7 @@ static void release_z3fold_page_locked(struct kref *ref)
>  {
> struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
> refcount);
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   z3fold_page_ensure_locked(zhdr);
> __release_z3fold_page(zhdr, true);
>  }
>
> @@ -289,7 +296,7 @@ static void release_z3fold_page_locked_list(struct kref 
> *ref)
> list_del_init(>buddy);
> spin_unlock(>pool->lock);
>
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   z3fold_page_ensure_locked(zhdr);
> __release_z3fold_page(zhdr, true);
>  }
>
> @@ -403,7 +410,7 @@ static void do_compact_page(struct z3fold_header *zhdr, 
> bool locked)
>
> page = virt_to_page(zhdr);
> if (locked)
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   z3fold_page_ensure_locked(zhdr);
> else
> z3fold_page_lock(zhdr);
> if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, >private))) {
> --
> 2.19.0
>


Re: [PATCH] mm: fix z3fold warnings on CONFIG_SMP=n

2018-09-27 Thread Dan Streetman
On Thu, Sep 27, 2018 at 4:27 PM Alex Xu (Hello71)  wrote:
>
> Spinlocks are always lockable on UP systems, even if they were just
> locked.

i think it would be much better to just use either
assert_spin_locked() or just spin_is_locked(), instead of an #ifdef.

>
> Cc: Dan Streetman 
> Signed-off-by: Alex Xu (Hello71) 
> ---
>  mm/z3fold.c | 13 ++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/mm/z3fold.c b/mm/z3fold.c
> index 4b366d181..4e6ad2de4 100644
> --- a/mm/z3fold.c
> +++ b/mm/z3fold.c
> @@ -202,6 +202,13 @@ static inline void z3fold_page_lock(struct z3fold_header 
> *zhdr)
> spin_lock(>page_lock);
>  }
>
> +static inline void z3fold_page_ensure_locked(struct z3fold_header *zhdr)
> +{
> +#ifdef CONFIG_SMP
> +   WARN_ON(z3fold_page_trylock(zhdr));
> +#endif
> +}
> +
>  /* Try to lock a z3fold page */
>  static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
>  {
> @@ -277,7 +284,7 @@ static void release_z3fold_page_locked(struct kref *ref)
>  {
> struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
> refcount);
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   z3fold_page_ensure_locked(zhdr);
> __release_z3fold_page(zhdr, true);
>  }
>
> @@ -289,7 +296,7 @@ static void release_z3fold_page_locked_list(struct kref 
> *ref)
> list_del_init(>buddy);
> spin_unlock(>pool->lock);
>
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   z3fold_page_ensure_locked(zhdr);
> __release_z3fold_page(zhdr, true);
>  }
>
> @@ -403,7 +410,7 @@ static void do_compact_page(struct z3fold_header *zhdr, 
> bool locked)
>
> page = virt_to_page(zhdr);
> if (locked)
> -   WARN_ON(z3fold_page_trylock(zhdr));
> +   z3fold_page_ensure_locked(zhdr);
> else
> z3fold_page_lock(zhdr);
> if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, >private))) {
> --
> 2.19.0
>


Re: [PATCH v2] zswap: re-check zswap_is_full after do zswap_shrink

2018-07-25 Thread Dan Streetman
On Mon, Jun 25, 2018 at 4:08 AM Li Wang  wrote:
>
> On 30 May 2018 at 20:53, Dan Streetman  wrote:
> > On Wed, May 30, 2018 at 6:39 AM, Li Wang  wrote:
> >> The '/sys/../zswap/stored_pages:' keep raising in zswap test with
> >> "zswap.max_pool_percent=0" parameter. But theoretically, it should
> >> not compress or store pages any more since there is no space in
> >> compressed pool.
> >>
> >> Reproduce steps:
> >>   1. Boot kernel with "zswap.enabled=1"
> >>   2. Set the max_pool_percent to 0
> >>   # echo 0 > /sys/module/zswap/parameters/max_pool_percent
> >>   3. Do memory stress test to see if some pages have been compressed
> >>   # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
> >>   4. Watching the 'stored_pages' number increasing or not
> >>
> >> The root cause is:
> >>   When zswap_max_pool_percent is setting to 0 via kernel parameter, the
> >>   zswap_is_full() will always return true to do zswap_shrink(). But if
> >>   the shinking is able to reclain a page successful, then proceeds to
> >>   compress/store another page, so the value of stored_pages will keep
> >>   changing.
> >>
> >> To solve the issue, this patch adds zswap_is_full() check again after
> >> zswap_shrink() to make sure it's now under the max_pool_percent, and
> >> not to compress/store if reach its limitaion.
> >>
> >> Signed-off-by: Li Wang 
> >
> > Acked-by: Dan Streetman 
>
> ping~
>
> Any possible to merge this in kernel-4.18-rcX? My zswap test always
> fails on the upstream kernel.

cc'ing Andrew as he may have missed this.

>
>
> --
> Regards,
> Li Wang
> Email: wangli.a...@gmail.com


Re: [PATCH v2] zswap: re-check zswap_is_full after do zswap_shrink

2018-07-25 Thread Dan Streetman
On Mon, Jun 25, 2018 at 4:08 AM Li Wang  wrote:
>
> On 30 May 2018 at 20:53, Dan Streetman  wrote:
> > On Wed, May 30, 2018 at 6:39 AM, Li Wang  wrote:
> >> The '/sys/../zswap/stored_pages:' keep raising in zswap test with
> >> "zswap.max_pool_percent=0" parameter. But theoretically, it should
> >> not compress or store pages any more since there is no space in
> >> compressed pool.
> >>
> >> Reproduce steps:
> >>   1. Boot kernel with "zswap.enabled=1"
> >>   2. Set the max_pool_percent to 0
> >>   # echo 0 > /sys/module/zswap/parameters/max_pool_percent
> >>   3. Do memory stress test to see if some pages have been compressed
> >>   # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
> >>   4. Watching the 'stored_pages' number increasing or not
> >>
> >> The root cause is:
> >>   When zswap_max_pool_percent is setting to 0 via kernel parameter, the
> >>   zswap_is_full() will always return true to do zswap_shrink(). But if
> >>   the shinking is able to reclain a page successful, then proceeds to
> >>   compress/store another page, so the value of stored_pages will keep
> >>   changing.
> >>
> >> To solve the issue, this patch adds zswap_is_full() check again after
> >> zswap_shrink() to make sure it's now under the max_pool_percent, and
> >> not to compress/store if reach its limitaion.
> >>
> >> Signed-off-by: Li Wang 
> >
> > Acked-by: Dan Streetman 
>
> ping~
>
> Any possible to merge this in kernel-4.18-rcX? My zswap test always
> fails on the upstream kernel.

cc'ing Andrew as he may have missed this.

>
>
> --
> Regards,
> Li Wang
> Email: wangli.a...@gmail.com


Re: [PATCH v2] zswap: re-check zswap_is_full after do zswap_shrink

2018-05-30 Thread Dan Streetman
On Wed, May 30, 2018 at 6:39 AM, Li Wang  wrote:
> The '/sys/../zswap/stored_pages:' keep raising in zswap test with
> "zswap.max_pool_percent=0" parameter. But theoretically, it should
> not compress or store pages any more since there is no space in
> compressed pool.
>
> Reproduce steps:
>   1. Boot kernel with "zswap.enabled=1"
>   2. Set the max_pool_percent to 0
>   # echo 0 > /sys/module/zswap/parameters/max_pool_percent
>   3. Do memory stress test to see if some pages have been compressed
>   # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
>   4. Watching the 'stored_pages' number increasing or not
>
> The root cause is:
>   When zswap_max_pool_percent is setting to 0 via kernel parameter, the
>   zswap_is_full() will always return true to do zswap_shrink(). But if
>   the shinking is able to reclain a page successful, then proceeds to
>   compress/store another page, so the value of stored_pages will keep
>   changing.
>
> To solve the issue, this patch adds zswap_is_full() check again after
> zswap_shrink() to make sure it's now under the max_pool_percent, and
> not to compress/store if reach its limitaion.
>
> Signed-off-by: Li Wang 

Acked-by: Dan Streetman 

> Cc: Seth Jennings 
> Cc: Dan Streetman 
> Cc: Huang Ying 
> Cc: Yu Zhao 
> ---
>  mm/zswap.c | 9 +
>  1 file changed, 9 insertions(+)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 61a5c41..fd320c3 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1026,6 +1026,15 @@ static int zswap_frontswap_store(unsigned type, 
> pgoff_t offset,
> ret = -ENOMEM;
> goto reject;
> }
> +
> +   /* A second zswap_is_full() check after
> +* zswap_shrink() to make sure it's now
> +* under the max_pool_percent
> +*/
> +   if (zswap_is_full()) {
> +   ret = -ENOMEM;
> +   goto reject;
> +   }
> }
>
> /* allocate entry */
> --
> 2.9.5
>


Re: [PATCH v2] zswap: re-check zswap_is_full after do zswap_shrink

2018-05-30 Thread Dan Streetman
On Wed, May 30, 2018 at 6:39 AM, Li Wang  wrote:
> The '/sys/../zswap/stored_pages:' keep raising in zswap test with
> "zswap.max_pool_percent=0" parameter. But theoretically, it should
> not compress or store pages any more since there is no space in
> compressed pool.
>
> Reproduce steps:
>   1. Boot kernel with "zswap.enabled=1"
>   2. Set the max_pool_percent to 0
>   # echo 0 > /sys/module/zswap/parameters/max_pool_percent
>   3. Do memory stress test to see if some pages have been compressed
>   # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
>   4. Watching the 'stored_pages' number increasing or not
>
> The root cause is:
>   When zswap_max_pool_percent is setting to 0 via kernel parameter, the
>   zswap_is_full() will always return true to do zswap_shrink(). But if
>   the shinking is able to reclain a page successful, then proceeds to
>   compress/store another page, so the value of stored_pages will keep
>   changing.
>
> To solve the issue, this patch adds zswap_is_full() check again after
> zswap_shrink() to make sure it's now under the max_pool_percent, and
> not to compress/store if reach its limitaion.
>
> Signed-off-by: Li Wang 

Acked-by: Dan Streetman 

> Cc: Seth Jennings 
> Cc: Dan Streetman 
> Cc: Huang Ying 
> Cc: Yu Zhao 
> ---
>  mm/zswap.c | 9 +
>  1 file changed, 9 insertions(+)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 61a5c41..fd320c3 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1026,6 +1026,15 @@ static int zswap_frontswap_store(unsigned type, 
> pgoff_t offset,
> ret = -ENOMEM;
> goto reject;
> }
> +
> +   /* A second zswap_is_full() check after
> +* zswap_shrink() to make sure it's now
> +* under the max_pool_percent
> +*/
> +   if (zswap_is_full()) {
> +   ret = -ENOMEM;
> +   goto reject;
> +   }
> }
>
> /* allocate entry */
> --
> 2.9.5
>


Re: [PATCH RFC] zswap: reject to compress/store page if zswap_max_pool_percent is 0

2018-05-30 Thread Dan Streetman
On Tue, May 29, 2018 at 10:57 PM, Li Wang  wrote:
> Hi Dan,
>
> On Wed, May 30, 2018 at 5:14 AM, Dan Streetman  wrote:
>>
>> On Thu, May 24, 2018 at 5:57 AM, Li Wang  wrote:
>> > The '/sys/../zswap/stored_pages:' keep raising in zswap test with
>> > "zswap.max_pool_percent=0" parameter. But theoretically, it should
>> > not compress or store pages any more since there is no space for
>> > compressed pool.
>> >
>> > Reproduce steps:
>> >
>> >   1. Boot kernel with "zswap.enabled=1 zswap.max_pool_percent=17"
>> >   2. Set the max_pool_percent to 0
>> >   # echo 0 > /sys/module/zswap/parameters/max_pool_percent
>> >  Confirm this parameter works fine
>> >   # cat /sys/kernel/debug/zswap/pool_total_size
>> >   0
>> >   3. Do memory stress test to see if some pages have been compressed
>> >   # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
>> >  Watching the 'stored_pages' numbers increasing or not
>> >
>> > The root cause is:
>> >
>> >   When the zswap_max_pool_percent is set to 0 via kernel parameter, the
>> > zswap_is_full()
>> >   will always return true to shrink the pool size by zswap_shrink(). If
>> > the pool size
>> >   has been shrinked a little success, zswap will do compress/store pages
>> > again. Then we
>> >   get fails on that as above.
>>
>> special casing 0% doesn't make a lot of sense to me, and I'm not
>> entirely sure what exactly you are trying to fix here.
>
>
> Sorry for that confusing, I am a pretty new to zswap.
>
> To specify 0 to max_pool_percent is purpose to verify if zswap stopping work
> when there is no space in compressed pool.
>
> Another consideration from me is:
>
> [Method A]
>
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1021,7 +1021,7 @@ static int zswap_frontswap_store(unsigned type,
> pgoff_t offset,
> /* reclaim space if needed */
> if (zswap_is_full()) {
> zswap_pool_limit_hit++;
> -   if (zswap_shrink()) {
> +   if (!zswap_max_pool_percent || zswap_shrink()) {
> zswap_reject_reclaim_fail++;
> ret = -ENOMEM;
> goto reject;
>
> This make sure the compressed pool is enough to do zswap_shrink().
>
>
>>
>>
>> however, zswap does currently do a zswap_is_full() check, and then if
>> it's able to reclaim a page happily proceeds to store another page,
>> without re-checking zswap_is_full().  If you're trying to fix that,
>> then I would ack a patch that adds a second zswap_is_full() check
>> after zswap_shrink() to make sure it's now under the max_pool_percent
>> (or somehow otherwise fixes that behavior).
>>
>
> Ok, it sounds like can also fix the issue. The changes maybe like:
>
> [Method B]
>
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1026,6 +1026,15 @@ static int zswap_frontswap_store(unsigned type,
> pgoff_t offset,
> ret = -ENOMEM;
> goto reject;
> }
> +
> +   /* A second zswap_is_full() check after
> +* zswap_shrink() to make sure it's now
> +* under the max_pool_percent
> +*/
> +   if (zswap_is_full()) {
> +   ret = -ENOMEM;
> +   goto reject;
> +   }
> }
>
>
> So, which one do you think is better, A or B?

this is better.

>
> --
> Regards,
> Li Wang


Re: [PATCH RFC] zswap: reject to compress/store page if zswap_max_pool_percent is 0

2018-05-30 Thread Dan Streetman
On Tue, May 29, 2018 at 10:57 PM, Li Wang  wrote:
> Hi Dan,
>
> On Wed, May 30, 2018 at 5:14 AM, Dan Streetman  wrote:
>>
>> On Thu, May 24, 2018 at 5:57 AM, Li Wang  wrote:
>> > The '/sys/../zswap/stored_pages:' keep raising in zswap test with
>> > "zswap.max_pool_percent=0" parameter. But theoretically, it should
>> > not compress or store pages any more since there is no space for
>> > compressed pool.
>> >
>> > Reproduce steps:
>> >
>> >   1. Boot kernel with "zswap.enabled=1 zswap.max_pool_percent=17"
>> >   2. Set the max_pool_percent to 0
>> >   # echo 0 > /sys/module/zswap/parameters/max_pool_percent
>> >  Confirm this parameter works fine
>> >   # cat /sys/kernel/debug/zswap/pool_total_size
>> >   0
>> >   3. Do memory stress test to see if some pages have been compressed
>> >   # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
>> >  Watching the 'stored_pages' numbers increasing or not
>> >
>> > The root cause is:
>> >
>> >   When the zswap_max_pool_percent is set to 0 via kernel parameter, the
>> > zswap_is_full()
>> >   will always return true to shrink the pool size by zswap_shrink(). If
>> > the pool size
>> >   has been shrinked a little success, zswap will do compress/store pages
>> > again. Then we
>> >   get fails on that as above.
>>
>> special casing 0% doesn't make a lot of sense to me, and I'm not
>> entirely sure what exactly you are trying to fix here.
>
>
> Sorry for that confusing, I am a pretty new to zswap.
>
> To specify 0 to max_pool_percent is purpose to verify if zswap stopping work
> when there is no space in compressed pool.
>
> Another consideration from me is:
>
> [Method A]
>
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1021,7 +1021,7 @@ static int zswap_frontswap_store(unsigned type,
> pgoff_t offset,
> /* reclaim space if needed */
> if (zswap_is_full()) {
> zswap_pool_limit_hit++;
> -   if (zswap_shrink()) {
> +   if (!zswap_max_pool_percent || zswap_shrink()) {
> zswap_reject_reclaim_fail++;
> ret = -ENOMEM;
> goto reject;
>
> This make sure the compressed pool is enough to do zswap_shrink().
>
>
>>
>>
>> however, zswap does currently do a zswap_is_full() check, and then if
>> it's able to reclaim a page happily proceeds to store another page,
>> without re-checking zswap_is_full().  If you're trying to fix that,
>> then I would ack a patch that adds a second zswap_is_full() check
>> after zswap_shrink() to make sure it's now under the max_pool_percent
>> (or somehow otherwise fixes that behavior).
>>
>
> Ok, it sounds like can also fix the issue. The changes maybe like:
>
> [Method B]
>
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1026,6 +1026,15 @@ static int zswap_frontswap_store(unsigned type,
> pgoff_t offset,
> ret = -ENOMEM;
> goto reject;
> }
> +
> +   /* A second zswap_is_full() check after
> +* zswap_shrink() to make sure it's now
> +* under the max_pool_percent
> +*/
> +   if (zswap_is_full()) {
> +   ret = -ENOMEM;
> +   goto reject;
> +   }
> }
>
>
> So, which one do you think is better, A or B?

this is better.

>
> --
> Regards,
> Li Wang


Re: [PATCH RFC] zswap: reject to compress/store page if zswap_max_pool_percent is 0

2018-05-29 Thread Dan Streetman
On Thu, May 24, 2018 at 5:57 AM, Li Wang  wrote:
> The '/sys/../zswap/stored_pages:' keep raising in zswap test with
> "zswap.max_pool_percent=0" parameter. But theoretically, it should
> not compress or store pages any more since there is no space for
> compressed pool.
>
> Reproduce steps:
>
>   1. Boot kernel with "zswap.enabled=1 zswap.max_pool_percent=17"
>   2. Set the max_pool_percent to 0
>   # echo 0 > /sys/module/zswap/parameters/max_pool_percent
>  Confirm this parameter works fine
>   # cat /sys/kernel/debug/zswap/pool_total_size
>   0
>   3. Do memory stress test to see if some pages have been compressed
>   # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
>  Watching the 'stored_pages' numbers increasing or not
>
> The root cause is:
>
>   When the zswap_max_pool_percent is set to 0 via kernel parameter, the 
> zswap_is_full()
>   will always return true to shrink the pool size by zswap_shrink(). If the 
> pool size
>   has been shrinked a little success, zswap will do compress/store pages 
> again. Then we
>   get fails on that as above.

special casing 0% doesn't make a lot of sense to me, and I'm not
entirely sure what exactly you are trying to fix here.

however, zswap does currently do a zswap_is_full() check, and then if
it's able to reclaim a page happily proceeds to store another page,
without re-checking zswap_is_full().  If you're trying to fix that,
then I would ack a patch that adds a second zswap_is_full() check
after zswap_shrink() to make sure it's now under the max_pool_percent
(or somehow otherwise fixes that behavior).

>
> Signed-off-by: Li Wang 
> Cc: Seth Jennings 
> Cc: Dan Streetman 
> Cc: Huang Ying 
> Cc: Yu Zhao 
> ---
>  mm/zswap.c | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 61a5c41..2b537bb 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1007,6 +1007,11 @@ static int zswap_frontswap_store(unsigned type, 
> pgoff_t offset,
> u8 *src, *dst;
> struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
>
> +   if (!zswap_max_pool_percent) {
> +   ret = -ENOMEM;
> +   goto reject;
> +   }
> +
> /* THP isn't supported */
> if (PageTransHuge(page)) {
> ret = -EINVAL;
> --
> 2.9.5
>


Re: [PATCH RFC] zswap: reject to compress/store page if zswap_max_pool_percent is 0

2018-05-29 Thread Dan Streetman
On Thu, May 24, 2018 at 5:57 AM, Li Wang  wrote:
> The '/sys/../zswap/stored_pages:' keep raising in zswap test with
> "zswap.max_pool_percent=0" parameter. But theoretically, it should
> not compress or store pages any more since there is no space for
> compressed pool.
>
> Reproduce steps:
>
>   1. Boot kernel with "zswap.enabled=1 zswap.max_pool_percent=17"
>   2. Set the max_pool_percent to 0
>   # echo 0 > /sys/module/zswap/parameters/max_pool_percent
>  Confirm this parameter works fine
>   # cat /sys/kernel/debug/zswap/pool_total_size
>   0
>   3. Do memory stress test to see if some pages have been compressed
>   # stress --vm 1 --vm-bytes $mem_available"M" --timeout 60s
>  Watching the 'stored_pages' numbers increasing or not
>
> The root cause is:
>
>   When the zswap_max_pool_percent is set to 0 via kernel parameter, the 
> zswap_is_full()
>   will always return true to shrink the pool size by zswap_shrink(). If the 
> pool size
>   has been shrinked a little success, zswap will do compress/store pages 
> again. Then we
>   get fails on that as above.

special casing 0% doesn't make a lot of sense to me, and I'm not
entirely sure what exactly you are trying to fix here.

however, zswap does currently do a zswap_is_full() check, and then if
it's able to reclaim a page happily proceeds to store another page,
without re-checking zswap_is_full().  If you're trying to fix that,
then I would ack a patch that adds a second zswap_is_full() check
after zswap_shrink() to make sure it's now under the max_pool_percent
(or somehow otherwise fixes that behavior).

>
> Signed-off-by: Li Wang 
> Cc: Seth Jennings 
> Cc: Dan Streetman 
> Cc: Huang Ying 
> Cc: Yu Zhao 
> ---
>  mm/zswap.c | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 61a5c41..2b537bb 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1007,6 +1007,11 @@ static int zswap_frontswap_store(unsigned type, 
> pgoff_t offset,
> u8 *src, *dst;
> struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
>
> +   if (!zswap_max_pool_percent) {
> +   ret = -ENOMEM;
> +   goto reject;
> +   }
> +
> /* THP isn't supported */
> if (PageTransHuge(page)) {
> ret = -EINVAL;
> --
> 2.9.5
>


Re: net: hang in unregister_netdevice: waiting for lo to become free

2018-05-11 Thread Dan Streetman
On Fri, May 11, 2018 at 5:19 AM, Dmitry Vyukov <dvyu...@google.com> wrote:
> On Thu, May 10, 2018 at 12:23 PM, Dan Streetman <ddstr...@ieee.org> wrote:
>>>>>>>> <tommi.t.rant...@nokia.com> wrote:
>>>>>>>>> On 20.02.2018 18:26, Neil Horman wrote:
>>>>>>>>>>
>>>>>>>>>> On Tue, Feb 20, 2018 at 09:14:41AM +0100, Dmitry Vyukov wrote:
>>>>>>>>>>>
>>>>>>>>>>> On Tue, Feb 20, 2018 at 8:56 AM, Tommi Rantala
>>>>>>>>>>> <tommi.t.rant...@nokia.com> wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On 19.02.2018 20:59, Dmitry Vyukov wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>> Is this meant to be fixed already? I am still seeing this on the
>>>>>>>>>>>>> latest upstream tree.
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> These two commits are in v4.16-rc1:
>>>>>>>>>>>>
>>>>>>>>>>>> commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8
>>>>>>>>>>>> Author: Tommi Rantala <tommi.t.rant...@nokia.com>
>>>>>>>>>>>> Date:   Mon Feb 5 21:48:14 2018 +0200
>>>>>>>>>>>>
>>>>>>>>>>>>  sctp: fix dst refcnt leak in sctp_v4_get_dst
>>>>>>>>>>>> ...
>>>>>>>>>>>>  Fixes: 410f03831 ("sctp: add routing output fallback")
>>>>>>>>>>>>  Fixes: 0ca50d12f ("sctp: fix src address selection if using
>>>>>>>>>>>> secondary
>>>>>>>>>>>> addresses")
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2
>>>>>>>>>>>> Author: Alexey Kodanev <alexey.koda...@oracle.com>
>>>>>>>>>>>> Date:   Mon Feb 5 15:10:35 2018 +0300
>>>>>>>>>>>>
>>>>>>>>>>>>  sctp: fix dst refcnt leak in sctp_v6_get_dst()
>>>>>>>>>>>> ...
>>>>>>>>>>>>  Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using
>>>>>>>>>>>> secondary
>>>>>>>>>>>> addresses for ipv6")
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> I guess we missed something if it's still reproducible.
>>>>>>>>>>>>
>>>>>>>>>>>> I can check it later this week, unless someone else beat me to it.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Hi Tommi,
>>>>>>>>>>>
>>>>>>>>>>> Hmmm, I can't claim that it's exactly the same bug. Perhaps it's
>>>>>>>>>>> another one then. But I am still seeing these:
>>>>>>>>>>>
>>>>>>>>>>> [   58.799130] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>>> Usage count = 4
>>>>>>>>>>> [   60.847138] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>>> Usage count = 4
>>>>>>>>>>> [   62.895093] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>>> Usage count = 4
>>>>>>>>>>> [   64.943103] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>>> Usage count = 4
>>>>>>>>>>>
>>>>>>>>>>> on upstream tree pulled ~12 hours ago.
>>>>>>>>>>>
>>>>>>>>>> Can you write a systemtap script to probe dev_hold, and dev_put, 
>>>>>>>>>> printing
>>>>>>>>>> out a
>>>>>>>>>> backtrace if the device name matches "lo".  That should 

Re: net: hang in unregister_netdevice: waiting for lo to become free

2018-05-11 Thread Dan Streetman
On Fri, May 11, 2018 at 5:19 AM, Dmitry Vyukov  wrote:
> On Thu, May 10, 2018 at 12:23 PM, Dan Streetman  wrote:
>>>>>>>>  wrote:
>>>>>>>>> On 20.02.2018 18:26, Neil Horman wrote:
>>>>>>>>>>
>>>>>>>>>> On Tue, Feb 20, 2018 at 09:14:41AM +0100, Dmitry Vyukov wrote:
>>>>>>>>>>>
>>>>>>>>>>> On Tue, Feb 20, 2018 at 8:56 AM, Tommi Rantala
>>>>>>>>>>>  wrote:
>>>>>>>>>>>>
>>>>>>>>>>>> On 19.02.2018 20:59, Dmitry Vyukov wrote:
>>>>>>>>>>>>>
>>>>>>>>>>>>> Is this meant to be fixed already? I am still seeing this on the
>>>>>>>>>>>>> latest upstream tree.
>>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> These two commits are in v4.16-rc1:
>>>>>>>>>>>>
>>>>>>>>>>>> commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8
>>>>>>>>>>>> Author: Tommi Rantala 
>>>>>>>>>>>> Date:   Mon Feb 5 21:48:14 2018 +0200
>>>>>>>>>>>>
>>>>>>>>>>>>  sctp: fix dst refcnt leak in sctp_v4_get_dst
>>>>>>>>>>>> ...
>>>>>>>>>>>>  Fixes: 410f03831 ("sctp: add routing output fallback")
>>>>>>>>>>>>  Fixes: 0ca50d12f ("sctp: fix src address selection if using
>>>>>>>>>>>> secondary
>>>>>>>>>>>> addresses")
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2
>>>>>>>>>>>> Author: Alexey Kodanev 
>>>>>>>>>>>> Date:   Mon Feb 5 15:10:35 2018 +0300
>>>>>>>>>>>>
>>>>>>>>>>>>  sctp: fix dst refcnt leak in sctp_v6_get_dst()
>>>>>>>>>>>> ...
>>>>>>>>>>>>  Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using
>>>>>>>>>>>> secondary
>>>>>>>>>>>> addresses for ipv6")
>>>>>>>>>>>>
>>>>>>>>>>>>
>>>>>>>>>>>> I guess we missed something if it's still reproducible.
>>>>>>>>>>>>
>>>>>>>>>>>> I can check it later this week, unless someone else beat me to it.
>>>>>>>>>>>
>>>>>>>>>>>
>>>>>>>>>>> Hi Tommi,
>>>>>>>>>>>
>>>>>>>>>>> Hmmm, I can't claim that it's exactly the same bug. Perhaps it's
>>>>>>>>>>> another one then. But I am still seeing these:
>>>>>>>>>>>
>>>>>>>>>>> [   58.799130] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>>> Usage count = 4
>>>>>>>>>>> [   60.847138] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>>> Usage count = 4
>>>>>>>>>>> [   62.895093] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>>> Usage count = 4
>>>>>>>>>>> [   64.943103] unregister_netdevice: waiting for lo to become free.
>>>>>>>>>>> Usage count = 4
>>>>>>>>>>>
>>>>>>>>>>> on upstream tree pulled ~12 hours ago.
>>>>>>>>>>>
>>>>>>>>>> Can you write a systemtap script to probe dev_hold, and dev_put, 
>>>>>>>>>> printing
>>>>>>>>>> out a
>>>>>>>>>> backtrace if the device name matches "lo".  That should tell us
>>>>>>>>>> definitively if
>>>>>>>>>> the problem is in the same location or not
>>>>>>>>&g

Re: net: hang in unregister_netdevice: waiting for lo to become free

2018-05-10 Thread Dan Streetman
On Thu, May 10, 2018 at 2:46 AM, Dmitry Vyukov <dvyu...@google.com> wrote:
> On Mon, Apr 16, 2018 at 9:42 PM, Dan Streetman <ddstr...@ieee.org> wrote:
>>>>>> On Wed, Feb 21, 2018 at 3:53 PM, Tommi Rantala
>>>>>> <tommi.t.rant...@nokia.com> wrote:
>>>>>>> On 20.02.2018 18:26, Neil Horman wrote:
>>>>>>>>
>>>>>>>> On Tue, Feb 20, 2018 at 09:14:41AM +0100, Dmitry Vyukov wrote:
>>>>>>>>>
>>>>>>>>> On Tue, Feb 20, 2018 at 8:56 AM, Tommi Rantala
>>>>>>>>> <tommi.t.rant...@nokia.com> wrote:
>>>>>>>>>>
>>>>>>>>>> On 19.02.2018 20:59, Dmitry Vyukov wrote:
>>>>>>>>>>>
>>>>>>>>>>> Is this meant to be fixed already? I am still seeing this on the
>>>>>>>>>>> latest upstream tree.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> These two commits are in v4.16-rc1:
>>>>>>>>>>
>>>>>>>>>> commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8
>>>>>>>>>> Author: Tommi Rantala <tommi.t.rant...@nokia.com>
>>>>>>>>>> Date:   Mon Feb 5 21:48:14 2018 +0200
>>>>>>>>>>
>>>>>>>>>>  sctp: fix dst refcnt leak in sctp_v4_get_dst
>>>>>>>>>> ...
>>>>>>>>>>  Fixes: 410f03831 ("sctp: add routing output fallback")
>>>>>>>>>>  Fixes: 0ca50d12f ("sctp: fix src address selection if using
>>>>>>>>>> secondary
>>>>>>>>>> addresses")
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2
>>>>>>>>>> Author: Alexey Kodanev <alexey.koda...@oracle.com>
>>>>>>>>>> Date:   Mon Feb 5 15:10:35 2018 +0300
>>>>>>>>>>
>>>>>>>>>>  sctp: fix dst refcnt leak in sctp_v6_get_dst()
>>>>>>>>>> ...
>>>>>>>>>>  Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using
>>>>>>>>>> secondary
>>>>>>>>>> addresses for ipv6")
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I guess we missed something if it's still reproducible.
>>>>>>>>>>
>>>>>>>>>> I can check it later this week, unless someone else beat me to it.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Hi Tommi,
>>>>>>>>>
>>>>>>>>> Hmmm, I can't claim that it's exactly the same bug. Perhaps it's
>>>>>>>>> another one then. But I am still seeing these:
>>>>>>>>>
>>>>>>>>> [   58.799130] unregister_netdevice: waiting for lo to become free.
>>>>>>>>> Usage count = 4
>>>>>>>>> [   60.847138] unregister_netdevice: waiting for lo to become free.
>>>>>>>>> Usage count = 4
>>>>>>>>> [   62.895093] unregister_netdevice: waiting for lo to become free.
>>>>>>>>> Usage count = 4
>>>>>>>>> [   64.943103] unregister_netdevice: waiting for lo to become free.
>>>>>>>>> Usage count = 4
>>>>>>>>>
>>>>>>>>> on upstream tree pulled ~12 hours ago.
>>>>>>>>>
>>>>>>>> Can you write a systemtap script to probe dev_hold, and dev_put, 
>>>>>>>> printing
>>>>>>>> out a
>>>>>>>> backtrace if the device name matches "lo".  That should tell us
>>>>>>>> definitively if
>>>>>>>> the problem is in the same location or not
>>>>>>>
>>>>>>>
>>>>>>> Hi Dmitry, I tested with the reproducer and the kernel .config file 
>>>>>>> that you
>>>>>>> sent in the first email in this thread:
>>>>>>>
>>>>

Re: net: hang in unregister_netdevice: waiting for lo to become free

2018-05-10 Thread Dan Streetman
On Thu, May 10, 2018 at 2:46 AM, Dmitry Vyukov  wrote:
> On Mon, Apr 16, 2018 at 9:42 PM, Dan Streetman  wrote:
>>>>>> On Wed, Feb 21, 2018 at 3:53 PM, Tommi Rantala
>>>>>>  wrote:
>>>>>>> On 20.02.2018 18:26, Neil Horman wrote:
>>>>>>>>
>>>>>>>> On Tue, Feb 20, 2018 at 09:14:41AM +0100, Dmitry Vyukov wrote:
>>>>>>>>>
>>>>>>>>> On Tue, Feb 20, 2018 at 8:56 AM, Tommi Rantala
>>>>>>>>>  wrote:
>>>>>>>>>>
>>>>>>>>>> On 19.02.2018 20:59, Dmitry Vyukov wrote:
>>>>>>>>>>>
>>>>>>>>>>> Is this meant to be fixed already? I am still seeing this on the
>>>>>>>>>>> latest upstream tree.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> These two commits are in v4.16-rc1:
>>>>>>>>>>
>>>>>>>>>> commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8
>>>>>>>>>> Author: Tommi Rantala 
>>>>>>>>>> Date:   Mon Feb 5 21:48:14 2018 +0200
>>>>>>>>>>
>>>>>>>>>>  sctp: fix dst refcnt leak in sctp_v4_get_dst
>>>>>>>>>> ...
>>>>>>>>>>  Fixes: 410f03831 ("sctp: add routing output fallback")
>>>>>>>>>>  Fixes: 0ca50d12f ("sctp: fix src address selection if using
>>>>>>>>>> secondary
>>>>>>>>>> addresses")
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2
>>>>>>>>>> Author: Alexey Kodanev 
>>>>>>>>>> Date:   Mon Feb 5 15:10:35 2018 +0300
>>>>>>>>>>
>>>>>>>>>>  sctp: fix dst refcnt leak in sctp_v6_get_dst()
>>>>>>>>>> ...
>>>>>>>>>>  Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using
>>>>>>>>>> secondary
>>>>>>>>>> addresses for ipv6")
>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> I guess we missed something if it's still reproducible.
>>>>>>>>>>
>>>>>>>>>> I can check it later this week, unless someone else beat me to it.
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Hi Tommi,
>>>>>>>>>
>>>>>>>>> Hmmm, I can't claim that it's exactly the same bug. Perhaps it's
>>>>>>>>> another one then. But I am still seeing these:
>>>>>>>>>
>>>>>>>>> [   58.799130] unregister_netdevice: waiting for lo to become free.
>>>>>>>>> Usage count = 4
>>>>>>>>> [   60.847138] unregister_netdevice: waiting for lo to become free.
>>>>>>>>> Usage count = 4
>>>>>>>>> [   62.895093] unregister_netdevice: waiting for lo to become free.
>>>>>>>>> Usage count = 4
>>>>>>>>> [   64.943103] unregister_netdevice: waiting for lo to become free.
>>>>>>>>> Usage count = 4
>>>>>>>>>
>>>>>>>>> on upstream tree pulled ~12 hours ago.
>>>>>>>>>
>>>>>>>> Can you write a systemtap script to probe dev_hold, and dev_put, 
>>>>>>>> printing
>>>>>>>> out a
>>>>>>>> backtrace if the device name matches "lo".  That should tell us
>>>>>>>> definitively if
>>>>>>>> the problem is in the same location or not
>>>>>>>
>>>>>>>
>>>>>>> Hi Dmitry, I tested with the reproducer and the kernel .config file 
>>>>>>> that you
>>>>>>> sent in the first email in this thread:
>>>>>>>
>>>>>>> With 4.16-rc2 unable to reproduce.
>>>>>>>
>>>>>>> With 4.15-rc9 bug reproducible, and I get "unregister_netdevice: 
>

Re: net: hang in unregister_netdevice: waiting for lo to become free

2018-04-16 Thread Dan Streetman
On Mon, Apr 16, 2018 at 3:35 AM, Dmitry Vyukov <dvyu...@google.com> wrote:
> On Fri, Apr 13, 2018 at 5:54 PM, Dmitry Vyukov <dvyu...@google.com> wrote:
>> On Fri, Apr 13, 2018 at 2:43 PM, Dan Streetman <ddstr...@ieee.org> wrote:
>>> On Thu, Apr 12, 2018 at 8:15 AM, Dmitry Vyukov <dvyu...@google.com> wrote:
>>>> On Wed, Feb 21, 2018 at 3:53 PM, Tommi Rantala
>>>> <tommi.t.rant...@nokia.com> wrote:
>>>>> On 20.02.2018 18:26, Neil Horman wrote:
>>>>>>
>>>>>> On Tue, Feb 20, 2018 at 09:14:41AM +0100, Dmitry Vyukov wrote:
>>>>>>>
>>>>>>> On Tue, Feb 20, 2018 at 8:56 AM, Tommi Rantala
>>>>>>> <tommi.t.rant...@nokia.com> wrote:
>>>>>>>>
>>>>>>>> On 19.02.2018 20:59, Dmitry Vyukov wrote:
>>>>>>>>>
>>>>>>>>> Is this meant to be fixed already? I am still seeing this on the
>>>>>>>>> latest upstream tree.
>>>>>>>>>
>>>>>>>>
>>>>>>>> These two commits are in v4.16-rc1:
>>>>>>>>
>>>>>>>> commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8
>>>>>>>> Author: Tommi Rantala <tommi.t.rant...@nokia.com>
>>>>>>>> Date:   Mon Feb 5 21:48:14 2018 +0200
>>>>>>>>
>>>>>>>>  sctp: fix dst refcnt leak in sctp_v4_get_dst
>>>>>>>> ...
>>>>>>>>  Fixes: 410f03831 ("sctp: add routing output fallback")
>>>>>>>>  Fixes: 0ca50d12f ("sctp: fix src address selection if using
>>>>>>>> secondary
>>>>>>>> addresses")
>>>>>>>>
>>>>>>>>
>>>>>>>> commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2
>>>>>>>> Author: Alexey Kodanev <alexey.koda...@oracle.com>
>>>>>>>> Date:   Mon Feb 5 15:10:35 2018 +0300
>>>>>>>>
>>>>>>>>  sctp: fix dst refcnt leak in sctp_v6_get_dst()
>>>>>>>> ...
>>>>>>>>  Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using
>>>>>>>> secondary
>>>>>>>> addresses for ipv6")
>>>>>>>>
>>>>>>>>
>>>>>>>> I guess we missed something if it's still reproducible.
>>>>>>>>
>>>>>>>> I can check it later this week, unless someone else beat me to it.
>>>>>>>
>>>>>>>
>>>>>>> Hi Tommi,
>>>>>>>
>>>>>>> Hmmm, I can't claim that it's exactly the same bug. Perhaps it's
>>>>>>> another one then. But I am still seeing these:
>>>>>>>
>>>>>>> [   58.799130] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>> [   60.847138] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>> [   62.895093] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>> [   64.943103] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>>
>>>>>>> on upstream tree pulled ~12 hours ago.
>>>>>>>
>>>>>> Can you write a systemtap script to probe dev_hold, and dev_put, printing
>>>>>> out a
>>>>>> backtrace if the device name matches "lo".  That should tell us
>>>>>> definitively if
>>>>>> the problem is in the same location or not
>>>>>
>>>>>
>>>>> Hi Dmitry, I tested with the reproducer and the kernel .config file that 
>>>>> you
>>>>> sent in the first email in this thread:
>>>>>
>>>>> With 4.16-rc2 unable to reproduce.
>>>>>
>>>>> With 4.15-rc9 bug reproducible, and I get "unregister_netdevice: waiting 
>>>>> for
>>>>> lo to become free. Usage count = 3"
>>>>>
>>>>> With 4.15-rc9 and Alexey's "sctp: fix dst refcnt leak in 
>>>>> sctp_v6_get_dst()"
>>>>> cherry

Re: net: hang in unregister_netdevice: waiting for lo to become free

2018-04-16 Thread Dan Streetman
On Mon, Apr 16, 2018 at 3:35 AM, Dmitry Vyukov  wrote:
> On Fri, Apr 13, 2018 at 5:54 PM, Dmitry Vyukov  wrote:
>> On Fri, Apr 13, 2018 at 2:43 PM, Dan Streetman  wrote:
>>> On Thu, Apr 12, 2018 at 8:15 AM, Dmitry Vyukov  wrote:
>>>> On Wed, Feb 21, 2018 at 3:53 PM, Tommi Rantala
>>>>  wrote:
>>>>> On 20.02.2018 18:26, Neil Horman wrote:
>>>>>>
>>>>>> On Tue, Feb 20, 2018 at 09:14:41AM +0100, Dmitry Vyukov wrote:
>>>>>>>
>>>>>>> On Tue, Feb 20, 2018 at 8:56 AM, Tommi Rantala
>>>>>>>  wrote:
>>>>>>>>
>>>>>>>> On 19.02.2018 20:59, Dmitry Vyukov wrote:
>>>>>>>>>
>>>>>>>>> Is this meant to be fixed already? I am still seeing this on the
>>>>>>>>> latest upstream tree.
>>>>>>>>>
>>>>>>>>
>>>>>>>> These two commits are in v4.16-rc1:
>>>>>>>>
>>>>>>>> commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8
>>>>>>>> Author: Tommi Rantala 
>>>>>>>> Date:   Mon Feb 5 21:48:14 2018 +0200
>>>>>>>>
>>>>>>>>  sctp: fix dst refcnt leak in sctp_v4_get_dst
>>>>>>>> ...
>>>>>>>>  Fixes: 410f03831 ("sctp: add routing output fallback")
>>>>>>>>  Fixes: 0ca50d12f ("sctp: fix src address selection if using
>>>>>>>> secondary
>>>>>>>> addresses")
>>>>>>>>
>>>>>>>>
>>>>>>>> commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2
>>>>>>>> Author: Alexey Kodanev 
>>>>>>>> Date:   Mon Feb 5 15:10:35 2018 +0300
>>>>>>>>
>>>>>>>>  sctp: fix dst refcnt leak in sctp_v6_get_dst()
>>>>>>>> ...
>>>>>>>>  Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using
>>>>>>>> secondary
>>>>>>>> addresses for ipv6")
>>>>>>>>
>>>>>>>>
>>>>>>>> I guess we missed something if it's still reproducible.
>>>>>>>>
>>>>>>>> I can check it later this week, unless someone else beat me to it.
>>>>>>>
>>>>>>>
>>>>>>> Hi Tommi,
>>>>>>>
>>>>>>> Hmmm, I can't claim that it's exactly the same bug. Perhaps it's
>>>>>>> another one then. But I am still seeing these:
>>>>>>>
>>>>>>> [   58.799130] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>> [   60.847138] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>> [   62.895093] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>> [   64.943103] unregister_netdevice: waiting for lo to become free.
>>>>>>> Usage count = 4
>>>>>>>
>>>>>>> on upstream tree pulled ~12 hours ago.
>>>>>>>
>>>>>> Can you write a systemtap script to probe dev_hold, and dev_put, printing
>>>>>> out a
>>>>>> backtrace if the device name matches "lo".  That should tell us
>>>>>> definitively if
>>>>>> the problem is in the same location or not
>>>>>
>>>>>
>>>>> Hi Dmitry, I tested with the reproducer and the kernel .config file that 
>>>>> you
>>>>> sent in the first email in this thread:
>>>>>
>>>>> With 4.16-rc2 unable to reproduce.
>>>>>
>>>>> With 4.15-rc9 bug reproducible, and I get "unregister_netdevice: waiting 
>>>>> for
>>>>> lo to become free. Usage count = 3"
>>>>>
>>>>> With 4.15-rc9 and Alexey's "sctp: fix dst refcnt leak in 
>>>>> sctp_v6_get_dst()"
>>>>> cherry-picked on top, unable to reproduce.
>>>>>
>>>>>
>>>>> Is syzkaller doing something else now to trigger the bug...?
>>>>> Can you still trigger the bug with the same reproduce

Re: net: hang in unregister_netdevice: waiting for lo to become free

2018-04-13 Thread Dan Streetman
On Thu, Apr 12, 2018 at 8:15 AM, Dmitry Vyukov  wrote:
> On Wed, Feb 21, 2018 at 3:53 PM, Tommi Rantala
>  wrote:
>> On 20.02.2018 18:26, Neil Horman wrote:
>>>
>>> On Tue, Feb 20, 2018 at 09:14:41AM +0100, Dmitry Vyukov wrote:

 On Tue, Feb 20, 2018 at 8:56 AM, Tommi Rantala
  wrote:
>
> On 19.02.2018 20:59, Dmitry Vyukov wrote:
>>
>> Is this meant to be fixed already? I am still seeing this on the
>> latest upstream tree.
>>
>
> These two commits are in v4.16-rc1:
>
> commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8
> Author: Tommi Rantala 
> Date:   Mon Feb 5 21:48:14 2018 +0200
>
>  sctp: fix dst refcnt leak in sctp_v4_get_dst
> ...
>  Fixes: 410f03831 ("sctp: add routing output fallback")
>  Fixes: 0ca50d12f ("sctp: fix src address selection if using
> secondary
> addresses")
>
>
> commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2
> Author: Alexey Kodanev 
> Date:   Mon Feb 5 15:10:35 2018 +0300
>
>  sctp: fix dst refcnt leak in sctp_v6_get_dst()
> ...
>  Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using
> secondary
> addresses for ipv6")
>
>
> I guess we missed something if it's still reproducible.
>
> I can check it later this week, unless someone else beat me to it.


 Hi Tommi,

 Hmmm, I can't claim that it's exactly the same bug. Perhaps it's
 another one then. But I am still seeing these:

 [   58.799130] unregister_netdevice: waiting for lo to become free.
 Usage count = 4
 [   60.847138] unregister_netdevice: waiting for lo to become free.
 Usage count = 4
 [   62.895093] unregister_netdevice: waiting for lo to become free.
 Usage count = 4
 [   64.943103] unregister_netdevice: waiting for lo to become free.
 Usage count = 4

 on upstream tree pulled ~12 hours ago.

>>> Can you write a systemtap script to probe dev_hold, and dev_put, printing
>>> out a
>>> backtrace if the device name matches "lo".  That should tell us
>>> definitively if
>>> the problem is in the same location or not
>>
>>
>> Hi Dmitry, I tested with the reproducer and the kernel .config file that you
>> sent in the first email in this thread:
>>
>> With 4.16-rc2 unable to reproduce.
>>
>> With 4.15-rc9 bug reproducible, and I get "unregister_netdevice: waiting for
>> lo to become free. Usage count = 3"
>>
>> With 4.15-rc9 and Alexey's "sctp: fix dst refcnt leak in sctp_v6_get_dst()"
>> cherry-picked on top, unable to reproduce.
>>
>>
>> Is syzkaller doing something else now to trigger the bug...?
>> Can you still trigger the bug with the same reproducer?
>
> Hi Neil, Tommi,
>
> Reviving this old thread about "unregister_netdevice: waiting for lo
> to become free. Usage count = 3" hangs.
> I still did not have time to deep dive into what happens there (too
> many bugs coming from syzbot). But this still actively happens and I
> suspect accounts to a significant portion of various hang reports,
> which are quite unpleasant.
>
> One idea that could make it all simpler:
>
> Is this wait loop in netdev_wait_allrefs() supposed to wait for any
> prolonged periods of time under any non-buggy conditions? E.g. more
> than 1-2 minutes?
> If it only supposed to wait briefly for things that already supposed
> to be shutting down, and we add a WARNING there after some timeout,
> then syzbot will report all info how/when it happens, hopefully
> extracting reproducers, and all the nice things.
> But this WARNING should not have any false positives under any
> realistic conditions (e.g. waiting for arrival of remote packets with
> large timeouts).
>
> Looking at some task hung reports, it seems that this code holds some
> mutexes, takes workqueue thread and prevents any progress with
> destruction of other devices (and net namespace creation/destruction),
> so I guess it should not wait for any indefinite periods of time?

I'm working on this currently:
https://bugs.launchpad.net/ubuntu/zesty/+source/linux/+bug/1711407

I added a summary of what I've found to be the cause (or at least, one
possible cause) of this:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407/comments/72

I'm working on a patch to work around the main side-effect of this,
which is hanging while holding the global net mutex.  Hangs will still
happen (e.g. if a dst leaks) but should not affect anything else,
other than a leak of the dst and its net namespace.

Fixing the dst leaks is important too, of course, but a dst leak (or
other cause) shouldn't break the entire system.


Re: net: hang in unregister_netdevice: waiting for lo to become free

2018-04-13 Thread Dan Streetman
On Thu, Apr 12, 2018 at 8:15 AM, Dmitry Vyukov  wrote:
> On Wed, Feb 21, 2018 at 3:53 PM, Tommi Rantala
>  wrote:
>> On 20.02.2018 18:26, Neil Horman wrote:
>>>
>>> On Tue, Feb 20, 2018 at 09:14:41AM +0100, Dmitry Vyukov wrote:

 On Tue, Feb 20, 2018 at 8:56 AM, Tommi Rantala
  wrote:
>
> On 19.02.2018 20:59, Dmitry Vyukov wrote:
>>
>> Is this meant to be fixed already? I am still seeing this on the
>> latest upstream tree.
>>
>
> These two commits are in v4.16-rc1:
>
> commit 4a31a6b19f9ddf498c81f5c9b089742b7472a6f8
> Author: Tommi Rantala 
> Date:   Mon Feb 5 21:48:14 2018 +0200
>
>  sctp: fix dst refcnt leak in sctp_v4_get_dst
> ...
>  Fixes: 410f03831 ("sctp: add routing output fallback")
>  Fixes: 0ca50d12f ("sctp: fix src address selection if using
> secondary
> addresses")
>
>
> commit 957d761cf91cdbb175ad7d8f5472336a4d54dbf2
> Author: Alexey Kodanev 
> Date:   Mon Feb 5 15:10:35 2018 +0300
>
>  sctp: fix dst refcnt leak in sctp_v6_get_dst()
> ...
>  Fixes: dbc2b5e9a09e ("sctp: fix src address selection if using
> secondary
> addresses for ipv6")
>
>
> I guess we missed something if it's still reproducible.
>
> I can check it later this week, unless someone else beat me to it.


 Hi Tommi,

 Hmmm, I can't claim that it's exactly the same bug. Perhaps it's
 another one then. But I am still seeing these:

 [   58.799130] unregister_netdevice: waiting for lo to become free.
 Usage count = 4
 [   60.847138] unregister_netdevice: waiting for lo to become free.
 Usage count = 4
 [   62.895093] unregister_netdevice: waiting for lo to become free.
 Usage count = 4
 [   64.943103] unregister_netdevice: waiting for lo to become free.
 Usage count = 4

 on upstream tree pulled ~12 hours ago.

>>> Can you write a systemtap script to probe dev_hold, and dev_put, printing
>>> out a
>>> backtrace if the device name matches "lo".  That should tell us
>>> definitively if
>>> the problem is in the same location or not
>>
>>
>> Hi Dmitry, I tested with the reproducer and the kernel .config file that you
>> sent in the first email in this thread:
>>
>> With 4.16-rc2 unable to reproduce.
>>
>> With 4.15-rc9 bug reproducible, and I get "unregister_netdevice: waiting for
>> lo to become free. Usage count = 3"
>>
>> With 4.15-rc9 and Alexey's "sctp: fix dst refcnt leak in sctp_v6_get_dst()"
>> cherry-picked on top, unable to reproduce.
>>
>>
>> Is syzkaller doing something else now to trigger the bug...?
>> Can you still trigger the bug with the same reproducer?
>
> Hi Neil, Tommi,
>
> Reviving this old thread about "unregister_netdevice: waiting for lo
> to become free. Usage count = 3" hangs.
> I still did not have time to deep dive into what happens there (too
> many bugs coming from syzbot). But this still actively happens and I
> suspect accounts to a significant portion of various hang reports,
> which are quite unpleasant.
>
> One idea that could make it all simpler:
>
> Is this wait loop in netdev_wait_allrefs() supposed to wait for any
> prolonged periods of time under any non-buggy conditions? E.g. more
> than 1-2 minutes?
> If it only supposed to wait briefly for things that already supposed
> to be shutting down, and we add a WARNING there after some timeout,
> then syzbot will report all info how/when it happens, hopefully
> extracting reproducers, and all the nice things.
> But this WARNING should not have any false positives under any
> realistic conditions (e.g. waiting for arrival of remote packets with
> large timeouts).
>
> Looking at some task hung reports, it seems that this code holds some
> mutexes, takes workqueue thread and prevents any progress with
> destruction of other devices (and net namespace creation/destruction),
> so I guess it should not wait for any indefinite periods of time?

I'm working on this currently:
https://bugs.launchpad.net/ubuntu/zesty/+source/linux/+bug/1711407

I added a summary of what I've found to be the cause (or at least, one
possible cause) of this:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407/comments/72

I'm working on a patch to work around the main side-effect of this,
which is hanging while holding the global net mutex.  Hangs will still
happen (e.g. if a dst leaks) but should not affect anything else,
other than a leak of the dst and its net namespace.

Fixing the dst leaks is important too, of course, but a dst leak (or
other cause) shouldn't break the entire system.


Re: [PATCH] crypto: nx-842: Delete an error message for a failed memory allocation in nx842_pseries_init()

2018-02-14 Thread Dan Streetman
On Wed, Feb 14, 2018 at 11:17 AM, SF Markus Elfring
<elfr...@users.sourceforge.net> wrote:
> From: Markus Elfring <elfr...@users.sourceforge.net>
> Date: Wed, 14 Feb 2018 17:05:13 +0100
>
> Omit an extra message for a memory allocation failure in this function.
>
> This issue was detected by using the Coccinelle software.
>
> Signed-off-by: Markus Elfring <elfr...@users.sourceforge.net>

Reviewed-by: Dan Streetman <ddstr...@ieee.org>

> ---
>  drivers/crypto/nx/nx-842-pseries.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/crypto/nx/nx-842-pseries.c 
> b/drivers/crypto/nx/nx-842-pseries.c
> index bf52cd1d7fca..66869976cfa2 100644
> --- a/drivers/crypto/nx/nx-842-pseries.c
> +++ b/drivers/crypto/nx/nx-842-pseries.c
> @@ -1105,10 +1105,9 @@ static int __init nx842_pseries_init(void)
>
> RCU_INIT_POINTER(devdata, NULL);
> new_devdata = kzalloc(sizeof(*new_devdata), GFP_KERNEL);
> -   if (!new_devdata) {
> -   pr_err("Could not allocate memory for device data\n");
> +   if (!new_devdata)
> return -ENOMEM;
> -   }
> +
> RCU_INIT_POINTER(devdata, new_devdata);
>
> ret = vio_register_driver(_vio_driver);
> --
> 2.16.1
>


Re: [PATCH] crypto: nx-842: Delete an error message for a failed memory allocation in nx842_pseries_init()

2018-02-14 Thread Dan Streetman
On Wed, Feb 14, 2018 at 11:17 AM, SF Markus Elfring
 wrote:
> From: Markus Elfring 
> Date: Wed, 14 Feb 2018 17:05:13 +0100
>
> Omit an extra message for a memory allocation failure in this function.
>
> This issue was detected by using the Coccinelle software.
>
> Signed-off-by: Markus Elfring 

Reviewed-by: Dan Streetman 

> ---
>  drivers/crypto/nx/nx-842-pseries.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/crypto/nx/nx-842-pseries.c 
> b/drivers/crypto/nx/nx-842-pseries.c
> index bf52cd1d7fca..66869976cfa2 100644
> --- a/drivers/crypto/nx/nx-842-pseries.c
> +++ b/drivers/crypto/nx/nx-842-pseries.c
> @@ -1105,10 +1105,9 @@ static int __init nx842_pseries_init(void)
>
> RCU_INIT_POINTER(devdata, NULL);
> new_devdata = kzalloc(sizeof(*new_devdata), GFP_KERNEL);
> -   if (!new_devdata) {
> -   pr_err("Could not allocate memory for device data\n");
> +   if (!new_devdata)
> return -ENOMEM;
> -   }
> +
> RCU_INIT_POINTER(devdata, new_devdata);
>
> ret = vio_register_driver(_vio_driver);
> --
> 2.16.1
>


Re: [PATCH -mm -v2] mm, swap, frontswap: Fix THP swap if frontswap enabled

2018-02-12 Thread Dan Streetman

On Thu, 8 Feb 2018, Minchan Kim wrote:

> Hi Huang,
> 
> On Thu, Feb 08, 2018 at 11:27:50PM +0800, huang ying wrote:
> > On Wed, Feb 7, 2018 at 3:00 PM, Huang, Ying <ying.hu...@intel.com> wrote:
> > > From: Huang Ying <huang.ying.cari...@gmail.com>
> > >
> > > It was reported by Sergey Senozhatsky that if THP (Transparent Huge
> > > Page) and frontswap (via zswap) are both enabled, when memory goes low
> > > so that swap is triggered, segfault and memory corruption will occur
> > > in random user space applications as follow,
> > >
> > > kernel: urxvt[338]: segfault at 20 ip 7fc08889ae0d sp 
> > > 7ffc73a7fc40 error 6 in libc-2.26.so[7fc08881a000+1ae000]
> > >  #0  0x7fc08889ae0d _int_malloc (libc.so.6)
> > >  #1  0x7fc08889c2f3 malloc (libc.so.6)
> > >  #2  0x560e6004bff7 _Z14rxvt_wcstoutf8PKwi (urxvt)
> > >  #3  0x560e6005e75c n/a (urxvt)
> > >  #4  0x560e6007d9f1 
> > > _ZN16rxvt_perl_interp6invokeEP9rxvt_term9hook_typez (urxvt)
> > >  #5  0x560e6003d988 _ZN9rxvt_term9cmd_parseEv (urxvt)
> > >  #6  0x560e60042804 _ZN9rxvt_term6pty_cbERN2ev2ioEi (urxvt)
> > >  #7  0x560e6005c10f _Z17ev_invoke_pendingv (urxvt)
> > >  #8  0x560e6005cb55 ev_run (urxvt)
> > >  #9  0x560e6003b9b9 main (urxvt)
> > >  #10 0x7fc08883af4a __libc_start_main (libc.so.6)
> > >  #11 0x560e6003f9da _start (urxvt)
> > >
> > > After bisection, it was found the first bad commit is
> > > bd4c82c22c367e068 ("mm, THP, swap: delay splitting THP after swapped
> > > out").
> > >
> > > The root cause is as follow.
> > >
> > > When the pages are written to swap device during swapping out in
> > > swap_writepage(), zswap (fontswap) is tried to compress the pages
> > > instead to improve the performance.  But zswap (frontswap) will treat
> > > THP as normal page, so only the head page is saved.  After swapping
> > > in, tail pages will not be restored to its original contents, so cause
> > > the memory corruption in the applications.
> > >
> > > This is fixed via splitting THP before writing the page to swap device
> > > if frontswap is enabled.  To deal with the situation where frontswap
> > > is enabled at runtime, whether the page is THP is checked before using
> > > frontswap during swapping out too.
> > >
> > > Reported-and-tested-by: Sergey Senozhatsky <sergey.senozhat...@gmail.com>
> > > Signed-off-by: "Huang, Ying" <ying.hu...@intel.com>
> > > Cc: Konrad Rzeszutek Wilk <konrad.w...@oracle.com>
> > > Cc: Dan Streetman <ddstr...@ieee.org>
> > > Cc: Seth Jennings <sjenn...@redhat.com>
> > > Cc: Minchan Kim <minc...@kernel.org>
> > > Cc: Tetsuo Handa <penguin-ker...@i-love.sakura.ne.jp>
> > > Cc: Shaohua Li <s...@kernel.org>
> > > Cc: Michal Hocko <mho...@suse.com>
> > > Cc: Johannes Weiner <han...@cmpxchg.org>
> > > Cc: Mel Gorman <mgor...@techsingularity.net>
> > > Cc: Shakeel Butt <shake...@google.com>
> > > Cc: sta...@vger.kernel.org # 4.14
> > > Fixes: bd4c82c22c367e068 ("mm, THP, swap: delay splitting THP after 
> > > swapped out")
> > >
> > > Changelog:
> > >
> > > v2:
> > >
> > > - Move frontswap check into swapfile.c to avoid to make vmscan.c
> > >   depends on frontswap.
> > > ---
> > >  mm/page_io.c  | 2 +-
> > >  mm/swapfile.c | 3 +++
> > >  2 files changed, 4 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/mm/page_io.c b/mm/page_io.c
> > > index b41cf9644585..6dca817ae7a0 100644
> > > --- a/mm/page_io.c
> > > +++ b/mm/page_io.c
> > > @@ -250,7 +250,7 @@ int swap_writepage(struct page *page, struct 
> > > writeback_control *wbc)
> > > unlock_page(page);
> > > goto out;
> > > }
> > > -   if (frontswap_store(page) == 0) {
> > > +   if (!PageTransHuge(page) && frontswap_store(page) == 0) {
> > > set_page_writeback(page);
> > > unlock_page(page);
> > > end_page_writeback(page);
> > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > index 006047b16814..0b7c7883ce64 100644
> > > --- a/mm/swapfile.c
> > > +++ b/mm/swapfile.c
> > > @@ -934,6 +934,9 @@ int get_swap_pages(int n_goal

Re: [PATCH -mm -v2] mm, swap, frontswap: Fix THP swap if frontswap enabled

2018-02-12 Thread Dan Streetman

On Thu, 8 Feb 2018, Minchan Kim wrote:

> Hi Huang,
> 
> On Thu, Feb 08, 2018 at 11:27:50PM +0800, huang ying wrote:
> > On Wed, Feb 7, 2018 at 3:00 PM, Huang, Ying  wrote:
> > > From: Huang Ying 
> > >
> > > It was reported by Sergey Senozhatsky that if THP (Transparent Huge
> > > Page) and frontswap (via zswap) are both enabled, when memory goes low
> > > so that swap is triggered, segfault and memory corruption will occur
> > > in random user space applications as follow,
> > >
> > > kernel: urxvt[338]: segfault at 20 ip 7fc08889ae0d sp 
> > > 7ffc73a7fc40 error 6 in libc-2.26.so[7fc08881a000+1ae000]
> > >  #0  0x7fc08889ae0d _int_malloc (libc.so.6)
> > >  #1  0x7fc08889c2f3 malloc (libc.so.6)
> > >  #2  0x560e6004bff7 _Z14rxvt_wcstoutf8PKwi (urxvt)
> > >  #3  0x560e6005e75c n/a (urxvt)
> > >  #4  0x560e6007d9f1 
> > > _ZN16rxvt_perl_interp6invokeEP9rxvt_term9hook_typez (urxvt)
> > >  #5  0x560e6003d988 _ZN9rxvt_term9cmd_parseEv (urxvt)
> > >  #6  0x560e60042804 _ZN9rxvt_term6pty_cbERN2ev2ioEi (urxvt)
> > >  #7  0x560e6005c10f _Z17ev_invoke_pendingv (urxvt)
> > >  #8  0x560e6005cb55 ev_run (urxvt)
> > >  #9  0x560e6003b9b9 main (urxvt)
> > >  #10 0x7fc08883af4a __libc_start_main (libc.so.6)
> > >  #11 0x560e6003f9da _start (urxvt)
> > >
> > > After bisection, it was found the first bad commit is
> > > bd4c82c22c367e068 ("mm, THP, swap: delay splitting THP after swapped
> > > out").
> > >
> > > The root cause is as follow.
> > >
> > > When the pages are written to swap device during swapping out in
> > > swap_writepage(), zswap (fontswap) is tried to compress the pages
> > > instead to improve the performance.  But zswap (frontswap) will treat
> > > THP as normal page, so only the head page is saved.  After swapping
> > > in, tail pages will not be restored to its original contents, so cause
> > > the memory corruption in the applications.
> > >
> > > This is fixed via splitting THP before writing the page to swap device
> > > if frontswap is enabled.  To deal with the situation where frontswap
> > > is enabled at runtime, whether the page is THP is checked before using
> > > frontswap during swapping out too.
> > >
> > > Reported-and-tested-by: Sergey Senozhatsky 
> > > Signed-off-by: "Huang, Ying" 
> > > Cc: Konrad Rzeszutek Wilk 
> > > Cc: Dan Streetman 
> > > Cc: Seth Jennings 
> > > Cc: Minchan Kim 
> > > Cc: Tetsuo Handa 
> > > Cc: Shaohua Li 
> > > Cc: Michal Hocko 
> > > Cc: Johannes Weiner 
> > > Cc: Mel Gorman 
> > > Cc: Shakeel Butt 
> > > Cc: sta...@vger.kernel.org # 4.14
> > > Fixes: bd4c82c22c367e068 ("mm, THP, swap: delay splitting THP after 
> > > swapped out")
> > >
> > > Changelog:
> > >
> > > v2:
> > >
> > > - Move frontswap check into swapfile.c to avoid to make vmscan.c
> > >   depends on frontswap.
> > > ---
> > >  mm/page_io.c  | 2 +-
> > >  mm/swapfile.c | 3 +++
> > >  2 files changed, 4 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/mm/page_io.c b/mm/page_io.c
> > > index b41cf9644585..6dca817ae7a0 100644
> > > --- a/mm/page_io.c
> > > +++ b/mm/page_io.c
> > > @@ -250,7 +250,7 @@ int swap_writepage(struct page *page, struct 
> > > writeback_control *wbc)
> > > unlock_page(page);
> > > goto out;
> > > }
> > > -   if (frontswap_store(page) == 0) {
> > > +   if (!PageTransHuge(page) && frontswap_store(page) == 0) {
> > > set_page_writeback(page);
> > > unlock_page(page);
> > > end_page_writeback(page);
> > > diff --git a/mm/swapfile.c b/mm/swapfile.c
> > > index 006047b16814..0b7c7883ce64 100644
> > > --- a/mm/swapfile.c
> > > +++ b/mm/swapfile.c
> > > @@ -934,6 +934,9 @@ int get_swap_pages(int n_goal, bool cluster, 
> > > swp_entry_t swp_entries[])
> > >
> > > /* Only single cluster request supported */
> > > WARN_ON_ONCE(n_goal > 1 && cluster);
> > > +   /* Frontswap doesn't support THP */
> > > +   if (frontswap_enabled() && cluster)
> > > +   goto noswap;
> > 
> > I foun

[PATCH] net: tcp: close sock if net namespace is exiting

2018-01-18 Thread Dan Streetman
When a tcp socket is closed, if it detects that its net namespace is
exiting, close immediately and do not wait for FIN sequence.

For normal sockets, a reference is taken to their net namespace, so it will
never exit while the socket is open.  However, kernel sockets do not take a
reference to their net namespace, so it may begin exiting while the kernel
socket is still open.  In this case if the kernel socket is a tcp socket,
it will stay open trying to complete its close sequence.  The sock's dst(s)
hold a reference to their interface, which are all transferred to the
namespace's loopback interface when the real interfaces are taken down.
When the namespace tries to take down its loopback interface, it hangs
waiting for all references to the loopback interface to release, which
results in messages like:

unregister_netdevice: waiting for lo to become free. Usage count = 1

These messages continue until the socket finally times out and closes.
Since the net namespace cleanup holds the net_mutex while calling its
registered pernet callbacks, any new net namespace initialization is
blocked until the current net namespace finishes exiting.

After this change, the tcp socket notices the exiting net namespace, and
closes immediately, releasing its dst(s) and their reference to the
loopback interface, which lets the net namespace continue exiting.

Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
Signed-off-by: Dan Streetman <ddstr...@canonical.com>
---
 include/net/net_namespace.h | 10 ++
 net/ipv4/tcp.c  |  3 +++
 net/ipv4/tcp_timer.c| 15 +++
 3 files changed, 28 insertions(+)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f8a84a2c2341..f306b2aa15a4 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -223,6 +223,11 @@ int net_eq(const struct net *net1, const struct net *net2)
return net1 == net2;
 }
 
+static inline int check_net(const struct net *net)
+{
+   return refcount_read(>count) != 0;
+}
+
 void net_drop_ns(void *);
 
 #else
@@ -247,6 +252,11 @@ int net_eq(const struct net *net1, const struct net *net2)
return 1;
 }
 
+static inline int check_net(const struct net *net)
+{
+   return 1;
+}
+
 #define net_drop_ns NULL
 #endif
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d7cf861bf699..9389193e73f3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2298,6 +2298,9 @@ void tcp_close(struct sock *sk, long timeout)
tcp_send_active_reset(sk, GFP_ATOMIC);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
+   } else if (!check_net(sock_net(sk))) {
+   /* Not possible to send reset; just close */
+   tcp_set_state(sk, TCP_CLOSE);
}
}
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6db3124cdbda..41b40b805aa3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -48,11 +48,19 @@ static void tcp_write_err(struct sock *sk)
  *  to prevent DoS attacks. It is called when a retransmission timeout
  *  or zero probe timeout occurs on orphaned socket.
  *
+ *  Also close if our net namespace is exiting; in that case there is no
+ *  hope of ever communicating again since all netns interfaces are already
+ *  down (or about to be down), and we need to release our dst references,
+ *  which have been moved to the netns loopback interface, so the namespace
+ *  can finish exiting.  This condition is only possible if we are a kernel
+ *  socket, as those do not hold references to the namespace.
+ *
  *  Criteria is still not confirmed experimentally and may change.
  *  We kill the socket, if:
  *  1. If number of orphaned sockets exceeds an administratively configured
  * limit.
  *  2. If we have strong memory pressure.
+ *  3. If our net namespace is exiting.
  */
 static int tcp_out_of_resources(struct sock *sk, bool do_reset)
 {
@@ -81,6 +89,13 @@ static int tcp_out_of_resources(struct sock *sk, bool 
do_reset)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
}
+
+   if (!check_net(sock_net(sk))) {
+   /* Not possible to send reset; just close */
+   tcp_done(sk);
+   return 1;
+   }
+
return 0;
 }
 
-- 
2.14.1



[PATCH] net: tcp: close sock if net namespace is exiting

2018-01-18 Thread Dan Streetman
When a tcp socket is closed, if it detects that its net namespace is
exiting, close immediately and do not wait for FIN sequence.

For normal sockets, a reference is taken to their net namespace, so it will
never exit while the socket is open.  However, kernel sockets do not take a
reference to their net namespace, so it may begin exiting while the kernel
socket is still open.  In this case if the kernel socket is a tcp socket,
it will stay open trying to complete its close sequence.  The sock's dst(s)
hold a reference to their interface, which are all transferred to the
namespace's loopback interface when the real interfaces are taken down.
When the namespace tries to take down its loopback interface, it hangs
waiting for all references to the loopback interface to release, which
results in messages like:

unregister_netdevice: waiting for lo to become free. Usage count = 1

These messages continue until the socket finally times out and closes.
Since the net namespace cleanup holds the net_mutex while calling its
registered pernet callbacks, any new net namespace initialization is
blocked until the current net namespace finishes exiting.

After this change, the tcp socket notices the exiting net namespace, and
closes immediately, releasing its dst(s) and their reference to the
loopback interface, which lets the net namespace continue exiting.

Link: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1711407
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=97811
Signed-off-by: Dan Streetman 
---
 include/net/net_namespace.h | 10 ++
 net/ipv4/tcp.c  |  3 +++
 net/ipv4/tcp_timer.c| 15 +++
 3 files changed, 28 insertions(+)

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index f8a84a2c2341..f306b2aa15a4 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -223,6 +223,11 @@ int net_eq(const struct net *net1, const struct net *net2)
return net1 == net2;
 }
 
+static inline int check_net(const struct net *net)
+{
+   return refcount_read(>count) != 0;
+}
+
 void net_drop_ns(void *);
 
 #else
@@ -247,6 +252,11 @@ int net_eq(const struct net *net1, const struct net *net2)
return 1;
 }
 
+static inline int check_net(const struct net *net)
+{
+   return 1;
+}
+
 #define net_drop_ns NULL
 #endif
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d7cf861bf699..9389193e73f3 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2298,6 +2298,9 @@ void tcp_close(struct sock *sk, long timeout)
tcp_send_active_reset(sk, GFP_ATOMIC);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPABORTONMEMORY);
+   } else if (!check_net(sock_net(sk))) {
+   /* Not possible to send reset; just close */
+   tcp_set_state(sk, TCP_CLOSE);
}
}
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 6db3124cdbda..41b40b805aa3 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -48,11 +48,19 @@ static void tcp_write_err(struct sock *sk)
  *  to prevent DoS attacks. It is called when a retransmission timeout
  *  or zero probe timeout occurs on orphaned socket.
  *
+ *  Also close if our net namespace is exiting; in that case there is no
+ *  hope of ever communicating again since all netns interfaces are already
+ *  down (or about to be down), and we need to release our dst references,
+ *  which have been moved to the netns loopback interface, so the namespace
+ *  can finish exiting.  This condition is only possible if we are a kernel
+ *  socket, as those do not hold references to the namespace.
+ *
  *  Criteria is still not confirmed experimentally and may change.
  *  We kill the socket, if:
  *  1. If number of orphaned sockets exceeds an administratively configured
  * limit.
  *  2. If we have strong memory pressure.
+ *  3. If our net namespace is exiting.
  */
 static int tcp_out_of_resources(struct sock *sk, bool do_reset)
 {
@@ -81,6 +89,13 @@ static int tcp_out_of_resources(struct sock *sk, bool 
do_reset)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY);
return 1;
}
+
+   if (!check_net(sock_net(sk))) {
+   /* Not possible to send reset; just close */
+   tcp_done(sk);
+   return 1;
+   }
+
return 0;
 }
 
-- 
2.14.1



Re: [PATCH v3] zswap: only save zswap header when necessary

2018-01-11 Thread Dan Streetman
On Wed, Jan 10, 2018 at 5:56 PM, Yu Zhao <yuz...@google.com> wrote:
> We waste sizeof(swp_entry_t) for zswap header when using zsmalloc
> as zpool driver because zsmalloc doesn't support eviction.
>
> Add zpool_evictable() to detect if zpool is potentially evictable,
> and use it in zswap to avoid waste memory for zswap header.
>
> Signed-off-by: Yu Zhao <yuz...@google.com>
> ---
>  include/linux/zpool.h |  2 ++
>  mm/zpool.c| 25 +++--
>  mm/zsmalloc.c |  7 ---
>  mm/zswap.c| 20 ++--
>  4 files changed, 35 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/zpool.h b/include/linux/zpool.h
> index 004ba807df96..7238865e75b0 100644
> --- a/include/linux/zpool.h
> +++ b/include/linux/zpool.h
> @@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver *driver);
>
>  int zpool_unregister_driver(struct zpool_driver *driver);
>
> +bool zpool_evictable(struct zpool *pool);
> +
>  #endif
> diff --git a/mm/zpool.c b/mm/zpool.c
> index fd3ff719c32c..e1e7aa6d1d06 100644
> --- a/mm/zpool.c
> +++ b/mm/zpool.c
> @@ -21,6 +21,7 @@ struct zpool {
> struct zpool_driver *driver;
> void *pool;
> const struct zpool_ops *ops;
> +   bool evictable;
>
> struct list_head list;
>  };
> @@ -142,7 +143,7 @@ EXPORT_SYMBOL(zpool_has_pool);
>   *
>   * This creates a new zpool of the specified type.  The gfp flags will be
>   * used when allocating memory, if the implementation supports it.  If the
> - * ops param is NULL, then the created zpool will not be shrinkable.
> + * ops param is NULL, then the created zpool will not be evictable.
>   *
>   * Implementations must guarantee this to be thread-safe.
>   *
> @@ -180,6 +181,7 @@ struct zpool *zpool_create_pool(const char *type, const 
> char *name, gfp_t gfp,
> zpool->driver = driver;
> zpool->pool = driver->create(name, gfp, ops, zpool);
> zpool->ops = ops;
> +   zpool->evictable = driver->shrink && ops && ops->evict;

Since the ops->evict comes from zswap (and is never omitted), if we do
restore zs_zpool_shrink() in the future to call the zsmalloc shrinker
(and not do eviction), we'll have to add a driver->evictable bool to
check here as well.

But for now this is good, the zpools with driver->shrink do eviction,
zsmalloc doesn't do eviction and won't have driver->shrink.

Acked-by: Dan Streetman <ddstr...@ieee.org>

Thanks!

>
> if (!zpool->pool) {
> pr_err("couldn't create %s pool\n", type);
> @@ -296,7 +298,8 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
>  int zpool_shrink(struct zpool *zpool, unsigned int pages,
> unsigned int *reclaimed)
>  {
> -   return zpool->driver->shrink(zpool->pool, pages, reclaimed);
> +   return zpool->driver->shrink ?
> +  zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL;
>  }
>
>  /**
> @@ -355,6 +358,24 @@ u64 zpool_get_total_size(struct zpool *zpool)
> return zpool->driver->total_size(zpool->pool);
>  }
>
> +/**
> + * zpool_evictable() - Test if zpool is potentially evictable
> + * @pool   The zpool to test
> + *
> + * Zpool is only potentially evictable when it's created with struct
> + * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
> + *
> + * However, it doesn't necessarily mean driver will use zpool_ops.evict
> + * in its implementation of zpool_driver.shrink. It could do internal
> + * defragmentation instead.
> + *
> + * Returns: true if potentially evictable; false otherwise.
> + */
> +bool zpool_evictable(struct zpool *zpool)
> +{
> +   return zpool->evictable;
> +}
> +
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("Dan Streetman <ddstr...@ieee.org>");
>  MODULE_DESCRIPTION("Common API for compressed memory storage");
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 683c0651098c..9cc741bcdb32 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -407,12 +407,6 @@ static void zs_zpool_free(void *pool, unsigned long 
> handle)
> zs_free(pool, handle);
>  }
>
> -static int zs_zpool_shrink(void *pool, unsigned int pages,
> -   unsigned int *reclaimed)
> -{
> -   return -EINVAL;
> -}
> -
>  static void *zs_zpool_map(void *pool, unsigned long handle,
> enum zpool_mapmode mm)
>  {
> @@ -450,7 +444,6 @@ static struct zpool_driver zs_zpool_driver = {
> .destroy =  zs_zpool_destroy,
> .malloc =   z

Re: [PATCH v3] zswap: only save zswap header when necessary

2018-01-11 Thread Dan Streetman
On Wed, Jan 10, 2018 at 5:56 PM, Yu Zhao  wrote:
> We waste sizeof(swp_entry_t) for zswap header when using zsmalloc
> as zpool driver because zsmalloc doesn't support eviction.
>
> Add zpool_evictable() to detect if zpool is potentially evictable,
> and use it in zswap to avoid waste memory for zswap header.
>
> Signed-off-by: Yu Zhao 
> ---
>  include/linux/zpool.h |  2 ++
>  mm/zpool.c| 25 +++--
>  mm/zsmalloc.c |  7 ---
>  mm/zswap.c| 20 ++--
>  4 files changed, 35 insertions(+), 19 deletions(-)
>
> diff --git a/include/linux/zpool.h b/include/linux/zpool.h
> index 004ba807df96..7238865e75b0 100644
> --- a/include/linux/zpool.h
> +++ b/include/linux/zpool.h
> @@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver *driver);
>
>  int zpool_unregister_driver(struct zpool_driver *driver);
>
> +bool zpool_evictable(struct zpool *pool);
> +
>  #endif
> diff --git a/mm/zpool.c b/mm/zpool.c
> index fd3ff719c32c..e1e7aa6d1d06 100644
> --- a/mm/zpool.c
> +++ b/mm/zpool.c
> @@ -21,6 +21,7 @@ struct zpool {
> struct zpool_driver *driver;
> void *pool;
> const struct zpool_ops *ops;
> +   bool evictable;
>
> struct list_head list;
>  };
> @@ -142,7 +143,7 @@ EXPORT_SYMBOL(zpool_has_pool);
>   *
>   * This creates a new zpool of the specified type.  The gfp flags will be
>   * used when allocating memory, if the implementation supports it.  If the
> - * ops param is NULL, then the created zpool will not be shrinkable.
> + * ops param is NULL, then the created zpool will not be evictable.
>   *
>   * Implementations must guarantee this to be thread-safe.
>   *
> @@ -180,6 +181,7 @@ struct zpool *zpool_create_pool(const char *type, const 
> char *name, gfp_t gfp,
> zpool->driver = driver;
> zpool->pool = driver->create(name, gfp, ops, zpool);
> zpool->ops = ops;
> +   zpool->evictable = driver->shrink && ops && ops->evict;

Since the ops->evict comes from zswap (and is never omitted), if we do
restore zs_zpool_shrink() in the future to call the zsmalloc shrinker
(and not do eviction), we'll have to add a driver->evictable bool to
check here as well.

But for now this is good, the zpools with driver->shrink do eviction,
zsmalloc doesn't do eviction and won't have driver->shrink.

Acked-by: Dan Streetman 

Thanks!

>
> if (!zpool->pool) {
> pr_err("couldn't create %s pool\n", type);
> @@ -296,7 +298,8 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
>  int zpool_shrink(struct zpool *zpool, unsigned int pages,
> unsigned int *reclaimed)
>  {
> -   return zpool->driver->shrink(zpool->pool, pages, reclaimed);
> +   return zpool->driver->shrink ?
> +  zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL;
>  }
>
>  /**
> @@ -355,6 +358,24 @@ u64 zpool_get_total_size(struct zpool *zpool)
> return zpool->driver->total_size(zpool->pool);
>  }
>
> +/**
> + * zpool_evictable() - Test if zpool is potentially evictable
> + * @pool   The zpool to test
> + *
> + * Zpool is only potentially evictable when it's created with struct
> + * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
> + *
> + * However, it doesn't necessarily mean driver will use zpool_ops.evict
> + * in its implementation of zpool_driver.shrink. It could do internal
> + * defragmentation instead.
> + *
> + * Returns: true if potentially evictable; false otherwise.
> + */
> +bool zpool_evictable(struct zpool *zpool)
> +{
> +   return zpool->evictable;
> +}
> +
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("Dan Streetman ");
>  MODULE_DESCRIPTION("Common API for compressed memory storage");
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 683c0651098c..9cc741bcdb32 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -407,12 +407,6 @@ static void zs_zpool_free(void *pool, unsigned long 
> handle)
> zs_free(pool, handle);
>  }
>
> -static int zs_zpool_shrink(void *pool, unsigned int pages,
> -   unsigned int *reclaimed)
> -{
> -   return -EINVAL;
> -}
> -
>  static void *zs_zpool_map(void *pool, unsigned long handle,
> enum zpool_mapmode mm)
>  {
> @@ -450,7 +444,6 @@ static struct zpool_driver zs_zpool_driver = {
> .destroy =  zs_zpool_destroy,
> .malloc =   zs_zpool_malloc,
> .free = zs_zpool_free,
> -   .shrink =   zs_zpool_shri

Re: [PATCH] zswap: only save zswap header if zpool is shrinkable

2018-01-10 Thread Dan Streetman
On Tue, Jan 9, 2018 at 5:47 PM, Yu Zhao <yuz...@google.com> wrote:
> On Tue, Jan 09, 2018 at 01:25:18PM -0500, Dan Streetman wrote:
>> On Mon, Jan 8, 2018 at 5:51 PM, Yu Zhao <yuz...@google.com> wrote:
>> > We waste sizeof(swp_entry_t) for zswap header when using zsmalloc
>> > as zpool driver because zsmalloc doesn't support eviction.
>> >
>> > Add zpool_shrinkable() to detect if zpool is shrinkable, and use
>> > it in zswap to avoid waste memory for zswap header.
>> >
>> > Signed-off-by: Yu Zhao <yuz...@google.com>
>> > ---
>> >  include/linux/zpool.h |  2 ++
>> >  mm/zpool.c| 17 -
>> >  mm/zsmalloc.c |  7 ---
>> >  mm/zswap.c| 20 ++--
>> >  4 files changed, 28 insertions(+), 18 deletions(-)
>> >
>> > diff --git a/include/linux/zpool.h b/include/linux/zpool.h
>> > index 004ba807df96..3f0ac2ab74aa 100644
>> > --- a/include/linux/zpool.h
>> > +++ b/include/linux/zpool.h
>> > @@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver 
>> > *driver);
>> >
>> >  int zpool_unregister_driver(struct zpool_driver *driver);
>> >
>> > +bool zpool_shrinkable(struct zpool *pool);
>> > +
>> >  #endif
>> > diff --git a/mm/zpool.c b/mm/zpool.c
>> > index fd3ff719c32c..839d4234c540 100644
>> > --- a/mm/zpool.c
>> > +++ b/mm/zpool.c
>> > @@ -296,7 +296,8 @@ void zpool_free(struct zpool *zpool, unsigned long 
>> > handle)
>> >  int zpool_shrink(struct zpool *zpool, unsigned int pages,
>> > unsigned int *reclaimed)
>> >  {
>> > -   return zpool->driver->shrink(zpool->pool, pages, reclaimed);
>> > +   return zpool_shrinkable(zpool) ?
>> > +  zpool->driver->shrink(zpool->pool, pages, reclaimed) : 
>> > -EINVAL;
>> >  }
>> >
>> >  /**
>> > @@ -355,6 +356,20 @@ u64 zpool_get_total_size(struct zpool *zpool)
>> > return zpool->driver->total_size(zpool->pool);
>> >  }
>> >
>> > +/**
>> > + * zpool_shrinkable() - Test if zpool is shrinkable
>> > + * @pool   The zpool to test
>> > + *
>> > + * Zpool is only shrinkable when it's created with struct
>> > + * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
>> > + *
>> > + * Returns: true if shrinkable; false otherwise.
>> > + */
>> > +bool zpool_shrinkable(struct zpool *zpool)
>> > +{
>> > +   return zpool->ops && zpool->ops->evict && zpool->driver->shrink;
>>
>> as these things won't ever change for the life of the zpool, it would
>> probably be better to just check them at zpool creation time and set a
>> single new zpool param, like 'zpool->shrinkable'. since this function
>> will be called for every page that's swapped in or out, that may save
>> a bit of time.
>
> Ack.
>
>> also re: calling it 'shrinkable' or 'evictable', the real thing zswap
>> is interested in is if it needs to include the header info that
>> zswap_writeback_entry (i.e. ops->evict) later needs, so yeah it does
>> make more sense to call it zpool_evictable() and zpool->evictable.
>> However, I think the function should still be zpool_shrink() and
>> zpool->driver->shrink(), because it should be possible for
>> zs_pool_shrink() to call the normal zsmalloc shrinker, instead of
>> doing the zswap-style eviction, even if it doesn't do that currently.
>
> I agree we keep zpool_shrink(). It could either shrink pool if driver
> supports slab shrinker by providing zpool->driver->shrink or evict
> pages from pool if driver supports zpool->driver->evict (which in turn
> calls ops->evict provided by zswap) or both.
>
> We can't use a single zpool->driver->callback to achieve both because
> there will be no way for zswap to know if driver uses ops->evict thus
> no way to determine if zswap_header is needed.
>
> So for now, I think it'd be better if we deleted zpool->driver->shrink
> from zsmalloc and renamed it to zpool->driver->evict in zbud. Later
> if we decide zpool_shrink should also call zsmalloc slab shrinker, we
> add a new callback.

Well, I think shrink vs evict an implementation detail, isn't it?
That is, from zswap's perspective, there should be:

zpool_evictable()
if true, zswap needs to include the header on each compressed page,
because the zpool may callback zpool->ops->evict() which calls
zswap_writeback_entry() which expects the entry to start with a zswap
header.
if false, zswap doesn't need to include the header, because the zpool
will never, ever call zpool->ops->evict

zpool_shrink()
this will try to shrink the zpool, using whatever
zpool-implementation-specific shrinking method.  If zpool_evictable()
is true for this zpool, then zpool_shrink() *might* callback to
zpool->ops->evict(), although it doesn't have to if it can shrink
without evictions.  If zpool_evictable() is false, then zpool_shrink()
will never callback to zpool->ops->evict().

There is really no need for zswap to call different functions based on
whether the pool is evictable or not...is there?


Re: [PATCH] zswap: only save zswap header if zpool is shrinkable

2018-01-10 Thread Dan Streetman
On Tue, Jan 9, 2018 at 5:47 PM, Yu Zhao  wrote:
> On Tue, Jan 09, 2018 at 01:25:18PM -0500, Dan Streetman wrote:
>> On Mon, Jan 8, 2018 at 5:51 PM, Yu Zhao  wrote:
>> > We waste sizeof(swp_entry_t) for zswap header when using zsmalloc
>> > as zpool driver because zsmalloc doesn't support eviction.
>> >
>> > Add zpool_shrinkable() to detect if zpool is shrinkable, and use
>> > it in zswap to avoid waste memory for zswap header.
>> >
>> > Signed-off-by: Yu Zhao 
>> > ---
>> >  include/linux/zpool.h |  2 ++
>> >  mm/zpool.c| 17 -
>> >  mm/zsmalloc.c |  7 ---
>> >  mm/zswap.c| 20 ++--
>> >  4 files changed, 28 insertions(+), 18 deletions(-)
>> >
>> > diff --git a/include/linux/zpool.h b/include/linux/zpool.h
>> > index 004ba807df96..3f0ac2ab74aa 100644
>> > --- a/include/linux/zpool.h
>> > +++ b/include/linux/zpool.h
>> > @@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver 
>> > *driver);
>> >
>> >  int zpool_unregister_driver(struct zpool_driver *driver);
>> >
>> > +bool zpool_shrinkable(struct zpool *pool);
>> > +
>> >  #endif
>> > diff --git a/mm/zpool.c b/mm/zpool.c
>> > index fd3ff719c32c..839d4234c540 100644
>> > --- a/mm/zpool.c
>> > +++ b/mm/zpool.c
>> > @@ -296,7 +296,8 @@ void zpool_free(struct zpool *zpool, unsigned long 
>> > handle)
>> >  int zpool_shrink(struct zpool *zpool, unsigned int pages,
>> > unsigned int *reclaimed)
>> >  {
>> > -   return zpool->driver->shrink(zpool->pool, pages, reclaimed);
>> > +   return zpool_shrinkable(zpool) ?
>> > +  zpool->driver->shrink(zpool->pool, pages, reclaimed) : 
>> > -EINVAL;
>> >  }
>> >
>> >  /**
>> > @@ -355,6 +356,20 @@ u64 zpool_get_total_size(struct zpool *zpool)
>> > return zpool->driver->total_size(zpool->pool);
>> >  }
>> >
>> > +/**
>> > + * zpool_shrinkable() - Test if zpool is shrinkable
>> > + * @pool   The zpool to test
>> > + *
>> > + * Zpool is only shrinkable when it's created with struct
>> > + * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
>> > + *
>> > + * Returns: true if shrinkable; false otherwise.
>> > + */
>> > +bool zpool_shrinkable(struct zpool *zpool)
>> > +{
>> > +   return zpool->ops && zpool->ops->evict && zpool->driver->shrink;
>>
>> as these things won't ever change for the life of the zpool, it would
>> probably be better to just check them at zpool creation time and set a
>> single new zpool param, like 'zpool->shrinkable'. since this function
>> will be called for every page that's swapped in or out, that may save
>> a bit of time.
>
> Ack.
>
>> also re: calling it 'shrinkable' or 'evictable', the real thing zswap
>> is interested in is if it needs to include the header info that
>> zswap_writeback_entry (i.e. ops->evict) later needs, so yeah it does
>> make more sense to call it zpool_evictable() and zpool->evictable.
>> However, I think the function should still be zpool_shrink() and
>> zpool->driver->shrink(), because it should be possible for
>> zs_pool_shrink() to call the normal zsmalloc shrinker, instead of
>> doing the zswap-style eviction, even if it doesn't do that currently.
>
> I agree we keep zpool_shrink(). It could either shrink pool if driver
> supports slab shrinker by providing zpool->driver->shrink or evict
> pages from pool if driver supports zpool->driver->evict (which in turn
> calls ops->evict provided by zswap) or both.
>
> We can't use a single zpool->driver->callback to achieve both because
> there will be no way for zswap to know if driver uses ops->evict thus
> no way to determine if zswap_header is needed.
>
> So for now, I think it'd be better if we deleted zpool->driver->shrink
> from zsmalloc and renamed it to zpool->driver->evict in zbud. Later
> if we decide zpool_shrink should also call zsmalloc slab shrinker, we
> add a new callback.

Well, I think shrink vs evict an implementation detail, isn't it?
That is, from zswap's perspective, there should be:

zpool_evictable()
if true, zswap needs to include the header on each compressed page,
because the zpool may callback zpool->ops->evict() which calls
zswap_writeback_entry() which expects the entry to start with a zswap
header.
if false, zswap doesn't need to include the header, because the zpool
will never, ever call zpool->ops->evict

zpool_shrink()
this will try to shrink the zpool, using whatever
zpool-implementation-specific shrinking method.  If zpool_evictable()
is true for this zpool, then zpool_shrink() *might* callback to
zpool->ops->evict(), although it doesn't have to if it can shrink
without evictions.  If zpool_evictable() is false, then zpool_shrink()
will never callback to zpool->ops->evict().

There is really no need for zswap to call different functions based on
whether the pool is evictable or not...is there?


Re: [PATCH] zswap: only save zswap header if zpool is shrinkable

2018-01-09 Thread Dan Streetman
On Mon, Jan 8, 2018 at 5:51 PM, Yu Zhao <yuz...@google.com> wrote:
> We waste sizeof(swp_entry_t) for zswap header when using zsmalloc
> as zpool driver because zsmalloc doesn't support eviction.
>
> Add zpool_shrinkable() to detect if zpool is shrinkable, and use
> it in zswap to avoid waste memory for zswap header.
>
> Signed-off-by: Yu Zhao <yuz...@google.com>
> ---
>  include/linux/zpool.h |  2 ++
>  mm/zpool.c| 17 -
>  mm/zsmalloc.c |  7 ---
>  mm/zswap.c| 20 ++--
>  4 files changed, 28 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/zpool.h b/include/linux/zpool.h
> index 004ba807df96..3f0ac2ab74aa 100644
> --- a/include/linux/zpool.h
> +++ b/include/linux/zpool.h
> @@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver *driver);
>
>  int zpool_unregister_driver(struct zpool_driver *driver);
>
> +bool zpool_shrinkable(struct zpool *pool);
> +
>  #endif
> diff --git a/mm/zpool.c b/mm/zpool.c
> index fd3ff719c32c..839d4234c540 100644
> --- a/mm/zpool.c
> +++ b/mm/zpool.c
> @@ -296,7 +296,8 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
>  int zpool_shrink(struct zpool *zpool, unsigned int pages,
> unsigned int *reclaimed)
>  {
> -   return zpool->driver->shrink(zpool->pool, pages, reclaimed);
> +   return zpool_shrinkable(zpool) ?
> +  zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL;
>  }
>
>  /**
> @@ -355,6 +356,20 @@ u64 zpool_get_total_size(struct zpool *zpool)
> return zpool->driver->total_size(zpool->pool);
>  }
>
> +/**
> + * zpool_shrinkable() - Test if zpool is shrinkable
> + * @pool   The zpool to test
> + *
> + * Zpool is only shrinkable when it's created with struct
> + * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
> + *
> + * Returns: true if shrinkable; false otherwise.
> + */
> +bool zpool_shrinkable(struct zpool *zpool)
> +{
> +   return zpool->ops && zpool->ops->evict && zpool->driver->shrink;

as these things won't ever change for the life of the zpool, it would
probably be better to just check them at zpool creation time and set a
single new zpool param, like 'zpool->shrinkable'. since this function
will be called for every page that's swapped in or out, that may save
a bit of time.

also re: calling it 'shrinkable' or 'evictable', the real thing zswap
is interested in is if it needs to include the header info that
zswap_writeback_entry (i.e. ops->evict) later needs, so yeah it does
make more sense to call it zpool_evictable() and zpool->evictable.
However, I think the function should still be zpool_shrink() and
zpool->driver->shrink(), because it should be possible for
zs_pool_shrink() to call the normal zsmalloc shrinker, instead of
doing the zswap-style eviction, even if it doesn't do that currently.

> +}
> +
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("Dan Streetman <ddstr...@ieee.org>");
>  MODULE_DESCRIPTION("Common API for compressed memory storage");
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 683c0651098c..9cc741bcdb32 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -407,12 +407,6 @@ static void zs_zpool_free(void *pool, unsigned long 
> handle)
> zs_free(pool, handle);
>  }
>
> -static int zs_zpool_shrink(void *pool, unsigned int pages,
> -   unsigned int *reclaimed)
> -{
> -   return -EINVAL;
> -}
> -
>  static void *zs_zpool_map(void *pool, unsigned long handle,
> enum zpool_mapmode mm)
>  {
> @@ -450,7 +444,6 @@ static struct zpool_driver zs_zpool_driver = {
> .destroy =  zs_zpool_destroy,
> .malloc =   zs_zpool_malloc,
> .free = zs_zpool_free,
> -   .shrink =   zs_zpool_shrink,
> .map =  zs_zpool_map,
> .unmap =zs_zpool_unmap,
> .total_size =   zs_zpool_total_size,
> diff --git a/mm/zswap.c b/mm/zswap.c
> index d39581a076c3..15d2ea29a6fa 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -964,11 +964,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
> offset,
> struct zswap_entry *entry, *dupentry;
> struct crypto_comp *tfm;
> int ret;
> -   unsigned int dlen = PAGE_SIZE, len;
> +   unsigned int hlen, dlen = PAGE_SIZE;
> unsigned long handle;
> char *buf;
> u8 *src, *dst;
> -   struct zswap_header *zhdr;
> +   struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
>
>   

Re: [PATCH] zswap: only save zswap header if zpool is shrinkable

2018-01-09 Thread Dan Streetman
On Mon, Jan 8, 2018 at 5:51 PM, Yu Zhao  wrote:
> We waste sizeof(swp_entry_t) for zswap header when using zsmalloc
> as zpool driver because zsmalloc doesn't support eviction.
>
> Add zpool_shrinkable() to detect if zpool is shrinkable, and use
> it in zswap to avoid waste memory for zswap header.
>
> Signed-off-by: Yu Zhao 
> ---
>  include/linux/zpool.h |  2 ++
>  mm/zpool.c| 17 -
>  mm/zsmalloc.c |  7 ---
>  mm/zswap.c| 20 ++--
>  4 files changed, 28 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/zpool.h b/include/linux/zpool.h
> index 004ba807df96..3f0ac2ab74aa 100644
> --- a/include/linux/zpool.h
> +++ b/include/linux/zpool.h
> @@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver *driver);
>
>  int zpool_unregister_driver(struct zpool_driver *driver);
>
> +bool zpool_shrinkable(struct zpool *pool);
> +
>  #endif
> diff --git a/mm/zpool.c b/mm/zpool.c
> index fd3ff719c32c..839d4234c540 100644
> --- a/mm/zpool.c
> +++ b/mm/zpool.c
> @@ -296,7 +296,8 @@ void zpool_free(struct zpool *zpool, unsigned long handle)
>  int zpool_shrink(struct zpool *zpool, unsigned int pages,
> unsigned int *reclaimed)
>  {
> -   return zpool->driver->shrink(zpool->pool, pages, reclaimed);
> +   return zpool_shrinkable(zpool) ?
> +  zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL;
>  }
>
>  /**
> @@ -355,6 +356,20 @@ u64 zpool_get_total_size(struct zpool *zpool)
> return zpool->driver->total_size(zpool->pool);
>  }
>
> +/**
> + * zpool_shrinkable() - Test if zpool is shrinkable
> + * @pool   The zpool to test
> + *
> + * Zpool is only shrinkable when it's created with struct
> + * zpool_ops.evict and its driver implements struct zpool_driver.shrink.
> + *
> + * Returns: true if shrinkable; false otherwise.
> + */
> +bool zpool_shrinkable(struct zpool *zpool)
> +{
> +   return zpool->ops && zpool->ops->evict && zpool->driver->shrink;

as these things won't ever change for the life of the zpool, it would
probably be better to just check them at zpool creation time and set a
single new zpool param, like 'zpool->shrinkable'. since this function
will be called for every page that's swapped in or out, that may save
a bit of time.

also re: calling it 'shrinkable' or 'evictable', the real thing zswap
is interested in is if it needs to include the header info that
zswap_writeback_entry (i.e. ops->evict) later needs, so yeah it does
make more sense to call it zpool_evictable() and zpool->evictable.
However, I think the function should still be zpool_shrink() and
zpool->driver->shrink(), because it should be possible for
zs_pool_shrink() to call the normal zsmalloc shrinker, instead of
doing the zswap-style eviction, even if it doesn't do that currently.

> +}
> +
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("Dan Streetman ");
>  MODULE_DESCRIPTION("Common API for compressed memory storage");
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 683c0651098c..9cc741bcdb32 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -407,12 +407,6 @@ static void zs_zpool_free(void *pool, unsigned long 
> handle)
> zs_free(pool, handle);
>  }
>
> -static int zs_zpool_shrink(void *pool, unsigned int pages,
> -   unsigned int *reclaimed)
> -{
> -   return -EINVAL;
> -}
> -
>  static void *zs_zpool_map(void *pool, unsigned long handle,
> enum zpool_mapmode mm)
>  {
> @@ -450,7 +444,6 @@ static struct zpool_driver zs_zpool_driver = {
> .destroy =  zs_zpool_destroy,
> .malloc =   zs_zpool_malloc,
> .free = zs_zpool_free,
> -   .shrink =   zs_zpool_shrink,
> .map =  zs_zpool_map,
> .unmap =zs_zpool_unmap,
> .total_size =   zs_zpool_total_size,
> diff --git a/mm/zswap.c b/mm/zswap.c
> index d39581a076c3..15d2ea29a6fa 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -964,11 +964,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
> offset,
> struct zswap_entry *entry, *dupentry;
> struct crypto_comp *tfm;
> int ret;
> -   unsigned int dlen = PAGE_SIZE, len;
> +   unsigned int hlen, dlen = PAGE_SIZE;
> unsigned long handle;
> char *buf;
> u8 *src, *dst;
> -   struct zswap_header *zhdr;
> +   struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
>
> if (!zswap_enabled || !tree) {
> ret = -ENODEV;
> @@ 

Re: [PATCH 2/2] mm/zswap: move `zswap_has_pool` to front of `if ()`

2018-01-08 Thread Dan Streetman
On Tue, Jan 2, 2018 at 5:03 AM, Joey Pabalinas  wrote:
> `zwap_has_pool` is a simple boolean, so it should be tested first
> to avoid unnecessarily calling `strcmp()`. Test `zswap_has_pool`
> first to take advantage of the short-circuiting behavior of && in
> `__zswap_param_set()`.
>
> Signed-off-by: Joey Pabalinas 
>
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index a4f2dfaf9131694265..dbf35139471f692798 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -672,7 +672,7 @@ static int __zswap_param_set(const char *val, const 
> struct kernel_param *kp,
> }
>
> /* no change required */
> -   if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
> +   if (zswap_has_pool && !strcmp(s, *(char **)kp->arg))

Nak.

This function is only called when actually changing one of the zswap
module params, which is extremely rare (typically once per boot, per
parameter, if at all).  Changing the ordering will have virtually no
noticeable impact on anything.

Additionally, !zswap_has_pool is strictly an initialization-failed
temporary situation (until the compressor/zpool params are be set to
working implementation values), and in all "normal" conditions it will
be true, meaning this reordering will actually
*add* time - the normal path is for this check to *not* be true, so
keeping the strcmp first bypasses bothering with checking
zswap_has_pool.

> return 0;
>
> /* if this is load-time (pre-init) param setting,
> --
> 2.15.1
>


Re: [PATCH 2/2] mm/zswap: move `zswap_has_pool` to front of `if ()`

2018-01-08 Thread Dan Streetman
On Tue, Jan 2, 2018 at 5:03 AM, Joey Pabalinas  wrote:
> `zwap_has_pool` is a simple boolean, so it should be tested first
> to avoid unnecessarily calling `strcmp()`. Test `zswap_has_pool`
> first to take advantage of the short-circuiting behavior of && in
> `__zswap_param_set()`.
>
> Signed-off-by: Joey Pabalinas 
>
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index a4f2dfaf9131694265..dbf35139471f692798 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -672,7 +672,7 @@ static int __zswap_param_set(const char *val, const 
> struct kernel_param *kp,
> }
>
> /* no change required */
> -   if (!strcmp(s, *(char **)kp->arg) && zswap_has_pool)
> +   if (zswap_has_pool && !strcmp(s, *(char **)kp->arg))

Nak.

This function is only called when actually changing one of the zswap
module params, which is extremely rare (typically once per boot, per
parameter, if at all).  Changing the ordering will have virtually no
noticeable impact on anything.

Additionally, !zswap_has_pool is strictly an initialization-failed
temporary situation (until the compressor/zpool params are be set to
working implementation values), and in all "normal" conditions it will
be true, meaning this reordering will actually
*add* time - the normal path is for this check to *not* be true, so
keeping the strcmp first bypasses bothering with checking
zswap_has_pool.

> return 0;
>
> /* if this is load-time (pre-init) param setting,
> --
> 2.15.1
>


Re: [PATCH 1/2] mm/zswap: make type and compressor const

2018-01-08 Thread Dan Streetman
On Tue, Jan 2, 2018 at 5:03 AM, Joey Pabalinas  wrote:
> The characters pointed to by `zswap_compressor`, `type`, and `compressor`
> aren't ever modified. Add const to the static variable and both parameters in
> `zswap_pool_find_get()`, `zswap_pool_create()`, and `__zswap_param_set()`
>
> Signed-off-by: Joey Pabalinas 

Nak.

Those variables are not const; they are updated in
__zswap_param_set().  They aren't modified in pool_find_get() or
pool_create(), but they certainly aren't globally const.

>
>  1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index d39581a076c3aed1e9..a4f2dfaf9131694265 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -90,7 +90,7 @@ module_param_cb(enabled, _enabled_param_ops, 
> _enabled, 0644);
>
>  /* Crypto compressor to use */
>  #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
> -static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
> +static const char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
>  static int zswap_compressor_param_set(const char *,
>   const struct kernel_param *);
>  static struct kernel_param_ops zswap_compressor_param_ops = {
> @@ -475,7 +475,8 @@ static struct zswap_pool *zswap_pool_last_get(void)
>  }
>
>  /* type and compressor must be null-terminated */
> -static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> +static struct zswap_pool *zswap_pool_find_get(const char *type,
> + const char *compressor)
>  {
> struct zswap_pool *pool;
>
> @@ -495,7 +496,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, 
> char *compressor)
> return NULL;
>  }
>
> -static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> +static struct zswap_pool *zswap_pool_create(const char *type,
> +   const char *compressor)
>  {
> struct zswap_pool *pool;
> char name[38]; /* 'zswap' + 32 char (max) num + \0 */
> @@ -658,7 +660,7 @@ static void zswap_pool_put(struct zswap_pool *pool)
>
>  /* val must be a null-terminated string */
>  static int __zswap_param_set(const char *val, const struct kernel_param *kp,
> -char *type, char *compressor)
> +const char *type, const char *compressor)
>  {
> struct zswap_pool *pool, *put_pool = NULL;
> char *s = strstrip((char *)val);
> --
> 2.15.1
>


Re: [PATCH 1/2] mm/zswap: make type and compressor const

2018-01-08 Thread Dan Streetman
On Tue, Jan 2, 2018 at 5:03 AM, Joey Pabalinas  wrote:
> The characters pointed to by `zswap_compressor`, `type`, and `compressor`
> aren't ever modified. Add const to the static variable and both parameters in
> `zswap_pool_find_get()`, `zswap_pool_create()`, and `__zswap_param_set()`
>
> Signed-off-by: Joey Pabalinas 

Nak.

Those variables are not const; they are updated in
__zswap_param_set().  They aren't modified in pool_find_get() or
pool_create(), but they certainly aren't globally const.

>
>  1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index d39581a076c3aed1e9..a4f2dfaf9131694265 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -90,7 +90,7 @@ module_param_cb(enabled, _enabled_param_ops, 
> _enabled, 0644);
>
>  /* Crypto compressor to use */
>  #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
> -static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
> +static const char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
>  static int zswap_compressor_param_set(const char *,
>   const struct kernel_param *);
>  static struct kernel_param_ops zswap_compressor_param_ops = {
> @@ -475,7 +475,8 @@ static struct zswap_pool *zswap_pool_last_get(void)
>  }
>
>  /* type and compressor must be null-terminated */
> -static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
> +static struct zswap_pool *zswap_pool_find_get(const char *type,
> + const char *compressor)
>  {
> struct zswap_pool *pool;
>
> @@ -495,7 +496,8 @@ static struct zswap_pool *zswap_pool_find_get(char *type, 
> char *compressor)
> return NULL;
>  }
>
> -static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
> +static struct zswap_pool *zswap_pool_create(const char *type,
> +   const char *compressor)
>  {
> struct zswap_pool *pool;
> char name[38]; /* 'zswap' + 32 char (max) num + \0 */
> @@ -658,7 +660,7 @@ static void zswap_pool_put(struct zswap_pool *pool)
>
>  /* val must be a null-terminated string */
>  static int __zswap_param_set(const char *val, const struct kernel_param *kp,
> -char *type, char *compressor)
> +const char *type, const char *compressor)
>  {
> struct zswap_pool *pool, *put_pool = NULL;
> char *s = strstrip((char *)val);
> --
> 2.15.1
>


Re: [PATCH v2] zswap: Update with same-value filled page feature

2017-12-06 Thread Dan Streetman
On Wed, Dec 6, 2017 at 6:48 AM, Srividya Desireddy
<srividya...@samsung.com> wrote:
> From: Srividya Desireddy <srividya...@samsung.com>
> Date: Wed, 6 Dec 2017 16:29:50 +0530
> Subject: [PATCH v2] zswap: Update with same-value filled page feature
>
> Changes since v1:
> Updated to clarify about zswap.same_filled_pages_enabled parameter.
>
> Updated zswap document with details on same-value filled
> pages identification feature.
> The usage of zswap.same_filled_pages_enabled module parameter
> is explained.
>
> Signed-off-by: Srividya Desireddy <srividya...@samsung.com>

Acked-by: Dan Streetman <ddstr...@ieee.org>

> ---
>  Documentation/vm/zswap.txt | 22 +-
>  1 file changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt
> index 89fff7d..0b3a114 100644
> --- a/Documentation/vm/zswap.txt
> +++ b/Documentation/vm/zswap.txt
> @@ -98,5 +98,25 @@ request is made for a page in an old zpool, it is 
> uncompressed using its
>  original compressor.  Once all pages are removed from an old zpool, the zpool
>  and its compressor are freed.
>
> +Some of the pages in zswap are same-value filled pages (i.e. contents of the
> +page have same value or repetitive pattern). These pages include zero-filled
> +pages and they are handled differently. During store operation, a page is
> +checked if it is a same-value filled page before compressing it. If true, the
> +compressed length of the page is set to zero and the pattern or same-filled
> +value is stored.
> +
> +Same-value filled pages identification feature is enabled by default and can 
> be
> +disabled at boot time by setting the "same_filled_pages_enabled" attribute 
> to 0,
> +e.g. zswap.same_filled_pages_enabled=0. It can also be enabled and disabled 
> at
> +runtime using the sysfs "same_filled_pages_enabled" attribute, e.g.
> +
> +echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
> +
> +When zswap same-filled page identification is disabled at runtime, it will 
> stop
> +checking for the same-value filled pages during store operation. However, the
> +existing pages which are marked as same-value filled pages remain stored
> +unchanged in zswap until they are either loaded or invalidated.
> +
>  A debugfs interface is provided for various statistic about pool size, number
> -of pages stored, and various counters for the reasons pages are rejected.
> +of pages stored, same-value filled pages and various counters for the reasons
> +pages are rejected.
> --
> 2.7.4
>


Re: [PATCH v2] zswap: Update with same-value filled page feature

2017-12-06 Thread Dan Streetman
On Wed, Dec 6, 2017 at 6:48 AM, Srividya Desireddy
 wrote:
> From: Srividya Desireddy 
> Date: Wed, 6 Dec 2017 16:29:50 +0530
> Subject: [PATCH v2] zswap: Update with same-value filled page feature
>
> Changes since v1:
> Updated to clarify about zswap.same_filled_pages_enabled parameter.
>
> Updated zswap document with details on same-value filled
> pages identification feature.
> The usage of zswap.same_filled_pages_enabled module parameter
> is explained.
>
> Signed-off-by: Srividya Desireddy 

Acked-by: Dan Streetman 

> ---
>  Documentation/vm/zswap.txt | 22 +-
>  1 file changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt
> index 89fff7d..0b3a114 100644
> --- a/Documentation/vm/zswap.txt
> +++ b/Documentation/vm/zswap.txt
> @@ -98,5 +98,25 @@ request is made for a page in an old zpool, it is 
> uncompressed using its
>  original compressor.  Once all pages are removed from an old zpool, the zpool
>  and its compressor are freed.
>
> +Some of the pages in zswap are same-value filled pages (i.e. contents of the
> +page have same value or repetitive pattern). These pages include zero-filled
> +pages and they are handled differently. During store operation, a page is
> +checked if it is a same-value filled page before compressing it. If true, the
> +compressed length of the page is set to zero and the pattern or same-filled
> +value is stored.
> +
> +Same-value filled pages identification feature is enabled by default and can 
> be
> +disabled at boot time by setting the "same_filled_pages_enabled" attribute 
> to 0,
> +e.g. zswap.same_filled_pages_enabled=0. It can also be enabled and disabled 
> at
> +runtime using the sysfs "same_filled_pages_enabled" attribute, e.g.
> +
> +echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
> +
> +When zswap same-filled page identification is disabled at runtime, it will 
> stop
> +checking for the same-value filled pages during store operation. However, the
> +existing pages which are marked as same-value filled pages remain stored
> +unchanged in zswap until they are either loaded or invalidated.
> +
>  A debugfs interface is provided for various statistic about pool size, number
> -of pages stored, and various counters for the reasons pages are rejected.
> +of pages stored, same-value filled pages and various counters for the reasons
> +pages are rejected.
> --
> 2.7.4
>


Re: [PATCH] zswap: Update with same-value filled page feature

2017-11-29 Thread Dan Streetman
On Wed, Nov 29, 2017 at 10:34 AM, Srividya Desireddy
 wrote:
> From: Srividya Desireddy 
> Date: Wed, 29 Nov 2017 20:23:15 +0530
> Subject: [PATCH] zswap: Update with same-value filled page feature
>
> Updated zswap document with details on same-value filled
> pages identification feature.
> The usage of zswap.same_filled_pages_enabled module parameter
> is explained.
>
> Signed-off-by: Srividya Desireddy 
> ---
>  Documentation/vm/zswap.txt | 22 +-
>  1 file changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt
> index 89fff7d..cc015b5 100644
> --- a/Documentation/vm/zswap.txt
> +++ b/Documentation/vm/zswap.txt
> @@ -98,5 +98,25 @@ request is made for a page in an old zpool, it is 
> uncompressed using its
>  original compressor.  Once all pages are removed from an old zpool, the zpool
>  and its compressor are freed.
>
> +Some of the pages in zswap are same-value filled pages (i.e. contents of the
> +page have same value or repetitive pattern). These pages include zero-filled
> +pages and they are handled differently. During store operation, a page is
> +checked if it is a same-value filled page before compressing it. If true, the
> +compressed length of the page is set to zero and the pattern or same-filled
> +value is stored.
> +
> +Same-value filled pages identification feature is enabled by default and can 
> be
> +disabled at boot time by setting the "same_filled_pages_enabled" attribute 
> to 0,
> +e.g. zswap.same_filled_pages_enabled=0. It can also be enabled and disabled 
> at
> +runtime using the sysfs "same_filled_pages_enabled" attribute, e.g.
> +
> +echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
> +
> +When zswap same-filled page identification is disabled at runtime, it will 
> stop
> +checking for the same-value filled pages during store operation. However, the
> +existing pages which are marked as same-value filled pages will be loaded or
> +invalidated.

On first read I thought you were saying existing pages were
immediately loaded or invalidated, which of course is not the case.
Can you update the sentence to clarify existing pages are not modified
by disabling the param, like:

"However, the existing pages which are marked as same-value filled
pages remain stored unchanged until they are either loaded or
invalidated."

except for that the doc update looks good.

> +
>  A debugfs interface is provided for various statistic about pool size, number
> -of pages stored, and various counters for the reasons pages are rejected.
> +of pages stored, same-value filled pages and various counters for the reasons
> +pages are rejected.
> --
> 2.7.4
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


Re: [PATCH] zswap: Update with same-value filled page feature

2017-11-29 Thread Dan Streetman
On Wed, Nov 29, 2017 at 10:34 AM, Srividya Desireddy
 wrote:
> From: Srividya Desireddy 
> Date: Wed, 29 Nov 2017 20:23:15 +0530
> Subject: [PATCH] zswap: Update with same-value filled page feature
>
> Updated zswap document with details on same-value filled
> pages identification feature.
> The usage of zswap.same_filled_pages_enabled module parameter
> is explained.
>
> Signed-off-by: Srividya Desireddy 
> ---
>  Documentation/vm/zswap.txt | 22 +-
>  1 file changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/Documentation/vm/zswap.txt b/Documentation/vm/zswap.txt
> index 89fff7d..cc015b5 100644
> --- a/Documentation/vm/zswap.txt
> +++ b/Documentation/vm/zswap.txt
> @@ -98,5 +98,25 @@ request is made for a page in an old zpool, it is 
> uncompressed using its
>  original compressor.  Once all pages are removed from an old zpool, the zpool
>  and its compressor are freed.
>
> +Some of the pages in zswap are same-value filled pages (i.e. contents of the
> +page have same value or repetitive pattern). These pages include zero-filled
> +pages and they are handled differently. During store operation, a page is
> +checked if it is a same-value filled page before compressing it. If true, the
> +compressed length of the page is set to zero and the pattern or same-filled
> +value is stored.
> +
> +Same-value filled pages identification feature is enabled by default and can 
> be
> +disabled at boot time by setting the "same_filled_pages_enabled" attribute 
> to 0,
> +e.g. zswap.same_filled_pages_enabled=0. It can also be enabled and disabled 
> at
> +runtime using the sysfs "same_filled_pages_enabled" attribute, e.g.
> +
> +echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
> +
> +When zswap same-filled page identification is disabled at runtime, it will 
> stop
> +checking for the same-value filled pages during store operation. However, the
> +existing pages which are marked as same-value filled pages will be loaded or
> +invalidated.

On first read I thought you were saying existing pages were
immediately loaded or invalidated, which of course is not the case.
Can you update the sentence to clarify existing pages are not modified
by disabling the param, like:

"However, the existing pages which are marked as same-value filled
pages remain stored unchanged until they are either loaded or
invalidated."

except for that the doc update looks good.

> +
>  A debugfs interface is provided for various statistic about pool size, number
> -of pages stored, and various counters for the reasons pages are rejected.
> +of pages stored, same-value filled pages and various counters for the reasons
> +pages are rejected.
> --
> 2.7.4
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


Re: [PATCH] zswap: Same-filled pages handling

2017-11-28 Thread Dan Streetman
On Mon, Nov 20, 2017 at 6:46 PM, Andrew Morton
 wrote:
>
> On Wed, 18 Oct 2017 10:48:32 + Srividya Desireddy 
>  wrote:
>
> > +/* Enable/disable handling same-value filled pages (enabled by default) */
> > +static bool zswap_same_filled_pages_enabled = true;
> > +module_param_named(same_filled_pages_enabled, 
> > zswap_same_filled_pages_enabled,
> > +bool, 0644);
>
> Do we actually need this?  Being able to disable the new feature shows
> a certain lack of confidence ;) I guess we can remove it later as that
> confidence grows.

No, it's not absolutely needed to have the param to enable/disable the
feature, but my concern is around how many pages actually benefit from
this, since it adds some overhead to check every page - the usefulness
of the feature depends entirely on the system workload.  So having a
way to disable it is helpful, for use cases that know it won't benefit
them.

>
> Please send a patch to document this parameter in
> Documentation/vm/zswap.txt.  And if you have time, please check that
> the rest of that file is up-to-date?

Srividya, can you send a patch to doc this feature please.

I'll check the rest of the file is correct.

>
> Thanks.
>


Re: [PATCH] zswap: Same-filled pages handling

2017-11-28 Thread Dan Streetman
On Mon, Nov 20, 2017 at 6:46 PM, Andrew Morton
 wrote:
>
> On Wed, 18 Oct 2017 10:48:32 + Srividya Desireddy 
>  wrote:
>
> > +/* Enable/disable handling same-value filled pages (enabled by default) */
> > +static bool zswap_same_filled_pages_enabled = true;
> > +module_param_named(same_filled_pages_enabled, 
> > zswap_same_filled_pages_enabled,
> > +bool, 0644);
>
> Do we actually need this?  Being able to disable the new feature shows
> a certain lack of confidence ;) I guess we can remove it later as that
> confidence grows.

No, it's not absolutely needed to have the param to enable/disable the
feature, but my concern is around how many pages actually benefit from
this, since it adds some overhead to check every page - the usefulness
of the feature depends entirely on the system workload.  So having a
way to disable it is helpful, for use cases that know it won't benefit
them.

>
> Please send a patch to document this parameter in
> Documentation/vm/zswap.txt.  And if you have time, please check that
> the rest of that file is up-to-date?

Srividya, can you send a patch to doc this feature please.

I'll check the rest of the file is correct.

>
> Thanks.
>


Re: [PATCH] zswap: Same-filled pages handling

2017-11-17 Thread Dan Streetman
On Thu, Nov 2, 2017 at 11:08 AM, Srividya Desireddy
<srividya...@samsung.com> wrote:
>
> On Wed, Oct 19, 2017 at 6:38 AM, Matthew Wilcox wrote:
>> On Thu, Oct 19, 2017 at 12:31:18AM +0300, Timofey Titovets wrote:
>>> > +static void zswap_fill_page(void *ptr, unsigned long value)
>>> > +{
>>> > +   unsigned int pos;
>>> > +   unsigned long *page;
>>> > +
>>> > +   page = (unsigned long *)ptr;
>>> > +   if (value == 0)
>>> > +   memset(page, 0, PAGE_SIZE);
>>> > +   else {
>>> > +   for (pos = 0; pos < PAGE_SIZE / sizeof(*page); pos++)
>>> > +   page[pos] = value;
>>> > +   }
>>> > +}
>>>
>>> Same here, but with memcpy().
>>
>>No.  Use memset_l which is optimised for this specific job.
>
> I have tested this patch using memset_l() function in zswap_fill_page() on
> x86 64-bit system with 2GB RAM. The performance remains same.
> But, memset_l() funcion might be optimised in future.
> @Seth Jennings/Dan Streetman:  Should I use memset_l() function in this patch.

my testing showed also showed minimal if any difference when using
memset_l(), but it's simpler code and should never be slower than
looping.  I'll ack it if you want to send an additional patch making
this change (on top of the one I already acked).

>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


Re: [PATCH] zswap: Same-filled pages handling

2017-11-17 Thread Dan Streetman
On Thu, Nov 2, 2017 at 11:08 AM, Srividya Desireddy
 wrote:
>
> On Wed, Oct 19, 2017 at 6:38 AM, Matthew Wilcox wrote:
>> On Thu, Oct 19, 2017 at 12:31:18AM +0300, Timofey Titovets wrote:
>>> > +static void zswap_fill_page(void *ptr, unsigned long value)
>>> > +{
>>> > +   unsigned int pos;
>>> > +   unsigned long *page;
>>> > +
>>> > +   page = (unsigned long *)ptr;
>>> > +   if (value == 0)
>>> > +   memset(page, 0, PAGE_SIZE);
>>> > +   else {
>>> > +   for (pos = 0; pos < PAGE_SIZE / sizeof(*page); pos++)
>>> > +   page[pos] = value;
>>> > +   }
>>> > +}
>>>
>>> Same here, but with memcpy().
>>
>>No.  Use memset_l which is optimised for this specific job.
>
> I have tested this patch using memset_l() function in zswap_fill_page() on
> x86 64-bit system with 2GB RAM. The performance remains same.
> But, memset_l() funcion might be optimised in future.
> @Seth Jennings/Dan Streetman:  Should I use memset_l() function in this patch.

my testing showed also showed minimal if any difference when using
memset_l(), but it's simpler code and should never be slower than
looping.  I'll ack it if you want to send an additional patch making
this change (on top of the one I already acked).

>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


Re: [PATCH] zswap: Same-filled pages handling

2017-11-17 Thread Dan Streetman
On Wed, Oct 18, 2017 at 5:31 PM, Timofey Titovets  wrote:
>> +static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
>> +{
>> +   unsigned int pos;
>> +   unsigned long *page;
>> +
>> +   page = (unsigned long *)ptr;
>> +   for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
>> +   if (page[pos] != page[0])
>> +   return 0;
>> +   }
>> +   *value = page[0];
>> +   return 1;
>> +}
>> +
>
> In theory you can speedup that check by memcmp(),
> And do something like first:
> memcmp(ptr, ptr + PAGE_SIZE/sizeof(*page)/2, PAGE_SIZE/2);
> After compare 1/4 with 2/4
> Then 1/8 with 2/8.
> And after do you check with pattern, only on first 512 bytes.
>
> Just because memcmp() on fresh CPU are crazy fast.
> That can easy make you check less expensive.

I did check this, and it is actually significantly worse; keep in mind
that doing it ^ way may is a smaller loop, but is actually doing more
memory comparisons.

>
>> +static void zswap_fill_page(void *ptr, unsigned long value)
>> +{
>> +   unsigned int pos;
>> +   unsigned long *page;
>> +
>> +   page = (unsigned long *)ptr;
>> +   if (value == 0)
>> +   memset(page, 0, PAGE_SIZE);
>> +   else {
>> +   for (pos = 0; pos < PAGE_SIZE / sizeof(*page); pos++)
>> +   page[pos] = value;
>> +   }
>> +}
>
> Same here, but with memcpy().
>
> P.S.
> I'm just too busy to make fast performance test in user space,
> but my recent experience with that CPU commands, show what that make a sense:
> KSM patch: https://patchwork.kernel.org/patch/9980803/
> User space tests: https://github.com/Nefelim4ag/memcmpe
> PAGE_SIZE: 65536, loop count: 1966080
> memcmp:  -28time: 3216 ms,  th: 40064.644611 MiB/s
> memcmpe: -28, offset: 62232 time: 3588 ms,  th: 35902.462390 MiB/s
> memcmpe: -28, offset: 62232 time: 71 ms,th: 1792233.164286 MiB/s
>
> IIRC, with code like our, you must see ~2.5GiB/s
>
> Thanks.
> --
> Have a nice day,
> Timofey.


Re: [PATCH] zswap: Same-filled pages handling

2017-11-17 Thread Dan Streetman
On Wed, Oct 18, 2017 at 5:31 PM, Timofey Titovets  wrote:
>> +static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
>> +{
>> +   unsigned int pos;
>> +   unsigned long *page;
>> +
>> +   page = (unsigned long *)ptr;
>> +   for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) {
>> +   if (page[pos] != page[0])
>> +   return 0;
>> +   }
>> +   *value = page[0];
>> +   return 1;
>> +}
>> +
>
> In theory you can speedup that check by memcmp(),
> And do something like first:
> memcmp(ptr, ptr + PAGE_SIZE/sizeof(*page)/2, PAGE_SIZE/2);
> After compare 1/4 with 2/4
> Then 1/8 with 2/8.
> And after do you check with pattern, only on first 512 bytes.
>
> Just because memcmp() on fresh CPU are crazy fast.
> That can easy make you check less expensive.

I did check this, and it is actually significantly worse; keep in mind
that doing it ^ way may is a smaller loop, but is actually doing more
memory comparisons.

>
>> +static void zswap_fill_page(void *ptr, unsigned long value)
>> +{
>> +   unsigned int pos;
>> +   unsigned long *page;
>> +
>> +   page = (unsigned long *)ptr;
>> +   if (value == 0)
>> +   memset(page, 0, PAGE_SIZE);
>> +   else {
>> +   for (pos = 0; pos < PAGE_SIZE / sizeof(*page); pos++)
>> +   page[pos] = value;
>> +   }
>> +}
>
> Same here, but with memcpy().
>
> P.S.
> I'm just too busy to make fast performance test in user space,
> but my recent experience with that CPU commands, show what that make a sense:
> KSM patch: https://patchwork.kernel.org/patch/9980803/
> User space tests: https://github.com/Nefelim4ag/memcmpe
> PAGE_SIZE: 65536, loop count: 1966080
> memcmp:  -28time: 3216 ms,  th: 40064.644611 MiB/s
> memcmpe: -28, offset: 62232 time: 3588 ms,  th: 35902.462390 MiB/s
> memcmpe: -28, offset: 62232 time: 71 ms,th: 1792233.164286 MiB/s
>
> IIRC, with code like our, you must see ~2.5GiB/s
>
> Thanks.
> --
> Have a nice day,
> Timofey.


Re: [PATCH] zswap: Same-filled pages handling

2017-11-17 Thread Dan Streetman
On Wed, Oct 18, 2017 at 6:48 AM, Srividya Desireddy
<srividya...@samsung.com> wrote:
>
> From: Srividya Desireddy <srividya...@samsung.com>
> Date: Wed, 18 Oct 2017 15:39:02 +0530
> Subject: [PATCH] zswap: Same-filled pages handling
>
> Zswap is a cache which compresses the pages that are being swapped out
> and stores them into a dynamically allocated RAM-based memory pool.
> Experiments have shown that around 10-20% of pages stored in zswap
> are same-filled pages (i.e. contents of the page are all same), but
> these pages are handled as normal pages by compressing and allocating
> memory in the pool.
>
> This patch adds a check in zswap_frontswap_store() to identify same-filled
> page before compression of the page. If the page is a same-filled page, set
> zswap_entry.length to zero, save the same-filled value and skip the
> compression of the page and alloction of memory in zpool.
> In zswap_frontswap_load(), check if value of zswap_entry.length is zero
> corresponding to the page to be loaded. If zswap_entry.length is zero,
> fill the page with same-filled value. This saves the decompression time
> during load.
>
> On a ARM Quad Core 32-bit device with 1.5GB RAM by launching and
> relaunching different applications, out of ~64000 pages stored in
> zswap, ~11000 pages were same-value filled pages (including zero-filled
> pages) and ~9000 pages were zero-filled pages.
>
> An average of 17% of pages(including zero-filled pages) in zswap are
> same-value filled pages and 14% pages are zero-filled pages.
> An average of 3% of pages are same-filled non-zero pages.
>
> The below table shows the execution time profiling with the patch.
>
>   BaselineWith patch  % Improvement
> -
> *Zswap Store Time   26.5ms   18ms  32%
>  (of same value pages)
> *Zswap Load Time
>  (of same value pages)  25.5ms   13ms  49%
> -
>
> On Ubuntu PC with 2GB RAM, while executing kernel build and other test
> scripts and running multimedia applications, out of 36 pages
> stored in zswap 78000(~22%) of pages were found to be same-value filled
> pages (including zero-filled pages) and 64000(~17%) are zero-filled
> pages. So an average of %5 of pages are same-filled non-zero pages.
>
> The below table shows the execution time profiling with the patch.
>
>   BaselineWith patch  % Improvement
> -
> *Zswap Store Time   91ms74ms   19%
>  (of same value pages)
> *Zswap Load Time50ms7.5ms  85%
>  (of same value pages)
> -
>
> *The execution times may vary with test device used.

First, I'm really sorry for such a long delay in looking at this.

I did test this patch out this week, and I added some instrumentation
to check the performance impact, and tested with a small program to
try to check the best and worst cases.

When doing a lot of swap where all (or almost all) pages are
same-value, I found this patch does save both time and space,
significantly.  The exact improvement in time and space depends on
which compressor is being used, but roughly agrees with the numbers
you listed.

In the worst case situation, where all (or almost all) pages have the
same-value *except* the final long (meaning, zswap will check each
long on the entire page but then still have to pass the page to the
compressor), the same-value check is around 10-15% of the total time
spent in zswap_frontswap_store().  That's a not-insignificant amount
of time, but it's not huge.  Considering that most systems will
probably be swapping pages that aren't similar to the worst case
(although I don't have any data to know that), I'd say the improvement
is worth the possible worst-case performance impact.

>
> Signed-off-by: Srividya Desireddy <srividya...@samsung.com>

Acked-by: Dan Streetman <ddstr...@ieee.org>

> ---
>  mm/zswap.c | 77 
> ++
>  1 file changed, 72 insertions(+), 5 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index d39581a..4dd8b89 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -49,6 +49,8 @@
>  static u64 zswap_pool_total_size;
>  /* The number of compressed pages currently stored in zswap */
>  static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
> +/* The number of same-value filled pages currently stored in zswap */
> +static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
>
>  /*
>   * The statistics below are not protected from co

Re: [PATCH] zswap: Same-filled pages handling

2017-11-17 Thread Dan Streetman
On Wed, Oct 18, 2017 at 6:48 AM, Srividya Desireddy
 wrote:
>
> From: Srividya Desireddy 
> Date: Wed, 18 Oct 2017 15:39:02 +0530
> Subject: [PATCH] zswap: Same-filled pages handling
>
> Zswap is a cache which compresses the pages that are being swapped out
> and stores them into a dynamically allocated RAM-based memory pool.
> Experiments have shown that around 10-20% of pages stored in zswap
> are same-filled pages (i.e. contents of the page are all same), but
> these pages are handled as normal pages by compressing and allocating
> memory in the pool.
>
> This patch adds a check in zswap_frontswap_store() to identify same-filled
> page before compression of the page. If the page is a same-filled page, set
> zswap_entry.length to zero, save the same-filled value and skip the
> compression of the page and alloction of memory in zpool.
> In zswap_frontswap_load(), check if value of zswap_entry.length is zero
> corresponding to the page to be loaded. If zswap_entry.length is zero,
> fill the page with same-filled value. This saves the decompression time
> during load.
>
> On a ARM Quad Core 32-bit device with 1.5GB RAM by launching and
> relaunching different applications, out of ~64000 pages stored in
> zswap, ~11000 pages were same-value filled pages (including zero-filled
> pages) and ~9000 pages were zero-filled pages.
>
> An average of 17% of pages(including zero-filled pages) in zswap are
> same-value filled pages and 14% pages are zero-filled pages.
> An average of 3% of pages are same-filled non-zero pages.
>
> The below table shows the execution time profiling with the patch.
>
>   BaselineWith patch  % Improvement
> -
> *Zswap Store Time   26.5ms   18ms  32%
>  (of same value pages)
> *Zswap Load Time
>  (of same value pages)  25.5ms   13ms  49%
> -
>
> On Ubuntu PC with 2GB RAM, while executing kernel build and other test
> scripts and running multimedia applications, out of 36 pages
> stored in zswap 78000(~22%) of pages were found to be same-value filled
> pages (including zero-filled pages) and 64000(~17%) are zero-filled
> pages. So an average of %5 of pages are same-filled non-zero pages.
>
> The below table shows the execution time profiling with the patch.
>
>   BaselineWith patch  % Improvement
> -
> *Zswap Store Time   91ms74ms   19%
>  (of same value pages)
> *Zswap Load Time50ms7.5ms  85%
>  (of same value pages)
> -
>
> *The execution times may vary with test device used.

First, I'm really sorry for such a long delay in looking at this.

I did test this patch out this week, and I added some instrumentation
to check the performance impact, and tested with a small program to
try to check the best and worst cases.

When doing a lot of swap where all (or almost all) pages are
same-value, I found this patch does save both time and space,
significantly.  The exact improvement in time and space depends on
which compressor is being used, but roughly agrees with the numbers
you listed.

In the worst case situation, where all (or almost all) pages have the
same-value *except* the final long (meaning, zswap will check each
long on the entire page but then still have to pass the page to the
compressor), the same-value check is around 10-15% of the total time
spent in zswap_frontswap_store().  That's a not-insignificant amount
of time, but it's not huge.  Considering that most systems will
probably be swapping pages that aren't similar to the worst case
(although I don't have any data to know that), I'd say the improvement
is worth the possible worst-case performance impact.

>
> Signed-off-by: Srividya Desireddy 

Acked-by: Dan Streetman 

> ---
>  mm/zswap.c | 77 
> ++
>  1 file changed, 72 insertions(+), 5 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index d39581a..4dd8b89 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -49,6 +49,8 @@
>  static u64 zswap_pool_total_size;
>  /* The number of compressed pages currently stored in zswap */
>  static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
> +/* The number of same-value filled pages currently stored in zswap */
> +static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
>
>  /*
>   * The statistics below are not protected from concurrent access for
> @@ -116,6 +118,11 @@ static int zswap_compressor_param_set(const char *,
>  static unsigned

Re: [PATCH] mm/zswap: constify struct kernel_param_ops uses

2017-08-28 Thread Dan Streetman
On Sat, Aug 26, 2017 at 1:41 PM, Arvind Yadav <arvind.yadav...@gmail.com> wrote:
> kernel_param_ops are not supposed to change at runtime. All functions
> working with kernel_param_ops provided by  work
> with const kernel_param_ops. So mark the non-const structs as const.
>
> Signed-off-by: Arvind Yadav <arvind.yadav...@gmail.com>

Reviewed-by: Dan Streetman <ddstr...@ieee.org>

> ---
>  mm/zswap.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index d39581a..030fbf9 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -82,7 +82,7 @@ static u64 zswap_duplicate_entry;
>  static bool zswap_enabled;
>  static int zswap_enabled_param_set(const char *,
>const struct kernel_param *);
> -static struct kernel_param_ops zswap_enabled_param_ops = {
> +static const struct kernel_param_ops zswap_enabled_param_ops = {
> .set =  zswap_enabled_param_set,
> .get =  param_get_bool,
>  };
> @@ -93,7 +93,7 @@ module_param_cb(enabled, _enabled_param_ops, 
> _enabled, 0644);
>  static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
>  static int zswap_compressor_param_set(const char *,
>   const struct kernel_param *);
> -static struct kernel_param_ops zswap_compressor_param_ops = {
> +static const struct kernel_param_ops zswap_compressor_param_ops = {
> .set =  zswap_compressor_param_set,
> .get =  param_get_charp,
> .free = param_free_charp,
> @@ -105,7 +105,7 @@ module_param_cb(compressor, _compressor_param_ops,
>  #define ZSWAP_ZPOOL_DEFAULT "zbud"
>  static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
>  static int zswap_zpool_param_set(const char *, const struct kernel_param *);
> -static struct kernel_param_ops zswap_zpool_param_ops = {
> +static const struct kernel_param_ops zswap_zpool_param_ops = {
> .set =  zswap_zpool_param_set,
> .get =  param_get_charp,
> .free = param_free_charp,
> --
> 2.7.4
>


Re: [PATCH] mm/zswap: constify struct kernel_param_ops uses

2017-08-28 Thread Dan Streetman
On Sat, Aug 26, 2017 at 1:41 PM, Arvind Yadav  wrote:
> kernel_param_ops are not supposed to change at runtime. All functions
> working with kernel_param_ops provided by  work
> with const kernel_param_ops. So mark the non-const structs as const.
>
> Signed-off-by: Arvind Yadav 

Reviewed-by: Dan Streetman 

> ---
>  mm/zswap.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index d39581a..030fbf9 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -82,7 +82,7 @@ static u64 zswap_duplicate_entry;
>  static bool zswap_enabled;
>  static int zswap_enabled_param_set(const char *,
>const struct kernel_param *);
> -static struct kernel_param_ops zswap_enabled_param_ops = {
> +static const struct kernel_param_ops zswap_enabled_param_ops = {
> .set =  zswap_enabled_param_set,
> .get =  param_get_bool,
>  };
> @@ -93,7 +93,7 @@ module_param_cb(enabled, _enabled_param_ops, 
> _enabled, 0644);
>  static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
>  static int zswap_compressor_param_set(const char *,
>   const struct kernel_param *);
> -static struct kernel_param_ops zswap_compressor_param_ops = {
> +static const struct kernel_param_ops zswap_compressor_param_ops = {
> .set =  zswap_compressor_param_set,
> .get =  param_get_charp,
> .free = param_free_charp,
> @@ -105,7 +105,7 @@ module_param_cb(compressor, _compressor_param_ops,
>  #define ZSWAP_ZPOOL_DEFAULT "zbud"
>  static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
>  static int zswap_zpool_param_set(const char *, const struct kernel_param *);
> -static struct kernel_param_ops zswap_zpool_param_ops = {
> +static const struct kernel_param_ops zswap_zpool_param_ops = {
> .set =  zswap_zpool_param_set,
> .get =  param_get_charp,
> .free = param_free_charp,
> --
> 2.7.4
>


Re: [PATCH 2/2] zpool: Use common error handling code in zpool_create_pool()

2017-08-16 Thread Dan Streetman
On Mon, Aug 14, 2017 at 7:16 AM, SF Markus Elfring
<elfr...@users.sourceforge.net> wrote:
> From: Markus Elfring <elfr...@users.sourceforge.net>
> Date: Mon, 14 Aug 2017 13:04:33 +0200
>
> Add a jump target so that a bit of exception handling can be better reused
> in this function.
>
> Signed-off-by: Markus Elfring <elfr...@users.sourceforge.net>

Acked-by: Dan Streetman <ddstr...@ieee.org>

> ---
>  mm/zpool.c | 9 -
>  1 file changed, 4 insertions(+), 5 deletions(-)
>
> diff --git a/mm/zpool.c b/mm/zpool.c
> index fe1943f7d844..e4634edef86d 100644
> --- a/mm/zpool.c
> +++ b/mm/zpool.c
> @@ -171,10 +171,8 @@ struct zpool *zpool_create_pool(const char *type, const 
> char *name, gfp_t gfp,
> }
>
> zpool = kmalloc(sizeof(*zpool), gfp);
> -   if (!zpool) {
> -   zpool_put_driver(driver);
> -   return NULL;
> -   }
> +   if (!zpool)
> +   goto put_driver;
>
> zpool->driver = driver;
> zpool->pool = driver->create(name, gfp, ops, zpool);
> @@ -182,8 +180,9 @@ struct zpool *zpool_create_pool(const char *type, const 
> char *name, gfp_t gfp,
>
> if (!zpool->pool) {
> pr_err("couldn't create %s pool\n", type);
> -   zpool_put_driver(driver);
> kfree(zpool);
> +put_driver:
> +   zpool_put_driver(driver);
> return NULL;
> }
>
> --
> 2.14.0
>


Re: [PATCH 2/2] zpool: Use common error handling code in zpool_create_pool()

2017-08-16 Thread Dan Streetman
On Mon, Aug 14, 2017 at 7:16 AM, SF Markus Elfring
 wrote:
> From: Markus Elfring 
> Date: Mon, 14 Aug 2017 13:04:33 +0200
>
> Add a jump target so that a bit of exception handling can be better reused
> in this function.
>
> Signed-off-by: Markus Elfring 

Acked-by: Dan Streetman 

> ---
>  mm/zpool.c | 9 -
>  1 file changed, 4 insertions(+), 5 deletions(-)
>
> diff --git a/mm/zpool.c b/mm/zpool.c
> index fe1943f7d844..e4634edef86d 100644
> --- a/mm/zpool.c
> +++ b/mm/zpool.c
> @@ -171,10 +171,8 @@ struct zpool *zpool_create_pool(const char *type, const 
> char *name, gfp_t gfp,
> }
>
> zpool = kmalloc(sizeof(*zpool), gfp);
> -   if (!zpool) {
> -   zpool_put_driver(driver);
> -   return NULL;
> -   }
> +   if (!zpool)
> +   goto put_driver;
>
> zpool->driver = driver;
> zpool->pool = driver->create(name, gfp, ops, zpool);
> @@ -182,8 +180,9 @@ struct zpool *zpool_create_pool(const char *type, const 
> char *name, gfp_t gfp,
>
> if (!zpool->pool) {
> pr_err("couldn't create %s pool\n", type);
> -   zpool_put_driver(driver);
> kfree(zpool);
> +put_driver:
> +   zpool_put_driver(driver);
> return NULL;
> }
>
> --
> 2.14.0
>


Re: [PATCH 1/2] zpool: Delete an error message for a failed memory allocation in zpool_create_pool()

2017-08-16 Thread Dan Streetman
On Mon, Aug 14, 2017 at 7:15 AM, SF Markus Elfring
<elfr...@users.sourceforge.net> wrote:
> From: Markus Elfring <elfr...@users.sourceforge.net>
> Date: Mon, 14 Aug 2017 12:57:16 +0200
>
> Omit an extra message for a memory allocation failure in this function.
>
> This issue was detected by using the Coccinelle software.
>
> Signed-off-by: Markus Elfring <elfr...@users.sourceforge.net>

Acked-by: Dan Streetman <ddstr...@ieee.org>

> ---
>  mm/zpool.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/mm/zpool.c b/mm/zpool.c
> index fd3ff719c32c..fe1943f7d844 100644
> --- a/mm/zpool.c
> +++ b/mm/zpool.c
> @@ -172,7 +172,6 @@ struct zpool *zpool_create_pool(const char *type, const 
> char *name, gfp_t gfp,
>
> zpool = kmalloc(sizeof(*zpool), gfp);
> if (!zpool) {
> -   pr_err("couldn't create zpool - out of memory\n");
> zpool_put_driver(driver);
> return NULL;
> }
> --
> 2.14.0
>


Re: [PATCH 1/2] zpool: Delete an error message for a failed memory allocation in zpool_create_pool()

2017-08-16 Thread Dan Streetman
On Mon, Aug 14, 2017 at 7:15 AM, SF Markus Elfring
 wrote:
> From: Markus Elfring 
> Date: Mon, 14 Aug 2017 12:57:16 +0200
>
> Omit an extra message for a memory allocation failure in this function.
>
> This issue was detected by using the Coccinelle software.
>
> Signed-off-by: Markus Elfring 

Acked-by: Dan Streetman 

> ---
>  mm/zpool.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/mm/zpool.c b/mm/zpool.c
> index fd3ff719c32c..fe1943f7d844 100644
> --- a/mm/zpool.c
> +++ b/mm/zpool.c
> @@ -172,7 +172,6 @@ struct zpool *zpool_create_pool(const char *type, const 
> char *name, gfp_t gfp,
>
> zpool = kmalloc(sizeof(*zpool), gfp);
> if (!zpool) {
> -   pr_err("couldn't create zpool - out of memory\n");
> zpool_put_driver(driver);
> return NULL;
> }
> --
> 2.14.0
>


Re: [PATCH v2] zswap: Zero-filled pages handling

2017-07-06 Thread Dan Streetman
On Thu, Jul 6, 2017 at 5:29 AM, Srividya Desireddy
 wrote:
> On Wed, Jul 6, 2017 at 10:49 AM, Sergey Senozhatsky wrote:
>> On (07/02/17 20:28), Seth Jennings wrote:
>>> On Sun, Jul 2, 2017 at 9:19 AM, Srividya Desireddy
>>> > Zswap is a cache which compresses the pages that are being swapped out
>>> > and stores them into a dynamically allocated RAM-based memory pool.
>>> > Experiments have shown that around 10-20% of pages stored in zswap
>>> > are zero-filled pages (i.e. contents of the page are all zeros), but
>>> > these pages are handled as normal pages by compressing and allocating
>>> > memory in the pool.
>>>
>>> I am somewhat surprised that this many anon pages are zero filled.
>>>
>>> If this is true, then maybe we should consider solving this at the
>>> swap level in general, as we can de-dup zero pages in all swap
>>> devices, not just zswap.
>>>
>>> That being said, this is a fair small change and I don't see anything
>>> objectionable.  However, I do think the better solution would be to do
>> this at a higher level.
>>
>
> Thank you for your suggestion. It is a better solution to handle
> zero-filled pages before swapping-out to zswap. Since, Zram is already
> handles Zero pages internally, I considered to handle within Zswap.
> In a long run, we can work on it to commonly handle zero-filled anon
> pages.
>
>> zero-filled pages are just 1 case. in general, it's better
>> to handle pages that are memset-ed with the same value (e.g.
>> memset(page, 0x01, page_size)). which includes, but not
>> limited to, 0x00. zram does it.
>>
>> -ss
>
> It is a good solution to extend zero-filled pages handling to same value
> pages. I will work on to identify the percentage of same value pages
> excluding zero-filled pages in Zswap and will get back.

Yes, this sounds like a good modification to the patch.  Also, unless
anyone else disagrees, it may be good to control this with a module
param - in case anyone has a use case that they know won't be helped
by this, and the extra overhead of checking each page is wasteful.
Probably should default to enabled.

>
> - Srividya


Re: [PATCH v2] zswap: Zero-filled pages handling

2017-07-06 Thread Dan Streetman
On Thu, Jul 6, 2017 at 5:29 AM, Srividya Desireddy
 wrote:
> On Wed, Jul 6, 2017 at 10:49 AM, Sergey Senozhatsky wrote:
>> On (07/02/17 20:28), Seth Jennings wrote:
>>> On Sun, Jul 2, 2017 at 9:19 AM, Srividya Desireddy
>>> > Zswap is a cache which compresses the pages that are being swapped out
>>> > and stores them into a dynamically allocated RAM-based memory pool.
>>> > Experiments have shown that around 10-20% of pages stored in zswap
>>> > are zero-filled pages (i.e. contents of the page are all zeros), but
>>> > these pages are handled as normal pages by compressing and allocating
>>> > memory in the pool.
>>>
>>> I am somewhat surprised that this many anon pages are zero filled.
>>>
>>> If this is true, then maybe we should consider solving this at the
>>> swap level in general, as we can de-dup zero pages in all swap
>>> devices, not just zswap.
>>>
>>> That being said, this is a fair small change and I don't see anything
>>> objectionable.  However, I do think the better solution would be to do
>> this at a higher level.
>>
>
> Thank you for your suggestion. It is a better solution to handle
> zero-filled pages before swapping-out to zswap. Since, Zram is already
> handles Zero pages internally, I considered to handle within Zswap.
> In a long run, we can work on it to commonly handle zero-filled anon
> pages.
>
>> zero-filled pages are just 1 case. in general, it's better
>> to handle pages that are memset-ed with the same value (e.g.
>> memset(page, 0x01, page_size)). which includes, but not
>> limited to, 0x00. zram does it.
>>
>> -ss
>
> It is a good solution to extend zero-filled pages handling to same value
> pages. I will work on to identify the percentage of same value pages
> excluding zero-filled pages in Zswap and will get back.

Yes, this sounds like a good modification to the patch.  Also, unless
anyone else disagrees, it may be good to control this with a module
param - in case anyone has a use case that they know won't be helped
by this, and the extra overhead of checking each page is wasteful.
Probably should default to enabled.

>
> - Srividya


Re: [PATCH 3/3] zswap: Delete an error message for a failed memory allocation in zswap_dstmem_prepare()

2017-05-30 Thread Dan Streetman
On Sun, May 21, 2017 at 4:27 AM, SF Markus Elfring
<elfr...@users.sourceforge.net> wrote:
> From: Markus Elfring <elfr...@users.sourceforge.net>
> Date: Sun, 21 May 2017 09:29:25 +0200
>
> Omit an extra message for a memory allocation failure in this function.
>
> This issue was detected by using the Coccinelle software.
>
> Link: 
> http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf
> Signed-off-by: Markus Elfring <elfr...@users.sourceforge.net>

Acked-by: Dan Streetman <ddstr...@ieee.org>

> ---
>  mm/zswap.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 3f0a9a1daef4..ed7312291df9 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -374,7 +374,6 @@ static int zswap_dstmem_prepare(unsigned int cpu)
> -   if (!dst) {
> -   pr_err("can't allocate compressor buffer\n");
> +   if (!dst)
> return -ENOMEM;
> -   }
> +
> per_cpu(zswap_dstmem, cpu) = dst;
> return 0;
>  }
> --
> 2.13.0
>


Re: [PATCH 3/3] zswap: Delete an error message for a failed memory allocation in zswap_dstmem_prepare()

2017-05-30 Thread Dan Streetman
On Sun, May 21, 2017 at 4:27 AM, SF Markus Elfring
 wrote:
> From: Markus Elfring 
> Date: Sun, 21 May 2017 09:29:25 +0200
>
> Omit an extra message for a memory allocation failure in this function.
>
> This issue was detected by using the Coccinelle software.
>
> Link: 
> http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf
> Signed-off-by: Markus Elfring 

Acked-by: Dan Streetman 

> ---
>  mm/zswap.c | 5 ++---
>  1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 3f0a9a1daef4..ed7312291df9 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -374,7 +374,6 @@ static int zswap_dstmem_prepare(unsigned int cpu)
> -   if (!dst) {
> -   pr_err("can't allocate compressor buffer\n");
> +   if (!dst)
> return -ENOMEM;
> -   }
> +
> per_cpu(zswap_dstmem, cpu) = dst;
> return 0;
>  }
> --
> 2.13.0
>


Re: [PATCH 2/3] zswap: Improve a size determination in zswap_frontswap_init()

2017-05-30 Thread Dan Streetman
On Sun, May 21, 2017 at 4:26 AM, SF Markus Elfring
<elfr...@users.sourceforge.net> wrote:
> From: Markus Elfring <elfr...@users.sourceforge.net>
> Date: Sat, 20 May 2017 22:44:03 +0200
>
> Replace the specification of a data structure by a pointer dereference
> as the parameter for the operator "sizeof" to make the corresponding size
> determination a bit safer according to the Linux coding style convention.
>
> Signed-off-by: Markus Elfring <elfr...@users.sourceforge.net>

Acked-by: Dan Streetman <ddstr...@ieee.org>

> ---
>  mm/zswap.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 18d8e87119a6..a6e67633be03 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1156,5 +1156,5 @@ static void zswap_frontswap_init(unsigned type)
>  {
> struct zswap_tree *tree;
>
> -   tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
> +   tree = kzalloc(sizeof(*tree), GFP_KERNEL);
> if (!tree) {
> --
> 2.13.0
>


Re: [PATCH 2/3] zswap: Improve a size determination in zswap_frontswap_init()

2017-05-30 Thread Dan Streetman
On Sun, May 21, 2017 at 4:26 AM, SF Markus Elfring
 wrote:
> From: Markus Elfring 
> Date: Sat, 20 May 2017 22:44:03 +0200
>
> Replace the specification of a data structure by a pointer dereference
> as the parameter for the operator "sizeof" to make the corresponding size
> determination a bit safer according to the Linux coding style convention.
>
> Signed-off-by: Markus Elfring 

Acked-by: Dan Streetman 

> ---
>  mm/zswap.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 18d8e87119a6..a6e67633be03 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -1156,5 +1156,5 @@ static void zswap_frontswap_init(unsigned type)
>  {
> struct zswap_tree *tree;
>
> -   tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
> +   tree = kzalloc(sizeof(*tree), GFP_KERNEL);
> if (!tree) {
> --
> 2.13.0
>


Re: [PATCH 1/3] zswap: Delete an error message for a failed memory allocation in zswap_pool_create()

2017-05-30 Thread Dan Streetman
On Sun, May 21, 2017 at 4:25 AM, SF Markus Elfring
<elfr...@users.sourceforge.net> wrote:
> From: Markus Elfring <elfr...@users.sourceforge.net>
> Date: Sat, 20 May 2017 22:33:21 +0200
>
> Omit an extra message for a memory allocation failure in this function.
>
> This issue was detected by using the Coccinelle software.
>
> Link: 
> http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf
> Signed-off-by: Markus Elfring <elfr...@users.sourceforge.net>

Acked-by: Dan Streetman <ddstr...@ieee.org>

> ---
>  mm/zswap.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index eedc27894b10..18d8e87119a6 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -518,7 +518,5 @@ static struct zswap_pool *zswap_pool_create(char *type, 
> char *compressor)
> -   if (!pool) {
> -   pr_err("pool alloc failed\n");
> +   if (!pool)
> return NULL;
> -   }
>
> /* unique name for each pool specifically required by zsmalloc */
> snprintf(name, 38, "zswap%x", atomic_inc_return(_pools_count));
> --
> 2.13.0
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


Re: [PATCH 1/3] zswap: Delete an error message for a failed memory allocation in zswap_pool_create()

2017-05-30 Thread Dan Streetman
On Sun, May 21, 2017 at 4:25 AM, SF Markus Elfring
 wrote:
> From: Markus Elfring 
> Date: Sat, 20 May 2017 22:33:21 +0200
>
> Omit an extra message for a memory allocation failure in this function.
>
> This issue was detected by using the Coccinelle software.
>
> Link: 
> http://events.linuxfoundation.org/sites/events/files/slides/LCJ16-Refactor_Strings-WSang_0.pdf
> Signed-off-by: Markus Elfring 

Acked-by: Dan Streetman 

> ---
>  mm/zswap.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index eedc27894b10..18d8e87119a6 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -518,7 +518,5 @@ static struct zswap_pool *zswap_pool_create(char *type, 
> char *compressor)
> -   if (!pool) {
> -   pr_err("pool alloc failed\n");
> +   if (!pool)
> return NULL;
> -   }
>
> /* unique name for each pool specifically required by zsmalloc */
> snprintf(name, 38, "zswap%x", atomic_inc_return(_pools_count));
> --
> 2.13.0
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


Re: [PATCH] crypto/nx: Update MAINTAINERS entry for 842 compression

2017-04-01 Thread Dan Streetman
On Sat, Apr 1, 2017 at 1:25 PM, Haren Myneni <ha...@linux.vnet.ibm.com> wrote:
> [PATCH] crypto/nx: Update MAINTAINERS entry for 842 compression
>
> Signed-off-by: Haren Myneni <ha...@us.ibm.com>

Acked-by: Dan Streetman <ddstr...@ieee.org>

>
> ---
>  MAINTAINERS | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index c265a5f..4cfd225 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -6211,7 +6211,7 @@ F:drivers/crypto/nx/nx_csbcpb.h
>  F: drivers/crypto/nx/nx_debugfs.h
>
>  IBM Power 842 compression accelerator
> -M: Dan Streetman <ddstr...@ieee.org>
> +M: Haren Myneni <ha...@us.ibm.com>
>  S: Supported
>  F: drivers/crypto/nx/Makefile
>  F: drivers/crypto/nx/Kconfig
> --
> 1.8.3.1
>
>
>


Re: [PATCH] crypto/nx: Update MAINTAINERS entry for 842 compression

2017-04-01 Thread Dan Streetman
On Sat, Apr 1, 2017 at 1:25 PM, Haren Myneni  wrote:
> [PATCH] crypto/nx: Update MAINTAINERS entry for 842 compression
>
> Signed-off-by: Haren Myneni 

Acked-by: Dan Streetman 

>
> ---
>  MAINTAINERS | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index c265a5f..4cfd225 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -6211,7 +6211,7 @@ F:drivers/crypto/nx/nx_csbcpb.h
>  F: drivers/crypto/nx/nx_debugfs.h
>
>  IBM Power 842 compression accelerator
> -M: Dan Streetman 
> +M: Haren Myneni 
>  S: Supported
>  F: drivers/crypto/nx/Makefile
>  F: drivers/crypto/nx/Kconfig
> --
> 1.8.3.1
>
>
>


Re: maybe revert commit c275a57f5ec3 "xen/balloon: Set balloon's initial state to number of existing RAM pages"

2017-03-27 Thread Dan Streetman
On Fri, Mar 24, 2017 at 9:33 PM, Boris Ostrovsky
 wrote:
>
>>
>> I think we can all agree that the *ideal* situation would be, for the
>> balloon driver to not immediately hotplug memory so it can add 11 more
>> pages, so maybe I just need to figure out why the balloon driver
>> thinks it needs 11 more pages, and fix that.
>
>
>
> How does the new memory appear in the guest? Via online_pages()?
>
> Or is ballooning triggered from watch_target()?

yes, it's triggered from watch_target() which then calls
online_pages() with the new memory.  I added some debug (all numbers
are in hex):

[0.500080] xen:balloon: Initialising balloon driver
[0.503027] xen:balloon: balloon_init: current/target pages 1fff9d
[0.504044] xen_balloon: Initialising balloon driver
[0.508046] xen_balloon: watch_target: new target 80 kb
[0.508046] xen:balloon: balloon_set_new_target: target 20
[0.524024] xen:balloon: current_credit: target pages 20
current pages 1fff9d credit 63
[0.567055] xen:balloon: balloon_process: current_credit 63
[0.568005] xen:balloon: reserve_additional_memory: adding memory
resource for 8000 pages
[3.694443] online_pages: pfn 21 nr_pages 8000 type 0
[3.701072] xen:balloon: current_credit: target pages 20
current pages 1fff9d credit 63
[3.701074] xen:balloon: balloon_process: current_credit 63
[3.701075] xen:balloon: increase_reservation: nr_pages 63
[3.701170] xen:balloon: increase_reservation: done, current_pages 1fffa8
[3.701172] xen:balloon: current_credit: target pages 20
current pages 1fffa8 credit 58
[3.701173] xen:balloon: balloon_process: current_credit 58
[3.701173] xen:balloon: increase_reservation: nr_pages 58
[3.701180] xen:balloon: increase_reservation: XENMEM_populate_physmap err 0
[5.708085] xen:balloon: current_credit: target pages 20
current pages 1fffa8 credit 58
[5.708088] xen:balloon: balloon_process: current_credit 58
[5.708089] xen:balloon: increase_reservation: nr_pages 58
[5.708106] xen:balloon: increase_reservation: XENMEM_populate_physmap err 0
[9.716065] xen:balloon: current_credit: target pages 20
current pages 1fffa8 credit 58
[9.716068] xen:balloon: balloon_process: current_credit 58
[9.716069] xen:balloon: increase_reservation: nr_pages 58
[9.716087] xen:balloon: increase_reservation: XENMEM_populate_physmap err 0


and that continues forever at the max interval (32), since
max_retry_count is unlimited.  So I think I understand things now;
first, the current_pages is set properly based on the e820 map:

$ dmesg|grep -i e820
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0009dfff] usable
[0.00] BIOS-e820: [mem 0x0009e000-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000e-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0xefff] usable
[0.00] BIOS-e820: [mem 0xfc00-0x] reserved
[0.00] BIOS-e820: [mem 0x0001-0x00020fff] usable
[0.00] e820: update [mem 0x-0x0fff] usable ==> reserved
[0.00] e820: remove [mem 0x000a-0x000f] usable
[0.00] e820: last_pfn = 0x21 max_arch_pfn = 0x4
[0.00] e820: last_pfn = 0xf max_arch_pfn = 0x4
[0.00] e820: [mem 0xf000-0xfbff] available for PCI devices
[0.528007] e820: reserve RAM buffer [mem 0x0009e000-0x0009]
ubuntu@ip-172-31-60-112:~$ printf "%x\n" $[ 0x21 - 0x10 +
0xf - 0x100 + 0x9e - 1 ]
1fff9d


then, the xen balloon notices its target has been set to 20 by the
hypervisor.  That target does account for the hole at 0xf to
0x10, but it doesn't account for the hole at 0xe0 to 0x100 ( 0x20
pages), nor the hole at 0x9e to 0xa0 ( 2 pages ), nor the unlisted
hole (that the kernel removes) at 0xa0 to 0xe0 ( 0x40 pages).  That's
0x62 pages, plus the 1-page hole at addr 0 that the kernel always
reserves, is 0x63 pages of holes, which aren't accounted for in the
hypervisor's target.

so the balloon driver hotplugs the memory, and tries to increase its
reservation to provide the needed pages to get the current_pages up to
the target.  However, when it calls the hypervisor to populate the
physmap, the hypervisor only allows 11 (0xb) pages to be populated;
all calls after that get back 0 from the hypervisor.

Do you think the hypervisor's balloon target should account for the
e820 holes (and for the kernel's added hole at addr 0)?
Alternately/additionally, if the hypervisor doesn't want to support
ballooning, should it just return error from the call to populate the
physmap, and not allow those 11 pages?

At this point, it doesn't seem to me like the kernel is doing anything
wrong, correct?


Re: maybe revert commit c275a57f5ec3 "xen/balloon: Set balloon's initial state to number of existing RAM pages"

2017-03-27 Thread Dan Streetman
On Fri, Mar 24, 2017 at 9:33 PM, Boris Ostrovsky
 wrote:
>
>>
>> I think we can all agree that the *ideal* situation would be, for the
>> balloon driver to not immediately hotplug memory so it can add 11 more
>> pages, so maybe I just need to figure out why the balloon driver
>> thinks it needs 11 more pages, and fix that.
>
>
>
> How does the new memory appear in the guest? Via online_pages()?
>
> Or is ballooning triggered from watch_target()?

yes, it's triggered from watch_target() which then calls
online_pages() with the new memory.  I added some debug (all numbers
are in hex):

[0.500080] xen:balloon: Initialising balloon driver
[0.503027] xen:balloon: balloon_init: current/target pages 1fff9d
[0.504044] xen_balloon: Initialising balloon driver
[0.508046] xen_balloon: watch_target: new target 80 kb
[0.508046] xen:balloon: balloon_set_new_target: target 20
[0.524024] xen:balloon: current_credit: target pages 20
current pages 1fff9d credit 63
[0.567055] xen:balloon: balloon_process: current_credit 63
[0.568005] xen:balloon: reserve_additional_memory: adding memory
resource for 8000 pages
[3.694443] online_pages: pfn 21 nr_pages 8000 type 0
[3.701072] xen:balloon: current_credit: target pages 20
current pages 1fff9d credit 63
[3.701074] xen:balloon: balloon_process: current_credit 63
[3.701075] xen:balloon: increase_reservation: nr_pages 63
[3.701170] xen:balloon: increase_reservation: done, current_pages 1fffa8
[3.701172] xen:balloon: current_credit: target pages 20
current pages 1fffa8 credit 58
[3.701173] xen:balloon: balloon_process: current_credit 58
[3.701173] xen:balloon: increase_reservation: nr_pages 58
[3.701180] xen:balloon: increase_reservation: XENMEM_populate_physmap err 0
[5.708085] xen:balloon: current_credit: target pages 20
current pages 1fffa8 credit 58
[5.708088] xen:balloon: balloon_process: current_credit 58
[5.708089] xen:balloon: increase_reservation: nr_pages 58
[5.708106] xen:balloon: increase_reservation: XENMEM_populate_physmap err 0
[9.716065] xen:balloon: current_credit: target pages 20
current pages 1fffa8 credit 58
[9.716068] xen:balloon: balloon_process: current_credit 58
[9.716069] xen:balloon: increase_reservation: nr_pages 58
[9.716087] xen:balloon: increase_reservation: XENMEM_populate_physmap err 0


and that continues forever at the max interval (32), since
max_retry_count is unlimited.  So I think I understand things now;
first, the current_pages is set properly based on the e820 map:

$ dmesg|grep -i e820
[0.00] e820: BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0009dfff] usable
[0.00] BIOS-e820: [mem 0x0009e000-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000e-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0xefff] usable
[0.00] BIOS-e820: [mem 0xfc00-0x] reserved
[0.00] BIOS-e820: [mem 0x0001-0x00020fff] usable
[0.00] e820: update [mem 0x-0x0fff] usable ==> reserved
[0.00] e820: remove [mem 0x000a-0x000f] usable
[0.00] e820: last_pfn = 0x21 max_arch_pfn = 0x4
[0.00] e820: last_pfn = 0xf max_arch_pfn = 0x4
[0.00] e820: [mem 0xf000-0xfbff] available for PCI devices
[0.528007] e820: reserve RAM buffer [mem 0x0009e000-0x0009]
ubuntu@ip-172-31-60-112:~$ printf "%x\n" $[ 0x21 - 0x10 +
0xf - 0x100 + 0x9e - 1 ]
1fff9d


then, the xen balloon notices its target has been set to 20 by the
hypervisor.  That target does account for the hole at 0xf to
0x10, but it doesn't account for the hole at 0xe0 to 0x100 ( 0x20
pages), nor the hole at 0x9e to 0xa0 ( 2 pages ), nor the unlisted
hole (that the kernel removes) at 0xa0 to 0xe0 ( 0x40 pages).  That's
0x62 pages, plus the 1-page hole at addr 0 that the kernel always
reserves, is 0x63 pages of holes, which aren't accounted for in the
hypervisor's target.

so the balloon driver hotplugs the memory, and tries to increase its
reservation to provide the needed pages to get the current_pages up to
the target.  However, when it calls the hypervisor to populate the
physmap, the hypervisor only allows 11 (0xb) pages to be populated;
all calls after that get back 0 from the hypervisor.

Do you think the hypervisor's balloon target should account for the
e820 holes (and for the kernel's added hole at addr 0)?
Alternately/additionally, if the hypervisor doesn't want to support
ballooning, should it just return error from the call to populate the
physmap, and not allow those 11 pages?

At this point, it doesn't seem to me like the kernel is doing anything
wrong, correct?


Re: maybe revert commit c275a57f5ec3 "xen/balloon: Set balloon's initial state to number of existing RAM pages"

2017-03-24 Thread Dan Streetman
On Fri, Mar 24, 2017 at 5:10 PM, Konrad Rzeszutek Wilk
<konrad.w...@oracle.com> wrote:
> On Fri, Mar 24, 2017 at 04:34:23PM -0400, Dan Streetman wrote:
>> On Wed, Mar 22, 2017 at 10:13 PM, Boris Ostrovsky
>> <boris.ostrov...@oracle.com> wrote:
>> >
>> >
>> > On 03/22/2017 05:16 PM, Dan Streetman wrote:
>> >>
>> >> I have a question about a problem introduced by this commit:
>> >> c275a57f5ec3056f732843b11659d892235faff7
>> >> "xen/balloon: Set balloon's initial state to number of existing RAM pages"
>> >>
>> >> It changed the xen balloon current_pages calculation to start with the
>> >> number of physical pages in the system, instead of max_pfn.  Since
>> >> get_num_physpages() does not include holes, it's always less than the
>> >> e820 map's max_pfn.
>> >>
>> >> However, the problem that commit introduced is, if the hypervisor sets
>> >> the balloon target to equal to the e820 map's max_pfn, then the
>> >> balloon target will *always* be higher than the initial current pages.
>> >> Even if the hypervisor sets the target to (e820 max_pfn - holes), if
>> >> the OS adds any holes, the balloon target will be higher than the
>> >> current pages.  This is the situation, for example, for Amazon AWS
>> >> instances.  The result is, the xen balloon will always immediately
>> >> hotplug some memory at boot, but then make only (max_pfn -
>> >> get_num_physpages()) available to the system.
>> >>
>> >> This balloon-hotplugged memory can cause problems, if the hypervisor
>> >> wasn't expecting it; specifically, the system's physical page
>> >> addresses now will exceed the e820 map's max_pfn, due to the
>> >> balloon-hotplugged pages; if the hypervisor isn't expecting pt-device
>> >> DMA to/from those physical pages above the e820 max_pfn, it causes
>> >> problems.  For example:
>> >> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1668129
>> >>
>> >> The additional small amount of balloon memory can cause other problems
>> >> as well, for example:
>> >> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1518457
>> >>
>> >> Anyway, I'd like to ask, was the original commit added because
>> >> hypervisors are supposed to set their balloon target to the guest
>> >> system's number of phys pages (max_pfn - holes)?  The mailing list
>> >> discussion and commit description seem to indicate that.
>> >
>> >
>> >
>> > IIRC the problem that this was trying to fix was that since max_pfn 
>> > includes
>> > holes, upon booting we'd immediately balloon down by the (typically, MMIO)
>> > hole size.
>> >
>> > If you boot a guest with ~4+GB memory you should see this.
>> >
>> >
>> >> However I'm
>> >> not sure how that is possible, because the kernel reserves its own
>> >> holes, regardless of any predefined holes in the e820 map; for
>> >> example, the kernel reserves 64k (by default) at phys addr 0 (the
>> >> amount of reservation is configurable via CONFIG_X86_RESERVE_LOW).  So
>> >> the hypervisor really has no way to know what the "right" target to
>> >> specify is; unless it knows the exact guest OS and kernel version, and
>> >> kernel config values, it will never be able to correctly specify its
>> >> target to be exactly (e820 max_pfn - all holes).
>> >>
>> >> Should this commit be reverted?  Should the xen balloon target be
>> >> adjusted based on kernel-added e820 holes?
>> >
>> >
>> > I think the second one but shouldn't current_pages be updated, and not the
>> > target? The latter is set by Xen (toolstack, via xenstore usually).
>> >
>> > Also, the bugs above (at least one of them) talk about NVMe and I wonder
>> > whether the memory that they add is of RAM type --- I believe it has its 
>> > own
>> > type and so perhaps that introduces additional inconsistencies. AWS may 
>> > have
>> > added their own support for that, which we don't have upstream yet.
>>
>> The type of memory doesn't have anything to do with it.
>>
>> The problem with NVMe is it's a passthrough device, so the guest talks
>> directly to the NVMe controller and does DMA with it.  But the
>> hypervisor does swiotlb translation between the guest physical memory,
>
> Um, the h

Re: maybe revert commit c275a57f5ec3 "xen/balloon: Set balloon's initial state to number of existing RAM pages"

2017-03-24 Thread Dan Streetman
On Fri, Mar 24, 2017 at 5:10 PM, Konrad Rzeszutek Wilk
 wrote:
> On Fri, Mar 24, 2017 at 04:34:23PM -0400, Dan Streetman wrote:
>> On Wed, Mar 22, 2017 at 10:13 PM, Boris Ostrovsky
>>  wrote:
>> >
>> >
>> > On 03/22/2017 05:16 PM, Dan Streetman wrote:
>> >>
>> >> I have a question about a problem introduced by this commit:
>> >> c275a57f5ec3056f732843b11659d892235faff7
>> >> "xen/balloon: Set balloon's initial state to number of existing RAM pages"
>> >>
>> >> It changed the xen balloon current_pages calculation to start with the
>> >> number of physical pages in the system, instead of max_pfn.  Since
>> >> get_num_physpages() does not include holes, it's always less than the
>> >> e820 map's max_pfn.
>> >>
>> >> However, the problem that commit introduced is, if the hypervisor sets
>> >> the balloon target to equal to the e820 map's max_pfn, then the
>> >> balloon target will *always* be higher than the initial current pages.
>> >> Even if the hypervisor sets the target to (e820 max_pfn - holes), if
>> >> the OS adds any holes, the balloon target will be higher than the
>> >> current pages.  This is the situation, for example, for Amazon AWS
>> >> instances.  The result is, the xen balloon will always immediately
>> >> hotplug some memory at boot, but then make only (max_pfn -
>> >> get_num_physpages()) available to the system.
>> >>
>> >> This balloon-hotplugged memory can cause problems, if the hypervisor
>> >> wasn't expecting it; specifically, the system's physical page
>> >> addresses now will exceed the e820 map's max_pfn, due to the
>> >> balloon-hotplugged pages; if the hypervisor isn't expecting pt-device
>> >> DMA to/from those physical pages above the e820 max_pfn, it causes
>> >> problems.  For example:
>> >> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1668129
>> >>
>> >> The additional small amount of balloon memory can cause other problems
>> >> as well, for example:
>> >> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1518457
>> >>
>> >> Anyway, I'd like to ask, was the original commit added because
>> >> hypervisors are supposed to set their balloon target to the guest
>> >> system's number of phys pages (max_pfn - holes)?  The mailing list
>> >> discussion and commit description seem to indicate that.
>> >
>> >
>> >
>> > IIRC the problem that this was trying to fix was that since max_pfn 
>> > includes
>> > holes, upon booting we'd immediately balloon down by the (typically, MMIO)
>> > hole size.
>> >
>> > If you boot a guest with ~4+GB memory you should see this.
>> >
>> >
>> >> However I'm
>> >> not sure how that is possible, because the kernel reserves its own
>> >> holes, regardless of any predefined holes in the e820 map; for
>> >> example, the kernel reserves 64k (by default) at phys addr 0 (the
>> >> amount of reservation is configurable via CONFIG_X86_RESERVE_LOW).  So
>> >> the hypervisor really has no way to know what the "right" target to
>> >> specify is; unless it knows the exact guest OS and kernel version, and
>> >> kernel config values, it will never be able to correctly specify its
>> >> target to be exactly (e820 max_pfn - all holes).
>> >>
>> >> Should this commit be reverted?  Should the xen balloon target be
>> >> adjusted based on kernel-added e820 holes?
>> >
>> >
>> > I think the second one but shouldn't current_pages be updated, and not the
>> > target? The latter is set by Xen (toolstack, via xenstore usually).
>> >
>> > Also, the bugs above (at least one of them) talk about NVMe and I wonder
>> > whether the memory that they add is of RAM type --- I believe it has its 
>> > own
>> > type and so perhaps that introduces additional inconsistencies. AWS may 
>> > have
>> > added their own support for that, which we don't have upstream yet.
>>
>> The type of memory doesn't have anything to do with it.
>>
>> The problem with NVMe is it's a passthrough device, so the guest talks
>> directly to the NVMe controller and does DMA with it.  But the
>> hypervisor does swiotlb translation between the guest physical memory,
>
> Um, the hypervisor does not have SWIOTLB support, only IOMMU support.

he

Re: maybe revert commit c275a57f5ec3 "xen/balloon: Set balloon's initial state to number of existing RAM pages"

2017-03-24 Thread Dan Streetman
On Wed, Mar 22, 2017 at 10:13 PM, Boris Ostrovsky
<boris.ostrov...@oracle.com> wrote:
>
>
> On 03/22/2017 05:16 PM, Dan Streetman wrote:
>>
>> I have a question about a problem introduced by this commit:
>> c275a57f5ec3056f732843b11659d892235faff7
>> "xen/balloon: Set balloon's initial state to number of existing RAM pages"
>>
>> It changed the xen balloon current_pages calculation to start with the
>> number of physical pages in the system, instead of max_pfn.  Since
>> get_num_physpages() does not include holes, it's always less than the
>> e820 map's max_pfn.
>>
>> However, the problem that commit introduced is, if the hypervisor sets
>> the balloon target to equal to the e820 map's max_pfn, then the
>> balloon target will *always* be higher than the initial current pages.
>> Even if the hypervisor sets the target to (e820 max_pfn - holes), if
>> the OS adds any holes, the balloon target will be higher than the
>> current pages.  This is the situation, for example, for Amazon AWS
>> instances.  The result is, the xen balloon will always immediately
>> hotplug some memory at boot, but then make only (max_pfn -
>> get_num_physpages()) available to the system.
>>
>> This balloon-hotplugged memory can cause problems, if the hypervisor
>> wasn't expecting it; specifically, the system's physical page
>> addresses now will exceed the e820 map's max_pfn, due to the
>> balloon-hotplugged pages; if the hypervisor isn't expecting pt-device
>> DMA to/from those physical pages above the e820 max_pfn, it causes
>> problems.  For example:
>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1668129
>>
>> The additional small amount of balloon memory can cause other problems
>> as well, for example:
>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1518457
>>
>> Anyway, I'd like to ask, was the original commit added because
>> hypervisors are supposed to set their balloon target to the guest
>> system's number of phys pages (max_pfn - holes)?  The mailing list
>> discussion and commit description seem to indicate that.
>
>
>
> IIRC the problem that this was trying to fix was that since max_pfn includes
> holes, upon booting we'd immediately balloon down by the (typically, MMIO)
> hole size.
>
> If you boot a guest with ~4+GB memory you should see this.
>
>
>> However I'm
>> not sure how that is possible, because the kernel reserves its own
>> holes, regardless of any predefined holes in the e820 map; for
>> example, the kernel reserves 64k (by default) at phys addr 0 (the
>> amount of reservation is configurable via CONFIG_X86_RESERVE_LOW).  So
>> the hypervisor really has no way to know what the "right" target to
>> specify is; unless it knows the exact guest OS and kernel version, and
>> kernel config values, it will never be able to correctly specify its
>> target to be exactly (e820 max_pfn - all holes).
>>
>> Should this commit be reverted?  Should the xen balloon target be
>> adjusted based on kernel-added e820 holes?
>
>
> I think the second one but shouldn't current_pages be updated, and not the
> target? The latter is set by Xen (toolstack, via xenstore usually).
>
> Also, the bugs above (at least one of them) talk about NVMe and I wonder
> whether the memory that they add is of RAM type --- I believe it has its own
> type and so perhaps that introduces additional inconsistencies. AWS may have
> added their own support for that, which we don't have upstream yet.

The type of memory doesn't have anything to do with it.

The problem with NVMe is it's a passthrough device, so the guest talks
directly to the NVMe controller and does DMA with it.  But the
hypervisor does swiotlb translation between the guest physical memory,
and the host physical memory, so that the NVMe device can correctly
DMA to the right memory in the host.

However, the hypervisor only has the guest's physical memory up to the
max e820 pfn mapped; it didn't expect the balloon driver to hotplug
any additional memory above the e820 max pfn, so when the NVMe driver
in the guest tries to tell the NVMe controller to DMA to that
balloon-hotplugged memory, the hypervisor fails the NVMe request,
because it can't do the guest-to-host phys mem mapping, since the
guest phys address is outside the expected max range.



>
> -boris
>
>
>
>> Should something else be
>> done?
>>
>> For context, Amazon Linux has simply disabled Xen ballooning
>> completely.  Likewise, we're planning to disable Xen ballooning in the
>> Ubuntu kernel for Amazon AWS-specific kernels (but not for non-AWS
>> Ubuntu kernels).  However, if reverting this patch makes sense in a
>> bigger context (i.e. Xen users besides AWS), that would allow more
>> Ubuntu kernels to work correctly in AWS instances.
>>
>


Re: maybe revert commit c275a57f5ec3 "xen/balloon: Set balloon's initial state to number of existing RAM pages"

2017-03-24 Thread Dan Streetman
On Wed, Mar 22, 2017 at 10:13 PM, Boris Ostrovsky
 wrote:
>
>
> On 03/22/2017 05:16 PM, Dan Streetman wrote:
>>
>> I have a question about a problem introduced by this commit:
>> c275a57f5ec3056f732843b11659d892235faff7
>> "xen/balloon: Set balloon's initial state to number of existing RAM pages"
>>
>> It changed the xen balloon current_pages calculation to start with the
>> number of physical pages in the system, instead of max_pfn.  Since
>> get_num_physpages() does not include holes, it's always less than the
>> e820 map's max_pfn.
>>
>> However, the problem that commit introduced is, if the hypervisor sets
>> the balloon target to equal to the e820 map's max_pfn, then the
>> balloon target will *always* be higher than the initial current pages.
>> Even if the hypervisor sets the target to (e820 max_pfn - holes), if
>> the OS adds any holes, the balloon target will be higher than the
>> current pages.  This is the situation, for example, for Amazon AWS
>> instances.  The result is, the xen balloon will always immediately
>> hotplug some memory at boot, but then make only (max_pfn -
>> get_num_physpages()) available to the system.
>>
>> This balloon-hotplugged memory can cause problems, if the hypervisor
>> wasn't expecting it; specifically, the system's physical page
>> addresses now will exceed the e820 map's max_pfn, due to the
>> balloon-hotplugged pages; if the hypervisor isn't expecting pt-device
>> DMA to/from those physical pages above the e820 max_pfn, it causes
>> problems.  For example:
>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1668129
>>
>> The additional small amount of balloon memory can cause other problems
>> as well, for example:
>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1518457
>>
>> Anyway, I'd like to ask, was the original commit added because
>> hypervisors are supposed to set their balloon target to the guest
>> system's number of phys pages (max_pfn - holes)?  The mailing list
>> discussion and commit description seem to indicate that.
>
>
>
> IIRC the problem that this was trying to fix was that since max_pfn includes
> holes, upon booting we'd immediately balloon down by the (typically, MMIO)
> hole size.
>
> If you boot a guest with ~4+GB memory you should see this.
>
>
>> However I'm
>> not sure how that is possible, because the kernel reserves its own
>> holes, regardless of any predefined holes in the e820 map; for
>> example, the kernel reserves 64k (by default) at phys addr 0 (the
>> amount of reservation is configurable via CONFIG_X86_RESERVE_LOW).  So
>> the hypervisor really has no way to know what the "right" target to
>> specify is; unless it knows the exact guest OS and kernel version, and
>> kernel config values, it will never be able to correctly specify its
>> target to be exactly (e820 max_pfn - all holes).
>>
>> Should this commit be reverted?  Should the xen balloon target be
>> adjusted based on kernel-added e820 holes?
>
>
> I think the second one but shouldn't current_pages be updated, and not the
> target? The latter is set by Xen (toolstack, via xenstore usually).
>
> Also, the bugs above (at least one of them) talk about NVMe and I wonder
> whether the memory that they add is of RAM type --- I believe it has its own
> type and so perhaps that introduces additional inconsistencies. AWS may have
> added their own support for that, which we don't have upstream yet.

The type of memory doesn't have anything to do with it.

The problem with NVMe is it's a passthrough device, so the guest talks
directly to the NVMe controller and does DMA with it.  But the
hypervisor does swiotlb translation between the guest physical memory,
and the host physical memory, so that the NVMe device can correctly
DMA to the right memory in the host.

However, the hypervisor only has the guest's physical memory up to the
max e820 pfn mapped; it didn't expect the balloon driver to hotplug
any additional memory above the e820 max pfn, so when the NVMe driver
in the guest tries to tell the NVMe controller to DMA to that
balloon-hotplugged memory, the hypervisor fails the NVMe request,
because it can't do the guest-to-host phys mem mapping, since the
guest phys address is outside the expected max range.



>
> -boris
>
>
>
>> Should something else be
>> done?
>>
>> For context, Amazon Linux has simply disabled Xen ballooning
>> completely.  Likewise, we're planning to disable Xen ballooning in the
>> Ubuntu kernel for Amazon AWS-specific kernels (but not for non-AWS
>> Ubuntu kernels).  However, if reverting this patch makes sense in a
>> bigger context (i.e. Xen users besides AWS), that would allow more
>> Ubuntu kernels to work correctly in AWS instances.
>>
>


Re: maybe revert commit c275a57f5ec3 "xen/balloon: Set balloon's initial state to number of existing RAM pages"

2017-03-24 Thread Dan Streetman
On Thu, Mar 23, 2017 at 3:56 AM, Juergen Gross <jgr...@suse.com> wrote:
> On 23/03/17 03:13, Boris Ostrovsky wrote:
>>
>>
>> On 03/22/2017 05:16 PM, Dan Streetman wrote:
>>> I have a question about a problem introduced by this commit:
>>> c275a57f5ec3056f732843b11659d892235faff7
>>> "xen/balloon: Set balloon's initial state to number of existing RAM
>>> pages"
>>>
>>> It changed the xen balloon current_pages calculation to start with the
>>> number of physical pages in the system, instead of max_pfn.  Since
>>> get_num_physpages() does not include holes, it's always less than the
>>> e820 map's max_pfn.
>>>
>>> However, the problem that commit introduced is, if the hypervisor sets
>>> the balloon target to equal to the e820 map's max_pfn, then the
>>> balloon target will *always* be higher than the initial current pages.
>>> Even if the hypervisor sets the target to (e820 max_pfn - holes), if
>>> the OS adds any holes, the balloon target will be higher than the
>>> current pages.  This is the situation, for example, for Amazon AWS
>>> instances.  The result is, the xen balloon will always immediately
>>> hotplug some memory at boot, but then make only (max_pfn -
>>> get_num_physpages()) available to the system.
>>>
>>> This balloon-hotplugged memory can cause problems, if the hypervisor
>>> wasn't expecting it; specifically, the system's physical page
>>> addresses now will exceed the e820 map's max_pfn, due to the
>>> balloon-hotplugged pages; if the hypervisor isn't expecting pt-device
>>> DMA to/from those physical pages above the e820 max_pfn, it causes
>>> problems.  For example:
>>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1668129
>>>
>>> The additional small amount of balloon memory can cause other problems
>>> as well, for example:
>>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1518457
>>>
>>> Anyway, I'd like to ask, was the original commit added because
>>> hypervisors are supposed to set their balloon target to the guest
>>> system's number of phys pages (max_pfn - holes)?  The mailing list
>>> discussion and commit description seem to indicate that.
>>
>>
>> IIRC the problem that this was trying to fix was that since max_pfn
>> includes holes, upon booting we'd immediately balloon down by the
>> (typically, MMIO) hole size.
>>
>> If you boot a guest with ~4+GB memory you should see this.
>>
>>
>>> However I'm
>>> not sure how that is possible, because the kernel reserves its own
>>> holes, regardless of any predefined holes in the e820 map; for
>>> example, the kernel reserves 64k (by default) at phys addr 0 (the
>>> amount of reservation is configurable via CONFIG_X86_RESERVE_LOW).  So
>>> the hypervisor really has no way to know what the "right" target to
>>> specify is; unless it knows the exact guest OS and kernel version, and
>>> kernel config values, it will never be able to correctly specify its
>>> target to be exactly (e820 max_pfn - all holes).
>>>
>>> Should this commit be reverted?  Should the xen balloon target be
>>> adjusted based on kernel-added e820 holes?
>>
>> I think the second one but shouldn't current_pages be updated, and not
>> the target? The latter is set by Xen (toolstack, via xenstore usually).
>
> Right.
>
> Looking into a HVM domU I can't see any problem related to
> CONFIG_X86_RESERVE_LOW: it is set to 64 on my system. The domU is

sorry I brought that up; I was only giving an example.  It's not
directly relevant to this and may have distracted from the actual
problem; in fact on closer inspection, the X86_RESERVE_LOW is using
memblock_reserve(), which removes it from managed memory but not the
e820 map (and thus doesn't remove it from get_num_physpages()).  Only
phys page 0 is actually reserved in the e820 map.

> configured with 2048 MB of RAM, 8MB being video RAM. Looking into
> /sys/devices/system/xen_memory/xen_memory0 I can see the current
> size and target size do match: both are 2088960 kB (2 GB - 8 MB).
>
> Ballooning down and up to 2048 MB again doesn't change the picture.
>
> So which additional holes are added by the kernel on AWS via which
> functions?

I'll use two AWS types as examples, t2.micro (1G mem) and t2.large (8G mem).

In the micro, the results of ballooning are obvious, because the
hotplugged memory always goes into the Normal zone; but since the base
memory is only 1g, it's contained entirely in the DMA32/DMA zones.  So
we get:

$ grep

Re: maybe revert commit c275a57f5ec3 "xen/balloon: Set balloon's initial state to number of existing RAM pages"

2017-03-24 Thread Dan Streetman
On Thu, Mar 23, 2017 at 3:56 AM, Juergen Gross  wrote:
> On 23/03/17 03:13, Boris Ostrovsky wrote:
>>
>>
>> On 03/22/2017 05:16 PM, Dan Streetman wrote:
>>> I have a question about a problem introduced by this commit:
>>> c275a57f5ec3056f732843b11659d892235faff7
>>> "xen/balloon: Set balloon's initial state to number of existing RAM
>>> pages"
>>>
>>> It changed the xen balloon current_pages calculation to start with the
>>> number of physical pages in the system, instead of max_pfn.  Since
>>> get_num_physpages() does not include holes, it's always less than the
>>> e820 map's max_pfn.
>>>
>>> However, the problem that commit introduced is, if the hypervisor sets
>>> the balloon target to equal to the e820 map's max_pfn, then the
>>> balloon target will *always* be higher than the initial current pages.
>>> Even if the hypervisor sets the target to (e820 max_pfn - holes), if
>>> the OS adds any holes, the balloon target will be higher than the
>>> current pages.  This is the situation, for example, for Amazon AWS
>>> instances.  The result is, the xen balloon will always immediately
>>> hotplug some memory at boot, but then make only (max_pfn -
>>> get_num_physpages()) available to the system.
>>>
>>> This balloon-hotplugged memory can cause problems, if the hypervisor
>>> wasn't expecting it; specifically, the system's physical page
>>> addresses now will exceed the e820 map's max_pfn, due to the
>>> balloon-hotplugged pages; if the hypervisor isn't expecting pt-device
>>> DMA to/from those physical pages above the e820 max_pfn, it causes
>>> problems.  For example:
>>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1668129
>>>
>>> The additional small amount of balloon memory can cause other problems
>>> as well, for example:
>>> https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1518457
>>>
>>> Anyway, I'd like to ask, was the original commit added because
>>> hypervisors are supposed to set their balloon target to the guest
>>> system's number of phys pages (max_pfn - holes)?  The mailing list
>>> discussion and commit description seem to indicate that.
>>
>>
>> IIRC the problem that this was trying to fix was that since max_pfn
>> includes holes, upon booting we'd immediately balloon down by the
>> (typically, MMIO) hole size.
>>
>> If you boot a guest with ~4+GB memory you should see this.
>>
>>
>>> However I'm
>>> not sure how that is possible, because the kernel reserves its own
>>> holes, regardless of any predefined holes in the e820 map; for
>>> example, the kernel reserves 64k (by default) at phys addr 0 (the
>>> amount of reservation is configurable via CONFIG_X86_RESERVE_LOW).  So
>>> the hypervisor really has no way to know what the "right" target to
>>> specify is; unless it knows the exact guest OS and kernel version, and
>>> kernel config values, it will never be able to correctly specify its
>>> target to be exactly (e820 max_pfn - all holes).
>>>
>>> Should this commit be reverted?  Should the xen balloon target be
>>> adjusted based on kernel-added e820 holes?
>>
>> I think the second one but shouldn't current_pages be updated, and not
>> the target? The latter is set by Xen (toolstack, via xenstore usually).
>
> Right.
>
> Looking into a HVM domU I can't see any problem related to
> CONFIG_X86_RESERVE_LOW: it is set to 64 on my system. The domU is

sorry I brought that up; I was only giving an example.  It's not
directly relevant to this and may have distracted from the actual
problem; in fact on closer inspection, the X86_RESERVE_LOW is using
memblock_reserve(), which removes it from managed memory but not the
e820 map (and thus doesn't remove it from get_num_physpages()).  Only
phys page 0 is actually reserved in the e820 map.

> configured with 2048 MB of RAM, 8MB being video RAM. Looking into
> /sys/devices/system/xen_memory/xen_memory0 I can see the current
> size and target size do match: both are 2088960 kB (2 GB - 8 MB).
>
> Ballooning down and up to 2048 MB again doesn't change the picture.
>
> So which additional holes are added by the kernel on AWS via which
> functions?

I'll use two AWS types as examples, t2.micro (1G mem) and t2.large (8G mem).

In the micro, the results of ballooning are obvious, because the
hotplugged memory always goes into the Normal zone; but since the base
memory is only 1g, it's contained entirely in the DMA32/DMA zones.  So
we get:

$ grep -E '(start_pfn|p

maybe revert commit c275a57f5ec3 "xen/balloon: Set balloon's initial state to number of existing RAM pages"

2017-03-22 Thread Dan Streetman
I have a question about a problem introduced by this commit:
c275a57f5ec3056f732843b11659d892235faff7
"xen/balloon: Set balloon's initial state to number of existing RAM pages"

It changed the xen balloon current_pages calculation to start with the
number of physical pages in the system, instead of max_pfn.  Since
get_num_physpages() does not include holes, it's always less than the
e820 map's max_pfn.

However, the problem that commit introduced is, if the hypervisor sets
the balloon target to equal to the e820 map's max_pfn, then the
balloon target will *always* be higher than the initial current pages.
Even if the hypervisor sets the target to (e820 max_pfn - holes), if
the OS adds any holes, the balloon target will be higher than the
current pages.  This is the situation, for example, for Amazon AWS
instances.  The result is, the xen balloon will always immediately
hotplug some memory at boot, but then make only (max_pfn -
get_num_physpages()) available to the system.

This balloon-hotplugged memory can cause problems, if the hypervisor
wasn't expecting it; specifically, the system's physical page
addresses now will exceed the e820 map's max_pfn, due to the
balloon-hotplugged pages; if the hypervisor isn't expecting pt-device
DMA to/from those physical pages above the e820 max_pfn, it causes
problems.  For example:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1668129

The additional small amount of balloon memory can cause other problems
as well, for example:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1518457

Anyway, I'd like to ask, was the original commit added because
hypervisors are supposed to set their balloon target to the guest
system's number of phys pages (max_pfn - holes)?  The mailing list
discussion and commit description seem to indicate that.  However I'm
not sure how that is possible, because the kernel reserves its own
holes, regardless of any predefined holes in the e820 map; for
example, the kernel reserves 64k (by default) at phys addr 0 (the
amount of reservation is configurable via CONFIG_X86_RESERVE_LOW).  So
the hypervisor really has no way to know what the "right" target to
specify is; unless it knows the exact guest OS and kernel version, and
kernel config values, it will never be able to correctly specify its
target to be exactly (e820 max_pfn - all holes).

Should this commit be reverted?  Should the xen balloon target be
adjusted based on kernel-added e820 holes?  Should something else be
done?

For context, Amazon Linux has simply disabled Xen ballooning
completely.  Likewise, we're planning to disable Xen ballooning in the
Ubuntu kernel for Amazon AWS-specific kernels (but not for non-AWS
Ubuntu kernels).  However, if reverting this patch makes sense in a
bigger context (i.e. Xen users besides AWS), that would allow more
Ubuntu kernels to work correctly in AWS instances.


maybe revert commit c275a57f5ec3 "xen/balloon: Set balloon's initial state to number of existing RAM pages"

2017-03-22 Thread Dan Streetman
I have a question about a problem introduced by this commit:
c275a57f5ec3056f732843b11659d892235faff7
"xen/balloon: Set balloon's initial state to number of existing RAM pages"

It changed the xen balloon current_pages calculation to start with the
number of physical pages in the system, instead of max_pfn.  Since
get_num_physpages() does not include holes, it's always less than the
e820 map's max_pfn.

However, the problem that commit introduced is, if the hypervisor sets
the balloon target to equal to the e820 map's max_pfn, then the
balloon target will *always* be higher than the initial current pages.
Even if the hypervisor sets the target to (e820 max_pfn - holes), if
the OS adds any holes, the balloon target will be higher than the
current pages.  This is the situation, for example, for Amazon AWS
instances.  The result is, the xen balloon will always immediately
hotplug some memory at boot, but then make only (max_pfn -
get_num_physpages()) available to the system.

This balloon-hotplugged memory can cause problems, if the hypervisor
wasn't expecting it; specifically, the system's physical page
addresses now will exceed the e820 map's max_pfn, due to the
balloon-hotplugged pages; if the hypervisor isn't expecting pt-device
DMA to/from those physical pages above the e820 max_pfn, it causes
problems.  For example:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1668129

The additional small amount of balloon memory can cause other problems
as well, for example:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1518457

Anyway, I'd like to ask, was the original commit added because
hypervisors are supposed to set their balloon target to the guest
system's number of phys pages (max_pfn - holes)?  The mailing list
discussion and commit description seem to indicate that.  However I'm
not sure how that is possible, because the kernel reserves its own
holes, regardless of any predefined holes in the e820 map; for
example, the kernel reserves 64k (by default) at phys addr 0 (the
amount of reservation is configurable via CONFIG_X86_RESERVE_LOW).  So
the hypervisor really has no way to know what the "right" target to
specify is; unless it knows the exact guest OS and kernel version, and
kernel config values, it will never be able to correctly specify its
target to be exactly (e820 max_pfn - all holes).

Should this commit be reverted?  Should the xen balloon target be
adjusted based on kernel-added e820 holes?  Should something else be
done?

For context, Amazon Linux has simply disabled Xen ballooning
completely.  Likewise, we're planning to disable Xen ballooning in the
Ubuntu kernel for Amazon AWS-specific kernels (but not for non-AWS
Ubuntu kernels).  However, if reverting this patch makes sense in a
bigger context (i.e. Xen users besides AWS), that would allow more
Ubuntu kernels to work correctly in AWS instances.


Re: [PATCH v2 1/1] mm: zswap - Add crypto acomp/scomp framework support

2017-03-16 Thread Dan Streetman
On Thu, Mar 16, 2017 at 12:33 PM, Herbert Xu
<herb...@gondor.apana.org.au> wrote:
> On Wed, Mar 08, 2017 at 12:38:40PM -0500, Dan Streetman wrote:
>>
>>
>> setting the ASYNC bit makes it synchronous?  that seems backwards...?
>
> You set the ASYNC bit in the mask and leave it clear in the type.
> That way only algorithms with the ASYNC bit off will match.

aha, ok i get it now.

>> zswap gets the fun of being the first crypto compression consumer to
>> switch to the new api? ;-)
>
> BTW I think we should hold off on converting zswap for now.

ok that sounds good, we can wait.  Thanks!


Re: [PATCH v2 1/1] mm: zswap - Add crypto acomp/scomp framework support

2017-03-16 Thread Dan Streetman
On Thu, Mar 16, 2017 at 12:33 PM, Herbert Xu
 wrote:
> On Wed, Mar 08, 2017 at 12:38:40PM -0500, Dan Streetman wrote:
>>
>>
>> setting the ASYNC bit makes it synchronous?  that seems backwards...?
>
> You set the ASYNC bit in the mask and leave it clear in the type.
> That way only algorithms with the ASYNC bit off will match.

aha, ok i get it now.

>> zswap gets the fun of being the first crypto compression consumer to
>> switch to the new api? ;-)
>
> BTW I think we should hold off on converting zswap for now.

ok that sounds good, we can wait.  Thanks!


Re: [PATCH v2 1/1] mm: zswap - Add crypto acomp/scomp framework support

2017-03-16 Thread Dan Streetman
On Thu, Mar 9, 2017 at 4:39 AM, Herbert Xu <herb...@gondor.apana.org.au> wrote:
> On Wed, Mar 08, 2017 at 12:38:40PM -0500, Dan Streetman wrote:
>>
>> It looks like the crypto_scomp interface is buried under
>> include/crypto/internal/scompress.h, however that's exactly what zswap
>> should be using.  We don't need to switch to an asynchronous interface
>> that's rather significantly more complicated, and then use it in a
>> synchronous way.  The crypto_scomp interface should probably be made
>> public, not an implementation internal.
>
> No scomp is not meant to be used externally.  We provide exactly
> one compression interface and it's acomp.  acomp can be used
> synchronously by setting the CRYPTO_ALG_ASYNC bit in the mask
> field when allocating the algorithm.

setting the ASYNC bit makes it synchronous?  that seems backwards...?

Anyway, I have a few concerns about moving over to using that first,
specifically:

- no docs on acomp in Documentation/crypto/ that I can see
- no place in the crypto code that I can see that parses ALG_ASYNC to
make the crypto_acomp_compress() call synchronous
- no synchronous test in crypto/testmgr.c

Maybe I'm reading the code wrong, but it looks like any compression
backend that is actually scomp, actually does the (de)compression
synchronously.  In crypto_acomp_init_tfm(), if the tfm is not
crypto_acomp_type (and I assume because all the current
implementations register as scomp, they aren't acomp_type) it calls
crypto_init_scomp_ops_async(), which then sets ->compress to
scomp_acomp_compress() and that function appears to directly call the
scomp compression function.  This is just after a very quick look, so
maybe I'm reading it wrong.  I'll look some more, and also add a
synchronous testmgr test so i can understand how it works better.

Is the acomp interface fully ready for use?

>
> The existing compression interface will be phased out.
>
> Cheers,
> --
> Email: Herbert Xu <herb...@gondor.apana.org.au>
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH v2 1/1] mm: zswap - Add crypto acomp/scomp framework support

2017-03-16 Thread Dan Streetman
On Thu, Mar 9, 2017 at 4:39 AM, Herbert Xu  wrote:
> On Wed, Mar 08, 2017 at 12:38:40PM -0500, Dan Streetman wrote:
>>
>> It looks like the crypto_scomp interface is buried under
>> include/crypto/internal/scompress.h, however that's exactly what zswap
>> should be using.  We don't need to switch to an asynchronous interface
>> that's rather significantly more complicated, and then use it in a
>> synchronous way.  The crypto_scomp interface should probably be made
>> public, not an implementation internal.
>
> No scomp is not meant to be used externally.  We provide exactly
> one compression interface and it's acomp.  acomp can be used
> synchronously by setting the CRYPTO_ALG_ASYNC bit in the mask
> field when allocating the algorithm.

setting the ASYNC bit makes it synchronous?  that seems backwards...?

Anyway, I have a few concerns about moving over to using that first,
specifically:

- no docs on acomp in Documentation/crypto/ that I can see
- no place in the crypto code that I can see that parses ALG_ASYNC to
make the crypto_acomp_compress() call synchronous
- no synchronous test in crypto/testmgr.c

Maybe I'm reading the code wrong, but it looks like any compression
backend that is actually scomp, actually does the (de)compression
synchronously.  In crypto_acomp_init_tfm(), if the tfm is not
crypto_acomp_type (and I assume because all the current
implementations register as scomp, they aren't acomp_type) it calls
crypto_init_scomp_ops_async(), which then sets ->compress to
scomp_acomp_compress() and that function appears to directly call the
scomp compression function.  This is just after a very quick look, so
maybe I'm reading it wrong.  I'll look some more, and also add a
synchronous testmgr test so i can understand how it works better.

Is the acomp interface fully ready for use?

>
> The existing compression interface will be phased out.
>
> Cheers,
> --
> Email: Herbert Xu 
> Home Page: http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH v2 1/1] mm: zswap - Add crypto acomp/scomp framework support

2017-03-08 Thread Dan Streetman
On Mon, Feb 27, 2017 at 9:40 AM, Mahipal Reddy
<mahipalreddy2...@gmail.com> wrote:
> Hi Dan,
> Thanks for your reply.
>
> On Sat, Feb 25, 2017 at 3:51 AM, Dan Streetman <ddstr...@ieee.org> wrote:
>> On Fri, Feb 24, 2017 at 11:05 AM, Mahipal Challa
>> <mahipal.cha...@cavium.com> wrote:
>>> This adds support for kernel's new crypto acomp/scomp framework
>>> to zswap.
>>
>> I don't understand the point of this, zswap can't compress pages
>> asynchronously, so what benefit do we get from using the async crypto
>> api and then immediately waiting for it to finish?  This seems like
>> it's just adding complexity for no reason?
>
> 1) The new crypto acomp/scomp framework, provides both synchronous and
> asynchronous comp/decomp
> functionality with the same async-crypto(acomp) 
> api(include/crypto/acompress.h).
>
> 2) Currently with new crypto acomp/scomp framework, the crypto
> sub-system(crypto/lzo.c, crypto/deflate.c)
> only supports synchronous mode of compression/decompression which
> meets the zswap requirement.
>
> 3) The new crypto acomp/scomp framework is introduced in the 4.10.xx kernel.
> With this new framework, according to Herbert Xu, existing crypto
> comp(CRYPTO_ALG_TYPE_COMPRESS ) api
> is going to be deprecated (which zswap uses).

zswap gets the fun of being the first crypto compression consumer to
switch to the new api? ;-)

It looks like the crypto_scomp interface is buried under
include/crypto/internal/scompress.h, however that's exactly what zswap
should be using.  We don't need to switch to an asynchronous interface
that's rather significantly more complicated, and then use it in a
synchronous way.  The crypto_scomp interface should probably be made
public, not an implementation internal.


>
> 4) Applications like zswap, which use comp/decomp of crypto subsystem,
> at some point will have to be ported to
> the new framework.
>
> Regards,
> -Mahipal
>
>>> Signed-off-by: Mahipal Challa <mahipal.cha...@cavium.com>
>>> Signed-off-by: Vishnu Nair <vishnu.n...@cavium.com>
>>> ---
>>>  mm/zswap.c | 192 
>>> +++--
>>>  1 file changed, 162 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/mm/zswap.c b/mm/zswap.c
>>> index cabf09e..b29d109 100644
>>> --- a/mm/zswap.c
>>> +++ b/mm/zswap.c
>>> @@ -33,8 +33,10 @@
>>>  #include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>
>>>  #include 
>>>  #include 
>>> @@ -118,9 +120,21 @@ static int zswap_compressor_param_set(const char *,
>>>  * data structures
>>>  **/
>>>
>>> +/**
>>> + * struct zswap_acomp_result - Data structure to store result of acomp 
>>> callback
>>> + * @completion: zswap will wait for completion on this entry
>>> + * @err   : return value from acomp algorithm will be stored here
>>> + */
>>> +struct zswap_acomp_result {
>>> +   struct completion completion;
>>> +   int err;
>>> +};
>>> +
>>>  struct zswap_pool {
>>> struct zpool *zpool;
>>> -   struct crypto_comp * __percpu *tfm;
>>> +   struct crypto_acomp * __percpu *acomp;
>>> +   struct acomp_req * __percpu *acomp_req;
>>> +   struct zswap_acomp_result * __percpu *result;
>>> struct kref kref;
>>> struct list_head list;
>>> struct work_struct work;
>>> @@ -388,30 +402,66 @@ static int zswap_dstmem_dead(unsigned int cpu)
>>>  static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node 
>>> *node)
>>>  {
>>> struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, 
>>> node);
>>> -   struct crypto_comp *tfm;
>>> +   struct crypto_acomp *acomp;
>>> +   struct acomp_req *acomp_req;
>>> +   struct zswap_acomp_result *result;
>>>
>>> -   if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
>>> +   if (WARN_ON(*per_cpu_ptr(pool->acomp, cpu)))
>>> return 0;
>>> +   if (WARN_ON(*per_cpu_ptr(pool->acomp_req, cpu)))
>>> +   return 0;
>>> +   if (WARN_ON(*per_cpu_ptr(pool->result, cpu)))
>>> +   return 0;
>>> +
>>> +   acomp = crypto_alloc_acomp(pool->tfm_name, 0, 0);
>>> +   if (IS_ERR_OR_NULL(acomp)) {
>>>

Re: [PATCH v2 1/1] mm: zswap - Add crypto acomp/scomp framework support

2017-03-08 Thread Dan Streetman
On Mon, Feb 27, 2017 at 9:40 AM, Mahipal Reddy
 wrote:
> Hi Dan,
> Thanks for your reply.
>
> On Sat, Feb 25, 2017 at 3:51 AM, Dan Streetman  wrote:
>> On Fri, Feb 24, 2017 at 11:05 AM, Mahipal Challa
>>  wrote:
>>> This adds support for kernel's new crypto acomp/scomp framework
>>> to zswap.
>>
>> I don't understand the point of this, zswap can't compress pages
>> asynchronously, so what benefit do we get from using the async crypto
>> api and then immediately waiting for it to finish?  This seems like
>> it's just adding complexity for no reason?
>
> 1) The new crypto acomp/scomp framework, provides both synchronous and
> asynchronous comp/decomp
> functionality with the same async-crypto(acomp) 
> api(include/crypto/acompress.h).
>
> 2) Currently with new crypto acomp/scomp framework, the crypto
> sub-system(crypto/lzo.c, crypto/deflate.c)
> only supports synchronous mode of compression/decompression which
> meets the zswap requirement.
>
> 3) The new crypto acomp/scomp framework is introduced in the 4.10.xx kernel.
> With this new framework, according to Herbert Xu, existing crypto
> comp(CRYPTO_ALG_TYPE_COMPRESS ) api
> is going to be deprecated (which zswap uses).

zswap gets the fun of being the first crypto compression consumer to
switch to the new api? ;-)

It looks like the crypto_scomp interface is buried under
include/crypto/internal/scompress.h, however that's exactly what zswap
should be using.  We don't need to switch to an asynchronous interface
that's rather significantly more complicated, and then use it in a
synchronous way.  The crypto_scomp interface should probably be made
public, not an implementation internal.


>
> 4) Applications like zswap, which use comp/decomp of crypto subsystem,
> at some point will have to be ported to
> the new framework.
>
> Regards,
> -Mahipal
>
>>> Signed-off-by: Mahipal Challa 
>>> Signed-off-by: Vishnu Nair 
>>> ---
>>>  mm/zswap.c | 192 
>>> +++--
>>>  1 file changed, 162 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/mm/zswap.c b/mm/zswap.c
>>> index cabf09e..b29d109 100644
>>> --- a/mm/zswap.c
>>> +++ b/mm/zswap.c
>>> @@ -33,8 +33,10 @@
>>>  #include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>  #include 
>>>  #include 
>>> +#include 
>>>
>>>  #include 
>>>  #include 
>>> @@ -118,9 +120,21 @@ static int zswap_compressor_param_set(const char *,
>>>  * data structures
>>>  **/
>>>
>>> +/**
>>> + * struct zswap_acomp_result - Data structure to store result of acomp 
>>> callback
>>> + * @completion: zswap will wait for completion on this entry
>>> + * @err   : return value from acomp algorithm will be stored here
>>> + */
>>> +struct zswap_acomp_result {
>>> +   struct completion completion;
>>> +   int err;
>>> +};
>>> +
>>>  struct zswap_pool {
>>> struct zpool *zpool;
>>> -   struct crypto_comp * __percpu *tfm;
>>> +   struct crypto_acomp * __percpu *acomp;
>>> +   struct acomp_req * __percpu *acomp_req;
>>> +   struct zswap_acomp_result * __percpu *result;
>>> struct kref kref;
>>> struct list_head list;
>>> struct work_struct work;
>>> @@ -388,30 +402,66 @@ static int zswap_dstmem_dead(unsigned int cpu)
>>>  static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node 
>>> *node)
>>>  {
>>> struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, 
>>> node);
>>> -   struct crypto_comp *tfm;
>>> +   struct crypto_acomp *acomp;
>>> +   struct acomp_req *acomp_req;
>>> +   struct zswap_acomp_result *result;
>>>
>>> -   if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
>>> +   if (WARN_ON(*per_cpu_ptr(pool->acomp, cpu)))
>>> return 0;
>>> +   if (WARN_ON(*per_cpu_ptr(pool->acomp_req, cpu)))
>>> +   return 0;
>>> +   if (WARN_ON(*per_cpu_ptr(pool->result, cpu)))
>>> +   return 0;
>>> +
>>> +   acomp = crypto_alloc_acomp(pool->tfm_name, 0, 0);
>>> +   if (IS_ERR_OR_NULL(acomp)) {
>>> +   pr_err("could not alloc crypto acomp %s : %ld\n",
>>> +  pool->tfm_name, PTR_ERR(acomp));
>>> +

Re: [PATCH] zswap: Zero-filled pages handling

2017-03-08 Thread Dan Streetman
On Wed, Mar 8, 2017 at 6:47 AM, Srividya Desireddy
<srividya...@samsung.com> wrote:
>
> On Sat, Mar 4, 2017 at 02:55 AM, Dan Streetman <ddstr...@ieee.org> wrote:
>> On Sat, Feb 25, 2017 at 12:18 PM, Sarbojit Ganguly
>> <unixman.linux...@gmail.com> wrote:
>>> On 25 February 2017 at 20:12, Srividya Desireddy
>>> <srividya...@samsung.com> wrote:
>>>> From: Srividya Desireddy <srividya...@samsung.com>
>>>> Date: Thu, 23 Feb 2017 15:04:06 +0530
>>>> Subject: [PATCH] zswap: Zero-filled pages handling
>>
>> your email is base64-encoded; please send plain text emails.
>>
>>>>
>>>> Zswap is a cache which compresses the pages that are being swapped out
>>>> and stores them into a dynamically allocated RAM-based memory pool.
>>>> Experiments have shown that around 10-20% of pages stored in zswap
>>>> are zero-filled pages (i.e. contents of the page are all zeros), but
>>
>> 20%?  that's a LOT of zero pages...which seems like applications are
>> wasting a lot of memory.  what kind of workload are you testing with?
>>
>
> I have tested this patch with different workloaded on different devices.
> On Ubuntu PC with 2GB RAM, while executing kernel build and other test
> scripts ~15% of pages in zswap were zero pages. With multimedia workload
> more than 20% of zswap pages were found to be zero pages.
> On a ARM Quad Core 32-bit device with 1.5GB RAM an average 10% of zero
> pages were found on launching and relaunching 15 applications.
>
>>>> these pages are handled as normal pages by compressing and allocating
>>>> memory in the pool.
>>>>
>>>> This patch adds a check in zswap_frontswap_store() to identify zero-filled
>>>> page before compression of the page. If the page is a zero-filled page, set
>>>> zswap_entry.zeroflag and skip the compression of the page and alloction
>>>> of memory in zpool. In zswap_frontswap_load(), check if the zeroflag is
>>>> set for the page in zswap_entry. If the flag is set, memset the page with
>>>> zero. This saves the decompression time during load.
>>>>
>>>> The overall overhead caused to check for a zero-filled page is very minimal
>>>> when compared to the time saved by avoiding compression and allocation in
>>>> case of zero-filled pages. Although, compressed size of a zero-filled page
>>>> is very less, with this patch load time of a zero-filled page is reduced by
>>>> 80% when compared to baseline.
>>>
>>> Is it possible to share the benchmark details?
>>
>> Was there an answer to this?
>>
>
> This patch is tested on a ARM Quad Core 32-bit device with 1.5GB RAM by
> launching and relaunching different applications. With the patch, an
> average of 5000 pages zero pages found in zswap out of the ~5 pages
> stored in zswap and application launch time improved by ~3%.
>
> Test Parameters BaselineWith patch  Improvement
> ---
> Total RAM   1343MB  1343MB
> Available RAM   451MB   445MB -6MB
> Avg. Memfree69MB70MB  1MB
> Avg. Swap Used  226MB   215MB -11MB
> Avg. App entry time 644msec 623msec   3%
>
> With patch, every page swapped to zswap is checked if it is a zero
> page or not and for all the zero pages compression and memory allocation
> operations are skipped. Overall there is an improvement of 30% in zswap
> store time.
> In case of non-zero pages there is no overhead during zswap page load. For
> zero pages there is a improvement of more than 60% in the zswap load time
> as the zero page decompression is avoided.
>
> The below table shows the execution time profiling of the patch.
>
> Zswap Store Operation BaselineWith patch  % Improvement
> --
> * Zero page check-- 22.5ms
>  (for non-zero pages)
> * Zero page check-- 24ms
>  (for zero pages)
> * Compression time  55ms --
>  (of zero pages)
> * Allocation time   14ms --
>  (to store compressed
>   zero pages)
> -
> Total   69ms46.5ms 32%
>
> Zswap Load Operation BaselineWith patch  % Improvement
> -
> * Decompression time  30.4ms--
>  (of zero pages)
> * Zero page check +--

Re: [PATCH] zswap: Zero-filled pages handling

2017-03-08 Thread Dan Streetman
On Wed, Mar 8, 2017 at 6:47 AM, Srividya Desireddy
 wrote:
>
> On Sat, Mar 4, 2017 at 02:55 AM, Dan Streetman  wrote:
>> On Sat, Feb 25, 2017 at 12:18 PM, Sarbojit Ganguly
>>  wrote:
>>> On 25 February 2017 at 20:12, Srividya Desireddy
>>>  wrote:
>>>> From: Srividya Desireddy 
>>>> Date: Thu, 23 Feb 2017 15:04:06 +0530
>>>> Subject: [PATCH] zswap: Zero-filled pages handling
>>
>> your email is base64-encoded; please send plain text emails.
>>
>>>>
>>>> Zswap is a cache which compresses the pages that are being swapped out
>>>> and stores them into a dynamically allocated RAM-based memory pool.
>>>> Experiments have shown that around 10-20% of pages stored in zswap
>>>> are zero-filled pages (i.e. contents of the page are all zeros), but
>>
>> 20%?  that's a LOT of zero pages...which seems like applications are
>> wasting a lot of memory.  what kind of workload are you testing with?
>>
>
> I have tested this patch with different workloaded on different devices.
> On Ubuntu PC with 2GB RAM, while executing kernel build and other test
> scripts ~15% of pages in zswap were zero pages. With multimedia workload
> more than 20% of zswap pages were found to be zero pages.
> On a ARM Quad Core 32-bit device with 1.5GB RAM an average 10% of zero
> pages were found on launching and relaunching 15 applications.
>
>>>> these pages are handled as normal pages by compressing and allocating
>>>> memory in the pool.
>>>>
>>>> This patch adds a check in zswap_frontswap_store() to identify zero-filled
>>>> page before compression of the page. If the page is a zero-filled page, set
>>>> zswap_entry.zeroflag and skip the compression of the page and alloction
>>>> of memory in zpool. In zswap_frontswap_load(), check if the zeroflag is
>>>> set for the page in zswap_entry. If the flag is set, memset the page with
>>>> zero. This saves the decompression time during load.
>>>>
>>>> The overall overhead caused to check for a zero-filled page is very minimal
>>>> when compared to the time saved by avoiding compression and allocation in
>>>> case of zero-filled pages. Although, compressed size of a zero-filled page
>>>> is very less, with this patch load time of a zero-filled page is reduced by
>>>> 80% when compared to baseline.
>>>
>>> Is it possible to share the benchmark details?
>>
>> Was there an answer to this?
>>
>
> This patch is tested on a ARM Quad Core 32-bit device with 1.5GB RAM by
> launching and relaunching different applications. With the patch, an
> average of 5000 pages zero pages found in zswap out of the ~5 pages
> stored in zswap and application launch time improved by ~3%.
>
> Test Parameters BaselineWith patch  Improvement
> ---
> Total RAM   1343MB  1343MB
> Available RAM   451MB   445MB -6MB
> Avg. Memfree69MB70MB  1MB
> Avg. Swap Used  226MB   215MB -11MB
> Avg. App entry time 644msec 623msec   3%
>
> With patch, every page swapped to zswap is checked if it is a zero
> page or not and for all the zero pages compression and memory allocation
> operations are skipped. Overall there is an improvement of 30% in zswap
> store time.
> In case of non-zero pages there is no overhead during zswap page load. For
> zero pages there is a improvement of more than 60% in the zswap load time
> as the zero page decompression is avoided.
>
> The below table shows the execution time profiling of the patch.
>
> Zswap Store Operation BaselineWith patch  % Improvement
> --
> * Zero page check-- 22.5ms
>  (for non-zero pages)
> * Zero page check-- 24ms
>  (for zero pages)
> * Compression time  55ms --
>  (of zero pages)
> * Allocation time   14ms --
>  (to store compressed
>   zero pages)
> -
> Total   69ms46.5ms 32%
>
> Zswap Load Operation BaselineWith patch  % Improvement
> -
> * Decompression time  30.4ms--
>  (of zero pages)
> * Zero page check +-- 10.04ms
>  memset operation
>  (of zero pages)
> -
> Total   

Re: [PATCH] zswap: Zero-filled pages handling

2017-03-03 Thread Dan Streetman
On Sat, Feb 25, 2017 at 12:18 PM, Sarbojit Ganguly
 wrote:
> On 25 February 2017 at 20:12, Srividya Desireddy
>  wrote:
>> From: Srividya Desireddy 
>> Date: Thu, 23 Feb 2017 15:04:06 +0530
>> Subject: [PATCH] zswap: Zero-filled pages handling

your email is base64-encoded; please send plain text emails.

>>
>> Zswap is a cache which compresses the pages that are being swapped out
>> and stores them into a dynamically allocated RAM-based memory pool.
>> Experiments have shown that around 10-20% of pages stored in zswap
>> are zero-filled pages (i.e. contents of the page are all zeros), but

20%?  that's a LOT of zero pages...which seems like applications are
wasting a lot of memory.  what kind of workload are you testing with?

>> these pages are handled as normal pages by compressing and allocating
>> memory in the pool.
>>
>> This patch adds a check in zswap_frontswap_store() to identify zero-filled
>> page before compression of the page. If the page is a zero-filled page, set
>> zswap_entry.zeroflag and skip the compression of the page and alloction
>> of memory in zpool. In zswap_frontswap_load(), check if the zeroflag is
>> set for the page in zswap_entry. If the flag is set, memset the page with
>> zero. This saves the decompression time during load.
>>
>> The overall overhead caused to check for a zero-filled page is very minimal
>> when compared to the time saved by avoiding compression and allocation in
>> case of zero-filled pages. Although, compressed size of a zero-filled page
>> is very less, with this patch load time of a zero-filled page is reduced by
>> 80% when compared to baseline.
>
> Is it possible to share the benchmark details?

Was there an answer to this?

>
>
>>
>> Signed-off-by: Srividya Desireddy 
>> ---
>>  mm/zswap.c |   48 +---
>>  1 file changed, 45 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 067a0d6..a574008 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -49,6 +49,8 @@
>>  static u64 zswap_pool_total_size;
>>  /* The number of compressed pages currently stored in zswap */
>>  static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
>> +/* The number of zero filled pages swapped out to zswap */
>> +static atomic_t zswap_zero_pages = ATOMIC_INIT(0);
>>
>>  /*
>>   * The statistics below are not protected from concurrent access for
>> @@ -140,6 +142,8 @@ struct zswap_pool {
>>   *  decompression
>>   * pool - the zswap_pool the entry's data is in
>>   * handle - zpool allocation handle that stores the compressed page data
>> + * zeroflag - the flag is set if the content of the page is filled with
>> + *zeros
>>   */
>>  struct zswap_entry {
>> struct rb_node rbnode;
>> @@ -148,6 +152,7 @@ struct zswap_entry {
>> unsigned int length;
>> struct zswap_pool *pool;
>> unsigned long handle;
>> +   unsigned char zeroflag;

instead of a flag, we can use length == 0; the length will never be 0
for any actually compressed page.

>>  };
>>
>>  struct zswap_header {
>> @@ -236,6 +241,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t 
>> gfp)
>> if (!entry)
>> return NULL;
>> entry->refcount = 1;
>> +   entry->zeroflag = 0;
>> RB_CLEAR_NODE(>rbnode);
>> return entry;
>>  }
>> @@ -306,8 +312,12 @@ static void zswap_rb_erase(struct rb_root *root, struct 
>> zswap_entry *entry)
>>   */
>>  static void zswap_free_entry(struct zswap_entry *entry)
>>  {
>> -   zpool_free(entry->pool->zpool, entry->handle);
>> -   zswap_pool_put(entry->pool);
>> +   if (entry->zeroflag)
>> +   atomic_dec(_zero_pages);
>> +   else {
>> +   zpool_free(entry->pool->zpool, entry->handle);
>> +   zswap_pool_put(entry->pool);
>> +   }
>> zswap_entry_cache_free(entry);
>> atomic_dec(_stored_pages);
>> zswap_update_total_size();
>> @@ -877,6 +887,19 @@ static int zswap_shrink(void)
>> return ret;
>>  }
>>
>> +static int zswap_is_page_zero_filled(void *ptr)
>> +{
>> +   unsigned int pos;
>> +   unsigned long *page;
>> +
>> +   page = (unsigned long *)ptr;
>> +   for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
>> +   if (page[pos])
>> +   return 0;
>> +   }
>> +   return 1;
>> +}
>> +
>>  /*
>>  * frontswap hooks
>>  **/
>> @@ -917,6 +940,15 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
>> offset,
>> goto reject;
>> }
>>
>> +   src = kmap_atomic(page);
>> +   if (zswap_is_page_zero_filled(src)) {
>> +   kunmap_atomic(src);
>> +   entry->offset = offset;
>> +   entry->zeroflag = 1;
>> +   atomic_inc(_zero_pages);
>> + 

Re: [PATCH] zswap: Zero-filled pages handling

2017-03-03 Thread Dan Streetman
On Sat, Feb 25, 2017 at 12:18 PM, Sarbojit Ganguly
 wrote:
> On 25 February 2017 at 20:12, Srividya Desireddy
>  wrote:
>> From: Srividya Desireddy 
>> Date: Thu, 23 Feb 2017 15:04:06 +0530
>> Subject: [PATCH] zswap: Zero-filled pages handling

your email is base64-encoded; please send plain text emails.

>>
>> Zswap is a cache which compresses the pages that are being swapped out
>> and stores them into a dynamically allocated RAM-based memory pool.
>> Experiments have shown that around 10-20% of pages stored in zswap
>> are zero-filled pages (i.e. contents of the page are all zeros), but

20%?  that's a LOT of zero pages...which seems like applications are
wasting a lot of memory.  what kind of workload are you testing with?

>> these pages are handled as normal pages by compressing and allocating
>> memory in the pool.
>>
>> This patch adds a check in zswap_frontswap_store() to identify zero-filled
>> page before compression of the page. If the page is a zero-filled page, set
>> zswap_entry.zeroflag and skip the compression of the page and alloction
>> of memory in zpool. In zswap_frontswap_load(), check if the zeroflag is
>> set for the page in zswap_entry. If the flag is set, memset the page with
>> zero. This saves the decompression time during load.
>>
>> The overall overhead caused to check for a zero-filled page is very minimal
>> when compared to the time saved by avoiding compression and allocation in
>> case of zero-filled pages. Although, compressed size of a zero-filled page
>> is very less, with this patch load time of a zero-filled page is reduced by
>> 80% when compared to baseline.
>
> Is it possible to share the benchmark details?

Was there an answer to this?

>
>
>>
>> Signed-off-by: Srividya Desireddy 
>> ---
>>  mm/zswap.c |   48 +---
>>  1 file changed, 45 insertions(+), 3 deletions(-)
>>
>> diff --git a/mm/zswap.c b/mm/zswap.c
>> index 067a0d6..a574008 100644
>> --- a/mm/zswap.c
>> +++ b/mm/zswap.c
>> @@ -49,6 +49,8 @@
>>  static u64 zswap_pool_total_size;
>>  /* The number of compressed pages currently stored in zswap */
>>  static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
>> +/* The number of zero filled pages swapped out to zswap */
>> +static atomic_t zswap_zero_pages = ATOMIC_INIT(0);
>>
>>  /*
>>   * The statistics below are not protected from concurrent access for
>> @@ -140,6 +142,8 @@ struct zswap_pool {
>>   *  decompression
>>   * pool - the zswap_pool the entry's data is in
>>   * handle - zpool allocation handle that stores the compressed page data
>> + * zeroflag - the flag is set if the content of the page is filled with
>> + *zeros
>>   */
>>  struct zswap_entry {
>> struct rb_node rbnode;
>> @@ -148,6 +152,7 @@ struct zswap_entry {
>> unsigned int length;
>> struct zswap_pool *pool;
>> unsigned long handle;
>> +   unsigned char zeroflag;

instead of a flag, we can use length == 0; the length will never be 0
for any actually compressed page.

>>  };
>>
>>  struct zswap_header {
>> @@ -236,6 +241,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t 
>> gfp)
>> if (!entry)
>> return NULL;
>> entry->refcount = 1;
>> +   entry->zeroflag = 0;
>> RB_CLEAR_NODE(>rbnode);
>> return entry;
>>  }
>> @@ -306,8 +312,12 @@ static void zswap_rb_erase(struct rb_root *root, struct 
>> zswap_entry *entry)
>>   */
>>  static void zswap_free_entry(struct zswap_entry *entry)
>>  {
>> -   zpool_free(entry->pool->zpool, entry->handle);
>> -   zswap_pool_put(entry->pool);
>> +   if (entry->zeroflag)
>> +   atomic_dec(_zero_pages);
>> +   else {
>> +   zpool_free(entry->pool->zpool, entry->handle);
>> +   zswap_pool_put(entry->pool);
>> +   }
>> zswap_entry_cache_free(entry);
>> atomic_dec(_stored_pages);
>> zswap_update_total_size();
>> @@ -877,6 +887,19 @@ static int zswap_shrink(void)
>> return ret;
>>  }
>>
>> +static int zswap_is_page_zero_filled(void *ptr)
>> +{
>> +   unsigned int pos;
>> +   unsigned long *page;
>> +
>> +   page = (unsigned long *)ptr;
>> +   for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
>> +   if (page[pos])
>> +   return 0;
>> +   }
>> +   return 1;
>> +}
>> +
>>  /*
>>  * frontswap hooks
>>  **/
>> @@ -917,6 +940,15 @@ static int zswap_frontswap_store(unsigned type, pgoff_t 
>> offset,
>> goto reject;
>> }
>>
>> +   src = kmap_atomic(page);
>> +   if (zswap_is_page_zero_filled(src)) {
>> +   kunmap_atomic(src);
>> +   entry->offset = offset;
>> +   entry->zeroflag = 1;
>> +   atomic_inc(_zero_pages);
>> +   goto insert_entry;
>> +   }
>> +
>> /* if entry is successfully added, it 

Re: [PATCH v2 1/1] mm: zswap - Add crypto acomp/scomp framework support

2017-02-24 Thread Dan Streetman
On Fri, Feb 24, 2017 at 11:05 AM, Mahipal Challa
 wrote:
> This adds support for kernel's new crypto acomp/scomp framework
> to zswap.

I don't understand the point of this, zswap can't compress pages
asynchronously, so what benefit do we get from using the async crypto
api and then immediately waiting for it to finish?  This seems like
it's just adding complexity for no reason?

>
> Signed-off-by: Mahipal Challa 
> Signed-off-by: Vishnu Nair 
> ---
>  mm/zswap.c | 192 
> +++--
>  1 file changed, 162 insertions(+), 30 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index cabf09e..b29d109 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -33,8 +33,10 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
> +#include 
>
>  #include 
>  #include 
> @@ -118,9 +120,21 @@ static int zswap_compressor_param_set(const char *,
>  * data structures
>  **/
>
> +/**
> + * struct zswap_acomp_result - Data structure to store result of acomp 
> callback
> + * @completion: zswap will wait for completion on this entry
> + * @err   : return value from acomp algorithm will be stored here
> + */
> +struct zswap_acomp_result {
> +   struct completion completion;
> +   int err;
> +};
> +
>  struct zswap_pool {
> struct zpool *zpool;
> -   struct crypto_comp * __percpu *tfm;
> +   struct crypto_acomp * __percpu *acomp;
> +   struct acomp_req * __percpu *acomp_req;
> +   struct zswap_acomp_result * __percpu *result;
> struct kref kref;
> struct list_head list;
> struct work_struct work;
> @@ -388,30 +402,66 @@ static int zswap_dstmem_dead(unsigned int cpu)
>  static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
>  {
> struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> -   struct crypto_comp *tfm;
> +   struct crypto_acomp *acomp;
> +   struct acomp_req *acomp_req;
> +   struct zswap_acomp_result *result;
>
> -   if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
> +   if (WARN_ON(*per_cpu_ptr(pool->acomp, cpu)))
> return 0;
> +   if (WARN_ON(*per_cpu_ptr(pool->acomp_req, cpu)))
> +   return 0;
> +   if (WARN_ON(*per_cpu_ptr(pool->result, cpu)))
> +   return 0;
> +
> +   acomp = crypto_alloc_acomp(pool->tfm_name, 0, 0);
> +   if (IS_ERR_OR_NULL(acomp)) {
> +   pr_err("could not alloc crypto acomp %s : %ld\n",
> +  pool->tfm_name, PTR_ERR(acomp));
> +   return -ENOMEM;
> +   }
> +   *per_cpu_ptr(pool->acomp, cpu) = acomp;
> +
> +   acomp_req = acomp_request_alloc(acomp);
> +   if (IS_ERR_OR_NULL(acomp_req)) {
> +   pr_err("could not alloc crypto acomp %s : %ld\n",
> +  pool->tfm_name, PTR_ERR(acomp));
> +   return -ENOMEM;
> +   }
> +   *per_cpu_ptr(pool->acomp_req, cpu) = acomp_req;
>
> -   tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
> -   if (IS_ERR_OR_NULL(tfm)) {
> -   pr_err("could not alloc crypto comp %s : %ld\n",
> -  pool->tfm_name, PTR_ERR(tfm));
> +   result = kzalloc(sizeof(*result), GFP_KERNEL);
> +   if (IS_ERR_OR_NULL(result)) {
> +   pr_err("Could not initialize completion on result\n");
> return -ENOMEM;
> }
> -   *per_cpu_ptr(pool->tfm, cpu) = tfm;
> +   init_completion(>completion);
> +   *per_cpu_ptr(pool->result, cpu) = result;
> +
> return 0;
>  }
>
>  static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
>  {
> struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> -   struct crypto_comp *tfm;
> +   struct crypto_acomp *acomp;
> +   struct acomp_req *acomp_req;
> +   struct zswap_acomp_result *result;
> +
> +   acomp_req = *per_cpu_ptr(pool->acomp_req, cpu);
> +   if (!IS_ERR_OR_NULL(acomp_req))
> +   acomp_request_free(acomp_req);
> +   *per_cpu_ptr(pool->acomp_req, cpu) = NULL;
> +
> +   acomp = *per_cpu_ptr(pool->acomp, cpu);
> +   if (!IS_ERR_OR_NULL(acomp))
> +   crypto_free_acomp(acomp);
> +   *per_cpu_ptr(pool->acomp, cpu) = NULL;
> +
> +   result = *per_cpu_ptr(pool->result, cpu);
> +   if (!IS_ERR_OR_NULL(result))
> +   kfree(result);
> +   *per_cpu_ptr(pool->result, cpu) = NULL;
>
> -   tfm = *per_cpu_ptr(pool->tfm, cpu);
> -   if (!IS_ERR_OR_NULL(tfm))
> -   crypto_free_comp(tfm);
> -   *per_cpu_ptr(pool->tfm, cpu) = NULL;
> return 0;
>  }
>
> @@ -512,8 +562,20 @@ static struct zswap_pool *zswap_pool_create(char *type, 
> char *compressor)
> pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
>
> strlcpy(pool->tfm_name, 

  1   2   3   4   5   6   7   8   9   10   >