zsmalloc classes are known to be affected by internal fragmentation.

For example, /sys/kernel/debug/zsmalloc/zramX/classes
 class  size almost_full almost_empty obj_allocated   obj_used pages_used 
pages_per_zspage freeable
    54   896           1           12           117         57         26       
         2       12
...
   107  1744           1           23           196         76         84       
         3       51
   111  1808           0            0            63         63         28       
         4        0
   126  2048           0          160           568        408        284       
         1       80
   144  2336          52          620          8631       5747       4932       
         4     1648
   151  2448         123          406         10090       8736       6054       
         3      810
   168  2720           0          512         15738      14926      10492       
         2      540
   190  3072           0            2           136        130        102       
         3        3
...

demonstrates that class-896 has 12/26=46% of unused pages, class-2336 has
1648/4932=33% of unused pages, etc. And the more classes we will have as
'normal' classes (more than one object per-zspage) the bigger this problem
will grow. The existing compaction relies on a user space (user can trigger
compaction via `compact' zram's sysfs attr) or a shrinker; it does not
happen automatically.

This patch introduces a 'watermark' value of unused pages and schedules a
compaction work on a per-class basis once class's fragmentation becomes
too big. So compaction is not performed in current I/O operation context,
but in workqueue workers later.

The current watermark is set to 40% -- if class has 40+% of `freeable'
pages then compaction work will be scheduled.

TEST
====

  2G zram, ext4, lz0

  iozone -t 1 -R -r 64K -s 1200M -I +Z

                        BASE       PATCHED
"  Initial write "   959670.94    966724.62
"        Rewrite "  1276167.62   1237632.88
"           Read "  3334708.25   3345357.50
"        Re-read "  3405310.75   3337137.25
"   Reverse Read "  3284499.75   3241283.50
"    Stride read "  3293417.75   3268364.00
"    Random read "  3255253.50   3241685.00
" Mixed workload "  3274398.00   3231498.00
"   Random write "  1253207.50   1216247.00
"         Pwrite "   873682.25    877045.81
"          Pread "  3173266.00   3318471.75
"         Fwrite "   881278.38    897622.81
"          Fread "  4397147.00   4501131.50

  iozone -t 3 -R -r 64K -s 60M -I +Z

                        BASE       PATCHED
"  Initial write "  1855931.62   1869576.31
"        Rewrite "  2223531.06   2221543.62
"           Read "  7958435.75   8023044.75
"        Re-read "  7912776.75   8068961.00
"   Reverse Read "  7832227.50   7788237.50
"    Stride read "  7952113.50   7919778.00
"    Random read "  7908816.00   7881792.50
" Mixed workload "  6364520.38   6332493.94
"   Random write "  2230115.69   2176777.19
"         Pwrite "  1915939.31   1929464.75
"          Pread "  3857052.91   3840517.91
"         Fwrite "  2271730.44   2272800.31
"          Fread "  9053867.00   8880966.25

Signed-off-by: Sergey Senozhatsky <[email protected]>
---
 mm/zsmalloc.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index e72efb1..a4ef7e7 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -219,6 +219,10 @@ struct size_class {
        int pages_per_zspage;
        /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
        bool huge;
+
+       bool compact_scheduled;
+       struct zs_pool *pool;
+       struct work_struct compact_work;
 };
 
 /*
@@ -1467,6 +1471,8 @@ static void obj_free(struct zs_pool *pool, struct 
size_class *class,
        zs_stat_dec(class, OBJ_USED, 1);
 }
 
+static bool class_watermark_ok(struct size_class *class);
+
 void zs_free(struct zs_pool *pool, unsigned long handle)
 {
        struct page *first_page, *f_page;
@@ -1495,6 +1501,11 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
                atomic_long_sub(class->pages_per_zspage,
                                &pool->pages_allocated);
                free_zspage(first_page);
+       } else {
+               if (!class_watermark_ok(class) && !class->compact_scheduled) {
+                       queue_work(system_long_wq, &class->compact_work);
+                       class->compact_scheduled = true;
+               }
        }
        spin_unlock(&class->lock);
        unpin_tag(handle);
@@ -1745,6 +1756,19 @@ static unsigned long zs_can_compact(struct size_class 
*class)
        return obj_wasted * class->pages_per_zspage;
 }
 
+static bool class_watermark_ok(struct size_class *class)
+{
+       unsigned long pages_used = zs_stat_get(class, OBJ_ALLOCATED);
+
+       pages_used /= get_maxobj_per_zspage(class->size,
+                       class->pages_per_zspage) * class->pages_per_zspage;
+
+       if (!pages_used)
+               return true;
+
+       return (100 * zs_can_compact(class) / pages_used) < 40;
+}
+
 static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 {
        struct zs_compact_control cc;
@@ -1789,9 +1813,17 @@ static void __zs_compact(struct zs_pool *pool, struct 
size_class *class)
        if (src_page)
                putback_zspage(pool, class, src_page);
 
+       class->compact_scheduled = false;
        spin_unlock(&class->lock);
 }
 
+static void class_compaction_work(struct work_struct *work)
+{
+       struct size_class *class = container_of(work, struct size_class, 
compact_work);
+
+       __zs_compact(class->pool, class);
+}
+
 unsigned long zs_compact(struct zs_pool *pool)
 {
        int i;
@@ -1948,6 +1980,9 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t 
flags)
                if (pages_per_zspage == 1 &&
                        get_maxobj_per_zspage(size, pages_per_zspage) == 1)
                        class->huge = true;
+
+               INIT_WORK(&class->compact_work, class_compaction_work);
+               class->pool = pool;
                spin_lock_init(&class->lock);
                pool->size_class[i] = class;
 
@@ -1990,6 +2025,8 @@ void zs_destroy_pool(struct zs_pool *pool)
                if (class->index != i)
                        continue;
 
+               cancel_work_sync(&class->compact_work);
+
                for (fg = 0; fg < _ZS_NR_FULLNESS_GROUPS; fg++) {
                        if (class->fullness_list[fg]) {
                                pr_info("Freeing non-empty class with size %db, 
fullness group %d\n",
-- 
2.8.0.rc0

Reply via email to