This patch adds an optional choice which can be
enabled by users in order to cache both incomplete
ends of compressed clusters as a complement to
the in-place decompression in order to boost random
read, but it costs more memory than the in-place
decompression only.

Signed-off-by: Gao Xiang <[email protected]>
---
 fs/erofs/Kconfig     |  38 ++++++++
 fs/erofs/internal.h  |  25 +++++
 fs/erofs/super.c     |  75 ++++++++++++++-
 fs/erofs/unzip_vle.c | 257 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/erofs/utils.c     |  17 +++-
 5 files changed, 410 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 00e811c..d08c019 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -99,3 +99,41 @@ config EROFS_FS_CLUSTER_PAGE_LIMIT
          than 2. Otherwise, the image cannot be mounted
          correctly on this kernel.
 
+choice
+       prompt "EROFS VLE Data Decompression mode"
+       depends on EROFS_FS_ZIP
+       default EROFS_FS_ZIP_CACHE_BIPOLAR
+       help
+         EROFS supports three options for VLE decompression.
+         "In-place Decompression Only" consumes the minimum memory
+         with lowest random read.
+
+         "Bipolar Cached Decompression" consumes the maximum memory
+         with highest random read.
+
+         If unsure, select "Bipolar Cached Decompression"
+
+config EROFS_FS_ZIP_NO_CACHE
+       bool "In-place Decompression Only"
+       help
+         Read compressed data into page cache and do in-place
+         decompression directly.
+
+config EROFS_FS_ZIP_CACHE_UNIPOLAR
+       bool "Unipolar Cached Decompression"
+       help
+         For each request, it caches the last compressed page
+         for further reading.
+         It still decompresses in place for the rest compressed pages.
+
+config EROFS_FS_ZIP_CACHE_BIPOLAR
+       bool "Bipolar Cached Decompression"
+       help
+         For each request, it caches the both end compressed pages
+         for further reading.
+         It still decompresses in place for the rest compressed pages.
+
+         Recommended for performance priority.
+
+endchoice
+
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index fd444ec..5667f56 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -59,6 +59,18 @@ struct erofs_fault_info {
 };
 #endif
 
+#ifdef CONFIG_EROFS_FS_ZIP_CACHE_BIPOLAR
+#define EROFS_FS_ZIP_CACHE_LVL (2)
+#elif defined(EROFS_FS_ZIP_CACHE_UNIPOLAR)
+#define EROFS_FS_ZIP_CACHE_LVL (1)
+#else
+#define EROFS_FS_ZIP_CACHE_LVL (0)
+#endif
+
+#if (!defined(EROFS_FS_HAS_MANAGED_CACHE) && (EROFS_FS_ZIP_CACHE_LVL > 0))
+#define EROFS_FS_HAS_MANAGED_CACHE
+#endif
+
 /* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
 #define EROFS_SUPER_MAGIC   EROFS_SUPER_MAGIC_V1
 
@@ -88,6 +100,11 @@ struct erofs_sb_info {
                spinlock_t lock;
 #endif
        } workstn;
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+       struct inode *managed_cache;
+#endif
+
 #endif
 
        u32 build_time_nsec;
@@ -251,6 +268,14 @@ static inline void erofs_workstation_cleanup_all(struct 
super_block *sb)
        erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+#define EROFS_UNALLOCATED_CACHED_PAGE  ((void *)0x5F0EF00D)
+
+extern int try_to_free_cached_page(struct address_space *, struct page *);
+extern int try_to_free_all_cached_pages(struct erofs_sb_info *,
+       struct erofs_workgroup *);
+#endif
+
 #endif
 
 /* we strictly follow PAGE_SIZE and no buffer head */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 7e5333c..5a940c7 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -247,6 +247,63 @@ static int parse_options(struct super_block *sb, char 
*options)
        return 0;
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+
+static const struct address_space_operations managed_cache_aops;
+
+static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+{
+       int ret = 1;    /* 0 - busy */
+       struct address_space *const mapping = page->mapping;
+
+       BUG_ON(!PageLocked(page));
+       BUG_ON(mapping->a_ops != &managed_cache_aops);
+
+       if (PagePrivate(page))
+               ret = try_to_free_cached_page(mapping, page);
+
+       return ret;
+}
+
+static void managed_cache_invalidatepage(struct page *page,
+       unsigned int offset, unsigned int length)
+{
+       const unsigned int stop = length + offset;
+
+       BUG_ON(!PageLocked(page));
+
+       /* Check for overflow */
+       BUG_ON(stop > PAGE_SIZE || stop < length);
+
+       if (offset == 0 && stop == PAGE_SIZE)
+               while(!managed_cache_releasepage(page, GFP_NOFS))
+                       cond_resched();
+}
+
+static const struct address_space_operations managed_cache_aops = {
+       .releasepage = managed_cache_releasepage,
+       .invalidatepage = managed_cache_invalidatepage,
+};
+
+struct inode *erofs_init_managed_cache(struct super_block *sb)
+{
+       struct inode *inode = new_inode(sb);
+
+       if (unlikely(inode == NULL))
+               return ERR_PTR(-ENOMEM);
+
+       set_nlink(inode, 1);
+       inode->i_size = OFFSET_MAX;
+
+       inode->i_mapping->a_ops = &managed_cache_aops;
+       mapping_set_gfp_mask(inode->i_mapping,
+                            GFP_NOFS | __GFP_HIGHMEM |
+                            __GFP_MOVABLE |  __GFP_NOFAIL);
+       return inode;
+}
+
+#endif
+
 static int erofs_read_super(struct super_block *sb,
        const char *dev_name, void *data, int silent)
 {
@@ -301,11 +358,19 @@ static int erofs_read_super(struct super_block *sb,
 #endif
 #endif
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+       sbi->managed_cache = erofs_init_managed_cache(sb);
+       if (IS_ERR(sbi->managed_cache)) {
+               err = PTR_ERR(sbi->managed_cache);
+               goto err_sbi;
+       }
+#endif
+
        /* get the root inode */
        inode = erofs_iget(sb, ROOT_NID(sbi), true);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
-               goto err_sbi;
+               goto iget_err;
        }
 
        if (!S_ISDIR(inode->i_mode)) {
@@ -348,6 +413,10 @@ static int erofs_read_super(struct super_block *sb,
 err_iput:
        if (sb->s_root == NULL)
                iput(inode);
+iget_err:
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+       iput(sbi->managed_cache);
+#endif
 err_sbi:
        sb->s_fs_info = NULL;
        kfree(sbi);
@@ -370,6 +439,10 @@ static void erofs_put_super(struct super_block *sb)
        infoln("unmounted for %s", sbi->dev_name);
        __putname(sbi->dev_name);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+       iput(sbi->managed_cache);
+#endif
+
        mutex_lock(&sbi->umount_mutex);
 
 #ifdef CONFIG_EROFS_FS_ZIP
diff --git a/fs/erofs/unzip_vle.c b/fs/erofs/unzip_vle.c
index c113740..63e27bd 100644
--- a/fs/erofs/unzip_vle.c
+++ b/fs/erofs/unzip_vle.c
@@ -95,6 +95,111 @@ struct z_erofs_vle_work_builder {
 #define VLE_WORK_BUILDER_INIT()        \
        { .work = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+
+static bool grab_managed_cache_pages(struct address_space *mapping,
+                                    erofs_blk_t start,
+                                    struct page **compressed_pages,
+                                    int clusterblks,
+                                    bool reserve_allocation)
+{
+       bool noio = true;
+       unsigned int i;
+
+       /* TODO: optimize by introducing find_get_pages_range */
+       for (i = 0; i < clusterblks; ++i) {
+               struct page *page, *found;
+
+               if (READ_ONCE(compressed_pages[i]) != NULL)
+                       continue;
+
+               page = found = find_get_page(mapping, start + i);
+               if (found == NULL) {
+                       noio = false;
+                       if (!reserve_allocation)
+                               continue;
+                       page = EROFS_UNALLOCATED_CACHED_PAGE;
+               }
+
+               if (NULL == cmpxchg(compressed_pages + i, NULL, page))
+                        continue;
+
+               if (found != NULL)
+                       put_page(found);
+       }
+       return noio;
+}
+
+/* called by erofs_shrinker to get rid of all compressed_pages */
+int try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+                                struct erofs_workgroup *egrp)
+{
+       struct z_erofs_vle_workgroup *const grp =
+               container_of(egrp, struct z_erofs_vle_workgroup, obj);
+       struct address_space *const mapping = sbi->managed_cache->i_mapping;
+       const int clusterpages = erofs_clusterpages(sbi);
+       int i;
+
+       /*
+        * refcount of workgroup is now freezed as 1,
+        * therefore no need to worry about available decompression users.
+        */
+       for (i = 0; i < clusterpages; ++i) {
+               struct page *page = grp->compressed_pages[i];
+
+               if (page == NULL || page->mapping != mapping)
+                       continue;
+
+               /* block other users from reclaiming or migrating the page */
+               if (!trylock_page(page))
+                       return -EBUSY;
+
+               /* barrier is implied in the following 'unlock_page' */
+               WRITE_ONCE(grp->compressed_pages[i], NULL);
+
+               set_page_private(page, 0);
+               ClearPagePrivate(page);
+
+               unlock_page(page);
+               put_page(page);
+       }
+       return 0;
+}
+
+int try_to_free_cached_page(struct address_space *mapping, struct page *page)
+{
+       struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb);
+       const unsigned clusterpages = erofs_clusterpages(sbi);
+
+       struct z_erofs_vle_workgroup *grp;
+       int ret = 0;    /* 0 - busy */
+
+       /* prevent the workgroup from being freed */
+       rcu_read_lock();
+       grp = (void *)page_private(page);
+
+       if (erofs_workgroup_try_to_freeze(&grp->obj, 1)) {
+               unsigned i;
+
+               for (i = 0; i < clusterpages; ++i) {
+                       if (grp->compressed_pages[i] == page) {
+                               WRITE_ONCE(grp->compressed_pages[i], NULL);
+                               ret = 1;
+                               break;
+                       }
+               }
+               erofs_workgroup_unfreeze(&grp->obj, 1);
+       }
+       rcu_read_unlock();
+
+       if (ret) {
+               ClearPagePrivate(page);
+               put_page(page);
+       }
+       return ret;
+}
+#endif
+
 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
 static inline bool try_to_reuse_as_compressed_page(
        struct z_erofs_vle_work_builder *b,
@@ -451,6 +556,9 @@ struct z_erofs_vle_frontend {
        z_erofs_vle_owned_workgrp_t owned_head;
 
        bool initial;
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+       erofs_off_t cachedzone_la;
+#endif
 };
 
 #define VLE_FRONTEND_INIT(__i) { \
@@ -516,6 +624,26 @@ static int z_erofs_do_read_page(struct 
z_erofs_vle_frontend *fe,
        if (unlikely(err))
                goto err_out;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+       else {
+               struct z_erofs_vle_workgroup *grp = fe->builder.grp;
+               struct address_space *mapping = sbi->managed_cache->i_mapping;
+
+               /* let's do out-of-order decompression for noio */
+               bool noio_outoforder = grab_managed_cache_pages(mapping,
+                       erofs_blknr(map->m_pa),
+                       grp->compressed_pages, erofs_blknr(map->m_plen),
+                       fe->initial
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+                       | (map->m_la <= fe->cachedzone_la)
+#endif
+               );
+
+               if (noio_outoforder && builder_is_followed(builder))
+                       builder->role = Z_EROFS_VLE_WORK_PRIMARY;
+       }
+#endif
+
        tight &= builder_is_followed(builder);
        work = builder->work;
 hitted:
@@ -613,6 +741,15 @@ static inline void z_erofs_vle_read_endio(struct bio *bio)
 
                DBG_BUGON(PageUptodate(page));
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+               if (page->mapping != NULL) {
+                       struct inode *inode = page->mapping->host;
+
+                       cachedpage = (inode ==
+                               EROFS_SB(inode->i_sb)->managed_cache);
+               }
+#endif
+
                if (unlikely(err))
                        SetPageError(page);
                else if (cachedpage)
@@ -726,6 +863,13 @@ static int z_erofs_vle_unzip(struct super_block *sb,
 
                if (page->mapping == NULL)
                        continue;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+               if (page->mapping->host == sbi->managed_cache) {
+                       BUG_ON(PageLocked(page));
+                       BUG_ON(!PageUptodate(page));
+                       continue;
+               }
+#endif
 
                pagenr = z_erofs_onlinepage_index(page);
 
@@ -807,6 +951,10 @@ static int z_erofs_vle_unzip(struct super_block *sb,
                if (page->mapping == NULL)
                        list_add(&page->lru, page_pool);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+               else if (page->mapping->host == sbi->managed_cache)
+                       continue;
+#endif
                WRITE_ONCE(compressed_pages[i], NULL);
        }
 
@@ -898,7 +1046,32 @@ static void z_erofs_vle_unzip_wq(struct work_struct *work)
        return io;
 }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+/* true - unlocked (noio), false - locked (need submit io) */
+static inline bool recover_managed_page(
+       struct z_erofs_vle_workgroup *grp,
+       struct page *page)
+{
+       wait_on_page_locked(page);
+       if (PagePrivate(page) && PageUptodate(page))
+               return true;
+
+       lock_page(page);
+       if (unlikely(!PagePrivate(page))) {
+               set_page_private(page, (unsigned long)grp);
+               SetPagePrivate(page);
+       }
+       if (unlikely(PageUptodate(page))) {
+               unlock_page(page);
+               return true;
+       }
+       return false;
+}
+
+#define __FSIO_1 1
+#else
 #define __FSIO_1 0
+#endif
 
 static bool z_erofs_vle_submit_all(struct super_block *sb,
                                   z_erofs_vle_owned_workgrp_t owned_head,
@@ -909,6 +1082,11 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
        struct erofs_sb_info *const sbi = EROFS_SB(sb);
        const unsigned clusterpages = erofs_clusterpages(sbi);
        const gfp_t gfp = GFP_NOFS;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+       struct address_space *const managed_cache_mapping =
+               sbi->managed_cache->i_mapping;
+       struct z_erofs_vle_workgroup *lstgrp_noio = NULL, *lstgrp_io = NULL;
+#endif
        struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1];
        struct bio *bio;
        tagptr1_t bi_private;
@@ -923,6 +1101,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
         * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io
          * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io
         */
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+       ios[0] = prepare_io_handler(sb, fg_io + 0, false);
+#endif
+
        if (force_fg) {
                ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false);
                bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0);
@@ -943,6 +1125,10 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
                struct page **compressed_pages, *oldpage, *page;
                pgoff_t first_index;
                unsigned i = 0;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+               unsigned noio = 0;
+               bool cachemanaged;
+#endif
                int err;
 
                /* no possible 'owned_head' equals the following */
@@ -963,9 +1149,28 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
                /* fulfill all compressed pages */
                oldpage = page = READ_ONCE(compressed_pages[i]);
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+               cachemanaged = false;
+
+               if (page == EROFS_UNALLOCATED_CACHED_PAGE) {
+                       cachemanaged = true;
+                       goto do_allocpage;
+               } else if (page != NULL) {
+                       if (page->mapping != managed_cache_mapping)
+                               BUG_ON(PageUptodate(page));
+                       else if (recover_managed_page(grp, page)) {
+                               /* page is uptodate, skip io submission */
+                               force_submit = true;
+                               ++noio;
+                               goto skippage;
+                       }
+               } else {
+do_allocpage:
+#else
                if (page != NULL)
                        BUG_ON(PageUptodate(page));
                else {
+#endif
                        page = erofs_allocpage(pagepool, gfp);
                        page->mapping = NULL;
 
@@ -973,6 +1178,12 @@ static bool z_erofs_vle_submit_all(struct super_block *sb,
                                oldpage, page)) {
                                list_add(&page->lru, pagepool);
                                goto repeat;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+                       } else if (cachemanaged && !add_to_page_cache_lru(page,
+                               managed_cache_mapping, first_index + i, gfp)) {
+                               set_page_private(page, (unsigned long)grp);
+                               SetPagePrivate(page);
+#endif
                        }
                }
 
@@ -996,14 +1207,51 @@ static bool z_erofs_vle_submit_all(struct super_block 
*sb,
 
                force_submit = false;
                last_index = first_index + i;
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+skippage:
+#endif
                if (++i < clusterpages)
                        goto repeat;
+
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+               if (noio < clusterpages)
+                       lstgrp_io = grp;
+               else {
+                       z_erofs_vle_owned_workgrp_t iogrp_next =
+                               owned_head == Z_EROFS_VLE_WORKGRP_TAIL ?
+                               Z_EROFS_VLE_WORKGRP_TAIL_CLOSED :
+                               owned_head;
+
+                       if (lstgrp_io == NULL)
+                               ios[1]->head = iogrp_next;
+                       else
+                               WRITE_ONCE(lstgrp_io->next, iogrp_next);
+
+                       if (lstgrp_noio == NULL)
+                               ios[0]->head = grp;
+                       else
+                               WRITE_ONCE(lstgrp_noio->next, grp);
+
+                       lstgrp_noio = grp;
+               }
+#endif
        } while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
 
        if (bio != NULL)
                __submit_bio(bio, REQ_OP_READ, 0);
 
+#ifndef EROFS_FS_HAS_MANAGED_CACHE
        BUG_ON(!nr_bios);
+#else
+       if (lstgrp_noio != NULL)
+               WRITE_ONCE(lstgrp_noio->next, Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
+
+       if (!force_fg && !nr_bios) {
+               kvfree(container_of(ios[1],
+                       struct z_erofs_vle_unzip_io_sb, io));
+               return true;
+       }
+#endif
 
        z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios);
        return true;
@@ -1019,6 +1267,9 @@ static void z_erofs_submit_and_unzip(struct 
z_erofs_vle_frontend *f,
        if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg))
                return;
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+       z_erofs_vle_unzip_all(sb, &io[0], pagepool);
+#endif
        if (!force_fg)
                return;
 
@@ -1038,6 +1289,9 @@ static int z_erofs_vle_normalaccess_readpage(struct file 
*file,
        int err;
        LIST_HEAD(pagepool);
 
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+       f.cachedzone_la = page->index << PAGE_SHIFT;
+#endif
        err = z_erofs_do_read_page(&f, page, &pagepool);
        (void)z_erofs_vle_work_iter_end(&f.builder);
 
@@ -1068,6 +1322,9 @@ static inline int __z_erofs_vle_normalaccess_readpages(
        struct page *head = NULL;
        LIST_HEAD(pagepool);
 
+#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
+       f.cachedzone_la = lru_to_page(pages)->index << PAGE_SHIFT;
+#endif
        for (; nr_pages; --nr_pages) {
                struct page *page = lru_to_page(pages);
 
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index dd1ce5f..b669ca3 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -143,13 +143,28 @@ unsigned long erofs_shrink_workstation(struct 
erofs_sb_info *sbi,
                if (cleanup)
                        BUG_ON(cnt != 1);
 
+#ifndef EROFS_FS_HAS_MANAGED_CACHE
                else if (cnt > 1)
+#else
+               if (!erofs_workgroup_try_to_freeze(grp, 1))
+#endif
                        continue;
 
                if (radix_tree_delete(&sbi->workstn.tree,
-                       grp->index) != grp)
+                       grp->index) != grp) {
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+skip:
+                       erofs_workgroup_unfreeze(grp, 1);
+#endif
                        continue;
+               }
 
+#ifdef EROFS_FS_HAS_MANAGED_CACHE
+               if (try_to_free_all_cached_pages(sbi, grp))
+                       goto skip;
+
+               erofs_workgroup_unfreeze(grp, 1);
+#endif
                /* (rarely) grabbed again when freeing */
                erofs_workgroup_put(grp);
 
-- 
1.9.1

Reply via email to