On 2025/11/14 17:55, Hongbo Li wrote:
This patch adds inode page cache sharing functionality for unencoded
files.
I conducted experiments in the container environment. Below is the
memory usage for reading all files in two different minor versions
of container images:
+-------------------+------------------+-------------+---------------+
| Image | Page Cache Share | Memory (MB) | Memory |
| | | | Reduction (%) |
+-------------------+------------------+-------------+---------------+
| | No | 241 | - |
| redis +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 163 | 33% |
+-------------------+------------------+-------------+---------------+
| | No | 872 | - |
| postgres +------------------+-------------+---------------+
| 16.1 & 16.2 | Yes | 630 | 28% |
+-------------------+------------------+-------------+---------------+
| | No | 2771 | - |
| tensorflow +------------------+-------------+---------------+
| 2.11.0 & 2.11.1 | Yes | 2340 | 16% |
+-------------------+------------------+-------------+---------------+
| | No | 926 | - |
| mysql +------------------+-------------+---------------+
| 8.0.11 & 8.0.12 | Yes | 735 | 21% |
+-------------------+------------------+-------------+---------------+
| | No | 390 | - |
| nginx +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 219 | 44% |
+-------------------+------------------+-------------+---------------+
| tomcat | No | 924 | - |
| 10.1.25 & 10.1.26 +------------------+-------------+---------------+
| | Yes | 474 | 49% |
+-------------------+------------------+-------------+---------------+
Additionally, the table below shows the runtime memory usage of the
container:
+-------------------+------------------+-------------+---------------+
| Image | Page Cache Share | Memory (MB) | Memory |
| | | | Reduction (%) |
+-------------------+------------------+-------------+---------------+
| | No | 35 | - |
| redis +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 28 | 20% |
+-------------------+------------------+-------------+---------------+
| | No | 149 | - |
| postgres +------------------+-------------+---------------+
| 16.1 & 16.2 | Yes | 95 | 37% |
+-------------------+------------------+-------------+---------------+
| | No | 1028 | - |
| tensorflow +------------------+-------------+---------------+
| 2.11.0 & 2.11.1 | Yes | 930 | 10% |
+-------------------+------------------+-------------+---------------+
| | No | 155 | - |
| mysql +------------------+-------------+---------------+
| 8.0.11 & 8.0.12 | Yes | 132 | 15% |
+-------------------+------------------+-------------+---------------+
| | No | 25 | - |
| nginx +------------------+-------------+---------------+
| 7.2.4 & 7.2.5 | Yes | 20 | 20% |
+-------------------+------------------+-------------+---------------+
| tomcat | No | 186 | - |
| 10.1.25 & 10.1.26 +------------------+-------------+---------------+
| | Yes | 98 | 48% |
+-------------------+------------------+-------------+---------------+
Co-developed-by: Hongzhen Luo <[email protected]>
Signed-off-by: Hongzhen Luo <[email protected]>
Signed-off-by: Hongbo Li <[email protected]>
---
fs/erofs/data.c | 38 +++++++++++++++---
fs/erofs/inode.c | 5 +++
fs/erofs/internal.h | 4 ++
fs/erofs/ishare.c | 98 ++++++++++++++++++++++++++++++++++++++++++++-
fs/erofs/ishare.h | 18 +++++++++
fs/erofs/super.c | 11 +++--
6 files changed, 163 insertions(+), 11 deletions(-)
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index bd3d85c61341..c459104e4734 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -5,6 +5,7 @@
* Copyright (C) 2021, Alibaba Cloud
*/
#include "internal.h"
+#include "ishare.h"
Can we just get rid of another "ishare.h", these can be moved into
internal.h:
#ifdef CONFIG_EROFS_FS_INODE_SHARE
int erofs_ishare_init(struct super_block *sb);
void erofs_ishare_exit(struct super_block *sb);
bool erofs_ishare_fill_inode(struct inode *inode);
void erofs_ishare_free_inode(struct inode *inode);
#else
static inline int erofs_ishare_init(struct super_block *sb) { return 0; }
static inline void erofs_ishare_exit(struct super_block *sb) {}
static inline bool erofs_ishare_fill_inode(struct inode *inode) { return false;
}
static inline void erofs_ishare_free_inode(struct inode *inode) {}
#endif // CONFIG_EROFS_FS_INODE_SHARE
#include <linux/sched/mm.h>
#include <trace/events/erofs.h>
@@ -269,23 +270,27 @@ void erofs_onlinefolio_end(struct folio *folio, int err, bool dirty)
struct erofs_iomap_iter_ctx {
struct page *page;
void *base;
+ struct inode *realinode;
};
static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
{
- int ret;
struct erofs_iomap_iter_ctx *ctx;
- struct super_block *sb = inode->i_sb;
struct erofs_map_blocks map;
struct erofs_map_dev mdev;
struct iomap_iter *iter;
+ struct inode *realinode;
+ struct super_block *sb;
struct inode *realinode = ctx ? ctx->realinode : inode;
struct super_block *sb = realinode->i_sb;
+ int ret;
iter = container_of(iomap, struct iomap_iter, iomap);
ctx = iter->private;
+ realinode = ctx ? ctx->realinode : inode;
+ sb = realinode->i_sb;
map.m_la = offset;
map.m_llen = length;
- ret = erofs_map_blocks(inode, &map);
+ ret = erofs_map_blocks(realinode, &map);
if (ret < 0)
return ret;
@@ -300,7 +305,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
return 0;
}
- if (!(map.m_flags & EROFS_MAP_META) || !erofs_inode_in_metabox(inode)) {
+ if (!(map.m_flags & EROFS_MAP_META) ||
!erofs_inode_in_metabox(realinode)) {
mdev = (struct erofs_map_dev) {
.m_deviceid = map.m_deviceid,
.m_pa = map.m_pa,
@@ -326,7 +331,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t
offset, loff_t length,
struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
ptr = erofs_read_metabuf(&buf, sb, map.m_pa,
- erofs_inode_in_metabox(inode));
+
erofs_inode_in_metabox(realinode));
if (IS_ERR(ptr))
return PTR_ERR(ptr);
iomap->inline_data = ptr;
...
@@ -234,3 +248,83 @@ const struct file_operations erofs_ishare_fops = {
.get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
};
+
+void erofs_read_begin(struct erofs_read_ctx *rdctx)
I think if backing_head, backing_link (although I don't like
the naming) is valid, erofs_read_begin() and erofs_read_end()
is unneeded here.
Since we maintain the backing validity using .open() and
.release() hooks.
the odd erofs_read_{begin,end} can be avoided then...
Thanks,
Gao Xiang