On 12/15/25 09:46, Shivank Garg wrote:
When MADV_COLLAPSE is called on file-backed mappings (e.g., executable
text sections), the pages may still be dirty from recent writes.
collapse_file() will trigger async writeback and fail with
SCAN_PAGE_DIRTY_OR_WRITEBACK (-EAGAIN).
MADV_COLLAPSE is a synchronous operation where userspace expects
immediate results. If the collapse fails due to dirty pages, perform
synchronous writeback on the specific range and retry once.
This avoids spurious failures for freshly written executables while
avoiding unnecessary synchronous I/O for mappings that are already clean.
Reported-by: Branden Moore <[email protected]>
Closes: https://lore.kernel.org/all/[email protected]
Fixes: 34488399fa08 ("mm/madvise: add file and shmem support to MADV_COLLAPSE")
Suggested-by: David Hildenbrand <[email protected]>
Tested-by: Lance Yang <[email protected]>
Signed-off-by: Shivank Garg <[email protected]>
---
mm/khugepaged.c | 40 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 219dfa2e523c..6c8c35d3e0c9 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -22,6 +22,7 @@
#include <linux/dax.h>
#include <linux/ksm.h>
#include <linux/pgalloc.h>
+#include <linux/backing-dev.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -2787,9 +2788,11 @@ int madvise_collapse(struct vm_area_struct *vma,
unsigned long start,
hend = end & HPAGE_PMD_MASK;
for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
+ bool retried = false;
int result = SCAN_FAIL;
if (!mmap_locked) {
+retry:
Jumping into an if block is nasty :)
cond_resched();
mmap_read_lock(mm);
mmap_locked = true;
@@ -2819,6 +2822,43 @@ int madvise_collapse(struct vm_area_struct *vma,
unsigned long start,
if (!mmap_locked)
*lock_dropped = true;
+ /*
+ * If the file-backed VMA has dirty pages, the scan triggers
+ * async writeback and returns SCAN_PAGE_DIRTY_OR_WRITEBACK.
+ * Since MADV_COLLAPSE is sync, we force sync writeback and
+ * retry once.
+ */
+ if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !retried) {
+ /*
+ * File scan drops the lock. We must re-acquire it to
+ * safely inspect the VMA and hold the file reference.
+ */
+ if (!mmap_locked) {
+ cond_resched();
+ mmap_read_lock(mm);
+ mmap_locked = true;
+ result = hugepage_vma_revalidate(mm, addr, false,
&vma, cc);
+ if (result != SCAN_SUCCEED)
+ goto handle_result;
+ }
+
+ if (!vma_is_anonymous(vma) && vma->vm_file &&
+ mapping_can_writeback(vma->vm_file->f_mapping)) {
+ struct file *file = get_file(vma->vm_file);
+ pgoff_t pgoff = linear_page_index(vma, addr);
+ loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
+ loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+
+ mmap_read_unlock(mm);
+ mmap_locked = false;
+ *lock_dropped = true;
+ filemap_write_and_wait_range(file->f_mapping,
lstart, lend);
+ fput(file);
+ retried = true;
+ goto retry;
+ }
+ }
+
This looks a bit complicated. Can't we move that handing up, where we have most
of that
information already? Or am I missing something important?
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 97d1b2824386f..c7271877c5220 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -22,6 +22,7 @@
#include <linux/dax.h>
#include <linux/ksm.h>
#include <linux/pgalloc.h>
+#include <linux/backing-dev.h>
#include <asm/tlb.h>
#include "internal.h"
@@ -2786,7 +2787,9 @@ int madvise_collapse(struct vm_area_struct *vma, unsigned
long start,
for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
int result = SCAN_FAIL;
+ bool triggered_wb = false;
+retry:
if (!mmap_locked) {
cond_resched();
mmap_read_lock(mm);
@@ -2809,6 +2812,16 @@ int madvise_collapse(struct vm_area_struct *vma,
unsigned long start,
mmap_locked = false;
result = hpage_collapse_scan_file(mm, addr, file, pgoff,
cc);
+
+ if (result == SCAN_PAGE_DIRTY_OR_WRITEBACK && !triggered_wb
&&
+ mapping_can_writeback(file->f_mapping)) {
+ loff_t lstart = (loff_t)pgoff << PAGE_SHIFT;
+ loff_t lend = lstart + HPAGE_PMD_SIZE - 1;
+
+ filemap_write_and_wait_range(file->f_mapping,
lstart, lend);
+ triggered_wb = true;
+ goto retry;
+ }
fput(file);
} else {
result = hpage_collapse_scan_pmd(mm, vma, addr,
--
Cheers
David