Hi Stephen,
On Fri, Jan 23, 2026 at 2:04 PM Stephen Brennan
<[email protected]> wrote:
>
> Tao Liu <[email protected]> writes:
> > pfn and num is the data which extensions give to makedumpfile for mm page
> > filtering. Since makedumpfile will iterate the pfn in an ascending order in
> > __exclude_unnecessary_pages(), pfn and num are stored within ft_page_info
> > linked
> > lists and organized in an ascending order by pfn, so if one pfn
> > is hit by one list, the next pfn is most likely to be hit either by
> > this list again, or the one follows, so a cur variable is used for saving
> > the current list position to speedup the pfn checking process.
>
> I'm wondering about the trade-off for using a linked list versus an
> array. Using the linked list, we are forced to maintain the sorted
> order as we construct the list, which is an O(N^2) insertion sort.
>
> If instead we used an array, we could sort it with qsort() once, at the
> end. Then we could merge any overlapping ranges. Lookup could be
> implemented cheaply with bsearch(), and we could continue to use the
> optimization where we maintain a "cur" pointer. I believe the overall
> runtime complexity of the array approach would be O(N*log(N)) without
> requiring hand-implementing anything too complex, compared to O(N^2).
>
> Depending on the number of pages (and how fragmented they are), this
> may or may not be an issue.
>
> In my testing for userspace tasks, the number of pages retained can be
> on the order of ~100k. However -- my use case can't really use a list of
> PFNs, which I'll explain below. So my use case doesn't really matter too
> much here -- maybe your use case has relatively few page ranges, so the
> cost of O(N^2) is not bad.
>
> So I guess I don't have a strong preference - but it's worth
> considering.
>
> > In addition, 2 ft_page_info linked list chains are used, one for mm page
> > discarding and the other for page keeping.
> >
> > Signed-off-by: Tao Liu <[email protected]>
> > ---
> > erase_info.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++
> > erase_info.h | 12 +++++++
> > makedumpfile.c | 28 ++++++++++++---
> > 3 files changed, 134 insertions(+), 4 deletions(-)
> >
> > diff --git a/erase_info.c b/erase_info.c
> > index b67d1d0..8838bea 100644
> > --- a/erase_info.c
> > +++ b/erase_info.c
> > @@ -2466,3 +2466,101 @@ get_size_eraseinfo(void)
> > return size_eraseinfo;
> > }
> >
> > +/* Pages to be discarded */
> > +static struct ft_page_info *ft_head_discard = NULL;
> > +/* Pages to be keeped */
> > +static struct ft_page_info *ft_head_keep = NULL;
> > +
> > +/*
> > + * Insert the ft_page_info blocks into ft_head by ascending pfn.
> > + */
> > +bool
> > +update_filter_pages_info(unsigned long pfn, unsigned long num, bool
> > to_discard)
> > +{
> > + struct ft_page_info *p, **ft_head;
> > + struct ft_page_info *new_p = malloc(sizeof(struct ft_page_info));
> > +
> > + ft_head = to_discard ? &ft_head_discard : &ft_head_keep;
> > +
> > + if (!new_p) {
> > + ERRMSG("Can't allocate memory for ft_page_info at %lx\n",
> > pfn);
> > + return false;
> > + }
> > + new_p->pfn = pfn;
> > + new_p->num = num;
> > + new_p->next = NULL;
> > +
> > + if (!(*ft_head) || (*ft_head)->pfn > new_p->pfn) {
> > + new_p->next = (*ft_head);
> > + (*ft_head) = new_p;
> > + return true;
> > + }
> > +
> > + p = (*ft_head);
> > + while (p->next != NULL && p->next->pfn < new_p->pfn) {
> > + p = p->next;
> > + }
> > +
> > + new_p->next = p->next;
> > + p->next = new_p;
>
> It might be wise to defensively handle the case of overlapping
> PFN ranges by merging them.
>
> > + return true;
> > +}
> > +
> > +/*
> > + * Check if the pfn hit ft_page_info block.
> > + *
> > + * pfn and ft_head are in ascending order, so save the current ft_page_info
> > + * block into **p because it is likely to hit again next time.
> > + */
> > +bool
> > +filter_page(unsigned long pfn, struct ft_page_info **p, bool
> > handle_discard)
> > +{
> > + struct ft_page_info *ft_head;
> > +
> > + ft_head = handle_discard ? ft_head_discard : ft_head_keep;
> > +
> > + if (ft_head == NULL)
> > + return false;
> > +
> > + if (*p == NULL)
> > + *p = ft_head;
> > +
> > + /* The gap before 1st block */
> > + if (pfn >= 0 && pfn < ft_head->pfn)
> > + return false;
> > +
> > + /* Handle 1~(n-1) blocks and following gaps */
> > + while ((*p)->next) {
> > + if (pfn >= (*p)->pfn && pfn < (*p)->pfn + (*p)->num)
> > + return true; // hit the block
> > + if (pfn >= (*p)->pfn + (*p)->num && pfn < (*p)->next->pfn)
> > + return false; // the gap after the block
> > + *p = (*p)->next;
> > + }
> > +
> > + /* The last block and gap */
> > + if (pfn >= (*p)->pfn + (*p)->num)
> > + return false;
> > + else
> > + return true;
> > +}
> > +
> > +static void
> > +do_cleanup(struct ft_page_info **ft_head)
> > +{
> > + struct ft_page_info *p, *p_tmp;
> > +
> > + for (p = *ft_head; p;) {
> > + p_tmp = p;
> > + p = p->next;
> > + free(p_tmp);
> > + }
> > + *ft_head = NULL;
> > +}
> > +
> > +void
> > +cleanup_filter_pages_info(void)
> > +{
> > + do_cleanup(&ft_head_discard);
> > + do_cleanup(&ft_head_keep);
> > +}
> > diff --git a/erase_info.h b/erase_info.h
> > index b363a40..6c60706 100644
> > --- a/erase_info.h
> > +++ b/erase_info.h
> > @@ -20,6 +20,7 @@
> > #define _ERASE_INFO_H
> >
> > #define MAX_SIZE_STR_LEN (26)
> > +#include <stdbool.h>
> >
> > /*
> > * Erase information, original symbol expressions.
> > @@ -65,5 +66,16 @@ void filter_data_buffer_parallel(unsigned char *buf,
> > unsigned long long paddr,
> > unsigned long get_size_eraseinfo(void);
> > int update_filter_info_raw(unsigned long long, int, int);
> >
> > +bool update_filter_pages_info(unsigned long, unsigned long, bool);
> > +
> > +struct ft_page_info {
> > + unsigned long pfn;
> > + unsigned long num;
> > + struct ft_page_info *next;
> > +} __attribute__((packed));
> > +
> > +bool filter_page(unsigned long, struct ft_page_info **p, bool
> > handle_discard);
> > +void cleanup_filter_pages_info(void);
> > +
> > #endif /* _ERASE_INFO_H */
> >
> > diff --git a/makedumpfile.c b/makedumpfile.c
> > index ca8ed8a..ebac8da 100644
> > --- a/makedumpfile.c
> > +++ b/makedumpfile.c
> > @@ -102,6 +102,7 @@ mdf_pfn_t pfn_free;
> > mdf_pfn_t pfn_hwpoison;
> > mdf_pfn_t pfn_offline;
> > mdf_pfn_t pfn_elf_excluded;
> > +mdf_pfn_t pfn_extension;
> >
> > mdf_pfn_t num_dumped;
> >
> > @@ -6459,6 +6460,8 @@ __exclude_unnecessary_pages(unsigned long mem_map,
> > unsigned int order_offset, dtor_offset;
> > unsigned long flags, mapping, private = 0;
> > unsigned long compound_dtor, compound_head = 0;
> > + struct ft_page_info *cur_discard = NULL;
> > + struct ft_page_info *cur_keep = NULL;
> >
> > /*
> > * If a multi-page exclusion is pending, do it first
> > @@ -6495,6 +6498,13 @@ __exclude_unnecessary_pages(unsigned long mem_map,
> > if (info->flag_cyclic && !is_cyclic_region(pfn, cycle))
> > continue;
> >
> > + /*
> > + * Keep pages that specified by user via
> > + * makedumpfile extensions
> > + */
> > + if (filter_page(pfn, &cur_keep, false))
> > + continue;
> > +
>
> It makes sense to allow plugins to enumerate a list of PFNs to override
> and include. I like that - it's simple enough. But it's not flexible
> enough for my use case with userspace stacks :(
>
> The userspace stack region is an anon_vma. My plugin can enumerate the
> anon_vmas that it wants to save, but it's prohibitively expensive and
> complex to enumerate the list of pages associated with each anon_vma. We
> would need to do a page table walk for each process.
>
> There's a simpler way: from the struct page mapping and index fields,
> it's possible to determine which anon_vma the page is associated with,
> and what index it has within the VMA. And from this, we can make the
> determination of whether to include a page or not. This is what I had
> implemented in this patch:
>
> https://github.com/brenns10/makedumpfile/commit/1c0a828ef80962480f771915c2d494272721b659#diff-2593512d7ec329b34b1ca5686a7b6b073d0ca636df8ff20fea04684da2c8e063R6692-R12150
>
> So, I wonder if it makes sense to allow a plugin to register a callback
> to be called here, so the plugin can make the more complex decision?
> This would keep the logic outside of the core makedumpfile code, but
> allow the necessary flexibility.
>
> Something like:
>
> if (plugin_keep_page_callback && plugin_keep_page_callback(pfn, pcache))
> continue;
>
> And then the extension system could allow an extension to register that
> callback. It would need to keep the extension loaded for the duration of
> the execution of makedumpfile (rather than calling dlclose()
> immediately).
>
> What do you think about this? I'm happy to implement this part of it
> separate from your patch series -- you could simply drop the stuff
> related to page inclusion, and I can add the necessary pieces when I
> submit my extension patches.
Thanks a lot for your comments and suggestions! For my GPU mm
filtering case, the pages are often allocated as a large block set, so
one linked list like {discard: true, start_pfn: 100, pfn_nums: 100000}
could hit a large range of pages' pfn, thus linked lists aren't much
long and overlap/merge isn't a performance bottleneck to me. I agree
with your case that this can be a problem for scattered pages.
Here is my plan, in v4, I will re-design the extensions part, that
extensions will declare which kallsyms/btf symbol it is interested in,
and remove the hash tables for kallsyms as you suggested (Many thanks
for that, I think it is a brilliant idea to save the kallsyms
memory!). And I will leave this linked list / array page (ex)inclusion
function to you, so we can cooperate on this. Thanks again for your
support and help :)
Thanks,
Tao Liu
>
> Thanks,
> Stephen
>
> > /*
> > * Exclude the memory hole.
> > */
> > @@ -6687,6 +6697,14 @@ check_order:
> > else if (isOffline(flags, _mapcount)) {
> > pfn_counter = &pfn_offline;
> > }
> > + /*
> > + * Exclude pages that specified by user via
> > + * makedumpfile extensions
> > + */
> > + else if (filter_page(pfn, &cur_discard, true)) {
> > + nr_pages = 1;
> > + pfn_counter = &pfn_extension;
> > + }
> > /*
> > * Unexcludable page
> > */
> > @@ -6748,6 +6766,7 @@ exclude_unnecessary_pages(struct cycle *cycle)
> > print_progress(PROGRESS_UNN_PAGES, info->num_mem_map,
> > info->num_mem_map, NULL);
> > print_execution_time(PROGRESS_UNN_PAGES, &ts_start);
> > }
> > + cleanup_filter_pages_info();
> >
> > return TRUE;
> > }
> > @@ -8234,7 +8253,7 @@ write_elf_pages_cyclic(struct cache_data *cd_header,
> > struct cache_data *cd_page)
> > */
> > if (info->flag_cyclic) {
> > pfn_zero = pfn_cache = pfn_cache_private = 0;
> > - pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
> > + pfn_user = pfn_free = pfn_hwpoison = pfn_offline =
> > pfn_extension = 0;
> > pfn_memhole = info->max_mapnr;
> > }
> >
> > @@ -9579,7 +9598,7 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data
> > *cd_header, struct cache_d
> > * Reset counter for debug message.
> > */
> > pfn_zero = pfn_cache = pfn_cache_private = 0;
> > - pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
> > + pfn_user = pfn_free = pfn_hwpoison = pfn_offline =
> > pfn_extension = 0;
> > pfn_memhole = info->max_mapnr;
> >
> > /*
> > @@ -10528,7 +10547,7 @@ print_report(void)
> > pfn_original = info->max_mapnr - pfn_memhole;
> >
> > pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
> > - + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
> > + + pfn_user + pfn_free + pfn_hwpoison + pfn_offline +
> > pfn_extension;
> >
> > REPORT_MSG("\n");
> > REPORT_MSG("Original pages : 0x%016llx\n", pfn_original);
> > @@ -10544,6 +10563,7 @@ print_report(void)
> > REPORT_MSG(" Free pages : 0x%016llx\n", pfn_free);
> > REPORT_MSG(" Hwpoison pages : 0x%016llx\n", pfn_hwpoison);
> > REPORT_MSG(" Offline pages : 0x%016llx\n", pfn_offline);
> > + REPORT_MSG(" Extension filter pages : 0x%016llx\n",
> > pfn_extension);
> > REPORT_MSG(" Remaining pages : 0x%016llx\n",
> > pfn_original - pfn_excluded);
> >
> > @@ -10584,7 +10604,7 @@ print_mem_usage(void)
> > pfn_original = info->max_mapnr - pfn_memhole;
> >
> > pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
> > - + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
> > + + pfn_user + pfn_free + pfn_hwpoison + pfn_offline +
> > pfn_extension;
> > shrinking = (pfn_original - pfn_excluded) * 100;
> > shrinking = shrinking / pfn_original;
> > total_size = info->page_size * pfn_original;
> > --
> > 2.47.0
>