Hi Stephen,

On Fri, Jan 23, 2026 at 2:04 PM Stephen Brennan
<[email protected]> wrote:
>
> Tao Liu <[email protected]> writes:
> > pfn and num is the data which extensions give to makedumpfile for mm page
> > filtering. Since makedumpfile will iterate the pfn in an ascending order in
> > __exclude_unnecessary_pages(), pfn and num are stored within ft_page_info 
> > linked
> > lists and organized in an ascending order by pfn, so if one pfn
> > is hit by one list, the next pfn is most likely to be hit either by
> > this list again, or the one follows, so a cur variable is used for saving
> > the current list position to speedup the pfn checking process.
>
> I'm wondering about the trade-off for using a linked list versus an
> array. Using the linked list, we are forced to maintain the sorted
> order as we construct the list, which is an O(N^2) insertion sort.
>
> If instead we used an array, we could sort it with qsort() once, at the
> end. Then we could merge any overlapping ranges. Lookup could be
> implemented cheaply with bsearch(), and we could continue to use the
> optimization where we maintain a "cur" pointer.  I believe the overall
> runtime complexity of the array approach would be O(N*log(N)) without
> requiring hand-implementing anything too complex, compared to O(N^2).
>
> Depending on the number of pages (and how fragmented they are), this
> may or may not be an issue.
>
> In my testing for userspace tasks, the number of pages retained can be
> on the order of ~100k. However -- my use case can't really use a list of
> PFNs, which I'll explain below. So my use case doesn't really matter too
> much here -- maybe your use case has relatively few page ranges, so the
> cost of O(N^2) is not bad.
>
> So I guess I don't have a strong preference - but it's worth
> considering.
>
> > In addition, 2 ft_page_info linked list chains are used, one for mm page
> > discarding and the other for page keeping.
> >
> > Signed-off-by: Tao Liu <[email protected]>
> > ---
> >  erase_info.c   | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++
> >  erase_info.h   | 12 +++++++
> >  makedumpfile.c | 28 ++++++++++++---
> >  3 files changed, 134 insertions(+), 4 deletions(-)
> >
> > diff --git a/erase_info.c b/erase_info.c
> > index b67d1d0..8838bea 100644
> > --- a/erase_info.c
> > +++ b/erase_info.c
> > @@ -2466,3 +2466,101 @@ get_size_eraseinfo(void)
> >       return size_eraseinfo;
> >  }
> >
> > +/* Pages to be discarded */
> > +static struct ft_page_info *ft_head_discard = NULL;
> > +/* Pages to be keeped */
> > +static struct ft_page_info *ft_head_keep = NULL;
> > +
> > +/*
> > + * Insert the ft_page_info blocks into ft_head by ascending pfn.
> > + */
> > +bool
> > +update_filter_pages_info(unsigned long pfn, unsigned long num, bool 
> > to_discard)
> > +{
> > +     struct ft_page_info *p, **ft_head;
> > +     struct ft_page_info *new_p = malloc(sizeof(struct ft_page_info));
> > +
> > +     ft_head = to_discard ? &ft_head_discard : &ft_head_keep;
> > +
> > +     if (!new_p) {
> > +             ERRMSG("Can't allocate memory for ft_page_info at %lx\n", 
> > pfn);
> > +             return false;
> > +     }
> > +     new_p->pfn = pfn;
> > +     new_p->num = num;
> > +     new_p->next = NULL;
> > +
> > +     if (!(*ft_head) || (*ft_head)->pfn > new_p->pfn) {
> > +             new_p->next = (*ft_head);
> > +             (*ft_head) = new_p;
> > +             return true;
> > +     }
> > +
> > +     p = (*ft_head);
> > +     while (p->next != NULL && p->next->pfn < new_p->pfn) {
> > +             p = p->next;
> > +     }
> > +
> > +     new_p->next = p->next;
> > +     p->next = new_p;
>
> It might be wise to defensively handle the case of overlapping
> PFN ranges by merging them.
>
> > +     return true;
> > +}
> > +
> > +/*
> > + * Check if the pfn hit ft_page_info block.
> > + *
> > + * pfn and ft_head are in ascending order, so save the current ft_page_info
> > + * block into **p because it is likely to hit again next time.
> > + */
> > +bool
> > +filter_page(unsigned long pfn, struct ft_page_info **p, bool 
> > handle_discard)
> > +{
> > +     struct ft_page_info *ft_head;
> > +
> > +     ft_head = handle_discard ? ft_head_discard : ft_head_keep;
> > +
> > +     if (ft_head == NULL)
> > +             return false;
> > +
> > +     if (*p == NULL)
> > +             *p = ft_head;
> > +
> > +     /* The gap before 1st block */
> > +     if (pfn >= 0 && pfn < ft_head->pfn)
> > +             return false;
> > +
> > +     /* Handle 1~(n-1) blocks and following gaps */
> > +     while ((*p)->next) {
> > +             if (pfn >= (*p)->pfn && pfn < (*p)->pfn + (*p)->num)
> > +                     return true; // hit the block
> > +             if (pfn >= (*p)->pfn + (*p)->num && pfn < (*p)->next->pfn)
> > +                     return false; // the gap after the block
> > +             *p = (*p)->next;
> > +     }
> > +
> > +     /* The last block and gap */
> > +     if (pfn >= (*p)->pfn + (*p)->num)
> > +             return false;
> > +     else
> > +             return true;
> > +}
> > +
> > +static void
> > +do_cleanup(struct ft_page_info **ft_head)
> > +{
> > +     struct ft_page_info *p, *p_tmp;
> > +
> > +     for (p = *ft_head; p;) {
> > +             p_tmp = p;
> > +             p = p->next;
> > +             free(p_tmp);
> > +     }
> > +     *ft_head = NULL;
> > +}
> > +
> > +void
> > +cleanup_filter_pages_info(void)
> > +{
> > +     do_cleanup(&ft_head_discard);
> > +     do_cleanup(&ft_head_keep);
> > +}
> > diff --git a/erase_info.h b/erase_info.h
> > index b363a40..6c60706 100644
> > --- a/erase_info.h
> > +++ b/erase_info.h
> > @@ -20,6 +20,7 @@
> >  #define _ERASE_INFO_H
> >
> >  #define MAX_SIZE_STR_LEN (26)
> > +#include <stdbool.h>
> >
> >  /*
> >   * Erase information, original symbol expressions.
> > @@ -65,5 +66,16 @@ void filter_data_buffer_parallel(unsigned char *buf, 
> > unsigned long long paddr,
> >  unsigned long get_size_eraseinfo(void);
> >  int update_filter_info_raw(unsigned long long, int, int);
> >
> > +bool update_filter_pages_info(unsigned long, unsigned long, bool);
> > +
> > +struct ft_page_info {
> > +     unsigned long pfn;
> > +     unsigned long num;
> > +     struct ft_page_info *next;
> > +} __attribute__((packed));
> > +
> > +bool filter_page(unsigned long, struct ft_page_info **p, bool 
> > handle_discard);
> > +void cleanup_filter_pages_info(void);
> > +
> >  #endif /* _ERASE_INFO_H */
> >
> > diff --git a/makedumpfile.c b/makedumpfile.c
> > index ca8ed8a..ebac8da 100644
> > --- a/makedumpfile.c
> > +++ b/makedumpfile.c
> > @@ -102,6 +102,7 @@ mdf_pfn_t pfn_free;
> >  mdf_pfn_t pfn_hwpoison;
> >  mdf_pfn_t pfn_offline;
> >  mdf_pfn_t pfn_elf_excluded;
> > +mdf_pfn_t pfn_extension;
> >
> >  mdf_pfn_t num_dumped;
> >
> > @@ -6459,6 +6460,8 @@ __exclude_unnecessary_pages(unsigned long mem_map,
> >       unsigned int order_offset, dtor_offset;
> >       unsigned long flags, mapping, private = 0;
> >       unsigned long compound_dtor, compound_head = 0;
> > +     struct ft_page_info *cur_discard = NULL;
> > +     struct ft_page_info *cur_keep = NULL;
> >
> >       /*
> >        * If a multi-page exclusion is pending, do it first
> > @@ -6495,6 +6498,13 @@ __exclude_unnecessary_pages(unsigned long mem_map,
> >               if (info->flag_cyclic && !is_cyclic_region(pfn, cycle))
> >                       continue;
> >
> > +             /*
> > +              * Keep pages that specified by user via
> > +              * makedumpfile extensions
> > +              */
> > +             if (filter_page(pfn, &cur_keep, false))
> > +                     continue;
> > +
>
> It makes sense to allow plugins to enumerate a list of PFNs to override
> and include. I like that - it's simple enough. But it's not flexible
> enough for my use case with userspace stacks :(
>
> The userspace stack region is an anon_vma. My plugin can enumerate the
> anon_vmas that it wants to save, but it's prohibitively expensive and
> complex to enumerate the list of pages associated with each anon_vma. We
> would need to do a page table walk for each process.
>
> There's a simpler way: from the struct page mapping and index fields,
> it's possible to determine which anon_vma the page is associated with,
> and what index it has within the VMA. And from this, we can make the
> determination of whether to include a page or not. This is what I had
> implemented in this patch:
>
> https://github.com/brenns10/makedumpfile/commit/1c0a828ef80962480f771915c2d494272721b659#diff-2593512d7ec329b34b1ca5686a7b6b073d0ca636df8ff20fea04684da2c8e063R6692-R12150
>
> So, I wonder if it makes sense to allow a plugin to register a callback
> to be called here, so the plugin can make the more complex decision?
> This would keep the logic outside of the core makedumpfile code, but
> allow the necessary flexibility.
>
> Something like:
>
> if (plugin_keep_page_callback && plugin_keep_page_callback(pfn, pcache))
>     continue;
>
> And then the extension system could allow an extension to register that
> callback. It would need to keep the extension loaded for the duration of
> the execution of makedumpfile (rather than calling dlclose()
> immediately).
>
> What do you think about this? I'm happy to implement this part of it
> separate from your patch series -- you could simply drop the stuff
> related to page inclusion, and I can add the necessary pieces when I
> submit my extension patches.

Thanks a lot for your comments and suggestions! For my GPU mm
filtering case, the pages are often allocated as a large block set, so
one linked list like {discard: true, start_pfn: 100, pfn_nums: 100000}
could hit a large range of pages' pfn, thus linked lists aren't much
long and overlap/merge isn't a performance bottleneck to me. I agree
with your case that this can be a problem for scattered pages.

Here is my plan, in v4, I will re-design the extensions part, that
extensions will declare which kallsyms/btf symbol it is interested in,
and remove the hash tables for kallsyms as you suggested (Many thanks
for that, I think it is a brilliant idea to save the kallsyms
memory!). And I will leave this linked list / array page (ex)inclusion
function to you, so we can cooperate on this. Thanks again for your
support and help :)

Thanks,
Tao Liu

>
> Thanks,
> Stephen
>
> >               /*
> >                * Exclude the memory hole.
> >                */
> > @@ -6687,6 +6697,14 @@ check_order:
> >               else if (isOffline(flags, _mapcount)) {
> >                       pfn_counter = &pfn_offline;
> >               }
> > +             /*
> > +              * Exclude pages that specified by user via
> > +              * makedumpfile extensions
> > +              */
> > +             else if (filter_page(pfn, &cur_discard, true)) {
> > +                     nr_pages = 1;
> > +                     pfn_counter = &pfn_extension;
> > +             }
> >               /*
> >                * Unexcludable page
> >                */
> > @@ -6748,6 +6766,7 @@ exclude_unnecessary_pages(struct cycle *cycle)
> >               print_progress(PROGRESS_UNN_PAGES, info->num_mem_map, 
> > info->num_mem_map, NULL);
> >               print_execution_time(PROGRESS_UNN_PAGES, &ts_start);
> >       }
> > +     cleanup_filter_pages_info();
> >
> >       return TRUE;
> >  }
> > @@ -8234,7 +8253,7 @@ write_elf_pages_cyclic(struct cache_data *cd_header, 
> > struct cache_data *cd_page)
> >        */
> >       if (info->flag_cyclic) {
> >               pfn_zero = pfn_cache = pfn_cache_private = 0;
> > -             pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
> > +             pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 
> > pfn_extension = 0;
> >               pfn_memhole = info->max_mapnr;
> >       }
> >
> > @@ -9579,7 +9598,7 @@ write_kdump_pages_and_bitmap_cyclic(struct cache_data 
> > *cd_header, struct cache_d
> >                * Reset counter for debug message.
> >                */
> >               pfn_zero = pfn_cache = pfn_cache_private = 0;
> > -             pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 0;
> > +             pfn_user = pfn_free = pfn_hwpoison = pfn_offline = 
> > pfn_extension = 0;
> >               pfn_memhole = info->max_mapnr;
> >
> >               /*
> > @@ -10528,7 +10547,7 @@ print_report(void)
> >       pfn_original = info->max_mapnr - pfn_memhole;
> >
> >       pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
> > -         + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
> > +         + pfn_user + pfn_free + pfn_hwpoison + pfn_offline + 
> > pfn_extension;
> >
> >       REPORT_MSG("\n");
> >       REPORT_MSG("Original pages  : 0x%016llx\n", pfn_original);
> > @@ -10544,6 +10563,7 @@ print_report(void)
> >       REPORT_MSG("    Free pages              : 0x%016llx\n", pfn_free);
> >       REPORT_MSG("    Hwpoison pages          : 0x%016llx\n", pfn_hwpoison);
> >       REPORT_MSG("    Offline pages           : 0x%016llx\n", pfn_offline);
> > +     REPORT_MSG("    Extension filter pages  : 0x%016llx\n", 
> > pfn_extension);
> >       REPORT_MSG("  Remaining pages  : 0x%016llx\n",
> >           pfn_original - pfn_excluded);
> >
> > @@ -10584,7 +10604,7 @@ print_mem_usage(void)
> >       pfn_original = info->max_mapnr - pfn_memhole;
> >
> >       pfn_excluded = pfn_zero + pfn_cache + pfn_cache_private
> > -         + pfn_user + pfn_free + pfn_hwpoison + pfn_offline;
> > +         + pfn_user + pfn_free + pfn_hwpoison + pfn_offline + 
> > pfn_extension;
> >       shrinking = (pfn_original - pfn_excluded) * 100;
> >       shrinking = shrinking / pfn_original;
> >       total_size = info->page_size * pfn_original;
> > --
> > 2.47.0
>


Reply via email to