On 12/9/25 2:47 PM, Jan Beulich wrote:
On 24.11.2025 13:33, Oleksii Kurochko wrote:
@@ -374,24 +399,107 @@ static struct page_info *p2m_alloc_page(struct
p2m_domain *p2m)
return pg;
}
-static int p2m_set_type(pte_t *pte, p2m_type_t t)
+/*
+ * `pte` – PTE entry for which the type `t` will be stored.
+ *
+ * If `t` is `p2m_ext_storage`, both `ctx` and `p2m` must be provided;
+ * otherwise, only p2m may be NULL.
+ */
+static void p2m_set_type(pte_t *pte, p2m_type_t t,
+ struct p2m_pte_ctx *ctx,
+ struct p2m_domain *p2m)
I assume you having struct p2m_domain * separate from ctx here is because the
"get" counterpart wouldn't need it. The same is true for the level member,
though. I wonder therefore whether putting p2m in pte_ctx as well wouldn't
make for an overall cleaner interface. (But this is truly just a thought; I
don#t mean to insist here.)
I think it really makes sense to put p2m into pte_ctx.
Besides simplifying p2m_{set,get}_type(), it will also allow us to drop the p2m
argument from p2m_pte_from_mfn().
Let’s make this change now.
{
- int rc = 0;
+ struct page_info **md_pg;
+ struct md_t *metadata = NULL;
- if ( t > p2m_first_external )
- panic("unimplemeted\n");
- else
- pte->pte |= MASK_INSR(t, P2M_TYPE_PTE_BITS_MASK);
+ ASSERT(p2m);
- return rc;
+ /*
+ * It is sufficient to compare ctx->index with PAGETABLE_ENTRIES because,
+ * even for the p2m root page table (which is a 16 KB page allocated as
+ * four 4 KB pages), calc_offset() guarantees that the page-table index
+ * will always fall within the range [0, 511].
+ */
+ ASSERT(ctx && ctx->index < PAGETABLE_ENTRIES);
+
+ /*
+ * At the moment, p2m_get_root_pointer() returns one of four possible p2m
+ * root pages, so there is no need to search for the correct ->pt_page
+ * here.
+ * Non-root page tables are 4 KB pages, so simply using ->pt_page is
+ * sufficient.
+ */
+ md_pg = &ctx->pt_page->v.md.pg;
+
+ if ( !*md_pg && (t >= p2m_first_external) )
+ {
+ BUG_ON(ctx->level > P2M_MAX_SUPPORTED_LEVEL_MAPPING);
+
+ if ( ctx->level <= P2M_MAX_SUPPORTED_LEVEL_MAPPING )
+ {
+ /*
+ * Since p2m_alloc_page() initializes an allocated page with
zeros, p2m_invalid
+ * is expected to have the value 0 as well. This ensures that if a
metadata
+ * page is accessed before being properly initialized, the correct
type
+ * (p2m_invalid in this case) will be returned.
+ */
Nit: Line length.
Also imo "properly initialized" is ambiguous. The clearing of the page can
already
count as such. No access to the page may happen ahead of this clearing.
I will drop then this part of the comment:
+ This ensures that if a metadata
+ * page is accessed before being properly initialized, the correct
type
+ * (p2m_invalid in this case) will be returned.
+ BUILD_BUG_ON(p2m_invalid);
+
+ *md_pg = p2m_alloc_page(p2m);
+ if ( !*md_pg )
+ {
+ printk("%pd: can't allocate metadata page\n", p2m->domain);
+ domain_crash(p2m->domain);
+
+ return;
+ }
+ }
+ }
+
+ if ( *md_pg )
+ metadata = __map_domain_page(*md_pg);
+
+ if ( t >= p2m_first_external )
+ {
+ metadata[ctx->index].type = t;
+
+ t = p2m_ext_storage;
+ }
+ else if ( metadata )
+ metadata[ctx->index].type = p2m_invalid;
+
+ pte->pte |= MASK_INSR(t, P2M_TYPE_PTE_BITS_MASK);
+
+ unmap_domain_page(metadata);
}
Just to mention (towards future work): Once a metadata page goes back to be
entirely zero-filled, it could as well be hooked off and returned to the pool.
Not doing so may mean detaining an unused page indefinitely.
Won’t that already happen when p2m_free_table() is called?
p2m_free_table() calls p2m_free_page(p2m, tbl_pg->v.md.pg), which in turn calls
paging_free_page(), and that returns the page to the pool by doing:
page_list_add_tail(pg, &d->arch.paging.freelist);
-static p2m_type_t p2m_get_type(const pte_t pte)
+/*
+ * `pte` -> PTE entry that stores the PTE's type.
+ *
+ * If the PTE's type is `p2m_ext_storage`, `ctx` should be provided;
+ * otherwise it could be NULL.
+ */
+static p2m_type_t p2m_get_type(const pte_t pte, const struct p2m_pte_ctx *ctx)
{
p2m_type_t type = MASK_EXTR(pte.pte, P2M_TYPE_PTE_BITS_MASK);
+ /*
+ * Since the PTE is initialized with all zeros by default, p2m_invalid must
+ * have the value 0. This ensures that if p2m_get_type() is called for a
GFN
+ * that hasn't been initialized, the correct type (p2m_invalid in this
case)
+ * will be returned. It also guarantees that metadata won't be touched when
+ * the GFN hasn't been initialized.
+ */
+ BUILD_BUG_ON(p2m_invalid);
I don't think comment and BUILD_BUG_ON() need repeating here. That's relevant
only when (zero-)initializing the page.
if ( type == p2m_ext_storage )
- panic("unimplemented\n");
+ {
+ const struct md_t *md = __map_domain_page(ctx->pt_page->v.md.pg);
+
+ type = md[ctx->index].type;
In exchange you may want to assert here that the type found is
= p2m_first_external (as - supposedly - guaranteed by p2m_set_type()).
Make sense, will add the following after type = ...:
/*
* Since p2m_set_type() guarantees that the type will be greater than
* p2m_first_external, just check that we received a valid type here.
*/
ASSERT(type > p2m_first_external);
@@ -792,6 +952,13 @@ static bool p2m_split_superpage(struct p2m_domain *p2m,
pte_t *entry,
pte = *entry;
pte_set_mfn(&pte, mfn_add(mfn, i << level_order));
+ if ( MASK_EXTR(pte.pte, P2M_TYPE_PTE_BITS_MASK) == p2m_ext_storage )
+ {
+ p2m_pte_ctx.index = i;
+
+ p2m_set_type(&pte, old_type, &p2m_pte_ctx, p2m);
In order to re-use p2m_pte_ctx across multiple iterations without fully re-
initializing, you want the respective parameter of p2m_set_type() be pointer-
to-const.
Good point. I will update prototype of p2m_set_type() to:
static void p2m_set_type(...
const struct p2m_pte_ctx *ctx)
@@ -894,13 +1061,21 @@ static int p2m_set_entry(struct p2m_domain *p2m,
{
/* We need to split the original page. */
pte_t split_pte = *entry;
+ struct page_info *tbl_pg = mfn_to_page(domain_page_map_to_mfn(table));
ASSERT(pte_is_superpage(*entry, level));
- if ( !p2m_split_superpage(p2m, &split_pte, level, target, offsets) )
+ if ( !p2m_split_superpage(p2m, &split_pte, level, target, offsets,
+ tbl_pg) )
{
+ struct p2m_pte_ctx tmp_ctx = {
+ .pt_page = tbl_pg,
+ .index = offsets[level],
+ .level = level,
+ };
This, ...
@@ -938,7 +1113,13 @@ static int p2m_set_entry(struct p2m_domain *p2m,
p2m_clean_pte(entry, p2m->clean_dcache);
else
{
- pte_t pte = p2m_pte_from_mfn(mfn, t, false);
+ struct p2m_pte_ctx tmp_ctx = {
+ .pt_page = mfn_to_page(domain_page_map_to_mfn(table)),
+ .index = offsets[level],
+ .level = level,
+ };
... this, and ...
@@ -974,7 +1155,15 @@ static int p2m_set_entry(struct p2m_domain *p2m,
if ( pte_is_valid(orig_pte) &&
(!pte_is_valid(*entry) ||
!mfn_eq(pte_get_mfn(*entry), pte_get_mfn(orig_pte))) )
- p2m_free_subtree(p2m, orig_pte, level);
+ {
+ struct p2m_pte_ctx tmp_ctx = {
+ .pt_page = mfn_to_page(domain_page_map_to_mfn(table)),
+ .index = offsets[level],
+ .level = level,
+ };
... this initializer are identical. Perhaps (sorry) it wasn't a good idea
after all to move the context variable from function scope to the more
narrow ones?
Probably, but I’m not 100% sure that making it a function-scope variable would
be better.
Essentially, it would only help eliminate the last two declarations of
`tmp_ctx`:
struct p2m_pte_ctx tmp_ctx = {
.pt_page = mfn_to_page(domain_page_map_to_mfn(table)),
.index = offsets[level],
.level = level,
};
It would also require re-initializing (or as an option it could done once after
the
superpage breakup happened; please look the diff below) all the fields (except
the
newly added `tmp_ctx.p2m`) inside the case where a superpage breakup is needed:
/*
* If we are here with level > target, we must be at a leaf node,
* and we need to break up the superpage.
*/
if (level > target)
In this case, `index`, `level`, and `pt_page` will be changed.
It seems slightly better to use a function-scope variable, so I can rework the
code to have
the following:
@@ -1049,6 +1047,8 @@ static int p2m_set_entry(struct p2m_domain *p2m,
entry = table + offsets[level];
+ tmp_ctx.p2m = p2m;
+
/*
* If we are here with level > target, we must be at a leaf node,
* and we need to break up the superpage.
@@ -1064,11 +1064,9 @@ static int p2m_set_entry(struct p2m_domain *p2m,
if ( !p2m_split_superpage(p2m, &split_pte, level, target, offsets,
tbl_pg) )
{
- struct p2m_pte_ctx tmp_ctx = {
- .pt_page = tbl_pg,
- .index = offsets[level],
- .level = level,
- };
+ tmp_ctx.pt_page = tbl_pg;
+ tmp_ctx.index = offsets[level];
+ tmp_ctx.level = level;
/* Free the allocated sub-tree */
p2m_free_subtree(p2m, split_pte, &tmp_ctx);
@@ -1097,6 +1095,10 @@ static int p2m_set_entry(struct p2m_domain *p2m,
entry = table + offsets[level];
}
+ tmp_ctx.pt_page = mfn_to_page(domain_page_map_to_mfn(table));
+ tmp_ctx.index = offsets[level];
+ tmp_ctx.level = level;
+
/*
* We should always be there with the correct level because all the
* intermediate tables have been installed if necessary.
@@ -1109,13 +1111,7 @@ static int p2m_set_entry(struct p2m_domain *p2m,
p2m_clean_pte(entry, p2m->clean_dcache);
else
{
- struct p2m_pte_ctx tmp_ctx = {
- .pt_page = mfn_to_page(domain_page_map_to_mfn(table)),
- .index = offsets[level],
- .level = level,
- };
-
- pte_t pte = p2m_pte_from_mfn(mfn, t, &tmp_ctx, p2m);
+ pte_t pte = p2m_pte_from_mfn(mfn, t, &tmp_ctx);
p2m_write_pte(entry, pte, p2m->clean_dcache);
@@ -1152,12 +1148,6 @@ static int p2m_set_entry(struct p2m_domain *p2m,
(!pte_is_valid(*entry) ||
!mfn_eq(pte_get_mfn(*entry), pte_get_mfn(orig_pte))) )
{
- struct p2m_pte_ctx tmp_ctx = {
- .pt_page = mfn_to_page(domain_page_map_to_mfn(table)),
- .index = offsets[level],
- .level = level,
- };
-
p2m_free_subtree(p2m, orig_pte, &tmp_ctx);
}
@@ -1363,6 +1353,7 @@ static mfn_t p2m_get_entry(struct p2m_domain *p2m, gfn_t gfn,
.pt_page = mfn_to_page(domain_page_map_to_mfn(table)),
.index = offsets[level],
.level = level,
+ .p2m = p2m,
};
*t = p2m_get_type(entry, &p2m_pte_ctx);
Does it make sense?
Thanks.
~ Oleksii