The existing pagetable walker has complicated return semantics, which squeeze
multiple pieces of information into single integer.  This would be fine if the
information didn't overlap, but it does.

Specifically, _PAGE_INVALID_BITS for 3-level guests alias _PAGE_PAGED and
_PAGE_SHARED.  A guest which constructs a PTE with bits 52 or 53 set (the
start of the upper software-available range) will create a virtual address
which, when walked by Xen, tricks Xen into believing the frame is paged or
shared.  This behaviour was introduced by XSA-173 (c/s 8b17648).

It is also complicated to turn rc back into a normal pagefault error code.
Instead, change the calling semantics to return a boolean indicating success,
and have the function accumulate a real pagefault error code as it goes
(including synthetic error codes, which do not alias hardware ones).  This
requires an equivalent adjustment to map_domain_gfn().

Issues fixed:
 * 2-level PSE36 superpages now return the correct translation.
 * 2-level L2 superpages without CR0.PSE now return the correct translation.
 * SMEP now inhibits a user instruction fetch even if NX isn't active.
 * Supervisor writes without CR0.WP now set the leaf dirty bit.
 * L4e._PAGE_GLOBAL is strictly reserved on AMD.
 * 3-level l3 entries have all reserved bits checked.
 * 3-level entries can no longer alias Xen's idea of paged or shared.

Signed-off-by: Andrew Cooper <andrew.coop...@citrix.com>
Reviewed-by: Tim Deegan <t...@xen.org>
Reviewed-by: George Dunlap <george.dun...@citrix.com>
Reviewed-by: Jan Beulich <jbeul...@suse.com>
---
 xen/arch/x86/mm/guest_walk.c      | 449 ++++++++++++++++++++------------------
 xen/arch/x86/mm/hap/guest_walk.c  |  21 +-
 xen/arch/x86/mm/hap/nested_ept.c  |   2 +-
 xen/arch/x86/mm/p2m.c             |  17 +-
 xen/arch/x86/mm/shadow/multi.c    |  27 +--
 xen/include/asm-x86/guest_pt.h    |  62 ++++--
 xen/include/asm-x86/p2m.h         |   2 +-
 xen/include/asm-x86/page.h        |   3 -
 xen/include/asm-x86/processor.h   |   2 +
 xen/include/asm-x86/x86_64/page.h |   6 -
 10 files changed, 303 insertions(+), 288 deletions(-)

diff --git a/xen/arch/x86/mm/guest_walk.c b/xen/arch/x86/mm/guest_walk.c
index 11f4c5d..bdb52d9 100644
--- a/xen/arch/x86/mm/guest_walk.c
+++ b/xen/arch/x86/mm/guest_walk.c
@@ -32,44 +32,6 @@ asm(".file \"" __OBJECT_FILE__ "\"");
 #include <asm/page.h>
 #include <asm/guest_pt.h>
 
-extern const uint32_t gw_page_flags[];
-#if GUEST_PAGING_LEVELS == CONFIG_PAGING_LEVELS
-const uint32_t gw_page_flags[] = {
-    /* I/F -  Usr Wr */
-    /* 0   0   0   0 */ _PAGE_PRESENT,
-    /* 0   0   0   1 */ _PAGE_PRESENT|_PAGE_RW,
-    /* 0   0   1   0 */ _PAGE_PRESENT|_PAGE_USER,
-    /* 0   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
-    /* 0   1   0   0 */ _PAGE_PRESENT,
-    /* 0   1   0   1 */ _PAGE_PRESENT|_PAGE_RW,
-    /* 0   1   1   0 */ _PAGE_PRESENT|_PAGE_USER,
-    /* 0   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER,
-    /* 1   0   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
-    /* 1   0   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
-    /* 1   0   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
-    /* 1   0   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
-    /* 1   1   0   0 */ _PAGE_PRESENT|_PAGE_NX_BIT,
-    /* 1   1   0   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_NX_BIT,
-    /* 1   1   1   0 */ _PAGE_PRESENT|_PAGE_USER|_PAGE_NX_BIT,
-    /* 1   1   1   1 */ _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_NX_BIT,
-};
-#endif
-
-/* Flags that are needed in a pagetable entry, with the sense of NX inverted */
-static uint32_t mandatory_flags(struct vcpu *v, uint32_t pfec) 
-{
-    /* Don't demand not-NX if the CPU wouldn't enforce it. */
-    if ( !guest_supports_nx(v) )
-        pfec &= ~PFEC_insn_fetch;
-
-    /* Don't demand R/W if the CPU wouldn't enforce it. */
-    if ( is_hvm_vcpu(v) && unlikely(!hvm_wp_enabled(v)) 
-         && !(pfec & PFEC_user_mode) )
-        pfec &= ~PFEC_write_access;
-
-    return gw_page_flags[(pfec & 0x1f) >> 1] | _PAGE_INVALID_BITS;
-}
-
 /* Modify a guest pagetable entry to set the Accessed and Dirty bits.
  * Returns non-zero if it actually writes to guest memory. */
 static uint32_t set_ad_bits(void *guest_p, void *walk_p, int set_dirty)
@@ -90,62 +52,33 @@ static uint32_t set_ad_bits(void *guest_p, void *walk_p, 
int set_dirty)
     return 0;
 }
 
-#if GUEST_PAGING_LEVELS >= 4
-static bool_t pkey_fault(struct vcpu *vcpu, uint32_t pfec,
-        uint32_t pte_flags, uint32_t pte_pkey)
-{
-    uint32_t pkru;
-
-    /* When page isn't present,  PKEY isn't checked. */
-    if ( !(pfec & PFEC_page_present) || is_pv_vcpu(vcpu) )
-        return 0;
-
-    /*
-     * PKU:  additional mechanism by which the paging controls
-     * access to user-mode addresses based on the value in the
-     * PKRU register. A fault is considered as a PKU violation if all
-     * of the following conditions are true:
-     * 1.CR4_PKE=1.
-     * 2.EFER_LMA=1.
-     * 3.Page is present with no reserved bit violations.
-     * 4.The access is not an instruction fetch.
-     * 5.The access is to a user page.
-     * 6.PKRU.AD=1 or
-     *      the access is a data write and PKRU.WD=1 and
-     *          either CR0.WP=1 or it is a user access.
-     */
-    if ( !hvm_pku_enabled(vcpu) ||
-         !hvm_long_mode_enabled(vcpu) ||
-         (pfec & PFEC_reserved_bit) ||
-         (pfec & PFEC_insn_fetch) ||
-         !(pte_flags & _PAGE_USER) )
-        return 0;
-
-    pkru = read_pkru();
-    if ( unlikely(pkru) )
-    {
-        bool_t pkru_ad = read_pkru_ad(pkru, pte_pkey);
-        bool_t pkru_wd = read_pkru_wd(pkru, pte_pkey);
-
-        /* Condition 6 */
-        if ( pkru_ad ||
-             (pkru_wd && (pfec & PFEC_write_access) &&
-              (hvm_wp_enabled(vcpu) || (pfec & PFEC_user_mode))) )
-            return 1;
-    }
-
-    return 0;
-}
-#endif
-
-/* Walk the guest pagetables, after the manner of a hardware walker. */
-/* Because the walk is essentially random, it can cause a deadlock 
- * warning in the p2m locking code. Highly unlikely this is an actual
- * deadlock, because who would walk page table in the opposite order? */
-uint32_t
+/*
+ * Walk the guest pagetables, after the manner of a hardware walker.
+ *
+ * This is a condensing of the 'Paging' chapters from Intel and AMD software
+ * manuals.  Please refer closely to them.
+ *
+ * A pagetable walk consists of two parts:
+ *   1) to find whether a translation exists, and
+ *   2) if a translation does exist, to check whether the translation's access
+ *      rights permit the access.
+ *
+ * A translation is found by following the pagetable structure (starting at
+ * %cr3) to a leaf entry (an L1 PTE, or a higher level entry with PSE set)
+ * which identifies the physical destination of the access.
+ *
+ * A translation from one level to the next exists if the PTE is both present
+ * and has no reserved bits set.  If the pagewalk counters a situation where a
+ * translation does not exist, the walk stops at that point.
+ *
+ * The access rights (NX, User, RW bits) are collected as the walk progresses.
+ * If a translation exists, the accumulated access rights are compared to the
+ * requested walk, to see whether the access is permitted.
+ */
+bool
 guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
                   unsigned long va, walk_t *gw, 
-                  uint32_t pfec, mfn_t top_mfn, void *top_map)
+                  uint32_t walk, mfn_t top_mfn, void *top_map)
 {
     struct domain *d = v->domain;
     p2m_type_t p2mt;
@@ -155,64 +88,44 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
     guest_l3e_t *l3p = NULL;
     guest_l4e_t *l4p;
 #endif
-    unsigned int pkey;
-    uint32_t gflags, mflags, iflags, rc = 0;
-    bool_t smep = 0, smap = 0;
+    uint32_t gflags, rc;
     bool_t pse1G = 0, pse2M = 0;
     p2m_query_t qt = P2M_ALLOC | P2M_UNSHARE;
 
-    /* Only implicit supervisor data accesses exist. */
-    ASSERT(!(pfec & PFEC_implicit) ||
-           !(pfec & (PFEC_insn_fetch|PFEC_user_mode)));
+#define AR_ACCUM_AND (_PAGE_USER | _PAGE_RW)
+#define AR_ACCUM_OR  (_PAGE_NX_BIT)
+    /* Start with all AND bits set, all OR bits clear. */
+    uint32_t ar, ar_and = ~0u, ar_or = 0;
 
-    perfc_incr(guest_walk);
-    memset(gw, 0, sizeof(*gw));
-    gw->va = va;
-
-    /* Mandatory bits that must be set in every entry.  We invert NX and
-     * the invalid bits, to calculate as if there were an "X" bit that
-     * allowed access.  We will accumulate, in rc, the set of flags that
-     * are missing/unwanted. */
-    mflags = mandatory_flags(v, pfec);
-    iflags = (_PAGE_NX_BIT | _PAGE_INVALID_BITS);
+    bool walk_ok = false;
 
-    if ( is_hvm_domain(d) && !(pfec & PFEC_user_mode) )
-    {
-        const struct cpu_user_regs *regs = guest_cpu_user_regs();
+    /*
+     * TODO - We should ASSERT() that only the following bits are set as
+     * inputs to a guest walk, but a whole load of code currently passes in
+     * other PFEC_ constants.
+     */
+    walk &= (PFEC_implicit | PFEC_insn_fetch | PFEC_user_mode | 
PFEC_write_access);
 
-        /* SMEP: kernel-mode instruction fetches from user-mode mappings
-         * should fault.  Unlike NX or invalid bits, we're looking for _all_
-         * entries in the walk to have _PAGE_USER set, so we need to do the
-         * whole walk as if it were a user-mode one and then invert the 
answer. */
-        smep =  hvm_smep_enabled(v) && (pfec & PFEC_insn_fetch);
+    /* Only implicit supervisor data accesses exist. */
+    ASSERT(!(walk & PFEC_implicit) ||
+           !(walk & (PFEC_insn_fetch|PFEC_user_mode)));
 
-        switch ( v->arch.smap_check_policy )
-        {
-        case SMAP_CHECK_HONOR_CPL_AC:
-            /*
-             * SMAP: kernel-mode data accesses from user-mode mappings
-             * should fault.
-             * A fault is considered as a SMAP violation if the following
-             * conditions come true:
-             *   - X86_CR4_SMAP is set in CR4
-             *   - A user page is accessed
-             *   - CPL = 3 or X86_EFLAGS_AC is clear
-             *   - Page fault in kernel mode
-             */
-            smap = hvm_smap_enabled(v) &&
-                   ((hvm_get_cpl(v) == 3) || !(regs->eflags & X86_EFLAGS_AC));
-            break;
-        case SMAP_CHECK_ENABLED:
-            smap = hvm_smap_enabled(v);
-            break;
-        default:
-            ASSERT(v->arch.smap_check_policy == SMAP_CHECK_DISABLED);
-            break;
-        }
-    }
+    /*
+     * PFEC_insn_fetch is only used as an input to pagetable walking if NX or
+     * SMEP are enabled.  Otherwise, instruction fetches are indistinguishable
+     * from data reads.
+     *
+     * This property can be demonstrated on real hardware by having NX and
+     * SMEP inactive, but SMAP active, and observing that EFLAGS.AC determines
+     * whether a pagefault occures for supervisor execution on user mappings.
+     */
+    if ( !(guest_supports_nx(v) || guest_smep_enabled(v)) )
+        walk &= ~PFEC_insn_fetch;
 
-    if ( smep || smap )
-        mflags |= _PAGE_USER;
+    perfc_incr(guest_walk);
+    memset(gw, 0, sizeof(*gw));
+    gw->va = va;
+    gw->pfec = walk & (PFEC_insn_fetch | PFEC_user_mode | PFEC_write_access);
 
 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
@@ -221,17 +134,20 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
     gw->l4mfn = top_mfn;
     l4p = (guest_l4e_t *) top_map;
     gw->l4e = l4p[guest_l4_table_offset(va)];
-    gflags = guest_l4e_get_flags(gw->l4e) ^ iflags;
-    if ( !(gflags & _PAGE_PRESENT) ) {
-        rc |= _PAGE_PRESENT;
+    gflags = guest_l4e_get_flags(gw->l4e);
+    if ( !(gflags & _PAGE_PRESENT) )
         goto out;
-    }
-    if ( gflags & _PAGE_PSE )
+
+    /* Check for reserved bits. */
+    if ( guest_l4e_rsvd_bits(v, gw->l4e) )
     {
-        rc |= _PAGE_PSE | _PAGE_INVALID_BIT;
+        gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
         goto out;
     }
-    rc |= ((gflags & mflags) ^ mflags);
+
+    /* Accumulate l4e access rights. */
+    ar_and &= gflags;
+    ar_or  |= gflags;
 
     /* Map the l3 table */
     l3p = map_domain_gfn(p2m, 
@@ -241,17 +157,28 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
                          qt,
                          &rc); 
     if(l3p == NULL)
+    {
+        gw->pfec |= rc & PFEC_synth_mask;
         goto out;
+    }
+
     /* Get the l3e and check its flags*/
     gw->l3e = l3p[guest_l3_table_offset(va)];
-    pkey = guest_l3e_get_pkey(gw->l3e);
-    gflags = guest_l3e_get_flags(gw->l3e) ^ iflags;
-    if ( !(gflags & _PAGE_PRESENT) ) {
-        rc |= _PAGE_PRESENT;
+    gflags = guest_l3e_get_flags(gw->l3e);
+    if ( !(gflags & _PAGE_PRESENT) )
+        goto out;
+
+    /* Check for reserved bits, including possibly _PAGE_PSE. */
+    if ( guest_l3e_rsvd_bits(v, gw->l3e) )
+    {
+        gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
         goto out;
     }
-    rc |= ((gflags & mflags) ^ mflags);
     
+    /* Accumulate l3e access rights. */
+    ar_and &= gflags;
+    ar_or  |= gflags;
+
     pse1G = !!(gflags & _PAGE_PSE);
 
     if ( pse1G )
@@ -272,26 +199,25 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
             /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
             flags &= ~_PAGE_PAT;
 
-        if ( !guest_supports_l3_superpages(d) )
-            rc |= _PAGE_PSE | _PAGE_INVALID_BIT;
-        if ( gfn_x(start) & GUEST_L3_GFN_MASK & ~0x1 )
-            rc |= _PAGE_INVALID_BITS;
-
         /* Increment the pfn by the right number of 4k pages. */
         start = _gfn((gfn_x(start) & ~GUEST_L3_GFN_MASK) +
                      ((va >> PAGE_SHIFT) & GUEST_L3_GFN_MASK));
         gw->l1e = guest_l1e_from_gfn(start, flags);
         gw->l2mfn = gw->l1mfn = INVALID_MFN;
-        goto set_ad;
+        goto leaf;
     }
 
 #else /* PAE only... */
 
     /* Get the l3e and check its flag */
     gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
-    if ( !(guest_l3e_get_flags(gw->l3e) & _PAGE_PRESENT) ) 
+    gflags = guest_l3e_get_flags(gw->l3e);
+    if ( !(gflags & _PAGE_PRESENT) )
+        goto out;
+
+    if ( guest_l3e_rsvd_bits(v, gw->l3e) )
     {
-        rc |= _PAGE_PRESENT;
+        gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
         goto out;
     }
 
@@ -305,7 +231,11 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
                          qt,
                          &rc); 
     if(l2p == NULL)
+    {
+        gw->pfec |= rc & PFEC_synth_mask;
         goto out;
+    }
+
     /* Get the l2e */
     gw->l2e = l2p[guest_l2_table_offset(va)];
 
@@ -318,15 +248,32 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
 
 #endif /* All levels... */
 
-    pkey = guest_l2e_get_pkey(gw->l2e);
-    gflags = guest_l2e_get_flags(gw->l2e) ^ iflags;
-    if ( !(gflags & _PAGE_PRESENT) ) {
-        rc |= _PAGE_PRESENT;
+    /* Check the l2e flags. */
+    gflags = guest_l2e_get_flags(gw->l2e);
+    if ( !(gflags & _PAGE_PRESENT) )
+        goto out;
+
+    /*
+     * In 2-level paging without CR0.PSE, there are no reserved bits, and the
+     * PAT/PSE bit is ignored.
+     */
+    if ( GUEST_PAGING_LEVELS == 2 && !guest_supports_l2_superpages(v) )
+    {
+        gw->l2e.l2 &= ~_PAGE_PSE;
+        gflags &= ~_PAGE_PSE;
+    }
+    /* else check for reserved bits, including possibly _PAGE_PSE. */
+    else if ( guest_l2e_rsvd_bits(v, gw->l2e) )
+    {
+        gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
         goto out;
     }
-    rc |= ((gflags & mflags) ^ mflags);
 
-    pse2M = (gflags & _PAGE_PSE) && guest_supports_l2_superpages(v);
+    /* Accumulate l2e access rights. */
+    ar_and &= gflags;
+    ar_or  |= gflags;
+
+    pse2M = !!(gflags & _PAGE_PSE);
 
     if ( pse2M )
     {
@@ -334,7 +281,11 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
          * no guest l1e.  We make one up so that the propagation code
          * can generate a shadow l1 table.  Start with the gfn of the 
          * first 4k-page of the superpage. */
+#if GUEST_PAGING_LEVELS == 2
+        gfn_t start = _gfn(unfold_pse36(gw->l2e.l2) >> PAGE_SHIFT);
+#else
         gfn_t start = guest_l2e_get_gfn(gw->l2e);
+#endif
         /* Grant full access in the l1e, since all the guest entry's 
          * access controls are enforced in the shadow l2e. */
         int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
@@ -348,70 +299,137 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
             /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
             flags &= ~_PAGE_PAT;
 
-        if ( gfn_x(start) & GUEST_L2_GFN_MASK & ~0x1 )
-            rc |= _PAGE_INVALID_BITS;
-
-        /* Increment the pfn by the right number of 4k pages.  
-         * Mask out PAT and invalid bits. */
+        /* Increment the pfn by the right number of 4k pages. */
         start = _gfn((gfn_x(start) & ~GUEST_L2_GFN_MASK) +
                      guest_l1_table_offset(va));
+#if GUEST_PAGING_LEVELS == 2
+         /* Wider than 32 bits if PSE36 superpage. */
+        gw->el1e = (gfn_x(start) << PAGE_SHIFT) | flags;
+#else
         gw->l1e = guest_l1e_from_gfn(start, flags);
+#endif
         gw->l1mfn = INVALID_MFN;
-    } 
-    else 
+        goto leaf;
+    }
+
+    /* Map the l1 table */
+    l1p = map_domain_gfn(p2m,
+                         guest_l2e_get_gfn(gw->l2e),
+                         &gw->l1mfn,
+                         &p2mt,
+                         qt,
+                         &rc);
+    if ( l1p == NULL )
     {
-        /* Not a superpage: carry on and find the l1e. */
-        l1p = map_domain_gfn(p2m, 
-                             guest_l2e_get_gfn(gw->l2e), 
-                             &gw->l1mfn,
-                             &p2mt,
-                             qt,
-                             &rc);
-        if(l1p == NULL)
+        gw->pfec |= rc & PFEC_synth_mask;
+        goto out;
+    }
+    gw->l1e = l1p[guest_l1_table_offset(va)];
+    gflags = guest_l1e_get_flags(gw->l1e);
+    if ( !(gflags & _PAGE_PRESENT) )
+        goto out;
+
+    /* Check for reserved bits. */
+    if ( guest_l1e_rsvd_bits(v, gw->l1e) )
+    {
+        gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
+        goto out;
+    }
+
+    /* Accumulate l1e access rights. */
+    ar_and &= gflags;
+    ar_or  |= gflags;
+
+ leaf:
+    gw->pfec |= PFEC_page_present;
+
+    /*
+     * The pagetable walk has returned a successful translation.  Now check
+     * access rights to see whether the access should succeed.
+     */
+    ar = (ar_and & AR_ACCUM_AND) | (ar_or & AR_ACCUM_OR);
+
+    if ( (walk & PFEC_insn_fetch) && (ar & _PAGE_NX_BIT) )
+        /* Requested an instruction fetch and found NX? Fail. */
+        goto out;
+
+    if ( walk & PFEC_user_mode ) /* Requested a user acess. */
+    {
+        if ( !(ar & _PAGE_USER) )
+            /* Got a supervisor walk?  Unconditional fail. */
             goto out;
-        gw->l1e = l1p[guest_l1_table_offset(va)];
-        pkey = guest_l1e_get_pkey(gw->l1e);
-        gflags = guest_l1e_get_flags(gw->l1e) ^ iflags;
-        if ( !(gflags & _PAGE_PRESENT) ) {
-            rc |= _PAGE_PRESENT;
+
+        if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) )
+            /* Requested a write and only got a read? Fail. */
             goto out;
+    }
+    else /* Requested a supervisor access. */
+    {
+        if ( ar & _PAGE_USER ) /* Got a user walk. */
+        {
+            if ( (walk & PFEC_insn_fetch) && guest_smep_enabled(v) )
+                /* User insn fetch and smep? Fail. */
+                goto out;
+
+            if ( !(walk & PFEC_insn_fetch) && guest_smap_enabled(v) &&
+                 ((walk & PFEC_implicit) ||
+                  !(guest_cpu_user_regs()->eflags & X86_EFLAGS_AC)) )
+                /* User data access and smap? Fail. */
+                goto out;
         }
-        rc |= ((gflags & mflags) ^ mflags);
+
+        if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) &&
+             guest_wp_enabled(v) )
+            /* Requested a write, got a read, and CR0.WP is set? Fail. */
+            goto out;
     }
 
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
-set_ad:
-    if ( pkey_fault(v, pfec, gflags, pkey) )
-        rc |= _PAGE_PKEY_BITS;
+    /*
+     * If all access checks are thusfar ok, check Protection Key for 64bit
+     * user data accesses.
+     *
+     * N.B. In the case that the walk ended with a superpage, the fabricated
+     * gw->l1e contains the appropriate leaf pkey.
+     */
+    if ( (walk & PFEC_user_mode) && !(walk & PFEC_insn_fetch) &&
+         guest_pku_enabled(v) )
+    {
+        unsigned int pkey = guest_l1e_get_pkey(gw->l1e);
+        unsigned int pkru = read_pkru();
+
+        if ( read_pkru_ad(pkru, pkey) ||
+             ((ar & PFEC_write_access) && read_pkru_wd(pkru, pkey)) )
+        {
+            gw->pfec |= PFEC_prot_key;
+            goto out;
+        }
+    }
 #endif
-    /* Now re-invert the user-mode requirement for SMEP and SMAP */
-    if ( smep || smap )
-        rc ^= _PAGE_USER;
+
+    walk_ok = true;
 
     /* Go back and set accessed and dirty bits only if the walk was a
      * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
      * get set whenever a lower-level PT is used, at least some hardware
      * walkers behave this way. */
-    if ( rc == 0 ) 
-    {
 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
-        if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
-            paging_mark_dirty(d, gw->l4mfn);
-        if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e,
-                         (pse1G && (pfec & PFEC_write_access))) )
-            paging_mark_dirty(d, gw->l3mfn);
+    if ( set_ad_bits(l4p + guest_l4_table_offset(va), &gw->l4e, 0) )
+        paging_mark_dirty(d, gw->l4mfn);
+    if ( set_ad_bits(l3p + guest_l3_table_offset(va), &gw->l3e,
+                     (pse1G && (walk & PFEC_write_access))) )
+        paging_mark_dirty(d, gw->l3mfn);
 #endif
-        if ( !pse1G ) 
+    if ( !pse1G )
+    {
+        if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
+                         (pse2M && (walk & PFEC_write_access))) )
+            paging_mark_dirty(d, gw->l2mfn);
+        if ( !pse2M )
         {
-            if ( set_ad_bits(l2p + guest_l2_table_offset(va), &gw->l2e,
-                             (pse2M && (pfec & PFEC_write_access))) )
-                paging_mark_dirty(d, gw->l2mfn);
-            if ( !pse2M ) 
-            {
-                if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e, 
-                                 (pfec & PFEC_write_access)) )
-                    paging_mark_dirty(d, gw->l1mfn);
-            }
+            if ( set_ad_bits(l1p + guest_l1_table_offset(va), &gw->l1e,
+                             (walk & PFEC_write_access)) )
+                paging_mark_dirty(d, gw->l1mfn);
         }
     }
 
@@ -436,10 +454,5 @@ guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
         put_page(mfn_to_page(mfn_x(gw->l1mfn)));
     }
 
-    /* If this guest has a restricted physical address space then the
-     * target GFN must fit within it. */
-    if ( !(rc & _PAGE_PRESENT) && !gfn_valid(d, guest_walk_to_gfn(gw)) )
-        rc |= _PAGE_INVALID_BITS;
-
-    return rc;
+    return walk_ok;
 }
diff --git a/xen/arch/x86/mm/hap/guest_walk.c b/xen/arch/x86/mm/hap/guest_walk.c
index e202c9a..5a4896b 100644
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -50,7 +50,7 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
     struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3,
     paddr_t ga, uint32_t *pfec, unsigned int *page_order)
 {
-    uint32_t missing;
+    bool walk_ok;
     mfn_t top_mfn;
     void *top_map;
     p2m_type_t p2mt;
@@ -91,12 +91,12 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
 #if GUEST_PAGING_LEVELS == 3
     top_map += (cr3 & ~(PAGE_MASK | 31));
 #endif
-    missing = guest_walk_tables(v, p2m, ga, &gw, *pfec, top_mfn, top_map);
+    walk_ok = guest_walk_tables(v, p2m, ga, &gw, *pfec, top_mfn, top_map);
     unmap_domain_page(top_map);
     put_page(top_page);
 
     /* Interpret the answer */
-    if ( missing == 0 )
+    if ( walk_ok )
     {
         gfn_t gfn = guest_walk_to_gfn(&gw);
         struct page_info *page;
@@ -123,20 +123,7 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PAGING_LEVELS)(
         return gfn_x(gfn);
     }
 
-    if ( missing & _PAGE_PRESENT )
-        *pfec &= ~PFEC_page_present;
-
-    if ( missing & _PAGE_INVALID_BITS ) 
-        *pfec |= PFEC_reserved_bit;
-
-    if ( missing & _PAGE_PKEY_BITS )
-        *pfec |= PFEC_prot_key;
-
-    if ( missing & _PAGE_PAGED )
-        *pfec = PFEC_page_paged;
-
-    if ( missing & _PAGE_SHARED )
-        *pfec = PFEC_page_shared;
+    *pfec = gw.pfec;
 
  out_tweak_pfec:
     /*
diff --git a/xen/arch/x86/mm/hap/nested_ept.c b/xen/arch/x86/mm/hap/nested_ept.c
index 02b27b1..14b1bb0 100644
--- a/xen/arch/x86/mm/hap/nested_ept.c
+++ b/xen/arch/x86/mm/hap/nested_ept.c
@@ -208,7 +208,7 @@ nept_walk_tables(struct vcpu *v, unsigned long l2ga, 
ept_walk_t *gw)
     goto out;
 
 map_err:
-    if ( rc == _PAGE_PAGED )
+    if ( rc == PFEC_page_paged )
     {
         ret = EPT_TRANSLATE_RETRY;
         goto out;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index a5651a3..d38004c 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -1774,17 +1774,18 @@ unsigned long paging_gva_to_gfn(struct vcpu *v,
 }
 
 /*
- * If the map is non-NULL, we leave this function having
- * acquired an extra ref on mfn_to_page(*mfn).
+ * If the map is non-NULL, we leave this function having acquired an extra ref
+ * on mfn_to_page(*mfn).  In all cases, *pfec contains appropriate
+ * synthetic/structure PFEC_* bits.
  */
 void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn,
-                     p2m_type_t *p2mt, p2m_query_t q, uint32_t *rc)
+                     p2m_type_t *p2mt, p2m_query_t q, uint32_t *pfec)
 {
     struct page_info *page;
 
     if ( !gfn_valid(p2m->domain, gfn) )
     {
-        *rc = _PAGE_INVALID_BIT;
+        *pfec = PFEC_reserved_bit | PFEC_page_present;
         return NULL;
     }
 
@@ -1796,21 +1797,23 @@ void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, 
mfn_t *mfn,
         if ( page )
             put_page(page);
         p2m_mem_paging_populate(p2m->domain, gfn_x(gfn));
-        *rc = _PAGE_PAGED;
+        *pfec = PFEC_page_paged;
         return NULL;
     }
     if ( p2m_is_shared(*p2mt) )
     {
         if ( page )
             put_page(page);
-        *rc = _PAGE_SHARED;
+        *pfec = PFEC_page_shared;
         return NULL;
     }
     if ( !page )
     {
-        *rc |= _PAGE_PRESENT;
+        *pfec = 0;
         return NULL;
     }
+
+    *pfec = PFEC_page_present;
     *mfn = page_to_mfn(page);
     ASSERT(mfn_valid(*mfn));
 
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index 52cef46..4762e15 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -171,7 +171,7 @@ delete_shadow_status(struct domain *d, mfn_t gmfn, u32 
shadow_type, mfn_t smfn)
 /**************************************************************************/
 /* Functions for walking the guest page tables */
 
-static inline uint32_t
+static inline bool
 sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
                      uint32_t pfec)
 {
@@ -2858,6 +2858,7 @@ static int sh_page_fault(struct vcpu *v,
     int r;
     p2m_type_t p2mt;
     uint32_t rc, error_code;
+    bool walk_ok;
     int version;
     const struct npfec access = {
          .read_access = 1,
@@ -3075,21 +3076,20 @@ static int sh_page_fault(struct vcpu *v,
      * shadow page table. */
     version = atomic_read(&d->arch.paging.shadow.gtable_dirty_version);
     rmb();
-    rc = sh_walk_guest_tables(v, va, &gw, error_code);
+    walk_ok = sh_walk_guest_tables(v, va, &gw, error_code);
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_OUT_OF_SYNC)
     regs->error_code &= ~PFEC_page_present;
-    if ( !(rc & _PAGE_PRESENT) )
+    if ( gw.pfec & PFEC_page_present )
         regs->error_code |= PFEC_page_present;
 #endif
 
-    if ( rc != 0 )
+    if ( !walk_ok )
     {
         perfc_incr(shadow_fault_bail_real_fault);
         SHADOW_PRINTK("not a shadow fault\n");
         reset_early_unshadow(v);
-        if ( (rc & _PAGE_INVALID_BITS) )
-            regs->error_code |= PFEC_reserved_bit;
+        regs->error_code = gw.pfec & PFEC_arch_mask;
         goto propagate;
     }
 
@@ -3723,7 +3723,7 @@ sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
 {
     walk_t gw;
     gfn_t gfn;
-    uint32_t missing;
+    bool walk_ok;
 
 #if (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB)
     /* Check the vTLB cache first */
@@ -3732,18 +3732,9 @@ sh_gva_to_gfn(struct vcpu *v, struct p2m_domain *p2m,
         return vtlb_gfn;
 #endif /* (SHADOW_OPTIMIZATIONS & SHOPT_VIRTUAL_TLB) */
 
-    if ( (missing = sh_walk_guest_tables(v, va, &gw, *pfec)) != 0 )
+    if ( !(walk_ok = sh_walk_guest_tables(v, va, &gw, *pfec)) )
     {
-        if ( (missing & _PAGE_PRESENT) )
-            *pfec &= ~PFEC_page_present;
-        if ( missing & _PAGE_INVALID_BITS )
-            *pfec |= PFEC_reserved_bit;
-        /*
-         * SDM Intel 64 Volume 3, Chapter Paging, PAGE-FAULT EXCEPTIONS:
-         * The PFEC_insn_fetch flag is set only when NX or SMEP are enabled.
-         */
-        if ( is_hvm_vcpu(v) && !hvm_nx_enabled(v) && !hvm_smep_enabled(v) )
-            *pfec &= ~PFEC_insn_fetch;
+        *pfec = gw.pfec;
         return gfn_x(INVALID_GFN);
     }
     gfn = guest_walk_to_gfn(&gw);
diff --git a/xen/include/asm-x86/guest_pt.h b/xen/include/asm-x86/guest_pt.h
index 4e5018d..5fb77c6 100644
--- a/xen/include/asm-x86/guest_pt.h
+++ b/xen/include/asm-x86/guest_pt.h
@@ -253,19 +253,26 @@ static inline bool guest_supports_nx(const struct vcpu *v)
     return hvm_nx_enabled(v);
 }
 
+static inline bool guest_wp_enabled(const struct vcpu *v)
+{
+    /* PV guests can't control CR0.WP, and it is unconditionally set by Xen. */
+    return is_pv_vcpu(v) || hvm_wp_enabled(v);
+}
 
-/*
- * Some bits are invalid in any pagetable entry.
- * Normal flags values get represented in 24-bit values (see
- * get_pte_flags() and put_pte_flags()), so set bit 24 in
- * addition to be able to flag out of range frame numbers.
- */
-#if GUEST_PAGING_LEVELS == 3
-#define _PAGE_INVALID_BITS \
-    (_PAGE_INVALID_BIT | get_pte_flags(((1ull << 63) - 1) & ~(PAGE_SIZE - 1)))
-#else /* 2-level and 4-level */
-#define _PAGE_INVALID_BITS _PAGE_INVALID_BIT
-#endif
+static inline bool guest_smep_enabled(const struct vcpu *v)
+{
+    return !is_pv_vcpu(v) && hvm_smep_enabled(v);
+}
+
+static inline bool guest_smap_enabled(const struct vcpu *v)
+{
+    return !is_pv_vcpu(v) && hvm_smap_enabled(v);
+}
+
+static inline bool guest_pku_enabled(const struct vcpu *v)
+{
+    return !is_pv_vcpu(v) && hvm_pku_enabled(v);
+}
 
 /* Helpers for identifying whether guest entries have reserved bits set. */
 
@@ -331,13 +338,19 @@ struct guest_pagetable_walk
     guest_l3e_t l3e;            /* Guest's level 3 entry */
 #endif
     guest_l2e_t l2e;            /* Guest's level 2 entry */
-    guest_l1e_t l1e;            /* Guest's level 1 entry (or fabrication) */
+    union
+    {
+        guest_l1e_t l1e;        /* Guest's level 1 entry (or fabrication). */
+        uint64_t   el1e;        /* L2 PSE36 superpages wider than 32 bits. */
+    };
 #if GUEST_PAGING_LEVELS >= 4
     mfn_t l4mfn;                /* MFN that the level 4 entry was in */
     mfn_t l3mfn;                /* MFN that the level 3 entry was in */
 #endif
     mfn_t l2mfn;                /* MFN that the level 2 entry was in */
     mfn_t l1mfn;                /* MFN that the level 1 entry was in */
+
+    uint32_t pfec;              /* Accumulated PFEC_* error code from walk. */
 };
 
 /* Given a walk_t, translate the gw->va into the guest's notion of the
@@ -346,7 +359,9 @@ static inline gfn_t guest_walk_to_gfn(const walk_t *gw)
 {
     if ( !(guest_l1e_get_flags(gw->l1e) & _PAGE_PRESENT) )
         return INVALID_GFN;
-    return guest_l1e_get_gfn(gw->l1e);
+    return (GUEST_PAGING_LEVELS == 2
+            ? _gfn(gw->el1e >> PAGE_SHIFT)
+            : guest_l1e_get_gfn(gw->l1e));
 }
 
 /* Given a walk_t, translate the gw->va into the guest's notion of the
@@ -392,8 +407,8 @@ static inline unsigned int guest_walk_to_page_order(const 
walk_t *gw)
  * we go.  For the purposes of reading pagetables we treat all non-RAM
  * memory as contining zeroes.
  *
- * Returns 0 for success, or the set of permission bits that we failed on
- * if the walk did not complete.
+ * Returns a boolean indicating success or failure.  walk_t.pfec contains
+ * the accumulated error code on failure.
  */
 
 /* Macro-fu so you can call guest_walk_tables() and get the right one. */
@@ -401,7 +416,7 @@ static inline unsigned int guest_walk_to_page_order(const 
walk_t *gw)
 #define GPT_RENAME(_n, _l) GPT_RENAME2(_n, _l)
 #define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS)
 
-extern uint32_t
+bool
 guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, unsigned long va,
                   walk_t *gw, uint32_t pfec, mfn_t top_mfn, void *top_map);
 
@@ -421,8 +436,21 @@ static inline void print_gw(const walk_t *gw)
 #endif /* All levels... */
     gprintk(XENLOG_INFO, "   l2e=%" PRI_gpte " l2mfn=%" PRI_mfn "\n",
             gw->l2e.l2, mfn_x(gw->l2mfn));
+#if GUEST_PAGING_LEVELS == 2
+    gprintk(XENLOG_INFO, "  el1e=%08" PRIx64 " l1mfn=%" PRI_mfn "\n",
+            gw->el1e, mfn_x(gw->l1mfn));
+#else
     gprintk(XENLOG_INFO, "   l1e=%" PRI_gpte " l1mfn=%" PRI_mfn "\n",
             gw->l1e.l1, mfn_x(gw->l1mfn));
+#endif
+    gprintk(XENLOG_INFO, "   pfec=%02x[%c%c%c%c%c%c]\n", gw->pfec,
+            gw->pfec & PFEC_prot_key     ? 'K' : '-',
+            gw->pfec & PFEC_insn_fetch   ? 'I' : 'd',
+            gw->pfec & PFEC_reserved_bit ? 'R' : '-',
+            gw->pfec & PFEC_user_mode    ? 'U' : 's',
+            gw->pfec & PFEC_write_access ? 'W' : 'r',
+            gw->pfec & PFEC_page_present ? 'P' : '-'
+        );
 }
 
 #endif /* _XEN_ASM_GUEST_PT_H */
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index 470d29d..bc189d1 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -679,7 +679,7 @@ int p2m_set_entry(struct p2m_domain *p2m, unsigned long 
gfn, mfn_t mfn,
 extern void p2m_pt_init(struct p2m_domain *p2m);
 
 void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn,
-                     p2m_type_t *p2mt, p2m_query_t q, uint32_t *rc);
+                     p2m_type_t *p2mt, p2m_query_t q, uint32_t *pfec);
 
 /* Debugging and auditing of the P2M code? */
 #ifndef NDEBUG
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
index 46faffc..bc5946b 100644
--- a/xen/include/asm-x86/page.h
+++ b/xen/include/asm-x86/page.h
@@ -314,9 +314,6 @@ void efi_update_l4_pgtable(unsigned int l4idx, 
l4_pgentry_t);
 #define _PAGE_PSE_PAT  _AC(0x1000,U)
 #define _PAGE_AVAIL_HIGH (_AC(0x7ff, U) << 12)
 #define _PAGE_NX       (cpu_has_nx ? _PAGE_NX_BIT : 0)
-/* non-architectural flags */
-#define _PAGE_PAGED   0x2000U
-#define _PAGE_SHARED  0x4000U
 
 /*
  * Debug option: Ensure that granted mappings are not implicitly unmapped.
diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h
index d3ba8ea..75632d9 100644
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -73,10 +73,12 @@
 #define PFEC_reserved_bit   (_AC(1,U) << 3)
 #define PFEC_insn_fetch     (_AC(1,U) << 4)
 #define PFEC_prot_key       (_AC(1,U) << 5)
+#define PFEC_arch_mask      (_AC(0xffff,U)) /* Architectural PFEC values. */
 /* Internally used only flags. */
 #define PFEC_page_paged     (1U<<16)
 #define PFEC_page_shared    (1U<<17)
 #define PFEC_implicit       (1U<<18) /* Pagewalk input for ldt/gdt/idt/tr 
accesses. */
+#define PFEC_synth_mask     (~PFEC_arch_mask) /* Synthetic PFEC values. */
 
 /* Other exception error code values. */
 #define X86_XEC_EXT         (_AC(1,U) << 0)
diff --git a/xen/include/asm-x86/x86_64/page.h 
b/xen/include/asm-x86/x86_64/page.h
index 5a613bc..1a6cae6 100644
--- a/xen/include/asm-x86/x86_64/page.h
+++ b/xen/include/asm-x86/x86_64/page.h
@@ -152,12 +152,6 @@ typedef l4_pgentry_t root_pgentry_t;
 #define _PAGE_GNTTAB (1U<<22)
 
 /*
- * Bit 24 of a 24-bit flag mask!  This is not any bit of a real pte,
- * and is only used for signalling in variables that contain flags.
- */
-#define _PAGE_INVALID_BIT (1U<<24)
-
-/*
  * Bit 12 of a 24-bit flag mask. This corresponds to bit 52 of a pte.
  * This is needed to distinguish between user and kernel PTEs since _PAGE_USER
  * is asserted for both.
-- 
2.1.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
https://lists.xen.org/xen-devel

Reply via email to