Gitweb:     
http://git.kernel.org/git/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=b239fb2501117bf3aeb4dd6926edd855be92333d
Commit:     b239fb2501117bf3aeb4dd6926edd855be92333d
Parent:     3dc494e86d1c93afd4c66385f270899dbfae483d
Author:     Jeremy Fitzhardinge <[EMAIL PROTECTED]>
AuthorDate: Wed May 2 19:27:13 2007 +0200
Committer:  Andi Kleen <[EMAIL PROTECTED]>
CommitDate: Wed May 2 19:27:13 2007 +0200

    [PATCH] i386: PARAVIRT: Hooks to set up initial pagetable
    
    This patch introduces paravirt_ops hooks to control how the kernel's
    initial pagetable is set up.
    
    In the case of a native boot, the very early bootstrap code creates a
    simple non-PAE pagetable to map the kernel and physical memory.  When
    the VM subsystem is initialized, it creates a proper pagetable which
    respects the PAE mode, large pages, etc.
    
    When booting under a hypervisor, there are many possibilities for what
    paging environment the hypervisor establishes for the guest kernel, so
    the constructon of the kernel's pagetable depends on the hypervisor.
    
    In the case of Xen, the hypervisor boots the kernel with a fully
    constructed pagetable, which is already using PAE if necessary.  Also,
    Xen requires particular care when constructing pagetables to make sure
    all pagetables are always mapped read-only.
    
    In order to make this easier, kernel's initial pagetable construction
    has been changed to only allocate and initialize a pagetable page if
    there's no page already present in the pagetable.  This allows the Xen
    paravirt backend to make a copy of the hypervisor-provided pagetable,
    allowing the kernel to establish any more mappings it needs while
    keeping the existing ones.
    
    A slightly subtle point which is worth highlighting here is that Xen
    requires all kernel mappings to share the same pte_t pages between all
    pagetables, so that updating a kernel page's mapping in one pagetable
    is reflected in all other pagetables.  This makes it possible to
    allocate a page and attach it to a pagetable without having to
    explicitly enumerate that page's mapping in all pagetables.
    
    And:
    
    +From: "Eric W. Biederman" <[EMAIL PROTECTED]>
    
    If we don't set the leaf page table entries it is quite possible that
    will inherit and incorrect page table entry from the initial boot
    page table setup in head.S.  So we need to redo the effort here,
    so we pick up PSE, PGE and the like.
    
    Hypervisors like Xen require that their page tables be read-only,
    which is slightly incompatible with our low identity mappings, however
    I discussed this with Jeremy he has modified the Xen early set_pte
    function to avoid problems in this area.
    
    Signed-off-by: Eric W. Biederman <[EMAIL PROTECTED]>
    Signed-off-by: Jeremy Fitzhardinge <[EMAIL PROTECTED]>
    Signed-off-by: Andi Kleen <[EMAIL PROTECTED]>
    Acked-by: William Irwin <[EMAIL PROTECTED]>
    Cc: Ingo Molnar <[EMAIL PROTECTED]>
---
 arch/i386/kernel/paravirt.c |    3 +
 arch/i386/mm/init.c         |  138 ++++++++++++++++++++++++++++---------------
 include/asm-i386/paravirt.h |   17 +++++-
 include/asm-i386/pgtable.h  |   16 +++++
 4 files changed, 126 insertions(+), 48 deletions(-)

diff --git a/arch/i386/kernel/paravirt.c b/arch/i386/kernel/paravirt.c
index cba7a15..47d075b 100644
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -193,6 +193,9 @@ struct paravirt_ops paravirt_ops = {
 #endif
        .set_lazy_mode = paravirt_nop,
 
+       .pagetable_setup_start = native_pagetable_setup_start,
+       .pagetable_setup_done = native_pagetable_setup_done,
+
        .flush_tlb_user = native_flush_tlb,
        .flush_tlb_kernel = native_flush_tlb_global,
        .flush_tlb_single = native_flush_tlb_single,
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index bd5ef37..e8545dc 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -43,6 +43,7 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
+#include <asm/paravirt.h>
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
@@ -62,17 +63,18 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
        pmd_t *pmd_table;
                
 #ifdef CONFIG_X86_PAE
-       pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
-       paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
-       set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
-       pud = pud_offset(pgd, 0);
-       if (pmd_table != pmd_offset(pud, 0)) 
-               BUG();
-#else
+       if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
+               pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
+
+               paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT);
+               set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
+               pud = pud_offset(pgd, 0);
+               if (pmd_table != pmd_offset(pud, 0))
+                       BUG();
+       }
+#endif
        pud = pud_offset(pgd, 0);
        pmd_table = pmd_offset(pud, 0);
-#endif
-
        return pmd_table;
 }
 
@@ -82,14 +84,12 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd)
  */
 static pte_t * __init one_page_table_init(pmd_t *pmd)
 {
-       if (pmd_none(*pmd)) {
+       if (!(pmd_val(*pmd) & _PAGE_PRESENT)) {
                pte_t *page_table = (pte_t *) 
alloc_bootmem_low_pages(PAGE_SIZE);
+
                paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT);
                set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
-               if (page_table != pte_offset_kernel(pmd, 0))
-                       BUG();  
-
-               return page_table;
+               BUG_ON(page_table != pte_offset_kernel(pmd, 0));
        }
        
        return pte_offset_kernel(pmd, 0);
@@ -109,7 +109,6 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
 static void __init page_table_range_init (unsigned long start, unsigned long 
end, pgd_t *pgd_base)
 {
        pgd_t *pgd;
-       pud_t *pud;
        pmd_t *pmd;
        int pgd_idx, pmd_idx;
        unsigned long vaddr;
@@ -120,13 +119,10 @@ static void __init page_table_range_init (unsigned long 
start, unsigned long end
        pgd = pgd_base + pgd_idx;
 
        for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) {
-               if (pgd_none(*pgd)) 
-                       one_md_table_init(pgd);
-               pud = pud_offset(pgd, vaddr);
-               pmd = pmd_offset(pud, vaddr);
+               pmd = one_md_table_init(pgd);
+               pmd = pmd + pmd_index(vaddr);
                for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, 
pmd_idx++) {
-                       if (pmd_none(*pmd)) 
-                               one_page_table_init(pmd);
+                       one_page_table_init(pmd);
 
                        vaddr += PMD_SIZE;
                }
@@ -168,20 +164,22 @@ static void __init kernel_physical_mapping_init(pgd_t 
*pgd_base)
                        /* Map with big pages if possible, otherwise create 
normal page tables. */
                        if (cpu_has_pse) {
                                unsigned int address2 = (pfn + PTRS_PER_PTE - 
1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1;
-
                                if (is_kernel_text(address) || 
is_kernel_text(address2))
                                        set_pmd(pmd, pfn_pmd(pfn, 
PAGE_KERNEL_LARGE_EXEC));
                                else
                                        set_pmd(pmd, pfn_pmd(pfn, 
PAGE_KERNEL_LARGE));
+
                                pfn += PTRS_PER_PTE;
                        } else {
                                pte = one_page_table_init(pmd);
 
-                               for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn 
< max_low_pfn; pte++, pfn++, pte_ofs++) {
-                                               if (is_kernel_text(address))
-                                                       set_pte(pte, 
pfn_pte(pfn, PAGE_KERNEL_EXEC));
-                                               else
-                                                       set_pte(pte, 
pfn_pte(pfn, PAGE_KERNEL));
+                               for (pte_ofs = 0;
+                                    pte_ofs < PTRS_PER_PTE && pfn < 
max_low_pfn;
+                                    pte++, pfn++, pte_ofs++, address += 
PAGE_SIZE) {
+                                       if (is_kernel_text(address))
+                                               set_pte(pte, pfn_pte(pfn, 
PAGE_KERNEL_EXEC));
+                                       else
+                                               set_pte(pte, pfn_pte(pfn, 
PAGE_KERNEL));
                                }
                        }
                }
@@ -338,24 +336,78 @@ extern void __init remap_numa_kva(void);
 #define remap_numa_kva() do {} while (0)
 #endif
 
-static void __init pagetable_init (void)
+void __init native_pagetable_setup_start(pgd_t *base)
 {
-       unsigned long vaddr;
-       pgd_t *pgd_base = swapper_pg_dir;
-
 #ifdef CONFIG_X86_PAE
        int i;
-       /* Init entries of the first-level page table to the zero page */
-       for (i = 0; i < PTRS_PER_PGD; i++)
-               set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | 
_PAGE_PRESENT));
+
+       /*
+        * Init entries of the first-level page table to the
+        * zero page, if they haven't already been set up.
+        *
+        * In a normal native boot, we'll be running on a
+        * pagetable rooted in swapper_pg_dir, but not in PAE
+        * mode, so this will end up clobbering the mappings
+        * for the lower 24Mbytes of the address space,
+        * without affecting the kernel address space.
+        */
+       for (i = 0; i < USER_PTRS_PER_PGD; i++)
+               set_pgd(&base[i],
+                       __pgd(__pa(empty_zero_page) | _PAGE_PRESENT));
+
+       /* Make sure kernel address space is empty so that a pagetable
+          will be allocated for it. */
+       memset(&base[USER_PTRS_PER_PGD], 0,
+              KERNEL_PGD_PTRS * sizeof(pgd_t));
 #else
        paravirt_alloc_pd(__pa(swapper_pg_dir) >> PAGE_SHIFT);
 #endif
+}
+
+void __init native_pagetable_setup_done(pgd_t *base)
+{
+#ifdef CONFIG_X86_PAE
+       /*
+        * Add low memory identity-mappings - SMP needs it when
+        * starting up on an AP from real-mode. In the non-PAE
+        * case we already have these mappings through head.S.
+        * All user-space mappings are explicitly cleared after
+        * SMP startup.
+        */
+       set_pgd(&base[0], base[USER_PTRS_PER_PGD]);
+#endif
+}
+
+/*
+ * Build a proper pagetable for the kernel mappings.  Up until this
+ * point, we've been running on some set of pagetables constructed by
+ * the boot process.
+ *
+ * If we're booting on native hardware, this will be a pagetable
+ * constructed in arch/i386/kernel/head.S, and not running in PAE mode
+ * (even if we'll end up running in PAE).  The root of the pagetable
+ * will be swapper_pg_dir.
+ *
+ * If we're booting paravirtualized under a hypervisor, then there are
+ * more options: we may already be running PAE, and the pagetable may
+ * or may not be based in swapper_pg_dir.  In any case,
+ * paravirt_pagetable_setup_start() will set up swapper_pg_dir
+ * appropriately for the rest of the initialization to work.
+ *
+ * In general, pagetable_init() assumes that the pagetable may already
+ * be partially populated, and so it avoids stomping on any existing
+ * mappings.
+ */
+static void __init pagetable_init (void)
+{
+       unsigned long vaddr, end;
+       pgd_t *pgd_base = swapper_pg_dir;
+
+       paravirt_pagetable_setup_start(pgd_base);
 
        /* Enable PSE if available */
-       if (cpu_has_pse) {
+       if (cpu_has_pse)
                set_in_cr4(X86_CR4_PSE);
-       }
 
        /* Enable PGE if available */
        if (cpu_has_pge) {
@@ -372,20 +424,12 @@ static void __init pagetable_init (void)
         * created - mappings will be set by set_fixmap():
         */
        vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK;
-       page_table_range_init(vaddr, 0, pgd_base);
+       end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK;
+       page_table_range_init(vaddr, end, pgd_base);
 
        permanent_kmaps_init(pgd_base);
 
-#ifdef CONFIG_X86_PAE
-       /*
-        * Add low memory identity-mappings - SMP needs it when
-        * starting up on an AP from real-mode. In the non-PAE
-        * case we already have these mappings through head.S.
-        * All user-space mappings are explicitly cleared after
-        * SMP startup.
-        */
-       set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]);
-#endif
+       paravirt_pagetable_setup_done(pgd_base);
 }
 
 #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP)
diff --git a/include/asm-i386/paravirt.h b/include/asm-i386/paravirt.h
index 0aacb13..c49b44c 100644
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -2,10 +2,11 @@
 #define __ASM_PARAVIRT_H
 /* Various instructions on x86 need to be replaced for
  * para-virtualization: those hooks are defined here. */
+
+#ifdef CONFIG_PARAVIRT
 #include <linux/stringify.h>
 #include <asm/page.h>
 
-#ifdef CONFIG_PARAVIRT
 /* These are the most performance critical ops, so we want to be able to patch
  * callers */
 #define PARAVIRT_IRQ_DISABLE 0
@@ -50,6 +51,9 @@ struct paravirt_ops
        char *(*memory_setup)(void);
        void (*init_IRQ)(void);
 
+       void (*pagetable_setup_start)(pgd_t *pgd_base);
+       void (*pagetable_setup_done)(pgd_t *pgd_base);
+
        void (*banner)(void);
 
        unsigned long (*get_wallclock)(void);
@@ -370,6 +374,17 @@ static inline void setup_secondary_clock(void)
 }
 #endif
 
+static inline void paravirt_pagetable_setup_start(pgd_t *base)
+{
+       if (paravirt_ops.pagetable_setup_start)
+               (*paravirt_ops.pagetable_setup_start)(base);
+}
+
+static inline void paravirt_pagetable_setup_done(pgd_t *base)
+{
+       if (paravirt_ops.pagetable_setup_done)
+               (*paravirt_ops.pagetable_setup_done)(base);
+}
 
 #ifdef CONFIG_SMP
 static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 147f255..0790ad6 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -514,6 +514,22 @@ do {                                                       
                \
  * tables contain all the necessary information.
  */
 #define update_mmu_cache(vma,address,pte) do { } while (0)
+
+void native_pagetable_setup_start(pgd_t *base);
+void native_pagetable_setup_done(pgd_t *base);
+
+#ifndef CONFIG_PARAVIRT
+static inline void paravirt_pagetable_setup_start(pgd_t *base)
+{
+       native_pagetable_setup_start(base);
+}
+
+static inline void paravirt_pagetable_setup_done(pgd_t *base)
+{
+       native_pagetable_setup_done(base);
+}
+#endif /* !CONFIG_PARAVIRT */
+
 #endif /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_FLATMEM
-
To unsubscribe from this list: send the line "unsubscribe git-commits-head" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to