Re: [KERNEL PATCH v9 3/3] xen/privcmd: Add new syscall to get gsi from dev
On 23.09.24 09:58, Juergen Gross wrote: On 23.09.24 07:49, Chen, Jiqian wrote: On 2024/9/21 05:17, Stefano Stabellini wrote: On Fri, 20 Sep 2024, Chen, Jiqian wrote: On 2024/9/19 06:49, Stefano Stabellini wrote: On Thu, 12 Sep 2024, Jiqian Chen wrote: On PVH dom0, when passthrough a device to domU, QEMU and xl tools want to use gsi number to do pirq mapping, see QEMU code xen_pt_realize->xc_physdev_map_pirq, and xl code pci_add_dm_done->xc_physdev_map_pirq, but in current codes, the gsi number is got from file /sys/bus/pci/devices//irq, that is wrong, because irq is not equal with gsi, they are in different spaces, so pirq mapping fails. And in current linux codes, there is no method to get gsi for userspace. For above purpose, record gsi of pcistub devices when init pcistub and add a new syscall into privcmd to let userspace can get gsi when they have a need. Signed-off-by: Jiqian Chen Signed-off-by: Huang Rui Signed-off-by: Jiqian Chen --- v8->v9 changes: Changed the syscall name from "IOCTL_PRIVCMD_GSI_FROM_DEV" to "IOCTL_PRIVCMD_PCIDEV_GET_GSI". Also changed the other functions name. Changed the macro wrapping "pcistub_get_gsi_from_sbdf" from "CONFIG_XEN_ACPI" to "CONFIG_XEN_PCIDEV_BACKEND" to fix compile errors reported by CI robot. Changed the parameter gsi of struct privcmd_pcidev_get_gsi from int to u32. v7->v8 changes: In function privcmd_ioctl_gsi_from_dev, return -EINVAL when not confige CONFIG_XEN_ACPI. Used PCI_BUS_NUM PCI_SLOT PCI_FUNC instead of open coding. v6->v7 changes: Changed implementation to add a new parameter "gsi" to struct pcistub_device and set gsi when pcistub initialize device. Then when userspace wants to get gsi and pass sbdf, we can return that gsi. v5->v6 changes: Changed implementation to add a new syscall to translate irq to gsi, instead adding a new gsi sysfs node, because the pci Maintainer didn't allow to add that sysfs node. v3->v5 changes: No. v2->v3 changes: Suggested by Roger: Abandoned previous implementations that added new syscall to get gsi from irq and changed to add a new sysfs node for gsi, then userspace can get gsi number from sysfs node. --- | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202406090826.whl6cb7r-...@intel.com/ --- | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202405171113.t431pc8o-...@intel.com/ --- drivers/xen/privcmd.c | 30 +++ drivers/xen/xen-pciback/pci_stub.c | 38 +++--- include/uapi/xen/privcmd.h | 7 ++ include/xen/acpi.h | 9 +++ 4 files changed, 81 insertions(+), 3 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 9563650dfbaf..1ed612d21543 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -46,6 +46,9 @@ #include #include #include +#ifdef CONFIG_XEN_ACPI +#include +#endif #include "privcmd.h" @@ -844,6 +847,29 @@ static long privcmd_ioctl_mmap_resource(struct file *file, return rc; } +static long privcmd_ioctl_pcidev_get_gsi(struct file *file, void __user *udata) +{ +#ifdef CONFIG_XEN_ACPI + int rc; + struct privcmd_pcidev_get_gsi kdata; + + if (copy_from_user(&kdata, udata, sizeof(kdata))) + return -EFAULT; + + rc = pcistub_get_gsi_from_sbdf(kdata.sbdf); + if (rc < 0) + return rc; + + kdata.gsi = rc; + if (copy_to_user(udata, &kdata, sizeof(kdata))) + return -EFAULT; + + return 0; +#else + return -EINVAL; +#endif +} + #ifdef CONFIG_XEN_PRIVCMD_EVENTFD /* Irqfd support */ static struct workqueue_struct *irqfd_cleanup_wq; @@ -1543,6 +1569,10 @@ static long privcmd_ioctl(struct file *file, ret = privcmd_ioctl_ioeventfd(file, udata); break; + case IOCTL_PRIVCMD_PCIDEV_GET_GSI: + ret = privcmd_ioctl_pcidev_get_gsi(file, udata); + break; + default: break; } diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 8ce27333f54b..2ea8e4075adc 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -56,6 +56,9 @@ struct pcistub_device { struct pci_dev *dev; struct xen_pcibk_device *pdev;/* non-NULL if struct pci_dev is in use */ +#ifdef CONFIG_XEN_ACPI + int gsi; +#endif }; /* Access to pcistub_devices & seized_devices lists and the initialize_devices @@ -88,6 +91,9 @@ static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) kref_init(&psdev->kref); spin_lock_init(&psdev->lock); +#ifdef CONFIG_XEN_ACPI + psdev->gsi = -1; +#endif return psdev; } @@ -220,6 +226,25 @@ static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev, return pci_dev; } +#ifdef CONFIG_XEN_PCIDE
Re: [KERNEL PATCH v9 3/3] xen/privcmd: Add new syscall to get gsi from dev
On 23.09.24 07:49, Chen, Jiqian wrote: On 2024/9/21 05:17, Stefano Stabellini wrote: On Fri, 20 Sep 2024, Chen, Jiqian wrote: On 2024/9/19 06:49, Stefano Stabellini wrote: On Thu, 12 Sep 2024, Jiqian Chen wrote: On PVH dom0, when passthrough a device to domU, QEMU and xl tools want to use gsi number to do pirq mapping, see QEMU code xen_pt_realize->xc_physdev_map_pirq, and xl code pci_add_dm_done->xc_physdev_map_pirq, but in current codes, the gsi number is got from file /sys/bus/pci/devices//irq, that is wrong, because irq is not equal with gsi, they are in different spaces, so pirq mapping fails. And in current linux codes, there is no method to get gsi for userspace. For above purpose, record gsi of pcistub devices when init pcistub and add a new syscall into privcmd to let userspace can get gsi when they have a need. Signed-off-by: Jiqian Chen Signed-off-by: Huang Rui Signed-off-by: Jiqian Chen --- v8->v9 changes: Changed the syscall name from "IOCTL_PRIVCMD_GSI_FROM_DEV" to "IOCTL_PRIVCMD_PCIDEV_GET_GSI". Also changed the other functions name. Changed the macro wrapping "pcistub_get_gsi_from_sbdf" from "CONFIG_XEN_ACPI" to "CONFIG_XEN_PCIDEV_BACKEND" to fix compile errors reported by CI robot. Changed the parameter gsi of struct privcmd_pcidev_get_gsi from int to u32. v7->v8 changes: In function privcmd_ioctl_gsi_from_dev, return -EINVAL when not confige CONFIG_XEN_ACPI. Used PCI_BUS_NUM PCI_SLOT PCI_FUNC instead of open coding. v6->v7 changes: Changed implementation to add a new parameter "gsi" to struct pcistub_device and set gsi when pcistub initialize device. Then when userspace wants to get gsi and pass sbdf, we can return that gsi. v5->v6 changes: Changed implementation to add a new syscall to translate irq to gsi, instead adding a new gsi sysfs node, because the pci Maintainer didn't allow to add that sysfs node. v3->v5 changes: No. v2->v3 changes: Suggested by Roger: Abandoned previous implementations that added new syscall to get gsi from irq and changed to add a new sysfs node for gsi, then userspace can get gsi number from sysfs node. --- | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202406090826.whl6cb7r-...@intel.com/ --- | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202405171113.t431pc8o-...@intel.com/ --- drivers/xen/privcmd.c | 30 +++ drivers/xen/xen-pciback/pci_stub.c | 38 +++--- include/uapi/xen/privcmd.h | 7 ++ include/xen/acpi.h | 9 +++ 4 files changed, 81 insertions(+), 3 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 9563650dfbaf..1ed612d21543 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -46,6 +46,9 @@ #include #include #include +#ifdef CONFIG_XEN_ACPI +#include +#endif #include "privcmd.h" @@ -844,6 +847,29 @@ static long privcmd_ioctl_mmap_resource(struct file *file, return rc; } +static long privcmd_ioctl_pcidev_get_gsi(struct file *file, void __user *udata) +{ +#ifdef CONFIG_XEN_ACPI + int rc; + struct privcmd_pcidev_get_gsi kdata; + + if (copy_from_user(&kdata, udata, sizeof(kdata))) + return -EFAULT; + + rc = pcistub_get_gsi_from_sbdf(kdata.sbdf); + if (rc < 0) + return rc; + + kdata.gsi = rc; + if (copy_to_user(udata, &kdata, sizeof(kdata))) + return -EFAULT; + + return 0; +#else + return -EINVAL; +#endif +} + #ifdef CONFIG_XEN_PRIVCMD_EVENTFD /* Irqfd support */ static struct workqueue_struct *irqfd_cleanup_wq; @@ -1543,6 +1569,10 @@ static long privcmd_ioctl(struct file *file, ret = privcmd_ioctl_ioeventfd(file, udata); break; + case IOCTL_PRIVCMD_PCIDEV_GET_GSI: + ret = privcmd_ioctl_pcidev_get_gsi(file, udata); + break; + default: break; } diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index 8ce27333f54b..2ea8e4075adc 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -56,6 +56,9 @@ struct pcistub_device { struct pci_dev *dev; struct xen_pcibk_device *pdev;/* non-NULL if struct pci_dev is in use */ +#ifdef CONFIG_XEN_ACPI + int gsi; +#endif }; /* Access to pcistub_devices & seized_devices lists and the initialize_devices @@ -88,6 +91,9 @@ static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) kref_init(&psdev->kref); spin_lock_init(&psdev->lock); +#ifdef CONFIG_XEN_ACPI + psdev->gsi = -1; +#endif return psdev; } @@ -220,6 +226,25 @@ static struct pci_dev *pcistub_device_get_pci_dev(struct xen_pcibk_device *pdev, return pci_dev; } +#ifdef CONFIG_XEN_PCIDEV_BACKEND This breaks configurations without CONFIG_ACPI and with CONFI
[GIT PULL] xen: branch for v6.12-rc1
Linus, Please git pull the following tag: git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git for-linus-6.12-rc1-tag xen: branch for v6.12-rc1 It contains the following patches: - A 7 patch series fixing a boot problem as a Xen dom0 on some AMD systems - A 3 patch series fixing Xen PVH boot problems with KASAN enabled - A fix for a build warning - 2 fixes in swiotlb-xen Thanks. Juergen arch/x86/include/asm/acpi.h| 8 ++ arch/x86/include/asm/cpuid.h | 7 +- arch/x86/kernel/acpi/boot.c| 11 ++ arch/x86/kernel/jailhouse.c| 1 + arch/x86/kernel/mmconf-fam10h_64.c | 1 + arch/x86/kernel/smpboot.c | 1 + arch/x86/kernel/x86_init.c | 1 + arch/x86/platform/pvh/Makefile | 1 + arch/x86/platform/pvh/enlighten.c | 6 +- arch/x86/xen/mmu_pv.c | 5 +- arch/x86/xen/p2m.c | 98 ++ arch/x86/xen/setup.c | 202 - arch/x86/xen/xen-ops.h | 6 +- drivers/xen/pci.c | 14 +-- drivers/xen/swiotlb-xen.c | 10 +- drivers/xen/xenbus/xenbus_xs.c | 6 +- 16 files changed, 312 insertions(+), 66 deletions(-) Alexey Dobriyan (3): xen, pvh: fix unbootable VMs (PVH + KASAN - AMD_MEM_ENCRYPT) x86/cpu: fix unbootable VMs by inlining memcmp() in hypervisor_cpuid_base() xen, pvh: fix unbootable VMs by inlining memset() in xen_prepare_pvh() Gustavo A. R. Silva (1): xen/pci: Avoid -Wflex-array-member-not-at-end warning Juergen Gross (9): xen: use correct end address of kernel for conflict checking xen: introduce generic helper checking for memory map conflicts xen: move checks for e820 conflicts further up xen: move max_pfn in xen_memory_setup() out of function scope xen: add capability to remap non-RAM pages to different PFNs xen: allow mapping ACPI data using a different physical address xen: tolerate ACPI NVS memory overlapping with Xen allocated memory xen/swiotlb: add alignment check for dma buffers xen/swiotlb: fix allocated size Shen Lichuan (1): xen/xenbus: Convert to use ERR_CAST()
Re: [PATCH v3 0/5] x86/pvh: Make 64bit PVH entry relocatable
x86 maintainers, are you going to pick this series up, or should I take it via the Xen tree? Juergen On 23.08.24 21:36, Jason Andryuk wrote: Using the PVH entry point, the uncompressed vmlinux is loaded at LOAD_PHYSICAL_ADDR, and execution starts in 32bit mode at the address in XEN_ELFNOTE_PHYS32_ENTRY, pvh_start_xen, with paging disabled. Loading at LOAD_PHYSICAL_ADDR has not been a problem in the past as virtual machines don't have conflicting memory maps. But Xen now supports a PVH dom0, which uses the host memory map, and there are Coreboot/EDK2 firmwares that have reserved regions conflicting with LOAD_PHYSICAL_ADDR. Xen recently added XEN_ELFNOTE_PHYS32_RELOC to specify an alignment, minimum and maximum load address when LOAD_PHYSICAL_ADDR cannot be used. This patch series makes the PVH entry path PIC to support relocation. Only x86-64 is converted. The 32bit entry path calling into vmlinux, which is not PIC, will not support relocation. The entry path needs pages tables to switch to 64bit mode. A new pvh_init_top_pgt is added to make the transition into the startup_64 when the regular init_top_pgt pagetables are setup. This duplication is unfortunate, but it keeps the changes simpler. __startup_64() can't be used to setup init_top_pgt for PVH entry because it is 64bit code - the 32bit entry code doesn't have page tables to use. This is the straight forward implementation to make it work. Other approaches could be pursued. checkpatch.pl gives an error: "ERROR: Macros with multiple statements should be enclosed in a do - while loop" about the moved PMDS macro. But PMDS is an assembler macro, so its not applicable. There are some false positive warnings "WARNING: space prohibited between function name and open parenthesis '('" about the macro, too. v2 addresses review feedback. It also replace LOAD_PHYSICAL_ADDR with _pa(pvh_start_xen) in some offset calculations. They happened to be equal in my original builds. When the two values differ, _pa(pvh_start_xen) is the correct one to use. v3: Fix build error for 32bit. Add Juergen's R-b to patch 4. Jason Andryuk (5): xen: sync elfnote.h from xen tree x86/pvh: Make PVH entrypoint PIC for x86-64 x86/pvh: Set phys_base when calling xen_prepare_pvh() x86/kernel: Move page table macros to header x86/pvh: Add 64bit relocation page tables arch/x86/include/asm/pgtable_64.h | 23 - arch/x86/kernel/head_64.S | 20 arch/x86/platform/pvh/head.S | 161 +++--- include/xen/interface/elfnote.h | 93 - 4 files changed, 259 insertions(+), 38 deletions(-) base-commit: 7c626ce4bae1ac14f60076d00eafe71af30450ba OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [PATCH v2 1/2] xen/swiotlb: add alignment check for dma buffers
On 16.09.24 08:56, Juergen Gross wrote: On 16.09.24 08:50, Jan Beulich wrote: On 16.09.2024 08:47, Juergen Gross wrote: --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -78,9 +78,15 @@ static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); + phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); next_bfn = pfn_to_bfn(xen_pfn); + /* If buffer is physically aligned, ensure DMA alignment. */ + if (IS_ALIGNED(p, algn) && + !IS_ALIGNED(next_bfn << XEN_PAGE_SHIFT, algn)) And this shift is not at risk of losing bits on Arm LPAE? For alignment check this just doesn't matter (assuming XEN_PAGE_SIZE is smaller than 4G). Wait, that was nonsense. I'll change the check to: !IS_ALIGNED((phys_addr_t)next_bfn << XEN_PAGE_SHIFT, algn) Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [PATCH v2 1/2] xen/swiotlb: add alignment check for dma buffers
On 16.09.24 08:50, Jan Beulich wrote: On 16.09.2024 08:47, Juergen Gross wrote: --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -78,9 +78,15 @@ static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); + phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); next_bfn = pfn_to_bfn(xen_pfn); + /* If buffer is physically aligned, ensure DMA alignment. */ + if (IS_ALIGNED(p, algn) && + !IS_ALIGNED(next_bfn << XEN_PAGE_SHIFT, algn)) And this shift is not at risk of losing bits on Arm LPAE? For alignment check this just doesn't matter (assuming XEN_PAGE_SIZE is smaller than 4G). Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[PATCH v2 0/2] xen/swiotlb: fix 2 issues in xen-swiotlb
Fix 2 issues in xen-swiotlb: - buffers need to be aligned properly - wrong buffer size if XEN_PAGE_SIZE < PAGE_SIZE Changes in V2: - added patch 2 Juergen Gross (2): xen/swiotlb: add alignment check for dma buffers xen/swiotlb: fix allocated size drivers/xen/swiotlb-xen.c | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) -- 2.43.0
[PATCH v2 2/2] xen/swiotlb: fix allocated size
The allocated size in xen_swiotlb_alloc_coherent() and xen_swiotlb_free_coherent() is calculated wrong for the case of XEN_PAGE_SIZE not matching PAGE_SIZE. Fix that. Fixes: 7250f422da04 ("xen-swiotlb: use actually allocated size on check physical continuous") Reported-by: Jan Beulich Signed-off-by: Juergen Gross --- V2: - new patch --- drivers/xen/swiotlb-xen.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index ddf5b1df632e..faa2fb7c74ae 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -147,7 +147,7 @@ xen_swiotlb_alloc_coherent(struct device *dev, size_t size, void *ret; /* Align the allocation to the Xen page size */ - size = 1UL << (order + XEN_PAGE_SHIFT); + size = ALIGN(size, XEN_PAGE_SIZE); ret = (void *)__get_free_pages(flags, get_order(size)); if (!ret) @@ -179,7 +179,7 @@ xen_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, int order = get_order(size); /* Convert the size to actually allocated. */ - size = 1UL << (order + XEN_PAGE_SHIFT); + size = ALIGN(size, XEN_PAGE_SIZE); if (WARN_ON_ONCE(dma_handle + size - 1 > dev->coherent_dma_mask) || WARN_ON_ONCE(range_straddles_page_boundary(phys, size))) -- 2.43.0
[PATCH v2 1/2] xen/swiotlb: add alignment check for dma buffers
When checking a memory buffer to be consecutive in machine memory, the alignment needs to be checked, too. Failing to do so might result in DMA memory not being aligned according to its requested size, leading to error messages like: 4xxx :2b:00.0: enabling device (0140 -> 0142) 4xxx :2b:00.0: Ring address not aligned 4xxx :2b:00.0: Failed to initialise service qat_crypto 4xxx :2b:00.0: Resetting device qat_dev0 4xxx: probe of :2b:00.0 failed with error -14 Fixes: 9435cce87950 ("xen/swiotlb: Add support for 64KB page granularity") Signed-off-by: Juergen Gross --- V2: - use 1ULL for creating align mask in order to cover ARM32 LPAE - fix case of XEN_PAGE_SIZE != PAGE_SIZE (Jan Beulich) --- drivers/xen/swiotlb-xen.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 35155258a7e2..ddf5b1df632e 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -78,9 +78,15 @@ static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); + phys_addr_t algn = 1ULL << (get_order(size) + PAGE_SHIFT); next_bfn = pfn_to_bfn(xen_pfn); + /* If buffer is physically aligned, ensure DMA alignment. */ + if (IS_ALIGNED(p, algn) && + !IS_ALIGNED(next_bfn << XEN_PAGE_SHIFT, algn)) + return 1; + for (i = 1; i < nr_pages; i++) if (pfn_to_bfn(++xen_pfn) != ++next_bfn) return 1; -- 2.43.0
Re: [PATCH v3 5/5] x86/pvh: Add 64bit relocation page tables
On 23.08.24 21:36, Jason Andryuk wrote: The PVH entry point is 32bit. For a 64bit kernel, the entry point must switch to 64bit mode, which requires a set of page tables. In the past, PVH used init_top_pgt. This works fine when the kernel is loaded at LOAD_PHYSICAL_ADDR, as the page tables are prebuilt for this address. If the kernel is loaded at a different address, they need to be adjusted. __startup_64() adjusts the prebuilt page tables for the physical load address, but it is 64bit code. The 32bit PVH entry code can't call it to adjust the page tables, so it can't readily be re-used. 64bit PVH entry needs page tables set up for identity map, the kernel high map and the direct map. pvh_start_xen() enters identity mapped. Inside xen_prepare_pvh(), it jumps through a pv_ops function pointer into the highmap. The direct map is used for __va() on the initramfs and other guest physical addresses. Add a dedicated set of prebuild page tables for PVH entry. They are adjusted in assembly before loading. Add XEN_ELFNOTE_PHYS32_RELOC to indicate support for relocation along with the kernel's loading constraints. The maximum load address, KERNEL_IMAGE_SIZE - 1, is determined by a single pvh_level2_ident_pgt page. It could be larger with more pages. Signed-off-by: Jason Andryuk Reviewed-by: Juergen Gross Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [PATCH v3 2/5] x86/pvh: Make PVH entrypoint PIC for x86-64
On 23.08.24 21:36, Jason Andryuk wrote: The PVH entrypoint is 32bit non-PIC code running the uncompressed vmlinux at its load address CONFIG_PHYSICAL_START - default 0x100 (16MB). The kernel is loaded at that physical address inside the VM by the VMM software (Xen/QEMU). When running a Xen PVH Dom0, the host reserved addresses are mapped 1-1 into the PVH container. There exist system firmwares (Coreboot/EDK2) with reserved memory at 16MB. This creates a conflict where the PVH kernel cannot be loaded at that address. Modify the PVH entrypoint to be position-indepedent to allow flexibility in load address. Only the 64bit entry path is converted. A 32bit kernel is not PIC, so calling into other parts of the kernel, like xen_prepare_pvh() and mk_pgtable_32(), don't work properly when relocated. This makes the code PIC, but the page tables need to be updated as well to handle running from the kernel high map. The UNWIND_HINT_END_OF_STACK is to silence: vmlinux.o: warning: objtool: pvh_start_xen+0x7f: unreachable instruction after the lret into 64bit code. Signed-off-by: Jason Andryuk Reviewed-by: Juergen Gross Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[PATCH] xen/swiotlb: add alignment check for dma buffers
When checking a memory buffer to be consecutive in machine memory, the alignment needs to be checked, too. Failing to do so might result in DMA memory not being aligned according to its requested size, leading to error messages like: 4xxx :2b:00.0: enabling device (0140 -> 0142) 4xxx :2b:00.0: Ring address not aligned 4xxx :2b:00.0: Failed to initialise service qat_crypto 4xxx :2b:00.0: Resetting device qat_dev0 4xxx: probe of :2b:00.0 failed with error -14 Signed-off-by: Juergen Gross --- drivers/xen/swiotlb-xen.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 35155258a7e2..11f4b1195324 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -78,9 +78,15 @@ static inline int range_straddles_page_boundary(phys_addr_t p, size_t size) { unsigned long next_bfn, xen_pfn = XEN_PFN_DOWN(p); unsigned int i, nr_pages = XEN_PFN_UP(xen_offset_in_page(p) + size); + unsigned int order = get_order(size); next_bfn = pfn_to_bfn(xen_pfn); + /* If buffer is physically aligned, ensure DMA alignment. */ + if (IS_ALIGNED(p, 1UL << (order + PAGE_SHIFT)) && + !IS_ALIGNED(next_bfn, 1UL << order)) + return 1; + for (i = 1; i < nr_pages; i++) if (pfn_to_bfn(++xen_pfn) != ++next_bfn) return 1; -- 2.43.0
[PATCH v3 6/7] xen: allow mapping ACPI data using a different physical address
When running as a Xen PV dom0 the system needs to map ACPI data of the host using host physical addresses, while those addresses can conflict with the guest physical addresses of the loaded linux kernel. The same problem might apply in case a PV guest is configured to use the host memory map. This conflict can be solved by mapping the ACPI data to a different guest physical address, but mapping the data via acpi_os_ioremap() must still be possible using the host physical address, as this address might be generated by AML when referencing some of the ACPI data. When configured to support running as a Xen PV domain, have an implementation of acpi_os_ioremap() being aware of the possibility to need above mentioned translation of a host physical address to the guest physical address. This modification requires to fix some #include of asm/acpi.h in x86 code to use linux/acpi.h instead. Signed-off-by: Juergen Gross --- V2: - new patch (Jan Beulich) V3: - add const attribute (Jan Beulich) - guard ACPI related code with CONFIG_ACPI (Jan Beulich) - use CONFIG_XEN_PV instead of CONFIG_XEN_PV_DOM0 --- arch/x86/include/asm/acpi.h| 8 +++ arch/x86/kernel/acpi/boot.c| 10 + arch/x86/kernel/mmconf-fam10h_64.c | 2 +- arch/x86/kernel/x86_init.c | 2 +- arch/x86/xen/p2m.c | 34 ++ arch/x86/xen/setup.c | 2 +- 6 files changed, 55 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 21bc53f5ed0c..5ab1a4598d00 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -174,6 +174,14 @@ void acpi_generic_reduced_hw_init(void); void x86_default_set_root_pointer(u64 addr); u64 x86_default_get_root_pointer(void); +#ifdef CONFIG_XEN_PV +/* A Xen PV domain needs a special acpi_os_ioremap() handling. */ +extern void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, +acpi_size size); +void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size); +#define acpi_os_ioremap acpi_os_ioremap +#endif + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 9f4618dcd704..2de8510c56dd 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1778,3 +1778,13 @@ u64 x86_default_get_root_pointer(void) { return boot_params.acpi_rsdp_addr; } + +#ifdef CONFIG_XEN_PV +void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +{ + return ioremap_cache(phys, size); +} + +void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, acpi_size size) = + x86_acpi_os_ioremap; +#endif diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index c94dec6a1834..8347a29f9db4 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -9,12 +9,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 82b128d3f309..47ef8af23101 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -8,8 +8,8 @@ #include #include #include +#include -#include #include #include #include diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 5b2aeae6f9e4..a64e9562733e 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -836,6 +837,33 @@ void __init xen_do_remap_nonram(void) pr_info("Remapped %u non-RAM page(s)\n", remapped); } +#ifdef CONFIG_ACPI +/* + * Xen variant of acpi_os_ioremap() taking potentially remapped non-RAM + * regions into acount. + * Any attempt to map an area crossing a remap boundary will produce a + * WARN() splat. + */ +static void __iomem *xen_acpi_os_ioremap(acpi_physical_address phys, +acpi_size size) +{ + unsigned int i; + const struct nonram_remap *remap = xen_nonram_remap; + + for (i = 0; i < nr_nonram_remap; i++) { + if (phys + size > remap->maddr && + phys < remap->maddr + remap->size) { + WARN_ON(phys < remap->maddr || + phys + size > remap->maddr + remap->size); + phys = remap->paddr + phys - remap->maddr; + break; + } + } + + return x86_acpi_os_ioremap(phys, size); +} +#endif /* CONFIG_ACPI */ + /* * Add a new non-RAM remap entry. * In case of no free entry found, just crash the system. @@ -850,6 +878,12 @@ void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, BUG(); } +#ifdef CONFIG_ACPI + /* S
[PATCH v3 4/7] xen: move max_pfn in xen_memory_setup() out of function scope
Instead of having max_pfn as a local variable of xen_memory_setup(), make it a static variable in setup.c instead. This avoids having to pass it to subfunctions, which will be needed in more cases in future. Rename it to ini_nr_pages, as the value denotes the currently usable number of memory pages as passed from the hypervisor at boot time. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki Reviewed-by: Jan Beulich --- arch/x86/xen/setup.c | 52 ++-- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index dba68951ed6b..2c79bb5a9cd0 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -46,6 +46,9 @@ bool xen_pv_pci_possible; /* E820 map used during setting up memory. */ static struct e820_table xen_e820_table __initdata; +/* Number of initially usable memory pages. */ +static unsigned long ini_nr_pages __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -212,7 +215,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages) + unsigned long end_pfn) { unsigned long pfn, end; int ret; @@ -220,7 +223,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN_ON(start_pfn > end_pfn); /* Release pages first. */ - end = min(end_pfn, nr_pages); + end = min(end_pfn, ini_nr_pages); for (pfn = start_pfn; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -341,15 +344,14 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, - unsigned long remap_pfn) + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; unsigned long n = end_pfn - start_pfn; if (remap_pfn == 0) - remap_pfn = nr_pages; + remap_pfn = ini_nr_pages; while (i < n) { unsigned long cur_pfn = start_pfn + i; @@ -358,19 +360,19 @@ static unsigned long __init xen_set_identity_and_remap_chunk( unsigned long remap_range_size; /* Do not remap pages beyond the current allocation */ - if (cur_pfn >= nr_pages) { + if (cur_pfn >= ini_nr_pages) { /* Identity map remaining pages */ set_phys_range_identity(cur_pfn, cur_pfn + size); break; } - if (cur_pfn + size > nr_pages) - size = nr_pages - cur_pfn; + if (cur_pfn + size > ini_nr_pages) + size = ini_nr_pages - cur_pfn; remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warn("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages); + cur_pfn + left); break; } /* Adjust size to fit in current e820 RAM region */ @@ -397,18 +399,18 @@ static unsigned long __init xen_set_identity_and_remap_chunk( } static unsigned long __init xen_count_remap_pages( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pages) { - if (start_pfn >= nr_pages) + if (start_pfn >= ini_nr_pages) return remap_pages; - return remap_pages + min(end_pfn, nr_pages) - start_pfn; + return remap_pages + min(end_pfn, ini_nr_pages) - start_pfn; } -static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, +static unsigned long __init xen_foreach_remap_area( unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn, - unsigned long nr_pages, unsigned long last_val)) + unsigned long last_val)) { phys_addr_t start = 0; unsigned long ret_val = 0; @@ -436,8 +438,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, end_pfn = PFN_UP(entry->addr);
[PATCH v3 1/7] xen: use correct end address of kernel for conflict checking
When running as a Xen PV dom0 the kernel is loaded by the hypervisor using a different memory map than that of the host. In order to minimize the required changes in the kernel, the kernel adapts its memory map to that of the host. In order to do that it is checking for conflicts of its load address with the host memory map. Unfortunately the tested memory range does not include the .brk area, which might result in crashes or memory corruption when this area does conflict with the memory map of the host. Fix the test by using the _end label instead of __bss_stop. Fixes: 808fdb71936c ("xen: check for kernel memory conflicting with memory layout") Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki Reviewed-by: Jan Beulich --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 806ddb2391d9..4bcc70a71b7d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -825,7 +825,7 @@ char * __init xen_memory_setup(void) * to relocating (and even reusing) pages with kernel text or data. */ if (xen_is_e820_reserved(__pa_symbol(_text), - __pa_symbol(__bss_stop) - __pa_symbol(_text))) { +__pa_symbol(_end) - __pa_symbol(_text))) { xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); BUG(); } -- 2.43.0
[PATCH v3 7/7] xen: tolerate ACPI NVS memory overlapping with Xen allocated memory
In order to minimize required special handling for running as Xen PV dom0, the memory layout is modified to match that of the host. This requires to have only RAM at the locations where Xen allocated memory is living. Unfortunately there seem to be some machines, where ACPI NVS is located at 64 MB, resulting in a conflict with the loaded kernel or the initial page tables built by Xen. Avoid this conflict by swapping the ACPI NVS area in the memory map with unused RAM. This is possible via modification of the dom0 P2M map. Accesses to the ACPI NVS area are done either for saving and restoring it across suspend operations (this will work the same way as before), or by ACPI code when NVS memory is referenced from other ACPI tables. The latter case is handled by a Xen specific indirection of acpi_os_ioremap(). While the E820 map can (and should) be modified right away, the P2M map can be updated only after memory allocation is working, as the P2M map might need to be extended. Fixes: 808fdb71936c ("xen: check for kernel memory conflicting with memory layout") Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki --- V2: - remap helpers split off into other patch V3: - adjust commit message (Jan Beulich) --- arch/x86/xen/setup.c | 92 +++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1114e49937da..c3db71d96c43 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -495,6 +495,8 @@ void __init xen_remap_memory(void) set_pte_mfn(buf, mfn_save, PAGE_KERNEL); pr_info("Remapped %ld page(s)\n", remapped); + + xen_do_remap_nonram(); } static unsigned long __init xen_get_pages_limit(void) @@ -625,14 +627,102 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) return 0; } +/* + * Swap a non-RAM E820 map entry with RAM above ini_nr_pages. + * Note that the E820 map is modified accordingly, but the P2M map isn't yet. + * The adaption of the P2M must be deferred until page allocation is possible. + */ +static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t mem_end = PFN_PHYS(ini_nr_pages); + phys_addr_t swap_addr, swap_size, entry_end; + + swap_addr = PAGE_ALIGN_DOWN(swap_entry->addr); + swap_size = PAGE_ALIGN(swap_entry->addr - swap_addr + swap_entry->size); + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + entry_end = entry->addr + entry->size; + if (entry->type == E820_TYPE_RAM && entry->size >= swap_size && + entry_end - swap_size >= mem_end) { + /* Reduce RAM entry by needed space (whole pages). */ + entry->size -= swap_size; + + /* Add new entry at the end of E820 map. */ + entry = xen_e820_table.entries + + xen_e820_table.nr_entries; + xen_e820_table.nr_entries++; + + /* Fill new entry (keep size and page offset). */ + entry->type = swap_entry->type; + entry->addr = entry_end - swap_size + + swap_addr - swap_entry->addr; + entry->size = swap_entry->size; + + /* Convert old entry to RAM, align to pages. */ + swap_entry->type = E820_TYPE_RAM; + swap_entry->addr = swap_addr; + swap_entry->size = swap_size; + + /* Remember PFN<->MFN relation for P2M update. */ + xen_add_remap_nonram(swap_addr, entry_end - swap_size, +swap_size); + + /* Order E820 table and merge entries. */ + e820__update_table(&xen_e820_table); + + return; + } + + entry++; + } + + xen_raw_console_write("No suitable area found for required E820 entry remapping action\n"); + BUG(); +} + +/* + * Look for non-RAM memory types in a specific guest physical area and move + * those away if possible (ACPI NVS only for now). + */ +static void __init xen_e820_resolve_conflicts(phys_addr_t start, + phys_addr_t size) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t end; + + if (!size) + return; + + end = start + size; + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + if (entry->addr >
[PATCH v3 5/7] xen: add capability to remap non-RAM pages to different PFNs
When running as a Xen PV dom0 it can happen that the kernel is being loaded to a guest physical address conflicting with the host memory map. In order to be able to resolve this conflict, add the capability to remap non-RAM areas to different guest PFNs. A function to use this remapping information for other purposes than doing the remap will be added when needed. As the number of conflicts should be rather low (currently only machines with max. 1 conflict are known), save the remap data in a small statically allocated array. Signed-off-by: Juergen Gross --- V2: - split off from patch 5 of V1 of the series - moved to p2m.c V3: - add const qualifier (Jan Beulich) - change remap size type to size_t (Jan Beulich) - add __ro_after_init qualifier (Jan Beulich) - log frame numbers in hex (Jan Beulich) - always issue message regarding number of remapped pages (Jan Beulich) - add sanity check (Jan Beulich) - fix remap loop (Jan Beulich) Signed-off-by: Juergen Gross --- arch/x86/xen/p2m.c | 65 ++ arch/x86/xen/xen-ops.h | 3 ++ 2 files changed, 68 insertions(+) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 7c735b730acd..5b2aeae6f9e4 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -80,6 +80,7 @@ #include #include #include +#include #include "xen-ops.h" @@ -792,6 +793,70 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return ret; } +/* Remapped non-RAM areas */ +#define NR_NONRAM_REMAP 4 +static struct nonram_remap { + phys_addr_t maddr; + phys_addr_t paddr; + size_t size; +} xen_nonram_remap[NR_NONRAM_REMAP] __ro_after_init; +static unsigned int nr_nonram_remap __ro_after_init; + +/* + * Do the real remapping of non-RAM regions as specified in the + * xen_nonram_remap[] array. + * In case of an error just crash the system. + */ +void __init xen_do_remap_nonram(void) +{ + unsigned int i; + unsigned int remapped = 0; + const struct nonram_remap *remap = xen_nonram_remap; + unsigned long pfn, mfn, end_pfn; + + for (i = 0; i < nr_nonram_remap; i++) { + end_pfn = PFN_UP(remap->paddr + remap->size); + pfn = PFN_DOWN(remap->paddr); + mfn = PFN_DOWN(remap->maddr); + while (pfn < end_pfn) { + if (!set_phys_to_machine(pfn, mfn)) { + pr_err("Failed to set p2m mapping for pfn=%lx mfn=%lx\n", + pfn, mfn); + BUG(); + } + + pfn++; + mfn++; + remapped++; + } + + remap++; + } + + pr_info("Remapped %u non-RAM page(s)\n", remapped); +} + +/* + * Add a new non-RAM remap entry. + * In case of no free entry found, just crash the system. + */ +void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, +unsigned long size) +{ + BUG_ON((maddr & ~PAGE_MASK) != (paddr & ~PAGE_MASK)); + + if (nr_nonram_remap == NR_NONRAM_REMAP) { + xen_raw_console_write("Number of required E820 entry remapping actions exceed maximum value\n"); + BUG(); + } + + xen_nonram_remap[nr_nonram_remap].maddr = maddr; + xen_nonram_remap[nr_nonram_remap].paddr = paddr; + xen_nonram_remap[nr_nonram_remap].size = size; + + nr_nonram_remap++; +} + #ifdef CONFIG_XEN_DEBUG_FS #include static int p2m_dump_show(struct seq_file *m, void *v) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a27d1d653d3..e1b782e823e6 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -47,6 +47,9 @@ void xen_mm_unpin_all(void); #ifdef CONFIG_X86_64 void __init xen_relocate_p2m(void); #endif +void __init xen_do_remap_nonram(void); +void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, +unsigned long size); void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, const char *component); -- 2.43.0
[PATCH v3 3/7] xen: move checks for e820 conflicts further up
Move the checks for e820 memory map conflicts using the xen_chk_is_e820_usable() helper further up in order to prepare resolving some of the possible conflicts by doing some e820 map modifications, which must happen before evaluating the RAM layout. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki Reviewed-by: Jan Beulich --- arch/x86/xen/setup.c | 44 ++-- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 96765180514b..dba68951ed6b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -764,6 +764,28 @@ char * __init xen_memory_setup(void) /* Make sure the Xen-supplied memory map is well-ordered. */ e820__update_table(&xen_e820_table); + /* +* Check whether the kernel itself conflicts with the target E820 map. +* Failing now is better than running into weird problems later due +* to relocating (and even reusing) pages with kernel text or data. +*/ + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* +* Check for a conflict of the xen_start_info memory with the target +* E820 map. +*/ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); + + /* +* Check for a conflict of the hypervisor supplied page tables with +* the target E820 map. +*/ + xen_pt_check_e820(); + max_pages = xen_get_max_pages(); /* How many extra pages do we need due to remapping? */ @@ -836,28 +858,6 @@ char * __init xen_memory_setup(void) e820__update_table(e820_table); - /* -* Check whether the kernel itself conflicts with the target E820 map. -* Failing now is better than running into weird problems later due -* to relocating (and even reusing) pages with kernel text or data. -*/ - xen_chk_is_e820_usable(__pa_symbol(_text), - __pa_symbol(_end) - __pa_symbol(_text), - "kernel"); - - /* -* Check for a conflict of the xen_start_info memory with the target -* E820 map. -*/ - xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), - "xen_start_info"); - - /* -* Check for a conflict of the hypervisor supplied page tables with -* the target E820 map. -*/ - xen_pt_check_e820(); - xen_reserve_xen_mfnlist(); /* Check for a conflict of the initrd with the target E820 map. */ -- 2.43.0
[PATCH v3 2/7] xen: introduce generic helper checking for memory map conflicts
When booting as a Xen PV dom0 the memory layout of the dom0 is modified to match that of the host, as this requires less changes in the kernel for supporting Xen. There are some cases, though, which are problematic, as it is the Xen hypervisor selecting the kernel's load address plus some other data, which might conflict with the host's memory map. These conflicts are detected at boot time and result in a boot error. In order to support handling at least some of these conflicts in future, introduce a generic helper function which will later gain the ability to adapt the memory layout when possible. Add the missing check for the xen_start_info area. Note that possible p2m map and initrd memory conflicts are handled already by copying the data to memory areas not conflicting with the memory map. The initial stack allocated by Xen doesn't need to be checked, as early boot code is switching to the statically allocated initial kernel stack. Initial page tables and the kernel itself will be handled later. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki Reviewed-by: Jan Beulich --- arch/x86/xen/mmu_pv.c | 5 + arch/x86/xen/setup.c | 34 -- arch/x86/xen/xen-ops.h | 3 ++- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index f1ce39d6d32c..839e6613753d 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2018,10 +2018,7 @@ void __init xen_reserve_special_pages(void) void __init xen_pt_check_e820(void) { - if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { - xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(xen_pt_base, xen_pt_size, "page table"); } static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 4bcc70a71b7d..96765180514b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -567,7 +567,7 @@ static void __init xen_ignore_unusable(void) } } -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +static bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) { struct e820_entry *entry; unsigned mapcnt; @@ -624,6 +624,23 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) return 0; } +/* + * Check for an area in physical memory to be usable for non-movable purposes. + * An area is considered to usable if the used E820 map lists it to be RAM. + * In case the area is not usable, crash the system with an error message. + */ +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component) +{ + if (!xen_is_e820_reserved(start, size)) + return; + + xen_raw_console_write("Xen hypervisor allocated "); + xen_raw_console_write(component); + xen_raw_console_write(" memory conflicts with E820 map\n"); + BUG(); +} + /* * Like memcpy, but with physical addresses for dest and src. */ @@ -824,11 +841,16 @@ char * __init xen_memory_setup(void) * Failing now is better than running into weird problems later due * to relocating (and even reusing) pages with kernel text or data. */ - if (xen_is_e820_reserved(__pa_symbol(_text), -__pa_symbol(_end) - __pa_symbol(_text))) { - xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* +* Check for a conflict of the xen_start_info memory with the target +* E820 map. +*/ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); /* * Check for a conflict of the hypervisor supplied page tables with diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0cf16fc79e0b..9a27d1d653d3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -48,7 +48,8 @@ void xen_mm_unpin_all(void); void __init xen_relocate_p2m(void); #endif -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); -- 2.43.0
[PATCH v3 0/7] xen: fix dom0 PV boot on some AMD machines
There have been reports of failed boots with Xen due to an overlap of the kernel's memory with ACPI NVS reported in the E820 map of the host. This series fixes this issue by moving the NVS area in dom0 to some higher address. Changes in V2: - split of v1 patch 5 - new patch 6 Changes in V3: - addressed comments Juergen Gross (7): xen: use correct end address of kernel for conflict checking xen: introduce generic helper checking for memory map conflicts xen: move checks for e820 conflicts further up xen: move max_pfn in xen_memory_setup() out of function scope xen: add capability to remap non-RAM pages to different PFNs xen: allow mapping ACPI data using a different physical address xen: tolerate ACPI NVS memory overlapping with Xen allocated memory arch/x86/include/asm/acpi.h| 8 ++ arch/x86/kernel/acpi/boot.c| 10 ++ arch/x86/kernel/mmconf-fam10h_64.c | 2 +- arch/x86/kernel/x86_init.c | 2 +- arch/x86/xen/mmu_pv.c | 5 +- arch/x86/xen/p2m.c | 99 ++ arch/x86/xen/setup.c | 202 ++--- arch/x86/xen/xen-ops.h | 6 +- 8 files changed, 282 insertions(+), 52 deletions(-) -- 2.43.0
Re: [PATCH v2 6/7] xen: allow mapping ACPI data using a different physical address
On 20.08.24 11:56, Jan Beulich wrote: On 20.08.2024 10:20, Juergen Gross wrote: @@ -838,6 +839,31 @@ void __init xen_do_remap_nonram(void) pr_info("Remapped %u non-RAM page(s)\n", remapped); } +/* + * Xen variant of acpi_os_ioremap() taking potentially remapped non-RAM + * regions into acount. + * Any attempt to map an area crossing a remap boundary will produce a + * WARN() splat. + */ +static void __iomem *xen_acpi_os_ioremap(acpi_physical_address phys, +acpi_size size) +{ + unsigned int i; + struct nonram_remap *remap = xen_nonram_remap; const (also in one of the functions in patch 5)? Yes. + for (i = 0; i < nr_nonram_remap; i++) { + if (phys + size > remap->maddr && + phys < remap->maddr + remap->size) { + WARN_ON(phys < remap->maddr || + phys + size > remap->maddr + remap->size); + phys = remap->paddr + phys - remap->maddr; + break; + } + } + + return x86_acpi_os_ioremap(phys, size); +} At least this, perhaps also what patch 5 adds, likely wants to be limited to the XEN_DOM0 case? Or else I wonder whether ... @@ -850,6 +876,10 @@ void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, BUG(); } + /* Switch to the Xen acpi_os_ioremap() variant. */ + if (nr_nonram_remap == 0) + acpi_os_ioremap = xen_acpi_os_ioremap; ... this would actually build when XEN_DOM0=n. I'm actually surprised there's no Dom0-only code section in this file, where the new code could then simply be inserted. I'd rather make this conditional on CONFIG_ACPI. Depending on how Xen tools will handle a PV-domain with "e820_host=1" this code might be important for domUs, too. Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[PATCH v2 6/7] xen: allow mapping ACPI data using a different physical address
When running as a Xen PV dom0 the system needs to map ACPI data of the host using host physical addresses, while those addresses can conflict with the guest physical addresses of the loaded linux kernel. This conflict can be solved by mapping the ACPI data to a different guest physical address, but mapping the data via acpi_os_ioremap() must still be possible using the host physical address, as this address might be generated by AML when referencing some of the ACPI data. When configured to support running as a Xen PV dom0, have an implementation of acpi_os_ioremap() being aware of the possibility to need above mentioned translation of a host physical address to the guest physical address. This modification requires to fix some #include of asm/acpi.h in x86 code to use linux/acpi.h instead. Signed-off-by: Juergen Gross --- V2: - new patch (Jan Beulich) --- arch/x86/include/asm/acpi.h| 8 arch/x86/kernel/acpi/boot.c| 10 ++ arch/x86/kernel/mmconf-fam10h_64.c | 2 +- arch/x86/kernel/x86_init.c | 2 +- arch/x86/xen/p2m.c | 30 ++ arch/x86/xen/setup.c | 2 +- 6 files changed, 51 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 21bc53f5ed0c..42067643e615 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -174,6 +174,14 @@ void acpi_generic_reduced_hw_init(void); void x86_default_set_root_pointer(u64 addr); u64 x86_default_get_root_pointer(void); +#ifdef CONFIG_XEN_PV_DOM0 +/* A Xen PV dom0 needs a special acpi_os_ioremap() handling. */ +extern void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, +acpi_size size); +void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size); +#define acpi_os_ioremap acpi_os_ioremap +#endif + #else /* !CONFIG_ACPI */ #define acpi_lapic 0 diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 9f4618dcd704..6dd01d15f277 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1778,3 +1778,13 @@ u64 x86_default_get_root_pointer(void) { return boot_params.acpi_rsdp_addr; } + +#ifdef CONFIG_XEN_PV_DOM0 +void __iomem *x86_acpi_os_ioremap(acpi_physical_address phys, acpi_size size) +{ + return ioremap_cache(phys, size); +} + +void __iomem * (*acpi_os_ioremap)(acpi_physical_address phys, acpi_size size) = + x86_acpi_os_ioremap; +#endif diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index c94dec6a1834..8347a29f9db4 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -9,12 +9,12 @@ #include #include #include +#include #include #include #include #include -#include #include #include diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 82b128d3f309..47ef8af23101 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -8,8 +8,8 @@ #include #include #include +#include -#include #include #include #include diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index bb55e0fe1a04..61aaf066bf3e 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -70,6 +70,7 @@ #include #include #include +#include #include #include @@ -838,6 +839,31 @@ void __init xen_do_remap_nonram(void) pr_info("Remapped %u non-RAM page(s)\n", remapped); } +/* + * Xen variant of acpi_os_ioremap() taking potentially remapped non-RAM + * regions into acount. + * Any attempt to map an area crossing a remap boundary will produce a + * WARN() splat. + */ +static void __iomem *xen_acpi_os_ioremap(acpi_physical_address phys, +acpi_size size) +{ + unsigned int i; + struct nonram_remap *remap = xen_nonram_remap; + + for (i = 0; i < nr_nonram_remap; i++) { + if (phys + size > remap->maddr && + phys < remap->maddr + remap->size) { + WARN_ON(phys < remap->maddr || + phys + size > remap->maddr + remap->size); + phys = remap->paddr + phys - remap->maddr; + break; + } + } + + return x86_acpi_os_ioremap(phys, size); +} + /* * Add a new non-RAM remap entry. * In case of no free entry found, just crash the system. @@ -850,6 +876,10 @@ void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, BUG(); } + /* Switch to the Xen acpi_os_ioremap() variant. */ + if (nr_nonram_remap == 0) + acpi_os_ioremap = xen_acpi_os_ioremap; + xen_nonram_remap[nr_nonram_remap].maddr = maddr; xen_nonram_remap[nr_nonram_remap].paddr = paddr; xen_nonram_remap[nr_nonram_remap].size = size; diff
[PATCH v2 4/7] xen: move max_pfn in xen_memory_setup() out of function scope
Instead of having max_pfn as a local variable of xen_memory_setup(), make it a static variable in setup.c instead. This avoids having to pass it to subfunctions, which will be needed in more cases in future. Rename it to ini_nr_pages, as the value denotes the currently usable number of memory pages as passed from the hypervisor at boot time. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki --- arch/x86/xen/setup.c | 53 ++-- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index dba68951ed6b..d678c0330971 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -46,6 +46,9 @@ bool xen_pv_pci_possible; /* E820 map used during setting up memory. */ static struct e820_table xen_e820_table __initdata; +/* Number of initially usable memory pages. */ +static unsigned long ini_nr_pages __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -212,7 +215,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages) + unsigned long end_pfn) { unsigned long pfn, end; int ret; @@ -220,7 +223,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN_ON(start_pfn > end_pfn); /* Release pages first. */ - end = min(end_pfn, nr_pages); + end = min(end_pfn, ini_nr_pages); for (pfn = start_pfn; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -341,15 +344,14 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, - unsigned long remap_pfn) + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; unsigned long n = end_pfn - start_pfn; if (remap_pfn == 0) - remap_pfn = nr_pages; + remap_pfn = ini_nr_pages; while (i < n) { unsigned long cur_pfn = start_pfn + i; @@ -358,19 +360,19 @@ static unsigned long __init xen_set_identity_and_remap_chunk( unsigned long remap_range_size; /* Do not remap pages beyond the current allocation */ - if (cur_pfn >= nr_pages) { + if (cur_pfn >= ini_nr_pages) { /* Identity map remaining pages */ set_phys_range_identity(cur_pfn, cur_pfn + size); break; } - if (cur_pfn + size > nr_pages) - size = nr_pages - cur_pfn; + if (cur_pfn + size > ini_nr_pages) + size = ini_nr_pages - cur_pfn; remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warn("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages); + cur_pfn + left); break; } /* Adjust size to fit in current e820 RAM region */ @@ -397,18 +399,18 @@ static unsigned long __init xen_set_identity_and_remap_chunk( } static unsigned long __init xen_count_remap_pages( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pages) { - if (start_pfn >= nr_pages) + if (start_pfn >= ini_nr_pages) return remap_pages; - return remap_pages + min(end_pfn, nr_pages) - start_pfn; + return remap_pages + min(end_pfn, ini_nr_pages) - start_pfn; } -static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, +static unsigned long __init xen_foreach_remap_area( unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn, - unsigned long nr_pages, unsigned long last_val)) + unsigned long last_val)) { phys_addr_t start = 0; unsigned long ret_val = 0; @@ -436,8 +438,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) -
[PATCH v2 7/7] xen: tolerate ACPI NVS memory overlapping with Xen allocated memory
In order to minimize required special handling for running as Xen PV dom0, the memory layout is modified to match that of the host. This requires to have only RAM at the locations where Xen allocated memory is living. Unfortunately there seem to be some machines, where ACPI NVS is located at 64 MB, resulting in a conflict with the loaded kernel or the initial page tables built by Xen. As ACPI NVS needs to be accessed by the kernel only for saving and restoring it across suspend operations, it can be relocated in the dom0's memory map by swapping it with unused RAM (this is possible via modification of the dom0 P2M map). While the E820 map can (and should) be modified right away, the P2M map can be updated only after memory allocation is working, as the P2M map might need to be extended. Fixes: 808fdb71936c ("xen: check for kernel memory conflicting with memory layout") Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki --- V2: - remap helpers split off into other patch --- arch/x86/xen/setup.c | 92 +++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 88b2ebd23da3..5697f1cdd6a0 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -495,6 +495,8 @@ void __init xen_remap_memory(void) set_pte_mfn(buf, mfn_save, PAGE_KERNEL); pr_info("Remapped %ld page(s)\n", remapped); + + xen_do_remap_nonram(); } static unsigned long __init xen_get_pages_limit(void) @@ -625,14 +627,102 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) return 0; } +/* + * Swap a non-RAM E820 map entry with RAM above ini_nr_pages. + * Note that the E820 map is modified accordingly, but the P2M map isn't yet. + * The adaption of the P2M must be deferred until page allocation is possible. + */ +static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t mem_end = PFN_PHYS(ini_nr_pages); + phys_addr_t swap_addr, swap_size, entry_end; + + swap_addr = PAGE_ALIGN_DOWN(swap_entry->addr); + swap_size = PAGE_ALIGN(swap_entry->addr - swap_addr + swap_entry->size); + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + entry_end = entry->addr + entry->size; + if (entry->type == E820_TYPE_RAM && entry->size >= swap_size && + entry_end - swap_size >= mem_end) { + /* Reduce RAM entry by needed space (whole pages). */ + entry->size -= swap_size; + + /* Add new entry at the end of E820 map. */ + entry = xen_e820_table.entries + + xen_e820_table.nr_entries; + xen_e820_table.nr_entries++; + + /* Fill new entry (keep size and page offset). */ + entry->type = swap_entry->type; + entry->addr = entry_end - swap_size + + swap_addr - swap_entry->addr; + entry->size = swap_entry->size; + + /* Convert old entry to RAM, align to pages. */ + swap_entry->type = E820_TYPE_RAM; + swap_entry->addr = swap_addr; + swap_entry->size = swap_size; + + /* Remember PFN<->MFN relation for P2M update. */ + xen_add_remap_nonram(swap_addr, entry_end - swap_size, +swap_size); + + /* Order E820 table and merge entries. */ + e820__update_table(&xen_e820_table); + + return; + } + + entry++; + } + + xen_raw_console_write("No suitable area found for required E820 entry remapping action\n"); + BUG(); +} + +/* + * Look for non-RAM memory types in a specific guest physical area and move + * those away if possible (ACPI NVS only for now). + */ +static void __init xen_e820_resolve_conflicts(phys_addr_t start, + phys_addr_t size) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t end; + + if (!size) + return; + + end = start + size; + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + if (entry->addr >= end) + return; + + if (entry->addr + entry->size > start && + entry->type == E820_TYPE_NVS) + xen_e820_swap_entry_with_ram(entry); + +
[PATCH v2 5/7] xen: add capability to remap non-RAM pages to different PFNs
When running as a Xen PV dom0 it can happen that the kernel is being loaded to a guest physical address conflicting with the host memory map. In order to be able to resolve this conflict, add the capability to remap non-RAM areas to different guest PFNs. A function to use this remapping information for other purposes than doing the remap will be added when needed. As the number of conflicts should be rather low (currently only machines with max. 1 conflict are known), save the remap data in a small statically allocated array. Signed-off-by: Juergen Gross --- V2: - split off from patch 5 of V1 of the series - moved to p2m.c --- arch/x86/xen/p2m.c | 65 ++ arch/x86/xen/xen-ops.h | 3 ++ 2 files changed, 68 insertions(+) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 7c735b730acd..bb55e0fe1a04 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -80,6 +80,7 @@ #include #include #include +#include #include "xen-ops.h" @@ -792,6 +793,70 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, return ret; } +/* Remapped non-RAM areas */ +#define NR_NONRAM_REMAP 4 +static struct nonram_remap { + phys_addr_t maddr; + phys_addr_t paddr; + unsigned long size; +} xen_nonram_remap[NR_NONRAM_REMAP]; +static unsigned int nr_nonram_remap; + +/* + * Do the real remapping of non-RAM regions as specified in the + * xen_nonram_remap[] array. + * In case of an error just crash the system. + */ +void __init xen_do_remap_nonram(void) +{ + unsigned int i; + unsigned int remapped = 0; + struct nonram_remap *remap = xen_nonram_remap; + unsigned long pfn, mfn, len; + + if (!nr_nonram_remap) + return; + + for (i = 0; i < nr_nonram_remap; i++) { + pfn = PFN_DOWN(remap->paddr); + mfn = PFN_DOWN(remap->maddr); + for (len = 0; len < remap->size; len += PAGE_SIZE) { + if (!set_phys_to_machine(pfn, mfn)) { + pr_err("Failed to set p2m mapping for pfn=%ld mfn=%ld\n", + pfn, mfn); + BUG(); + } + + pfn++; + mfn++; + remapped++; + } + + remap++; + } + + pr_info("Remapped %u non-RAM page(s)\n", remapped); +} + +/* + * Add a new non-RAM remap entry. + * In case of no free entry found, just crash the system. + */ +void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, +unsigned long size) +{ + if (nr_nonram_remap == NR_NONRAM_REMAP) { + xen_raw_console_write("Number of required E820 entry remapping actions exceed maximum value\n"); + BUG(); + } + + xen_nonram_remap[nr_nonram_remap].maddr = maddr; + xen_nonram_remap[nr_nonram_remap].paddr = paddr; + xen_nonram_remap[nr_nonram_remap].size = size; + + nr_nonram_remap++; +} + #ifdef CONFIG_XEN_DEBUG_FS #include static int p2m_dump_show(struct seq_file *m, void *v) diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 9a27d1d653d3..e1b782e823e6 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -47,6 +47,9 @@ void xen_mm_unpin_all(void); #ifdef CONFIG_X86_64 void __init xen_relocate_p2m(void); #endif +void __init xen_do_remap_nonram(void); +void __init xen_add_remap_nonram(phys_addr_t maddr, phys_addr_t paddr, +unsigned long size); void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, const char *component); -- 2.43.0
[PATCH v2 2/7] xen: introduce generic helper checking for memory map conflicts
When booting as a Xen PV dom0 the memory layout of the dom0 is modified to match that of the host, as this requires less changes in the kernel for supporting Xen. There are some cases, though, which are problematic, as it is the Xen hypervisor selecting the kernel's load address plus some other data, which might conflict with the host's memory map. These conflicts are detected at boot time and result in a boot error. In order to support handling at least some of these conflicts in future, introduce a generic helper function which will later gain the ability to adapt the memory layout when possible. Add the missing check for the xen_start_info area. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki --- arch/x86/xen/mmu_pv.c | 5 + arch/x86/xen/setup.c | 34 -- arch/x86/xen/xen-ops.h | 3 ++- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index f1ce39d6d32c..839e6613753d 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2018,10 +2018,7 @@ void __init xen_reserve_special_pages(void) void __init xen_pt_check_e820(void) { - if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { - xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(xen_pt_base, xen_pt_size, "page table"); } static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 4bcc70a71b7d..96765180514b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -567,7 +567,7 @@ static void __init xen_ignore_unusable(void) } } -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +static bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) { struct e820_entry *entry; unsigned mapcnt; @@ -624,6 +624,23 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) return 0; } +/* + * Check for an area in physical memory to be usable for non-movable purposes. + * An area is considered to usable if the used E820 map lists it to be RAM. + * In case the area is not usable, crash the system with an error message. + */ +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component) +{ + if (!xen_is_e820_reserved(start, size)) + return; + + xen_raw_console_write("Xen hypervisor allocated "); + xen_raw_console_write(component); + xen_raw_console_write(" memory conflicts with E820 map\n"); + BUG(); +} + /* * Like memcpy, but with physical addresses for dest and src. */ @@ -824,11 +841,16 @@ char * __init xen_memory_setup(void) * Failing now is better than running into weird problems later due * to relocating (and even reusing) pages with kernel text or data. */ - if (xen_is_e820_reserved(__pa_symbol(_text), -__pa_symbol(_end) - __pa_symbol(_text))) { - xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* +* Check for a conflict of the xen_start_info memory with the target +* E820 map. +*/ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); /* * Check for a conflict of the hypervisor supplied page tables with diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0cf16fc79e0b..9a27d1d653d3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -48,7 +48,8 @@ void xen_mm_unpin_all(void); void __init xen_relocate_p2m(void); #endif -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); -- 2.43.0
[PATCH v2 3/7] xen: move checks for e820 conflicts further up
Move the checks for e820 memory map conflicts using the xen_chk_is_e820_usable() helper further up in order to prepare resolving some of the possible conflicts by doing some e820 map modifications, which must happen before evaluating the RAM layout. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki --- arch/x86/xen/setup.c | 44 ++-- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 96765180514b..dba68951ed6b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -764,6 +764,28 @@ char * __init xen_memory_setup(void) /* Make sure the Xen-supplied memory map is well-ordered. */ e820__update_table(&xen_e820_table); + /* +* Check whether the kernel itself conflicts with the target E820 map. +* Failing now is better than running into weird problems later due +* to relocating (and even reusing) pages with kernel text or data. +*/ + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* +* Check for a conflict of the xen_start_info memory with the target +* E820 map. +*/ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); + + /* +* Check for a conflict of the hypervisor supplied page tables with +* the target E820 map. +*/ + xen_pt_check_e820(); + max_pages = xen_get_max_pages(); /* How many extra pages do we need due to remapping? */ @@ -836,28 +858,6 @@ char * __init xen_memory_setup(void) e820__update_table(e820_table); - /* -* Check whether the kernel itself conflicts with the target E820 map. -* Failing now is better than running into weird problems later due -* to relocating (and even reusing) pages with kernel text or data. -*/ - xen_chk_is_e820_usable(__pa_symbol(_text), - __pa_symbol(_end) - __pa_symbol(_text), - "kernel"); - - /* -* Check for a conflict of the xen_start_info memory with the target -* E820 map. -*/ - xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), - "xen_start_info"); - - /* -* Check for a conflict of the hypervisor supplied page tables with -* the target E820 map. -*/ - xen_pt_check_e820(); - xen_reserve_xen_mfnlist(); /* Check for a conflict of the initrd with the target E820 map. */ -- 2.43.0
[PATCH v2 0/7] xen: fix dom0 PV boot on some AMD machines
There have been reports of failed boots with Xen due to an overlap of the kernel's memory with ACPI NVS reported in the E820 map of the host. This series fixes this issue by moving the NVS area in dom0 to some higher address. Changes in V2: - split of v1 patch 5 - new patch 6 Juergen Gross (7): xen: use correct end address of kernel for conflict checking xen: introduce generic helper checking for memory map conflicts xen: move checks for e820 conflicts further up xen: move max_pfn in xen_memory_setup() out of function scope xen: add capability to remap non-RAM pages to different PFNs xen: allow mapping ACPI data using a different physical address xen: tolerate ACPI NVS memory overlapping with Xen allocated memory arch/x86/include/asm/acpi.h| 8 ++ arch/x86/kernel/acpi/boot.c| 10 ++ arch/x86/kernel/mmconf-fam10h_64.c | 2 +- arch/x86/kernel/x86_init.c | 2 +- arch/x86/xen/mmu_pv.c | 5 +- arch/x86/xen/p2m.c | 95 ++ arch/x86/xen/setup.c | 203 ++--- arch/x86/xen/xen-ops.h | 6 +- 8 files changed, 279 insertions(+), 52 deletions(-) -- 2.43.0
[PATCH v2 1/7] xen: use correct end address of kernel for conflict checking
When running as a Xen PV dom0 the kernel is loaded by the hypervisor using a different memory map than that of the host. In order to minimize the required changes in the kernel, the kernel adapts its memory map to that of the host. In order to do that it is checking for conflicts of its load address with the host memory map. Unfortunately the tested memory range does not include the .brk area, which might result in crashes or memory corruption when this area does conflict withe the memory map of the host. Fix the test by using the _end label instead of __bss_stop. Fixes: 808fdb71936c ("xen: check for kernel memory conflicting with memory layout") Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 806ddb2391d9..4bcc70a71b7d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -825,7 +825,7 @@ char * __init xen_memory_setup(void) * to relocating (and even reusing) pages with kernel text or data. */ if (xen_is_e820_reserved(__pa_symbol(_text), - __pa_symbol(__bss_stop) - __pa_symbol(_text))) { +__pa_symbol(_end) - __pa_symbol(_text))) { xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); BUG(); } -- 2.43.0
Re: [PATCH 0/5] xen: fix dom0 PV boot on some AMD machines
On 07.08.24 12:41, Juergen Gross wrote: There have been reports of failed boots with Xen due to an overlap of the kernel's memory with ACPI NVS reported in the E820 map of the host. This series fixes this issue by moving the NVS area in dom0 to some higher address. Juergen Gross (5): xen: use correct end address of kernel for conflict checking xen: introduce generic helper checking for memory map conflicts xen: move checks for e820 conflicts further up xen: move max_pfn in xen_memory_setup() out of function scope xen: tolerate ACPI NVS memory overlapping with Xen allocated memory arch/x86/xen/mmu_pv.c | 5 +- arch/x86/xen/setup.c | 242 + arch/x86/xen/xen-ops.h | 3 +- 3 files changed, 201 insertions(+), 49 deletions(-) While testing V2 of this series another thought came to my mind: What about PV guests on such a system with e820_host=1? I guess those will have a similar problem as a PV dom0, as the ACPI NVS entry in the memory map will be at the same position as on the host. Shouldn't we get rid of the ACPI related e820 entries in the guests? Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [PATCH v2 4/5] x86/kernel: Move page table macros to header
On 14.08.24 21:50, Jason Andryuk wrote: The PVH entry point will need an additional set of prebuild page tables. Move the macros and defines to pgtable_64.h, so they can be re-used. Signed-off-by: Jason Andryuk Reviewed-by: Juergen Gross Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[PATCH v2 0/3] mini-os: mm: use a generic page table walker
Instead of open coding a page table walk multiple times, use a generic page table walker instead. This new page table walker will be used later for kexec support, too. Changes in V2: - addressed comments Juergen Gross (3): mini-os: mm: introduce generic page table walk function mini-os: mm: switch need_pgt() to use walk_pt() mini-os: mm: convert set_readonly() to use walk_pt() arch/x86/mm.c | 353 +- 1 file changed, 204 insertions(+), 149 deletions(-) -- 2.43.0
[PATCH v2 2/3] mini-os: mm: switch need_pgt() to use walk_pt()
Instead of open coding a page table walk, use walk_pt() in need_pgt(). Signed-off-by: Juergen Gross --- V2: - add comment and ASSERT() (Samuel Thibault) --- arch/x86/mm.c | 72 +-- 1 file changed, 30 insertions(+), 42 deletions(-) diff --git a/arch/x86/mm.c b/arch/x86/mm.c index 9849b985..84a6d7f0 100644 --- a/arch/x86/mm.c +++ b/arch/x86/mm.c @@ -523,57 +523,45 @@ static pgentry_t *get_pgt(unsigned long va) * return a valid PTE for a given virtual address. If PTE does not exist, * allocate page-table pages. */ -pgentry_t *need_pgt(unsigned long va) +static int need_pgt_func(unsigned long va, unsigned int lvl, bool is_leaf, + pgentry_t *pte, void *par) { +pgentry_t **result = par; unsigned long pt_mfn; -pgentry_t *tab; unsigned long pt_pfn; -unsigned offset; +unsigned int idx; -tab = pt_base; -pt_mfn = virt_to_mfn(pt_base); +if ( !is_leaf ) +return 0; -#if defined(__x86_64__) -offset = l4_table_offset(va); -if ( !(tab[offset] & _PAGE_PRESENT) ) -{ -pt_pfn = virt_to_pfn(alloc_page()); -if ( !pt_pfn ) -return NULL; -new_pt_frame(&pt_pfn, pt_mfn, offset, L3_FRAME); -} -ASSERT(tab[offset] & _PAGE_PRESENT); -pt_mfn = pte_to_mfn(tab[offset]); -tab = mfn_to_virt(pt_mfn); -#endif -offset = l3_table_offset(va); -if ( !(tab[offset] & _PAGE_PRESENT) ) -{ -pt_pfn = virt_to_pfn(alloc_page()); -if ( !pt_pfn ) -return NULL; -new_pt_frame(&pt_pfn, pt_mfn, offset, L2_FRAME); -} -ASSERT(tab[offset] & _PAGE_PRESENT); -pt_mfn = pte_to_mfn(tab[offset]); -tab = mfn_to_virt(pt_mfn); -offset = l2_table_offset(va); -if ( !(tab[offset] & _PAGE_PRESENT) ) +if ( lvl == L1_FRAME || (*pte & _PAGE_PRESENT) ) { -pt_pfn = virt_to_pfn(alloc_page()); -if ( !pt_pfn ) -return NULL; -new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME); +/* + * The PTE is not addressing a page table (is_leaf is true). If we are + * either at the lowest level or we have a valid large page, we don't + * need to allocate a page table. + */ +ASSERT(lvl == L1_FRAME || (*pte & _PAGE_PSE)); +*result = pte; +return 1; } -ASSERT(tab[offset] & _PAGE_PRESENT); -if ( tab[offset] & _PAGE_PSE ) -return &tab[offset]; -pt_mfn = pte_to_mfn(tab[offset]); -tab = mfn_to_virt(pt_mfn); +pt_mfn = virt_to_mfn(pte); +pt_pfn = virt_to_pfn(alloc_page()); +if ( !pt_pfn ) +return -1; +idx = idx_from_va_lvl(va, lvl); +new_pt_frame(&pt_pfn, pt_mfn, idx, lvl - 1); -offset = l1_table_offset(va); -return &tab[offset]; +return 0; +} + +pgentry_t *need_pgt(unsigned long va) +{ +pgentry_t *tab = NULL; + +walk_pt(va, va, need_pgt_func, &tab); +return tab; } EXPORT_SYMBOL(need_pgt); -- 2.43.0
[PATCH v2 3/3] mini-os: mm: convert set_readonly() to use walk_pt()
Instead of having another copy of a page table walk in set_readonly(), just use walk_pt(). As it will be needed later anyway, split out the TLB flushing into a dedicated function. Signed-off-by: Juergen Gross --- V2: - clear count after doing an mmu_update call (Samuel Thibault) - do final mmu_update call from set_readonly() if needed (Samuel Thibault) --- arch/x86/mm.c | 124 +++--- 1 file changed, 56 insertions(+), 68 deletions(-) diff --git a/arch/x86/mm.c b/arch/x86/mm.c index 84a6d7f0..85827d93 100644 --- a/arch/x86/mm.c +++ b/arch/x86/mm.c @@ -402,92 +402,80 @@ static void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn) * Mark portion of the address space read only. */ extern struct shared_info shared_info; -static void set_readonly(void *text, void *etext) -{ -unsigned long start_address = -((unsigned long) text + PAGE_SIZE - 1) & PAGE_MASK; -unsigned long end_address = (unsigned long) etext; -pgentry_t *tab = pt_base, page; -unsigned long mfn = pfn_to_mfn(virt_to_pfn(pt_base)); -unsigned long offset; -unsigned long page_size = PAGE_SIZE; + +struct set_readonly_par { +unsigned long etext; #ifdef CONFIG_PARAVIRT -int count = 0; -int rc; +unsigned int count; #endif +}; -printk("setting %p-%p readonly\n", text, etext); +static int set_readonly_func(unsigned long va, unsigned int lvl, bool is_leaf, + pgentry_t *pte, void *par) +{ +struct set_readonly_par *ro = par; -while ( start_address + page_size <= end_address ) -{ -tab = pt_base; -mfn = pfn_to_mfn(virt_to_pfn(pt_base)); +if ( !is_leaf ) +return 0; -#if defined(__x86_64__) -offset = l4_table_offset(start_address); -page = tab[offset]; -mfn = pte_to_mfn(page); -tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT); -#endif -offset = l3_table_offset(start_address); -page = tab[offset]; -mfn = pte_to_mfn(page); -tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT); -offset = l2_table_offset(start_address); -if ( !(tab[offset] & _PAGE_PSE) ) -{ -page = tab[offset]; -mfn = pte_to_mfn(page); -tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT); +if ( va + (1UL << ptdata[lvl].shift) > ro->etext ) +return 1; -offset = l1_table_offset(start_address); -} +if ( va == (unsigned long)&shared_info ) +{ +printk("skipped %lx\n", va); +return 0; +} -if ( start_address != (unsigned long)&shared_info ) -{ #ifdef CONFIG_PARAVIRT -mmu_updates[count].ptr = -((pgentry_t)mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset; -mmu_updates[count].val = tab[offset] & ~_PAGE_RW; -count++; +mmu_updates[ro->count].ptr = virt_to_mach(pte); +mmu_updates[ro->count].val = *pte & ~_PAGE_RW; +ro->count++; + +if ( ro->count == L1_PAGETABLE_ENTRIES ) +{ + ro->count = 0; + if ( HYPERVISOR_mmu_update(mmu_updates, ro->count, NULL, +DOMID_SELF) < 0 ) + BUG(); +} #else -tab[offset] &= ~_PAGE_RW; +*pte &= ~_PAGE_RW; #endif -} -else -printk("skipped %lx\n", start_address); -start_address += page_size; +return 0; +} #ifdef CONFIG_PARAVIRT -if ( count == L1_PAGETABLE_ENTRIES || - start_address + page_size > end_address ) -{ -rc = HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF); -if ( rc < 0 ) -{ -printk("ERROR: set_readonly(): PTE could not be updated\n"); -do_exit(); -} -count = 0; -} +static void tlb_flush(void) +{ +mmuext_op_t op = { .cmd = MMUEXT_TLB_FLUSH_ALL }; +int count; + +HYPERVISOR_mmuext_op(&op, 1, &count, DOMID_SELF); +} #else -if ( start_address == (1UL << L2_PAGETABLE_SHIFT) ) -page_size = 1UL << L2_PAGETABLE_SHIFT; +static void tlb_flush(void) +{ +write_cr3((unsigned long)pt_base); +} #endif -} + +static void set_readonly(void *text, void *etext) +{ +struct set_readonly_par setro = { .etext = (unsigned long)etext }; +unsigned long start_address = PAGE_ALIGN((unsigned long)text); + +printk("setting %p-%p readonly\n", text, etext); +walk_pt(start_address, setro.etext, set_readonly_func, &setro); #ifdef CONFIG_PARAVIRT -{ -mmuext_op_t op = { -.cmd = MMUEXT_TLB_FLUSH_ALL, -}; -int count; -HYPERVISOR_mmuext_op(&op, 1, &count, DOMID_SELF); -} -#else -write_cr3((unsigned long)pt_base); +if ( setro.count && + HYPERVISOR_mmu_update(mmu_updates, setro.count, NULL, DOMID_SELF) < 0) +BUG(); #endif + +tlb_flush(); } /* -- 2.43.0
[PATCH v2 1/3] mini-os: mm: introduce generic page table walk function
In x86 mm code there are multiple instances of page table walks for different purposes. Introduce a generic page table walker being able to cover the current use cases. It will be used for other cases in future, too. The page table walker needs some per-level data, so add a table for that data. Merge it with the already existing pt_prot[] array. Rewrite get_pgt() to use the new walker. Signed-off-by: Juergen Gross --- V2: - add idx_from_va_lvl() helper (Samuel Thibault) --- arch/x86/mm.c | 157 +- 1 file changed, 118 insertions(+), 39 deletions(-) diff --git a/arch/x86/mm.c b/arch/x86/mm.c index 7ddf16e4..9849b985 100644 --- a/arch/x86/mm.c +++ b/arch/x86/mm.c @@ -125,20 +125,30 @@ void arch_mm_preinit(void *p) } #endif +static const struct { +unsigned int shift; +unsigned int entries; +pgentry_t prot; +} ptdata[PAGETABLE_LEVELS + 1] = { +{ 0, 0, 0 }, +{ L1_PAGETABLE_SHIFT, L1_PAGETABLE_ENTRIES, L1_PROT }, +{ L2_PAGETABLE_SHIFT, L2_PAGETABLE_ENTRIES, L2_PROT }, +{ L3_PAGETABLE_SHIFT, L3_PAGETABLE_ENTRIES, L3_PROT }, +#if defined(__x86_64__) +{ L4_PAGETABLE_SHIFT, L4_PAGETABLE_ENTRIES, L4_PROT }, +#endif +}; + +static inline unsigned int idx_from_va_lvl(unsigned long va, unsigned int lvl) +{ +return (va >> ptdata[lvl].shift) & (ptdata[lvl].entries - 1); +} + /* * Make pt_pfn a new 'level' page table frame and hook it into the page * table at offset in previous level MFN (pref_l_mfn). pt_pfn is a guest * PFN. */ -static pgentry_t pt_prot[PAGETABLE_LEVELS] = { -L1_PROT, -L2_PROT, -L3_PROT, -#if defined(__x86_64__) -L4_PROT, -#endif -}; - static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, unsigned long offset, unsigned long level) { @@ -170,7 +180,7 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, mmu_updates[0].ptr = (tab[l2_table_offset(pt_page)] & PAGE_MASK) + sizeof(pgentry_t) * l1_table_offset(pt_page); mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | -(pt_prot[level - 1] & ~_PAGE_RW); +(ptdata[level].prot & ~_PAGE_RW); if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 ) { @@ -183,7 +193,7 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, mmu_updates[0].ptr = ((pgentry_t)prev_l_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset; mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | -pt_prot[level]; +ptdata[level + 1].prot; if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 ) { @@ -192,7 +202,7 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, } #else tab = mfn_to_virt(prev_l_mfn); -tab[offset] = (*pt_pfn << PAGE_SHIFT) | pt_prot[level]; +tab[offset] = (*pt_pfn << PAGE_SHIFT) | ptdata[level + 1].prot; #endif *pt_pfn += 1; @@ -202,6 +212,82 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1]; #endif +/* + * Walk recursively through all PTEs calling a specified function. The function + * is allowed to change the PTE, the walker will follow the new value. + * The walk will cover the virtual address range [from_va .. to_va]. + * The supplied function will be called with the following parameters: + * va: base virtual address of the area covered by the current PTE + * lvl: page table level of the PTE (1 = lowest level, PAGETABLE_LEVELS = + * PTE in page table addressed by %cr3) + * is_leaf: true if PTE doesn't address another page table (it is either at + * level 1, or invalid, or has its PSE bit set) + * pte: address of the PTE + * par: parameter, passed to walk_pt() by caller + * Return value of func() being non-zero will terminate walk_pt(), walk_pt() + * will return that value in this case, zero else. + */ +static int walk_pt(unsigned long from_va, unsigned long to_va, + int (func)(unsigned long va, unsigned int lvl, + bool is_leaf, pgentry_t *pte, void *par), + void *par) +{ +unsigned int lvl = PAGETABLE_LEVELS; +unsigned int ptindex[PAGETABLE_LEVELS + 1]; +unsigned long va = round_pgdown(from_va); +unsigned long va_lvl; +pgentry_t *tab[PAGETABLE_LEVELS + 1]; +pgentry_t *pte; +bool is_leaf; +int ret; + +/* Start at top level page table. */ +tab[lvl] = pt_base; +ptindex[lvl] = idx_from_va_lvl(va, lvl); + +while ( va < (to_va | (PAGE_SIZE - 1)) ) +{ +pte = tab[lvl] + ptindex[lvl]; +is_leaf = (lvl == L1_FRAME) || (*pte & _PAGE_PSE) || + !(*pte & _PAGE_PRESENT); +va_lvl = va & ~((1UL << ptdata[lvl].shift) - 1)
[PATCH 5/5] xen: tolerate ACPI NVS memory overlapping with Xen allocated memory
In order to minimize required special handling for running as Xen PV dom0, the memory layout is modified to match that of the host. This requires to have only RAM at the locations where Xen allocated memory is living. Unfortunately there seem to be some machines, where ACPI NVS is located at 64 MB, resulting in a conflict with the loaded kernel or the initial page tables built by Xen. As ACPI NVS needs to be accessed by the kernel only for saving and restoring it across suspend operations, it can be relocated in the dom0's memory map by swapping it with unused RAM (this is possible via modification of the dom0 P2M map). While the E820 map can (and should) be modified right away, the P2M map can be updated only after memory allocation is working, as the P2M map might need to be extended. Fixes: 808fdb71936c ("xen: check for kernel memory conflicting with memory layout") Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki --- arch/x86/xen/setup.c | 133 ++- 1 file changed, 132 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index d678c0330971..dbb5d13ca61a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -49,6 +49,15 @@ static struct e820_table xen_e820_table __initdata; /* Number of initially usable memory pages. */ static unsigned long ini_nr_pages __initdata; +/* Remapped non-RAM areas */ +#define NR_NONRAM_REMAP 4 +static struct nonram_remap { + unsigned long maddr; + unsigned long size; + unsigned long paddr; +} xen_nonram_remap[NR_NONRAM_REMAP] __initdata; +static unsigned int nr_nonram_remap __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -452,6 +461,8 @@ static unsigned long __init xen_foreach_remap_area( * to be remapped memory itself in a linked list anchored at xen_remap_mfn. * This scheme allows to remap the different chunks in arbitrary order while * the resulting mapping will be independent from the order. + * In case xen_e820_resolve_conflicts() did relocate some non-RAM E820 + * entries, set the correct P2M information for the affected pages. */ void __init xen_remap_memory(void) { @@ -495,6 +506,29 @@ void __init xen_remap_memory(void) set_pte_mfn(buf, mfn_save, PAGE_KERNEL); pr_info("Remapped %ld page(s)\n", remapped); + + if (nr_nonram_remap == 0) + return; + + remapped = 0; + for (i = 0; i < nr_nonram_remap; i++) { + struct nonram_remap *remap = xen_nonram_remap + i; + + pfn = PFN_DOWN(remap->paddr); + mfn_save = PFN_DOWN(remap->maddr); + for (len = 0; len < remap->size; len += PAGE_SIZE) { + if (!set_phys_to_machine(pfn, mfn_save)) { + WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n", +pfn, mfn_save); + BUG(); + } + pfn++; + mfn_save++; + remapped++; + } + } + + pr_info("Remapped %ld non-RAM page(s)\n", remapped); } static unsigned long __init xen_get_pages_limit(void) @@ -625,14 +659,111 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) return 0; } +/* + * Swap a non-RAM E820 map entry with RAM above ini_nr_pages. + * Note that the E820 map is modified accordingly, but the P2M map isn't yet. + * The adaption of the P2M must be deferred until page allocation is possible. + */ +static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t mem_end = PFN_PHYS(ini_nr_pages); + struct nonram_remap *remap; + phys_addr_t swap_addr, swap_size, entry_end; + + if (nr_nonram_remap == NR_NONRAM_REMAP) { + xen_raw_console_write("Number of required E820 entry remapping actions exceed maximum value\n"); + BUG(); + } + + swap_addr = PAGE_ALIGN_DOWN(swap_entry->addr); + swap_size = PAGE_ALIGN(swap_entry->addr - swap_addr + swap_entry->size); + remap = xen_nonram_remap + nr_nonram_remap; + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + entry_end = entry->addr + entry->size; + if (entry->type == E820_TYPE_RAM && entry->size >= swap_size && + entry_end - swap_size >= mem_end) { + /* Reduce RAM entry by needed space (whole pages). */ + entry->size -= swap_size; + + /* Add new entry at the end of E82
[PATCH 3/5] xen: move checks for e820 conflicts further up
Move the checks for e820 memory map conflicts using the xen_chk_is_e820_usable() helper further up in order to prepare resolving some of the possible conflicts by doing some e820 map modifications, which must happen before evaluating the RAM layout. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki --- arch/x86/xen/setup.c | 44 ++-- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 96765180514b..dba68951ed6b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -764,6 +764,28 @@ char * __init xen_memory_setup(void) /* Make sure the Xen-supplied memory map is well-ordered. */ e820__update_table(&xen_e820_table); + /* +* Check whether the kernel itself conflicts with the target E820 map. +* Failing now is better than running into weird problems later due +* to relocating (and even reusing) pages with kernel text or data. +*/ + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* +* Check for a conflict of the xen_start_info memory with the target +* E820 map. +*/ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); + + /* +* Check for a conflict of the hypervisor supplied page tables with +* the target E820 map. +*/ + xen_pt_check_e820(); + max_pages = xen_get_max_pages(); /* How many extra pages do we need due to remapping? */ @@ -836,28 +858,6 @@ char * __init xen_memory_setup(void) e820__update_table(e820_table); - /* -* Check whether the kernel itself conflicts with the target E820 map. -* Failing now is better than running into weird problems later due -* to relocating (and even reusing) pages with kernel text or data. -*/ - xen_chk_is_e820_usable(__pa_symbol(_text), - __pa_symbol(_end) - __pa_symbol(_text), - "kernel"); - - /* -* Check for a conflict of the xen_start_info memory with the target -* E820 map. -*/ - xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), - "xen_start_info"); - - /* -* Check for a conflict of the hypervisor supplied page tables with -* the target E820 map. -*/ - xen_pt_check_e820(); - xen_reserve_xen_mfnlist(); /* Check for a conflict of the initrd with the target E820 map. */ -- 2.43.0
[PATCH 4/5] xen: move max_pfn in xen_memory_setup() out of function scope
Instead of having max_pfn as a local variable of xen_memory_setup(), make it a static variable in setup.c instead. This avoids having to pass it to subfunctions, which will be needed in more cases in future. Rename it to ini_nr_pages, as the value denotes the currently usable number of memory pages as passed from the hypervisor at boot time. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki --- arch/x86/xen/setup.c | 53 ++-- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index dba68951ed6b..d678c0330971 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -46,6 +46,9 @@ bool xen_pv_pci_possible; /* E820 map used during setting up memory. */ static struct e820_table xen_e820_table __initdata; +/* Number of initially usable memory pages. */ +static unsigned long ini_nr_pages __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -212,7 +215,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages) + unsigned long end_pfn) { unsigned long pfn, end; int ret; @@ -220,7 +223,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN_ON(start_pfn > end_pfn); /* Release pages first. */ - end = min(end_pfn, nr_pages); + end = min(end_pfn, ini_nr_pages); for (pfn = start_pfn; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -341,15 +344,14 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, - unsigned long remap_pfn) + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; unsigned long n = end_pfn - start_pfn; if (remap_pfn == 0) - remap_pfn = nr_pages; + remap_pfn = ini_nr_pages; while (i < n) { unsigned long cur_pfn = start_pfn + i; @@ -358,19 +360,19 @@ static unsigned long __init xen_set_identity_and_remap_chunk( unsigned long remap_range_size; /* Do not remap pages beyond the current allocation */ - if (cur_pfn >= nr_pages) { + if (cur_pfn >= ini_nr_pages) { /* Identity map remaining pages */ set_phys_range_identity(cur_pfn, cur_pfn + size); break; } - if (cur_pfn + size > nr_pages) - size = nr_pages - cur_pfn; + if (cur_pfn + size > ini_nr_pages) + size = ini_nr_pages - cur_pfn; remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warn("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages); + cur_pfn + left); break; } /* Adjust size to fit in current e820 RAM region */ @@ -397,18 +399,18 @@ static unsigned long __init xen_set_identity_and_remap_chunk( } static unsigned long __init xen_count_remap_pages( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pages) { - if (start_pfn >= nr_pages) + if (start_pfn >= ini_nr_pages) return remap_pages; - return remap_pages + min(end_pfn, nr_pages) - start_pfn; + return remap_pages + min(end_pfn, ini_nr_pages) - start_pfn; } -static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, +static unsigned long __init xen_foreach_remap_area( unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn, - unsigned long nr_pages, unsigned long last_val)) + unsigned long last_val)) { phys_addr_t start = 0; unsigned long ret_val = 0; @@ -436,8 +438,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) -
[PATCH 1/5] xen: use correct end address of kernel for conflict checking
When running as a Xen PV dom0 the kernel is loaded by the hypervisor using a different memory map than that of the host. In order to minimize the required changes in the kernel, the kernel adapts its memory map to that of the host. In order to do that it is checking for conflicts of its load address with the host memory map. Unfortunately the tested memory range does not include the .brk area, which might result in crashes or memory corruption when this area does conflict withe the memory map of the host. Fix the test by using the _end label instead of __bss_stop. Fixes: 808fdb71936c ("xen: check for kernel memory conflicting with memory layout") Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 806ddb2391d9..4bcc70a71b7d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -825,7 +825,7 @@ char * __init xen_memory_setup(void) * to relocating (and even reusing) pages with kernel text or data. */ if (xen_is_e820_reserved(__pa_symbol(_text), - __pa_symbol(__bss_stop) - __pa_symbol(_text))) { +__pa_symbol(_end) - __pa_symbol(_text))) { xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); BUG(); } -- 2.43.0
[PATCH 2/5] xen: introduce generic helper checking for memory map conflicts
When booting as a Xen PV dom0 the memory layout of the dom0 is modified to match that of the host, as this requires less changes in the kernel for supporting Xen. There are some cases, though, which are problematic, as it is the Xen hypervisor selecting the kernel's load address plus some other data, which might conflict with the host's memory map. These conflicts are detected at boot time and result in a boot error. In order to support handling at least some of these conflicts in future, introduce a generic helper function which will later gain the ability to adapt the memory layout when possible. Add the missing check for the xen_start_info area. Signed-off-by: Juergen Gross Tested-by: Marek Marczykowski-Górecki --- arch/x86/xen/mmu_pv.c | 5 + arch/x86/xen/setup.c | 34 -- arch/x86/xen/xen-ops.h | 3 ++- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index f1ce39d6d32c..839e6613753d 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2018,10 +2018,7 @@ void __init xen_reserve_special_pages(void) void __init xen_pt_check_e820(void) { - if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { - xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(xen_pt_base, xen_pt_size, "page table"); } static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 4bcc70a71b7d..96765180514b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -567,7 +567,7 @@ static void __init xen_ignore_unusable(void) } } -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +static bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) { struct e820_entry *entry; unsigned mapcnt; @@ -624,6 +624,23 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) return 0; } +/* + * Check for an area in physical memory to be usable for non-movable purposes. + * An area is considered to usable if the used E820 map lists it to be RAM. + * In case the area is not usable, crash the system with an error message. + */ +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component) +{ + if (!xen_is_e820_reserved(start, size)) + return; + + xen_raw_console_write("Xen hypervisor allocated "); + xen_raw_console_write(component); + xen_raw_console_write(" memory conflicts with E820 map\n"); + BUG(); +} + /* * Like memcpy, but with physical addresses for dest and src. */ @@ -824,11 +841,16 @@ char * __init xen_memory_setup(void) * Failing now is better than running into weird problems later due * to relocating (and even reusing) pages with kernel text or data. */ - if (xen_is_e820_reserved(__pa_symbol(_text), -__pa_symbol(_end) - __pa_symbol(_text))) { - xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* +* Check for a conflict of the xen_start_info memory with the target +* E820 map. +*/ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); /* * Check for a conflict of the hypervisor supplied page tables with diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0cf16fc79e0b..9a27d1d653d3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -48,7 +48,8 @@ void xen_mm_unpin_all(void); void __init xen_relocate_p2m(void); #endif -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); -- 2.43.0
[PATCH 0/5] xen: fix dom0 PV boot on some AMD machines
There have been reports of failed boots with Xen due to an overlap of the kernel's memory with ACPI NVS reported in the E820 map of the host. This series fixes this issue by moving the NVS area in dom0 to some higher address. Juergen Gross (5): xen: use correct end address of kernel for conflict checking xen: introduce generic helper checking for memory map conflicts xen: move checks for e820 conflicts further up xen: move max_pfn in xen_memory_setup() out of function scope xen: tolerate ACPI NVS memory overlapping with Xen allocated memory arch/x86/xen/mmu_pv.c | 5 +- arch/x86/xen/setup.c | 242 + arch/x86/xen/xen-ops.h | 3 +- 3 files changed, 201 insertions(+), 49 deletions(-) -- 2.43.0
[PATCH 4/5] xen: move max_pfn in xen_memory_setup() out of function scope
Instead of having max_pfn as a local variable of xen_memory_setup(), make it a static variable in setup.c instead. This avoids having to pass it to subfunctions, which will be needed in more cases in future. Rename it to ini_nr_pages, as the value denotes the currently usable number of memory pages as passed from the hypervisor at boot time. Signed-off-by: Juergen Gross Marek Marczykowski-Górecki --- arch/x86/xen/setup.c | 53 ++-- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index dba68951ed6b..d678c0330971 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -46,6 +46,9 @@ bool xen_pv_pci_possible; /* E820 map used during setting up memory. */ static struct e820_table xen_e820_table __initdata; +/* Number of initially usable memory pages. */ +static unsigned long ini_nr_pages __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -212,7 +215,7 @@ static int __init xen_free_mfn(unsigned long mfn) * as a fallback if the remapping fails. */ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, - unsigned long end_pfn, unsigned long nr_pages) + unsigned long end_pfn) { unsigned long pfn, end; int ret; @@ -220,7 +223,7 @@ static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn, WARN_ON(start_pfn > end_pfn); /* Release pages first. */ - end = min(end_pfn, nr_pages); + end = min(end_pfn, ini_nr_pages); for (pfn = start_pfn; pfn < end; pfn++) { unsigned long mfn = pfn_to_mfn(pfn); @@ -341,15 +344,14 @@ static void __init xen_do_set_identity_and_remap_chunk( * to Xen and not remapped. */ static unsigned long __init xen_set_identity_and_remap_chunk( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, - unsigned long remap_pfn) + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pfn) { unsigned long pfn; unsigned long i = 0; unsigned long n = end_pfn - start_pfn; if (remap_pfn == 0) - remap_pfn = nr_pages; + remap_pfn = ini_nr_pages; while (i < n) { unsigned long cur_pfn = start_pfn + i; @@ -358,19 +360,19 @@ static unsigned long __init xen_set_identity_and_remap_chunk( unsigned long remap_range_size; /* Do not remap pages beyond the current allocation */ - if (cur_pfn >= nr_pages) { + if (cur_pfn >= ini_nr_pages) { /* Identity map remaining pages */ set_phys_range_identity(cur_pfn, cur_pfn + size); break; } - if (cur_pfn + size > nr_pages) - size = nr_pages - cur_pfn; + if (cur_pfn + size > ini_nr_pages) + size = ini_nr_pages - cur_pfn; remap_range_size = xen_find_pfn_range(&remap_pfn); if (!remap_range_size) { pr_warn("Unable to find available pfn range, not remapping identity pages\n"); xen_set_identity_and_release_chunk(cur_pfn, - cur_pfn + left, nr_pages); + cur_pfn + left); break; } /* Adjust size to fit in current e820 RAM region */ @@ -397,18 +399,18 @@ static unsigned long __init xen_set_identity_and_remap_chunk( } static unsigned long __init xen_count_remap_pages( - unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages, + unsigned long start_pfn, unsigned long end_pfn, unsigned long remap_pages) { - if (start_pfn >= nr_pages) + if (start_pfn >= ini_nr_pages) return remap_pages; - return remap_pages + min(end_pfn, nr_pages) - start_pfn; + return remap_pages + min(end_pfn, ini_nr_pages) - start_pfn; } -static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, +static unsigned long __init xen_foreach_remap_area( unsigned long (*func)(unsigned long start_pfn, unsigned long end_pfn, - unsigned long nr_pages, unsigned long last_val)) + unsigned long last_val)) { phys_addr_t start = 0; unsigned long ret_val = 0; @@ -436,8 +438,7 @@ static unsigned long __init xen_foreach_remap_area(unsigned long nr_pages, end_pfn = PFN_UP(entry->addr); if (start_pfn < end_pfn) -
[PATCH 5/5] xen: tolerate ACPI NVS memory overlapping with Xen allocated memory
In order to minimize required special handling for running as Xen PV dom0, the memory layout is modified to match that of the host. This requires to have only RAM at the locations where Xen allocated memory is living. Unfortunately there seem to be some machines, where ACPI NVS is located at 64 MB, resulting in a conflict with the loaded kernel or the initial page tables built by Xen. As ACPI NVS needs to be accessed by the kernel only for saving and restoring it across suspend operations, it can be relocated in the dom0's memory map by swapping it with unused RAM (this is possible via modification of the dom0 P2M map). While the E820 map can (and should) be modified right away, the P2M map can be updated only after memory allocation is working, as the P2M map might need to be extended. Fixes: 808fdb71936c ("xen: check for kernel memory conflicting with memory layout") Signed-off-by: Juergen Gross Marek Marczykowski-Górecki --- arch/x86/xen/setup.c | 133 ++- 1 file changed, 132 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index d678c0330971..dbb5d13ca61a 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -49,6 +49,15 @@ static struct e820_table xen_e820_table __initdata; /* Number of initially usable memory pages. */ static unsigned long ini_nr_pages __initdata; +/* Remapped non-RAM areas */ +#define NR_NONRAM_REMAP 4 +static struct nonram_remap { + unsigned long maddr; + unsigned long size; + unsigned long paddr; +} xen_nonram_remap[NR_NONRAM_REMAP] __initdata; +static unsigned int nr_nonram_remap __initdata; + /* * Buffer used to remap identity mapped pages. We only need the virtual space. * The physical page behind this address is remapped as needed to different @@ -452,6 +461,8 @@ static unsigned long __init xen_foreach_remap_area( * to be remapped memory itself in a linked list anchored at xen_remap_mfn. * This scheme allows to remap the different chunks in arbitrary order while * the resulting mapping will be independent from the order. + * In case xen_e820_resolve_conflicts() did relocate some non-RAM E820 + * entries, set the correct P2M information for the affected pages. */ void __init xen_remap_memory(void) { @@ -495,6 +506,29 @@ void __init xen_remap_memory(void) set_pte_mfn(buf, mfn_save, PAGE_KERNEL); pr_info("Remapped %ld page(s)\n", remapped); + + if (nr_nonram_remap == 0) + return; + + remapped = 0; + for (i = 0; i < nr_nonram_remap; i++) { + struct nonram_remap *remap = xen_nonram_remap + i; + + pfn = PFN_DOWN(remap->paddr); + mfn_save = PFN_DOWN(remap->maddr); + for (len = 0; len < remap->size; len += PAGE_SIZE) { + if (!set_phys_to_machine(pfn, mfn_save)) { + WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n", +pfn, mfn_save); + BUG(); + } + pfn++; + mfn_save++; + remapped++; + } + } + + pr_info("Remapped %ld non-RAM page(s)\n", remapped); } static unsigned long __init xen_get_pages_limit(void) @@ -625,14 +659,111 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) return 0; } +/* + * Swap a non-RAM E820 map entry with RAM above ini_nr_pages. + * Note that the E820 map is modified accordingly, but the P2M map isn't yet. + * The adaption of the P2M must be deferred until page allocation is possible. + */ +static void __init xen_e820_swap_entry_with_ram(struct e820_entry *swap_entry) +{ + struct e820_entry *entry; + unsigned int mapcnt; + phys_addr_t mem_end = PFN_PHYS(ini_nr_pages); + struct nonram_remap *remap; + phys_addr_t swap_addr, swap_size, entry_end; + + if (nr_nonram_remap == NR_NONRAM_REMAP) { + xen_raw_console_write("Number of required E820 entry remapping actions exceed maximum value\n"); + BUG(); + } + + swap_addr = PAGE_ALIGN_DOWN(swap_entry->addr); + swap_size = PAGE_ALIGN(swap_entry->addr - swap_addr + swap_entry->size); + remap = xen_nonram_remap + nr_nonram_remap; + entry = xen_e820_table.entries; + + for (mapcnt = 0; mapcnt < xen_e820_table.nr_entries; mapcnt++) { + entry_end = entry->addr + entry->size; + if (entry->type == E820_TYPE_RAM && entry->size >= swap_size && + entry_end - swap_size >= mem_end) { + /* Reduce RAM entry by needed space (whole pages). */ + entry->size -= swap_size; + + /* Add new entry at the end of E82
[PATCH 2/5] xen: introduce generic helper checking for memory map conflicts
When booting as a Xen PV dom0 the memory layout of the dom0 is modified to match that of the host, as this requires less changes in the kernel for supporting Xen. There are some cases, though, which are problematic, as it is the Xen hypervisor selecting the kernel's load address plus some other data, which might conflict with the host's memory map. These conflicts are detected at boot time and result in a boot error. In order to support handling at least some of these conflicts in future, introduce a generic helper function which will later gain the ability to adapt the memory layout when possible. Add the missing check for the xen_start_info area. Signed-off-by: Juergen Gross Marek Marczykowski-Górecki --- arch/x86/xen/mmu_pv.c | 5 + arch/x86/xen/setup.c | 34 -- arch/x86/xen/xen-ops.h | 3 ++- 3 files changed, 31 insertions(+), 11 deletions(-) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index f1ce39d6d32c..839e6613753d 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2018,10 +2018,7 @@ void __init xen_reserve_special_pages(void) void __init xen_pt_check_e820(void) { - if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { - xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(xen_pt_base, xen_pt_size, "page table"); } static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 4bcc70a71b7d..96765180514b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -567,7 +567,7 @@ static void __init xen_ignore_unusable(void) } } -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) +static bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size) { struct e820_entry *entry; unsigned mapcnt; @@ -624,6 +624,23 @@ phys_addr_t __init xen_find_free_area(phys_addr_t size) return 0; } +/* + * Check for an area in physical memory to be usable for non-movable purposes. + * An area is considered to usable if the used E820 map lists it to be RAM. + * In case the area is not usable, crash the system with an error message. + */ +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component) +{ + if (!xen_is_e820_reserved(start, size)) + return; + + xen_raw_console_write("Xen hypervisor allocated "); + xen_raw_console_write(component); + xen_raw_console_write(" memory conflicts with E820 map\n"); + BUG(); +} + /* * Like memcpy, but with physical addresses for dest and src. */ @@ -824,11 +841,16 @@ char * __init xen_memory_setup(void) * Failing now is better than running into weird problems later due * to relocating (and even reusing) pages with kernel text or data. */ - if (xen_is_e820_reserved(__pa_symbol(_text), -__pa_symbol(_end) - __pa_symbol(_text))) { - xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); - BUG(); - } + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* +* Check for a conflict of the xen_start_info memory with the target +* E820 map. +*/ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); /* * Check for a conflict of the hypervisor supplied page tables with diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 0cf16fc79e0b..9a27d1d653d3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -48,7 +48,8 @@ void xen_mm_unpin_all(void); void __init xen_relocate_p2m(void); #endif -bool __init xen_is_e820_reserved(phys_addr_t start, phys_addr_t size); +void __init xen_chk_is_e820_usable(phys_addr_t start, phys_addr_t size, + const char *component); unsigned long __ref xen_chk_extra_mem(unsigned long pfn); void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); -- 2.43.0
[PATCH 0/5] xen: fix dom0 PV boot on some AMD machines
There have been reports of failed boots with Xen due to an overlap of the kernel's memory with ACPI NVS reported in the E820 map of the host. This series fixes this issue by moving the NVS area in dom0 to some higher address. Juergen Gross (5): xen: use correct end address of kernel for conflict checking xen: introduce generic helper checking for memory map conflicts xen: move checks for e820 conflicts further up xen: move max_pfn in xen_memory_setup() out of function scope xen: tolerate ACPI NVS memory overlapping with Xen allocated memory arch/x86/xen/mmu_pv.c | 5 +- arch/x86/xen/setup.c | 242 + arch/x86/xen/xen-ops.h | 3 +- 3 files changed, 201 insertions(+), 49 deletions(-) -- 2.43.0
[PATCH 3/5] xen: move checks for e820 conflicts further up
Move the checks for e820 memory map conflicts using the xen_chk_is_e820_usable() helper further up in order to prepare resolving some of the possible conflicts by doing some e820 map modifications, which must happen before evaluating the RAM layout. Signed-off-by: Juergen Gross Marek Marczykowski-Górecki --- arch/x86/xen/setup.c | 44 ++-- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 96765180514b..dba68951ed6b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -764,6 +764,28 @@ char * __init xen_memory_setup(void) /* Make sure the Xen-supplied memory map is well-ordered. */ e820__update_table(&xen_e820_table); + /* +* Check whether the kernel itself conflicts with the target E820 map. +* Failing now is better than running into weird problems later due +* to relocating (and even reusing) pages with kernel text or data. +*/ + xen_chk_is_e820_usable(__pa_symbol(_text), + __pa_symbol(_end) - __pa_symbol(_text), + "kernel"); + + /* +* Check for a conflict of the xen_start_info memory with the target +* E820 map. +*/ + xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), + "xen_start_info"); + + /* +* Check for a conflict of the hypervisor supplied page tables with +* the target E820 map. +*/ + xen_pt_check_e820(); + max_pages = xen_get_max_pages(); /* How many extra pages do we need due to remapping? */ @@ -836,28 +858,6 @@ char * __init xen_memory_setup(void) e820__update_table(e820_table); - /* -* Check whether the kernel itself conflicts with the target E820 map. -* Failing now is better than running into weird problems later due -* to relocating (and even reusing) pages with kernel text or data. -*/ - xen_chk_is_e820_usable(__pa_symbol(_text), - __pa_symbol(_end) - __pa_symbol(_text), - "kernel"); - - /* -* Check for a conflict of the xen_start_info memory with the target -* E820 map. -*/ - xen_chk_is_e820_usable(__pa(xen_start_info), sizeof(*xen_start_info), - "xen_start_info"); - - /* -* Check for a conflict of the hypervisor supplied page tables with -* the target E820 map. -*/ - xen_pt_check_e820(); - xen_reserve_xen_mfnlist(); /* Check for a conflict of the initrd with the target E820 map. */ -- 2.43.0
[PATCH 1/5] xen: use correct end address of kernel for conflict checking
When running as a Xen PV dom0 the kernel is loaded by the hypervisor using a different memory map than that of the host. In order to minimize the required changes in the kernel, the kernel adapts its memory map to that of the host. In order to do that it is checking for conflicts of its load address with the host memory map. Unfortunately the tested memory range does not include the .brk area, which might result in crashes or memory corruption when this area does conflict withe the memory map of the host. Fix the test by using the _end label instead of __bss_stop. Fixes: 808fdb71936c ("xen: check for kernel memory conflicting with memory layout") Signed-off-by: Juergen Gross Marek Marczykowski-Górecki --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 806ddb2391d9..4bcc70a71b7d 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -825,7 +825,7 @@ char * __init xen_memory_setup(void) * to relocating (and even reusing) pages with kernel text or data. */ if (xen_is_e820_reserved(__pa_symbol(_text), - __pa_symbol(__bss_stop) - __pa_symbol(_text))) { +__pa_symbol(_end) - __pa_symbol(_text))) { xen_raw_console_write("Xen hypervisor allocated kernel memory conflicts with E820 map\n"); BUG(); } -- 2.43.0
Re: [PATCH] Fixed incorrect output in xl's "help" command.
On 05.08.24 16:11, John E. Krokes wrote: In "xl help", the output includes this line: vsnd-list List virtual display devices for a domain This should obviously say "sound devices" instead of "display devices". Signed-off-by: John E. Krokes Reviewed-by: Juergen Gross Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [KEXEC] Lacking documentation and kexec failure with Xen 4.19-rc4 and 4.20-dev on linux host
On 30.07.24 06:02, A Kundu wrote: I am experiencing issues using kexec to load Xen 4.19-rc4 and 4.20-dev on a debian host. The current documentation at https://xenbits.xenproject.org/docs/4.19-testing/misc/kexec_and_kdump.txt appears to be missing crucial details on properly using kexec with the --vmm option for loading Xen. kexec complains the --vmm option is not present. System information: $ uname -a Linux host 6.9.10-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.9.10-1 (2024-07-19) x86_64 GNU/Linux $ kexec --version # compiled from source tarball with ./configure --with-xen kexec-tools 2.0.29 Steps to reproduce: 1. Set variables: XEN_HYPERVISOR="/boot/xen.gz" XEN_CMD="dom0_mem=6G dom0_max_vcpus=6 dom0_vcpus_pin cpufreq=xen" 2. Attempt to load Xen 4.19-rc4: # kexec -l "$XEN_HYPERVISOR" --command-line="$XEN_CMD" Could not find a free area of memory of 0x3b6001 bytes... elf_exec_build_load_relocatable: ELF exec load failed 3. Attempt to load Xen 4.20-dev: # kexec -l "$XEN_HYPERVISOR" --command-line="$XEN_CMD" Could not find a free area of memory of 0x3f8001 bytes... elf_exec_build_load_relocatable: ELF exec load failed 4. Follow the steps in the Xen manual at misc/kexec_and_kdump.txt: # Set variables XEN_IMAGE="/boot/xen-4.19-rc.gz" DOM0_IMAGE="/boot/vmlinuz-6.9.10-amd64" DOM0_INITRD="/boot/initrd.img-6.9.10-amd64" DOM0_MEMORY="6G" DOM0_CPUS="0-5" # Prepare Xen and dom0 command line arguments XEN_ARGS="no-real-mode dom0_mem=${DOM0_MEMORY}" DOM0_ARGS="max_cpus=6 dom0_max_vcpus=6 dom0_vcpus_pin" # Load Xen and dom0 kernel kexec -l --append="${XEN_ARGS} -- ${DOM0_ARGS}" --initrd=${DOM0_INITRD} \ --vmm=${XEN_IMAGE} ${DOM0_IMAGE} Result: kexec: unrecognized option '--vmm=/boot/xen-4.19-rc.gz' I have tried compiling kexec-tools 2.0.29 from source using ./configure --with-xen, as well as using debian's apt version, but both result in the same "unrecognized option '--vmm'" error. Please provide guidance on the correct steps to get kexec working for loading Xen 4.19-rc4 and 4.20-dev. Additionally, I kindly request that the documentation be updated with these details to assist other users who may encounter this issue. The mentioned documentation hasn't seen any updates since 2008 in the part you are interested in. I believe the most sane action would be to remove the "Kexec" chapter completely. With Xen I'm not aware of any use case nowadays other than that explained in the "Kdump" chapter. Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: kexec failure with Xen 4.19-rc4 and 4.20-dev on linux host
On 05.08.24 08:01, Jan Beulich wrote: On 04.08.2024 15:17, A Kundu wrote: On 8/2/24 13:25, Jan Beulich wrote: > On 02.08.2024 09:28, A Kundu wrote: >> On 8/2/24 09:06, Baoquan He wrote: >>> On 07/31/24 at 06:34pm, A Kundu wrote: I am experiencing issues using kexec to load Xen 4.17(debian's apt version), Xen 4.19-rc4 (compiled from source) and 4.20-dev (compiled from source) on a debian host. >>> You should CC this to XEN dev list so that XEN dev knows this and may >>> provide help. Not everyone is interested in and knows XEN. >>> System information: $ uname -a Linux host 6.9.10-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.9.10-1 (2024-07-19) x86_64 GNU/Linux $ kexec --version # compiled from source tarball with ./configure --with-xen kexec-tools 2.0.29 Steps to reproduce: 1. Set variables: XEN_HYPERVISOR="/boot/xen.gz" XEN_CMD="dom0_mem=6G dom0_max_vcpus=6 dom0_vcpus_pin cpufreq=xen" 2. Attempt to load Xen 4.19-rc4: # kexec -l "$XEN_HYPERVISOR" --command-line="$XEN_CMD" Could not find a free area of memory of 0x3b6001 bytes... elf_exec_build_load_relocatable: ELF exec load failed 3. Attempt to load Xen 4.20-dev: # kexec -l "$XEN_HYPERVISOR" --command-line="$XEN_CMD" Could not find a free area of memory of 0x3f8001 bytes... elf_exec_build_load_relocatable: ELF exec load failed 4. Attempt to load Xen 4.17 (from debian rrepositories): # kexec -l /boot/xen-4.17-amd64.gz --command-line="$XEN_CMD" Could not find a free area of memory of 0x3b4001 bytes... elf_exec_build_load_relocatable: ELF exec load failed > > And with all of them saying effectively the same, did you verify you > actually have a sufficiently large area reserved? The obvious > place for you to look at is Xen's boot log (obtained via serial > console or "xl dmesg" immediately after booting the system). If you > find everything as expected there, ... > If you need any further information to investigate this problem, please let me know. > > ... please provide that boot log. I have also followed up on your suggestion to check the Xen boot log using "xl dmesg", but unfortunately, I received the following error: xencall: error: Could not obtain handle on privileged command interface: No such file or directory libxl: error: libxl.c:102:libxl_ctx_alloc: cannot open libxc handle: No such file or directory cannot init xl context This indicates that Xen did not boot successfully, so there are no logs available. The fact that you have Dom0 up makes clear that Xen booted okay(ish). The fact that you get "No such file or directory" from xencall suggests you either didn't load the xen-privcmd driver (normally arrangements are made by distros for this to happen automatically), or you didn't even build it. The messages seen don't indicate that Xen booted okay(ish). I get the same messages when having booted the Linux kernel on bare metal without Xen. > And with all of them saying effectively the same, did you verify you > actually have a sufficiently large area reserved? The obvious > place for you to look at is Xen's boot log (obtained via serial > console or "xl dmesg" immediately after booting the system). If you > find everything as expected there, ... > In an attempt to resolve the memory allocation issue, I have tried the following: Added a crashkernel=@ parameter to the host kernel command line to reserve a dedicated memory region for kexec, and attempted to load Xen into that area. That was a remote guess of mine. This command line option is meaningless when running under Xen. The reservation needs to be done in Xen. Just one thing to add here: what do you want to accomplish by kexec-ing into Xen? You need to specify a dom0 kernel and probably the dom0's initrd, too. Or do you have configured your dom0 to work without an initrd? Even in this case you'd need to pass on the dom0 to Xen via the kexec command. Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[PATCH] tools/xenstored: switch stubdom live update to use file for state
With the introduction of 9pfs for Xenstore-stubdom it is now possible to use a file for saving the state when doing live update. This allows to move some environment specific actions back to the common source file lu.c. Signed-off-by: Juergen Gross --- tools/xenstored/lu.c| 75 - tools/xenstored/lu.h| 19 +- tools/xenstored/lu_daemon.c | 72 --- tools/xenstored/lu_minios.c | 49 4 files changed, 75 insertions(+), 140 deletions(-) diff --git a/tools/xenstored/lu.c b/tools/xenstored/lu.c index 2f41d10c95..bec2a84e10 100644 --- a/tools/xenstored/lu.c +++ b/tools/xenstored/lu.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include "talloc.h" #include "core.h" @@ -19,11 +21,18 @@ #include "watch.h" #ifndef NO_LIVE_UPDATE + +struct lu_dump_state { + void *buf; + unsigned int size; + int fd; + char *filename; +}; + struct live_update *lu_status; static int lu_destroy(void *data) { - lu_destroy_arch(data); lu_status = NULL; return 0; @@ -70,6 +79,48 @@ bool lu_is_pending(void) return lu_status != NULL; } +static void lu_get_dump_state(struct lu_dump_state *state) +{ + struct stat statbuf; + + state->size = 0; + + state->filename = talloc_asprintf(NULL, "%s/state_dump", + xenstore_rundir()); + if (!state->filename) + barf("Allocation failure"); + + state->fd = open(state->filename, O_RDONLY); + if (state->fd < 0) + return; + if (fstat(state->fd, &statbuf) != 0) + goto out_close; + state->size = statbuf.st_size; + + state->buf = mmap(NULL, state->size, PROT_READ, MAP_PRIVATE, + state->fd, 0); + if (state->buf == MAP_FAILED) { + state->size = 0; + goto out_close; + } + + return; + + out_close: + close(state->fd); +} + +static void lu_close_dump_state(struct lu_dump_state *state) +{ + assert(state->filename != NULL); + + munmap(state->buf, state->size); + close(state->fd); + + unlink(state->filename); + talloc_free(state->filename); +} + void lu_read_state(void) { struct lu_dump_state state = {}; @@ -197,6 +248,28 @@ static const char *lu_reject_reason(const void *ctx) return ret ? (const char *)ret : "Overlapping transactions"; } +static FILE *lu_dump_open(const void *ctx) +{ + char *filename; + int fd; + + filename = talloc_asprintf(ctx, "%s/state_dump", + xenstore_rundir()); + if (!filename) + return NULL; + + fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR); + if (fd < 0) + return NULL; + + return fdopen(fd, "w"); +} + +static void lu_dump_close(FILE *fp) +{ + fclose(fp); +} + static const char *lu_dump_state(const void *ctx, struct connection *conn) { FILE *fp; diff --git a/tools/xenstored/lu.h b/tools/xenstored/lu.h index ac3c572ca8..dacc9b6e42 100644 --- a/tools/xenstored/lu.h +++ b/tools/xenstored/lu.h @@ -18,12 +18,9 @@ struct live_update { unsigned int kernel_size; unsigned int kernel_off; - void *dump_state; - unsigned long dump_size; -#else - char *filename; #endif + char *filename; char *cmdline; /* Start parameters. */ @@ -32,15 +29,6 @@ struct live_update { time_t started_at; }; -struct lu_dump_state { - void *buf; - unsigned int size; -#ifndef __MINIOS__ - int fd; - char *filename; -#endif -}; - extern struct live_update *lu_status; struct connection *lu_get_connection(void); @@ -54,15 +42,10 @@ int do_control_lu(const void *ctx, struct connection *conn, const char **vec, int num); /* Live update private interfaces. */ -void lu_get_dump_state(struct lu_dump_state *state); -void lu_close_dump_state(struct lu_dump_state *state); -FILE *lu_dump_open(const void *ctx); -void lu_dump_close(FILE *fp); char *lu_exec(const void *ctx, int argc, char **argv); const char *lu_arch(const void *ctx, struct connection *conn, const char **vec, int num); const char *lu_begin(struct connection *conn); -void lu_destroy_arch(void *data); #else static inline struct connection *lu_get_connection(void) { diff --git a/tools/xenstored/lu_daemon.c b/tools/xenstored/lu_daemon.c index 635ab0..6df6c80a2a 100644 --- a/tools/xenstored/lu_daemon.c +++ b/tools/xenstored/lu_daemon.c @@ -5,82 +5,14 @@ * Copyright (C) 2022 Juergen Gross, SUSE LLC */ -#include -#include #include #include -#include -#include #i
[PATCH 1/3] mini-os: mm: introduce generic page table walk function
In x86 mm code there are multiple instances of page table walks for different purposes. Introduce a generic page table walker being able to cover the current use cases. It will be used for other cases in future, too. The page table walker needs some per-level data, so add a table for that data. Merge it with the already existing pt_prot[] array. Rewrite get_pgt() to use the new walker. Signed-off-by: Juergen Gross --- arch/x86/mm.c | 152 +- 1 file changed, 113 insertions(+), 39 deletions(-) diff --git a/arch/x86/mm.c b/arch/x86/mm.c index 7ddf16e4..cc4d41e9 100644 --- a/arch/x86/mm.c +++ b/arch/x86/mm.c @@ -125,20 +125,25 @@ void arch_mm_preinit(void *p) } #endif +static const struct { +unsigned int shift; +unsigned int entries; +pgentry_t prot; +} ptdata[PAGETABLE_LEVELS + 1] = { +{ 0, 0, 0 }, +{ L1_PAGETABLE_SHIFT, L1_PAGETABLE_ENTRIES, L1_PROT }, +{ L2_PAGETABLE_SHIFT, L2_PAGETABLE_ENTRIES, L2_PROT }, +{ L3_PAGETABLE_SHIFT, L3_PAGETABLE_ENTRIES, L3_PROT }, +#if defined(__x86_64__) +{ L4_PAGETABLE_SHIFT, L4_PAGETABLE_ENTRIES, L4_PROT }, +#endif +}; + /* * Make pt_pfn a new 'level' page table frame and hook it into the page * table at offset in previous level MFN (pref_l_mfn). pt_pfn is a guest * PFN. */ -static pgentry_t pt_prot[PAGETABLE_LEVELS] = { -L1_PROT, -L2_PROT, -L3_PROT, -#if defined(__x86_64__) -L4_PROT, -#endif -}; - static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, unsigned long offset, unsigned long level) { @@ -170,7 +175,7 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, mmu_updates[0].ptr = (tab[l2_table_offset(pt_page)] & PAGE_MASK) + sizeof(pgentry_t) * l1_table_offset(pt_page); mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | -(pt_prot[level - 1] & ~_PAGE_RW); +(ptdata[level].prot & ~_PAGE_RW); if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 ) { @@ -183,7 +188,7 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, mmu_updates[0].ptr = ((pgentry_t)prev_l_mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset; mmu_updates[0].val = (pgentry_t)pfn_to_mfn(*pt_pfn) << PAGE_SHIFT | -pt_prot[level]; +ptdata[level + 1].prot; if ( (rc = HYPERVISOR_mmu_update(mmu_updates, 1, NULL, DOMID_SELF)) < 0 ) { @@ -192,7 +197,7 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, } #else tab = mfn_to_virt(prev_l_mfn); -tab[offset] = (*pt_pfn << PAGE_SHIFT) | pt_prot[level]; +tab[offset] = (*pt_pfn << PAGE_SHIFT) | ptdata[level + 1].prot; #endif *pt_pfn += 1; @@ -202,6 +207,82 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1]; #endif +/* + * Walk recursively through all PTEs calling a specified function. The function + * is allowed to change the PTE, the walker will follow the new value. + * The walk will cover the virtual address range [from_va .. to_va]. + * The supplied function will be called with the following parameters: + * va: base virtual address of the area covered by the current PTE + * lvl: page table level of the PTE (1 = lowest level, PAGETABLE_LEVELS = + * PTE in page table addressed by %cr3) + * is_leaf: true if PTE doesn't address another page table + * pte: address of the PTE + * par: parameter, passed to walk_pt() by caller + * Return value of func() being non-zero will terminate walk_pt(), walk_pt() + * will return that value in this case, zero else. + */ +static int walk_pt(unsigned long from_va, unsigned long to_va, + int (func)(unsigned long va, unsigned int lvl, + bool is_leaf, pgentry_t *pte, void *par), + void *par) +{ +unsigned int lvl = PAGETABLE_LEVELS; +unsigned int ptindex[PAGETABLE_LEVELS + 1]; +unsigned long va = round_pgdown(from_va); +unsigned long va_lvl; +pgentry_t *tab[PAGETABLE_LEVELS + 1]; +pgentry_t *pte; +bool is_leaf; +int ret; + +/* Start at top level page table. */ +tab[lvl] = pt_base; +ptindex[lvl] = (va >> ptdata[lvl].shift) & (ptdata[lvl].entries - 1); + +while ( va < (to_va | (PAGE_SIZE - 1)) ) +{ +pte = tab[lvl] + ptindex[lvl]; +is_leaf = (lvl == L1_FRAME) || (*pte & _PAGE_PSE) || + !(*pte & _PAGE_PRESENT); +va_lvl = va & ~((1UL << ptdata[lvl].shift) - 1); +ret = func(va_lvl, lvl, is_leaf, pte, par); +if ( ret ) +return ret; + +/* PTE might have been modified by func(), reevaluate leaf state. */ +is_leaf = (lvl == L1_FRAME) || (*pte & _PAGE_PSE) || +
[PATCH 3/3] mini-os: mm: convert set_readonly() to use walk_pt()
Instead of having another copy of a page table walk in set_readonly(), just use walk_pt(). As it will be needed later anyway, split out the TLB flushing into a dedicated function. Signed-off-by: Juergen Gross --- arch/x86/mm.c | 119 +- 1 file changed, 50 insertions(+), 69 deletions(-) diff --git a/arch/x86/mm.c b/arch/x86/mm.c index accde291..90992068 100644 --- a/arch/x86/mm.c +++ b/arch/x86/mm.c @@ -397,92 +397,73 @@ static void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn) * Mark portion of the address space read only. */ extern struct shared_info shared_info; -static void set_readonly(void *text, void *etext) -{ -unsigned long start_address = -((unsigned long) text + PAGE_SIZE - 1) & PAGE_MASK; -unsigned long end_address = (unsigned long) etext; -pgentry_t *tab = pt_base, page; -unsigned long mfn = pfn_to_mfn(virt_to_pfn(pt_base)); -unsigned long offset; -unsigned long page_size = PAGE_SIZE; + +struct set_readonly_par { +unsigned long etext; #ifdef CONFIG_PARAVIRT -int count = 0; -int rc; +unsigned int count; #endif +}; -printk("setting %p-%p readonly\n", text, etext); +static int set_readonly_func(unsigned long va, unsigned int lvl, bool is_leaf, + pgentry_t *pte, void *par) +{ +struct set_readonly_par *ro = par; -while ( start_address + page_size <= end_address ) -{ -tab = pt_base; -mfn = pfn_to_mfn(virt_to_pfn(pt_base)); +if ( !is_leaf ) +return 0; -#if defined(__x86_64__) -offset = l4_table_offset(start_address); -page = tab[offset]; -mfn = pte_to_mfn(page); -tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT); -#endif -offset = l3_table_offset(start_address); -page = tab[offset]; -mfn = pte_to_mfn(page); -tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT); -offset = l2_table_offset(start_address); -if ( !(tab[offset] & _PAGE_PSE) ) -{ -page = tab[offset]; -mfn = pte_to_mfn(page); -tab = to_virt(mfn_to_pfn(mfn) << PAGE_SHIFT); +if ( va + (1UL << ptdata[lvl].shift) > ro->etext ) +return 1; -offset = l1_table_offset(start_address); -} +if ( va == (unsigned long)&shared_info ) +{ +printk("skipped %lx\n", va); +return 0; +} -if ( start_address != (unsigned long)&shared_info ) -{ #ifdef CONFIG_PARAVIRT -mmu_updates[count].ptr = -((pgentry_t)mfn << PAGE_SHIFT) + sizeof(pgentry_t) * offset; -mmu_updates[count].val = tab[offset] & ~_PAGE_RW; -count++; +mmu_updates[ro->count].ptr = virt_to_mach(pte); +mmu_updates[ro->count].val = *pte & ~_PAGE_RW; +ro->count++; + +if ( (ro->count == L1_PAGETABLE_ENTRIES || + va + 2 * PAGE_SIZE > ro->etext) && + HYPERVISOR_mmu_update(mmu_updates, ro->count, NULL, DOMID_SELF) < 0 ) +{ +printk("ERROR: set_readonly(): PTE could not be updated\n"); +do_exit(); +} #else -tab[offset] &= ~_PAGE_RW; +*pte &= ~_PAGE_RW; #endif -} -else -printk("skipped %lx\n", start_address); -start_address += page_size; +return 0; +} #ifdef CONFIG_PARAVIRT -if ( count == L1_PAGETABLE_ENTRIES || - start_address + page_size > end_address ) -{ -rc = HYPERVISOR_mmu_update(mmu_updates, count, NULL, DOMID_SELF); -if ( rc < 0 ) -{ -printk("ERROR: set_readonly(): PTE could not be updated\n"); -do_exit(); -} -count = 0; -} -#else -if ( start_address == (1UL << L2_PAGETABLE_SHIFT) ) -page_size = 1UL << L2_PAGETABLE_SHIFT; -#endif -} +static void tlb_flush(void) +{ +mmuext_op_t op = { .cmd = MMUEXT_TLB_FLUSH_ALL }; +int count; -#ifdef CONFIG_PARAVIRT -{ -mmuext_op_t op = { -.cmd = MMUEXT_TLB_FLUSH_ALL, -}; -int count; -HYPERVISOR_mmuext_op(&op, 1, &count, DOMID_SELF); -} +HYPERVISOR_mmuext_op(&op, 1, &count, DOMID_SELF); +} #else +static void tlb_flush(void) +{ write_cr3((unsigned long)pt_base); +} #endif + +static void set_readonly(void *text, void *etext) +{ +struct set_readonly_par setro = { .etext = (unsigned long)etext }; +unsigned long start_address = PAGE_ALIGN((unsigned long)text); + +printk("setting %p-%p readonly\n", text, etext); +walk_pt(start_address, setro.etext, set_readonly_func, &setro); +tlb_flush(); } /* -- 2.43.0
[PATCH 2/3] mini-os: mm: switch need_pgt() to use walk_pt()
Instead of open coding a page table walk, use walk_pt() in need_pgt(). Signed-off-by: Juergen Gross --- arch/x86/mm.c | 66 +++ 1 file changed, 24 insertions(+), 42 deletions(-) diff --git a/arch/x86/mm.c b/arch/x86/mm.c index cc4d41e9..accde291 100644 --- a/arch/x86/mm.c +++ b/arch/x86/mm.c @@ -518,57 +518,39 @@ static pgentry_t *get_pgt(unsigned long va) * return a valid PTE for a given virtual address. If PTE does not exist, * allocate page-table pages. */ -pgentry_t *need_pgt(unsigned long va) +static int need_pgt_func(unsigned long va, unsigned int lvl, bool is_leaf, + pgentry_t *pte, void *par) { +pgentry_t **result = par; unsigned long pt_mfn; -pgentry_t *tab; unsigned long pt_pfn; -unsigned offset; +unsigned int idx; -tab = pt_base; -pt_mfn = virt_to_mfn(pt_base); +if ( !is_leaf ) +return 0; -#if defined(__x86_64__) -offset = l4_table_offset(va); -if ( !(tab[offset] & _PAGE_PRESENT) ) -{ -pt_pfn = virt_to_pfn(alloc_page()); -if ( !pt_pfn ) -return NULL; -new_pt_frame(&pt_pfn, pt_mfn, offset, L3_FRAME); -} -ASSERT(tab[offset] & _PAGE_PRESENT); -pt_mfn = pte_to_mfn(tab[offset]); -tab = mfn_to_virt(pt_mfn); -#endif -offset = l3_table_offset(va); -if ( !(tab[offset] & _PAGE_PRESENT) ) -{ -pt_pfn = virt_to_pfn(alloc_page()); -if ( !pt_pfn ) -return NULL; -new_pt_frame(&pt_pfn, pt_mfn, offset, L2_FRAME); -} -ASSERT(tab[offset] & _PAGE_PRESENT); -pt_mfn = pte_to_mfn(tab[offset]); -tab = mfn_to_virt(pt_mfn); -offset = l2_table_offset(va); -if ( !(tab[offset] & _PAGE_PRESENT) ) +if ( lvl == L1_FRAME || (*pte & _PAGE_PRESENT) ) { -pt_pfn = virt_to_pfn(alloc_page()); -if ( !pt_pfn ) -return NULL; -new_pt_frame(&pt_pfn, pt_mfn, offset, L1_FRAME); +*result = pte; +return 1; } -ASSERT(tab[offset] & _PAGE_PRESENT); -if ( tab[offset] & _PAGE_PSE ) -return &tab[offset]; -pt_mfn = pte_to_mfn(tab[offset]); -tab = mfn_to_virt(pt_mfn); +pt_mfn = virt_to_mfn(pte); +pt_pfn = virt_to_pfn(alloc_page()); +if ( !pt_pfn ) +return -1; +idx = (va >> ptdata[lvl].shift) & (ptdata[lvl].entries - 1); +new_pt_frame(&pt_pfn, pt_mfn, idx, lvl - 1); -offset = l1_table_offset(va); -return &tab[offset]; +return 0; +} + +pgentry_t *need_pgt(unsigned long va) +{ +pgentry_t *tab = NULL; + +walk_pt(va, va, need_pgt_func, &tab); +return tab; } EXPORT_SYMBOL(need_pgt); -- 2.43.0
[PATCH 0/3] mini-os: mm: use a generic page table walker
Instead of open coding a page table walk multiple times, use a generic page table walker instead. This new page table walker will be used later for kexec support, too. Juergen Gross (3): mini-os: mm: introduce generic page table walk function mini-os: mm: switch need_pgt() to use walk_pt() mini-os: mm: convert set_readonly() to use walk_pt() arch/x86/mm.c | 337 -- 1 file changed, 187 insertions(+), 150 deletions(-) -- 2.43.0
[GIT PULL] xen: branch for v6.11-rc1 take 2
Linus, Please git pull the following tag: git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git for-linus-6.11-rc1a-tag xen: branch for v6.11-rc1 take 2 It contains 2 fixes for issues introduced in this merge window: - one for enhanced debugging in the Xen multicall handling - a series with 2 patches fixing a boot failure when running as dom0 in PVH mode Thanks. Juergen arch/x86/include/asm/xen/hypervisor.h | 5 -- arch/x86/platform/pvh/enlighten.c | 3 - arch/x86/xen/enlighten_pvh.c | 107 ++ arch/x86/xen/multicalls.c | 19 +++--- arch/x86/xen/smp_pv.c | 1 + arch/x86/xen/xen-ops.h| 3 + 6 files changed, 74 insertions(+), 64 deletions(-) Juergen Gross (1): xen: fix multicall debug data referencing Roger Pau Monne (2): x86/xen: move xen_reserve_extra_memory() x86/xen: fix memblock_reserve() usage on PVH
[PATCH v2] mini-os: put sanity_check() under CONFIG_TEST
Hide the sanity_check() function, as it is used nowhere. By putting it under #ifdef CONFIG_TEST it will stay around, but it won't be included in normal production builds. Call sanity_check() from the periodic thread of the test app, causing a sanity check every second. Since any application linked with Mini-OS can't call sanity_check() (there is no EXPORT_SYMBOL for it), there is zero chance of breaking any use case. Signed-off-by: Juergen Gross --- V2: - don't remove it, but just hide it (Samuel Thibault) --- include/lib.h | 2 ++ mm.c | 2 ++ test.c| 1 + 3 files changed, 5 insertions(+) diff --git a/include/lib.h b/include/lib.h index abd4e9ab..de67bab0 100644 --- a/include/lib.h +++ b/include/lib.h @@ -152,8 +152,10 @@ do { \ #define BUG_ON(x) ASSERT(!(x)) +#ifdef CONFIG_TEST /* Consistency check as much as possible. */ void sanity_check(void); +#endif /* Get own domid. */ domid_t get_domid(void); diff --git a/mm.c b/mm.c index 4aa0c6ca..a5d3f5e5 100644 --- a/mm.c +++ b/mm.c @@ -395,6 +395,7 @@ void fini_mm(void) { } +#ifdef CONFIG_TEST void sanity_check(void) { int x; @@ -410,3 +411,4 @@ void sanity_check(void) } } } +#endif diff --git a/test.c b/test.c index 465c54e8..4dd6e260 100644 --- a/test.c +++ b/test.c @@ -185,6 +185,7 @@ static void periodic_thread(void *p) { gettimeofday(&tv, NULL); printk("T(s=%ld us=%ld)\n", tv.tv_sec, tv.tv_usec); +sanity_check(); msleep(1000); } } -- 2.43.0
Re: [PATCH 4/4] mini-os: remove sanity_check()
On 25.07.24 08:29, Samuel Thibault wrote: Jürgen Groß, le jeu. 25 juil. 2024 08:25:18 +0200, a ecrit: On 25.07.24 00:44, Samuel Thibault wrote: Hello, Jürgen Groß, le mar. 23 juil. 2024 08:36:13 +0200, a ecrit: On 22.07.24 23:35, Samuel Thibault wrote: Juergen Gross, le lun. 22 juil. 2024 17:01:41 +0200, a ecrit: Remove the sanity_check() function, as it is used nowhere. Since any application linked with Mini-OS can't call sanity_check() either (there is no EXPORT_SYMBOL for it), there is zero chance of breaking any use case. Don't we still want to keep it around, at least as formal documentation of the expected status of the list? Hmm, is it really worth the extra code? I have already seen such kind of piece of code getting very convenient when tracking odd bugs. What about putting it under CONFIG_TEST then? Ok ! I went a little bit further by calling sanity_check() from periodic_thread() in test.c once a second. This will make real use of the function. Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [xen-tip:linux-next 12/12] WARNING: modpost: vmlinux: section mismatch in reference: mc_debug_data+0x0 (section: .data) -> mc_debug_data_early (section: .init.data)
On 24.07.24 03:08, kernel test robot wrote: tree: https://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git linux-next head: 368990a7fe30737c990f628a60d26d9854a9e690 commit: 368990a7fe30737c990f628a60d26d9854a9e690 [12/12] xen: fix multicall debug data referencing config: x86_64-randconfig-012-20240724 (https://download.01.org/0day-ci/archive/20240724/202407240907.u0njhgtu-...@intel.com/config) compiler: gcc-13 (Ubuntu 13.2.0-4ubuntu3) 13.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240724/202407240907.u0njhgtu-...@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot | Closes: https://lore.kernel.org/oe-kbuild-all/202407240907.u0njhgtu-...@intel.com/ All warnings (new ones prefixed by >>, old ones prefixed by <<): WARNING: modpost: vmlinux: section mismatch in reference: mc_debug_data+0x0 (section: .data) -> mc_debug_data_early (section: .init.data) With current infrastructure this is not easily fixable, as there is no way to tag a percpu variable as __refdata. Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [PATCH v1.1 5/6] tools/libxs: Use writev()/sendmsg() instead of write()
On 23.07.24 14:25, Andrew Cooper wrote: On 23/07/2024 10:37 am, Jürgen Groß wrote: On 22.07.24 18:25, Andrew Cooper wrote: With the input data now conveniently arranged, use writev()/sendmsg() instead of decomposing it into write() calls. This causes all requests to be submitted with a single system call, rather than at least two. While in principle short writes can occur, the chances of it happening are slim given that most xenbus comms are only a handful of bytes. Nevertheless, provide {writev,sendmsg}_exact() wrappers which take care of resubmitting on EINTR or short write. Signed-off-by: Andrew Cooper --- CC: Anthony PERARD CC: Juergen Gross CC: Frediano Ziglio v1.1: * Fix iov overread, spotted by Frediano. Factor the common updating logic out into update_iov(). --- tools/libs/store/xs.c | 94 +-- 1 file changed, 91 insertions(+), 3 deletions(-) diff --git a/tools/libs/store/xs.c b/tools/libs/store/xs.c index e8202314..f80ac7558cbe 100644 --- a/tools/libs/store/xs.c +++ b/tools/libs/store/xs.c @@ -563,6 +563,95 @@ static void *read_reply( return body; } +/* + * Update an iov/nr pair after an incomplete writev()/sendmsg(). + * + * Awkwardly, nr has different widths and signs between writev() and + * sendmsg(), so we take it and return it by value, rather than by pointer. + */ +static size_t update_iov(struct iovec **p_iov, size_t nr, size_t res) +{ + struct iovec *iov = *p_iov; + + /* Skip fully complete elements, including empty elements. */ + while (nr && res >= iov->iov_len) { + res -= iov->iov_len; + nr--; + iov++; + } + + /* Partial element, adjust base/len. */ + if (res) { + iov->iov_len -= res; + iov->iov_base += res; + } + + *p_iov = iov; + + return nr; +} + +/* + * Wrapper around sendmsg() to resubmit on EINTR or short write. Returns + * @true if all data was transmitted, or @false with errno for an error. + * Note: May alter @iov in place on resubmit. + */ +static bool sendmsg_exact(int fd, struct iovec *iov, unsigned int nr) +{ + struct msghdr hdr = { + .msg_iov = iov, + .msg_iovlen = nr, + }; + + /* Sanity check first element isn't empty */ + assert(iov->iov_len == sizeof(struct xsd_sockmsg)); Can you please move this assert() into write_request(), avoiding to have 2 copies of it? It was more relevant before update_iov() was split out. But, there's exactly the same assertion in the write_request()'s caller, so I'd prefer to simply drop it if that's ok? The writev()/sendmsg() won't malfunction if the first element is 0, and update_iov() will now cope too, so I don't think it's necessary. Fine with me. Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[PATCH] mini-os: mm: don't have two large arrays of the same kind
In the CONFIG_PARAVIRT case there are two identical static arrays of more than 8kB size with function scope. As the two functions never nest, a single array of file scope can replace the two existing instances. Signed-off-by: Juergen Gross --- arch/x86/mm.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm.c b/arch/x86/mm.c index be1cf0cf..7ddf16e4 100644 --- a/arch/x86/mm.c +++ b/arch/x86/mm.c @@ -198,6 +198,10 @@ static void new_pt_frame(unsigned long *pt_pfn, unsigned long prev_l_mfn, *pt_pfn += 1; } +#ifdef CONFIG_PARAVIRT +static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1]; +#endif + /* * Build the initial pagetable. */ @@ -209,7 +213,6 @@ static void build_pagetable(unsigned long *start_pfn, unsigned long *max_pfn) unsigned long pt_mfn = pfn_to_mfn(virt_to_pfn(pt_base)); unsigned long offset; #ifdef CONFIG_PARAVIRT -static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1]; int count = 0; int rc; #endif @@ -323,7 +326,6 @@ static void set_readonly(void *text, void *etext) unsigned long offset; unsigned long page_size = PAGE_SIZE; #ifdef CONFIG_PARAVIRT -static mmu_update_t mmu_updates[L1_PAGETABLE_ENTRIES + 1]; int count = 0; int rc; #endif -- 2.43.0
[PATCH 4/4] mini-os: remove sanity_check()
Remove the sanity_check() function, as it is used nowhere. Since any application linked with Mini-OS can't call sanity_check() either (there is no EXPORT_SYMBOL for it), there is zero chance of breaking any use case. Signed-off-by: Juergen Gross --- include/lib.h | 3 --- mm.c | 16 2 files changed, 19 deletions(-) diff --git a/include/lib.h b/include/lib.h index abd4e9ab..acd4acc6 100644 --- a/include/lib.h +++ b/include/lib.h @@ -152,9 +152,6 @@ do { \ #define BUG_ON(x) ASSERT(!(x)) -/* Consistency check as much as possible. */ -void sanity_check(void); - /* Get own domid. */ domid_t get_domid(void); diff --git a/mm.c b/mm.c index 96686a5c..1fa7e7bf 100644 --- a/mm.c +++ b/mm.c @@ -394,19 +394,3 @@ void init_mm(void) void fini_mm(void) { } - -void sanity_check(void) -{ -int x; -chunk_head_t *head; - -for ( x = 0; x < FREELIST_SIZE; x++ ) -{ -for ( head = free_list[x].next; !FREELIST_EMPTY(head); - head = head->next ) -{ -ASSERT(!allocated_in_map(virt_to_pfn(head))); -ASSERT(head->next->prev == head); -} -} -} -- 2.43.0
[PATCH 2/4] mini-os: mm: remove not needed struct chunk_tail_st
The struct chunk_tail_st isn't really used other than writing to it. Remove it in order to simplify the code. Signed-off-by: Juergen Gross --- mm.c | 20 1 file changed, 20 deletions(-) diff --git a/mm.c b/mm.c index 1dcd954c..2cc49e94 100644 --- a/mm.c +++ b/mm.c @@ -123,7 +123,6 @@ static void map_free(unsigned long first_page, unsigned long nr_pages) /* BINARY BUDDY ALLOCATOR */ typedef struct chunk_head_st chunk_head_t; -typedef struct chunk_tail_st chunk_tail_t; struct chunk_head_st { chunk_head_t *next; @@ -131,10 +130,6 @@ struct chunk_head_st { intlevel; }; -struct chunk_tail_st { -int level; -}; - /* Linked lists of free chunks of different powers-of-two in size. */ #define FREELIST_SIZE ((sizeof(void *) << 3) - PAGE_SHIFT) static chunk_head_t *free_head[FREELIST_SIZE]; @@ -151,7 +146,6 @@ static void init_page_allocator(unsigned long min, unsigned long max) unsigned long range; unsigned long r_min, r_max; chunk_head_t *ch; -chunk_tail_t *ct; printk("MM: Initialise page allocator for %lx(%lx)-%lx(%lx)\n", (u_long)to_virt(min), min, (u_long)to_virt(max), max); @@ -215,14 +209,12 @@ static void init_page_allocator(unsigned long min, unsigned long max) ch = (chunk_head_t *)r_min; r_min += 1UL << i; range -= 1UL << i; -ct = (chunk_tail_t *)r_min - 1; i -= PAGE_SHIFT; ch->level = i; ch->next= free_head[i]; ch->pprev = &free_head[i]; ch->next->pprev = &ch->next; free_head[i]= ch; -ct->level = i; } } @@ -234,7 +226,6 @@ unsigned long alloc_pages(int order) { int i; chunk_head_t *alloc_ch, *spare_ch; -chunk_tail_t*spare_ct; if ( !chk_free_pages(1UL << order) ) goto no_memory; @@ -261,14 +252,11 @@ unsigned long alloc_pages(int order) i--; spare_ch = (chunk_head_t *)((char *)alloc_ch + (1UL << (i + PAGE_SHIFT))); -spare_ct = (chunk_tail_t *)((char *)spare_ch + -(1UL << (i + PAGE_SHIFT))) - 1; /* Create new header for spare chunk. */ spare_ch->level = i; spare_ch->next = free_head[i]; spare_ch->pprev = &free_head[i]; -spare_ct->level = i; /* Link in the spare chunk. */ spare_ch->next->pprev = &spare_ch->next; @@ -289,7 +277,6 @@ EXPORT_SYMBOL(alloc_pages); void free_pages(void *pointer, int order) { chunk_head_t *freed_ch, *to_merge_ch; -chunk_tail_t *freed_ct; unsigned long mask; /* First free the chunk */ @@ -297,8 +284,6 @@ void free_pages(void *pointer, int order) /* Create free chunk */ freed_ch = (chunk_head_t *)pointer; -freed_ct = (chunk_tail_t *)((char *)pointer + -(1UL << (order + PAGE_SHIFT))) - 1; /* Now, possibly we can conseal chunks together */ while ( order < FREELIST_SIZE ) @@ -320,9 +305,6 @@ void free_pages(void *pointer, int order) if ( allocated_in_map(virt_to_pfn(to_merge_ch)) || to_merge_ch->level != order ) break; - -/* Merge with successor */ -freed_ct = (chunk_tail_t *)((char *)to_merge_ch + mask) - 1; } /* We are committed to merging, unlink the chunk */ @@ -336,8 +318,6 @@ void free_pages(void *pointer, int order) freed_ch->level = order; freed_ch->next = free_head[order]; freed_ch->pprev = &free_head[order]; -freed_ct->level = order; - freed_ch->next->pprev = &freed_ch->next; free_head[order] = freed_ch; -- 2.43.0
[PATCH 3/4] mini-os: mm: reduce buddy allocator list administration data
Today the administration data for the buddy allocator's lists consists of 2 arrays: one pointer array and one list element array for easier handling of the lists' tails. Those arrays can be combined into one by dropping the pointer array and using a different list end indicator. Add enqueue and dequeue helpers for better readability. Change the level member type to unsigned int. Signed-off-by: Juergen Gross --- mm.c | 73 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/mm.c b/mm.c index 2cc49e94..96686a5c 100644 --- a/mm.c +++ b/mm.c @@ -125,16 +125,30 @@ static void map_free(unsigned long first_page, unsigned long nr_pages) typedef struct chunk_head_st chunk_head_t; struct chunk_head_st { -chunk_head_t *next; -chunk_head_t **pprev; -intlevel; +chunk_head_t *next; +chunk_head_t *prev; +unsigned int level; }; /* Linked lists of free chunks of different powers-of-two in size. */ #define FREELIST_SIZE ((sizeof(void *) << 3) - PAGE_SHIFT) -static chunk_head_t *free_head[FREELIST_SIZE]; -static chunk_head_t free_tail[FREELIST_SIZE]; -#define FREELIST_EMPTY(_l) ((_l)->next == NULL) +static chunk_head_t free_list[FREELIST_SIZE]; +#define FREELIST_EMPTY(_l) ((_l)->level == FREELIST_SIZE) + +static void enqueue_elem(chunk_head_t *elem, unsigned int level) +{ +elem->level = level; +elem->next = free_list[level].next; +elem->prev = &free_list[level]; +elem->next->prev = elem; +free_list[level].next = elem; +} + +static void dequeue_elem(chunk_head_t *elem) +{ +elem->prev->next = elem->next; +elem->next->prev = elem->prev; +} /* * Initialise allocator, placing addresses [@min,@max] in free pool. @@ -151,9 +165,9 @@ static void init_page_allocator(unsigned long min, unsigned long max) (u_long)to_virt(min), min, (u_long)to_virt(max), max); for ( i = 0; i < FREELIST_SIZE; i++ ) { -free_head[i] = &free_tail[i]; -free_tail[i].pprev = &free_head[i]; -free_tail[i].next = NULL; +free_list[i].next = &free_list[i]; +free_list[i].prev = &free_list[i]; +free_list[i].level = FREELIST_SIZE; } min = round_pgup(min); @@ -209,12 +223,7 @@ static void init_page_allocator(unsigned long min, unsigned long max) ch = (chunk_head_t *)r_min; r_min += 1UL << i; range -= 1UL << i; -i -= PAGE_SHIFT; -ch->level = i; -ch->next= free_head[i]; -ch->pprev = &free_head[i]; -ch->next->pprev = &ch->next; -free_head[i]= ch; +enqueue_elem(ch, i - PAGE_SHIFT); } } @@ -233,17 +242,16 @@ unsigned long alloc_pages(int order) /* Find smallest order which can satisfy the request. */ for ( i = order; i < FREELIST_SIZE; i++ ) { -if ( !FREELIST_EMPTY(free_head[i]) ) +if ( !FREELIST_EMPTY(free_list[i].next) ) break; } -if ( i == FREELIST_SIZE ) +if ( i >= FREELIST_SIZE ) goto no_memory; /* Unlink a chunk. */ -alloc_ch = free_head[i]; -free_head[i] = alloc_ch->next; -alloc_ch->next->pprev = alloc_ch->pprev; +alloc_ch = free_list[i].next; +dequeue_elem(alloc_ch); /* We may have to break the chunk a number of times. */ while ( i != order ) @@ -254,13 +262,7 @@ unsigned long alloc_pages(int order) (1UL << (i + PAGE_SHIFT))); /* Create new header for spare chunk. */ -spare_ch->level = i; -spare_ch->next = free_head[i]; -spare_ch->pprev = &free_head[i]; - -/* Link in the spare chunk. */ -spare_ch->next->pprev = &spare_ch->next; -free_head[i] = spare_ch; +enqueue_elem(spare_ch, i); } map_alloc(PHYS_PFN(to_phys(alloc_ch)), 1UL << order); @@ -308,18 +310,13 @@ void free_pages(void *pointer, int order) } /* We are committed to merging, unlink the chunk */ -*(to_merge_ch->pprev) = to_merge_ch->next; -to_merge_ch->next->pprev = to_merge_ch->pprev; +dequeue_elem(to_merge_ch); order++; } /* Link the new chunk */ -freed_ch->level = order; -freed_ch->next = free_head[order]; -freed_ch->pprev = &free_head[order]; -freed_ch->next->pprev = &freed_ch->next; -free_head[order] = freed_ch; +enqueue_elem(freed_ch, order); } EXPORT_SYMBOL(free_pages); @@ -405,13 +402,11 @@ void sanity_check(void) for ( x = 0; x < FREELIST_SIZE; x++ ) { -for ( head = free_head[x]; !FREELIST_EMPTY(head); head = head->next ) +for (
[PATCH 0/4] mini-os: cleanup of mm.c
Some cleanups in mm.c: style, removal of unused stuff, optimizations. Juergen Gross (4): mini-os: make mm.c coding style compliant mini-os: mm: remove not needed struct chunk_tail_st mini-os: mm: reduce buddy allocator list administration data mini-os: remove sanity_check() include/lib.h | 3 - mm.c | 264 +- 2 files changed, 112 insertions(+), 155 deletions(-) -- 2.43.0
[PATCH 1/4] mini-os: make mm.c coding style compliant
Apply the coding style to mm.c. No functional change. Signed-off-by: Juergen Gross --- mm.c | 191 ++- 1 file changed, 96 insertions(+), 95 deletions(-) diff --git a/mm.c b/mm.c index eb0e34de..1dcd954c 100644 --- a/mm.c +++ b/mm.c @@ -1,4 +1,4 @@ -/* +/* * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge * (C) 2005 - Grzegorz Milos - Intel Research Cambridge @@ -7,9 +7,9 @@ *File: mm.c * Author: Rolf Neugebauer (neuge...@dcs.gla.ac.uk) * Changes: Grzegorz Milos - * + * *Date: Aug 2003, chages Aug 2005 - * + * * Environment: Xen Minimal OS * Description: memory management related functions * contains buddy page allocator from Xen. @@ -21,16 +21,16 @@ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ @@ -45,7 +45,7 @@ #include #include -/* +/* * ALLOCATION BITMAP * One bit per page of memory. Bit set => page is allocated. */ @@ -55,7 +55,7 @@ unsigned long mm_alloc_bitmap_size; #define PAGES_PER_MAPWORD (sizeof(unsigned long) * 8) -#define allocated_in_map(_pn) \ +#define allocated_in_map(_pn) \ (mm_alloc_bitmap[(_pn) / PAGES_PER_MAPWORD] & \ (1UL << ((_pn) & (PAGES_PER_MAPWORD - 1 @@ -63,8 +63,8 @@ unsigned long nr_free_pages; /* * Hint regarding bitwise arithmetic in map_{alloc,free}: - * -(1<= n. - * (1<= n. + * (1 << n) - 1 sets all bits < n. * Variable names in map_{alloc,free}: * *_idx == Index into `mm_alloc_bitmap' array. * *_off == Bit offset within an element of the `mm_alloc_bitmap' array. @@ -75,53 +75,52 @@ static void map_alloc(unsigned long first_page, unsigned long nr_pages) unsigned long start_off, end_off, curr_idx, end_idx; curr_idx = first_page / PAGES_PER_MAPWORD; -start_off = first_page & (PAGES_PER_MAPWORD-1); +start_off = first_page & (PAGES_PER_MAPWORD - 1); end_idx = (first_page + nr_pages) / PAGES_PER_MAPWORD; -end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD-1); +end_off = (first_page + nr_pages) & (PAGES_PER_MAPWORD - 1); if ( curr_idx == end_idx ) { -mm_alloc_bitmap[curr_idx] |= ((1UL<next == NULL) @@ -163,14 +162,14 @@ static void init_page_allocator(unsigned long min, unsigned long max) free_tail[i].next = NULL; } -min = round_pgup (min); +min = round_pgup(min); max = round_pgdown(max); /* Allocate space for the allocation bitmap. */ -mm_alloc_bitmap_size = (max + 1) >> (PAGE_SHIFT + 3); -mm_alloc_bitmap_size = round_pgup(mm_alloc_bitmap_size); +mm_alloc_bitmap_size = (max + 1) >> (PAGE_SHIFT + 3); +mm_alloc_bitmap_size = round_pgup(mm_alloc_bitmap_size); mm_alloc_bitmap = (unsigned long *)to_virt(min); -min += mm_alloc_bitmap_size; +min += mm_alloc_bitmap_size; /* All allocated by default. */ memset(mm_alloc_bitmap, ~0, mm_alloc_bitmap_size); @@ -208,7 +207,10 @@ static void init_page_allocator(unsigned long min, unsigned long max) * must not be bigger than remaining range. */ for ( i = PAGE_SHIFT; (1UL << (i + 1)) <= range; i++ ) -if ( r_min & (1UL << i) ) break; +{ +if ( r_min & (1UL << i) ) +break; +} ch = (chunk_head_t *)r_min; r_min += 1UL << i; @@ -227,7 +229,6 @@ static void init_p
[PATCH 2/3] mini-os: remove some not needed stuff from arch/x86/time.c
Remove unused or not needed stuff from arch/x86/time.c. Signed-off-by: Juergen Gross --- arch/x86/time.c | 16 1 file changed, 16 deletions(-) diff --git a/arch/x86/time.c b/arch/x86/time.c index a473a9e1..7fd7abef 100644 --- a/arch/x86/time.c +++ b/arch/x86/time.c @@ -48,7 +48,6 @@ struct shadow_time_info { uint64_t tsc_timestamp; /* TSC at last update of time vals. */ uint64_t system_timestamp; /* Time, in nanosecs, since boot.*/ uint32_t tsc_to_nsec_mul; -uint32_t tsc_to_usec_mul; int tsc_shift; uint32_t version; }; @@ -57,19 +56,6 @@ static uint32_t shadow_ts_version; static struct shadow_time_info shadow; -#ifndef rmb -#define rmb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory") -#endif - -#define HANDLE_USEC_OVERFLOW(_tv) \ -do {\ -while ( (_tv)->tv_usec >= 100 ) \ -{ \ -(_tv)->tv_usec -= 100; \ -(_tv)->tv_sec++;\ -} \ -} while ( 0 ) - static inline int time_values_up_to_date(void) { struct vcpu_time_info *src = &HYPERVISOR_shared_info->vcpu_info[0].time; @@ -143,8 +129,6 @@ static void get_time_values_from_xen(void) shadow.tsc_shift = src->tsc_shift; rmb(); } while ( (src->version & 1) | (shadow.version ^ src->version) ); - -shadow.tsc_to_usec_mul = shadow.tsc_to_nsec_mul / 1000; } /* -- 2.43.0
[PATCH 3/3] mini-os: simplify monotonic_clock()
monotonic_clock() in arch/x86/time.c is more complex than needed: it has basically two nested loops making sure the time data obtained from Xen are valid. Simplify that by merging some of the used sub-functions into the main function and using only a single loop. Further simplify the code by using struct vcpu_time_info for the local instance instead of defining a similar structure in the code. Signed-off-by: Juergen Gross --- arch/x86/time.c | 58 ++--- 1 file changed, 16 insertions(+), 42 deletions(-) diff --git a/arch/x86/time.c b/arch/x86/time.c index 7fd7abef..52916e15 100644 --- a/arch/x86/time.c +++ b/arch/x86/time.c @@ -44,24 +44,10 @@ */ /* These are peridically updated in shared_info, and then copied here. */ -struct shadow_time_info { -uint64_t tsc_timestamp; /* TSC at last update of time vals. */ -uint64_t system_timestamp; /* Time, in nanosecs, since boot.*/ -uint32_t tsc_to_nsec_mul; -int tsc_shift; -uint32_t version; -}; static struct timespec shadow_ts; static uint32_t shadow_ts_version; -static struct shadow_time_info shadow; - -static inline int time_values_up_to_date(void) -{ -struct vcpu_time_info *src = &HYPERVISOR_shared_info->vcpu_info[0].time; - -return shadow.version == src->version; -} +static struct vcpu_time_info shadow; static inline int wc_values_up_to_date(void) { @@ -113,22 +99,7 @@ static unsigned long get_nsec_offset(void) rdtscll(now); delta = now - shadow.tsc_timestamp; -return scale_delta(delta, shadow.tsc_to_nsec_mul, shadow.tsc_shift); -} - -static void get_time_values_from_xen(void) -{ -struct vcpu_time_info *src = &HYPERVISOR_shared_info->vcpu_info[0].time; - -do { -shadow.version = src->version; -rmb(); -shadow.tsc_timestamp = src->tsc_timestamp; -shadow.system_timestamp = src->system_time; -shadow.tsc_to_nsec_mul = src->tsc_to_system_mul; -shadow.tsc_shift = src->tsc_shift; -rmb(); -} while ( (src->version & 1) | (shadow.version ^ src->version) ); +return scale_delta(delta, shadow.tsc_to_system_mul, shadow.tsc_shift); } /* @@ -138,19 +109,22 @@ static void get_time_values_from_xen(void) */ uint64_t monotonic_clock(void) { -uint64_t time; -uint32_t local_time_version; +struct vcpu_time_info *src = &HYPERVISOR_shared_info->vcpu_info[0].time; -do { -local_time_version = shadow.version; -rmb(); -time = shadow.system_timestamp + get_nsec_offset(); -if ( !time_values_up_to_date() ) -get_time_values_from_xen(); -rmb(); -} while ( local_time_version != shadow.version ); +if ( shadow.version != src->version ) +{ +do { +shadow.version = src->version; +rmb(); +shadow.tsc_timestamp = src->tsc_timestamp; +shadow.system_time = src->system_time; +shadow.tsc_to_system_mul = src->tsc_to_system_mul; +shadow.tsc_shift = src->tsc_shift; +rmb(); +} while ( (src->version & 1) || (shadow.version != src->version) ); +} -return time; +return shadow.system_time + get_nsec_offset(); } static void update_wallclock(void) -- 2.43.0
[PATCH 0/3] Mini-OS: tidy up arch/x86/time.c
Make arch/x86/time.c coding style compliant, remove some unneeded cruft and simplify the time keeping logic. Juergen Gross (3): mini-os: apply coding style to arch/x86/time.c mini-os: remove some not needed stuff from arch/x86/time.c mini-os: simplify monotonic_clock() arch/x86/time.c | 188 ++-- 1 file changed, 69 insertions(+), 119 deletions(-) -- 2.43.0
[PATCH 1/3] mini-os: apply coding style to arch/x86/time.c
Apply the preferred coding style to arch/x86/time.c. Signed-off-by: Juergen Gross --- arch/x86/time.c | 194 +++- 1 file changed, 93 insertions(+), 101 deletions(-) diff --git a/arch/x86/time.c b/arch/x86/time.c index 332c0260..a473a9e1 100644 --- a/arch/x86/time.c +++ b/arch/x86/time.c @@ -1,7 +1,7 @@ /* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- * (C) 2003 - Rolf Neugebauer - Intel Research Cambridge - * (C) 2002-2003 - Keir Fraser - University of Cambridge + * (C) 2002-2003 - Keir Fraser - University of Cambridge * (C) 2005 - Grzegorz Milos - Intel Research Cambridge * (C) 2006 - Robert Kaiser - FH Wiesbaden @@ -18,20 +18,19 @@ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: - * + * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ - #include #include #include @@ -46,44 +45,43 @@ /* These are peridically updated in shared_info, and then copied here. */ struct shadow_time_info { - uint64_t tsc_timestamp; /* TSC at last update of time vals. */ - uint64_t system_timestamp; /* Time, in nanosecs, since boot.*/ - uint32_t tsc_to_nsec_mul; - uint32_t tsc_to_usec_mul; - int tsc_shift; - uint32_t version; +uint64_t tsc_timestamp; /* TSC at last update of time vals. */ +uint64_t system_timestamp; /* Time, in nanosecs, since boot.*/ +uint32_t tsc_to_nsec_mul; +uint32_t tsc_to_usec_mul; +int tsc_shift; +uint32_t version; }; static struct timespec shadow_ts; static uint32_t shadow_ts_version; static struct shadow_time_info shadow; - #ifndef rmb -#define rmb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory") +#define rmb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory") #endif -#define HANDLE_USEC_OVERFLOW(_tv) \ -do { \ +#define HANDLE_USEC_OVERFLOW(_tv) \ +do {\ while ( (_tv)->tv_usec >= 100 ) \ -{ \ +{ \ (_tv)->tv_usec -= 100; \ (_tv)->tv_sec++;\ -} \ +} \ } while ( 0 ) static inline int time_values_up_to_date(void) { - struct vcpu_time_info *src = &HYPERVISOR_shared_info->vcpu_info[0].time; +struct vcpu_time_info *src = &HYPERVISOR_shared_info->vcpu_info[0].time; - return (shadow.version == src->version); +return shadow.version == src->version; } static inline int wc_values_up_to_date(void) { - shared_info_t *s= HYPERVISOR_shared_info; +shared_info_t *s = HYPERVISOR_shared_info; - return (shadow_ts_version == s->wc_version); +return shadow_ts_version == s->wc_version; } /* @@ -92,109 +90,104 @@ static inline int wc_values_up_to_date(void) */ static inline uint64_t scale_delta(uint64_t delta, uint32_t mul_frac, int shift) { - uint64_t product; +uint64_t product; #ifdef __i386__ - uint32_t tmp1, tmp2; +uint32_t tmp1, tmp2; #endif - if ( shift < 0 ) - delta >>= -shift; - else - delta <<= shift; +if ( shift < 0 ) +delta >>= -shift; +else +delta <<= shift; #ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4
[PATCH v2] Mini-OS: add some macros for asm statements
Instead of having #ifdefs sprinkled around in x86 code, add some macros defining constants for asm statements to address differences between 32- and 64-bit mode. Modify existing code to use those macros. While at it convert the assembler parts of run_idle_thread() to a more sane variant. Signed-off-by: Juergen Gross --- V2: - addressed comments by Andrew Cooper --- arch/x86/sched.c | 34 ++ include/x86/os.h | 18 ++ 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/arch/x86/sched.c b/arch/x86/sched.c index dabe6fd6..42805f9f 100644 --- a/arch/x86/sched.c +++ b/arch/x86/sched.c @@ -60,16 +60,10 @@ void dump_stack(struct thread *thread) unsigned long *bottom = (unsigned long *)(thread->stack + STACK_SIZE); unsigned long *pointer = (unsigned long *)thread->sp; int count; -if(thread == current) -{ -#ifdef __i386__ -asm("movl %%esp,%0" -: "=r"(pointer)); -#else -asm("movq %%rsp,%0" -: "=r"(pointer)); -#endif -} + +if ( thread == current ) +asm("mov %%"ASM_SP",%0" : "=r"(pointer)); + printk("The stack for \"%s\"\n", thread->name); for(count = 0; count < 25 && pointer < bottom; count ++) { @@ -119,20 +113,12 @@ struct thread* arch_create_thread(char *name, void (*function)(void *), void run_idle_thread(void) { -/* Switch stacks and run the thread */ -#if defined(__i386__) -__asm__ __volatile__("mov %0,%%esp\n\t" - "push %1\n\t" - "ret" - :"=m" (idle_thread->sp) - :"m" (idle_thread->ip)); -#elif defined(__x86_64__) -__asm__ __volatile__("mov %0,%%rsp\n\t" - "push %1\n\t" - "ret" - :"=m" (idle_thread->sp) - :"m" (idle_thread->ip)); -#endif +/* Switch stacks and run the thread */ +asm volatile ("mov %[sp], %%"ASM_SP"\n\t" + "jmp *%[ip]\n\t" + : + : [sp] "m" (idle_thread->sp), +[ip] "m" (idle_thread->ip)); } unsigned long __local_irq_save(void) diff --git a/include/x86/os.h b/include/x86/os.h index ee34d784..0095be13 100644 --- a/include/x86/os.h +++ b/include/x86/os.h @@ -61,6 +61,16 @@ #define TRAP_deferred_nmi 31 #define TRAP_xen_callback 32 +#if defined(__i386__) +#define __SZ"l" +#define __REG "e" +#else +#define __SZ"q" +#define __REG "r" +#endif + +#define ASM_SP __REG"sp" + /* Everything below this point is not included by assembler (.S) files. */ #ifndef __ASSEMBLY__ @@ -141,14 +151,6 @@ do { \ #else -#if defined(__i386__) -#define __SZ "l" -#define __REG "e" -#else -#define __SZ "q" -#define __REG "r" -#endif - #define __cli() asm volatile ( "cli" : : : "memory" ) #define __sti() asm volatile ( "sti" : : : "memory" ) -- 2.43.0
[PATCH] SUPPORT.md: update Xen version
Update the Xen version to 4.20 Signed-off-by: Juergen Gross --- SUPPORT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SUPPORT.md b/SUPPORT.md index 77d2a7a7db..bd4316523d 100644 --- a/SUPPORT.md +++ b/SUPPORT.md @@ -9,7 +9,7 @@ for the definitions of the support status levels etc. # Release Support -Xen-Version: 4.19-rc +Xen-Version: 4.20-unstable Initial-Release: n/a Supported-Until: TBD Security-Support-Until: Unreleased - not yet security-supported -- 2.43.0
[PATCH] MAINTAINERS: drop CPU POOLS section
The CPU POOLS sections in MAINTAINERS can be dropped, as the SCHEDULING section has the same maintainers and it is covering the CPU POOLS files as well. Signed-off-by: Juergen Gross --- MAINTAINERS | 6 -- 1 file changed, 6 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 2b0c894527..37aacba2f9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -284,12 +284,6 @@ S: Supported F: .gitlab-ci.yml F: automation/ -CPU POOLS -M: Juergen Gross -M: Dario Faggioli -S: Supported -F: xen/common/sched/*cpupool.c - DEVICE TREE M: Stefano Stabellini M: Julien Grall -- 2.43.0
[PATCH] xen: fix multicall debug data referencing
The recent adding of multicall debug mixed up the referencing of the debug data. A __percpu tagged pointer can't be initialized with a plain pointer, so use another percpu variable for the pointer and set it on each new cpu via a function. Fixes: 942d917cb92a ("xen: make multicall debug boot time selectable") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202407151106.5s7mnfpz-...@intel.com/ Signed-off-by: Juergen Gross --- arch/x86/xen/multicalls.c | 19 --- arch/x86/xen/smp_pv.c | 1 + arch/x86/xen/xen-ops.h| 3 +++ 3 files changed, 16 insertions(+), 7 deletions(-) diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index d4cefd8a9af4..10c660fae8b3 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -54,8 +54,9 @@ struct mc_debug_data { static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); static struct mc_debug_data mc_debug_data_early __initdata; -static struct mc_debug_data __percpu *mc_debug_data __refdata = +static DEFINE_PER_CPU(struct mc_debug_data *, mc_debug_data) = &mc_debug_data_early; +static struct mc_debug_data __percpu *mc_debug_data_ptr; DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); static struct static_key mc_debug __ro_after_init; @@ -70,16 +71,20 @@ static int __init xen_parse_mc_debug(char *arg) } early_param("xen_mc_debug", xen_parse_mc_debug); +void mc_percpu_init(unsigned int cpu) +{ + per_cpu(mc_debug_data, cpu) = per_cpu_ptr(mc_debug_data_ptr, cpu); +} + static int __init mc_debug_enable(void) { - struct mc_debug_data __percpu *mcdb; unsigned long flags; if (!mc_debug_enabled) return 0; - mcdb = alloc_percpu(struct mc_debug_data); - if (!mcdb) { + mc_debug_data_ptr = alloc_percpu(struct mc_debug_data); + if (!mc_debug_data_ptr) { pr_err("xen_mc_debug inactive\n"); static_key_slow_dec(&mc_debug); return -ENOMEM; @@ -88,7 +93,7 @@ static int __init mc_debug_enable(void) /* Be careful when switching to percpu debug data. */ local_irq_save(flags); xen_mc_flush(); - mc_debug_data = mcdb; + mc_percpu_init(0); local_irq_restore(flags); pr_info("xen_mc_debug active\n"); @@ -150,7 +155,7 @@ void xen_mc_flush(void) trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx); if (static_key_false(&mc_debug)) { - mcdb = this_cpu_ptr(mc_debug_data); + mcdb = __this_cpu_read(mc_debug_data); memcpy(mcdb->entries, b->entries, b->mcidx * sizeof(struct multicall_entry)); } @@ -230,7 +235,7 @@ struct multicall_space __xen_mc_entry(size_t args) ret.mc = &b->entries[b->mcidx]; if (static_key_false(&mc_debug)) { - struct mc_debug_data *mcdb = this_cpu_ptr(mc_debug_data); + struct mc_debug_data *mcdb = __this_cpu_read(mc_debug_data); mcdb->caller[b->mcidx] = __builtin_return_address(0); mcdb->argsz[b->mcidx] = args; diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 7ea57f728b89..6863d3da7dec 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -305,6 +305,7 @@ static int xen_pv_kick_ap(unsigned int cpu, struct task_struct *idle) return rc; xen_pmu_init(cpu); + mc_percpu_init(cpu); /* * Why is this a BUG? If the hypercall fails then everything can be diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index e7775dff9452..0cf16fc79e0b 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -257,6 +257,9 @@ void xen_mc_callback(void (*fn)(void *), void *data); */ struct multicall_space xen_mc_extend_args(unsigned long op, size_t arg_size); +/* Do percpu data initialization for multicalls. */ +void mc_percpu_init(unsigned int cpu); + extern bool is_xen_pmu; irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); -- 2.43.0
Re: [PATCH for-4.19] docs/checklist: Fix XEN_EXTRAVERSION inconsistency for release candidates
On 16.07.24 09:46, Jan Beulich wrote: On 16.07.2024 09:33, Julien Grall wrote: Hi, On 16/07/2024 08:24, Jan Beulich wrote: On 16.07.2024 09:22, Julien Grall wrote: On 16/07/2024 07:47, Jan Beulich wrote: On 15.07.2024 18:56, Julien Grall wrote: On 15/07/2024 16:50, Andrew Cooper wrote: An earlier part of the checklist states: * change xen-unstable README. The banner (generated using figlet) should say: - "Xen 4.5" in releases and on stable branches - "Xen 4.5-unstable" on unstable - "Xen 4.5-rc" for release candidate Update the notes about XEN_EXTRAVERSION to match. When this is the purpose of the patch, ... We have been tagging the tree with 4.5.0-rcX. So I think it would be better to update the wording so we use a consistent naming. I find: 4.18-rc 4.17-rc 4.16-rc 4.15-rc Hmmm... I don't think we are looking at the same thing. I was specifically looking at the tag and *not* XEN_EXTRAVERSION. ... why would we be looking at tags? As I wrote, consistency across the naming scheme we use. The tags (necessarily) have RC numbers, Right but they also *have* the .0. so are going to be different from XEN_EXTRAVERSION in any event. Sure they are not going to be 100% the same. However, they could have some similarity. As I pointed out multiple times now, to me it is odd we are tagging the tree with 4.19.0-rcX, but we use 4.19-rc. Furthermore, if you look at the history of the document. It is quite clear that the goal was consistency (the commit mentioned by Andrew happened after). Yes it wasn't respected but I can't tell exactly why. So as we try to correct the documentation, I think we should also look at consistency. If you *really* want to drop the .0, then I think it should happen for the tag as well (again for consistency). I don't see why (but I also wouldn't mind the dropping from the tag). They are going to be different. Whether they're different in one or two aspects is secondary to me. I rather view the consistency goal to be with what we've been doing in the last so many releases. Another aspect to look at would be version sorting. This will be interesting when e.g. having a Xen rpm package installed and upgrading it with a later version. I don't think we want to regard replacing an -rc version with the .0 version to be a downgrade, so the the version numbers should be sorted by "sort -V" in the correct order. This would mean that we'd need to use: 4.19-rc 4.19.0 4.19.1 Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[GIT PULL] xen: branch for v6.11-rc1
Linus, Please git pull the following tag: git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git for-linus-6.11-rc1-tag xen: branch for v6.11-rc1 It contains the following patches: - some trivial cleanups - a fix for the Xen timer - a patch for adding boot time selectable debug capability to the Xen multicall handling - two fixes for the recently added Xen irqfd handling Thanks. Juergen Documentation/admin-guide/kernel-parameters.txt | 11 +- arch/arm/xen/p2m.c | 2 +- arch/x86/xen/apic.c | 2 - arch/x86/xen/debugfs.c | 2 +- arch/x86/xen/debugfs.h | 7 -- arch/x86/xen/enlighten.c| 2 - arch/x86/xen/enlighten_hvm.c| 2 - arch/x86/xen/enlighten_pv.c | 4 - arch/x86/xen/mmu.c | 3 +- arch/x86/xen/mmu.h | 28 - arch/x86/xen/mmu_hvm.c | 2 +- arch/x86/xen/mmu_pv.c | 15 ++- arch/x86/xen/multicalls.c | 128 arch/x86/xen/multicalls.h | 69 --- arch/x86/xen/p2m.c | 6 +- arch/x86/xen/pmu.c | 1 - arch/x86/xen/pmu.h | 22 arch/x86/xen/setup.c| 1 - arch/x86/xen/smp.c | 1 - arch/x86/xen/smp.h | 51 arch/x86/xen/smp_hvm.c | 2 - arch/x86/xen/smp_pv.c | 3 - arch/x86/xen/spinlock.c | 20 +--- arch/x86/xen/suspend.c | 2 - arch/x86/xen/time.c | 2 +- arch/x86/xen/xen-ops.h | 148 +++- drivers/xen/evtchn.c| 1 + drivers/xen/manage.c| 2 +- drivers/xen/privcmd-buf.c | 1 + drivers/xen/privcmd.c | 36 -- drivers/xen/xen-pciback/pci_stub.c | 1 + include/xen/events.h| 2 + kernel/locking/qspinlock.c | 2 +- 33 files changed, 305 insertions(+), 276 deletions(-) Chen Ni (2): x86/xen: Convert comma to semicolon xen/arm: Convert comma to semicolon Christophe JAILLET (1): xen/manage: Constify struct shutdown_handler Frediano Ziglio (1): x86/xen/time: Reduce Xen timer tick Jeff Johnson (1): xen: add missing MODULE_DESCRIPTION() macros Juergen Gross (4): xen: make multicall debug boot time selectable x86/xen: make some functions static x86/xen: eliminate some private header files x86/xen: remove deprecated xen_nopvspin boot parameter Viresh Kumar (2): xen: privcmd: Switch from mutex to spinlock for irqfds xen: privcmd: Fix possible access to a freed kirqfd instance
Re: Problems in PV dom0 on recent x86 hardware
On 12.07.24 12:35, Jürgen Groß wrote: On 09.07.24 15:08, Jason Andryuk wrote: On 2024-07-09 06:56, Jürgen Groß wrote: On 09.07.24 09:01, Jan Beulich wrote: On 09.07.2024 08:36, Jürgen Groß wrote: On 09.07.24 08:24, Jan Beulich wrote: On 08.07.2024 23:30, Jason Andryuk wrote: From the backtrace, it looks like the immediate case is just trying to read a 4-byte version: [ 44.575541] ucsi_acpi_dsm+0x53/0x80 [ 44.575546] ucsi_acpi_read+0x2e/0x60 [ 44.575550] ucsi_register+0x24/0xa0 [ 44.57] ucsi_acpi_probe+0x162/0x1e3 int ucsi_register(struct ucsi *ucsi) { int ret; ret = ucsi->ops->read(ucsi, UCSI_VERSION, &ucsi->version, sizeof(ucsi->version)); ->read being ucsi_acpi_read() However, the driver also appears write to adjacent addresses. There are also corresponding write functions in the driver, yes, but ucsi_acpi_async_write() (used directly or indirectly) similarly calls ucsi_acpi_dsm(), which wires through to acpi_evaluate_dsm(). That's ACPI object evaluation, which isn't obvious without seeing the involved AML whether it might write said memory region. I guess an ACPI dump would help here? Perhaps, yes. It is available in the bug report: https://bugzilla.opensuse.org/show_bug.cgi?id=1227301 After acpixtract & iasl: $ grep -ir FEEC * dsdt.dsl: OperationRegion (ECMM, SystemMemory, 0xFEEC2000, 0x0100) ssdt16.dsl: OperationRegion (SUSC, SystemMemory, 0xFEEC2100, 0x30) from the DSDT: Scope (\_SB.PCI0.LPC0.EC0) { OperationRegion (ECMM, SystemMemory, 0xFEEC2000, 0x0100) Field (ECMM, AnyAcc, Lock, Preserve) { TWBT, 2048 } Name (BTBF, Buffer (0x0100) { 0x00 // . }) Method (BTIF, 0, NotSerialized) { BTBF = TWBT /* \_SB_.PCI0.LPC0.EC0_.TWBT */ Return (BTBF) /* \_SB_.PCI0.LPC0.EC0_.BTBF */ } } From SSDT16: DefinitionBlock ("", "SSDT", 2, "LENOVO", "UsbCTabl", 0x0001) { External (_SB_.PCI0.LPC0.EC0_, DeviceObj) Scope (\_SB) { OperationRegion (SUSC, SystemMemory, 0xFEEC2100, 0x30) Field (SUSC, ByteAcc, Lock, Preserve) { This embedded controller (?) seems to live at 0xfeec2xxx. What is the takeaway from that? Is this a firmware bug (if yes, pointers to a specification saying that this is an illegal configuration would be nice), or do we need a way to map this page from dom0? I've found the following in the AMD IOMMU spec [1]: Received DMA requests without PASID in the 0xFEEx_ address range are treated as MSI interrupts and are processed using interrupt remapping rather than address translation. To me this sounds as if there wouldn't be a major risk letting dom0 map physical addresses in this area, as long as "normal" I/Os to this area would result in DMA requests with a PASID. OTOH I'm not familiar with Xen IOMMU handling, so I might be completely wrong. Another question would be whether a device having resources in this area can even work through an IOMMU. Juergen [1]: https://www.amd.com/content/dam/amd/en/documents/processor-tech-docs/specifications/48882_IOMMU.pdf OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[PATCH] x86/xen: remove deprecated xen_nopvspin boot parameter
The xen_nopvspin boot parameter is deprecated since 2019. nopvspin can be used instead. Remove the xen_nopvspin boot parameter and replace the xen_pvspin variable use cases with nopvspin. This requires to move the nopvspin variable out of the .initdata section, as it needs to be accessed for cpuhotplug, too. Signed-off-by: Juergen Gross --- .../admin-guide/kernel-parameters.txt | 5 - arch/x86/xen/spinlock.c | 20 +-- kernel/locking/qspinlock.c| 2 +- 3 files changed, 6 insertions(+), 21 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b33d048e01d8..2074ba03f2e3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7439,11 +7439,6 @@ access functions when running as Xen PV guest. The default value is controlled by CONFIG_XEN_PV_MSR_SAFE. - xen_nopvspin[X86,XEN,EARLY] - Disables the qspinlock slowpath using Xen PV optimizations. - This parameter is obsoleted by "nopvspin" parameter, which - has equivalent effect for XEN platform. - xen_nopv[X86] Disables the PV optimizations forcing the HVM guest to run as generic HVM guest with no PV drivers. diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 5c6fc16e4b92..8e4efe0fb6f9 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -18,7 +18,6 @@ static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(atomic_t, xen_qlock_wait_nest); -static bool xen_pvspin = true; static void xen_qlock_kick(int cpu) { @@ -68,7 +67,7 @@ void xen_init_lock_cpu(int cpu) int irq; char *name; - if (!xen_pvspin) + if (nopvspin) return; WARN(per_cpu(lock_kicker_irq, cpu) >= 0, "spinlock on CPU%d exists on IRQ%d!\n", @@ -95,7 +94,7 @@ void xen_uninit_lock_cpu(int cpu) { int irq; - if (!xen_pvspin) + if (nopvspin) return; kfree(per_cpu(irq_name, cpu)); @@ -125,10 +124,10 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); void __init xen_init_spinlocks(void) { /* Don't need to use pvqspinlock code if there is only 1 vCPU. */ - if (num_possible_cpus() == 1 || nopvspin) - xen_pvspin = false; + if (num_possible_cpus() == 1) + nopvspin = true; - if (!xen_pvspin) { + if (nopvspin) { printk(KERN_DEBUG "xen: PV spinlocks disabled\n"); static_branch_disable(&virt_spin_lock_key); return; @@ -143,12 +142,3 @@ void __init xen_init_spinlocks(void) pv_ops.lock.kick = xen_qlock_kick; pv_ops.lock.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); } - -static __init int xen_parse_nopvspin(char *arg) -{ - pr_notice("\"xen_nopvspin\" is deprecated, please use \"nopvspin\" instead\n"); - xen_pvspin = false; - return 0; -} -early_param("xen_nopvspin", xen_parse_nopvspin); - diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 1df5fef8a656..7d96bed718e4 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -583,7 +583,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #include "qspinlock_paravirt.h" #include "qspinlock.c" -bool nopvspin __initdata; +bool nopvspin; static __init int parse_nopvspin(char *arg) { nopvspin = true; -- 2.43.0
[PATCH 1/2] x86/xen: make some functions static
Some functions and variables in arch/x86/xen are used locally only, make them static. Signed-off-by: Juergen Gross --- arch/x86/xen/mmu.h | 4 arch/x86/xen/mmu_pv.c | 11 ++- arch/x86/xen/xen-ops.h | 1 - 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 6e4c6bd62203..11fa577af6b4 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -17,10 +17,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep); -void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, - pte_t *ptep, pte_t pte); - unsigned long xen_read_cr2_direct(void); extern void xen_init_mmu_ops(void); diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 54e0d311dcc9..8924129e284c 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -128,7 +128,7 @@ static DEFINE_SPINLOCK(xen_reservation_lock); * looking at another vcpu's cr3 value, it should use this variable. */ DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ -DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ +static DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ static phys_addr_t xen_pt_base, xen_pt_size __initdata; @@ -305,16 +305,17 @@ static void xen_set_pte(pte_t *ptep, pte_t pteval) __xen_set_pte(ptep, pteval); } -pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, -unsigned long addr, pte_t *ptep) +static pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) { /* Just return the pte as-is. We preserve the bits on commit */ trace_xen_mmu_ptep_modify_prot_start(vma->vm_mm, addr, ptep, *ptep); return *ptep; } -void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, -pte_t *ptep, pte_t pte) +static void xen_ptep_modify_prot_commit(struct vm_area_struct *vma, + unsigned long addr, + pte_t *ptep, pte_t pte) { struct mmu_update u; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 79cf93f2c92f..9f29229e25b8 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -23,7 +23,6 @@ void xen_copy_trap_info(struct trap_info *traps); DECLARE_PER_CPU_ALIGNED(struct vcpu_info, xen_vcpu_info); DECLARE_PER_CPU(unsigned long, xen_cr3); -DECLARE_PER_CPU(unsigned long, xen_current_cr3); extern struct start_info *xen_start_info; extern struct shared_info xen_dummy_shared_info; -- 2.43.0
[PATCH 2/2] x86/xen: eliminate some private header files
Under arch/x86/xen there is one large private header file xen-ops.h containing most of the Xen-private x86 related declarations, and then there are several small headers with a handful of declarations each. Merge the small headers into xen-ops.h. While doing that, move the declaration of xen_fifo_events from xen-ops.h into include/xen/events.h where it should have been from the beginning. Signed-off-by: Juergen Gross --- arch/x86/xen/apic.c | 2 - arch/x86/xen/debugfs.c | 2 +- arch/x86/xen/debugfs.h | 7 -- arch/x86/xen/enlighten.c | 2 - arch/x86/xen/enlighten_hvm.c | 2 - arch/x86/xen/enlighten_pv.c | 4 - arch/x86/xen/mmu.c | 3 +- arch/x86/xen/mmu.h | 24 -- arch/x86/xen/mmu_hvm.c | 2 +- arch/x86/xen/mmu_pv.c| 4 +- arch/x86/xen/multicalls.c| 3 +- arch/x86/xen/multicalls.h| 69 arch/x86/xen/p2m.c | 2 - arch/x86/xen/pmu.c | 1 - arch/x86/xen/pmu.h | 22 -- arch/x86/xen/setup.c | 1 - arch/x86/xen/smp.c | 1 - arch/x86/xen/smp.h | 51 arch/x86/xen/smp_hvm.c | 2 - arch/x86/xen/smp_pv.c| 3 - arch/x86/xen/suspend.c | 2 - arch/x86/xen/xen-ops.h | 147 ++- include/xen/events.h | 2 + 23 files changed, 152 insertions(+), 206 deletions(-) delete mode 100644 arch/x86/xen/debugfs.h delete mode 100644 arch/x86/xen/mmu.h delete mode 100644 arch/x86/xen/multicalls.h delete mode 100644 arch/x86/xen/pmu.h delete mode 100644 arch/x86/xen/smp.h diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c index 8b045dd25196..bb0f3f368446 100644 --- a/arch/x86/xen/apic.c +++ b/arch/x86/xen/apic.c @@ -10,8 +10,6 @@ #include #include #include "xen-ops.h" -#include "pmu.h" -#include "smp.h" static unsigned int xen_io_apic_read(unsigned apic, unsigned reg) { diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c index 532410998684..b8c9f2a7d9b6 100644 --- a/arch/x86/xen/debugfs.c +++ b/arch/x86/xen/debugfs.c @@ -3,7 +3,7 @@ #include #include -#include "debugfs.h" +#include "xen-ops.h" static struct dentry *d_xen_debug; diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h deleted file mode 100644 index 6b813ad1091c.. --- a/arch/x86/xen/debugfs.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XEN_DEBUGFS_H -#define _XEN_DEBUGFS_H - -struct dentry * __init xen_init_debugfs(void); - -#endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0305485edcd3..84e5adbd0925 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -20,8 +20,6 @@ #include #include "xen-ops.h" -#include "smp.h" -#include "pmu.h" EXPORT_SYMBOL_GPL(hypercall_page); diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index c001a2296582..24d2957a4726 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -28,8 +28,6 @@ #include #include "xen-ops.h" -#include "mmu.h" -#include "smp.h" static unsigned long shared_info_pfn; diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index 9ba53814ed6a..2c12ae42dc8b 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -85,10 +85,6 @@ #endif #include "xen-ops.h" -#include "mmu.h" -#include "smp.h" -#include "multicalls.h" -#include "pmu.h" #include "../kernel/cpu/cpu.h" /* get_cpu_cap() */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 60e9c37fd79f..c4c479373249 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -5,8 +5,7 @@ #include #include -#include "multicalls.h" -#include "mmu.h" +#include "xen-ops.h" unsigned long arbitrary_virt_to_mfn(void *vaddr) { diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h deleted file mode 100644 index 11fa577af6b4.. --- a/arch/x86/xen/mmu.h +++ /dev/null @@ -1,24 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _XEN_MMU_H - -#include -#include - -enum pt_level { - PT_PGD, - PT_P4D, - PT_PUD, - PT_PMD, - PT_PTE -}; - - -bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); - -void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -unsigned long xen_read_cr2_direct(void); - -extern void xen_init_mmu_ops(void); -extern void xen_hvm_init_mmu_ops(void); -#endif /* _XEN_MMU_H */ diff --git a/arch/x86/xen/mmu_hvm.c b/arch/x86/xen/mmu_hvm.c index 509bdee3ab90..337955652202 100644 --- a/arch/x86/xen/mmu_hvm.c +++ b/arch/x86/xen/mmu_hvm.c @@ -5,7 +5,7 @@ #include #include -#include "mmu.h" +#include "xen-ops.h" #ifdef CONFIG_PR
[PATCH 0/2] x86/xen: cleanup of xen private headers
Cleanup the private header files in arch/x86/xen by merging them into one file and by removing unneeded stuff. Juergen Gross (2): x86/xen: make some functions static x86/xen: eliminate some private header files arch/x86/xen/apic.c | 2 - arch/x86/xen/debugfs.c | 2 +- arch/x86/xen/debugfs.h | 7 -- arch/x86/xen/enlighten.c | 2 - arch/x86/xen/enlighten_hvm.c | 2 - arch/x86/xen/enlighten_pv.c | 4 - arch/x86/xen/mmu.c | 3 +- arch/x86/xen/mmu.h | 28 --- arch/x86/xen/mmu_hvm.c | 2 +- arch/x86/xen/mmu_pv.c| 15 ++-- arch/x86/xen/multicalls.c| 3 +- arch/x86/xen/multicalls.h| 69 arch/x86/xen/p2m.c | 2 - arch/x86/xen/pmu.c | 1 - arch/x86/xen/pmu.h | 22 -- arch/x86/xen/setup.c | 1 - arch/x86/xen/smp.c | 1 - arch/x86/xen/smp.h | 51 arch/x86/xen/smp_hvm.c | 2 - arch/x86/xen/smp_pv.c| 3 - arch/x86/xen/suspend.c | 2 - arch/x86/xen/xen-ops.h | 148 ++- include/xen/events.h | 2 + 23 files changed, 158 insertions(+), 216 deletions(-) delete mode 100644 arch/x86/xen/debugfs.h delete mode 100644 arch/x86/xen/mmu.h delete mode 100644 arch/x86/xen/multicalls.h delete mode 100644 arch/x86/xen/pmu.h delete mode 100644 arch/x86/xen/smp.h -- 2.43.0
[PATCH v2] xen: make multicall debug boot time selectable
Today Xen multicall debugging needs to be enabled via modifying a define in a source file for getting debug data of multicall errors encountered by users. Switch multicall debugging to depend on a boot parameter "xen_mc_debug" instead, enabling affected users to boot with the new parameter set in order to get better diagnostics. With debugging enabled print the following information in case at least one of the batched calls failed: - all calls of the batch with operation, result and caller - all parameters of each call - all parameters stored in the multicall data for each call Signed-off-by: Juergen Gross --- V2: - fixed argument printing, added parameter printing - in debug case print entries without error, too - rename struct member from debug to entries (Boris Ostrovsky) - get rid of get_mc_debug_ptr() (Boris Ostrovsky) --- .../admin-guide/kernel-parameters.txt | 6 + arch/x86/xen/multicalls.c | 125 ++ 2 files changed, 108 insertions(+), 23 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 27ec49af1bf2..b33d048e01d8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7427,6 +7427,12 @@ Crash from Xen panic notifier, without executing late panic() code such as dumping handler. + xen_mc_debug[X86,XEN,EARLY] + Enable multicall debugging when running as a Xen PV guest. + Enabling this feature will reduce performance a little + bit, so it should only be enabled for obtaining extended + debug data in case of multicall errors. + xen_msr_safe= [X86,XEN,EARLY] Format: Select whether to always use non-faulting (safe) MSR diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 07054572297f..a8d699687d5c 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include @@ -31,18 +33,12 @@ #define MC_BATCH 32 -#define MC_DEBUG 0 - #define MC_ARGS(MC_BATCH * 16) struct mc_buffer { unsigned mcidx, argidx, cbidx; struct multicall_entry entries[MC_BATCH]; -#if MC_DEBUG - struct multicall_entry debug[MC_BATCH]; - void *caller[MC_BATCH]; -#endif unsigned char args[MC_ARGS]; struct callback { void (*fn)(void *); @@ -50,13 +46,98 @@ struct mc_buffer { } callbacks[MC_BATCH]; }; +struct mc_debug_data { + struct multicall_entry entries[MC_BATCH]; + void *caller[MC_BATCH]; + size_t argsz[MC_BATCH]; + unsigned long *args[MC_BATCH]; +}; + static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); +static struct mc_debug_data mc_debug_data_early __initdata; +static struct mc_debug_data __percpu *mc_debug_data __refdata = + &mc_debug_data_early; DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); +static struct static_key mc_debug __ro_after_init; +static bool mc_debug_enabled __initdata; + +static int __init xen_parse_mc_debug(char *arg) +{ + mc_debug_enabled = true; + static_key_slow_inc(&mc_debug); + + return 0; +} +early_param("xen_mc_debug", xen_parse_mc_debug); + +static int __init mc_debug_enable(void) +{ + struct mc_debug_data __percpu *mcdb; + unsigned long flags; + + if (!mc_debug_enabled) + return 0; + + mcdb = alloc_percpu(struct mc_debug_data); + if (!mcdb) { + pr_err("xen_mc_debug inactive\n"); + static_key_slow_dec(&mc_debug); + return -ENOMEM; + } + + /* Be careful when switching to percpu debug data. */ + local_irq_save(flags); + xen_mc_flush(); + mc_debug_data = mcdb; + local_irq_restore(flags); + + pr_info("xen_mc_debug active\n"); + + return 0; +} +early_initcall(mc_debug_enable); + +/* Number of parameters of hypercalls used via multicalls. */ +static const uint8_t hpcpars[] = { + [__HYPERVISOR_mmu_update] = 4, + [__HYPERVISOR_stack_switch] = 2, + [__HYPERVISOR_fpu_taskswitch] = 1, + [__HYPERVISOR_update_descriptor] = 2, + [__HYPERVISOR_update_va_mapping] = 3, + [__HYPERVISOR_mmuext_op] = 4, +}; + +static void print_debug_data(struct mc_buffer *b, struct mc_debug_data *mcdb, +int idx) +{ + unsigned int arg; + unsigned int opidx = mcdb->entries[idx].op & 0xff; + unsigned int pars = 0; + + pr_err(" call %2d: op=%lu result=%ld caller=%pS ", idx + 1, + mcdb->entries[idx].op, b->entries[idx].result, + mcdb->caller[idx]); + if (
Re: GNTTABOP_setup_table yields -1 PFNs
On 08.07.24 14:50, Taylor R Campbell wrote: Date: Mon, 8 Jul 2024 11:09:21 +0200 From: Jan Beulich On 06.07.2024 04:22, Taylor R Campbell wrote: On a Xen 4.14 host (with extraversion=.0.88.g1d1d1f53), with version 1 grant tables, where GNTTABOP_query_size initially returns nr_frames=32 max_nr_frames=64, a NetBSD guest repeatedly queries GNTTABOP_setup_table for successively larger nr_frames from 1 up. First question: Is there some earlier GNTTABOP_setup_table that you invoke? I'd expect (and also observe) nr_frames=1 initially. Not that the guest OS invokes. Perhaps the bootloader, pv-grub 0.97, might invoke GNTTABOP_setup_table? I looked around but couldn't find an obvious source for pv-grub 0.97. This is based on Mini-OS, which does GNTTABOP_setup_table with nr_frames=1 Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[PATCH] xen: make multicall debug boot time selectable
Today Xen multicall debugging needs to be enabled via modifying a define in a source file for getting debug data of multicall errors encountered by users. Switch multicall debugging to depend on a boot parameter "xen_mc_debug" instead, enabling affected users to boot with the new parameter set in order to get better diagnostics. Add printing all arguments of a single call for better diagnostics. Signed-off-by: Juergen Gross --- .../admin-guide/kernel-parameters.txt | 6 + arch/x86/xen/multicalls.c | 120 ++ 2 files changed, 99 insertions(+), 27 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 27ec49af1bf2..b33d048e01d8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7427,6 +7427,12 @@ Crash from Xen panic notifier, without executing late panic() code such as dumping handler. + xen_mc_debug[X86,XEN,EARLY] + Enable multicall debugging when running as a Xen PV guest. + Enabling this feature will reduce performance a little + bit, so it should only be enabled for obtaining extended + debug data in case of multicall errors. + xen_msr_safe= [X86,XEN,EARLY] Format: Select whether to always use non-faulting (safe) MSR diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 07054572297f..abea216f07f4 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include #include @@ -31,18 +33,12 @@ #define MC_BATCH 32 -#define MC_DEBUG 0 - #define MC_ARGS(MC_BATCH * 16) struct mc_buffer { unsigned mcidx, argidx, cbidx; struct multicall_entry entries[MC_BATCH]; -#if MC_DEBUG - struct multicall_entry debug[MC_BATCH]; - void *caller[MC_BATCH]; -#endif unsigned char args[MC_ARGS]; struct callback { void (*fn)(void *); @@ -50,13 +46,84 @@ struct mc_buffer { } callbacks[MC_BATCH]; }; +struct mc_debug_data { + struct multicall_entry debug[MC_BATCH]; + void *caller[MC_BATCH]; + size_t argsz[MC_BATCH]; +}; + static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); +static struct mc_debug_data __percpu *mc_debug_data; +static struct mc_debug_data mc_debug_data_early __initdata; DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); +static struct static_key mc_debug __ro_after_init; +static bool mc_debug_enabled __initdata; + +static int __init xen_parse_mc_debug(char *arg) +{ + mc_debug_enabled = true; + static_key_slow_inc(&mc_debug); + + return 0; +} +early_param("xen_mc_debug", xen_parse_mc_debug); + +static int __init mc_debug_enable(void) +{ + struct mc_debug_data __percpu *mcdb; + unsigned long flags; + + if (!mc_debug_enabled) + return 0; + + mcdb = alloc_percpu(struct mc_debug_data); + if (!mcdb) { + pr_err("xen_mc_debug inactive\n"); + static_key_slow_dec(&mc_debug); + return -ENOMEM; + } + + /* Be careful when switching to percpu debug data. */ + local_irq_save(flags); + xen_mc_flush(); + mc_debug_data = mcdb; + local_irq_restore(flags); + + pr_info("xen_mc_debug active\n"); + + return 0; +} +early_initcall(mc_debug_enable); + +static void print_debug_data(struct mc_buffer *b, struct mc_debug_data *mcdb, +int idx) +{ + unsigned int arg; + + pr_err(" call %2d: op=%lu result=%ld caller=%pS", idx + 1, + mcdb->debug[idx].op, b->entries[idx].result, mcdb->caller[idx]); + if (mcdb->argsz[idx]) { + pr_cont(" args="); + for (arg = 0; arg < mcdb->argsz[idx] / 8; arg++) + pr_cont("%lx ", mcdb->debug[idx].args[arg]); + } + pr_cont("\n"); +} + +static struct mc_debug_data * __ref get_mc_debug_ptr(void) +{ + if (likely(mc_debug_data)) + return this_cpu_ptr(mc_debug_data); + + return &mc_debug_data_early; +} + void xen_mc_flush(void) { struct mc_buffer *b = this_cpu_ptr(&mc_buffer); struct multicall_entry *mc; + struct mc_debug_data *mcdb = NULL; int ret = 0; unsigned long flags; int i; @@ -69,10 +136,11 @@ void xen_mc_flush(void) trace_xen_mc_flush(b->mcidx, b->argidx, b->cbidx); -#if MC_DEBUG - memcpy(b->debug, b->entries, - b->mcidx * sizeof(struct multicall_entry)); -#endif + if (static_key_
Re: [PATCH 2/2] xen: privcmd: Fix possible access to a freed kirqfd instance
On 18.06.24 11:42, Viresh Kumar wrote: Nothing prevents simultaneous ioctl calls to privcmd_irqfd_assign() and privcmd_irqfd_deassign(). If that happens, it is possible that a kirqfd created and added to the irqfds_list by privcmd_irqfd_assign() may get removed by another thread executing privcmd_irqfd_deassign(), while the former is still using it after dropping the locks. This can lead to a situation where an already freed kirqfd instance may be accessed and cause kernel oops. Use SRCU locking to prevent the same, as is done for the KVM implementation for irqfds. Reported-by: Al Viro Suggested-by: Paolo Bonzini Signed-off-by: Viresh Kumar Reviewed-by: Juergen Gross Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [PATCH 1/2] xen: privcmd: Switch from mutex to spinlock for irqfds
On 18.06.24 11:42, Viresh Kumar wrote: irqfd_wakeup() gets EPOLLHUP, when it is called by eventfd_release() by way of wake_up_poll(&ctx->wqh, EPOLLHUP), which gets called under spin_lock_irqsave(). We can't use a mutex here as it will lead to a deadlock. Fix it by switching over to a spin lock. Reported-by: Al Viro Signed-off-by: Viresh Kumar --- drivers/xen/privcmd.c | 26 +++--- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 67dfa4778864..5ceb6c56cf3e 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -13,7 +13,6 @@ #include #include #include -#include I don't think you can drop that. There is still the ioreq_lock mutex. I can fix that up while committing, with that: Reviewed-by: Juergen Gross Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [PATCH] xen-blkfront: fix sector_size propagation to the block layer
On 25.06.24 07:52, Christoph Hellwig wrote: Ensure that info->sector_size and info->physical_sector_size are set before the call to blkif_set_queue_limits by doing away with the local variables and arguments that propagate them. Thanks to Marek Marczykowski-Górecki and Jürgen Groß for root causing the issue. Fixes: ba3f67c11638 ("xen-blkfront: atomically update queue limits") Reported-by: Rusty Bird Signed-off-by: Christoph Hellwig I guess this should go through the block tree? Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [PATCH] xen: add missing MODULE_DESCRIPTION() macros
On 12.06.24 01:54, Jeff Johnson wrote: With ARCH=x86, make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/xen/xen-pciback/xen-pciback.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/xen/xen-evtchn.o WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/xen/xen-privcmd.o Add the missing invocations of the MODULE_DESCRIPTION() macro. Signed-off-by: Jeff Johnson Reviewed-by: Juergen Gross Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [PATCH] MAINTAINERS: Step down as maintainer and committer
On 26.06.24 17:19, George Dunlap wrote: Remain a Reviewer on the golang bindings and scheduler for now (using a xenproject.org alias), since there may be architectural decisions I can shed light on. Remove the XENTRACE section entirely, as there's no obvious candidate to take it over; having the respective parts fall back to the tools and The Rest seems the most reasonable option. Signed-off-by: George Dunlap Acked-by: Juergen Gross Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: Disaggregated (Xoar) Dom0 Building
On 26.06.24 18:47, Lonnie Cumberland wrote: Hello All, I hope that everyone is doing well today. Currently, I am investigating and researching the ideas of "Disaggregating" Dom0 and have the Xoar Xen patches ("Breaking Up is Hard to Do: Security and Functionality in a Commodity Hypervisor" 2011) available which were developed against version 22155 of xen-unstable. The Linux patches are against Linux with pvops 2.6.31.13 and developed on a standard Ubuntu 10.04 install. My effort would also be up update these patches. I have been able to locate the Xen "Dom0 Disaggregation" (https://wiki.xenproject.org/wiki/Dom0_Disaggregation) am reading up on things now but wanted to ask the developers list about any experience you may have had in this area since the research objective is to integrate Xoar with the latest Xen 4.20, if possible, and to take it further to basically eliminate Dom0 all together with individual Mini-OS or Unikernel "Service and Driver VM's" instead that are loaded at UEFI boot time. Any guidance, thoughts, or ideas would be greatly appreciated, Just some pointers, this is not an exhaustive list: - you should have a look at dom0less (see docs/features/dom0less.pandoc in the Xen source tree) and hyperlauch (see docs/designs/launch/hyperlaunch.rst in the Xen source tree) - Xenstore in a stub-domain is working fine, it is the default in openSUSE and SLE - QubesOS has a lot of the disaggregation you are looking for implemented - I'm pretty sure only very few changes should be needed for the Linux kernel, if any. Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[PATCH v2] xen/sched: fix error handling in cpu_schedule_up()
In case cpu_schedule_up() is failing, it needs to undo all externally visible changes it has done before. Reason is that cpu_schedule_callback() won't be called with the CPU_UP_CANCELED notifier in case cpu_schedule_up() did fail. Reported-by: Jan Beulich Fixes: 207589dbacd4 ("xen/sched: move per cpu scheduler private data into struct sched_resource") Signed-off-by: Juergen Gross --- V2: - undo changes in cpu_schedule_up() in case of failure --- xen/common/sched/core.c | 63 + 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c index d84b65f197..c466711e9e 100644 --- a/xen/common/sched/core.c +++ b/xen/common/sched/core.c @@ -2755,6 +2755,36 @@ static struct sched_resource *sched_alloc_res(void) return sr; } +static void cf_check sched_res_free(struct rcu_head *head) +{ +struct sched_resource *sr = container_of(head, struct sched_resource, rcu); + +free_cpumask_var(sr->cpus); +if ( sr->sched_unit_idle ) +sched_free_unit_mem(sr->sched_unit_idle); +xfree(sr); +} + +static void cpu_schedule_down(unsigned int cpu) +{ +struct sched_resource *sr; + +rcu_read_lock(&sched_res_rculock); + +sr = get_sched_res(cpu); + +kill_timer(&sr->s_timer); + +cpumask_clear_cpu(cpu, &sched_res_mask); +set_sched_res(cpu, NULL); + +/* Keep idle unit. */ +sr->sched_unit_idle = NULL; +call_rcu(&sr->rcu, sched_res_free); + +rcu_read_unlock(&sched_res_rculock); +} + static int cpu_schedule_up(unsigned int cpu) { struct sched_resource *sr; @@ -2794,7 +2824,10 @@ static int cpu_schedule_up(unsigned int cpu) idle_vcpu[cpu]->sched_unit->res = sr; if ( idle_vcpu[cpu] == NULL ) +{ +cpu_schedule_down(cpu); return -ENOMEM; +} idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0; @@ -2812,36 +2845,6 @@ static int cpu_schedule_up(unsigned int cpu) return 0; } -static void cf_check sched_res_free(struct rcu_head *head) -{ -struct sched_resource *sr = container_of(head, struct sched_resource, rcu); - -free_cpumask_var(sr->cpus); -if ( sr->sched_unit_idle ) -sched_free_unit_mem(sr->sched_unit_idle); -xfree(sr); -} - -static void cpu_schedule_down(unsigned int cpu) -{ -struct sched_resource *sr; - -rcu_read_lock(&sched_res_rculock); - -sr = get_sched_res(cpu); - -kill_timer(&sr->s_timer); - -cpumask_clear_cpu(cpu, &sched_res_mask); -set_sched_res(cpu, NULL); - -/* Keep idle unit. */ -sr->sched_unit_idle = NULL; -call_rcu(&sr->rcu, sched_res_free); - -rcu_read_unlock(&sched_res_rculock); -} - void sched_rm_cpu(unsigned int cpu) { int rc; -- 2.35.3
Re: [PATCH] xen/sched: fix rare error case in cpu_schedule_down()
On 25.06.24 10:33, Jan Beulich wrote: On 25.06.2024 10:27, Juergen Gross wrote: In case cpu_schedule_up() is failing to allocate memory for struct sched_resource, Or (just to mention it again) in case the function isn't called at all because some earlier CPU_UP_PREPARE handler fails. This remark made me looking into the notifier implementation. I think this patch should be reworked significantly: In cpu_up() the CPU_UP_CANCELED notifier call is using the same notifier_block as the one used by CPU_UP_PREPARE before. This means, that only the handlers which didn't fail for CPU_UP_PREPARE will be called with CPU_UP_CANCELED. So there is no such case as "in case the function isn't called at all because some earlier CPU_UP_PREPARE handler fails". And a failure of cpu_schedule_up() needs to undo all externally visible modifications instead of hoping that the CPU_UP_CANCELED notifier will do that. So: V2 of the patch will be needed. Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[PATCH] xen/sched: fix rare error case in cpu_schedule_down()
In case cpu_schedule_up() is failing to allocate memory for struct sched_resource, cpu_schedule_down() will be called with the sched_resource pointer being NULL. This needs to be handled. Reported-by: Jan Beulich Fixes: 207589dbacd4 ("xen/sched: move per cpu scheduler private data into struct sched_resource") Signed-off-by: Juergen Gross --- xen/common/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xen/common/sched/core.c b/xen/common/sched/core.c index d84b65f197..0dc86b8f6c 100644 --- a/xen/common/sched/core.c +++ b/xen/common/sched/core.c @@ -2829,6 +2829,8 @@ static void cpu_schedule_down(unsigned int cpu) rcu_read_lock(&sched_res_rculock); sr = get_sched_res(cpu); +if ( !sr ) +goto out; kill_timer(&sr->s_timer); @@ -2839,6 +2841,7 @@ static void cpu_schedule_down(unsigned int cpu) sr->sched_unit_idle = NULL; call_rcu(&sr->rcu, sched_res_free); + out: rcu_read_unlock(&sched_res_rculock); } -- 2.35.3
[PATCH] MAINTAINERS: add me as scheduer maintainer
I've been active in the scheduling code since many years now. Add me as a maintainer. Signed-off-by: Juergen Gross --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6ba7d2765f..cc40c0be9d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -490,6 +490,7 @@ F: xen/common/sched/rt.c SCHEDULING M: George Dunlap M: Dario Faggioli +M: Juergen Gross S: Supported F: xen/common/sched/ -- 2.35.3
[PATCH] MAINTAINERS: add an entry for tools/9pfsd
Add me as the maintainer for the tools/9pfsd directory. Signed-off-by: Juergen Gross --- MAINTAINERS | 6 ++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 076cf1e141..28fb35582b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -206,6 +206,12 @@ Maintainers List (try to look for most precise areas first) --- +9PFSD +M: Juergen Gross +M: Anthony PERARD +S: Supported +F: tools/9pfsd + ACPI M: Jan Beulich S: Supported -- 2.35.3
Re: CVE-2021-47377: kernel: xen/balloon: use a kernel thread instead a workqueue
Hi, I'd like to dispute CVE-2021-47377: the issue fixed by upstream commit 8480ed9c2bbd56fc86524998e5f2e3e22f5038f6 can in no way be triggered by an unprivileged user or by a remote attack of the system, as it requires initiation of memory ballooning of the running system. This can be done only by either a host admin or by an admin of the guest which might suffer the detection of the hanging workqueue. Please revoke this CVE. Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
Re: [PATCH v3 2/4] xen/arm: Alloc XenStore page for Dom0less DomUs from hypervisor
On 24.05.24 16:30, Jürgen Groß wrote: On 24.05.24 15:58, Julien Grall wrote: Hi Henry, + Juergen as the Xenstore maintainers. I'd like his opinion on the approach. The documentation of the new logic is in: https://lore.kernel.org/xen-devel/20240517032156.1490515-5-xin.wa...@amd.com/ FWIW I am happy in principle with the logic (this is what we discussed on the call last week). Some comments below. I'm not against this logic, but I'm wondering why it needs to be so complicated. Can't the domU itself allocate the Xenstore page from its RAM pages, write the PFN into the Xenstore grant tab entry, and then make it public via setting HVM_PARAM_STORE_PFN? The init-dom0less application could then check HVM_PARAM_STORE_PFN being set and call XS_introduce_domain. Note that at least C-xenstored does not need the PFN of the Xenstore page, as it is just using GNTTAB_RESERVED_XENSTORE for mapping the page. Hmm, seems as if O-xenstored is using the map_foreign_page mechanism with the PFN to map the Xenstore ring page. Maybe this should be changed to use the grant entry, too? Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature
[GIT PULL] xen: branch for v6.10-rc1
Linus, Please git pull the following tag: git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip.git for-linus-6.10a-rc1-tag xen: branch for v6.10-rc1 It contains the following patches: - a small cleanup in the drivers/xen/xenbus Makefile - a fix of the Xen xenstore driver to improve connecting to a late started Xenstore - an enhancement for better support of ballooning in PVH guests - a cleanup using try_cmpxchg() instead of open coding it Thanks. Juergen arch/x86/xen/enlighten.c | 33 + arch/x86/xen/p2m.c| 11 +-- drivers/xen/xenbus/Makefile | 14 ++ drivers/xen/xenbus/xenbus_probe.c | 36 +++- 4 files changed, 67 insertions(+), 27 deletions(-) Andy Shevchenko (1): xen/xenbus: Use *-y instead of *-objs in Makefile Henry Wang (1): drivers/xen: Improve the late XenStore init protocol Roger Pau Monne (1): xen/x86: add extra pages to unpopulated-alloc if available Uros Bizjak (1): locking/x86/xen: Use try_cmpxchg() in xen_alloc_p2m_entry()
Re: [PATCH 5/5] x86/pvh: Add 64bit relocation page tables
On 10.04.24 21:48, Jason Andryuk wrote: The PVH entry point is 32bit. For a 64bit kernel, the entry point must switch to 64bit mode, which requires a set of page tables. In the past, PVH used init_top_pgt. This works fine when the kernel is loaded at LOAD_PHYSICAL_ADDR, as the page tables are prebuilt for this address. If the kernel is loaded at a different address, they need to be adjusted. __startup_64() adjusts the prebuilt page tables for the physical load address, but it is 64bit code. The 32bit PVH entry code can't call it to adjust the page tables, so it can't readily be re-used. 64bit PVH entry needs page tables set up for identity map, the kernel high map and the direct map. pvh_start_xen() enters identity mapped. Inside xen_prepare_pvh(), it jumps through a pv_ops function pointer into the highmap. The direct map is used for __va() on the initramfs and other guest physical addresses. Add a dedicated set of prebuild page tables for PVH entry. They are adjusted in assembly before loading. Add XEN_ELFNOTE_PHYS32_RELOC to indicate support for relocation along with the kernel's loading constraints. The maximum load address, KERNEL_IMAGE_SIZE - 1, is determined by a single pvh_level2_ident_pgt page. It could be larger with more pages. Signed-off-by: Jason Andryuk --- Instead of adding 5 pages of prebuilt page tables, they could be contructed dynamically in the .bss area. They are then only used for PVH entry and until transitioning to init_top_pgt. The .bss is later cleared. It's safer to add the dedicated pages, so that is done here. --- arch/x86/platform/pvh/head.S | 105 ++- 1 file changed, 104 insertions(+), 1 deletion(-) diff --git a/arch/x86/platform/pvh/head.S b/arch/x86/platform/pvh/head.S index c08d08d8cc92..4af3cfbcf2f8 100644 --- a/arch/x86/platform/pvh/head.S +++ b/arch/x86/platform/pvh/head.S @@ -21,6 +21,8 @@ #include #include +#include "../kernel/pgtable_64_helpers.h" + __HEAD /* @@ -102,8 +104,47 @@ SYM_CODE_START_LOCAL(pvh_start_xen) btsl $_EFER_LME, %eax wrmsr + mov %ebp, %ebx + subl $LOAD_PHYSICAL_ADDR, %ebx /* offset */ + jz .Lpagetable_done + + /* Fixup page-tables for relocation. */ + leal rva(pvh_init_top_pgt)(%ebp), %edi + movl $512, %ecx Please use PTRS_PER_PGD instead of the literal 512. Similar issue below. +2: + testl $_PAGE_PRESENT, 0x00(%edi) + jz 1f + addl %ebx, 0x00(%edi) +1: + addl $8, %edi + decl %ecx + jnz 2b + + /* L3 ident has a single entry. */ + leal rva(pvh_level3_ident_pgt)(%ebp), %edi + addl %ebx, 0x00(%edi) + + leal rva(pvh_level3_kernel_pgt)(%ebp), %edi + addl %ebx, (4096 - 16)(%edi) + addl %ebx, (4096 - 8)(%edi) PAGE_SIZE instead of 4096, please. + + /* pvh_level2_ident_pgt is fine - large pages */ + + /* pvh_level2_kernel_pgt needs adjustment - large pages */ + leal rva(pvh_level2_kernel_pgt)(%ebp), %edi + movl $512, %ecx +2: + testl $_PAGE_PRESENT, 0x00(%edi) + jz 1f + addl %ebx, 0x00(%edi) +1: + addl $8, %edi + decl %ecx + jnz 2b + +.Lpagetable_done: /* Enable pre-constructed page tables. */ - leal rva(init_top_pgt)(%ebp), %eax + leal rva(pvh_init_top_pgt)(%ebp), %eax mov %eax, %cr3 mov $(X86_CR0_PG | X86_CR0_PE), %eax mov %eax, %cr0 @@ -197,5 +238,67 @@ SYM_DATA_START_LOCAL(early_stack) .fill BOOT_STACK_SIZE, 1, 0 SYM_DATA_END_LABEL(early_stack, SYM_L_LOCAL, early_stack_end) +#ifdef CONFIG_X86_64 +/* + * Xen PVH needs a set of identity mapped and kernel high mapping + * page tables. pvh_start_xen starts running on the identity mapped + * page tables, but xen_prepare_pvh calls into the high mapping. + * These page tables need to be relocatable and are only used until + * startup_64 transitions to init_top_pgt. + */ +SYM_DATA_START_PAGE_ALIGNED(pvh_init_top_pgt) + .quad pvh_level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .orgpvh_init_top_pgt + L4_PAGE_OFFSET*8, 0 Please add a space before and after the '*'. + .quad pvh_level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .orgpvh_init_top_pgt + L4_START_KERNEL*8, 0 + /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ + .quad pvh_level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC +SYM_DATA_END(pvh_init_top_pgt) + +SYM_DATA_START_PAGE_ALIGNED(pvh_level3_ident_pgt) + .quad pvh_level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC + .fill 511, 8, 0 +SYM_DATA_END(pvh_level3_ident_pgt) +SYM_DATA_START_PAGE_ALIGNED(pvh_level2_ident_pgt) + /* +* Since I easily can, map the first 1G. +* Don't set NX because code runs from these pages. +* +* Note: This sets _PAGE_GLOBAL despite whether +* the CPU supports it or it is enabled. But, +* the CP
Re: [PATCH 4/5] x86/kernel: Move page table macros to new header
On 10.04.24 21:48, Jason Andryuk wrote: The PVH entry point will need an additional set of prebuild page tables. Move the macros and defines to a new header so they can be re-used. Signed-off-by: Jason Andryuk With the one nit below addressed: Reviewed-by: Juergen Gross ... diff --git a/arch/x86/kernel/pgtable_64_helpers.h b/arch/x86/kernel/pgtable_64_helpers.h new file mode 100644 index ..0ae87d768ce2 --- /dev/null +++ b/arch/x86/kernel/pgtable_64_helpers.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PGTABLES_64_H__ +#define __PGTABLES_64_H__ + +#ifdef __ASSEMBLY__ + +#define l4_index(x)(((x) >> 39) & 511) +#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) Please fix the minor style issue in this line by s/-/ - / Juergen OpenPGP_0xB0DE9DD628BF132F.asc Description: OpenPGP public key OpenPGP_signature.asc Description: OpenPGP digital signature