On Thu, Feb 17, 2022 at 12:01:08PM +0100, Jan Beulich wrote:
> In order for a to-be-introduced ERMS form of memcpy() to not regress
> boot performance on certain systems when video output is active, we
> first need to arrange for avoiding further dependency on firmware
> setting up MTRRs in a way we can actually further modify. On many
> systems, due to the continuously growing amounts of installed memory,
> MTRRs get configured with at least one huge WB range, and with MMIO
> ranges below 4Gb then forced to UC via overlapping MTRRs. mtrr_add(), as
> it is today, can't deal with such a setup. Hence on such systems we
> presently leave the frame buffer mapped UC, leading to significantly
> reduced performance when using REP STOSB / REP MOVSB.
> 
> On post-PentiumII hardware (i.e. any that's capable of running 64-bit
> code), an effective memory type of WC can be achieved without MTRRs, by
> simply referencing the respective PAT entry from the PTEs. While this
> will leave the switch to ERMS forms of memset() and memcpy() with
> largely unchanged performance, the change here on its own improves
> performance on affected systems quite significantly: Measuring just the
> individual affected memcpy() invocations yielded a speedup by a factor
> of over 250 on my initial (Skylake) test system. memset() isn't getting
> improved by as much there, but still by a factor of about 20.
> 
> While adding {__,}PAGE_HYPERVISOR_WC, also add {__,}PAGE_HYPERVISOR_WT
> to, at the very least, make clear what PTE flags this memory type uses.
> 
> Signed-off-by: Jan Beulich <[email protected]>
> ---
> REPOST (in isolation) upon Roger's request. The header location change I
> don't really consider a "re-base".
> 
> v2: Mark ioremap_wc() __init.
> ---
> TBD: If the VGA range is WC in the fixed range MTRRs, reusing the low
>      1st Mb mapping (like ioremap() does) would be an option.
> 
> --- a/xen/arch/x86/include/asm/mm.h
> +++ b/xen/arch/x86/include/asm/mm.h
> @@ -602,6 +602,8 @@ void destroy_perdomain_mapping(struct do
>                                 unsigned int nr);
>  void free_perdomain_mappings(struct domain *);
>  
> +void __iomem *ioremap_wc(paddr_t, size_t);
> +
>  extern int memory_add(unsigned long spfn, unsigned long epfn, unsigned int 
> pxm);
>  
>  void domain_set_alloc_bitsize(struct domain *d);
> --- a/xen/arch/x86/include/asm/page.h
> +++ b/xen/arch/x86/include/asm/page.h
> @@ -349,8 +349,10 @@ void efi_update_l4_pgtable(unsigned int
>  #define __PAGE_HYPERVISOR_RX      (_PAGE_PRESENT | _PAGE_ACCESSED)
>  #define __PAGE_HYPERVISOR         (__PAGE_HYPERVISOR_RX | \
>                                     _PAGE_DIRTY | _PAGE_RW)
> +#define __PAGE_HYPERVISOR_WT      (__PAGE_HYPERVISOR | _PAGE_PWT)
>  #define __PAGE_HYPERVISOR_UCMINUS (__PAGE_HYPERVISOR | _PAGE_PCD)
>  #define __PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR | _PAGE_PCD | _PAGE_PWT)
> +#define __PAGE_HYPERVISOR_WC      (__PAGE_HYPERVISOR | _PAGE_PAT)
>  #define __PAGE_HYPERVISOR_SHSTK   (__PAGE_HYPERVISOR_RO | _PAGE_DIRTY)
>  
>  #define MAP_SMALL_PAGES _PAGE_AVAIL0 /* don't use superpages mappings */
> --- a/xen/arch/x86/include/asm/x86_64/page.h
> +++ b/xen/arch/x86/include/asm/x86_64/page.h
> @@ -152,6 +152,10 @@ static inline intpte_t put_pte_flags(uns
>                                   _PAGE_GLOBAL | _PAGE_NX)
>  #define PAGE_HYPERVISOR_UC      (__PAGE_HYPERVISOR_UC | \
>                                   _PAGE_GLOBAL | _PAGE_NX)
> +#define PAGE_HYPERVISOR_WC      (__PAGE_HYPERVISOR_WC | \
> +                                 _PAGE_GLOBAL | _PAGE_NX)
> +#define PAGE_HYPERVISOR_WT      (__PAGE_HYPERVISOR_WT | \
> +                                 _PAGE_GLOBAL | _PAGE_NX)
>  
>  #endif /* __X86_64_PAGE_H__ */
>  
> --- a/xen/arch/x86/mm.c
> +++ b/xen/arch/x86/mm.c
> @@ -5895,6 +5895,20 @@ void __iomem *ioremap(paddr_t pa, size_t
>      return (void __force __iomem *)va;
>  }
>  
> +void __iomem *__init ioremap_wc(paddr_t pa, size_t len)
> +{
> +    mfn_t mfn = _mfn(PFN_DOWN(pa));
> +    unsigned int offs = pa & (PAGE_SIZE - 1);
> +    unsigned int nr = PFN_UP(offs + len);
> +    void *va;
> +
> +    WARN_ON(page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL));
> +
> +    va = __vmap(&mfn, nr, 1, 1, PAGE_HYPERVISOR_WC, VMAP_DEFAULT);
> +
> +    return (void __force __iomem *)(va + offs);
> +}
> +
>  int create_perdomain_mapping(struct domain *d, unsigned long va,
>                               unsigned int nr, l1_pgentry_t **pl1tab,
>                               struct page_info **ppg)
> --- a/xen/drivers/video/vesa.c
> +++ b/xen/drivers/video/vesa.c
> @@ -9,9 +9,9 @@
>  #include <xen/param.h>
>  #include <xen/xmalloc.h>
>  #include <xen/kernel.h>
> +#include <xen/mm.h>
>  #include <xen/vga.h>
>  #include <asm/io.h>
> -#include <asm/page.h>
>  #include "font.h"
>  #include "lfb.h"
>  
> @@ -103,7 +103,7 @@ void __init vesa_init(void)
>      lfbp.text_columns = vlfb_info.width / font->width;
>      lfbp.text_rows = vlfb_info.height / font->height;
>  
> -    lfbp.lfb = lfb = ioremap(lfb_base(), vram_remap);
> +    lfbp.lfb = lfb = ioremap_wc(lfb_base(), vram_remap);
>      if ( !lfb )
>          return;
>  
> @@ -179,8 +179,7 @@ void __init vesa_mtrr_init(void)
>  
>  static void lfb_flush(void)
>  {
> -    if ( vesa_mtrr == 3 )
> -        __asm__ __volatile__ ("sfence" : : : "memory");
> +    __asm__ __volatile__ ("sfence" : : : "memory");

Now that the cache attribute is forced to WC using PAT don't we need
to drop vesa_mtrr_init and vesa_mtrr? The more that the option is
fully undocumented.

>  }
>  
>  void __init vesa_endboot(bool_t keep)
> --- a/xen/drivers/video/vga.c
> +++ b/xen/drivers/video/vga.c
> @@ -79,7 +79,7 @@ void __init video_init(void)
>      {
>      case XEN_VGATYPE_TEXT_MODE_3:
>          if ( page_is_ram_type(paddr_to_pfn(0xB8000), RAM_TYPE_CONVENTIONAL) 
> ||
> -             ((video = ioremap(0xB8000, 0x8000)) == NULL) )
> +             ((video = ioremap_wc(0xB8000, 0x8000)) == NULL) )
>              return;
>          outw(0x200a, 0x3d4); /* disable cursor */
>          columns = vga_console_info.u.text_mode_3.columns;
> @@ -164,7 +164,11 @@ void __init video_endboot(void)
>      {
>      case XEN_VGATYPE_TEXT_MODE_3:
>          if ( !vgacon_keep )
> +        {
>              memset(video, 0, columns * lines * 2);
> +            iounmap(video);
> +            video = ZERO_BLOCK_PTR;
> +        }
>          break;
>      case XEN_VGATYPE_VESA_LFB:
>      case XEN_VGATYPE_EFI_LFB:

I think in vesa_endboot you also need to iounmap the framebuffer
iomem?

I would assume this was also required before your change, yet I'm not
finding any iounmap call that would do it.

Thanks, Roger.

Reply via email to