On Sun, Jan 28, 2007 at 03:11:34PM +0100, Peter Zijlstra wrote:
> Eradicate global locks.
> 
>  - kmap_lock is removed by extensive use of atomic_t, a new flush
>    scheme and modifying set_page_address to only allow NULL<->virt
>    transitions.

What's the point for this?  Extensive atomic_t use is usually much worse
than spinlocks.  A spinlock region is just a single atomic instruction,
as soon as you do more than one atomic_t you tend to make scalability
worse.  Not to mention that atomic_t are much worse when you try to
profile scalability issues.

What benchmark shows a problem with the current locking, and from what
caller?  In doubt we just need to convert that caller to kmap_atomic.


> 
> A count of 0 is an exclusive state acting as an entry lock. This is done
> using inc_not_zero and cmpxchg. The restriction on changing the virtual
> address closes the gap with concurrent additions of the same entry.
> 
>  - pool_lock is removed by using the pkmap index for the
>    page_address_maps.
> 
> By using the pkmap index for the hash entries it is no longer needed to
> keep a free list.
> 
> This patch has been in -rt for a while but should also help regular
> highmem machines with multiple cores/cpus.
> 
> Signed-off-by: Peter Zijlstra <[EMAIL PROTECTED]>
> ---
>  include/linux/mm.h |   32 ++-
>  mm/highmem.c       |  433 
> ++++++++++++++++++++++++++++++-----------------------
>  2 files changed, 276 insertions(+), 189 deletions(-)
> 
> Index: linux/include/linux/mm.h
> ===================================================================
> --- linux.orig/include/linux/mm.h
> +++ linux/include/linux/mm.h
> @@ -543,23 +543,39 @@ static __always_inline void *lowmem_page
>  #endif
>  
>  #if defined(WANT_PAGE_VIRTUAL)
> -#define page_address(page) ((page)->virtual)
> -#define set_page_address(page, address)                      \
> -     do {                                            \
> -             (page)->virtual = (address);            \
> -     } while(0)
> -#define page_address_init()  do { } while(0)
> +/*
> + * wrap page->virtual so it is safe to set/read locklessly
> + */
> +#define page_address(page) \
> +     ({ typeof((page)->virtual) v = (page)->virtual; \
> +      smp_read_barrier_depends(); \
> +      v; })
> +
> +static inline int set_page_address(struct page *page, void *address)
> +{
> +     if (address)
> +             return cmpxchg(&page->virtual, NULL, address) == NULL;
> +     else {
> +             /*
> +              * cmpxchg is a bit abused because it is not guaranteed
> +              * safe wrt direct assignment on all platforms.
> +              */
> +             void *virt = page->virtual;
> +             return cmpxchg(&page->vitrual, virt, NULL) == virt;
> +     }
> +}
> +void page_address_init(void);
>  #endif
>  
>  #if defined(HASHED_PAGE_VIRTUAL)
>  void *page_address(struct page *page);
> -void set_page_address(struct page *page, void *virtual);
> +int set_page_address(struct page *page, void *virtual);
>  void page_address_init(void);
>  #endif
>  
>  #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
>  #define page_address(page) lowmem_page_address(page)
> -#define set_page_address(page, address)  do { } while(0)
> +#define set_page_address(page, address)  (0)
>  #define page_address_init()  do { } while(0)
>  #endif
>  
> Index: linux/mm/highmem.c
> ===================================================================
> --- linux.orig/mm/highmem.c
> +++ linux/mm/highmem.c
> @@ -14,6 +14,11 @@
>   * based on Linus' idea.
>   *
>   * Copyright (C) 1999 Ingo Molnar <[EMAIL PROTECTED]>
> + *
> + * Largely rewritten to get rid of all global locks
> + *
> + * Copyright (C) 2006 Red Hat, Inc., Peter Zijlstra <[EMAIL PROTECTED]>
> + *
>   */
>  
>  #include <linux/mm.h>
> @@ -27,18 +32,14 @@
>  #include <linux/hash.h>
>  #include <linux/highmem.h>
>  #include <linux/blktrace_api.h>
> +
>  #include <asm/tlbflush.h>
> +#include <asm/pgtable.h>
>  
> -/*
> - * Virtual_count is not a pure "count".
> - *  0 means that it is not mapped, and has not been mapped
> - *    since a TLB flush - it is usable.
> - *  1 means that there are no users, but it has been mapped
> - *    since the last TLB flush - so we can't use it.
> - *  n means that there are (n-1) current users of it.
> - */
>  #ifdef CONFIG_HIGHMEM
>  
> +static int __set_page_address(struct page *page, void *virtual, int pos);
> +
>  unsigned long totalhigh_pages __read_mostly;
>  
>  unsigned int nr_free_highpages (void)
> @@ -52,164 +53,208 @@ unsigned int nr_free_highpages (void)
>       return pages;
>  }
>  
> -static int pkmap_count[LAST_PKMAP];
> -static unsigned int last_pkmap_nr;
> -static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
> +/*
> + * count is not a pure "count".
> + *  0 means its owned exclusively by someone
> + *  1 means its free for use - either mapped or not.
> + *  n means that there are (n-1) current users of it.
> + */
> +static atomic_t pkmap_count[LAST_PKMAP];
> +static atomic_t pkmap_hand;
>  
>  pte_t * pkmap_page_table;
>  
>  static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
>  
> -static void flush_all_zero_pkmaps(void)
> +/*
> + * Try to free a given kmap slot.
> + *
> + * Returns:
> + *  -1 - in use
> + *   0 - free, no TLB flush needed
> + *   1 - free, needs TLB flush
> + */
> +static int pkmap_try_free(int pos)
>  {
> -     int i;
> -
> -     flush_cache_kmaps();
> +     if (atomic_cmpxchg(&pkmap_count[pos], 1, 0) != 1)
> +             return -1;
>  
> -     for (i = 0; i < LAST_PKMAP; i++) {
> -             struct page *page;
> +     /*
> +      * TODO: add a young bit to make it CLOCK
> +      */
> +     if (!pte_none(pkmap_page_table[pos])) {
> +             struct page *page = pte_page(pkmap_page_table[pos]);
> +             unsigned long addr = PKMAP_ADDR(pos);
> +             pte_t *ptep = &pkmap_page_table[pos];
> +
> +             VM_BUG_ON(addr != (unsigned long)page_address(page));
> +
> +             if (!__set_page_address(page, NULL, pos))
> +                     BUG();
> +             flush_kernel_dcache_page(page);
> +             pte_clear(&init_mm, addr, ptep);
>  
> -             /*
> -              * zero means we don't have anything to do,
> -              * >1 means that it is still in use. Only
> -              * a count of 1 means that it is free but
> -              * needs to be unmapped
> -              */
> -             if (pkmap_count[i] != 1)
> -                     continue;
> -             pkmap_count[i] = 0;
> +             return 1;
> +     }
>  
> -             /* sanity check */
> -             BUG_ON(pte_none(pkmap_page_table[i]));
> +     return 0;
> +}
>  
> -             /*
> -              * Don't need an atomic fetch-and-clear op here;
> -              * no-one has the page mapped, and cannot get at
> -              * its virtual address (and hence PTE) without first
> -              * getting the kmap_lock (which is held here).
> -              * So no dangers, even with speculative execution.
> -              */
> -             page = pte_page(pkmap_page_table[i]);
> -             pte_clear(&init_mm, (unsigned long)page_address(page),
> -                       &pkmap_page_table[i]);
> +static inline void pkmap_put(atomic_t *counter)
> +{
> +     switch (atomic_dec_return(counter)) {
> +     case 0:
> +             BUG();
>  
> -             set_page_address(page, NULL);
> +     case 1:
> +             wake_up(&pkmap_map_wait);
>       }
> -     flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
>  }
>  
> -static inline unsigned long map_new_virtual(struct page *page)
> +#define TLB_BATCH    32
> +
> +static int pkmap_get_free(void)
>  {
> -     unsigned long vaddr;
> -     int count;
> +     int i, pos, flush;
> +     DECLARE_WAITQUEUE(wait, current);
>  
> -start:
> -     count = LAST_PKMAP;
> -     /* Find an empty entry */
> -     for (;;) {
> -             last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
> -             if (!last_pkmap_nr) {
> -                     flush_all_zero_pkmaps();
> -                     count = LAST_PKMAP;
> -             }
> -             if (!pkmap_count[last_pkmap_nr])
> -                     break;  /* Found a usable entry */
> -             if (--count)
> -                     continue;
> +restart:
> +     for (i = 0; i < LAST_PKMAP; i++) {
> +             pos = atomic_inc_return(&pkmap_hand) % LAST_PKMAP;
> +             flush = pkmap_try_free(pos);
> +             if (flush >= 0)
> +                     goto got_one;
> +     }
> +
> +     /*
> +      * wait for somebody else to unmap their entries
> +      */
> +     __set_current_state(TASK_UNINTERRUPTIBLE);
> +     add_wait_queue(&pkmap_map_wait, &wait);
> +     schedule();
> +     remove_wait_queue(&pkmap_map_wait, &wait);
> +
> +     goto restart;
> +
> +got_one:
> +     if (flush) {
> +#if 0
> +             flush_tlb_kernel_range(PKMAP_ADDR(pos), PKMAP_ADDR(pos+1));
> +#else
> +             int pos2 = (pos + 1) % LAST_PKMAP;
> +             int nr;
> +             int entries[TLB_BATCH];
>  
>               /*
> -              * Sleep for somebody else to unmap their entries
> +              * For those architectures that cannot help but flush the
> +              * whole TLB, flush some more entries to make it worthwhile.
> +              * Scan ahead of the hand to minimise search distances.
>                */
> -             {
> -                     DECLARE_WAITQUEUE(wait, current);
> +             for (i = 0, nr = 0; i < LAST_PKMAP && nr < TLB_BATCH;
> +                             i++, pos2 = (pos2 + 1) % LAST_PKMAP) {
>  
> -                     __set_current_state(TASK_UNINTERRUPTIBLE);
> -                     add_wait_queue(&pkmap_map_wait, &wait);
> -                     spin_unlock(&kmap_lock);
> -                     schedule();
> -                     remove_wait_queue(&pkmap_map_wait, &wait);
> -                     spin_lock(&kmap_lock);
> -
> -                     /* Somebody else might have mapped it while we slept */
> -                     if (page_address(page))
> -                             return (unsigned long)page_address(page);
> +                     flush = pkmap_try_free(pos2);
> +                     if (flush < 0)
> +                             continue;
> +
> +                     if (!flush) {
> +                             atomic_t *counter = &pkmap_count[pos2];
> +                             VM_BUG_ON(atomic_read(counter) != 0);
> +                             atomic_set(counter, 2);
> +                             pkmap_put(counter);
> +                     } else
> +                             entries[nr++] = pos2;
> +             }
> +             flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
>  
> -                     /* Re-start */
> -                     goto start;
> +             for (i = 0; i < nr; i++) {
> +                     atomic_t *counter = &pkmap_count[entries[i]];
> +                     VM_BUG_ON(atomic_read(counter) != 0);
> +                     atomic_set(counter, 2);
> +                     pkmap_put(counter);
>               }
> +#endif
>       }
> -     vaddr = PKMAP_ADDR(last_pkmap_nr);
> -     set_pte_at(&init_mm, vaddr,
> -                &(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
> +     return pos;
> +}
> +
> +static unsigned long pkmap_insert(struct page *page)
> +{
> +     int pos = pkmap_get_free();
> +     unsigned long vaddr = PKMAP_ADDR(pos);
> +     pte_t *ptep = &pkmap_page_table[pos];
> +     pte_t entry = mk_pte(page, kmap_prot);
> +     atomic_t *counter = &pkmap_count[pos];
> +
> +     VM_BUG_ON(atomic_read(counter) != 0);
>  
> -     pkmap_count[last_pkmap_nr] = 1;
> -     set_page_address(page, (void *)vaddr);
> +     set_pte_at(&init_mm, vaddr, ptep, entry);
> +     if (unlikely(!__set_page_address(page, (void *)vaddr, pos))) {
> +             /*
> +              * concurrent pkmap_inserts for this page -
> +              * the other won the race, release this entry.
> +              *
> +              * we can still clear the pte without a tlb flush since
> +              * it couldn't have been used yet.
> +              */
> +             pte_clear(&init_mm, vaddr, ptep);
> +             VM_BUG_ON(atomic_read(counter) != 0);
> +             atomic_set(counter, 2);
> +             pkmap_put(counter);
> +             vaddr = 0;
> +     } else
> +             atomic_set(counter, 2);
>  
>       return vaddr;
>  }
>  
> -void fastcall *kmap_high(struct page *page)
> +fastcall void *kmap_high(struct page *page)
>  {
>       unsigned long vaddr;
> -
> -     /*
> -      * For highmem pages, we can't trust "virtual" until
> -      * after we have the lock.
> -      *
> -      * We cannot call this from interrupts, as it may block
> -      */
> -     spin_lock(&kmap_lock);
> +again:
>       vaddr = (unsigned long)page_address(page);
> +     if (vaddr) {
> +             atomic_t *counter = &pkmap_count[PKMAP_NR(vaddr)];
> +             if (atomic_inc_not_zero(counter)) {
> +                     /*
> +                      * atomic_inc_not_zero implies a (memory) barrier on 
> success
> +                      * so page address will be reloaded.
> +                      */
> +                     unsigned long vaddr2 = (unsigned 
> long)page_address(page);
> +                     if (likely(vaddr == vaddr2))
> +                             return (void *)vaddr;
> +
> +                     /*
> +                      * Oops, we got someone else.
> +                      *
> +                      * This can happen if we get preempted after
> +                      * page_address() and before atomic_inc_not_zero()
> +                      * and during that preemption this slot is freed and
> +                      * reused.
> +                      */
> +                     pkmap_put(counter);
> +                     goto again;
> +             }
> +     }
> +
> +     vaddr = pkmap_insert(page);
>       if (!vaddr)
> -             vaddr = map_new_virtual(page);
> -     pkmap_count[PKMAP_NR(vaddr)]++;
> -     BUG_ON(pkmap_count[PKMAP_NR(vaddr)] < 2);
> -     spin_unlock(&kmap_lock);
> -     return (void*) vaddr;
> +             goto again;
> +
> +     return (void *)vaddr;
>  }
>  
>  EXPORT_SYMBOL(kmap_high);
>  
> -void fastcall kunmap_high(struct page *page)
> +fastcall void kunmap_high(struct page *page)
>  {
> -     unsigned long vaddr;
> -     unsigned long nr;
> -     int need_wakeup;
> -
> -     spin_lock(&kmap_lock);
> -     vaddr = (unsigned long)page_address(page);
> +     unsigned long vaddr = (unsigned long)page_address(page);
>       BUG_ON(!vaddr);
> -     nr = PKMAP_NR(vaddr);
> -
> -     /*
> -      * A count must never go down to zero
> -      * without a TLB flush!
> -      */
> -     need_wakeup = 0;
> -     switch (--pkmap_count[nr]) {
> -     case 0:
> -             BUG();
> -     case 1:
> -             /*
> -              * Avoid an unnecessary wake_up() function call.
> -              * The common case is pkmap_count[] == 1, but
> -              * no waiters.
> -              * The tasks queued in the wait-queue are guarded
> -              * by both the lock in the wait-queue-head and by
> -              * the kmap_lock.  As the kmap_lock is held here,
> -              * no need for the wait-queue-head's lock.  Simply
> -              * test if the queue is empty.
> -              */
> -             need_wakeup = waitqueue_active(&pkmap_map_wait);
> -     }
> -     spin_unlock(&kmap_lock);
> -
> -     /* do wake-up, if needed, race-free outside of the spin lock */
> -     if (need_wakeup)
> -             wake_up(&pkmap_map_wait);
> +     pkmap_put(&pkmap_count[PKMAP_NR(vaddr)]);
>  }
>  
>  EXPORT_SYMBOL(kunmap_high);
> +
>  #endif
>  
>  #if defined(HASHED_PAGE_VIRTUAL)
> @@ -217,19 +262,13 @@ EXPORT_SYMBOL(kunmap_high);
>  #define PA_HASH_ORDER        7
>  
>  /*
> - * Describes one page->virtual association
> + * Describes one page->virtual address association.
>   */
> -struct page_address_map {
> +static struct page_address_map {
>       struct page *page;
>       void *virtual;
>       struct list_head list;
> -};
> -
> -/*
> - * page_address_map freelist, allocated from page_address_maps.
> - */
> -static struct list_head page_address_pool;   /* freelist */
> -static spinlock_t pool_lock;                 /* protects page_address_pool */
> +} page_address_maps[LAST_PKMAP];
>  
>  /*
>   * Hash table bucket
> @@ -244,91 +283,123 @@ static struct page_address_slot *page_sl
>       return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
>  }
>  
> -void *page_address(struct page *page)
> +static void *__page_address(struct page_address_slot *pas, struct page *page)
>  {
> -     unsigned long flags;
> -     void *ret;
> -     struct page_address_slot *pas;
> -
> -     if (!PageHighMem(page))
> -             return lowmem_page_address(page);
> +     void *ret = NULL;
>  
> -     pas = page_slot(page);
> -     ret = NULL;
> -     spin_lock_irqsave(&pas->lock, flags);
>       if (!list_empty(&pas->lh)) {
>               struct page_address_map *pam;
>  
>               list_for_each_entry(pam, &pas->lh, list) {
>                       if (pam->page == page) {
>                               ret = pam->virtual;
> -                             goto done;
> +                             break;
>                       }
>               }
>       }
> -done:
> +
> +     return ret;
> +}
> +
> +void *page_address(struct page *page)
> +{
> +     unsigned long flags;
> +     void *ret;
> +     struct page_address_slot *pas;
> +
> +     if (!PageHighMem(page))
> +             return lowmem_page_address(page);
> +
> +     pas = page_slot(page);
> +     spin_lock_irqsave(&pas->lock, flags);
> +     ret = __page_address(pas, page);
>       spin_unlock_irqrestore(&pas->lock, flags);
>       return ret;
>  }
>  
>  EXPORT_SYMBOL(page_address);
>  
> -void set_page_address(struct page *page, void *virtual)
> +static int __set_page_address(struct page *page, void *virtual, int pos)
>  {
> +     int ret = 0;
>       unsigned long flags;
>       struct page_address_slot *pas;
>       struct page_address_map *pam;
>  
> -     BUG_ON(!PageHighMem(page));
> +     VM_BUG_ON(!PageHighMem(page));
> +     VM_BUG_ON(atomic_read(&pkmap_count[pos]) != 0);
> +     VM_BUG_ON(pos < 0 || pos >= LAST_PKMAP);
>  
>       pas = page_slot(page);
> -     if (virtual) {          /* Add */
> -             BUG_ON(list_empty(&page_address_pool));
> +     pam = &page_address_maps[pos];
>  
> -             spin_lock_irqsave(&pool_lock, flags);
> -             pam = list_entry(page_address_pool.next,
> -                             struct page_address_map, list);
> -             list_del(&pam->list);
> -             spin_unlock_irqrestore(&pool_lock, flags);
> -
> -             pam->page = page;
> -             pam->virtual = virtual;
> -
> -             spin_lock_irqsave(&pas->lock, flags);
> -             list_add_tail(&pam->list, &pas->lh);
> -             spin_unlock_irqrestore(&pas->lock, flags);
> -     } else {                /* Remove */
> -             spin_lock_irqsave(&pas->lock, flags);
> -             list_for_each_entry(pam, &pas->lh, list) {
> -                     if (pam->page == page) {
> -                             list_del(&pam->list);
> -                             spin_unlock_irqrestore(&pas->lock, flags);
> -                             spin_lock_irqsave(&pool_lock, flags);
> -                             list_add_tail(&pam->list, &page_address_pool);
> -                             spin_unlock_irqrestore(&pool_lock, flags);
> -                             goto done;
> -                     }
> +     spin_lock_irqsave(&pas->lock, flags);
> +     if (virtual) { /* add */
> +             VM_BUG_ON(!list_empty(&pam->list));
> +
> +             if (!__page_address(pas, page)) {
> +                     pam->page = page;
> +                     pam->virtual = virtual;
> +                     list_add_tail(&pam->list, &pas->lh);
> +                     ret = 1;
> +             }
> +     } else { /* remove */
> +             if (!list_empty(&pam->list)) {
> +                     list_del_init(&pam->list);
> +                     ret = 1;
>               }
> -             spin_unlock_irqrestore(&pas->lock, flags);
>       }
> -done:
> -     return;
> +     spin_unlock_irqrestore(&pas->lock, flags);
> +
> +     return ret;
>  }
>  
> -static struct page_address_map page_address_maps[LAST_PKMAP];
> +int set_page_address(struct page *page, void *virtual)
> +{
> +     /*
> +      * set_page_address is not supposed to be called when using
> +      * hashed virtual addresses.
> +      */
> +     BUG();
> +     return 0;
> +}
>  
> -void __init page_address_init(void)
> +void __init __page_address_init(void)
>  {
>       int i;
>  
> -     INIT_LIST_HEAD(&page_address_pool);
>       for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
> -             list_add(&page_address_maps[i].list, &page_address_pool);
> +             INIT_LIST_HEAD(&page_address_maps[i].list);
> +
>       for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
>               INIT_LIST_HEAD(&page_address_htable[i].lh);
>               spin_lock_init(&page_address_htable[i].lock);
>       }
> -     spin_lock_init(&pool_lock);
> +}
> +
> +#elif defined (CONFIG_HIGHMEM) /* HASHED_PAGE_VIRTUAL */
> +
> +static int __set_page_address(struct page *page, void *virtual, int pos)
> +{
> +     return set_page_address(page, virtual);
>  }
>  
>  #endif       /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
> +
> +#if defined(CONFIG_HIGHMEM) || defined(HASHED_PAGE_VIRTUAL)
> +
> +void __init page_address_init(void)
> +{
> +#ifdef CONFIG_HIGHMEM
> +     int i;
> +
> +     for (i = 0; i < ARRAY_SIZE(pkmap_count); i++)
> +             atomic_set(&pkmap_count[i], 1);
> +#endif
> +
> +#ifdef HASHED_PAGE_VIRTUAL
> +     __page_address_init();
> +#endif
> +}
> +
> +#endif
> 
> 
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [EMAIL PROTECTED]
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
---end quoted text---
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to