Originally get_swap_page() started iterating through the singly-linked
list of swap_info_structs using swap_list.next or highest_priority_index,
which both were intended to point to the highest priority active swap
target that was not full.  The previous patch in this series changed the
singly-linked list to a doubly-linked list, and removed the logic to start
at the highest priority non-full entry; it starts scanning at the highest
priority entry each time, even if the entry is full.

Add a new list, also priority ordered, to track only swap_info_structs
that are available, i.e. active and not full.  Use a new spinlock so that
entries can be added/removed outside of get_swap_page; that wasn't possible
previously because the main list is protected by swap_lock, which can't be
taken when holding a swap_info_struct->lock because of locking order.
The get_swap_page() logic now does not need to hold the swap_lock, and it
iterates only through swap_info_structs that are available.

Signed-off-by: Dan Streetman <[email protected]>


---
 include/linux/swap.h |   1 +
 mm/swapfile.c        | 128 ++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 87 insertions(+), 42 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 96662d8..d9263db 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,6 +214,7 @@ struct percpu_cluster {
 struct swap_info_struct {
        unsigned long   flags;          /* SWP_USED etc: see above */
        signed short    prio;           /* swap priority of this type */
+       struct list_head prio_list;     /* entry in priority list */
        struct list_head list;          /* entry in swap list */
        signed char     type;           /* strange name for an index */
        unsigned int    max;            /* extent of the swap_map */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b958645..3c38461 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -57,9 +57,13 @@ static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
 
-/* all active swap_info */
+/* all active swap_info; protected with swap_lock */
 LIST_HEAD(swap_list_head);
 
+/* all available (active, not full) swap_info, priority ordered */
+static LIST_HEAD(prio_head);
+static DEFINE_SPINLOCK(prio_lock);
+
 struct swap_info_struct *swap_info[MAX_SWAPFILES];
 
 static DEFINE_MUTEX(swapon_mutex);
@@ -73,6 +77,27 @@ static inline unsigned char swap_count(unsigned char ent)
        return ent & ~SWAP_HAS_CACHE;   /* may include SWAP_HAS_CONT flag */
 }
 
+/*
+ * add, in priority order, swap_info (p)->(le) list_head to list (lh)
+ * this list-generic function is needed because both swap_list_head
+ * and prio_head need to be priority ordered:
+ * swap_list_head in swapoff to adjust lower negative prio swap_infos
+ * prio_list in get_swap_page to scan highest prio swap_info first
+ */
+#define swap_info_list_add(p, lh, le) do {                     \
+       struct swap_info_struct *_si;                           \
+       BUG_ON(!list_empty(&(p)->le));                          \
+       list_for_each_entry(_si, (lh), le) {                    \
+               if ((p)->prio >= _si->prio) {                   \
+                       list_add_tail(&(p)->le, &_si->le);      \
+                       break;                                  \
+               }                                               \
+       }                                                       \
+       /* lh empty, or p lowest prio */                        \
+       if (list_empty(&(p)->le))                               \
+               list_add_tail(&(p)->le, (lh));                  \
+} while (0)
+
 /* returns 1 if swap entry is freed */
 static int
 __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
@@ -591,6 +616,9 @@ checks:
        if (si->inuse_pages == si->pages) {
                si->lowest_bit = si->max;
                si->highest_bit = 0;
+               spin_lock(&prio_lock);
+               list_del_init(&si->prio_list);
+               spin_unlock(&prio_lock);
        }
        si->swap_map[offset] = usage;
        inc_cluster_info_page(si, si->cluster_info, offset);
@@ -642,53 +670,68 @@ swp_entry_t get_swap_page(void)
 {
        struct swap_info_struct *si, *next;
        pgoff_t offset;
-       struct list_head *tmp;
 
-       spin_lock(&swap_lock);
        if (atomic_long_read(&nr_swap_pages) <= 0)
                goto noswap;
        atomic_long_dec(&nr_swap_pages);
 
-       list_for_each(tmp, &swap_list_head) {
-               si = list_entry(tmp, typeof(*si), list);
-               spin_lock(&si->lock);
-               if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
-                       spin_unlock(&si->lock);
-                       continue;
-               }
-
+       spin_lock(&prio_lock);
+start_over:
+       list_for_each_entry_safe(si, next, &prio_head, prio_list) {
                /*
-                * rotate the current swap_info that we're going to use
+                * rotate the current swap_info that we're checking
                 * to after any other swap_info that have the same prio,
                 * so that all equal-priority swap_info get used equally
                 */
-               next = si;
-               list_for_each_entry_continue(next, &swap_list_head, list) {
-                       if (si->prio != next->prio)
+               struct swap_info_struct *eq_prio = si;
+               list_for_each_entry_continue(eq_prio, &prio_head, prio_list) {
+                       if (si->prio != eq_prio->prio)
                                break;
-                       list_rotate_left(&si->list);
-                       next = si;
+                       list_rotate_left(&si->prio_list);
+                       eq_prio = si;
+               }
+               spin_unlock(&prio_lock);
+               spin_lock(&si->lock);
+               if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
+                       spin_lock(&prio_lock);
+                       if (list_empty(&si->prio_list)) {
+                               spin_unlock(&si->lock);
+                               goto nextsi;
+                       }
+                       WARN(!si->highest_bit,
+                            "swap_info %d in list but !highest_bit\n",
+                            si->type);
+                       WARN(!(si->flags & SWP_WRITEOK),
+                            "swap_info %d in list but !SWP_WRITEOK\n",
+                            si->type);
+                       list_del_init(&si->prio_list);
+                       spin_unlock(&si->lock);
+                       goto nextsi;
                }
 
-               spin_unlock(&swap_lock);
                /* This is called for allocating swap entry for cache */
                offset = scan_swap_map(si, SWAP_HAS_CACHE);
                spin_unlock(&si->lock);
                if (offset)
                        return swp_entry(si->type, offset);
-               spin_lock(&swap_lock);
+               printk(KERN_DEBUG "scan_swap_map of si %d failed to find 
offset\n",
+                      si->type);
+               spin_lock(&prio_lock);
+nextsi:
                /*
-                * shouldn't really have got here, but for some reason the
-                * scan_swap_map came back empty for this swap_info.
-                * Since we dropped the swap_lock, there may now be
-                * non-full higher prio swap_infos; let's start over.
+                * shouldn't really have got here.  either si was
+                * in the prio_head list but was full or !writeok, or
+                * scan_swap_map came back empty.  Since we dropped
+                * the prio_lock, the prio_head list may have been
+                * modified; so if next is still in the prio_head
+                * list then try it, otherwise start over.
                 */
-               tmp = &swap_list_head;
+               if (list_empty(&next->prio_list))
+                       goto start_over;
        }
 
        atomic_long_inc(&nr_swap_pages);
 noswap:
-       spin_unlock(&swap_lock);
        return (swp_entry_t) {0};
 }
 
@@ -791,8 +834,17 @@ static unsigned char swap_entry_free(struct 
swap_info_struct *p,
                dec_cluster_info_page(p, p->cluster_info, offset);
                if (offset < p->lowest_bit)
                        p->lowest_bit = offset;
-               if (offset > p->highest_bit)
+               if (offset > p->highest_bit) {
+                       bool was_full = !p->highest_bit;
                        p->highest_bit = offset;
+                       if (was_full && (p->flags & SWP_WRITEOK)) {
+                               spin_lock(&prio_lock);
+                               if (list_empty(&p->prio_list))
+                                       swap_info_list_add(p, &prio_head,
+                                                          prio_list);
+                               spin_unlock(&prio_lock);
+                       }
+               }
                atomic_long_inc(&nr_swap_pages);
                p->inuse_pages--;
                frontswap_invalidate_page(p->type, offset);
@@ -1727,8 +1779,6 @@ static void _enable_swap_info(struct swap_info_struct *p, 
int prio,
                                unsigned char *swap_map,
                                struct swap_cluster_info *cluster_info)
 {
-       struct swap_info_struct *si;
-
        if (prio >= 0)
                p->prio = prio;
        else
@@ -1740,20 +1790,10 @@ static void _enable_swap_info(struct swap_info_struct 
*p, int prio,
        total_swap_pages += p->pages;
 
        assert_spin_locked(&swap_lock);
-       BUG_ON(!list_empty(&p->list));
-       /* insert into swap list: */
-       list_for_each_entry(si, &swap_list_head, list) {
-               if (p->prio >= si->prio) {
-                       list_add_tail(&p->list, &si->list);
-                       return;
-               }
-       }
-       /*
-        * this covers two cases:
-        * 1) p->prio is less than all existing prio
-        * 2) the swap list is empty
-        */
-       list_add_tail(&p->list, &swap_list_head);
+       swap_info_list_add(p, &swap_list_head, list);
+       spin_lock(&prio_lock);
+       swap_info_list_add(p, &prio_head, prio_list);
+       spin_unlock(&prio_lock);
 }
 
 static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1827,6 +1867,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
                spin_unlock(&swap_lock);
                goto out_dput;
        }
+       spin_lock(&prio_lock);
+       list_del_init(&p->prio_list);
+       spin_unlock(&prio_lock);
        spin_lock(&p->lock);
        if (p->prio < 0) {
                struct swap_info_struct *si = p;
@@ -2101,6 +2144,7 @@ static struct swap_info_struct *alloc_swap_info(void)
        }
        INIT_LIST_HEAD(&p->first_swap_extent.list);
        INIT_LIST_HEAD(&p->list);
+       INIT_LIST_HEAD(&p->prio_list);
        p->flags = SWP_USED;
        spin_unlock(&swap_lock);
        spin_lock_init(&p->lock);
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to