Hi,

On 10/02/16 19:59, Bob Peterson wrote:
Hi,

----- Original Message -----
Hi,

On 18/12/15 18:42, Bob Peterson wrote:
Hi,

This patch introduces a new glock workqueue, gfs2_glock_final. The
workqueue merely does work to call dlm's unlock.
This prevents gfs2_evict_inode from calling dlm directly which
might block, waiting for DLM to unlock, which may be waiting for
something like a fence operation. By moving it to its own work queue,
the final put happens later, which allows the shrinker to continue,
and free memory, avoiding a livelock.
I don't think this is going to work... if the DLM lock put is delayed
then it would be possible for a new glock to be created (or the existing
one to be reused) if the inode gets recreated, and then there is no
coordination between the unlock and the new lock request. If those were
to be sent in the wrong order, then things are going to get very confused.

If you add the coordination then we are back to square one, in terms of
the deadlock in this loop. The only way we can stop this is to try and
avoid inode deallocation in the eviction path. By the time we've already
decided to do the eviction, it is too late,

Steve.
Okay, I scrapped that idea and decided to do something very similar to
what GFS1 did. That is, to use a daemon to eliminate glocks that are no
longer needed. Rather than introduce a new daemon, I'm adding new function
to the existing quota daemon. This version seems to be running just as
well with the scenario I've been struggling with for so long.

What do you think? If we can't do this, I'm out of ideas on how to solve it.
I don't think it is a good plan to go back to that system. It caused problems which we've long since solved. Lets take a step back and see if we can find a different solution here. I know it is not easy, and the code path is one of the more tricky in the whole of GFS2, which is why we need to be very careful.

I'm going to try and block out some time tomorrow and look at the remaining patches which you posted back in December which are still waiting for review, and I'll think a bit more about this issue at the same time, since they are all connected,

Steve.

Regards,

Bob Peterson
Red Hat File Systems
---
Patch description:

This patch adds a new rubbish list for glocks. When a glock's reference
count goes to zero, it gets thrown on the rubbish list, rather than
being freed and unlocked in dlm immediately. If we tell dlm to unlock
immediately, we run the risk of a livelock in which GFS2 waits forever
for DLM to release the lock, DLM waits forever for a pending fence
action, but the fence action waits forever for memory, which is
itself blocks on the inode shrinker, which is trying to free inodes.

With this new method, the glock gets tossed onto the rubbish list,
then the inode free may continue. Thus, the shrinker may continue
running, avoiding the live lock. The glocks on the rubbish list are
processed periodically by the quota daemon, which is very similar to
what GFS1 used to do (but GFS1 had several daemons; one was dedicated
to this process).

When the daemon processes the rubbish list, it first checks to see
if the glock has been recreated in the hash tables. If so, it can't
instruct DLM to unlock it; the new lock supercedes the dead one.

Signed-off-by: Bob Peterson <rpete...@redhat.com>

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8dedece..703a4fd 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -67,6 +67,8 @@ struct workqueue_struct *gfs2_delete_workqueue;
  static LIST_HEAD(lru_list);
  static atomic_t lru_count = ATOMIC_INIT(0);
  static DEFINE_SPINLOCK(lru_lock);
+static LIST_HEAD(rubbish_list);
+static DEFINE_SPINLOCK(rubbish_lock);
#define GFS2_GL_HASH_SHIFT 15
  #define GFS2_GL_HASH_SIZE       (1 << GFS2_GL_HASH_SHIFT)
@@ -160,7 +162,6 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock 
*gl)
void gfs2_glock_put(struct gfs2_glock *gl)
  {
-       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
        struct address_space *mapping = gfs2_glock2aspace(gl);
if (lockref_put_or_lock(&gl->gl_lockref))
@@ -173,8 +174,44 @@ void gfs2_glock_put(struct gfs2_glock *gl)
        rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms);
        GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
        GLOCK_BUG_ON(gl, mapping && mapping->nrpages);
-       trace_gfs2_glock_put(gl);
-       sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
+       spin_lock(&rubbish_lock);
+       list_add_tail(&gl->gl_lru, &rubbish_list);
+       spin_unlock(&rubbish_lock);
+}
+
+/**
+ * gfs2_rubbish_removal - destroy glocks that are no longer wanted
+ */
+long gfs2_rubbish_removal(long count)
+{
+       struct gfs2_glock *gl;
+       struct gfs2_sbd *sdp;
+       long removed = 0;
+
+       spin_lock(&rubbish_lock);
+       while(count-- && !list_empty(&rubbish_list)) {
+               gl = list_entry(rubbish_list.next, struct gfs2_glock, gl_lru);
+               BUG_ON(!__lockref_is_dead(&gl->gl_lockref));
+               sdp = gl->gl_name.ln_sbd;
+               list_del_init(&gl->gl_lru);
+               spin_unlock(&rubbish_lock);
+
+               /* If it was recreated after being sacked, don't tell dlm to
+                  unlock. The new one supercedes the one in the rubbish. */
+               if (rhashtable_lookup_fast(&gl_hash_table, &gl->gl_name,
+                                          ht_parms)) {
+                       gfs2_glock_free(gl);
+               } else { /* was not recreated */
+                       trace_gfs2_glock_put(gl);
+                       sdp->sd_lockstruct.ls_ops->lm_put_lock(gl);
+               }
+               removed++;
+               if (count && !list_empty(&rubbish_list))
+                       cond_resched();
+               spin_lock(&rubbish_lock);
+       }
+       spin_unlock(&rubbish_lock);
+       return removed;
  }
/**
@@ -1503,13 +1540,21 @@ static void dump_glock_func(struct gfs2_glock *gl)
void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
  {
+       long removed;
+       int i;
+
        set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
        flush_workqueue(glock_workqueue);
        glock_hash_walk(clear_glock, sdp);
        flush_workqueue(glock_workqueue);
-       wait_event_timeout(sdp->sd_glock_wait,
-                          atomic_read(&sdp->sd_glock_disposal) == 0,
-                          HZ * 600);
+       for (i = 0; i < 600; i++) {
+               removed = gfs2_rubbish_removal(LONG_MAX);
+               wait_event_timeout(sdp->sd_glock_wait,
+                                  atomic_read(&sdp->sd_glock_disposal) == 0,
+                                  HZ);
+               if (atomic_read(&sdp->sd_glock_disposal) == 0)
+                       break;
+       }
        glock_hash_walk(dump_glock_func, sdp);
  }
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 46ab67f..5fe645f 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -203,6 +203,7 @@ extern void gfs2_dump_glock(struct seq_file *seq, const 
struct gfs2_glock *gl);
  #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); 
BUG(); } } while(0)
  extern __printf(2, 3)
  void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
+extern long gfs2_rubbish_removal(long count);
/**
   * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index be6d9c4..c343c2e 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1534,6 +1534,7 @@ int gfs2_quotad(void *data)
        unsigned long t = 0;
        DEFINE_WAIT(wait);
        int empty;
+       long removed;
while (!kthread_should_stop()) { @@ -1555,6 +1556,9 @@ int gfs2_quotad(void *data)
                /* Check for & recover partially truncated inodes */
                quotad_check_trunc_list(sdp);
+ /* Destroy any glocks that need destroying */
+               removed = gfs2_rubbish_removal(2500);
+
                try_to_freeze();
t = min(quotad_timeo, statfs_timeo);
@@ -1563,7 +1567,7 @@ int gfs2_quotad(void *data)
                spin_lock(&sdp->sd_trunc_lock);
                empty = list_empty(&sdp->sd_trunc_list);
                spin_unlock(&sdp->sd_trunc_lock);
-               if (empty && !sdp->sd_statfs_force_sync)
+               if (empty && removed == 0 && !sdp->sd_statfs_force_sync)
                        t -= schedule_timeout(t);
                else
                        t = 0;

Reply via email to