from:"Steven Whitehouse"

Re: [Cluster-devel] [GFS2 PATCH 1/2][TRY #2] GFS2: Move glock superblock pointer to field gl_name

2015-07-13 Thread Steven Whitehouse


Hi,

On 13/07/15 14:39, Bob Peterson wrote:

Hi,

- Original Message -

Hi,

On 09/07/15 19:25, Bob Peterson wrote:

What uniquely identifies a glock in the glock hash table is not
gl_name, but gl_name and its superblock pointer. This patch makes
the gl_name field correspond to a unique glock identifier. That will
allow us to simplify hashing with a future patch, since the hash
algorithm can then take the gl_name and hash its components in one
operation.
---

[snip]

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index a1ec7c2..4de7853 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -205,11 +205,13 @@ enum {
   struct lm_lockname {
u64 ln_number;
unsigned int ln_type;
+   struct gfs2_sbd *ln_sbd;
   };
   

This looks like its adding a hole on 64 bit arches... can we swap the
order of ln_type and ln_sbd, or even make the sbd the first element of
this? That way a memcmp of two of lm_locknames doesn't have to include
the hole in the comparison, which I assume is what you want here?

Steve.

Good catch. This replacement for patch #1 is identical except for
the placement of ln_sbd in the lm_lockname structure.

Regards,

Bob Peterson
Red Hat File Systems
---


That looks good now. Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.


commit 0ed2b820a4f3d44e447b3e724f5355cd2f7ee4ee
Author: Bob Peterson rpete...@redhat.com
Date:   Mon Mar 16 11:52:05 2015 -0500

GFS2: Move glock superblock pointer to field gl_name

What uniquely identifies a glock in the glock hash table is not
gl_name, but gl_name and its superblock pointer. This patch makes
the gl_name field correspond to a unique glock identifier. That will
allow us to simplify hashing with a future patch, since the hash
algorithm can then take the gl_name and hash its components in one
operation.

Signed-off-by: Bob Peterson rpete...@redhat.com
---
  fs/gfs2/glock.c  | 32 +++-
  fs/gfs2/glops.c  | 38 --
  fs/gfs2/incore.h |  9 +
  fs/gfs2/lock_dlm.c   | 10 +-
  fs/gfs2/lops.c   |  6 +++---
  fs/gfs2/meta_io.c|  6 +++---
  fs/gfs2/meta_io.h|  2 +-
  fs/gfs2/quota.c  | 22 +++---
  fs/gfs2/rgrp.c   |  2 +-
  fs/gfs2/trace_gfs2.h | 18 +-
  fs/gfs2/trans.c  |  4 ++--
  11 files changed, 75 insertions(+), 74 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a38e38f..25e0389 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -119,7 +119,7 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
  
  void gfs2_glock_free(struct gfs2_glock *gl)

  {
-   struct gfs2_sbd *sdp = gl-gl_sbd;
+   struct gfs2_sbd *sdp = gl-gl_name.ln_sbd;
  
  	call_rcu(gl-gl_rcu, gfs2_glock_dealloc);

if (atomic_dec_and_test(sdp-sd_glock_disposal))
@@ -192,7 +192,7 @@ static void gfs2_glock_remove_from_lru(struct gfs2_glock 
*gl)
  
  void gfs2_glock_put(struct gfs2_glock *gl)

  {
-   struct gfs2_sbd *sdp = gl-gl_sbd;
+   struct gfs2_sbd *sdp = gl-gl_name.ln_sbd;
struct address_space *mapping = gfs2_glock2aspace(gl);
  
  	if (lockref_put_or_lock(gl-gl_lockref))

@@ -220,7 +220,6 @@ void gfs2_glock_put(struct gfs2_glock *gl)
   */
  
  static struct gfs2_glock *search_bucket(unsigned int hash,

-   const struct gfs2_sbd *sdp,
const struct lm_lockname *name)
  {
struct gfs2_glock *gl;
@@ -229,8 +228,6 @@ static struct gfs2_glock *search_bucket(unsigned int hash,
hlist_bl_for_each_entry_rcu(gl, h, gl_hash_table[hash], gl_list) {
if (!lm_name_equal(gl-gl_name, name))
continue;
-   if (gl-gl_sbd != sdp)
-   continue;
if (lockref_get_not_dead(gl-gl_lockref))
return gl;
}
@@ -506,7 +503,7 @@ __releases(gl-gl_spin)
  __acquires(gl-gl_spin)
  {
const struct gfs2_glock_operations *glops = gl-gl_ops;
-   struct gfs2_sbd *sdp = gl-gl_sbd;
+   struct gfs2_sbd *sdp = gl-gl_name.ln_sbd;
unsigned int lck_flags = gh ? gh-gh_flags : 0;
int ret;
  
@@ -628,7 +625,7 @@ out_unlock:

  static void delete_work_func(struct work_struct *work)
  {
struct gfs2_glock *gl = container_of(work, struct gfs2_glock, 
gl_delete);
-   struct gfs2_sbd *sdp = gl-gl_sbd;
+   struct gfs2_sbd *sdp = gl-gl_name.ln_sbd;
struct gfs2_inode *ip;
struct inode *inode;
u64 no_addr = gl-gl_name.ln_number;
@@ -704,14 +701,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
   struct gfs2_glock **glp)
  {
struct super_block *s = sdp-sd_vfs;
-   struct lm_lockname name = { .ln_number = number, .ln_type = 
glops-go_type };
+   struct lm_lockname name = { .ln_number = number,
+   .ln_type = glops-go_type,
+   .ln_sbd = sdp

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Don't add all glocks to the lru

2015-06-18 Thread Steven Whitehouse


Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.

On 16/06/15 19:31, Bob Peterson wrote:

Hi,

Regarding my previously posted patch:
I decided it makes more sense not to single out rgrp glocks for
exclusion from the lru list. It makes sense to also exclude the
transaction glock, and non-disk glocks off the lru list as well.
Therefore, I changed this to a generic glops flag so we could
specify them on an individual basis.

Patch description:

The glocks used for resource groups often come and go hundreds of
thousands of time per second. Adding them to the lru list just
adds unnecessary contention for the lru_lock spin_lock, especially
considering we're almost certainly going to re-use the glock and
take it back off the lru microseconds later. We never want the
glock shrinker to cull them anyway. This patch adds a new bit in
the glops that determines which glock types get put onto the lru
list and which ones don't.

Regards,

Bob Peterson

Signed-off-by: Bob Peterson rpete...@redhat.com
---
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 0fa8062..a38e38f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1076,7 +1076,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
!test_bit(GLF_DEMOTE, gl-gl_flags))
fast_path = 1;
}
-   if (!test_bit(GLF_LFLUSH, gl-gl_flags)  demote_ok(gl))
+   if (!test_bit(GLF_LFLUSH, gl-gl_flags)  demote_ok(gl) 
+   (glops-go_flags  GLOF_LRU))
gfs2_glock_add_to_lru(gl);
  
  	trace_gfs2_glock_queue(gh, 0);

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index fe91951..1249b2b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -561,7 +561,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
.go_lock = inode_go_lock,
.go_dump = inode_go_dump,
.go_type = LM_TYPE_INODE,
-   .go_flags = GLOF_ASPACE,
+   .go_flags = GLOF_ASPACE | GLOF_LRU,
  };
  
  const struct gfs2_glock_operations gfs2_rgrp_glops = {

@@ -584,10 +584,12 @@ const struct gfs2_glock_operations gfs2_freeze_glops = {
  const struct gfs2_glock_operations gfs2_iopen_glops = {
.go_type = LM_TYPE_IOPEN,
.go_callback = iopen_go_callback,
+   .go_flags = GLOF_LRU,
  };
  
  const struct gfs2_glock_operations gfs2_flock_glops = {

.go_type = LM_TYPE_FLOCK,
+   .go_flags = GLOF_LRU,
  };
  
  const struct gfs2_glock_operations gfs2_nondisk_glops = {

@@ -596,7 +598,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
  
  const struct gfs2_glock_operations gfs2_quota_glops = {

.go_type = LM_TYPE_QUOTA,
-   .go_flags = GLOF_LVB,
+   .go_flags = GLOF_LVB | GLOF_LRU,
  };
  
  const struct gfs2_glock_operations gfs2_journal_glops = {

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 304a223..a1ec7c2 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -225,6 +225,7 @@ struct gfs2_glock_operations {
const unsigned long go_flags;
  #define GLOF_ASPACE 1
  #define GLOF_LVB2
+#define GLOF_LRU4
  };
  
  enum {

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Don't brelse rgrp buffer_heads every allocation

2015-06-16 Thread Steven Whitehouse


Hi,

On 15/06/15 15:43, Bob Peterson wrote:

- Original Message -

I'm assuming that these figures are bandwidth rather than times, since
that appears to show that the patch makes quite a large difference.
However the reclen is rather small. In the 32 bytes case, thats 128
writes for each new block thats being allocated, unless of course that
is 32k?

Steve.

Hi,

To do this test, I'm executing this command:
numactl --cpunodebind=0 --membind=0 /home/bob/iozone/iozone3_429/src/current/iozone 
-az -f /mnt/gfs2/iozone-gfs2 -n 2048m -g 2048m -y 32k -q 1m -e -i 0 -+n  
/home/bob/iozone.out

According to iozone -h, specifying -y this way is 32K, not 32 bytes.
The -q is maximum write size, in KB, so -q 1m is 1MB writes.
The -g is maximum file size, in KB, so -g 2048 is a 2MB file.

So the test varies the writes from 32K to 1MB, adjusting the number
of writes to get the file to 2MB.

Regards,

Bob Peterson
Red Hat File Systems


Ok, that makes sense to me. I guess that there is not a lot of searching 
for the rgrps going on, which is why we are not seeing a big gain in the 
rgrplvb case. If the fs was nearly full perhaps we'd see more of a 
difference in that case.


Either way, provided the rgrplvb code can continue to work, that is 
really the important thing, so I think we should be ok with this. I'd 
still very much like to see what can be done to reduce the page look up 
cost more directly though, since that seems to be the real issue here,


Steve.

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Don't brelse rgrp buffer_heads every allocation

2015-06-15 Thread Steven Whitehouse


Hi,


On 12/06/15 20:50, Bob Peterson wrote:

- Original Message -

Hi,


On 09/06/15 15:45, Bob Peterson wrote:

- Original Message -

Hi,


On 05/06/15 15:49, Bob Peterson wrote:

Hi,

This patch allows the block allocation code to retain the buffers
for the resource groups so they don't need to be re-read from buffer
cache with every request. This is a performance improvement that's
especially noticeable when resource groups are very large. For
example, with 2GB resource groups and 4K blocks, there can be 33
blocks for every resource group. This patch allows those 33 buffers
to be kept around and not read in and thrown away with every
operation. The buffers are released when the resource group is
either synced or invalidated.

The blocks should be cached between operations, so this should only be
resulting in a skip of the look up of the cached block, and no changes
to the actual I/O. Does that mean that grab_cache_page() is slow I
wonder? Or is this an issue of going around the retry loop due to lack
of memory at some stage?

How does this interact with the rgrplvb support? I'd guess that with
that turned on, this is no longer an issue, because we'd only read in
the blocks for the rgrps that we are actually going to use?



Steve.

Hi,

If you compare the two vmstat outputs in the bugzilla #1154782, you'll
see no significant difference in memory usage nor cpu usage. So I assume
the page lookup is the slow part; not because it's such a slow thing
but because it's done 33 times per read-reference-invalidate (33 pages
to look up per rgrp).

Regards,

Bob Peterson
Red Hat File Systems

Thats true, however, as I understand the problem here, the issue is not
reading in the blocks for the rgrp that is eventually selected to use,
but the reading in of those blocks for the rgrps that we reject, for
whatever reason (full, or congested, or whatever). So with rgrplvb
enabled, we don't then read those rgrps in off disk at all in most cases
- so I was wondering whether that solves the problem without needing
this change?

Ideally I'd like to make the rgrplvb setting the default, since it is
much more efficient. The question is how we can do that and still remain
backward compatible? Not an easy one to answer :(

Also, if the page lookup is the slow thing, then we should look at using
pagevec_lookup() to get the pages in chunks rather than doing it
individually (and indeed, multiple times per page, in case of block size
less than page size). We know that the blocks will always be contiguous
on disk, so we should be able to send down large I/Os, rather than
relying on the block stack to merge them as we do at the moment, which
should be a further improvement too,

Steve.

Hi,

The rgrplvb mount option only helps if the file system is using lock_dlm.
For lock_nolock, it's still just as slow because lock_nolock has no knowledge
of lvbs. Now, granted, that's an unusual case because GFS2 is normally used
with lock_dlm.
That sounds like a bug... it should work in the same way, even with 
lock_nolock.



I like the idea of making rgrplvb the default mount option, and I don't
see a problem doing that.
The issue is that it will not be backwards compatible. We really need a 
way to at least easily detect if someone has a mixed cluster (some nodes 
with rgrplvb enabled, and some without) otherwise it might be confusing 
when we get odd reports of allocations failing, even when there appears 
to be free space.


We need to treat what we put into LVBs in the same way as we treat the 
on-disk format in terms of

backwards compatibility.



I think the rgrplvb option should be compatible with this patch, but
I'll set up a test environment in order to test that they work together
harmoniously.

I also like the idea of using a pagevec for reading in multiple pages for
the rgrps, but that's another improvement for another day. If there's
not a bugzilla record open for that, perhaps we should open one.

Regards,

Bob Peterson
Red Hat File Systems


If we have rgrplvb, then we my not need this patch, since we will not be 
looking up the rgrp's blocks as often. So we should see the benefit just 
by turning that on I think... at least it would be good to see whether 
there is any performance difference there. In cases where we have nodes 
competing for the rgrps, then the blocks will not be cached anyway, so 
we will gain no benefit from this patch, since we'll have to read the 
blocks anyway, hence my thought that speeding up the lookup is the way 
to go, since it will give the benefit for more different cases - both 
when the rgrps blocks are cached and uncached,



Steve.

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Don't add rgrp glocks to the lru

2015-06-15 Thread Steven Whitehouse


Hi,

On 12/06/15 19:29, Bob Peterson wrote:

Hi,

The glocks used for resource groups often come and go hundreds of
thousands of times per second. Adding them to the lru list just
adds unnecessary contention for the lru_lock spin_lock, especially
considering we're almost certainly going to re-use the glock and
take it back off the lru microseconds later. We never want the
glock shrinker to cull them anyway. This patch bypasses adding
them to the lru.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson rpete...@redhat.com

Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.

---
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 0fa8062..1db06a5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1076,7 +1076,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
!test_bit(GLF_DEMOTE, gl-gl_flags))
fast_path = 1;
}
-   if (!test_bit(GLF_LFLUSH, gl-gl_flags)  demote_ok(gl))
+   if (!test_bit(GLF_LFLUSH, gl-gl_flags)  demote_ok(gl) 
+   gl-gl_name.ln_type != LM_TYPE_RGRP)
gfs2_glock_add_to_lru(gl);
  
  	trace_gfs2_glock_queue(gh, 0);

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Don't brelse rgrp buffer_heads every allocation

2015-06-10 Thread Steven Whitehouse


Hi,


On 09/06/15 15:45, Bob Peterson wrote:

- Original Message -

Hi,


On 05/06/15 15:49, Bob Peterson wrote:

Hi,

This patch allows the block allocation code to retain the buffers
for the resource groups so they don't need to be re-read from buffer
cache with every request. This is a performance improvement that's
especially noticeable when resource groups are very large. For
example, with 2GB resource groups and 4K blocks, there can be 33
blocks for every resource group. This patch allows those 33 buffers
to be kept around and not read in and thrown away with every
operation. The buffers are released when the resource group is
either synced or invalidated.

The blocks should be cached between operations, so this should only be
resulting in a skip of the look up of the cached block, and no changes
to the actual I/O. Does that mean that grab_cache_page() is slow I
wonder? Or is this an issue of going around the retry loop due to lack
of memory at some stage?

How does this interact with the rgrplvb support? I'd guess that with
that turned on, this is no longer an issue, because we'd only read in
the blocks for the rgrps that we are actually going to use?



Steve.

Hi,

If you compare the two vmstat outputs in the bugzilla #1154782, you'll
see no significant difference in memory usage nor cpu usage. So I assume
the page lookup is the slow part; not because it's such a slow thing
but because it's done 33 times per read-reference-invalidate (33 pages
to look up per rgrp).

Regards,

Bob Peterson
Red Hat File Systems


Thats true, however, as I understand the problem here, the issue is not 
reading in the blocks for the rgrp that is eventually selected to use, 
but the reading in of those blocks for the rgrps that we reject, for 
whatever reason (full, or congested, or whatever). So with rgrplvb 
enabled, we don't then read those rgrps in off disk at all in most cases 
- so I was wondering whether that solves the problem without needing 
this change?


Ideally I'd like to make the rgrplvb setting the default, since it is 
much more efficient. The question is how we can do that and still remain 
backward compatible? Not an easy one to answer :(


Also, if the page lookup is the slow thing, then we should look at using 
pagevec_lookup() to get the pages in chunks rather than doing it 
individually (and indeed, multiple times per page, in case of block size 
less than page size). We know that the blocks will always be contiguous 
on disk, so we should be able to send down large I/Os, rather than 
relying on the block stack to merge them as we do at the moment, which 
should be a further improvement too,


Steve.

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Don't brelse rgrp buffer_heads every allocation

2015-06-08 Thread Steven Whitehouse


Hi,


On 05/06/15 15:49, Bob Peterson wrote:

Hi,

This patch allows the block allocation code to retain the buffers
for the resource groups so they don't need to be re-read from buffer
cache with every request. This is a performance improvement that's
especially noticeable when resource groups are very large. For
example, with 2GB resource groups and 4K blocks, there can be 33
blocks for every resource group. This patch allows those 33 buffers
to be kept around and not read in and thrown away with every
operation. The buffers are released when the resource group is
either synced or invalidated.
The blocks should be cached between operations, so this should only be 
resulting in a skip of the look up of the cached block, and no changes 
to the actual I/O. Does that mean that grab_cache_page() is slow I 
wonder? Or is this an issue of going around the retry loop due to lack 
of memory at some stage?


How does this interact with the rgrplvb support? I'd guess that with 
that turned on, this is no longer an issue, because we'd only read in 
the blocks for the rgrps that we are actually going to use?




Steve.


Signed-off-by: Bob Peterson rpete...@redhat.com
---
  fs/gfs2/glops.c | 14 +++---
  fs/gfs2/rgrp.c  | 23 +++
  fs/gfs2/rgrp.h  |  1 +
  3 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index fe91951..c23377f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -144,6 +144,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
struct gfs2_rgrpd *rgd;
int error;
  
+	spin_lock(gl-gl_spin);

+   rgd = gl-gl_object;
+   if (rgd)
+   gfs2_rgrp_brelse(rgd);
+   spin_unlock(gl-gl_spin);
+
if (!test_and_clear_bit(GLF_DIRTY, gl-gl_flags))
return;
GLOCK_BUG_ON(gl, gl-gl_state != LM_ST_EXCLUSIVE);
@@ -175,15 +181,17 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int 
flags)
  {
struct gfs2_sbd *sdp = gl-gl_sbd;
struct address_space *mapping = sdp-sd_aspace;
+   struct gfs2_rgrpd *rgd = gl-gl_object;
+
+   if (rgd)
+   gfs2_rgrp_brelse(rgd);
  
  	WARN_ON_ONCE(!(flags  DIO_METADATA));

gfs2_assert_withdraw(sdp, !atomic_read(gl-gl_ail_count));
truncate_inode_pages_range(mapping, gl-gl_vm.start, gl-gl_vm.end);
  
-	if (gl-gl_object) {

-   struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl-gl_object;
+   if (rgd)
rgd-rd_flags = ~GFS2_RDF_UPTODATE;
-   }
  }
  
  /**

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index cd53d6e..c6c6232 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1244,14 +1244,13 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
  }
  
  /**

- * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get()
- * @gh: The glock holder for the resource group
+ * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get()
+ * @rgd: The resource group
   *
   */
  
-void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)

+void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd)
  {
-   struct gfs2_rgrpd *rgd = gh-gh_gl-gl_object;
int x, length = rgd-rd_length;
  
  	for (x = 0; x  length; x++) {

@@ -1264,6 +1263,22 @@ void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
  
  }
  
+/**

+ * gfs2_rgrp_go_unlock - Unlock a rgrp glock
+ * @gh: The glock holder for the resource group
+ *
+ */
+
+void gfs2_rgrp_go_unlock(struct gfs2_holder *gh)
+{
+   struct gfs2_rgrpd *rgd = gh-gh_gl-gl_object;
+   int demote_requested = test_bit(GLF_DEMOTE, gh-gh_gl-gl_flags) |
+   test_bit(GLF_PENDING_DEMOTE, gh-gh_gl-gl_flags);
+
+   if (rgd  demote_requested)
+   gfs2_rgrp_brelse(rgd);
+}
+
  int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 struct buffer_head *bh,
 const struct gfs2_bitmap *bi, unsigned minlen, u64 
*ptrimmed)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 68972ec..c0ab33f 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -36,6 +36,7 @@ extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
  extern int gfs2_rindex_update(struct gfs2_sbd *sdp);
  extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
  extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
+extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd);
  extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
  
  extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);

Re: [Cluster-devel] [GFS2 PATCH] gfs2: s64 cast for negative quota value

2015-06-08 Thread Steven Whitehouse


Hi,

Looks good to me:
Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.

On 08/06/15 16:36, Abhi Das wrote:

One-line fix to cast quota value to s64 before comparison.
By default the quantity is treated as u64.

Signed-off-by: Abhi Das a...@redhat.com
---
  fs/gfs2/quota.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index dcd598a..c2607a2 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -798,7 +798,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t 
loc,
loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */
err = -EIO;
be64_add_cpu(q.qu_value, change);
-   if (be64_to_cpu(q.qu_value)  0)
+   if (((s64)be64_to_cpu(q.qu_value))  0)
q.qu_value = 0; /* Never go negative on quota usage */
qd-qd_qb.qb_value = q.qu_value;
if (fdq) {

Re: [Cluster-devel] [GFS2 PATCH] gfs2: Don't support fallocate on jdata files

2015-06-04 Thread Steven Whitehouse


Hi,


On 04/06/15 11:27, Andrew Price wrote:

On 04/06/15 09:42, Steven Whitehouse wrote:

Hi,

Will glibc do the fallback path, or just return this as an error? I
think thats worth checking as it would be nice it it would transparently
fall back in this case,


You only get the fallback with posix_fallocate() but many applications 
will use fallocate() directly for various reasons¹.



On 03/06/15 22:30, Abhi Das wrote:

We cannot provide an efficient implementation due to the headers
on the data blocks, so there doesn't seem much point in having it.


I'm not sure I like the idea that fallocate() could work in one 
directory and fail in another... What exactly is the issue here? Is it 
just the journal space required or that writing the data block headers 
would be too slow?


Andy

The issue is that data blocks for journaled data have metadata headers 
on them. Although I guess that would not affect the normal journaled 
data. but it would affect the journaled data used by the various kernel 
internal files (i.e. on the meta fs).


The idea of fallocate is to be able to allocate the zero the blocks 
efficiently, but if they have to go through the journal anyway, then it 
is going to be really slow, so not a lot of point in doing it. We should 
definitely try and encourage applications to use posix_fallocate where 
possible, so that the fallback will work, although I know that we cannot 
cover all situations.


The reality is that virtually nobody uses jdata files anyway, so this is 
not likely to affect anybody, otherwise they'd have reported this as not 
working before now,


Steve.


¹ https://sourceware.org/bugzilla/show_bug.cgi?id=15661


Resolves: rhbz#1221331
Signed-off-by: Abhi Das a...@redhat.com
---
  fs/gfs2/file.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c706c6d..8252115 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -917,7 +917,7 @@ static long gfs2_fallocate(struct file *file, int
mode, loff_t offset, loff_t le
  struct gfs2_holder gh;
  int ret;
-if (mode  ~FALLOC_FL_KEEP_SIZE)
+if ((mode  ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
  return -EOPNOTSUPP;
  mutex_lock(inode-i_mutex);

Re: [Cluster-devel] [GFS2 PATCH] gfs2: Don't support fallocate on jdata files

2015-06-04 Thread Steven Whitehouse


Hi,

Will glibc do the fallback path, or just return this as an error? I 
think thats worth checking as it would be nice it it would transparently 
fall back in this case,


Steve.

On 03/06/15 22:30, Abhi Das wrote:

We cannot provide an efficient implementation due to the headers
on the data blocks, so there doesn't seem much point in having it.

Resolves: rhbz#1221331
Signed-off-by: Abhi Das a...@redhat.com
---
  fs/gfs2/file.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index c706c6d..8252115 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -917,7 +917,7 @@ static long gfs2_fallocate(struct file *file, int mode, 
loff_t offset, loff_t le
struct gfs2_holder gh;
int ret;
  
-	if (mode  ~FALLOC_FL_KEEP_SIZE)

+   if ((mode  ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
return -EOPNOTSUPP;
  
  	mutex_lock(inode-i_mutex);

Re: [Cluster-devel] [RFC 1/1 linux-next] gfs2: convert simple_str to kstr

2015-04-30 Thread Steven Whitehouse


Hi,

On 29/04/15 19:54, Fabian Frederick wrote:

-Remove obsolete simple_str functions.
-Return error code when kstr failed.
-This patch also calls functions corresponding to destination type.

Basically it's an RFC because of the type mismatch all over the place.
ie code was doing simple_strtol to integer...
Newer kstr functions detect such problems.
By default I used destination type as a reference. Maybe it's wrong and we
really want to read long, unsigned long from source ?

Signed-off-by: Fabian Frederick f...@skynet.be

Looks correct to me. Most of those are simple 1 or 0 interfaces anyway.
Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.


---
  fs/gfs2/sys.c | 56 
  1 file changed, 44 insertions(+), 12 deletions(-)

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ae8e881..436bdeb 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -101,8 +101,11 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
  
  static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)

  {
-   int error;
-   int n = simple_strtol(buf, NULL, 0);
+   int error, n;
+
+   error = kstrtoint(buf, 0, n);
+   if (error)
+   return error;
  
  	if (!capable(CAP_SYS_ADMIN))

return -EPERM;
@@ -134,10 +137,16 @@ static ssize_t withdraw_show(struct gfs2_sbd *sdp, char 
*buf)
  
  static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len)

  {
+   int error, val;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
  
-	if (simple_strtol(buf, NULL, 0) != 1)

+   error = kstrtoint(buf, 0, val);
+   if (error)
+   return error;
+
+   if (val != -1)
return -EINVAL;
  
  	gfs2_lm_withdraw(sdp, withdrawing from cluster at user's request\n);

@@ -148,10 +157,16 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const 
char *buf, size_t len)
  static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf,
 size_t len)
  {
+   int error, val;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
  
-	if (simple_strtol(buf, NULL, 0) != 1)

+   error = kstrtoint(buf, 0, val);
+   if (error)
+   return error;
+
+   if (val != -1)
return -EINVAL;
  
  	gfs2_statfs_sync(sdp-sd_vfs, 0);

@@ -161,10 +176,16 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, 
const char *buf,
  static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf,
size_t len)
  {
+   int error, val;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
  
-	if (simple_strtol(buf, NULL, 0) != 1)

+   error = kstrtoint(buf, 0, val);
+   if (error)
+   return error;
+
+   if (val != -1)
return -EINVAL;
  
  	gfs2_quota_sync(sdp-sd_vfs, 0);

@@ -181,7 +202,9 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd 
*sdp, const char *buf,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
  
-	id = simple_strtoul(buf, NULL, 0);

+   error = kstrtou32(buf, 0, id);
+   if (error)
+   return error;
  
  	qid = make_kqid(current_user_ns(), USRQUOTA, id);

if (!qid_valid(qid))
@@ -201,7 +224,9 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd 
*sdp, const char *buf,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
  
-	id = simple_strtoul(buf, NULL, 0);

+   error = kstrtou32(buf, 0, id);
+   if (error)
+   return error;
  
  	qid = make_kqid(current_user_ns(), GRPQUOTA, id);

if (!qid_valid(qid))
@@ -325,9 +350,11 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const 
char *buf, size_t len)
  {
struct lm_lockstruct *ls = sdp-sd_lockstruct;
ssize_t ret = len;
-   int val;
+   int val, error;
  
-	val = simple_strtol(buf, NULL, 0);

+   error = kstrtoint(buf, 0, val);
+   if (error)
+   return error;
  
  	if (val == 1)

set_bit(DFL_BLOCK_LOCKS, ls-ls_recover_flags);
@@ -351,9 +378,11 @@ static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf)
  static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
  {
ssize_t ret = len;
-   int val;
+   int val, error;
  
-	val = simple_strtol(buf, NULL, 0);

+   error = kstrtoint(buf, 0, val);
+   if (error)
+   return error;
  
  	if ((val == 1) 

!strcmp(sdp-sd_lockstruct.ls_ops-lm_proto_name, lock_dlm))
@@ -553,11 +582,14 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned 
int *field,
  {
struct gfs2_tune *gt = sdp-sd_tune;
unsigned int x;
+   int error;
  
  	if (!capable(CAP_SYS_ADMIN))

return -EPERM;
  
-	x = simple_strtoul(buf, NULL, 0);

+   error = kstrtouint(buf, 0, x);
+   if (error)
+   return error

Re: [Cluster-devel] [PATCH] GFS2: mark the journal idle to fix ro mounts

2015-04-28 Thread Steven Whitehouse


Hi,

On 27/04/15 18:31, Benjamin Marzinski wrote:

On Mon, Apr 27, 2015 at 11:01:42AM +0100, Steven Whitehouse wrote:

Hi,

On 24/04/15 16:13, Benjamin Marzinski wrote:

When gfs2 was mounted read-only and then unmounted, it was writing a
header block to the journal in the syncing gfs2_log_flush() call from
kill_sb(). This is because the journal was not being marked as idle
until the first log header was written out, and on a read-only mount
there never was a log header written out. Since the journal was not
marked idle, gfs2_log_flush() was writing out a header lock to make
sure it was empty during the sync.  Not only did this cause IO to a
read-only filesystem, but the journalling isn't completely initialized
on read-only mounts, and so gfs2 was writing out the wrong sequence
number in the log header.

Now, the journal is marked idle on mount, and gfs2_log_flush() won't
write out anything until there starts being transactions to flush.

Does that mean that we should be doing more to initialize the log in the r/o
mount case? It should know enough to recover the journals in the case that
it is the first mounter, so did this perhaps only apply to subsequent
mounters of the filesystem?

gfs2 currently has enough information to do recovery.  Both
gfs2_recover_func() and gfs2_make_fs_rw() call gfs2_find_jhead() to
get information about the head of the journal and the sequence
numbers.

gfs2_make_fs_rw() saves this information with these lines

 /*  Initialize some head of the log stuff  */
 sdp-sd_log_sequence = head.lh_sequence + 1;
 gfs2_log_pointers_init(sdp, head.lh_blkno);

This is what's not getting called on the read-only mounts that was
causing the fsck error. But the read only mounts should never be writing
anything in gfs2_log_flush(), which is where these values are used. So
we could make sure these values are always initialized, but the fact
that they weren't was what allowed us to catch this bug (and it would
still be a bug to write that header, even if it didn't mess with fsck).

-Ben

That makes sense to me - thanks,

Steve.

Re: [Cluster-devel] [PATCH] GFS2: mark the journal idle to fix ro mounts

2015-04-27 Thread Steven Whitehouse


Hi,

On 24/04/15 16:13, Benjamin Marzinski wrote:

When gfs2 was mounted read-only and then unmounted, it was writing a
header block to the journal in the syncing gfs2_log_flush() call from
kill_sb(). This is because the journal was not being marked as idle
until the first log header was written out, and on a read-only mount
there never was a log header written out. Since the journal was not
marked idle, gfs2_log_flush() was writing out a header lock to make
sure it was empty during the sync.  Not only did this cause IO to a
read-only filesystem, but the journalling isn't completely initialized
on read-only mounts, and so gfs2 was writing out the wrong sequence
number in the log header.

Now, the journal is marked idle on mount, and gfs2_log_flush() won't
write out anything until there starts being transactions to flush.
Does that mean that we should be doing more to initialize the log in the 
r/o mount case? It should know enough to recover the journals in the 
case that it is the first mounter, so did this perhaps only apply to 
subsequent mounters of the filesystem?


Still it is a good catch!

Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.


Signed-off-by: Benjamin Marzinski bmarz...@redhat.com
---
  fs/gfs2/ops_fstype.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8633ad3..fd984f6 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -757,6 +757,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
}
}
  
+	sdp-sd_log_idle = 1;

set_bit(SDF_JOURNAL_CHECKED, sdp-sd_flags);
gfs2_glock_dq_uninit(ji_gh);
jindex = 0;

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Average in only non-zero round-trip times for congestion stats

2015-04-23 Thread Steven Whitehouse


Hi,

On 23/04/15 17:45, Bob Peterson wrote:

- Original Message -

Hi,

That looks better. Do you get better results with it compared with the
previous version?

Steve.

On 22/04/15 18:00, Bob Peterson wrote:

Hi,

This patch changes function gfs2_rgrp_congested so that it only factors
in non-zero values into its average round trip time. If the round-trip
time is zero for a particular cpu, that cpu has obviously never dealt
with bouncing the resource group in question, so factoring in a zero
value will only skew the numbers. It also fixes a compile error on
some arches related to division.

Regards,

Bob Peterson
Signed-off-by: Bob Peterson rpete...@redhat.com

It's not straightforward because the preferred rgrps patch skews the
performance results greatly. However, based on my performance tests, yes,
the numbers do look better. I temporarily commented out that preferred
rgrps patch to get a clearer picture. Here are numbers from my testing:
This test consists of 5 nodes all simultaneously doing this 'dd' command:
dd if=/dev/zero of=/mnt/gfs2/`hostname` bs=1M count=100
(after drop_caches)

Stock (nothing disabled):
 Run 1
  
pats 716 MB/s
jets 615 MB/s
bills669 MB/s
dolphins 605 MB/s
ravens   735 MB/s
  
Average: 668 MB/s

Stock (plus preferred rgrps disabled): AKA 'c3'
 Run 1 Run 2
    
pats 766 MB/s  675 MB/s
jets 649 MB/s  716 MB/s
bills790 MB/s  735 MB/s
dolphins 761 MB/s  727 MB/s
ravens   712 MB/s  728 MB/s
    
Average: 736 MB/s  716 MB/s

Without latest patch (plus preferred rgrps disabled): AKA 'c2'
(In other words, this is the previous patch which was called
GFS2: Use average srttb value in congestion calculations)
 Run 1 Run 2 Run 3
      -
pats 830 MB/s  688 MB/s  697 MB/s
jets 833 MB/s  622 MB/s  645 MB/s
bills831 MB/s  796 MB/s  637 MB/s
dolphins 834 MB/s  597 MB/s  690 MB/s
ravens   815 MB/s  731 MB/s  734 MB/s
      -
Average: 829 MB/s  687 MB/s  681 MB/s

Latest patch (plus preferred rgrps disabled):
 Run 1 Run 2 Run 3
      -
pats 811 MB/s  829 MB/s  652 MB/s
jets 825 MB/s  863 MB/s  702 MB/s
bills846 MB/s  825 MB/s  710 MB/s
dolphins 845 MB/s  845 MB/s  683 MB/s
ravens   820 MB/s  818 MB/s  682 MB/s
      -
Average: 829 MB/s  836 MB/s  686 MB/s

Latest patch (nothing disabled):
 Run 1 Run 2 Run 3
      -
pats 834 MB/s  817 MB/s  819 MB/s
jets 837 MB/s  835 MB/s  836 MB/s
bills841 MB/s  837 MB/s  834 MB/s
dolphins 838 MB/s  851 MB/s  842 MB/s
ravens   795 MB/s  808 MB/s  815 MB/s
      -
Average: 829 MB/s  830 MB/s  829 MB/s

This test (simultaneous dd) is known to be a worst case scenario,
so I expect it to show the most improvement. For ordinary block
allocations, I don't expect that big of an improvement.

Regards,

Bob Peterson
Red Hat File Systems


That looks very encouraging I think, definitely a good step forward.
Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Average in only non-zero round-trip times for congestion stats

2015-04-23 Thread Steven Whitehouse


Hi,

That looks better. Do you get better results with it compared with the 
previous version?


Steve.

On 22/04/15 18:00, Bob Peterson wrote:

Hi,

This patch changes function gfs2_rgrp_congested so that it only factors
in non-zero values into its average round trip time. If the round-trip
time is zero for a particular cpu, that cpu has obviously never dealt
with bouncing the resource group in question, so factoring in a zero
value will only skew the numbers. It also fixes a compile error on
some arches related to division.

Regards,

Bob Peterson
Signed-off-by: Bob Peterson rpete...@redhat.com
---
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f39eedc..cb27065 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1854,15 +1854,19 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd 
*rgd, int loops)
s64 srttb_diff;
s64 sqr_diff;
s64 var;
-   int cpu;
+   int cpu, nonzero = 0;
  
  	preempt_disable();

for_each_present_cpu(cpu) {
st = per_cpu_ptr(sdp-sd_lkstats, cpu)-lkstats[LM_TYPE_RGRP];
-   a_srttb += st-stats[GFS2_LKS_SRTTB];
+   if (st-stats[GFS2_LKS_SRTTB]) {
+   a_srttb += st-stats[GFS2_LKS_SRTTB];
+   nonzero++;
+   }
}
st = this_cpu_ptr(sdp-sd_lkstats)-lkstats[LM_TYPE_RGRP];
-   a_srttb /= num_present_cpus();
+   if (nonzero)
+   do_div(a_srttb, nonzero);
r_dcount = st-stats[GFS2_LKS_DCOUNT];
var = st-stats[GFS2_LKS_SRTTVARB] +
  gl-gl_stats.stats[GFS2_LKS_SRTTVARB];

Re: [Cluster-devel] linux-next: Tree for Apr 20 (gfs2)

2015-04-20 Thread Steven Whitehouse


Hi,

On 20/04/15 16:40, Randy Dunlap wrote:

On 04/19/15 22:45, Stephen Rothwell wrote:

Hi all,

Please do not add any v4.2 material to your linux-next included trees
until after v4.1-rc1 is released.

Changes since 20150415:


on i386:

ERROR: __divdi3 [fs/gfs2/gfs2.ko] undefined!





These notices should go to Bob as well now, since he is looking after 
the tree that you are pulling from. I suspect that might be from one of 
his patches too,


Steve.

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Use average srttb value in congestion calculations

2015-04-15 Thread Steven Whitehouse


Hi,

On 15/04/15 15:39, Bob Peterson wrote:

Hi,

This patch changes function gfs2_rgrp_congested so that it uses an
average srttb (smoothed round trip time for blocking rgrp glocks)
rather than the CPU-specific value. If we use the CPU-specific value
it can incorrectly report no contention when there really is contention
due to the glock processing occurring on a different CPU.

Signed-off-by: Bob Peterson rpete...@redhat.com

Looks like a good catch... Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.


---
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 6af2396..f39eedc 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1850,14 +1850,19 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd 
*rgd, int loops)
const struct gfs2_sbd *sdp = gl-gl_sbd;
struct gfs2_lkstats *st;
s64 r_dcount, l_dcount;
-   s64 r_srttb, l_srttb;
+   s64 l_srttb, a_srttb = 0;
s64 srttb_diff;
s64 sqr_diff;
s64 var;
+   int cpu;
  
  	preempt_disable();

+   for_each_present_cpu(cpu) {
+   st = per_cpu_ptr(sdp-sd_lkstats, cpu)-lkstats[LM_TYPE_RGRP];
+   a_srttb += st-stats[GFS2_LKS_SRTTB];
+   }
st = this_cpu_ptr(sdp-sd_lkstats)-lkstats[LM_TYPE_RGRP];
-   r_srttb = st-stats[GFS2_LKS_SRTTB];
+   a_srttb /= num_present_cpus();
r_dcount = st-stats[GFS2_LKS_DCOUNT];
var = st-stats[GFS2_LKS_SRTTVARB] +
  gl-gl_stats.stats[GFS2_LKS_SRTTVARB];
@@ -1866,10 +1871,10 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd 
*rgd, int loops)
l_srttb = gl-gl_stats.stats[GFS2_LKS_SRTTB];
l_dcount = gl-gl_stats.stats[GFS2_LKS_DCOUNT];
  
-	if ((l_dcount  1) || (r_dcount  1) || (r_srttb == 0))

+   if ((l_dcount  1) || (r_dcount  1) || (a_srttb == 0))
return false;
  
-	srttb_diff = r_srttb - l_srttb;

+   srttb_diff = a_srttb - l_srttb;
sqr_diff = srttb_diff * srttb_diff;
  
  	var *= 2;

Re: [Cluster-devel] GFS2: Pull request (merge window)

2015-04-15 Thread Steven Whitehouse


Hi,

On 15/04/15 00:16, Linus Torvalds wrote:

On Tue, Apr 14, 2015 at 10:47 AM, Bob Peterson rpete...@redhat.com wrote:

There's another that adds me as a GFS2 co-maintainer [...]

So generally, when I start getting pull requests from different
people, I'd like to see a previous separate heads-up or confirmation
from the previous person just so that I'm not taken by surprise.
Rather than this kind of as part of the pull request, make the thing
official.

Yes, yes, I can see that you use the same script that Steven did, and
I can see that you've been the main developer lately, but still..

Anyway. Pulled.

  Linus


Thanks - I'm very much still here, keeping an eye on everything, looking 
at all the patches and so forth, but not really writing any myself any 
more, due to taking on a wider role recently. That doesn't mean that 
GFS2 will have fewer developers though, and Bob has been working on it 
pretty much right from the start too, so this should be a smooth transition,


Steve.

Re: [Cluster-devel] [GFS2] gfs2: fix quota refresh race in do_glock()

2015-04-08 Thread Steven Whitehouse


Hi,

That makes sense to me. Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.

On 07/04/15 18:48, Abhi Das wrote:

quotad periodically syncs in-memory quotas to the ondisk quota file
and sets the QDF_REFRESH flag so that a subsequent read of a synced
quota is re-read from disk.

gfs2_quota_lock() checks for this flag and sets a 'force' bit to
force re-read from disk if requested. However, there is a race
condition here. It is possible for gfs2_quota_lock() to find the
QDF_REFRESH flag unset (i.e force=0) and quotad comes in immediately
after and syncs the relevant quota and sets the QDF_REFRESH flag.
gfs2_quota_lock() resumes with force=0 and uses the stale in-memory
quota usage values that result in miscalculations.

This patch fixes this race by moving the check for the QDF_REFRESH
flag check further out into the gfs2_quota_lock() process, i.e, in
do_glock(), under the protection of the quota glock.

Resolves: rhbz#1174295
Signed-off-by: Abhi Das a...@redhat.com
---
  fs/gfs2/quota.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 5561468..5c27e48 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -923,6 +923,9 @@ restart:
if (error)
return error;
  
+	if (test_and_clear_bit(QDF_REFRESH, qd-qd_flags))

+   force_refresh = FORCE;
+
qd-qd_qb = *(struct gfs2_quota_lvb *)qd-qd_gl-gl_lksb.sb_lvbptr;
  
  	if (force_refresh || qd-qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {

@@ -974,11 +977,8 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, 
kgid_t gid)
 sizeof(struct gfs2_quota_data *), sort_qd, NULL);
  
  	for (x = 0; x  ip-i_res-rs_qa_qd_num; x++) {

-   int force = NO_FORCE;
qd = ip-i_res-rs_qa_qd[x];
-   if (test_and_clear_bit(QDF_REFRESH, qd-qd_flags))
-   force = FORCE;
-   error = do_glock(qd, force, ip-i_res-rs_qa_qd_ghs[x]);
+   error = do_glock(qd, NO_FORCE, ip-i_res-rs_qa_qd_ghs[x]);
if (error)
break;
}

Re: [Cluster-devel] [PATCH 08/16] gfs2: use is_xxx_kiocb instead of filp-fl_flags

2015-04-07 Thread Steven Whitehouse


Hi,

On 04/04/15 20:13, Dmitry Monakhov wrote:

Cc: cluster-devel@redhat.com
Signed-off-by: Dmitry Monakhov dmonak...@openvz.org

Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.


---
  fs/gfs2/file.c |2 +-
  1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index f6fc412..25da110 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -709,7 +709,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
  
  	gfs2_size_hint(file, iocb-ki_pos, iov_iter_count(from));
  
-	if (file-f_flags  O_APPEND) {

+   if (is_append_kiocb(iocb)) {
struct gfs2_holder gh;
  
  		ret = gfs2_glock_nq_init(ip-i_gl, LM_ST_SHARED, 0, gh);

Re: [Cluster-devel] [PATCH 1/1] gfs2: incorrect check for debugfs returns

2015-03-24 Thread Steven Whitehouse


Hi,

On 24/03/15 02:36, Chengyu Song wrote:

debugfs_create_dir and debugfs_create_file may return -ENODEV when debugfs
is not configured, so the return value should be checked against ERROR_VALUE
as well, otherwise the later dereference of the dentry pointer would crash
the kernel.

Looks reasonable to me:

Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.


Signed-off-by: Chengyu Song cson...@gatech.edu
---
  fs/gfs2/glock.c | 47 ---
  1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index f42dffb..0fa8062 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2047,34 +2047,41 @@ static const struct file_operations gfs2_sbstats_fops = 
{
  
  int gfs2_create_debugfs_file(struct gfs2_sbd *sdp)

  {
-   sdp-debugfs_dir = debugfs_create_dir(sdp-sd_table_name, gfs2_root);
-   if (!sdp-debugfs_dir)
-   return -ENOMEM;
-   sdp-debugfs_dentry_glocks = debugfs_create_file(glocks,
-S_IFREG | S_IRUGO,
-sdp-debugfs_dir, sdp,
-gfs2_glocks_fops);
-   if (!sdp-debugfs_dentry_glocks)
+   struct dentry *dent;
+
+   dent = debugfs_create_dir(sdp-sd_table_name, gfs2_root);
+   if (IS_ERR_OR_NULL(dent))
+   goto fail;
+   sdp-debugfs_dir = dent;
+
+   dent = debugfs_create_file(glocks,
+  S_IFREG | S_IRUGO,
+  sdp-debugfs_dir, sdp,
+  gfs2_glocks_fops);
+   if (IS_ERR_OR_NULL(dent))
goto fail;
+   sdp-debugfs_dentry_glocks = dent;
  
-	sdp-debugfs_dentry_glstats = debugfs_create_file(glstats,

-   S_IFREG | S_IRUGO,
-   sdp-debugfs_dir, sdp,
-   gfs2_glstats_fops);
-   if (!sdp-debugfs_dentry_glstats)
+   dent = debugfs_create_file(glstats,
+  S_IFREG | S_IRUGO,
+  sdp-debugfs_dir, sdp,
+  gfs2_glstats_fops);
+   if (IS_ERR_OR_NULL(dent))
goto fail;
+   sdp-debugfs_dentry_glstats = dent;
  
-	sdp-debugfs_dentry_sbstats = debugfs_create_file(sbstats,

-   S_IFREG | S_IRUGO,
-   sdp-debugfs_dir, sdp,
-   gfs2_sbstats_fops);
-   if (!sdp-debugfs_dentry_sbstats)
+   dent = debugfs_create_file(sbstats,
+  S_IFREG | S_IRUGO,
+  sdp-debugfs_dir, sdp,
+  gfs2_sbstats_fops);
+   if (IS_ERR_OR_NULL(dent))
goto fail;
+   sdp-debugfs_dentry_sbstats = dent;
  
  	return 0;

  fail:
gfs2_delete_debugfs_file(sdp);
-   return -ENOMEM;
+   return dent ? PTR_ERR(dent) : -ENOMEM;
  }
  
  void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)

@@ -2100,6 +2107,8 @@ void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
  int gfs2_register_debugfs(void)
  {
gfs2_root = debugfs_create_dir(gfs2, NULL);
+   if (IS_ERR(gfs2_root))
+   return PTR_ERR(gfs2_root);
return gfs2_root ? 0 : -ENOMEM;
  }

Re: [Cluster-devel] [GFS2 0/3] fallocate and quota fixes

2015-03-18 Thread Steven Whitehouse


Hi,

Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.

On 18/03/15 07:36, Abhi Das wrote:

This is a revised version of the patches required to properly fix the
fallocate quota issue described in bz1174295

patch1: This patch supplies gfs2_quota_check() with the number of blocks
the caller intends to allocate in the current operation, resulting
in a more accurate quota check.

patch2: gfs2_quota_check() and gfs2_inplace_reserve() return the number
of blocks available subject to quota limits and rgrp size
respectively. These functions don't return error if atleast
ap.min_target (if set) blocks are allocatable.

patch3: The fallocate() function uses the features of patch2 to determine
how many total blocks are available for allocation and uses them
right away, instead of guessing/retrying inefficiently.

The behavior of quota enforcement is altered by this patchset.
i.   Quotas are exceeded (a warning message is also issued to syslog) before
  the actual usage blocks exceed the imposed limit. In fact, the actual
  usage can never exceed the limit. Whenever it is determined that the
  completion of an operation will cause a quota to exceed its limit, such
  an operation is aborted with -EDQUOT and a 'quota exceeded' message is
  dispatched. Note: When min_target is set and allowed blocks are =
  min_target, we don't issue an error. It is assumed that the caller will
  only allocate the allowed blocks.
ii.  The gfs2_write_calc_reserv()/calc_max_reserv() functions are used to
  map between available blocks and the data bytes that can be written
  using them. Typically, for large files, some blocks are used up for
  metadata and only the remaining blocks can be used for data. Example:
  To write only a handful of bytes that would easily fit in one block, we
  might have to allocate an extra bunch of intermediate metadata blocks.
  If we had only 1 block left in our allotted quota, this operation would
  likely fail.

The functions mentioned in ii. are not very efficient. They always compute
the worst case number of extra blocks required and it is often the case that
not all those extra blocks are used. We need to find a better algorithm to
get a tighter estimate on the blocks needed for a given number of bytes.

I've run some basic tests and things seem to be holding up. The failing case
in bz1174295 is fixed using this patchset. I'll do test build and pass it on
to Nate to test with.

Abhi Das (3):
   gfs2: perform quota checks against allocation parameters
   gfs2: allow quota_check and inplace_reserve to return available blocks
   gfs2: allow fallocate to max out quotas/fs efficiently

  fs/gfs2/aops.c   |  6 ++---
  fs/gfs2/bmap.c   |  2 +-
  fs/gfs2/file.c   | 81 
  fs/gfs2/incore.h |  4 ++-
  fs/gfs2/inode.c  | 18 +++--
  fs/gfs2/quota.c  | 54 +++--
  fs/gfs2/quota.h  |  8 +++---
  fs/gfs2/rgrp.c   | 20 ++
  fs/gfs2/rgrp.h   |  3 ++-
  fs/gfs2/xattr.c  |  2 +-
  10 files changed, 133 insertions(+), 65 deletions(-)

Re: [Cluster-devel] [PATCH] GFS2: Fix potential NULL dereference in gfs2_alloc_inode

2015-03-02 Thread Steven Whitehouse


Hi,

On 02/03/15 16:15, Andrew Price wrote:

Return NULL when ip is NULL instead of dereferencing it.

Signed-off-by: Andrew Price anpr...@redhat.com
---
  fs/gfs2/super.c | 13 +++--
  1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 1666382..37c59ee 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1628,12 +1628,13 @@ static struct inode *gfs2_alloc_inode(struct 
super_block *sb)
struct gfs2_inode *ip;
  
  	ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);

-   if (ip) {
-   ip-i_flags = 0;
-   ip-i_gl = NULL;
-   ip-i_rgd = NULL;
-   ip-i_res = NULL;
-   }
+   if (!ip)
+   return NULL;
+
+   ip-i_flags = 0;
+   ip-i_gl = NULL;
+   ip-i_rgd = NULL;
+   ip-i_res = NULL;
return ip-i_inode;
  }
  


I'm not sure that I see the problem here... it should just return NULL 
if ip is NULL, since ip-i_inode is the first element of ip,


Steve.

Re: [Cluster-devel] [GFS2 0/3] fallocate and quota fixes

2015-02-25 Thread Steven Whitehouse


Hi,

On 25/02/15 05:17, Abhi Das wrote:

This is a revised version of the patches required to properly fix the
fallocate quota issue described in bz1174295

patch1: This patch supplies gfs2_quota_check() with the number of blocks
the caller intends to allocate in the current operation, resulting
in a more accurate quota check.

patch2: gfs2_quota_check() and gfs2_inplace_reserve() return the number
of blocks available subject to quota limits and rgrp size
respectively. The difference from previous versions of this patch
is that we return the available blocks even on success.

patch3: The fallocate() function uses the features of patch2 to determine
how many total blocks are available for allocation and uses them
right away, instead of guessing inefficiently.

The behavior of quota enforcement is altered by this patchset.
i.   Quotas are exceeded (a warning message is also issued to syslog) before
  the actual usage blocks exceed the imposed limit. In fact, the actual
  usage can never exceed the limit. Whenever it is determined that the
  completion of an operation will cause a quota to exceed its limit, such
  an operation is aborted with -EDQUOT and a 'quota exceeded' message is
  dispatched.
ii.  The gfs2_write_calc_reserv()/calc_max_reserv() functions are used to
  map between available blocks and the data bytes that can be written
  using them. Typically, for large files, some blocks are used up for
  metadata and only the remaining blocks can be used for data. Example:
  To write only a handful of bytes that would easily fit in one block, we
  might have to allocate an extra bunch of intermediate metadata blocks.
  If we had only 1 block left in our allotted quota, this operation would
  likely fail.

The functions mentioned in ii. are not very efficient. They always compute
the worst case number of extra blocks required and it is often the case that
not all those extra blocks are used. We need to find a better algorithm to
get a tighter estimate on the blocks needed for a given number of bytes.

I've run some basic tests and things seem to be holding up. The failing case
in bz1174295 is fixed using this patchset. I'll do test build and pass it on
to Nate to test with.

Abhi Das (3):
   gfs2: perform quota checks against allocation parameters
   gfs2: allow quota_check and inplace_reserve to return available blocks
   gfs2: allow fallocate to max out quotas/fs efficiently

  fs/gfs2/aops.c   |  6 ++---
  fs/gfs2/bmap.c   |  2 +-
  fs/gfs2/file.c   | 75 ++--
  fs/gfs2/incore.h |  3 ++-
  fs/gfs2/inode.c  | 18 --
  fs/gfs2/quota.c  | 40 ++
  fs/gfs2/quota.h  |  8 +++---
  fs/gfs2/rgrp.c   | 12 +++--
  fs/gfs2/rgrp.h   |  6 +++--
  fs/gfs2/xattr.c  |  2 +-
  10 files changed, 117 insertions(+), 55 deletions(-)



This looks good, but definitely needs careful testing,

Steve.

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Allocate reservation during splice_write

2015-02-19 Thread Steven Whitehouse


Hi,

On 19/02/15 18:24, Bob Peterson wrote:

Hi,

This patch adds a GFS2-specific function for splice_write which
first calls function gfs2_rs_alloc to make sure a reservation
structure has been allocated before attempting to reserve blocks.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson rpete...@redhat.com

Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.


---
  fs/gfs2/file.c | 20 ++--
  1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ec9c2d3..bd86e79 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1063,6 +1063,22 @@ static int gfs2_flock(struct file *file, int cmd, struct 
file_lock *fl)
}
  }
  
+static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,

+ struct file *out, loff_t *ppos,
+ size_t len, unsigned int flags)
+{
+   int error;
+   struct gfs2_inode *ip = GFS2_I(out-f_mapping-host);
+
+   error = gfs2_rs_alloc(ip);
+   if (error)
+   return (ssize_t)error;
+
+   gfs2_size_hint(out, *ppos, len);
+
+   return iter_file_splice_write(pipe, out, ppos, len, flags);
+}
+
  const struct file_operations gfs2_file_fops = {
.llseek = gfs2_llseek,
.read   = new_sync_read,
@@ -1077,7 +1093,7 @@ const struct file_operations gfs2_file_fops = {
.lock   = gfs2_lock,
.flock  = gfs2_flock,
.splice_read= generic_file_splice_read,
-   .splice_write   = iter_file_splice_write,
+   .splice_write   = gfs2_file_splice_write,
.setlease   = simple_nosetlease,
.fallocate  = gfs2_fallocate,
  };
@@ -1107,7 +1123,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.release= gfs2_release,
.fsync  = gfs2_fsync,
.splice_read= generic_file_splice_read,
-   .splice_write   = iter_file_splice_write,
+   .splice_write   = gfs2_file_splice_write,
.setlease   = generic_setlease,
.fallocate  = gfs2_fallocate,
  };

Re: [Cluster-devel] [PATCH] fs: record task name which froze superblock

2015-02-18 Thread Steven Whitehouse


Hi,

On 18/02/15 09:13, Jan Kara wrote:

On Wed 18-02-15 10:34:55, Alexey Dobriyan wrote:

On Mon, Feb 16, 2015 at 10:38:52AM +0100, Jan Kara wrote:

On Sat 14-02-15 21:55:24, Alexey Dobriyan wrote:

Freezing and thawing are separate system calls, task which is supposed
to thaw filesystem/superblock can disappear due to crash or not thaw
due to a bug. Record at least task name (we can't take task_struct
reference) to make support engineer's life easier.

Hopefully 16 bytes per superblock isn't much.

P.S.: Cc'ing GFS2 people just in case they want to correct
my understanding of GFS2 having async freeze code.

Signed-off-by: Alexey Dobriyan adobri...@gmail.com

   Hum, and when do you show the task name? Or do you expect that customer
takes a crashdump and support just finds it in memory?

Yeah, having at least something in crashdump is fine.

   OK, then comment about this at freeze_comm[] definition so that it's
clear it isn't just set-but-never-read field.
  

--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -518,6 +518,7 @@ static int ioctl_fioasync(unsigned int fd, struct file 
*filp,
  static int ioctl_fsfreeze(struct file *filp)
  {
struct super_block *sb = file_inode(filp)-i_sb;
+   int rv;
  
  	if (!capable(CAP_SYS_ADMIN))

return -EPERM;
@@ -527,22 +528,31 @@ static int ioctl_fsfreeze(struct file *filp)
return -EOPNOTSUPP;
  
  	/* Freeze */

-   if (sb-s_op-freeze_super)
-   return sb-s_op-freeze_super(sb);
-   return freeze_super(sb);
+   if (sb-s_op-freeze_super) {
+   rv = sb-s_op-freeze_super(sb);
+   if (rv == 0)
+   get_task_comm(sb-s_writers.freeze_comm, current);
+   } else
+   rv = freeze_super(sb);
+   return rv;

   Why don't you just set the name in ioctl_fsfreeze() in both cases?

There are users of freeze_super() in GFS2 unless I'm misreading code.

   Yes, there are. The call in fs/gfs2/glops.c is in a call path from
-freeze_super() handler for GFS2 so that one is handled in
ioctl_fsfreeze() anyway. The call in fs/gfs2/sys.c is a way to freeze
filesystem via sysfs (dunno why GFS2 has to invent its own thing and ioctl
isn't enough). Steven? So having the logic in ioctl_fsfreeze(),
freeze_bdev() and freeze_store() in gfs2 seems to be enough.

The sysfs freeze thing is historical and strongly deprecated - I hope 
that we may be able to remove it one day,


Steve.

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Allocate reservation during write_begin if needed

2015-02-17 Thread Steven Whitehouse


Hi,


Since we set the allocation structure when the write call begins, and it 
is not deallocated until there are no writers left with the file open, 
how does this happen?


Steve.

On 17/02/15 17:09, Bob Peterson wrote:

Hi,

This patch adds a call to function gfs2_rs_alloc to make sure a
reservation structure has been allocated before attempting to
reserve blocks.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson rpete...@redhat.com
---
  fs/gfs2/aops.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37f..6453e23 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -675,6 +675,9 @@ static int gfs2_write_begin(struct file *file, struct 
address_space *mapping,
if (error)
goto out_unlock;
  
+		error = gfs2_rs_alloc(ip);

+   if (error)
+   goto out_qunlock;
requested = data_blocks + ind_blocks;
ap.target = requested;
error = gfs2_inplace_reserve(ip, ap);

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Allocate reservation during write_begin if needed

2015-02-17 Thread Steven Whitehouse


Hi,

On 17/02/15 19:09, Bob Peterson wrote:

- Original Message -

Hi,


Since we set the allocation structure when the write call begins, and it
is not deallocated until there are no writers left with the file open,
how does this happen?

Steve.

Hi,

In a normal write, the code goes through gfs2_page_mkwrite or
gfs2_file_aio_write. In the failing scenario, it's going through
sendfile. I suppose I could patch sendfile as an alternative, but
the advantage here is that this patch will do it only if a block
allocation is needed.

Regards,

Bob Peterson
Red Hat File Systems


Ah, I see. In which case that code path should be patched. So it should 
be part of the splice code I think, since it should be done at the 
higher level, and not at the write_begin level, since that is too late. 
We should have a call to the reservation code too at that point, to 
ensure that we don't have fragmentation issues. So we need a wrapper for 
|iter_file_splice_write| along the lines of gfs2_file_write_iter I think,


Steve.

Re: [Cluster-devel] [GFS2 PATCH 1/3] gfs2: perform quota checks against allocation parameters

2015-02-17 Thread Steven Whitehouse


Hi,

On 16/02/15 17:59, Abhi Das wrote:

Use struct gfs2_alloc_parms as an argument to gfs2_quota_check()
and gfs2_quota_lock_check() to check for quota violations while
accounting for the new blocks requested by the current operation
in ap-target.

Previously, the number of new blocks requested during an operation
were not accounted for during quota_check and would allow these
operations to exceed quota. This was not very apparent since most
operations allocated only 1 block at a time and quotas would get
violated in the next operation. i.e. quota excess would only be by
1 block or so. With fallocate, (where we allocate a bunch of blocks
at once) the quota excess is non-trivial and is addressed by this
patch.

Resolves: rhbz#1174295
Signed-off-by: Abhi Das a...@redhat.com
---
  fs/gfs2/aops.c   |  6 +++---
  fs/gfs2/bmap.c   |  2 +-
  fs/gfs2/file.c   |  9 +
  fs/gfs2/incore.h |  2 +-
  fs/gfs2/inode.c  | 18 ++
  fs/gfs2/quota.c  |  6 +++---
  fs/gfs2/quota.h  |  8 +---
  fs/gfs2/xattr.c  |  2 +-
  8 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37f..0261126 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -671,12 +671,12 @@ static int gfs2_write_begin(struct file *file, struct 
address_space *mapping,
  
  	if (alloc_required) {

struct gfs2_alloc_parms ap = { .aflags = 0, };
-   error = gfs2_quota_lock_check(ip);
+   requested = data_blocks + ind_blocks;
+   ap.target = requested;
+   error = gfs2_quota_lock_check(ip, ap);
if (error)
goto out_unlock;
  
-		requested = data_blocks + ind_blocks;

-   ap.target = requested;
error = gfs2_inplace_reserve(ip, ap);
if (error)
goto out_qunlock;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index f0b945a..61296ec 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1224,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size)
  
  	if (gfs2_is_stuffed(ip) 

(size  (sdp-sd_sb.sb_bsize - sizeof(struct gfs2_dinode {
-   error = gfs2_quota_lock_check(ip);
+   error = gfs2_quota_lock_check(ip, ap);
if (error)
return error;
  
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c

index 6e600ab..2ea420a 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -429,11 +429,11 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, 
struct vm_fault *vmf)
if (ret)
goto out_unlock;
  
-	ret = gfs2_quota_lock_check(ip);

-   if (ret)
-   goto out_unlock;
gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, data_blocks, ind_blocks);
ap.target = data_blocks + ind_blocks;
+   ret = gfs2_quota_lock_check(ip, ap);
+   if (ret)
+   goto out_unlock;
ret = gfs2_inplace_reserve(ip, ap);
if (ret)
goto out_quota_unlock;
@@ -828,7 +828,8 @@ static long __gfs2_fallocate(struct file *file, int mode, 
loff_t offset, loff_t
offset += bytes;
continue;
}
-   error = gfs2_quota_lock_check(ip);
+   ap.target = bytes  sdp-sd_sb.sb_bsize_shift;
+   error = gfs2_quota_lock_check(ip, ap);
if (error)
return error;
  retry:
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 7a2dbbc..3a4ea50 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -301,7 +301,7 @@ struct gfs2_blkreserv {
   * to the allocation code.
   */
  struct gfs2_alloc_parms {
-   u32 target;
+   u64 target;
Why does this need to be 64 bits in size? The max size of an rgrp is 
2^32, so there should be no need to represent an extent larger than that,


Steve.


u32 aflags;
  };
  
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c

index 73c72253..08bc84d 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -382,7 +382,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, 
unsigned *dblocks)
struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, };
int error;
  
-	error = gfs2_quota_lock_check(ip);

+   error = gfs2_quota_lock_check(ip, ap);
if (error)
goto out;
  
@@ -525,7 +525,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,

int error;
  
  	if (da-nr_blocks) {

-   error = gfs2_quota_lock_check(dip);
+   error = gfs2_quota_lock_check(dip, ap);
if (error)
goto fail_quota_locks;
  
@@ -953,7 +953,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
  
  	if (da.nr_blocks) {

struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
-   error = gfs2_quota_lock_check(dip);
+   error = gfs2_quota_lock_check(dip, ap);

Re: [Cluster-devel] [GFS2 PATCH 3/3] gfs2: allow fallocate to max out quotas/fs efficiently

2015-02-17 Thread Steven Whitehouse


Hi,

On 16/02/15 17:59, Abhi Das wrote:

We can quickly get an estimate of how many more blocks are
available for allocation restricted by quota and fs size
respectively using the ap-allowed field in the gfs2_alloc_parms
structure. gfs2_quota_check() and gfs2_inplace_reserve() provide
these values.

By re-trying to allocate what's available instead of guessing, we
can max out quotas or the filesystem efficiently.

Bear in mind that this applies only when the requested fallocate
operation would otherwise error out with -EDQUOT or -ENOSPC without
utilizing all the blocks that might still be available.

Signed-off-by: Abhi Das a...@redhat.com
---
  fs/gfs2/file.c | 18 +++---
  1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 2ea420a..57129fa 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -829,20 +829,24 @@ static long __gfs2_fallocate(struct file *file, int mode, 
loff_t offset, loff_t
continue;
}
ap.target = bytes  sdp-sd_sb.sb_bsize_shift;
+   quota_retry:
error = gfs2_quota_lock_check(ip, ap);
-   if (error)
+   if (error) {
+   if (error == -EDQUOT  ap.allowed) {
+   bytes = ap.allowed  sdp-sd_sb.sb_bsize_shift;
+   ap.target = ap.allowed;
+   goto quota_retry;
+   }
return error;
Do you really need to loop here if -EDQUOT is returned and ap.allowed is 
set? Why not just continue after having updated the target value?


Otherwise I think these patches look good,

Steve.


-retry:
+   }
+   retry:
gfs2_write_calc_reserv(ip, bytes, data_blocks, ind_blocks);
  
  		ap.target = data_blocks + ind_blocks;

error = gfs2_inplace_reserve(ip, ap);
if (error) {
-   if (error == -ENOSPC  bytes  sdp-sd_sb.sb_bsize) {
-   bytes = 1;
-   bytes = bsize_mask;
-   if (bytes == 0)
-   bytes = sdp-sd_sb.sb_bsize;
+   if (error == -ENOSPC  ap.allowed) {
+   bytes = ap.allowed  sdp-sd_sb.sb_bsize_shift;
goto retry;
}
goto out_qunlock;

Re: [Cluster-devel] [PATCH] Add myself (Bob Peterson) as a maintainer of GFS2

2015-02-16 Thread Steven Whitehouse


Hi,

On 16/02/15 14:20, Bob Peterson wrote:

Hi,

This patch adds Bob Peterson as a maintainer of the GFS2 file system.
It also changes the development repository to a shared location rather
than Steve Whitehouse's private location.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson rpete...@redhat.com

Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.


---
  MAINTAINERS | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 249e8dd..eee1680 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4239,10 +4239,10 @@ F:  scripts/get_maintainer.pl
  
  GFS2 FILE SYSTEM

  M:Steven Whitehouse swhit...@redhat.com
+M: Bob Peterson rpete...@redhat.com
  L:cluster-devel@redhat.com
  W:http://sources.redhat.com/cluster/
-T: git 
git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-3.0-fixes.git
-T: git git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-3.0-nmw.git
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/gfs2/linux-gfs2.git
  S:Supported
  F:Documentation/filesystems/gfs2*.txt
  F:fs/gfs2/

Re: [Cluster-devel] [GFS2 PATCH 3/4] gfs2: add new function gfs2_inpl_rsrv_ret_max_avl

2015-02-13 Thread Steven Whitehouse


Hi,

Likewise here, if the available blocks was added to the allocation 
parms, then this would help to neaten up the code a bit. Otherwise all 
four patches look good to me,


Steve.

On 12/02/15 16:54, Abhi Das wrote:

This is a variant of the existing gfs2_inplace_reserve() function.

If the requested number of blocks are not available to be reserved
from any of the rgrps, gfs2_inplace_reserve() return -ENOSPC.
gfs2_inpl_rsrv_ret_max_val() will also return the maximum blocks
available in an extra parameter 'max_avail'.

If acceptable to the caller logic, either of these inplace resreve
functions may be called again requesting 'max_avail' blocks to
avoid the -ENOSPC error.

Signed-off-by: Abhi Das a...@redhat.com
---
  fs/gfs2/rgrp.c | 13 +++--
  fs/gfs2/rgrp.h | 10 +-
  2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 9150207..0fa9ae3 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1942,14 +1942,17 @@ static inline int fast_to_acquire(struct gfs2_rgrpd 
*rgd)
  }
  
  /**

- * gfs2_inplace_reserve - Reserve space in the filesystem
+ * gfs2_inpl_rsrv_ret_max_avl - Reserve space in the filesystem
   * @ip: the inode to reserve space for
   * @ap: the allocation parameters
+ * @max_avail: If non-NULL, return the max available extent if -ENOSPC
   *
   * Returns: errno
   */
  
-int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap)

+int gfs2_inpl_rsrv_ret_max_avl(struct gfs2_inode *ip,
+  const struct gfs2_alloc_parms *ap,
+  u32 *max_avail)
  {
struct gfs2_sbd *sdp = GFS2_SB(ip-i_inode);
struct gfs2_rgrpd *begin = NULL;
@@ -1958,6 +1961,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const 
struct gfs2_alloc_parms *a
u64 last_unlinked = NO_BLOCK;
int loops = 0;
u32 skip = 0;
+   u32 max_avlbl = 0;
  
  	if (sdp-sd_args.ar_rgrplvb)

flags |= GL_SKIP;
@@ -2030,6 +2034,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const 
struct gfs2_alloc_parms *a
if (rs-rs_rbm.rgd-rd_free_clone = ap-target) {
ip-i_rgd = rs-rs_rbm.rgd;
return 0;
+   } else if (rs-rs_rbm.rgd-rd_free_clone  max_avlbl) {
+   max_avlbl = rs-rs_rbm.rgd-rd_free_clone;
}
  
  check_rgrp:

@@ -2068,6 +2074,9 @@ next_rgrp:
gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
}
  
+	if (max_avail)

+   *max_avail = max_avlbl;
+
return -ENOSPC;
  }
  
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h

index b104f4a..2adffca 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -41,7 +41,15 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
  extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
  
  #define GFS2_AF_ORLOV 1

-extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct 
gfs2_alloc_parms *ap);
+extern int gfs2_inpl_rsrv_ret_max_avl(struct gfs2_inode *ip,
+ const struct gfs2_alloc_parms *ap,
+ u32 *max_avail);
+static inline int gfs2_inplace_reserve(struct gfs2_inode *ip,
+  const struct gfs2_alloc_parms *ap)
+{
+   return gfs2_inpl_rsrv_ret_max_avl(ip, ap, NULL);
+}
+
  extern void gfs2_inplace_release(struct gfs2_inode *ip);
  
  extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,

[Cluster-devel] [PATCH 4/5] GFS2: use __vmalloc GFP_NOFS for fs-related allocations.

2015-02-10 Thread Steven Whitehouse

From: Oleg Drokin gr...@linuxhacker.ru

leaf_dealloc uses vzalloc as a fallback to kzalloc(GFP_NOFS), so
it clearly does not want any shrinker activity within the fs itself.
convert vzalloc into __vmalloc(GFP_NOFS|__GFP_ZERO) to better achieve
this goal.

Signed-off-by: Oleg Drokin gr...@linuxhacker.ru
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c5a34f0..6371192 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1896,7 +1896,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 
index, u32 len,
 
ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
if (ht == NULL)
-   ht = vzalloc(size);
+   ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO,
+  PAGE_KERNEL);
if (!ht)
return -ENOMEM;
 
-- 
1.8.3.1

[Cluster-devel] [PATCH 5/5] GFS2: Fix crash during ACL deletion in acl max entry check in gfs2_set_acl()

2015-02-10 Thread Steven Whitehouse

From: Andrew Elble awe...@rit.edu

Fixes: e01580bf9e (gfs2: use generic posix ACL infrastructure)
Reported-by: Eric Meddaugh etm...@rit.edu
Tested-by: Eric Meddaugh etm...@rit.edu
Signed-off-by: Andrew Elble awe...@rit.edu
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3088e2a..7b31430 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -73,7 +73,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, 
int type)
 
BUG_ON(name == NULL);
 
-   if (acl-a_count  GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
+   if (acl  acl-a_count  GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
return -E2BIG;
 
if (type == ACL_TYPE_ACCESS) {
-- 
1.8.3.1

[Cluster-devel] [PATCH 3/5] GFS2: Eliminate a nonsense goto

2015-02-10 Thread Steven Whitehouse

From: Bob Peterson rpete...@redhat.com

This patch just removes a goto that did nothing.

Signed-off-by: Bob Peterson rpete...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9054002..73c72253 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -543,10 +543,7 @@ static int link_dinode(struct gfs2_inode *dip, const 
struct qstr *name,
}
 
error = gfs2_dir_add(dip-i_inode, name, ip, da);
-   if (error)
-   goto fail_end_trans;
 
-fail_end_trans:
gfs2_trans_end(sdp);
 fail_ipreserv:
gfs2_inplace_release(dip);
-- 
1.8.3.1

[Cluster-devel] [PATCH 2/5] GFS2: fix sprintf format specifier

2015-02-10 Thread Steven Whitehouse

From: alex chen alex.c...@huawei.com

Sprintf format specifier %d and %u are mixed up in
gfs2_recovery_done() and freeze_show(). So correct them.

Signed-off-by: Alex Chen alex.c...@huawei.com
Reviewed-by: Joseph Qi joseph...@huawei.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 573bd3b..1b64577 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -439,7 +439,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, 
unsigned int jid,
 
 ls-ls_recover_jid_done = jid;
 ls-ls_recover_jid_status = message;
-   sprintf(env_jid, JID=%d, jid);
+   sprintf(env_jid, JID=%u, jid);
sprintf(env_status, RECOVERY=%s,
message == LM_RD_SUCCESS ? Done : Failed);
 kobject_uevent_env(sdp-sd_kobj, KOBJ_CHANGE, envp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 3ab566b..ae8e881 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -96,7 +96,7 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
struct super_block *sb = sdp-sd_vfs;
int frozen = (sb-s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
 
-   return snprintf(buf, PAGE_SIZE, %u\n, frozen);
+   return snprintf(buf, PAGE_SIZE, %d\n, frozen);
 }
 
 static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-- 
1.8.3.1

[Cluster-devel] GFS2: Pre-pull patch posting

2015-02-10 Thread Steven Whitehouse

This time we have mostly clean ups. There is a bug fix for a NULL dereference
relating to ACLs, and another which improves (but does not fix entirely) an
allocation fall-back code path. The other three patches are small clean ups.

Steve.

[Cluster-devel] GFS2: Pull request (merge window)

2015-02-10 Thread Steven Whitehouse

Hi,

Please consider pulling the following changes,

Steve.

--

This time we have mostly clean ups. There is a bug fix for a NULL dereference
relating to ACLs, and another which improves (but does not fix entirely) an 
allocation fall-back code path. The other three patches are small clean ups.

--
The following changes since commit 11c8f01b423b2d9742ce21e44cb7da7b55429ad5:

  Merge branch 'rc-fixes' of 
git://git.kernel.org/pub/scm/linux/kernel/git/mmarek/kbuild (2015-01-08 
14:35:00 -0800)

are available in the git repository at:


  git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-3.0-nmw.git 
tags/gfs2-merge-window

for you to fetch changes up to 278702074ff77b1a3fa2061267997095959f5e2c:

  GFS2: Fix crash during ACL deletion in acl max entry check in gfs2_set_acl() 
(2015-02-10 10:14:56 +)


This time we have mostly clean ups. There is a bug fix for a NULL dereference
relating to ACLs, and another which improves (but does not fix entirely) an
allocation fall-back code path. The other three patches are small clean
ups.


Andrew Elble (1):
  GFS2: Fix crash during ACL deletion in acl max entry check in 
gfs2_set_acl()

Bob Peterson (2):
  GFS2: Eliminate __gfs2_glock_remove_from_lru
  GFS2: Eliminate a nonsense goto

Oleg Drokin (1):
  GFS2: use __vmalloc GFP_NOFS for fs-related allocations.

alex chen (1):
  GFS2: fix sprintf format specifier

 fs/gfs2/acl.c  |  2 +-
 fs/gfs2/dir.c  |  3 ++-
 fs/gfs2/glock.c| 13 +++--
 fs/gfs2/inode.c|  3 ---
 fs/gfs2/recovery.c |  2 +-
 fs/gfs2/sys.c  |  2 +-
 6 files changed, 8 insertions(+), 17 deletions(-)



signature.asc
Description: This is a digitally signed message part

Re: [Cluster-devel] [PATCH] gfs2: use __vmalloc GFP_NOFS for fs-related allocations.

2015-02-04 Thread Steven Whitehouse


Hi,

On 04/02/15 07:13, Oleg Drokin wrote:

Hello!

On Feb 3, 2015, at 5:33 PM, Dave Chinner wrote:

I also wonder if vmalloc is still very slow? That was the case some
time ago when I noticed a problem in directory access times in gfs2,
which made us change to use kmalloc with a vmalloc fallback in the
first place,

Another of the myths about vmalloc. The speed and scalability of
vmap/vmalloc is a long solved problem - Nick Piggin fixed the worst
of those problems 5-6 years ago - see the rewrite from 2008 that
started with commit db64fe0 (mm: rewrite vmap layer)

This actually might be less true than one would hope. At least somewhat
recent studies by LLNL (https://jira.hpdd.intel.com/browse/LU-4008)
show that there's huge contention on vmlist_lock, so if you have vmalloc
intense workloads, you get penalized heavily. Granted, this is rhel6 kernel,
but that is still (albeit heavily modified) 2.6.32, which was released at
the end of 2009, way after 2008.
I see that vmlist_lock is gone now, but e.g. vmap_area_lock that is heavily
used is still in place.

So of course with that in place there's every incentive to not use vmalloc
if at all possible. But if used, one would still hopes it would be at least
safe to do even if somewhat slow.

Bye,
 Oleg


I was thinking back to this thread:
https://lkml.org/lkml/2010/4/12/207

More recent than 2008, and although it resulted in a patch that 
apparently fixed the problem, I don't think it was ever applied on the 
basis that it was too risky and kmalloc was the proper solution 
anyway I've not tested recently, so it may have been fixed in the 
mean time,


Steve.

Re: [Cluster-devel] [PATCH] gfs2: use __vmalloc GFP_NOFS for fs-related allocations.

2015-02-02 Thread Steven Whitehouse


Hi,

On 02/02/15 08:11, Dave Chinner wrote:

On Mon, Feb 02, 2015 at 01:57:23AM -0500, Oleg Drokin wrote:

Hello!

On Feb 2, 2015, at 12:37 AM, Dave Chinner wrote:


On Sun, Feb 01, 2015 at 10:59:54PM -0500, gr...@linuxhacker.ru wrote:

From: Oleg Drokin gr...@linuxhacker.ru

leaf_dealloc uses vzalloc as a fallback to kzalloc(GFP_NOFS), so
it clearly does not want any shrinker activity within the fs itself.
convert vzalloc into __vmalloc(GFP_NOFS|__GFP_ZERO) to better achieve
this goal.

Signed-off-by: Oleg Drokin gr...@linuxhacker.ru
---
fs/gfs2/dir.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c5a34f0..6371192 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1896,7 +1896,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 
index, u32 len,

ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
if (ht == NULL)
-   ht = vzalloc(size);
+   ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO,
+  PAGE_KERNEL);

That, in the end, won't help as vmalloc still uses GFP_KERNEL
allocations deep down in the PTE allocation code. See the hacks in
the DM and XFS code to work around this. i.e. go look for callers of
memalloc_noio_save().  It's ugly and grotesque, but we've got no
other way to limit reclaim context because the MM devs won't pass
the vmalloc gfp context down the stack to the PTE allocations

Hm, interesting.
So all the other code in the kernel that does this sort of thing (and there's 
quite a bit
outside of xfs and ocfs2) would not get the desired effect?

No. I expect, however, that very few people would ever see a
deadlock as a result - it's a pretty rare sort of kernel case to hit
in most cases. XFS does make extensive use of vm_map_ram() in
GFP_NOFS context, however, when large directory block sizes are in
use, and we also have a history of lockdep throwing warnings under
memory pressure. In the end, the memalloc_noio_save() changes were
made to stop the frequent lockdep reports rather than actual
deadlocks.
Indeed, I think the patch is still an improvement however, so I'm happy 
to apply it while a better solution is found.



So, I did some digging in archives and found this thread from 2010 onward with 
various
patches and rants.
Not sure how I missed that before.

Should we have another run at this I wonder?

By all means, but I don't think you'll have any more luck than
anyone else in the past. We've still got the problem of attitude
(vmalloc is not for general use) and making it actually work is
seen as encouraging undesirable behaviour. If you can change
attitudes towards vmalloc first, then you'll be much more likely to
make progress in getting these problems solved



Well I don't know whether it has to be vmalloc that provides the 
solution here... if memory fragmentation could be controlled then 
kmalloc of larger contiguous chunks of memory could be done using that, 
which might be a better solution overall. But I do agree that we need to 
try and come to some kind of solution to this problem as it is one of 
those things that has been rumbling on for a long time without a proper 
solution.


I also wonder if vmalloc is still very slow? That was the case some time 
ago when I noticed a problem in directory access times in gfs2, which 
made us change to use kmalloc with a vmalloc fallback in the first place,


Steve.

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Eliminate a nonsense goto

2015-01-26 Thread Steven Whitehouse


Hi,

Now in the -nmw git tree. Thanks,

Steve.

On 22/01/15 16:11, Bob Peterson wrote:

Hi,

This patch just removes a goto that did nothing.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson rpete...@redhat.com
---
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9054002..73c72253 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -543,10 +543,7 @@ static int link_dinode(struct gfs2_inode *dip, const 
struct qstr *name,
}
  
  	error = gfs2_dir_add(dip-i_inode, name, ip, da);

-   if (error)
-   goto fail_end_trans;
  
-fail_end_trans:

gfs2_trans_end(sdp);
  fail_ipreserv:
gfs2_inplace_release(dip);

Re: [Cluster-devel] GFS2: Move most of the remaining inode.c into ops_inode.c

2015-01-24 Thread Steven Whitehouse


Hi,

On 24/01/15 19:45, Dan Carpenter wrote:

Hello Steven Whitehouse,

The [some really old patch], leads to the following static checker
warning:

fs/gfs2/inode.c:203 gfs2_inode_lookup()
error: passing non negative 13 to ERR_PTR

fs/gfs2/inode.c
167  set_bit(GIF_INVALID, ip-i_flags);
168  error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, 
ip-i_iopen_gh);
 
^^
It looks like this function can return GLR_TRYFAILED.  The caller is
only expecting normal ERR_PTRs so it could cause an oops.
This does not request a try lock, so it should never return the 
GLR_TRYFAILED value. I think the checker is perhaps not following the 
code well enough to figure that out?


Steve.


169  if (unlikely(error))
170  goto fail_iopen;
171
172  ip-i_iopen_gh.gh_gl-gl_object = ip;
173  gfs2_glock_put(io_gl);
174  io_gl = NULL;
175
176  if (type == DT_UNKNOWN) {
177  /* Inode glock must be locked already */
178  error = gfs2_inode_refresh(GFS2_I(inode));
179  if (error)
180  goto fail_refresh;
181  } else {
182  inode-i_mode = DT2IF(type);
183  }
184
185  gfs2_set_iop(inode);
186  unlock_new_inode(inode);
187  }
188
189  return inode;
190
191  fail_refresh:
192  ip-i_iopen_gh.gh_flags |= GL_NOCACHE;
193  ip-i_iopen_gh.gh_gl-gl_object = NULL;
194  gfs2_glock_dq_uninit(ip-i_iopen_gh);
195  fail_iopen:
196  if (io_gl)
197  gfs2_glock_put(io_gl);
198  fail_put:
199  ip-i_gl-gl_object = NULL;
200  gfs2_glock_put(ip-i_gl);
201  fail:
202  iget_failed(inode);
203  return ERR_PTR(error);
204  }

Related:
fs/gfs2/inode.c:203 gfs2_inode_lookup() error: passing non negative 13 to 
ERR_PTR
fs/gfs2/inode.c:218 gfs2_lookup_by_inum() error: passing non negative 13 to 
ERR_PTR
fs/gfs2/inode.c:243 gfs2_lookup_by_inum() error: passing non negative 13 to 
ERR_PTR
fs/gfs2/inode.c:306 gfs2_lookupi() error: passing non negative 13 to ERR_PTR
fs/gfs2/inode.c:324 gfs2_lookupi() error: passing non negative 13 to ERR_PTR
fs/gfs2/inode.c:852 __gfs2_lookup() error: passing non negative 13 to ERR_PTR
fs/gfs2/inode.c:1567 gfs2_follow_link() error: passing non negative 13 to 
ERR_PTR

regards,
dan carpenter

Re: [Cluster-devel] [PATCH] gfs2: fix sprintf format specifier

2015-01-13 Thread Steven Whitehouse


Hi,

On 12/01/15 11:01, alex chen wrote:

Sprintf format specifier %d and %u are mixed up in
gfs2_recovery_done() and freeze_show(). So correct them.

Signed-off-by: Alex Chen alex.c...@huawei.com
Reviewed-by: Joseph Qi joseph...@huawei.com


Now in the GFS2 -nmw tree. Thanks,

Steve.


---
  fs/gfs2/recovery.c | 2 +-
  fs/gfs2/sys.c  | 2 +-
  2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 573bd3b..1b64577 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -439,7 +439,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, 
unsigned int jid,

  ls-ls_recover_jid_done = jid;
  ls-ls_recover_jid_status = message;
-   sprintf(env_jid, JID=%d, jid);
+   sprintf(env_jid, JID=%u, jid);
sprintf(env_status, RECOVERY=%s,
message == LM_RD_SUCCESS ? Done : Failed);
  kobject_uevent_env(sdp-sd_kobj, KOBJ_CHANGE, envp);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 3ab566b..ae8e881 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -96,7 +96,7 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
struct super_block *sb = sdp-sd_vfs;
int frozen = (sb-s_writers.frozen == SB_UNFROZEN) ? 0 : 1;

-   return snprintf(buf, PAGE_SIZE, %u\n, frozen);
+   return snprintf(buf, PAGE_SIZE, %d\n, frozen);
  }

  static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Eliminate __gfs2_glock_remove_from_lru

2015-01-09 Thread Steven Whitehouse


Hi,

Now in the -nmw tree. Thanks,

Steve.

On 05/01/15 18:25, Bob Peterson wrote:

Hi,

Since the only caller of function __gfs2_glock_remove_from_lru locks the
same spin_lock as gfs2_glock_remove_from_lru, the functions can be combined.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson rpete...@redhat.com
---
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a23524a..aeb7bc9 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -173,19 +173,14 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
spin_unlock(lru_lock);
  }
  
-static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl)

+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
  {
+   spin_lock(lru_lock);
if (!list_empty(gl-gl_lru)) {
list_del_init(gl-gl_lru);
atomic_dec(lru_count);
clear_bit(GLF_LRU, gl-gl_flags);
}
-}
-
-static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
-{
-   spin_lock(lru_lock);
-   __gfs2_glock_remove_from_lru(gl);
spin_unlock(lru_lock);
  }
  
@@ -205,9 +200,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
  
  	lockref_mark_dead(gl-gl_lockref);
  
-	spin_lock(lru_lock);

-   __gfs2_glock_remove_from_lru(gl);
-   spin_unlock(lru_lock);
+   gfs2_glock_remove_from_lru(gl);
spin_unlock(gl-gl_lockref.lock);
spin_lock_bucket(gl-gl_hash);
hlist_bl_del_rcu(gl-gl_list);

Re: [Cluster-devel] [Lsf-pc] [LSF/MM ATTEND] [TOPIC] fs/block interface discussions

2014-12-12 Thread Steven Whitehouse


Hi,

On 11/12/14 00:52, Alasdair G Kergon wrote:

On Wed, Dec 10, 2014 at 07:46:51PM +0100, Jan Kara wrote:

   But still you first need to stop all writes to the filesystem, then do a
sync, and then allow writing again - which is exactly what freeze does.

And with device-mapper, we were asked to support the taking of snapshots
of multiple volumes simultaneously (e.g. where the application data is
stored across more than one filesystem). Thin dm snapshots can handle
this (the original non-thin ones can't).

Alasdair



Thats good to know, and a useful feature. One of the issues I can see is 
that because there are a number of different layers involved 
(application/fs/storage) coordination of requirements between those is 
not easy. To try to answer Jan's question earlier in the thread, no I 
don't know any specific application developers, but I can certainly help 
to propose some kind of solution, and then get some feedback. I think it 
is probably going to be easier to start with a specific proposal, albeit 
tentative, and then ask for feedback than to just say how should we do 
this? which is a lot more open ended.


Going back to the other point above regarding freeze, is it not 
necessarily a requirement to stop all writes in order to do a snapshot, 
what is needed is in effect a barrier between operations which should be 
represented in the snapshot and those which should not, because they 
happen after the snapshot has been taken. Not that I'm particularly 
attached to that proposal as it stands, but I hope it demonstrates the 
kind of thing I had in mind for discussion. I hope also that it will be 
possible to come up with a better solution during and/or following the 
discussion.


The goal  would really be to figure out which bits we already have, 
which bits are missing, where the problems are, what can be done better, 
and so forth, while we have at least two of the three layers represented 
and in the same room. This is very much something for the long term 
rather than a quick discussion followed by a few patches kind of thing, 
I think,


Steve.

[Cluster-devel] [LSF/MM ATTEND] [TOPIC] fs/block interface discussions

2014-12-10 Thread Steven Whitehouse


Hi,

Here is my request to attend LSF/MM 2015. Since I last attended LSF/MM, 
I now have rather wider interests in fs and storage than just GFS2, 
although I'm still interested in that too. My interests now include many 
other filesystems including NFS, CIFS, ext*, XFS, btrfs, overlayfs 
including the common VFS and block interfaces, among other things.


Here are a few thoughts on topics which might be interesting to discuss

I'm interested generally in topics related to integration between 
components, one example being snapshots. We have snapshots at various 
different layers (can be done at array level or dm/lvm level and also we 
have filesystem support in the form of fs freezing). There are a few 
thoughts that spring to mind - one being how this should integrate with 
applications - in order to make it easier to use, and another being 
whether we could introduce snapshots which do not require freezing the 
fs (as per btrfs) for other filesystems too - possibly by passing down a 
special kind of flush from the filesystem layer.


A more general topic is proposed changes to the fs/block interface, of 
which the above may possibly be one example. There are a number of 
proposals for new classes of block device, and new features which will 
potentially require a different (or extended) interface at the fs/block 
layer. These have largely been discussed to date as individual features, 
and I wonder whether it might be useful to try and bring together the 
various proposals to see if there is commonality between at least some 
of them at the fs/block interface level. I know that there have been 
discussions going on relating to the individual proposals, so the idea I 
had was to try and look at them from a slightly different angle by 
bringing as many of them as possible together and concentrating on how 
they would be used from a filesystem perspective,


Steve.

Re: [Cluster-devel] [Lsf-pc] [LSF/MM ATTEND] [TOPIC] fs/block interface discussions

2014-12-10 Thread Steven Whitehouse


Hi,

On 10/12/14 12:48, Jan Kara wrote:

   Hi,

On Wed 10-12-14 11:49:48, Steven Whitehouse wrote:

I'm interested generally in topics related to integration between
components, one example being snapshots. We have snapshots at
various different layers (can be done at array level or dm/lvm level
and also we have filesystem support in the form of fs freezing).

   Well, usually snapshots at LVM layer are using fs freezing to get a
consistent image of a filesystem. So these two are integrated AFAICS.


There are a few thoughts that spring to mind - one being how this
should integrate with applications - in order to make it easier to
use, and another being whether we could introduce snapshots which do
not require freezing the fs (as per btrfs) for other filesystems too
- possibly by passing down a special kind of flush from the
filesystem layer.

   So btrfs is special in its COW nature. For filesystems which do updates
in place you can do COW in the block layer (after all that's what
dm snapshotting does) but you still have to get fs into consistent state
(that's fsfreeze), then take snapshot of the device (by setting up proper
COW structures), and only then you can allow further modifications of the
filesystem by unfreezing it. I don't see a way around that...
Well I think it should be possible to get the fs into a consistent state 
without needing to do the freeze/snapshot/unfreeze procedure. Instead we 
might have (there are no doubt other solutions too, so this is just an 
example to get discussion started) an extra flag on a bio, which would 
only be valid with some combination of flush flags. Then it is just a 
case of telling the block layer that we want to do a snapshot, and it 
would then spot the marked bio when the fs sends it down, and know that 
everything before and including that bio should be in the snapshot, and 
everything after that is not. So the fs would do basically a special 
form of sync, setting the flag on the bio when it is consistent - the 
question being how should that then be triggered? It means that there is 
no longer any possibility of having a problem if the unfreeze does not 
happen for any reason.


Perhaps the more important question though, is how it would/could be 
integrated with applications? The ultimate goal that I had in mind is 
that we could have a tool which is run to create a snapshot which will 
with a single command deal with all three (application/fs/block) layers, 
and it should not matter whether the snapshot is done via any particular 
fs or block device, it should work in the same way. So how could we send 
a message to a process to say that a snapshot is about to be taken, and 
to get a message back when the app has produced a consistent set of 
data, and to coordinate between multiple applications using the same 
block device, or even across multiple block devices, being used by a 
single app?




A more general topic is proposed changes to the fs/block interface,
of which the above may possibly be one example. There are a number
of proposals for new classes of block device, and new features which
will potentially require a different (or extended) interface at the
fs/block layer. These have largely been discussed to date as
individual features, and I wonder whether it might be useful to try
and bring together the various proposals to see if there is
commonality between at least some of them at the fs/block interface
level. I know that there have been discussions going on relating to
the individual proposals, so the idea I had was to try and look at
them from a slightly different angle by bringing as many of them as
possible together and concentrating on how they would be used from a
filesystem perspective,

   Could you elaborate on which combination of features you'd like to
discuss?
Honza
Well there are a number that I'm aware of that are currently in 
development, but I suspect that this list is not complete:

 - SMR drives
 - persistent memory (various different types)
 - Hinting from fs to block layer for various different reasons 
(layout, compression, snapshots, anything else?)

 - Better i/o error reporting/recovery
 - copy offload
 - anything I forgot?

Steve.

[Cluster-devel] [PATCH 01/15] GFS2: directly return gfs2_dir_check()

2014-12-08 Thread Steven Whitehouse

From: Fabian Frederick f...@skynet.be

No need to store gfs2_dir_check result and test it before returning.

Signed-off-by: Fabian Frederick f...@skynet.be
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c4ed823..b41b5c7 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1045,11 +1045,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const 
struct qstr *name,
if (error)
return error;
 
-   error = gfs2_dir_check(dip-i_inode, name, ip);
-   if (error)
-   return error;
-
-   return 0;
+   return gfs2_dir_check(dip-i_inode, name, ip);
 }
 
 /**
-- 
1.8.3.1

[Cluster-devel] [PATCH 07/15] GFS2: Update timestamps on fallocate

2014-12-08 Thread Steven Whitehouse

From: Andrew Price anpr...@redhat.com

gfs2_fallocate() wasn't updating ctime and mtime when modifying the
inode. Add a call to file_update_time() to do that.

Signed-off-by: Andrew Price anpr...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 1182164..6e600ab 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -874,7 +874,8 @@ retry:
 
if (!(mode  FALLOC_FL_KEEP_SIZE)  (pos + count)  inode-i_size) {
i_size_write(inode, pos + count);
-   mark_inode_dirty(inode);
+   /* Marks the inode as dirty */
+   file_update_time(file);
}
 
return generic_write_sync(file, pos, count);
-- 
1.8.3.1

[Cluster-devel] GFS2: Pre-pull patch posting (merge window)

2014-12-08 Thread Steven Whitehouse

Hi,

In contrast to recent merge windows, there are a number of interesting features
this time. There is a set of patches to improve performance in relation to
block reservations. Some correctness fixes for fallocate, and an update
to the freeze/thaw code which greatly simplyfies this code path. In
addition there is a set of clean ups from Al Viro too,

Steve.

[Cluster-devel] [PATCH 10/15] GFS2: Deletion of unnecessary checks before two function calls

2014-12-08 Thread Steven Whitehouse

From: Markus Elfring elfr...@users.sourceforge.net

The functions iput() and put_pid() test whether their argument is NULL
and then return immediately. Thus the test around the call is not needed.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring elfr...@users.sourceforge.net
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 8f0c19d..a23524a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -836,8 +836,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, 
struct gfs2_holder *
gh-gh_flags = flags;
gh-gh_iflags = 0;
gh-gh_ip = _RET_IP_;
-   if (gh-gh_owner_pid)
-   put_pid(gh-gh_owner_pid);
+   put_pid(gh-gh_owner_pid);
gh-gh_owner_pid = get_pid(task_pid(current));
 }
 
-- 
1.8.3.1

[Cluster-devel] [PATCH 04/15] GFS2: If we use up our block reservation, request more next time

2014-12-08 Thread Steven Whitehouse

From: Bob Peterson rpete...@redhat.com

If we run out of blocks for a given multi-block allocation, we obviously
did not reserve enough. We should reserve more blocks for the next
reservation to reduce fragmentation. This patch increases the size hint
for reservations when they run out.

Signed-off-by: Bob Peterson rpete...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f4e4a0c..9150207 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2251,6 +2251,9 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip,
trace_gfs2_rs(rs, TRACE_RS_CLAIM);
if (rs-rs_free  !ret)
goto out;
+   /* We used up our block reservation, so we should
+  reserve more blocks next time. */
+   atomic_add(RGRP_RSRV_ADDBLKS, rs-rs_sizehint);
}
__rs_deltree(rs);
}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5d8f085..b104f4a 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -20,6 +20,7 @@
  */
 #define RGRP_RSRV_MINBYTES 8
 #define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
+#define RGRP_RSRV_ADDBLKS 64
 
 struct gfs2_rgrpd;
 struct gfs2_sbd;
-- 
1.8.3.1

[Cluster-devel] [PATCH 06/15] GFS2: Update i_size properly on fallocate

2014-12-08 Thread Steven Whitehouse

From: Andrew Price anpr...@redhat.com

This addresses an issue caught by fsx where the inode size was not being
updated to the expected value after fallocate(2) with mode 0.

The problem was caused by the offset and len parameters being converted
to multiples of the file system's block size, so i_size would be rounded
up to the nearest block size multiple instead of the requested size.

This replaces the per-chunk i_size updates with a single i_size_write on
successful completion of the operation.  With this patch gfs2 gets
through a complete run of fsx.

For clarity, the check for (error == 0) following the loop is removed as
all failures before that point jump to out_* labels or return.

Signed-off-by: Andrew Price anpr...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 3786579..1182164 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -729,7 +729,6 @@ static int fallocate_chunk(struct inode *inode, loff_t 
offset, loff_t len,
struct gfs2_inode *ip = GFS2_I(inode);
struct buffer_head *dibh;
int error;
-   loff_t size = len;
unsigned int nr_blks;
sector_t lblock = offset  inode-i_blkbits;
 
@@ -763,11 +762,6 @@ static int fallocate_chunk(struct inode *inode, loff_t 
offset, loff_t len,
goto out;
}
}
-   if (offset + size  inode-i_size  !(mode  FALLOC_FL_KEEP_SIZE))
-   i_size_write(inode, offset + size);
-
-   mark_inode_dirty(inode);
-
 out:
brelse(dibh);
return error;
@@ -878,9 +872,12 @@ retry:
gfs2_quota_unlock(ip);
}
 
-   if (error == 0)
-   error = generic_write_sync(file, pos, count);
-   return error;
+   if (!(mode  FALLOC_FL_KEEP_SIZE)  (pos + count)  inode-i_size) {
+   i_size_write(inode, pos + count);
+   mark_inode_dirty(inode);
+   }
+
+   return generic_write_sync(file, pos, count);
 
 out_trans_fail:
gfs2_inplace_release(ip);
-- 
1.8.3.1

[Cluster-devel] [PATCH 05/15] GFS2: Use inode_newsize_ok and get_write_access in fallocate

2014-12-08 Thread Steven Whitehouse

From: Andrew Price anpr...@redhat.com

gfs2_fallocate wasn't checking inode_newsize_ok nor get_write_access.
Split out the context setup and inode locking pieces into a separate
function to make it more clear and add these missing calls.

inode_newsize_ok is called conditional on FALLOC_FL_KEEP_SIZE as there
is no need to enforce a file size limit if it isn't going to change.

Signed-off-by: Andrew Price anpr...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 5ebe568..3786579 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -797,8 +797,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t 
max, loff_t *len,
}
 }
 
-static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
-  loff_t len)
+static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, 
loff_t len)
 {
struct inode *inode = file_inode(file);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -812,14 +811,9 @@ static long gfs2_fallocate(struct file *file, int mode, 
loff_t offset,
loff_t bsize_mask = ~((loff_t)sdp-sd_sb.sb_bsize - 1);
loff_t next = (offset + len - 1)  sdp-sd_sb.sb_bsize_shift;
loff_t max_chunk_size = UINT_MAX  bsize_mask;
-   struct gfs2_holder gh;
 
next = (next + 1)  sdp-sd_sb.sb_bsize_shift;
 
-   /* We only support the FALLOC_FL_KEEP_SIZE mode */
-   if (mode  ~FALLOC_FL_KEEP_SIZE)
-   return -EOPNOTSUPP;
-
offset = bsize_mask;
 
len = next - offset;
@@ -830,17 +824,6 @@ static long gfs2_fallocate(struct file *file, int mode, 
loff_t offset,
if (bytes == 0)
bytes = sdp-sd_sb.sb_bsize;
 
-   error = gfs2_rs_alloc(ip);
-   if (error)
-   return error;
-
-   mutex_lock(inode-i_mutex);
-
-   gfs2_holder_init(ip-i_gl, LM_ST_EXCLUSIVE, 0, gh);
-   error = gfs2_glock_nq(gh);
-   if (unlikely(error))
-   goto out_uninit;
-
gfs2_size_hint(file, offset, len);
 
while (len  0) {
@@ -853,8 +836,7 @@ static long gfs2_fallocate(struct file *file, int mode, 
loff_t offset,
}
error = gfs2_quota_lock_check(ip);
if (error)
-   goto out_unlock;
-
+   return error;
 retry:
gfs2_write_calc_reserv(ip, bytes, data_blocks, ind_blocks);
 
@@ -898,18 +880,58 @@ retry:
 
if (error == 0)
error = generic_write_sync(file, pos, count);
-   goto out_unlock;
+   return error;
 
 out_trans_fail:
gfs2_inplace_release(ip);
 out_qunlock:
gfs2_quota_unlock(ip);
+   return error;
+}
+
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t 
len)
+{
+   struct inode *inode = file_inode(file);
+   struct gfs2_inode *ip = GFS2_I(inode);
+   struct gfs2_holder gh;
+   int ret;
+
+   if (mode  ~FALLOC_FL_KEEP_SIZE)
+   return -EOPNOTSUPP;
+
+   mutex_lock(inode-i_mutex);
+
+   gfs2_holder_init(ip-i_gl, LM_ST_EXCLUSIVE, 0, gh);
+   ret = gfs2_glock_nq(gh);
+   if (ret)
+   goto out_uninit;
+
+   if (!(mode  FALLOC_FL_KEEP_SIZE) 
+   (offset + len)  inode-i_size) {
+   ret = inode_newsize_ok(inode, offset + len);
+   if (ret)
+   goto out_unlock;
+   }
+
+   ret = get_write_access(inode);
+   if (ret)
+   goto out_unlock;
+
+   ret = gfs2_rs_alloc(ip);
+   if (ret)
+   goto out_putw;
+
+   ret = __gfs2_fallocate(file, mode, offset, len);
+   if (ret)
+   gfs2_rs_deltree(ip-i_res);
+out_putw:
+   put_write_access(inode);
 out_unlock:
gfs2_glock_dq(gh);
 out_uninit:
gfs2_holder_uninit(gh);
mutex_unlock(inode-i_mutex);
-   return error;
+   return ret;
 }
 
 #ifdef CONFIG_GFS2_FS_LOCKING_DLM
-- 
1.8.3.1

[Cluster-devel] [PATCH 03/15] GFS2: Only increase rs_sizehint

2014-12-08 Thread Steven Whitehouse

From: Bob Peterson rpete...@redhat.com

If an application does a sequence of (1) big write, (2) little write
we don't necessarily want to reset the size hint based on the smaller
size. The fact that they did any big writes implies they may do more,
and therefore we should try to allocate bigger block reservations, even
if the last few were small writes. Therefore this patch changes function
gfs2_size_hint so that the size hint can only grow; it cannot shrink.
This is especially important where there are multiple writers.

Signed-off-by: Bob Peterson rpete...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 80dd44d..5ebe568 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -337,7 +337,8 @@ static void gfs2_size_hint(struct file *filep, loff_t 
offset, size_t size)
size_t blks = (size + sdp-sd_sb.sb_bsize - 1)  
sdp-sd_sb.sb_bsize_shift;
int hint = min_t(size_t, INT_MAX, blks);
 
-   atomic_set(ip-i_res-rs_sizehint, hint);
+   if (hint  atomic_read(ip-i_res-rs_sizehint))
+   atomic_set(ip-i_res-rs_sizehint, hint);
 }
 
 /**
-- 
1.8.3.1

[Cluster-devel] [PATCH 11/15] GFS2: bugger off early if O_CREAT open finds a directory

2014-12-08 Thread Steven Whitehouse

From: Al Viro v...@zeniv.linux.org.uk

Signed-off-by: Al Viro v...@zeniv.linux.org.uk
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 04065e5..f41b2fd 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -624,6 +624,11 @@ static int gfs2_create_inode(struct inode *dir, struct 
dentry *dentry,
inode = gfs2_dir_search(dir, dentry-d_name, !S_ISREG(mode) || excl);
error = PTR_ERR(inode);
if (!IS_ERR(inode)) {
+   if (S_ISDIR(inode-i_mode)) {
+   iput(inode);
+   inode = ERR_PTR(-EISDIR);
+   goto fail_gunlock;
+   }
d = d_splice_alias(inode, dentry);
error = PTR_ERR(d);
if (IS_ERR(d)) {
-- 
1.8.3.1

[Cluster-devel] [PATCH 02/15] GFS2: Set of distributed preferences for rgrps

2014-12-08 Thread Steven Whitehouse

From: Bob Peterson rpete...@redhat.com

This patch tries to use the journal numbers to evenly distribute
which node prefers which resource group for block allocations. This
is to help performance.

Signed-off-by: Bob Peterson rpete...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 39e7e99..1b89918 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -97,6 +97,7 @@ struct gfs2_rgrpd {
 #define GFS2_RDF_CHECK 0x1000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE  0x2000 /* rg is up to date */
 #define GFS2_RDF_ERROR 0x4000 /* error in rg */
+#define GFS2_RDF_PREFERRED 0x8000 /* This rgrp is preferred */
 #define GFS2_RDF_MASK  0xf000 /* mask for internal flags */
spinlock_t rd_rsspin;   /* protects reservation related vars */
struct rb_root rd_rstree;   /* multi-block reservation tree */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7474c41..f4e4a0c 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -936,7 +936,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
rgd-rd_gl-gl_vm.start = rgd-rd_addr * bsize;
rgd-rd_gl-gl_vm.end = rgd-rd_gl-gl_vm.start + (rgd-rd_length * 
bsize) - 1;
rgd-rd_rgl = (struct gfs2_rgrp_lvb *)rgd-rd_gl-gl_lksb.sb_lvbptr;
-   rgd-rd_flags = ~GFS2_RDF_UPTODATE;
+   rgd-rd_flags = ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
if (rgd-rd_data  sdp-sd_max_rg_data)
sdp-sd_max_rg_data = rgd-rd_data;
spin_lock(sdp-sd_rindex_spin);
@@ -955,6 +955,36 @@ fail:
 }
 
 /**
+ * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
+ * @sdp: the GFS2 superblock
+ *
+ * The purpose of this function is to select a subset of the resource groups
+ * and mark them as PREFERRED. We do it in such a way that each node prefers
+ * to use a unique set of rgrps to minimize glock contention.
+ */
+static void set_rgrp_preferences(struct gfs2_sbd *sdp)
+{
+   struct gfs2_rgrpd *rgd, *first;
+   int i;
+
+   /* Skip an initial number of rgrps, based on this node's journal ID.
+  That should start each node out on its own set. */
+   rgd = gfs2_rgrpd_get_first(sdp);
+   for (i = 0; i  sdp-sd_lockstruct.ls_jid; i++)
+   rgd = gfs2_rgrpd_get_next(rgd);
+   first = rgd;
+
+   do {
+   rgd-rd_flags |= GFS2_RDF_PREFERRED;
+   for (i = 0; i  sdp-sd_journals; i++) {
+   rgd = gfs2_rgrpd_get_next(rgd);
+   if (rgd == first)
+   break;
+   }
+   } while (rgd != first);
+}
+
+/**
  * gfs2_ri_update - Pull in a new resource index from the disk
  * @ip: pointer to the rindex inode
  *
@@ -973,6 +1003,8 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
if (error  0)
return error;
 
+   set_rgrp_preferences(sdp);
+
sdp-sd_rindex_uptodate = 1;
return 0;
 }
@@ -1891,6 +1923,25 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, 
const struct gfs2_rgrpd *b
 }
 
 /**
+ * fast_to_acquire - determine if a resource group will be fast to acquire
+ *
+ * If this is one of our preferred rgrps, it should be quicker to acquire,
+ * because we tried to set ourselves up as dlm lock master.
+ */
+static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
+{
+   struct gfs2_glock *gl = rgd-rd_gl;
+
+   if (gl-gl_state != LM_ST_UNLOCKED  list_empty(gl-gl_holders) 
+   !test_bit(GLF_DEMOTE_IN_PROGRESS, gl-gl_flags) 
+   !test_bit(GLF_DEMOTE, gl-gl_flags))
+   return 1;
+   if (rgd-rd_flags  GFS2_RDF_PREFERRED)
+   return 1;
+   return 0;
+}
+
+/**
  * gfs2_inplace_reserve - Reserve space in the filesystem
  * @ip: the inode to reserve space for
  * @ap: the allocation parameters
@@ -1932,10 +1983,15 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const 
struct gfs2_alloc_parms *a
rg_locked = 0;
if (skip  skip--)
goto next_rgrp;
-   if (!gfs2_rs_active(rs)  (loops  2) 
-gfs2_rgrp_used_recently(rs, 1000) 
-gfs2_rgrp_congested(rs-rs_rbm.rgd, loops))
-   goto next_rgrp;
+   if (!gfs2_rs_active(rs)) {
+   if (loops == 0 
+   !fast_to_acquire(rs-rs_rbm.rgd))
+   goto next_rgrp;
+   if ((loops  2) 
+   gfs2_rgrp_used_recently(rs, 1000) 
+   gfs2_rgrp_congested(rs-rs_rbm.rgd, loops))
+   goto next_rgrp;
+   }
error = gfs2_glock_nq_init(rs-rs_rbm.rgd-rd_gl

[Cluster-devel] [PATCH 14/15] GFS2: gfs2_dir_get_hash_table(): avoiding deferred vfree() is easy here...

2014-12-08 Thread Steven Whitehouse

From: Al Viro v...@zeniv.linux.org.uk

vfree() is allowed under spinlock these days, but it's cheaper when
it doesn't step into deferred case and here it's very easy to avoid.

Signed-off-by: Al Viro v...@zeniv.linux.org.uk
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c247fed..c5a34f0 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -370,11 +370,12 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode 
*ip)
}
 
spin_lock(inode-i_lock);
-   if (ip-i_hash_cache)
-   kvfree(hc);
-   else
+   if (likely(!ip-i_hash_cache)) {
ip-i_hash_cache = hc;
+   hc = NULL;
+   }
spin_unlock(inode-i_lock);
+   kvfree(hc);
 
return ip-i_hash_cache;
 }
-- 
1.8.3.1

[Cluster-devel] [PATCH 08/15] fs: add freeze_super/thaw_super fs hooks

2014-12-08 Thread Steven Whitehouse

From: Benjamin Marzinski bmarz...@redhat.com

Currently, freezing a filesystem involves calling freeze_super, which locks
sb-s_umount and then calls the fs-specific freeze_fs hook. This makes it
hard for gfs2 (and potentially other cluster filesystems) to use the vfs
freezing code to do freezes on all the cluster nodes.

In order to communicate that a freeze has been requested, and to make sure
that only one node is trying to freeze at a time, gfs2 uses a glock
(sd_freeze_gl). The problem is that there is no hook for gfs2 to acquire
this lock before calling freeze_super. This means that two nodes can
attempt to freeze the filesystem by both calling freeze_super, acquiring
the sb-s_umount lock, and then attempting to grab the cluster glock
sd_freeze_gl. Only one will succeed, and the other will be stuck in
freeze_super, making it impossible to finish freezing the node.

To solve this problem, this patch adds the freeze_super and thaw_super
hooks.  If a filesystem implements these hooks, they are called instead of
the vfs freeze_super and thaw_super functions. This means that every
filesystem that implements these hooks must call the vfs freeze_super and
thaw_super functions itself within the hook function to make use of the vfs
freezing code.

Reviewed-by: Jan Kara j...@suse.cz
Signed-off-by: Benjamin Marzinski bmarz...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1d9c9f3..b48c41b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -235,7 +235,10 @@ struct super_block *freeze_bdev(struct block_device *bdev)
sb = get_active_super(bdev);
if (!sb)
goto out;
-   error = freeze_super(sb);
+   if (sb-s_op-freeze_super)
+   error = sb-s_op-freeze_super(sb);
+   else
+   error = freeze_super(sb);
if (error) {
deactivate_super(sb);
bdev-bd_fsfreeze_count--;
@@ -272,7 +275,10 @@ int thaw_bdev(struct block_device *bdev, struct 
super_block *sb)
if (!sb)
goto out;
 
-   error = thaw_super(sb);
+   if (sb-s_op-thaw_super)
+   error = sb-s_op-thaw_super(sb);
+   else
+   error = thaw_super(sb);
if (error) {
bdev-bd_fsfreeze_count++;
mutex_unlock(bdev-bd_fsfreeze_mutex);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 8ac3fad..77c9a78 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -518,10 +518,12 @@ static int ioctl_fsfreeze(struct file *filp)
return -EPERM;
 
/* If filesystem doesn't support freeze feature, return. */
-   if (sb-s_op-freeze_fs == NULL)
+   if (sb-s_op-freeze_fs == NULL  sb-s_op-freeze_super == NULL)
return -EOPNOTSUPP;
 
/* Freeze */
+   if (sb-s_op-freeze_super)
+   return sb-s_op-freeze_super(sb);
return freeze_super(sb);
 }
 
@@ -533,6 +535,8 @@ static int ioctl_fsthaw(struct file *filp)
return -EPERM;
 
/* Thaw */
+   if (sb-s_op-thaw_super)
+   return sb-s_op-thaw_super(sb);
return thaw_super(sb);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ab779e..b4a1d73c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1577,7 +1577,9 @@ struct super_operations {
void (*evict_inode) (struct inode *);
void (*put_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
+   int (*freeze_super) (struct super_block *);
int (*freeze_fs) (struct super_block *);
+   int (*thaw_super) (struct super_block *);
int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
-- 
1.8.3.1

[Cluster-devel] [PATCH 15/15] GFS2: gfs2_atomic_open(): simplify the use of finish_no_open()

2014-12-08 Thread Steven Whitehouse

From: Al Viro v...@zeniv.linux.org.uk

In -atomic_open(inode, dentry, file, opened) calling finish_no_open(file, NULL)
is equivalent to dget(dentry); return finish_no_open(file, dentry);

No need to open-code that...

Signed-off-by: Al Viro v...@zeniv.linux.org.uk
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9e8545b..9054002 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1245,11 +1245,8 @@ static int gfs2_atomic_open(struct inode *dir, struct 
dentry *dentry,
if (d != NULL)
dentry = d;
if (dentry-d_inode) {
-   if (!(*opened  FILE_OPENED)) {
-   if (d == NULL)
-   dget(dentry);
-   return finish_no_open(file, dentry);
-   }
+   if (!(*opened  FILE_OPENED))
+   return finish_no_open(file, d);
dput(d);
return 0;
}
-- 
1.8.3.1

[Cluster-devel] [PATCH 12/15] GFS2: gfs2_create_inode(): don't bother with d_splice_alias()

2014-12-08 Thread Steven Whitehouse

From: Al Viro v...@zeniv.linux.org.uk

dentry is always hashed and negative, inode - non-error, non-NULL and
non-directory.  In such conditions d_splice_alias() is equivalent to
d_instantiate(dentry, inode) and return NULL, which simplifies the
downstream code and is consistent with the have to create a new object
case.

Signed-off-by: Al Viro v...@zeniv.linux.org.uk
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index f41b2fd..9e8545b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -596,7 +596,6 @@ static int gfs2_create_inode(struct inode *dir, struct 
dentry *dentry,
struct gfs2_inode *dip = GFS2_I(dir), *ip;
struct gfs2_sbd *sdp = GFS2_SB(dip-i_inode);
struct gfs2_glock *io_gl;
-   struct dentry *d;
int error, free_vfs_inode = 0;
u32 aflags = 0;
unsigned blocks = 1;
@@ -629,22 +628,13 @@ static int gfs2_create_inode(struct inode *dir, struct 
dentry *dentry,
inode = ERR_PTR(-EISDIR);
goto fail_gunlock;
}
-   d = d_splice_alias(inode, dentry);
-   error = PTR_ERR(d);
-   if (IS_ERR(d)) {
-   inode = ERR_CAST(d);
-   goto fail_gunlock;
-   }
+   d_instantiate(dentry, inode);
error = 0;
if (file) {
-   if (S_ISREG(inode-i_mode)) {
-   WARN_ON(d != NULL);
+   if (S_ISREG(inode-i_mode))
error = finish_open(file, dentry, 
gfs2_open_common, opened);
-   } else {
-   error = finish_no_open(file, d);
-   }
-   } else {
-   dput(d);
+   else
+   error = finish_no_open(file, NULL);
}
gfs2_glock_dq_uninit(ghs);
return error;
-- 
1.8.3.1

[Cluster-devel] [PATCH 13/15] GFS2: use kvfree() instead of open-coding it

2014-12-08 Thread Steven Whitehouse

From: Al Viro v...@zeniv.linux.org.uk

Signed-off-by: Al Viro v...@zeniv.linux.org.uk
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 5d4261f..c247fed 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -365,22 +365,15 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode 
*ip)
 
ret = gfs2_dir_read_data(ip, hc, hsize);
if (ret  0) {
-   if (is_vmalloc_addr(hc))
-   vfree(hc);
-   else
-   kfree(hc);
+   kvfree(hc);
return ERR_PTR(ret);
}
 
spin_lock(inode-i_lock);
-   if (ip-i_hash_cache) {
-   if (is_vmalloc_addr(hc))
-   vfree(hc);
-   else
-   kfree(hc);
-   } else {
+   if (ip-i_hash_cache)
+   kvfree(hc);
+   else
ip-i_hash_cache = hc;
-   }
spin_unlock(inode-i_lock);
 
return ip-i_hash_cache;
@@ -396,10 +389,7 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip)
 {
__be64 *hc = ip-i_hash_cache;
ip-i_hash_cache = NULL;
-   if (is_vmalloc_addr(hc))
-   vfree(hc);
-   else
-   kfree(hc);
+   kvfree(hc);
 }
 
 static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent)
@@ -1168,10 +1158,7 @@ fail:
gfs2_dinode_out(dip, dibh-b_data);
brelse(dibh);
 out_kfree:
-   if (is_vmalloc_addr(hc2))
-   vfree(hc2);
-   else
-   kfree(hc2);
+   kvfree(hc2);
return error;
 }
 
@@ -1302,14 +1289,6 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
return ptr;
 }
 
-static void gfs2_free_sort_buffer(void *ptr)
-{
-   if (is_vmalloc_addr(ptr))
-   vfree(ptr);
-   else
-   kfree(ptr);
-}
-
 static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
  int *copied, unsigned *depth,
  u64 leaf_no)
@@ -1393,7 +1372,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct 
dir_context *ctx,
 out_free:
for(i = 0; i  leaf; i++)
brelse(larr[i]);
-   gfs2_free_sort_buffer(larr);
+   kvfree(larr);
 out:
return error;
 }
@@ -2004,10 +1983,7 @@ out_rlist:
gfs2_rlist_free(rlist);
gfs2_quota_unhold(dip);
 out:
-   if (is_vmalloc_addr(ht))
-   vfree(ht);
-   else
-   kfree(ht);
+   kvfree(ht);
return error;
 }
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 64b29f7..c8b148b 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1360,13 +1360,8 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 
gfs2_assert_warn(sdp, !atomic_read(sdp-sd_quota_count));
 
-   if (sdp-sd_quota_bitmap) {
-   if (is_vmalloc_addr(sdp-sd_quota_bitmap))
-   vfree(sdp-sd_quota_bitmap);
-   else
-   kfree(sdp-sd_quota_bitmap);
-   sdp-sd_quota_bitmap = NULL;
-   }
+   kvfree(sdp-sd_quota_bitmap);
+   sdp-sd_quota_bitmap = NULL;
 }
 
 static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
-- 
1.8.3.1

[Cluster-devel] [PATCH 09/15] GFS2: update freeze code to use freeze/thaw_super on all nodes

2014-12-08 Thread Steven Whitehouse

From: Benjamin Marzinski bmarz...@redhat.com

The current gfs2 freezing code is considerably more complicated than it
should be because it doesn't use the vfs freezing code on any node except
the one that begins the freeze.  This is because it needs to acquire a
cluster glock before calling the vfs code to prevent a deadlock, and
without the new freeze_super and thaw_super hooks, that was impossible. To
deal with the issue, gfs2 had to do some hacky locking tricks to make sure
that a frozen node couldn't be holding on a lock it needed to do the
unfreeze ioctl.

This patch makes use of the new hooks to simply the gfs2 locking code. Now,
all the nodes in the cluster freeze and thaw in exactly the same way. Every
node in the cluster caches the freeze glock in the shared state.  The new
freeze_super hook allows the freezing node to grab this freeze glock in
the exclusive state without first calling the vfs freeze_super function.
All the nodes in the cluster see this lock change, and call the vfs
freeze_super function. The vfs locking code guarantees that the nodes can't
get stuck holding the glocks necessary to unfreeze the system.  To
unfreeze, the freezing node uses the new thaw_super hook to drop the freeze
glock. Again, all the nodes notice this, reacquire the glock in shared mode
and call the vfs thaw_super function.

Signed-off-by: Benjamin Marzinski bmarz...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 1cc0bba..fe91951 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -28,6 +28,8 @@
 #include trans.h
 #include dir.h
 
+struct workqueue_struct *gfs2_freeze_wq;
+
 static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh)
 {
fs_err(gl-gl_sbd, AIL buffer %p: blocknr %llu state 0x%08lx mapping 
%p page state 0x%lx\n,
@@ -94,11 +96,8 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
  * on the stack */
tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
tr.tr_ip = _RET_IP_;
-   sb_start_intwrite(sdp-sd_vfs);
-   if (gfs2_log_reserve(sdp, tr.tr_reserved)  0) {
-   sb_end_intwrite(sdp-sd_vfs);
+   if (gfs2_log_reserve(sdp, tr.tr_reserved)  0)
return;
-   }
WARN_ON_ONCE(current-journal_info);
current-journal_info = tr;
 
@@ -469,20 +468,19 @@ static void inode_go_dump(struct seq_file *seq, const 
struct gfs2_glock *gl)
 
 static void freeze_go_sync(struct gfs2_glock *gl)
 {
+   int error = 0;
struct gfs2_sbd *sdp = gl-gl_sbd;
-   DEFINE_WAIT(wait);
 
if (gl-gl_state == LM_ST_SHARED 
test_bit(SDF_JOURNAL_LIVE, sdp-sd_flags)) {
-   atomic_set(sdp-sd_log_freeze, 1);
-   wake_up(sdp-sd_logd_waitq);
-   do {
-   prepare_to_wait(sdp-sd_log_frozen_wait, wait,
-   TASK_UNINTERRUPTIBLE);
-   if (atomic_read(sdp-sd_log_freeze))
-   io_schedule();
-   } while(atomic_read(sdp-sd_log_freeze));
-   finish_wait(sdp-sd_log_frozen_wait, wait);
+   atomic_set(sdp-sd_freeze_state, SFS_STARTING_FREEZE);
+   error = freeze_super(sdp-sd_vfs);
+   if (error) {
+   printk(KERN_INFO GFS2: couldn't freeze filesystem: 
%d\n, error);
+   gfs2_assert_withdraw(sdp, 0);
+   }
+   queue_work(gfs2_freeze_wq, sdp-sd_freeze_work);
+   gfs2_log_flush(sdp, NULL, FREEZE_FLUSH);
}
 }
 
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index 7455d26..8ed1857 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -12,6 +12,8 @@
 
 #include incore.h
 
+extern struct workqueue_struct *gfs2_freeze_wq;
+
 extern const struct gfs2_glock_operations gfs2_meta_glops;
 extern const struct gfs2_glock_operations gfs2_inode_glops;
 extern const struct gfs2_glock_operations gfs2_rgrp_glops;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 1b89918..7a2dbbc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -588,6 +588,12 @@ enum {
SDF_SKIP_DLM_UNLOCK = 8,
 };
 
+enum gfs2_freeze_state {
+   SFS_UNFROZEN= 0,
+   SFS_STARTING_FREEZE = 1,
+   SFS_FROZEN  = 2,
+};
+
 #define GFS2_FSNAME_LEN256
 
 struct gfs2_inum_host {
@@ -685,6 +691,7 @@ struct gfs2_sbd {
struct gfs2_holder sd_live_gh;
struct gfs2_glock *sd_rename_gl;
struct gfs2_glock *sd_freeze_gl;
+   struct work_struct sd_freeze_work;
wait_queue_head_t sd_glock_wait;
atomic_t sd_glock_disposal;
struct completion sd_locking_init;
@@ -789,6 +796,9 @@ struct gfs2_sbd {
wait_queue_head_t sd_log_flush_wait;
int sd_log_error;
 
+   atomic_t sd_reserving_log;
+   wait_queue_head_t sd_reserving_log_wait;
+
unsigned int sd_log_flush_head

[Cluster-devel] GFS2: Pull request (merge window)

2014-12-08 Thread Steven Whitehouse

Hi,

Please consider pulling the following changes,

Steve.

--

The following changes since commit 0df1f2487d2f0d04703f142813d53615d62a1da4:

  Linux 3.18-rc3 (2014-11-02 15:01:51 -0800)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-3.0-nmw.git 
tags/gfs2-merge-window

for you to fetch changes up to ec7d879c457611e540cb465c25f3040facbd1185:

  GFS2: gfs2_atomic_open(): simplify the use of finish_no_open() (2014-11-20 
11:18:08 +)


In contrast to recent merge windows, there are a number of interesting features
this time. There is a set of patches to improve performance in relation to
block reservations. Some correctness fixes for fallocate, and an update
to the freeze/thaw code which greatly simplyfies this code path. In
addition there is a set of clean ups from Al Viro too.


Al Viro (5):
  GFS2: bugger off early if O_CREAT open finds a directory
  GFS2: gfs2_create_inode(): don't bother with d_splice_alias()
  GFS2: use kvfree() instead of open-coding it
  GFS2: gfs2_dir_get_hash_table(): avoiding deferred vfree() is easy here...
  GFS2: gfs2_atomic_open(): simplify the use of finish_no_open()

Andrew Price (3):
  GFS2: Use inode_newsize_ok and get_write_access in fallocate
  GFS2: Update i_size properly on fallocate
  GFS2: Update timestamps on fallocate

Benjamin Marzinski (2):
  fs: add freeze_super/thaw_super fs hooks
  GFS2: update freeze code to use freeze/thaw_super on all nodes

Bob Peterson (3):
  GFS2: Set of distributed preferences for rgrps
  GFS2: Only increase rs_sizehint
  GFS2: If we use up our block reservation, request more next time

Fabian Frederick (1):
  GFS2: directly return gfs2_dir_check()

Markus Elfring (1):
  GFS2: Deletion of unnecessary checks before two function calls

 fs/block_dev.c   |  10 -
 fs/gfs2/dir.c|  39 --
 fs/gfs2/file.c   |  83 --
 fs/gfs2/glock.c  |   3 +-
 fs/gfs2/glops.c  |  26 ++--
 fs/gfs2/glops.h  |   2 +
 fs/gfs2/incore.h |  19 ++---
 fs/gfs2/inode.c  |  72 +
 fs/gfs2/log.c|  42 +--
 fs/gfs2/main.c   |  11 -
 fs/gfs2/ops_fstype.c |  18 +++--
 fs/gfs2/quota.c  |   9 +
 fs/gfs2/rgrp.c   |  69 ---
 fs/gfs2/rgrp.h   |   1 +
 fs/gfs2/super.c  | 112 ++-
 fs/gfs2/super.h  |   1 +
 fs/gfs2/trans.c  |  17 ++--
 fs/ioctl.c   |   6 ++-
 include/linux/fs.h   |   2 +
 19 files changed, 315 insertions(+), 227 deletions(-)



signature.asc
Description: This is a digitally signed message part

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Write the log descriptor entries in the correct order

2014-11-26 Thread Steven Whitehouse


Hi,

On 26/11/14 14:09, Bob Peterson wrote:

Hi,

I recently discovered GFS2 was writing the log descriptor items in
the journal in reverse order. For example, if I do a bunch of file
creates, the result was a log header that looked something like this:

0x298 (j+ 242): Log descriptor, type 300 (Metadata) len:504, data1: 503
0xa6e9ce di0xa6e9a7 di0xa6e97b di0xa6e942 di
0xa6e911 di0xa6e8d2 di0xa6e898 di0xa6e86e di
0xa6e840 di0xa6e809 di0xa6e7d7 di0xa6e77e di
0xa6e747 di0xa6e721 di0xa6e6f8 di0xa6e6ca di
0xa6e680 di0xa6e63d di0xa6e619 di0xa6e5f4 di
0xa6e5d1 di0xa6e598 di0xa6e520 di0xa6e4f4 di
0xa6e4c1 di0xa6e489 di0xa6e45c di0xa6e427 di

As you can see, the blocks are reverse-sorted. This patch changes the
order so that they're written in sorted order. Log headers written with
the patch look more like this:

0x36e (j+ 318): Log descriptor, type 300 (Metadata) len:504, data1: 503
0x4d5a80 di0x4d5aa0 lf0x4d5aa1 di0x4d5aa2 di
0x4d5aa3 di0x4d5aa4 di0x4d5aa5 di0x4d5aa6 di
0x4d5aa7 di0x4d5aa8 di0x4d5aa9 di0x4d5aaa di
0x4d5aab di0x4d5aac di0x4d5aad di0x4d5aae di
0x4d5aaf di0x4d5ab0 di0x4d5ab1 di0x4d5ab2 di
0x4d5ab3 di0x4d5ab4 di0x4d5ab5 di0x4d5ab6 di
0x4d5ab7 di0x4d5ab8 di0x4d5ab9 di0x4d5aba di

The blocks following the log descriptor are written sequentially, so the
sort order won't affect performance at all, but it makes the journal
easier to decipher for debugging purposes.

Patch description:

This patch changes the order in which the blocks are processed by function
gfs2_before_commit. Before, it was processing them in the wrong order such
that the blocks would be layed out and written in reverse order. This
changes the order to the correct order.

Regards,

Bob Peterson
Red Hat File Systems

Does that mean that this patch got the ordering the wrong way around then?

https://git.kernel.org/cgit/linux/kernel/git/steve/gfs2-3.0-nmw.git/commit/fs/gfs2/lops.c?id=7f63257da1aeea9b2ee68f469b0f9f4a39e5dff8

Steve.



Signed-off-by: Bob Peterson rpete...@redhat.com
---
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 2c1ae86..d644470 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -444,7 +444,7 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, 
unsigned int limit,
ptr = (__be64 *)(ld + 1);
  
  		n = 0;

-   list_for_each_entry_continue(bd1, blist, bd_list) {
+   list_for_each_entry_continue_reverse(bd1, blist, bd_list) {
*ptr++ = cpu_to_be64(bd1-bd_bh-b_blocknr);
if (is_databuf) {
gfs2_check_magic(bd1-bd_bh);
@@ -459,7 +459,7 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, 
unsigned int limit,
gfs2_log_lock(sdp);
  
  		n = 0;

-   list_for_each_entry_continue(bd2, blist, bd_list) {
+   list_for_each_entry_continue_reverse(bd2, blist, bd_list) {
get_bh(bd2-bd_bh);
gfs2_log_unlock(sdp);
lock_buffer(bd2-bd_bh);

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Simplify directory filling, hashing and unhashing functions

2014-11-26 Thread Steven Whitehouse


Hi,

On 19/11/14 13:59, Bob Peterson wrote:

Hi,

This patch implements a new cookie encoding scheme for encoding, decoding
and filling directory entries. This new scheme prevents duplicate cookies,
so it eliminates the need to sort the entries to prevent duplicates.
Thus, it simplies the code greatly.

Regards,

Bob Peterson
Red Hat File Systems


The bit that I don't follow is how you've managed to remove the sorting 
by hash value. What prevents this from giving the wrong answer if 
entries have been added or removed between readdir calls?


The other issue I can see if we want to extend the max size directory 
hash table at some point, since it appears that we'd have to change the 
encoding in order to do that,


Steve.



Signed-off-by: Bob Peterson rpete...@redhat.com
---
  fs/gfs2/dir.c | 165 +++---
  1 file changed, 66 insertions(+), 99 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 5d4261f..039479f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -58,7 +58,6 @@
  #include linux/slab.h
  #include linux/spinlock.h
  #include linux/buffer_head.h
-#include linux/sort.h
  #include linux/gfs2_ondisk.h
  #include linux/crc32.h
  #include linux/vmalloc.h
@@ -80,8 +79,48 @@
  
  #define MAX_RA_BLOCKS 32 /* max read-ahead blocks */
  
-#define gfs2_disk_hash2offset(h) (((u64)(h))  1)

-#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p))  1))
+/**
+ * Notes about the directory hash table:
+ *
+ * Hash table index is hash  (32 - dip-i_depth). Max directory depth is 17.
+ * 32 - 17 = 15. The hash value is 32-bits, so 0x1  15 = 0x2.
+ * So the maximum index to the hash table is 0x1.
+ *
+ * At the same time, pages are at most 4K, and GFS2 dirents are 96 bytes.
+ * According to other comments, 99 is the maximum number of entries that can
+ * fit in a single leaf block. However, if you do the math: Take one 4K block,
+ * subtract out the leaf header, so 0x1000 - 0x68 = 0xf98 bytes available.
+ * The minimum dirent, including a 1-character file name is 0x30 bytes long.
+ * So 0xf98 / 0x30 = 0x53, which is 83 decimal. So the theoretical limit is
+ * 83 index entries per leaf block.
+ *
+ * So we can represent the index number with 17 bits, plus the index in 7.
+ * Therefore, we can represent the offset to any dirent by a 32-bit number:
+ * 18-bits for hash table index, 7 for page index, 8 for leaf block extension
+ *
+ *0 1
+ *01234567 89abcdef 01234567 89abcdef
+ *   iooo
+ *leafblk  Hash table index   Offset
+ * Leaf blk:  (0xff)
+ * HTI Max Value:    1(0x1)
+ * Pg offset Max: 111 (0x7f)
+ **/
+
+#define gfs2_dir_offset2pge(p) (((u32)p  24)  0xff) /* leaf extension blk */
+#define gfs2_dir_pge2offset(p) (p  24)
+
+#define gfs2_dir_offset2hti(p) (((u32)p  7)  0x01) /* hash tbl index */
+#define gfs2_dir_hti2offset(p) (p  7)
+
+#define gfs2_dir_offset2pgo(p) ((u32)p  0x7f)   /* leaf blk page offset */
+#define gfs2_dir_pgo2offset(p) (p)
+
+static inline u32 dirloc2offset(u32 page, u32 hti, u32 off)
+{
+   return (gfs2_dir_pge2offset(page) | gfs2_dir_hti2offset(hti) |
+   gfs2_dir_pgo2offset(off));
+}
  
  struct qstr gfs2_qdot __read_mostly;

  struct qstr gfs2_qdotdot __read_mostly;
@@ -1176,48 +1215,6 @@ out_kfree:
  }
  
  /**

- * compare_dents - compare directory entries by hash value
- * @a: first dent
- * @b: second dent
- *
- * When comparing the hash entries of @a to @b:
- *   gt: returns 1
- *   lt: returns -1
- *   eq: returns 0
- */
-
-static int compare_dents(const void *a, const void *b)
-{
-   const struct gfs2_dirent *dent_a, *dent_b;
-   u32 hash_a, hash_b;
-   int ret = 0;
-
-   dent_a = *(const struct gfs2_dirent **)a;
-   hash_a = be32_to_cpu(dent_a-de_hash);
-
-   dent_b = *(const struct gfs2_dirent **)b;
-   hash_b = be32_to_cpu(dent_b-de_hash);
-
-   if (hash_a  hash_b)
-   ret = 1;
-   else if (hash_a  hash_b)
-   ret = -1;
-   else {
-   unsigned int len_a = be16_to_cpu(dent_a-de_name_len);
-   unsigned int len_b = be16_to_cpu(dent_b-de_name_len);
-
-   if (len_a  len_b)
-   ret = 1;
-   else if (len_a  len_b)
-   ret = -1;
-   else
-   ret = memcmp(dent_a + 1, dent_b + 1, len_a);
-   }
-
-   return ret;
-}
-
-/**
   * do_filldir_main - read out directory entries
   * @dip: The GFS2 inode
   * @ctx: what to feed the entries to
@@ -1225,11 +1222,6 @@ static int compare_dents(const void *a, const void *b)
   * @entries: the number of entries in darr
   * @copied: pointer to int that's non-zero if a entry has been copied out
   *
- * Jump through some hoops to make sure that if there

Re: [Cluster-devel] gfs2: gfs2_dir_get_hash_table(): avoiding deferred vfree() is easy here...

2014-11-20 Thread Steven Whitehouse


Hi,

All five patches now in the GFS2 -nmw tree. Thanks,

Steve.

On 20/11/14 05:19, Al Viro wrote:

vfree() is allowed under spinlock these days, but it's cheaper when
it doesn't step into deferred case and here it's very easy to avoid.

Signed-off-by: Al Viro v...@zeniv.linux.org.uk
---
  fs/gfs2/dir.c |7 ---
  1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c247fed..c5a34f0 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -370,11 +370,12 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode 
*ip)
}
  
  	spin_lock(inode-i_lock);

-   if (ip-i_hash_cache)
-   kvfree(hc);
-   else
+   if (likely(!ip-i_hash_cache)) {
ip-i_hash_cache = hc;
+   hc = NULL;
+   }
spin_unlock(inode-i_lock);
+   kvfree(hc);
  
  	return ip-i_hash_cache;

  }

Re: [Cluster-devel] [PATCH 1/1] GFS2: Deletion of unnecessary checks before two function calls

2014-11-18 Thread Steven Whitehouse


Hi,

Now in the GFS2 -nmw git tree. Thanks,

Steve.

On 18/11/14 10:35, SF Markus Elfring wrote:

From: Markus Elfring elfr...@users.sourceforge.net
Date: Tue, 18 Nov 2014 11:31:23 +0100

The functions iput() and put_pid() test whether their argument is NULL
and then return immediately. Thus the test around the call is not needed.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring elfr...@users.sourceforge.net
---
  fs/gfs2/glock.c  | 3 +--
  fs/gfs2/ops_fstype.c | 3 +--
  2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7f513b1..f4aa085 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -836,8 +836,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, 
struct gfs2_holder *
gh-gh_flags = flags;
gh-gh_iflags = 0;
gh-gh_ip = (unsigned long)__builtin_return_address(0);
-   if (gh-gh_owner_pid)
-   put_pid(gh-gh_owner_pid);
+   put_pid(gh-gh_owner_pid);
gh-gh_owner_pid = get_pid(task_pid(current));
  }
  
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c

index d3eae24..272ff81 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -918,8 +918,7 @@ fail_qc_i:
  fail_ut_i:
iput(sdp-sd_sc_inode);
  fail:
-   if (pn)
-   iput(pn);
+   iput(pn);
return error;
  }

Re: [Cluster-devel] [PATCH 1/3] gfs2: Use inode_newsize_ok and get_write_access in fallocate

2014-11-17 Thread Steven Whitehouse


Hi,

All three patches now in the -nmw tree. Thanks,

Steve.

On 12/11/14 17:24, Andrew Price wrote:

gfs2_fallocate wasn't checking inode_newsize_ok nor get_write_access.
Split out the context setup and inode locking pieces into a separate
function to make it more clear and add these missing calls.

inode_newsize_ok is called conditional on FALLOC_FL_KEEP_SIZE as there
is no need to enforce a file size limit if it isn't going to change.

Signed-off-by: Andrew Price anpr...@redhat.com
---
  fs/gfs2/file.c | 66 ++
  1 file changed, 44 insertions(+), 22 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 80dd44d..08ee6ce 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -796,8 +796,7 @@ static void calc_max_reserv(struct gfs2_inode *ip, loff_t 
max, loff_t *len,
}
  }
  
-static long gfs2_fallocate(struct file *file, int mode, loff_t offset,

-  loff_t len)
+static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, 
loff_t len)
  {
struct inode *inode = file_inode(file);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -811,14 +810,9 @@ static long gfs2_fallocate(struct file *file, int mode, 
loff_t offset,
loff_t bsize_mask = ~((loff_t)sdp-sd_sb.sb_bsize - 1);
loff_t next = (offset + len - 1)  sdp-sd_sb.sb_bsize_shift;
loff_t max_chunk_size = UINT_MAX  bsize_mask;
-   struct gfs2_holder gh;
  
  	next = (next + 1)  sdp-sd_sb.sb_bsize_shift;
  
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */

-   if (mode  ~FALLOC_FL_KEEP_SIZE)
-   return -EOPNOTSUPP;
-
offset = bsize_mask;
  
  	len = next - offset;

@@ -829,17 +823,6 @@ static long gfs2_fallocate(struct file *file, int mode, 
loff_t offset,
if (bytes == 0)
bytes = sdp-sd_sb.sb_bsize;
  
-	error = gfs2_rs_alloc(ip);

-   if (error)
-   return error;
-
-   mutex_lock(inode-i_mutex);
-
-   gfs2_holder_init(ip-i_gl, LM_ST_EXCLUSIVE, 0, gh);
-   error = gfs2_glock_nq(gh);
-   if (unlikely(error))
-   goto out_uninit;
-
gfs2_size_hint(file, offset, len);
  
  	while (len  0) {

@@ -852,8 +835,7 @@ static long gfs2_fallocate(struct file *file, int mode, 
loff_t offset,
}
error = gfs2_quota_lock_check(ip);
if (error)
-   goto out_unlock;
-
+   return error;
  retry:
gfs2_write_calc_reserv(ip, bytes, data_blocks, ind_blocks);
  
@@ -897,18 +879,58 @@ retry:
  
  	if (error == 0)

error = generic_write_sync(file, pos, count);
-   goto out_unlock;
+   return error;
  
  out_trans_fail:

gfs2_inplace_release(ip);
  out_qunlock:
gfs2_quota_unlock(ip);
+   return error;
+}
+
+static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t 
len)
+{
+   struct inode *inode = file_inode(file);
+   struct gfs2_inode *ip = GFS2_I(inode);
+   struct gfs2_holder gh;
+   int ret;
+
+   if (mode  ~FALLOC_FL_KEEP_SIZE)
+   return -EOPNOTSUPP;
+
+   mutex_lock(inode-i_mutex);
+
+   gfs2_holder_init(ip-i_gl, LM_ST_EXCLUSIVE, 0, gh);
+   ret = gfs2_glock_nq(gh);
+   if (ret)
+   goto out_uninit;
+
+   if (!(mode  FALLOC_FL_KEEP_SIZE) 
+   (offset + len)  inode-i_size) {
+   ret = inode_newsize_ok(inode, offset + len);
+   if (ret)
+   goto out_unlock;
+   }
+
+   ret = get_write_access(inode);
+   if (ret)
+   goto out_unlock;
+
+   ret = gfs2_rs_alloc(ip);
+   if (ret)
+   goto out_putw;
+
+   ret = __gfs2_fallocate(file, mode, offset, len);
+   if (ret)
+   gfs2_rs_deltree(ip-i_res);
+out_putw:
+   put_write_access(inode);
  out_unlock:
gfs2_glock_dq(gh);
  out_uninit:
gfs2_holder_uninit(gh);
mutex_unlock(inode-i_mutex);
-   return error;
+   return ret;
  }
  
  #ifdef CONFIG_GFS2_FS_LOCKING_DLM

Re: [Cluster-devel] [PATCH 0/2 v2] new vfs freeze hooks and gfs2 freeze rewrite

2014-11-17 Thread Steven Whitehouse


Hi,

Both patches are now in the -nmw tree. Thanks,

Steve.

On 14/11/14 02:42, Benjamin Marzinski wrote:

The existing gfs2 freeze code can't properly make use the kernel vfs
freezing code on all the nodes of a cluster. This is because the only hook
into the gfs2 freeze code comes inside of freeze_super, which holds the
sb-s_umount lock. In order to notify the other nodes in the cluster to
freeze the filesystem, gfs2 must grab a glock. Thus, if two nodes tried to
freeze the filesystem at the same time and all cluster nodes used
freeze_super to enforce the freeze, both nodes would first grab the
sb-s_umount lock in freeze_super, and then they would race to grab the
gfs2 freeze glock. The loser would be stuck in freeze_super, waiting on the
freeze glock. This means that gfs2 can't rely on freezing all the nodes in
the cluster by calling freeze_super, because one node would already be
stuck in that function with sb-s_umount held. So, currently gfs2 only
calls freeze_super on the node that initiates the freeze and deals with
blocking the writes entirely within the gfs2 code. This involves some
pretty hacky code.

To fix this, the first patch of this patchset adds two new vfs hooks,
freeze_super and thaw_super.  If a filesystem implements these hooks, they
will be called instead of the vfs freeze_super and thaw_super functions.
The second patch of this set makes use of these hooks to simplify the gfs2
freezing code. This allows gfs2 to grab the freeze glock first and call the
actual vfs freeze functions in response to node grabbing the freeze glock.
Thus, all nodes can use the vfs freeze_super and thaw_super code to freeze
the filesystem. This vesion also fixes some issues with a race between
reserving log space and making the filesystem read-only.

The current hook names were suggested by Jan Kara, over my original
prepare_freeeze and prepare_thaw names, which were more confusing.

Benjamin Marzinski (2):
   fs: add freeze_super/thaw_super fs hooks
   gfs2: update freeze code to use freeze/thaw_super on all nodes

  fs/block_dev.c   |  10 -
  fs/gfs2/glops.c  |  26 ++--
  fs/gfs2/glops.h  |   2 +
  fs/gfs2/incore.h |  18 ++---
  fs/gfs2/inode.c  |  40 +-
  fs/gfs2/log.c|  42 +--
  fs/gfs2/main.c   |  11 -
  fs/gfs2/ops_fstype.c |  18 +++--
  fs/gfs2/super.c  | 112 ++-
  fs/gfs2/super.h  |   1 +
  fs/gfs2/trans.c  |  17 ++--
  fs/ioctl.c   |   6 ++-
  include/linux/fs.h   |   2 +
  13 files changed, 176 insertions(+), 129 deletions(-)

Re: [Cluster-devel] [PATCH v5 6/7] fs: pass iocb to generic_write_sync

2014-11-06 Thread Steven Whitehouse


Hi,

On 05/11/14 21:14, Milosz Tanski wrote:

From: Christoph Hellwig h...@lst.de

Clean up the generic_write_sync by just passing an iocb and a bytes
written / negative errno argument.  In addition to simplifying the
callers this also prepares for passing a per-operation O_DSYNC
flag.  Two callers didn't quite fit that scheme:

  - dio_complete didn't both to update ki_pos as we don't need it
on a iocb that is about to be freed, so we had to add it. Additionally
it also synced out written data in the error case, which has been
changed to operate like the other callers.
  - gfs2 also used generic_write_sync to implement a crude version
of fallocate.  It has been switched to use an open coded variant
instead.

Signed-off-by: Christoph Hellwig h...@lst.de

GFS2 bits:

Acked-by: Steven Whitehouse swhit...@redhat.com

I know that Andy Price has some work in this area too, so in due course 
we'll have to be careful not to create a merge conflict here. Copying in 
Andy so he can see the changes,


Steve.


---
  fs/block_dev.c |  8 +---
  fs/btrfs/file.c|  7 ++-
  fs/cifs/file.c |  8 +---
  fs/direct-io.c |  8 ++--
  fs/ext4/file.c |  8 +---
  fs/gfs2/file.c |  9 +++--
  fs/ntfs/file.c |  8 ++--
  fs/udf/file.c  | 11 ++-
  fs/xfs/xfs_file.c  |  8 +---
  include/linux/fs.h |  8 +---
  mm/filemap.c   | 30 --
  11 files changed, 40 insertions(+), 73 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index cc9d411..c529b1c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1568,18 +1568,12 @@ static long block_ioctl(struct file *file, unsigned 
cmd, unsigned long arg)
   */
  ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
  {
-   struct file *file = iocb-ki_filp;
struct blk_plug plug;
ssize_t ret;
  
  	blk_start_plug(plug);

ret = __generic_file_write_iter(iocb, from);
-   if (ret  0) {
-   ssize_t err;
-   err = generic_write_sync(file, iocb-ki_pos - ret, ret);
-   if (err  0)
-   ret = err;
-   }
+   ret = generic_write_sync(iocb, ret);
blk_finish_plug(plug);
return ret;
  }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a18ceab..4f4a6f7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1820,11 +1820,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 */
BTRFS_I(inode)-last_trans = root-fs_info-generation + 1;
BTRFS_I(inode)-last_sub_trans = root-log_transid;
-   if (num_written  0) {
-   err = generic_write_sync(file, pos, num_written);
-   if (err  0)
-   num_written = err;
-   }
+
+   num_written = generic_write_sync(iocb, num_written);
  
  	if (sync)

atomic_dec(BTRFS_I(inode)-sync_writers);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c485afa..32359de 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2706,13 +2706,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
rc = __generic_file_write_iter(iocb, from);
mutex_unlock(inode-i_mutex);
  
-		if (rc  0) {

-   ssize_t err;
-
-   err = generic_write_sync(file, iocb-ki_pos - rc, rc);
-   if (err  0)
-   rc = err;
-   }
+   rc = generic_write_sync(iocb, rc);
} else {
mutex_unlock(inode-i_mutex);
}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e181b6b..b72ac83 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -257,12 +257,8 @@ static ssize_t dio_complete(struct dio *dio, loff_t 
offset, ssize_t ret,
inode_dio_done(dio-inode);
if (is_async) {
if (dio-rw  WRITE) {
-   int err;
-
-   err = generic_write_sync(dio-iocb-ki_filp, offset,
-transferred);
-   if (err  0  ret  0)
-   ret = err;
+   dio-iocb-ki_pos = offset + transferred;
+   ret = generic_write_sync(dio-iocb, ret);
}
  
  		aio_complete(dio-iocb, ret, 0);

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index aca7b24..79b000c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -175,13 +175,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter 
*from)
ret = __generic_file_write_iter(iocb, from);
mutex_unlock(inode-i_mutex);
  
-	if (ret  0) {

-   ssize_t err;
-
-   err = generic_write_sync(file, iocb-ki_pos - ret, ret);
-   if (err  0)
-   ret = err;
-   }
+   ret = generic_write_sync(iocb, ret);
if (o_direct)
blk_finish_plug(plug);
  
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c

index 80dd44d

Re: [Cluster-devel] [GFS2 PATCH 1/3] GFS2: Set of distributed preferences for rgrps

2014-10-28 Thread Steven Whitehouse


Hi,

On 27/10/14 14:07, Bob Peterson wrote:

- Original Message -

+   if ((loops  3) 
+   gfs2_rgrp_used_recently(rs, 1000) 
+   gfs2_rgrp_congested(rs-rs_rbm.rgd, loops))
+   goto next_rgrp;
+   }

This makes no sense, we end the loop when loops == 3 so that these
conditions will be applied in every case which is not what we want. We
must always end up doing a search of every rgrp in the worst case, in
order that if there is some space left somewhere, we will eventually
find it.

Definitely better wrt figuring out which rgrps to prefer, but I'm not
yet convinced about this logic. The whole point of the congestion logic
is to figure out ahead of time, whether it will take a long time to
access that rgrp, so it seems that this is not quite right, otherwise
there should be no need to bypass it like this. The fast_to_acquire
logic should at least by merged into the rgrp_congested logic, possibly
by just reducing the threshold at which congestion is measured.

It might be useful to introduce a tracepoint for when we reject and rgrp
during allocation, with a reason as to why it was rejected, so that it
is easier to see whats going on here,

Steve.

Hi,

Sigh. You're right: Good catch. The problem is that I've done more than 30
attempts at trying to get this right in my git tree, each of which has
up to 15 patches for various things. Somewhere around iteration 20, I
dropped an important change. My intent was always to add another layer
of rgrp criteria, so loops would be 4 rather than 3. I had done this
with a different patch, but it got dropped by accident. The 3 original
layers are as follows:

loop 0: Reject rgrps that are likely congested (based on past history)
 and rgrps where we just found congestion.
 Only accept rgrps for which we can get a multi-block reservation.
loop 1: Reject rgrps that are likely congested (based on past history)
 and rgrps where we just found congestion. Accept rgrps that have
 enough free space, even if we can't get a reservation.
loop 2: Don't ever reject rgrps because we're out of ideal conditions.
That is not how it is supposed to work. Loop 0 is the one when we try 
and avoid rgrps which are congested. Loop 1 and 2 are the same in that 
both are supposed to do a full scan of the rgrps. The only reason for 
loop 2 is that we flush out any unlinked inodes that may have 
accumulated between loop 1 and loop 2, but otherwise they should be 
identical.



The new scheme was intended to add a new layer 0 which only accepts rgrps
within a preferred subset of rgrps. In other words:

loop 0: Reject rgrps that aren't in our preferred subset of rgrps.
 Reject rgrps that are likely congested (based on past history)
 and rgrps where we just found congestion.
 Only accept rgrps for which we can get a multi-block reservation.
loop 1: Reject rgrps that are likely congested (based on past history)
 and rgrps where we just found congestion.
 Only accept rgrps for which we can get a multi-block reservation.
loop 2: Reject rgrps that are likely congested (based on past history)
 and rgrps where we just found congestion. Accept any rgrp that has
 enough free space, even if we can't get a reservation.
loop 3: Don't ever reject rgrps because we're out of ideal conditions.

But is 4 loops too many? I could combine 0 and 1, and in fact, today's code
accidentally does just that. The mistake was probably that I had been
experimenting with 3 versus 4 layers and had switched them back and forth
a few times for various tests.

Regards,

Bob Peterson
Red Hat File Systems
Yes, definitely too many. If we are looping that many times, it suggests 
that something is wrong with the way in which we are searching for 
rgrps. It would be better to use fewer loops if at all possible, rather 
than more, since this looping will be very slow,


Steve.

Re: [Cluster-devel] [GFS2 PATCH 1/3] GFS2: Set of distributed preferences for rgrps

2014-10-27 Thread Steven Whitehouse


Hi,

On 24/10/14 18:49, Bob Peterson wrote:

This patch tries to use the journal numbers to evenly distribute
which node prefers which resource group for block allocations. This
is to help performance.
---
  fs/gfs2/incore.h |  1 +
  fs/gfs2/rgrp.c   | 66 +++-
  2 files changed, 62 insertions(+), 5 deletions(-)

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 39e7e99..1b89918 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -97,6 +97,7 @@ struct gfs2_rgrpd {
  #define GFS2_RDF_CHECK0x1000 /* check for unlinked inodes 
*/
  #define GFS2_RDF_UPTODATE 0x2000 /* rg is up to date */
  #define GFS2_RDF_ERROR0x4000 /* error in rg */
+#define GFS2_RDF_PREFERRED 0x8000 /* This rgrp is preferred */
  #define GFS2_RDF_MASK 0xf000 /* mask for internal flags */
spinlock_t rd_rsspin;   /* protects reservation related vars */
struct rb_root rd_rstree;   /* multi-block reservation tree */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 7474c41..f65e56b 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -936,7 +936,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
rgd-rd_gl-gl_vm.start = rgd-rd_addr * bsize;
rgd-rd_gl-gl_vm.end = rgd-rd_gl-gl_vm.start + (rgd-rd_length * 
bsize) - 1;
rgd-rd_rgl = (struct gfs2_rgrp_lvb *)rgd-rd_gl-gl_lksb.sb_lvbptr;
-   rgd-rd_flags = ~GFS2_RDF_UPTODATE;
+   rgd-rd_flags = ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
if (rgd-rd_data  sdp-sd_max_rg_data)
sdp-sd_max_rg_data = rgd-rd_data;
spin_lock(sdp-sd_rindex_spin);
@@ -955,6 +955,36 @@ fail:
  }
  
  /**

+ * set_rgrp_preferences - Run all the rgrps, selecting some we prefer to use
+ * @sdp: the GFS2 superblock
+ *
+ * The purpose of this function is to select a subset of the resource groups
+ * and mark them as PREFERRED. We do it in such a way that each node prefers
+ * to use a unique set of rgrps to minimize glock contention.
+ */
+static void set_rgrp_preferences(struct gfs2_sbd *sdp)
+{
+   struct gfs2_rgrpd *rgd, *first;
+   int i;
+
+   /* Skip an initial number of rgrps, based on this node's journal ID.
+  That should start each node out on its own set. */
+   rgd = gfs2_rgrpd_get_first(sdp);
+   for (i = 0; i  sdp-sd_lockstruct.ls_jid; i++)
+   rgd = gfs2_rgrpd_get_next(rgd);
+   first = rgd;
+
+   do {
+   rgd-rd_flags |= GFS2_RDF_PREFERRED;
+   for (i = 0; i  sdp-sd_journals; i++) {
+   rgd = gfs2_rgrpd_get_next(rgd);
+   if (rgd == first)
+   break;
+   }
+   } while (rgd != first);
+}
+
+/**
   * gfs2_ri_update - Pull in a new resource index from the disk
   * @ip: pointer to the rindex inode
   *
@@ -973,6 +1003,8 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
if (error  0)
return error;
  
+	set_rgrp_preferences(sdp);

+
sdp-sd_rindex_uptodate = 1;
return 0;
  }
@@ -1891,6 +1923,25 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, 
const struct gfs2_rgrpd *b
  }
  
  /**

+ * fast_to_acquire - determine if a resource group will be fast to acquire
+ *
+ * If this is one of our preferred rgrps, it should be quicker to acquire,
+ * because we tried to set ourselves up as dlm lock master.
+ */
+static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
+{
+   struct gfs2_glock *gl = rgd-rd_gl;
+
+   if (gl-gl_state != LM_ST_UNLOCKED  list_empty(gl-gl_holders) 
+   !test_bit(GLF_DEMOTE_IN_PROGRESS, gl-gl_flags) 
+   !test_bit(GLF_DEMOTE, gl-gl_flags))
+   return 1;
+   if (rgd-rd_flags  GFS2_RDF_PREFERRED)
+   return 1;
+   return 0;
+}
+
+/**
   * gfs2_inplace_reserve - Reserve space in the filesystem
   * @ip: the inode to reserve space for
   * @ap: the allocation parameters
@@ -1932,10 +1983,15 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const 
struct gfs2_alloc_parms *a
rg_locked = 0;
if (skip  skip--)
goto next_rgrp;
-   if (!gfs2_rs_active(rs)  (loops  2) 
-gfs2_rgrp_used_recently(rs, 1000) 
-gfs2_rgrp_congested(rs-rs_rbm.rgd, loops))
-   goto next_rgrp;
+   if (!gfs2_rs_active(rs)) {
+   if (loops == 0 
+   !fast_to_acquire(rs-rs_rbm.rgd))
+   goto next_rgrp;



+   if ((loops  3) 
+   gfs2_rgrp_used_recently(rs, 1000) 
+   gfs2_rgrp_congested(rs-rs_rbm.rgd, loops))
+   goto next_rgrp;
+

Re: [Cluster-devel] [PATCH][try6] VFS: new want_holesize and got_holesize buffer_head flags for fiemap

2014-10-22 Thread Steven Whitehouse


Hi,

On 22/10/14 07:04, Christoph Hellwig wrote:

This looks like a big indicator that get_blocks is the wrong
interface for fiemap.



The question is then, what a better interface would look like? We could 
have get_hole as an extra operation I suppose. Not sure that I really 
see why thats better or worse than the extra flag at the moment? or did 
you have something else in mind?


Steve.

Re: [Cluster-devel] [GFS2 PATCH 2/4] GFS2: Make block reservations more persistent

2014-10-21 Thread Steven Whitehouse


Hi,

On 20/10/14 17:37, Bob Peterson wrote:

Before this patch, whenever a struct file (opened to allow writes) was
closed, the multi-block reservation structure associated with the inode
was deleted. That's a problem, especially when there are multiple writers.
Applications that do open-write-close will suffer from greater levels
of fragmentation and need to re-do work to perform write operations.
This patch removes the reservation delete from the file close code so
that they're more persistent until the inode is deleted.
---
  fs/gfs2/file.c | 7 ---
  1 file changed, 7 deletions(-)
This doesn't seem like a good plan. If you run something like untar, 
does that now leave gaps in the allocations? If there are applications 
which are going open/write/close in a loop, then it seems like it is the 
application that needs to be changed, rather than the filesystem,


Steve.


diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 7f4ed3d..2976019 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -616,15 +616,8 @@ static int gfs2_open(struct inode *inode, struct file 
*file)
  
  static int gfs2_release(struct inode *inode, struct file *file)

  {
-   struct gfs2_inode *ip = GFS2_I(inode);
-
kfree(file-private_data);
file-private_data = NULL;
-
-   if (!(file-f_mode  FMODE_WRITE))
-   return 0;
-
-   gfs2_rs_delete(ip, inode-i_writecount);
return 0;
  }

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Block reservation doubling scheme

2014-10-10 Thread Steven Whitehouse


Hi,

On 10/10/14 04:39, Bob Peterson wrote:

- Original Message -

- Original Message -

This patch introduces a new block reservation doubling scheme. If we

Maybe I sent this patch out prematurely. Instead of doubling the
reservation, maybe I should experiment with making it grow additively.
IOW, Instead of 32-64-128-256-512, I should use:
32-64-96-128-160-192-224-etc...
I know other file systems using doubling schemes, but I'm concerned
about it being too aggressive.

I tried an additive reservations algorithm. I basically changed the
previous patch from doubling the reservation to adding 32 blocks.
In other words, I replaced:

+   ip-i_rsrv_minblks = 1;
with this:
+   ip-i_rsrv_minblks += RGRP_RSRV_MINBLKS;

The results were not as good, but still very impressive, and maybe
acceptable:

Reservation doubling scheme:
EXTENT COUNT FOR OUTPUT FILES =  310103
EXTENT COUNT FOR OUTPUT FILES =  343990
EXTENT COUNT FOR OUTPUT FILES =  332818
EXTENT COUNT FOR OUTPUT FILES =  336852
EXTENT COUNT FOR OUTPUT FILES =  334820

Reservation additive scheme (32 blocks):
EXTENT COUNT FOR OUTPUT FILES =  322406
EXTENT COUNT FOR OUTPUT FILES =  341665
EXTENT COUNT FOR OUTPUT FILES =  341769
EXTENT COUNT FOR OUTPUT FILES =  348676
EXTENT COUNT FOR OUTPUT FILES =  348079

So I'm looking for opinions:
(a) Stick with the original reservation doubling patch, or
(b) Go with the additive version.
(c) Any other ideas?

Regards,

Bob Peterson
Red Hat File Systems


I think you are very much along the right lines. The issue is to ensure 
that all the evidence that is available is taken into account in 
figuring out how large a reservation to make. There are various clues, 
such as the time between writes, the size of the writes, whether the 
file gets closed between writes, whether the writes are contiguous and 
so forth.


Some of those things are taken into account already, however we can 
probably do better. We may be able to also take some hints from things 
like calls to fsync (should we drop reservations that are small at this 
point, since it likely signifies a significant point in the file, if 
fsync is called?) or even detect well known non-linear write patterns, 
e.g. backwards stride patterns or large matrix access patterns (by row 
or column).


The struct file is really the best place to store this context 
information, since if there are multiple writers to the same inode, then 
there is a fair chance that they'll have separate struct files. Does 
this happen in your test workload?


The readahead code can already detect some common read patterns, and it 
also turns itself off if the reads are random. The readahead problem is 
actually very much the same problem in that it tries to estimate which 
reads are coming next based on the context that has been seen already, 
so there may well be some lessons to be learned from that too.


I think its important to look at the statistics of lots of different 
workloads, and to check them off against your candidate algorithm(s), to 
ensure that the widest range of potential access patterns are taken into 
account,


Steve.

Re: [Cluster-devel] [PATCH 1/1] GFS2: use _RET_IP_ instead of (unsigned long)__builtin_return_address(0)

2014-10-08 Thread Steven Whitehouse


Hi,

Apologies for taking so long - I've added this into the -nmw tree now. 
Thanks,


Steve.

On 03/10/14 19:15, Fabian Frederick wrote:

use macro definition

Signed-off-by: Fabian Frederick f...@skynet.be
---
  fs/gfs2/glock.c | 4 ++--
  fs/gfs2/glops.c | 2 +-
  fs/gfs2/trans.c | 2 +-
  3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7f513b1..8f0c19d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -811,7 +811,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int 
state, unsigned flags,
  {
INIT_LIST_HEAD(gh-gh_list);
gh-gh_gl = gl;
-   gh-gh_ip = (unsigned long)__builtin_return_address(0);
+   gh-gh_ip = _RET_IP_;
gh-gh_owner_pid = get_pid(task_pid(current));
gh-gh_state = state;
gh-gh_flags = flags;
@@ -835,7 +835,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, 
struct gfs2_holder *
gh-gh_state = state;
gh-gh_flags = flags;
gh-gh_iflags = 0;
-   gh-gh_ip = (unsigned long)__builtin_return_address(0);
+   gh-gh_ip = _RET_IP_;
if (gh-gh_owner_pid)
put_pid(gh-gh_owner_pid);
gh-gh_owner_pid = get_pid(task_pid(current));
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 2ffc67d..1cc0bba 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -93,7 +93,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
   * tr-alloced is not set since the transaction structure is
   * on the stack */
tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
-   tr.tr_ip = (unsigned long)__builtin_return_address(0);
+   tr.tr_ip = _RET_IP_;
sb_start_intwrite(sdp-sd_vfs);
if (gfs2_log_reserve(sdp, tr.tr_reserved)  0) {
sb_end_intwrite(sdp-sd_vfs);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 0546ab4..42bfd336 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -44,7 +44,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int 
blocks,
if (!tr)
return -ENOMEM;
  
-	tr-tr_ip = (unsigned long)__builtin_return_address(0);

+   tr-tr_ip = _RET_IP_;
tr-tr_blocks = blocks;
tr-tr_revokes = revokes;
tr-tr_reserved = 1;

[Cluster-devel] [PATCH 2/4] GFS2: Make rename not save dirent location

2014-10-08 Thread Steven Whitehouse

From: Bob Peterson rpete...@redhat.com

This patch fixes a regression in the patch GFS2: Remember directory
insert point, commit 2b47dad866d04f14c328f888ba5406057b8c7d33.
The problem had to do with the rename function: The function found
space for the new dirent, and remembered that location. But then the
old dirent was removed, which often moved the eligible location for
the renamed dirent. Putting the new dirent at the saved location
caused file system corruption.

This patch adds a new save_loc variable to struct gfs2_diradd.
If 1, the dirent location is saved. If 0, the dirent location is not
saved and the buffer_head is released as per previous behavior.

Signed-off-by: Bob Peterson rpete...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1a349f9..5d4261f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -2100,8 +2100,13 @@ int gfs2_diradd_alloc_required(struct inode *inode, 
const struct qstr *name,
}
if (IS_ERR(dent))
return PTR_ERR(dent);
-   da-bh = bh;
-   da-dent = dent;
+
+   if (da-save_loc) {
+   da-bh = bh;
+   da-dent = dent;
+   } else {
+   brelse(bh);
+   }
return 0;
 }
 
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 126c65d..e1b309c 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -23,6 +23,7 @@ struct gfs2_diradd {
unsigned nr_blocks;
struct gfs2_dirent *dent;
struct buffer_head *bh;
+   int save_loc;
 };
 
 extern struct inode *gfs2_dir_search(struct inode *dir,
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9516f5c..fcf42ea 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -600,7 +600,7 @@ static int gfs2_create_inode(struct inode *dir, struct 
dentry *dentry,
int error, free_vfs_inode = 0;
u32 aflags = 0;
unsigned blocks = 1;
-   struct gfs2_diradd da = { .bh = NULL, };
+   struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
 
if (!name-len || name-len  GFS2_FNAMESIZE)
return -ENAMETOOLONG;
@@ -900,7 +900,7 @@ static int gfs2_link(struct dentry *old_dentry, struct 
inode *dir,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder ghs[2];
struct buffer_head *dibh;
-   struct gfs2_diradd da = { .bh = NULL, };
+   struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
int error;
 
if (S_ISDIR(inode-i_mode))
@@ -1338,7 +1338,7 @@ static int gfs2_rename(struct inode *odir, struct dentry 
*odentry,
struct gfs2_rgrpd *nrgd;
unsigned int num_gh;
int dir_rename = 0;
-   struct gfs2_diradd da = { .nr_blocks = 0, };
+   struct gfs2_diradd da = { .nr_blocks = 0, .save_loc = 0, };
unsigned int x;
int error;
 
-- 
1.8.3.1

[Cluster-devel] GFS2: Pre-pull patch posting (merge window)

2014-10-08 Thread Steven Whitehouse

Hi,

Not a huge amount this time... just four patches. This time we have a couple
of bug fixes, one relating to bad i_goal values which are now ignored (i_goal
is basically a hint so it is safe to so this) and another relating to the
saving of the dirent location during rename. There is one performance
improvement, which is an optimisation in rgblk_free so that multiple block
deallocations will now be more efficient, and one clean up patch to use
_RET_IP_ rather than writing it out longhand,

Steve.

[Cluster-devel] [PATCH 3/4] GFS2: Use gfs2_rbm_incr in rgblk_free

2014-10-08 Thread Steven Whitehouse

From: Bob Peterson rpete...@redhat.com

This patch speeds up GFS2 unlink operations by using function
gfs2_rbm_incr rather than continuously calculating the rbm.

Signed-off-by: Bob Peterson rpete...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 55ef72d..7474c41 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2097,7 +2097,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd 
*sdp, u64 bstart,
 u32 blen, unsigned char new_state)
 {
struct gfs2_rbm rbm;
-   struct gfs2_bitmap *bi;
+   struct gfs2_bitmap *bi, *bi_prev = NULL;
 
rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
if (!rbm.rgd) {
@@ -2106,18 +2106,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd 
*sdp, u64 bstart,
return NULL;
}
 
+   gfs2_rbm_from_block(rbm, bstart);
while (blen--) {
-   gfs2_rbm_from_block(rbm, bstart);
bi = rbm_bi(rbm);
-   bstart++;
-   if (!bi-bi_clone) {
-   bi-bi_clone = kmalloc(bi-bi_bh-b_size,
-  GFP_NOFS | __GFP_NOFAIL);
-   memcpy(bi-bi_clone + bi-bi_offset,
-  bi-bi_bh-b_data + bi-bi_offset, bi-bi_len);
+   if (bi != bi_prev) {
+   if (!bi-bi_clone) {
+   bi-bi_clone = kmalloc(bi-bi_bh-b_size,
+ GFP_NOFS | __GFP_NOFAIL);
+   memcpy(bi-bi_clone + bi-bi_offset,
+  bi-bi_bh-b_data + bi-bi_offset,
+  bi-bi_len);
+   }
+   gfs2_trans_add_meta(rbm.rgd-rd_gl, bi-bi_bh);
+   bi_prev = bi;
}
-   gfs2_trans_add_meta(rbm.rgd-rd_gl, bi-bi_bh);
gfs2_setbit(rbm, false, new_state);
+   gfs2_rbm_incr(rbm);
}
 
return rbm.rgd;
-- 
1.8.3.1

[Cluster-devel] GFS2: Pull request (merge window)

2014-10-08 Thread Steven Whitehouse

Hi,

Please consider pulling the following changes,

Steve.

-
The following changes since commit 37504a3be90b69438426d74ccf467a9fe192932b:

  Merge tag 'gfs2-fixes' of 
git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-3.0-fixes (2014-09-16 
07:47:04 -0700)

are available in the git repository at:


  git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-3.0-nmw.git 
tags/gfs2-merge-window

for you to fetch changes up to d29c0afe4db72ceb72149c3894a6079674e9751f:

  GFS2: use _RET_IP_ instead of (unsigned long)__builtin_return_address(0) 
(2014-10-08 09:57:07 +0100)


This time we have a couple of bug fixes, one relating to bad i_goal values
which are now ignored (i_goal is basically a hint so it is safe to so this)
and another relating to the saving of the dirent location during rename.

There is one performance improvement, which is an optimisation in rgblk_free
so that multiple block deallocations will now be more efficient,
and one clean up patch to use _RET_IP_ rather than writing it out longhand.


Abhi Das (1):
  GFS2: fix bad inode i_goal values during block allocation

Bob Peterson (2):
  GFS2: Make rename not save dirent location
  GFS2: Use gfs2_rbm_incr in rgblk_free

Fabian Frederick (1):
  GFS2: use _RET_IP_ instead of (unsigned long)__builtin_return_address(0)

 fs/gfs2/dir.c   |  9 +++--
 fs/gfs2/dir.h   |  1 +
 fs/gfs2/glock.c |  4 ++--
 fs/gfs2/glops.c |  2 +-
 fs/gfs2/inode.c |  7 ---
 fs/gfs2/rgrp.c  | 30 +-
 fs/gfs2/rgrp.h  |  1 +
 fs/gfs2/trans.c |  2 +-
 8 files changed, 38 insertions(+), 18 deletions(-)



signature.asc
Description: This is a digitally signed message part

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Change minimum reservation size to 64 blocks

2014-10-06 Thread Steven Whitehouse


Hi,

On 04/10/14 15:33, Bob Peterson wrote:

Hi,

I've been working on the problems of both slow performance and excessive
file fragmentation, especially with regard to the RHEL7 kernel. I've
created some patches (still forthcoming) to improve these things. For a
long time, I thought the fragmentation issue was related to GFS2 not
using its reservations efficiently. I wrote a patch that revises the
calculations in function gfs2_write_calc_reserv, but it didn't seem to
help. I also thought it might be releasing the reservations too early,
but that turned out not to be the case. I've done extensive performance
testing and determined that, even under the best circumstances (IOW by
minimizing both inter and intra-node resource group contention), file
fragmentation can be improved significantly by doubling the minimum
block reservation size from 32 to 64 blocks. To test this, I did
several two+ hour tests (real-life application) that uses lots of
files in GFS2. After the test is complete, I totaled all the file
extents (fragments) using filefrag.

Here are the results of five two-hour runs performed with new, revised
calculations, and today's 32-block reservations:

EXTENT COUNT FOR OUTPUT FILES =  444352
EXTENT COUNT FOR OUTPUT FILES =  468425
EXTENT COUNT FOR OUTPUT FILES =  472464
EXTENT COUNT FOR OUTPUT FILES =  482224
EXTENT COUNT FOR OUTPUT FILES =  481174

Here are the results of five runs performed with the old calculations
and 64-block reservations:

EXTENT COUNT FOR OUTPUT FILES =  365743
EXTENT COUNT FOR OUTPUT FILES =  398628
EXTENT COUNT FOR OUTPUT FILES =  404614
EXTENT COUNT FOR OUTPUT FILES =  401355
EXTENT COUNT FOR OUTPUT FILES =  384599

As you can see, by doubling the minimum reservation size, the files
have about 20 percent fewer extents.

This patch, therefore, doubles the minimum block reservations.

Incidentally, if we don't take measures to minimize resource group
contention (both inter and intra-node) the results are much worse.
Here is the same test, done on a stock RHEL7 kernel:

EXTENT COUNT FOR OUTPUT FILES =  826314
EXTENT COUNT FOR OUTPUT FILES =  791406
EXTENT COUNT FOR OUTPUT FILES =  735760

Patch description:

The minimum block reservation size was 32 blocks. This patch doubles
it to 64 blocks.


Whether that makes sense to do or not depends upon the distribution of 
the file sizes in the workload, and also the way in which the files are 
written. I don't think there is any guarantee that a different workload 
would not produce a different result here. Just out of interest, what is 
the distribution of file sizes in this workload?


I'd much prefer to see an algorithm that is adaptive, rather than simply 
bumping up the default here. We do need to be able to cope with cases 
where the files are much smaller, and without adaptive sizing, we might 
land up creating small holes in the allocations, which would then cause 
problems for later allocations,


Steve.


Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson rpete...@redhat.com
---
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5d8f085..e2058a7 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -14,11 +14,9 @@
  #include linux/uaccess.h
  
  /* Since each block in the file system is represented by two bits in the

- * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
- * By reserving 32 blocks at a time, we can optimize / shortcut how we search
- * through the bitmaps by looking a word at a time.
+ * bitmap, each 64-bit word in the bitmap represents 32 blocks.
   */
-#define RGRP_RSRV_MINBYTES 8
+#define RGRP_RSRV_MINBYTES 16 /* Reserve 64 blocks */
  #define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
  
  struct gfs2_rgrpd;

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Use gfs2_rbm_incr in rgblk_free

2014-10-03 Thread Steven Whitehouse


Hi,

Looks like a really nice optimisation. Now in the -nmw tree. Thanks,

Steve.

On 03/10/14 13:38, Bob Peterson wrote:

Hi,

This patch speeds up GFS2 unlink operations by using function
gfs2_rbm_incr rather than continuously calculating the rbm.

Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson rpete...@redhat.com
---
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 55ef72d..7474c41 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2097,7 +2097,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd 
*sdp, u64 bstart,
 u32 blen, unsigned char new_state)
  {
struct gfs2_rbm rbm;
-   struct gfs2_bitmap *bi;
+   struct gfs2_bitmap *bi, *bi_prev = NULL;
  
  	rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);

if (!rbm.rgd) {
@@ -2106,18 +2106,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd 
*sdp, u64 bstart,
return NULL;
}
  
+	gfs2_rbm_from_block(rbm, bstart);

while (blen--) {
-   gfs2_rbm_from_block(rbm, bstart);
bi = rbm_bi(rbm);
-   bstart++;
-   if (!bi-bi_clone) {
-   bi-bi_clone = kmalloc(bi-bi_bh-b_size,
-  GFP_NOFS | __GFP_NOFAIL);
-   memcpy(bi-bi_clone + bi-bi_offset,
-  bi-bi_bh-b_data + bi-bi_offset, bi-bi_len);
+   if (bi != bi_prev) {
+   if (!bi-bi_clone) {
+   bi-bi_clone = kmalloc(bi-bi_bh-b_size,
+ GFP_NOFS | __GFP_NOFAIL);
+   memcpy(bi-bi_clone + bi-bi_offset,
+  bi-bi_bh-b_data + bi-bi_offset,
+  bi-bi_len);
+   }
+   gfs2_trans_add_meta(rbm.rgd-rd_gl, bi-bi_bh);
+   bi_prev = bi;
}
-   gfs2_trans_add_meta(rbm.rgd-rd_gl, bi-bi_bh);
gfs2_setbit(rbm, false, new_state);
+   gfs2_rbm_incr(rbm);
}
  
  	return rbm.rgd;

Re: [Cluster-devel] [PATCH 02/12] gfs2: Set allowed quota types

2014-10-02 Thread Steven Whitehouse


Hi,

On 01/10/14 20:31, Jan Kara wrote:

We support user and group quotas. Tell vfs about it.

CC: Steven Whitehouse swhit...@redhat.com
CC: cluster-devel@redhat.com
Signed-off-by: Jan Kara j...@suse.cz
---
  fs/gfs2/ops_fstype.c | 1 +
  1 file changed, 1 insertion(+)

Acked-by: Steven Whitehouse swhit...@redhat.com

Steve.


diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index d3eae244076e..3032e6d069b5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1083,6 +1083,7 @@ static int fill_super(struct super_block *sb, struct 
gfs2_args *args, int silent
sb-s_xattr = gfs2_xattr_handlers;
sb-s_qcop = gfs2_quotactl_ops;
sb_dqopt(sb)-flags |= DQUOT_QUOTA_SYS_FILE;
+   sb_dqopt(sb)-allowed_types = (1  USRQUOTA) | (1  GRPQUOTA);
sb-s_time_gran = 1;
sb-s_maxbytes = MAX_LFS_FILESIZE;

Re: [Cluster-devel] [GFS2 PATCH] GFS2: Make rename not save dirent location

2014-10-01 Thread Steven Whitehouse


Hi,

On 29/09/14 13:52, Bob Peterson wrote:

Hi,

This patch fixes a regression in the patch GFS2: Remember directory
insert point, commit 2b47dad866d04f14c328f888ba5406057b8c7d33.
The problem had to do with the rename function: The function found
space for the new dirent, and remembered that location. But then the
old dirent was removed, which often moved the eligible location for
the renamed dirent. Putting the new dirent at the saved location
caused file system corruption.

This patch adds a new save_loc variable to struct gfs2_diradd.
If 1, the dirent location is saved. If 0, the dirent location is not
saved and the buffer_head is released as per previous behavior.

Regards,

Bob Peterson
Red Hat File Systems

Now in the -nmw git tree. Thanks,

Steve.


Signed-off-by: Bob Peterson rpete...@redhat.com
---
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 1a349f9..5d4261f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -2100,8 +2100,13 @@ int gfs2_diradd_alloc_required(struct inode *inode, 
const struct qstr *name,
}
if (IS_ERR(dent))
return PTR_ERR(dent);
-   da-bh = bh;
-   da-dent = dent;
+
+   if (da-save_loc) {
+   da-bh = bh;
+   da-dent = dent;
+   } else {
+   brelse(bh);
+   }
return 0;
  }
  
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h

index 126c65d..e1b309c 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -23,6 +23,7 @@ struct gfs2_diradd {
unsigned nr_blocks;
struct gfs2_dirent *dent;
struct buffer_head *bh;
+   int save_loc;
  };
  
  extern struct inode *gfs2_dir_search(struct inode *dir,

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9516f5c..fcf42ea 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -600,7 +600,7 @@ static int gfs2_create_inode(struct inode *dir, struct 
dentry *dentry,
int error, free_vfs_inode = 0;
u32 aflags = 0;
unsigned blocks = 1;
-   struct gfs2_diradd da = { .bh = NULL, };
+   struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
  
  	if (!name-len || name-len  GFS2_FNAMESIZE)

return -ENAMETOOLONG;
@@ -900,7 +900,7 @@ static int gfs2_link(struct dentry *old_dentry, struct 
inode *dir,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_holder ghs[2];
struct buffer_head *dibh;
-   struct gfs2_diradd da = { .bh = NULL, };
+   struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
int error;
  
  	if (S_ISDIR(inode-i_mode))

@@ -1338,7 +1338,7 @@ static int gfs2_rename(struct inode *odir, struct dentry 
*odentry,
struct gfs2_rgrpd *nrgd;
unsigned int num_gh;
int dir_rename = 0;
-   struct gfs2_diradd da = { .nr_blocks = 0, };
+   struct gfs2_diradd da = { .nr_blocks = 0, .save_loc = 0, };
unsigned int x;
int error;

Re: [Cluster-devel] [DLM PATCH] DLM: Don't wait for resource library lookups if NOLOOKUP is specified

2014-10-01 Thread Steven Whitehouse


Hi,

On 01/10/14 18:21, Bob Peterson wrote:

Hi,

This patch adds a new lock flag, DLM_LKF_NOLOOKUP, which instructs DLM
to refrain from sending lookup requests in cases where the lock library
node is not the current node. This is similar to the DLM_LKF_NOQUEUE
flag, except it fails locks that would require a lookup, with -EAGAIN.

This is not just about saving a network operation. It allows callers
like GFS2 to master locks for which they are the directory node. Each
node can then prefer local locks, especially in the case of GFS2
selecting resource groups for block allocations (implemented with a
separate patch). This mastering of local locks distributes the locks
between the nodes (at least until nodes enter or leave the cluster),
which tends to make each node keep to itself when doing allocations.
Thus, dlm communications are kept to a minimum, which results in
significantly faster block allocations.
I think we need to do some more investigation here... how long do the 
lookups take? If the issue is just to create a list of perferred rgrps 
for each node, then there are various ways in which we might do that. 
That is not to say that this isn't a good way to do it, but I think we 
should try to understand the timings here first and make sure that we 
are solving the right problem,


Steve.


Regards,

Bob Peterson
Red Hat File Systems

Signed-off-by: Bob Peterson rpete...@redhat.com
---
  fs/dlm/lock.c | 16 ++--
  include/uapi/linux/dlmconstants.h |  7 +++
  2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 83f3d55..f1e5b04 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -222,6 +222,11 @@ static inline int can_be_queued(struct dlm_lkb *lkb)
return !(lkb-lkb_exflags  DLM_LKF_NOQUEUE);
  }
  
+static inline int can_be_looked_up(struct dlm_lkb *lkb)

+{
+   return !(lkb-lkb_exflags  DLM_LKF_NOLOOKUP);
+}
+
  static inline int force_blocking_asts(struct dlm_lkb *lkb)
  {
return (lkb-lkb_exflags  DLM_LKF_NOQUEUEBAST);
@@ -2745,6 +2750,11 @@ static int set_master(struct dlm_rsb *r, struct dlm_lkb 
*lkb)
return 0;
}
  
+	if (!can_be_looked_up(lkb)) {

+   queue_cast(r, lkb, -EAGAIN);
+   return -EAGAIN;
+   }
+
wait_pending_remove(r);
  
  	r-res_first_lkid = lkb-lkb_id;

@@ -2828,7 +2838,8 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, 
uint32_t flags,
if (flags  DLM_LKF_CONVDEADLK  !(flags  DLM_LKF_CONVERT))
goto out;
  
-	if (flags  DLM_LKF_CONVDEADLK  flags  DLM_LKF_NOQUEUE)

+   if (flags  DLM_LKF_CONVDEADLK  (flags  (DLM_LKF_NOQUEUE |
+   DLM_LKF_NOLOOKUP)))
goto out;
  
  	if (flags  DLM_LKF_EXPEDITE  flags  DLM_LKF_CONVERT)

@@ -2837,7 +2848,8 @@ static int set_lock_args(int mode, struct dlm_lksb *lksb, 
uint32_t flags,
if (flags  DLM_LKF_EXPEDITE  flags  DLM_LKF_QUECVT)
goto out;
  
-	if (flags  DLM_LKF_EXPEDITE  flags  DLM_LKF_NOQUEUE)

+   if (flags  DLM_LKF_EXPEDITE  (flags  (DLM_LKF_NOQUEUE |
+ DLM_LKF_NOLOOKUP)))
goto out;
  
  	if (flags  DLM_LKF_EXPEDITE  mode != DLM_LOCK_NL)

diff --git a/include/uapi/linux/dlmconstants.h 
b/include/uapi/linux/dlmconstants.h
index 47bf08d..4b9ba15 100644
--- a/include/uapi/linux/dlmconstants.h
+++ b/include/uapi/linux/dlmconstants.h
@@ -131,6 +131,12 @@
   * Unlock the lock even if it is converting or waiting or has sublocks.
   * Only really for use by the userland device.c code.
   *
+ * DLM_LKF_NOLOOKUP
+ *
+ * Don't take any network time/bandwidth to do directory owner lookups.
+ * This is a lock for which we only care whether it's completely under
+ * local jurisdiction.
+ *
   */
  
  #define DLM_LKF_NOQUEUE		0x0001

@@ -152,6 +158,7 @@
  #define DLM_LKF_ALTCW 0x0001
  #define DLM_LKF_FORCEUNLOCK   0x0002
  #define DLM_LKF_TIMEOUT   0x0004
+#define DLM_LKF_NOLOOKUP   0x0008
  
  /*

   * Some return codes that are not in errno.h

Re: [Cluster-devel] [GFS2 PATCH] gfs2: fix bad inode i_goal values during block allocation

2014-09-19 Thread Steven Whitehouse


Hi,

Now in the -nmw git tree. Thanks,

Steve.

On 19/09/14 03:40, Abhi Das wrote:

This patch checks if i_goal is either zero or if doesn't exist
within any rgrp (i.e gfs2_blk2rgrpd() returns NULL). If so, it
assigns the ip-i_no_addr block as the i_goal.

There are two scenarios where a bad i_goal can result in a
-EBADSLT error.

1. Attempting to allocate to an existing inode:
Control reaches gfs2_inplace_reserve() and ip-i_goal is bad.
We need to fix i_goal here.

2. A new inode is created in a directory whose i_goal is hosed:
In this case, the parent dir's i_goal is copied onto the new
inode. Since the new inode is not yet created, the ip-i_no_addr
field is invalid and so, the fix in gfs2_inplace_reserve() as per
1) won't work in this scenario. We need to catch and fix it sooner
in the parent dir itself (gfs2_create_inode()), before it is
copied to the new inode.

Signed-off-by: Abhi Das a...@redhat.com
---
  fs/gfs2/inode.c | 1 +
  fs/gfs2/rgrp.c  | 8 
  fs/gfs2/rgrp.h  | 1 +
  3 files changed, 10 insertions(+)

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index fc8ac2e..9516f5c 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -672,6 +672,7 @@ static int gfs2_create_inode(struct inode *dir, struct 
dentry *dentry,
inode-i_atime = inode-i_mtime = inode-i_ctime = CURRENT_TIME;
gfs2_set_inode_blocks(inode, 1);
munge_mode_uid_gid(dip, inode);
+   check_and_update_goal(dip);
ip-i_goal = dip-i_goal;
ip-i_diskflags = 0;
ip-i_eattr = 0;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index f4cb9c0..55ef72d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -577,6 +577,13 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd 
*rgd)
return rgd;
  }
  
+void check_and_update_goal(struct gfs2_inode *ip)

+{
+   struct gfs2_sbd *sdp = GFS2_SB(ip-i_inode);
+   if (!ip-i_goal || gfs2_blk2rgrpd(sdp, ip-i_goal, 1) == NULL)
+   ip-i_goal = ip-i_no_addr;
+}
+
  void gfs2_free_clones(struct gfs2_rgrpd *rgd)
  {
int x;
@@ -1910,6 +1917,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const 
struct gfs2_alloc_parms *a
} else if (ip-i_rgd  rgrp_contains_block(ip-i_rgd, ip-i_goal)) {
rs-rs_rbm.rgd = begin = ip-i_rgd;
} else {
+   check_and_update_goal(ip);
rs-rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip-i_goal, 1);
}
if (S_ISDIR(ip-i_inode.i_mode)  (ap-aflags  GFS2_AF_ORLOV))
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 463ab2e..5d8f085 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -80,4 +80,5 @@ static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
return rs  !RB_EMPTY_NODE(rs-rs_node);
  }
  
+extern void check_and_update_goal(struct gfs2_inode *ip);

  #endif /* __RGRP_DOT_H__ */

[Cluster-devel] GFS2 -nmw git tree

2014-09-17 Thread Steven Whitehouse


Hi,

Linus pulled the fixes that I sent recently, so since there were no 
other patches in the -nmw tree, its now updated and empty, pending 
further patches being sent,


Steve.

[Cluster-devel] GFS2: Pull request (fixes)

2014-09-16 Thread Steven Whitehouse

Hi,

Please consider pulling the following fixes for GFS2,

Steve.


The following changes since commit 372b1dbdd1fb5697890b937228d93cfd9c734c90:

  Merge branch 'for-linus' of git://git.samba.org/sfrench/cifs-2.6 (2014-08-20 
18:33:21 -0500)

are available in the git repository at:


  git://git.kernel.org/pub/scm/linux/kernel/git/steve/gfs2-3.0-fixes.git 
tags/gfs2-fixes

for you to fetch changes up to cfb2f9d5c921e38b0f12bb26fed10b87766d:

  GFS2: fix d_splice_alias() misuses (2014-09-12 20:58:55 +0100)


Here are a number of small fixes for GFS2. There is a fix for FIEMAP
on large sparse files, a negative dentry hashing fix, a fix for
flock, and a bug fix relating to d_splice_alias usage. There are
also (patches 1 and 5) a couple of updates which are less
critical, but small and low risk.


Al Viro (1):
  GFS2: fix d_splice_alias() misuses

Benjamin Coddington (1):
  GFS2: Hash the negative dentry during inode lookup

Bob Peterson (2):
  GFS2: Change maxlen variables to size_t
  GFS2: Request demote when a try flock fails

Fabian Frederick (1):
  GFS2: fs/gfs2/super.c: replace seq_printf by seq_puts

Jan Kara (1):
  GFS2: Don't use MAXQUOTAS value

 fs/gfs2/bmap.c   |  9 +
 fs/gfs2/file.c   | 15 ---
 fs/gfs2/incore.h |  7 +--
 fs/gfs2/inode.c  |  9 ++---
 fs/gfs2/super.c  | 20 ++--
 5 files changed, 38 insertions(+), 22 deletions(-)



signature.asc
Description: This is a digitally signed message part

[Cluster-devel] GFS2: Pre-pull patch posting (fixes)

2014-09-15 Thread Steven Whitehouse

Hi,

Here are a number of small fixes for GFS2. There is a fix for FIEMAP
on large sparse files, a negative dentry hashing fix, a fix for
flock, and a bug fix relating to d_splice_alias usage. There are
also (patches 1 and 5) a couple of updates which are less
critical, but small and low risk.

Steve.

[Cluster-devel] [PATCH 1/6] GFS2: fs/gfs2/super.c: replace seq_printf by seq_puts

2014-09-15 Thread Steven Whitehouse

From: Fabian Frederick f...@skynet.be

fix checkpatch warnings:
WARNING: Prefer seq_puts to seq_printf

Cc: cluster-devel@redhat.com
Signed-off-by: Fabian Frederick f...@skynet.be
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2607ff1..a346f56 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1294,7 +1294,7 @@ static int gfs2_show_options(struct seq_file *s, struct 
dentry *root)
int val;
 
if (is_ancestor(root, sdp-sd_master_dir))
-   seq_printf(s, ,meta);
+   seq_puts(s, ,meta);
if (args-ar_lockproto[0])
seq_printf(s, ,lockproto=%s, args-ar_lockproto);
if (args-ar_locktable[0])
@@ -1302,13 +1302,13 @@ static int gfs2_show_options(struct seq_file *s, struct 
dentry *root)
if (args-ar_hostdata[0])
seq_printf(s, ,hostdata=%s, args-ar_hostdata);
if (args-ar_spectator)
-   seq_printf(s, ,spectator);
+   seq_puts(s, ,spectator);
if (args-ar_localflocks)
-   seq_printf(s, ,localflocks);
+   seq_puts(s, ,localflocks);
if (args-ar_debug)
-   seq_printf(s, ,debug);
+   seq_puts(s, ,debug);
if (args-ar_posix_acl)
-   seq_printf(s, ,acl);
+   seq_puts(s, ,acl);
if (args-ar_quota != GFS2_QUOTA_DEFAULT) {
char *state;
switch (args-ar_quota) {
@@ -1328,7 +1328,7 @@ static int gfs2_show_options(struct seq_file *s, struct 
dentry *root)
seq_printf(s, ,quota=%s, state);
}
if (args-ar_suiddir)
-   seq_printf(s, ,suiddir);
+   seq_puts(s, ,suiddir);
if (args-ar_data != GFS2_DATA_DEFAULT) {
char *state;
switch (args-ar_data) {
@@ -1345,7 +1345,7 @@ static int gfs2_show_options(struct seq_file *s, struct 
dentry *root)
seq_printf(s, ,data=%s, state);
}
if (args-ar_discard)
-   seq_printf(s, ,discard);
+   seq_puts(s, ,discard);
val = sdp-sd_tune.gt_logd_secs;
if (val != 30)
seq_printf(s, ,commit=%d, val);
@@ -1376,11 +1376,11 @@ static int gfs2_show_options(struct seq_file *s, struct 
dentry *root)
seq_printf(s, ,errors=%s, state);
}
if (test_bit(SDF_NOBARRIERS, sdp-sd_flags))
-   seq_printf(s, ,nobarrier);
+   seq_puts(s, ,nobarrier);
if (test_bit(SDF_DEMOTE, sdp-sd_flags))
-   seq_printf(s, ,demote_interface_used);
+   seq_puts(s, ,demote_interface_used);
if (args-ar_rgrplvb)
-   seq_printf(s, ,rgrplvb);
+   seq_puts(s, ,rgrplvb);
return 0;
 }
 
-- 
1.8.3.1

[Cluster-devel] [PATCH 4/6] GFS2: Hash the negative dentry during inode lookup

2014-09-15 Thread Steven Whitehouse

From: Benjamin Coddington bcodd...@redhat.com

Fix a regression introduced by:
6d4ade986f9c8df31e68 GFS2: Add atomic_open support
where an early return misses d_splice_alias() which had been
adding the negative dentry.

Signed-off-by: Benjamin Coddington bcodd...@redhat.com
Signed-off-by: Bob Peterson rpete...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e62e594..9317ddc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -840,8 +840,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, 
struct dentry *dentry,
int error;
 
inode = gfs2_lookupi(dir, dentry-d_name, 0);
-   if (!inode)
+   if (inode == NULL) {
+   d_add(dentry, NULL);
return NULL;
+   }
if (IS_ERR(inode))
return ERR_CAST(inode);
 
-- 
1.8.3.1

[Cluster-devel] [PATCH 5/6] GFS2: Don't use MAXQUOTAS value

2014-09-15 Thread Steven Whitehouse

From: Jan Kara j...@suse.cz

MAXQUOTAS value defines maximum number of quota types VFS supports.
This isn't necessarily the number of types gfs2 supports and with
addition of project quotas these two numbers stop matching. So make gfs2
use its private definition.

CC: cluster-devel@redhat.com
Signed-off-by: Jan Kara j...@suse.cz
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 67d310c..39e7e99 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -262,6 +262,9 @@ struct gfs2_holder {
unsigned long gh_ip;
 };
 
+/* Number of quota types we support */
+#define GFS2_MAXQUOTAS 2
+
 /* Resource group multi-block reservation, in order of appearance:
 
Step 1. Function prepares to write, allocates a mb, sets the size hint.
@@ -282,8 +285,8 @@ struct gfs2_blkreserv {
u64 rs_inum;  /* Inode number for reservation */
 
/* ancillary quota stuff */
-   struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
-   struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
+   struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
+   struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
unsigned int rs_qa_qd_num;
 };
 
-- 
1.8.3.1

[Cluster-devel] [PATCH 6/6] GFS2: fix d_splice_alias() misuses

2014-09-15 Thread Steven Whitehouse

From: Al Viro v...@zeniv.linux.org.uk

Callers of d_splice_alias(dentry, inode) don't need iput(), neither
on success nor on failure.  Either the reference to inode is stored
in a previously negative dentry, or it's dropped.  In either case
inode reference the caller used to hold is consumed.

__gfs2_lookup() does iput() in case when d_splice_alias() has failed.
Double iput() if we ever hit that.  And gfs2_create_inode() ends up
not only with double iput(), but with link count dropped to zero - on
an inode it has just found in directory.

Cc: sta...@vger.kernel.org # v3.14+
Signed-off-by: Al Viro v...@zeniv.linux.org.uk
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9317ddc..fc8ac2e 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -626,8 +626,10 @@ static int gfs2_create_inode(struct inode *dir, struct 
dentry *dentry,
if (!IS_ERR(inode)) {
d = d_splice_alias(inode, dentry);
error = PTR_ERR(d);
-   if (IS_ERR(d))
+   if (IS_ERR(d)) {
+   inode = ERR_CAST(d);
goto fail_gunlock;
+   }
error = 0;
if (file) {
if (S_ISREG(inode-i_mode)) {
@@ -856,7 +858,6 @@ static struct dentry *__gfs2_lookup(struct inode *dir, 
struct dentry *dentry,
 
d = d_splice_alias(inode, dentry);
if (IS_ERR(d)) {
-   iput(inode);
gfs2_glock_dq_uninit(gh);
return d;
}
-- 
1.8.3.1

[Cluster-devel] [PATCH 2/6] GFS2: Change maxlen variables to size_t

2014-09-15 Thread Steven Whitehouse

From: Bob Peterson rpete...@redhat.com

This patch changes some variables (especially maxlen in function
gfs2_block_map) from unsigned int to size_t. We need 64-bit arithmetic
for very large files (e.g. 1PB) where the variables otherwise get
shifted to all 0's.

Signed-off-by: Bob Peterson rpete...@redhat.com
Signed-off-by: Steven Whitehouse swhit...@redhat.com

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e6ee5b6..f0b945a 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -359,7 +359,7 @@ static inline void release_metapath(struct metapath *mp)
  * Returns: The length of the extent (minimum of one block)
  */
 
-static inline unsigned int gfs2_extent_length(void *start, unsigned int len, 
__be64 *ptr, unsigned limit, int *eob)
+static inline unsigned int gfs2_extent_length(void *start, unsigned int len, 
__be64 *ptr, size_t limit, int *eob)
 {
const __be64 *end = (start + len);
const __be64 *first = ptr;
@@ -449,7 +449,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const 
sector_t lblock,
   struct buffer_head *bh_map, struct metapath *mp,
   const unsigned int sheight,
   const unsigned int height,
-  const unsigned int maxlen)
+  const size_t maxlen)
 {
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -483,7 +483,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const 
sector_t lblock,
} else {
/* Need to allocate indirect blocks */
ptrs_per_blk = height  1 ? sdp-sd_inptrs : sdp-sd_diptrs;
-   dblks = min(maxlen, ptrs_per_blk - 
mp-mp_list[end_of_metadata]);
+   dblks = min(maxlen, (size_t)(ptrs_per_blk -
+mp-mp_list[end_of_metadata]));
if (height == ip-i_height) {
/* Writing into existing tree, extend tree down */
iblks = height - sheight;
@@ -605,7 +606,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
unsigned int bsize = sdp-sd_sb.sb_bsize;
-   const unsigned int maxlen = bh_map-b_size  inode-i_blkbits;
+   const size_t maxlen = bh_map-b_size  inode-i_blkbits;
const u64 *arr = sdp-sd_heightsize;
__be64 *ptr;
u64 size;
-- 
1.8.3.1

Re: [Cluster-devel] [GFS2 PATCH] GFS2: check and correct zero i_goal

2014-09-11 Thread Steven Whitehouse


Hi,

On 11/09/14 12:01, Abhi Das wrote:

A GFS1-GFS2 converted filesystem can have the ip-i_goal field
set to zero for inodes. This incorrect value results in -EBADSLT
when the user attempts to allocate blocks to such inodes. This
patch assigns the goal block to be the block address of the inode
itself, which serves as a reasonable starting point for the
allocation logic to find the next available block.

Resolves: rhbz#1130684
Signed-off-by: Abhi Das a...@redhat.com
---
  fs/gfs2/glops.c | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 2ffc67d..799427b 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -349,6 +349,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void 
*buf)
ip-i_inode.i_ctime.tv_nsec = be32_to_cpu(str-di_ctime_nsec);
  
  	ip-i_goal = be64_to_cpu(str-di_goal_meta);

+   if (!ip-i_goal) /* From a previous gfs2_convert, perhaps */
+   ip-i_goal = ip-i_no_addr;
+
ip-i_generation = be64_to_cpu(str-di_generation);
  
  	ip-i_diskflags = be32_to_cpu(str-di_flags);


I don't think that is the right place to make the change, since if the 
fs is read only then the in-kernel copy will be different to the on-disk 
copy of the inode. Better to catch the problem at block allocation time 
and use i_no_addr if i_goal is invalid then (and that doesn't just mean 
0 - it might be pointing at another impossible block)


Steve.

Re: [Cluster-devel] [PATCH] GFS2: Hash the negative dentry during inode lookup

2014-09-11 Thread Steven Whitehouse


Hi,

On 10/09/14 19:09, Benjamin Coddington wrote:

Fix a regression introduced by:
6d4ade986f9c8df31e68 GFS2: Add atomic_open support
where an early return misses d_splice_alias() which had been
adding the negative dentry.

Signed-off-by: Benjamin Coddington bcodd...@redhat.com
---
  fs/gfs2/inode.c |4 +++-
  1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e62e594..9317ddc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -840,8 +840,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, 
struct dentry *dentry,
int error;
  
  	inode = gfs2_lookupi(dir, dentry-d_name, 0);

-   if (!inode)
+   if (inode == NULL) {
+   d_add(dentry, NULL);
return NULL;
+   }
if (IS_ERR(inode))
return ERR_CAST(inode);
  


Now in the GFS2 -nmw tree, and I've added Bob's signed-off-by too. Thanks,

Steve.

Re: [Cluster-devel] [PATCH 06/19] libgfs2: Add functions for finding free extents

2014-09-03 Thread Steven Whitehouse


Hi,

You shouldn't need this for allocation in mkfs since you already know 
where the free extents are, so no need to be reading the bitmaps to find 
that out,


Steve.

On 02/09/14 13:07, Andrew Price wrote:

Port gfs2_rbm_find and some functions which it depends on from the gfs2
kernel code. This will set the base for allocation of single-extent
files. The functions have been simplified where possible as libgfs2
doesn't have a concept of reservations for the time being.

Signed-off-by: Andrew Price anpr...@redhat.com
---
  gfs2/libgfs2/rgrp.c | 197 
  gfs2/libgfs2/rgrp.h |   2 +
  2 files changed, 199 insertions(+)

diff --git a/gfs2/libgfs2/rgrp.c b/gfs2/libgfs2/rgrp.c
index 0f36b86..7063288 100644
--- a/gfs2/libgfs2/rgrp.c
+++ b/gfs2/libgfs2/rgrp.c
@@ -638,3 +638,200 @@ static int lgfs2_rbm_incr(struct lgfs2_rbm *rbm)
rbm-bii++;
return 0;
  }
+
+/**
+ * lgfs2_testbit - test a bit in the bitmaps
+ * @rbm: The bit to test
+ *
+ * Returns: The two bit block state of the requested bit
+ */
+static inline uint8_t lgfs2_testbit(const struct lgfs2_rbm *rbm)
+{
+   struct gfs2_bitmap *bi = rbm_bi(rbm);
+   const uint8_t *buffer = (uint8_t *)bi-bi_bh-b_data + bi-bi_offset;
+   const uint8_t *byte;
+   unsigned int bit;
+
+   byte = buffer + (rbm-offset / GFS2_NBBY);
+   bit = (rbm-offset % GFS2_NBBY) * GFS2_BIT_SIZE;
+
+   return (*byte  bit)  GFS2_BIT_MASK;
+}
+
+/**
+ * lgfs2_unaligned_extlen - Look for free blocks which are not byte aligned
+ * @rbm: Position to search (value/result)
+ * @n_unaligned: Number of unaligned blocks to check
+ * @len: Decremented for each block found (terminate on zero)
+ *
+ * Returns: true if a non-free block is encountered
+ */
+static int lgfs2_unaligned_extlen(struct lgfs2_rbm *rbm, uint32_t n_unaligned, 
uint32_t *len)
+{
+   uint32_t n;
+   uint8_t res;
+
+   for (n = 0; n  n_unaligned; n++) {
+   res = lgfs2_testbit(rbm);
+   if (res != GFS2_BLKST_FREE)
+   return 1;
+   (*len)--;
+   if (*len == 0)
+   return 1;
+   if (lgfs2_rbm_incr(rbm))
+   return 1;
+   }
+
+   return 0;
+}
+
+static uint8_t *check_bytes8(const uint8_t *start, uint8_t value, unsigned 
bytes)
+{
+   while (bytes) {
+   if (*start != value)
+   return (void *)start;
+   start++;
+   bytes--;
+   }
+   return NULL;
+}
+
+/**
+ * lgfs2_free_extlen - Return extent length of free blocks
+ * @rbm: Starting position
+ * @len: Max length to check
+ *
+ * Starting at the block specified by the rbm, see how many free blocks
+ * there are, not reading more than len blocks ahead. This can be done
+ * using check_bytes8 when the blocks are byte aligned, but has to be done
+ * on a block by block basis in case of unaligned blocks. Also this
+ * function can cope with bitmap boundaries (although it must stop on
+ * a resource group boundary)
+ *
+ * Returns: Number of free blocks in the extent
+ */
+static uint32_t lgfs2_free_extlen(const struct lgfs2_rbm *rrbm, uint32_t len)
+{
+   struct lgfs2_rbm rbm = *rrbm;
+   uint32_t n_unaligned = rbm.offset  3;
+   uint32_t size = len;
+   uint32_t bytes;
+   uint32_t chunk_size;
+   uint8_t *ptr, *start, *end;
+   uint64_t block;
+   struct gfs2_bitmap *bi;
+
+   if (n_unaligned 
+   lgfs2_unaligned_extlen(rbm, 4 - n_unaligned, len))
+   goto out;
+
+   n_unaligned = len  3;
+   /* Start is now byte aligned */
+   while (len  3) {
+   bi = rbm_bi(rbm);
+   start = (uint8_t *)bi-bi_bh-b_data;
+   end = start + bi-bi_bh-sdp-bsize;
+   start += bi-bi_offset;
+   start += (rbm.offset / GFS2_NBBY);
+   bytes = (len / GFS2_NBBY)  (end - start) ? (len / 
GFS2_NBBY):(end - start);
+   ptr = check_bytes8(start, 0, bytes);
+   chunk_size = ((ptr == NULL) ? bytes : (ptr - start));
+   chunk_size *= GFS2_NBBY;
+   len -= chunk_size;
+   block = lgfs2_rbm_to_block(rbm);
+   if (lgfs2_rbm_from_block(rbm, block + chunk_size)) {
+   n_unaligned = 0;
+   break;
+   }
+   if (ptr) {
+   n_unaligned = 3;
+   break;
+   }
+   n_unaligned = len  3;
+   }
+
+   /* Deal with any bits left over at the end */
+   if (n_unaligned)
+   lgfs2_unaligned_extlen(rbm, n_unaligned, len);
+out:
+   return size - len;
+}
+
+/**
+ * gfs2_rbm_find - Look for blocks of a particular state
+ * @rbm: Value/result starting position and final position
+ * @state: The state which we want to find
+ * @minext: Pointer to the requested extent length (NULL

Re: [Cluster-devel] [PATCH 00/19] gfs2-utils: Introduce extent allocation and speed up journal creation

2014-09-03 Thread Steven Whitehouse


Hi,

Aside from the question on patch 6, the other patches all look good,

Steve.

On 02/09/14 13:07, Andrew Price wrote:

Hi,

This patch set introduces extent allocation to libgfs2 and adds functions which
decouple file creation, allocation and writing so that mkfs.gfs2 can be
re-worked to write journals and resource groups sequentially.

With these patches, mkfs.gfs2 typically takes around 20% of the time that it
did before in my tests.  The main speed-up has been from the journal data
allocation functions not having to re-read and write a resource group for each
block allocated as it did before (this was a performance regression introduced
by previous memory footprint improvement patches, hence the significant perf
improvement).  Journals now each occupy an extent spanning an entire resource
group specifically sized for the journal, and the resource group headers are
written only once, after the journal blocks have been allocated in the
in-memory bitmaps. Resource groups are still only kept in memory for as long as
they are needed so peak memory usage should be largely unchanged.

One thing to note is that, with these patches, the root and master inodes are
no longer the first objects in the first resource group. The master inode is
written in the first free block after the journals and then the other metafs
structures are placed. The root directory inode is then finally created. This
is not a format change but it may cause some confusion after years of expecting
the root and master inodes to be at certain addresses so I thought it worth
mentioning.

Coverity and valgrind are happy about these patches and I've encountered no
problems after various tests which mount the fs.

Cheers,
Andy

Andrew Price (19):
   libgfs2: Keep a pointer to the sbd in lgfs2_rgrps_t
   libgfs2: Move bitmap buffers inside struct gfs2_bitmap
   libgfs2: Fix an impossible loop condition in gfs2_rgrp_read
   libgfs2: Introduce struct lgfs2_rbm
   libgfs2: Move struct _lgfs2_rgrps into rgrp.h
   libgfs2: Add functions for finding free extents
   tests: Add unit tests for the new extent search functions
   libgfs2: Ignore an empty rgrp plan if a length is specified
   libgfs2: Add back-pointer to rgrps in lgfs2_rgrp_t
   libgfs2: Const-ify the parameters of print functions
   libgfs2: Allow init_dinode to accept a preallocated bh
   libgfs2: Add extent allocation functions
   libgfs2: Add support for allocating entire rgrp headers
   libgfs2: Write file metadata sequentially
   libgfs2: Fix alignment in lgfs2_rgsize_for_data
   libgfs2: Handle non-zero bitmaps in lgfs2_rgrp_write
   libgfs2: Add a speedier journal data block writing function
   libgfs2: Create jindex directory separately from journals
   mkfs.gfs2: Improve journal creation performance

  .gitignore  |   3 +-
  gfs2/convert/gfs2_convert.c |  49 +++--
  gfs2/edit/journal.c |   6 +-
  gfs2/fsck/fs_recovery.c |   2 +-
  gfs2/fsck/initialize.c  |  27 +--
  gfs2/fsck/metawalk.c|  10 +-
  gfs2/fsck/pass5.c   |   9 +-
  gfs2/fsck/rgrepair.c|  14 +-
  gfs2/fsck/util.c|   2 +-
  gfs2/libgfs2/Makefile.am|   2 +-
  gfs2/libgfs2/fs_bits.c  |  10 +-
  gfs2/libgfs2/fs_geometry.c  |   6 +-
  gfs2/libgfs2/fs_ops.c   | 184 ++---
  gfs2/libgfs2/libgfs2.h  |  50 +++--
  gfs2/libgfs2/ondisk.c   |  26 +--
  gfs2/libgfs2/rgrp.c | 491 
  gfs2/libgfs2/rgrp.h |  50 +
  gfs2/libgfs2/structures.c   | 103 +-
  gfs2/mkfs/main_grow.c   |   4 +-
  gfs2/mkfs/main_mkfs.c   | 155 ++
  tests/Makefile.am   |  33 ++-
  tests/check_rgrp.c  | 143 +
  tests/libgfs2.at|   8 +-
  23 files changed, 1113 insertions(+), 274 deletions(-)
  create mode 100644 gfs2/libgfs2/rgrp.h
  create mode 100644 tests/check_rgrp.c

Re: [Cluster-devel] [PATCH 06/19] libgfs2: Add functions for finding free extents

2014-09-03 Thread Steven Whitehouse


Hi,

On 03/09/14 13:13, Andrew Price wrote:

On 03/09/14 11:17, Steven Whitehouse wrote:

Hi,

You shouldn't need this for allocation in mkfs since you already know
where the free extents are, so no need to be reading the bitmaps to find
that out,

Steve.


Well, that's true, but I'd like to use generic file allocation 
functions where possible and I wanted to make sure the new functions 
were able to allocate extents. I'd like to keep these changes in 
libgfs2 anyway because they'll be useful for future work in other 
tools, so how about an additional patch like the one below?


Andy

That may be ok, depending on the context. I was expecting to see a two 
stage process of calculating how many blocks are required, and then 
figuring out how to divide the blocks between the rgrps (i.e. fixing the 
location) as a second stage,


Steve.


diff --git a/gfs2/libgfs2/fs_ops.c b/gfs2/libgfs2/fs_ops.c
index 9c9cc82..b84b7f4 100644
--- a/gfs2/libgfs2/fs_ops.c
+++ b/gfs2/libgfs2/fs_ops.c
@@ -296,7 +296,9 @@ uint64_t lgfs2_space_for_data(const struct 
gfs2_sbd *sdp, const unsigned bsize,

  * Allocate an extent for a file in a resource group's bitmaps.
  * rg: The resource group in which to allocate the extent
  * di_size: The size of the file in bytes
- * ip: A pointer to the inode structure, whose fields will be set 
appropriately
+ * ip: A pointer to the inode structure, whose fields will be set 
appropriately.
+ * If ip-i_di.di_num.no_addr is not 0, the extent search will be 
skipped and

+ * the file allocated from that address.
  * flags: GFS2_DIF_* flags
  * mode: File mode flags, see creat(2)
  * Returns 0 on success with the contents of ip set accordingly, or 
non-zero
@@ -310,11 +312,13 @@ int lgfs2_file_alloc(lgfs2_rgrp_t rg, uint64_t 
di_size, struct gfs2_inode *ip, u

 struct gfs2_sbd *sdp = rg-rgrps-sdp;
 struct lgfs2_rbm rbm = { .rgd = rg, .offset = 0, .bii = 0 };
 uint32_t blocks = lgfs2_space_for_data(sdp, sdp-bsize, di_size);
-int err;

-err = lgfs2_rbm_find(rbm, GFS2_BLKST_FREE, blocks);
-if (err != 0)
-return err;
+if (ip-i_di.di_num.no_addr != 0) {
+if (lgfs2_rbm_from_block(rbm, ip-i_di.di_num.no_addr) != 0)
+return 1;
+} else if (lgfs2_rbm_find(rbm, GFS2_BLKST_FREE, blocks) != 0) {
+return 1;
+}

 extlen = lgfs2_alloc_extent(rbm, GFS2_BLKST_DINODE, blocks);
 if (extlen  blocks) {
diff --git a/gfs2/libgfs2/rgrp.c b/gfs2/libgfs2/rgrp.c
index f57ae3a..0d0f000 100644
--- a/gfs2/libgfs2/rgrp.c
+++ b/gfs2/libgfs2/rgrp.c
@@ -643,7 +643,7 @@ lgfs2_rgrp_t lgfs2_rgrp_last(lgfs2_rgrps_t rgs)
  *
  * Returns: 0 on success, or non-zero with errno set
  */
-static int lgfs2_rbm_from_block(struct lgfs2_rbm *rbm, uint64_t block)
+int lgfs2_rbm_from_block(struct lgfs2_rbm *rbm, uint64_t block)
 {
 uint64_t rblock = block - rbm-rgd-ri.ri_data0;
 struct gfs2_sbd *sdp = rbm_bi(rbm)-bi_bh-sdp;
diff --git a/gfs2/libgfs2/rgrp.h b/gfs2/libgfs2/rgrp.h
index bd89289..fd442b1 100644
--- a/gfs2/libgfs2/rgrp.h
+++ b/gfs2/libgfs2/rgrp.h
@@ -44,6 +44,7 @@ static inline int lgfs2_rbm_eq(const struct 
lgfs2_rbm *rbm1, const struct lgfs2_

 (rbm1-offset == rbm2-offset);
 }

+extern int lgfs2_rbm_from_block(struct lgfs2_rbm *rbm, uint64_t block);
 extern int lgfs2_rbm_find(struct lgfs2_rbm *rbm, uint8_t state, 
uint32_t *minext);
 extern unsigned lgfs2_alloc_extent(const struct lgfs2_rbm *rbm, int 
state, const unsigned elen);


diff --git a/gfs2/mkfs/main_mkfs.c b/gfs2/mkfs/main_mkfs.c
index 530383d..e927d82 100644
--- a/gfs2/mkfs/main_mkfs.c
+++ b/gfs2/mkfs/main_mkfs.c
@@ -694,6 +694,8 @@ static int place_journals(struct gfs2_sbd *sdp, 
lgfs2_rgrps_t rgs, struct mkfs_o

 perror(_(Failed to allocate space for bitmap buffer));
 return result;
 }
+/* Allocate at the beginning of the rgrp, bypassing extent 
search */

+in.i_di.di_num.no_addr = lgfs2_rgrp_index(rg)-ri_data0;
 /* In order to keep writes sequential here, we have to allocate
the journal, then write the rgrp header (which is now in its
final form) and then write the journal out */

< 1 2 3 4 5 6 7 8 9 10 >

301 - 400 of 2062 matches

Mail list logo