Re: [PATCH REPOST] rbd: be picky about osd request status type

2013-01-04 Thread Dan Mick
I personally dislike "spaces after cast", but I haven't checked the 
kernel style guide.  Otherwise:


Reviewed-by: Dan Mick 

On 01/03/2013 02:40 PM, Alex Elder wrote:

The result field in a ceph osd reply header is a signed 32-bit type,
but rbd code often casually uses int to represent it.

The following changes the types of variables that handle this result
value to be "s32" instead of "int" to be completely explicit about
it.  Only at the point we pass that result to __blk_end_request()
does the type get converted to the plain old int defined for that
interface.

There is almost certainly no binary impact of this change, but I
prefer to show the exact size and signedness of the value since we
know it.

Signed-off-by: Alex Elder 
---
  drivers/block/rbd.c |   23 ---
  1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 85131de..8b79a5b 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -171,7 +171,7 @@ struct rbd_client {
   */
  struct rbd_req_status {
int done;
-   int rc;
+   s32 rc;
u64 bytes;
  };

@@ -1053,13 +1053,13 @@ static void rbd_destroy_ops(struct
ceph_osd_req_op *ops)
  static void rbd_coll_end_req_index(struct request *rq,
   struct rbd_req_coll *coll,
   int index,
-  int ret, u64 len)
+  s32 ret, u64 len)
  {
struct request_queue *q;
int min, max, i;

dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
-coll, index, ret, (unsigned long long) len);
+coll, index, (int) ret, (unsigned long long) len);

if (!rq)
return;
@@ -1080,7 +1080,7 @@ static void rbd_coll_end_req_index(struct request *rq,
max++;

for (i = min; istatus[i].rc,
+   __blk_end_request(rq, (int) coll->status[i].rc,
  coll->status[i].bytes);
coll->num_done++;
kref_put(&coll->kref, rbd_coll_release);
@@ -1089,7 +1089,7 @@ static void rbd_coll_end_req_index(struct request *rq,
  }

  static void rbd_coll_end_req(struct rbd_request *rbd_req,
-int ret, u64 len)
+s32 ret, u64 len)
  {
rbd_coll_end_req_index(rbd_req->rq,
rbd_req->coll, rbd_req->coll_index,
@@ -1129,7 +1129,7 @@ static int rbd_do_request(struct request *rq,
if (!rbd_req) {
if (coll)
rbd_coll_end_req_index(rq, coll, coll_index,
-  -ENOMEM, len);
+  (s32) -ENOMEM, len);
return -ENOMEM;
}

@@ -1206,7 +1206,7 @@ done_err:
bio_chain_put(rbd_req->bio);
ceph_osdc_put_request(osd_req);
  done_pages:
-   rbd_coll_end_req(rbd_req, ret, len);
+   rbd_coll_end_req(rbd_req, (s32) ret, len);
kfree(rbd_req);
return ret;
  }
@@ -1219,7 +1219,7 @@ static void rbd_req_cb(struct ceph_osd_request
*osd_req, struct ceph_msg *msg)
struct rbd_request *rbd_req = osd_req->r_priv;
struct ceph_osd_reply_head *replyhead;
struct ceph_osd_op *op;
-   __s32 rc;
+   s32 rc;
u64 bytes;
int read_op;

@@ -1227,14 +1227,14 @@ static void rbd_req_cb(struct ceph_osd_request
*osd_req, struct ceph_msg *msg)
replyhead = msg->front.iov_base;
WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
op = (void *)(replyhead + 1);
-   rc = le32_to_cpu(replyhead->result);
+   rc = (s32) le32_to_cpu(replyhead->result);
bytes = le64_to_cpu(op->extent.length);
read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);

dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
(unsigned long long) bytes, read_op, (int) rc);

-   if (rc == -ENOENT && read_op) {
+   if (rc == (s32) -ENOENT && read_op) {
zero_bio_chain(rbd_req->bio, 0);
rc = 0;
} else if (rc == 0 && read_op && bytes < rbd_req->len) {
@@ -1679,7 +1679,8 @@ static void rbd_rq_fn(struct request_queue *q)
bio_chain, coll, cur_seg);
else
rbd_coll_end_req_index(rq, coll, cur_seg,
-  -ENOMEM, chain_size);
+  (s32) -ENOMEM,
+  chain_size);
size -= chain_size;
ofs += chain_size;


--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH REPOST] ceph: define ceph_encode_8_safe()

2013-01-04 Thread Dan Mick

Reviewed-by: Dan Mick 

On 01/03/2013 11:07 AM, Alex Elder wrote:

It's kind of a silly macro, but ceph_encode_8_safe() is the only one
missing from an otherwise pretty complete set.  It's not used, but
neither are a couple of the others in this set.

While in there, insert some whitespace to tidy up the alignment of
the line-terminating backslashes in some of the macro definitions.

Signed-off-by: Alex Elder 
---
  include/linux/ceph/decode.h |   29 +
  1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 4bbf2db..cd679f2 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end,
size_t n)
return end >= *p && n <= end - *p;
  }

-#define ceph_decode_need(p, end, n, bad)   \
-   do {\
-   if (!likely(ceph_has_room(p, end, n)))  \
-   goto bad;   \
+#define ceph_decode_need(p, end, n, bad)   \
+   do {\
+   if (!likely(ceph_has_room(p, end, n)))  \
+   goto bad;   \
} while (0)

  #define ceph_decode_64_safe(p, end, v, bad)   \
@@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end,
size_t n)
   *
   * There are two possible failures:
   *   - converting the string would require accessing memory at or
- * beyond the "end" pointer provided (-E
- *   - memory could not be allocated for the result
+ * beyond the "end" pointer provided (-ERANGE)
+ *   - memory could not be allocated for the result (-ENOMEM)
   */
  static inline char *ceph_extract_encoded_string(void **p, void *end,
size_t *lenp, gfp_t gfp)
@@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p,
void *end,
*p += len;
  }

-#define ceph_encode_need(p, end, n, bad)   \
-   do {\
-   if (!likely(ceph_has_room(p, end, n)))  \
-   goto bad;   \
+#define ceph_encode_need(p, end, n, bad)   \
+   do {\
+   if (!likely(ceph_has_room(p, end, n)))  \
+   goto bad;   \
} while (0)

  #define ceph_encode_64_safe(p, end, v, bad)   \
@@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p,
void *end,
  #define ceph_encode_32_safe(p, end, v, bad)   \
do {\
ceph_encode_need(p, end, sizeof(u32), bad); \
-   ceph_encode_32(p, v);   \
+   ceph_encode_32(p, v);   \
} while (0)
  #define ceph_encode_16_safe(p, end, v, bad)   \
do {\
ceph_encode_need(p, end, sizeof(u16), bad); \
-   ceph_encode_16(p, v);   \
+   ceph_encode_16(p, v);   \
+   } while (0)
+#define ceph_encode_8_safe(p, end, v, bad) \
+   do {\
+   ceph_encode_need(p, end, sizeof(u8), bad);  \
+   ceph_encode_8(p, v);\
} while (0)

  #define ceph_encode_copy_safe(p, end, pv, n, bad) \


--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


librados/librbd compatibility issue with v0.56

2013-01-04 Thread Sage Weil
We identified a problem with the version of librbd/librados in v0.56.  
There will be a v0.56.1 update in a few days that fixes it.  In the 
meantime, be aware that v0.56 ceph-osds may not interact properly with 
non-v0.56 radosgw or librbd clients, and v0.56 radosgw and librbd clients 
will not interact properly with non-v0.56 OSDs.

If you are using v0.56, we recommend moving to the current testing branch, 
which includes fixes queued up for v0.56.1.  You can get debs from e.g.

  http://gitbuilder.ceph.com/ceph-deb-precise-x86_64-basic/ref/testing/

or for other many other distros; see 

  http://ceph.com/docs/master/install/debian/#add-development-testing-packages

The fix is commit d3abd0fe0bb402ff403259d4b1a718a56331fc39 
(921e06decebccc913c0e4f61916d00e62e7e1635 in testing branch).

Thanks!
sage
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


argonaut stable update coming shortly

2013-01-04 Thread Sage Weil
We recently identified a bug in the way the OSD is marking its commits 
on XFS or ext4 (well, anything not btrfs) that could lead data loss in the 
event of power loss or a kernel panic.  We're doing some final testing and 
will have a v0.48.3argonaut package available by Monday.  If you like, 
you can get the fix before that from the autobuilders by grabbing the 
'stable' branch; see 

 http://ceph.com/docs/master/install/debian/#add-development-testing-packages

sage



--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


ceph caps (Ganesha + Ceph pnfs)

2013-01-04 Thread Matt W. Benjamin
Hi Ceph folks,

Summarizing from Ceph IRC discussion by request, I'm one of the developers of a 
pNFS (parallel nfs) implementation that is built atop the Ceph system.

I'm working on code that wants to use the Ceph caps system to control and 
sequence i/o operations and file metadata, for example, so that ordinary Ceph 
clients see a coherent view of the objects being exported via pNFS.

The basic pNFS model (sorry for those who know all this, RFC 5661, etc) is to 
extend NFSv4 with a distributed/parallel access model.  To do parallel access 
in pNFS, the NFS client gets a `layout` from an NFS metadata (MDS) server.  A 
layout is a recallable object, a bit like an oplock/delegation/DCE token, see 
spec, it basically presents a list of subordinate data servers (DSes) on which 
to read and/or write regions of a specific file.

Ok, so in our implementation, we would typically expect to have a DS server 
collocated with each Ceph OSD.  When an NFS client has a layout on a given 
inode, its i/o requests will be performed "directly" by the appropriate OSD.  
When an MDS is asked to issue a layout on a file, it should hold a cap or caps 
which ensure the layout will not conflict with other Ceph clients and ensure 
the MDS will be notified when it must recall the layout later if other clients 
attempt conflicting operations.  In turn, involved DS servers need the correct 
caps to read and/or write the data, plus, they need to update file metadata 
periodically.  (This can be upon a final commit of the client's layout, or 
inline with a write operation, if the client specifies the write be 'sync' 
stability.)

The current set of behaviors we're modeling are:

a) allow MDS to hold a Ceph caps, tracking issued pNFS layouts, such that it 
will be able to handle events which should trigger layout recalls at its pNFS 
clients (e.g., on conflicts)--currently we it holds 
CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD

b) on a given DS, we currently get CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD caps when 
asked to perform i/o on behalf of a valid layout--but we need to update 
metadata (size, mtime) and my question in IRC was cross checking these 
capabilities as correct to send an update message

In the current pass I'm trying to clean up/refine the model implementation, 
leaving some room for adjustment.

Thanks!

Matt

-- 
Matt Benjamin
The Linux Box
206 South Fifth Ave. Suite 150
Ann Arbor, MI  48104

http://linuxbox.com

tel. 734-761-4689
fax. 734-769-8938
cel. 734-216-5309
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Any idea about doing deduplication in ceph?

2013-01-04 Thread Gregory Farnum
On Wed, Dec 26, 2012 at 6:16 PM, lollipop  wrote:
> Nowadays, I am wondering doing offline deduplication in ceph?
> My idea is:
> First in the ceph-client, I try to get the locations of chunks in one file.
> The information includes
> how many chunks the file has and which osd the chunk(object group) has been
> stored.
> Then the ceph-client try to communicate with the exact osd to ask the osd to
> return the chunk hash.
> After that, we compare the returned hash with the already stored hash table,
> If the chunk is duplicated, we try to change the file meta-data.
> Can it work?
> Can you give some ideas? Thank you


Any off-line deduplication support in Ceph is going to have the
important parts be in the OSD code, not in the clients. :) We've
discussed dedup a little bit internally and on the mailing list; you
can do an archive search if you're interested in what the current
thoughts are. :)
-Greg
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ceph stability

2013-01-04 Thread Gregory Farnum
On Fri, Dec 21, 2012 at 2:07 AM, Amon Ott  wrote:
> Am 20.12.2012 15:31, schrieb Mark Nelson:
>> On 12/20/2012 01:08 AM, Roman Hlynovskiy wrote:
>>> Hello Mark,
>>>
>>> for multi-mds solutions do you refer to multi-active arch or 1 active
>>> and many standby arch?
>>
>> That's a good question!  I know we don't really recommend multi-active
>> right now for production use.  Not sure what our current recommendations
>> are for multi-standby.  As far as I know it's considered to be more
>> stable.  I'm sure Greg or Sage can chime in with a more accurate
>> assessment.
>
> We have been testing a lot with multi-standby, because a single MDS does
> not make a lot of sense in a cluster. Maybe the clue is to have only one
> standby, making SPOF a DPOF?

The number of standbys should not have any impact on stability — they
are read-only until the active MDS gets declared down, at which point
one of them becomes active. I suppose having a standby could reduce
stability if your active MDS is overloaded enough to miss check-ins
without actually going down — without a standby it would eventually
recover, but with a standby a transition happens. That's the only
difference, though.
-Greg
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: which Linux kernel version corresponds to 0.48argonaut?

2013-01-04 Thread Sage Weil
On Fri, 4 Jan 2013, Gregory Farnum wrote:
> I think they might be different just as a consequence of being updated
> less recently; that's where all of the lines whose origin I recognize
> differ (not certain about the calc_parents stuff though). Sage can
> confirm.

In general, the crush files in mainline should track the same files in 
ceph.git master, modulo the #include lines at the top.  0.48argonaut is 
sufficiently old that the linux versions are actually newer.

> The specific issue you encountered previously was of course because
> you changed the layout algorithm and the client needs to be able to
> process that layout itself.

Yep!
sage


> -Greg
> 
> On Thu, Dec 20, 2012 at 5:37 PM, Xing Lin  wrote:
> > This may be useful for other Ceph newbies just like me.
> >
> > I have ported my changes to 0.48argonaut to related Ceph files included in
> > Linux, though files with the same name are not exactly the same. Then I
> > recompiled and installed the kernel. After that, everything seems to be
> > working again now: Ceph is working with my new simple replica placement
> > algorithm. :)
> > So, it seems that Ceph files included in the Linux kernel are supposed to be
> > different from those in 0.48argonaut. Presumably, the Linux kernel contains
> > the client-side implementation while 0.48argonaut contains the server-side
> > implementation. It would be appreciated if someone can confirm it. Thank
> > you!
> >
> > Xing
> >
> >
> > On 12/20/2012 11:54 AM, Xing Lin wrote:
> >>
> >> Hi,
> >>
> >> I was trying to add a simple replica placement algorithm in Ceph. This
> >> algorithm simply returns r_th item in a bucket for the r_th replica. I have
> >> made that change in Ceph source code (including files such as crush.h,
> >> crush.c, mapper.c, ...) and I can run Ceph monitor and osd daemons. 
> >> However,
> >> I am not able to map rbd block devices at client machines. 'rbd map image0'
> >> reported "input/output error" and 'dmesg' at the client machine showed
> >> message like "libceph: handle_map corrupt msg". I believe that is because I
> >> have not ported my changes to Ceph client side programs and it does not
> >> recognize the new placement algorithm. I probably need to recompile the rbd
> >> block device driver. When I was trying to replace Ceph related files in
> >> Linux with my own version, I noticed that files in Linux-3.2.16 are
> >> different from these included in Ceph source code. For example, the
> >> following is the diff of crush.h in Linux-3.2.16 and 0.48argonaut. So, my
> >> question is that is there any version of Linux that contains the exact Ceph
> >> files as included in 0.48argonaut? Thanks.
> >>
> >> ---
> >>  $ diff -uNrp ceph-0.48argonaut/src/crush/crush.h
> >> linux-3.2.16/include/linux/crush/crush.h
> >> --- ceph-0.48argonaut/src/crush/crush.h2012-06-26 11:56:36.0
> >> -0600
> >> +++ linux-3.2.16/include/linux/crush/crush.h2012-04-22
> >> 16:31:32.0 -0600
> >> @@ -1,12 +1,7 @@
> >>  #ifndef CEPH_CRUSH_CRUSH_H
> >>  #define CEPH_CRUSH_CRUSH_H
> >>
> >> -#if defined(__linux__)
> >>  #include 
> >> -#elif defined(__FreeBSD__)
> >> -#include 
> >> -#include "include/inttypes.h"
> >> -#endif
> >>
> >>  /*
> >>   * CRUSH is a pseudo-random data distribution algorithm that
> >> @@ -156,24 +151,25 @@ struct crush_map {
> >>  struct crush_bucket **buckets;
> >>  struct crush_rule **rules;
> >>
> >> +/*
> >> + * Parent pointers to identify the parent bucket a device or
> >> + * bucket in the hierarchy.  If an item appears more than
> >> + * once, this is the _last_ time it appeared (where buckets
> >> + * are processed in bucket id order, from -1 on down to
> >> + * -max_buckets.
> >> + */
> >> +__u32 *bucket_parents;
> >> +__u32 *device_parents;
> >> +
> >>  __s32 max_buckets;
> >>  __u32 max_rules;
> >>  __s32 max_devices;
> >> -
> >> -/* choose local retries before re-descent */
> >> -__u32 choose_local_tries;
> >> -/* choose local attempts using a fallback permutation before
> >> - * re-descent */
> >> -__u32 choose_local_fallback_tries;
> >> -/* choose attempts before giving up */
> >> -__u32 choose_total_tries;
> >> -
> >> -__u32 *choose_tries;
> >>  };
> >>
> >>
> >>  /* crush.c */
> >> -extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int
> >> pos);
> >> +extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
> >> +extern void crush_calc_parents(struct crush_map *map);
> >>  extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
> >>  extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
> >>  extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
> >> @@ -181,9 +177,4 @@ extern void crush_destroy_bucket_straw(s
> >>  extern void crush_destroy_bucket(struct crush_bucket *b);
> >>  extern void crush_destroy(struct crush_map *map);
> >>
> >> -static inline int crush_

Re: which Linux kernel version corresponds to 0.48argonaut?

2013-01-04 Thread Gregory Farnum
I think they might be different just as a consequence of being updated
less recently; that's where all of the lines whose origin I recognize
differ (not certain about the calc_parents stuff though). Sage can
confirm.
The specific issue you encountered previously was of course because
you changed the layout algorithm and the client needs to be able to
process that layout itself.
-Greg

On Thu, Dec 20, 2012 at 5:37 PM, Xing Lin  wrote:
> This may be useful for other Ceph newbies just like me.
>
> I have ported my changes to 0.48argonaut to related Ceph files included in
> Linux, though files with the same name are not exactly the same. Then I
> recompiled and installed the kernel. After that, everything seems to be
> working again now: Ceph is working with my new simple replica placement
> algorithm. :)
> So, it seems that Ceph files included in the Linux kernel are supposed to be
> different from those in 0.48argonaut. Presumably, the Linux kernel contains
> the client-side implementation while 0.48argonaut contains the server-side
> implementation. It would be appreciated if someone can confirm it. Thank
> you!
>
> Xing
>
>
> On 12/20/2012 11:54 AM, Xing Lin wrote:
>>
>> Hi,
>>
>> I was trying to add a simple replica placement algorithm in Ceph. This
>> algorithm simply returns r_th item in a bucket for the r_th replica. I have
>> made that change in Ceph source code (including files such as crush.h,
>> crush.c, mapper.c, ...) and I can run Ceph monitor and osd daemons. However,
>> I am not able to map rbd block devices at client machines. 'rbd map image0'
>> reported "input/output error" and 'dmesg' at the client machine showed
>> message like "libceph: handle_map corrupt msg". I believe that is because I
>> have not ported my changes to Ceph client side programs and it does not
>> recognize the new placement algorithm. I probably need to recompile the rbd
>> block device driver. When I was trying to replace Ceph related files in
>> Linux with my own version, I noticed that files in Linux-3.2.16 are
>> different from these included in Ceph source code. For example, the
>> following is the diff of crush.h in Linux-3.2.16 and 0.48argonaut. So, my
>> question is that is there any version of Linux that contains the exact Ceph
>> files as included in 0.48argonaut? Thanks.
>>
>> ---
>>  $ diff -uNrp ceph-0.48argonaut/src/crush/crush.h
>> linux-3.2.16/include/linux/crush/crush.h
>> --- ceph-0.48argonaut/src/crush/crush.h2012-06-26 11:56:36.0
>> -0600
>> +++ linux-3.2.16/include/linux/crush/crush.h2012-04-22
>> 16:31:32.0 -0600
>> @@ -1,12 +1,7 @@
>>  #ifndef CEPH_CRUSH_CRUSH_H
>>  #define CEPH_CRUSH_CRUSH_H
>>
>> -#if defined(__linux__)
>>  #include 
>> -#elif defined(__FreeBSD__)
>> -#include 
>> -#include "include/inttypes.h"
>> -#endif
>>
>>  /*
>>   * CRUSH is a pseudo-random data distribution algorithm that
>> @@ -156,24 +151,25 @@ struct crush_map {
>>  struct crush_bucket **buckets;
>>  struct crush_rule **rules;
>>
>> +/*
>> + * Parent pointers to identify the parent bucket a device or
>> + * bucket in the hierarchy.  If an item appears more than
>> + * once, this is the _last_ time it appeared (where buckets
>> + * are processed in bucket id order, from -1 on down to
>> + * -max_buckets.
>> + */
>> +__u32 *bucket_parents;
>> +__u32 *device_parents;
>> +
>>  __s32 max_buckets;
>>  __u32 max_rules;
>>  __s32 max_devices;
>> -
>> -/* choose local retries before re-descent */
>> -__u32 choose_local_tries;
>> -/* choose local attempts using a fallback permutation before
>> - * re-descent */
>> -__u32 choose_local_fallback_tries;
>> -/* choose attempts before giving up */
>> -__u32 choose_total_tries;
>> -
>> -__u32 *choose_tries;
>>  };
>>
>>
>>  /* crush.c */
>> -extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int
>> pos);
>> +extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
>> +extern void crush_calc_parents(struct crush_map *map);
>>  extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
>>  extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
>>  extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
>> @@ -181,9 +177,4 @@ extern void crush_destroy_bucket_straw(s
>>  extern void crush_destroy_bucket(struct crush_bucket *b);
>>  extern void crush_destroy(struct crush_map *map);
>>
>> -static inline int crush_calc_tree_node(int i)
>> -{
>> -return ((i+1) << 1)-1;
>> -}
>> -
>>  #endif
>>
>> 
>> Xing
>> --
>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel

Re: Usage of CEPH FS versa HDFS for Hadoop: TeraSort benchmark performance comparison issue

2013-01-04 Thread Gregory Farnum
Sorry for the delay; I've been out on vacation...

On Fri, Dec 14, 2012 at 6:09 AM, Lachfeld, Jutta
 wrote:
> I do not have the full output of "ceph pg dump" for that specific TeraSort 
> run, but here is a typical output after automatically preparing CEPH for a 
> benchmark run
>  (removed almost all lines in the long pg_stat table hoping that you do not 
> need them):

Actually those were exactly what I was after; they include output on
the total PG size and the number of objects so we can check on average
size. :) If you'd like to do it yourself, look at some of the PGs
which correspond to your data pool (the PG ids are all of the form
0.123a, and the number before the decimal point is the pool ID; by
default you'll be looking for 0).


On Fri, Dec 14, 2012 at 6:53 AM, Mark Nelson  wrote:
> The large block size may be an issue (at least with some of our default
> tunable settings).  You might want to try 4 or 16MB and see if it's any
> better or worse.

Unless you've got a specific reason to think this is busted, I am
pretty confident it's not a problem. :)


Jutta, do you have any finer-grained numbers than total run time
(specifically, how much time is spent on data generation versus the
read-and-sort for each FS)? HDFS doesn't do any journaling like Ceph
does and the fact that the Ceph journal is in-memory might not be
helping much since it's so small compared to the amount of data being
written.
-Greg
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] Librados aio stat

2013-01-04 Thread Josh Durgin

On 01/04/2013 05:01 AM, Filippos Giannakos wrote:

Hi Team,

Is there any progress or any comments regarding the librados aio stat
patch ?


They look good to me. I put them in the wip-librados-aio-stat branch.
Can we add your signed-off-by to them?

Thanks,
Josh


Best regards

On 12/20/2012 10:05 PM, Filippos Giannakos wrote:

Hi Team,

Here is the patch with the changes, plus the tests you requested.

Best regards,
Filippos





--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] rbd: Support plain/json/xml output formatting

2013-01-04 Thread Josh Durgin

On 12/19/2012 04:17 AM, Stratos Psomadakis wrote:

This patch renames the --format option to --image-format, for specyfing the RBD
image format, and uses --format to specify the output formating (to be
consistent with the other ceph tools). To avoid breaking backwards compatibility
with existing scripts, rbd will still accept --format [1|2] for the image
format, but will print a warning message, noting its use is deprecated.

The rbd subcommands that support the new --format option are : ls, info, snap
list, children, showmapped, lock list.

Signed-off-by: Stratos Psomadakis 
---
Hi,

this is the updated version of the patch. I renamed --format option to
  --image-format, and used --format to specify the output formatting, as you
suggested.

I also implemented some basic error checking on the --format input, and modified
the rbd subcommands you mentioned, to support plain/json/xml output formatting.
Although, I'm not sure if the json and xml output of those commands is what
you'd want.

The style issues should also be resolved now.

Let me know what you think.


There were still some style issues (missing braces, long lines, and
spaces instead of tabs), but I squashed fixes to those into the
wip-rbd-formatted-output branch.

I changed the output a bit to uses ints, arrays, etc. where it
seemed appropriate. The tests aren't quite done yet, but does
this new json output look good to you?

Josh

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] fix build and packaging issues

2013-01-04 Thread Gary Lowell
No Problem.

Cheers,
Gary

On Jan 4, 2013, at 10:00 AM, Gregory Farnum wrote:

> Thanks!
> 
> Gary, can you pull these into a branch and do some before-and-after
> package comparisons on our systems (for the different distros in
> gitbuilder) and then merge into master?
> -Greg
> 
> On Fri, Jan 4, 2013 at 9:51 AM, Danny Al-Gaaf  wrote:
>> This set of patches contains fixes for some build and packaging issues.
>> 
>> Danny Al-Gaaf (6):
>>  src/java/Makefile.am: fix default java dir
>>  ceph.spec.in: fix handling of java files
>>  ceph.spec.in: rename libcephfs-java package to cephfs-java
>>  ceph.spec.in: fix libcephfs-jni package name
>>  configure.ac: remove AC_PROG_RANLIB
>>  configure.ac: change junit4 handling
>> 
>> ceph.spec.in | 21 +
>> configure.ac |  8 +---
>> src/java/Makefile.am |  6 +++---
>> 3 files changed, 17 insertions(+), 18 deletions(-)
>> 
>> --
>> 1.8.0.2
>> 
>> --
>> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/6] fix build and packaging issues

2013-01-04 Thread Gregory Farnum
Thanks!

Gary, can you pull these into a branch and do some before-and-after
package comparisons on our systems (for the different distros in
gitbuilder) and then merge into master?
-Greg

On Fri, Jan 4, 2013 at 9:51 AM, Danny Al-Gaaf  wrote:
> This set of patches contains fixes for some build and packaging issues.
>
> Danny Al-Gaaf (6):
>   src/java/Makefile.am: fix default java dir
>   ceph.spec.in: fix handling of java files
>   ceph.spec.in: rename libcephfs-java package to cephfs-java
>   ceph.spec.in: fix libcephfs-jni package name
>   configure.ac: remove AC_PROG_RANLIB
>   configure.ac: change junit4 handling
>
>  ceph.spec.in | 21 +
>  configure.ac |  8 +---
>  src/java/Makefile.am |  6 +++---
>  3 files changed, 17 insertions(+), 18 deletions(-)
>
> --
> 1.8.0.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] Librados aio stat

2013-01-04 Thread Sage Weil
Sorry, I missed this second set of patches when they came through over the 
break.  We'll try to get them merged into master shortly.

Thanks!
sage


On Fri, 4 Jan 2013, Filippos Giannakos wrote:

> Hi Team,
> 
> Is there any progress or any comments regarding the librados aio stat
> patch ?
> 
> Best regards
> 
> On 12/20/2012 10:05 PM, Filippos Giannakos wrote:
> > Hi Team,
> > 
> > Here is the patch with the changes, plus the tests you requested.
> > 
> > Best regards,
> > Filippos
> 
> 
> -- 
> Filippos.
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/6] fix build and packaging issues

2013-01-04 Thread Danny Al-Gaaf
This set of patches contains fixes for some build and packaging issues.

Danny Al-Gaaf (6):
  src/java/Makefile.am: fix default java dir
  ceph.spec.in: fix handling of java files
  ceph.spec.in: rename libcephfs-java package to cephfs-java
  ceph.spec.in: fix libcephfs-jni package name
  configure.ac: remove AC_PROG_RANLIB
  configure.ac: change junit4 handling

 ceph.spec.in | 21 +
 configure.ac |  8 +---
 src/java/Makefile.am |  6 +++---
 3 files changed, 17 insertions(+), 18 deletions(-)

-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/6] configure.ac: remove AC_PROG_RANLIB

2013-01-04 Thread Danny Al-Gaaf
Remove already comment out AC_PROG_RANLIB to get rid of warning:

libtoolize: `AC_PROG_RANLIB' is rendered obsolete by `LT_INIT'

Signed-off-by: Danny Al-Gaaf 
---
 configure.ac | 1 -
 1 file changed, 1 deletion(-)

diff --git a/configure.ac b/configure.ac
index 07bfaa9..15430dd 100644
--- a/configure.ac
+++ b/configure.ac
@@ -51,7 +51,6 @@ AM_CONDITIONAL(FREEBSD, test x"$freebsd" = x"yes")
 # Checks for programs.
 AC_PROG_CXX
 #AC_PROG_CC
-#AC_PROG_RANLIB
 AC_PROG_MAKE_SET
 AC_PROG_LIBTOOL
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/6] src/java/Makefile.am: fix default java dir

2013-01-04 Thread Danny Al-Gaaf
Fix default javadir in src/java/Makefile.am to $(datadir)/java
since this is the common data dir for java files.

Signed-off-by: Danny Al-Gaaf 
---
 src/java/Makefile.am | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/java/Makefile.am b/src/java/Makefile.am
index 55ff6c1..1f87885 100644
--- a/src/java/Makefile.am
+++ b/src/java/Makefile.am
@@ -43,7 +43,7 @@ $(JAVA_H): $(CEPH_PROXY)
 libcephfs.jar: $(CEPH_PROXY)
$(JAR) cf $@ $(JAVA_CLASSES:%=-C java %) 
 
-javadir = $(libdir)
+javadir = $(datadir)/java
 java_DATA = libcephfs.jar
 
 CLEANFILES = -rf java/com/ceph/fs/*.class $(JAVA_H) libcephfs.jar 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/6] ceph.spec.in: fix handling of java files

2013-01-04 Thread Danny Al-Gaaf
Fix handling of JAVA (jar) files. Don't move the files around in the install
section since the related Makefile is fixed instead.

Use %{_javadir} instead of /usr/share/java/

Signed-off-by: Danny Al-Gaaf 
---
 ceph.spec.in | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/ceph.spec.in b/ceph.spec.in
index 7aae6fe..cd93573 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -281,9 +281,6 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph/
 #  Relocate java packages to expected locations in buildroot.
 mkdir -p $RPM_BUILD_ROOT/usr/lib/jni
 mv $RPM_BUILD_ROOT/usr/lib64/libcephfs_jni.so* $RPM_BUILD_ROOT/usr/lib/jni/.
-mkdir -p $RPM_BUILD_ROOT/usr/share/java
-mv $RPM_BUILD_ROOT/usr/lib64/libcephfs.jar  $RPM_BUILD_ROOT/usr/share/java/.
-mv $RPM_BUILD_ROOT/usr/lib64/libcephfs-test.jar 
$RPM_BUILD_ROOT/usr/share/java/.
 
 %clean
 rm -rf $RPM_BUILD_ROOT
@@ -561,7 +558,7 @@ fi
 
 %files -n libcephfs-java
 %defattr(-,root,root,-)
-/usr/share/java/libcephfs.jar
-/usr/share/java/libcephfs-test.jar
+%{_javadir}/libcephfs.jar
+%{_javadir}/libcephfs-test.jar
 
 %changelog
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/6] configure.ac: change junit4 handling

2013-01-04 Thread Danny Al-Gaaf
Change handling of --with-debug and junit4. Add a new conditional HAVE_JUNIT4
to be able to build ceph-test package also if junit4 isn't available. In
this case simply don't build libcephfs-test.jar, but the rest of the tools.

Signed-off-by: Danny Al-Gaaf 
---
 configure.ac | 7 +--
 src/java/Makefile.am | 4 ++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/configure.ac b/configure.ac
index 15430dd..832054b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -309,8 +309,10 @@ AS_IF([test "x$enable_cephfs_java" = "xyes"], [
   junit4_jar=`find $dir -name junit4.jar | head -n 1`
  AS_IF([test -r "$junit4_jar"], [
  EXTRA_CLASSPATH_JAR=`dirname $junit4_jar`/junit4.jar
-  AC_SUBST(EXTRA_CLASSPATH_JAR)], [
-  AC_MSG_ERROR([Cannot find junit4.jar (apt-get install junit4)])])])
+  AC_SUBST(EXTRA_CLASSPATH_JAR)
+  [have_junit4=1]], [
+  AC_MSG_NOTICE([Cannot find junit4.jar (apt-get install junit4)])
+  [have_junit4=0]])])
 
# Check for Java programs: javac, javah, jar
 PATH_save=$PATH
@@ -341,6 +343,7 @@ AS_IF([test "x$enable_cephfs_java" = "xyes"], [
# Setup output var
AC_SUBST(JDK_CPPFLAGS)
 ])
+AM_CONDITIONAL(HAVE_JUNIT4, [test "$have_junit4" = "1"])
 
 # jni?
 # clear cache (from java above) -- this whole thing will get
diff --git a/src/java/Makefile.am b/src/java/Makefile.am
index 1f87885..788bea2 100644
--- a/src/java/Makefile.am
+++ b/src/java/Makefile.am
@@ -51,7 +51,7 @@ CLEANFILES = -rf java/com/ceph/fs/*.class $(JAVA_H) 
libcephfs.jar
 BUILT_SOURCES = $(JAVA_H)
 
 # build the tests if *both* --enable-cephfs-java and --with-debug were specifed
-if WITH_DEBUG 
+if HAVE_JUNIT4
 
 JAVA_TEST_CLASSES = $(JAVA_TEST_SRC:test/%.java=%.class)
 ESCAPED_JAVA_TEST_CLASSES = com/ceph/fs/CephAllTests\$$1.class
@@ -68,5 +68,5 @@ libcephfs-test.jar: $(CEPH_TEST_PROXY)
 java_DATA += libcephfs-test.jar
 
 CLEANFILES += test/com/ceph/fs/*.class libcephfs-test.jar
-endif # WITH_DEBUG
+endif # HAVE_JUNIT4 (includes WITH_DEBUG)
 endif #ENABLE_CEPHFS_JAVA
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/6] ceph.spec.in: rename libcephfs-java package to cephfs-java

2013-01-04 Thread Danny Al-Gaaf
Rename the libcephfs-java package to cephfs-java since the package
contains no (classic) library and RPMLINT complains about the name.

Signed-off-by: Danny Al-Gaaf 
---
 ceph.spec.in | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ceph.spec.in b/ceph.spec.in
index cd93573..bfd030b 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -200,14 +200,14 @@ BuildRequires:  java-devel
 This package contains the Java Native Interface library for CephFS Java
 bindings.
 
-%package -n libcephfs-java
+%package -n cephfs-java
 Summary:   Java libraries for the Ceph File System.
 Group: System Environment/Libraries
 License:   LGPL-2.0
 Requires:  java
 Requires:  libcephfs-jni = %{version}-%{release}-
 BuildRequires:  java-devel
-%description -n libcephfs-java
+%description -n cephfs-java
 This package contains the Java libraries for the Ceph File System.
 
 %if (0%{?centos} || 0%{?opensuse} || 0%{?suse_version})
@@ -556,7 +556,7 @@ fi
 %defattr(-,root,root,-)
 /usr/lib/jni/libcephfs_jni.so*
 
-%files -n libcephfs-java
+%files -n cephfs-java
 %defattr(-,root,root,-)
 %{_javadir}/libcephfs.jar
 %{_javadir}/libcephfs-test.jar
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/6] ceph.spec.in: fix libcephfs-jni package name

2013-01-04 Thread Danny Al-Gaaf
Rename libcephfs-jni to libcephfs_jni1 to reflect the SO name/version of
the library and to prevent RPMLINT to complain about the naming.

Signed-off-by: Danny Al-Gaaf 
---
 ceph.spec.in | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ceph.spec.in b/ceph.spec.in
index bfd030b..8eb4be8 100644
--- a/ceph.spec.in
+++ b/ceph.spec.in
@@ -189,14 +189,14 @@ Requires: libcephfs1 = %{version}-%{release}
 %description -n ceph-test
 This package contains Ceph benchmarks and test tools.
 
-%package -n libcephfs-jni
+%package -n libcephfs_jni1
 Summary:   Java Native Interface library for CephFS Java bindings.
 Group: System Environment/Libraries
 License:   LGPL-2.0
 Requires:  java
 Requires:  libcephfs1 = %{version}-%{release}
 BuildRequires:  java-devel
-%description -n libcephfs-jni
+%description -n libcephfs_jni1
 This package contains the Java Native Interface library for CephFS Java
 bindings.
 
@@ -205,7 +205,7 @@ Summary:Java libraries for the Ceph File System.
 Group: System Environment/Libraries
 License:   LGPL-2.0
 Requires:  java
-Requires:  libcephfs-jni = %{version}-%{release}-
+Requires:  libcephfs_jni1 = %{version}-%{release}
 BuildRequires:  java-devel
 %description -n cephfs-java
 This package contains the Java libraries for the Ceph File System.
@@ -552,7 +552,7 @@ fi
 %{_bindir}/tpbench
 %{_bindir}/xattr_bench
 
-%files -n libcephfs-jni
+%files -n libcephfs_jni1
 %defattr(-,root,root,-)
 /usr/lib/jni/libcephfs_jni.so*
 
-- 
1.8.0.2

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH, v2] rbd: define and use rbd_warn()

2013-01-04 Thread Alex Elder
Define a new function rbd_warn() that produces a boilerplate warning
message, identifying in the resulting message the affected rbd
device in the best way available.  Use it in a few places that now
use pr_warning().

Signed-off-by: Alex Elder 
Reviewed-by: Dan Mick 
---
v2: Added function name to warning message, as suggested by Dan.

 drivers/block/rbd.c |   46 --
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 414052d..86e60eb 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -294,6 +294,36 @@ static struct device rbd_root_dev = {
.release =  rbd_root_dev_release,
 };

+#define rbd_warn(rbd_dev, fmt, ...) \
+   _rbd_warn(__func__, (rbd_dev), (fmt), ##__VA_ARGS__)
+
+static __printf(3, 4) void
+_rbd_warn(const char *func, struct rbd_device *rbd_dev, const char
*fmt, ...)
+{
+   struct va_format vaf;
+   va_list args;
+
+   va_start(args, fmt);
+   vaf.fmt = fmt;
+   vaf.va = &args;
+
+   if (!rbd_dev)
+   printk(KERN_WARNING RBD_DRV_NAME ": %s: %pV\n", func, &vaf);
+   else if (rbd_dev->disk)
+   printk(KERN_WARNING RBD_DRV_NAME ": %s: %s: %pV\n",
+   func, rbd_dev->disk->disk_name, &vaf);
+   else if (rbd_dev->spec && rbd_dev->spec->image_name)
+   printk(KERN_WARNING RBD_DRV_NAME ": %s: image %s: %pV\n",
+   func, rbd_dev->spec->image_name, &vaf);
+   else if (rbd_dev->spec && rbd_dev->spec->image_id)
+   printk(KERN_WARNING RBD_DRV_NAME ": %s: id %s: %pV\n",
+   func, rbd_dev->spec->image_id, &vaf);
+   else/* punt */
+   printk(KERN_WARNING RBD_DRV_NAME ": %s: rbd_dev %p: %pV\n",
+   func, rbd_dev, &vaf);
+   va_end(args);
+}
+
 #ifdef RBD_DEBUG
 #define rbd_assert(expr)   \
if (unlikely(!(expr))) {\
@@ -1407,8 +1437,8 @@ static void rbd_watch_cb(u64 ver, u64 notify_id,
u8 opcode, void *data)
(unsigned int) opcode);
rc = rbd_dev_refresh(rbd_dev, &hver);
if (rc)
-   pr_warning(RBD_DRV_NAME "%d got notification but failed to "
-  " update snaps: %d\n", rbd_dev->major, rc);
+   rbd_warn(rbd_dev, "got notification but failed to "
+  " update snaps: %d", rc);

rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
 }
@@ -1771,15 +1801,13 @@ rbd_dev_v1_header_read(struct rbd_device
*rbd_dev, u64 *version)
goto out_err;
if (WARN_ON((size_t) ret < size)) {
ret = -ENXIO;
-   pr_warning("short header read for image %s"
-   " (want %zd got %d)\n",
-   rbd_dev->spec->image_name, size, ret);
+   rbd_warn(rbd_dev, "short header read (want %zd got %d)",
+   size, ret);
goto out_err;
}
if (!rbd_dev_ondisk_valid(ondisk)) {
ret = -ENXIO;
-   pr_warning("invalid header for image %s\n",
-   rbd_dev->spec->image_name);
+   rbd_warn(rbd_dev, "invalid header");
goto out_err;
}

@@ -2634,9 +2662,7 @@ static int rbd_dev_probe_update_spec(struct
rbd_device *rbd_dev)
if (name)
rbd_dev->spec->image_name = (char *) name;
else
-   pr_warning(RBD_DRV_NAME "%d "
-   "unable to get image name for image id %s\n",
-   rbd_dev->major, rbd_dev->spec->image_id);
+   rbd_warn(rbd_dev, "unable to get image name");

/* Look up the snapshot name. */

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH REPOST 2/4] rbd: add warning messages for missing arguments

2013-01-04 Thread Alex Elder
On 01/03/2013 07:10 PM, Dan Mick wrote:
> Do you want to include in the message some kind of indication which
> operation/function is involved?  (this is definitely better, but even
> better might be to add "rbd add" or "rbd_add_parse_args" to the msgs)

Sure.  This comment really applies to the previous patch.  I'm
sure I thought of doing that and I'm not sure any more why I did
not.  Maybe worried about lines getting too long?  Or maybe
I bumped into varargs problems?  I don't know.

I'll rename the rbd_warn() function _rbd_warn(), and will add a new
first argument which is the function name.  Then I'll define a new
macro rbd_warn() that just calls _rbd_warn(), inserting __func__ as
a first argument.

-Alex

> On 01/03/2013 11:11 AM, Alex Elder wrote:
>> Tell the user (via dmesg) what was wrong with the arguments provided
>> via /sys/bus/rbd/add.
>>
>> Signed-off-by: Alex Elder 
>> ---
>>   drivers/block/rbd.c |   24 
>>   1 file changed, 16 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
>> index 635b81d..31da8c5 100644
>> --- a/drivers/block/rbd.c
>> +++ b/drivers/block/rbd.c
>> @@ -3244,8 +3244,10 @@ static int rbd_add_parse_args(const char *buf,
>>   /* The first four tokens are required */
>>
>>   len = next_token(&buf);
>> -if (!len)
>> -return -EINVAL;/* Missing monitor address(es) */
>> +if (!len) {
>> +rbd_warn(NULL, "no monitor address(es) provided");
>> +return -EINVAL;
>> +}
>>   mon_addrs = buf;
>>   mon_addrs_size = len + 1;
>>   buf += len;
>> @@ -3254,8 +3256,10 @@ static int rbd_add_parse_args(const char *buf,
>>   options = dup_token(&buf, NULL);
>>   if (!options)
>>   return -ENOMEM;
>> -if (!*options)
>> -goto out_err;/* Missing options */
>> +if (!*options) {
>> +rbd_warn(NULL, "no options provided");
>> +goto out_err;
>> +}
>>
>>   spec = rbd_spec_alloc();
>>   if (!spec)
>> @@ -3264,14 +3268,18 @@ static int rbd_add_parse_args(const char *buf,
>>   spec->pool_name = dup_token(&buf, NULL);
>>   if (!spec->pool_name)
>>   goto out_mem;
>> -if (!*spec->pool_name)
>> -goto out_err;/* Missing pool name */
>> +if (!*spec->pool_name) {
>> +rbd_warn(NULL, "no pool name provided");
>> +goto out_err;
>> +}
>>
>>   spec->image_name = dup_token(&buf, NULL);
>>   if (!spec->image_name)
>>   goto out_mem;
>> -if (!*spec->image_name)
>> -goto out_err;/* Missing image name */
>> +if (!*spec->image_name) {
>> +rbd_warn(NULL, "no image name provided");
>> +goto out_err;
>> +}
>>
>>   /*
>>* Snapshot name is optional; default is to use "-"
>>

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: OSD memory leaks?

2013-01-04 Thread Sébastien Han
Hi Sam,

Thanks for your answer and sorry the late reply.

Unfortunately I can't get something out from the profiler, actually I
do but I guess it doesn't show what is supposed to show... I will keep
on trying this. Anyway yesterday I just thought that the problem might
be due to some over usage of some OSDs. I was thinking that the
distribution of the primary OSD might be uneven, this could have
explained that some memory leaks are more important with some servers.
At the end, the repartition seems even but while looking at the pg
dump I found something interesting in the scrub column, timestamps
from the last scrubbing operation matched with times showed on the
graph.

After this, I made some calculation, I compared the total number of
scrubbing operation with the time range where memory leaks occurred.
First of all check my setup:

root@c2-ceph-01 ~ # ceph osd tree
dumped osdmap tree epoch 859
# id weight type name up/down reweight
-1 12 pool default
-3 12 rack lc2_rack33
-2 3 host c2-ceph-01
0 1 osd.0 up 1
1 1 osd.1 up 1
2 1 osd.2 up 1
-4 3 host c2-ceph-04
10 1 osd.10 up 1
11 1 osd.11 up 1
9 1 osd.9 up 1
-5 3 host c2-ceph-02
3 1 osd.3 up 1
4 1 osd.4 up 1
5 1 osd.5 up 1
-6 3 host c2-ceph-03
6 1 osd.6 up 1
7 1 osd.7 up 1
8 1 osd.8 up 1


And there are the results:

* Ceph node 1 which has the most important memory leak performed 1608
in total and 1059 during the time range where memory leaks occured
* Ceph node 2, 1168 in total and 776 during the time range where
memory leaks occured
* Ceph node 3, 940 in total and 94 during  the time range where memory
leaks occurred
* Ceph node 4, 899 in total and 191 during  the time range where
memory leaks occurred

I'm still not entirely sure that the scrub operation causes the leak
but the only relevant relation that I found...

Could it be that the scrubbing process doesn't release memory? Btw I
was wondering, how ceph decides at what time it should run the
scrubbing operation? I know that it's once a day and control by the
following options

OPTION(osd_scrub_min_interval, OPT_FLOAT, 300)
OPTION(osd_scrub_max_interval, OPT_FLOAT, 60*60*24)

But how ceph determined the time where the operation started, during
cluster creation probably?

I just checked the options that control OSD scrubbing and found that by default:

OPTION(osd_max_scrubs, OPT_INT, 1)

So that might explain why only one OSD uses a lot of memory.

My dirty workaround at the moment is to performed a check of memory
use by every OSD and restart it if it uses more than 25% of the total
memory. Also note that on ceph 1, 3 and 4 it's always one OSD that
uses a lot of memory, for ceph 2 only the mem usage is high but almost
the same for all the OSD process.

Thank you in advance.

--
Regards,
Sébastien Han.


On Wed, Dec 19, 2012 at 10:43 PM, Samuel Just  wrote:
>
> Sorry, it's been very busy.  The next step would to try to get a heap
> dump.  You can start a heap profile on osd N by:
>
> ceph osd tell N heap start_profiler
>
> and you can get it to dump the collected profile using
>
> ceph osd tell N heap dump.
>
> The dumps should show up in the osd log directory.
>
> Assuming the heap profiler is working correctly, you can look at the
> dump using pprof in google-perftools.
>
> On Wed, Dec 19, 2012 at 8:37 AM, Sébastien Han  
> wrote:
> > No more suggestions? :(
> > --
> > Regards,
> > Sébastien Han.
> >
> >
> > On Tue, Dec 18, 2012 at 6:21 PM, Sébastien Han  
> > wrote:
> >> Nothing terrific...
> >>
> >> Kernel logs from my clients are full of "libceph: osd4
> >> 172.20.11.32:6801 socket closed"
> >>
> >> I saw this somewhere on the tracker.
> >>
> >> Does this harm?
> >>
> >> Thanks.
> >>
> >> --
> >> Regards,
> >> Sébastien Han.
> >>
> >>
> >>
> >> On Mon, Dec 17, 2012 at 11:55 PM, Samuel Just  wrote:
> >>>
> >>> What is the workload like?
> >>> -Sam
> >>>
> >>> On Mon, Dec 17, 2012 at 2:41 PM, Sébastien Han  
> >>> wrote:
> >>> > Hi,
> >>> >
> >>> > No, I don't see nothing abnormal in the network stats. I don't see
> >>> > anything in the logs... :(
> >>> > The weird thing is that one node over 4 seems to take way more memory
> >>> > than the others...
> >>> >
> >>> > --
> >>> > Regards,
> >>> > Sébastien Han.
> >>> >
> >>> >
> >>> > On Mon, Dec 17, 2012 at 11:31 PM, Sébastien Han 
> >>> >  wrote:
> >>> >>
> >>> >> Hi,
> >>> >>
> >>> >> No, I don't see nothing abnormal in the network stats. I don't see 
> >>> >> anything in the logs... :(
> >>> >> The weird thing is that one node over 4 seems to take way more memory 
> >>> >> than the others...
> >>> >>
> >>> >> --
> >>> >> Regards,
> >>> >> Sébastien Han.
> >>> >>
> >>> >>
> >>> >>
> >>> >> On Mon, Dec 17, 2012 at 7:12 PM, Samuel Just  
> >>> >> wrote:
> >>> >>>
> >>> >>> Are you having network hiccups?  There was a bug noticed recently that
> >>> >>> could cause a memory leak if nodes are being marked up and down.
> >>> >>> -Sam
> >>> >>>
> >>> >>> On Mon, Dec 17, 2012 at 12:28 AM, Sébastien Han 
> >>> >>>  wrote:
> >>> >>> > Hi guys,
> >>> >>> 

THE END, for now

2013-01-04 Thread Alex Elder
That concludes my reposting of un-reviewed patches.  I see I'm
getting some reviews now, so I'll be building up a replacement
testing branch with annotations to the commits indicating that.

-Alex
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST] rbd: assign watch request more directly

2013-01-04 Thread Alex Elder
Both rbd_req_sync_op() and rbd_do_request() have a "linger"
parameter, which is the address of a pointer that should refer to
the osd request structure used to issue a request to an osd.

Only one case ever supplies a non-null "linger" argument: an
CEPH_OSD_OP_WATCH start.  And in that one case it is assigned
&rbd_dev->watch_request.

Within rbd_do_request() (where the assignment ultimately gets made)
we know the rbd_dev and therefore its watch_request field.  We
also know whether the op being sent is CEPH_OSD_OP_WATCH start.

Stop opaquely passing down the "linger" pointer, and instead just
assign the value directly inside rbd_do_request() when it's needed.

This makes it unnecessary for rbd_req_sync_watch() to make
arrangements to hold a value that's not available until a
bit later.  This more clearly separates setting up a watch
request from submitting it.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   20 
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 21fbf82..02002b1 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1158,7 +1158,6 @@ static int rbd_do_request(struct request *rq,
  int coll_index,
  void (*rbd_cb)(struct ceph_osd_request *,
 struct ceph_msg *),
- struct ceph_osd_request **linger_req,
  u64 *ver)
 {
struct ceph_osd_client *osdc;
@@ -1210,9 +1209,9 @@ static int rbd_do_request(struct request *rq,
ceph_osdc_build_request(osd_req, ofs, len, 1, op,
snapc, snapid, &mtime);

-   if (linger_req) {
+   if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) {
ceph_osdc_set_request_linger(osdc, osd_req);
-   *linger_req = osd_req;
+   rbd_dev->watch_request = osd_req;
}

ret = ceph_osdc_start_request(osdc, osd_req, false);
@@ -1296,7 +1295,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
   const char *object_name,
   u64 ofs, u64 inbound_size,
   char *inbound,
-  struct ceph_osd_request **linger_req,
   u64 *ver)
 {
int ret;
@@ -1317,7 +1315,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
  op,
  NULL, 0,
  NULL,
- linger_req, ver);
+ ver);
if (ret < 0)
goto done;

@@ -1383,7 +1381,7 @@ static int rbd_do_op(struct request *rq,
 flags,
 op,
 coll, coll_index,
-rbd_req_cb, 0, NULL);
+rbd_req_cb, NULL);
if (ret < 0)
rbd_coll_end_req_index(rq, coll, coll_index,
(s32) ret, seg_len);
@@ -1410,7 +1408,7 @@ static int rbd_req_sync_read(struct rbd_device
*rbd_dev,
return -ENOMEM;

ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-  op, object_name, ofs, len, buf, NULL, ver);
+  op, object_name, ofs, len, buf, ver);
rbd_osd_req_op_destroy(op);

return ret;
@@ -1436,7 +1434,7 @@ static int rbd_req_sync_notify_ack(struct
rbd_device *rbd_dev,
  CEPH_OSD_FLAG_READ,
  op,
  NULL, 0,
- rbd_simple_req_cb, 0, NULL);
+ rbd_simple_req_cb, NULL);

rbd_osd_req_op_destroy(op);

@@ -1469,7 +1467,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id,
u8 opcode, void *data)
  */
 static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 {
-   struct ceph_osd_request **linger_req = NULL;
struct ceph_osd_req_op *op;
int ret = 0;

@@ -1481,7 +1478,6 @@ static int rbd_req_sync_watch(struct rbd_device
*rbd_dev, int start)
&rbd_dev->watch_event);
if (ret < 0)
return ret;
-   linger_req = &rbd_dev->watch_request;
} else {
rbd_assert(rbd_dev->watch_request != NULL);
}
@@ -1493,7 +1489,7 @@ static int rbd_req_sync_watch(struct rbd_device
*rbd_dev, int start)
ret = rbd_req_sync_op(rbd_dev,
  CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
  op, rbd_dev->header_name,
- 0, 0, NULL, linger_req, NULL);
+ 0, 0, NULL, NULL);

/* Cancel the event if we're tearing down, or on error */

@@ -1537,7 +1533,7 @@ static int rbd_req_sync_exec(struct rbd_device
*rbd_dev,

ret = rbd_req_sync_o

[PATCH REPOST 6/6] rbd: move remaining osd op setup into rbd_osd_req_op_create()

2013-01-04 Thread Alex Elder
The two remaining osd ops used by rbd are CEPH_OSD_OP_WATCH and
CEPH_OSD_OP_NOTIFY_ACK.  Move the setup of those operations into
rbd_osd_req_op_create(), and get rid of rbd_create_rw_op() and
rbd_destroy_op().

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   68
---
 1 file changed, 27 insertions(+), 41 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 9f41c32..21fbf82 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1027,24 +1027,6 @@ out_err:
return NULL;
 }

-static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs,
u64 len)
-{
-   struct ceph_osd_req_op *op;
-
-   op = kzalloc(sizeof (*op), GFP_NOIO);
-   if (!op)
-   return NULL;
-
-   op->op = opcode;
-
-   return op;
-}
-
-static void rbd_destroy_op(struct ceph_osd_req_op *op)
-{
-   kfree(op);
-}
-
 struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
 {
struct ceph_osd_req_op *op;
@@ -1087,6 +1069,16 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16
opcode, ...)
op->cls.indata_len = (u32) size;
op->payload_len += size;
break;
+   case CEPH_OSD_OP_NOTIFY_ACK:
+   case CEPH_OSD_OP_WATCH:
+   /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
+   /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
+   op->watch.cookie = va_arg(args, u64);
+   op->watch.ver = va_arg(args, u64);
+   op->watch.ver = cpu_to_le64(op->watch.ver); /* XXX */
+   if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
+   op->watch.flag = (u8) 1;
+   break;
default:
rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
kfree(op);
@@ -1434,14 +1426,10 @@ static int rbd_req_sync_notify_ack(struct
rbd_device *rbd_dev,
struct ceph_osd_req_op *op;
int ret;

-   op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0);
+   op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
if (!op)
return -ENOMEM;

-   op->watch.ver = cpu_to_le64(ver);
-   op->watch.cookie = notify_id;
-   op->watch.flag = 0;
-
ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
  rbd_dev->header_name, 0, 0, NULL,
  NULL, 0,
@@ -1450,7 +1438,8 @@ static int rbd_req_sync_notify_ack(struct
rbd_device *rbd_dev,
  NULL, 0,
  rbd_simple_req_cb, 0, NULL);

-   rbd_destroy_op(op);
+   rbd_osd_req_op_destroy(op);
+
return ret;
 }

@@ -1480,14 +1469,9 @@ static void rbd_watch_cb(u64 ver, u64 notify_id,
u8 opcode, void *data)
  */
 static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 {
-   struct ceph_osd_req_op *op;
struct ceph_osd_request **linger_req = NULL;
-   __le64 version = 0;
-   int ret;
-
-   op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0);
-   if (!op)
-   return -ENOMEM;
+   struct ceph_osd_req_op *op;
+   int ret = 0;

if (start) {
struct ceph_osd_client *osdc;
@@ -1496,26 +1480,28 @@ static int rbd_req_sync_watch(struct rbd_device
*rbd_dev, int start)
ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
&rbd_dev->watch_event);
if (ret < 0)
-   goto done;
-   version = cpu_to_le64(rbd_dev->header.obj_version);
+   return ret;
linger_req = &rbd_dev->watch_request;
+   } else {
+   rbd_assert(rbd_dev->watch_request != NULL);
}

-   op->watch.ver = version;
-   op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-   op->watch.flag = (u8) start ? 1 : 0;
-
-   ret = rbd_req_sync_op(rbd_dev,
+   op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
+   rbd_dev->watch_event->cookie,
+   rbd_dev->header.obj_version, start);
+   if (op)
+   ret = rbd_req_sync_op(rbd_dev,
  CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
  op, rbd_dev->header_name,
  0, 0, NULL, linger_req, NULL);

-   if (!start || ret < 0) {
+   /* Cancel the event if we're tearing down, or on error */
+
+   if (!start || !op || ret < 0) {
ceph_osdc_cancel_event(rbd_dev->watch_event);
rbd_dev->watch_event = NULL;
}
-done:
-   rbd_destroy_op(op);
+   rbd_osd_req_op_destroy(op);

return ret;
 }
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-

[PATCH REPOST 5/6] rbd: move call osd op setup into rbd_osd_req_op_create()

2013-01-04 Thread Alex Elder
Move the initialization of the CEPH_OSD_OP_CALL operation into
rbd_osd_req_op_create().

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   48 ++--
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6fa6ba7..9f41c32 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -52,10 +52,12 @@
 #defineSECTOR_SHIFT9
 #defineSECTOR_SIZE (1ULL << SECTOR_SHIFT)

-/* It might be useful to have this defined elsewhere too */
+/* It might be useful to have these defined elsewhere */

-#defineU32_MAX ((u32) (~0U))
-#defineU64_MAX ((u64) (~0ULL))
+#defineU8_MAX  ((u8)   (~0U))
+#defineU16_MAX ((u16)  (~0U))
+#defineU32_MAX ((u32)  (~0U))
+#defineU64_MAX ((u64)  (~0ULL))

 #define RBD_DRV_NAME "rbd"
 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
@@ -1047,6 +1049,7 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16
opcode, ...)
 {
struct ceph_osd_req_op *op;
va_list args;
+   size_t size;

op = kzalloc(sizeof (*op), GFP_NOIO);
if (!op)
@@ -1063,6 +1066,27 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16
opcode, ...)
if (opcode == CEPH_OSD_OP_WRITE)
op->payload_len = op->extent.length;
break;
+   case CEPH_OSD_OP_CALL:
+   /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
+   op->cls.class_name = va_arg(args, char *);
+   size = strlen(op->cls.class_name);
+   rbd_assert(size <= (size_t) U8_MAX);
+   op->cls.class_len = size;
+   op->payload_len = size;
+
+   op->cls.method_name = va_arg(args, char *);
+   size = strlen(op->cls.method_name);
+   rbd_assert(size <= (size_t) U8_MAX);
+   op->cls.method_len = size;
+   op->payload_len += size;
+
+   op->cls.argc = 0;
+   op->cls.indata = va_arg(args, void *);
+   size = va_arg(args, size_t);
+   rbd_assert(size <= (size_t) U32_MAX);
+   op->cls.indata_len = (u32) size;
+   op->payload_len += size;
+   break;
default:
rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
kfree(op);
@@ -1510,9 +1534,6 @@ static int rbd_req_sync_exec(struct rbd_device
*rbd_dev,
 u64 *ver)
 {
struct ceph_osd_req_op *op;
-   int class_name_len = strlen(class_name);
-   int method_name_len = strlen(method_name);
-   int payload_size;
int ret;

/*
@@ -1523,25 +1544,16 @@ static int rbd_req_sync_exec(struct rbd_device
*rbd_dev,
 * the perspective of the server side) in the OSD request
 * operation.
 */
-   payload_size = class_name_len + method_name_len + outbound_size;
-   op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0);
+   op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
+   method_name, outbound, outbound_size);
if (!op)
return -ENOMEM;
-   op->payload_len = payload_size;
-
-   op->cls.class_name = class_name;
-   op->cls.class_len = (__u8) class_name_len;
-   op->cls.method_name = method_name;
-   op->cls.method_len = (__u8) method_name_len;
-   op->cls.argc = 0;
-   op->cls.indata = outbound;
-   op->cls.indata_len = outbound_size;

ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
   object_name, 0, inbound_size, inbound,
   NULL, ver);

-   rbd_destroy_op(op);
+   rbd_osd_req_op_destroy(op);

dout("cls_exec returned %d\n", ret);
return ret;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 4/6] rbd: define generalized osd request op routines

2013-01-04 Thread Alex Elder
Create a baseline function to encapsulate the creation of osd
requests, along with a matching function to destroy them.  For now
this just duplicates what rbd_create_rw_op() does for read and write
operations, but the next patches will expand on this.

Since rbd_create_rw_op() is no longer used for read or write
requests, the special handling in that function for those types can
be removed.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   52
+++
 1 file changed, 40 insertions(+), 12 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 685d049..6fa6ba7 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1034,14 +1034,6 @@ static struct ceph_osd_req_op
*rbd_create_rw_op(int opcode, u64 ofs, u64 len)
return NULL;

op->op = opcode;
-   if (opcode == CEPH_OSD_OP_READ || opcode == CEPH_OSD_OP_WRITE) {
-   op->extent.offset = ofs;
-   op->extent.length = len;
-   if (opcode == CEPH_OSD_OP_WRITE) {
-   rbd_assert(len <= (u64) U32_MAX);
-   op->payload_len = len;
-   }
-   }

return op;
 }
@@ -1051,6 +1043,42 @@ static void rbd_destroy_op(struct ceph_osd_req_op
*op)
kfree(op);
 }

+struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
+{
+   struct ceph_osd_req_op *op;
+   va_list args;
+
+   op = kzalloc(sizeof (*op), GFP_NOIO);
+   if (!op)
+   return NULL;
+   op->op = opcode;
+   va_start(args, opcode);
+   switch (opcode) {
+   case CEPH_OSD_OP_READ:
+   case CEPH_OSD_OP_WRITE:
+   /* rbd_osd_req_op_create(READ, offset, length) */
+   /* rbd_osd_req_op_create(WRITE, offset, length) */
+   op->extent.offset = va_arg(args, u64);
+   op->extent.length = va_arg(args, u64);
+   if (opcode == CEPH_OSD_OP_WRITE)
+   op->payload_len = op->extent.length;
+   break;
+   default:
+   rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
+   kfree(op);
+   op = NULL;
+   break;
+   }
+   va_end(args);
+
+   return op;
+}
+
+static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
+{
+   kfree(op);
+}
+
 static void rbd_coll_end_req_index(struct request *rq,
   struct rbd_req_coll *coll,
   int index,
@@ -1323,7 +1351,7 @@ static int rbd_do_op(struct request *rq,
}

ret = -ENOMEM;
-   op = rbd_create_rw_op(opcode, seg_ofs, seg_len);
+   op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
if (!op)
goto done;

@@ -1343,7 +1371,7 @@ static int rbd_do_op(struct request *rq,
if (ret < 0)
rbd_coll_end_req_index(rq, coll, coll_index,
(s32) ret, seg_len);
-   rbd_destroy_op(op);
+   rbd_osd_req_op_destroy(op);
 done:
kfree(seg_name);
return ret;
@@ -1361,13 +1389,13 @@ static int rbd_req_sync_read(struct rbd_device
*rbd_dev,
struct ceph_osd_req_op *op;
int ret;

-   op = rbd_create_rw_op(CEPH_OSD_OP_READ, ofs, len);
+   op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
if (!op)
return -ENOMEM;

ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
   op, object_name, ofs, len, buf, NULL, ver);
-   rbd_destroy_op(op);
+   rbd_osd_req_op_destroy(op);

return ret;
 }
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 3/6] rbd: initialize off and len in rbd_create_rw_op()

2013-01-04 Thread Alex Elder
Move the initialization of a read or write operation's offset,
length, and payload length fields into rbd_create_rw_op().

This will actually get removed in the next patch, but it finishes
the consolidation of setting these fields at osd op creation time.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   36 
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 49e500f..685d049 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1025,19 +1025,23 @@ out_err:
return NULL;
 }

-static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32
payload_len)
+static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs,
u64 len)
 {
struct ceph_osd_req_op *op;

op = kzalloc(sizeof (*op), GFP_NOIO);
if (!op)
return NULL;
-   /*
-* op extent offset and length will be set later on
-* after ceph_calc_file_object_mapping().
-*/
+
op->op = opcode;
-   op->payload_len = payload_len;
+   if (opcode == CEPH_OSD_OP_READ || opcode == CEPH_OSD_OP_WRITE) {
+   op->extent.offset = ofs;
+   op->extent.length = len;
+   if (opcode == CEPH_OSD_OP_WRITE) {
+   rbd_assert(len <= (u64) U32_MAX);
+   op->payload_len = len;
+   }
+   }

return op;
 }
@@ -1297,7 +1301,6 @@ static int rbd_do_op(struct request *rq,
u64 seg_len;
int ret;
struct ceph_osd_req_op *op;
-   u32 payload_len;
int opcode;
int flags;
u64 snapid;
@@ -1312,22 +1315,17 @@ static int rbd_do_op(struct request *rq,
opcode = CEPH_OSD_OP_WRITE;
flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
snapid = CEPH_NOSNAP;
-   payload_len = seg_len;
} else {
opcode = CEPH_OSD_OP_READ;
flags = CEPH_OSD_FLAG_READ;
rbd_assert(!snapc);
snapid = rbd_dev->spec->snap_id;
-   payload_len = 0;
}

ret = -ENOMEM;
-   op = rbd_create_rw_op(opcode, payload_len);
+   op = rbd_create_rw_op(opcode, seg_ofs, seg_len);
if (!op)
goto done;
-   op->extent.offset = seg_ofs;
-   op->extent.length = seg_len;
-   op->payload_len = payload_len;

/* we've taken care of segment sizes earlier when we
   cloned the bios. We should never have a segment
@@ -1363,12 +1361,9 @@ static int rbd_req_sync_read(struct rbd_device
*rbd_dev,
struct ceph_osd_req_op *op;
int ret;

-   op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0);
+   op = rbd_create_rw_op(CEPH_OSD_OP_READ, ofs, len);
if (!op)
return -ENOMEM;
-   op->extent.offset = ofs;
-   op->extent.length = len;
-   op->payload_len = 0;

ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
   op, object_name, ofs, len, buf, NULL, ver);
@@ -1387,7 +1382,7 @@ static int rbd_req_sync_notify_ack(struct
rbd_device *rbd_dev,
struct ceph_osd_req_op *op;
int ret;

-   op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0);
+   op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0);
if (!op)
return -ENOMEM;

@@ -1438,7 +1433,7 @@ static int rbd_req_sync_watch(struct rbd_device
*rbd_dev, int start)
__le64 version = 0;
int ret;

-   op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
+   op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0);
if (!op)
return -ENOMEM;

@@ -1501,9 +1496,10 @@ static int rbd_req_sync_exec(struct rbd_device
*rbd_dev,
 * operation.
 */
payload_size = class_name_len + method_name_len + outbound_size;
-   op = rbd_create_rw_op(CEPH_OSD_OP_CALL, payload_size);
+   op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0);
if (!op)
return -ENOMEM;
+   op->payload_len = payload_size;

op->cls.class_name = class_name;
op->cls.class_len = (__u8) class_name_len;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 2/6] rbd: don't assign extent info in rbd_req_sync_op()

2013-01-04 Thread Alex Elder
Move the assignment of the extent offset and length and payload
length out of rbd_req_sync_op() and into its caller in the one spot
where a read (and note--no write) operation might be initiated.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   10 +++---
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4ce9981..49e500f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1262,13 +1262,6 @@ static int rbd_req_sync_op(struct rbd_device
*rbd_dev,
if (IS_ERR(pages))
return PTR_ERR(pages);

-   if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-   op->extent.offset = ofs;
-   op->extent.length = inbound_size;
-   if (op->op == CEPH_OSD_OP_WRITE)
-   op->payload_len = inbound_size;
-   }
-
ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
  object_name, ofs, inbound_size, NULL,
  pages, num_pages,
@@ -1373,6 +1366,9 @@ static int rbd_req_sync_read(struct rbd_device
*rbd_dev,
op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0);
if (!op)
return -ENOMEM;
+   op->extent.offset = ofs;
+   op->extent.length = len;
+   op->payload_len = 0;

ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
   op, object_name, ofs, len, buf, NULL, ver);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 1/6] rbd: don't assign extent info in rbd_do_request()

2013-01-04 Thread Alex Elder
In rbd_do_request() there's a sort of last-minute assignment of the
extent offset and length and payload length for read and write
operations.  Move those assignments into the caller (in those spots
that might initiate read or write operations)

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f0b30b2..4ce9981 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1156,13 +1156,6 @@ static int rbd_do_request(struct request *rq,
osd_req->r_oid_len = strlen(osd_req->r_oid);

osd_req->r_file_layout = rbd_dev->layout;   /* struct */
-
-   if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-   op->extent.offset = ofs;
-   op->extent.length = len;
-   if (op->op == CEPH_OSD_OP_WRITE)
-   op->payload_len = len;
-   }
osd_req->r_num_pages = calc_pages_for(ofs, len);
osd_req->r_page_alignment = ofs & ~PAGE_MASK;

@@ -1269,6 +1262,13 @@ static int rbd_req_sync_op(struct rbd_device
*rbd_dev,
if (IS_ERR(pages))
return PTR_ERR(pages);

+   if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
+   op->extent.offset = ofs;
+   op->extent.length = inbound_size;
+   if (op->op == CEPH_OSD_OP_WRITE)
+   op->payload_len = inbound_size;
+   }
+
ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
  object_name, ofs, inbound_size, NULL,
  pages, num_pages,
@@ -1332,6 +1332,9 @@ static int rbd_do_op(struct request *rq,
op = rbd_create_rw_op(opcode, payload_len);
if (!op)
goto done;
+   op->extent.offset = seg_ofs;
+   op->extent.length = seg_len;
+   op->payload_len = payload_len;

/* we've taken care of segment sizes earlier when we
   cloned the bios. We should never have a segment
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 0/6] rbd: consolidate osd request setup

2013-01-04 Thread Alex Elder
This series consolidates and encapsulates the setup of all
osd requests into a single function which takes variable
arguments appropriate for the type of request.  The result
groups together common code idioms and I think makes the
spots that build these messages a little easier to read.

-Alex

[PATCH REPOST 1/6] rbd: don't assign extent info in rbd_do_request()
[PATCH REPOST 2/6] rbd: don't assign extent info in rbd_req_sync_op()
[PATCH REPOST 3/6] rbd: initialize off and len in rbd_create_rw_op()
[PATCH REPOST 4/6] rbd: define generalized osd request op routines
[PATCH REPOST 5/6] rbd: move call osd op setup into rbd_osd_req_op_create()
[PATCH REPOST 6/6] rbd: move remaining osd op setup into
rbd_osd_req_op_create()
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 2/2] rbd: don't leak rbd_req for rbd_req_sync_notify_ack()

2013-01-04 Thread Alex Elder
When rbd_req_sync_notify_ack() calls rbd_do_request() it supplies
rbd_simple_req_cb() as its callback function.  Because the callback
is supplied, an rbd_req structure gets allocated and populated so it
can be used by the callback.  However rbd_simple_req_cb() is not
freeing (or even using) the rbd_req structure, so it's getting
leaked.

Since rbd_simple_req_cb() has no need for the rbd_req structure,
just avoid allocating one for this case.  Of the three calls to
rbd_do_request(), only the one from rbd_do_op() needs the rbd_req
structure, and that call can be distinguished from the other two
because it supplies a non-null rbd_collection pointer.

So fix this leak by only allocating the rbd_req structure if a
non-null "coll" value is provided to rbd_do_request().

Signed-off-by: Alex Elder 
Reviewed-by: Josh Durgin 
---
 drivers/block/rbd.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 17bf0b4..f0b30b2 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1135,7 +1135,7 @@ static int rbd_do_request(struct request *rq,
bio_get(osd_req->r_bio);
}

-   if (rbd_cb) {
+   if (coll) {
ret = -ENOMEM;
rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
if (!rbd_req)
@@ -1146,7 +1146,7 @@ static int rbd_do_request(struct request *rq,
rbd_req->pages = pages;
rbd_req->len = len;
rbd_req->coll = coll;
-   rbd_req->coll_index = coll ? coll_index : 0;
+   rbd_req->coll_index = coll_index;
}

osd_req->r_callback = rbd_cb;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 1/2] rbd: don't leak rbd_req on synchronous requests

2013-01-04 Thread Alex Elder
When rbd_do_request() is called it allocates and populates an
rbd_req structure to hold information about the osd request to be
sent.  This is done for the benefit of the callback function (in
particular, rbd_req_cb()), which uses this in processing when
the request completes.

Synchronous requests provide no callback function, in which case
rbd_do_request() waits for the request to complete before returning.
This case is not handling the needed free of the rbd_req structure
like it should, so it is getting leaked.

Note however that the synchronous case has no need for the rbd_req
structure at all.  So rather than simply freeing this structure for
synchronous requests, just don't allocate it to begin with.

Signed-off-by: Alex Elder 
Reviewed-by: Josh Durgin 
---
 drivers/block/rbd.c |   48 
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c1e5f24..17bf0b4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1113,20 +1113,11 @@ static int rbd_do_request(struct request *rq,
  struct ceph_osd_request **linger_req,
  u64 *ver)
 {
+   struct ceph_osd_client *osdc;
struct ceph_osd_request *osd_req;
-   int ret;
+   struct rbd_request *rbd_req = NULL;
struct timespec mtime = CURRENT_TIME;
-   struct rbd_request *rbd_req;
-   struct ceph_osd_client *osdc;
-
-   rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO);
-   if (!rbd_req)
-   return -ENOMEM;
-
-   if (coll) {
-   rbd_req->coll = coll;
-   rbd_req->coll_index = coll_index;
-   }
+   int ret;

dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
object_name, (unsigned long long) ofs,
@@ -1134,10 +1125,8 @@ static int rbd_do_request(struct request *rq,

osdc = &rbd_dev->rbd_client->client->osdc;
osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
-   if (!osd_req) {
-   ret = -ENOMEM;
-   goto done_pages;
-   }
+   if (!osd_req)
+   return -ENOMEM;

osd_req->r_flags = flags;
osd_req->r_pages = pages;
@@ -1145,13 +1134,22 @@ static int rbd_do_request(struct request *rq,
osd_req->r_bio = bio;
bio_get(osd_req->r_bio);
}
-   osd_req->r_callback = rbd_cb;

-   rbd_req->rq = rq;
-   rbd_req->bio = bio;
-   rbd_req->pages = pages;
-   rbd_req->len = len;
+   if (rbd_cb) {
+   ret = -ENOMEM;
+   rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
+   if (!rbd_req)
+   goto done_osd_req;
+
+   rbd_req->rq = rq;
+   rbd_req->bio = bio;
+   rbd_req->pages = pages;
+   rbd_req->len = len;
+   rbd_req->coll = coll;
+   rbd_req->coll_index = coll ? coll_index : 0;
+   }

+   osd_req->r_callback = rbd_cb;
osd_req->r_priv = rbd_req;

strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
@@ -1193,10 +1191,12 @@ static int rbd_do_request(struct request *rq,
return ret;

 done_err:
-   bio_chain_put(rbd_req->bio);
-   ceph_osdc_put_request(osd_req);
-done_pages:
+   if (bio)
+   bio_chain_put(osd_req->r_bio);
kfree(rbd_req);
+done_osd_req:
+   ceph_osdc_put_request(osd_req);
+
return ret;
 }

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 0/2] rbd: fix two leaks

2013-01-04 Thread Alex Elder
When certain special osd requests are processed, they have data
structures allocated that are not properly freed when the request
is completed.  This series fixes that.

-Alex

[PATCH REPOST 1/2] rbd: don't leak rbd_req on synchronous requests
[PATCH REPOST 2/2] rbd: don't leak rbd_req for rbd_req_sync_notify_ack()
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST] rbd: combine rbd sync watch/unwatch functions

2013-01-04 Thread Alex Elder
The rbd_req_sync_watch() and rbd_req_sync_unwatch() functions are
nearly identical.  Combine them into a single function with a flag
indicating whether a watch is to be initiated or torn down.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   81
+--
 1 file changed, 27 insertions(+), 54 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 7c35608..c1e5f24 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1429,74 +1429,48 @@ static void rbd_watch_cb(u64 ver, u64 notify_id,
u8 opcode, void *data)
 }

 /*
- * Request sync osd watch
+ * Request sync osd watch/unwatch.  The value of "start" determines
+ * whether a watch request is being initiated or torn down.
  */
-static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
+static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
 {
struct ceph_osd_req_op *op;
-   struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+   struct ceph_osd_request **linger_req = NULL;
+   __le64 version = 0;
int ret;

op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
if (!op)
return -ENOMEM;

-   ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
-(void *)rbd_dev, &rbd_dev->watch_event);
-   if (ret < 0)
-   goto fail;
-
-   op->watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
-   op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-   op->watch.flag = 1;
-
-   ret = rbd_req_sync_op(rbd_dev,
- CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- op,
- rbd_dev->header_name,
- 0, 0, NULL,
- &rbd_dev->watch_request, NULL);
-
-   if (ret < 0)
-   goto fail_event;
-
-   rbd_destroy_op(op);
-   return 0;
-
-fail_event:
-   ceph_osdc_cancel_event(rbd_dev->watch_event);
-   rbd_dev->watch_event = NULL;
-fail:
-   rbd_destroy_op(op);
-   return ret;
-}
-
-/*
- * Request sync osd unwatch
- */
-static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
-{
-   struct ceph_osd_req_op *op;
-   int ret;
+   if (start) {
+   struct ceph_osd_client *osdc;

-   op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0);
-   if (!op)
-   return -ENOMEM;
+   osdc = &rbd_dev->rbd_client->client->osdc;
+   ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
+   &rbd_dev->watch_event);
+   if (ret < 0)
+   goto done;
+   version = cpu_to_le64(rbd_dev->header.obj_version);
+   linger_req = &rbd_dev->watch_request;
+   }

-   op->watch.ver = 0;
+   op->watch.ver = version;
op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
-   op->watch.flag = 0;
+   op->watch.flag = (u8) start ? 1 : 0;

ret = rbd_req_sync_op(rbd_dev,
  CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- op,
- rbd_dev->header_name,
- 0, 0, NULL, NULL, NULL);
-
+ op, rbd_dev->header_name,
+ 0, 0, NULL, linger_req, NULL);

+   if (!start || ret < 0) {
+   ceph_osdc_cancel_event(rbd_dev->watch_event);
+   rbd_dev->watch_event = NULL;
+   }
+done:
rbd_destroy_op(op);
-   ceph_osdc_cancel_event(rbd_dev->watch_event);
-   rbd_dev->watch_event = NULL;
+
return ret;
 }

@@ -3031,7 +3005,7 @@ static int rbd_init_watch_dev(struct rbd_device
*rbd_dev)
int ret, rc;

do {
-   ret = rbd_req_sync_watch(rbd_dev);
+   ret = rbd_req_sync_watch(rbd_dev, 1);
if (ret == -ERANGE) {
rc = rbd_dev_refresh(rbd_dev, NULL);
if (rc < 0)
@@ -3750,8 +3724,7 @@ static void rbd_dev_release(struct device *dev)
rbd_dev->watch_request);
}
if (rbd_dev->watch_event)
-   rbd_req_sync_unwatch(rbd_dev);
-
+   rbd_req_sync_watch(rbd_dev, 0);

/* clean up and free blkdev */
rbd_free_disk(rbd_dev);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST] rbd: use a common layout for each device

2013-01-04 Thread Alex Elder
Each osd message includes a layout structure, and for rbd it is
always the same (at least for osd's in a given pool).

Initialize a layout structure when an rbd_dev gets created and just
copy that into osd requests for the rbd image.

Replace an assertion that was done when initializing the layout
structures with code that catches and handles anything that would
trigger the assertion as soon as it is identified.  This precludes
that (bad) condition from ever occurring.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   34 +++---
 1 file changed, 23 insertions(+), 11 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 072608e..7c35608 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -235,6 +235,8 @@ struct rbd_device {

char*header_name;

+   struct ceph_file_layout layout;
+
struct ceph_osd_event   *watch_event;
struct ceph_osd_request *watch_request;

@@ -1091,16 +1093,6 @@ static void rbd_coll_end_req(struct rbd_request
*rbd_req,
ret, len);
 }

-static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id)
-{
-   memset(layout, 0, sizeof (*layout));
-   layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-   layout->fl_stripe_count = cpu_to_le32(1);
-   layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
-   rbd_assert(pool_id <= (u64) U32_MAX);
-   layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
-}
-
 /*
  * Send ceph osd request
  */
@@ -1165,7 +1157,7 @@ static int rbd_do_request(struct request *rq,
strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
osd_req->r_oid_len = strlen(osd_req->r_oid);

-   rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
+   osd_req->r_file_layout = rbd_dev->layout;   /* struct */

if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
op->extent.offset = ofs;
@@ -2295,6 +2287,13 @@ struct rbd_device *rbd_dev_create(struct
rbd_client *rbdc,
rbd_dev->spec = spec;
rbd_dev->rbd_client = rbdc;

+   /* Initialize the layout used for all rbd requests */
+
+   rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+   rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
+   rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+   rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
+
return rbd_dev;
 }

@@ -2549,6 +2548,12 @@ static int rbd_dev_v2_parent_info(struct
rbd_device *rbd_dev)
if (parent_spec->pool_id == CEPH_NOPOOL)
goto out;   /* No parent?  No problem. */

+   /* The ceph file layout needs to fit pool id in 32 bits */
+
+   ret = -EIO;
+   if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
+   goto out;
+
image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
if (IS_ERR(image_id)) {
ret = PTR_ERR(image_id);
@@ -3678,6 +3683,13 @@ static ssize_t rbd_add(struct bus_type *bus,
goto err_out_client;
spec->pool_id = (u64) rc;

+   /* The ceph file layout needs to fit pool id in 32 bits */
+
+   if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
+   rc = -EIO;
+   goto err_out_client;
+   }
+
rbd_dev = rbd_dev_create(rbdc, spec);
if (!rbd_dev)
goto err_out_client;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 3/3] rbd: don't bother calculating file mapping

2013-01-04 Thread Alex Elder
When rbd_do_request() has a request to process it initializes a ceph
file layout structure and uses it to compute offsets and limits for
the range of the request using ceph_calc_file_object_mapping().

The layout used is fixed, and is based on RBD_MAX_OBJ_ORDER (30).
It sets the layout's object size and stripe unit to be 1 GB (2^30),
and sets the stripe count to be 1.

The job of ceph_calc_file_object_mapping() is to determine which
of a sequence of objects will contain data covered by range, and
within that object, at what offset the range starts.  It also
truncates the length of the range at the end of the selected object
if necessary.

This is needed for ceph fs, but for rbd it really serves no purpose.
It does its own blocking of images into objects, echo of which is
(1 << obj_order) in size, and as a result it ignores the "bno"
value returned by ceph_calc_file_object_mapping().  In addition,
by the point a request has reached this function, it is already
destined for a single rbd object, and its length will not exceed
that object's extent.  Because of this, and because the mapping will
result in blocking up the range using an integer multiple of the
image's object order, ceph_calc_file_object_mapping() will never
change the offset or length values defined by the request.

In other words, this call is a big no-op for rbd data requests.

There is one exception.  We read the header object using this
function, and in that case we will not have already limited the
request size.  However, the header is a single object (not a file or
rbd image), and should not be broken into pieces anyway.  So in fact
we should *not* be calling ceph_calc_file_object_mapping() when
operating on the header object.

So...

Don't call ceph_calc_file_object_mapping() in rbd_do_request(),
because useless for image data and incorrect to do sofor the image
header.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   18 --
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index e6db737..072608e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1123,9 +1123,6 @@ static int rbd_do_request(struct request *rq,
 {
struct ceph_osd_request *osd_req;
int ret;
-   u64 bno;
-   u64 obj_off = 0;
-   u64 obj_len = 0;
struct timespec mtime = CURRENT_TIME;
struct rbd_request *rbd_req;
struct ceph_osd_client *osdc;
@@ -1169,19 +1166,12 @@ static int rbd_do_request(struct request *rq,
osd_req->r_oid_len = strlen(osd_req->r_oid);

rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-   ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len,
-   &bno, &obj_off, &obj_len);
-   rbd_assert(ret == 0);
-   if (obj_len < len) {
-   dout(" skipping last %llu, final file extent %llu~%llu\n",
-len - obj_len, ofs, obj_len);
-   len = obj_len;
-   }
+
if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-   op->extent.offset = obj_off;
-   op->extent.length = obj_len;
+   op->extent.offset = ofs;
+   op->extent.length = len;
if (op->op == CEPH_OSD_OP_WRITE)
-   op->payload_len = obj_len;
+   op->payload_len = len;
}
osd_req->r_num_pages = calc_pages_for(ofs, len);
osd_req->r_page_alignment = ofs & ~PAGE_MASK;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 2/3] rbd: open code rbd_calc_raw_layout()

2013-01-04 Thread Alex Elder
This patch gets rid of rbd_calc_raw_layout() by simply open coding
it in its one caller.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   55
+--
 1 file changed, 18 insertions(+), 37 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 46b52a4..e6db737 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1032,7 +1032,7 @@ static struct ceph_osd_req_op
*rbd_create_rw_op(int opcode, u32 payload_len)
return NULL;
/*
 * op extent offset and length will be set later on
-* in calc_raw_layout()
+* after ceph_calc_file_object_mapping().
 */
op->op = opcode;
op->payload_len = payload_len;
@@ -1101,40 +1101,6 @@ static void rbd_layout_init(struct
ceph_file_layout *layout, u64 pool_id)
layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
 }

-int rbd_calc_raw_layout(struct ceph_file_layout *layout,
-   u64 off, u64 *plen, u64 *bno,
-   struct ceph_osd_request *req,
-   struct ceph_osd_req_op *op)
-{
-   u64 orig_len = *plen;
-   u64 objoff, objlen;/* extent in object */
-   int r;
-
-   /* object extent? */
-   r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
- &objoff, &objlen);
-   if (r < 0)
-   return r;
-   if (objlen < orig_len) {
-   *plen = objlen;
-   dout(" skipping last %llu, final file extent %llu~%llu\n",
-orig_len - *plen, off, *plen);
-   }
-
-   if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
-   op->extent.offset = objoff;
-   op->extent.length = objlen;
-   }
-   req->r_num_pages = calc_pages_for(off, *plen);
-   req->r_page_alignment = off & ~PAGE_MASK;
-   if (op->op == CEPH_OSD_OP_WRITE)
-   op->payload_len = *plen;
-
-   dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
-*bno, objoff, objlen, req->r_num_pages);
-   return 0;
-}
-
 /*
  * Send ceph osd request
  */
@@ -1158,6 +1124,8 @@ static int rbd_do_request(struct request *rq,
struct ceph_osd_request *osd_req;
int ret;
u64 bno;
+   u64 obj_off = 0;
+   u64 obj_len = 0;
struct timespec mtime = CURRENT_TIME;
struct rbd_request *rbd_req;
struct ceph_osd_client *osdc;
@@ -1201,9 +1169,22 @@ static int rbd_do_request(struct request *rq,
osd_req->r_oid_len = strlen(osd_req->r_oid);

rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-   ret = rbd_calc_raw_layout(&osd_req->r_file_layout,
-   ofs, &len, &bno, osd_req, op);
+   ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len,
+   &bno, &obj_off, &obj_len);
rbd_assert(ret == 0);
+   if (obj_len < len) {
+   dout(" skipping last %llu, final file extent %llu~%llu\n",
+len - obj_len, ofs, obj_len);
+   len = obj_len;
+   }
+   if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
+   op->extent.offset = obj_off;
+   op->extent.length = obj_len;
+   if (op->op == CEPH_OSD_OP_WRITE)
+   op->payload_len = obj_len;
+   }
+   osd_req->r_num_pages = calc_pages_for(ofs, len);
+   osd_req->r_page_alignment = ofs & ~PAGE_MASK;

ceph_osdc_build_request(osd_req, ofs, len, 1, op,
snapc, snapid, &mtime);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 1/3] rbd: pull in ceph_calc_raw_layout()

2013-01-04 Thread Alex Elder
This is the first in a series of patches aimed at eliminating
the use of ceph_calc_raw_layout() by rbd.

It simply pulls in a copy of that function and renames it
rbd_calc_raw_layout().

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   36 +++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index a57d6e9..46b52a4 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1101,6 +1101,40 @@ static void rbd_layout_init(struct
ceph_file_layout *layout, u64 pool_id)
layout->fl_pg_pool = cpu_to_le32((u32) pool_id);
 }

+int rbd_calc_raw_layout(struct ceph_file_layout *layout,
+   u64 off, u64 *plen, u64 *bno,
+   struct ceph_osd_request *req,
+   struct ceph_osd_req_op *op)
+{
+   u64 orig_len = *plen;
+   u64 objoff, objlen;/* extent in object */
+   int r;
+
+   /* object extent? */
+   r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
+ &objoff, &objlen);
+   if (r < 0)
+   return r;
+   if (objlen < orig_len) {
+   *plen = objlen;
+   dout(" skipping last %llu, final file extent %llu~%llu\n",
+orig_len - *plen, off, *plen);
+   }
+
+   if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) {
+   op->extent.offset = objoff;
+   op->extent.length = objlen;
+   }
+   req->r_num_pages = calc_pages_for(off, *plen);
+   req->r_page_alignment = off & ~PAGE_MASK;
+   if (op->op == CEPH_OSD_OP_WRITE)
+   op->payload_len = *plen;
+
+   dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
+*bno, objoff, objlen, req->r_num_pages);
+   return 0;
+}
+
 /*
  * Send ceph osd request
  */
@@ -1167,7 +1201,7 @@ static int rbd_do_request(struct request *rq,
osd_req->r_oid_len = strlen(osd_req->r_oid);

rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-   ret = ceph_calc_raw_layout(&osd_req->r_file_layout,
+   ret = rbd_calc_raw_layout(&osd_req->r_file_layout,
ofs, &len, &bno, osd_req, op);
rbd_assert(ret == 0);

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 0/3] rbd: no need for file mapping calculation

2013-01-04 Thread Alex Elder
Currently every osd request submitted by the rbd code undergoes a
file mapping operation, which is common with what the ceph file system
uses.  But some analysis shows that there is no need to do this for
rbd, because it already takes care of its own blocking of image data
into distinct objects.  Removing this simplifies things.  I especially
think removing this improves things conceptually, removing a complex
mapping operation from the I/O path.

-Alex

[PATCH REPOST 1/3] rbd: pull in ceph_calc_raw_layout()
[PATCH REPOST 2/3] rbd: open code rbd_calc_raw_layout()
[PATCH REPOST 3/3] rbd: don't bother calculating file mapping
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST] rbd: kill ceph_osd_req_op->flags

2013-01-04 Thread Alex Elder
The flags field of struct ceph_osd_req_op is never used, so just get
rid of it.

Signed-off-by: Alex Elder 
---
 include/linux/ceph/osd_client.h |1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/ceph/osd_client.h
b/include/linux/ceph/osd_client.h
index 2b04d05..69287cc 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -157,7 +157,6 @@ struct ceph_osd_client {

 struct ceph_osd_req_op {
u16 op;   /* CEPH_OSD_OP_* */
-   u32 flags;/* CEPH_OSD_FLAG_* */
union {
struct {
u64 offset, length;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 4/4] rbd: assume single op in a request

2013-01-04 Thread Alex Elder
We now know that every of rbd_req_sync_op() passes an array of
exactly one operation, as evidenced by all callers passing 1 as its
num_op argument.  So get rid of that argument, assuming a single op.

Similarly, we now know that all callers of rbd_do_request() pass 1
as the num_op value, so that parameter can be eliminated as well.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   28 +---
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index d3472db..a57d6e9 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1113,8 +1113,7 @@ static int rbd_do_request(struct request *rq,
  struct page **pages,
  int num_pages,
  int flags,
- unsigned int num_op,
- struct ceph_osd_req_op *ops,
+ struct ceph_osd_req_op *op,
  struct rbd_req_coll *coll,
  int coll_index,
  void (*rbd_cb)(struct ceph_osd_request *,
@@ -1143,7 +1142,7 @@ static int rbd_do_request(struct request *rq,
(unsigned long long) len, coll, coll_index);

osdc = &rbd_dev->rbd_client->client->osdc;
-   osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO);
+   osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
if (!osd_req) {
ret = -ENOMEM;
goto done_pages;
@@ -1169,10 +1168,10 @@ static int rbd_do_request(struct request *rq,

rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
ret = ceph_calc_raw_layout(&osd_req->r_file_layout,
-   ofs, &len, &bno, osd_req, ops);
+   ofs, &len, &bno, osd_req, op);
rbd_assert(ret == 0);

-   ceph_osdc_build_request(osd_req, ofs, len, num_op, ops,
+   ceph_osdc_build_request(osd_req, ofs, len, 1, op,
snapc, snapid, &mtime);

if (linger_req) {
@@ -1255,8 +1254,7 @@ static void rbd_simple_req_cb(struct
ceph_osd_request *osd_req,
  */
 static int rbd_req_sync_op(struct rbd_device *rbd_dev,
   int flags,
-  unsigned int num_op,
-  struct ceph_osd_req_op *ops,
+  struct ceph_osd_req_op *op,
   const char *object_name,
   u64 ofs, u64 inbound_size,
   char *inbound,
@@ -1267,7 +1265,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
struct page **pages;
int num_pages;

-   rbd_assert(ops != NULL);
+   rbd_assert(op != NULL);

num_pages = calc_pages_for(ofs, inbound_size);
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
@@ -1278,7 +1276,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
  object_name, ofs, inbound_size, NULL,
  pages, num_pages,
  flags,
- num_op, ops,
+ op,
  NULL, 0,
  NULL,
  linger_req, ver);
@@ -1348,7 +1346,7 @@ static int rbd_do_op(struct request *rq,
 bio,
 NULL, 0,
 flags,
-1, op,
+op,
 coll, coll_index,
 rbd_req_cb, 0, NULL);
if (ret < 0)
@@ -1377,7 +1375,7 @@ static int rbd_req_sync_read(struct rbd_device
*rbd_dev,
return -ENOMEM;

ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-  1, op, object_name, ofs, len, buf, NULL, ver);
+  op, object_name, ofs, len, buf, NULL, ver);
rbd_destroy_op(op);

return ret;
@@ -1405,7 +1403,7 @@ static int rbd_req_sync_notify_ack(struct
rbd_device *rbd_dev,
  rbd_dev->header_name, 0, 0, NULL,
  NULL, 0,
  CEPH_OSD_FLAG_READ,
- 1, op,
+ op,
  NULL, 0,
  rbd_simple_req_cb, 0, NULL);

@@ -1457,7 +1455,7 @@ static int rbd_req_sync_watch(struct rbd_device
*rbd_dev)

ret = rbd_req_sync_op(rbd_dev,
  CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- 1, op,
+ op,
  rbd_dev->header_name,
  0, 0, NULL,
  &rbd_dev->watch_request, NULL);
@@ -1494,7 +1492,7 @@ static int rbd_req_sync_unwatch(struct rbd_device
*rbd_dev)

ret = rbd_req_sync_op(rbd_dev,
  

[PATCH REPOST 3/4] rbd: there is really only one op

2013-01-04 Thread Alex Elder
Throughout the rbd code there are spots where it appears we can
handle an osd request containing more than one osd request op.

But that is only the way it appears.  In fact, currently only one
operation at a time can be supported, and supporting more than
one will require much more than fleshing out the support that's
there now.

This patch changes names to make it perfectly clear that anywhere
we're dealing with a block of ops, we're in fact dealing with
exactly one of them.  We'll be able to simplify some things as
a result.

When multiple op support is implemented, we can update things again
accordingly.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |  118
---
 1 file changed, 56 insertions(+), 62 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0e6cc67..d3472db 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1023,32 +1023,26 @@ out_err:
return NULL;
 }

-/*
- * helpers for osd request op vectors.
- */
-static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op,
-   int opcode, u32 payload_len)
+static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32
payload_len)
 {
-   struct ceph_osd_req_op *ops;
+   struct ceph_osd_req_op *op;

-   ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO);
-   if (!ops)
+   op = kzalloc(sizeof (*op), GFP_NOIO);
+   if (!op)
return NULL;
-
-   ops[0].op = opcode;
-
/*
 * op extent offset and length will be set later on
 * in calc_raw_layout()
 */
-   ops[0].payload_len = payload_len;
+   op->op = opcode;
+   op->payload_len = payload_len;

-   return ops;
+   return op;
 }

-static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+static void rbd_destroy_op(struct ceph_osd_req_op *op)
 {
-   kfree(ops);
+   kfree(op);
 }

 static void rbd_coll_end_req_index(struct request *rq,
@@ -1314,7 +1308,7 @@ static int rbd_do_op(struct request *rq,
u64 seg_ofs;
u64 seg_len;
int ret;
-   struct ceph_osd_req_op *ops;
+   struct ceph_osd_req_op *op;
u32 payload_len;
int opcode;
int flags;
@@ -1340,8 +1334,8 @@ static int rbd_do_op(struct request *rq,
}

ret = -ENOMEM;
-   ops = rbd_create_rw_ops(1, opcode, payload_len);
-   if (!ops)
+   op = rbd_create_rw_op(opcode, payload_len);
+   if (!op)
goto done;

/* we've taken care of segment sizes earlier when we
@@ -1354,13 +1348,13 @@ static int rbd_do_op(struct request *rq,
 bio,
 NULL, 0,
 flags,
-1, ops,
+1, op,
 coll, coll_index,
 rbd_req_cb, 0, NULL);
if (ret < 0)
rbd_coll_end_req_index(rq, coll, coll_index,
(s32) ret, seg_len);
-   rbd_destroy_ops(ops);
+   rbd_destroy_op(op);
 done:
kfree(seg_name);
return ret;
@@ -1375,16 +1369,16 @@ static int rbd_req_sync_read(struct rbd_device
*rbd_dev,
  char *buf,
  u64 *ver)
 {
-   struct ceph_osd_req_op *ops;
+   struct ceph_osd_req_op *op;
int ret;

-   ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
-   if (!ops)
+   op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0);
+   if (!op)
return -ENOMEM;

ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-  1, ops, object_name, ofs, len, buf, NULL, ver);
-   rbd_destroy_ops(ops);
+  1, op, object_name, ofs, len, buf, NULL, ver);
+   rbd_destroy_op(op);

return ret;
 }
@@ -1396,26 +1390,26 @@ static int rbd_req_sync_notify_ack(struct
rbd_device *rbd_dev,
   u64 ver,
   u64 notify_id)
 {
-   struct ceph_osd_req_op *ops;
+   struct ceph_osd_req_op *op;
int ret;

-   ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
-   if (!ops)
+   op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0);
+   if (!op)
return -ENOMEM;

-   ops[0].watch.ver = cpu_to_le64(ver);
-   ops[0].watch.cookie = notify_id;
-   ops[0].watch.flag = 0;
+   op->watch.ver = cpu_to_le64(ver);
+   op->watch.cookie = notify_id;
+   op->watch.flag = 0;

ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
  rbd_dev->header_name, 0, 0, NULL,
  NULL, 0,
  CEPH_OSD_FLAG_READ,
- 1, ops,
+ 1, op,
  NULL, 0,
  rbd_simple_req_cb, 0, NULL);

-   rbd_destroy_ops(ops);
+   rbd_destro

[PATCH REPOST 2/4] libceph: pass num_op with ops

2013-01-04 Thread Alex Elder
Both ceph_osdc_alloc_request() and ceph_osdc_build_request() are
provided an array of ceph osd request operations.  Rather than just
passing the number of operations in the array, the caller is
required append an additional zeroed operation structure to signal
the end of the array.

All callers know the number of operations at the time these
functions are called, so drop the silly zero entry and supply that
number directly.  As a result, get_num_ops() is no longer needed.
This also means that ceph_osdc_alloc_request() never uses its ops
argument, so that can be dropped.

Also rbd_create_rw_ops() no longer needs to add one to reserve room
for the additional op.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |9 
 include/linux/ceph/osd_client.h |3 ++-
 net/ceph/osd_client.c   |   43
++-
 3 files changed, 22 insertions(+), 33 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index cee1334..0e6cc67 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1026,12 +1026,12 @@ out_err:
 /*
  * helpers for osd request op vectors.
  */
-static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
+static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op,
int opcode, u32 payload_len)
 {
struct ceph_osd_req_op *ops;

-   ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
+   ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO);
if (!ops)
return NULL;

@@ -1149,7 +1149,7 @@ static int rbd_do_request(struct request *rq,
(unsigned long long) len, coll, coll_index);

osdc = &rbd_dev->rbd_client->client->osdc;
-   osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO);
+   osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO);
if (!osd_req) {
ret = -ENOMEM;
goto done_pages;
@@ -1178,7 +1178,8 @@ static int rbd_do_request(struct request *rq,
ofs, &len, &bno, osd_req, ops);
rbd_assert(ret == 0);

-   ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime);
+   ceph_osdc_build_request(osd_req, ofs, len, num_op, ops,
+   snapc, snapid, &mtime);

if (linger_req) {
ceph_osdc_set_request_linger(osdc, osd_req);
diff --git a/include/linux/ceph/osd_client.h
b/include/linux/ceph/osd_client.h
index 75f56d3..2b04d05 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -214,12 +214,13 @@ extern int ceph_calc_raw_layout(struct
ceph_file_layout *layout,

 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct
ceph_osd_client *osdc,
   struct ceph_snap_context *snapc,
-  struct ceph_osd_req_op *ops,
+  unsigned int num_op,
   bool use_mempool,
   gfp_t gfp_flags);

 extern void ceph_osdc_build_request(struct ceph_osd_request *req,
u64 off, u64 len,
+   unsigned int num_op,
struct ceph_osd_req_op *src_ops,
struct ceph_snap_context *snapc,
u64 snap_id,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 150166b..06625fa 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -152,25 +152,14 @@ void ceph_osdc_release_request(struct kref *kref)
 }
 EXPORT_SYMBOL(ceph_osdc_release_request);

-static int get_num_ops(struct ceph_osd_req_op *ops)
-{
-   int i = 0;
-
-   while (ops[i].op)
-   i++;
-
-   return i;
-}
-
 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client
*osdc,
   struct ceph_snap_context *snapc,
-  struct ceph_osd_req_op *ops,
+  unsigned int num_op,
   bool use_mempool,
   gfp_t gfp_flags)
 {
struct ceph_osd_request *req;
struct ceph_msg *msg;
-   int num_op = get_num_ops(ops);
size_t msg_size = sizeof(struct ceph_osd_request_head);

msg_size += num_op*sizeof(struct ceph_osd_op);
@@ -309,7 +298,7 @@ static void osd_req_encode_op(struct
ceph_osd_request *req,
  *
  */
 void ceph_osdc_build_request(struct ceph_osd_request *req,
-u64 off, u64 len,
+u64 off, u64 len, unsigned int num_op,
 struct ceph_osd_req_op *src_ops,
 struct ceph_snap_context *snapc, u64 snap_id,
 

[PATCH REPOST 1/4] rbd: pass num_op with ops array

2013-01-04 Thread Alex Elder
Add a num_op parameter to rbd_do_request() and rbd_req_sync_op() to
indicate the number of entries in the array.  The callers of these
functions always know how many entries are in the array, so just
pass that information down.

This is in anticipation of eliminating the extra zero-filled entry
in these ops arrays.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |   16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6b9643a..cee1334 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1119,6 +1119,7 @@ static int rbd_do_request(struct request *rq,
  struct page **pages,
  int num_pages,
  int flags,
+ unsigned int num_op,
  struct ceph_osd_req_op *ops,
  struct rbd_req_coll *coll,
  int coll_index,
@@ -1259,6 +1260,7 @@ static void rbd_simple_req_cb(struct
ceph_osd_request *osd_req,
  */
 static int rbd_req_sync_op(struct rbd_device *rbd_dev,
   int flags,
+  unsigned int num_op,
   struct ceph_osd_req_op *ops,
   const char *object_name,
   u64 ofs, u64 inbound_size,
@@ -1281,7 +1283,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev,
  object_name, ofs, inbound_size, NULL,
  pages, num_pages,
  flags,
- ops,
+ num_op, ops,
  NULL, 0,
  NULL,
  linger_req, ver);
@@ -1351,7 +1353,7 @@ static int rbd_do_op(struct request *rq,
 bio,
 NULL, 0,
 flags,
-ops,
+1, ops,
 coll, coll_index,
 rbd_req_cb, 0, NULL);
if (ret < 0)
@@ -1380,7 +1382,7 @@ static int rbd_req_sync_read(struct rbd_device
*rbd_dev,
return -ENOMEM;

ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
-  ops, object_name, ofs, len, buf, NULL, ver);
+  1, ops, object_name, ofs, len, buf, NULL, ver);
rbd_destroy_ops(ops);

return ret;
@@ -1408,7 +1410,7 @@ static int rbd_req_sync_notify_ack(struct
rbd_device *rbd_dev,
  rbd_dev->header_name, 0, 0, NULL,
  NULL, 0,
  CEPH_OSD_FLAG_READ,
- ops,
+ 1, ops,
  NULL, 0,
  rbd_simple_req_cb, 0, NULL);

@@ -1460,7 +1462,7 @@ static int rbd_req_sync_watch(struct rbd_device
*rbd_dev)

ret = rbd_req_sync_op(rbd_dev,
  CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- ops,
+ 1, ops,
  rbd_dev->header_name,
  0, 0, NULL,
  &rbd_dev->watch_request, NULL);
@@ -1497,7 +1499,7 @@ static int rbd_req_sync_unwatch(struct rbd_device
*rbd_dev)

ret = rbd_req_sync_op(rbd_dev,
  CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
- ops,
+ 1, ops,
  rbd_dev->header_name,
  0, 0, NULL, NULL, NULL);

@@ -1548,7 +1550,7 @@ static int rbd_req_sync_exec(struct rbd_device
*rbd_dev,
ops[0].cls.indata = outbound;
ops[0].cls.indata_len = outbound_size;

-   ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops,
+   ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, ops,
   object_name, 0, inbound_size, inbound,
   NULL, ver);

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 0/4] rbd: explicitly support only one osd op

2013-01-04 Thread Alex Elder
An osd request can be made up of multiple "ops", all of which are
completed (or not) transactionally.  There is partial support for
multiple ops in an rbd request in the rbd code, but it's incomplete
and not even supported by the osd client or the messenger right now.

I see three problems with this partial implementation:  it gives a
false impression of how things work; it complicates some code in
some cases where it's not necessary; and it may constrain how one
might pursue fully implementing multiple ops in a request to ways
that don't fit well with how we want to do things.

So this series just simplifies things, making it explicit that there
is only one op in an kernel osd client request right now.

-Alex

[PATCH REPOST 1/4] rbd: pass num_op with ops array
[PATCH REPOST 2/4] libceph: pass num_op with ops
[PATCH REPOST 3/4] rbd: there is really only one op
[PATCH REPOST 4/4] rbd: assume single op in a request
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 6/6] libceph: don't set pages or bio in,> ceph_osdc_alloc_request()

2013-01-04 Thread Alex Elder
Only one of the two callers of ceph_osdc_alloc_request() provides
page or bio data for its payload.  And essentially all that function
was doing with those arguments was assigning them to fields in the
osd request structure.

Simplify ceph_osdc_alloc_request() by having the caller take care of
making those assignments

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |8 ++--
 include/linux/ceph/osd_client.h |4 +---
 net/ceph/osd_client.c   |   15 ++-
 3 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index fc8bfe6..6b9643a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1148,14 +1148,18 @@ static int rbd_do_request(struct request *rq,
(unsigned long long) len, coll, coll_index);

osdc = &rbd_dev->rbd_client->client->osdc;
-   osd_req = ceph_osdc_alloc_request(osdc, snapc, ops,
-   false, GFP_NOIO, pages, bio);
+   osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO);
if (!osd_req) {
ret = -ENOMEM;
goto done_pages;
}

osd_req->r_flags = flags;
+   osd_req->r_pages = pages;
+   if (bio) {
+   osd_req->r_bio = bio;
+   bio_get(osd_req->r_bio);
+   }
osd_req->r_callback = rbd_cb;

rbd_req->rq = rq;
diff --git a/include/linux/ceph/osd_client.h
b/include/linux/ceph/osd_client.h
index 6ddda5b..75f56d3 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -216,9 +216,7 @@ extern struct ceph_osd_request
*ceph_osdc_alloc_request(struct ceph_osd_client *
   struct ceph_snap_context *snapc,
   struct ceph_osd_req_op *ops,
   bool use_mempool,
-  gfp_t gfp_flags,
-  struct page **pages,
-  struct bio *bio);
+  gfp_t gfp_flags);

 extern void ceph_osdc_build_request(struct ceph_osd_request *req,
u64 off, u64 len,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 3fe2960..150166b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -166,9 +166,7 @@ struct ceph_osd_request
*ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
   struct ceph_snap_context *snapc,
   struct ceph_osd_req_op *ops,
   bool use_mempool,
-  gfp_t gfp_flags,
-  struct page **pages,
-  struct bio *bio)
+  gfp_t gfp_flags)
 {
struct ceph_osd_request *req;
struct ceph_msg *msg;
@@ -229,13 +227,6 @@ struct ceph_osd_request
*ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
memset(msg->front.iov_base, 0, msg->front.iov_len);

req->r_request = msg;
-   req->r_pages = pages;
-#ifdef CONFIG_BLOCK
-   if (bio) {
-   req->r_bio = bio;
-   bio_get(req->r_bio);
-   }
-#endif

return req;
 }
@@ -431,9 +422,7 @@ struct ceph_osd_request
*ceph_osdc_new_request(struct ceph_osd_client *osdc,
} else
ops[1].op = 0;

-   req = ceph_osdc_alloc_request(osdc, snapc, ops,
-use_mempool,
-GFP_NOFS, NULL, NULL);
+   req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS);
if (!req)
return ERR_PTR(-ENOMEM);
req->r_flags = flags;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 5/6] libceph: don't set flags in ceph_osdc_alloc_request()

2013-01-04 Thread Alex Elder
The only thing ceph_osdc_alloc_request() really does with the
flags value it is passed is assign it to the newly-created
osd request structure.  Do that in the caller instead.

Both callers subsequently call ceph_osdc_build_request(), so have
that function (instead of ceph_osdc_alloc_request()) issue a warning
if a request comes through with neither the read nor write flags set.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |3 ++-
 include/linux/ceph/osd_client.h |1 -
 net/ceph/osd_client.c   |   11 ---
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 09bcf08..fc8bfe6 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1148,13 +1148,14 @@ static int rbd_do_request(struct request *rq,
(unsigned long long) len, coll, coll_index);

osdc = &rbd_dev->rbd_client->client->osdc;
-   osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
+   osd_req = ceph_osdc_alloc_request(osdc, snapc, ops,
false, GFP_NOIO, pages, bio);
if (!osd_req) {
ret = -ENOMEM;
goto done_pages;
}

+   osd_req->r_flags = flags;
osd_req->r_callback = rbd_cb;

rbd_req->rq = rq;
diff --git a/include/linux/ceph/osd_client.h
b/include/linux/ceph/osd_client.h
index fe3a6e8..6ddda5b 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -213,7 +213,6 @@ extern int ceph_calc_raw_layout(struct
ceph_file_layout *layout,
struct ceph_osd_req_op *op);

 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct
ceph_osd_client *osdc,
-  int flags,
   struct ceph_snap_context *snapc,
   struct ceph_osd_req_op *ops,
   bool use_mempool,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 1b55fbe..3fe2960 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -163,7 +163,6 @@ static int get_num_ops(struct ceph_osd_req_op *ops)
 }

 struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client
*osdc,
-  int flags,
   struct ceph_snap_context *snapc,
   struct ceph_osd_req_op *ops,
   bool use_mempool,
@@ -200,10 +199,6 @@ struct ceph_osd_request
*ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
INIT_LIST_HEAD(&req->r_req_lru_item);
INIT_LIST_HEAD(&req->r_osd_item);

-   req->r_flags = flags;
-
-   WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
-
/* create reply message */
if (use_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
@@ -339,6 +334,8 @@ void ceph_osdc_build_request(struct ceph_osd_request
*req,
u64 data_len = 0;
int i;

+   WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
head = msg->front.iov_base;
head->snapid = cpu_to_le64(snap_id);
op = (void *)(head + 1);
@@ -434,12 +431,12 @@ struct ceph_osd_request
*ceph_osdc_new_request(struct ceph_osd_client *osdc,
} else
ops[1].op = 0;

-   req = ceph_osdc_alloc_request(osdc, flags,
-snapc, ops,
+   req = ceph_osdc_alloc_request(osdc, snapc, ops,
 use_mempool,
 GFP_NOFS, NULL, NULL);
if (!req)
return ERR_PTR(-ENOMEM);
+   req->r_flags = flags;

/* calculate max write size */
r = calc_layout(vino, layout, off, plen, req, ops);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 4/6] libceph: drop osdc from ceph_calc_raw_layout()

2013-01-04 Thread Alex Elder
The osdc parameter to ceph_calc_raw_layout() is not used, so get rid
of it.  Consequently, the corresponding parameter in calc_layout()
becomes unused, so get rid of that as well.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |2 +-
 include/linux/ceph/osd_client.h |3 +--
 net/ceph/osd_client.c   |   10 --
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 82a1460..09bcf08 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1168,7 +1168,7 @@ static int rbd_do_request(struct request *rq,
osd_req->r_oid_len = strlen(osd_req->r_oid);

rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
-   ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout,
+   ret = ceph_calc_raw_layout(&osd_req->r_file_layout,
ofs, &len, &bno, osd_req, ops);
rbd_assert(ret == 0);

diff --git a/include/linux/ceph/osd_client.h
b/include/linux/ceph/osd_client.h
index 0e82a0a..fe3a6e8 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -207,8 +207,7 @@ extern void ceph_osdc_handle_reply(struct
ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 struct ceph_msg *msg);

-extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
-   struct ceph_file_layout *layout,
+extern int ceph_calc_raw_layout(struct ceph_file_layout *layout,
u64 off, u64 *plen, u64 *bno,
struct ceph_osd_request *req,
struct ceph_osd_req_op *op);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 75e57de..1b55fbe 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -38,8 +38,7 @@ static int op_has_extent(int op)
op == CEPH_OSD_OP_WRITE);
 }

-int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
-   struct ceph_file_layout *layout,
+int ceph_calc_raw_layout(struct ceph_file_layout *layout,
u64 off, u64 *plen, u64 *bno,
struct ceph_osd_request *req,
struct ceph_osd_req_op *op)
@@ -99,8 +98,7 @@ EXPORT_SYMBOL(ceph_calc_raw_layout);
  *
  * fill osd op in request message.
  */
-static int calc_layout(struct ceph_osd_client *osdc,
-  struct ceph_vino vino,
+static int calc_layout(struct ceph_vino vino,
   struct ceph_file_layout *layout,
   u64 off, u64 *plen,
   struct ceph_osd_request *req,
@@ -109,7 +107,7 @@ static int calc_layout(struct ceph_osd_client *osdc,
u64 bno;
int r;

-   r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op);
+   r = ceph_calc_raw_layout(layout, off, plen, &bno, req, op);
if (r < 0)
return r;

@@ -444,7 +442,7 @@ struct ceph_osd_request
*ceph_osdc_new_request(struct ceph_osd_client *osdc,
return ERR_PTR(-ENOMEM);

/* calculate max write size */
-   r = calc_layout(osdc, vino, layout, off, plen, req, ops);
+   r = calc_layout(vino, layout, off, plen, req, ops);
if (r < 0)
return ERR_PTR(r);
req->r_file_layout = *layout;  /* keep a copy */
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 3/6] libceph: drop snapid in ceph_calc_raw_layout()

2013-01-04 Thread Alex Elder
A snapshot id must be provided to ceph_calc_raw_layout() even though
it is not needed at all for calculating the layout.

Where the snapshot id *is* needed is when building the request
message for an osd operation.

Drop the snapid parameter from ceph_calc_raw_layout() and pass
that value instead in ceph_osdc_build_request().

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |4 ++--
 include/linux/ceph/osd_client.h |2 +-
 net/ceph/osd_client.c   |   14 --
 3 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 1a241a7..82a1460 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1169,10 +1169,10 @@ static int rbd_do_request(struct request *rq,

rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id);
ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout,
-   snapid, ofs, &len, &bno, osd_req, ops);
+   ofs, &len, &bno, osd_req, ops);
rbd_assert(ret == 0);

-   ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime);
+   ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime);

if (linger_req) {
ceph_osdc_set_request_linger(osdc, osd_req);
diff --git a/include/linux/ceph/osd_client.h
b/include/linux/ceph/osd_client.h
index 4bfb458..0e82a0a 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -209,7 +209,6 @@ extern void ceph_osdc_handle_map(struct
ceph_osd_client *osdc,

 extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
struct ceph_file_layout *layout,
-   u64 snapid,
u64 off, u64 *plen, u64 *bno,
struct ceph_osd_request *req,
struct ceph_osd_req_op *op);
@@ -227,6 +226,7 @@ extern void ceph_osdc_build_request(struct
ceph_osd_request *req,
u64 off, u64 len,
struct ceph_osd_req_op *src_ops,
struct ceph_snap_context *snapc,
+   u64 snap_id,
struct timespec *mtime);

 extern struct ceph_osd_request *ceph_osdc_new_request(struct
ceph_osd_client *,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 65b7d11..75e57de 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -40,18 +40,14 @@ static int op_has_extent(int op)

 int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
struct ceph_file_layout *layout,
-   u64 snapid,
u64 off, u64 *plen, u64 *bno,
struct ceph_osd_request *req,
struct ceph_osd_req_op *op)
 {
-   struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
u64 orig_len = *plen;
u64 objoff, objlen;/* extent in object */
int r;

-   reqhead->snapid = cpu_to_le64(snapid);
-
/* object extent? */
r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
  &objoff, &objlen);
@@ -113,8 +109,7 @@ static int calc_layout(struct ceph_osd_client *osdc,
u64 bno;
int r;

-   r = ceph_calc_raw_layout(osdc, layout, vino.snap, off,
-plen, &bno, req, op);
+   r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op);
if (r < 0)
return r;

@@ -332,7 +327,7 @@ static void osd_req_encode_op(struct
ceph_osd_request *req,
 void ceph_osdc_build_request(struct ceph_osd_request *req,
 u64 off, u64 len,
 struct ceph_osd_req_op *src_ops,
-struct ceph_snap_context *snapc,
+struct ceph_snap_context *snapc, u64 snap_id,
 struct timespec *mtime)
 {
struct ceph_msg *msg = req->r_request;
@@ -347,6 +342,7 @@ void ceph_osdc_build_request(struct ceph_osd_request
*req,
int i;

head = msg->front.iov_base;
+   head->snapid = cpu_to_le64(snap_id);
op = (void *)(head + 1);
p = (void *)(op + num_op);

@@ -458,9 +454,7 @@ struct ceph_osd_request
*ceph_osdc_new_request(struct ceph_osd_client *osdc,
req->r_num_pages = calc_pages_for(page_align, *plen);
req->r_page_alignment = page_align;

-   ceph_osdc_build_request(req, off, *plen, ops,
-   snapc,
-   mtime);
+   ceph_osdc_build_request(req, off, *plen, ops, snapc, vino.snap, mtime);

return req;
 }
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 2/6] libceph: pass length to ceph_calc_file_object_mapping()

2013-01-04 Thread Alex Elder
ceph_calc_file_object_mapping() takes (among other things) a "file"
offset and length, and based on the layout, determines the object
number ("bno") backing the affected portion of the file's data and
the offset into that object where the desired range begins.  It also
computes the size that should be used for the request--either the
amount requested or something less if that would exceed the end of
the object.

This patch changes the input length parameter in this function so it
is used only for input.  That is, the argument will be passed by
value rather than by address, so the value provided won't get
updated by the function.

The value would only get updated if the length would surpass the
current object, and in that case the value it got updated to would
be exactly that returned in *oxlen.

Only one of the two callers is affected by this change.  Update
ceph_calc_raw_layout() so it records any updated value.

Signed-off-by: Alex Elder 
---
 fs/ceph/ioctl.c |2 +-
 include/linux/ceph/osdmap.h |2 +-
 net/ceph/osd_client.c   |6 --
 net/ceph/osdmap.c   |9 -
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 36549a4..3b22150 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -194,7 +194,7 @@ static long ceph_ioctl_get_dataloc(struct file
*file, void __user *arg)
return -EFAULT;

down_read(&osdc->map_sem);
-   r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+   r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
  &dl.object_no, &dl.object_offset,
  &olen);
if (r < 0)
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 5ea57ba..1f653e2 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -110,7 +110,7 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap
*map);

 /* calculate mapping of a file extent to an object */
 extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-u64 off, u64 *plen,
+u64 off, u64 len,
 u64 *bno, u64 *oxoff, u64 *oxlen);

 /* calculate mapping of object to a placement group */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cdf40c6..65b7d11 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -53,13 +53,15 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
reqhead->snapid = cpu_to_le64(snapid);

/* object extent? */
-   r = ceph_calc_file_object_mapping(layout, off, plen, bno,
+   r = ceph_calc_file_object_mapping(layout, off, orig_len, bno,
  &objoff, &objlen);
if (r < 0)
return r;
-   if (*plen < orig_len)
+   if (objlen < orig_len) {
+   *plen = objlen;
dout(" skipping last %llu, final file extent %llu~%llu\n",
 orig_len - *plen, off, *plen);
+   }

if (op_has_extent(op->op)) {
op->extent.offset = objoff;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index de73214..9dded22 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1010,7 +1010,7 @@ bad:
  * pass a stride back to the caller.
  */
 int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
-  u64 off, u64 *plen,
+  u64 off, u64 len,
   u64 *ono,
   u64 *oxoff, u64 *oxlen)
 {
@@ -1021,7 +1021,7 @@ int ceph_calc_file_object_mapping(struct
ceph_file_layout *layout,
u32 su_per_object;
u64 t, su_offset;

-   dout("mapping %llu~%llu  osize %u fl_su %u\n", off, *plen,
+   dout("mapping %llu~%llu  osize %u fl_su %u\n", off, len,
 osize, su);
if (su == 0 || sc == 0)
goto invalid;
@@ -1054,11 +1054,10 @@ int ceph_calc_file_object_mapping(struct
ceph_file_layout *layout,

/*
 * Calculate the length of the extent being written to the selected
-* object. This is the minimum of the full length requested (plen) or
+* object. This is the minimum of the full length requested (len) or
 * the remainder of the current stripe being written to.
 */
-   *oxlen = min_t(u64, *plen, su - su_offset);
-   *plen = *oxlen;
+   *oxlen = min_t(u64, len, su - su_offset);

dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
return 0;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 1/6] libceph: pass length to ceph_osdc_build_request()

2013-01-04 Thread Alex Elder
The len argument to ceph_osdc_build_request() is set up to be
passed by address, but that function never updates its value
so there's no need to do this.  Tighten up the interface by
passing the length directly.

Signed-off-by: Alex Elder 
---
 drivers/block/rbd.c |2 +-
 include/linux/ceph/osd_client.h |2 +-
 net/ceph/osd_client.c   |6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index cc12154..1a241a7 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1172,7 +1172,7 @@ static int rbd_do_request(struct request *rq,
snapid, ofs, &len, &bno, osd_req, ops);
rbd_assert(ret == 0);

-   ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime);
+   ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime);

if (linger_req) {
ceph_osdc_set_request_linger(osdc, osd_req);
diff --git a/include/linux/ceph/osd_client.h
b/include/linux/ceph/osd_client.h
index 61562c7..4bfb458 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -224,7 +224,7 @@ extern struct ceph_osd_request
*ceph_osdc_alloc_request(struct ceph_osd_client *
   struct bio *bio);

 extern void ceph_osdc_build_request(struct ceph_osd_request *req,
-   u64 off, u64 *plen,
+   u64 off, u64 len,
struct ceph_osd_req_op *src_ops,
struct ceph_snap_context *snapc,
struct timespec *mtime);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2adbfcc..cdf40c6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -328,7 +328,7 @@ static void osd_req_encode_op(struct
ceph_osd_request *req,
  *
  */
 void ceph_osdc_build_request(struct ceph_osd_request *req,
-u64 off, u64 *plen,
+u64 off, u64 len,
 struct ceph_osd_req_op *src_ops,
 struct ceph_snap_context *snapc,
 struct timespec *mtime)
@@ -382,7 +382,7 @@ void ceph_osdc_build_request(struct ceph_osd_request
*req,

if (flags & CEPH_OSD_FLAG_WRITE) {
req->r_request->hdr.data_off = cpu_to_le16(off);
-   req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
+   req->r_request->hdr.data_len = cpu_to_le32(len + data_len);
} else if (data_len) {
req->r_request->hdr.data_off = 0;
req->r_request->hdr.data_len = cpu_to_le32(data_len);
@@ -456,7 +456,7 @@ struct ceph_osd_request
*ceph_osdc_new_request(struct ceph_osd_client *osdc,
req->r_num_pages = calc_pages_for(page_align, *plen);
req->r_page_alignment = page_align;

-   ceph_osdc_build_request(req, off, plen, ops,
+   ceph_osdc_build_request(req, off, *plen, ops,
snapc,
mtime);

-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH REPOST 0/6] libceph: parameter cleanup

2013-01-04 Thread Alex Elder
This series mostly cleans up parameters used by functions
in libceph, in the osd client code.

-Alex

[PATCH REPOST 1/6] libceph: pass length to ceph_osdc_build_request()
[PATCH REPOST 2/6] libceph: pass length to ceph_calc_file_object_mapping()
[PATCH REPOST 3/6] libceph: drop snapid in ceph_calc_raw_layout()
[PATCH REPOST 4/6] libceph: drop osdc from ceph_calc_raw_layout()
[PATCH REPOST 5/6] libceph: don't set flags in ceph_osdc_alloc_request()
[PATCH REPOST 6/6] libceph: don't set pages or bio in
ceph_osdc_alloc_request()
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/2] Librados aio stat

2013-01-04 Thread Filippos Giannakos

Hi Team,

Is there any progress or any comments regarding the librados aio stat
patch ?

Best regards

On 12/20/2012 10:05 PM, Filippos Giannakos wrote:

Hi Team,

Here is the patch with the changes, plus the tests you requested.

Best regards,
Filippos



--
Filippos.
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/6] ceph: check mds_wanted for imported cap

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

The MDS may have incorrect wanted caps after importing caps. So the
client should check the value mds has and send cap update if necessary.

Signed-off-by: Yan, Zheng 
---
 fs/ceph/caps.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7e90299..16c10f8 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2425,10 +2425,12 @@ static void handle_cap_grant(struct inode *inode, 
struct ceph_mds_caps *grant,
 ceph_cap_string(used),
 ceph_cap_string(dirty));
if (wanted != le32_to_cpu(grant->wanted)) {
-   dout("mds wanted %s -> %s\n",
-ceph_cap_string(le32_to_cpu(grant->wanted)),
-ceph_cap_string(wanted));
-   grant->wanted = cpu_to_le32(wanted);
+   dout("mds wanted = %s\n",
+ceph_cap_string(le32_to_cpu(grant->wanted)));
+   /* imported cap may not have correct mds_wanted */
+   if (cap == ci->i_auth_cap &&
+   (wanted & ~(cap->mds_wanted | cap->issued)))
+   check_caps = 1;
}
 
cap->seq = seq;
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/6] ceph: don't acquire i_mutex ceph_vmtruncate_work

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

In commit 22cddde104, ceph_get_caps() was moved into ceph_write_begin().
So ceph_get_caps() can be called while i_mutex is locked. If there is
pending vmtruncate, ceph_get_caps() will wait for it to complete, but
ceph_vmtruncate_work() is blocked by the locked i_mutex.

There are several places that call __ceph_do_pending_vmtruncate()
without holding the i_mutex, I think it's OK to not acquire the i_mutex
in ceph_vmtruncate_work()

Signed-off-by: Yan, Zheng 
---
 fs/ceph/inode.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 81613bc..c895c7f 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1420,7 +1420,7 @@ out:
 
 
 /*
- * called by trunc_wq; take i_mutex ourselves
+ * called by trunc_wq;
  *
  * We also truncate in a separate thread as well.
  */
@@ -1431,9 +1431,7 @@ static void ceph_vmtruncate_work(struct work_struct *work)
struct inode *inode = &ci->vfs_inode;
 
dout("vmtruncate_work %p\n", inode);
-   mutex_lock(&inode->i_mutex);
__ceph_do_pending_vmtruncate(inode);
-   mutex_unlock(&inode->i_mutex);
iput(inode);
 }
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/6] ceph: allow revoking duplicated caps issued by non-auth MDS

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Allow revoking duplicated caps issued by non-auth MDS if these caps
are also issued by auth MDS.

Signed-off-by: Yan, Zheng 
---
 fs/ceph/caps.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a9fe2d5..c90b245 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1468,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int 
flags,
struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
-   int file_wanted, used;
+   int file_wanted, used, cap_used;
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int issued, implemented, want, retain, revoking, flushing = 0;
int mds = -1;   /* keep track of how far we've gone through i_caps list
@@ -1577,6 +1577,10 @@ retry_locked:
 ceph_cap_string(cap->implemented),
 ceph_cap_string(revoking));
 
+   cap_used = used;
+   if (ci->i_auth_cap && cap != ci->i_auth_cap)
+   cap_used &= ~ci->i_auth_cap->issued;
+
if (cap == ci->i_auth_cap &&
(cap->issued & CEPH_CAP_FILE_WR)) {
/* request larger max_size from MDS? */
@@ -1601,7 +1605,7 @@ retry_locked:
}
 
/* completed revocation? going down and there are no caps? */
-   if (revoking && (revoking & used) == 0) {
+   if (revoking && (revoking & cap_used) == 0) {
dout("completed revocation of %s\n",
 ceph_cap_string(cap->implemented & ~cap->issued));
goto ack;
@@ -1678,8 +1682,8 @@ ack:
sent++;
 
/* __send_cap drops i_ceph_lock */
-   delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
- retain, flushing, NULL);
+   delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
+ want, retain, flushing, NULL);
goto retry; /* retake i_ceph_lock and restart our cap scan. */
}
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/6] ceph: allocate cap_release message when receiving cap import

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

When client wants to release an imported cap, it's possible there
is no reserved cap_release message in corresponding mds session.
so __queue_cap_release causes kernel panic.

Signed-off-by: Yan, Zheng 
---
 fs/ceph/caps.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c90b245..7e90299 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2832,6 +2832,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
 (unsigned)seq);
 
+   if (op == CEPH_CAP_OP_IMPORT)
+   ceph_add_cap_releases(mdsc, session);
+
/* lookup ino */
inode = ceph_find_inode(sb, vino);
ci = ceph_inode(inode);
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/6] ceph: re-calculate truncate_size for strip object

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Otherwise osd may truncate the object to larger size.

Signed-off-by: Yan, Zheng 
---
 net/ceph/osd_client.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index eb9a444..267f183 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -76,8 +76,16 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
 orig_len - *plen, off, *plen);
 
if (op_has_extent(op->op)) {
+   u32 osize = le32_to_cpu(layout->fl_object_size);
op->extent.offset = objoff;
op->extent.length = objlen;
+   if (op->extent.truncate_size <= off - objoff) {
+   op->extent.truncate_size = 0;
+   } else {
+   op->extent.truncate_size -= off - objoff;
+   if (op->extent.truncate_size > osize)
+   op->extent.truncate_size = osize;
+   }
}
req->r_num_pages = calc_pages_for(off, *plen);
req->r_page_alignment = off & ~PAGE_MASK;
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/6] ceph: move dirty inode to migrating list when clearing auth caps

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Signed-off-by: Yan, Zheng 
---
 fs/ceph/caps.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index a1d9bb3..a9fe2d5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -611,8 +611,16 @@ retry:
 
if (flags & CEPH_CAP_FLAG_AUTH)
ci->i_auth_cap = cap;
-   else if (ci->i_auth_cap == cap)
+   else if (ci->i_auth_cap == cap) {
ci->i_auth_cap = NULL;
+   spin_lock(&mdsc->cap_dirty_lock);
+   if (!list_empty(&ci->i_dirty_item)) {
+   dout(" moving %p to cap_dirty_migrating\n", inode);
+   list_move(&ci->i_dirty_item,
+ &mdsc->cap_dirty_migrating);
+   }
+   spin_unlock(&mdsc->cap_dirty_lock);
+   }
 
dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/6] fixes for cephfs kernel client

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

These patches are client side modification for multiple MDS setup.
I'm not quite sure if patch 6 is correct, please review it.

These patches are also in:
  git://github.com/ukernel/ceph-client.git wip-ceph
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 27/29] mds: check if stray dentry is needed

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

The necessity of stray dentry can change before the request acquires
all locks.

Signed-off-by: Yan, Zheng 
---
 src/mds/Server.cc | 26 --
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 2531005..1557b30 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -4531,11 +4531,14 @@ void Server::handle_client_unlink(MDRequest *mdr)
 
   // -- create stray dentry? --
   CDentry *straydn = mdr->straydn;
-  if (dnl->is_primary() && !straydn) {
-straydn = mdcache->get_or_create_stray_dentry(dnl->get_inode());
-mdr->pin(straydn);
-mdr->straydn = straydn;
-  }
+  if (dnl->is_primary()) {
+if (!straydn) {
+  straydn = mdcache->get_or_create_stray_dentry(dnl->get_inode());
+  mdr->pin(straydn);
+  mdr->straydn = straydn;
+}
+  } else if (straydn)
+straydn = NULL;
   if (straydn)
 dout(10) << " straydn is " << *straydn << dendl;
 
@@ -5274,11 +5277,14 @@ void Server::handle_client_rename(MDRequest *mdr)
 
   // -- create stray dentry? --
   CDentry *straydn = mdr->straydn;
-  if (destdnl->is_primary() && !linkmerge && !straydn) {
-straydn = mdcache->get_or_create_stray_dentry(destdnl->get_inode());
-mdr->pin(straydn);
-mdr->straydn = straydn;
-  }
+  if (destdnl->is_primary() && !linkmerge) {
+if (!straydn) {
+  straydn = mdcache->get_or_create_stray_dentry(destdnl->get_inode());
+  mdr->pin(straydn);
+  mdr->straydn = straydn;
+}
+  } else if (straydn)
+straydn = NULL;
   if (straydn)
 dout(10) << " straydn is " << *straydn << dendl;
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 28/29] mds: don't issue caps while inode is exporting caps

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

If issue caps while inode is exporting caps, the client will drop the
caps soon when it receives the CAP_OP_EXPORT message, but the client
will not receive corresponding CAP_OP_IMPORT message.

Except open file request, it's OK to not issue caps for client requests.
If an non-auth MDS receives open file request but it can't issue caps,
forward the request to auth MDS.

Signed-off-by: Yan, Zheng 
---
 src/mds/CInode.cc | 10 ++
 src/mds/Server.cc | 22 +++---
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 4314ab5..3009f0c 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -2604,17 +2604,19 @@ int CInode::encode_inodestat(bufferlist& bl, Session 
*session,
 {
   int client = session->inst.name.num();
   assert(snapid);
-
   assert(session->connection);
   
   bool valid = true;
 
   // do not issue caps if inode differs from readdir snaprealm
   SnapRealm *realm = find_snaprealm();
-  bool no_caps = (realm && dir_realm && realm != dir_realm);
+  bool no_caps = (realm && dir_realm && realm != dir_realm) ||
+is_frozen() || state_test(CInode::STATE_EXPORTINGCAPS);
   if (no_caps)
-dout(20) << "encode_inodestat realm=" << realm << " snaprealm " << 
snaprealm
-<< " no_caps=" << no_caps << dendl;
+dout(20) << "encode_inodestat no caps"
+<< ((realm && dir_realm && realm != dir_realm)?", snaprealm 
differs ":"")
+<< (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"")
+<< (is_frozen()?", frozen inode":"") << dendl;
 
   // pick a version!
   inode_t *oi = &inode;
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 1557b30..b70445e 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -1936,14 +1936,14 @@ CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, int 
n,
 want_auth = true;
 
   if (want_auth) {
+if (ref->is_ambiguous_auth()) {
+  dout(10) << "waiting for single auth on " << *ref << dendl;
+  ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, 
mdr));
+  return 0;
+}
 if (!ref->is_auth()) {
-  if (ref->is_ambiguous_auth()) {
-   dout(10) << "waiting for single auth on " << *ref << dendl;
-   ref->add_waiter(CInode::WAIT_SINGLEAUTH, new 
C_MDS_RetryRequest(mdcache, mdr));
-  } else {
-   dout(10) << "fw to auth for " << *ref << dendl;
-   mdcache->request_forward(mdr, ref->authority().first);
-  }
+  dout(10) << "fw to auth for " << *ref << dendl;
+  mdcache->request_forward(mdr, ref->authority().first);
   return 0;
 }
 
@@ -2435,6 +2435,14 @@ void Server::handle_client_open(MDRequest *mdr)
   if (!cur)
 return;
 
+  if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
+assert(!need_auth);
+mdr->done_locking = false;
+CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true);
+if (!cur)
+  return;
+  }
+
   if (mdr->snapid != CEPH_NOSNAP && mdr->client_request->may_write()) {
 reply_request(mdr, -EROFS);
 return;
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 29/29] mds: optimize C_MDC_RetryOpenRemoteIno

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

When opening remote inode, C_MDC_RetryOpenRemoteIno is used as onfinish
context for discovering remote inode. When it is called, the MDS may
already have the inode.

Signed-off-by: Yan, Zheng 
---
 src/mds/MDCache.cc | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 236a0b9..9557436 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -7049,7 +7049,11 @@ public:
   C_MDC_RetryOpenRemoteIno(MDCache *mdc, inodeno_t i, Context *c, bool wx) :
 mdcache(mdc), ino(i), want_xlocked(wx), onfinish(c) {}
   void finish(int r) {
-mdcache->open_remote_ino(ino, onfinish, want_xlocked);
+if (mdcache->get_inode(ino)) {
+  onfinish->finish(0);
+  delete onfinish;
+} else
+  mdcache->open_remote_ino(ino, onfinish, want_xlocked);
   }
 };
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 25/29] mds: check null context in CDir::fetch()

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Signed-off-by: Yan, Zheng 
---
 src/mds/CDir.cc | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 91636cc..22cdf48 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1351,8 +1351,11 @@ void CDir::fetch(Context *c, const string& want_dn, bool 
ignore_authpinnability)
   assert(!is_complete());
 
   if (!can_auth_pin() && !ignore_authpinnability) {
-dout(7) << "fetch waiting for authpinnable" << dendl;
-add_waiter(WAIT_UNFREEZE, c);
+if (c) {
+  dout(7) << "fetch waiting for authpinnable" << dendl;
+  add_waiter(WAIT_UNFREEZE, c);
+} else
+  dout(7) << "fetch not authpinnable and no context" << dendl;
 return;
   }
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 26/29] mds: drop locks when opening remote dentry

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Opening remote dentry while holding locks may cause dead lock. For example,
'discover' is blocked by a xlocked dentry, the request holding the xlock
is blocked by the locks hold by the readdir request.

Signed-off-by: Yan, Zheng 
---
 src/mds/Server.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 119af98..2531005 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -2901,6 +2901,8 @@ void Server::handle_client_readdir(MDRequest *mdr)
  break;
}
 
+   mds->locker->drop_locks(mdr);
+   mdr->drop_local_auth_pins();
mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, 
mdr));
return;
   }
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 23/29] mds: fix cap mask for ifile lock

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

ifile lock has 8 cap bits, should its cap mask should be 0xff

Signed-off-by: Yan, Zheng 
---
 src/mds/SimpleLock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h
index 5ea2d4b..9531a01 100644
--- a/src/mds/SimpleLock.h
+++ b/src/mds/SimpleLock.h
@@ -273,7 +273,7 @@ public:
   }
   int get_cap_mask() const {
 switch (get_type()) {
-case CEPH_LOCK_IFILE: return 0xf;
+case CEPH_LOCK_IFILE: return 0xff;
 default: return 0x3;
 }
   }
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 24/29] mds: rdlock prepended dest trace when handling rename

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

rdlock prepended dest trace to prevent them from being xlocked by
someone else.

Signed-off-by: Yan, Zheng 
---
 src/mds/Server.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 8d05a37..119af98 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -5228,6 +5228,7 @@ void Server::handle_client_rename(MDRequest *mdr)
   while (destbase != srcbase) {
desttrace.insert(desttrace.begin(),
 destbase->get_projected_parent_dn());
+   rdlocks.insert(&desttrace[0]->lock);
dout(10) << "rename prepending desttrace with " << *desttrace[0] << 
dendl;
destbase = destbase->get_projected_parent_dn()->get_dir()->get_inode();
   }
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 18/29] mds: fix rename inode exportor check

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Use "srcdn->is_auth() && destdnl->is_primary()" to check if the MDS is
inode exportor of rename operation is not reliable, This is because
OP_FINISH slave request may race with subtree import. The fix is use
a variable in MDRequest to indicate if the MDS is inode exportor.

Signed-off-by: Yan, Zheng 
---
 src/mds/Mutation.h | 4 ++--
 src/mds/Server.cc  | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
index bf59dba..d0d3eca 100644
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -213,7 +213,7 @@ struct MDRequest : public Mutation {
 bufferlist inode_import;
 version_t inode_import_v;
 CInode* destdn_was_remote_inode;
-bool was_link_merge;
+bool was_inode_exportor;
 int prepared_inode_exporter; // has asked auth of srci to mark srci as 
ambiguous auth
 
 map imported_client_map;
@@ -233,7 +233,7 @@ struct MDRequest : public Mutation {
 
 More() : 
   src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0),
-  destdn_was_remote_inode(0), was_link_merge(false),
+  destdn_was_remote_inode(0), was_inode_exportor(false),
   prepared_inode_exporter(-1), flock_was_waiting(false),
   stid(0),
   slave_commit(0) { }
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 6097552..c5793e5 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -6261,6 +6261,7 @@ void Server::_logged_slave_rename(MDRequest *mdr,
 
 // remove mdr auth pin
 mdr->auth_unpin(srcdnl->get_inode());
+mdr->more()->was_inode_exportor = true;
 
 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
   }
@@ -6299,7 +6300,7 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r,
 
 // unfreeze+singleauth inode
 //  hmm, do i really need to delay this?
-if (srcdn->is_auth() && destdnl->is_primary()) {
+if (mdr->more()->was_inode_exportor) {
 
   CInode *in = destdnl->get_inode();
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 21/29] mds: keep dentry lock in sync state as much as possible

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Unlike locks of other types, dentry lock in unreadable state can block path
traverse, so it should be in sync state as much as possible. there are two
rare cases that dentry lock is not set to sync state: the dentry becomes
replicated; finishing xlock but the dentry is freezing.

In commit efbca31d, I tried fixing the issue that unreadable replica dentry
blocks path traverse by modifying MDCache::path_traverse(), but it does not
work. This patch also reverts that change.

Signed-off-by: Yan, Zheng 
---
 src/mds/CDentry.h   | 3 +++
 src/mds/Locker.cc   | 5 -
 src/mds/MDCache.cc  | 6 +-
 src/mds/Migrator.cc | 3 +++
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h
index 480e562..aa10bf9 100644
--- a/src/mds/CDentry.h
+++ b/src/mds/CDentry.h
@@ -301,6 +301,9 @@ public:
   
   // -- replication
   void encode_replica(int mds, bufferlist& bl) {
+if (!is_replicated())
+  lock.replicate_relax();
+
 __u32 nonce = add_replica(mds);
 ::encode(nonce, bl);
 ::encode(first, bl);
diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index cba894b..9b3bfe3 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -1425,13 +1425,16 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest 
*mut)
 void Locker::_finish_xlock(SimpleLock *lock, bool *pneed_issue)
 {
   assert(!lock->is_stable());
-  lock->get_parent()->auth_unpin(lock);
   if (lock->get_type() != CEPH_LOCK_DN && 
((CInode*)lock->get_parent())->get_loner() >= 0)
 lock->set_state(LOCK_EXCL);
   else
 lock->set_state(LOCK_LOCK);
+  if (lock->get_type() == CEPH_LOCK_DN && lock->get_parent()->is_replicated() 
&&
+  !lock->is_waiter_for(SimpleLock::WAIT_WR))
+simple_sync(lock, pneed_issue);
   if (lock->get_cap_shift())
 *pneed_issue = true;
+  lock->get_parent()->auth_unpin(lock);
 }
 
 void Locker::xlock_finish(SimpleLock *lock, Mutation *mut, bool *pneed_issue)
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 5adf4b7..236a0b9 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -6735,14 +6735,10 @@ int MDCache::path_traverse(MDRequest *mdr, Message 
*req, Context *fin, // wh
  (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
 dout(10) << "traverse: miss on null+readable dentry " << path[depth] 
<< " " << *dn << dendl;
 return -ENOENT;
-  } else if (curdir->is_auth()) {
+  } else {
 dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << 
dendl;
 dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin));
 return 1;
-  } else {
-   // non-auth and can not read, treat this as no dentry
-   dn = NULL;
-   dnl = NULL;
   }
 }
 
diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 8686c86..bda8035 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -1135,6 +1135,9 @@ int Migrator::encode_export_dir(bufferlist& exportbl,
 CDentry *dn = it->second;
 CInode *in = dn->get_linkage()->get_inode();
 
+if (!dn->is_replicated())
+  dn->lock.replicate_relax();
+
 num_exported++;
 
 // -- dentry
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 22/29] mds: fix replica state for LOCK_MIX_LOCK

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

LOCK_MIX_LOCK state is for gathering local locks and caps, so replica state
should be LOCK_MIX.

Signed-off-by: Yan, Zheng 
---
 src/mds/locks.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mds/locks.c b/src/mds/locks.c
index ce1a6dd..73b99fb 100644
--- a/src/mds/locks.c
+++ b/src/mds/locks.c
@@ -64,7 +64,7 @@ static const struct sm_state_t scatterlock[LOCK_MAX] = {

 [LOCK_LOCK]  = { 0, false, LOCK_LOCK, AUTH, 0,   REQ, AUTH,0,  
 0,   ANY, 0,0,0,0 },
 [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0,   0,   0,   0,  
 0,   0,   0,0,0,0 },
-[LOCK_MIX_LOCK]  = { LOCK_LOCK, false, LOCK_LOCK, 0,0,   0,   0,   0,  
 0,   0,   0,0,0,0 },
+[LOCK_MIX_LOCK]  = { LOCK_LOCK, false, LOCK_MIX,  0,0,   0,   0,   0,  
 0,   0,   0,0,0,0 },
 [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, 0,0,   0,   0,   0,  
 0,   0,   0,0,0,0 },
 [LOCK_TSYN_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0,0,   0,   0,   0,  
 0,   0,   0,0,0,0 },
 
@@ -98,7 +98,7 @@ const struct sm_state_t filelock[LOCK_MAX] = {
 [LOCK_LOCK]  = { 0, false, LOCK_LOCK, AUTH, 0,   REQ, AUTH,0,  
 0,   0,   CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
 [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0,   REQ, 0,   0,  
 0,   0,   CEPH_CAP_GCACHE,0,0,CEPH_CAP_GCACHE },
 [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0,0,   0,   0,   
XCL, 0,   0,   CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,CEPH_CAP_GCACHE },
-[LOCK_MIX_LOCK]  = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0,   REQ, 0,   0,  
 0,   0,   0,0,0,0 },
+[LOCK_MIX_LOCK]  = { LOCK_LOCK, false, LOCK_MIX,  AUTH, 0,   REQ, 0,   0,  
 0,   0,   0,0,0,0 },
 [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0,   REQ, 0,   0,  
 0,   0,   0,0,0,0 },
 
 [LOCK_PREXLOCK]  = { LOCK_LOCK, false, LOCK_LOCK, 0,XCL, 0,   0,   0,  
 0,   ANY, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 },
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 20/29] mds: forbid creating file in deleted directory

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Signed-off-by: Yan, Zheng 
---
 src/mds/CInode.cc |  5 -
 src/mds/CInode.h  |  2 ++
 src/mds/Server.cc | 13 -
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 9d29b46..4314ab5 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -245,7 +245,10 @@ void CInode::print(ostream& out)
   out << *this;
 }
 
-
+bool CInode::is_in_stray()
+{
+  return !is_base() && get_projected_parent_dir()->inode->is_stray();
+}
 
 void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t 
client)
 {
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index a627a4f..6eb396d 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -520,6 +520,8 @@ private:
 
   bool is_head() { return last == CEPH_NOSNAP; }
 
+  bool is_in_stray();
+
   // note: this overloads MDSCacheObject
   bool is_ambiguous_auth() {
 return state_test(STATE_AMBIGUOUSAUTH) ||
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 517339f..8d05a37 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -2633,6 +2633,13 @@ void Server::handle_client_openc(MDRequest *mdr)
 reply_request(mdr, -EROFS);
 return;
   }
+
+  CInode *diri = dn->get_dir()->get_inode();
+  if (diri->is_in_stray()) {
+reply_request(mdr, -ENOENT);
+return;
+  }
+
   // set layout
   ceph_file_layout layout;
   if (dir_layout)
@@ -2669,7 +2676,6 @@ void Server::handle_client_openc(MDRequest *mdr)
 return;
   }
 
-  CInode *diri = dn->get_dir()->get_inode();
   rdlocks.insert(&diri->authlock);
   if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks))
 return;
@@ -5123,6 +5129,11 @@ void Server::handle_client_rename(MDRequest *mdr)
   CDir *destdir = destdn->get_dir();
   assert(destdir->is_auth());
 
+  if (destdir->get_inode()->is_in_stray()) {
+reply_request(mdr, -ENOENT);
+return;
+  }
+
   int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, 
MDS_TRAVERSE_DISCOVER);
   if (r > 0)
 return; // delayed
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 19/29] mds: disable concurrent remote locking

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Current code allows multiple MDRequests to concurrently acquire a
remote lock. But a lock ACK message wakes all requests because they
were all put to the same waiting queue. One request gets the lock,
the rest requests will re-send the OP_WRLOCK/OPWRLOCK slave requests
and trigger assertion on remote MDS. The fix is disable concurrently
acquiring remote lock, send OP_WRLOCK/OPWRLOCK slave request only
if there is no on-going slave request.

Signed-off-by: Yan, Zheng 
---
 src/mds/Locker.cc | 54 --
 src/mds/Server.cc |  2 ++
 2 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index dbf4452..cba894b 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -532,14 +532,18 @@ void Locker::cancel_locking(Mutation *mut, set 
*pneed_issue)
   assert(lock);
   dout(10) << "cancel_locking " << *lock << " on " << *mut << dendl;
 
-  if (lock->get_type() != CEPH_LOCK_DN) {
-bool need_issue = false;
-if (lock->get_state() == LOCK_PREXLOCK)
-  _finish_xlock(lock, &need_issue);
-if (lock->is_stable())
-  eval(lock, &need_issue);
-if (need_issue)
-  pneed_issue->insert((CInode *)lock->get_parent());
+  if (lock->get_parent()->is_auth()) {
+if (lock->get_type() != CEPH_LOCK_DN) {
+  bool need_issue = false;
+  if (lock->get_state() == LOCK_PREXLOCK)
+   _finish_xlock(lock, &need_issue);
+  if (lock->is_stable())
+   eval(lock, &need_issue);
+  if (need_issue)
+   pneed_issue->insert((CInode *)lock->get_parent());
+}
+  } else {
+lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK);
   }
   mut->finish_locking(lock);
 }
@@ -1314,14 +1318,17 @@ void Locker::remote_wrlock_start(SimpleLock *lock, int 
target, MDRequest *mut)
   new C_MDS_RetryRequest(mdcache, mut));
 return;
   }
-
+
   // send lock request
-  mut->more()->slaves.insert(target);
-  MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
-MMDSSlaveRequest::OP_WRLOCK);
-  r->set_lock_type(lock->get_type());
-  lock->get_parent()->set_object_info(r->get_object_info());
-  mds->send_message_mds(r, target);
+  if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) {
+mut->start_locking(lock);
+mut->more()->slaves.insert(target);
+MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
+  MMDSSlaveRequest::OP_WRLOCK);
+r->set_lock_type(lock->get_type());
+lock->get_parent()->set_object_info(r->get_object_info());
+mds->send_message_mds(r, target);
+  }
   
   // wait
   lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new 
C_MDS_RetryRequest(mdcache, mut));
@@ -1398,13 +1405,16 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest 
*mut)
 }
 
 // send lock request
-int auth = lock->get_parent()->authority().first;
-mut->more()->slaves.insert(auth);
-MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
-  MMDSSlaveRequest::OP_XLOCK);
-r->set_lock_type(lock->get_type());
-lock->get_parent()->set_object_info(r->get_object_info());
-mds->send_message_mds(r, auth);
+if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) {
+  mut->start_locking(lock);
+  int auth = lock->get_parent()->authority().first;
+  mut->more()->slaves.insert(auth);
+  MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt,
+MMDSSlaveRequest::OP_XLOCK);
+  r->set_lock_type(lock->get_type());
+  lock->get_parent()->set_object_info(r->get_object_info());
+  mds->send_message_mds(r, auth);
+}
 
 // wait
 lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new 
C_MDS_RetryRequest(mdcache, mut));
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index c5793e5..517339f 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -1324,6 +1324,7 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest 
*m)
   dout(10) << "got remote xlock on " << *lock << " on " << 
*lock->get_parent() << dendl;
   mdr->xlocks.insert(lock);
   mdr->locks.insert(lock);
+  mdr->finish_locking(lock);
   lock->get_xlock(mdr, mdr->get_client());
   lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK);
 }
@@ -1338,6 +1339,7 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest 
*m)
   dout(10) << "got remote wrlock on " << *lock << " on " << 
*lock->get_parent() << dendl;
   mdr->remote_wrlocks[lock] = from;
   mdr->locks.insert(lock);
+  mdr->finish_locking(lock);
   lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK);
 }
 break;
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.ke

[PATCH 17/29] mds: call maybe_eval_stray after removing a replica dentry

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

MDCache::handle_cache_expire() processes dentries after inodes, so the
MDCache::maybe_eval_stray() in MDCache::inode_remove_replica() always
fails to remove stray inode because MDCache::eval_stray() checks if the
stray inode's dentry is replicated.

Signed-off-by: Yan, Zheng 
---
 src/mds/MDCache.cc | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 3410b0f..5adf4b7 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -6120,9 +6120,11 @@ void MDCache::dentry_remove_replica(CDentry *dn, int 
from)
   // fix lock
   if (dn->lock.remove_replica(from))
 mds->locker->eval_gather(&dn->lock);
-}
-
 
+  CDentry::linkage_t *dnl = dn->get_projected_linkage();
+  if (dnl->is_primary())
+maybe_eval_stray(dnl->get_inode());
+}
 
 void MDCache::trim_client_leases()
 {
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 16/29] mds: don't defer processing caps if inode is auth pinned

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

We should not defer processing caps if the inode is auth pinned by MDRequest,
because the MDRequest may change lock state of the inode later and wait for
the deferred caps.

Signed-off-by: Yan, Zheng 
---
 src/mds/Locker.cc | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index ee2d4cc..dbf4452 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2199,12 +2199,11 @@ bool Locker::should_defer_client_cap_frozen(CInode *in)
* Currently, a request wait if anything locked is freezing (can't
* auth_pin), which would avoid any deadlock with cap release.  Thus @in
* _MUST_ be in the lock/auth_pin set.
+   *
+   * auth_pins==0 implies no unstable lock and not auth pinnned by
+   * client request, otherwise continue even it's freezing.
*/
-  return (in->is_freezing() && (in->filelock.is_stable() &&
-   in->authlock.is_stable() &&
-   in->xattrlock.is_stable() &&
-   in->linklock.is_stable())) ||  // continue if 
freezing and lock is unstable
-in->is_frozen();
+  return (in->is_freezing() && in->get_num_auth_pins() == 0) || 
in->is_frozen();
 }
 
 /*
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 15/29] mds: remove unnecessary is_xlocked check

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Locker::foo_eval() is always called for stable locks, so no need to
check if the lock is xlocked.

Signed-off-by: Yan, Zheng 
---
 src/mds/Locker.cc | 21 -
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index e826321..ee2d4cc 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -3291,7 +3291,6 @@ void Locker::simple_eval(SimpleLock *lock, bool 
*need_issue)
 
   // stable -> sync?
   else if (lock->get_state() != LOCK_SYNC &&
-  !lock->is_xlocked() &&
   !lock->is_wrlocked() &&
   ((!(wanted & CEPH_CAP_GEXCL) && 
!lock->is_waiter_for(SimpleLock::WAIT_WR)) ||
(lock->get_state() == LOCK_EXCL && in && in->get_target_loner() < 
0))) {
@@ -3334,8 +,6 @@ bool Locker::simple_sync(SimpleLock *lock, bool 
*need_issue)
 int gather = 0;
 if (lock->is_wrlocked())
   gather++;
-if (lock->is_xlocked())
-  gather++;
 
 if (lock->get_parent()->is_replicated() && old_state == LOCK_MIX) {
   send_lock_message(lock, LOCK_AC_SYNC);
@@ -3412,9 +3409,7 @@ void Locker::simple_excl(SimpleLock *lock, bool 
*need_issue)
 gather++;
   if (lock->is_wrlocked())
 gather++;
-  if (lock->is_xlocked())
-gather++;
-  
+
   if (lock->get_parent()->is_replicated() && 
   lock->get_state() != LOCK_LOCK_EXCL &&
   lock->get_state() != LOCK_XSYN_EXCL) {
@@ -3695,7 +3690,6 @@ void Locker::scatter_eval(ScatterLock *lock, bool 
*need_issue)
   }
   
   if (!lock->is_rdlocked() &&
-  !lock->is_xlocked() &&
   lock->get_state() != LOCK_MIX &&
   lock->get_scatter_wanted()) {
 dout(10) << "scatter_eval scatter_wanted, bump to mix " << *lock
@@ -3706,8 +3700,7 @@ void Locker::scatter_eval(ScatterLock *lock, bool 
*need_issue)
 
   if (lock->get_type() == CEPH_LOCK_INEST) {
 // in general, we want to keep INEST writable at all times.
-if (!lock->is_rdlocked() &&
-   !lock->is_xlocked()) {
+if (!lock->is_rdlocked()) {
   if (lock->get_parent()->is_replicated()) {
if (lock->get_state() != LOCK_MIX)
  scatter_mix(lock, need_issue);
@@ -3723,7 +3716,6 @@ void Locker::scatter_eval(ScatterLock *lock, bool 
*need_issue)
   if (!in->has_subtree_root_dirfrag() || in->is_base()) {
 // i _should_ be sync.
 if (!lock->is_wrlocked() &&
-   !lock->is_xlocked() &&
lock->get_state() != LOCK_SYNC) {
   dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, 
syncing" << dendl;
   simple_sync(lock, need_issue);
@@ -3909,8 +3901,6 @@ void Locker::scatter_tempsync(ScatterLock *lock, bool 
*need_issue)
   int gather = 0;
   if (lock->is_wrlocked())
 gather++;
-  if (lock->is_xlocked())
-gather++;
 
   if (lock->get_cap_shift() &&
   in->is_head() &&
@@ -4044,8 +4034,7 @@ void Locker::file_eval(ScatterLock *lock, bool 
*need_issue)
   assert(lock->get_parent()->is_auth());
   assert(lock->is_stable());
 
-  if (lock->is_xlocked() || 
-  lock->get_parent()->is_freezing_or_frozen())
+  if (lock->get_parent()->is_freezing_or_frozen())
 return;
 
   // excl -> *?
@@ -4297,9 +4286,7 @@ void Locker::file_xsyn(SimpleLock *lock, bool *need_issue)
   int gather = 0;
   if (lock->is_wrlocked())
 gather++;
-  if (lock->is_xlocked())
-gather++;
-  
+
   if (in->is_head() &&
   in->issued_caps_need_gather(lock)) {
 if (need_issue)
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 14/29] mds: fix lock state transition check

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Locker::simple_excl() and Locker::scatter_mix() miss is_rdlocked
check; Locker::file_excl() miss is_rdlocked check and is_wrlocked
check.

Signed-off-by: Yan, Zheng 
---
 src/mds/Locker.cc | 9 +
 1 file changed, 9 insertions(+)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index dec0a94..e826321 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -3408,6 +3408,8 @@ void Locker::simple_excl(SimpleLock *lock, bool 
*need_issue)
   }
   
   int gather = 0;
+  if (lock->is_rdlocked())
+gather++;
   if (lock->is_wrlocked())
 gather++;
   if (lock->is_xlocked())
@@ -4167,6 +4169,8 @@ void Locker::scatter_mix(ScatterLock *lock, bool 
*need_issue)
 }
 
 int gather = 0;
+if (lock->is_rdlocked())
+  gather++;
 if (in->is_replicated()) {
   if (lock->get_state() != LOCK_EXCL_MIX &&   // EXCL replica is already 
LOCK
  lock->get_state() != LOCK_XSYN_EXCL) {  // XSYN replica is already 
LOCK;  ** FIXME here too!
@@ -4237,6 +4241,11 @@ void Locker::file_excl(ScatterLock *lock, bool 
*need_issue)
   }
   int gather = 0;
   
+  if (lock->is_rdlocked())
+gather++;
+  if (lock->is_wrlocked())
+gather++;
+
   if (in->is_replicated() &&
   lock->get_state() != LOCK_LOCK_EXCL &&
   lock->get_state() != LOCK_XSYN_EXCL) {  // if we were lock, replicas are 
already lock.
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 11/29] mds: fix anchor table commit race

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Anchor table updates for a given inode is fully serialized on client side.
But due to network latency, two commit requests from different clients can
arrive to anchor server out of order. The anchor table gets corrupted if
updates are committed in wrong order.

The fix is track on-going anchor updates for individual inode and delay
processing commit requests that arrive out of order.

Signed-off-by: Yan, Zheng 
---
 src/mds/AnchorServer.cc   | 61 +--
 src/mds/AnchorServer.h| 13 +-
 src/mds/MDSTableServer.cc |  4 +++-
 src/mds/MDSTableServer.h  |  2 +-
 src/mds/SnapServer.cc |  3 ++-
 src/mds/SnapServer.h  |  2 +-
 6 files changed, 78 insertions(+), 7 deletions(-)

diff --git a/src/mds/AnchorServer.cc b/src/mds/AnchorServer.cc
index cd2e625..4b1bf64 100644
--- a/src/mds/AnchorServer.cc
+++ b/src/mds/AnchorServer.cc
@@ -142,12 +142,13 @@ void AnchorServer::_prepare(bufferlist &bl, uint64_t 
reqid, int bymds)
 }
 inc(ino);
 pending_create[version] = ino;  // so we can undo
+pending_ops[ino].push_back(pair(version, NULL));
 break;
 
-
   case TABLE_OP_DESTROY:
 version++;
 pending_destroy[version] = ino;
+pending_ops[ino].push_back(pair(version, NULL));
 break;
 
   case TABLE_OP_UPDATE:
@@ -155,6 +156,7 @@ void AnchorServer::_prepare(bufferlist &bl, uint64_t reqid, 
int bymds)
 version++;
 pending_update[version].first = ino;
 pending_update[version].second = trace;
+pending_ops[ino].push_back(pair(version, NULL));
 break;
 
   default:
@@ -163,8 +165,56 @@ void AnchorServer::_prepare(bufferlist &bl, uint64_t 
reqid, int bymds)
   //dump();
 }
 
-void AnchorServer::_commit(version_t tid)
+bool AnchorServer::check_pending(version_t tid, MMDSTableRequest *req, 
list& finished)
+{
+  inodeno_t ino;
+  if (pending_create.count(tid))
+ino = pending_create[tid];
+  else if (pending_destroy.count(tid))
+ino = pending_destroy[tid];
+  else if (pending_update.count(tid))
+ino = pending_update[tid].first;
+  else
+assert(0);
+
+  assert(pending_ops.count(ino));
+  list< pair >& pending = pending_ops[ino];
+  list< pair >::iterator p = pending.begin();
+  if (p->first == tid) {
+assert(p->second == NULL);
+  } else {
+while (p != pending.end()) {
+  if (p->first == tid)
+   break;
+  p++;
+}
+assert(p != pending.end());
+assert(p->second == NULL);
+// not the earliest pending operation, wait if it's a commit
+if (req) {
+  p->second = new C_MDS_RetryMessage(mds, req);
+  return false;
+}
+  }
+
+  pending.erase(p);
+  if (pending.empty()) {
+pending_ops.erase(ino);
+  } else {
+for (p = pending.begin(); p != pending.end() && p->second; p++) {
+  finished.push_back(p->second);
+  p->second = NULL;
+}
+  }
+  return true;
+}
+
+bool AnchorServer::_commit(version_t tid, MMDSTableRequest *req)
 {
+  list finished;
+  if (!check_pending(tid, req, finished))
+return false;
+
   if (pending_create.count(tid)) {
 dout(7) << "commit " << tid << " create " << pending_create[tid] << dendl;
 pending_create.erase(tid);
@@ -206,10 +256,16 @@ void AnchorServer::_commit(version_t tid)
   // bump version.
   version++;
   //dump();
+
+  mds->queue_waiters(finished);
+  return true;
 }
 
 void AnchorServer::_rollback(version_t tid) 
 {
+  list finished;
+  check_pending(tid, NULL, finished);
+
   if (pending_create.count(tid)) {
 inodeno_t ino = pending_create[tid];
 dout(7) << "rollback " << tid << " create " << ino << dendl;
@@ -234,6 +290,7 @@ void AnchorServer::_rollback(version_t tid)
   // bump version.
   version++;
   //dump();
+  mds->queue_waiters(finished);
 }
 
 /* This function DOES put the passed message before returning */
diff --git a/src/mds/AnchorServer.h b/src/mds/AnchorServer.h
index aa5588a..50a848e 100644
--- a/src/mds/AnchorServer.h
+++ b/src/mds/AnchorServer.h
@@ -30,6 +30,7 @@ class AnchorServer : public MDSTableServer {
   map pending_create;
   map pending_destroy;
   map > > pending_update;
+  map > > pending_ops;
 
   void reset_state();
   void encode_server_state(bufferlist& bl) {
@@ -47,17 +48,27 @@ class AnchorServer : public MDSTableServer {
 ::decode(pending_create, p);
 ::decode(pending_destroy, p);
 ::decode(pending_update, p);
+
+map sort;
+sort.insert(pending_create.begin(), pending_create.end());
+sort.insert(pending_destroy.begin(), pending_destroy.end());
+for (map > >::iterator p = 
pending_update.begin();
+p != pending_update.end(); p++)
+  sort[p->first] = p->second.first;
+for (map::iterator p = sort.begin(); p != 
sort.end(); p++)
+  pending_ops[p->second].push_back(pair(p->first, 
NULL));
   }
 
   bool add(inodeno_t ino, inodeno_t dirino, __u32 dn_hash, bool replace);
   void inc(inodeno_t ino, int ref=1);
   void dec(inodeno_t ino, int ref=1);
+  bool check_pending(version_t tid, MMDSTableRequ

[PATCH 13/29] mds: indroduce DROPLOCKS slave request

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

In some rare case, Locker::acquire_locks() drops all acquired locks
in order to auth pin new objects. But Locker::drop_locks only drops
explicitly acquired remote locks, does not drop objects' version
locks that were implicitly acquired on remote MDS. These leftover
locks break locking order when re-acquiring _locks and may cause
dead lock.

The fix is indroduce DROPLOCKS slave request which drops all acquired
lock on remote MDS.

Signed-off-by: Yan, Zheng 
---
 src/mds/Locker.cc   | 30 +++---
 src/mds/Server.cc   |  6 ++
 src/messages/MMDSSlaveRequest.h |  4 
 3 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 98eaba8..dec0a94 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -483,16 +483,31 @@ void Locker::_drop_rdlocks(Mutation *mut, set 
*pneed_issue)
 
 void Locker::_drop_non_rdlocks(Mutation *mut, set *pneed_issue)
 {
+  set slaves;
+
   while (!mut->xlocks.empty()) {
+SimpleLock *lock = *mut->xlocks.begin();
+MDSCacheObject *p = lock->get_parent();
+if (!p->is_auth()) {
+  assert(lock->get_sm()->can_remote_xlock);
+  slaves.insert(p->authority().first);
+  lock->put_xlock();
+  mut->locks.erase(lock);
+  mut->xlocks.erase(lock);
+  continue;
+}
 bool ni = false;
-MDSCacheObject *p = (*mut->xlocks.begin())->get_parent();
-xlock_finish(*mut->xlocks.begin(), mut, &ni);
+xlock_finish(lock, mut, &ni);
 if (ni)
   pneed_issue->insert((CInode*)p);
   }
+
   while (!mut->remote_wrlocks.empty()) {
-remote_wrlock_finish(mut->remote_wrlocks.begin()->first, 
mut->remote_wrlocks.begin()->second, mut);
+slaves.insert(mut->remote_wrlocks.begin()->second);
+mut->locks.erase(mut->remote_wrlocks.begin()->first);
+mut->remote_wrlocks.erase(mut->remote_wrlocks.begin());
   }
+
   while (!mut->wrlocks.empty()) {
 bool ni = false;
 MDSCacheObject *p = (*mut->wrlocks.begin())->get_parent();
@@ -500,6 +515,15 @@ void Locker::_drop_non_rdlocks(Mutation *mut, set 
*pneed_issue)
 if (ni)
   pneed_issue->insert((CInode*)p);
   }
+
+  for (set::iterator p = slaves.begin(); p != slaves.end(); p++) {
+if (mds->mdsmap->get_state(*p) >= MDSMap::STATE_REJOIN) {
+  dout(10) << "_drop_non_rdlocks dropping remote locks on mds." << *p << 
dendl;
+  MMDSSlaveRequest *slavereq = new MMDSSlaveRequest(mut->reqid, 
mut->attempt,
+   
MMDSSlaveRequest::OP_DROPLOCKS);
+  mds->send_message_mds(slavereq, *p);
+}
+  }
 }
 
 void Locker::cancel_locking(Mutation *mut, set *pneed_issue)
diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 6700eda..6097552 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -1455,6 +1455,12 @@ void Server::dispatch_slave_request(MDRequest *mdr)
 }
 break;
 
+  case MMDSSlaveRequest::OP_DROPLOCKS:
+mds->locker->drop_locks(mdr);
+mdr->slave_request->put();
+mdr->slave_request = 0;
+break;
+
   case MMDSSlaveRequest::OP_AUTHPIN:
 handle_slave_auth_pin(mdr);
 break;
diff --git a/src/messages/MMDSSlaveRequest.h b/src/messages/MMDSSlaveRequest.h
index 03ec582..35af81d 100644
--- a/src/messages/MMDSSlaveRequest.h
+++ b/src/messages/MMDSSlaveRequest.h
@@ -41,6 +41,8 @@ class MMDSSlaveRequest : public Message {
   static const int OP_RMDIRPREP = 10;
   static const int OP_RMDIRPREPACK = -10;
 
+  static const int OP_DROPLOCKS= 11;
+
   static const int OP_FINISH = 17;  
   static const int OP_COMMITTED = -18;  
 
@@ -73,6 +75,8 @@ class MMDSSlaveRequest : public Message {
 case OP_RMDIRPREP: return "rmdir_prep";
 case OP_RMDIRPREPACK: return "rmdir_prep_ack";
 
+case OP_DROPLOCKS: return "drop_locks";
+
 case OP_ABORT: return "abort";
   //case OP_COMMIT: return "commit";
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/29] mds: mark rename inode as ambiguous auth on all involved MDS

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

When handling cross authority rename, the master first sends OP_RENAMEPREP
slave requests to witness MDS, then sends OP_RENAMEPREP slave request to
the rename inode's auth MDS after getting all witness MDS' acknowledgments.
Before receiving the OP_RENAMEPREP slave request, the rename inode's auth
MDS may change lock state of the rename inode and send lock messages to
witness MDS. But the witness MDS may already received the OP_RENAMEPREP
slave request and changed the source inode's authority. So the witness MDS
send lock acknowledgment message to wrong MDS and trigger assertion.

The fix is, firstly the master marks rename inode as ambiguous and send a
message to ask the rename inode's auth MDS to mark the inode as ambiguous,
then send OP_RENAMEPREP slave requests to the witness MDS, finally send
OP_RENAMEPREP slave request to the rename inode's auth MDS after getting
all witness MDS' acknowledgments.

Signed-off-by: Yan, Zheng 
---
 src/mds/CInode.cc   | 14 
 src/mds/CInode.h|  6 +++-
 src/mds/MDCache.cc  |  6 
 src/mds/Mutation.cc | 16 +
 src/mds/Mutation.h  |  8 -
 src/mds/Server.cc   | 98 +
 src/mds/Server.h|  2 +-
 7 files changed, 110 insertions(+), 40 deletions(-)

diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc
index 595892e..9d29b46 100644
--- a/src/mds/CInode.cc
+++ b/src/mds/CInode.cc
@@ -1932,6 +1932,20 @@ void CInode::unfreeze_auth_pin()
   }
 }
 
+void CInode::clear_ambiguous_auth(list& finished)
+{
+  assert(state_test(CInode::STATE_AMBIGUOUSAUTH));
+  state_clear(CInode::STATE_AMBIGUOUSAUTH);
+  take_waiting(CInode::WAIT_SINGLEAUTH, finished);
+}
+
+void CInode::clear_ambiguous_auth()
+{
+  list finished;
+  clear_ambiguous_auth(finished);
+  mdcache->mds->queue_waiters(finished);
+}
+
 // auth_pins
 bool CInode::can_auth_pin() {
   if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin())
diff --git a/src/mds/CInode.h b/src/mds/CInode.h
index e43ecf5..a627a4f 100644
--- a/src/mds/CInode.h
+++ b/src/mds/CInode.h
@@ -525,7 +525,11 @@ private:
 return state_test(STATE_AMBIGUOUSAUTH) ||
   MDSCacheObject::is_ambiguous_auth();
   }
-
+  void set_ambiguous_auth() {
+state_set(STATE_AMBIGUOUSAUTH);
+  }
+  void clear_ambiguous_auth(list& finished);
+  void clear_ambiguous_auth();
 
   inodeno_t ino() const { return inode.ino; }
   vinodeno_t vino() const { return vinodeno_t(inode.ino, last); }
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index da0c7f1..3410b0f 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -2616,6 +2616,9 @@ void MDCache::handle_mds_failure(int who)
p->second->more()->waiting_on_slave.erase(who);
mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second));
   }
+
+  if (p->second->more()->prepared_inode_exporter == who)
+   p->second->more()->prepared_inode_exporter = -1;
 }
   }
 
@@ -7607,6 +7610,9 @@ void MDCache::request_cleanup(MDRequest *mdr)
   // drop (local) auth pins
   mdr->drop_local_auth_pins();
 
+  if (mdr->ambiguous_auth_inode)
+mdr->clear_ambiguous_auth(mdr->ambiguous_auth_inode);
+
   // drop stickydirs
   for (set::iterator p = mdr->stickydirs.begin();
p != mdr->stickydirs.end();
diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc
index a9c3513..1c4cd13 100644
--- a/src/mds/Mutation.cc
+++ b/src/mds/Mutation.cc
@@ -106,6 +106,22 @@ void Mutation::unfreeze_auth_pin(CInode *inode)
   auth_pin_freeze = NULL;
 }
 
+void Mutation::set_ambiguous_auth(CInode *inode)
+{
+  if (!ambiguous_auth_inode) {
+inode->set_ambiguous_auth();
+ambiguous_auth_inode = inode;
+  } else
+assert(ambiguous_auth_inode == inode);
+}
+
+void Mutation::clear_ambiguous_auth(CInode *inode)
+{
+  assert(ambiguous_auth_inode == inode);
+  ambiguous_auth_inode->clear_ambiguous_auth();
+  ambiguous_auth_inode = NULL;
+}
+
 bool Mutation::can_auth_pin(MDSCacheObject *object)
 {
   return object->can_auth_pin() || (is_auth_pinned(object) && object == 
auth_pin_freeze);
diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h
index 83a1196..bf59dba 100644
--- a/src/mds/Mutation.h
+++ b/src/mds/Mutation.h
@@ -51,6 +51,7 @@ struct Mutation {
   set< MDSCacheObject* > remote_auth_pins;
   set< MDSCacheObject* > auth_pins;
   CInode *auth_pin_freeze;
+  CInode* ambiguous_auth_inode;
   
   // held locks
   set< SimpleLock* > rdlocks;  // always local.
@@ -83,6 +84,7 @@ struct Mutation {
   ls(0),
   slave_to_mds(-1),
   auth_pin_freeze(NULL),
+  ambiguous_auth_inode(NULL),
   locking(NULL),
   done_locking(false), committing(false), aborted(false), killed(false) { }
   Mutation(metareqid_t ri, __u32 att=0, int slave_to=-1)
@@ -90,6 +92,7 @@ struct Mutation {
   ls(0),
   slave_to_mds(slave_to), 
   auth_pin_freeze(NULL),
+  ambiguous_auth_inode(NULL),
   locking(NULL),
   done_locking(false), committing(false), aborted(false), killed(false)

[PATCH 12/29] mds: fix on-going two phrase commits tracking

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

The slaves for two phrase commit should be mdr->more()->witnessed
instead of mdr->more()->slaves. mdr->more()->slaves includes MDS
for remote auth pin and lock

Signed-off-by: Yan, Zheng 
---
 src/mds/Server.cc | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index e17f826..6700eda 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -4076,11 +4076,11 @@ void Server::_link_remote(MDRequest *mdr, bool inc, 
CDentry *dn, CInode *targeti
   EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
   mdlog->start_entry(le);
   le->metablob.add_client_req(mdr->reqid, 
mdr->client_request->get_oldest_client_tid());
-  if (!mdr->more()->slaves.empty()) {
-dout(20) << " noting uncommitted_slaves " << mdr->more()->slaves << dendl;
+  if (!mdr->more()->witnessed.empty()) {
+dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << 
dendl;
 le->reqid = mdr->reqid;
 le->had_slaves = true;
-mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, 
mdr->more()->slaves);
+mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, 
mdr->more()->witnessed);
   }
 
   if (inc) {
@@ -4655,11 +4655,11 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, 
CDentry *straydn)
   EUpdate *le = new EUpdate(mdlog, "unlink_local");
   mdlog->start_entry(le);
   le->metablob.add_client_req(mdr->reqid, 
mdr->client_request->get_oldest_client_tid());
-  if (!mdr->more()->slaves.empty()) {
-dout(20) << " noting uncommitted_slaves " << mdr->more()->slaves << dendl;
+  if (!mdr->more()->witnessed.empty()) {
+dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << 
dendl;
 le->reqid = mdr->reqid;
 le->had_slaves = true;
-mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, 
mdr->more()->slaves);
+mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, 
mdr->more()->witnessed);
   }
 
   if (straydn) {
@@ -5493,13 +5493,13 @@ void Server::handle_client_rename(MDRequest *mdr)
   EUpdate *le = new EUpdate(mdlog, "rename");
   mdlog->start_entry(le);
   le->metablob.add_client_req(mdr->reqid, 
mdr->client_request->get_oldest_client_tid());
-  if (!mdr->more()->slaves.empty()) {
-dout(20) << " noting uncommitted_slaves " << mdr->more()->slaves << dendl;
+  if (!mdr->more()->witnessed.empty()) {
+dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << 
dendl;
 
 le->reqid = mdr->reqid;
 le->had_slaves = true;
 
-mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, 
mdr->more()->slaves);
+mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, 
mdr->more()->witnessed);
   }
   
   _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/29] mds: skip frozen inode when assimilating dirty inodes' rstat

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

CDir::assimilate_dirty_rstat_inodes() may encounter frozen inodes that
are being renamed. Skip these frozen inodes because assimilating inode's
rstat require auth pinning the inode.

Signed-off-by: Yan, Zheng 
---
 src/mds/CDir.cc | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc
index 20ff469..91636cc 100644
--- a/src/mds/CDir.cc
+++ b/src/mds/CDir.cc
@@ -1022,6 +1022,9 @@ void CDir::assimilate_dirty_rstat_inodes()
   for (elist::iterator p = dirty_rstat_inodes.begin_use_current();
!p.end(); ++p) {
 CInode *in = *p;
+if (in->is_frozen())
+  continue;
+
 inode_t *pi = in->project_inode();
 pi->version = in->pre_dirty();
 
@@ -1040,16 +1043,22 @@ void 
CDir::assimilate_dirty_rstat_inodes_finish(Mutation *mut, EMetaBlob *blob)
   elist::iterator p = dirty_rstat_inodes.begin_use_current();
   while (!p.end()) {
 CInode *in = *p;
-CDentry *dn = in->get_projected_parent_dn();
 ++p;
 
+if (in->is_frozen())
+  continue;
+
+CDentry *dn = in->get_projected_parent_dn();
+
 mut->auth_pin(in);
 mut->add_projected_inode(in);
 
 in->clear_dirty_rstat();
 blob->add_primary_dentry(dn, true, in);
   }
-  assert(dirty_rstat_inodes.empty());
+
+  if (!dirty_rstat_inodes.empty())
+inode->mdcache->mds->locker->mark_updated_scatterlock(&inode->nestlock);
 }
 
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/29] mds: use null dentry to find old parent of renamed directory

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

When replaying an directory rename operation, MDS need to find old parent of
the renamed directory to adjust auth subtree. Current code searchs the cache
to find the old parent, it does not work if the renamed directory inode is not
in the cache. EMetaBlob for directory rename contains at most one null dentry,
so MDS can use null dentry to find old parent of the renamed directory. If
there is no null dentry in the EMetaBlob, the MDS was witness of the rename
operation and there is not auth subtree underneath the renamed directory.

Signed-off-by: Yan, Zheng 
---
 src/mds/Server.cc  | 11 ++-
 src/mds/journal.cc | 38 --
 2 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 23739ea..d1984a7 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -5616,10 +5616,10 @@ void Server::_rename_prepare(MDRequest *mdr,
   // nested beneath.
   bool force_journal = false;
   while (srci->is_dir()) {
-// if we are auth for srci and exporting it, have any _any_ open dirfrags, 
we
-// will (soon) have auth subtrees here.
-if (srci->is_auth() && !destdn->is_auth() && srci->has_dirfrags()) {
-  dout(10) << " we are exporting srci, and have open dirfrags, will force 
journal" << dendl;
+// if we are auth for srci and exporting it, force journal because we need 
create
+// auth subtrees here during journal replay.
+if (srci->is_auth() && !destdn->is_auth()) {
+  dout(10) << " we are exporting srci, will force journal" << dendl;
   force_journal = true;
   break;
 }
@@ -5720,7 +5720,8 @@ void Server::_rename_prepare(MDRequest *mdr,
  srci->get_dirfrags(ls);
  for (list::iterator p = ls.begin(); p != ls.end(); ++p) {
CDir *dir = *p;
-   metablob->renamed_dir_frags.push_back(dir->get_frag());
+   if (!dir->is_auth())
+ metablob->renamed_dir_frags.push_back(dir->get_frag());
  }
  dout(10) << " noting renamed dir open frags " << 
metablob->renamed_dir_frags << dendl;
}
diff --git a/src/mds/journal.cc b/src/mds/journal.cc
index 20bc755..e73b8e7 100644
--- a/src/mds/journal.cc
+++ b/src/mds/journal.cc
@@ -442,6 +442,16 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
   dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << 
dendl;
 else
   dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino 
<< dendl;
+
+int nnull = 0;
+for (list::iterator lp = lump_order.begin(); lp != 
lump_order.end(); ++lp) {
+  dirlump &lump = lump_map[*lp];
+  if (lump.nnull) {
+   dout(10) << "EMetaBlob.replay found null dentry in dir " << *lp << 
dendl;
+   nnull += lump.nnull;
+  }
+}
+assert(nnull <= 1);
   }
 
   // keep track of any inodes we unlink and don't relink elsewhere
@@ -622,8 +632,6 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
  dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl;
  if (dn->get_linkage()->is_primary())
unlinked.insert(dn->get_linkage()->get_inode());
- if (dn->get_linkage()->get_inode() == renamed_diri)
-   olddir = dir;
  dir->unlink_inode(dn);
}
dn->set_version(p->dnv);
@@ -631,24 +639,34 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg)
dout(10) << "EMetaBlob.replay had " << *dn << dendl;
assert(dn->last == p->dnlast);
   }
+  olddir = dir;
 }
   }
 
   if (renamed_dirino) {
-if (olddir) {
-  assert(renamed_diri);
+if (renamed_diri) {
+  assert(olddir);
+} else {
+  // we imported a diri we haven't seen before
+  renamed_diri = mds->mdcache->get_inode(renamed_dirino);
+  assert(renamed_diri);  // it was in the metablob
+}
+
+if (renamed_diri->authority().first != mds->whoami &&
+   olddir && olddir->authority().first == mds->whoami) {
+  list leaves;
+  renamed_diri->dirfragtree.get_leaves(leaves);
+  for (list::iterator p = leaves.begin(); p != leaves.end(); ++p)
+   renamed_diri->get_or_open_dirfrag(mds->mdcache, *p);
+}
+
+if (renamed_diri && olddir) {
   mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false);
   
   // see if we can discard the subtree we renamed out of
   CDir *root = mds->mdcache->get_subtree_root(olddir);
   if (root->get_dir_auth() == CDIR_AUTH_UNDEF)
mds->mdcache->try_trim_non_auth_subtree(root);
-
-} else {
-  // we imported a diri we haven't seen before
-  assert(!renamed_diri);
-  renamed_diri = mds->mdcache->get_inode(renamed_dirino);
-  assert(renamed_diri);  // it was in the metablob
 }
 
 // if we are the srci importer, we'll also have some dirfrags we have to 
open up...
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel

[PATCH 07/29] mds: don't trim ambiguous imports in MDCache::trim_non_auth_subtree

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Trimming ambiguous imports in MDCache::trim_non_auth_subtree() confuses
MDCache::disambiguate_imports() and causes infinite loop.

Signed-off-by: Yan, Zheng 
---
 src/mds/MDCache.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 5014b5d..da0c7f1 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -5802,7 +5802,7 @@ bool MDCache::trim_non_auth_subtree(CDir *dir)
 for (list::iterator subdir = subdirs.begin();
 subdir != subdirs.end();
 ++subdir) {
-  if ((*subdir)->is_subtree_root()) {
+  if ((*subdir)->is_subtree_root() || 
my_ambiguous_imports.count((*subdir)->dirfrag())) {
 keep_inode = true;
 dout(10) << "trim_non_auth_subtree(" << dir << ") subdir " << 
*subdir << "is kept!" << dendl;
   }
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/29] mds: only export directory fragments in stray to their auth MDS

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Signed-off-by: Yan, Zheng 
---
 src/mds/Migrator.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc
index 5db21cd..8686c86 100644
--- a/src/mds/Migrator.cc
+++ b/src/mds/Migrator.cc
@@ -642,12 +642,11 @@ void Migrator::export_dir(CDir *dir, int dest)
   }
 
   if (!dir->inode->is_base() && dir->get_parent_dir()->get_inode()->is_stray() 
&&
-  dir->get_parent_dir()->get_parent_dir()->ino() == 
MDS_INO_MDSDIR(mds->get_nodeid())) {
+  dir->get_parent_dir()->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
 dout(7) << "i won't export anything in stray" << dendl;
 return;
   }
 
-
   if (dir->is_frozen() ||
   dir->is_freezing()) {
 dout(7) << " can't export, freezing|frozen.  wait for other exports to 
finish first." << dendl;
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/29] mds: xlock stray dentry when handling rename or unlink

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

This prevents MDS from reintegrating stray before rename/unlink finishes

Signed-off-by: Yan, Zheng 
---
 src/mds/Server.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index dc14bbe..064733c 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -4536,6 +4536,7 @@ void Server::handle_client_unlink(MDRequest *mdr)
   if (straydn) {
 wrlocks.insert(&straydn->get_dir()->inode->filelock);
 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
+xlocks.insert(&straydn->lock);
   }
   if (in->is_dir())
 rdlocks.insert(&in->filelock);   // to verify it's empty
@@ -5300,6 +5301,7 @@ void Server::handle_client_rename(MDRequest *mdr)
   if (straydn) {
 wrlocks.insert(&straydn->get_dir()->inode->filelock);
 wrlocks.insert(&straydn->get_dir()->inode->nestlock);
+xlocks.insert(&straydn->lock);
   }
 
   // xlock versionlock on dentries if there are witnesses.
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/29] mds: don't journal null dentry for overwrited remote linkage

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Server::_rename_prepare() adds null dest dentry to the EMetaBlob if
the rename operation overwrites a remote linkage. This is incorrect
because null dentry are processed after primary and remote dentries
during journal replay. The erroneous null dentry makes the dentry of
rename destination disappear.

Signed-off-by: Yan, Zheng 
---
 src/mds/Server.cc | 4 
 1 file changed, 4 deletions(-)

diff --git a/src/mds/Server.cc b/src/mds/Server.cc
index 064733c..23739ea 100644
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@@ -5794,10 +5794,6 @@ void Server::_rename_prepare(MDRequest *mdr,
CEPH_NOSNAP, 0, destdnl);
metablob->add_primary_dentry(oldin->get_projected_parent_dn(), true, 
oldin);
   }
-  if (destdn->is_auth()) {
-   // auth for dn, not targeti
-   metablob->add_null_dentry(destdn, true);
-  }
 }
   }
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/29] mds: don't trigger assertion when discover races with rename

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Discover reply that adds replica dentry and inode can race with rename
if slave request for rename sends discover and waits, but waked up by
reply for different discover.

Signed-off-by: Yan, Zheng 
---
 src/mds/CDentry.cc | 17 ++---
 src/mds/MDCache.cc |  5 ++---
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc
index 7dd88ac..5ff6e61 100644
--- a/src/mds/CDentry.cc
+++ b/src/mds/CDentry.cc
@@ -439,23 +439,18 @@ void CDentry::decode_replica(bufferlist::iterator& p, 
bool is_new)
 
   inodeno_t rino;
   unsigned char rdtype;
+  __s32 ls;
   ::decode(rino, p);
   ::decode(rdtype, p);
-  if (rino) {
-if (linkage.is_null())
-  dir->link_remote_inode(this, rino, rdtype);
-else
-  assert(linkage.is_remote() && linkage.remote_ino == rino);
-  }
-  
-  __s32 ls;
   ::decode(ls, p);
-  if (is_new)
+
+  if (is_new) {
+if (rino)
+  dir->link_remote_inode(this, rino, rdtype);
 lock.set_state(ls);
+  }
 }
 
-
-
 // 
 // locking
 
diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc
index 0a88714..5014b5d 100644
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@@ -9150,12 +9150,11 @@ CInode 
*MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, listdecode_replica(p, false);
 dout(10) << "add_replica_inode had " << *in << dendl;
-assert(!dn || dn->get_linkage()->get_inode() == in);
   }
 
   if (dn) {
-assert(dn->get_linkage()->is_primary());
-assert(dn->get_linkage()->get_inode() == in);
+if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != 
in)
+  dout(10) << "add_replica_inode different linkage in dentry " << *dn << 
dendl;
 
 dn->get_dir()->take_ino_waiting(in->ino(), finished);
   }
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/29] mds: don't renew revoking lease

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

MDS may receives lease renew request while lease is being revoked,
just ignore the renew request.

Signed-off-by: Yan, Zheng 
---
 src/mds/Locker.cc | 26 ++
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index 76708ec..c8cd236 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -2914,18 +2914,20 @@ void Locker::handle_client_lease(MClientLease *m)
 
   case CEPH_MDS_LEASE_RENEW:
 {
-  dout(7) << "handle_client_lease client." << client
- << " renew on " << *dn << dendl;
-  int pool = 1;   // fixme.. do something smart!
-  m->h.duration_ms = (int)(1000 * mdcache->client_lease_durations[pool]);
-  m->h.seq = ++l->seq;
-  m->clear_payload();
-  
-  utime_t now = ceph_clock_now(g_ceph_context);
-  now += mdcache->client_lease_durations[pool];
-  mdcache->touch_client_lease(l, pool, now);
-  
-  mds->send_message_client_counted(m, m->get_connection());
+  dout(7) << "handle_client_lease client." << client << " renew on " << *dn
+ << (!dn->lock.can_lease(client)?", revoking lease":"") << dendl;
+  if (dn->lock.can_lease(client)) {
+   int pool = 1;   // fixme.. do something smart!
+   m->h.duration_ms = (int)(1000 * mdcache->client_lease_durations[pool]);
+   m->h.seq = ++l->seq;
+   m->clear_payload();
+
+   utime_t now = ceph_clock_now(g_ceph_context);
+   now += mdcache->client_lease_durations[pool];
+   mdcache->touch_client_lease(l, pool, now);
+
+   mds->send_message_client_counted(m, m->get_connection());
+  }
 }
 break;
 
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/29] mds: fix Locker::simple_eval()

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

Locker::simple_eval() checks if the loner wants CEPH_CAP_GEXCL to
decide if it should change the lock to EXCL state, but it checks
if CEPH_CAP_GEXCL is issued to the loner to decide if it should
change the lock to SYNC state. So if the loner wants CEPH_CAP_GEXCL,
but doesn't have CEPH_CAP_GEXCL, Locker::simple_eval() will keep
switching the lock state.

Signed-off-by: Yan, Zheng 
---
 src/mds/Locker.cc | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc
index c8cd236..98eaba8 100644
--- a/src/mds/Locker.cc
+++ b/src/mds/Locker.cc
@@ -3250,11 +3250,10 @@ void Locker::simple_eval(SimpleLock *lock, bool 
*need_issue)
 return;
 
   CInode *in = 0;
-  int wanted = 0, issued = 0;
+  int wanted = 0;
   if (lock->get_type() != CEPH_LOCK_DN) {
 in = (CInode*)lock->get_parent();
-wanted = in->get_caps_wanted(NULL, NULL, lock->get_cap_shift());
-issued = in->get_caps_issued(NULL, NULL, NULL, lock->get_cap_shift());
+in->get_caps_wanted(&wanted, NULL, lock->get_cap_shift());
   }
   
   // -> excl?
@@ -3270,8 +3269,7 @@ void Locker::simple_eval(SimpleLock *lock, bool 
*need_issue)
   else if (lock->get_state() != LOCK_SYNC &&
   !lock->is_xlocked() &&
   !lock->is_wrlocked() &&
-  ((!(issued & CEPH_CAP_GEXCL) &&
-!lock->is_waiter_for(SimpleLock::WAIT_WR)) ||
+  ((!(wanted & CEPH_CAP_GEXCL) && 
!lock->is_waiter_for(SimpleLock::WAIT_WR)) ||
(lock->get_state() == LOCK_EXCL && in && in->get_target_loner() < 
0))) {
 dout(7) << "simple_eval stable, syncing " << *lock 
<< " on " << *lock->get_parent() << dendl;
-- 
1.7.11.7

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 00/29] Various fixes for MDS

2013-01-04 Thread Yan, Zheng
From: "Yan, Zheng" 

This patch series fix various issues I encountered when running 3 MDS.
I test this patch series by runing fsstress on two clients, using the
same test directory. The MDS and clients could survived overnight test
at times.

This patch series are also in:
  git://github.com/ukernel/ceph.git wip-mds
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html