Re: [PATCH REPOST] rbd: be picky about osd request status type
I personally dislike "spaces after cast", but I haven't checked the kernel style guide. Otherwise: Reviewed-by: Dan Mick On 01/03/2013 02:40 PM, Alex Elder wrote: The result field in a ceph osd reply header is a signed 32-bit type, but rbd code often casually uses int to represent it. The following changes the types of variables that handle this result value to be "s32" instead of "int" to be completely explicit about it. Only at the point we pass that result to __blk_end_request() does the type get converted to the plain old int defined for that interface. There is almost certainly no binary impact of this change, but I prefer to show the exact size and signedness of the value since we know it. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 23 --- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 85131de..8b79a5b 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -171,7 +171,7 @@ struct rbd_client { */ struct rbd_req_status { int done; - int rc; + s32 rc; u64 bytes; }; @@ -1053,13 +1053,13 @@ static void rbd_destroy_ops(struct ceph_osd_req_op *ops) static void rbd_coll_end_req_index(struct request *rq, struct rbd_req_coll *coll, int index, - int ret, u64 len) + s32 ret, u64 len) { struct request_queue *q; int min, max, i; dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", -coll, index, ret, (unsigned long long) len); +coll, index, (int) ret, (unsigned long long) len); if (!rq) return; @@ -1080,7 +1080,7 @@ static void rbd_coll_end_req_index(struct request *rq, max++; for (i = min; istatus[i].rc, + __blk_end_request(rq, (int) coll->status[i].rc, coll->status[i].bytes); coll->num_done++; kref_put(&coll->kref, rbd_coll_release); @@ -1089,7 +1089,7 @@ static void rbd_coll_end_req_index(struct request *rq, } static void rbd_coll_end_req(struct rbd_request *rbd_req, -int ret, u64 len) +s32 ret, u64 len) { rbd_coll_end_req_index(rbd_req->rq, rbd_req->coll, rbd_req->coll_index, @@ -1129,7 +1129,7 @@ static int rbd_do_request(struct request *rq, if (!rbd_req) { if (coll) rbd_coll_end_req_index(rq, coll, coll_index, - -ENOMEM, len); + (s32) -ENOMEM, len); return -ENOMEM; } @@ -1206,7 +1206,7 @@ done_err: bio_chain_put(rbd_req->bio); ceph_osdc_put_request(osd_req); done_pages: - rbd_coll_end_req(rbd_req, ret, len); + rbd_coll_end_req(rbd_req, (s32) ret, len); kfree(rbd_req); return ret; } @@ -1219,7 +1219,7 @@ static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) struct rbd_request *rbd_req = osd_req->r_priv; struct ceph_osd_reply_head *replyhead; struct ceph_osd_op *op; - __s32 rc; + s32 rc; u64 bytes; int read_op; @@ -1227,14 +1227,14 @@ static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg) replyhead = msg->front.iov_base; WARN_ON(le32_to_cpu(replyhead->num_ops) == 0); op = (void *)(replyhead + 1); - rc = le32_to_cpu(replyhead->result); + rc = (s32) le32_to_cpu(replyhead->result); bytes = le64_to_cpu(op->extent.length); read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", (unsigned long long) bytes, read_op, (int) rc); - if (rc == -ENOENT && read_op) { + if (rc == (s32) -ENOENT && read_op) { zero_bio_chain(rbd_req->bio, 0); rc = 0; } else if (rc == 0 && read_op && bytes < rbd_req->len) { @@ -1679,7 +1679,8 @@ static void rbd_rq_fn(struct request_queue *q) bio_chain, coll, cur_seg); else rbd_coll_end_req_index(rq, coll, cur_seg, - -ENOMEM, chain_size); + (s32) -ENOMEM, + chain_size); size -= chain_size; ofs += chain_size; -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH REPOST] ceph: define ceph_encode_8_safe()
Reviewed-by: Dan Mick On 01/03/2013 11:07 AM, Alex Elder wrote: It's kind of a silly macro, but ceph_encode_8_safe() is the only one missing from an otherwise pretty complete set. It's not used, but neither are a couple of the others in this set. While in there, insert some whitespace to tidy up the alignment of the line-terminating backslashes in some of the macro definitions. Signed-off-by: Alex Elder --- include/linux/ceph/decode.h | 29 + 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 4bbf2db..cd679f2 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -52,10 +52,10 @@ static inline int ceph_has_room(void **p, void *end, size_t n) return end >= *p && n <= end - *p; } -#define ceph_decode_need(p, end, n, bad) \ - do {\ - if (!likely(ceph_has_room(p, end, n))) \ - goto bad; \ +#define ceph_decode_need(p, end, n, bad) \ + do {\ + if (!likely(ceph_has_room(p, end, n))) \ + goto bad; \ } while (0) #define ceph_decode_64_safe(p, end, v, bad) \ @@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n) * * There are two possible failures: * - converting the string would require accessing memory at or - * beyond the "end" pointer provided (-E - * - memory could not be allocated for the result + * beyond the "end" pointer provided (-ERANGE) + * - memory could not be allocated for the result (-ENOMEM) */ static inline char *ceph_extract_encoded_string(void **p, void *end, size_t *lenp, gfp_t gfp) @@ -217,10 +217,10 @@ static inline void ceph_encode_string(void **p, void *end, *p += len; } -#define ceph_encode_need(p, end, n, bad) \ - do {\ - if (!likely(ceph_has_room(p, end, n))) \ - goto bad; \ +#define ceph_encode_need(p, end, n, bad) \ + do {\ + if (!likely(ceph_has_room(p, end, n))) \ + goto bad; \ } while (0) #define ceph_encode_64_safe(p, end, v, bad) \ @@ -231,12 +231,17 @@ static inline void ceph_encode_string(void **p, void *end, #define ceph_encode_32_safe(p, end, v, bad) \ do {\ ceph_encode_need(p, end, sizeof(u32), bad); \ - ceph_encode_32(p, v); \ + ceph_encode_32(p, v); \ } while (0) #define ceph_encode_16_safe(p, end, v, bad) \ do {\ ceph_encode_need(p, end, sizeof(u16), bad); \ - ceph_encode_16(p, v); \ + ceph_encode_16(p, v); \ + } while (0) +#define ceph_encode_8_safe(p, end, v, bad) \ + do {\ + ceph_encode_need(p, end, sizeof(u8), bad); \ + ceph_encode_8(p, v);\ } while (0) #define ceph_encode_copy_safe(p, end, pv, n, bad) \ -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
librados/librbd compatibility issue with v0.56
We identified a problem with the version of librbd/librados in v0.56. There will be a v0.56.1 update in a few days that fixes it. In the meantime, be aware that v0.56 ceph-osds may not interact properly with non-v0.56 radosgw or librbd clients, and v0.56 radosgw and librbd clients will not interact properly with non-v0.56 OSDs. If you are using v0.56, we recommend moving to the current testing branch, which includes fixes queued up for v0.56.1. You can get debs from e.g. http://gitbuilder.ceph.com/ceph-deb-precise-x86_64-basic/ref/testing/ or for other many other distros; see http://ceph.com/docs/master/install/debian/#add-development-testing-packages The fix is commit d3abd0fe0bb402ff403259d4b1a718a56331fc39 (921e06decebccc913c0e4f61916d00e62e7e1635 in testing branch). Thanks! sage -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
argonaut stable update coming shortly
We recently identified a bug in the way the OSD is marking its commits on XFS or ext4 (well, anything not btrfs) that could lead data loss in the event of power loss or a kernel panic. We're doing some final testing and will have a v0.48.3argonaut package available by Monday. If you like, you can get the fix before that from the autobuilders by grabbing the 'stable' branch; see http://ceph.com/docs/master/install/debian/#add-development-testing-packages sage -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
ceph caps (Ganesha + Ceph pnfs)
Hi Ceph folks, Summarizing from Ceph IRC discussion by request, I'm one of the developers of a pNFS (parallel nfs) implementation that is built atop the Ceph system. I'm working on code that wants to use the Ceph caps system to control and sequence i/o operations and file metadata, for example, so that ordinary Ceph clients see a coherent view of the objects being exported via pNFS. The basic pNFS model (sorry for those who know all this, RFC 5661, etc) is to extend NFSv4 with a distributed/parallel access model. To do parallel access in pNFS, the NFS client gets a `layout` from an NFS metadata (MDS) server. A layout is a recallable object, a bit like an oplock/delegation/DCE token, see spec, it basically presents a list of subordinate data servers (DSes) on which to read and/or write regions of a specific file. Ok, so in our implementation, we would typically expect to have a DS server collocated with each Ceph OSD. When an NFS client has a layout on a given inode, its i/o requests will be performed "directly" by the appropriate OSD. When an MDS is asked to issue a layout on a file, it should hold a cap or caps which ensure the layout will not conflict with other Ceph clients and ensure the MDS will be notified when it must recall the layout later if other clients attempt conflicting operations. In turn, involved DS servers need the correct caps to read and/or write the data, plus, they need to update file metadata periodically. (This can be upon a final commit of the client's layout, or inline with a write operation, if the client specifies the write be 'sync' stability.) The current set of behaviors we're modeling are: a) allow MDS to hold a Ceph caps, tracking issued pNFS layouts, such that it will be able to handle events which should trigger layout recalls at its pNFS clients (e.g., on conflicts)--currently we it holds CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD b) on a given DS, we currently get CEPH_CAP_FILE_WR|CEPH_CAP_FILE_RD caps when asked to perform i/o on behalf of a valid layout--but we need to update metadata (size, mtime) and my question in IRC was cross checking these capabilities as correct to send an update message In the current pass I'm trying to clean up/refine the model implementation, leaving some room for adjustment. Thanks! Matt -- Matt Benjamin The Linux Box 206 South Fifth Ave. Suite 150 Ann Arbor, MI 48104 http://linuxbox.com tel. 734-761-4689 fax. 734-769-8938 cel. 734-216-5309 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Any idea about doing deduplication in ceph?
On Wed, Dec 26, 2012 at 6:16 PM, lollipop wrote: > Nowadays, I am wondering doing offline deduplication in ceph? > My idea is: > First in the ceph-client, I try to get the locations of chunks in one file. > The information includes > how many chunks the file has and which osd the chunk(object group) has been > stored. > Then the ceph-client try to communicate with the exact osd to ask the osd to > return the chunk hash. > After that, we compare the returned hash with the already stored hash table, > If the chunk is duplicated, we try to change the file meta-data. > Can it work? > Can you give some ideas? Thank you Any off-line deduplication support in Ceph is going to have the important parts be in the OSD code, not in the clients. :) We've discussed dedup a little bit internally and on the mailing list; you can do an archive search if you're interested in what the current thoughts are. :) -Greg -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: ceph stability
On Fri, Dec 21, 2012 at 2:07 AM, Amon Ott wrote: > Am 20.12.2012 15:31, schrieb Mark Nelson: >> On 12/20/2012 01:08 AM, Roman Hlynovskiy wrote: >>> Hello Mark, >>> >>> for multi-mds solutions do you refer to multi-active arch or 1 active >>> and many standby arch? >> >> That's a good question! I know we don't really recommend multi-active >> right now for production use. Not sure what our current recommendations >> are for multi-standby. As far as I know it's considered to be more >> stable. I'm sure Greg or Sage can chime in with a more accurate >> assessment. > > We have been testing a lot with multi-standby, because a single MDS does > not make a lot of sense in a cluster. Maybe the clue is to have only one > standby, making SPOF a DPOF? The number of standbys should not have any impact on stability — they are read-only until the active MDS gets declared down, at which point one of them becomes active. I suppose having a standby could reduce stability if your active MDS is overloaded enough to miss check-ins without actually going down — without a standby it would eventually recover, but with a standby a transition happens. That's the only difference, though. -Greg -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: which Linux kernel version corresponds to 0.48argonaut?
On Fri, 4 Jan 2013, Gregory Farnum wrote: > I think they might be different just as a consequence of being updated > less recently; that's where all of the lines whose origin I recognize > differ (not certain about the calc_parents stuff though). Sage can > confirm. In general, the crush files in mainline should track the same files in ceph.git master, modulo the #include lines at the top. 0.48argonaut is sufficiently old that the linux versions are actually newer. > The specific issue you encountered previously was of course because > you changed the layout algorithm and the client needs to be able to > process that layout itself. Yep! sage > -Greg > > On Thu, Dec 20, 2012 at 5:37 PM, Xing Lin wrote: > > This may be useful for other Ceph newbies just like me. > > > > I have ported my changes to 0.48argonaut to related Ceph files included in > > Linux, though files with the same name are not exactly the same. Then I > > recompiled and installed the kernel. After that, everything seems to be > > working again now: Ceph is working with my new simple replica placement > > algorithm. :) > > So, it seems that Ceph files included in the Linux kernel are supposed to be > > different from those in 0.48argonaut. Presumably, the Linux kernel contains > > the client-side implementation while 0.48argonaut contains the server-side > > implementation. It would be appreciated if someone can confirm it. Thank > > you! > > > > Xing > > > > > > On 12/20/2012 11:54 AM, Xing Lin wrote: > >> > >> Hi, > >> > >> I was trying to add a simple replica placement algorithm in Ceph. This > >> algorithm simply returns r_th item in a bucket for the r_th replica. I have > >> made that change in Ceph source code (including files such as crush.h, > >> crush.c, mapper.c, ...) and I can run Ceph monitor and osd daemons. > >> However, > >> I am not able to map rbd block devices at client machines. 'rbd map image0' > >> reported "input/output error" and 'dmesg' at the client machine showed > >> message like "libceph: handle_map corrupt msg". I believe that is because I > >> have not ported my changes to Ceph client side programs and it does not > >> recognize the new placement algorithm. I probably need to recompile the rbd > >> block device driver. When I was trying to replace Ceph related files in > >> Linux with my own version, I noticed that files in Linux-3.2.16 are > >> different from these included in Ceph source code. For example, the > >> following is the diff of crush.h in Linux-3.2.16 and 0.48argonaut. So, my > >> question is that is there any version of Linux that contains the exact Ceph > >> files as included in 0.48argonaut? Thanks. > >> > >> --- > >> $ diff -uNrp ceph-0.48argonaut/src/crush/crush.h > >> linux-3.2.16/include/linux/crush/crush.h > >> --- ceph-0.48argonaut/src/crush/crush.h2012-06-26 11:56:36.0 > >> -0600 > >> +++ linux-3.2.16/include/linux/crush/crush.h2012-04-22 > >> 16:31:32.0 -0600 > >> @@ -1,12 +1,7 @@ > >> #ifndef CEPH_CRUSH_CRUSH_H > >> #define CEPH_CRUSH_CRUSH_H > >> > >> -#if defined(__linux__) > >> #include > >> -#elif defined(__FreeBSD__) > >> -#include > >> -#include "include/inttypes.h" > >> -#endif > >> > >> /* > >> * CRUSH is a pseudo-random data distribution algorithm that > >> @@ -156,24 +151,25 @@ struct crush_map { > >> struct crush_bucket **buckets; > >> struct crush_rule **rules; > >> > >> +/* > >> + * Parent pointers to identify the parent bucket a device or > >> + * bucket in the hierarchy. If an item appears more than > >> + * once, this is the _last_ time it appeared (where buckets > >> + * are processed in bucket id order, from -1 on down to > >> + * -max_buckets. > >> + */ > >> +__u32 *bucket_parents; > >> +__u32 *device_parents; > >> + > >> __s32 max_buckets; > >> __u32 max_rules; > >> __s32 max_devices; > >> - > >> -/* choose local retries before re-descent */ > >> -__u32 choose_local_tries; > >> -/* choose local attempts using a fallback permutation before > >> - * re-descent */ > >> -__u32 choose_local_fallback_tries; > >> -/* choose attempts before giving up */ > >> -__u32 choose_total_tries; > >> - > >> -__u32 *choose_tries; > >> }; > >> > >> > >> /* crush.c */ > >> -extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int > >> pos); > >> +extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos); > >> +extern void crush_calc_parents(struct crush_map *map); > >> extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); > >> extern void crush_destroy_bucket_list(struct crush_bucket_list *b); > >> extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); > >> @@ -181,9 +177,4 @@ extern void crush_destroy_bucket_straw(s > >> extern void crush_destroy_bucket(struct crush_bucket *b); > >> extern void crush_destroy(struct crush_map *map); > >> > >> -static inline int crush_
Re: which Linux kernel version corresponds to 0.48argonaut?
I think they might be different just as a consequence of being updated less recently; that's where all of the lines whose origin I recognize differ (not certain about the calc_parents stuff though). Sage can confirm. The specific issue you encountered previously was of course because you changed the layout algorithm and the client needs to be able to process that layout itself. -Greg On Thu, Dec 20, 2012 at 5:37 PM, Xing Lin wrote: > This may be useful for other Ceph newbies just like me. > > I have ported my changes to 0.48argonaut to related Ceph files included in > Linux, though files with the same name are not exactly the same. Then I > recompiled and installed the kernel. After that, everything seems to be > working again now: Ceph is working with my new simple replica placement > algorithm. :) > So, it seems that Ceph files included in the Linux kernel are supposed to be > different from those in 0.48argonaut. Presumably, the Linux kernel contains > the client-side implementation while 0.48argonaut contains the server-side > implementation. It would be appreciated if someone can confirm it. Thank > you! > > Xing > > > On 12/20/2012 11:54 AM, Xing Lin wrote: >> >> Hi, >> >> I was trying to add a simple replica placement algorithm in Ceph. This >> algorithm simply returns r_th item in a bucket for the r_th replica. I have >> made that change in Ceph source code (including files such as crush.h, >> crush.c, mapper.c, ...) and I can run Ceph monitor and osd daemons. However, >> I am not able to map rbd block devices at client machines. 'rbd map image0' >> reported "input/output error" and 'dmesg' at the client machine showed >> message like "libceph: handle_map corrupt msg". I believe that is because I >> have not ported my changes to Ceph client side programs and it does not >> recognize the new placement algorithm. I probably need to recompile the rbd >> block device driver. When I was trying to replace Ceph related files in >> Linux with my own version, I noticed that files in Linux-3.2.16 are >> different from these included in Ceph source code. For example, the >> following is the diff of crush.h in Linux-3.2.16 and 0.48argonaut. So, my >> question is that is there any version of Linux that contains the exact Ceph >> files as included in 0.48argonaut? Thanks. >> >> --- >> $ diff -uNrp ceph-0.48argonaut/src/crush/crush.h >> linux-3.2.16/include/linux/crush/crush.h >> --- ceph-0.48argonaut/src/crush/crush.h2012-06-26 11:56:36.0 >> -0600 >> +++ linux-3.2.16/include/linux/crush/crush.h2012-04-22 >> 16:31:32.0 -0600 >> @@ -1,12 +1,7 @@ >> #ifndef CEPH_CRUSH_CRUSH_H >> #define CEPH_CRUSH_CRUSH_H >> >> -#if defined(__linux__) >> #include >> -#elif defined(__FreeBSD__) >> -#include >> -#include "include/inttypes.h" >> -#endif >> >> /* >> * CRUSH is a pseudo-random data distribution algorithm that >> @@ -156,24 +151,25 @@ struct crush_map { >> struct crush_bucket **buckets; >> struct crush_rule **rules; >> >> +/* >> + * Parent pointers to identify the parent bucket a device or >> + * bucket in the hierarchy. If an item appears more than >> + * once, this is the _last_ time it appeared (where buckets >> + * are processed in bucket id order, from -1 on down to >> + * -max_buckets. >> + */ >> +__u32 *bucket_parents; >> +__u32 *device_parents; >> + >> __s32 max_buckets; >> __u32 max_rules; >> __s32 max_devices; >> - >> -/* choose local retries before re-descent */ >> -__u32 choose_local_tries; >> -/* choose local attempts using a fallback permutation before >> - * re-descent */ >> -__u32 choose_local_fallback_tries; >> -/* choose attempts before giving up */ >> -__u32 choose_total_tries; >> - >> -__u32 *choose_tries; >> }; >> >> >> /* crush.c */ >> -extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int >> pos); >> +extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos); >> +extern void crush_calc_parents(struct crush_map *map); >> extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b); >> extern void crush_destroy_bucket_list(struct crush_bucket_list *b); >> extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b); >> @@ -181,9 +177,4 @@ extern void crush_destroy_bucket_straw(s >> extern void crush_destroy_bucket(struct crush_bucket *b); >> extern void crush_destroy(struct crush_map *map); >> >> -static inline int crush_calc_tree_node(int i) >> -{ >> -return ((i+1) << 1)-1; >> -} >> - >> #endif >> >> >> Xing >> -- >> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in >> the body of a message to majord...@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html > > > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel
Re: Usage of CEPH FS versa HDFS for Hadoop: TeraSort benchmark performance comparison issue
Sorry for the delay; I've been out on vacation... On Fri, Dec 14, 2012 at 6:09 AM, Lachfeld, Jutta wrote: > I do not have the full output of "ceph pg dump" for that specific TeraSort > run, but here is a typical output after automatically preparing CEPH for a > benchmark run > (removed almost all lines in the long pg_stat table hoping that you do not > need them): Actually those were exactly what I was after; they include output on the total PG size and the number of objects so we can check on average size. :) If you'd like to do it yourself, look at some of the PGs which correspond to your data pool (the PG ids are all of the form 0.123a, and the number before the decimal point is the pool ID; by default you'll be looking for 0). On Fri, Dec 14, 2012 at 6:53 AM, Mark Nelson wrote: > The large block size may be an issue (at least with some of our default > tunable settings). You might want to try 4 or 16MB and see if it's any > better or worse. Unless you've got a specific reason to think this is busted, I am pretty confident it's not a problem. :) Jutta, do you have any finer-grained numbers than total run time (specifically, how much time is spent on data generation versus the read-and-sort for each FS)? HDFS doesn't do any journaling like Ceph does and the fact that the Ceph journal is in-memory might not be helping much since it's so small compared to the amount of data being written. -Greg -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] Librados aio stat
On 01/04/2013 05:01 AM, Filippos Giannakos wrote: Hi Team, Is there any progress or any comments regarding the librados aio stat patch ? They look good to me. I put them in the wip-librados-aio-stat branch. Can we add your signed-off-by to them? Thanks, Josh Best regards On 12/20/2012 10:05 PM, Filippos Giannakos wrote: Hi Team, Here is the patch with the changes, plus the tests you requested. Best regards, Filippos -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] rbd: Support plain/json/xml output formatting
On 12/19/2012 04:17 AM, Stratos Psomadakis wrote: This patch renames the --format option to --image-format, for specyfing the RBD image format, and uses --format to specify the output formating (to be consistent with the other ceph tools). To avoid breaking backwards compatibility with existing scripts, rbd will still accept --format [1|2] for the image format, but will print a warning message, noting its use is deprecated. The rbd subcommands that support the new --format option are : ls, info, snap list, children, showmapped, lock list. Signed-off-by: Stratos Psomadakis --- Hi, this is the updated version of the patch. I renamed --format option to --image-format, and used --format to specify the output formatting, as you suggested. I also implemented some basic error checking on the --format input, and modified the rbd subcommands you mentioned, to support plain/json/xml output formatting. Although, I'm not sure if the json and xml output of those commands is what you'd want. The style issues should also be resolved now. Let me know what you think. There were still some style issues (missing braces, long lines, and spaces instead of tabs), but I squashed fixes to those into the wip-rbd-formatted-output branch. I changed the output a bit to uses ints, arrays, etc. where it seemed appropriate. The tests aren't quite done yet, but does this new json output look good to you? Josh -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/6] fix build and packaging issues
No Problem. Cheers, Gary On Jan 4, 2013, at 10:00 AM, Gregory Farnum wrote: > Thanks! > > Gary, can you pull these into a branch and do some before-and-after > package comparisons on our systems (for the different distros in > gitbuilder) and then merge into master? > -Greg > > On Fri, Jan 4, 2013 at 9:51 AM, Danny Al-Gaaf wrote: >> This set of patches contains fixes for some build and packaging issues. >> >> Danny Al-Gaaf (6): >> src/java/Makefile.am: fix default java dir >> ceph.spec.in: fix handling of java files >> ceph.spec.in: rename libcephfs-java package to cephfs-java >> ceph.spec.in: fix libcephfs-jni package name >> configure.ac: remove AC_PROG_RANLIB >> configure.ac: change junit4 handling >> >> ceph.spec.in | 21 + >> configure.ac | 8 +--- >> src/java/Makefile.am | 6 +++--- >> 3 files changed, 17 insertions(+), 18 deletions(-) >> >> -- >> 1.8.0.2 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in >> the body of a message to majord...@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/6] fix build and packaging issues
Thanks! Gary, can you pull these into a branch and do some before-and-after package comparisons on our systems (for the different distros in gitbuilder) and then merge into master? -Greg On Fri, Jan 4, 2013 at 9:51 AM, Danny Al-Gaaf wrote: > This set of patches contains fixes for some build and packaging issues. > > Danny Al-Gaaf (6): > src/java/Makefile.am: fix default java dir > ceph.spec.in: fix handling of java files > ceph.spec.in: rename libcephfs-java package to cephfs-java > ceph.spec.in: fix libcephfs-jni package name > configure.ac: remove AC_PROG_RANLIB > configure.ac: change junit4 handling > > ceph.spec.in | 21 + > configure.ac | 8 +--- > src/java/Makefile.am | 6 +++--- > 3 files changed, 17 insertions(+), 18 deletions(-) > > -- > 1.8.0.2 > > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] Librados aio stat
Sorry, I missed this second set of patches when they came through over the break. We'll try to get them merged into master shortly. Thanks! sage On Fri, 4 Jan 2013, Filippos Giannakos wrote: > Hi Team, > > Is there any progress or any comments regarding the librados aio stat > patch ? > > Best regards > > On 12/20/2012 10:05 PM, Filippos Giannakos wrote: > > Hi Team, > > > > Here is the patch with the changes, plus the tests you requested. > > > > Best regards, > > Filippos > > > -- > Filippos. > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > > -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/6] fix build and packaging issues
This set of patches contains fixes for some build and packaging issues. Danny Al-Gaaf (6): src/java/Makefile.am: fix default java dir ceph.spec.in: fix handling of java files ceph.spec.in: rename libcephfs-java package to cephfs-java ceph.spec.in: fix libcephfs-jni package name configure.ac: remove AC_PROG_RANLIB configure.ac: change junit4 handling ceph.spec.in | 21 + configure.ac | 8 +--- src/java/Makefile.am | 6 +++--- 3 files changed, 17 insertions(+), 18 deletions(-) -- 1.8.0.2 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/6] configure.ac: remove AC_PROG_RANLIB
Remove already comment out AC_PROG_RANLIB to get rid of warning: libtoolize: `AC_PROG_RANLIB' is rendered obsolete by `LT_INIT' Signed-off-by: Danny Al-Gaaf --- configure.ac | 1 - 1 file changed, 1 deletion(-) diff --git a/configure.ac b/configure.ac index 07bfaa9..15430dd 100644 --- a/configure.ac +++ b/configure.ac @@ -51,7 +51,6 @@ AM_CONDITIONAL(FREEBSD, test x"$freebsd" = x"yes") # Checks for programs. AC_PROG_CXX #AC_PROG_CC -#AC_PROG_RANLIB AC_PROG_MAKE_SET AC_PROG_LIBTOOL -- 1.8.0.2 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/6] src/java/Makefile.am: fix default java dir
Fix default javadir in src/java/Makefile.am to $(datadir)/java since this is the common data dir for java files. Signed-off-by: Danny Al-Gaaf --- src/java/Makefile.am | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/Makefile.am b/src/java/Makefile.am index 55ff6c1..1f87885 100644 --- a/src/java/Makefile.am +++ b/src/java/Makefile.am @@ -43,7 +43,7 @@ $(JAVA_H): $(CEPH_PROXY) libcephfs.jar: $(CEPH_PROXY) $(JAR) cf $@ $(JAVA_CLASSES:%=-C java %) -javadir = $(libdir) +javadir = $(datadir)/java java_DATA = libcephfs.jar CLEANFILES = -rf java/com/ceph/fs/*.class $(JAVA_H) libcephfs.jar -- 1.8.0.2 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/6] ceph.spec.in: fix handling of java files
Fix handling of JAVA (jar) files. Don't move the files around in the install section since the related Makefile is fixed instead. Use %{_javadir} instead of /usr/share/java/ Signed-off-by: Danny Al-Gaaf --- ceph.spec.in | 7 ++- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index 7aae6fe..cd93573 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -281,9 +281,6 @@ mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/ceph/ # Relocate java packages to expected locations in buildroot. mkdir -p $RPM_BUILD_ROOT/usr/lib/jni mv $RPM_BUILD_ROOT/usr/lib64/libcephfs_jni.so* $RPM_BUILD_ROOT/usr/lib/jni/. -mkdir -p $RPM_BUILD_ROOT/usr/share/java -mv $RPM_BUILD_ROOT/usr/lib64/libcephfs.jar $RPM_BUILD_ROOT/usr/share/java/. -mv $RPM_BUILD_ROOT/usr/lib64/libcephfs-test.jar $RPM_BUILD_ROOT/usr/share/java/. %clean rm -rf $RPM_BUILD_ROOT @@ -561,7 +558,7 @@ fi %files -n libcephfs-java %defattr(-,root,root,-) -/usr/share/java/libcephfs.jar -/usr/share/java/libcephfs-test.jar +%{_javadir}/libcephfs.jar +%{_javadir}/libcephfs-test.jar %changelog -- 1.8.0.2 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/6] configure.ac: change junit4 handling
Change handling of --with-debug and junit4. Add a new conditional HAVE_JUNIT4 to be able to build ceph-test package also if junit4 isn't available. In this case simply don't build libcephfs-test.jar, but the rest of the tools. Signed-off-by: Danny Al-Gaaf --- configure.ac | 7 +-- src/java/Makefile.am | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 15430dd..832054b 100644 --- a/configure.ac +++ b/configure.ac @@ -309,8 +309,10 @@ AS_IF([test "x$enable_cephfs_java" = "xyes"], [ junit4_jar=`find $dir -name junit4.jar | head -n 1` AS_IF([test -r "$junit4_jar"], [ EXTRA_CLASSPATH_JAR=`dirname $junit4_jar`/junit4.jar - AC_SUBST(EXTRA_CLASSPATH_JAR)], [ - AC_MSG_ERROR([Cannot find junit4.jar (apt-get install junit4)])])]) + AC_SUBST(EXTRA_CLASSPATH_JAR) + [have_junit4=1]], [ + AC_MSG_NOTICE([Cannot find junit4.jar (apt-get install junit4)]) + [have_junit4=0]])]) # Check for Java programs: javac, javah, jar PATH_save=$PATH @@ -341,6 +343,7 @@ AS_IF([test "x$enable_cephfs_java" = "xyes"], [ # Setup output var AC_SUBST(JDK_CPPFLAGS) ]) +AM_CONDITIONAL(HAVE_JUNIT4, [test "$have_junit4" = "1"]) # jni? # clear cache (from java above) -- this whole thing will get diff --git a/src/java/Makefile.am b/src/java/Makefile.am index 1f87885..788bea2 100644 --- a/src/java/Makefile.am +++ b/src/java/Makefile.am @@ -51,7 +51,7 @@ CLEANFILES = -rf java/com/ceph/fs/*.class $(JAVA_H) libcephfs.jar BUILT_SOURCES = $(JAVA_H) # build the tests if *both* --enable-cephfs-java and --with-debug were specifed -if WITH_DEBUG +if HAVE_JUNIT4 JAVA_TEST_CLASSES = $(JAVA_TEST_SRC:test/%.java=%.class) ESCAPED_JAVA_TEST_CLASSES = com/ceph/fs/CephAllTests\$$1.class @@ -68,5 +68,5 @@ libcephfs-test.jar: $(CEPH_TEST_PROXY) java_DATA += libcephfs-test.jar CLEANFILES += test/com/ceph/fs/*.class libcephfs-test.jar -endif # WITH_DEBUG +endif # HAVE_JUNIT4 (includes WITH_DEBUG) endif #ENABLE_CEPHFS_JAVA -- 1.8.0.2 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/6] ceph.spec.in: rename libcephfs-java package to cephfs-java
Rename the libcephfs-java package to cephfs-java since the package contains no (classic) library and RPMLINT complains about the name. Signed-off-by: Danny Al-Gaaf --- ceph.spec.in | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index cd93573..bfd030b 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -200,14 +200,14 @@ BuildRequires: java-devel This package contains the Java Native Interface library for CephFS Java bindings. -%package -n libcephfs-java +%package -n cephfs-java Summary: Java libraries for the Ceph File System. Group: System Environment/Libraries License: LGPL-2.0 Requires: java Requires: libcephfs-jni = %{version}-%{release}- BuildRequires: java-devel -%description -n libcephfs-java +%description -n cephfs-java This package contains the Java libraries for the Ceph File System. %if (0%{?centos} || 0%{?opensuse} || 0%{?suse_version}) @@ -556,7 +556,7 @@ fi %defattr(-,root,root,-) /usr/lib/jni/libcephfs_jni.so* -%files -n libcephfs-java +%files -n cephfs-java %defattr(-,root,root,-) %{_javadir}/libcephfs.jar %{_javadir}/libcephfs-test.jar -- 1.8.0.2 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/6] ceph.spec.in: fix libcephfs-jni package name
Rename libcephfs-jni to libcephfs_jni1 to reflect the SO name/version of the library and to prevent RPMLINT to complain about the naming. Signed-off-by: Danny Al-Gaaf --- ceph.spec.in | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ceph.spec.in b/ceph.spec.in index bfd030b..8eb4be8 100644 --- a/ceph.spec.in +++ b/ceph.spec.in @@ -189,14 +189,14 @@ Requires: libcephfs1 = %{version}-%{release} %description -n ceph-test This package contains Ceph benchmarks and test tools. -%package -n libcephfs-jni +%package -n libcephfs_jni1 Summary: Java Native Interface library for CephFS Java bindings. Group: System Environment/Libraries License: LGPL-2.0 Requires: java Requires: libcephfs1 = %{version}-%{release} BuildRequires: java-devel -%description -n libcephfs-jni +%description -n libcephfs_jni1 This package contains the Java Native Interface library for CephFS Java bindings. @@ -205,7 +205,7 @@ Summary:Java libraries for the Ceph File System. Group: System Environment/Libraries License: LGPL-2.0 Requires: java -Requires: libcephfs-jni = %{version}-%{release}- +Requires: libcephfs_jni1 = %{version}-%{release} BuildRequires: java-devel %description -n cephfs-java This package contains the Java libraries for the Ceph File System. @@ -552,7 +552,7 @@ fi %{_bindir}/tpbench %{_bindir}/xattr_bench -%files -n libcephfs-jni +%files -n libcephfs_jni1 %defattr(-,root,root,-) /usr/lib/jni/libcephfs_jni.so* -- 1.8.0.2 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH, v2] rbd: define and use rbd_warn()
Define a new function rbd_warn() that produces a boilerplate warning message, identifying in the resulting message the affected rbd device in the best way available. Use it in a few places that now use pr_warning(). Signed-off-by: Alex Elder Reviewed-by: Dan Mick --- v2: Added function name to warning message, as suggested by Dan. drivers/block/rbd.c | 46 -- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 414052d..86e60eb 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -294,6 +294,36 @@ static struct device rbd_root_dev = { .release = rbd_root_dev_release, }; +#define rbd_warn(rbd_dev, fmt, ...) \ + _rbd_warn(__func__, (rbd_dev), (fmt), ##__VA_ARGS__) + +static __printf(3, 4) void +_rbd_warn(const char *func, struct rbd_device *rbd_dev, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + if (!rbd_dev) + printk(KERN_WARNING RBD_DRV_NAME ": %s: %pV\n", func, &vaf); + else if (rbd_dev->disk) + printk(KERN_WARNING RBD_DRV_NAME ": %s: %s: %pV\n", + func, rbd_dev->disk->disk_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_name) + printk(KERN_WARNING RBD_DRV_NAME ": %s: image %s: %pV\n", + func, rbd_dev->spec->image_name, &vaf); + else if (rbd_dev->spec && rbd_dev->spec->image_id) + printk(KERN_WARNING RBD_DRV_NAME ": %s: id %s: %pV\n", + func, rbd_dev->spec->image_id, &vaf); + else/* punt */ + printk(KERN_WARNING RBD_DRV_NAME ": %s: rbd_dev %p: %pV\n", + func, rbd_dev, &vaf); + va_end(args); +} + #ifdef RBD_DEBUG #define rbd_assert(expr) \ if (unlikely(!(expr))) {\ @@ -1407,8 +1437,8 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) (unsigned int) opcode); rc = rbd_dev_refresh(rbd_dev, &hver); if (rc) - pr_warning(RBD_DRV_NAME "%d got notification but failed to " - " update snaps: %d\n", rbd_dev->major, rc); + rbd_warn(rbd_dev, "got notification but failed to " + " update snaps: %d", rc); rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); } @@ -1771,15 +1801,13 @@ rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version) goto out_err; if (WARN_ON((size_t) ret < size)) { ret = -ENXIO; - pr_warning("short header read for image %s" - " (want %zd got %d)\n", - rbd_dev->spec->image_name, size, ret); + rbd_warn(rbd_dev, "short header read (want %zd got %d)", + size, ret); goto out_err; } if (!rbd_dev_ondisk_valid(ondisk)) { ret = -ENXIO; - pr_warning("invalid header for image %s\n", - rbd_dev->spec->image_name); + rbd_warn(rbd_dev, "invalid header"); goto out_err; } @@ -2634,9 +2662,7 @@ static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev) if (name) rbd_dev->spec->image_name = (char *) name; else - pr_warning(RBD_DRV_NAME "%d " - "unable to get image name for image id %s\n", - rbd_dev->major, rbd_dev->spec->image_id); + rbd_warn(rbd_dev, "unable to get image name"); /* Look up the snapshot name. */ -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH REPOST 2/4] rbd: add warning messages for missing arguments
On 01/03/2013 07:10 PM, Dan Mick wrote: > Do you want to include in the message some kind of indication which > operation/function is involved? (this is definitely better, but even > better might be to add "rbd add" or "rbd_add_parse_args" to the msgs) Sure. This comment really applies to the previous patch. I'm sure I thought of doing that and I'm not sure any more why I did not. Maybe worried about lines getting too long? Or maybe I bumped into varargs problems? I don't know. I'll rename the rbd_warn() function _rbd_warn(), and will add a new first argument which is the function name. Then I'll define a new macro rbd_warn() that just calls _rbd_warn(), inserting __func__ as a first argument. -Alex > On 01/03/2013 11:11 AM, Alex Elder wrote: >> Tell the user (via dmesg) what was wrong with the arguments provided >> via /sys/bus/rbd/add. >> >> Signed-off-by: Alex Elder >> --- >> drivers/block/rbd.c | 24 >> 1 file changed, 16 insertions(+), 8 deletions(-) >> >> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c >> index 635b81d..31da8c5 100644 >> --- a/drivers/block/rbd.c >> +++ b/drivers/block/rbd.c >> @@ -3244,8 +3244,10 @@ static int rbd_add_parse_args(const char *buf, >> /* The first four tokens are required */ >> >> len = next_token(&buf); >> -if (!len) >> -return -EINVAL;/* Missing monitor address(es) */ >> +if (!len) { >> +rbd_warn(NULL, "no monitor address(es) provided"); >> +return -EINVAL; >> +} >> mon_addrs = buf; >> mon_addrs_size = len + 1; >> buf += len; >> @@ -3254,8 +3256,10 @@ static int rbd_add_parse_args(const char *buf, >> options = dup_token(&buf, NULL); >> if (!options) >> return -ENOMEM; >> -if (!*options) >> -goto out_err;/* Missing options */ >> +if (!*options) { >> +rbd_warn(NULL, "no options provided"); >> +goto out_err; >> +} >> >> spec = rbd_spec_alloc(); >> if (!spec) >> @@ -3264,14 +3268,18 @@ static int rbd_add_parse_args(const char *buf, >> spec->pool_name = dup_token(&buf, NULL); >> if (!spec->pool_name) >> goto out_mem; >> -if (!*spec->pool_name) >> -goto out_err;/* Missing pool name */ >> +if (!*spec->pool_name) { >> +rbd_warn(NULL, "no pool name provided"); >> +goto out_err; >> +} >> >> spec->image_name = dup_token(&buf, NULL); >> if (!spec->image_name) >> goto out_mem; >> -if (!*spec->image_name) >> -goto out_err;/* Missing image name */ >> +if (!*spec->image_name) { >> +rbd_warn(NULL, "no image name provided"); >> +goto out_err; >> +} >> >> /* >>* Snapshot name is optional; default is to use "-" >> -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: OSD memory leaks?
Hi Sam, Thanks for your answer and sorry the late reply. Unfortunately I can't get something out from the profiler, actually I do but I guess it doesn't show what is supposed to show... I will keep on trying this. Anyway yesterday I just thought that the problem might be due to some over usage of some OSDs. I was thinking that the distribution of the primary OSD might be uneven, this could have explained that some memory leaks are more important with some servers. At the end, the repartition seems even but while looking at the pg dump I found something interesting in the scrub column, timestamps from the last scrubbing operation matched with times showed on the graph. After this, I made some calculation, I compared the total number of scrubbing operation with the time range where memory leaks occurred. First of all check my setup: root@c2-ceph-01 ~ # ceph osd tree dumped osdmap tree epoch 859 # id weight type name up/down reweight -1 12 pool default -3 12 rack lc2_rack33 -2 3 host c2-ceph-01 0 1 osd.0 up 1 1 1 osd.1 up 1 2 1 osd.2 up 1 -4 3 host c2-ceph-04 10 1 osd.10 up 1 11 1 osd.11 up 1 9 1 osd.9 up 1 -5 3 host c2-ceph-02 3 1 osd.3 up 1 4 1 osd.4 up 1 5 1 osd.5 up 1 -6 3 host c2-ceph-03 6 1 osd.6 up 1 7 1 osd.7 up 1 8 1 osd.8 up 1 And there are the results: * Ceph node 1 which has the most important memory leak performed 1608 in total and 1059 during the time range where memory leaks occured * Ceph node 2, 1168 in total and 776 during the time range where memory leaks occured * Ceph node 3, 940 in total and 94 during the time range where memory leaks occurred * Ceph node 4, 899 in total and 191 during the time range where memory leaks occurred I'm still not entirely sure that the scrub operation causes the leak but the only relevant relation that I found... Could it be that the scrubbing process doesn't release memory? Btw I was wondering, how ceph decides at what time it should run the scrubbing operation? I know that it's once a day and control by the following options OPTION(osd_scrub_min_interval, OPT_FLOAT, 300) OPTION(osd_scrub_max_interval, OPT_FLOAT, 60*60*24) But how ceph determined the time where the operation started, during cluster creation probably? I just checked the options that control OSD scrubbing and found that by default: OPTION(osd_max_scrubs, OPT_INT, 1) So that might explain why only one OSD uses a lot of memory. My dirty workaround at the moment is to performed a check of memory use by every OSD and restart it if it uses more than 25% of the total memory. Also note that on ceph 1, 3 and 4 it's always one OSD that uses a lot of memory, for ceph 2 only the mem usage is high but almost the same for all the OSD process. Thank you in advance. -- Regards, Sébastien Han. On Wed, Dec 19, 2012 at 10:43 PM, Samuel Just wrote: > > Sorry, it's been very busy. The next step would to try to get a heap > dump. You can start a heap profile on osd N by: > > ceph osd tell N heap start_profiler > > and you can get it to dump the collected profile using > > ceph osd tell N heap dump. > > The dumps should show up in the osd log directory. > > Assuming the heap profiler is working correctly, you can look at the > dump using pprof in google-perftools. > > On Wed, Dec 19, 2012 at 8:37 AM, Sébastien Han > wrote: > > No more suggestions? :( > > -- > > Regards, > > Sébastien Han. > > > > > > On Tue, Dec 18, 2012 at 6:21 PM, Sébastien Han > > wrote: > >> Nothing terrific... > >> > >> Kernel logs from my clients are full of "libceph: osd4 > >> 172.20.11.32:6801 socket closed" > >> > >> I saw this somewhere on the tracker. > >> > >> Does this harm? > >> > >> Thanks. > >> > >> -- > >> Regards, > >> Sébastien Han. > >> > >> > >> > >> On Mon, Dec 17, 2012 at 11:55 PM, Samuel Just wrote: > >>> > >>> What is the workload like? > >>> -Sam > >>> > >>> On Mon, Dec 17, 2012 at 2:41 PM, Sébastien Han > >>> wrote: > >>> > Hi, > >>> > > >>> > No, I don't see nothing abnormal in the network stats. I don't see > >>> > anything in the logs... :( > >>> > The weird thing is that one node over 4 seems to take way more memory > >>> > than the others... > >>> > > >>> > -- > >>> > Regards, > >>> > Sébastien Han. > >>> > > >>> > > >>> > On Mon, Dec 17, 2012 at 11:31 PM, Sébastien Han > >>> > wrote: > >>> >> > >>> >> Hi, > >>> >> > >>> >> No, I don't see nothing abnormal in the network stats. I don't see > >>> >> anything in the logs... :( > >>> >> The weird thing is that one node over 4 seems to take way more memory > >>> >> than the others... > >>> >> > >>> >> -- > >>> >> Regards, > >>> >> Sébastien Han. > >>> >> > >>> >> > >>> >> > >>> >> On Mon, Dec 17, 2012 at 7:12 PM, Samuel Just > >>> >> wrote: > >>> >>> > >>> >>> Are you having network hiccups? There was a bug noticed recently that > >>> >>> could cause a memory leak if nodes are being marked up and down. > >>> >>> -Sam > >>> >>> > >>> >>> On Mon, Dec 17, 2012 at 12:28 AM, Sébastien Han > >>> >>> wrote: > >>> >>> > Hi guys, > >>> >>>
THE END, for now
That concludes my reposting of un-reviewed patches. I see I'm getting some reviews now, so I'll be building up a replacement testing branch with annotations to the commits indicating that. -Alex -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST] rbd: assign watch request more directly
Both rbd_req_sync_op() and rbd_do_request() have a "linger" parameter, which is the address of a pointer that should refer to the osd request structure used to issue a request to an osd. Only one case ever supplies a non-null "linger" argument: an CEPH_OSD_OP_WATCH start. And in that one case it is assigned &rbd_dev->watch_request. Within rbd_do_request() (where the assignment ultimately gets made) we know the rbd_dev and therefore its watch_request field. We also know whether the op being sent is CEPH_OSD_OP_WATCH start. Stop opaquely passing down the "linger" pointer, and instead just assign the value directly inside rbd_do_request() when it's needed. This makes it unnecessary for rbd_req_sync_watch() to make arrangements to hold a value that's not available until a bit later. This more clearly separates setting up a watch request from submitting it. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 20 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 21fbf82..02002b1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1158,7 +1158,6 @@ static int rbd_do_request(struct request *rq, int coll_index, void (*rbd_cb)(struct ceph_osd_request *, struct ceph_msg *), - struct ceph_osd_request **linger_req, u64 *ver) { struct ceph_osd_client *osdc; @@ -1210,9 +1209,9 @@ static int rbd_do_request(struct request *rq, ceph_osdc_build_request(osd_req, ofs, len, 1, op, snapc, snapid, &mtime); - if (linger_req) { + if (op->op == CEPH_OSD_OP_WATCH && op->watch.flag) { ceph_osdc_set_request_linger(osdc, osd_req); - *linger_req = osd_req; + rbd_dev->watch_request = osd_req; } ret = ceph_osdc_start_request(osdc, osd_req, false); @@ -1296,7 +1295,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, const char *object_name, u64 ofs, u64 inbound_size, char *inbound, - struct ceph_osd_request **linger_req, u64 *ver) { int ret; @@ -1317,7 +1315,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, op, NULL, 0, NULL, - linger_req, ver); + ver); if (ret < 0) goto done; @@ -1383,7 +1381,7 @@ static int rbd_do_op(struct request *rq, flags, op, coll, coll_index, -rbd_req_cb, 0, NULL); +rbd_req_cb, NULL); if (ret < 0) rbd_coll_end_req_index(rq, coll, coll_index, (s32) ret, seg_len); @@ -1410,7 +1408,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - op, object_name, ofs, len, buf, NULL, ver); + op, object_name, ofs, len, buf, ver); rbd_osd_req_op_destroy(op); return ret; @@ -1436,7 +1434,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, CEPH_OSD_FLAG_READ, op, NULL, 0, - rbd_simple_req_cb, 0, NULL); + rbd_simple_req_cb, NULL); rbd_osd_req_op_destroy(op); @@ -1469,7 +1467,6 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) */ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) { - struct ceph_osd_request **linger_req = NULL; struct ceph_osd_req_op *op; int ret = 0; @@ -1481,7 +1478,6 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) &rbd_dev->watch_event); if (ret < 0) return ret; - linger_req = &rbd_dev->watch_request; } else { rbd_assert(rbd_dev->watch_request != NULL); } @@ -1493,7 +1489,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, op, rbd_dev->header_name, - 0, 0, NULL, linger_req, NULL); + 0, 0, NULL, NULL); /* Cancel the event if we're tearing down, or on error */ @@ -1537,7 +1533,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ret = rbd_req_sync_o
[PATCH REPOST 6/6] rbd: move remaining osd op setup into rbd_osd_req_op_create()
The two remaining osd ops used by rbd are CEPH_OSD_OP_WATCH and CEPH_OSD_OP_NOTIFY_ACK. Move the setup of those operations into rbd_osd_req_op_create(), and get rid of rbd_create_rw_op() and rbd_destroy_op(). Signed-off-by: Alex Elder --- drivers/block/rbd.c | 68 --- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9f41c32..21fbf82 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1027,24 +1027,6 @@ out_err: return NULL; } -static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len) -{ - struct ceph_osd_req_op *op; - - op = kzalloc(sizeof (*op), GFP_NOIO); - if (!op) - return NULL; - - op->op = opcode; - - return op; -} - -static void rbd_destroy_op(struct ceph_osd_req_op *op) -{ - kfree(op); -} - struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) { struct ceph_osd_req_op *op; @@ -1087,6 +1069,16 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) op->cls.indata_len = (u32) size; op->payload_len += size; break; + case CEPH_OSD_OP_NOTIFY_ACK: + case CEPH_OSD_OP_WATCH: + /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */ + /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */ + op->watch.cookie = va_arg(args, u64); + op->watch.ver = va_arg(args, u64); + op->watch.ver = cpu_to_le64(op->watch.ver); /* XXX */ + if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int)) + op->watch.flag = (u8) 1; + break; default: rbd_warn(NULL, "unsupported opcode %hu\n", opcode); kfree(op); @@ -1434,14 +1426,10 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, struct ceph_osd_req_op *op; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0); + op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver); if (!op) return -ENOMEM; - op->watch.ver = cpu_to_le64(ver); - op->watch.cookie = notify_id; - op->watch.flag = 0; - ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, rbd_dev->header_name, 0, 0, NULL, NULL, 0, @@ -1450,7 +1438,8 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, NULL, 0, rbd_simple_req_cb, 0, NULL); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); + return ret; } @@ -1480,14 +1469,9 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) */ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) { - struct ceph_osd_req_op *op; struct ceph_osd_request **linger_req = NULL; - __le64 version = 0; - int ret; - - op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0); - if (!op) - return -ENOMEM; + struct ceph_osd_req_op *op; + int ret = 0; if (start) { struct ceph_osd_client *osdc; @@ -1496,26 +1480,28 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev, &rbd_dev->watch_event); if (ret < 0) - goto done; - version = cpu_to_le64(rbd_dev->header.obj_version); + return ret; linger_req = &rbd_dev->watch_request; + } else { + rbd_assert(rbd_dev->watch_request != NULL); } - op->watch.ver = version; - op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - op->watch.flag = (u8) start ? 1 : 0; - - ret = rbd_req_sync_op(rbd_dev, + op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH, + rbd_dev->watch_event->cookie, + rbd_dev->header.obj_version, start); + if (op) + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, op, rbd_dev->header_name, 0, 0, NULL, linger_req, NULL); - if (!start || ret < 0) { + /* Cancel the event if we're tearing down, or on error */ + + if (!start || !op || ret < 0) { ceph_osdc_cancel_event(rbd_dev->watch_event); rbd_dev->watch_event = NULL; } -done: - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); return ret; } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-
[PATCH REPOST 5/6] rbd: move call osd op setup into rbd_osd_req_op_create()
Move the initialization of the CEPH_OSD_OP_CALL operation into rbd_osd_req_op_create(). Signed-off-by: Alex Elder --- drivers/block/rbd.c | 48 ++-- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6fa6ba7..9f41c32 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -52,10 +52,12 @@ #defineSECTOR_SHIFT9 #defineSECTOR_SIZE (1ULL << SECTOR_SHIFT) -/* It might be useful to have this defined elsewhere too */ +/* It might be useful to have these defined elsewhere */ -#defineU32_MAX ((u32) (~0U)) -#defineU64_MAX ((u64) (~0ULL)) +#defineU8_MAX ((u8) (~0U)) +#defineU16_MAX ((u16) (~0U)) +#defineU32_MAX ((u32) (~0U)) +#defineU64_MAX ((u64) (~0ULL)) #define RBD_DRV_NAME "rbd" #define RBD_DRV_NAME_LONG "rbd (rados block device)" @@ -1047,6 +1049,7 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) { struct ceph_osd_req_op *op; va_list args; + size_t size; op = kzalloc(sizeof (*op), GFP_NOIO); if (!op) @@ -1063,6 +1066,27 @@ struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) if (opcode == CEPH_OSD_OP_WRITE) op->payload_len = op->extent.length; break; + case CEPH_OSD_OP_CALL: + /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */ + op->cls.class_name = va_arg(args, char *); + size = strlen(op->cls.class_name); + rbd_assert(size <= (size_t) U8_MAX); + op->cls.class_len = size; + op->payload_len = size; + + op->cls.method_name = va_arg(args, char *); + size = strlen(op->cls.method_name); + rbd_assert(size <= (size_t) U8_MAX); + op->cls.method_len = size; + op->payload_len += size; + + op->cls.argc = 0; + op->cls.indata = va_arg(args, void *); + size = va_arg(args, size_t); + rbd_assert(size <= (size_t) U32_MAX); + op->cls.indata_len = (u32) size; + op->payload_len += size; + break; default: rbd_warn(NULL, "unsupported opcode %hu\n", opcode); kfree(op); @@ -1510,9 +1534,6 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, u64 *ver) { struct ceph_osd_req_op *op; - int class_name_len = strlen(class_name); - int method_name_len = strlen(method_name); - int payload_size; int ret; /* @@ -1523,25 +1544,16 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, * the perspective of the server side) in the OSD request * operation. */ - payload_size = class_name_len + method_name_len + outbound_size; - op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0); + op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name, + method_name, outbound, outbound_size); if (!op) return -ENOMEM; - op->payload_len = payload_size; - - op->cls.class_name = class_name; - op->cls.class_len = (__u8) class_name_len; - op->cls.method_name = method_name; - op->cls.method_len = (__u8) method_name_len; - op->cls.argc = 0; - op->cls.indata = outbound; - op->cls.indata_len = outbound_size; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, 0, inbound_size, inbound, NULL, ver); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); dout("cls_exec returned %d\n", ret); return ret; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 4/6] rbd: define generalized osd request op routines
Create a baseline function to encapsulate the creation of osd requests, along with a matching function to destroy them. For now this just duplicates what rbd_create_rw_op() does for read and write operations, but the next patches will expand on this. Since rbd_create_rw_op() is no longer used for read or write requests, the special handling in that function for those types can be removed. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 52 +++ 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 685d049..6fa6ba7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1034,14 +1034,6 @@ static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len) return NULL; op->op = opcode; - if (opcode == CEPH_OSD_OP_READ || opcode == CEPH_OSD_OP_WRITE) { - op->extent.offset = ofs; - op->extent.length = len; - if (opcode == CEPH_OSD_OP_WRITE) { - rbd_assert(len <= (u64) U32_MAX); - op->payload_len = len; - } - } return op; } @@ -1051,6 +1043,42 @@ static void rbd_destroy_op(struct ceph_osd_req_op *op) kfree(op); } +struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...) +{ + struct ceph_osd_req_op *op; + va_list args; + + op = kzalloc(sizeof (*op), GFP_NOIO); + if (!op) + return NULL; + op->op = opcode; + va_start(args, opcode); + switch (opcode) { + case CEPH_OSD_OP_READ: + case CEPH_OSD_OP_WRITE: + /* rbd_osd_req_op_create(READ, offset, length) */ + /* rbd_osd_req_op_create(WRITE, offset, length) */ + op->extent.offset = va_arg(args, u64); + op->extent.length = va_arg(args, u64); + if (opcode == CEPH_OSD_OP_WRITE) + op->payload_len = op->extent.length; + break; + default: + rbd_warn(NULL, "unsupported opcode %hu\n", opcode); + kfree(op); + op = NULL; + break; + } + va_end(args); + + return op; +} + +static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op) +{ + kfree(op); +} + static void rbd_coll_end_req_index(struct request *rq, struct rbd_req_coll *coll, int index, @@ -1323,7 +1351,7 @@ static int rbd_do_op(struct request *rq, } ret = -ENOMEM; - op = rbd_create_rw_op(opcode, seg_ofs, seg_len); + op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len); if (!op) goto done; @@ -1343,7 +1371,7 @@ static int rbd_do_op(struct request *rq, if (ret < 0) rbd_coll_end_req_index(rq, coll, coll_index, (s32) ret, seg_len); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); done: kfree(seg_name); return ret; @@ -1361,13 +1389,13 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, struct ceph_osd_req_op *op; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_READ, ofs, len); + op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len); if (!op) return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, ofs, len, buf, NULL, ver); - rbd_destroy_op(op); + rbd_osd_req_op_destroy(op); return ret; } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 3/6] rbd: initialize off and len in rbd_create_rw_op()
Move the initialization of a read or write operation's offset, length, and payload length fields into rbd_create_rw_op(). This will actually get removed in the next patch, but it finishes the consolidation of setting these fields at osd op creation time. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 36 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 49e500f..685d049 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1025,19 +1025,23 @@ out_err: return NULL; } -static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len) +static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u64 ofs, u64 len) { struct ceph_osd_req_op *op; op = kzalloc(sizeof (*op), GFP_NOIO); if (!op) return NULL; - /* -* op extent offset and length will be set later on -* after ceph_calc_file_object_mapping(). -*/ + op->op = opcode; - op->payload_len = payload_len; + if (opcode == CEPH_OSD_OP_READ || opcode == CEPH_OSD_OP_WRITE) { + op->extent.offset = ofs; + op->extent.length = len; + if (opcode == CEPH_OSD_OP_WRITE) { + rbd_assert(len <= (u64) U32_MAX); + op->payload_len = len; + } + } return op; } @@ -1297,7 +1301,6 @@ static int rbd_do_op(struct request *rq, u64 seg_len; int ret; struct ceph_osd_req_op *op; - u32 payload_len; int opcode; int flags; u64 snapid; @@ -1312,22 +1315,17 @@ static int rbd_do_op(struct request *rq, opcode = CEPH_OSD_OP_WRITE; flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK; snapid = CEPH_NOSNAP; - payload_len = seg_len; } else { opcode = CEPH_OSD_OP_READ; flags = CEPH_OSD_FLAG_READ; rbd_assert(!snapc); snapid = rbd_dev->spec->snap_id; - payload_len = 0; } ret = -ENOMEM; - op = rbd_create_rw_op(opcode, payload_len); + op = rbd_create_rw_op(opcode, seg_ofs, seg_len); if (!op) goto done; - op->extent.offset = seg_ofs; - op->extent.length = seg_len; - op->payload_len = payload_len; /* we've taken care of segment sizes earlier when we cloned the bios. We should never have a segment @@ -1363,12 +1361,9 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, struct ceph_osd_req_op *op; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0); + op = rbd_create_rw_op(CEPH_OSD_OP_READ, ofs, len); if (!op) return -ENOMEM; - op->extent.offset = ofs; - op->extent.length = len; - op->payload_len = 0; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, ofs, len, buf, NULL, ver); @@ -1387,7 +1382,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, struct ceph_osd_req_op *op; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0); + op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0, 0); if (!op) return -ENOMEM; @@ -1438,7 +1433,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) __le64 version = 0; int ret; - op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); + op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0, 0); if (!op) return -ENOMEM; @@ -1501,9 +1496,10 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, * operation. */ payload_size = class_name_len + method_name_len + outbound_size; - op = rbd_create_rw_op(CEPH_OSD_OP_CALL, payload_size); + op = rbd_create_rw_op(CEPH_OSD_OP_CALL, 0, 0); if (!op) return -ENOMEM; + op->payload_len = payload_size; op->cls.class_name = class_name; op->cls.class_len = (__u8) class_name_len; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 2/6] rbd: don't assign extent info in rbd_req_sync_op()
Move the assignment of the extent offset and length and payload length out of rbd_req_sync_op() and into its caller in the one spot where a read (and note--no write) operation might be initiated. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 10 +++--- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4ce9981..49e500f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1262,13 +1262,6 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, if (IS_ERR(pages)) return PTR_ERR(pages); - if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = ofs; - op->extent.length = inbound_size; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = inbound_size; - } - ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, object_name, ofs, inbound_size, NULL, pages, num_pages, @@ -1373,6 +1366,9 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0); if (!op) return -ENOMEM; + op->extent.offset = ofs; + op->extent.length = len; + op->payload_len = 0; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op, object_name, ofs, len, buf, NULL, ver); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 1/6] rbd: don't assign extent info in rbd_do_request()
In rbd_do_request() there's a sort of last-minute assignment of the extent offset and length and payload length for read and write operations. Move those assignments into the caller (in those spots that might initiate read or write operations) Signed-off-by: Alex Elder --- drivers/block/rbd.c | 17 ++--- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f0b30b2..4ce9981 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1156,13 +1156,6 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); osd_req->r_file_layout = rbd_dev->layout; /* struct */ - - if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = ofs; - op->extent.length = len; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = len; - } osd_req->r_num_pages = calc_pages_for(ofs, len); osd_req->r_page_alignment = ofs & ~PAGE_MASK; @@ -1269,6 +1262,13 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, if (IS_ERR(pages)) return PTR_ERR(pages); + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { + op->extent.offset = ofs; + op->extent.length = inbound_size; + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = inbound_size; + } + ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, object_name, ofs, inbound_size, NULL, pages, num_pages, @@ -1332,6 +1332,9 @@ static int rbd_do_op(struct request *rq, op = rbd_create_rw_op(opcode, payload_len); if (!op) goto done; + op->extent.offset = seg_ofs; + op->extent.length = seg_len; + op->payload_len = payload_len; /* we've taken care of segment sizes earlier when we cloned the bios. We should never have a segment -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 0/6] rbd: consolidate osd request setup
This series consolidates and encapsulates the setup of all osd requests into a single function which takes variable arguments appropriate for the type of request. The result groups together common code idioms and I think makes the spots that build these messages a little easier to read. -Alex [PATCH REPOST 1/6] rbd: don't assign extent info in rbd_do_request() [PATCH REPOST 2/6] rbd: don't assign extent info in rbd_req_sync_op() [PATCH REPOST 3/6] rbd: initialize off and len in rbd_create_rw_op() [PATCH REPOST 4/6] rbd: define generalized osd request op routines [PATCH REPOST 5/6] rbd: move call osd op setup into rbd_osd_req_op_create() [PATCH REPOST 6/6] rbd: move remaining osd op setup into rbd_osd_req_op_create() -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 2/2] rbd: don't leak rbd_req for rbd_req_sync_notify_ack()
When rbd_req_sync_notify_ack() calls rbd_do_request() it supplies rbd_simple_req_cb() as its callback function. Because the callback is supplied, an rbd_req structure gets allocated and populated so it can be used by the callback. However rbd_simple_req_cb() is not freeing (or even using) the rbd_req structure, so it's getting leaked. Since rbd_simple_req_cb() has no need for the rbd_req structure, just avoid allocating one for this case. Of the three calls to rbd_do_request(), only the one from rbd_do_op() needs the rbd_req structure, and that call can be distinguished from the other two because it supplies a non-null rbd_collection pointer. So fix this leak by only allocating the rbd_req structure if a non-null "coll" value is provided to rbd_do_request(). Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 17bf0b4..f0b30b2 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1135,7 +1135,7 @@ static int rbd_do_request(struct request *rq, bio_get(osd_req->r_bio); } - if (rbd_cb) { + if (coll) { ret = -ENOMEM; rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO); if (!rbd_req) @@ -1146,7 +1146,7 @@ static int rbd_do_request(struct request *rq, rbd_req->pages = pages; rbd_req->len = len; rbd_req->coll = coll; - rbd_req->coll_index = coll ? coll_index : 0; + rbd_req->coll_index = coll_index; } osd_req->r_callback = rbd_cb; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 1/2] rbd: don't leak rbd_req on synchronous requests
When rbd_do_request() is called it allocates and populates an rbd_req structure to hold information about the osd request to be sent. This is done for the benefit of the callback function (in particular, rbd_req_cb()), which uses this in processing when the request completes. Synchronous requests provide no callback function, in which case rbd_do_request() waits for the request to complete before returning. This case is not handling the needed free of the rbd_req structure like it should, so it is getting leaked. Note however that the synchronous case has no need for the rbd_req structure at all. So rather than simply freeing this structure for synchronous requests, just don't allocate it to begin with. Signed-off-by: Alex Elder Reviewed-by: Josh Durgin --- drivers/block/rbd.c | 48 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c1e5f24..17bf0b4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1113,20 +1113,11 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_request **linger_req, u64 *ver) { + struct ceph_osd_client *osdc; struct ceph_osd_request *osd_req; - int ret; + struct rbd_request *rbd_req = NULL; struct timespec mtime = CURRENT_TIME; - struct rbd_request *rbd_req; - struct ceph_osd_client *osdc; - - rbd_req = kzalloc(sizeof(*rbd_req), GFP_NOIO); - if (!rbd_req) - return -ENOMEM; - - if (coll) { - rbd_req->coll = coll; - rbd_req->coll_index = coll_index; - } + int ret; dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n", object_name, (unsigned long long) ofs, @@ -1134,10 +1125,8 @@ static int rbd_do_request(struct request *rq, osdc = &rbd_dev->rbd_client->client->osdc; osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO); - if (!osd_req) { - ret = -ENOMEM; - goto done_pages; - } + if (!osd_req) + return -ENOMEM; osd_req->r_flags = flags; osd_req->r_pages = pages; @@ -1145,13 +1134,22 @@ static int rbd_do_request(struct request *rq, osd_req->r_bio = bio; bio_get(osd_req->r_bio); } - osd_req->r_callback = rbd_cb; - rbd_req->rq = rq; - rbd_req->bio = bio; - rbd_req->pages = pages; - rbd_req->len = len; + if (rbd_cb) { + ret = -ENOMEM; + rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO); + if (!rbd_req) + goto done_osd_req; + + rbd_req->rq = rq; + rbd_req->bio = bio; + rbd_req->pages = pages; + rbd_req->len = len; + rbd_req->coll = coll; + rbd_req->coll_index = coll ? coll_index : 0; + } + osd_req->r_callback = rbd_cb; osd_req->r_priv = rbd_req; strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); @@ -1193,10 +1191,12 @@ static int rbd_do_request(struct request *rq, return ret; done_err: - bio_chain_put(rbd_req->bio); - ceph_osdc_put_request(osd_req); -done_pages: + if (bio) + bio_chain_put(osd_req->r_bio); kfree(rbd_req); +done_osd_req: + ceph_osdc_put_request(osd_req); + return ret; } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 0/2] rbd: fix two leaks
When certain special osd requests are processed, they have data structures allocated that are not properly freed when the request is completed. This series fixes that. -Alex [PATCH REPOST 1/2] rbd: don't leak rbd_req on synchronous requests [PATCH REPOST 2/2] rbd: don't leak rbd_req for rbd_req_sync_notify_ack() -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST] rbd: combine rbd sync watch/unwatch functions
The rbd_req_sync_watch() and rbd_req_sync_unwatch() functions are nearly identical. Combine them into a single function with a flag indicating whether a watch is to be initiated or torn down. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 81 +-- 1 file changed, 27 insertions(+), 54 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 7c35608..c1e5f24 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1429,74 +1429,48 @@ static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) } /* - * Request sync osd watch + * Request sync osd watch/unwatch. The value of "start" determines + * whether a watch request is being initiated or torn down. */ -static int rbd_req_sync_watch(struct rbd_device *rbd_dev) +static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start) { struct ceph_osd_req_op *op; - struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + struct ceph_osd_request **linger_req = NULL; + __le64 version = 0; int ret; op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); if (!op) return -ENOMEM; - ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, -(void *)rbd_dev, &rbd_dev->watch_event); - if (ret < 0) - goto fail; - - op->watch.ver = cpu_to_le64(rbd_dev->header.obj_version); - op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - op->watch.flag = 1; - - ret = rbd_req_sync_op(rbd_dev, - CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - op, - rbd_dev->header_name, - 0, 0, NULL, - &rbd_dev->watch_request, NULL); - - if (ret < 0) - goto fail_event; - - rbd_destroy_op(op); - return 0; - -fail_event: - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; -fail: - rbd_destroy_op(op); - return ret; -} - -/* - * Request sync osd unwatch - */ -static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) -{ - struct ceph_osd_req_op *op; - int ret; + if (start) { + struct ceph_osd_client *osdc; - op = rbd_create_rw_op(CEPH_OSD_OP_WATCH, 0); - if (!op) - return -ENOMEM; + osdc = &rbd_dev->rbd_client->client->osdc; + ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev, + &rbd_dev->watch_event); + if (ret < 0) + goto done; + version = cpu_to_le64(rbd_dev->header.obj_version); + linger_req = &rbd_dev->watch_request; + } - op->watch.ver = 0; + op->watch.ver = version; op->watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); - op->watch.flag = 0; + op->watch.flag = (u8) start ? 1 : 0; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - op, - rbd_dev->header_name, - 0, 0, NULL, NULL, NULL); - + op, rbd_dev->header_name, + 0, 0, NULL, linger_req, NULL); + if (!start || ret < 0) { + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; + } +done: rbd_destroy_op(op); - ceph_osdc_cancel_event(rbd_dev->watch_event); - rbd_dev->watch_event = NULL; + return ret; } @@ -3031,7 +3005,7 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) int ret, rc; do { - ret = rbd_req_sync_watch(rbd_dev); + ret = rbd_req_sync_watch(rbd_dev, 1); if (ret == -ERANGE) { rc = rbd_dev_refresh(rbd_dev, NULL); if (rc < 0) @@ -3750,8 +3724,7 @@ static void rbd_dev_release(struct device *dev) rbd_dev->watch_request); } if (rbd_dev->watch_event) - rbd_req_sync_unwatch(rbd_dev); - + rbd_req_sync_watch(rbd_dev, 0); /* clean up and free blkdev */ rbd_free_disk(rbd_dev); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST] rbd: use a common layout for each device
Each osd message includes a layout structure, and for rbd it is always the same (at least for osd's in a given pool). Initialize a layout structure when an rbd_dev gets created and just copy that into osd requests for the rbd image. Replace an assertion that was done when initializing the layout structures with code that catches and handles anything that would trigger the assertion as soon as it is identified. This precludes that (bad) condition from ever occurring. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 34 +++--- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 072608e..7c35608 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -235,6 +235,8 @@ struct rbd_device { char*header_name; + struct ceph_file_layout layout; + struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; @@ -1091,16 +1093,6 @@ static void rbd_coll_end_req(struct rbd_request *rbd_req, ret, len); } -static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) -{ - memset(layout, 0, sizeof (*layout)); - layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_stripe_count = cpu_to_le32(1); - layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - rbd_assert(pool_id <= (u64) U32_MAX); - layout->fl_pg_pool = cpu_to_le32((u32) pool_id); -} - /* * Send ceph osd request */ @@ -1165,7 +1157,7 @@ static int rbd_do_request(struct request *rq, strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid)); osd_req->r_oid_len = strlen(osd_req->r_oid); - rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); + osd_req->r_file_layout = rbd_dev->layout; /* struct */ if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { op->extent.offset = ofs; @@ -2295,6 +2287,13 @@ struct rbd_device *rbd_dev_create(struct rbd_client *rbdc, rbd_dev->spec = spec; rbd_dev->rbd_client = rbdc; + /* Initialize the layout used for all rbd requests */ + + rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_stripe_count = cpu_to_le32(1); + rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); + rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id); + return rbd_dev; } @@ -2549,6 +2548,12 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) if (parent_spec->pool_id == CEPH_NOPOOL) goto out; /* No parent? No problem. */ + /* The ceph file layout needs to fit pool id in 32 bits */ + + ret = -EIO; + if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX)) + goto out; + image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL); if (IS_ERR(image_id)) { ret = PTR_ERR(image_id); @@ -3678,6 +3683,13 @@ static ssize_t rbd_add(struct bus_type *bus, goto err_out_client; spec->pool_id = (u64) rc; + /* The ceph file layout needs to fit pool id in 32 bits */ + + if (WARN_ON(spec->pool_id > (u64) U32_MAX)) { + rc = -EIO; + goto err_out_client; + } + rbd_dev = rbd_dev_create(rbdc, spec); if (!rbd_dev) goto err_out_client; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 3/3] rbd: don't bother calculating file mapping
When rbd_do_request() has a request to process it initializes a ceph file layout structure and uses it to compute offsets and limits for the range of the request using ceph_calc_file_object_mapping(). The layout used is fixed, and is based on RBD_MAX_OBJ_ORDER (30). It sets the layout's object size and stripe unit to be 1 GB (2^30), and sets the stripe count to be 1. The job of ceph_calc_file_object_mapping() is to determine which of a sequence of objects will contain data covered by range, and within that object, at what offset the range starts. It also truncates the length of the range at the end of the selected object if necessary. This is needed for ceph fs, but for rbd it really serves no purpose. It does its own blocking of images into objects, echo of which is (1 << obj_order) in size, and as a result it ignores the "bno" value returned by ceph_calc_file_object_mapping(). In addition, by the point a request has reached this function, it is already destined for a single rbd object, and its length will not exceed that object's extent. Because of this, and because the mapping will result in blocking up the range using an integer multiple of the image's object order, ceph_calc_file_object_mapping() will never change the offset or length values defined by the request. In other words, this call is a big no-op for rbd data requests. There is one exception. We read the header object using this function, and in that case we will not have already limited the request size. However, the header is a single object (not a file or rbd image), and should not be broken into pieces anyway. So in fact we should *not* be calling ceph_calc_file_object_mapping() when operating on the header object. So... Don't call ceph_calc_file_object_mapping() in rbd_do_request(), because useless for image data and incorrect to do sofor the image header. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 18 -- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e6db737..072608e 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1123,9 +1123,6 @@ static int rbd_do_request(struct request *rq, { struct ceph_osd_request *osd_req; int ret; - u64 bno; - u64 obj_off = 0; - u64 obj_len = 0; struct timespec mtime = CURRENT_TIME; struct rbd_request *rbd_req; struct ceph_osd_client *osdc; @@ -1169,19 +1166,12 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len, - &bno, &obj_off, &obj_len); - rbd_assert(ret == 0); - if (obj_len < len) { - dout(" skipping last %llu, final file extent %llu~%llu\n", -len - obj_len, ofs, obj_len); - len = obj_len; - } + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = obj_off; - op->extent.length = obj_len; + op->extent.offset = ofs; + op->extent.length = len; if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = obj_len; + op->payload_len = len; } osd_req->r_num_pages = calc_pages_for(ofs, len); osd_req->r_page_alignment = ofs & ~PAGE_MASK; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 2/3] rbd: open code rbd_calc_raw_layout()
This patch gets rid of rbd_calc_raw_layout() by simply open coding it in its one caller. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 55 +-- 1 file changed, 18 insertions(+), 37 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 46b52a4..e6db737 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1032,7 +1032,7 @@ static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len) return NULL; /* * op extent offset and length will be set later on -* in calc_raw_layout() +* after ceph_calc_file_object_mapping(). */ op->op = opcode; op->payload_len = payload_len; @@ -1101,40 +1101,6 @@ static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) layout->fl_pg_pool = cpu_to_le32((u32) pool_id); } -int rbd_calc_raw_layout(struct ceph_file_layout *layout, - u64 off, u64 *plen, u64 *bno, - struct ceph_osd_request *req, - struct ceph_osd_req_op *op) -{ - u64 orig_len = *plen; - u64 objoff, objlen;/* extent in object */ - int r; - - /* object extent? */ - r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, - &objoff, &objlen); - if (r < 0) - return r; - if (objlen < orig_len) { - *plen = objlen; - dout(" skipping last %llu, final file extent %llu~%llu\n", -orig_len - *plen, off, *plen); - } - - if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { - op->extent.offset = objoff; - op->extent.length = objlen; - } - req->r_num_pages = calc_pages_for(off, *plen); - req->r_page_alignment = off & ~PAGE_MASK; - if (op->op == CEPH_OSD_OP_WRITE) - op->payload_len = *plen; - - dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", -*bno, objoff, objlen, req->r_num_pages); - return 0; -} - /* * Send ceph osd request */ @@ -1158,6 +1124,8 @@ static int rbd_do_request(struct request *rq, struct ceph_osd_request *osd_req; int ret; u64 bno; + u64 obj_off = 0; + u64 obj_len = 0; struct timespec mtime = CURRENT_TIME; struct rbd_request *rbd_req; struct ceph_osd_client *osdc; @@ -1201,9 +1169,22 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = rbd_calc_raw_layout(&osd_req->r_file_layout, - ofs, &len, &bno, osd_req, op); + ret = ceph_calc_file_object_mapping(&osd_req->r_file_layout, ofs, len, + &bno, &obj_off, &obj_len); rbd_assert(ret == 0); + if (obj_len < len) { + dout(" skipping last %llu, final file extent %llu~%llu\n", +len - obj_len, ofs, obj_len); + len = obj_len; + } + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { + op->extent.offset = obj_off; + op->extent.length = obj_len; + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = obj_len; + } + osd_req->r_num_pages = calc_pages_for(ofs, len); + osd_req->r_page_alignment = ofs & ~PAGE_MASK; ceph_osdc_build_request(osd_req, ofs, len, 1, op, snapc, snapid, &mtime); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 1/3] rbd: pull in ceph_calc_raw_layout()
This is the first in a series of patches aimed at eliminating the use of ceph_calc_raw_layout() by rbd. It simply pulls in a copy of that function and renames it rbd_calc_raw_layout(). Signed-off-by: Alex Elder --- drivers/block/rbd.c | 36 +++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index a57d6e9..46b52a4 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1101,6 +1101,40 @@ static void rbd_layout_init(struct ceph_file_layout *layout, u64 pool_id) layout->fl_pg_pool = cpu_to_le32((u32) pool_id); } +int rbd_calc_raw_layout(struct ceph_file_layout *layout, + u64 off, u64 *plen, u64 *bno, + struct ceph_osd_request *req, + struct ceph_osd_req_op *op) +{ + u64 orig_len = *plen; + u64 objoff, objlen;/* extent in object */ + int r; + + /* object extent? */ + r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, + &objoff, &objlen); + if (r < 0) + return r; + if (objlen < orig_len) { + *plen = objlen; + dout(" skipping last %llu, final file extent %llu~%llu\n", +orig_len - *plen, off, *plen); + } + + if (op->op == CEPH_OSD_OP_READ || op->op == CEPH_OSD_OP_WRITE) { + op->extent.offset = objoff; + op->extent.length = objlen; + } + req->r_num_pages = calc_pages_for(off, *plen); + req->r_page_alignment = off & ~PAGE_MASK; + if (op->op == CEPH_OSD_OP_WRITE) + op->payload_len = *plen; + + dout("calc_layout bno=%llx %llu~%llu (%d pages)\n", +*bno, objoff, objlen, req->r_num_pages); + return 0; +} + /* * Send ceph osd request */ @@ -1167,7 +1201,7 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = ceph_calc_raw_layout(&osd_req->r_file_layout, + ret = rbd_calc_raw_layout(&osd_req->r_file_layout, ofs, &len, &bno, osd_req, op); rbd_assert(ret == 0); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 0/3] rbd: no need for file mapping calculation
Currently every osd request submitted by the rbd code undergoes a file mapping operation, which is common with what the ceph file system uses. But some analysis shows that there is no need to do this for rbd, because it already takes care of its own blocking of image data into distinct objects. Removing this simplifies things. I especially think removing this improves things conceptually, removing a complex mapping operation from the I/O path. -Alex [PATCH REPOST 1/3] rbd: pull in ceph_calc_raw_layout() [PATCH REPOST 2/3] rbd: open code rbd_calc_raw_layout() [PATCH REPOST 3/3] rbd: don't bother calculating file mapping -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST] rbd: kill ceph_osd_req_op->flags
The flags field of struct ceph_osd_req_op is never used, so just get rid of it. Signed-off-by: Alex Elder --- include/linux/ceph/osd_client.h |1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 2b04d05..69287cc 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -157,7 +157,6 @@ struct ceph_osd_client { struct ceph_osd_req_op { u16 op; /* CEPH_OSD_OP_* */ - u32 flags;/* CEPH_OSD_FLAG_* */ union { struct { u64 offset, length; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 4/4] rbd: assume single op in a request
We now know that every of rbd_req_sync_op() passes an array of exactly one operation, as evidenced by all callers passing 1 as its num_op argument. So get rid of that argument, assuming a single op. Similarly, we now know that all callers of rbd_do_request() pass 1 as the num_op value, so that parameter can be eliminated as well. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 28 +--- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index d3472db..a57d6e9 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1113,8 +1113,7 @@ static int rbd_do_request(struct request *rq, struct page **pages, int num_pages, int flags, - unsigned int num_op, - struct ceph_osd_req_op *ops, + struct ceph_osd_req_op *op, struct rbd_req_coll *coll, int coll_index, void (*rbd_cb)(struct ceph_osd_request *, @@ -1143,7 +1142,7 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO); + osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO); if (!osd_req) { ret = -ENOMEM; goto done_pages; @@ -1169,10 +1168,10 @@ static int rbd_do_request(struct request *rq, rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); ret = ceph_calc_raw_layout(&osd_req->r_file_layout, - ofs, &len, &bno, osd_req, ops); + ofs, &len, &bno, osd_req, op); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, len, num_op, ops, + ceph_osdc_build_request(osd_req, ofs, len, 1, op, snapc, snapid, &mtime); if (linger_req) { @@ -1255,8 +1254,7 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, */ static int rbd_req_sync_op(struct rbd_device *rbd_dev, int flags, - unsigned int num_op, - struct ceph_osd_req_op *ops, + struct ceph_osd_req_op *op, const char *object_name, u64 ofs, u64 inbound_size, char *inbound, @@ -1267,7 +1265,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, struct page **pages; int num_pages; - rbd_assert(ops != NULL); + rbd_assert(op != NULL); num_pages = calc_pages_for(ofs, inbound_size); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); @@ -1278,7 +1276,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, object_name, ofs, inbound_size, NULL, pages, num_pages, flags, - num_op, ops, + op, NULL, 0, NULL, linger_req, ver); @@ -1348,7 +1346,7 @@ static int rbd_do_op(struct request *rq, bio, NULL, 0, flags, -1, op, +op, coll, coll_index, rbd_req_cb, 0, NULL); if (ret < 0) @@ -1377,7 +1375,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - 1, op, object_name, ofs, len, buf, NULL, ver); + op, object_name, ofs, len, buf, NULL, ver); rbd_destroy_op(op); return ret; @@ -1405,7 +1403,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, rbd_dev->header_name, 0, 0, NULL, NULL, 0, CEPH_OSD_FLAG_READ, - 1, op, + op, NULL, 0, rbd_simple_req_cb, 0, NULL); @@ -1457,7 +1455,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 1, op, + op, rbd_dev->header_name, 0, 0, NULL, &rbd_dev->watch_request, NULL); @@ -1494,7 +1492,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev,
[PATCH REPOST 3/4] rbd: there is really only one op
Throughout the rbd code there are spots where it appears we can handle an osd request containing more than one osd request op. But that is only the way it appears. In fact, currently only one operation at a time can be supported, and supporting more than one will require much more than fleshing out the support that's there now. This patch changes names to make it perfectly clear that anywhere we're dealing with a block of ops, we're in fact dealing with exactly one of them. We'll be able to simplify some things as a result. When multiple op support is implemented, we can update things again accordingly. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 118 --- 1 file changed, 56 insertions(+), 62 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 0e6cc67..d3472db 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1023,32 +1023,26 @@ out_err: return NULL; } -/* - * helpers for osd request op vectors. - */ -static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op, - int opcode, u32 payload_len) +static struct ceph_osd_req_op *rbd_create_rw_op(int opcode, u32 payload_len) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; - ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO); - if (!ops) + op = kzalloc(sizeof (*op), GFP_NOIO); + if (!op) return NULL; - - ops[0].op = opcode; - /* * op extent offset and length will be set later on * in calc_raw_layout() */ - ops[0].payload_len = payload_len; + op->op = opcode; + op->payload_len = payload_len; - return ops; + return op; } -static void rbd_destroy_ops(struct ceph_osd_req_op *ops) +static void rbd_destroy_op(struct ceph_osd_req_op *op) { - kfree(ops); + kfree(op); } static void rbd_coll_end_req_index(struct request *rq, @@ -1314,7 +1308,7 @@ static int rbd_do_op(struct request *rq, u64 seg_ofs; u64 seg_len; int ret; - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; u32 payload_len; int opcode; int flags; @@ -1340,8 +1334,8 @@ static int rbd_do_op(struct request *rq, } ret = -ENOMEM; - ops = rbd_create_rw_ops(1, opcode, payload_len); - if (!ops) + op = rbd_create_rw_op(opcode, payload_len); + if (!op) goto done; /* we've taken care of segment sizes earlier when we @@ -1354,13 +1348,13 @@ static int rbd_do_op(struct request *rq, bio, NULL, 0, flags, -1, ops, +1, op, coll, coll_index, rbd_req_cb, 0, NULL); if (ret < 0) rbd_coll_end_req_index(rq, coll, coll_index, (s32) ret, seg_len); - rbd_destroy_ops(ops); + rbd_destroy_op(op); done: kfree(seg_name); return ret; @@ -1375,16 +1369,16 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, char *buf, u64 *ver) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_READ, 0); + if (!op) return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - 1, ops, object_name, ofs, len, buf, NULL, ver); - rbd_destroy_ops(ops); + 1, op, object_name, ofs, len, buf, NULL, ver); + rbd_destroy_op(op); return ret; } @@ -1396,26 +1390,26 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, u64 ver, u64 notify_id) { - struct ceph_osd_req_op *ops; + struct ceph_osd_req_op *op; int ret; - ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); - if (!ops) + op = rbd_create_rw_op(CEPH_OSD_OP_NOTIFY_ACK, 0); + if (!op) return -ENOMEM; - ops[0].watch.ver = cpu_to_le64(ver); - ops[0].watch.cookie = notify_id; - ops[0].watch.flag = 0; + op->watch.ver = cpu_to_le64(ver); + op->watch.cookie = notify_id; + op->watch.flag = 0; ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, rbd_dev->header_name, 0, 0, NULL, NULL, 0, CEPH_OSD_FLAG_READ, - 1, ops, + 1, op, NULL, 0, rbd_simple_req_cb, 0, NULL); - rbd_destroy_ops(ops); + rbd_destro
[PATCH REPOST 2/4] libceph: pass num_op with ops
Both ceph_osdc_alloc_request() and ceph_osdc_build_request() are provided an array of ceph osd request operations. Rather than just passing the number of operations in the array, the caller is required append an additional zeroed operation structure to signal the end of the array. All callers know the number of operations at the time these functions are called, so drop the silly zero entry and supply that number directly. As a result, get_num_ops() is no longer needed. This also means that ceph_osdc_alloc_request() never uses its ops argument, so that can be dropped. Also rbd_create_rw_ops() no longer needs to add one to reserve room for the additional op. Signed-off-by: Alex Elder --- drivers/block/rbd.c |9 include/linux/ceph/osd_client.h |3 ++- net/ceph/osd_client.c | 43 ++- 3 files changed, 22 insertions(+), 33 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index cee1334..0e6cc67 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1026,12 +1026,12 @@ out_err: /* * helpers for osd request op vectors. */ -static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, +static struct ceph_osd_req_op *rbd_create_rw_ops(int num_op, int opcode, u32 payload_len) { struct ceph_osd_req_op *ops; - ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); + ops = kzalloc(num_op * sizeof (*ops), GFP_NOIO); if (!ops) return NULL; @@ -1149,7 +1149,7 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO); + osd_req = ceph_osdc_alloc_request(osdc, snapc, num_op, false, GFP_NOIO); if (!osd_req) { ret = -ENOMEM; goto done_pages; @@ -1178,7 +1178,8 @@ static int rbd_do_request(struct request *rq, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime); + ceph_osdc_build_request(osd_req, ofs, len, num_op, ops, + snapc, snapid, &mtime); if (linger_req) { ceph_osdc_set_request_linger(osdc, osd_req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 75f56d3..2b04d05 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -214,12 +214,13 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, - struct ceph_osd_req_op *ops, + unsigned int num_op, bool use_mempool, gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, + unsigned int num_op, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 150166b..06625fa 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -152,25 +152,14 @@ void ceph_osdc_release_request(struct kref *kref) } EXPORT_SYMBOL(ceph_osdc_release_request); -static int get_num_ops(struct ceph_osd_req_op *ops) -{ - int i = 0; - - while (ops[i].op) - i++; - - return i; -} - struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, - struct ceph_osd_req_op *ops, + unsigned int num_op, bool use_mempool, gfp_t gfp_flags) { struct ceph_osd_request *req; struct ceph_msg *msg; - int num_op = get_num_ops(ops); size_t msg_size = sizeof(struct ceph_osd_request_head); msg_size += num_op*sizeof(struct ceph_osd_op); @@ -309,7 +298,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, * */ void ceph_osdc_build_request(struct ceph_osd_request *req, -u64 off, u64 len, +u64 off, u64 len, unsigned int num_op, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, u64 snap_id,
[PATCH REPOST 1/4] rbd: pass num_op with ops array
Add a num_op parameter to rbd_do_request() and rbd_req_sync_op() to indicate the number of entries in the array. The callers of these functions always know how many entries are in the array, so just pass that information down. This is in anticipation of eliminating the extra zero-filled entry in these ops arrays. Signed-off-by: Alex Elder --- drivers/block/rbd.c | 16 +--- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 6b9643a..cee1334 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1119,6 +1119,7 @@ static int rbd_do_request(struct request *rq, struct page **pages, int num_pages, int flags, + unsigned int num_op, struct ceph_osd_req_op *ops, struct rbd_req_coll *coll, int coll_index, @@ -1259,6 +1260,7 @@ static void rbd_simple_req_cb(struct ceph_osd_request *osd_req, */ static int rbd_req_sync_op(struct rbd_device *rbd_dev, int flags, + unsigned int num_op, struct ceph_osd_req_op *ops, const char *object_name, u64 ofs, u64 inbound_size, @@ -1281,7 +1283,7 @@ static int rbd_req_sync_op(struct rbd_device *rbd_dev, object_name, ofs, inbound_size, NULL, pages, num_pages, flags, - ops, + num_op, ops, NULL, 0, NULL, linger_req, ver); @@ -1351,7 +1353,7 @@ static int rbd_do_op(struct request *rq, bio, NULL, 0, flags, -ops, +1, ops, coll, coll_index, rbd_req_cb, 0, NULL); if (ret < 0) @@ -1380,7 +1382,7 @@ static int rbd_req_sync_read(struct rbd_device *rbd_dev, return -ENOMEM; ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, - ops, object_name, ofs, len, buf, NULL, ver); + 1, ops, object_name, ofs, len, buf, NULL, ver); rbd_destroy_ops(ops); return ret; @@ -1408,7 +1410,7 @@ static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, rbd_dev->header_name, 0, 0, NULL, NULL, 0, CEPH_OSD_FLAG_READ, - ops, + 1, ops, NULL, 0, rbd_simple_req_cb, 0, NULL); @@ -1460,7 +1462,7 @@ static int rbd_req_sync_watch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, + 1, ops, rbd_dev->header_name, 0, 0, NULL, &rbd_dev->watch_request, NULL); @@ -1497,7 +1499,7 @@ static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - ops, + 1, ops, rbd_dev->header_name, 0, 0, NULL, NULL, NULL); @@ -1548,7 +1550,7 @@ static int rbd_req_sync_exec(struct rbd_device *rbd_dev, ops[0].cls.indata = outbound; ops[0].cls.indata_len = outbound_size; - ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, ops, + ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, 1, ops, object_name, 0, inbound_size, inbound, NULL, ver); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 0/4] rbd: explicitly support only one osd op
An osd request can be made up of multiple "ops", all of which are completed (or not) transactionally. There is partial support for multiple ops in an rbd request in the rbd code, but it's incomplete and not even supported by the osd client or the messenger right now. I see three problems with this partial implementation: it gives a false impression of how things work; it complicates some code in some cases where it's not necessary; and it may constrain how one might pursue fully implementing multiple ops in a request to ways that don't fit well with how we want to do things. So this series just simplifies things, making it explicit that there is only one op in an kernel osd client request right now. -Alex [PATCH REPOST 1/4] rbd: pass num_op with ops array [PATCH REPOST 2/4] libceph: pass num_op with ops [PATCH REPOST 3/4] rbd: there is really only one op [PATCH REPOST 4/4] rbd: assume single op in a request -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 6/6] libceph: don't set pages or bio in,> ceph_osdc_alloc_request()
Only one of the two callers of ceph_osdc_alloc_request() provides page or bio data for its payload. And essentially all that function was doing with those arguments was assigning them to fields in the osd request structure. Simplify ceph_osdc_alloc_request() by having the caller take care of making those assignments Signed-off-by: Alex Elder --- drivers/block/rbd.c |8 ++-- include/linux/ceph/osd_client.h |4 +--- net/ceph/osd_client.c | 15 ++- 3 files changed, 9 insertions(+), 18 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fc8bfe6..6b9643a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1148,14 +1148,18 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, - false, GFP_NOIO, pages, bio); + osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO); if (!osd_req) { ret = -ENOMEM; goto done_pages; } osd_req->r_flags = flags; + osd_req->r_pages = pages; + if (bio) { + osd_req->r_bio = bio; + bio_get(osd_req->r_bio); + } osd_req->r_callback = rbd_cb; rbd_req->rq = rq; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 6ddda5b..75f56d3 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -216,9 +216,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, - gfp_t gfp_flags, - struct page **pages, - struct bio *bio); + gfp_t gfp_flags); extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 3fe2960..150166b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -166,9 +166,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, - gfp_t gfp_flags, - struct page **pages, - struct bio *bio) + gfp_t gfp_flags) { struct ceph_osd_request *req; struct ceph_msg *msg; @@ -229,13 +227,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, memset(msg->front.iov_base, 0, msg->front.iov_len); req->r_request = msg; - req->r_pages = pages; -#ifdef CONFIG_BLOCK - if (bio) { - req->r_bio = bio; - bio_get(req->r_bio); - } -#endif return req; } @@ -431,9 +422,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } else ops[1].op = 0; - req = ceph_osdc_alloc_request(osdc, snapc, ops, -use_mempool, -GFP_NOFS, NULL, NULL); + req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS); if (!req) return ERR_PTR(-ENOMEM); req->r_flags = flags; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 5/6] libceph: don't set flags in ceph_osdc_alloc_request()
The only thing ceph_osdc_alloc_request() really does with the flags value it is passed is assign it to the newly-created osd request structure. Do that in the caller instead. Both callers subsequently call ceph_osdc_build_request(), so have that function (instead of ceph_osdc_alloc_request()) issue a warning if a request comes through with neither the read nor write flags set. Signed-off-by: Alex Elder --- drivers/block/rbd.c |3 ++- include/linux/ceph/osd_client.h |1 - net/ceph/osd_client.c | 11 --- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 09bcf08..fc8bfe6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1148,13 +1148,14 @@ static int rbd_do_request(struct request *rq, (unsigned long long) len, coll, coll_index); osdc = &rbd_dev->rbd_client->client->osdc; - osd_req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, + osd_req = ceph_osdc_alloc_request(osdc, snapc, ops, false, GFP_NOIO, pages, bio); if (!osd_req) { ret = -ENOMEM; goto done_pages; } + osd_req->r_flags = flags; osd_req->r_callback = rbd_cb; rbd_req->rq = rq; diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index fe3a6e8..6ddda5b 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -213,7 +213,6 @@ extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, struct ceph_osd_req_op *op); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, - int flags, struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 1b55fbe..3fe2960 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -163,7 +163,6 @@ static int get_num_ops(struct ceph_osd_req_op *ops) } struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, - int flags, struct ceph_snap_context *snapc, struct ceph_osd_req_op *ops, bool use_mempool, @@ -200,10 +199,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, INIT_LIST_HEAD(&req->r_req_lru_item); INIT_LIST_HEAD(&req->r_osd_item); - req->r_flags = flags; - - WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); - /* create reply message */ if (use_mempool) msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); @@ -339,6 +334,8 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 data_len = 0; int i; + WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0); + head = msg->front.iov_base; head->snapid = cpu_to_le64(snap_id); op = (void *)(head + 1); @@ -434,12 +431,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, } else ops[1].op = 0; - req = ceph_osdc_alloc_request(osdc, flags, -snapc, ops, + req = ceph_osdc_alloc_request(osdc, snapc, ops, use_mempool, GFP_NOFS, NULL, NULL); if (!req) return ERR_PTR(-ENOMEM); + req->r_flags = flags; /* calculate max write size */ r = calc_layout(vino, layout, off, plen, req, ops); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 4/6] libceph: drop osdc from ceph_calc_raw_layout()
The osdc parameter to ceph_calc_raw_layout() is not used, so get rid of it. Consequently, the corresponding parameter in calc_layout() becomes unused, so get rid of that as well. Signed-off-by: Alex Elder --- drivers/block/rbd.c |2 +- include/linux/ceph/osd_client.h |3 +-- net/ceph/osd_client.c | 10 -- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 82a1460..09bcf08 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1168,7 +1168,7 @@ static int rbd_do_request(struct request *rq, osd_req->r_oid_len = strlen(osd_req->r_oid); rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); - ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout, + ret = ceph_calc_raw_layout(&osd_req->r_file_layout, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0e82a0a..fe3a6e8 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -207,8 +207,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); -extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, +extern int ceph_calc_raw_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 75e57de..1b55fbe 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -38,8 +38,7 @@ static int op_has_extent(int op) op == CEPH_OSD_OP_WRITE); } -int ceph_calc_raw_layout(struct ceph_osd_client *osdc, - struct ceph_file_layout *layout, +int ceph_calc_raw_layout(struct ceph_file_layout *layout, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op) @@ -99,8 +98,7 @@ EXPORT_SYMBOL(ceph_calc_raw_layout); * * fill osd op in request message. */ -static int calc_layout(struct ceph_osd_client *osdc, - struct ceph_vino vino, +static int calc_layout(struct ceph_vino vino, struct ceph_file_layout *layout, u64 off, u64 *plen, struct ceph_osd_request *req, @@ -109,7 +107,7 @@ static int calc_layout(struct ceph_osd_client *osdc, u64 bno; int r; - r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op); + r = ceph_calc_raw_layout(layout, off, plen, &bno, req, op); if (r < 0) return r; @@ -444,7 +442,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, return ERR_PTR(-ENOMEM); /* calculate max write size */ - r = calc_layout(osdc, vino, layout, off, plen, req, ops); + r = calc_layout(vino, layout, off, plen, req, ops); if (r < 0) return ERR_PTR(r); req->r_file_layout = *layout; /* keep a copy */ -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 3/6] libceph: drop snapid in ceph_calc_raw_layout()
A snapshot id must be provided to ceph_calc_raw_layout() even though it is not needed at all for calculating the layout. Where the snapshot id *is* needed is when building the request message for an osd operation. Drop the snapid parameter from ceph_calc_raw_layout() and pass that value instead in ceph_osdc_build_request(). Signed-off-by: Alex Elder --- drivers/block/rbd.c |4 ++-- include/linux/ceph/osd_client.h |2 +- net/ceph/osd_client.c | 14 -- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 1a241a7..82a1460 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1169,10 +1169,10 @@ static int rbd_do_request(struct request *rq, rbd_layout_init(&osd_req->r_file_layout, rbd_dev->spec->pool_id); ret = ceph_calc_raw_layout(osdc, &osd_req->r_file_layout, - snapid, ofs, &len, &bno, osd_req, ops); + ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime); + ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, snapid, &mtime); if (linger_req) { ceph_osdc_set_request_linger(osdc, osd_req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 4bfb458..0e82a0a 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -209,7 +209,6 @@ extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, - u64 snapid, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op); @@ -227,6 +226,7 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, + u64 snap_id, struct timespec *mtime); extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 65b7d11..75e57de 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -40,18 +40,14 @@ static int op_has_extent(int op) int ceph_calc_raw_layout(struct ceph_osd_client *osdc, struct ceph_file_layout *layout, - u64 snapid, u64 off, u64 *plen, u64 *bno, struct ceph_osd_request *req, struct ceph_osd_req_op *op) { - struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; u64 orig_len = *plen; u64 objoff, objlen;/* extent in object */ int r; - reqhead->snapid = cpu_to_le64(snapid); - /* object extent? */ r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, &objoff, &objlen); @@ -113,8 +109,7 @@ static int calc_layout(struct ceph_osd_client *osdc, u64 bno; int r; - r = ceph_calc_raw_layout(osdc, layout, vino.snap, off, -plen, &bno, req, op); + r = ceph_calc_raw_layout(osdc, layout, off, plen, &bno, req, op); if (r < 0) return r; @@ -332,7 +327,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off, u64 len, struct ceph_osd_req_op *src_ops, -struct ceph_snap_context *snapc, +struct ceph_snap_context *snapc, u64 snap_id, struct timespec *mtime) { struct ceph_msg *msg = req->r_request; @@ -347,6 +342,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, int i; head = msg->front.iov_base; + head->snapid = cpu_to_le64(snap_id); op = (void *)(head + 1); p = (void *)(op + num_op); @@ -458,9 +454,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_num_pages = calc_pages_for(page_align, *plen); req->r_page_alignment = page_align; - ceph_osdc_build_request(req, off, *plen, ops, - snapc, - mtime); + ceph_osdc_build_request(req, off, *plen, ops, snapc, vino.snap, mtime); return req; } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 2/6] libceph: pass length to ceph_calc_file_object_mapping()
ceph_calc_file_object_mapping() takes (among other things) a "file" offset and length, and based on the layout, determines the object number ("bno") backing the affected portion of the file's data and the offset into that object where the desired range begins. It also computes the size that should be used for the request--either the amount requested or something less if that would exceed the end of the object. This patch changes the input length parameter in this function so it is used only for input. That is, the argument will be passed by value rather than by address, so the value provided won't get updated by the function. The value would only get updated if the length would surpass the current object, and in that case the value it got updated to would be exactly that returned in *oxlen. Only one of the two callers is affected by this change. Update ceph_calc_raw_layout() so it records any updated value. Signed-off-by: Alex Elder --- fs/ceph/ioctl.c |2 +- include/linux/ceph/osdmap.h |2 +- net/ceph/osd_client.c |6 -- net/ceph/osdmap.c |9 - 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index 36549a4..3b22150 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -194,7 +194,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) return -EFAULT; down_read(&osdc->map_sem); - r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, + r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len, &dl.object_no, &dl.object_offset, &olen); if (r < 0) diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 5ea57ba..1f653e2 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -110,7 +110,7 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map); /* calculate mapping of a file extent to an object */ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, -u64 off, u64 *plen, +u64 off, u64 len, u64 *bno, u64 *oxoff, u64 *oxlen); /* calculate mapping of object to a placement group */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index cdf40c6..65b7d11 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -53,13 +53,15 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc, reqhead->snapid = cpu_to_le64(snapid); /* object extent? */ - r = ceph_calc_file_object_mapping(layout, off, plen, bno, + r = ceph_calc_file_object_mapping(layout, off, orig_len, bno, &objoff, &objlen); if (r < 0) return r; - if (*plen < orig_len) + if (objlen < orig_len) { + *plen = objlen; dout(" skipping last %llu, final file extent %llu~%llu\n", orig_len - *plen, off, *plen); + } if (op_has_extent(op->op)) { op->extent.offset = objoff; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index de73214..9dded22 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1010,7 +1010,7 @@ bad: * pass a stride back to the caller. */ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, - u64 off, u64 *plen, + u64 off, u64 len, u64 *ono, u64 *oxoff, u64 *oxlen) { @@ -1021,7 +1021,7 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, u32 su_per_object; u64 t, su_offset; - dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen, + dout("mapping %llu~%llu osize %u fl_su %u\n", off, len, osize, su); if (su == 0 || sc == 0) goto invalid; @@ -1054,11 +1054,10 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, /* * Calculate the length of the extent being written to the selected -* object. This is the minimum of the full length requested (plen) or +* object. This is the minimum of the full length requested (len) or * the remainder of the current stripe being written to. */ - *oxlen = min_t(u64, *plen, su - su_offset); - *plen = *oxlen; + *oxlen = min_t(u64, len, su - su_offset); dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); return 0; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 1/6] libceph: pass length to ceph_osdc_build_request()
The len argument to ceph_osdc_build_request() is set up to be passed by address, but that function never updates its value so there's no need to do this. Tighten up the interface by passing the length directly. Signed-off-by: Alex Elder --- drivers/block/rbd.c |2 +- include/linux/ceph/osd_client.h |2 +- net/ceph/osd_client.c |6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index cc12154..1a241a7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1172,7 +1172,7 @@ static int rbd_do_request(struct request *rq, snapid, ofs, &len, &bno, osd_req, ops); rbd_assert(ret == 0); - ceph_osdc_build_request(osd_req, ofs, &len, ops, snapc, &mtime); + ceph_osdc_build_request(osd_req, ofs, len, ops, snapc, &mtime); if (linger_req) { ceph_osdc_set_request_linger(osdc, osd_req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 61562c7..4bfb458 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -224,7 +224,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client * struct bio *bio); extern void ceph_osdc_build_request(struct ceph_osd_request *req, - u64 off, u64 *plen, + u64 off, u64 len, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, struct timespec *mtime); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 2adbfcc..cdf40c6 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -328,7 +328,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req, * */ void ceph_osdc_build_request(struct ceph_osd_request *req, -u64 off, u64 *plen, +u64 off, u64 len, struct ceph_osd_req_op *src_ops, struct ceph_snap_context *snapc, struct timespec *mtime) @@ -382,7 +382,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, if (flags & CEPH_OSD_FLAG_WRITE) { req->r_request->hdr.data_off = cpu_to_le16(off); - req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len); + req->r_request->hdr.data_len = cpu_to_le32(len + data_len); } else if (data_len) { req->r_request->hdr.data_off = 0; req->r_request->hdr.data_len = cpu_to_le32(data_len); @@ -456,7 +456,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, req->r_num_pages = calc_pages_for(page_align, *plen); req->r_page_alignment = page_align; - ceph_osdc_build_request(req, off, plen, ops, + ceph_osdc_build_request(req, off, *plen, ops, snapc, mtime); -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH REPOST 0/6] libceph: parameter cleanup
This series mostly cleans up parameters used by functions in libceph, in the osd client code. -Alex [PATCH REPOST 1/6] libceph: pass length to ceph_osdc_build_request() [PATCH REPOST 2/6] libceph: pass length to ceph_calc_file_object_mapping() [PATCH REPOST 3/6] libceph: drop snapid in ceph_calc_raw_layout() [PATCH REPOST 4/6] libceph: drop osdc from ceph_calc_raw_layout() [PATCH REPOST 5/6] libceph: don't set flags in ceph_osdc_alloc_request() [PATCH REPOST 6/6] libceph: don't set pages or bio in ceph_osdc_alloc_request() -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] Librados aio stat
Hi Team, Is there any progress or any comments regarding the librados aio stat patch ? Best regards On 12/20/2012 10:05 PM, Filippos Giannakos wrote: Hi Team, Here is the patch with the changes, plus the tests you requested. Best regards, Filippos -- Filippos. -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/6] ceph: check mds_wanted for imported cap
From: "Yan, Zheng" The MDS may have incorrect wanted caps after importing caps. So the client should check the value mds has and send cap update if necessary. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 10 ++ 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7e90299..16c10f8 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2425,10 +2425,12 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ceph_cap_string(used), ceph_cap_string(dirty)); if (wanted != le32_to_cpu(grant->wanted)) { - dout("mds wanted %s -> %s\n", -ceph_cap_string(le32_to_cpu(grant->wanted)), -ceph_cap_string(wanted)); - grant->wanted = cpu_to_le32(wanted); + dout("mds wanted = %s\n", +ceph_cap_string(le32_to_cpu(grant->wanted))); + /* imported cap may not have correct mds_wanted */ + if (cap == ci->i_auth_cap && + (wanted & ~(cap->mds_wanted | cap->issued))) + check_caps = 1; } cap->seq = seq; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 6/6] ceph: don't acquire i_mutex ceph_vmtruncate_work
From: "Yan, Zheng" In commit 22cddde104, ceph_get_caps() was moved into ceph_write_begin(). So ceph_get_caps() can be called while i_mutex is locked. If there is pending vmtruncate, ceph_get_caps() will wait for it to complete, but ceph_vmtruncate_work() is blocked by the locked i_mutex. There are several places that call __ceph_do_pending_vmtruncate() without holding the i_mutex, I think it's OK to not acquire the i_mutex in ceph_vmtruncate_work() Signed-off-by: Yan, Zheng --- fs/ceph/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 81613bc..c895c7f 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1420,7 +1420,7 @@ out: /* - * called by trunc_wq; take i_mutex ourselves + * called by trunc_wq; * * We also truncate in a separate thread as well. */ @@ -1431,9 +1431,7 @@ static void ceph_vmtruncate_work(struct work_struct *work) struct inode *inode = &ci->vfs_inode; dout("vmtruncate_work %p\n", inode); - mutex_lock(&inode->i_mutex); __ceph_do_pending_vmtruncate(inode); - mutex_unlock(&inode->i_mutex); iput(inode); } -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/6] ceph: allow revoking duplicated caps issued by non-auth MDS
From: "Yan, Zheng" Allow revoking duplicated caps issued by non-auth MDS if these caps are also issued by auth MDS. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a9fe2d5..c90b245 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1468,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; - int file_wanted, used; + int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list @@ -1577,6 +1577,10 @@ retry_locked: ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); + cap_used = used; + if (ci->i_auth_cap && cap != ci->i_auth_cap) + cap_used &= ~ci->i_auth_cap->issued; + if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ @@ -1601,7 +1605,7 @@ retry_locked: } /* completed revocation? going down and there are no caps? */ - if (revoking && (revoking & used) == 0) { + if (revoking && (revoking & cap_used) == 0) { dout("completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; @@ -1678,8 +1682,8 @@ ack: sent++; /* __send_cap drops i_ceph_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, - retain, flushing, NULL); + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, + want, retain, flushing, NULL); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/6] ceph: allocate cap_release message when receiving cap import
From: "Yan, Zheng" When client wants to release an imported cap, it's possible there is no reserved cap_release message in corresponding mds session. so __queue_cap_release causes kernel panic. Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c90b245..7e90299 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2832,6 +2832,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); + if (op == CEPH_CAP_OP_IMPORT) + ceph_add_cap_releases(mdsc, session); + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/6] ceph: re-calculate truncate_size for strip object
From: "Yan, Zheng" Otherwise osd may truncate the object to larger size. Signed-off-by: Yan, Zheng --- net/ceph/osd_client.c | 8 1 file changed, 8 insertions(+) diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index eb9a444..267f183 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -76,8 +76,16 @@ int ceph_calc_raw_layout(struct ceph_osd_client *osdc, orig_len - *plen, off, *plen); if (op_has_extent(op->op)) { + u32 osize = le32_to_cpu(layout->fl_object_size); op->extent.offset = objoff; op->extent.length = objlen; + if (op->extent.truncate_size <= off - objoff) { + op->extent.truncate_size = 0; + } else { + op->extent.truncate_size -= off - objoff; + if (op->extent.truncate_size > osize) + op->extent.truncate_size = osize; + } } req->r_num_pages = calc_pages_for(off, *plen); req->r_page_alignment = off & ~PAGE_MASK; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/6] ceph: move dirty inode to migrating list when clearing auth caps
From: "Yan, Zheng" Signed-off-by: Yan, Zheng --- fs/ceph/caps.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index a1d9bb3..a9fe2d5 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -611,8 +611,16 @@ retry: if (flags & CEPH_CAP_FLAG_AUTH) ci->i_auth_cap = cap; - else if (ci->i_auth_cap == cap) + else if (ci->i_auth_cap == cap) { ci->i_auth_cap = NULL; + spin_lock(&mdsc->cap_dirty_lock); + if (!list_empty(&ci->i_dirty_item)) { + dout(" moving %p to cap_dirty_migrating\n", inode); + list_move(&ci->i_dirty_item, + &mdsc->cap_dirty_migrating); + } + spin_unlock(&mdsc->cap_dirty_lock); + } dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/6] fixes for cephfs kernel client
From: "Yan, Zheng" These patches are client side modification for multiple MDS setup. I'm not quite sure if patch 6 is correct, please review it. These patches are also in: git://github.com/ukernel/ceph-client.git wip-ceph -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 27/29] mds: check if stray dentry is needed
From: "Yan, Zheng" The necessity of stray dentry can change before the request acquires all locks. Signed-off-by: Yan, Zheng --- src/mds/Server.cc | 26 -- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 2531005..1557b30 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -4531,11 +4531,14 @@ void Server::handle_client_unlink(MDRequest *mdr) // -- create stray dentry? -- CDentry *straydn = mdr->straydn; - if (dnl->is_primary() && !straydn) { -straydn = mdcache->get_or_create_stray_dentry(dnl->get_inode()); -mdr->pin(straydn); -mdr->straydn = straydn; - } + if (dnl->is_primary()) { +if (!straydn) { + straydn = mdcache->get_or_create_stray_dentry(dnl->get_inode()); + mdr->pin(straydn); + mdr->straydn = straydn; +} + } else if (straydn) +straydn = NULL; if (straydn) dout(10) << " straydn is " << *straydn << dendl; @@ -5274,11 +5277,14 @@ void Server::handle_client_rename(MDRequest *mdr) // -- create stray dentry? -- CDentry *straydn = mdr->straydn; - if (destdnl->is_primary() && !linkmerge && !straydn) { -straydn = mdcache->get_or_create_stray_dentry(destdnl->get_inode()); -mdr->pin(straydn); -mdr->straydn = straydn; - } + if (destdnl->is_primary() && !linkmerge) { +if (!straydn) { + straydn = mdcache->get_or_create_stray_dentry(destdnl->get_inode()); + mdr->pin(straydn); + mdr->straydn = straydn; +} + } else if (straydn) +straydn = NULL; if (straydn) dout(10) << " straydn is " << *straydn << dendl; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 28/29] mds: don't issue caps while inode is exporting caps
From: "Yan, Zheng" If issue caps while inode is exporting caps, the client will drop the caps soon when it receives the CAP_OP_EXPORT message, but the client will not receive corresponding CAP_OP_IMPORT message. Except open file request, it's OK to not issue caps for client requests. If an non-auth MDS receives open file request but it can't issue caps, forward the request to auth MDS. Signed-off-by: Yan, Zheng --- src/mds/CInode.cc | 10 ++ src/mds/Server.cc | 22 +++--- 2 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 4314ab5..3009f0c 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -2604,17 +2604,19 @@ int CInode::encode_inodestat(bufferlist& bl, Session *session, { int client = session->inst.name.num(); assert(snapid); - assert(session->connection); bool valid = true; // do not issue caps if inode differs from readdir snaprealm SnapRealm *realm = find_snaprealm(); - bool no_caps = (realm && dir_realm && realm != dir_realm); + bool no_caps = (realm && dir_realm && realm != dir_realm) || +is_frozen() || state_test(CInode::STATE_EXPORTINGCAPS); if (no_caps) -dout(20) << "encode_inodestat realm=" << realm << " snaprealm " << snaprealm -<< " no_caps=" << no_caps << dendl; +dout(20) << "encode_inodestat no caps" +<< ((realm && dir_realm && realm != dir_realm)?", snaprealm differs ":"") +<< (state_test(CInode::STATE_EXPORTINGCAPS)?", exporting caps":"") +<< (is_frozen()?", frozen inode":"") << dendl; // pick a version! inode_t *oi = &inode; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 1557b30..b70445e 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1936,14 +1936,14 @@ CInode* Server::rdlock_path_pin_ref(MDRequest *mdr, int n, want_auth = true; if (want_auth) { +if (ref->is_ambiguous_auth()) { + dout(10) << "waiting for single auth on " << *ref << dendl; + ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); + return 0; +} if (!ref->is_auth()) { - if (ref->is_ambiguous_auth()) { - dout(10) << "waiting for single auth on " << *ref << dendl; - ref->add_waiter(CInode::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr)); - } else { - dout(10) << "fw to auth for " << *ref << dendl; - mdcache->request_forward(mdr, ref->authority().first); - } + dout(10) << "fw to auth for " << *ref << dendl; + mdcache->request_forward(mdr, ref->authority().first); return 0; } @@ -2435,6 +2435,14 @@ void Server::handle_client_open(MDRequest *mdr) if (!cur) return; + if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) { +assert(!need_auth); +mdr->done_locking = false; +CInode *cur = rdlock_path_pin_ref(mdr, 0, rdlocks, true); +if (!cur) + return; + } + if (mdr->snapid != CEPH_NOSNAP && mdr->client_request->may_write()) { reply_request(mdr, -EROFS); return; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 29/29] mds: optimize C_MDC_RetryOpenRemoteIno
From: "Yan, Zheng" When opening remote inode, C_MDC_RetryOpenRemoteIno is used as onfinish context for discovering remote inode. When it is called, the MDS may already have the inode. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 236a0b9..9557436 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -7049,7 +7049,11 @@ public: C_MDC_RetryOpenRemoteIno(MDCache *mdc, inodeno_t i, Context *c, bool wx) : mdcache(mdc), ino(i), want_xlocked(wx), onfinish(c) {} void finish(int r) { -mdcache->open_remote_ino(ino, onfinish, want_xlocked); +if (mdcache->get_inode(ino)) { + onfinish->finish(0); + delete onfinish; +} else + mdcache->open_remote_ino(ino, onfinish, want_xlocked); } }; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 25/29] mds: check null context in CDir::fetch()
From: "Yan, Zheng" Signed-off-by: Yan, Zheng --- src/mds/CDir.cc | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 91636cc..22cdf48 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1351,8 +1351,11 @@ void CDir::fetch(Context *c, const string& want_dn, bool ignore_authpinnability) assert(!is_complete()); if (!can_auth_pin() && !ignore_authpinnability) { -dout(7) << "fetch waiting for authpinnable" << dendl; -add_waiter(WAIT_UNFREEZE, c); +if (c) { + dout(7) << "fetch waiting for authpinnable" << dendl; + add_waiter(WAIT_UNFREEZE, c); +} else + dout(7) << "fetch not authpinnable and no context" << dendl; return; } -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 26/29] mds: drop locks when opening remote dentry
From: "Yan, Zheng" Opening remote dentry while holding locks may cause dead lock. For example, 'discover' is blocked by a xlocked dentry, the request holding the xlock is blocked by the locks hold by the readdir request. Signed-off-by: Yan, Zheng --- src/mds/Server.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 119af98..2531005 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2901,6 +2901,8 @@ void Server::handle_client_readdir(MDRequest *mdr) break; } + mds->locker->drop_locks(mdr); + mdr->drop_local_auth_pins(); mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr)); return; } -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 23/29] mds: fix cap mask for ifile lock
From: "Yan, Zheng" ifile lock has 8 cap bits, should its cap mask should be 0xff Signed-off-by: Yan, Zheng --- src/mds/SimpleLock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/SimpleLock.h b/src/mds/SimpleLock.h index 5ea2d4b..9531a01 100644 --- a/src/mds/SimpleLock.h +++ b/src/mds/SimpleLock.h @@ -273,7 +273,7 @@ public: } int get_cap_mask() const { switch (get_type()) { -case CEPH_LOCK_IFILE: return 0xf; +case CEPH_LOCK_IFILE: return 0xff; default: return 0x3; } } -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 24/29] mds: rdlock prepended dest trace when handling rename
From: "Yan, Zheng" rdlock prepended dest trace to prevent them from being xlocked by someone else. Signed-off-by: Yan, Zheng --- src/mds/Server.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 8d05a37..119af98 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -5228,6 +5228,7 @@ void Server::handle_client_rename(MDRequest *mdr) while (destbase != srcbase) { desttrace.insert(desttrace.begin(), destbase->get_projected_parent_dn()); + rdlocks.insert(&desttrace[0]->lock); dout(10) << "rename prepending desttrace with " << *desttrace[0] << dendl; destbase = destbase->get_projected_parent_dn()->get_dir()->get_inode(); } -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 18/29] mds: fix rename inode exportor check
From: "Yan, Zheng" Use "srcdn->is_auth() && destdnl->is_primary()" to check if the MDS is inode exportor of rename operation is not reliable, This is because OP_FINISH slave request may race with subtree import. The fix is use a variable in MDRequest to indicate if the MDS is inode exportor. Signed-off-by: Yan, Zheng --- src/mds/Mutation.h | 4 ++-- src/mds/Server.cc | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index bf59dba..d0d3eca 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -213,7 +213,7 @@ struct MDRequest : public Mutation { bufferlist inode_import; version_t inode_import_v; CInode* destdn_was_remote_inode; -bool was_link_merge; +bool was_inode_exportor; int prepared_inode_exporter; // has asked auth of srci to mark srci as ambiguous auth map imported_client_map; @@ -233,7 +233,7 @@ struct MDRequest : public Mutation { More() : src_reanchor_atid(0), dst_reanchor_atid(0), inode_import_v(0), - destdn_was_remote_inode(0), was_link_merge(false), + destdn_was_remote_inode(0), was_inode_exportor(false), prepared_inode_exporter(-1), flock_was_waiting(false), stid(0), slave_commit(0) { } diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 6097552..c5793e5 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -6261,6 +6261,7 @@ void Server::_logged_slave_rename(MDRequest *mdr, // remove mdr auth pin mdr->auth_unpin(srcdnl->get_inode()); +mdr->more()->was_inode_exportor = true; dout(10) << " exported srci " << *srcdnl->get_inode() << dendl; } @@ -6299,7 +6300,7 @@ void Server::_commit_slave_rename(MDRequest *mdr, int r, // unfreeze+singleauth inode // hmm, do i really need to delay this? -if (srcdn->is_auth() && destdnl->is_primary()) { +if (mdr->more()->was_inode_exportor) { CInode *in = destdnl->get_inode(); -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 21/29] mds: keep dentry lock in sync state as much as possible
From: "Yan, Zheng" Unlike locks of other types, dentry lock in unreadable state can block path traverse, so it should be in sync state as much as possible. there are two rare cases that dentry lock is not set to sync state: the dentry becomes replicated; finishing xlock but the dentry is freezing. In commit efbca31d, I tried fixing the issue that unreadable replica dentry blocks path traverse by modifying MDCache::path_traverse(), but it does not work. This patch also reverts that change. Signed-off-by: Yan, Zheng --- src/mds/CDentry.h | 3 +++ src/mds/Locker.cc | 5 - src/mds/MDCache.cc | 6 +- src/mds/Migrator.cc | 3 +++ 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/mds/CDentry.h b/src/mds/CDentry.h index 480e562..aa10bf9 100644 --- a/src/mds/CDentry.h +++ b/src/mds/CDentry.h @@ -301,6 +301,9 @@ public: // -- replication void encode_replica(int mds, bufferlist& bl) { +if (!is_replicated()) + lock.replicate_relax(); + __u32 nonce = add_replica(mds); ::encode(nonce, bl); ::encode(first, bl); diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index cba894b..9b3bfe3 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -1425,13 +1425,16 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut) void Locker::_finish_xlock(SimpleLock *lock, bool *pneed_issue) { assert(!lock->is_stable()); - lock->get_parent()->auth_unpin(lock); if (lock->get_type() != CEPH_LOCK_DN && ((CInode*)lock->get_parent())->get_loner() >= 0) lock->set_state(LOCK_EXCL); else lock->set_state(LOCK_LOCK); + if (lock->get_type() == CEPH_LOCK_DN && lock->get_parent()->is_replicated() && + !lock->is_waiter_for(SimpleLock::WAIT_WR)) +simple_sync(lock, pneed_issue); if (lock->get_cap_shift()) *pneed_issue = true; + lock->get_parent()->auth_unpin(lock); } void Locker::xlock_finish(SimpleLock *lock, Mutation *mut, bool *pneed_issue) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 5adf4b7..236a0b9 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6735,14 +6735,10 @@ int MDCache::path_traverse(MDRequest *mdr, Message *req, Context *fin, // wh (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) { dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl; return -ENOENT; - } else if (curdir->is_auth()) { + } else { dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl; dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin)); return 1; - } else { - // non-auth and can not read, treat this as no dentry - dn = NULL; - dnl = NULL; } } diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 8686c86..bda8035 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -1135,6 +1135,9 @@ int Migrator::encode_export_dir(bufferlist& exportbl, CDentry *dn = it->second; CInode *in = dn->get_linkage()->get_inode(); +if (!dn->is_replicated()) + dn->lock.replicate_relax(); + num_exported++; // -- dentry -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 22/29] mds: fix replica state for LOCK_MIX_LOCK
From: "Yan, Zheng" LOCK_MIX_LOCK state is for gathering local locks and caps, so replica state should be LOCK_MIX. Signed-off-by: Yan, Zheng --- src/mds/locks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mds/locks.c b/src/mds/locks.c index ce1a6dd..73b99fb 100644 --- a/src/mds/locks.c +++ b/src/mds/locks.c @@ -64,7 +64,7 @@ static const struct sm_state_t scatterlock[LOCK_MAX] = { [LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, ANY, 0,0,0,0 }, [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, 0, 0, 0, 0, 0, 0,0,0,0 }, -[LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0,0, 0, 0, 0, 0, 0, 0,0,0,0 }, +[LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, 0,0, 0, 0, 0, 0, 0, 0,0,0,0 }, [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, 0,0, 0, 0, 0, 0, 0, 0,0,0,0 }, [LOCK_TSYN_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0,0, 0, 0, 0, 0, 0, 0,0,0,0 }, @@ -98,7 +98,7 @@ const struct sm_state_t filelock[LOCK_MAX] = { [LOCK_LOCK] = { 0, false, LOCK_LOCK, AUTH, 0, REQ, AUTH,0, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, [LOCK_SYNC_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, CEPH_CAP_GCACHE,0,0,CEPH_CAP_GCACHE }, [LOCK_EXCL_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0,0, 0, 0, XCL, 0, 0, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,CEPH_CAP_GCACHE }, -[LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 }, +[LOCK_MIX_LOCK] = { LOCK_LOCK, false, LOCK_MIX, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 }, [LOCK_MIX_LOCK2] = { LOCK_LOCK, false, LOCK_LOCK, AUTH, 0, REQ, 0, 0, 0, 0, 0,0,0,0 }, [LOCK_PREXLOCK] = { LOCK_LOCK, false, LOCK_LOCK, 0,XCL, 0, 0, 0, 0, ANY, CEPH_CAP_GCACHE|CEPH_CAP_GBUFFER,0,0,0 }, -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 20/29] mds: forbid creating file in deleted directory
From: "Yan, Zheng" Signed-off-by: Yan, Zheng --- src/mds/CInode.cc | 5 - src/mds/CInode.h | 2 ++ src/mds/Server.cc | 13 - 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 9d29b46..4314ab5 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -245,7 +245,10 @@ void CInode::print(ostream& out) out << *this; } - +bool CInode::is_in_stray() +{ + return !is_base() && get_projected_parent_dir()->inode->is_stray(); +} void CInode::add_need_snapflush(CInode *snapin, snapid_t snapid, client_t client) { diff --git a/src/mds/CInode.h b/src/mds/CInode.h index a627a4f..6eb396d 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -520,6 +520,8 @@ private: bool is_head() { return last == CEPH_NOSNAP; } + bool is_in_stray(); + // note: this overloads MDSCacheObject bool is_ambiguous_auth() { return state_test(STATE_AMBIGUOUSAUTH) || diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 517339f..8d05a37 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -2633,6 +2633,13 @@ void Server::handle_client_openc(MDRequest *mdr) reply_request(mdr, -EROFS); return; } + + CInode *diri = dn->get_dir()->get_inode(); + if (diri->is_in_stray()) { +reply_request(mdr, -ENOENT); +return; + } + // set layout ceph_file_layout layout; if (dir_layout) @@ -2669,7 +2676,6 @@ void Server::handle_client_openc(MDRequest *mdr) return; } - CInode *diri = dn->get_dir()->get_inode(); rdlocks.insert(&diri->authlock); if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) return; @@ -5123,6 +5129,11 @@ void Server::handle_client_rename(MDRequest *mdr) CDir *destdir = destdn->get_dir(); assert(destdir->is_auth()); + if (destdir->get_inode()->is_in_stray()) { +reply_request(mdr, -ENOENT); +return; + } + int r = mdcache->path_traverse(mdr, NULL, NULL, srcpath, &srctrace, NULL, MDS_TRAVERSE_DISCOVER); if (r > 0) return; // delayed -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 19/29] mds: disable concurrent remote locking
From: "Yan, Zheng" Current code allows multiple MDRequests to concurrently acquire a remote lock. But a lock ACK message wakes all requests because they were all put to the same waiting queue. One request gets the lock, the rest requests will re-send the OP_WRLOCK/OPWRLOCK slave requests and trigger assertion on remote MDS. The fix is disable concurrently acquiring remote lock, send OP_WRLOCK/OPWRLOCK slave request only if there is no on-going slave request. Signed-off-by: Yan, Zheng --- src/mds/Locker.cc | 54 -- src/mds/Server.cc | 2 ++ 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index dbf4452..cba894b 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -532,14 +532,18 @@ void Locker::cancel_locking(Mutation *mut, set *pneed_issue) assert(lock); dout(10) << "cancel_locking " << *lock << " on " << *mut << dendl; - if (lock->get_type() != CEPH_LOCK_DN) { -bool need_issue = false; -if (lock->get_state() == LOCK_PREXLOCK) - _finish_xlock(lock, &need_issue); -if (lock->is_stable()) - eval(lock, &need_issue); -if (need_issue) - pneed_issue->insert((CInode *)lock->get_parent()); + if (lock->get_parent()->is_auth()) { +if (lock->get_type() != CEPH_LOCK_DN) { + bool need_issue = false; + if (lock->get_state() == LOCK_PREXLOCK) + _finish_xlock(lock, &need_issue); + if (lock->is_stable()) + eval(lock, &need_issue); + if (need_issue) + pneed_issue->insert((CInode *)lock->get_parent()); +} + } else { +lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); } mut->finish_locking(lock); } @@ -1314,14 +1318,17 @@ void Locker::remote_wrlock_start(SimpleLock *lock, int target, MDRequest *mut) new C_MDS_RetryRequest(mdcache, mut)); return; } - + // send lock request - mut->more()->slaves.insert(target); - MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt, -MMDSSlaveRequest::OP_WRLOCK); - r->set_lock_type(lock->get_type()); - lock->get_parent()->set_object_info(r->get_object_info()); - mds->send_message_mds(r, target); + if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) { +mut->start_locking(lock); +mut->more()->slaves.insert(target); +MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt, + MMDSSlaveRequest::OP_WRLOCK); +r->set_lock_type(lock->get_type()); +lock->get_parent()->set_object_info(r->get_object_info()); +mds->send_message_mds(r, target); + } // wait lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mut)); @@ -1398,13 +1405,16 @@ bool Locker::xlock_start(SimpleLock *lock, MDRequest *mut) } // send lock request -int auth = lock->get_parent()->authority().first; -mut->more()->slaves.insert(auth); -MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt, - MMDSSlaveRequest::OP_XLOCK); -r->set_lock_type(lock->get_type()); -lock->get_parent()->set_object_info(r->get_object_info()); -mds->send_message_mds(r, auth); +if (!lock->is_waiter_for(SimpleLock::WAIT_REMOTEXLOCK)) { + mut->start_locking(lock); + int auth = lock->get_parent()->authority().first; + mut->more()->slaves.insert(auth); + MMDSSlaveRequest *r = new MMDSSlaveRequest(mut->reqid, mut->attempt, +MMDSSlaveRequest::OP_XLOCK); + r->set_lock_type(lock->get_type()); + lock->get_parent()->set_object_info(r->get_object_info()); + mds->send_message_mds(r, auth); +} // wait lock->add_waiter(SimpleLock::WAIT_REMOTEXLOCK, new C_MDS_RetryRequest(mdcache, mut)); diff --git a/src/mds/Server.cc b/src/mds/Server.cc index c5793e5..517339f 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1324,6 +1324,7 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m) dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl; mdr->xlocks.insert(lock); mdr->locks.insert(lock); + mdr->finish_locking(lock); lock->get_xlock(mdr, mdr->get_client()); lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); } @@ -1338,6 +1339,7 @@ void Server::handle_slave_request_reply(MMDSSlaveRequest *m) dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl; mdr->remote_wrlocks[lock] = from; mdr->locks.insert(lock); + mdr->finish_locking(lock); lock->finish_waiters(SimpleLock::WAIT_REMOTEXLOCK); } break; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.ke
[PATCH 17/29] mds: call maybe_eval_stray after removing a replica dentry
From: "Yan, Zheng" MDCache::handle_cache_expire() processes dentries after inodes, so the MDCache::maybe_eval_stray() in MDCache::inode_remove_replica() always fails to remove stray inode because MDCache::eval_stray() checks if the stray inode's dentry is replicated. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 3410b0f..5adf4b7 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -6120,9 +6120,11 @@ void MDCache::dentry_remove_replica(CDentry *dn, int from) // fix lock if (dn->lock.remove_replica(from)) mds->locker->eval_gather(&dn->lock); -} - + CDentry::linkage_t *dnl = dn->get_projected_linkage(); + if (dnl->is_primary()) +maybe_eval_stray(dnl->get_inode()); +} void MDCache::trim_client_leases() { -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 16/29] mds: don't defer processing caps if inode is auth pinned
From: "Yan, Zheng" We should not defer processing caps if the inode is auth pinned by MDRequest, because the MDRequest may change lock state of the inode later and wait for the deferred caps. Signed-off-by: Yan, Zheng --- src/mds/Locker.cc | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index ee2d4cc..dbf4452 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2199,12 +2199,11 @@ bool Locker::should_defer_client_cap_frozen(CInode *in) * Currently, a request wait if anything locked is freezing (can't * auth_pin), which would avoid any deadlock with cap release. Thus @in * _MUST_ be in the lock/auth_pin set. + * + * auth_pins==0 implies no unstable lock and not auth pinnned by + * client request, otherwise continue even it's freezing. */ - return (in->is_freezing() && (in->filelock.is_stable() && - in->authlock.is_stable() && - in->xattrlock.is_stable() && - in->linklock.is_stable())) || // continue if freezing and lock is unstable -in->is_frozen(); + return (in->is_freezing() && in->get_num_auth_pins() == 0) || in->is_frozen(); } /* -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 15/29] mds: remove unnecessary is_xlocked check
From: "Yan, Zheng" Locker::foo_eval() is always called for stable locks, so no need to check if the lock is xlocked. Signed-off-by: Yan, Zheng --- src/mds/Locker.cc | 21 - 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index e826321..ee2d4cc 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -3291,7 +3291,6 @@ void Locker::simple_eval(SimpleLock *lock, bool *need_issue) // stable -> sync? else if (lock->get_state() != LOCK_SYNC && - !lock->is_xlocked() && !lock->is_wrlocked() && ((!(wanted & CEPH_CAP_GEXCL) && !lock->is_waiter_for(SimpleLock::WAIT_WR)) || (lock->get_state() == LOCK_EXCL && in && in->get_target_loner() < 0))) { @@ -3334,8 +,6 @@ bool Locker::simple_sync(SimpleLock *lock, bool *need_issue) int gather = 0; if (lock->is_wrlocked()) gather++; -if (lock->is_xlocked()) - gather++; if (lock->get_parent()->is_replicated() && old_state == LOCK_MIX) { send_lock_message(lock, LOCK_AC_SYNC); @@ -3412,9 +3409,7 @@ void Locker::simple_excl(SimpleLock *lock, bool *need_issue) gather++; if (lock->is_wrlocked()) gather++; - if (lock->is_xlocked()) -gather++; - + if (lock->get_parent()->is_replicated() && lock->get_state() != LOCK_LOCK_EXCL && lock->get_state() != LOCK_XSYN_EXCL) { @@ -3695,7 +3690,6 @@ void Locker::scatter_eval(ScatterLock *lock, bool *need_issue) } if (!lock->is_rdlocked() && - !lock->is_xlocked() && lock->get_state() != LOCK_MIX && lock->get_scatter_wanted()) { dout(10) << "scatter_eval scatter_wanted, bump to mix " << *lock @@ -3706,8 +3700,7 @@ void Locker::scatter_eval(ScatterLock *lock, bool *need_issue) if (lock->get_type() == CEPH_LOCK_INEST) { // in general, we want to keep INEST writable at all times. -if (!lock->is_rdlocked() && - !lock->is_xlocked()) { +if (!lock->is_rdlocked()) { if (lock->get_parent()->is_replicated()) { if (lock->get_state() != LOCK_MIX) scatter_mix(lock, need_issue); @@ -3723,7 +3716,6 @@ void Locker::scatter_eval(ScatterLock *lock, bool *need_issue) if (!in->has_subtree_root_dirfrag() || in->is_base()) { // i _should_ be sync. if (!lock->is_wrlocked() && - !lock->is_xlocked() && lock->get_state() != LOCK_SYNC) { dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << dendl; simple_sync(lock, need_issue); @@ -3909,8 +3901,6 @@ void Locker::scatter_tempsync(ScatterLock *lock, bool *need_issue) int gather = 0; if (lock->is_wrlocked()) gather++; - if (lock->is_xlocked()) -gather++; if (lock->get_cap_shift() && in->is_head() && @@ -4044,8 +4034,7 @@ void Locker::file_eval(ScatterLock *lock, bool *need_issue) assert(lock->get_parent()->is_auth()); assert(lock->is_stable()); - if (lock->is_xlocked() || - lock->get_parent()->is_freezing_or_frozen()) + if (lock->get_parent()->is_freezing_or_frozen()) return; // excl -> *? @@ -4297,9 +4286,7 @@ void Locker::file_xsyn(SimpleLock *lock, bool *need_issue) int gather = 0; if (lock->is_wrlocked()) gather++; - if (lock->is_xlocked()) -gather++; - + if (in->is_head() && in->issued_caps_need_gather(lock)) { if (need_issue) -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 14/29] mds: fix lock state transition check
From: "Yan, Zheng" Locker::simple_excl() and Locker::scatter_mix() miss is_rdlocked check; Locker::file_excl() miss is_rdlocked check and is_wrlocked check. Signed-off-by: Yan, Zheng --- src/mds/Locker.cc | 9 + 1 file changed, 9 insertions(+) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index dec0a94..e826321 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -3408,6 +3408,8 @@ void Locker::simple_excl(SimpleLock *lock, bool *need_issue) } int gather = 0; + if (lock->is_rdlocked()) +gather++; if (lock->is_wrlocked()) gather++; if (lock->is_xlocked()) @@ -4167,6 +4169,8 @@ void Locker::scatter_mix(ScatterLock *lock, bool *need_issue) } int gather = 0; +if (lock->is_rdlocked()) + gather++; if (in->is_replicated()) { if (lock->get_state() != LOCK_EXCL_MIX && // EXCL replica is already LOCK lock->get_state() != LOCK_XSYN_EXCL) { // XSYN replica is already LOCK; ** FIXME here too! @@ -4237,6 +4241,11 @@ void Locker::file_excl(ScatterLock *lock, bool *need_issue) } int gather = 0; + if (lock->is_rdlocked()) +gather++; + if (lock->is_wrlocked()) +gather++; + if (in->is_replicated() && lock->get_state() != LOCK_LOCK_EXCL && lock->get_state() != LOCK_XSYN_EXCL) { // if we were lock, replicas are already lock. -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 11/29] mds: fix anchor table commit race
From: "Yan, Zheng" Anchor table updates for a given inode is fully serialized on client side. But due to network latency, two commit requests from different clients can arrive to anchor server out of order. The anchor table gets corrupted if updates are committed in wrong order. The fix is track on-going anchor updates for individual inode and delay processing commit requests that arrive out of order. Signed-off-by: Yan, Zheng --- src/mds/AnchorServer.cc | 61 +-- src/mds/AnchorServer.h| 13 +- src/mds/MDSTableServer.cc | 4 +++- src/mds/MDSTableServer.h | 2 +- src/mds/SnapServer.cc | 3 ++- src/mds/SnapServer.h | 2 +- 6 files changed, 78 insertions(+), 7 deletions(-) diff --git a/src/mds/AnchorServer.cc b/src/mds/AnchorServer.cc index cd2e625..4b1bf64 100644 --- a/src/mds/AnchorServer.cc +++ b/src/mds/AnchorServer.cc @@ -142,12 +142,13 @@ void AnchorServer::_prepare(bufferlist &bl, uint64_t reqid, int bymds) } inc(ino); pending_create[version] = ino; // so we can undo +pending_ops[ino].push_back(pair(version, NULL)); break; - case TABLE_OP_DESTROY: version++; pending_destroy[version] = ino; +pending_ops[ino].push_back(pair(version, NULL)); break; case TABLE_OP_UPDATE: @@ -155,6 +156,7 @@ void AnchorServer::_prepare(bufferlist &bl, uint64_t reqid, int bymds) version++; pending_update[version].first = ino; pending_update[version].second = trace; +pending_ops[ino].push_back(pair(version, NULL)); break; default: @@ -163,8 +165,56 @@ void AnchorServer::_prepare(bufferlist &bl, uint64_t reqid, int bymds) //dump(); } -void AnchorServer::_commit(version_t tid) +bool AnchorServer::check_pending(version_t tid, MMDSTableRequest *req, list& finished) +{ + inodeno_t ino; + if (pending_create.count(tid)) +ino = pending_create[tid]; + else if (pending_destroy.count(tid)) +ino = pending_destroy[tid]; + else if (pending_update.count(tid)) +ino = pending_update[tid].first; + else +assert(0); + + assert(pending_ops.count(ino)); + list< pair >& pending = pending_ops[ino]; + list< pair >::iterator p = pending.begin(); + if (p->first == tid) { +assert(p->second == NULL); + } else { +while (p != pending.end()) { + if (p->first == tid) + break; + p++; +} +assert(p != pending.end()); +assert(p->second == NULL); +// not the earliest pending operation, wait if it's a commit +if (req) { + p->second = new C_MDS_RetryMessage(mds, req); + return false; +} + } + + pending.erase(p); + if (pending.empty()) { +pending_ops.erase(ino); + } else { +for (p = pending.begin(); p != pending.end() && p->second; p++) { + finished.push_back(p->second); + p->second = NULL; +} + } + return true; +} + +bool AnchorServer::_commit(version_t tid, MMDSTableRequest *req) { + list finished; + if (!check_pending(tid, req, finished)) +return false; + if (pending_create.count(tid)) { dout(7) << "commit " << tid << " create " << pending_create[tid] << dendl; pending_create.erase(tid); @@ -206,10 +256,16 @@ void AnchorServer::_commit(version_t tid) // bump version. version++; //dump(); + + mds->queue_waiters(finished); + return true; } void AnchorServer::_rollback(version_t tid) { + list finished; + check_pending(tid, NULL, finished); + if (pending_create.count(tid)) { inodeno_t ino = pending_create[tid]; dout(7) << "rollback " << tid << " create " << ino << dendl; @@ -234,6 +290,7 @@ void AnchorServer::_rollback(version_t tid) // bump version. version++; //dump(); + mds->queue_waiters(finished); } /* This function DOES put the passed message before returning */ diff --git a/src/mds/AnchorServer.h b/src/mds/AnchorServer.h index aa5588a..50a848e 100644 --- a/src/mds/AnchorServer.h +++ b/src/mds/AnchorServer.h @@ -30,6 +30,7 @@ class AnchorServer : public MDSTableServer { map pending_create; map pending_destroy; map > > pending_update; + map > > pending_ops; void reset_state(); void encode_server_state(bufferlist& bl) { @@ -47,17 +48,27 @@ class AnchorServer : public MDSTableServer { ::decode(pending_create, p); ::decode(pending_destroy, p); ::decode(pending_update, p); + +map sort; +sort.insert(pending_create.begin(), pending_create.end()); +sort.insert(pending_destroy.begin(), pending_destroy.end()); +for (map > >::iterator p = pending_update.begin(); +p != pending_update.end(); p++) + sort[p->first] = p->second.first; +for (map::iterator p = sort.begin(); p != sort.end(); p++) + pending_ops[p->second].push_back(pair(p->first, NULL)); } bool add(inodeno_t ino, inodeno_t dirino, __u32 dn_hash, bool replace); void inc(inodeno_t ino, int ref=1); void dec(inodeno_t ino, int ref=1); + bool check_pending(version_t tid, MMDSTableRequ
[PATCH 13/29] mds: indroduce DROPLOCKS slave request
From: "Yan, Zheng" In some rare case, Locker::acquire_locks() drops all acquired locks in order to auth pin new objects. But Locker::drop_locks only drops explicitly acquired remote locks, does not drop objects' version locks that were implicitly acquired on remote MDS. These leftover locks break locking order when re-acquiring _locks and may cause dead lock. The fix is indroduce DROPLOCKS slave request which drops all acquired lock on remote MDS. Signed-off-by: Yan, Zheng --- src/mds/Locker.cc | 30 +++--- src/mds/Server.cc | 6 ++ src/messages/MMDSSlaveRequest.h | 4 3 files changed, 37 insertions(+), 3 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 98eaba8..dec0a94 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -483,16 +483,31 @@ void Locker::_drop_rdlocks(Mutation *mut, set *pneed_issue) void Locker::_drop_non_rdlocks(Mutation *mut, set *pneed_issue) { + set slaves; + while (!mut->xlocks.empty()) { +SimpleLock *lock = *mut->xlocks.begin(); +MDSCacheObject *p = lock->get_parent(); +if (!p->is_auth()) { + assert(lock->get_sm()->can_remote_xlock); + slaves.insert(p->authority().first); + lock->put_xlock(); + mut->locks.erase(lock); + mut->xlocks.erase(lock); + continue; +} bool ni = false; -MDSCacheObject *p = (*mut->xlocks.begin())->get_parent(); -xlock_finish(*mut->xlocks.begin(), mut, &ni); +xlock_finish(lock, mut, &ni); if (ni) pneed_issue->insert((CInode*)p); } + while (!mut->remote_wrlocks.empty()) { -remote_wrlock_finish(mut->remote_wrlocks.begin()->first, mut->remote_wrlocks.begin()->second, mut); +slaves.insert(mut->remote_wrlocks.begin()->second); +mut->locks.erase(mut->remote_wrlocks.begin()->first); +mut->remote_wrlocks.erase(mut->remote_wrlocks.begin()); } + while (!mut->wrlocks.empty()) { bool ni = false; MDSCacheObject *p = (*mut->wrlocks.begin())->get_parent(); @@ -500,6 +515,15 @@ void Locker::_drop_non_rdlocks(Mutation *mut, set *pneed_issue) if (ni) pneed_issue->insert((CInode*)p); } + + for (set::iterator p = slaves.begin(); p != slaves.end(); p++) { +if (mds->mdsmap->get_state(*p) >= MDSMap::STATE_REJOIN) { + dout(10) << "_drop_non_rdlocks dropping remote locks on mds." << *p << dendl; + MMDSSlaveRequest *slavereq = new MMDSSlaveRequest(mut->reqid, mut->attempt, + MMDSSlaveRequest::OP_DROPLOCKS); + mds->send_message_mds(slavereq, *p); +} + } } void Locker::cancel_locking(Mutation *mut, set *pneed_issue) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 6700eda..6097552 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -1455,6 +1455,12 @@ void Server::dispatch_slave_request(MDRequest *mdr) } break; + case MMDSSlaveRequest::OP_DROPLOCKS: +mds->locker->drop_locks(mdr); +mdr->slave_request->put(); +mdr->slave_request = 0; +break; + case MMDSSlaveRequest::OP_AUTHPIN: handle_slave_auth_pin(mdr); break; diff --git a/src/messages/MMDSSlaveRequest.h b/src/messages/MMDSSlaveRequest.h index 03ec582..35af81d 100644 --- a/src/messages/MMDSSlaveRequest.h +++ b/src/messages/MMDSSlaveRequest.h @@ -41,6 +41,8 @@ class MMDSSlaveRequest : public Message { static const int OP_RMDIRPREP = 10; static const int OP_RMDIRPREPACK = -10; + static const int OP_DROPLOCKS= 11; + static const int OP_FINISH = 17; static const int OP_COMMITTED = -18; @@ -73,6 +75,8 @@ class MMDSSlaveRequest : public Message { case OP_RMDIRPREP: return "rmdir_prep"; case OP_RMDIRPREPACK: return "rmdir_prep_ack"; +case OP_DROPLOCKS: return "drop_locks"; + case OP_ABORT: return "abort"; //case OP_COMMIT: return "commit"; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/29] mds: mark rename inode as ambiguous auth on all involved MDS
From: "Yan, Zheng" When handling cross authority rename, the master first sends OP_RENAMEPREP slave requests to witness MDS, then sends OP_RENAMEPREP slave request to the rename inode's auth MDS after getting all witness MDS' acknowledgments. Before receiving the OP_RENAMEPREP slave request, the rename inode's auth MDS may change lock state of the rename inode and send lock messages to witness MDS. But the witness MDS may already received the OP_RENAMEPREP slave request and changed the source inode's authority. So the witness MDS send lock acknowledgment message to wrong MDS and trigger assertion. The fix is, firstly the master marks rename inode as ambiguous and send a message to ask the rename inode's auth MDS to mark the inode as ambiguous, then send OP_RENAMEPREP slave requests to the witness MDS, finally send OP_RENAMEPREP slave request to the rename inode's auth MDS after getting all witness MDS' acknowledgments. Signed-off-by: Yan, Zheng --- src/mds/CInode.cc | 14 src/mds/CInode.h| 6 +++- src/mds/MDCache.cc | 6 src/mds/Mutation.cc | 16 + src/mds/Mutation.h | 8 - src/mds/Server.cc | 98 + src/mds/Server.h| 2 +- 7 files changed, 110 insertions(+), 40 deletions(-) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 595892e..9d29b46 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -1932,6 +1932,20 @@ void CInode::unfreeze_auth_pin() } } +void CInode::clear_ambiguous_auth(list& finished) +{ + assert(state_test(CInode::STATE_AMBIGUOUSAUTH)); + state_clear(CInode::STATE_AMBIGUOUSAUTH); + take_waiting(CInode::WAIT_SINGLEAUTH, finished); +} + +void CInode::clear_ambiguous_auth() +{ + list finished; + clear_ambiguous_auth(finished); + mdcache->mds->queue_waiters(finished); +} + // auth_pins bool CInode::can_auth_pin() { if (is_freezing_inode() || is_frozen_inode() || is_frozen_auth_pin()) diff --git a/src/mds/CInode.h b/src/mds/CInode.h index e43ecf5..a627a4f 100644 --- a/src/mds/CInode.h +++ b/src/mds/CInode.h @@ -525,7 +525,11 @@ private: return state_test(STATE_AMBIGUOUSAUTH) || MDSCacheObject::is_ambiguous_auth(); } - + void set_ambiguous_auth() { +state_set(STATE_AMBIGUOUSAUTH); + } + void clear_ambiguous_auth(list& finished); + void clear_ambiguous_auth(); inodeno_t ino() const { return inode.ino; } vinodeno_t vino() const { return vinodeno_t(inode.ino, last); } diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index da0c7f1..3410b0f 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -2616,6 +2616,9 @@ void MDCache::handle_mds_failure(int who) p->second->more()->waiting_on_slave.erase(who); mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, p->second)); } + + if (p->second->more()->prepared_inode_exporter == who) + p->second->more()->prepared_inode_exporter = -1; } } @@ -7607,6 +7610,9 @@ void MDCache::request_cleanup(MDRequest *mdr) // drop (local) auth pins mdr->drop_local_auth_pins(); + if (mdr->ambiguous_auth_inode) +mdr->clear_ambiguous_auth(mdr->ambiguous_auth_inode); + // drop stickydirs for (set::iterator p = mdr->stickydirs.begin(); p != mdr->stickydirs.end(); diff --git a/src/mds/Mutation.cc b/src/mds/Mutation.cc index a9c3513..1c4cd13 100644 --- a/src/mds/Mutation.cc +++ b/src/mds/Mutation.cc @@ -106,6 +106,22 @@ void Mutation::unfreeze_auth_pin(CInode *inode) auth_pin_freeze = NULL; } +void Mutation::set_ambiguous_auth(CInode *inode) +{ + if (!ambiguous_auth_inode) { +inode->set_ambiguous_auth(); +ambiguous_auth_inode = inode; + } else +assert(ambiguous_auth_inode == inode); +} + +void Mutation::clear_ambiguous_auth(CInode *inode) +{ + assert(ambiguous_auth_inode == inode); + ambiguous_auth_inode->clear_ambiguous_auth(); + ambiguous_auth_inode = NULL; +} + bool Mutation::can_auth_pin(MDSCacheObject *object) { return object->can_auth_pin() || (is_auth_pinned(object) && object == auth_pin_freeze); diff --git a/src/mds/Mutation.h b/src/mds/Mutation.h index 83a1196..bf59dba 100644 --- a/src/mds/Mutation.h +++ b/src/mds/Mutation.h @@ -51,6 +51,7 @@ struct Mutation { set< MDSCacheObject* > remote_auth_pins; set< MDSCacheObject* > auth_pins; CInode *auth_pin_freeze; + CInode* ambiguous_auth_inode; // held locks set< SimpleLock* > rdlocks; // always local. @@ -83,6 +84,7 @@ struct Mutation { ls(0), slave_to_mds(-1), auth_pin_freeze(NULL), + ambiguous_auth_inode(NULL), locking(NULL), done_locking(false), committing(false), aborted(false), killed(false) { } Mutation(metareqid_t ri, __u32 att=0, int slave_to=-1) @@ -90,6 +92,7 @@ struct Mutation { ls(0), slave_to_mds(slave_to), auth_pin_freeze(NULL), + ambiguous_auth_inode(NULL), locking(NULL), done_locking(false), committing(false), aborted(false), killed(false)
[PATCH 12/29] mds: fix on-going two phrase commits tracking
From: "Yan, Zheng" The slaves for two phrase commit should be mdr->more()->witnessed instead of mdr->more()->slaves. mdr->more()->slaves includes MDS for remote auth pin and lock Signed-off-by: Yan, Zheng --- src/mds/Server.cc | 18 +- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index e17f826..6700eda 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -4076,11 +4076,11 @@ void Server::_link_remote(MDRequest *mdr, bool inc, CDentry *dn, CInode *targeti EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote"); mdlog->start_entry(le); le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid()); - if (!mdr->more()->slaves.empty()) { -dout(20) << " noting uncommitted_slaves " << mdr->more()->slaves << dendl; + if (!mdr->more()->witnessed.empty()) { +dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl; le->reqid = mdr->reqid; le->had_slaves = true; -mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->slaves); +mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); } if (inc) { @@ -4655,11 +4655,11 @@ void Server::_unlink_local(MDRequest *mdr, CDentry *dn, CDentry *straydn) EUpdate *le = new EUpdate(mdlog, "unlink_local"); mdlog->start_entry(le); le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid()); - if (!mdr->more()->slaves.empty()) { -dout(20) << " noting uncommitted_slaves " << mdr->more()->slaves << dendl; + if (!mdr->more()->witnessed.empty()) { +dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl; le->reqid = mdr->reqid; le->had_slaves = true; -mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->slaves); +mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); } if (straydn) { @@ -5493,13 +5493,13 @@ void Server::handle_client_rename(MDRequest *mdr) EUpdate *le = new EUpdate(mdlog, "rename"); mdlog->start_entry(le); le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid()); - if (!mdr->more()->slaves.empty()) { -dout(20) << " noting uncommitted_slaves " << mdr->more()->slaves << dendl; + if (!mdr->more()->witnessed.empty()) { +dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl; le->reqid = mdr->reqid; le->had_slaves = true; -mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->slaves); +mds->mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed); } _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn); -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/29] mds: skip frozen inode when assimilating dirty inodes' rstat
From: "Yan, Zheng" CDir::assimilate_dirty_rstat_inodes() may encounter frozen inodes that are being renamed. Skip these frozen inodes because assimilating inode's rstat require auth pinning the inode. Signed-off-by: Yan, Zheng --- src/mds/CDir.cc | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index 20ff469..91636cc 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1022,6 +1022,9 @@ void CDir::assimilate_dirty_rstat_inodes() for (elist::iterator p = dirty_rstat_inodes.begin_use_current(); !p.end(); ++p) { CInode *in = *p; +if (in->is_frozen()) + continue; + inode_t *pi = in->project_inode(); pi->version = in->pre_dirty(); @@ -1040,16 +1043,22 @@ void CDir::assimilate_dirty_rstat_inodes_finish(Mutation *mut, EMetaBlob *blob) elist::iterator p = dirty_rstat_inodes.begin_use_current(); while (!p.end()) { CInode *in = *p; -CDentry *dn = in->get_projected_parent_dn(); ++p; +if (in->is_frozen()) + continue; + +CDentry *dn = in->get_projected_parent_dn(); + mut->auth_pin(in); mut->add_projected_inode(in); in->clear_dirty_rstat(); blob->add_primary_dentry(dn, true, in); } - assert(dirty_rstat_inodes.empty()); + + if (!dirty_rstat_inodes.empty()) +inode->mdcache->mds->locker->mark_updated_scatterlock(&inode->nestlock); } -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 06/29] mds: use null dentry to find old parent of renamed directory
From: "Yan, Zheng" When replaying an directory rename operation, MDS need to find old parent of the renamed directory to adjust auth subtree. Current code searchs the cache to find the old parent, it does not work if the renamed directory inode is not in the cache. EMetaBlob for directory rename contains at most one null dentry, so MDS can use null dentry to find old parent of the renamed directory. If there is no null dentry in the EMetaBlob, the MDS was witness of the rename operation and there is not auth subtree underneath the renamed directory. Signed-off-by: Yan, Zheng --- src/mds/Server.cc | 11 ++- src/mds/journal.cc | 38 -- 2 files changed, 34 insertions(+), 15 deletions(-) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 23739ea..d1984a7 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -5616,10 +5616,10 @@ void Server::_rename_prepare(MDRequest *mdr, // nested beneath. bool force_journal = false; while (srci->is_dir()) { -// if we are auth for srci and exporting it, have any _any_ open dirfrags, we -// will (soon) have auth subtrees here. -if (srci->is_auth() && !destdn->is_auth() && srci->has_dirfrags()) { - dout(10) << " we are exporting srci, and have open dirfrags, will force journal" << dendl; +// if we are auth for srci and exporting it, force journal because we need create +// auth subtrees here during journal replay. +if (srci->is_auth() && !destdn->is_auth()) { + dout(10) << " we are exporting srci, will force journal" << dendl; force_journal = true; break; } @@ -5720,7 +5720,8 @@ void Server::_rename_prepare(MDRequest *mdr, srci->get_dirfrags(ls); for (list::iterator p = ls.begin(); p != ls.end(); ++p) { CDir *dir = *p; - metablob->renamed_dir_frags.push_back(dir->get_frag()); + if (!dir->is_auth()) + metablob->renamed_dir_frags.push_back(dir->get_frag()); } dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl; } diff --git a/src/mds/journal.cc b/src/mds/journal.cc index 20bc755..e73b8e7 100644 --- a/src/mds/journal.cc +++ b/src/mds/journal.cc @@ -442,6 +442,16 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) dout(10) << "EMetaBlob.replay renamed inode is " << *renamed_diri << dendl; else dout(10) << "EMetaBlob.replay don't have renamed ino " << renamed_dirino << dendl; + +int nnull = 0; +for (list::iterator lp = lump_order.begin(); lp != lump_order.end(); ++lp) { + dirlump &lump = lump_map[*lp]; + if (lump.nnull) { + dout(10) << "EMetaBlob.replay found null dentry in dir " << *lp << dendl; + nnull += lump.nnull; + } +} +assert(nnull <= 1); } // keep track of any inodes we unlink and don't relink elsewhere @@ -622,8 +632,6 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) dout(10) << "EMetaBlob.replay unlinking " << *dn << dendl; if (dn->get_linkage()->is_primary()) unlinked.insert(dn->get_linkage()->get_inode()); - if (dn->get_linkage()->get_inode() == renamed_diri) - olddir = dir; dir->unlink_inode(dn); } dn->set_version(p->dnv); @@ -631,24 +639,34 @@ void EMetaBlob::replay(MDS *mds, LogSegment *logseg) dout(10) << "EMetaBlob.replay had " << *dn << dendl; assert(dn->last == p->dnlast); } + olddir = dir; } } if (renamed_dirino) { -if (olddir) { - assert(renamed_diri); +if (renamed_diri) { + assert(olddir); +} else { + // we imported a diri we haven't seen before + renamed_diri = mds->mdcache->get_inode(renamed_dirino); + assert(renamed_diri); // it was in the metablob +} + +if (renamed_diri->authority().first != mds->whoami && + olddir && olddir->authority().first == mds->whoami) { + list leaves; + renamed_diri->dirfragtree.get_leaves(leaves); + for (list::iterator p = leaves.begin(); p != leaves.end(); ++p) + renamed_diri->get_or_open_dirfrag(mds->mdcache, *p); +} + +if (renamed_diri && olddir) { mds->mdcache->adjust_subtree_after_rename(renamed_diri, olddir, false); // see if we can discard the subtree we renamed out of CDir *root = mds->mdcache->get_subtree_root(olddir); if (root->get_dir_auth() == CDIR_AUTH_UNDEF) mds->mdcache->try_trim_non_auth_subtree(root); - -} else { - // we imported a diri we haven't seen before - assert(!renamed_diri); - renamed_diri = mds->mdcache->get_inode(renamed_dirino); - assert(renamed_diri); // it was in the metablob } // if we are the srci importer, we'll also have some dirfrags we have to open up... -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel
[PATCH 07/29] mds: don't trim ambiguous imports in MDCache::trim_non_auth_subtree
From: "Yan, Zheng" Trimming ambiguous imports in MDCache::trim_non_auth_subtree() confuses MDCache::disambiguate_imports() and causes infinite loop. Signed-off-by: Yan, Zheng --- src/mds/MDCache.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 5014b5d..da0c7f1 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -5802,7 +5802,7 @@ bool MDCache::trim_non_auth_subtree(CDir *dir) for (list::iterator subdir = subdirs.begin(); subdir != subdirs.end(); ++subdir) { - if ((*subdir)->is_subtree_root()) { + if ((*subdir)->is_subtree_root() || my_ambiguous_imports.count((*subdir)->dirfrag())) { keep_inode = true; dout(10) << "trim_non_auth_subtree(" << dir << ") subdir " << *subdir << "is kept!" << dendl; } -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 08/29] mds: only export directory fragments in stray to their auth MDS
From: "Yan, Zheng" Signed-off-by: Yan, Zheng --- src/mds/Migrator.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/mds/Migrator.cc b/src/mds/Migrator.cc index 5db21cd..8686c86 100644 --- a/src/mds/Migrator.cc +++ b/src/mds/Migrator.cc @@ -642,12 +642,11 @@ void Migrator::export_dir(CDir *dir, int dest) } if (!dir->inode->is_base() && dir->get_parent_dir()->get_inode()->is_stray() && - dir->get_parent_dir()->get_parent_dir()->ino() == MDS_INO_MDSDIR(mds->get_nodeid())) { + dir->get_parent_dir()->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) { dout(7) << "i won't export anything in stray" << dendl; return; } - if (dir->is_frozen() || dir->is_freezing()) { dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 04/29] mds: xlock stray dentry when handling rename or unlink
From: "Yan, Zheng" This prevents MDS from reintegrating stray before rename/unlink finishes Signed-off-by: Yan, Zheng --- src/mds/Server.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index dc14bbe..064733c 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -4536,6 +4536,7 @@ void Server::handle_client_unlink(MDRequest *mdr) if (straydn) { wrlocks.insert(&straydn->get_dir()->inode->filelock); wrlocks.insert(&straydn->get_dir()->inode->nestlock); +xlocks.insert(&straydn->lock); } if (in->is_dir()) rdlocks.insert(&in->filelock); // to verify it's empty @@ -5300,6 +5301,7 @@ void Server::handle_client_rename(MDRequest *mdr) if (straydn) { wrlocks.insert(&straydn->get_dir()->inode->filelock); wrlocks.insert(&straydn->get_dir()->inode->nestlock); +xlocks.insert(&straydn->lock); } // xlock versionlock on dentries if there are witnesses. -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 05/29] mds: don't journal null dentry for overwrited remote linkage
From: "Yan, Zheng" Server::_rename_prepare() adds null dest dentry to the EMetaBlob if the rename operation overwrites a remote linkage. This is incorrect because null dentry are processed after primary and remote dentries during journal replay. The erroneous null dentry makes the dentry of rename destination disappear. Signed-off-by: Yan, Zheng --- src/mds/Server.cc | 4 1 file changed, 4 deletions(-) diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 064733c..23739ea 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -5794,10 +5794,6 @@ void Server::_rename_prepare(MDRequest *mdr, CEPH_NOSNAP, 0, destdnl); metablob->add_primary_dentry(oldin->get_projected_parent_dn(), true, oldin); } - if (destdn->is_auth()) { - // auth for dn, not targeti - metablob->add_null_dentry(destdn, true); - } } } -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/29] mds: don't trigger assertion when discover races with rename
From: "Yan, Zheng" Discover reply that adds replica dentry and inode can race with rename if slave request for rename sends discover and waits, but waked up by reply for different discover. Signed-off-by: Yan, Zheng --- src/mds/CDentry.cc | 17 ++--- src/mds/MDCache.cc | 5 ++--- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/src/mds/CDentry.cc b/src/mds/CDentry.cc index 7dd88ac..5ff6e61 100644 --- a/src/mds/CDentry.cc +++ b/src/mds/CDentry.cc @@ -439,23 +439,18 @@ void CDentry::decode_replica(bufferlist::iterator& p, bool is_new) inodeno_t rino; unsigned char rdtype; + __s32 ls; ::decode(rino, p); ::decode(rdtype, p); - if (rino) { -if (linkage.is_null()) - dir->link_remote_inode(this, rino, rdtype); -else - assert(linkage.is_remote() && linkage.remote_ino == rino); - } - - __s32 ls; ::decode(ls, p); - if (is_new) + + if (is_new) { +if (rino) + dir->link_remote_inode(this, rino, rdtype); lock.set_state(ls); + } } - - // // locking diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 0a88714..5014b5d 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -9150,12 +9150,11 @@ CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, listdecode_replica(p, false); dout(10) << "add_replica_inode had " << *in << dendl; -assert(!dn || dn->get_linkage()->get_inode() == in); } if (dn) { -assert(dn->get_linkage()->is_primary()); -assert(dn->get_linkage()->get_inode() == in); +if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in) + dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl; dn->get_dir()->take_ino_waiting(in->ino(), finished); } -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/29] mds: don't renew revoking lease
From: "Yan, Zheng" MDS may receives lease renew request while lease is being revoked, just ignore the renew request. Signed-off-by: Yan, Zheng --- src/mds/Locker.cc | 26 ++ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index 76708ec..c8cd236 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -2914,18 +2914,20 @@ void Locker::handle_client_lease(MClientLease *m) case CEPH_MDS_LEASE_RENEW: { - dout(7) << "handle_client_lease client." << client - << " renew on " << *dn << dendl; - int pool = 1; // fixme.. do something smart! - m->h.duration_ms = (int)(1000 * mdcache->client_lease_durations[pool]); - m->h.seq = ++l->seq; - m->clear_payload(); - - utime_t now = ceph_clock_now(g_ceph_context); - now += mdcache->client_lease_durations[pool]; - mdcache->touch_client_lease(l, pool, now); - - mds->send_message_client_counted(m, m->get_connection()); + dout(7) << "handle_client_lease client." << client << " renew on " << *dn + << (!dn->lock.can_lease(client)?", revoking lease":"") << dendl; + if (dn->lock.can_lease(client)) { + int pool = 1; // fixme.. do something smart! + m->h.duration_ms = (int)(1000 * mdcache->client_lease_durations[pool]); + m->h.seq = ++l->seq; + m->clear_payload(); + + utime_t now = ceph_clock_now(g_ceph_context); + now += mdcache->client_lease_durations[pool]; + mdcache->touch_client_lease(l, pool, now); + + mds->send_message_client_counted(m, m->get_connection()); + } } break; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/29] mds: fix Locker::simple_eval()
From: "Yan, Zheng" Locker::simple_eval() checks if the loner wants CEPH_CAP_GEXCL to decide if it should change the lock to EXCL state, but it checks if CEPH_CAP_GEXCL is issued to the loner to decide if it should change the lock to SYNC state. So if the loner wants CEPH_CAP_GEXCL, but doesn't have CEPH_CAP_GEXCL, Locker::simple_eval() will keep switching the lock state. Signed-off-by: Yan, Zheng --- src/mds/Locker.cc | 8 +++- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/mds/Locker.cc b/src/mds/Locker.cc index c8cd236..98eaba8 100644 --- a/src/mds/Locker.cc +++ b/src/mds/Locker.cc @@ -3250,11 +3250,10 @@ void Locker::simple_eval(SimpleLock *lock, bool *need_issue) return; CInode *in = 0; - int wanted = 0, issued = 0; + int wanted = 0; if (lock->get_type() != CEPH_LOCK_DN) { in = (CInode*)lock->get_parent(); -wanted = in->get_caps_wanted(NULL, NULL, lock->get_cap_shift()); -issued = in->get_caps_issued(NULL, NULL, NULL, lock->get_cap_shift()); +in->get_caps_wanted(&wanted, NULL, lock->get_cap_shift()); } // -> excl? @@ -3270,8 +3269,7 @@ void Locker::simple_eval(SimpleLock *lock, bool *need_issue) else if (lock->get_state() != LOCK_SYNC && !lock->is_xlocked() && !lock->is_wrlocked() && - ((!(issued & CEPH_CAP_GEXCL) && -!lock->is_waiter_for(SimpleLock::WAIT_WR)) || + ((!(wanted & CEPH_CAP_GEXCL) && !lock->is_waiter_for(SimpleLock::WAIT_WR)) || (lock->get_state() == LOCK_EXCL && in && in->get_target_loner() < 0))) { dout(7) << "simple_eval stable, syncing " << *lock << " on " << *lock->get_parent() << dendl; -- 1.7.11.7 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 00/29] Various fixes for MDS
From: "Yan, Zheng" This patch series fix various issues I encountered when running 3 MDS. I test this patch series by runing fsstress on two clients, using the same test directory. The MDS and clients could survived overnight test at times. This patch series are also in: git://github.com/ukernel/ceph.git wip-mds -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html