On Tue, 24 Jul 2012, Yehuda Sadeh wrote:
> On Fri, Jul 20, 2012 at 5:41 PM, Sage Weil <[email protected]> wrote:
> > From: caleb miles <[email protected]>
> >
> > The server side recently added support for tuning some magic
> > crush variables. Decode these variables if they are present, or use the
> > default values if they are not present.
> >
> > Corresponds to ceph.git commit 89af369c25f274fe62ef730e5e8aad0c54f1e5a5.
> >
> > Signed-off-by: caleb miles <[email protected]>
> > Reviewed-by: Sage Weil <[email protected]>
> > ---
> > include/linux/ceph/ceph_features.h | 4 ++-
> > include/linux/crush/crush.h | 8 +++++++
> > net/ceph/crush/mapper.c | 13 ++++++-----
> > net/ceph/osdmap.c | 39
> > ++++++++++++++++++++++++++++++++++++
> > 4 files changed, 57 insertions(+), 7 deletions(-)
> >
> > diff --git a/include/linux/ceph/ceph_features.h
> > b/include/linux/ceph/ceph_features.h
> > index 342f93d..df25dcf 100644
> > --- a/include/linux/ceph/ceph_features.h
> > +++ b/include/linux/ceph/ceph_features.h
> > @@ -12,12 +12,14 @@
> > #define CEPH_FEATURE_MONNAMES (1<<5)
> > #define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
> > #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
> > +#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
>
> any reason why this is 18 and not 8?
9-17 are used.. just not implemented/used by the kernel code.
>
> >
> > /*
> > * Features supported.
> > */
> > #define CEPH_FEATURES_SUPPORTED_DEFAULT \
> > - (CEPH_FEATURE_NOSRCADDR)
> > + (CEPH_FEATURE_NOSRCADDR | \
> > + CEPH_FEATURE_CRUSH_TUNABLES)
> >
> > #define CEPH_FEATURES_REQUIRED_DEFAULT \
> > (CEPH_FEATURE_NOSRCADDR)
> > diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h
> > index 7c47508..25baa28 100644
> > --- a/include/linux/crush/crush.h
> > +++ b/include/linux/crush/crush.h
> > @@ -154,6 +154,14 @@ struct crush_map {
> > __s32 max_buckets;
> > __u32 max_rules;
> > __s32 max_devices;
> > +
> > + /* choose local retries before re-descent */
> > + __u32 choose_local_tries;
> > + /* choose local attempts using a fallback permutation before
> > + * re-descent */
> > + __u32 choose_local_fallback_tries;
> > + /* choose attempts before giving up */
> > + __u32 choose_total_tries;
> > };
> >
> >
> > diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
> > index d7edc24..35fce75 100644
> > --- a/net/ceph/crush/mapper.c
> > +++ b/net/ceph/crush/mapper.c
> > @@ -306,7 +306,6 @@ static int crush_choose(const struct crush_map *map,
> > int item = 0;
> > int itemtype;
> > int collide, reject;
> > - const unsigned int orig_tries = 5; /* attempts before we fall back
> > to search */
> >
> > dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n",
> > recurse_to_leaf ? "_LEAF" : "",
> > bucket->id, x, outpos, numrep);
> > @@ -351,8 +350,9 @@ static int crush_choose(const struct crush_map *map,
> > reject = 1;
> > goto reject;
> > }
> > - if (flocal >= (in->size>>1) &&
> > - flocal > orig_tries)
> > + if (map->choose_local_fallback_tries > 0 &&
> > + flocal >= (in->size>>1) &&
> > + flocal >
> > map->choose_local_fallback_tries)
>
> is flocal right here or should it be ftotal?
>
> > item = bucket_perm_choose(in, x, r);
> > else
> > item = crush_bucket_choose(in, x,
> > r);
> > @@ -422,13 +422,14 @@ reject:
> > ftotal++;
> > flocal++;
> >
> > - if (collide && flocal < 3)
> > + if (collide && flocal <=
> > map->choose_local_tries)
> > /* retry locally a few
> > times */
> > retry_bucket = 1;
> > - else if (flocal <= in->size +
> > orig_tries)
> > + else if
> > (map->choose_local_fallback_tries > 0 &&
> > + flocal <= in->size +
> > map->choose_local_fallback_tries)
> > /* exhaustive bucket search
> > */
> > retry_bucket = 1;
> > - else if (ftotal < 20)
> > + else if (ftotal <=
> > map->choose_total_tries)
> > /* then retry descent */
> > retry_descent = 1;
> > else
> > diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
> > index 9600674..3124b71 100644
> > --- a/net/ceph/osdmap.c
> > +++ b/net/ceph/osdmap.c
> > @@ -135,6 +135,21 @@ bad:
> > return -EINVAL;
> > }
> >
> > +static int skip_name_map(void **p, void *end)
> > +{
> > + int len;
> > + ceph_decode_32_safe(p, end, len ,bad);
> > + while (len--) {
> > + int strlen;
> use u32 for strlen
>
> > + *p += sizeof(u32);
> > + ceph_decode_32_safe(p, end, strlen, bad);
> > + *p += strlen;
> > +}
> > + return 0;
> > +bad:
> > + return -EINVAL;
> > +}
> > +
> > static struct crush_map *crush_decode(void *pbyval, void *end)
> > {
> > struct crush_map *c;
> > @@ -143,6 +158,7 @@ static struct crush_map *crush_decode(void *pbyval,
> > void *end)
> > void **p = &pbyval;
> > void *start = pbyval;
> > u32 magic;
> > + u32 num_name_maps;
> >
> > dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
> >
> > @@ -150,6 +166,11 @@ static struct crush_map *crush_decode(void *pbyval,
> > void *end)
> > if (c == NULL)
> > return ERR_PTR(-ENOMEM);
> >
> > + /* set tunables to default values */
> > + c->choose_local_tries = 2;
> > + c->choose_local_fallback_tries = 5;
> > + c->choose_total_tries = 19;
> > +
> > ceph_decode_need(p, end, 4*sizeof(u32), bad);
> > magic = ceph_decode_32(p);
> > if (magic != CRUSH_MAGIC) {
> > @@ -297,7 +318,25 @@ static struct crush_map *crush_decode(void *pbyval,
> > void *end)
> > }
> >
> > /* ignore trailing name maps. */
> > + for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) {
> > + err = skip_name_map(p, end);
> > + if (err < 0)
> > + goto done;
> > + }
> > +
> > + /* tunables */
> > + ceph_decode_need(p, end, 3*sizeof(u32), done);
> > + c->choose_local_tries = ceph_decode_32(p);
> > + c->choose_local_fallback_tries = ceph_decode_32(p);
> > + c->choose_total_tries = ceph_decode_32(p);
> > + dout("crush decode tunable choose_local_tries = %d",
> > + c->choose_local_tries);
> > + dout("crush decode tunable choose_local_fallback_tries = %d",
> > + c->choose_local_fallback_tries);
> > + dout("crush decode tunable choose_total_tries = %d",
> > + c->choose_total_tries);
> >
> > +done:
> > dout("crush_decode success\n");
> > return c;
> >
> > --
> > 1.7.9
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
>
>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html