Hi Brad,
the cluster recover to about 0.012% after switching to firefly
tunables (got stuck again with 1 PG remapped) and after that I
increased the pg_num/pgp_num from 128 to 256 to 512, and the status is
getting worse: more PGs are getting stuck at a remapped state, so I
don't see a reason to keep increasing the PGs in the 3-replica pool
right now. Do you see any problem in the pool conf and crusmap rules
below that may lead to this situation?
root@staging-rd0-00:~# ceph -s
cluster 2c91375c-6926-4a96-a2b6-f154fbbe70d4
health HEALTH_WARN
9 pgs stuck unclean
recovery 712/4870783 objects degraded (0.015%)
recovery 1291/4870783 objects misplaced (0.027%)
monmap e17: 3 mons at
{staging-rd0-00=62.217.119.10:6789/0,staging-rd0-01=62.217.119.11:6789/0,staging-rd0-03=62.217.119.13:6789/0}
election epoch 416, quorum 0,1,2
staging-rd0-00,staging-rd0-01,staging-rd0-03
osdmap e159764: 16 osds: 16 up, 16 in; 9 remapped pgs
pgmap v38867868: 10752 pgs, 6 pools, 2529 GB data, 2342 kobjects
5371 GB used, 35594 GB / 40965 GB avail
712/4870783 objects degraded (0.015%)
1291/4870783 objects misplaced (0.027%)
10743 active+clean
9 active+remapped
client io 6087 B/s rd, 566 kB/s wr, 126 op/s
root@staging-rd0-00:~# ceph osd dump | grep pool
pool 0 'data' replicated size 2 min_size 1 crush_ruleset 3 object_hash
rjenkins pg_num 2048 pgp_num 2048 last_change 119047
crash_replay_interval 45 stripe_width 0
pool 1 'metadata' replicated size 2 min_size 1 crush_ruleset 3
object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119048
stripe_width 0
pool 2 'rbd' replicated size 2 min_size 1 crush_ruleset 3 object_hash
rjenkins pg_num 2048 pgp_num 2048 last_change 119049 stripe_width 0
pool 3 'blocks' replicated size 2 min_size 1 crush_ruleset 4
object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119050
stripe_width 0
pool 4 'maps' replicated size 2 min_size 1 crush_ruleset 3 object_hash
rjenkins pg_num 2048 pgp_num 2048 last_change 119051 stripe_width 0
pool 179 'scbench' replicated size 3 min_size 1 crush_ruleset 0
object_hash rjenkins pg_num 512 pgp_num 512 last_change 159762 flags
hashpspool stripe_width 0
root@staging-rd0-00:~# ceph osd crush dump
{
"devices": [
{
"id": 0,
"name": "osd.0"
},
{
"id": 1,
"name": "osd.1"
},
{
"id": 2,
"name": "osd.2"
},
{
"id": 3,
"name": "osd.3"
},
{
"id": 4,
"name": "osd.4"
},
{
"id": 5,
"name": "osd.5"
},
{
"id": 6,
"name": "osd.6"
},
{
"id": 7,
"name": "osd.7"
},
{
"id": 8,
"name": "osd.8"
},
{
"id": 9,
"name": "osd.9"
},
{
"id": 10,
"name": "osd.10"
},
{
"id": 11,
"name": "osd.11"
},
{
"id": 12,
"name": "osd.12"
},
{
"id": 13,
"name": "osd.13"
},
{
"id": 14,
"name": "osd.14"
},
{
"id": 15,
"name": "osd.15"
}
],
"types": [
{
"type_id": 0,
"name": "osd"
},
{
"type_id": 1,
"name": "host"
},
{
"type_id": 2,
"name": "rack"
},
{
"type_id": 3,
"name": "row"
},
{
"type_id": 4,
"name": "room"
},
{
"type_id": 5,
"name": "datacenter"
},
{
"type_id": 6,
"name": "root"
}
],
"buckets": [
{
"id": -1,
"name": "default",
"type_id": 6,
"type_name": "root",
"weight": 734000,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": -3,
"weight": 734000,
"pos": 0
}
]
},
{
"id": -2,
"name": "staging-rd0-03",
"type_id": 1,
"type_name": "host",
"weight": 26214,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 14,
"weight": 13107,
"pos": 0
},
{
"id": 15,
"weight": 13107,
"pos": 1
}
]
},
{
"id": -3,
"name": "unknownrack",
"type_id": 2,
"type_name": "rack",
"weight": 734000,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": -2,
"weight": 26214,
"pos": 0
},
{
"id": -8,
"weight": 340786,
"pos": 1
},
{
"id": -7,
"weight": 340786,
"pos": 2
},
{
"id": -4,
"weight": 26214,
"pos": 3
}
]
},
{
"id": -4,
"name": "staging-rd0-02",
"type_id": 1,
"type_name": "host",
"weight": 26214,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 12,
"weight": 13107,
"pos": 0
},
{
"id": 13,
"weight": 13107,
"pos": 1
}
]
},
{
"id": -7,
"name": "staging-rd0-00",
"type_id": 1,
"type_name": "host",
"weight": 340786,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 0,
"weight": 39321,
"pos": 0
},
{
"id": 1,
"weight": 39321,
"pos": 1
},
{
"id": 2,
"weight": 65536,
"pos": 2
},
{
"id": 3,
"weight": 65536,
"pos": 3
},
{
"id": 4,
"weight": 65536,
"pos": 4
},
{
"id": 5,
"weight": 65536,
"pos": 5
}
]
},
{
"id": -8,
"name": "staging-rd0-01",
"type_id": 1,
"type_name": "host",
"weight": 340786,
"alg": "straw",
"hash": "rjenkins1",
"items": [
{
"id": 6,
"weight": 39321,
"pos": 0
},
{
"id": 7,
"weight": 39321,
"pos": 1
},
{
"id": 8,
"weight": 65536,
"pos": 2
},
{
"id": 9,
"weight": 65536,
"pos": 3
},
{
"id": 10,
"weight": 65536,
"pos": 4
},
{
"id": 11,
"weight": 65536,
"pos": 5
}
]
}
],
"rules": [
{
"rule_id": 0,
"rule_name": "data",
"ruleset": 0,
"type": 1,
"min_size": 1,
"max_size": 10,
"steps": [
{
"op": "take",
"item": -1,
"item_name": "default"
},
{
"op": "chooseleaf_firstn",
"num": 0,
"type": "host"
},
{
"op": "emit"
}
]
},
{
"rule_id": 1,
"rule_name": "metadata",
"ruleset": 1,
"type": 1,
"min_size": 1,
"max_size": 10,
"steps": [
{
"op": "take",
"item": -1,
"item_name": "default"
},
{
"op": "chooseleaf_firstn",
"num": 0,
"type": "host"
},
{
"op": "emit"
}
]
},
{
"rule_id": 2,
"rule_name": "rbd",
"ruleset": 2,
"type": 1,
"min_size": 1,
"max_size": 10,
"steps": [
{
"op": "take",
"item": -1,
"item_name": "default"
},
{
"op": "chooseleaf_firstn",
"num": 0,
"type": "host"
},
{
"op": "emit"
}
]
},
{
"rule_id": 3,
"rule_name": "sas",
"ruleset": 3,
"type": 1,
"min_size": 2,
"max_size": 10,
"steps": [
{
"op": "take",
"item": -1,
"item_name": "default"
},
{
"op": "chooseleaf_firstn",
"num": 0,
"type": "host"
},
{
"op": "emit"
}
]
},
{
"rule_id": 4,
"rule_name": "sata",
"ruleset": 4,
"type": 1,
"min_size": 2,
"max_size": 10,
"steps": [
{
"op": "take",
"item": -1,
"item_name": "default"
},
{
"op": "chooseleaf_firstn",
"num": 0,
"type": "host"
},
{
"op": "emit"
}
]
}
],
"tunables": {
"choose_local_tries": 0,
"choose_local_fallback_tries": 0,
"choose_total_tries": 50,
"chooseleaf_descend_once": 1,
"chooseleaf_vary_r": 1,
"straw_calc_version": 1,
"allowed_bucket_algs": 22,
"profile": "unknown",
"optimal_tunables": 0,
"legacy_tunables": 0,
"require_feature_tunables": 1,
"require_feature_tunables2": 1,
"require_feature_tunables3": 1,
"has_v2_rules": 0,
"has_v3_rules": 0,
"has_v4_buckets": 0
}
}
On 26 July 2016 at 02:07, Brad Hubbard <[email protected]> wrote:
> On Tue, Jul 26, 2016 at 6:08 AM, Kostis Fardelas <[email protected]> wrote:
>> Following up, I increased pg_num/pgp_num for my 3-replica pool to 128
>
> These pg numbers seem low.
>
> Can you take a look at http://ceph.com/pgcalc/ and verify these values
> are appropriate for your environment and use case?
>
> I'd also take a good look at your crush rules to determine if they are
> contributing to the problem.
>
>> (being in argonaut tunables) and after a small recovery that followed,
>> I switched to bobtail tunables. Remapping started and got stuck (!)
>> again without any OSD down this time with 1 PG active+remapped. Tried
>> restarting PG's OSDs, no luck.
>>
>> One thing to notice is that stuck PGs are always on this 3-replicated pool.
>>
>> Finally, I decided to take the hit and switch to firefly tunables
>> (with chooseleaf_vary_r=1) just for the sake of it. Misplaced objects
>> are on 51% of the cluster right now, so I am going to wait and update
>> our thread with the outcome when the dust settles down.
>>
>> All in all, even if firefly tunables lead to a healthy PG
>> distribution, I am afraid I am going to stick with argonaut tunables
>> for now and on, the experience was far from encouraging and there is
>> little documentation regarding the cons and pros of profile tunables
>> changes and their impact on a production cluster.
>>
>> Kostis
>>
>> On 24 July 2016 at 14:29, Kostis Fardelas <[email protected]> wrote:
>>> nice to hear from you Goncalo,
>>> what you propose sounds like an interesting theory, I will test it
>>> tomorrow and let you know. In the meanwhile, I did the same test with
>>> bobtail and argonaut tunables:
>>> - with argonaut tunables, the recovery completes to the end
>>> - with bobtail tunables, the situation is worse than with firefly - I
>>> got even more degraded and misplaced objects and recovery stuck across
>>> 6 PGs
>>>
>>> I also fell upon a thread with an almost similar case [1], where Sage
>>> recommends to switch to hammer tunables and straw2 algorithm, but this
>>> is not an option for a lot of people due to kernel requirements
>>>
>>> [1] https://www.spinics.net/lists/ceph-devel/msg30381.html
>>>
>>>
>>> On 24 July 2016 at 03:44, Goncalo Borges <[email protected]>
>>> wrote:
>>>> Hi Kostis
>>>> This is a wild guess but one thing I note is that your pool 179 has a very
>>>> low pg number (100).
>>>>
>>>> Maybe the algorithm behind the new tunable need a higher pg number to
>>>> actually proceed with the recovery?
>>>>
>>>> You could try to increase the pgs to 128 (it is always better to use
>>>> powers of 2) and see if the recover completes..
>>>>
>>>> Cheers
>>>> G.
>>>> ________________________________________
>>>> From: ceph-users [[email protected]] on behalf of Kostis
>>>> Fardelas [[email protected]]
>>>> Sent: 23 July 2016 16:32
>>>> To: Brad Hubbard
>>>> Cc: ceph-users
>>>> Subject: Re: [ceph-users] Recovery stuck after adjusting to recent tunables
>>>>
>>>> Hi Brad,
>>>>
>>>> pool 0 'data' replicated size 2 min_size 1 crush_ruleset 3 object_hash
>>>> rjenkins pg_num 2048 pgp_num 2048 last_change 119047
>>>> crash_replay_interval 45 stripe_width 0
>>>> pool 1 'metadata' replicated size 2 min_size 1 crush_ruleset 3
>>>> object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119048
>>>> stripe_width 0
>>>> pool 2 'rbd' replicated size 2 min_size 1 crush_ruleset 3 object_hash
>>>> rjenkins pg_num 2048 pgp_num 2048 last_change 119049 stripe_width 0
>>>> pool 3 'blocks' replicated size 2 min_size 1 crush_ruleset 4
>>>> object_hash rjenkins pg_num 2048 pgp_num 2048 last_change 119050
>>>> stripe_width 0
>>>> pool 4 'maps' replicated size 2 min_size 1 crush_ruleset 3 object_hash
>>>> rjenkins pg_num 2048 pgp_num 2048 last_change 119051 stripe_width 0
>>>> pool 179 'scbench' replicated size 3 min_size 1 crush_ruleset 0
>>>> object_hash rjenkins pg_num 100 pgp_num 100 last_change 154034 flags
>>>> hashpspool stripe_width 0
>>>>
>>>> This is the status of 179.38 when the cluster is healthy:
>>>> http://pastebin.ca/3663600
>>>>
>>>> and this is when recovery is stuck:
>>>> http://pastebin.ca/3663601
>>>>
>>>>
>>>> It seems that the PG is replicated with size 3 but the cluster cannot
>>>> create the third replica for some objects whose third OSD (OSD.14) is
>>>> down. That was not the case with argonaut tunables as I remember.
>>>>
>>>> Regards
>>>>
>>>>
>>>> On 23 July 2016 at 06:16, Brad Hubbard <[email protected]> wrote:
>>>>> On Sat, Jul 23, 2016 at 12:17 AM, Kostis Fardelas <[email protected]>
>>>>> wrote:
>>>>>> Hello,
>>>>>> being in latest Hammer, I think I hit a bug with more recent than
>>>>>> legacy tunables.
>>>>>>
>>>>>> Being in legacy tunables for a while, I decided to experiment with
>>>>>> "better" tunables. So first I went from argonaut profile to bobtail
>>>>>> and then to firefly. However, I decided to make the changes on
>>>>>> chooseleaf_vary_r incrementally (because the remapping from 0 to 5 was
>>>>>> huge), from 5 down to the best value (1). So when I reached
>>>>>> chooseleaf_vary_r = 2, I decided to run a simple test before going to
>>>>>> chooseleaf_vary_r = 1: close an OSD (OSD.14) and let the cluster
>>>>>> recover. But the recovery never completes and a PG remains stuck,
>>>>>> reported as undersized+degraded. No OSD is near full and all pools
>>>>>> have min_size=1.
>>>>>>
>>>>>> ceph osd crush show-tunables -f json-pretty
>>>>>>
>>>>>> {
>>>>>> "choose_local_tries": 0,
>>>>>> "choose_local_fallback_tries": 0,
>>>>>> "choose_total_tries": 50,
>>>>>> "chooseleaf_descend_once": 1,
>>>>>> "chooseleaf_vary_r": 2,
>>>>>> "straw_calc_version": 1,
>>>>>> "allowed_bucket_algs": 22,
>>>>>> "profile": "unknown",
>>>>>> "optimal_tunables": 0,
>>>>>> "legacy_tunables": 0,
>>>>>> "require_feature_tunables": 1,
>>>>>> "require_feature_tunables2": 1,
>>>>>> "require_feature_tunables3": 1,
>>>>>> "has_v2_rules": 0,
>>>>>> "has_v3_rules": 0,
>>>>>> "has_v4_buckets": 0
>>>>>> }
>>>>>>
>>>>>> The really strange thing is that the OSDs of the stuck PG belong to
>>>>>> other nodes than the one I decided to stop (osd.14).
>>>>>>
>>>>>> # ceph pg dump_stuck
>>>>>> ok
>>>>>> pg_stat state up up_primary acting acting_primary
>>>>>> 179.38 active+undersized+degraded [2,8] 2 [2,8] 2
>>>>>
>>>>> Can you share a query of this pg?
>>>>>
>>>>> What size (not min size) is this pool (assuming it's 2)?
>>>>>
>>>>>>
>>>>>>
>>>>>> ID WEIGHT TYPE NAME UP/DOWN REWEIGHT PRIMARY-AFFINITY
>>>>>> -1 11.19995 root default
>>>>>> -3 11.19995 rack unknownrack
>>>>>> -2 0.39999 host staging-rd0-03
>>>>>> 14 0.20000 osd.14 up 1.00000 1.00000
>>>>>> 15 0.20000 osd.15 up 1.00000 1.00000
>>>>>> -8 5.19998 host staging-rd0-01
>>>>>> 6 0.59999 osd.6 up 1.00000 1.00000
>>>>>> 7 0.59999 osd.7 up 1.00000 1.00000
>>>>>> 8 1.00000 osd.8 up 1.00000 1.00000
>>>>>> 9 1.00000 osd.9 up 1.00000 1.00000
>>>>>> 10 1.00000 osd.10 up 1.00000 1.00000
>>>>>> 11 1.00000 osd.11 up 1.00000 1.00000
>>>>>> -7 5.19998 host staging-rd0-00
>>>>>> 0 0.59999 osd.0 up 1.00000 1.00000
>>>>>> 1 0.59999 osd.1 up 1.00000 1.00000
>>>>>> 2 1.00000 osd.2 up 1.00000 1.00000
>>>>>> 3 1.00000 osd.3 up 1.00000 1.00000
>>>>>> 4 1.00000 osd.4 up 1.00000 1.00000
>>>>>> 5 1.00000 osd.5 up 1.00000 1.00000
>>>>>> -4 0.39999 host staging-rd0-02
>>>>>> 12 0.20000 osd.12 up 1.00000 1.00000
>>>>>> 13 0.20000 osd.13 up 1.00000 1.00000
>>>>>>
>>>>>>
>>>>>> Have you experienced something similar?
>>>>>>
>>>>>> Regards,
>>>>>> Kostis
>>>>>> _______________________________________________
>>>>>> ceph-users mailing list
>>>>>> [email protected]
>>>>>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>>>>
>>>>>
>>>>>
>>>>> --
>>>>> Cheers,
>>>>> Brad
>>>> _______________________________________________
>>>> ceph-users mailing list
>>>> [email protected]
>>>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>
>
>
> --
> Cheers,
> Brad
_______________________________________________
ceph-users mailing list
[email protected]
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com