Hi Vivien, what I noticed immediately are your pool settings:
You have set on the pools: size 3 *min_size 3* but it should be: size 3 *min_size 2 * The consequence of this is that every time osd fails, the affected pgs go offline. Regards, Joachim joachim.kraftma...@clyso.com www.clyso.com Hohenzollernstr. 27, 80801 Munich Utting | HR: Augsburg | HRB: 25866 | USt. ID-Nr.: DE275430677 Am Mo., 18. Aug. 2025 um 14:13 Uhr schrieb GLE, Vivien <vivien....@inist.fr >: > > > 'ceph osd ok-to-stop' is a safety check, nothing more. > > Oh ok I though that you can add true to ignore pgs becoming inactive > This is the ouput for one OSD , all of my OSD are unsafe to stop. > > root@ceph-monitor-1:/# ceph osd ok-to-stop 1 > > {"ok_to_stop":false,"osds":[1],"num_ok_pgs":30,"num_not_ok_pgs":170,"bad_become_inactive":["2.0","2.2","2.3","2.4","2.7","2.f","2.11","2.14","2.17","2.18","2.19","2.1a","2.1d","3.2","3.5","3.a","3.d","7.0","7.6","7.7","7.8","7.b","7.d","7.11","7.16","7.17","7.19","7.1f","10.2","10.9","10.b","10.c","10.d","10.e","10.10","10.11","10.14","10.1a","10.1b","10.1d","10.1f","11.1","11.4","11.5","11.7","11.9","11.a","11.c","11.d","11.e","11.11","11.13","11.14","11.15","11.16","11.1a","11.1b","11.1e","15.1","15.2","15.9","15.a","15.b","15.d","15.f","15.10","15.11","15.13","15.14","15.16","15.17","15.19","15.1c","15.1d","15.1f","16.2","16.3","16.5","16.6","16.9","16.c","16.d","16.f","16.18","16.19","16.1b","16.1c","16.1d","16.1e","17.6","17.a","17.b","17.c","17.d","17.10","17.11","17.18","17.1f","25.2","25.3","25.5","25.7","25.8","25.b","25.10","25.11","25.13","25.14","25.17","25.19","25.1a","25.1b","25.21","25.22","25.23","25.25","25.27","25.2a","25.2b","25.2e","25.31","25.35","25.37","25.3d","26.1","26.8","26.9","26.a","26.f","26.10","26.14","26.16","26.1a","26.1e","27.0","27.1","27.2","27.4","27.5","27.6","27.c","27.d","27.f","28.2","28.3","28.6","28.7","28.a","28.10","28.14","28.18","28.1a","28.1d","28.1e","28.1f","28.20","28.21","28.25","28.26","28.2a","28.2b","28.2c","28.2d","28.2f","28.33","28.34","28.37","28.3a","28.3d","28.3f"],"ok_become_degraded":["4.0","4.3","4.6","4.8","4.a","4.b","4.d","4.e","4.11","4.13","4.14","4.15","4.17","4.18","4.1b","4.1d","4.1f","18.0","18.2","18.6","18.c","18.d","18.e","18.14","18.16","18.17","18.19","18.1a","18.1d","18.1e"]} > Error EBUSY: unsafe to stop osd(s) at this time (170 PGs are or would > become offline) > > root@ceph-monitor-1:/# ceph osd pool ls detail > pool 1 '.mgr' replicated size 3 min_size 3 crush_rule 0 object_hash > rjenkins pg_num 1 pgp_num 1 autoscale_mode on last_change 886201 flags > hashpspool stripe_width 0 pg_num_max 32 pg_num_min 1 application mgr > read_balance_score 6.98 > pool 2 'rbd' replicated size 3 min_size 3 crush_rule 0 object_hash > rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 670923 lfor > 0/1645/1643 flags hashpspool,selfmanaged_snaps stripe_width 0 application > rbd read_balance_score 2.19 > pool 3 'cephfs.toto-fs.meta' replicated size 3 min_size 3 crush_rule 0 > object_hash rjenkins pg_num 16 pgp_num 16 autoscale_mode on last_change > 886203 lfor 0/0/64 flags hashpspool stripe_width 0 pg_autoscale_bias 4 > pg_num_min 16 recovery_priority 5 application cephfs read_balance_score 2.19 > pool 4 'cephfs.toto-fs.data' replicated size 3 min_size 2 crush_rule 0 > object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change > 3008 lfor 0/3008/3006 flags hashpspool,bulk max_bytes 32212254720 > stripe_width 0 application cephfs read_balance_score 1.53 > pool 7 '.nfs' replicated size 3 min_size 3 crush_rule 0 object_hash > rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 886199 lfor > 0/0/140 flags hashpspool stripe_width 0 application nfs read_balance_score > 1.53 > pool 10 'prbd' replicated size 3 min_size 3 crush_rule 0 object_hash > rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 670919 lfor > 0/0/833 flags hashpspool,selfmanaged_snaps stripe_width 0 application rbd > read_balance_score 1.53 > pool 11 '.rgw.root' replicated size 3 min_size 3 crush_rule 0 object_hash > rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 886205 lfor > 0/0/833 flags hashpspool stripe_width 0 application rgw read_balance_score > 1.53 > pool 15 'testzone.rgw.log' replicated size 3 min_size 3 crush_rule 0 > object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change > 886225 lfor 0/0/1153 flags hashpspool stripe_width 0 application rgw > read_balance_score 1.97 > pool 16 'testzone.rgw.control' replicated size 3 min_size 3 crush_rule 0 > object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change > 886213 lfor 0/0/1153 flags hashpspool stripe_width 0 application rgw > read_balance_score 1.53 > pool 17 'testzone.rgw.meta' replicated size 3 min_size 3 crush_rule 0 > object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change > 886215 lfor 0/0/1875 flags hashpspool stripe_width 0 pg_autoscale_bias 4 > application rgw read_balance_score 1.53 > pool 18 'testzone.rgw.buckets.index' replicated size 3 min_size 2 > crush_rule 0 object_hash rjenkins pg_num 32 pgp_num 32 autoscale_mode on > last_change 1965 lfor 0/0/1877 flags hashpspool stripe_width 0 > pg_autoscale_bias 4 application rgw read_balance_score 1.31 > pool 25 'pool_VM' replicated size 3 min_size 3 crush_rule 0 object_hash > rjenkins pg_num 64 pgp_num 64 autoscale_mode on last_change 886217 lfor > 0/0/885938 flags hashpspool,selfmanaged_snaps max_bytes 107374182400 > stripe_width 0 target_size_bytes 536870912000 application rbd > read_balance_score 1.64 > pool 26 'k8s' replicated size 3 min_size 3 crush_rule 0 object_hash > rjenkins pg_num 32 pgp_num 32 autoscale_mode on last_change 886233 lfor > 0/0/886231 flags hashpspool,selfmanaged_snaps stripe_width 0 application > rbd read_balance_score 1.53 > pool 27 'cephfs.testfs.meta' replicated size 3 min_size 3 crush_rule 0 > object_hash rjenkins pg_num 16 pgp_num 16 autoscale_mode on last_change > 886221 lfor 0/0/885939 flags hashpspool stripe_width 0 pg_autoscale_bias 4 > pg_num_min 16 recovery_priority 5 application cephfs read_balance_score 1.75 > pool 28 'cephfs.testfs.data' replicated size 3 min_size 3 crush_rule 0 > object_hash rjenkins pg_num 64 pgp_num 64 autoscale_mode on last_change > 886242 lfor 0/0/886240 flags hashpspool,bulk stripe_width 0 application > cephfs read_balance_score 1.31 > > root@ceph-monitor-1:/# ceph osd df tree > ID CLASS WEIGHT REWEIGHT SIZE RAW USE DATA OMAP META > AVAIL %USE VAR PGS STATUS TYPE NAME > -1 6.63879 - 6.4 TiB 104 GiB 95 GiB 164 KiB 9.6 GiB > 6.3 TiB 1.60 1.00 - root inist > -15 0.90970 - 932 GiB 12 GiB 11 GiB 35 KiB 889 MiB > 920 GiB 1.29 0.80 - host ceph-monitor-1 > 6 hdd 0.90970 1.00000 932 GiB 12 GiB 11 GiB 35 KiB 889 MiB > 920 GiB 1.29 0.80 197 up osd.6 > -21 5.72910 - 5.5 TiB 92 GiB 84 GiB 129 KiB 8.7 GiB > 5.4 TiB 1.65 1.03 - datacenter dcml > -20 2.72910 - 2.7 TiB 46 GiB 42 GiB 62 KiB 4.0 GiB > 2.7 TiB 1.64 1.02 - room it02 > -19 2.72910 - 2.7 TiB 46 GiB 42 GiB 62 KiB 4.0 GiB > 2.7 TiB 1.64 1.02 - row left > -18 2.72910 - 2.7 TiB 46 GiB 42 GiB 62 KiB 4.0 GiB > 2.7 TiB 1.64 1.02 - rack 10 > -3 0.90970 - 932 GiB 14 GiB 13 GiB 17 KiB 1.2 GiB > 918 GiB 1.50 0.94 - host ceph-node-1 > 2 hdd 0.90970 1.00000 932 GiB 14 GiB 13 GiB 17 KiB 1.2 GiB > 918 GiB 1.50 0.94 201 up osd.2 > -5 0.90970 - 932 GiB 16 GiB 14 GiB 28 KiB 1.6 GiB > 916 GiB 1.70 1.06 - host ceph-node-2 > 1 hdd 0.90970 1.00000 932 GiB 16 GiB 14 GiB 28 KiB 1.6 GiB > 916 GiB 1.70 1.06 200 up osd.1 > -9 0.90970 - 932 GiB 16 GiB 15 GiB 17 KiB 1.2 GiB > 915 GiB 1.72 1.07 - host ceph-node-3 > 5 hdd 0.90970 1.00000 932 GiB 16 GiB 15 GiB 17 KiB 1.2 GiB > 915 GiB 1.72 1.07 200 up osd.5 > -36 3.00000 - 2.7 TiB 47 GiB 42 GiB 67 KiB 4.8 GiB > 2.7 TiB 1.67 1.04 - room it06 > -35 3.00000 - 2.7 TiB 47 GiB 42 GiB 67 KiB 4.8 GiB > 2.7 TiB 1.67 1.04 - row left06 > -34 3.00000 - 2.7 TiB 47 GiB 42 GiB 67 KiB 4.8 GiB > 2.7 TiB 1.67 1.04 - rack 08 > -7 1.00000 - 932 GiB 15 GiB 13 GiB 21 KiB 1.6 GiB > 917 GiB 1.57 0.98 - host ceph-node-4 > 0 hdd 1.00000 1.00000 932 GiB 15 GiB 13 GiB 21 KiB 1.6 GiB > 917 GiB 1.57 0.98 207 up osd.0 > -13 1.00000 - 932 GiB 15 GiB 14 GiB 31 KiB 1.6 GiB > 916 GiB 1.63 1.02 - host ceph-node-5 > 3 hdd 1.00000 1.00000 932 GiB 15 GiB 14 GiB 31 KiB 1.6 GiB > 916 GiB 1.63 1.02 217 up osd.3 > -11 1.00000 - 932 GiB 17 GiB 15 GiB 15 KiB 1.5 GiB > 915 GiB 1.80 1.12 - host ceph-node-6 > 4 hdd 1.00000 1.00000 932 GiB 17 GiB 15 GiB 15 KiB 1.5 GiB > 915 GiB 1.80 1.12 221 up osd.4 > TOTAL 6.4 TiB 104 GiB 95 GiB 168 KiB 9.6 GiB > 6.3 TiB 1.60 > MIN/MAX VAR: 0.80/1.12 STDDEV: 0.16 > > Vivien > > > > > > > > > > > ________________________________ > De : Eugen Block <ebl...@nde.ag> > Envoyé : lundi 18 août 2025 12:37:14 > À : ceph-users@ceph.io > Objet : [ceph-users] Re: Ceph upgrade OSD unsafe to stop > > Hi, > > 'ceph osd ok-to-stop' is a safety check, nothing more. It basically > checks if PGs would become inactive if you stopped said OSD or if > those PGs would become only degraded. Which OSD does report that it's > unsafe to stop? Can you paste the output of 'ceph osd ok-to-stop > <OSD_ID>'? And with that also 'ceph osd pool ls detail' to see which > pool(s) is/are affected. And 'ceph osd df tree' can also be useful here. > > Regards, > Eugen > > Zitat von "GLE, Vivien" <vivien....@inist.fr>: > > > Hi, > > > > > > I'm trying to update my cluster (19.2.2 -> 19.2.3), mon and mgr > > upgrade goes well but I had some issue with OSD : > > > > > > Upgrade: unsafe to stop osd(s) at this time (165 PGs are or would > > become offline) > > > > > > Cluster is in health_ok > > > > All pools are replica 3 and pgs active+clean > > > > autoscaler is off following the ceph docs > > > > > > Does ceph osd ok-to-stop lead to lost data ? > > > > > > The only rules used in the cluster is replicated_rule : > > > > > > root@ceph-monitor-1:/# ceph osd crush rule dump replicated_rule > > > > { > > "rule_id": 0, > > "rule_name": "replicated_rule", > > "type": 1, > > "steps": [ > > { > > "op": "take", > > "item": -1, > > "item_name": "inist" > > }, > > { > > "op": "chooseleaf_firstn", > > "num": 0, > > "type": "host" > > }, > > { > > "op": "emit" > > } > > ] > > } > > > > root@ceph-monitor-1:/# ceph osd tree > > ID CLASS WEIGHT TYPE NAME STATUS > > REWEIGHT PRI-AFF > > -1 6.63879 root inist > > -15 0.90970 host ceph-monitor-1 > > 6 hdd 0.90970 osd.6 up > > 1.00000 1.00000 > > -21 5.72910 datacenter bat1 > > -20 2.72910 room room01 > > -19 2.72910 row left > > -18 2.72910 rack 10 > > -3 0.90970 host ceph-node-1 > > 2 hdd 0.90970 osd.2 up > > 1.00000 1.00000 > > -5 0.90970 host ceph-node-2 > > 1 hdd 0.90970 osd.1 up > > 1.00000 1.00000 > > -9 0.90970 host ceph-node-3 > > 5 hdd 0.90970 osd.5 up > > 1.00000 1.00000 > > -36 3.00000 room room03 > > -35 3.00000 row left06 > > -34 3.00000 rack 08 > > -7 1.00000 host ceph-node-4 > > 0 hdd 1.00000 osd.0 up > > 1.00000 1.00000 > > -13 1.00000 host ceph-node-5 > > 3 hdd 1.00000 osd.3 up > > 1.00000 1.00000 > > -11 1.00000 host ceph-node-6 > > 4 hdd 1.00000 osd.4 up > > 1.00000 1.00000 > > > > Thanks ! > > > > Vivien > > > > > > > > _______________________________________________ > > ceph-users mailing list -- ceph-users@ceph.io > > To unsubscribe send an email to ceph-users-le...@ceph.io > > > _______________________________________________ > ceph-users mailing list -- ceph-users@ceph.io > To unsubscribe send an email to ceph-users-le...@ceph.io > _______________________________________________ > ceph-users mailing list -- ceph-users@ceph.io > To unsubscribe send an email to ceph-users-le...@ceph.io > _______________________________________________ ceph-users mailing list -- ceph-users@ceph.io To unsubscribe send an email to ceph-users-le...@ceph.io