kas wrote:
: Marc,
:
: Marc Roos wrote:
: : Are you sure your osd's are up and reachable? (run ceph osd tree on
: : another node)
:
: They are up, because all three mons see them as up.
: However, ceph osd tree provided the hint (thanks!): The OSD host went back
: with hostname "localhost" instead of the correct one for some reason.
: So the OSDs moved themselves to a new HOST=localhost CRUSH node directly
: under the CRUSH root. I rebooted the OSD host once again, and it went up
: again with the correct hostname, and the "ceph osd tree" output looks sane
: now. So I guess we have a reason for such a huge rebalance.
:
: However, even though the OSD tree is back in the normal state,
: the rebalance is still going on, and there are even inactive PGs,
: with some Ceph clients being stuck seemingly forever:
:
: health: HEALTH_ERR
: 1964645/3977451 objects misplaced (49.395%)
: Reduced data availability: 11 pgs inactive
Wild guessing what to do, I went to the rebooted OSD host and ran
systemctl restart ceph-osd.target
- restarting all OSD processes. The previously inactive (activating) pgs
went to the active state, and Ceph clients got unstuck. Now I see
HEALTH_ERR with backfill_toofull only, which I consider a normal state
during Ceph Mimic rebalance.
It would be interesting to know why some of the PGs went stuck,
and why did restart help. FWIW, I have a "ceph pg query" output for
one of the 11 inactive PGs.
-Yenya
-------------------------------------------
# ceph pg 23.4f5 query
{
"state": "activating+remapped",
"snap_trimq": "[]",
"snap_trimq_len": 0,
"epoch": 104015,
"up": [
70,
72,
27
],
"acting": [
25,
27,
79
],
"backfill_targets": [
"70",
"72"
],
"acting_recovery_backfill": [
"25",
"27",
"70",
"72",
"79"
],
"info": {
"pgid": "23.4f5",
"last_update": "103035'4667973",
"last_complete": "103035'4667973",
"log_tail": "102489'4664889",
"last_user_version": 4667973,
"last_backfill": "MAX",
"last_backfill_bitwise": 1,
"purged_snaps": [],
"history": {
"epoch_created": 406,
"epoch_pool_created": 406,
"last_epoch_started": 103086,
"last_interval_started": 103085,
"last_epoch_clean": 96881,
"last_interval_clean": 96880,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 103095,
"same_interval_since": 103095,
"same_primary_since": 95398,
"last_scrub": "102517'4667556",
"last_scrub_stamp": "2019-05-15 01:07:28.978979",
"last_deep_scrub": "102491'4666011",
"last_deep_scrub_stamp": "2019-05-08 07:20:08.253942",
"last_clean_scrub_stamp": "2019-05-15 01:07:28.978979"
},
"stats": {
"version": "103035'4667973",
"reported_seq": "2116838",
"reported_epoch": "104015",
"state": "activating+remapped",
"last_fresh": "2019-05-15 16:19:44.530005",
"last_change": "2019-05-15 14:56:04.248887",
"last_active": "2019-05-15 14:56:02.579506",
"last_peered": "2019-05-15 14:56:01.401941",
"last_clean": "2019-05-15 14:53:39.291350",
"last_became_active": "2019-05-15 14:55:54.163102",
"last_became_peered": "2019-05-15 14:55:54.163102",
"last_unstale": "2019-05-15 16:19:44.530005",
"last_undegraded": "2019-05-15 16:19:44.530005",
"last_fullsized": "2019-05-15 16:19:44.530005",
"mapping_epoch": 103095,
"log_start": "102489'4664889",
"ondisk_log_start": "102489'4664889",
"created": 406,
"last_epoch_clean": 96881,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "102517'4667556",
"last_scrub_stamp": "2019-05-15 01:07:28.978979",
"last_deep_scrub": "102491'4666011",
"last_deep_scrub_stamp": "2019-05-08 07:20:08.253942",
"last_clean_scrub_stamp": "2019-05-15 01:07:28.978979",
"log_size": 3084,
"ondisk_log_size": 3084,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": true,
"manifest_stats_invalid": true,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 2641321984,
"num_objects": 633,
"num_object_clones": 49,
"num_object_copies": 1899,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 1266,
"num_objects_unfound": 0,
"num_objects_dirty": 633,
"num_whiteouts": 0,
"num_read": 1263624,
"num_read_kb": 49804648,
"num_write": 5054985,
"num_write_kb": 76175293,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 6507,
"num_bytes_recovered": 27291253248,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0
},
"up": [
70,
72,
27
],
"acting": [
25,
27,
79
],
"blocked_by": [],
"up_primary": 70,
"acting_primary": 25,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 103096,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
"peer_info": [
{
"peer": "27",
"pgid": "23.4f5",
"last_update": "103035'4667973",
"last_complete": "103035'4667973",
"log_tail": "102489'4664889",
"last_user_version": 4667973,
"last_backfill": "MAX",
"last_backfill_bitwise": 1,
"purged_snaps": [],
"history": {
"epoch_created": 406,
"epoch_pool_created": 406,
"last_epoch_started": 103086,
"last_interval_started": 103085,
"last_epoch_clean": 96881,
"last_interval_clean": 96880,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 103095,
"same_interval_since": 103095,
"same_primary_since": 95398,
"last_scrub": "102517'4667556",
"last_scrub_stamp": "2019-05-15 01:07:28.978979",
"last_deep_scrub": "102491'4666011",
"last_deep_scrub_stamp": "2019-05-08 07:20:08.253942",
"last_clean_scrub_stamp": "2019-05-15 01:07:28.978979"
},
"stats": {
"version": "102814'4667972",
"reported_seq": "2115836",
"reported_epoch": "103035",
"state": "active+clean",
"last_fresh": "2019-05-15 14:52:36.025409",
"last_change": "2019-05-15 01:07:28.979033",
"last_active": "2019-05-15 14:52:36.025409",
"last_peered": "2019-05-15 14:52:36.025409",
"last_clean": "2019-05-15 14:52:36.025409",
"last_became_active": "2019-04-26 06:30:50.855477",
"last_became_peered": "2019-04-26 06:30:50.855477",
"last_unstale": "2019-05-15 14:52:36.025409",
"last_undegraded": "2019-05-15 14:52:36.025409",
"last_fullsized": "2019-05-15 14:52:36.025409",
"mapping_epoch": 103095,
"log_start": "102489'4664889",
"ondisk_log_start": "102489'4664889",
"created": 406,
"last_epoch_clean": 96881,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "102517'4667556",
"last_scrub_stamp": "2019-05-15 01:07:28.978979",
"last_deep_scrub": "102491'4666011",
"last_deep_scrub_stamp": "2019-05-08 07:20:08.253942",
"last_clean_scrub_stamp": "2019-05-15 01:07:28.978979",
"log_size": 3083,
"ondisk_log_size": 3083,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": true,
"manifest_stats_invalid": true,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 2641321984,
"num_objects": 633,
"num_object_clones": 49,
"num_object_copies": 1899,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 633,
"num_whiteouts": 0,
"num_read": 1263624,
"num_read_kb": 49804648,
"num_write": 5054985,
"num_write_kb": 76175293,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 6507,
"num_bytes_recovered": 27291253248,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0
},
"up": [
70,
72,
27
],
"acting": [
25,
27,
79
],
"blocked_by": [],
"up_primary": 70,
"acting_primary": 25,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 103086,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
{
"peer": "70",
"pgid": "23.4f5",
"last_update": "103035'4667973",
"last_complete": "103035'4667973",
"log_tail": "102489'4664973",
"last_user_version": 0,
"last_backfill": "MIN",
"last_backfill_bitwise": 1,
"purged_snaps": [],
"history": {
"epoch_created": 406,
"epoch_pool_created": 406,
"last_epoch_started": 103086,
"last_interval_started": 103085,
"last_epoch_clean": 96881,
"last_interval_clean": 96880,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 103095,
"same_interval_since": 103095,
"same_primary_since": 95398,
"last_scrub": "102517'4667556",
"last_scrub_stamp": "2019-05-15 01:07:28.978979",
"last_deep_scrub": "102491'4666011",
"last_deep_scrub_stamp": "2019-05-08 07:20:08.253942",
"last_clean_scrub_stamp": "2019-05-15 01:07:28.978979"
},
"stats": {
"version": "0'0",
"reported_seq": "0",
"reported_epoch": "0",
"state": "unknown",
"last_fresh": "0.000000",
"last_change": "0.000000",
"last_active": "0.000000",
"last_peered": "0.000000",
"last_clean": "0.000000",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "0.000000",
"last_undegraded": "0.000000",
"last_fullsized": "0.000000",
"mapping_epoch": 0,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 0,
"last_epoch_clean": 0,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "0'0",
"last_scrub_stamp": "0.000000",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "0.000000",
"last_clean_scrub_stamp": "0.000000",
"log_size": 0,
"ondisk_log_size": 0,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 633,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 0,
"num_whiteouts": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 0,
"num_write_kb": 0,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0
},
"up": [],
"acting": [],
"blocked_by": [],
"up_primary": -1,
"acting_primary": -1,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 1,
"last_epoch_started": 103096,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
{
"peer": "72",
"pgid": "23.4f5",
"last_update": "103035'4667973",
"last_complete": "103035'4667973",
"log_tail": "102489'4664973",
"last_user_version": 0,
"last_backfill": "MIN",
"last_backfill_bitwise": 1,
"purged_snaps": [],
"history": {
"epoch_created": 406,
"epoch_pool_created": 406,
"last_epoch_started": 103086,
"last_interval_started": 103085,
"last_epoch_clean": 96881,
"last_interval_clean": 96880,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 103095,
"same_interval_since": 103095,
"same_primary_since": 95398,
"last_scrub": "102517'4667556",
"last_scrub_stamp": "2019-05-15 01:07:28.978979",
"last_deep_scrub": "102491'4666011",
"last_deep_scrub_stamp": "2019-05-08 07:20:08.253942",
"last_clean_scrub_stamp": "2019-05-15 01:07:28.978979"
},
"stats": {
"version": "0'0",
"reported_seq": "0",
"reported_epoch": "0",
"state": "unknown",
"last_fresh": "0.000000",
"last_change": "0.000000",
"last_active": "0.000000",
"last_peered": "0.000000",
"last_clean": "0.000000",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "0.000000",
"last_undegraded": "0.000000",
"last_fullsized": "0.000000",
"mapping_epoch": 103095,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 0,
"last_epoch_clean": 0,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "0'0",
"last_scrub_stamp": "0.000000",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "0.000000",
"last_clean_scrub_stamp": "0.000000",
"log_size": 0,
"ondisk_log_size": 0,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": false,
"manifest_stats_invalid": false,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 633,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 0,
"num_whiteouts": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 0,
"num_write_kb": 0,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0
},
"up": [
70,
72,
27
],
"acting": [
25,
27,
79
],
"blocked_by": [],
"up_primary": 70,
"acting_primary": 25,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 1,
"last_epoch_started": 103086,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
{
"peer": "79",
"pgid": "23.4f5",
"last_update": "103035'4667973",
"last_complete": "103035'4667973",
"log_tail": "102489'4664889",
"last_user_version": 4667973,
"last_backfill": "MAX",
"last_backfill_bitwise": 1,
"purged_snaps": [],
"history": {
"epoch_created": 406,
"epoch_pool_created": 406,
"last_epoch_started": 103086,
"last_interval_started": 103085,
"last_epoch_clean": 96881,
"last_interval_clean": 96880,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 103095,
"same_interval_since": 103095,
"same_primary_since": 95398,
"last_scrub": "102517'4667556",
"last_scrub_stamp": "2019-05-15 01:07:28.978979",
"last_deep_scrub": "102491'4666011",
"last_deep_scrub_stamp": "2019-05-08 07:20:08.253942",
"last_clean_scrub_stamp": "2019-05-15 01:07:28.978979"
},
"stats": {
"version": "102814'4667972",
"reported_seq": "2115836",
"reported_epoch": "103035",
"state": "active+clean",
"last_fresh": "2019-05-15 14:52:36.025409",
"last_change": "2019-05-15 01:07:28.979033",
"last_active": "2019-05-15 14:52:36.025409",
"last_peered": "2019-05-15 14:52:36.025409",
"last_clean": "2019-05-15 14:52:36.025409",
"last_became_active": "2019-04-26 06:30:50.855477",
"last_became_peered": "2019-04-26 06:30:50.855477",
"last_unstale": "2019-05-15 14:52:36.025409",
"last_undegraded": "2019-05-15 14:52:36.025409",
"last_fullsized": "2019-05-15 14:52:36.025409",
"mapping_epoch": 103095,
"log_start": "102489'4664889",
"ondisk_log_start": "102489'4664889",
"created": 406,
"last_epoch_clean": 96881,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "102517'4667556",
"last_scrub_stamp": "2019-05-15 01:07:28.978979",
"last_deep_scrub": "102491'4666011",
"last_deep_scrub_stamp": "2019-05-08 07:20:08.253942",
"last_clean_scrub_stamp": "2019-05-15 01:07:28.978979",
"log_size": 3083,
"ondisk_log_size": 3083,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": false,
"pin_stats_invalid": true,
"manifest_stats_invalid": true,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 2641321984,
"num_objects": 633,
"num_object_clones": 49,
"num_object_copies": 1899,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 633,
"num_whiteouts": 0,
"num_read": 1263624,
"num_read_kb": 49804648,
"num_write": 5054985,
"num_write_kb": 76175293,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 6507,
"num_bytes_recovered": 27291253248,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0
},
"up": [
70,
72,
27
],
"acting": [
25,
27,
79
],
"blocked_by": [],
"up_primary": 70,
"acting_primary": 25,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 96881,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
],
"recovery_state": [
{
"name": "Started/Primary/Active",
"enter_time": "2019-05-15 14:56:04.242725",
"might_have_unfound": [],
"recovery_progress": {
"backfill_targets": [
"70",
"72"
],
"waiting_on_backfill": [],
"last_backfill_started": "MIN",
"backfill_info": {
"begin": "MIN",
"end": "MIN",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
},
"scrub": {
"scrubber.epoch_start": "96880",
"scrubber.active": false,
"scrubber.state": "INACTIVE",
"scrubber.start": "MIN",
"scrubber.end": "MIN",
"scrubber.max_end": "MIN",
"scrubber.subset_last_update": "0'0",
"scrubber.deep": false,
"scrubber.waiting_on_whom": []
}
},
{
"name": "Started",
"enter_time": "2019-05-15 14:56:03.673622"
}
],
"agent_state": {}
}
-------------------------------------------
--
| Jan "Yenya" Kasprzak <kas at {fi.muni.cz - work | yenya.net - private}> |
| http://www.fi.muni.cz/~kas/ GPG: 4096R/A45477D5 |
sir_clive> I hope you don't mind if I steal some of your ideas?
laryross> As far as stealing... we call it sharing here. --from rcgroups
_______________________________________________
ceph-users mailing list
[email protected]
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com