Hello,
We have a cluster with HEALTH_ERR due to inconsisten PG.
HEALTH_ERR 1 pgs inconsistent; 1 scrub errors
pg 2.ae is active+clean+inconsistent, acting [11,4]
1 scrub errors
We have run ceph pg repair on the problematic pg and health went back to OK.
I checked the two osd acting on that pg (we have 2 replicas here) and
one of them had I/O errors, which we assume was the cause of the
inconsistent PG in the first place. So, to avoid further problems, we
want to remove the disk from the cluster. However, as soon as we stop
the OSD, we get back the inconsistent PG and recovery won't start.
Any ideas of what could be happening? Why do we get back to inconsistent
PG? How to remove the failing disk?
Can't find any ERR on the logs of the OSDs, only on monitors logs. So I
can't see if there is a specific object causing the inconsistent state
(doesn't seem to be the case).
I attach the ceph pg query when HEALTH_ERR.
Any help would be much appreciated. Thanks!
--
Ana Avilés
Greenhost - sustainable hosting & digital security
E: [email protected]
T: +31 20 4890444
W: https://greenhost.nl
{
"state": "active+clean+inconsistent",
"snap_trimq": "[]",
"epoch": 198938,
"up": [
11,
4
],
"acting": [
11,
4
],
"actingbackfill": [
"4",
"11"
],
"info": {
"pgid": "2.ae",
"last_update": "198925'737155",
"last_complete": "198925'737155",
"log_tail": "198005'733960",
"last_user_version": 737036,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps":
"[1~12d,131~2538,266a~19,2686~2,2689~7,2691~4b0,2b43~8,2b4c~3,2b50~c,2b61~24,2b88~c,2b9a~11,2bb2~5,2bb8~2,2bbb~4,2bc0~1416,3fd7~1730,5708~9,5712~88b,5f9e~1373,7312~9c0,7cd3~5,7cd9~e36,8b10~935,9446~c0b,a053~29,a07d~1b7c,bbfa~1d8d,d988~c83,e60c~299c,10fa9~a7c,11a26~2719,14140~1,14144~8,1414d~1197,152e5~2098,1737e~264a,199c9~11,199db~57e,19f5a~1c2e,1bb89~2b3,1bed9~b3,1bf91~1,1bfa2~4e0,1c483~187,1c60b~113e,1d74a~15c,1d8a7~31,1d8d9~4,1d8de~6d1,1e04c~b2,1e102~1,1e111~422,1e534~2a,1e55f~140,1e739~14e,1e899~15d,1ea8d~b4,1eb51~59d,1f0ef~1d6,1f35f~b0,1f41e~24,1f443~179,1f5bd~666,1fcbc~b4,1fd80~1aa,1ff2b~3a9,20369~ab,204bb~aa,2060d~b1,20764~aa,208b5~aa,20a09~ac,20b5f~ab]",
"history": {
"epoch_created": 1,
"last_epoch_started": 198937,
"last_epoch_clean": 198937,
"last_epoch_split": 0,
"last_epoch_marked_full": 154843,
"same_up_since": 198936,
"same_interval_since": 198936,
"same_primary_since": 198936,
"last_scrub": "198925'737155",
"last_scrub_stamp": "2016-07-29 19:07:48.694564",
"last_deep_scrub": "198925'737155",
"last_deep_scrub_stamp": "2016-07-29 19:07:48.694564",
"last_clean_scrub_stamp": "2016-07-29 19:07:48.694564"
},
"stats": {
"version": "198925'737155",
"reported_seq": "1226184",
"reported_epoch": "198937",
"state": "active+clean+inconsistent",
"last_fresh": "2016-07-29 19:14:30.876365",
"last_change": "2016-07-29 19:14:30.876365",
"last_active": "2016-07-29 19:14:30.876365",
"last_peered": "2016-07-29 19:14:30.876365",
"last_clean": "2016-07-29 19:14:30.876365",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "2016-07-29 19:14:30.876365",
"last_undegraded": "2016-07-29 19:14:30.876365",
"last_fullsized": "2016-07-29 19:14:30.876365",
"mapping_epoch": 198933,
"log_start": "198005'733960",
"ondisk_log_start": "198005'733960",
"created": 1,
"last_epoch_clean": 198937,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "198925'737155",
"last_scrub_stamp": "2016-07-29 19:07:48.694564",
"last_deep_scrub": "198925'737155",
"last_deep_scrub_stamp": "2016-07-29 19:07:48.694564",
"last_clean_scrub_stamp": "2016-07-29 19:07:48.694564",
"log_size": 3195,
"ondisk_log_size": 3195,
"stats_invalid": "0",
"stat_sum": {
"num_bytes": 15776131072,
"num_objects": 5373,
"num_object_clones": 2080,
"num_object_copies": 10746,
"num_objects_missing_on_primary": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 5373,
"num_whiteouts": 0,
"num_read": 78438,
"num_read_kb": 3068887,
"num_write": 104414,
"num_write_kb": 32078753,
"num_scrub_errors": 1,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 1,
"num_objects_recovered": 4258,
"num_bytes_recovered": 15611940864,
"num_keys_recovered": 0,
"num_objects_omap": 1,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0
},
"up": [
11,
4
],
"acting": [
11,
4
],
"blocked_by": [],
"up_primary": 11,
"acting_primary": 11
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 198937,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
"peer_info": [
{
"peer": "4",
"pgid": "2.ae",
"last_update": "198925'737155",
"last_complete": "198925'737155",
"log_tail": "198005'733960",
"last_user_version": 737036,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps":
"[1~12d,131~2538,266a~19,2686~2,2689~7,2691~4b0,2b43~8,2b4c~3,2b50~c,2b61~24,2b88~c,2b9a~11,2bb2~5,2bb8~2,2bbb~4,2bc0~1416,3fd7~1730,5708~9,5712~88b,5f9e~1373,7312~9c0,7cd3~5,7cd9~e36,8b10~935,9446~c0b,a053~29,a07d~1b7c,bbfa~1d8d,d988~c83,e60c~299c,10fa9~a7c,11a26~2719,14140~1,14144~8,1414d~1197,152e5~2098,1737e~264a,199c9~11,199db~57e,19f5a~1c2e,1bb89~2b3,1bed9~b3,1bf91~1,1bfa2~4e0,1c483~187,1c60b~113e,1d74a~15c,1d8a7~31,1d8d9~4,1d8de~6d1,1e04c~b2,1e102~1,1e111~422,1e534~2a,1e55f~140,1e739~14e,1e899~15d,1ea8d~b4,1eb51~59d,1f0ef~1d6,1f35f~b0,1f41e~24,1f443~179,1f5bd~666,1fcbc~b4,1fd80~1aa,1ff2b~3a9,20369~ab,204bb~aa,2060d~b1,20764~aa,208b5~aa,20a09~ac,20b5f~ab]",
"history": {
"epoch_created": 1,
"last_epoch_started": 198937,
"last_epoch_clean": 198937,
"last_epoch_split": 0,
"last_epoch_marked_full": 154843,
"same_up_since": 198936,
"same_interval_since": 198936,
"same_primary_since": 198936,
"last_scrub": "198925'737155",
"last_scrub_stamp": "2016-07-29 19:07:48.694564",
"last_deep_scrub": "198925'737155",
"last_deep_scrub_stamp": "2016-07-29 19:07:48.694564",
"last_clean_scrub_stamp": "2016-07-29 19:07:48.694564"
},
"stats": {
"version": "198925'737155",
"reported_seq": "1226180",
"reported_epoch": "198936",
"state": "active+undersized+degraded+inconsistent",
"last_fresh": "2016-07-29 19:13:58.096548",
"last_change": "2016-07-29 19:13:58.095929",
"last_active": "2016-07-29 19:13:58.096548",
"last_peered": "2016-07-29 19:13:58.096548",
"last_clean": "2016-07-29 18:58:57.531797",
"last_became_active": "0.000000",
"last_became_peered": "0.000000",
"last_unstale": "2016-07-29 19:13:58.096548",
"last_undegraded": "2016-07-29 19:13:57.932878",
"last_fullsized": "2016-07-29 19:13:57.932878",
"mapping_epoch": 198933,
"log_start": "198005'733960",
"ondisk_log_start": "198005'733960",
"created": 1,
"last_epoch_clean": 198934,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "198925'737155",
"last_scrub_stamp": "2016-07-29 19:07:48.694564",
"last_deep_scrub": "198925'737155",
"last_deep_scrub_stamp": "2016-07-29 19:07:48.694564",
"last_clean_scrub_stamp": "2016-07-29 19:07:48.694564",
"log_size": 3195,
"ondisk_log_size": 3195,
"stats_invalid": "0",
"stat_sum": {
"num_bytes": 15776131072,
"num_objects": 5373,
"num_object_clones": 2080,
"num_object_copies": 10746,
"num_objects_missing_on_primary": 0,
"num_objects_degraded": 5373,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 5373,
"num_whiteouts": 0,
"num_read": 78438,
"num_read_kb": 3068887,
"num_write": 104414,
"num_write_kb": 32078753,
"num_scrub_errors": 1,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 1,
"num_objects_recovered": 4258,
"num_bytes_recovered": 15611940864,
"num_keys_recovered": 0,
"num_objects_omap": 1,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0
},
"up": [
11,
4
],
"acting": [
11,
4
],
"blocked_by": [],
"up_primary": 11,
"acting_primary": 11
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 198937,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
],
"recovery_state": [
{
"name": "Started\/Primary\/Active",
"enter_time": "2016-07-29 19:14:30.841075",
"might_have_unfound": [],
"recovery_progress": {
"backfill_targets": [],
"waiting_on_backfill": [],
"last_backfill_started": "MIN",
"backfill_info": {
"begin": "MIN",
"end": "MIN",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
},
"scrub": {
"scrubber.epoch_start": "0",
"scrubber.active": 0,
"scrubber.waiting_on": 0,
"scrubber.waiting_on_whom": []
}
},
{
"name": "Started",
"enter_time": "2016-07-29 19:14:29.818707"
}
],
"agent_state": {}
}
_______________________________________________
ceph-users mailing list
[email protected]
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com