Hello,
I am running into an "interesting" issue with a PG that is being flagged
as inconsistent during scrub (causing the cluster to go to HEALTH_ERR),
but doesn't actually appear to contain any inconsistent objects.
$ ceph health detail
HEALTH_ERR 1 scrub errors; Possible data damage: 1 pg inconsistent
OSD_SCRUB_ERRORS 1 scrub errors
PG_DAMAGED Possible data damage: 1 pg inconsistent
pg 10.10d is active+clean+inconsistent, acting [15,13]
$ rados list-inconsistent-obj 10.10d
{"epoch":12138,"inconsistents":[]}
"ceph pg query" (see below) on that PG does report num_scrub_errors=1,
num_shallow_scrub_errors=1, and num_objects_dirty=1. "osd scrub auto
repair = true" is set on all OSDs, but the PG never auto-repairs. (This
is a test cluster, the pool size is 2 — this may preclude auto repair
from ever kicking in; I'm not sure on that one.)
"ceph pg repair" does repair, but the issue reappears on the next
scheduled scrub.
This issue was first discovered while the cluster was on
Jewel/Filestore. In an event like this I would normally suspect either a
problem with an individual OSD, or a bug in the FileStore code. But the
cluster has had *all* of it's OSDs replaced since, as part of a full
Jewel→Luminous→Nautilus upgrade and a FileStore→BlueStore conversion.
The issue still persists.
A full "ceph pg 10.10d query" result is below. If anyone has ideas on
how to permanently fix this issue, I'd be most grateful.
Thanks!
Cheers,
Florian
{
"state": "active+clean+inconsistent",
"snap_trimq": "[]",
"snap_trimq_len": 0,
"epoch": 12143,
"up": [
15,
13
],
"acting": [
15,
13
],
"acting_recovery_backfill": [
"13",
"15"
],
"info": {
"pgid": "10.10d",
"last_update": "100'11",
"last_complete": "100'11",
"log_tail": "0'0",
"last_user_version": 11,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": [],
"history": {
"epoch_created": 45,
"epoch_pool_created": 45,
"last_epoch_started": 12139,
"last_interval_started": 12138,
"last_epoch_clean": 12139,
"last_interval_clean": 12138,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 12138,
"same_interval_since": 12138,
"same_primary_since": 12114,
"last_scrub": "100'11",
"last_scrub_stamp": "2019-10-14 08:33:57.347097",
"last_deep_scrub": "100'11",
"last_deep_scrub_stamp": "2019-10-11 14:09:29.016946",
"last_clean_scrub_stamp": "2019-10-11 14:09:29.016946"
},
"stats": {
"version": "100'11",
"reported_seq": "4927",
"reported_epoch": "12143",
"state": "active+clean+inconsistent",
"last_fresh": "2019-10-14 08:33:57.347147",
"last_change": "2019-10-14 08:33:57.347147",
"last_active": "2019-10-14 08:33:57.347147",
"last_peered": "2019-10-14 08:33:57.347147",
"last_clean": "2019-10-14 08:33:57.347147",
"last_became_active": "2019-10-11 14:44:09.312226",
"last_became_peered": "2019-10-11 14:44:09.312226",
"last_unstale": "2019-10-14 08:33:57.347147",
"last_undegraded": "2019-10-14 08:33:57.347147",
"last_fullsized": "2019-10-14 08:33:57.347147",
"mapping_epoch": 12138,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 45,
"last_epoch_clean": 12139,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "100'11",
"last_scrub_stamp": "2019-10-14 08:33:57.347097",
"last_deep_scrub": "100'11",
"last_deep_scrub_stamp": "2019-10-11 14:09:29.016946",
"last_clean_scrub_stamp": "2019-10-11 14:09:29.016946",
"log_size": 11,
"ondisk_log_size": 11,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": true,
"pin_stats_invalid": true,
"manifest_stats_invalid": true,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 11,
"num_objects": 1,
"num_object_clones": 0,
"num_object_copies": 2,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 0,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 1,
"num_whiteouts": 0,
"num_read": 33,
"num_read_kb": 22,
"num_write": 11,
"num_write_kb": 6,
"num_scrub_errors": 1,
"num_shallow_scrub_errors": 1,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 2,
"num_bytes_recovered": 22,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0,
"num_omap_bytes": 0,
"num_omap_keys": 0,
"num_objects_repaired": 0
},
"up": [
15,
13
],
"acting": [
15,
13
],
"avail_no_missing": [],
"object_location_counts": [],
"blocked_by": [],
"up_primary": 15,
"acting_primary": 15,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 12139,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
},
"peer_info": [
{
"peer": "13",
"pgid": "10.10d",
"last_update": "100'11",
"last_complete": "100'11",
"log_tail": "0'0",
"last_user_version": 11,
"last_backfill": "MAX",
"last_backfill_bitwise": 0,
"purged_snaps": [],
"history": {
"epoch_created": 45,
"epoch_pool_created": 45,
"last_epoch_started": 12139,
"last_interval_started": 12138,
"last_epoch_clean": 12139,
"last_interval_clean": 12138,
"last_epoch_split": 0,
"last_epoch_marked_full": 0,
"same_up_since": 12138,
"same_interval_since": 12138,
"same_primary_since": 12114,
"last_scrub": "100'11",
"last_scrub_stamp": "2019-10-14 08:33:57.347097",
"last_deep_scrub": "100'11",
"last_deep_scrub_stamp": "2019-10-11 14:09:29.016946",
"last_clean_scrub_stamp": "2019-10-11 14:09:29.016946"
},
"stats": {
"version": "100'11",
"reported_seq": "36",
"reported_epoch": "12113",
"state": "active+undersized+degraded",
"last_fresh": "2019-10-11 14:39:58.946532",
"last_change": "2019-10-11 14:39:58.924989",
"last_active": "2019-10-11 14:39:58.946532",
"last_peered": "2019-10-11 14:39:58.946532",
"last_clean": "2014-11-05 15:48:35.131248",
"last_became_active": "2019-10-11 14:39:58.924989",
"last_became_peered": "2019-10-11 14:39:58.924989",
"last_unstale": "2019-10-11 14:39:58.946532",
"last_undegraded": "2019-10-11 14:39:58.892352",
"last_fullsized": "2019-10-11 14:39:58.892331",
"mapping_epoch": 12138,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 45,
"last_epoch_clean": 12103,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "100'11",
"last_scrub_stamp": "2019-10-11 14:09:29.016946",
"last_deep_scrub": "100'11",
"last_deep_scrub_stamp": "2019-10-11 14:09:29.016946",
"last_clean_scrub_stamp": "2019-10-11 14:09:29.016946",
"log_size": 11,
"ondisk_log_size": 11,
"stats_invalid": false,
"dirty_stats_invalid": false,
"omap_stats_invalid": false,
"hitset_stats_invalid": false,
"hitset_bytes_stats_invalid": true,
"pin_stats_invalid": true,
"manifest_stats_invalid": true,
"snaptrimq_len": 0,
"stat_sum": {
"num_bytes": 11,
"num_objects": 1,
"num_object_clones": 0,
"num_object_copies": 2,
"num_objects_missing_on_primary": 0,
"num_objects_missing": 0,
"num_objects_degraded": 1,
"num_objects_misplaced": 0,
"num_objects_unfound": 0,
"num_objects_dirty": 1,
"num_whiteouts": 0,
"num_read": 33,
"num_read_kb": 22,
"num_write": 11,
"num_write_kb": 6,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 2,
"num_bytes_recovered": 22,
"num_keys_recovered": 0,
"num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0,
"num_legacy_snapsets": 0,
"num_large_omap_objects": 0,
"num_objects_manifest": 0,
"num_omap_bytes": 0,
"num_omap_keys": 0,
"num_objects_repaired": 0
},
"up": [
15,
13
],
"acting": [
15,
13
],
"avail_no_missing": [
"13"
],
"object_location_counts": [
{
"shards": "13",
"objects": 1
}
],
"blocked_by": [],
"up_primary": 15,
"acting_primary": 15,
"purged_snaps": []
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 12139,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
],
"recovery_state": [
{
"name": "Started/Primary/Active",
"enter_time": "2019-10-11 14:44:09.175574",
"might_have_unfound": [],
"recovery_progress": {
"backfill_targets": [],
"waiting_on_backfill": [],
"last_backfill_started": "MIN",
"backfill_info": {
"begin": "MIN",
"end": "MIN",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
},
"scrub": {
"scrubber.epoch_start": "12138",
"scrubber.active": false,
"scrubber.state": "INACTIVE",
"scrubber.start": "MIN",
"scrubber.end": "MIN",
"scrubber.max_end": "MIN",
"scrubber.subset_last_update": "0'0",
"scrubber.deep": false,
"scrubber.waiting_on_whom": []
}
},
{
"name": "Started",
"enter_time": "2019-10-11 14:44:08.833757"
}
],
"agent_state": {}
}
_______________________________________________
ceph-users mailing list -- [email protected]
To unsubscribe send an email to [email protected]