Hi Greg,
This is the output of the two commands (at 8:10 I have stoped and
started the osd.42):
root@ceph-04:~# ceph pg dump_stuck inactive
ok
pg_stat objects mip degr unf bytes log disklog state
state_stamp v reported up acting last_scrub
scrub_stamp last_deep_scrub deep_scrub_stamp
6.289 0 0 0 0 0 0 0
incomplete 2014-02-16 08:10:54.623856 0'0 22149:342
[42,31] [42,31] 21088'14 2014-02-14 15:08:25.628985 0'0
2014-02-09 14:22:18.130409
root@ceph-04:~# ceph pg 6.289 query
{ "state": "incomplete",
"epoch": 22149,
"up": [
42,
31],
"acting": [
42,
31],
"info": { "pgid": "6.289",
"last_update": "0'0",
"last_complete": "0'0",
"log_tail": "0'0",
"last_user_version": 0,
"last_backfill": "MAX",
"purged_snaps": "[]",
"history": { "epoch_created": 21017,
"last_epoch_started": 21206,
"last_epoch_clean": 21206,
"last_epoch_split": 0,
"same_up_since": 22006,
"same_interval_since": 22006,
"same_primary_since": 22006,
"last_scrub": "21088'14",
"last_scrub_stamp": "2014-02-14 15:08:25.628985",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2014-02-09 14:22:18.130409",
"last_clean_scrub_stamp": "2014-02-14 15:08:25.628985"},
"stats": { "version": "0'0",
"reported_seq": "342",
"reported_epoch": "22149",
"state": "incomplete",
"last_fresh": "2014-02-16 18:58:12.707355",
"last_change": "2014-02-16 08:10:54.623856",
"last_active": "0.000000",
"last_clean": "0.000000",
"last_became_active": "0.000000",
"last_unstale": "2014-02-16 18:58:12.707355",
"mapping_epoch": 22003,
"log_start": "0'0",
"ondisk_log_start": "0'0",
"created": 21017,
"last_epoch_clean": 21206,
"parent": "0.0",
"parent_split_bits": 0,
"last_scrub": "21088'14",
"last_scrub_stamp": "2014-02-14 15:08:25.628985",
"last_deep_scrub": "0'0",
"last_deep_scrub_stamp": "2014-02-09 14:22:18.130409",
"last_clean_scrub_stamp": "2014-02-14 15:08:25.628985",
"log_size": 0,
"ondisk_log_size": 0,
"stats_invalid": "0",
"stat_sum": { "num_bytes": 0,
"num_objects": 0,
"num_object_clones": 0,
"num_object_copies": 0,
"num_objects_missing_on_primary": 0,
"num_objects_degraded": 0,
"num_objects_unfound": 0,
"num_read": 0,
"num_read_kb": 0,
"num_write": 0,
"num_write_kb": 0,
"num_scrub_errors": 0,
"num_shallow_scrub_errors": 0,
"num_deep_scrub_errors": 0,
"num_objects_recovered": 0,
"num_bytes_recovered": 0,
"num_keys_recovered": 0},
"stat_cat_sum": {},
"up": [
42,
31],
"acting": [
42,
31]},
"empty": 1,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 0},
"recovery_state": [
{ "name": "Started\/Primary\/Peering",
"enter_time": "2014-02-16 08:10:54.580099",
"past_intervals": [
{ "first": 21017,
"last": 21120,
"maybe_went_rw": 1,
"up": [
42,
31],
"acting": [
42,
31]},
{ "first": 21121,
"last": 21137,
"maybe_went_rw": 1,
"up": [
42],
"acting": [
42]},
{ "first": 21138,
"last": 21185,
"maybe_went_rw": 1,
"up": [
42,
31],
"acting": [
42,
31]},
{ "first": 21186,
"last": 21203,
"maybe_went_rw": 1,
"up": [
42,
25],
"acting": [
42,
25]},
{ "first": 21204,
"last": 21582,
"maybe_went_rw": 1,
"up": [
42,
31],
"acting": [
42,
31]},
{ "first": 21583,
"last": 21587,
"maybe_went_rw": 1,
"up": [
31],
"acting": [
31]},
{ "first": 21588,
"last": 21598,
"maybe_went_rw": 1,
"up": [
47,
31],
"acting": [
47,
31]},
{ "first": 21599,
"last": 21608,
"maybe_went_rw": 1,
"up": [
31],
"acting": [
31]},
{ "first": 21609,
"last": 22002,
"maybe_went_rw": 1,
"up": [
42,
31],
"acting": [
42,
31]},
{ "first": 22003,
"last": 22005,
"maybe_went_rw": 1,
"up": [
31],
"acting": [
31]}],
"probing_osds": [
31,
42,
47],
"down_osds_we_would_probe": [],
"peering_blocked_by": []},
{ "name": "Started",
"enter_time": "2014-02-16 08:10:54.580031"}]}
Regards
Udo
On 16.02.2014 18:48, Gregory Farnum wrote:
> Check out
> http://ceph.com/docs/master/rados/operations/placement-groups/#get-statistics-for-stuck-pgs
> and http://ceph.com/docs/master/rados/troubleshooting/troubleshooting-pg/.
> What does the dump of the PG say is going on?
> -Greg
> Software Engineer #42 @ http://inktank.com | http://ceph.com
>
>
> On Sun, Feb 16, 2014 at 12:32 AM, Udo Lembke <[email protected]> wrote:
>> Hi,
>> I switch some disks from manual format to ceph-deploy (because slightly
>> different xfs-parameters) - all disks are on a single node of an 4-node
>> cluster.
>> After rebuilding the osd-disk one PG are incomplete:
>> ceph -s
>> cluster 591db070-15c1-4c7a-b107-67717bdb87d9
>> health HEALTH_WARN 1 pgs incomplete; 1 pgs stuck inactive; 1 pgs
>> stuck unclean
>> monmap e7: 3 mons at
>> {a=172.20.2.11:6789/0,b=172.20.2.64:6789/0,c=172.20.2.65:6789/0},
>> election epoch 1178, quorum 0,1,2 a,b,c
>> mdsmap e409: 1/1/1 up {0=b=up:active}, 2 up:standby
>> osdmap e22002: 52 osds: 52 up, 52 in
>> pgmap v10177038: 7408 pgs, 5 pools, 58618 GB data, 14662 kobjects
>> 114 TB used, 76319 GB / 189 TB avail
>> 7405 active+clean
>> 1 incomplete
>> 2 active+clean+scrubbing+deep
>>
>> The pg are on one of the "rebuilded" disk (osd.42):
>> ceph pg map 6.289
>> osdmap e22002 pg 6.289 (6.289) -> up [42,31] acting [42,31]
>>
>> ls -lsa /var/lib/ceph/osd/ceph-42/current/6.289_head/
>> insgesamt 16
>> 0 drwxr-xr-x 2 root root 6 Feb 15 20:11 .
>> 16 drwxr-xr-x 411 root root 12288 Feb 16 03:09 ..
>>
>> ls -lsa
>> /var/lib/ceph/osd/ceph-31/current/6.289*/
>>
>> /var/lib/ceph/osd/ceph-31/current/6.289_head/:
>>
>> insgesamt
>> 20520
>>
>> 8 drwxr-xr-x 2 root root 4096 Feb 15 10:24
>> .
>>
>> 12 drwxr-xr-x 320 root root 8192 Feb 15 21:11
>> ..
>>
>> 4100 -rw-r--r-- 1 root root 4194304 Feb 15 10:24
>> benchmark\udata\uproxmox4\u638085\uobject2844__head_4F14E289__6
>> 4100 -rw-r--r-- 1 root root 4194304 Feb 15 10:24
>> benchmark\udata\uproxmox4\u638085\uobject3975__head_A7EBCA89__6
>> 4100 -rw-r--r-- 1 root root 4194304 Feb 15 10:24
>> benchmark\udata\uproxmox4\u638085\uobject4003__head_537FE289__6
>> 4100 -rw-r--r-- 1 root root 4194304 Feb 15 10:24
>> benchmark\udata\uproxmox4\u673679\uobject344__head_FF4A1289__6
>> 4100 -rw-r--r-- 1 root root 4194304 Feb 15 10:24
>> benchmark\udata\uproxmox4\u673679\uobject474__head_5FC3EA89__6
>>
>> /var/lib/ceph/osd/ceph-31/current/6.289_TEMP/:
>> insgesamt 16
>> 4 drwxr-xr-x 2 root root 6 Feb 15 10:24 .
>> 12 drwxr-xr-x 320 root root 8192 Feb 15 21:11 ..
>>
>> How to say ceph, that the content on osd.31 is the right one?
>> I have tried an "ceph osd repair osd.42" without luck.
>>
>> In the manual I saw only "ceph osd lost NN" but then all other data will
>> also rebuild to other disks I guess.
>> If "osd lost" the only option, how reuse osd-42? Waiting for an healthy
>> cluster and then recreate the disk?
>>
>> Hope for an hint.
>>
>>
>> Best regards
>>
>> Udo
>> _______________________________________________
>> ceph-users mailing list
>> [email protected]
>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
_______________________________________________
ceph-users mailing list
[email protected]
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com