Any pointers to fix incomplete PG would be grateful
I tried the following with no success.
pg scrub
pg deep scrub
pg repair
osd out , down , rm , in
osd lost
# ceph -s
cluster 2bd3283d-67ef-4316-8b7e-d8f4747eae33
health HEALTH_WARN 7 pgs down; 20 pgs incomplete; 1 pgs recovering; 20 pgs
stuck inactive; 21 pgs stuck unclean; 4 requests are blocked > 32 sec; recovery
201/986658 objects degraded (0.020%); 133/328886 unfound (0.040%)
monmap e3: 3 mons at
{pouta-s01=xx.xx.xx.1:6789/0,pouta-s02=xx.xx.xx.2:6789/0,pouta-s03=xx.xx.xx.3:6789/0},
election epoch 1920, quorum 0,1,2 pouta-s01,pouta-s02,pouta-s03
osdmap e262813: 239 osds: 239 up, 239 in
pgmap v588073: 18432 pgs, 13 pools, 2338 GB data, 321 kobjects
19094 GB used, 849 TB / 868 TB avail
201/986658 objects degraded (0.020%); 133/328886 unfound (0.040%)
7 down+incomplete
18411 active+clean
13 incomplete
1 active+recovering
# ceph pg dump_stuck inactive
ok
pg_stat objects mip degr unf bytes log disklog state
state_stamp v reported up up_primary acting
acting_primar last_scrub scrub_stamp last_deep_scrub deep_scrub_stamp
10.70 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.152179 0'0 262813:163 [213,88,80] 213
[213,88,80] 213 0'0 2015-03-12 17:59:43.275049 0'0
2015-03-09 17:55:58.745662
3.dde 68 66 0 66 552861709 297 297
down+incomplete 2015-04-01 21:21:16.161066 33547'297 262813:230683
[174,5,179] 174 [174,5,179] 174 33547'297 2015-03-12
14:19:15.261595 28522'43 2015-03-11 14:19:13.894538
5.a2 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.145329 0'0 262813:150 [168,182,201] 168
[168,182,201] 168 0'0 2015-03-12 17:58:29.257085 0'0
2015-03-09 17:55:07.684377
13.1b6 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.139062 0'0 262813:2974 [0,176,131] 0
[0,176,131] 0 0'0 2015-03-12 18:00:13.286920 0'0
2015-03-09 17:56:18.715208
7.25b 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.113876 0'0 262813:167 [111,26,108] 111
[111,26,108] 111 27666'16 2015-03-12 17:59:06.357864 2330'3
2015-03-09 17:55:30.754522
5.19 0 0 0 0 0 0 0 down+incomplete
2015-04-01 21:21:16.199712 0'0 262813:27605 [212,43,131] 212
[212,43,131] 212 0'0 2015-03-12 13:51:37.777026 0'0
2015-03-11 13:51:35.406246
3.a2f 68 0 0 0 543686693 302 302
incomplete 2015-04-01 21:21:16.141368 33531'302 262813:3731
[149,224,33] 149 [149,224,33] 149 33531'302 2015-03-12
14:17:43.045627 28564'54 2015-03-11 14:17:40.314189
7.298 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.108523 0'0 262813:166 [221,154,225] 221
[221,154,225] 221 27666'13 2015-03-12 17:59:10.308423 2330'4
2015-03-09 17:55:35.750109
1.1e7 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.192711 0'0 262813:162 [215,232] 215
[215,232] 215 0'0 2015-03-12 17:55:45.203232 0'0
2015-03-09 17:53:49.694822
3.774 79 0 0 0 645136397 339 339
down+incomplete 2015-04-01 21:21:16.207131 33570'339 262813:168986
[162,39,161] 162 [162,39,161] 162 33570'339 2015-03-12
14:49:03.869447 2226'2 2015-03-09 13:46:49.783950
3.7d0 78 0 0 0 609222686 376 376
down+incomplete 2015-04-01 21:21:16.135599 33538'376 262813:185045
[117,118,177] 117 [117,118,177] 117 33538'376 2015-03-12
13:51:03.984454 28394'62 2015-03-11 13:50:58.196288
3.d60 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.158179 0'0 262813:169 [60,56,220] 60
[60,56,220] 60 33552'321 2015-03-12 13:44:43.502907
28356'39 2015-03-11 13:44:41.663482
4.1fc 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.217291 0'0 262813:163 [144,58,153] 144
[144,58,153] 144 0'0 2015-03-12 17:58:19.254170 0'0
2015-03-09 17:54:55.720479
3.e02 72 0 0 0 585105425 304 304
down+incomplete 2015-04-01 21:21:16.099150 33568'304 262813:169744
[15,102,147] 15 [15,102,147] 15 33568'304 2015-03-16
10:04:19.894789 2246'4 2015-03-09 11:43:44.176331
8.1d4 0 0 0 0 0 0 0 down+incomplete
2015-04-01 21:21:16.218644 0'0 262813:21867 [126,43,174] 126
[126,43,174] 126 0'0 2015-03-12 14:34:35.258338 0'0
2015-03-12 14:34:35.258338
4.2f4 0 0 0 0 0 0 0 down+incomplete
2015-04-01 21:21:16.117515 0'0 262813:116150 [181,186,13] 181
[181,186,13] 181 0'0 2015-03-12 14:59:03.529264 0'0
2015-03-09 13:46:40.601301
3.e5a 76 70 0 0 623902741 325 325
incomplete 2015-04-01 21:21:16.043300 33569'325 262813:73426
[97,22,62] 97 [97,22,62] 97 33569'325 2015-03-12
13:58:05.813966 28433'44 2015-03-11 13:57:53.909795
8.3a0 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.056437 0'0 262813:175168 [62,14,224] 62
[62,14,224] 62 0'0 2015-03-12 13:52:44.546418 0'0
2015-03-12 13:52:44.546418
3.24e 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.130831 0'0 262813:165 [39,202,90] 39
[39,202,90] 39 33556'272 2015-03-13 11:44:41.263725 2327'4
2015-03-09 17:54:43.675552
5.f7 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.145298 0'0 262813:153 [54,193,123] 54
[54,193,123] 54 0'0 2015-03-12 17:58:30.257371 0'0
2015-03-09 17:55:11.725629
[root@pouta-s01 ceph]#
########## Example 1 : PG 10.70 ###########
10.70 0 0 0 0 0 0 0 incomplete
2015-04-01 21:21:16.152179 0'0 262813:163 [213,88,80] 213
[213,88,80] 213 0'0 2015-03-12 17:59:43.275049 0'0
2015-03-09 17:55:58.745662
This is how i found location of each OSD
[root@pouta-s01 ceph]# ceph osd find 88
{ "osd": 88,
"ip": "10.100.50.3:7079\/916853",
"crush_location": { "host": "pouta-s03",
"root": "default”}}
[root@pouta-s01 ceph]#
When i manually check current/pg_head directory , data is not present here (
i.e. data is lost from all the copies )
[root@pouta-s04 current]# ls -l /var/lib/ceph/osd/ceph-80/current/10.70_head
total 0
[root@pouta-s04 current]#
On some of the OSD’s HEAD directory does not exists
[root@pouta-s03 ~]# ls -l /var/lib/ceph/osd/ceph-88/current/10.70_head
ls: cannot access /var/lib/ceph/osd/ceph-88/current/10.70_head: No such file or
directory
[root@pouta-s03 ~]#
[root@pouta-s02 ~]# ls -l /var/lib/ceph/osd/ceph-213/current/10.70_head
total 0
[root@pouta-s02 ~]#
# ceph pg 10.70 query ---> http://paste.ubuntu.com/10719840/
########## Example 2 : PG 3.7d0 ###########
3.7d0 78 0 0 0 609222686 376 376
down+incomplete 2015-04-01 21:21:16.135599 33538'376 262813:185045
[117,118,177] 117 [117,118,177] 117 33538'376 2015-03-12
13:51:03.984454 28394'62 2015-03-11 13:50:58.196288
[root@pouta-s04 current]# ceph pg map 3.7d0
osdmap e262813 pg 3.7d0 (3.7d0) -> up [117,118,177] acting [117,118,177]
[root@pouta-s04 current]#
Data is present here , so 1 copy is present out of 3
[root@pouta-s04 current]# ls -l /var/lib/ceph/osd/ceph-117/current/3.7d0_head/
| wc -l
63
[root@pouta-s04 current]#
[root@pouta-s03 ~]# ls -l /var/lib/ceph/osd/ceph-118/current/3.7d0_head/
total 0
[root@pouta-s03 ~]#
[root@pouta-s01 ceph]# ceph osd find 177
{ "osd": 177,
"ip": "10.100.50.2:7062\/777799",
"crush_location": { "host": "pouta-s02",
"root": "default”}}
[root@pouta-s01 ceph]#
Even directory is not present here
[root@pouta-s02 ~]# ls -l /var/lib/ceph/osd/ceph-177/current/3.7d0_head/
ls: cannot access /var/lib/ceph/osd/ceph-177/current/3.7d0_head/: No such file
or directory
[root@pouta-s02 ~]#
# ceph pg 3.7d0 query http://paste.ubuntu.com/10720107/
<http://paste.ubuntu.com/10720107/>
- Karan -
> On 20 Mar 2015, at 22:43, Craig Lewis <[email protected]> wrote:
>
> > osdmap e261536: 239 osds: 239 up, 238 in
>
> Why is that last OSD not IN? The history you need is probably there.
>
> Run ceph pg <pgid> query on some of the stuck PGs. Look for the
> recovery_state section. That should tell you what Ceph needs to complete the
> recovery.
>
>
> If you need more help, post the output of a couple pg queries.
>
>
>
> On Fri, Mar 20, 2015 at 4:22 AM, Karan Singh <[email protected]
> <mailto:[email protected]>> wrote:
> Hello Guys
>
> My CEPH cluster lost data and not its not recovering. This problem occurred
> when Ceph performed recovery when one of the node was down.
> Now all the nodes are up but Ceph is showing PG as incomplete , unclean ,
> recovering.
>
>
> I have tried several things to recover them like , scrub , deep-scrub , pg
> repair , try changing primary affinity and then scrubbing ,
> osd_pool_default_size etc. BUT NO LUCK
>
> Could yo please advice , how to recover PG and achieve HEALTH_OK
>
> # ceph -s
> cluster 2bd3283d-67ef-4316-8b7e-d8f4747eae33
> health HEALTH_WARN 19 pgs incomplete; 3 pgs recovering; 20 pgs stuck
> inactive; 23 pgs stuck unclean; 2 requests are blocked > 32 sec; recovery
> 531/980676 objects degraded (0.054%); 243/326892 unfound (0.074%)
> monmap e3: 3 mons at
> {xxx=xxxx:6789/0,xxx=xxxx:6789:6789/0,xxx=xxxx:6789:6789/0}, election epoch
> 1474, quorum 0,1,2 xx,xx,xx
> osdmap e261536: 239 osds: 239 up, 238 in
> pgmap v415790: 18432 pgs, 13 pools, 2330 GB data, 319 kobjects
> 20316 GB used, 844 TB / 864 TB avail
> 531/980676 objects degraded (0.054%); 243/326892 unfound (0.074%)
> 1 creating
> 18409 active+clean
> 3 active+recovering
> 19 incomplete
>
>
>
>
> # ceph pg dump_stuck unclean
> ok
> pg_stat objects mip degr unf bytes log disklog state
> state_stamp v reported up up_primary acting
> acting_primary last_scrub scrub_stamp last_deep_scrub
> deep_scrub_stamp
> 10.70 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.534911 0'0 261536:1015 [153,140,80] 153
> [153,140,80] 153 0'0 2015-03-12 17:59:43.275049 0'0
> 2015-03-09 17:55:58.745662
> 3.dde 68 66 0 66 552861709 297 297
> incomplete 2015-03-20 12:19:49.584839 33547'297 261536:228352
> [174,5,179] 174 [174,5,179] 174 33547'297 2015-03-12
> 14:19:15.261595 28522'43 2015-03-11 14:19:13.894538
> 5.a2 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.560756 0'0 261536:897 [214,191,170] 214
> [214,191,170] 214 0'0 2015-03-12 17:58:29.257085 0'0
> 2015-03-09 17:55:07.684377
> 13.1b6 0 0 0 0 0 0 0
> incomplete 2015-03-20 12:19:49.846253 0'0 261536:1050
> [0,176,131] 0 [0,176,131] 0 0'0 2015-03-12
> 18:00:13.286920 0'0 2015-03-09 17:56:18.715208
> 7.25b 16 0 0 0 67108864 16 16
> incomplete 2015-03-20 12:19:49.639102 27666'16 261536:4777
> [194,145,45] 194 [194,145,45] 194 27666'16 2015-03-12
> 17:59:06.357864 2330'3 2015-03-09 17:55:30.754522
> 5.19 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.742698 0'0 261536:25410 [212,43,131] 212
> [212,43,131] 212 0'0 2015-03-12 13:51:37.777026 0'0
> 2015-03-11 13:51:35.406246
> 3.a2f 0 0 0 0 0 0 0 creating
> 2015-03-20 12:42:15.586372 0'0 0:0 [] -1 [] -1
> 0'0 0.000000 0'0 0.000000
> 7.298 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.566966 0'0 261536:900 [187,95,225] 187
> [187,95,225] 187 27666'13 2015-03-12 17:59:10.308423
> 2330'4 2015-03-09 17:55:35.750109
> 3.a5a 77 87 261 87 623902741 325 325
> active+recovering 2015-03-20 10:54:57.443670 33569'325
> 261536:182464 [150,149,181] 150 [150,149,181] 150 33569'325
> 2015-03-12 13:58:05.813966 28433'44 2015-03-11 13:57:53.909795
> 1.1e7 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.610547 0'0 261536:772 [175,182] 175
> [175,182] 175 0'0 2015-03-12 17:55:45.203232 0'0
> 2015-03-09 17:53:49.694822
> 3.774 79 0 0 0 645136397 339 339
> incomplete 2015-03-20 12:19:49.821708 33570'339 261536:166857
> [162,39,161] 162 [162,39,161] 162 33570'339 2015-03-12
> 14:49:03.869447 2226'2 2015-03-09 13:46:49.783950
> 3.7d0 78 0 0 0 609222686 376 376
> incomplete 2015-03-20 12:19:49.534004 33538'376 261536:182810
> [117,118,177] 117 [117,118,177] 117 33538'376 2015-03-12
> 13:51:03.984454 28394'62 2015-03-11 13:50:58.196288
> 3.d60 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.647196 0'0 261536:833 [154,172,1] 154
> [154,172,1] 154 33552'321 2015-03-12 13:44:43.502907
> 28356'39 2015-03-11 13:44:41.663482
> 4.1fc 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.610103 0'0 261536:1069 [70,179,58] 70
> [70,179,58] 70 0'0 2015-03-12 17:58:19.254170 0'0
> 2015-03-09 17:54:55.720479
> 3.e02 72 0 0 0 585105425 304 304
> incomplete 2015-03-20 12:19:49.564768 33568'304 261536:167428
> [15,102,147] 15 [15,102,147] 15 33568'304 2015-03-16
> 10:04:19.894789 2246'4 2015-03-09 11:43:44.176331
> 8.1d4 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.614727 0'0 261536:19611 [126,43,174] 126
> [126,43,174] 126 0'0 2015-03-12 14:34:35.258338 0'0
> 2015-03-12 14:34:35.258338
> 4.2f4 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.595109 0'0 261536:113791 [181,186,13] 181
> [181,186,13] 181 0'0 2015-03-12 14:59:03.529264 0'0
> 2015-03-09 13:46:40.601301
> 3.52c 65 23 69 23 543162368 290 290
> active+recovering 2015-03-20 10:51:43.664734 33553'290
> 261536:8431 [212,100,219] 212 [212,100,219] 212 33553'290
> 2015-03-13 11:44:26.396514 29686'103 2015-03-11 17:18:33.452616
> 3.e5a 76 70 0 0 623902741 325 325
> incomplete 2015-03-20 12:19:49.552071 33569'325 261536:71248
> [97,22,62] 97 [97,22,62] 97 33569'325 2015-03-12
> 13:58:05.813966 28433'44 2015-03-11 13:57:53.909795
> 8.3a0 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.615728 0'0 261536:173184 [62,14,178] 62
> [62,14,178] 62 0'0 2015-03-12 13:52:44.546418 0'0
> 2015-03-12 13:52:44.546418
> 3.24e 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.591282 0'0 261536:1026 [103,14,90] 103
> [103,14,90] 103 33556'272 2015-03-13 11:44:41.263725
> 2327'4 2015-03-09 17:54:43.675552
> 5.f7 0 0 0 0 0 0 0 incomplete
> 2015-03-20 12:19:49.667823 0'0 261536:853 [73,44,123] 73
> [73,44,123] 73 0'0 2015-03-12 17:58:30.257371 0'0
> 2015-03-09 17:55:11.725629
> 3.ae8 77 67 201 67 624427024 342 342
> active+recovering 2015-03-20 10:50:01.693979 33516'342
> 261536:149258 [122,144,218] 122 [122,144,218] 122 33516'342
> 2015-03-12 17:11:01.899062 29638'134 2015-03-11 17:10:59.966372
> #
>
>
> PG data is there on multiple OSD’s but Ceph is not recovering the PG , For
> Example
>
> # ceph pg map 7.25b
> osdmap e261536 pg 7.25b (7.25b) -> up [194,145,45] acting [194,145,45]
>
>
> # ls -l /var/lib/ceph/osd/ceph-194/current/7.25b_head | wc -l
> 17
>
> # ls -l /var/lib/ceph/osd/ceph-145/current/7.25b_head | wc -l
> 0
> #
>
> # ls -l /var/lib/ceph/osd/ceph-45/current/7.25b_head | wc -l
> 17
>
>
>
>
>
> Some of the PG are completely lost , i.e they don’t have any data . For
> example
>
> # ceph pg map 10.70
> osdmap e261536 pg 10.70 (10.70) -> up [153,140,80] acting [153,140,80]
>
>
> # ls -l /var/lib/ceph/osd/ceph-140/current/10.70_head | wc -l
> 0
>
> # ls -l /var/lib/ceph/osd/ceph-153/current/10.70_head | wc -l
> 0
>
> # ls -l /var/lib/ceph/osd/ceph-80/current/10.70_head | wc -l
> 0
>
>
>
> - Karan -
>
>
>
> _______________________________________________
> ceph-users mailing list
> [email protected] <mailto:[email protected]>
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
> <http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com>
>
>
smime.p7s
Description: S/MIME cryptographic signature
_______________________________________________ ceph-users mailing list [email protected] http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
