Re: [ceph-users] object size changing after a pg repair

2016-06-29 Thread Goncalo Borges
 "num_objects_omap": 0,
"num_objects_hit_set_archive": 0,
"num_bytes_hit_set_archive": 0,
"num_flush": 0,
"num_flush_kb": 0,
"num_evict": 0,
"num_evict_kb": 0,
"num_promote": 0,
"num_flush_mode_high": 0,
"num_flush_mode_low": 0,
"num_evict_mode_some": 0,
"num_evict_mode_full": 0,
"num_objects_pinned": 0
},
"up": [
56,
39,
6
],
"acting": [
56,
39,
6
],
"blocked_by": [],
"up_primary": 56,
"acting_primary": 56
},
"empty": 0,
"dne": 0,
"incomplete": 0,
"last_epoch_started": 996,
"hit_set_history": {
"current_last_update": "0'0",
"history": []
}
}
],
"recovery_state": [
{
"name": "Started\/Primary\/Active",
"enter_time": "2016-06-27 04:57:36.876639",
"might_have_unfound": [],
"recovery_progress": {
"backfill_targets": [],
"waiting_on_backfill": [],
"last_backfill_started": "MIN",
"backfill_info": {
"begin": "MIN",
"end": "MIN",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
},
"scrub": {
"scrubber.epoch_start": "995",
"scrubber.active": 0,
"scrubber.state": "INACTIVE",
"scrubber.start": "MIN",
"scrubber.end": "MIN",
"scrubber.subset_last_update": "0'0",
"scrubber.deep": false,
"scrubber.seed": 0,
"scrubber.waiting_on": 0,
"scrubber.waiting_on_whom": []
}
},
{
"name": "Started",
"enter_time": "2016-06-27 04:57:35.828117"
}
],
"agent_state": {}
}

--- * ---

# diff -Nua 6.263query1.txt 6.263query2.txt
--- 6.263query1.txt 2016-06-30 04:38:13.290371200 +
+++ 6.263query2.txt 2016-06-30 04:38:43.412642932 +
@@ -19,10 +19,10 @@
 ],
 "info": {
 "pgid": "6.263",
-"last_update": "1005'2273061",
-"last_complete": "1005'2273061",
-"log_tail": "1005'227",
-"last_user_version": 2273061,
+"last_update": "1005'2273745",
+"last_complete": "1005'2273745",
+"log_tail": "1005'2270700",
+"last_user_version": 2273745,
 "last_backfill": "MAX",
 "last_backfill_bitwise": 0,
 "purged_snaps": "[]",
@@ -42,23 +42,23 @@
 "last_clean_scrub_stamp": "2016-06-30 02:13:00.455256"
 },
 "stats": {
-"version": "1005'2273061",
-"reported_seq": "2937682",
+"version": "1005'2273745",
+"reported_seq": "2938345",
 "reported_epoch": "1005",
 "state": "active+clean",
-"last_fresh": "2016-06-30 04:38:13.270047",
+"last_fresh": "2016-06-30 04:38:43.320788",
 "last_change": "2016-06-30 02:13:00.455293",
-"last_active": "2016-06-30 04:38:13.270047",
-"last_peered": "2016-06-30 04:38:13.270047&qu

Re: [ceph-users] object size changing after a pg repair

2016-06-29 Thread Shinobu Kinjo
shallow_scrub_errors": 0,
> "num_deep_scrub_errors": 0,
> "num_objects_recovered": 0,
> "num_bytes_recovered": 0,
> "num_keys_recovered": 0,
> "num_objects_omap": 0,
> "num_objects_hit_set_archive": 0,
> "num_bytes_hit_set_archive": 0,
> "num_flush": 0,
> "num_flush_kb": 0,
> "num_evict": 0,
> "num_evict_kb": 0,
> "num_promote": 0,
> "num_flush_mode_high": 0,
> "num_flush_mode_low": 0,
> "num_evict_mode_some": 0,
> "num_evict_mode_full": 0,
> "num_objects_pinned": 0
> },
> "up": [
> 56,
> 39,
> 6
> ],
> "acting": [
> 56,
> 39,
> 6
> ],
> "blocked_by": [],
> "up_primary": 56,
> "acting_primary": 56
> },
> "empty": 0,
> "dne": 0,
> "incomplete": 0,
> "last_epoch_started": 996,
> "hit_set_history": {
> "current_last_update": "0'0",
> "history": []
> }
> }
> ],
> "recovery_state": [
> {
> "name": "Started\/Primary\/Active",
> "enter_time": "2016-06-27 04:57:36.876639",
> "might_have_unfound": [],
> "recovery_progress": {
> "backfill_targets": [],
> "waiting_on_backfill": [],
> "last_backfill_started": "MIN",
> "backfill_info": {
> "begin": "MIN",
> "end": "MIN",
> "objects": []
> },
> "peer_backfill_info": [],
> "backfills_in_flight": [],
> "recovering": [],
> "pg_backend": {
> "pull_from_peer": [],
> "pushing": []
> }
> },
> "scrub": {
> "scrubber.epoch_start": "995",
> "scrubber.active": 0,
> "scrubber.state": "INACTIVE",
> "scrubber.start": "MIN",
> "scrubber.end": "MIN",
> "scrubber.subset_last_update": "0'0",
> "scrubber.deep": false,
> "scrubber.seed": 0,
> "scrubber.waiting_on": 0,
> "scrubber.waiting_on_whom": []
> }
> },
> {
> "name": "Started",
> "enter_time": "2016-06-27 04:57:35.828117"
> }
> ],
> "agent_state": {}
> }
>
> --- * ---
>
> # diff -Nua 6.263query1.txt 6.263query2.txt
> --- 6.263query1.txt 2016-06-30 04:38:13.290371200 +
> +++ 6.263query2.txt 2016-06-30 04:38:43.412642932 +
> @@ -19,10 +19,10 @@
>  ],
>  "info": {
>  "pgid": "6.263",
> -"last_update": "1005'2273061",
> -"last_complete": "1005'2273061",
> -"log_tail": "1005'227",
> -"last_user_version": 2273061,
> +"last_update": "1005'2273745",
> +"last_complete": "1005'2273745",
> +"log_tail": "1005'2270700",
> +"last_user_version": 2273745,
>  "last_backfill": "MAX",
>  "last_backfill_bitwise": 0,
>  "purged_snaps": "[]",
> @@ -42,23 +42,23 @@
>  "last_clean_scrub_stamp"

Re: [ceph-users] object size changing after a pg repair

2016-06-29 Thread Goncalo Borges
 "waiting_on_backfill": [],
"last_backfill_started": "MIN",
"backfill_info": {
"begin": "MIN",
"end": "MIN",
"objects": []
},
"peer_backfill_info": [],
"backfills_in_flight": [],
"recovering": [],
"pg_backend": {
"pull_from_peer": [],
"pushing": []
}
},
"scrub": {
"scrubber.epoch_start": "995",
"scrubber.active": 0,
"scrubber.state": "INACTIVE",
"scrubber.start": "MIN",
"scrubber.end": "MIN",
"scrubber.subset_last_update": "0'0",
"scrubber.deep": false,
"scrubber.seed": 0,
"scrubber.waiting_on": 0,
"scrubber.waiting_on_whom": []
}
},
{
"name": "Started",
"enter_time": "2016-06-27 04:57:35.828117"
}
],
"agent_state": {}
}

--- * ---

# diff -Nua 6.263query1.txt 6.263query2.txt
--- 6.263query1.txt 2016-06-30 04:38:13.290371200 +
+++ 6.263query2.txt 2016-06-30 04:38:43.412642932 +
@@ -19,10 +19,10 @@
 ],
 "info": {
 "pgid": "6.263",
-"last_update": "1005'2273061",
-"last_complete": "1005'2273061",
-"log_tail": "1005'227",
-"last_user_version": 2273061,
+"last_update": "1005'2273745",
+"last_complete": "1005'2273745",
+"log_tail": "1005'2270700",
+"last_user_version": 2273745,
 "last_backfill": "MAX",
 "last_backfill_bitwise": 0,
 "purged_snaps": "[]",
@@ -42,23 +42,23 @@
 "last_clean_scrub_stamp": "2016-06-30 02:13:00.455256"
 },
 "stats": {
-"version": "1005'2273061",
-"reported_seq": "2937682",
+"version": "1005'2273745",
+"reported_seq": "2938345",
 "reported_epoch": "1005",
 "state": "active+clean",
-"last_fresh": "2016-06-30 04:38:13.270047",
+"last_fresh": "2016-06-30 04:38:43.320788",
 "last_change": "2016-06-30 02:13:00.455293",
-"last_active": "2016-06-30 04:38:13.270047",
-"last_peered": "2016-06-30 04:38:13.270047",
-"last_clean": "2016-06-30 04:38:13.270047",
+"last_active": "2016-06-30 04:38:43.320788",
+"last_peered": "2016-06-30 04:38:43.320788",
+"last_clean": "2016-06-30 04:38:43.320788",
 "last_became_active": "2016-06-27 04:57:36.949798",
 "last_became_peered": "2016-06-27 04:57:36.949798",
-"last_unstale": "2016-06-30 04:38:13.270047",
-"last_undegraded": "2016-06-30 04:38:13.270047",
-"last_fullsized": "2016-06-30 04:38:13.270047",
+"last_unstale": "2016-06-30 04:38:43.320788",
+"last_undegraded": "2016-06-30 04:38:43.320788",
+"last_fullsized": "2016-06-30 04:38:43.320788",
 "mapping_epoch": 994,
-"log_start": "1005'227",
-"ondisk_log_start": "1005'227",
+"log_start": "1005'2270700",
+"ondisk_log_start": "1005'2270700",
 "created": 341,
 "last_epoch_clean": 996,
 "parent": "0.0",
@@ -68,8 +68,8 @@
 "last_deep_scrub": "1005'2076134",
 "last_deep_scrub_stamp": "2016-06-30 02:13:00.455256",
 "last_clean_scrub_stamp": "2016-06-30 02:13:00.455256",
-"log_size": 3061,
-"ondisk_log_size"

Re: [ceph-users] object size changing after a pg repair

2016-06-29 Thread Shinobu Kinjo
What does `ceph pg 6.263 query` show you?


On Thu, Jun 30, 2016 at 12:02 PM, Goncalo Borges <
goncalo.bor...@sydney.edu.au> wrote:

> Dear Cephers...
>
> Today our ceph cluster gave us a couple of scrub errors regarding
> inconsistent pgs. We just upgraded from 9.2.0 to 10.2.2 two days ago.
>
> # ceph health detail
> HEALTH_ERR 2 pgs inconsistent; 2 scrub errors; crush map has legacy
> tunables (require bobtail, min is firefly)
> pg 6.39c is active+clean+inconsistent, acting [2,60,32]
> pg 6.263 is active+clean+inconsistent, acting [56,39,6]
> 2 scrub errors
> crush map has legacy tunables (require bobtail, min is firefly); see
> http://ceph.com/docs/master/rados/operations/crush-map/#tunables
>
> We have started by looking to pg 6.263. Errors were only appearing in
> osd.56 logs but not in others.
>
> # cat  ceph-osd.56.log-20160629 | grep -Hn 'ERR'
> (standard input):8569:2016-06-29 08:09:50.952397 7fd023322700 -1
> log_channel(cluster) log [ERR] : scrub 6.263
> 6:c645f18e:::12a343d.:head on disk size (1836) does not match
> object info size (41242) adjusted for ondisk to (41242)
> (standard input):8602:2016-06-29 08:11:11.227865 7fd023322700 -1
> log_channel(cluster) log [ERR] : 6.263 scrub 1 errors
>
> So, we did a 'ceph pg repair  6.263'.
>
> Eventually, that pg went back to 'active+clean'
>
> # ceph pg dump | grep ^6.263
> dumped all in format plain
> 6.263   10845   0   0   0   0   39592671010 3037
> 3037active+clean2016-06-30 02:13:00.455293  1005'2126237
> 1005:2795768[56,39,6]   56  [56,39,6]   56
> 1005'20761342016-06-30 02:13:00.455256  1005'20761342016-06-30
> 02:13:00.455256
>
> However, in the logs i found
>
> 2016-06-30 02:03:03.992240 osd.56 192.231.127.226:6801/21569 278 :
> cluster [INF] 6.263 repair starts
> 2016-06-30 02:13:00.455237 osd.56 192.231.127.226:6801/21569 279 :
> cluster [INF] 6.263 repair ok, 0 fixed
>
> I did not like the '0 fixed'.
>
> Inspecting a bit more, I found that the object inside the pg in all
> involved osds are changing size. For example in osd.56 (but the same thing
> is true in 39 and 6) I found in consecutive 'ls -l' commands:
>
> # ls -l
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
> -rw-r--r-- 1 ceph ceph 8602 Jun 30 02:53
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
> [root@rccephosd8 ceph]# ls -l
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
> -rw-r--r-- 1 ceph ceph 170 Jun 30 02:53
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
>
> # ls -l
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
> -rw-r--r-- 1 ceph ceph 15436 Jun 30 02:53
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
>
> # ls -l
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
> -rw-r--r-- 1 ceph ceph 26044 Jun 30 02:53
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
>
> # ls -l
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
> -rw-r--r-- 1 ceph ceph 0 Jun 30 02:53
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
>
> # ls -l
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
> -rw-r--r-- 1 ceph ceph 14076 Jun 30 02:53
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
>
> # ls -l
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
> -rw-r--r-- 1 ceph ceph 31110 Jun 30 02:53
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
>
> # ls -l
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
> -rw-r--r-- 1 ceph ceph 0 Jun 30 02:53
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
>
> # ls -l
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
> -rw-r--r-- 1 ceph ceph 20230 Jun 30 02:53
> /var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
>
> # ls -l
> /var/li

[ceph-users] object size changing after a pg repair

2016-06-29 Thread Goncalo Borges
Dear Cephers...

Today our ceph cluster gave us a couple of scrub errors regarding inconsistent 
pgs. We just upgraded from 9.2.0 to 10.2.2 two days ago.

# ceph health detail 
HEALTH_ERR 2 pgs inconsistent; 2 scrub errors; crush map has legacy tunables 
(require bobtail, min is firefly)
pg 6.39c is active+clean+inconsistent, acting [2,60,32]
pg 6.263 is active+clean+inconsistent, acting [56,39,6]
2 scrub errors
crush map has legacy tunables (require bobtail, min is firefly); see 
http://ceph.com/docs/master/rados/operations/crush-map/#tunables

We have started by looking to pg 6.263. Errors were only appearing in osd.56 
logs but not in others.

# cat  ceph-osd.56.log-20160629 | grep -Hn 'ERR' 
(standard input):8569:2016-06-29 08:09:50.952397 7fd023322700 -1 
log_channel(cluster) log [ERR] : scrub 6.263 
6:c645f18e:::12a343d.:head on disk size (1836) does not match 
object info size (41242) adjusted for ondisk to (41242)
(standard input):8602:2016-06-29 08:11:11.227865 7fd023322700 -1 
log_channel(cluster) log [ERR] : 6.263 scrub 1 errors

So, we did a 'ceph pg repair  6.263'.

Eventually, that pg went back to 'active+clean'

# ceph pg dump | grep ^6.263
dumped all in format plain
6.263   10845   0   0   0   0   39592671010 30373037
active+clean2016-06-30 02:13:00.455293  1005'21262371005:2795768
[56,39,6]   56  [56,39,6]   56  1005'20761342016-06-30 
02:13:00.455256  1005'20761342016-06-30 02:13:00.455256

However, in the logs i found

2016-06-30 02:03:03.992240 osd.56 192.231.127.226:6801/21569 278 : cluster 
[INF] 6.263 repair starts
2016-06-30 02:13:00.455237 osd.56 192.231.127.226:6801/21569 279 : cluster 
[INF] 6.263 repair ok, 0 fixed

I did not like the '0 fixed'. 

Inspecting a bit more, I found that the object inside the pg in all involved 
osds are changing size. For example in osd.56 (but the same thing is true in 39 
and 6) I found in consecutive 'ls -l' commands:

# ls -l 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
-rw-r--r-- 1 ceph ceph 8602 Jun 30 02:53 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
[root@rccephosd8 ceph]# ls -l 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
-rw-r--r-- 1 ceph ceph 170 Jun 30 02:53 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6

# ls -l 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
-rw-r--r-- 1 ceph ceph 15436 Jun 30 02:53 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6

# ls -l 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
-rw-r--r-- 1 ceph ceph 26044 Jun 30 02:53 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6

# ls -l 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
-rw-r--r-- 1 ceph ceph 0 Jun 30 02:53 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6

# ls -l 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
-rw-r--r-- 1 ceph ceph 14076 Jun 30 02:53 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6

# ls -l 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
-rw-r--r-- 1 ceph ceph 31110 Jun 30 02:53 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6

# ls -l 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
-rw-r--r-- 1 ceph ceph 0 Jun 30 02:53 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6

# ls -l 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
-rw-r--r-- 1 ceph ceph 20230 Jun 30 02:53 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6

# ls -l 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
-rw-r--r-- 1 ceph ceph 23392 Jun 30 02:53 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6

# ls -l 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6
-rw-r--r-- 1 ceph ceph 0 Jun 30 02:53 
/var/lib/ceph/osd/ceph-56/current/6.263_head/DIR_3/DIR_6/DIR_2/DIR_A/12a343d.__head_718FA263__6

# ls -l 
/var/lib/ceph/osd/ceph-56

[ceph-users] Can I modify ak/sk?

2016-06-29 Thread yang
Hello, everyone
When I want to modify access_key using the following cmd:
radosgw-admin user modify --uid=user --access_key="userak"

I got:

{
"user_id": "user",
"display_name": "User name",
"email": "",
"suspended": 0,
"max_buckets": 1000,
"auid": 0,
"subusers": [],
"keys": [
{
"user": "user",
"access_key": "0JXAO52QZ44R6WTF7CH6",
"secret_key": "wD2jBcwIu2OHjTBHcOzjU6tDePGGjIJolMBg4IbT"
},
{
"user": "user",
"access_key": "userak",
"secret_key": ""
}
],
"swift_keys": [],
"caps": [],
"op_mask": "read, write, delete",
"default_placement": "",
"placement_tags": [],
"bucket_quota": {
"enabled": false,
"max_size_kb": -1,
"max_objects": -1
},
"user_quota": {
"enabled": false,
"max_size_kb": -1,
"max_objects": -1
},
"temp_url_keys": []
}


The access_key does not change, but create a new item, and it's secret key is 
empty.
Is it right?
___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


Re: [ceph-users] Hammer: PGs stuck creating

2016-06-29 Thread Brad Hubbard
On Thu, Jun 30, 2016 at 3:22 AM, Brian Felton  wrote:
> Greetings,
>
> I have a lab cluster running Hammer 0.94.6 and being used exclusively for
> object storage.  The cluster consists of four servers running 60 6TB OSDs
> each.  The main .rgw.buckets pool is using k=3 m=1 erasure coding and
> contains 8192 placement groups.
>
> Last week, one of our guys out-ed and removed one OSD from each of three of
> the four servers in the cluster, which resulted in some general badness (the
> disks were wiped post-removal, so the data are gone).  After a proper
> education in why this is a Bad Thing, we got the OSDs added back.  When all
> was said and done, we had 30 pgs that were stuck incomplete, and no amount
> of magic has been able to get them to recover.  From reviewing the data, we
> knew that all of these pgs contained at least 2 of the removed OSDs; I
> understand and accept that the data are gone, and that's not a concern (yay
> lab).
>
> Here are the things I've tried:
>
> - Restarted all OSDs
> - Stopped all OSDs, removed all OSDs from the crush map, and started
> everything back up
> - Executed a 'ceph pg force_create_pg ' for each of the 30 stuck pgs
> - Executed a 'ceph pg send_pg_creates' to get the ball rolling on creates
> - Executed several 'ceph pg  query' commands to ensure we were
> referencing valid OSDs after the 'force_create_pg'
> - Ensured those OSDs were really removed (e.g. 'ceph auth del', 'ceph osd
> crush remove', and 'ceph osd rm')

Can you share some of the pg query output?

>
> At this point, I've got the same 30 pgs that are stuck creating.  I've run
> out of ideas for getting this back to a healthy state.  In reviewing the
> other posts on the mailing list, the overwhelming solution was a bad OSD in
> the crush map, but I'm all but certain that isn't what's hitting us here.
> Normally, being the lab, I'd consider nuking the .rgw.buckets pool and
> starting from scratch, but we've recently spent a lot of time pulling 140TB
> of data into this cluster for some performance and recovery tests, and I'd
> prefer not to have to start that process again.  I am willing to entertain
> most any other idea irrespective to how destructive it is to these PGs, so
> long as I don't have to lose the rest of the data in the pool.
>
> Many thanks in advance for any assistance here.
>
> Brian Felton
>
>
>
>
> ___
> ceph-users mailing list
> ceph-users@lists.ceph.com
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>



-- 
Cheers,
Brad
___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


Re: [ceph-users] Mounting Ceph RBD image to XenServer 7 as SR

2016-06-29 Thread Jake Young
On Wednesday, June 29, 2016, Mike Jacobacci  wrote:

> Hi all,
>
> Is there anyone using rbd for xenserver vm storage?  I have XenServer 7
> and the latest Ceph, I am looking for the the best way to mount the rbd
> volume under XenServer.  There is not much recent info out there I have
> found except for this:
>
> http://www.mad-hacking.net/documentation/linux/ha-cluster/storage-area-network/ceph-xen-domu.xml
>
> and this plugin (which looks nice):
> https://github.com/mstarikov/rbdsr
>
> I am looking for a way that doesn’t involve too much command line so other
> admins that don’t know Ceph or XenServer very well can work with it.  I am
> just curious what others are doing… Any help is greatly appreciated!
>
> Cheers,
> Mike
>

I'm not a XenServer user, so I can't help you there; but I feel your pain
using Ceph for VMware storage.

I'm surprised that any major Linux distributions haven't considered
enabling rbd modules in initrd.

I can see having a tiny OS image containing not much more than grub and the
boot kernel. The trick would be to find a way to manage the boot string in
the grub conf on a large scale.

Jake
___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


[ceph-users] Improving metadata throughput

2016-06-29 Thread Daniel Davidson
I am starting to work with and benchmark our ceph cluster.  While 
throughput is so far looking good, metadata performance so far looks to 
be suffering.  Is there anything that can be done to speed up the 
response time of looking through a lot of small files and folders?  
Right now, I am running four metadata servers and the filesystem is 
mounted via fuse.


We use module to manage environmental variables for the applications on 
our cluster.  When I type "module avail" it takes about 30 minutes to 
get a response the first time with a pair of my monitors running at 100% 
during this time.  Later ones are near instantaneous.


Dan

___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


[ceph-users] Mounting Ceph RBD image to XenServer 7 as SR

2016-06-29 Thread Mike Jacobacci
Hi all,

Is there anyone using rbd for xenserver vm storage?  I have XenServer 7 and the 
latest Ceph, I am looking for the the best way to mount the rbd volume under 
XenServer.  There is not much recent info out there I have found except for 
this:
http://www.mad-hacking.net/documentation/linux/ha-cluster/storage-area-network/ceph-xen-domu.xml
 


and this plugin (which looks nice):
https://github.com/mstarikov/rbdsr 

I am looking for a way that doesn’t involve too much command line so other 
admins that don’t know Ceph or XenServer very well can work with it.  I am just 
curious what others are doing… Any help is greatly appreciated!

Cheers,
Mike___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


Re: [ceph-users] pg scrub and auto repair in hammer

2016-06-29 Thread Lionel Bouton
Hi,

Le 29/06/2016 18:33, Stefan Priebe - Profihost AG a écrit :
>> Am 28.06.2016 um 09:43 schrieb Lionel Bouton 
>> :
>>
>> Hi,
>>
>> Le 28/06/2016 08:34, Stefan Priebe - Profihost AG a écrit :
>>> [...]
>>> Yes but at least BTRFS is still not working for ceph due to
>>> fragmentation. I've even tested a 4.6 kernel a few weeks ago. But it
>>> doubles it's I/O after a few days.
>> BTRFS autodefrag is not working over the long term. That said BTRFS
>> itself is working far better than XFS on our cluster (noticeably better
>> latencies). As not having checksums wasn't an option we coded and are
>> using this:
>>
>> https://github.com/jtek/ceph-utils/blob/master/btrfs-defrag-scheduler.rb
>>
>> This actually saved us from 2 faulty disk controllers which were
>> infrequently corrupting data in our cluster.
>>
>> Mandatory too for performance :
>> filestore btrfs snap = false
> This sounds interesting. For how long you use this method?

More than a year now. Since the beginning almost two years ago we always
had at least one or two BTRFS OSDs to test and compare to the XFS ones.
At the very beginning we had to recycle them regularly because their
performance degraded over time. This was not a problem as Ceph makes it
easy to move data around safely.
We only switched after both finding out that "filestore btrfs snap =
false" was mandatory (when true it creates large write spikes every
filestore sync interval) and that a custom defragmentation process was
needed to maintain performance over the long run.

>  What kind of workload do you have?

A dozen VMs using rbd through KVM built-in support. There are different
kinds of access patterns : a large PostgreSQL instance (75+GB on disk,
300+ tx/s with peaks of ~2000 with a mean of 50+ IO/s and peaks to 1000,
mostly writes), a small MySQL instance (hard to say : was very large but
we moved most of its content to PostgreSQL which left only a small
database for a proprietary tool and large ibdata* files with mostly
holes), a very large NFS server (~10 TB), lots of Ruby on Rails
applications and background workers.

On the whole storage system Ceph reports an average of 170 op/s with
peaks that can reach 3000.

>  How did you measure the performance and latency?

Every useful metric we can get is fed to a Zabbix server. Latency is
measured both by the kernel on each disk with the average time a request
stays in queue (number of IOs / accumulated wait time over a given
period : you can find these values in /sys/block//stat) and at Ceph
level by monitoring the apply latency (we now have journals on SSD so
our commit latency is mostly limited by the available CPU).
The most interesting metric is the apply latency, block device latency
is useful to monitor to see how much the device itself is pushed and how
well read performs (apply latency only gives us the write side of the
story).

The behavior during backfills confirmed the latency benefits too : BTRFS
OSDs were less frequently involved in slow requests than the XFS ones.

>  What kernel do you use with btrfs?

4.4.6 currently (we just finished migrating all servers last week-end).
But the switch from XFS to BTRFS occurred with late 3.9 kernels IIRC.

I don't have measurements for this but when we switched from 4.1.15-r1
("-r1" is for Gentoo patches) to 4.4.6 we saw faster OSD startups
(including the initial filesystem mount). The only drawback with BTRFS
(if you don't count having to develop and run a custom defragmentation
scheduler) was the OSD startup times vs XFS. It was very slow when
starting from an unmounted filesystem at least until 4.1.x. This was not
really a problem as we don't restart OSDs often.

Best regards,

Lionel
___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


[ceph-users] Hammer: PGs stuck creating

2016-06-29 Thread Brian Felton
Greetings,

I have a lab cluster running Hammer 0.94.6 and being used exclusively for
object storage.  The cluster consists of four servers running 60 6TB OSDs
each.  The main .rgw.buckets pool is using k=3 m=1 erasure coding and
contains 8192 placement groups.

Last week, one of our guys out-ed and removed one OSD from each of three of
the four servers in the cluster, which resulted in some general badness
(the disks were wiped post-removal, so the data are gone).  After a proper
education in why this is a Bad Thing, we got the OSDs added back.  When all
was said and done, we had 30 pgs that were stuck incomplete, and no amount
of magic has been able to get them to recover.  From reviewing the data, we
knew that all of these pgs contained at least 2 of the removed OSDs; I
understand and accept that the data are gone, and that's not a concern (yay
lab).

Here are the things I've tried:

- Restarted all OSDs
- Stopped all OSDs, removed all OSDs from the crush map, and started
everything back up
- Executed a 'ceph pg force_create_pg ' for each of the 30 stuck pgs
- Executed a 'ceph pg send_pg_creates' to get the ball rolling on creates
- Executed several 'ceph pg  query' commands to ensure we were
referencing valid OSDs after the 'force_create_pg'
- Ensured those OSDs were really removed (e.g. 'ceph auth del', 'ceph osd
crush remove', and 'ceph osd rm')

At this point, I've got the same 30 pgs that are stuck creating.  I've run
out of ideas for getting this back to a healthy state.  In reviewing the
other posts on the mailing list, the overwhelming solution was a bad OSD in
the crush map, but I'm all but certain that isn't what's hitting us here.
Normally, being the lab, I'd consider nuking the .rgw.buckets pool and
starting from scratch, but we've recently spent a lot of time pulling 140TB
of data into this cluster for some performance and recovery tests, and I'd
prefer not to have to start that process again.  I am willing to entertain
most any other idea irrespective to how destructive it is to these PGs, so
long as I don't have to lose the rest of the data in the pool.

Many thanks in advance for any assistance here.

Brian Felton
___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


Re: [ceph-users] pg scrub and auto repair in hammer

2016-06-29 Thread Stefan Priebe - Profihost AG

> Am 28.06.2016 um 09:43 schrieb Lionel Bouton 
> :
> 
> Hi,
> 
> Le 28/06/2016 08:34, Stefan Priebe - Profihost AG a écrit :
>> [...]
>> Yes but at least BTRFS is still not working for ceph due to
>> fragmentation. I've even tested a 4.6 kernel a few weeks ago. But it
>> doubles it's I/O after a few days.
> 
> BTRFS autodefrag is not working over the long term. That said BTRFS
> itself is working far better than XFS on our cluster (noticeably better
> latencies). As not having checksums wasn't an option we coded and are
> using this:
> 
> https://github.com/jtek/ceph-utils/blob/master/btrfs-defrag-scheduler.rb
> 
> This actually saved us from 2 faulty disk controllers which were
> infrequently corrupting data in our cluster.
> 
> Mandatory too for performance :
> filestore btrfs snap = false

This sounds interesting. For how long you use this method? What kind of 
workload do you have? How did you measure the performance and latency? What 
kernel do you use with btrfs?

Greets,
Stefan
> 
> Lionel

___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Mario Giammarco
This time at the end of recovery procedure you described it was like most
pgs active+clean 20 pgs incomplete.
After that when trying to use the cluster I got "request blocked more than"
and no vm can start.
I know that something has happened after the broken disk, probably a server
reboot. I am investigating.
But even if I find the origin of the problem it will not help in finding a
solution now.
So I am using my time in repairing the pool only to save the production
data and I will throw away the rest.
Now after marking all pgs as complete with ceph_objectstore_tool I see that:

1) ceph has put out three hdds ( I suppose due to scrub but it is my only
my idea, I will check logs) BAD
2) it is recovering for objects degraded and misplaced GOOD
3) vm are not usable yet BAD
4) I see some pgs in state down+peering (I hope is not BAD)

Regarding 1) how I can put again that three hdds in the cluster? Should I
remove them from crush and start again?
Can I tell ceph that they are not bad?
Mario

Il giorno mer 29 giu 2016 alle ore 15:34 Lionel Bouton <
lionel+c...@bouton.name> ha scritto:

> Hi,
>
> Le 29/06/2016 12:00, Mario Giammarco a écrit :
> > Now the problem is that ceph has put out two disks because scrub  has
> > failed (I think it is not a disk fault but due to mark-complete)
>
> There is something odd going on. I've only seen deep-scrub failing (ie
> detect one inconsistency and marking the pg so) so I'm not sure what
> happens in the case of a "simple" scrub failure but what should not
> happen is the whole OSD going down on scrub of deepscrub fairure which
> you seem to imply did happen.
> Do you have logs for these two failures giving a hint at what happened
> (probably /var/log/ceph/ceph-osd..log) ? Any kernel log pointing to
> hardware failure(s) around the time these events happened ?
>
> Another point : you said that you had one disk "broken". Usually ceph
> handles this case in the following manner :
> - the OSD detects the problem and commit suicide (unless it's configured
> to ignore IO errors which is not the default),
> - your cluster is then in degraded state with one OSD down/in,
> - after a timeout (several minutes), Ceph decides that the OSD won't
> come up again soon and marks the OSD "out" (so one OSD down/out),
> - as the OSD is out, crush adapts pg positions based on the remaining
> available OSDs and bring back all degraded pg to clean state by creating
> missing replicas while moving pgs around. You see a lot of IO, many pg
> in wait_backfill/backfilling states at this point,
> - when all is done the cluster is back to HEALTH_OK
>
> When your disk was broken and you waited 24 hours how far along this
> process was your cluster ?
>
> Best regards,
>
> Lionel
>
___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Lionel Bouton
Hi,

Le 29/06/2016 12:00, Mario Giammarco a écrit :
> Now the problem is that ceph has put out two disks because scrub  has
> failed (I think it is not a disk fault but due to mark-complete)

There is something odd going on. I've only seen deep-scrub failing (ie
detect one inconsistency and marking the pg so) so I'm not sure what
happens in the case of a "simple" scrub failure but what should not
happen is the whole OSD going down on scrub of deepscrub fairure which
you seem to imply did happen.
Do you have logs for these two failures giving a hint at what happened
(probably /var/log/ceph/ceph-osd..log) ? Any kernel log pointing to
hardware failure(s) around the time these events happened ?

Another point : you said that you had one disk "broken". Usually ceph
handles this case in the following manner :
- the OSD detects the problem and commit suicide (unless it's configured
to ignore IO errors which is not the default),
- your cluster is then in degraded state with one OSD down/in,
- after a timeout (several minutes), Ceph decides that the OSD won't
come up again soon and marks the OSD "out" (so one OSD down/out),
- as the OSD is out, crush adapts pg positions based on the remaining
available OSDs and bring back all degraded pg to clean state by creating
missing replicas while moving pgs around. You see a lot of IO, many pg
in wait_backfill/backfilling states at this point,
- when all is done the cluster is back to HEALTH_OK

When your disk was broken and you waited 24 hours how far along this
process was your cluster ?

Best regards,

Lionel
___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Oliver Dzombic
Hi,

it does not.

But in your case, you have 10 OSD, and 7 of them have incomplete PG's.

So since your proxmox vps's are not on single PG's but spread across
many PG's you have a good chance that at least some data of any vps is
on one of the defect PG's.

-- 
Mit freundlichen Gruessen / Best regards

Oliver Dzombic
IP-Interactive

mailto:i...@ip-interactive.de

Anschrift:

IP Interactive UG ( haftungsbeschraenkt )
Zum Sonnenberg 1-3
63571 Gelnhausen

HRB 93402 beim Amtsgericht Hanau
Geschäftsführung: Oliver Dzombic

Steuer Nr.: 35 236 3622 1
UST ID: DE274086107


Am 29.06.2016 um 13:09 schrieb Mario Giammarco:
> Just one question: why when ceph has some incomplete pgs it refuses to
> do I/o on good pgs?
> 
> Il giorno mer 29 giu 2016 alle ore 12:55 Oliver Dzombic
> > ha scritto:
> 
> Hi,
> 
> again:
> 
> You >must< check all your logs ( as fucky as it is for sure ).
> 
> Means on the ceph nodes in /var/log/ceph/*
> 
> And go back to the time where things went down the hill.
> 
> There must be something else going on, beyond normal osd crash.
> 
> And your manual pg repair/pg remove/pg set complete is, most probably,
> just getting your situation worst.
> 
> So really, if you want to have a chance to find out whats going on, you
> must check all the logs. Especially the OSD logs, especially the OSD log
> of the OSD you removed, and then the OSD logs of those pg, which are
> incomplete/stuck/what_ever_not_good.
> 
> --
> Mit freundlichen Gruessen / Best regards
> 
> Oliver Dzombic
> IP-Interactive
> 
> mailto:i...@ip-interactive.de 
> 
> Anschrift:
> 
> IP Interactive UG ( haftungsbeschraenkt )
> Zum Sonnenberg 1-3
> 63571 Gelnhausen
> 
> HRB 93402 beim Amtsgericht Hanau
> Geschäftsführung: Oliver Dzombic
> 
> Steuer Nr.: 35 236 3622 1
> UST ID: DE274086107
> 
> 
> Am 29.06.2016 um 12:33 schrieb Mario Giammarco:
> > Thanks,
> > I can put in osds but the do not stay in, and I am pretty sure
> that are
> > not broken.
> >
> > Il giorno mer 29 giu 2016 alle ore 12:07 Oliver Dzombic
> > 
> >> ha
> scritto:
> >
> > hi,
> >
> > ceph osd set noscrub
> > ceph osd set nodeep-scrub
> >
> > ceph osd in 
> >
> >
> > --
> > Mit freundlichen Gruessen / Best regards
> >
> > Oliver Dzombic
> > IP-Interactive
> >
> > mailto:i...@ip-interactive.de 
> >
> >
> > Anschrift:
> >
> > IP Interactive UG ( haftungsbeschraenkt )
> > Zum Sonnenberg 1-3
> > 63571 Gelnhausen
> >
> > HRB 93402 beim Amtsgericht Hanau
> > Geschäftsführung: Oliver Dzombic
> >
> > Steuer Nr.: 35 236 3622 1
> > UST ID: DE274086107
> >
> >
> > Am 29.06.2016 um 12:00 schrieb Mario Giammarco:
> > > Now the problem is that ceph has put out two disks because
> scrub  has
> > > failed (I think it is not a disk fault but due to mark-complete)
> > > How can I:
> > > - disable scrub
> > > - put in again the two disks
> > >
> > > I will wait anyway the end of recovery to be sure it really
> works
> > again
> > >
> > > Il giorno mer 29 giu 2016 alle ore 11:16 Mario Giammarco
> > > 
> >
> > 
>  scritto:
> > >
> > > Infact I am worried because:
> > >
> > > 1) ceph is under proxmox, and proxmox may decide to reboot a
> > server
> > > if it is not responding
> > > 2) probably a server was rebooted while ceph was
> reconstructing
> > > 3) even using max=3 do not help
> > >
> > > Anyway this is the "unofficial" procedure that I am
> using, much
> > > simpler than blog post:
> > >
> > > 1) find host where is pg
> > > 2) stop ceph in that host
> > > 3) ceph-objectstore-tool --pgid 1.98 --op mark-complete
> > --data-path
> > > /var/lib/ceph/osd/ceph-9 --journal-path
> > > /var/lib/ceph/osd/ceph-9/journal
> > > 4) start ceph
> > > 5) look finally it reconstructing
> > >
> > > Il giorno mer 29 giu 2016 alle ore 11:11 Oliver Dzombic
> > > 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Mario Giammarco
Just one question: why when ceph has some incomplete pgs it refuses to do
I/o on good pgs?

Il giorno mer 29 giu 2016 alle ore 12:55 Oliver Dzombic <
i...@ip-interactive.de> ha scritto:

> Hi,
>
> again:
>
> You >must< check all your logs ( as fucky as it is for sure ).
>
> Means on the ceph nodes in /var/log/ceph/*
>
> And go back to the time where things went down the hill.
>
> There must be something else going on, beyond normal osd crash.
>
> And your manual pg repair/pg remove/pg set complete is, most probably,
> just getting your situation worst.
>
> So really, if you want to have a chance to find out whats going on, you
> must check all the logs. Especially the OSD logs, especially the OSD log
> of the OSD you removed, and then the OSD logs of those pg, which are
> incomplete/stuck/what_ever_not_good.
>
> --
> Mit freundlichen Gruessen / Best regards
>
> Oliver Dzombic
> IP-Interactive
>
> mailto:i...@ip-interactive.de
>
> Anschrift:
>
> IP Interactive UG ( haftungsbeschraenkt )
> Zum Sonnenberg 1-3
> 63571 Gelnhausen
>
> HRB 93402 beim Amtsgericht Hanau
> Geschäftsführung: Oliver Dzombic
>
> Steuer Nr.: 35 236 3622 1
> UST ID: DE274086107
>
>
> Am 29.06.2016 um 12:33 schrieb Mario Giammarco:
> > Thanks,
> > I can put in osds but the do not stay in, and I am pretty sure that are
> > not broken.
> >
> > Il giorno mer 29 giu 2016 alle ore 12:07 Oliver Dzombic
> > > ha scritto:
> >
> > hi,
> >
> > ceph osd set noscrub
> > ceph osd set nodeep-scrub
> >
> > ceph osd in 
> >
> >
> > --
> > Mit freundlichen Gruessen / Best regards
> >
> > Oliver Dzombic
> > IP-Interactive
> >
> > mailto:i...@ip-interactive.de 
> >
> > Anschrift:
> >
> > IP Interactive UG ( haftungsbeschraenkt )
> > Zum Sonnenberg 1-3
> > 63571 Gelnhausen
> >
> > HRB 93402 beim Amtsgericht Hanau
> > Geschäftsführung: Oliver Dzombic
> >
> > Steuer Nr.: 35 236 3622 1
> > UST ID: DE274086107
> >
> >
> > Am 29.06.2016 um 12:00 schrieb Mario Giammarco:
> > > Now the problem is that ceph has put out two disks because scrub
> has
> > > failed (I think it is not a disk fault but due to mark-complete)
> > > How can I:
> > > - disable scrub
> > > - put in again the two disks
> > >
> > > I will wait anyway the end of recovery to be sure it really works
> > again
> > >
> > > Il giorno mer 29 giu 2016 alle ore 11:16 Mario Giammarco
> > > 
> > >> ha
> scritto:
> > >
> > > Infact I am worried because:
> > >
> > > 1) ceph is under proxmox, and proxmox may decide to reboot a
> > server
> > > if it is not responding
> > > 2) probably a server was rebooted while ceph was reconstructing
> > > 3) even using max=3 do not help
> > >
> > > Anyway this is the "unofficial" procedure that I am using, much
> > > simpler than blog post:
> > >
> > > 1) find host where is pg
> > > 2) stop ceph in that host
> > > 3) ceph-objectstore-tool --pgid 1.98 --op mark-complete
> > --data-path
> > > /var/lib/ceph/osd/ceph-9 --journal-path
> > > /var/lib/ceph/osd/ceph-9/journal
> > > 4) start ceph
> > > 5) look finally it reconstructing
> > >
> > > Il giorno mer 29 giu 2016 alle ore 11:11 Oliver Dzombic
> > > 
> > >> ha
> > scritto:
> > >
> > > Hi,
> > >
> > > removing ONE disk while your replication is 2, is no
> problem.
> > >
> > > You dont need to wait a single second to replace of remove
> > it. Its
> > > anyway not used and out/down. So from ceph's point of view
> its
> > > not existent.
> > >
> > > 
> > >
> > > But as christian told you already, what we see now fits to
> a
> > > szenario
> > > where you lost the osd and eighter you did something, or
> > > something else
> > > happens, but the data were not recovered again.
> > >
> > > Eighter because another OSD was broken, or because you did
> > > something.
> > >
> > > Maybe, because of the "too many PGs per OSD (307 > max
> 300)"
> > > ceph never
> > > recovered.
> > >
> > > What i can see from http://pastebin.com/VZD7j2vN is that
> > >
> > > OSD 5,13,9,0,6,2,3 and maybe others, are the OSD's holding
> the
> > > incomplete data.
> > >
> > > This are 7 OSD's from 10. So something happend to that
> > OSD's or
> > > the data
> >   

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Oliver Dzombic
Hi,

again:

You >must< check all your logs ( as fucky as it is for sure ).

Means on the ceph nodes in /var/log/ceph/*

And go back to the time where things went down the hill.

There must be something else going on, beyond normal osd crash.

And your manual pg repair/pg remove/pg set complete is, most probably,
just getting your situation worst.

So really, if you want to have a chance to find out whats going on, you
must check all the logs. Especially the OSD logs, especially the OSD log
of the OSD you removed, and then the OSD logs of those pg, which are
incomplete/stuck/what_ever_not_good.

-- 
Mit freundlichen Gruessen / Best regards

Oliver Dzombic
IP-Interactive

mailto:i...@ip-interactive.de

Anschrift:

IP Interactive UG ( haftungsbeschraenkt )
Zum Sonnenberg 1-3
63571 Gelnhausen

HRB 93402 beim Amtsgericht Hanau
Geschäftsführung: Oliver Dzombic

Steuer Nr.: 35 236 3622 1
UST ID: DE274086107


Am 29.06.2016 um 12:33 schrieb Mario Giammarco:
> Thanks,
> I can put in osds but the do not stay in, and I am pretty sure that are
> not broken.
> 
> Il giorno mer 29 giu 2016 alle ore 12:07 Oliver Dzombic
> > ha scritto:
> 
> hi,
> 
> ceph osd set noscrub
> ceph osd set nodeep-scrub
> 
> ceph osd in 
> 
> 
> --
> Mit freundlichen Gruessen / Best regards
> 
> Oliver Dzombic
> IP-Interactive
> 
> mailto:i...@ip-interactive.de 
> 
> Anschrift:
> 
> IP Interactive UG ( haftungsbeschraenkt )
> Zum Sonnenberg 1-3
> 63571 Gelnhausen
> 
> HRB 93402 beim Amtsgericht Hanau
> Geschäftsführung: Oliver Dzombic
> 
> Steuer Nr.: 35 236 3622 1
> UST ID: DE274086107
> 
> 
> Am 29.06.2016 um 12:00 schrieb Mario Giammarco:
> > Now the problem is that ceph has put out two disks because scrub  has
> > failed (I think it is not a disk fault but due to mark-complete)
> > How can I:
> > - disable scrub
> > - put in again the two disks
> >
> > I will wait anyway the end of recovery to be sure it really works
> again
> >
> > Il giorno mer 29 giu 2016 alle ore 11:16 Mario Giammarco
> > 
> >> ha scritto:
> >
> > Infact I am worried because:
> >
> > 1) ceph is under proxmox, and proxmox may decide to reboot a
> server
> > if it is not responding
> > 2) probably a server was rebooted while ceph was reconstructing
> > 3) even using max=3 do not help
> >
> > Anyway this is the "unofficial" procedure that I am using, much
> > simpler than blog post:
> >
> > 1) find host where is pg
> > 2) stop ceph in that host
> > 3) ceph-objectstore-tool --pgid 1.98 --op mark-complete
> --data-path
> > /var/lib/ceph/osd/ceph-9 --journal-path
> > /var/lib/ceph/osd/ceph-9/journal
> > 4) start ceph
> > 5) look finally it reconstructing
> >
> > Il giorno mer 29 giu 2016 alle ore 11:11 Oliver Dzombic
> > 
> >> ha
> scritto:
> >
> > Hi,
> >
> > removing ONE disk while your replication is 2, is no problem.
> >
> > You dont need to wait a single second to replace of remove
> it. Its
> > anyway not used and out/down. So from ceph's point of view its
> > not existent.
> >
> > 
> >
> > But as christian told you already, what we see now fits to a
> > szenario
> > where you lost the osd and eighter you did something, or
> > something else
> > happens, but the data were not recovered again.
> >
> > Eighter because another OSD was broken, or because you did
> > something.
> >
> > Maybe, because of the "too many PGs per OSD (307 > max 300)"
> > ceph never
> > recovered.
> >
> > What i can see from http://pastebin.com/VZD7j2vN is that
> >
> > OSD 5,13,9,0,6,2,3 and maybe others, are the OSD's holding the
> > incomplete data.
> >
> > This are 7 OSD's from 10. So something happend to that
> OSD's or
> > the data
> > in them. And that had nothing to do with a single disk
> failing.
> >
> > Something else must have been happend.
> >
> > And as christian already wrote: you will have to go
> through your
> > logs
> > back until the point were things going down.
> >
> > Because a fail of a single OSD, no matter what your
> replication
> > size is,
> > can ( 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Mario Giammarco
Thanks,
I can put in osds but the do not stay in, and I am pretty sure that are not
broken.

Il giorno mer 29 giu 2016 alle ore 12:07 Oliver Dzombic <
i...@ip-interactive.de> ha scritto:

> hi,
>
> ceph osd set noscrub
> ceph osd set nodeep-scrub
>
> ceph osd in 
>
>
> --
> Mit freundlichen Gruessen / Best regards
>
> Oliver Dzombic
> IP-Interactive
>
> mailto:i...@ip-interactive.de
>
> Anschrift:
>
> IP Interactive UG ( haftungsbeschraenkt )
> Zum Sonnenberg 1-3
> 63571 Gelnhausen
>
> HRB 93402 beim Amtsgericht Hanau
> Geschäftsführung: Oliver Dzombic
>
> Steuer Nr.: 35 236 3622 1
> UST ID: DE274086107
>
>
> Am 29.06.2016 um 12:00 schrieb Mario Giammarco:
> > Now the problem is that ceph has put out two disks because scrub  has
> > failed (I think it is not a disk fault but due to mark-complete)
> > How can I:
> > - disable scrub
> > - put in again the two disks
> >
> > I will wait anyway the end of recovery to be sure it really works again
> >
> > Il giorno mer 29 giu 2016 alle ore 11:16 Mario Giammarco
> > > ha scritto:
> >
> > Infact I am worried because:
> >
> > 1) ceph is under proxmox, and proxmox may decide to reboot a server
> > if it is not responding
> > 2) probably a server was rebooted while ceph was reconstructing
> > 3) even using max=3 do not help
> >
> > Anyway this is the "unofficial" procedure that I am using, much
> > simpler than blog post:
> >
> > 1) find host where is pg
> > 2) stop ceph in that host
> > 3) ceph-objectstore-tool --pgid 1.98 --op mark-complete --data-path
> > /var/lib/ceph/osd/ceph-9 --journal-path
> > /var/lib/ceph/osd/ceph-9/journal
> > 4) start ceph
> > 5) look finally it reconstructing
> >
> > Il giorno mer 29 giu 2016 alle ore 11:11 Oliver Dzombic
> > > ha scritto:
> >
> > Hi,
> >
> > removing ONE disk while your replication is 2, is no problem.
> >
> > You dont need to wait a single second to replace of remove it.
> Its
> > anyway not used and out/down. So from ceph's point of view its
> > not existent.
> >
> > 
> >
> > But as christian told you already, what we see now fits to a
> > szenario
> > where you lost the osd and eighter you did something, or
> > something else
> > happens, but the data were not recovered again.
> >
> > Eighter because another OSD was broken, or because you did
> > something.
> >
> > Maybe, because of the "too many PGs per OSD (307 > max 300)"
> > ceph never
> > recovered.
> >
> > What i can see from http://pastebin.com/VZD7j2vN is that
> >
> > OSD 5,13,9,0,6,2,3 and maybe others, are the OSD's holding the
> > incomplete data.
> >
> > This are 7 OSD's from 10. So something happend to that OSD's or
> > the data
> > in them. And that had nothing to do with a single disk failing.
> >
> > Something else must have been happend.
> >
> > And as christian already wrote: you will have to go through your
> > logs
> > back until the point were things going down.
> >
> > Because a fail of a single OSD, no matter what your replication
> > size is,
> > can ( normally ) not harm the consistency of 7 other OSD's,
> > means 70% of
> > your total cluster.
> >
> > --
> > Mit freundlichen Gruessen / Best regards
> >
> > Oliver Dzombic
> > IP-Interactive
> >
> > mailto:i...@ip-interactive.de 
> >
> > Anschrift:
> >
> > IP Interactive UG ( haftungsbeschraenkt )
> > Zum Sonnenberg 1-3
> > 63571 Gelnhausen
> >
> > HRB 93402 beim Amtsgericht Hanau
> > Geschäftsführung: Oliver Dzombic
> >
> > Steuer Nr.: 35 236 3622 1
> > UST ID: DE274086107
> >
> >
> > Am 29.06.2016 um 10:56 schrieb Mario Giammarco:
> > > Yes I have removed it from crush because it was broken. I have
> > waited 24
> > > hours to see if cephs would like to heals itself. Then I
> > removed the
> > > disk completely (it was broken...) and I waited 24 hours
> > again. Then I
> > > start getting worried.
> > > Are you saying to me that I should not remove a broken disk
> from
> > > cluster? 24 hours were not enough?
> > >
> > > Il giorno mer 29 giu 2016 alle ore 10:53 Zoltan Arnold Nagy
> > > 
> >  > >> ha scritto:
> > >
> > > Just loosing one disk doesn’t automagically delete it from
> > CRUSH,
> > > but in the output you had 10 disks listed, so there 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Oliver Dzombic
hi,

ceph osd set noscrub
ceph osd set nodeep-scrub

ceph osd in 


-- 
Mit freundlichen Gruessen / Best regards

Oliver Dzombic
IP-Interactive

mailto:i...@ip-interactive.de

Anschrift:

IP Interactive UG ( haftungsbeschraenkt )
Zum Sonnenberg 1-3
63571 Gelnhausen

HRB 93402 beim Amtsgericht Hanau
Geschäftsführung: Oliver Dzombic

Steuer Nr.: 35 236 3622 1
UST ID: DE274086107


Am 29.06.2016 um 12:00 schrieb Mario Giammarco:
> Now the problem is that ceph has put out two disks because scrub  has
> failed (I think it is not a disk fault but due to mark-complete)
> How can I:
> - disable scrub
> - put in again the two disks
> 
> I will wait anyway the end of recovery to be sure it really works again
> 
> Il giorno mer 29 giu 2016 alle ore 11:16 Mario Giammarco
> > ha scritto:
> 
> Infact I am worried because:
> 
> 1) ceph is under proxmox, and proxmox may decide to reboot a server
> if it is not responding
> 2) probably a server was rebooted while ceph was reconstructing
> 3) even using max=3 do not help
> 
> Anyway this is the "unofficial" procedure that I am using, much
> simpler than blog post:
> 
> 1) find host where is pg
> 2) stop ceph in that host
> 3) ceph-objectstore-tool --pgid 1.98 --op mark-complete --data-path
> /var/lib/ceph/osd/ceph-9 --journal-path
> /var/lib/ceph/osd/ceph-9/journal 
> 4) start ceph
> 5) look finally it reconstructing
> 
> Il giorno mer 29 giu 2016 alle ore 11:11 Oliver Dzombic
> > ha scritto:
> 
> Hi,
> 
> removing ONE disk while your replication is 2, is no problem.
> 
> You dont need to wait a single second to replace of remove it. Its
> anyway not used and out/down. So from ceph's point of view its
> not existent.
> 
> 
> 
> But as christian told you already, what we see now fits to a
> szenario
> where you lost the osd and eighter you did something, or
> something else
> happens, but the data were not recovered again.
> 
> Eighter because another OSD was broken, or because you did
> something.
> 
> Maybe, because of the "too many PGs per OSD (307 > max 300)"
> ceph never
> recovered.
> 
> What i can see from http://pastebin.com/VZD7j2vN is that
> 
> OSD 5,13,9,0,6,2,3 and maybe others, are the OSD's holding the
> incomplete data.
> 
> This are 7 OSD's from 10. So something happend to that OSD's or
> the data
> in them. And that had nothing to do with a single disk failing.
> 
> Something else must have been happend.
> 
> And as christian already wrote: you will have to go through your
> logs
> back until the point were things going down.
> 
> Because a fail of a single OSD, no matter what your replication
> size is,
> can ( normally ) not harm the consistency of 7 other OSD's,
> means 70% of
> your total cluster.
> 
> --
> Mit freundlichen Gruessen / Best regards
> 
> Oliver Dzombic
> IP-Interactive
> 
> mailto:i...@ip-interactive.de 
> 
> Anschrift:
> 
> IP Interactive UG ( haftungsbeschraenkt )
> Zum Sonnenberg 1-3
> 63571 Gelnhausen
> 
> HRB 93402 beim Amtsgericht Hanau
> Geschäftsführung: Oliver Dzombic
> 
> Steuer Nr.: 35 236 3622 1
> UST ID: DE274086107
> 
> 
> Am 29.06.2016 um 10:56 schrieb Mario Giammarco:
> > Yes I have removed it from crush because it was broken. I have
> waited 24
> > hours to see if cephs would like to heals itself. Then I
> removed the
> > disk completely (it was broken...) and I waited 24 hours
> again. Then I
> > start getting worried.
> > Are you saying to me that I should not remove a broken disk from
> > cluster? 24 hours were not enough?
> >
> > Il giorno mer 29 giu 2016 alle ore 10:53 Zoltan Arnold Nagy
> > 
>  >> ha scritto:
> >
> > Just loosing one disk doesn’t automagically delete it from
> CRUSH,
> > but in the output you had 10 disks listed, so there must be
> > something else going - did you delete the disk from the
> crush map as
> > well?
> >
> > Ceph waits by default 300 secs AFAIK to mark an OSD out
> after it
> > will start to recover.
> >
> >
> >> On 29 Jun 2016, at 10:42, Mario Giammarco
> 
> >> 

Re: [ceph-users] Is anyone seeing iissues with task_numa_find_cpu?

2016-06-29 Thread Stefan Priebe - Profihost AG
Hi,

to be precise i've far more patches attached to the sched part (around
20) of the kernel. So maybe that's the reason why it helps to me.

Could you please post a complete stack trace? Also Qemu / KVM triggers this.

Stefan

Am 29.06.2016 um 11:41 schrieb Campbell Steven:
> Hi Alex/Stefan,
> 
> I'm in the middle of testing 4.7rc5 on our test cluster to confirm
> once and for all this particular issue has been completely resolved by
> Peter's recent patch to sched/fair.c refereed to by Stefan above. For
> us anyway the patches that Stefan applied did not solve the issue and
> neither did any 4.5.x or 4.6.x released kernel thus far, hopefully it
> does the trick for you. We could get about 4 hours uptime before
> things went haywire for us.
> 
> It's interesting how it seems the CEPH workload triggers this bug so
> well as it's quite a long standing issue that's only just been
> resolved, another user chimed in on the lkml thread a couple of days
> ago as well and again his trace had ceph-osd in it as well.
> 
> https://lkml.org/lkml/headers/2016/6/21/491
> 
> Campbell
> 
> On 29 June 2016 at 18:29, Stefan Priebe - Profihost AG
>  wrote:
>>
>> Am 29.06.2016 um 04:30 schrieb Alex Gorbachev:
>>> Hi Stefan,
>>>
>>> On Tue, Jun 28, 2016 at 1:46 PM, Stefan Priebe - Profihost AG
>>>  wrote:
 Please be aware that you may need even more patches. Overall this needs 3
 patches. Where the first two try to fix a bug and the 3rd one fixes the
 fixes + even more bugs related to the scheduler. I've no idea on which 
 patch
 level Ubuntu is.
>>>
>>> Stefan, would you be able to please point to the other two patches
>>> beside https://lkml.org/lkml/diff/2016/6/22/102/1 ?
>>
>> Sorry sure yes:
>>
>> 1. 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a
>> bounded value")
>>
>> 2.) 40ed9cba24bb7e01cc380a02d3f04065b8afae1d ("sched/fair: Fix
>> post_init_entity_util_avg() serialization")
>>
>> 3.) the one listed at lkml.
>>
>> Stefan
>>
>>>
>>> Thank you,
>>> Alex
>>>

 Stefan

 Excuse my typo sent from my mobile phone.

 Am 28.06.2016 um 17:59 schrieb Tim Bishop :

 Yes - I noticed this today on Ubuntu 16.04 with the default kernel. No
 useful information to add other than it's not just you.

 Tim.

 On Tue, Jun 28, 2016 at 11:05:40AM -0400, Alex Gorbachev wrote:

 After upgrading to kernel 4.4.13 on Ubuntu, we are seeing a few of

 these issues where an OSD would fail with the stack below.  I logged a

 bug at https://bugzilla.kernel.org/show_bug.cgi?id=121101 and there is

 a similar description at https://lkml.org/lkml/2016/6/22/102, but the

 odd part is we have turned off CFQ and blk-mq/scsi-mq and are using

 just the noop scheduler.


 Does the ceph kernel code somehow use the fair scheduler code block?


 Thanks

 --

 Alex Gorbachev

 Storcium


 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.684974] CPU: 30 PID:

 10403 Comm: ceph-osd Not tainted 4.4.13-040413-generic #201606072354

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.684991] Hardware name:

 Supermicro X9DRi-LN4+/X9DR3-LN4+/X9DRi-LN4+/X9DR3-LN4+, BIOS 3.2

 03/04/2015

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685009] task:

 880f79df8000 ti: 880f79fb8000 task.ti: 880f79fb8000

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685024] RIP:

 0010:[]  []

 task_numa_find_cpu+0x22e/0x6f0

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685051] RSP:

 0018:880f79fbb818  EFLAGS: 00010206

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685063] RAX:

  RBX: 880f79fbb8b8 RCX: 

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685076] RDX:

  RSI:  RDI: 8810352d4800

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685107] RBP:

 880f79fbb880 R08: 0001020cf87c R09: 00ff00ff

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685150] R10:

 0009 R11: 0006 R12: 8807c3adc4c0

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685194] R13:

 0006 R14: 033e R15: fec7

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685238] FS:

 7f30e46b8700() GS:88105f58()

 knlGS:

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685283] CS:  0010 DS:

  ES:  CR0: 80050033

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685310] CR2:

 1321a000 CR3: 000853598000 CR4: 000406e0

 Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685354] Stack:

 Jun 28 09:46:41 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Mario Giammarco
Now the problem is that ceph has put out two disks because scrub  has
failed (I think it is not a disk fault but due to mark-complete)
How can I:
- disable scrub
- put in again the two disks

I will wait anyway the end of recovery to be sure it really works again

Il giorno mer 29 giu 2016 alle ore 11:16 Mario Giammarco <
mgiamma...@gmail.com> ha scritto:

> Infact I am worried because:
>
> 1) ceph is under proxmox, and proxmox may decide to reboot a server if it
> is not responding
> 2) probably a server was rebooted while ceph was reconstructing
> 3) even using max=3 do not help
>
> Anyway this is the "unofficial" procedure that I am using, much simpler
> than blog post:
>
> 1) find host where is pg
> 2) stop ceph in that host
> 3) ceph-objectstore-tool --pgid 1.98 --op mark-complete --data-path
> /var/lib/ceph/osd/ceph-9 --journal-path /var/lib/ceph/osd/ceph-9/journal
> 4) start ceph
> 5) look finally it reconstructing
>
> Il giorno mer 29 giu 2016 alle ore 11:11 Oliver Dzombic <
> i...@ip-interactive.de> ha scritto:
>
>> Hi,
>>
>> removing ONE disk while your replication is 2, is no problem.
>>
>> You dont need to wait a single second to replace of remove it. Its
>> anyway not used and out/down. So from ceph's point of view its not
>> existent.
>>
>> 
>>
>> But as christian told you already, what we see now fits to a szenario
>> where you lost the osd and eighter you did something, or something else
>> happens, but the data were not recovered again.
>>
>> Eighter because another OSD was broken, or because you did something.
>>
>> Maybe, because of the "too many PGs per OSD (307 > max 300)" ceph never
>> recovered.
>>
>> What i can see from http://pastebin.com/VZD7j2vN is that
>>
>> OSD 5,13,9,0,6,2,3 and maybe others, are the OSD's holding the
>> incomplete data.
>>
>> This are 7 OSD's from 10. So something happend to that OSD's or the data
>> in them. And that had nothing to do with a single disk failing.
>>
>> Something else must have been happend.
>>
>> And as christian already wrote: you will have to go through your logs
>> back until the point were things going down.
>>
>> Because a fail of a single OSD, no matter what your replication size is,
>> can ( normally ) not harm the consistency of 7 other OSD's, means 70% of
>> your total cluster.
>>
>> --
>> Mit freundlichen Gruessen / Best regards
>>
>> Oliver Dzombic
>> IP-Interactive
>>
>> mailto:i...@ip-interactive.de
>>
>> Anschrift:
>>
>> IP Interactive UG ( haftungsbeschraenkt )
>> Zum Sonnenberg 1-3
>> 63571 Gelnhausen
>>
>> HRB 93402 beim Amtsgericht Hanau
>> Geschäftsführung: Oliver Dzombic
>>
>> Steuer Nr.: 35 236 3622 1
>> UST ID: DE274086107
>>
>>
>> Am 29.06.2016 um 10:56 schrieb Mario Giammarco:
>> > Yes I have removed it from crush because it was broken. I have waited 24
>> > hours to see if cephs would like to heals itself. Then I removed the
>> > disk completely (it was broken...) and I waited 24 hours again. Then I
>> > start getting worried.
>> > Are you saying to me that I should not remove a broken disk from
>> > cluster? 24 hours were not enough?
>> >
>> > Il giorno mer 29 giu 2016 alle ore 10:53 Zoltan Arnold Nagy
>> > > ha
>> scritto:
>> >
>> > Just loosing one disk doesn’t automagically delete it from CRUSH,
>> > but in the output you had 10 disks listed, so there must be
>> > something else going - did you delete the disk from the crush map as
>> > well?
>> >
>> > Ceph waits by default 300 secs AFAIK to mark an OSD out after it
>> > will start to recover.
>> >
>> >
>> >> On 29 Jun 2016, at 10:42, Mario Giammarco > >> > wrote:
>> >>
>> >> I thank you for your reply so I can add my experience:
>> >>
>> >> 1) the other time this thing happened to me I had a cluster with
>> >> min_size=2 and size=3 and the problem was the same. That time I
>> >> put min_size=1 to recover the pool but it did not help. So I do
>> >> not understand where is the advantage to put three copies when
>> >> ceph can decide to discard all three.
>> >> 2) I started with 11 hdds. The hard disk failed. Ceph waited
>> >> forever for hard disk coming back. But hard disk is really
>> >> completelly broken so I have followed the procedure to really
>> >> delete from cluster. Anyway ceph did not recover.
>> >> 3) I have 307 pgs more than 300 but it is due to the fact that I
>> >> had 11 hdds now only 10. I will add more hdds after I repair the
>> pool
>> >> 4) I have reduced the monitors to 3
>> >>
>> >>
>> >>
>> >> Il giorno mer 29 giu 2016 alle ore 10:25 Christian Balzer
>> >> > ha scritto:
>> >>
>> >>
>> >> Hello,
>> >>
>> >> On Wed, 29 Jun 2016 06:02:59 + Mario Giammarco wrote:
>> >>
>> >> > pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0
>> >> object_hash
>> 

Re: [ceph-users] Is anyone seeing iissues with task_numa_find_cpu?

2016-06-29 Thread Campbell Steven
Hi Alex/Stefan,

I'm in the middle of testing 4.7rc5 on our test cluster to confirm
once and for all this particular issue has been completely resolved by
Peter's recent patch to sched/fair.c refereed to by Stefan above. For
us anyway the patches that Stefan applied did not solve the issue and
neither did any 4.5.x or 4.6.x released kernel thus far, hopefully it
does the trick for you. We could get about 4 hours uptime before
things went haywire for us.

It's interesting how it seems the CEPH workload triggers this bug so
well as it's quite a long standing issue that's only just been
resolved, another user chimed in on the lkml thread a couple of days
ago as well and again his trace had ceph-osd in it as well.

https://lkml.org/lkml/headers/2016/6/21/491

Campbell

On 29 June 2016 at 18:29, Stefan Priebe - Profihost AG
 wrote:
>
> Am 29.06.2016 um 04:30 schrieb Alex Gorbachev:
>> Hi Stefan,
>>
>> On Tue, Jun 28, 2016 at 1:46 PM, Stefan Priebe - Profihost AG
>>  wrote:
>>> Please be aware that you may need even more patches. Overall this needs 3
>>> patches. Where the first two try to fix a bug and the 3rd one fixes the
>>> fixes + even more bugs related to the scheduler. I've no idea on which patch
>>> level Ubuntu is.
>>
>> Stefan, would you be able to please point to the other two patches
>> beside https://lkml.org/lkml/diff/2016/6/22/102/1 ?
>
> Sorry sure yes:
>
> 1. 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a
> bounded value")
>
> 2.) 40ed9cba24bb7e01cc380a02d3f04065b8afae1d ("sched/fair: Fix
> post_init_entity_util_avg() serialization")
>
> 3.) the one listed at lkml.
>
> Stefan
>
>>
>> Thank you,
>> Alex
>>
>>>
>>> Stefan
>>>
>>> Excuse my typo sent from my mobile phone.
>>>
>>> Am 28.06.2016 um 17:59 schrieb Tim Bishop :
>>>
>>> Yes - I noticed this today on Ubuntu 16.04 with the default kernel. No
>>> useful information to add other than it's not just you.
>>>
>>> Tim.
>>>
>>> On Tue, Jun 28, 2016 at 11:05:40AM -0400, Alex Gorbachev wrote:
>>>
>>> After upgrading to kernel 4.4.13 on Ubuntu, we are seeing a few of
>>>
>>> these issues where an OSD would fail with the stack below.  I logged a
>>>
>>> bug at https://bugzilla.kernel.org/show_bug.cgi?id=121101 and there is
>>>
>>> a similar description at https://lkml.org/lkml/2016/6/22/102, but the
>>>
>>> odd part is we have turned off CFQ and blk-mq/scsi-mq and are using
>>>
>>> just the noop scheduler.
>>>
>>>
>>> Does the ceph kernel code somehow use the fair scheduler code block?
>>>
>>>
>>> Thanks
>>>
>>> --
>>>
>>> Alex Gorbachev
>>>
>>> Storcium
>>>
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.684974] CPU: 30 PID:
>>>
>>> 10403 Comm: ceph-osd Not tainted 4.4.13-040413-generic #201606072354
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.684991] Hardware name:
>>>
>>> Supermicro X9DRi-LN4+/X9DR3-LN4+/X9DRi-LN4+/X9DR3-LN4+, BIOS 3.2
>>>
>>> 03/04/2015
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685009] task:
>>>
>>> 880f79df8000 ti: 880f79fb8000 task.ti: 880f79fb8000
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685024] RIP:
>>>
>>> 0010:[]  []
>>>
>>> task_numa_find_cpu+0x22e/0x6f0
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685051] RSP:
>>>
>>> 0018:880f79fbb818  EFLAGS: 00010206
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685063] RAX:
>>>
>>>  RBX: 880f79fbb8b8 RCX: 
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685076] RDX:
>>>
>>>  RSI:  RDI: 8810352d4800
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685107] RBP:
>>>
>>> 880f79fbb880 R08: 0001020cf87c R09: 00ff00ff
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685150] R10:
>>>
>>> 0009 R11: 0006 R12: 8807c3adc4c0
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685194] R13:
>>>
>>> 0006 R14: 033e R15: fec7
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685238] FS:
>>>
>>> 7f30e46b8700() GS:88105f58()
>>>
>>> knlGS:
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685283] CS:  0010 DS:
>>>
>>>  ES:  CR0: 80050033
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685310] CR2:
>>>
>>> 1321a000 CR3: 000853598000 CR4: 000406e0
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685354] Stack:
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685374]
>>>
>>> 813d050f 000d 0045 880f79df8000
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685426]
>>>
>>> 033f  00016b00 033f
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685477]
>>>
>>> 880f79df8000 880f79fbb8b8 01f4 0054
>>>
>>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685528] Call Trace:
>>>

[ceph-users] Maximum possible IOPS for the given configuration

2016-06-29 Thread Mykola Dvornik
Dear ceph-users,

Are there any expressions / calculators available to calculate the
maximum expected random write IOPS of the ceph cluster?

To my understanding of the ceph IO, this should be something like

MAXIOPS = (1-OVERHEAD) * OSD_BACKENDSTORAGE_IOPS * NUM_OSD /
REPLICA_COUNT

So the question is what OSD_BACKENDSTORAGE_IOPS should stand for? 4K
random or sequential writes IOPS?  

-Mykola


___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


Re: [ceph-users] CephFS mds cache pressure

2016-06-29 Thread João Castro
xiaoxi chen  writes:

> 
> Hmm, I asked in the ML some days before,:) likely you hit the kernel bug 
which fixed by commit 5e804ac482 "ceph: don't invalidate page cache when 
inode is no longer used”.  This fix is in 4.4 but not in 4.2. I haven't got a 
chance to play with 4.4 , it would be great if you can have a try.
> For MDS OOM issue, we did a MDS RSS vs #Inodes scaling test, the result 
showing around 4MB per 1000 Inodes, so your MDS likely can hold up to 2~3 
Million inodes. But yes, even with the fix if the client misbehavior (open 
and hold a lot of inodes, doesn't respond to cache pressure message), MDS can 
go over the throttling and then killed by OOM
> 

Hello!
I will install a newer kernel version and I will increase the ram a bit just 
to see how it handles it.
Thanks!

> > To: ceph-users  lists.ceph.com> From: castrofjoao-
re5jqeeqqe8avxtiumw...@public.gmane.org> Date: Tue, 28 Jun 2016 21:34:03 
+> Subject: Re: [ceph-users] CephFS mds cache pressure> > Hey John,> > 
ceph version 10.2.2 (45107e21c568dd033c2f0a3107dec8f0b0e58374)> 4.2.0-36-
generic> > Thanks!> > ___> ceph-
users mailing list> ceph-users-idqoxfivofjgjs9i8mt...@public.gmane.org> 
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
> 
> 
> 
> ___
> ceph-users mailing list
> ceph-users@...
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
> 


___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Mario Giammarco
Infact I am worried because:

1) ceph is under proxmox, and proxmox may decide to reboot a server if it
is not responding
2) probably a server was rebooted while ceph was reconstructing
3) even using max=3 do not help

Anyway this is the "unofficial" procedure that I am using, much simpler
than blog post:

1) find host where is pg
2) stop ceph in that host
3) ceph-objectstore-tool --pgid 1.98 --op mark-complete --data-path
/var/lib/ceph/osd/ceph-9 --journal-path /var/lib/ceph/osd/ceph-9/journal
4) start ceph
5) look finally it reconstructing

Il giorno mer 29 giu 2016 alle ore 11:11 Oliver Dzombic <
i...@ip-interactive.de> ha scritto:

> Hi,
>
> removing ONE disk while your replication is 2, is no problem.
>
> You dont need to wait a single second to replace of remove it. Its
> anyway not used and out/down. So from ceph's point of view its not
> existent.
>
> 
>
> But as christian told you already, what we see now fits to a szenario
> where you lost the osd and eighter you did something, or something else
> happens, but the data were not recovered again.
>
> Eighter because another OSD was broken, or because you did something.
>
> Maybe, because of the "too many PGs per OSD (307 > max 300)" ceph never
> recovered.
>
> What i can see from http://pastebin.com/VZD7j2vN is that
>
> OSD 5,13,9,0,6,2,3 and maybe others, are the OSD's holding the
> incomplete data.
>
> This are 7 OSD's from 10. So something happend to that OSD's or the data
> in them. And that had nothing to do with a single disk failing.
>
> Something else must have been happend.
>
> And as christian already wrote: you will have to go through your logs
> back until the point were things going down.
>
> Because a fail of a single OSD, no matter what your replication size is,
> can ( normally ) not harm the consistency of 7 other OSD's, means 70% of
> your total cluster.
>
> --
> Mit freundlichen Gruessen / Best regards
>
> Oliver Dzombic
> IP-Interactive
>
> mailto:i...@ip-interactive.de
>
> Anschrift:
>
> IP Interactive UG ( haftungsbeschraenkt )
> Zum Sonnenberg 1-3
> 63571 Gelnhausen
>
> HRB 93402 beim Amtsgericht Hanau
> Geschäftsführung: Oliver Dzombic
>
> Steuer Nr.: 35 236 3622 1
> UST ID: DE274086107
>
>
> Am 29.06.2016 um 10:56 schrieb Mario Giammarco:
> > Yes I have removed it from crush because it was broken. I have waited 24
> > hours to see if cephs would like to heals itself. Then I removed the
> > disk completely (it was broken...) and I waited 24 hours again. Then I
> > start getting worried.
> > Are you saying to me that I should not remove a broken disk from
> > cluster? 24 hours were not enough?
> >
> > Il giorno mer 29 giu 2016 alle ore 10:53 Zoltan Arnold Nagy
> > > ha
> scritto:
> >
> > Just loosing one disk doesn’t automagically delete it from CRUSH,
> > but in the output you had 10 disks listed, so there must be
> > something else going - did you delete the disk from the crush map as
> > well?
> >
> > Ceph waits by default 300 secs AFAIK to mark an OSD out after it
> > will start to recover.
> >
> >
> >> On 29 Jun 2016, at 10:42, Mario Giammarco  >> > wrote:
> >>
> >> I thank you for your reply so I can add my experience:
> >>
> >> 1) the other time this thing happened to me I had a cluster with
> >> min_size=2 and size=3 and the problem was the same. That time I
> >> put min_size=1 to recover the pool but it did not help. So I do
> >> not understand where is the advantage to put three copies when
> >> ceph can decide to discard all three.
> >> 2) I started with 11 hdds. The hard disk failed. Ceph waited
> >> forever for hard disk coming back. But hard disk is really
> >> completelly broken so I have followed the procedure to really
> >> delete from cluster. Anyway ceph did not recover.
> >> 3) I have 307 pgs more than 300 but it is due to the fact that I
> >> had 11 hdds now only 10. I will add more hdds after I repair the
> pool
> >> 4) I have reduced the monitors to 3
> >>
> >>
> >>
> >> Il giorno mer 29 giu 2016 alle ore 10:25 Christian Balzer
> >> > ha scritto:
> >>
> >>
> >> Hello,
> >>
> >> On Wed, 29 Jun 2016 06:02:59 + Mario Giammarco wrote:
> >>
> >> > pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0
> >> object_hash
> >>^
> >> And that's the root cause of all your woes.
> >> The default replication size is 3 for a reason and while I do
> >> run pools
> >> with replication of 2 they are either HDD RAIDs or extremely
> >> trustworthy
> >> and well monitored SSD.
> >>
> >> That said, something more than a single HDD failure must have
> >> happened
> >> here, you should check the logs and backtrace all the 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Oliver Dzombic
Hi,

removing ONE disk while your replication is 2, is no problem.

You dont need to wait a single second to replace of remove it. Its
anyway not used and out/down. So from ceph's point of view its not existent.



But as christian told you already, what we see now fits to a szenario
where you lost the osd and eighter you did something, or something else
happens, but the data were not recovered again.

Eighter because another OSD was broken, or because you did something.

Maybe, because of the "too many PGs per OSD (307 > max 300)" ceph never
recovered.

What i can see from http://pastebin.com/VZD7j2vN is that

OSD 5,13,9,0,6,2,3 and maybe others, are the OSD's holding the
incomplete data.

This are 7 OSD's from 10. So something happend to that OSD's or the data
in them. And that had nothing to do with a single disk failing.

Something else must have been happend.

And as christian already wrote: you will have to go through your logs
back until the point were things going down.

Because a fail of a single OSD, no matter what your replication size is,
can ( normally ) not harm the consistency of 7 other OSD's, means 70% of
your total cluster.

-- 
Mit freundlichen Gruessen / Best regards

Oliver Dzombic
IP-Interactive

mailto:i...@ip-interactive.de

Anschrift:

IP Interactive UG ( haftungsbeschraenkt )
Zum Sonnenberg 1-3
63571 Gelnhausen

HRB 93402 beim Amtsgericht Hanau
Geschäftsführung: Oliver Dzombic

Steuer Nr.: 35 236 3622 1
UST ID: DE274086107


Am 29.06.2016 um 10:56 schrieb Mario Giammarco:
> Yes I have removed it from crush because it was broken. I have waited 24
> hours to see if cephs would like to heals itself. Then I removed the
> disk completely (it was broken...) and I waited 24 hours again. Then I
> start getting worried.
> Are you saying to me that I should not remove a broken disk from
> cluster? 24 hours were not enough?
> 
> Il giorno mer 29 giu 2016 alle ore 10:53 Zoltan Arnold Nagy
> > ha scritto:
> 
> Just loosing one disk doesn’t automagically delete it from CRUSH,
> but in the output you had 10 disks listed, so there must be
> something else going - did you delete the disk from the crush map as
> well?
> 
> Ceph waits by default 300 secs AFAIK to mark an OSD out after it
> will start to recover.
> 
> 
>> On 29 Jun 2016, at 10:42, Mario Giammarco > > wrote:
>>
>> I thank you for your reply so I can add my experience:
>>
>> 1) the other time this thing happened to me I had a cluster with
>> min_size=2 and size=3 and the problem was the same. That time I
>> put min_size=1 to recover the pool but it did not help. So I do
>> not understand where is the advantage to put three copies when
>> ceph can decide to discard all three.
>> 2) I started with 11 hdds. The hard disk failed. Ceph waited
>> forever for hard disk coming back. But hard disk is really
>> completelly broken so I have followed the procedure to really
>> delete from cluster. Anyway ceph did not recover.
>> 3) I have 307 pgs more than 300 but it is due to the fact that I
>> had 11 hdds now only 10. I will add more hdds after I repair the pool
>> 4) I have reduced the monitors to 3
>>
>>
>>
>> Il giorno mer 29 giu 2016 alle ore 10:25 Christian Balzer
>> > ha scritto:
>>
>>
>> Hello,
>>
>> On Wed, 29 Jun 2016 06:02:59 + Mario Giammarco wrote:
>>
>> > pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0
>> object_hash
>>^
>> And that's the root cause of all your woes.
>> The default replication size is 3 for a reason and while I do
>> run pools
>> with replication of 2 they are either HDD RAIDs or extremely
>> trustworthy
>> and well monitored SSD.
>>
>> That said, something more than a single HDD failure must have
>> happened
>> here, you should check the logs and backtrace all the step you
>> did after
>> that OSD failed.
>>
>> You said there were 11 HDDs and your first ceph -s output showed:
>> ---
>>  osdmap e10182: 10 osds: 10 up, 10 in
>> 
>> And your crush map states the same.
>>
>> So how and WHEN did you remove that OSD?
>> My suspicion would be it was removed before recovery was complete.
>>
>> Also, as I think was mentioned before, 7 mons are overkill 3-5
>> would be a
>> saner number.
>>
>> Christian
>>
>> > rjenkins pg_num 512 pgp_num 512 last_change 9313 flags
>> hashpspool
>> > stripe_width 0
>> >removed_snaps [1~3]
>> > pool 1 'rbd2' replicated size 2 min_size 1 crush_ruleset 0
>> object_hash
>> > rjenkins pg_num 512 pgp_num 512 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Mario Giammarco
Yes I have removed it from crush because it was broken. I have waited 24
hours to see if cephs would like to heals itself. Then I removed the disk
completely (it was broken...) and I waited 24 hours again. Then I start
getting worried.
Are you saying to me that I should not remove a broken disk from cluster?
24 hours were not enough?

Il giorno mer 29 giu 2016 alle ore 10:53 Zoltan Arnold Nagy <
zol...@linux.vnet.ibm.com> ha scritto:

> Just loosing one disk doesn’t automagically delete it from CRUSH, but in
> the output you had 10 disks listed, so there must be something else going -
> did you delete the disk from the crush map as well?
>
> Ceph waits by default 300 secs AFAIK to mark an OSD out after it will
> start to recover.
>
>
> On 29 Jun 2016, at 10:42, Mario Giammarco  wrote:
>
> I thank you for your reply so I can add my experience:
>
> 1) the other time this thing happened to me I had a cluster with
> min_size=2 and size=3 and the problem was the same. That time I put
> min_size=1 to recover the pool but it did not help. So I do not understand
> where is the advantage to put three copies when ceph can decide to discard
> all three.
> 2) I started with 11 hdds. The hard disk failed. Ceph waited forever for
> hard disk coming back. But hard disk is really completelly broken so I have
> followed the procedure to really delete from cluster. Anyway ceph did not
> recover.
> 3) I have 307 pgs more than 300 but it is due to the fact that I had 11
> hdds now only 10. I will add more hdds after I repair the pool
> 4) I have reduced the monitors to 3
>
>
>
> Il giorno mer 29 giu 2016 alle ore 10:25 Christian Balzer 
> ha scritto:
>
>>
>> Hello,
>>
>> On Wed, 29 Jun 2016 06:02:59 + Mario Giammarco wrote:
>>
>> > pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0 object_hash
>>^
>> And that's the root cause of all your woes.
>> The default replication size is 3 for a reason and while I do run pools
>> with replication of 2 they are either HDD RAIDs or extremely trustworthy
>> and well monitored SSD.
>>
>> That said, something more than a single HDD failure must have happened
>> here, you should check the logs and backtrace all the step you did after
>> that OSD failed.
>>
>> You said there were 11 HDDs and your first ceph -s output showed:
>> ---
>>  osdmap e10182: 10 osds: 10 up, 10 in
>> 
>> And your crush map states the same.
>>
>> So how and WHEN did you remove that OSD?
>> My suspicion would be it was removed before recovery was complete.
>>
>> Also, as I think was mentioned before, 7 mons are overkill 3-5 would be a
>> saner number.
>>
>> Christian
>>
>> > rjenkins pg_num 512 pgp_num 512 last_change 9313 flags hashpspool
>> > stripe_width 0
>> >removed_snaps [1~3]
>> > pool 1 'rbd2' replicated size 2 min_size 1 crush_ruleset 0 object_hash
>> > rjenkins pg_num 512 pgp_num 512 last_change 9314 flags hashpspool
>> > stripe_width 0
>> >removed_snaps [1~3]
>> > pool 2 'rbd3' replicated size 2 min_size 1 crush_ruleset 0 object_hash
>> > rjenkins pg_num 512 pgp_num 512 last_change 10537 flags hashpspool
>> > stripe_width 0
>> >removed_snaps [1~3]
>> >
>> >
>> > ID WEIGHT  REWEIGHT SIZE   USE   AVAIL %USE  VAR
>> > 5 1.81000  1.0  1857G  984G  872G 53.00 0.86
>> > 6 1.81000  1.0  1857G 1202G  655G 64.73 1.05
>> > 2 1.81000  1.0  1857G 1158G  698G 62.38 1.01
>> > 3 1.35999  1.0  1391G  906G  485G 65.12 1.06
>> > 4 0.8  1.0   926G  702G  223G 75.88 1.23
>> > 7 1.81000  1.0  1857G 1063G  793G 57.27 0.93
>> > 8 1.81000  1.0  1857G 1011G  846G 54.44 0.88
>> > 9 0.8  1.0   926G  573G  352G 61.91 1.01
>> > 0 1.81000  1.0  1857G 1227G  629G 66.10 1.07
>> > 13 0.45000  1.0   460G  307G  153G 66.74 1.08
>> >  TOTAL 14846G 9136G 5710G 61.54
>> > MIN/MAX VAR: 0.86/1.23  STDDEV: 6.47
>> >
>> >
>> >
>> > ceph version 0.94.7 (d56bdf93ced6b80b07397d57e3fa68fe68304432)
>> >
>> > http://pastebin.com/SvGfcSHb
>> > http://pastebin.com/gYFatsNS
>> > http://pastebin.com/VZD7j2vN
>> >
>> > I do not understand why I/O on ENTIRE cluster is blocked when only few
>> > pgs are incomplete.
>> >
>> > Many thanks,
>> > Mario
>> >
>> >
>> > Il giorno mar 28 giu 2016 alle ore 19:34 Stefan Priebe - Profihost AG <
>> > s.pri...@profihost.ag> ha scritto:
>> >
>> > > And ceph health detail
>> > >
>> > > Stefan
>> > >
>> > > Excuse my typo sent from my mobile phone.
>> > >
>> > > Am 28.06.2016 um 19:28 schrieb Oliver Dzombic > >:
>> > >
>> > > Hi Mario,
>> > >
>> > > please give some more details:
>> > >
>> > > Please the output of:
>> > >
>> > > ceph osd pool ls detail
>> > > ceph osd df
>> > > ceph --version
>> > >
>> > > ceph -w for 10 seconds ( use http://pastebin.com/ please )
>> > >
>> > > ceph osd crush dump ( also pastebin pls )
>> > >
>> > > --
>> > > Mit freundlichen Gruessen / Best regards
>> > >
>> > > Oliver Dzombic
>> > > IP-Interactive

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Zoltan Arnold Nagy
Just loosing one disk doesn’t automagically delete it from CRUSH, but in the 
output you had 10 disks listed, so there must be something else going - did you 
delete the disk from the crush map as well?

Ceph waits by default 300 secs AFAIK to mark an OSD out after it will start to 
recover.


> On 29 Jun 2016, at 10:42, Mario Giammarco  wrote:
> 
> I thank you for your reply so I can add my experience:
> 
> 1) the other time this thing happened to me I had a cluster with min_size=2 
> and size=3 and the problem was the same. That time I put min_size=1 to 
> recover the pool but it did not help. So I do not understand where is the 
> advantage to put three copies when ceph can decide to discard all three.
> 2) I started with 11 hdds. The hard disk failed. Ceph waited forever for hard 
> disk coming back. But hard disk is really completelly broken so I have 
> followed the procedure to really delete from cluster. Anyway ceph did not 
> recover.
> 3) I have 307 pgs more than 300 but it is due to the fact that I had 11 hdds 
> now only 10. I will add more hdds after I repair the pool
> 4) I have reduced the monitors to 3
> 
> 
> 
> Il giorno mer 29 giu 2016 alle ore 10:25 Christian Balzer  > ha scritto:
> 
> Hello,
> 
> On Wed, 29 Jun 2016 06:02:59 + Mario Giammarco wrote:
> 
> > pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0 object_hash
>^
> And that's the root cause of all your woes.
> The default replication size is 3 for a reason and while I do run pools
> with replication of 2 they are either HDD RAIDs or extremely trustworthy
> and well monitored SSD.
> 
> That said, something more than a single HDD failure must have happened
> here, you should check the logs and backtrace all the step you did after
> that OSD failed.
> 
> You said there were 11 HDDs and your first ceph -s output showed:
> ---
>  osdmap e10182: 10 osds: 10 up, 10 in
> 
> And your crush map states the same.
> 
> So how and WHEN did you remove that OSD?
> My suspicion would be it was removed before recovery was complete.
> 
> Also, as I think was mentioned before, 7 mons are overkill 3-5 would be a
> saner number.
> 
> Christian
> 
> > rjenkins pg_num 512 pgp_num 512 last_change 9313 flags hashpspool
> > stripe_width 0
> >removed_snaps [1~3]
> > pool 1 'rbd2' replicated size 2 min_size 1 crush_ruleset 0 object_hash
> > rjenkins pg_num 512 pgp_num 512 last_change 9314 flags hashpspool
> > stripe_width 0
> >removed_snaps [1~3]
> > pool 2 'rbd3' replicated size 2 min_size 1 crush_ruleset 0 object_hash
> > rjenkins pg_num 512 pgp_num 512 last_change 10537 flags hashpspool
> > stripe_width 0
> >removed_snaps [1~3]
> >
> >
> > ID WEIGHT  REWEIGHT SIZE   USE   AVAIL %USE  VAR
> > 5 1.81000  1.0  1857G  984G  872G 53.00 0.86
> > 6 1.81000  1.0  1857G 1202G  655G 64.73 1.05
> > 2 1.81000  1.0  1857G 1158G  698G 62.38 1.01
> > 3 1.35999  1.0  1391G  906G  485G 65.12 1.06
> > 4 0.8  1.0   926G  702G  223G 75.88 1.23
> > 7 1.81000  1.0  1857G 1063G  793G 57.27 0.93
> > 8 1.81000  1.0  1857G 1011G  846G 54.44 0.88
> > 9 0.8  1.0   926G  573G  352G 61.91 1.01
> > 0 1.81000  1.0  1857G 1227G  629G 66.10 1.07
> > 13 0.45000  1.0   460G  307G  153G 66.74 1.08
> >  TOTAL 14846G 9136G 5710G 61.54
> > MIN/MAX VAR: 0.86/1.23  STDDEV: 6.47
> >
> >
> >
> > ceph version 0.94.7 (d56bdf93ced6b80b07397d57e3fa68fe68304432)
> >
> > http://pastebin.com/SvGfcSHb 
> > http://pastebin.com/gYFatsNS 
> > http://pastebin.com/VZD7j2vN 
> >
> > I do not understand why I/O on ENTIRE cluster is blocked when only few
> > pgs are incomplete.
> >
> > Many thanks,
> > Mario
> >
> >
> > Il giorno mar 28 giu 2016 alle ore 19:34 Stefan Priebe - Profihost AG <
> > s.pri...@profihost.ag > ha scritto:
> >
> > > And ceph health detail
> > >
> > > Stefan
> > >
> > > Excuse my typo sent from my mobile phone.
> > >
> > > Am 28.06.2016 um 19:28 schrieb Oliver Dzombic  > > >:
> > >
> > > Hi Mario,
> > >
> > > please give some more details:
> > >
> > > Please the output of:
> > >
> > > ceph osd pool ls detail
> > > ceph osd df
> > > ceph --version
> > >
> > > ceph -w for 10 seconds ( use http://pastebin.com/  
> > > please )
> > >
> > > ceph osd crush dump ( also pastebin pls )
> > >
> > > --
> > > Mit freundlichen Gruessen / Best regards
> > >
> > > Oliver Dzombic
> > > IP-Interactive
> > >
> > > mailto:i...@ip-interactive.de  
> > > >
> > >
> > > Anschrift:
> > >
> > > IP Interactive UG ( haftungsbeschraenkt )
> > > Zum Sonnenberg 1-3
> > > 63571 Gelnhausen
> > >
> > > HRB 93402 beim Amtsgericht Hanau
> > > 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Mario Giammarco
I thank you for your reply so I can add my experience:

1) the other time this thing happened to me I had a cluster with min_size=2
and size=3 and the problem was the same. That time I put min_size=1 to
recover the pool but it did not help. So I do not understand where is the
advantage to put three copies when ceph can decide to discard all three.
2) I started with 11 hdds. The hard disk failed. Ceph waited forever for
hard disk coming back. But hard disk is really completelly broken so I have
followed the procedure to really delete from cluster. Anyway ceph did not
recover.
3) I have 307 pgs more than 300 but it is due to the fact that I had 11
hdds now only 10. I will add more hdds after I repair the pool
4) I have reduced the monitors to 3



Il giorno mer 29 giu 2016 alle ore 10:25 Christian Balzer 
ha scritto:

>
> Hello,
>
> On Wed, 29 Jun 2016 06:02:59 + Mario Giammarco wrote:
>
> > pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0 object_hash
>^
> And that's the root cause of all your woes.
> The default replication size is 3 for a reason and while I do run pools
> with replication of 2 they are either HDD RAIDs or extremely trustworthy
> and well monitored SSD.
>
> That said, something more than a single HDD failure must have happened
> here, you should check the logs and backtrace all the step you did after
> that OSD failed.
>
> You said there were 11 HDDs and your first ceph -s output showed:
> ---
>  osdmap e10182: 10 osds: 10 up, 10 in
> 
> And your crush map states the same.
>
> So how and WHEN did you remove that OSD?
> My suspicion would be it was removed before recovery was complete.
>
> Also, as I think was mentioned before, 7 mons are overkill 3-5 would be a
> saner number.
>
> Christian
>
> > rjenkins pg_num 512 pgp_num 512 last_change 9313 flags hashpspool
> > stripe_width 0
> >removed_snaps [1~3]
> > pool 1 'rbd2' replicated size 2 min_size 1 crush_ruleset 0 object_hash
> > rjenkins pg_num 512 pgp_num 512 last_change 9314 flags hashpspool
> > stripe_width 0
> >removed_snaps [1~3]
> > pool 2 'rbd3' replicated size 2 min_size 1 crush_ruleset 0 object_hash
> > rjenkins pg_num 512 pgp_num 512 last_change 10537 flags hashpspool
> > stripe_width 0
> >removed_snaps [1~3]
> >
> >
> > ID WEIGHT  REWEIGHT SIZE   USE   AVAIL %USE  VAR
> > 5 1.81000  1.0  1857G  984G  872G 53.00 0.86
> > 6 1.81000  1.0  1857G 1202G  655G 64.73 1.05
> > 2 1.81000  1.0  1857G 1158G  698G 62.38 1.01
> > 3 1.35999  1.0  1391G  906G  485G 65.12 1.06
> > 4 0.8  1.0   926G  702G  223G 75.88 1.23
> > 7 1.81000  1.0  1857G 1063G  793G 57.27 0.93
> > 8 1.81000  1.0  1857G 1011G  846G 54.44 0.88
> > 9 0.8  1.0   926G  573G  352G 61.91 1.01
> > 0 1.81000  1.0  1857G 1227G  629G 66.10 1.07
> > 13 0.45000  1.0   460G  307G  153G 66.74 1.08
> >  TOTAL 14846G 9136G 5710G 61.54
> > MIN/MAX VAR: 0.86/1.23  STDDEV: 6.47
> >
> >
> >
> > ceph version 0.94.7 (d56bdf93ced6b80b07397d57e3fa68fe68304432)
> >
> > http://pastebin.com/SvGfcSHb
> > http://pastebin.com/gYFatsNS
> > http://pastebin.com/VZD7j2vN
> >
> > I do not understand why I/O on ENTIRE cluster is blocked when only few
> > pgs are incomplete.
> >
> > Many thanks,
> > Mario
> >
> >
> > Il giorno mar 28 giu 2016 alle ore 19:34 Stefan Priebe - Profihost AG <
> > s.pri...@profihost.ag> ha scritto:
> >
> > > And ceph health detail
> > >
> > > Stefan
> > >
> > > Excuse my typo sent from my mobile phone.
> > >
> > > Am 28.06.2016 um 19:28 schrieb Oliver Dzombic  >:
> > >
> > > Hi Mario,
> > >
> > > please give some more details:
> > >
> > > Please the output of:
> > >
> > > ceph osd pool ls detail
> > > ceph osd df
> > > ceph --version
> > >
> > > ceph -w for 10 seconds ( use http://pastebin.com/ please )
> > >
> > > ceph osd crush dump ( also pastebin pls )
> > >
> > > --
> > > Mit freundlichen Gruessen / Best regards
> > >
> > > Oliver Dzombic
> > > IP-Interactive
> > >
> > > mailto:i...@ip-interactive.de 
> > >
> > > Anschrift:
> > >
> > > IP Interactive UG ( haftungsbeschraenkt )
> > > Zum Sonnenberg 1-3
> > > 63571 Gelnhausen
> > >
> > > HRB 93402 beim Amtsgericht Hanau
> > > Geschäftsführung: Oliver Dzombic
> > >
> > > Steuer Nr.: 35 236 3622 1
> > > UST ID: DE274086107
> > >
> > >
> > > Am 28.06.2016 um 18:59 schrieb Mario Giammarco:
> > >
> > > Hello,
> > >
> > > this is the second time that happens to me, I hope that someone can
> > >
> > > explain what I can do.
> > >
> > > Proxmox ceph cluster with 8 servers, 11 hdd. Min_size=1, size=2.
> > >
> > >
> > > One hdd goes down due to bad sectors.
> > >
> > > Ceph recovers but it ends with:
> > >
> > >
> > > cluster f2a8dd7d-949a-4a29-acab-11d4900249f4
> > >
> > > health HEALTH_WARN
> > >
> > >3 pgs down
> > >
> > >19 pgs incomplete
> > >
> > >19 pgs stuck inactive
> > >
> > >

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Christian Balzer

Hello,

On Wed, 29 Jun 2016 06:02:59 + Mario Giammarco wrote:

> pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0 object_hash
   ^
And that's the root cause of all your woes.
The default replication size is 3 for a reason and while I do run pools
with replication of 2 they are either HDD RAIDs or extremely trustworthy
and well monitored SSD.

That said, something more than a single HDD failure must have happened
here, you should check the logs and backtrace all the step you did after
that OSD failed.

You said there were 11 HDDs and your first ceph -s output showed:
---
 osdmap e10182: 10 osds: 10 up, 10 in

And your crush map states the same.

So how and WHEN did you remove that OSD?
My suspicion would be it was removed before recovery was complete.

Also, as I think was mentioned before, 7 mons are overkill 3-5 would be a
saner number.

Christian

> rjenkins pg_num 512 pgp_num 512 last_change 9313 flags hashpspool
> stripe_width 0
>removed_snaps [1~3]
> pool 1 'rbd2' replicated size 2 min_size 1 crush_ruleset 0 object_hash
> rjenkins pg_num 512 pgp_num 512 last_change 9314 flags hashpspool
> stripe_width 0
>removed_snaps [1~3]
> pool 2 'rbd3' replicated size 2 min_size 1 crush_ruleset 0 object_hash
> rjenkins pg_num 512 pgp_num 512 last_change 10537 flags hashpspool
> stripe_width 0
>removed_snaps [1~3]
> 
> 
> ID WEIGHT  REWEIGHT SIZE   USE   AVAIL %USE  VAR
> 5 1.81000  1.0  1857G  984G  872G 53.00 0.86
> 6 1.81000  1.0  1857G 1202G  655G 64.73 1.05
> 2 1.81000  1.0  1857G 1158G  698G 62.38 1.01
> 3 1.35999  1.0  1391G  906G  485G 65.12 1.06
> 4 0.8  1.0   926G  702G  223G 75.88 1.23
> 7 1.81000  1.0  1857G 1063G  793G 57.27 0.93
> 8 1.81000  1.0  1857G 1011G  846G 54.44 0.88
> 9 0.8  1.0   926G  573G  352G 61.91 1.01
> 0 1.81000  1.0  1857G 1227G  629G 66.10 1.07
> 13 0.45000  1.0   460G  307G  153G 66.74 1.08
>  TOTAL 14846G 9136G 5710G 61.54
> MIN/MAX VAR: 0.86/1.23  STDDEV: 6.47
> 
> 
> 
> ceph version 0.94.7 (d56bdf93ced6b80b07397d57e3fa68fe68304432)
> 
> http://pastebin.com/SvGfcSHb
> http://pastebin.com/gYFatsNS
> http://pastebin.com/VZD7j2vN
> 
> I do not understand why I/O on ENTIRE cluster is blocked when only few
> pgs are incomplete.
> 
> Many thanks,
> Mario
> 
> 
> Il giorno mar 28 giu 2016 alle ore 19:34 Stefan Priebe - Profihost AG <
> s.pri...@profihost.ag> ha scritto:
> 
> > And ceph health detail
> >
> > Stefan
> >
> > Excuse my typo sent from my mobile phone.
> >
> > Am 28.06.2016 um 19:28 schrieb Oliver Dzombic :
> >
> > Hi Mario,
> >
> > please give some more details:
> >
> > Please the output of:
> >
> > ceph osd pool ls detail
> > ceph osd df
> > ceph --version
> >
> > ceph -w for 10 seconds ( use http://pastebin.com/ please )
> >
> > ceph osd crush dump ( also pastebin pls )
> >
> > --
> > Mit freundlichen Gruessen / Best regards
> >
> > Oliver Dzombic
> > IP-Interactive
> >
> > mailto:i...@ip-interactive.de 
> >
> > Anschrift:
> >
> > IP Interactive UG ( haftungsbeschraenkt )
> > Zum Sonnenberg 1-3
> > 63571 Gelnhausen
> >
> > HRB 93402 beim Amtsgericht Hanau
> > Geschäftsführung: Oliver Dzombic
> >
> > Steuer Nr.: 35 236 3622 1
> > UST ID: DE274086107
> >
> >
> > Am 28.06.2016 um 18:59 schrieb Mario Giammarco:
> >
> > Hello,
> >
> > this is the second time that happens to me, I hope that someone can
> >
> > explain what I can do.
> >
> > Proxmox ceph cluster with 8 servers, 11 hdd. Min_size=1, size=2.
> >
> >
> > One hdd goes down due to bad sectors.
> >
> > Ceph recovers but it ends with:
> >
> >
> > cluster f2a8dd7d-949a-4a29-acab-11d4900249f4
> >
> > health HEALTH_WARN
> >
> >3 pgs down
> >
> >19 pgs incomplete
> >
> >19 pgs stuck inactive
> >
> >19 pgs stuck unclean
> >
> >7 requests are blocked > 32 sec
> >
> > monmap e11: 7 mons at
> >
> > {0=192.168.0.204:6789/0,1=192.168.0.201:6789/0,
> >
> > 2=192.168.0.203:6789/0,3=192.168.0.205:6789/0,4=192.168.0.202:
> >
> > 6789/0,5=192.168.0.206:6789/0,6=192.168.0.207:6789/0}
> >
> >election epoch 722, quorum
> >
> > 0,1,2,3,4,5,6 1,4,2,0,3,5,6
> >
> > osdmap e10182: 10 osds: 10 up, 10 in
> >
> >  pgmap v3295880: 1024 pgs, 2 pools, 4563 GB data, 1143 kobjects
> >
> >9136 GB used, 5710 GB / 14846 GB avail
> >
> >1005 active+clean
> >
> >  16 incomplete
> >
> >   3 down+incomplete
> >
> >
> > Unfortunately "7 requests blocked" means no virtual machine can boot
> >
> > because ceph has stopped i/o.
> >
> >
> > I can accept to lose some data, but not ALL data!
> >
> > Can you help me please?
> >
> > Thanks,
> >
> > Mario
> >
> >
> > ___
> >
> > ceph-users mailing list
> >
> > ceph-users@lists.ceph.com
> >
> > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
> 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Oliver Dzombic
Hi Mario,

in my opinion you should

1. fix

 too many PGs per OSD (307 > max 300)

2. stop scrubbing / deeb scrubbing

--

How looks your current

ceph osd tree

?



-- 
Mit freundlichen Gruessen / Best regards

Oliver Dzombic
IP-Interactive

mailto:i...@ip-interactive.de

Anschrift:

IP Interactive UG ( haftungsbeschraenkt )
Zum Sonnenberg 1-3
63571 Gelnhausen

HRB 93402 beim Amtsgericht Hanau
Geschäftsführung: Oliver Dzombic

Steuer Nr.: 35 236 3622 1
UST ID: DE274086107


Am 29.06.2016 um 09:50 schrieb Mario Giammarco:
> I have searched google and I see that there is no official procedure.
> 
> Il giorno mer 29 giu 2016 alle ore 09:43 Mario Giammarco
> > ha scritto:
> 
> I have read many times the post "incomplete pgs, oh my"
> I think my case is different. 
> The broken disk is completely broken.
> So how can I simply mark incomplete pgs as complete? 
> Should I stop ceph before?
> 
> 
> Il giorno mer 29 giu 2016 alle ore 09:36 Tomasz Kuzemko
> >
> ha scritto:
> 
> Hi,
> if you need fast access to your remaining data you can use
> ceph-objectstore-tool to mark those PGs as complete, however
> this will
> irreversibly lose the missing data.
> 
> If you understand the risks, this procedure is pretty good
> explained here:
> http://ceph.com/community/incomplete-pgs-oh-my/
> 
> Since this article was written, ceph-objectstore-tool gained a
> feature
> that was not available at that time, that is "--op mark-complete". I
> think it will be necessary in your case to call --op
> mark-complete after
> you import the PG to temporary OSD (between steps 12 and 13).
> 
> On 29.06.2016 09:09, Mario Giammarco wrote:
> > Now I have also discovered that, by mistake, someone has put
> production
> > data on a virtual machine of the cluster. I need that ceph
> starts I/O so
> > I can boot that virtual machine.
> > Can I mark the incomplete pgs as valid?
> > If needed, where can I buy some paid support?
> > Thanks again,
> > Mario
> >
> > Il giorno mer 29 giu 2016 alle ore 08:02 Mario Giammarco
> > 
> >> ha
> scritto:
> >
> > pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0
> > object_hash rjenkins pg_num 512 pgp_num 512 last_change
> 9313 flags
> > hashpspool stripe_width 0
> >removed_snaps [1~3]
> > pool 1 'rbd2' replicated size 2 min_size 1 crush_ruleset 0
> > object_hash rjenkins pg_num 512 pgp_num 512 last_change
> 9314 flags
> > hashpspool stripe_width 0
> >removed_snaps [1~3]
> > pool 2 'rbd3' replicated size 2 min_size 1 crush_ruleset 0
> > object_hash rjenkins pg_num 512 pgp_num 512 last_change
> 10537 flags
> > hashpspool stripe_width 0
> >removed_snaps [1~3]
> >
> >
> > ID WEIGHT  REWEIGHT SIZE   USE   AVAIL %USE  VAR
> > 5 1.81000  1.0  1857G  984G  872G 53.00 0.86
> > 6 1.81000  1.0  1857G 1202G  655G 64.73 1.05
> > 2 1.81000  1.0  1857G 1158G  698G 62.38 1.01
> > 3 1.35999  1.0  1391G  906G  485G 65.12 1.06
> > 4 0.8  1.0   926G  702G  223G 75.88 1.23
> > 7 1.81000  1.0  1857G 1063G  793G 57.27 0.93
> > 8 1.81000  1.0  1857G 1011G  846G 54.44 0.88
> > 9 0.8  1.0   926G  573G  352G 61.91 1.01
> > 0 1.81000  1.0  1857G 1227G  629G 66.10 1.07
> > 13 0.45000  1.0   460G  307G  153G 66.74 1.08
> >  TOTAL 14846G 9136G 5710G 61.54
> > MIN/MAX VAR: 0.86/1.23  STDDEV: 6.47
> >
> >
> >
> > ceph version 0.94.7 (d56bdf93ced6b80b07397d57e3fa68fe68304432)
> >
> > http://pastebin.com/SvGfcSHb
> > http://pastebin.com/gYFatsNS
> > http://pastebin.com/VZD7j2vN
> >
> > I do not understand why I/O on ENTIRE cluster is blocked
> when only
> > few pgs are incomplete.
> >
> > Many thanks,
> > Mario
> >
> >
> > Il giorno mar 28 giu 2016 alle ore 19:34 Stefan Priebe -
> Profihost
> > AG 
> >>
> ha scritto:
> >
> > And ceph health detail
> >
> > 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Tomasz Kuzemko
As far as I know there isn't, which is a shame. We have covered a
situation like this in our dev environment to be ready for it in
production and it worked, however be aware that the data that Ceph
believes is missing will be lost after you mark a PG complete.

In your situation I would find OSD which has a most complete copy of the
incomplete PG by looking at files in /var/lib/ceph/osd/*/current (based
on size or maybe mtime of files) and export it using
ceph-objectstore-tool. After that you can follow procedure described in
"incomplete pgs, oh my" with the addition of "--op mark-complete"
between steps 12 and 13.

On 29.06.2016 09:50, Mario Giammarco wrote:
> I have searched google and I see that there is no official procedure.
> 
> Il giorno mer 29 giu 2016 alle ore 09:43 Mario Giammarco
> > ha scritto:
> 
> I have read many times the post "incomplete pgs, oh my"
> I think my case is different. 
> The broken disk is completely broken.
> So how can I simply mark incomplete pgs as complete? 
> Should I stop ceph before?
> 
> 
> Il giorno mer 29 giu 2016 alle ore 09:36 Tomasz Kuzemko
> >
> ha scritto:
> 
> Hi,
> if you need fast access to your remaining data you can use
> ceph-objectstore-tool to mark those PGs as complete, however
> this will
> irreversibly lose the missing data.
> 
> If you understand the risks, this procedure is pretty good
> explained here:
> http://ceph.com/community/incomplete-pgs-oh-my/
> 
> Since this article was written, ceph-objectstore-tool gained a
> feature
> that was not available at that time, that is "--op mark-complete". I
> think it will be necessary in your case to call --op
> mark-complete after
> you import the PG to temporary OSD (between steps 12 and 13).
> 
> On 29.06.2016 09:09, Mario Giammarco wrote:
> > Now I have also discovered that, by mistake, someone has put
> production
> > data on a virtual machine of the cluster. I need that ceph
> starts I/O so
> > I can boot that virtual machine.
> > Can I mark the incomplete pgs as valid?
> > If needed, where can I buy some paid support?
> > Thanks again,
> > Mario
> >
> > Il giorno mer 29 giu 2016 alle ore 08:02 Mario Giammarco
> > 
> >> ha
> scritto:
> >
> > pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0
> > object_hash rjenkins pg_num 512 pgp_num 512 last_change
> 9313 flags
> > hashpspool stripe_width 0
> >removed_snaps [1~3]
> > pool 1 'rbd2' replicated size 2 min_size 1 crush_ruleset 0
> > object_hash rjenkins pg_num 512 pgp_num 512 last_change
> 9314 flags
> > hashpspool stripe_width 0
> >removed_snaps [1~3]
> > pool 2 'rbd3' replicated size 2 min_size 1 crush_ruleset 0
> > object_hash rjenkins pg_num 512 pgp_num 512 last_change
> 10537 flags
> > hashpspool stripe_width 0
> >removed_snaps [1~3]
> >
> >
> > ID WEIGHT  REWEIGHT SIZE   USE   AVAIL %USE  VAR
> > 5 1.81000  1.0  1857G  984G  872G 53.00 0.86
> > 6 1.81000  1.0  1857G 1202G  655G 64.73 1.05
> > 2 1.81000  1.0  1857G 1158G  698G 62.38 1.01
> > 3 1.35999  1.0  1391G  906G  485G 65.12 1.06
> > 4 0.8  1.0   926G  702G  223G 75.88 1.23
> > 7 1.81000  1.0  1857G 1063G  793G 57.27 0.93
> > 8 1.81000  1.0  1857G 1011G  846G 54.44 0.88
> > 9 0.8  1.0   926G  573G  352G 61.91 1.01
> > 0 1.81000  1.0  1857G 1227G  629G 66.10 1.07
> > 13 0.45000  1.0   460G  307G  153G 66.74 1.08
> >  TOTAL 14846G 9136G 5710G 61.54
> > MIN/MAX VAR: 0.86/1.23  STDDEV: 6.47
> >
> >
> >
> > ceph version 0.94.7 (d56bdf93ced6b80b07397d57e3fa68fe68304432)
> >
> > http://pastebin.com/SvGfcSHb
> > http://pastebin.com/gYFatsNS
> > http://pastebin.com/VZD7j2vN
> >
> > I do not understand why I/O on ENTIRE cluster is blocked
> when only
> > few pgs are incomplete.
> >
> > Many thanks,
> > Mario
> >
> >
> > Il giorno mar 28 giu 2016 alle ore 19:34 Stefan Priebe -
> Profihost
> > AG 
> 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Mario Giammarco
I have searched google and I see that there is no official procedure.

Il giorno mer 29 giu 2016 alle ore 09:43 Mario Giammarco <
mgiamma...@gmail.com> ha scritto:

> I have read many times the post "incomplete pgs, oh my"
> I think my case is different.
> The broken disk is completely broken.
> So how can I simply mark incomplete pgs as complete?
> Should I stop ceph before?
>
>
> Il giorno mer 29 giu 2016 alle ore 09:36 Tomasz Kuzemko <
> tomasz.kuze...@corp.ovh.com> ha scritto:
>
>> Hi,
>> if you need fast access to your remaining data you can use
>> ceph-objectstore-tool to mark those PGs as complete, however this will
>> irreversibly lose the missing data.
>>
>> If you understand the risks, this procedure is pretty good explained here:
>> http://ceph.com/community/incomplete-pgs-oh-my/
>>
>> Since this article was written, ceph-objectstore-tool gained a feature
>> that was not available at that time, that is "--op mark-complete". I
>> think it will be necessary in your case to call --op mark-complete after
>> you import the PG to temporary OSD (between steps 12 and 13).
>>
>> On 29.06.2016 09:09, Mario Giammarco wrote:
>> > Now I have also discovered that, by mistake, someone has put production
>> > data on a virtual machine of the cluster. I need that ceph starts I/O so
>> > I can boot that virtual machine.
>> > Can I mark the incomplete pgs as valid?
>> > If needed, where can I buy some paid support?
>> > Thanks again,
>> > Mario
>> >
>> > Il giorno mer 29 giu 2016 alle ore 08:02 Mario Giammarco
>> > > ha scritto:
>> >
>> > pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0
>> > object_hash rjenkins pg_num 512 pgp_num 512 last_change 9313 flags
>> > hashpspool stripe_width 0
>> >removed_snaps [1~3]
>> > pool 1 'rbd2' replicated size 2 min_size 1 crush_ruleset 0
>> > object_hash rjenkins pg_num 512 pgp_num 512 last_change 9314 flags
>> > hashpspool stripe_width 0
>> >removed_snaps [1~3]
>> > pool 2 'rbd3' replicated size 2 min_size 1 crush_ruleset 0
>> > object_hash rjenkins pg_num 512 pgp_num 512 last_change 10537 flags
>> > hashpspool stripe_width 0
>> >removed_snaps [1~3]
>> >
>> >
>> > ID WEIGHT  REWEIGHT SIZE   USE   AVAIL %USE  VAR
>> > 5 1.81000  1.0  1857G  984G  872G 53.00 0.86
>> > 6 1.81000  1.0  1857G 1202G  655G 64.73 1.05
>> > 2 1.81000  1.0  1857G 1158G  698G 62.38 1.01
>> > 3 1.35999  1.0  1391G  906G  485G 65.12 1.06
>> > 4 0.8  1.0   926G  702G  223G 75.88 1.23
>> > 7 1.81000  1.0  1857G 1063G  793G 57.27 0.93
>> > 8 1.81000  1.0  1857G 1011G  846G 54.44 0.88
>> > 9 0.8  1.0   926G  573G  352G 61.91 1.01
>> > 0 1.81000  1.0  1857G 1227G  629G 66.10 1.07
>> > 13 0.45000  1.0   460G  307G  153G 66.74 1.08
>> >  TOTAL 14846G 9136G 5710G 61.54
>> > MIN/MAX VAR: 0.86/1.23  STDDEV: 6.47
>> >
>> >
>> >
>> > ceph version 0.94.7 (d56bdf93ced6b80b07397d57e3fa68fe68304432)
>> >
>> > http://pastebin.com/SvGfcSHb
>> > http://pastebin.com/gYFatsNS
>> > http://pastebin.com/VZD7j2vN
>> >
>> > I do not understand why I/O on ENTIRE cluster is blocked when only
>> > few pgs are incomplete.
>> >
>> > Many thanks,
>> > Mario
>> >
>> >
>> > Il giorno mar 28 giu 2016 alle ore 19:34 Stefan Priebe - Profihost
>> > AG > ha
>> scritto:
>> >
>> > And ceph health detail
>> >
>> > Stefan
>> >
>> > Excuse my typo sent from my mobile phone.
>> >
>> > Am 28.06.2016 um 19:28 schrieb Oliver Dzombic
>> > >:
>> >
>> >> Hi Mario,
>> >>
>> >> please give some more details:
>> >>
>> >> Please the output of:
>> >>
>> >> ceph osd pool ls detail
>> >> ceph osd df
>> >> ceph --version
>> >>
>> >> ceph -w for 10 seconds ( use http://pastebin.com/ please )
>> >>
>> >> ceph osd crush dump ( also pastebin pls )
>> >>
>> >> --
>> >> Mit freundlichen Gruessen / Best regards
>> >>
>> >> Oliver Dzombic
>> >> IP-Interactive
>> >>
>> >> mailto:i...@ip-interactive.de
>> >>
>> >> Anschrift:
>> >>
>> >> IP Interactive UG ( haftungsbeschraenkt )
>> >> Zum Sonnenberg 1-3
>> >> 63571 Gelnhausen
>> >>
>> >> HRB 93402 beim Amtsgericht Hanau
>> >> Geschäftsführung: Oliver Dzombic
>> >>
>> >> Steuer Nr.: 35 236 3622 1
>> >> UST ID: DE274086107
>> >>
>> >>
>> >> Am 28.06.2016 um 18:59 schrieb Mario Giammarco:
>> >>> Hello,
>> >>> this is the second time that happens to me, I hope that
>> >>> someone can
>> >>> explain what I can do.
>> >>> Proxmox ceph cluster with 8 servers, 11 hdd. Min_size=1,
>> size=2.
>> 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Tomasz Kuzemko
Hi,
if you need fast access to your remaining data you can use
ceph-objectstore-tool to mark those PGs as complete, however this will
irreversibly lose the missing data.

If you understand the risks, this procedure is pretty good explained here:
http://ceph.com/community/incomplete-pgs-oh-my/

Since this article was written, ceph-objectstore-tool gained a feature
that was not available at that time, that is "--op mark-complete". I
think it will be necessary in your case to call --op mark-complete after
you import the PG to temporary OSD (between steps 12 and 13).

On 29.06.2016 09:09, Mario Giammarco wrote:
> Now I have also discovered that, by mistake, someone has put production
> data on a virtual machine of the cluster. I need that ceph starts I/O so
> I can boot that virtual machine.
> Can I mark the incomplete pgs as valid?
> If needed, where can I buy some paid support?
> Thanks again,
> Mario
> 
> Il giorno mer 29 giu 2016 alle ore 08:02 Mario Giammarco
> > ha scritto:
> 
> pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0
> object_hash rjenkins pg_num 512 pgp_num 512 last_change 9313 flags
> hashpspool stripe_width 0
>removed_snaps [1~3]
> pool 1 'rbd2' replicated size 2 min_size 1 crush_ruleset 0
> object_hash rjenkins pg_num 512 pgp_num 512 last_change 9314 flags
> hashpspool stripe_width 0
>removed_snaps [1~3]
> pool 2 'rbd3' replicated size 2 min_size 1 crush_ruleset 0
> object_hash rjenkins pg_num 512 pgp_num 512 last_change 10537 flags
> hashpspool stripe_width 0
>removed_snaps [1~3]
> 
> 
> ID WEIGHT  REWEIGHT SIZE   USE   AVAIL %USE  VAR   
> 5 1.81000  1.0  1857G  984G  872G 53.00 0.86  
> 6 1.81000  1.0  1857G 1202G  655G 64.73 1.05  
> 2 1.81000  1.0  1857G 1158G  698G 62.38 1.01  
> 3 1.35999  1.0  1391G  906G  485G 65.12 1.06  
> 4 0.8  1.0   926G  702G  223G 75.88 1.23  
> 7 1.81000  1.0  1857G 1063G  793G 57.27 0.93  
> 8 1.81000  1.0  1857G 1011G  846G 54.44 0.88  
> 9 0.8  1.0   926G  573G  352G 61.91 1.01  
> 0 1.81000  1.0  1857G 1227G  629G 66.10 1.07  
> 13 0.45000  1.0   460G  307G  153G 66.74 1.08  
>  TOTAL 14846G 9136G 5710G 61.54   
> MIN/MAX VAR: 0.86/1.23  STDDEV: 6.47
> 
> 
> 
> ceph version 0.94.7 (d56bdf93ced6b80b07397d57e3fa68fe68304432)
> 
> http://pastebin.com/SvGfcSHb
> http://pastebin.com/gYFatsNS
> http://pastebin.com/VZD7j2vN
> 
> I do not understand why I/O on ENTIRE cluster is blocked when only
> few pgs are incomplete.
> 
> Many thanks,
> Mario
> 
> 
> Il giorno mar 28 giu 2016 alle ore 19:34 Stefan Priebe - Profihost
> AG > ha scritto:
> 
> And ceph health detail
> 
> Stefan
> 
> Excuse my typo sent from my mobile phone.
> 
> Am 28.06.2016 um 19:28 schrieb Oliver Dzombic
> >:
> 
>> Hi Mario,
>>
>> please give some more details:
>>
>> Please the output of:
>>
>> ceph osd pool ls detail
>> ceph osd df
>> ceph --version
>>
>> ceph -w for 10 seconds ( use http://pastebin.com/ please )
>>
>> ceph osd crush dump ( also pastebin pls )
>>
>> -- 
>> Mit freundlichen Gruessen / Best regards
>>
>> Oliver Dzombic
>> IP-Interactive
>>
>> mailto:i...@ip-interactive.de
>>
>> Anschrift:
>>
>> IP Interactive UG ( haftungsbeschraenkt )
>> Zum Sonnenberg 1-3
>> 63571 Gelnhausen
>>
>> HRB 93402 beim Amtsgericht Hanau
>> Geschäftsführung: Oliver Dzombic
>>
>> Steuer Nr.: 35 236 3622 1
>> UST ID: DE274086107
>>
>>
>> Am 28.06.2016 um 18:59 schrieb Mario Giammarco:
>>> Hello,
>>> this is the second time that happens to me, I hope that
>>> someone can
>>> explain what I can do.
>>> Proxmox ceph cluster with 8 servers, 11 hdd. Min_size=1, size=2.
>>>
>>> One hdd goes down due to bad sectors.
>>> Ceph recovers but it ends with:
>>>
>>> cluster f2a8dd7d-949a-4a29-acab-11d4900249f4
>>> health HEALTH_WARN
>>>3 pgs down
>>>19 pgs incomplete
>>>19 pgs stuck inactive
>>>19 pgs stuck unclean
>>>7 requests are blocked > 32 sec
>>> monmap e11: 7 mons at
>>> {0=192.168.0.204:6789/0,1=192.168.0.201:6789/0
>>> ,
>>> 2=192.168.0.203:6789/0,3=192.168.0.205:6789/0,4=192.168.0.202
>>> 
>>> :
>>> 6789/0,5=192.168.0.206:6789/0,6=192.168.0.207:6789/0
>>>   

Re: [ceph-users] Ceph deployment

2016-06-29 Thread Fran Barrera
Hi Oliver,

This is my problem:

I have deployed Ceph AIO with two interfaces 192.168.1.67 and 10.0.0.67 but
at the momento of installation I used 192.168.1.67 and I have an Openstack
installed with two interfaces 192.168.1.65 and 10.0.0.65.

Openstack have the storage in Ceph but is working on 192.168.1.0/24 net and
I need to work in 10.0.0.0/24 net.

If will do a tcpdump on ceph node and launch everything from openstack:

# tcpdump -i eth2 tcp and src host 10.0.0.65
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth2, link-type EN10MB (Ethernet), capture size 262144 bytes

Nothing appears and if I do with the other interface you can see it's
working:

# tcpdump -i eth1 tcp and src host 192.168.1.65
tcpdump: verbose output suppressed, use -v or -vv for full protocol decode
listening on eth1, link-type EN10MB (Ethernet), capture size 262144 bytes
09:29:56.577198 IP 192.168.1.65.50338 > ceph-node.6800: Flags [P.], seq
3504392283:3504392292, ack 1662624736, win 1444, options [nop,nop,TS val
276566611 ecr 223200], length 9
09:29:56.711320 IP 192.168.1.65.53642 > ceph-node.6808: Flags [P.], seq
2208322028:2208326424, ack 2367465877, win 1444, options [nop,nop,TS val
276566644 ecr 223227], length 4396

This is my ceph.conf:

[global]
...
mon_initial_members = ceph-node
mon_host = 192.168.1.67
public_network = 192.168.1.0/24, 10.0.0.0/24
...

How I can change this for work with 10.0.0.0/24 net?

Thanks,
Fran.

2016-06-22 12:06 GMT+02:00 Oliver Dzombic :

> Hi Fran,
>
> public_network = the network of the clients to access ceph ressources
>
> cluster_network = the network ceph use to keep the osd's sycronizing
> themself
>
> 
>
> So if you want that your ceph cluster is available to public internet
> addresses, you will have to assign IPs from a real public network.
>
> That means not 10.0.0.0 / 192.168.0.0 and so on. But thats a logical
> network design question and has nothing to do with ceph.
>
> Of course you could, via iptables/what ever create rules to
> masquarade/forward public ceph traffic to an internal, private network.
>
> --
> Mit freundlichen Gruessen / Best regards
>
> Oliver Dzombic
> IP-Interactive
>
> mailto:i...@ip-interactive.de
>
> Anschrift:
>
> IP Interactive UG ( haftungsbeschraenkt )
> Zum Sonnenberg 1-3
> 63571 Gelnhausen
>
> HRB 93402 beim Amtsgericht Hanau
> Geschäftsführung: Oliver Dzombic
>
> Steuer Nr.: 35 236 3622 1
> UST ID: DE274086107
>
>
> Am 22.06.2016 um 11:33 schrieb Fran Barrera:
> > Hi all,
> >
> > I have a couple of question about the deployment of Ceph.
> >
> >
> > This is what I plan:
> >
> > Private Net - 10.0.0.0/24 
> > Public Net - 192.168.1.0/24 
> >
> > Ceph server:
> >  - eth1: 192.168.1.67
> >  - eth2: 10.0.0.67
> >
> > Openstack server:
> >  - eth1: 192.168.1.65
> >  - eth2: 10.0.0.65
> >
> >
> >  ceph.conf
> >   - mon_host: 10.0.0.67
> >   - cluster_network - 10.0.0.0/24 
> >   - public_network - 192.168.1.0/24 
> >
> > Now, I have some doubts:
> >  - If I configure Ceph with this configuration. Could I connect with
> > Ceph from a client in the Public Net? I say it this because mon_host is
> > 10.0.0.67 in ceph.conf
> >  - The private NET was created for Openstack, but I think if I can use
> > this net for Ceph cluster network  or if I need to create another one.
> >
> > I want to connect Ceph with Openstack through a private Net and have the
> > possibility to connect with ceph from the public net too.
> >
> >
> > Any suggestions?
> >
> > Thanks,
> > Fran.
> >
> >
> > ___
> > ceph-users mailing list
> > ceph-users@lists.ceph.com
> > http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
> >
> ___
> ceph-users mailing list
> ceph-users@lists.ceph.com
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>
___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Mario Giammarco
Now I have also discovered that, by mistake, someone has put production
data on a virtual machine of the cluster. I need that ceph starts I/O so I
can boot that virtual machine.
Can I mark the incomplete pgs as valid?
If needed, where can I buy some paid support?
Thanks again,
Mario

Il giorno mer 29 giu 2016 alle ore 08:02 Mario Giammarco <
mgiamma...@gmail.com> ha scritto:

> pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0 object_hash
> rjenkins pg_num 512 pgp_num 512 last_change 9313 flags hashpspool
> stripe_width 0
>removed_snaps [1~3]
> pool 1 'rbd2' replicated size 2 min_size 1 crush_ruleset 0 object_hash
> rjenkins pg_num 512 pgp_num 512 last_change 9314 flags hashpspool
> stripe_width 0
>removed_snaps [1~3]
> pool 2 'rbd3' replicated size 2 min_size 1 crush_ruleset 0 object_hash
> rjenkins pg_num 512 pgp_num 512 last_change 10537 flags hashpspool
> stripe_width 0
>removed_snaps [1~3]
>
>
> ID WEIGHT  REWEIGHT SIZE   USE   AVAIL %USE  VAR
> 5 1.81000  1.0  1857G  984G  872G 53.00 0.86
> 6 1.81000  1.0  1857G 1202G  655G 64.73 1.05
> 2 1.81000  1.0  1857G 1158G  698G 62.38 1.01
> 3 1.35999  1.0  1391G  906G  485G 65.12 1.06
> 4 0.8  1.0   926G  702G  223G 75.88 1.23
> 7 1.81000  1.0  1857G 1063G  793G 57.27 0.93
> 8 1.81000  1.0  1857G 1011G  846G 54.44 0.88
> 9 0.8  1.0   926G  573G  352G 61.91 1.01
> 0 1.81000  1.0  1857G 1227G  629G 66.10 1.07
> 13 0.45000  1.0   460G  307G  153G 66.74 1.08
>  TOTAL 14846G 9136G 5710G 61.54
> MIN/MAX VAR: 0.86/1.23  STDDEV: 6.47
>
>
>
> ceph version 0.94.7 (d56bdf93ced6b80b07397d57e3fa68fe68304432)
>
> http://pastebin.com/SvGfcSHb
> http://pastebin.com/gYFatsNS
> http://pastebin.com/VZD7j2vN
>
> I do not understand why I/O on ENTIRE cluster is blocked when only few pgs
> are incomplete.
>
> Many thanks,
> Mario
>
>
> Il giorno mar 28 giu 2016 alle ore 19:34 Stefan Priebe - Profihost AG <
> s.pri...@profihost.ag> ha scritto:
>
>> And ceph health detail
>>
>> Stefan
>>
>> Excuse my typo sent from my mobile phone.
>>
>> Am 28.06.2016 um 19:28 schrieb Oliver Dzombic :
>>
>> Hi Mario,
>>
>> please give some more details:
>>
>> Please the output of:
>>
>> ceph osd pool ls detail
>> ceph osd df
>> ceph --version
>>
>> ceph -w for 10 seconds ( use http://pastebin.com/ please )
>>
>> ceph osd crush dump ( also pastebin pls )
>>
>> --
>> Mit freundlichen Gruessen / Best regards
>>
>> Oliver Dzombic
>> IP-Interactive
>>
>> mailto:i...@ip-interactive.de 
>>
>> Anschrift:
>>
>> IP Interactive UG ( haftungsbeschraenkt )
>> Zum Sonnenberg 1-3
>> 63571 Gelnhausen
>>
>> HRB 93402 beim Amtsgericht Hanau
>> Geschäftsführung: Oliver Dzombic
>>
>> Steuer Nr.: 35 236 3622 1
>> UST ID: DE274086107
>>
>>
>> Am 28.06.2016 um 18:59 schrieb Mario Giammarco:
>>
>> Hello,
>>
>> this is the second time that happens to me, I hope that someone can
>>
>> explain what I can do.
>>
>> Proxmox ceph cluster with 8 servers, 11 hdd. Min_size=1, size=2.
>>
>>
>> One hdd goes down due to bad sectors.
>>
>> Ceph recovers but it ends with:
>>
>>
>> cluster f2a8dd7d-949a-4a29-acab-11d4900249f4
>>
>> health HEALTH_WARN
>>
>>3 pgs down
>>
>>19 pgs incomplete
>>
>>19 pgs stuck inactive
>>
>>19 pgs stuck unclean
>>
>>7 requests are blocked > 32 sec
>>
>> monmap e11: 7 mons at
>>
>> {0=192.168.0.204:6789/0,1=192.168.0.201:6789/0,
>>
>> 2=192.168.0.203:6789/0,3=192.168.0.205:6789/0,4=192.168.0.202:
>>
>> 6789/0,5=192.168.0.206:6789/0,6=192.168.0.207:6789/0}
>>
>>election epoch 722, quorum
>>
>> 0,1,2,3,4,5,6 1,4,2,0,3,5,6
>>
>> osdmap e10182: 10 osds: 10 up, 10 in
>>
>>  pgmap v3295880: 1024 pgs, 2 pools, 4563 GB data, 1143 kobjects
>>
>>9136 GB used, 5710 GB / 14846 GB avail
>>
>>1005 active+clean
>>
>>  16 incomplete
>>
>>   3 down+incomplete
>>
>>
>> Unfortunately "7 requests blocked" means no virtual machine can boot
>>
>> because ceph has stopped i/o.
>>
>>
>> I can accept to lose some data, but not ALL data!
>>
>> Can you help me please?
>>
>> Thanks,
>>
>> Mario
>>
>>
>> ___
>>
>> ceph-users mailing list
>>
>> ceph-users@lists.ceph.com
>>
>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>
>>
>> ___
>> ceph-users mailing list
>> ceph-users@lists.ceph.com
>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>
>> ___
>> ceph-users mailing list
>> ceph-users@lists.ceph.com
>> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>>
>
___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com


Re: [ceph-users] Is anyone seeing iissues with task_numa_find_cpu?

2016-06-29 Thread Stefan Priebe - Profihost AG

Am 29.06.2016 um 04:30 schrieb Alex Gorbachev:
> Hi Stefan,
> 
> On Tue, Jun 28, 2016 at 1:46 PM, Stefan Priebe - Profihost AG
>  wrote:
>> Please be aware that you may need even more patches. Overall this needs 3
>> patches. Where the first two try to fix a bug and the 3rd one fixes the
>> fixes + even more bugs related to the scheduler. I've no idea on which patch
>> level Ubuntu is.
> 
> Stefan, would you be able to please point to the other two patches
> beside https://lkml.org/lkml/diff/2016/6/22/102/1 ?

Sorry sure yes:

1. 2b8c41daba32 ("sched/fair: Initiate a new task's util avg to a
bounded value")

2.) 40ed9cba24bb7e01cc380a02d3f04065b8afae1d ("sched/fair: Fix
post_init_entity_util_avg() serialization")

3.) the one listed at lkml.

Stefan

> 
> Thank you,
> Alex
> 
>>
>> Stefan
>>
>> Excuse my typo sent from my mobile phone.
>>
>> Am 28.06.2016 um 17:59 schrieb Tim Bishop :
>>
>> Yes - I noticed this today on Ubuntu 16.04 with the default kernel. No
>> useful information to add other than it's not just you.
>>
>> Tim.
>>
>> On Tue, Jun 28, 2016 at 11:05:40AM -0400, Alex Gorbachev wrote:
>>
>> After upgrading to kernel 4.4.13 on Ubuntu, we are seeing a few of
>>
>> these issues where an OSD would fail with the stack below.  I logged a
>>
>> bug at https://bugzilla.kernel.org/show_bug.cgi?id=121101 and there is
>>
>> a similar description at https://lkml.org/lkml/2016/6/22/102, but the
>>
>> odd part is we have turned off CFQ and blk-mq/scsi-mq and are using
>>
>> just the noop scheduler.
>>
>>
>> Does the ceph kernel code somehow use the fair scheduler code block?
>>
>>
>> Thanks
>>
>> --
>>
>> Alex Gorbachev
>>
>> Storcium
>>
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.684974] CPU: 30 PID:
>>
>> 10403 Comm: ceph-osd Not tainted 4.4.13-040413-generic #201606072354
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.684991] Hardware name:
>>
>> Supermicro X9DRi-LN4+/X9DR3-LN4+/X9DRi-LN4+/X9DR3-LN4+, BIOS 3.2
>>
>> 03/04/2015
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685009] task:
>>
>> 880f79df8000 ti: 880f79fb8000 task.ti: 880f79fb8000
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685024] RIP:
>>
>> 0010:[]  []
>>
>> task_numa_find_cpu+0x22e/0x6f0
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685051] RSP:
>>
>> 0018:880f79fbb818  EFLAGS: 00010206
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685063] RAX:
>>
>>  RBX: 880f79fbb8b8 RCX: 
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685076] RDX:
>>
>>  RSI:  RDI: 8810352d4800
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685107] RBP:
>>
>> 880f79fbb880 R08: 0001020cf87c R09: 00ff00ff
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685150] R10:
>>
>> 0009 R11: 0006 R12: 8807c3adc4c0
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685194] R13:
>>
>> 0006 R14: 033e R15: fec7
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685238] FS:
>>
>> 7f30e46b8700() GS:88105f58()
>>
>> knlGS:
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685283] CS:  0010 DS:
>>
>>  ES:  CR0: 80050033
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685310] CR2:
>>
>> 1321a000 CR3: 000853598000 CR4: 000406e0
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685354] Stack:
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685374]
>>
>> 813d050f 000d 0045 880f79df8000
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685426]
>>
>> 033f  00016b00 033f
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685477]
>>
>> 880f79df8000 880f79fbb8b8 01f4 0054
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685528] Call Trace:
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.68]
>>
>> [] ? cpumask_next_and+0x2f/0x40
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685584]
>>
>> [] task_numa_migrate+0x43e/0x9b0
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685613]
>>
>> [] ? update_cfs_shares+0xbc/0x100
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685642]
>>
>> [] numa_migrate_preferred+0x79/0x80
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685672]
>>
>> [] task_numa_fault+0x7f4/0xd40
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685700]
>>
>> [] ? timerqueue_del+0x24/0x70
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685729]
>>
>> [] ? should_numa_migrate_memory+0x55/0x130
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685762]
>>
>> [] handle_mm_fault+0xbc0/0x1820
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685793]
>>
>> [] ? __hrtimer_init+0x90/0x90
>>
>> Jun 28 09:46:41 roc04r-sca090 kernel: [137912.685822]
>>
>> [] ? remove_wait_queue+0x4d/0x60
>>
>> Jun 28 

Re: [ceph-users] Another cluster completely hang

2016-06-29 Thread Mario Giammarco
pool 0 'rbd' replicated size 2 min_size 1 crush_ruleset 0 object_hash
rjenkins pg_num 512 pgp_num 512 last_change 9313 flags hashpspool
stripe_width 0
   removed_snaps [1~3]
pool 1 'rbd2' replicated size 2 min_size 1 crush_ruleset 0 object_hash
rjenkins pg_num 512 pgp_num 512 last_change 9314 flags hashpspool
stripe_width 0
   removed_snaps [1~3]
pool 2 'rbd3' replicated size 2 min_size 1 crush_ruleset 0 object_hash
rjenkins pg_num 512 pgp_num 512 last_change 10537 flags hashpspool
stripe_width 0
   removed_snaps [1~3]


ID WEIGHT  REWEIGHT SIZE   USE   AVAIL %USE  VAR
5 1.81000  1.0  1857G  984G  872G 53.00 0.86
6 1.81000  1.0  1857G 1202G  655G 64.73 1.05
2 1.81000  1.0  1857G 1158G  698G 62.38 1.01
3 1.35999  1.0  1391G  906G  485G 65.12 1.06
4 0.8  1.0   926G  702G  223G 75.88 1.23
7 1.81000  1.0  1857G 1063G  793G 57.27 0.93
8 1.81000  1.0  1857G 1011G  846G 54.44 0.88
9 0.8  1.0   926G  573G  352G 61.91 1.01
0 1.81000  1.0  1857G 1227G  629G 66.10 1.07
13 0.45000  1.0   460G  307G  153G 66.74 1.08
 TOTAL 14846G 9136G 5710G 61.54
MIN/MAX VAR: 0.86/1.23  STDDEV: 6.47



ceph version 0.94.7 (d56bdf93ced6b80b07397d57e3fa68fe68304432)

http://pastebin.com/SvGfcSHb
http://pastebin.com/gYFatsNS
http://pastebin.com/VZD7j2vN

I do not understand why I/O on ENTIRE cluster is blocked when only few pgs
are incomplete.

Many thanks,
Mario


Il giorno mar 28 giu 2016 alle ore 19:34 Stefan Priebe - Profihost AG <
s.pri...@profihost.ag> ha scritto:

> And ceph health detail
>
> Stefan
>
> Excuse my typo sent from my mobile phone.
>
> Am 28.06.2016 um 19:28 schrieb Oliver Dzombic :
>
> Hi Mario,
>
> please give some more details:
>
> Please the output of:
>
> ceph osd pool ls detail
> ceph osd df
> ceph --version
>
> ceph -w for 10 seconds ( use http://pastebin.com/ please )
>
> ceph osd crush dump ( also pastebin pls )
>
> --
> Mit freundlichen Gruessen / Best regards
>
> Oliver Dzombic
> IP-Interactive
>
> mailto:i...@ip-interactive.de 
>
> Anschrift:
>
> IP Interactive UG ( haftungsbeschraenkt )
> Zum Sonnenberg 1-3
> 63571 Gelnhausen
>
> HRB 93402 beim Amtsgericht Hanau
> Geschäftsführung: Oliver Dzombic
>
> Steuer Nr.: 35 236 3622 1
> UST ID: DE274086107
>
>
> Am 28.06.2016 um 18:59 schrieb Mario Giammarco:
>
> Hello,
>
> this is the second time that happens to me, I hope that someone can
>
> explain what I can do.
>
> Proxmox ceph cluster with 8 servers, 11 hdd. Min_size=1, size=2.
>
>
> One hdd goes down due to bad sectors.
>
> Ceph recovers but it ends with:
>
>
> cluster f2a8dd7d-949a-4a29-acab-11d4900249f4
>
> health HEALTH_WARN
>
>3 pgs down
>
>19 pgs incomplete
>
>19 pgs stuck inactive
>
>19 pgs stuck unclean
>
>7 requests are blocked > 32 sec
>
> monmap e11: 7 mons at
>
> {0=192.168.0.204:6789/0,1=192.168.0.201:6789/0,
>
> 2=192.168.0.203:6789/0,3=192.168.0.205:6789/0,4=192.168.0.202:
>
> 6789/0,5=192.168.0.206:6789/0,6=192.168.0.207:6789/0}
>
>election epoch 722, quorum
>
> 0,1,2,3,4,5,6 1,4,2,0,3,5,6
>
> osdmap e10182: 10 osds: 10 up, 10 in
>
>  pgmap v3295880: 1024 pgs, 2 pools, 4563 GB data, 1143 kobjects
>
>9136 GB used, 5710 GB / 14846 GB avail
>
>1005 active+clean
>
>  16 incomplete
>
>   3 down+incomplete
>
>
> Unfortunately "7 requests blocked" means no virtual machine can boot
>
> because ceph has stopped i/o.
>
>
> I can accept to lose some data, but not ALL data!
>
> Can you help me please?
>
> Thanks,
>
> Mario
>
>
> ___
>
> ceph-users mailing list
>
> ceph-users@lists.ceph.com
>
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>
>
> ___
> ceph-users mailing list
> ceph-users@lists.ceph.com
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>
> ___
> ceph-users mailing list
> ceph-users@lists.ceph.com
> http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com
>
___
ceph-users mailing list
ceph-users@lists.ceph.com
http://lists.ceph.com/listinfo.cgi/ceph-users-ceph.com