On 11/07/12 13:32, Mark Kirkwood wrote:
I have attached the dump of stuck stale pgs, and the crushmap in use.
...of course I left off the crushmap, so here it is, plus my ceph.conf
for good measure.
Mark
# begin crush map
# devices
device 0 osd0
device 1 osd1
device 2 osd2
device 3 osd3
# types
type 0 device
type 1 host
type 2 datacenter
type 3 root
# buckets
host ved1 {
id -1 # do not change unnecessarily
# weight 1.000
alg straw
hash 0 # rjenkins1
item osd0 weight 1.000
}
host ved2 {
id -2 # do not change unnecessarily
# weight 1.000
alg straw
hash 0 # rjenkins1
item osd1 weight 1.000
}
host ved3 {
id -3 # do not change unnecessarily
# weight 1.000
alg straw
hash 0 # rjenkins1
item osd2 weight 1.000
}
host ved4 {
id -4 # do not change unnecessarily
# weight 1.000
alg straw
hash 0 # rjenkins1
item osd3 weight 1.000
}
datacenter datacenter0 {
id -5 # do not change unnecessarily
# weight 2.000
alg straw
hash 0 # rjenkins1
item ved1 weight 1.000
item ved2 weight 1.000
}
datacenter datacenter1 {
id -6 # do not change unnecessarily
# weight 2.000
alg straw
hash 0 # rjenkins1
item ved3 weight 1.000
item ved4 weight 1.000
}
root root {
id -7 # do not change unnecessarily
# weight 4.000
alg straw
hash 0 # rjenkins1
item datacenter0 weight 2.000
item datacenter1 weight 2.000
}
# rules
rule data {
ruleset 1
type replicated
min_size 2
max_size 10
step take datacenter0
step chooseleaf firstn -2 type host
step emit
step take datacenter1
step chooseleaf firstn 2 type host
step emit
}
rule rbd {
ruleset 2
type replicated
min_size 2
max_size 10
step take datacenter0
step chooseleaf firstn -2 type host
step emit
step take datacenter1
step chooseleaf firstn 2 type host
step emit
}
# end crush map
;
; Sample ceph ceph.conf file.
;
; This file defines cluster membership, the various locations
; that Ceph stores data, and any other runtime options.
;
; Experimental setup for 4 osd, 3 mon and 0 mds.
; Will experiment with crush rules later...
;
; If a 'host' is defined for a daemon, the start/stop script will
; verify that it matches the hostname (or else ignore it). If it is
; not defined, it is assumed that the daemon is intended to start on
; the current host (e.g., in a setup with a startup.conf on each
; node).
; The variables $type, $id and $name are available to use in paths
; $type = The type of daemon, possible values: mon, mds and osd
; $id = The ID of the daemon, for mon.alpha, $id will be alpha
; $name = $type.$id
; For example:
; osd.0
; $type = osd
; $id = 0
; $name = osd.0
; mon.beta
; $type = mon
; $id = beta
; $name = mon.beta
; global
[global]
; enable secure authentication
;auth supported = cephx
;keyring = /etc/ceph/ceph.keyring
; allow ourselves to open a lot of files
max open files = 131072
; set log file
log file = /var/log/ceph/$name.log
; log_to_syslog = true ; uncomment this line to log to syslog
; set up pid files
pid file = /var/run/ceph/$name.pid
; If you want to run a IPv6 cluster, set this to true. Dual-stack isn't
possible
;ms bind ipv6 = true
; monitors
; You need at least one. You need at least three if you want to
; tolerate any node failures. Always create an odd number.
[mon]
mon data = /var/data/$name
;keyring = /var/data/keyring.$name
; If you are using for example the RADOS Gateway and want to have your
newly created
; pools a higher replication level, you can set a default
;osd pool default size = 3
; You can also specify a CRUSH rule for new pools
; Wiki: http://ceph.newdream.net/wiki/Custom_data_placement_with_CRUSH
;osd pool default crush rule = 0
; Timing is critical for monitors, but if you want to allow the clocks
to drift a
; bit more, you can specify the max drift.
;mon clock drift allowed = 1
; Tell the monitor to backoff from this warning for 30 seconds
;mon clock drift warn backoff = 30
; logging, for debugging monitor crashes, in order of
; their likelihood of being helpful :)
;debug ms = 1
;debug mon = 20
;debug paxos = 20
;debug auth = 20
[mon.ved1]
host = ved1
mon addr = 192.168.122.11:6789
[mon.ved2]
host = ved2
mon addr = 192.168.122.12:6789
[mon.ved3]
host = ved3
mon addr = 192.168.122.13:6789
; osd
; You need at least one. Two if you want data to be replicated.
; Define as many as you like.
[osd]
; This is where the btrfs volume will be mounted.
osd data = /var/data/$name
;keyring = /var/data/keyring.$name
; Ideally, make this a separate disk or partition. A few
; hundred MB should be enough; more if you have fast or many
; disks. You can use a file under the osd data dir if need be
; (e.g. /data/$name/journal), but it will be slower than a
; separate disk or partition.
; This is an example of a file-based journal.
osd journal = /var/data/$name/journal
osd journal size = 1000 ; journal size, in megabytes
; If you want to run the journal on a tmpfs, disable DirectIO
;journal dio = false
; You can change the number of recovery operations to speed up recovery
; or slow it down if your machines can't handle it
; osd recovery max active = 3
; osd logging to debug osd issues, in order of likelihood of being
; helpful
;debug ms = 1
;debug osd = 20
;debug filestore = 20
;debug journal = 20
[osd.0]
host = ved1
; if 'btrfs devs' is not specified, you're responsible for
; setting up the 'osd data' dir. if it is not btrfs, things
; will behave up until you try to recover from a crash (which
; usually fine for basic testing).
;btrfs devs = /dev/sdx
; If you want to specify some other mount options, you can do so.
; The default values are rw,noatime
;btrfs options = rw,noatime
[osd.1]
host = ved2
[osd.2]
host = ved3
[osd.3]
host = ved4