On 11/07/12 13:32, Mark Kirkwood wrote:

I have attached the dump of stuck stale pgs, and the crushmap in use.


...of course I left off the crushmap, so here it is, plus my ceph.conf for good measure.


Mark
# begin crush map

# devices
device 0 osd0
device 1 osd1
device 2 osd2
device 3 osd3

# types
type 0 device
type 1 host
type 2 datacenter
type 3 root

# buckets
host ved1 {
        id -1           # do not change unnecessarily
        # weight 1.000
        alg straw
        hash 0  # rjenkins1
        item osd0 weight 1.000
}
host ved2 {
        id -2           # do not change unnecessarily
        # weight 1.000
        alg straw
        hash 0  # rjenkins1
        item osd1 weight 1.000
}
host ved3 {
        id -3           # do not change unnecessarily
        # weight 1.000
        alg straw
        hash 0  # rjenkins1
        item osd2 weight 1.000
}
host ved4 {
        id -4           # do not change unnecessarily
        # weight 1.000
        alg straw
        hash 0  # rjenkins1
        item osd3 weight 1.000
}
datacenter datacenter0 {
        id -5           # do not change unnecessarily
        # weight 2.000
        alg straw
        hash 0  # rjenkins1
        item ved1 weight 1.000
        item ved2 weight 1.000
}
datacenter datacenter1 {
        id -6           # do not change unnecessarily
        # weight 2.000
        alg straw
        hash 0  # rjenkins1
        item ved3 weight 1.000
        item ved4 weight 1.000
}
root root {
        id -7           # do not change unnecessarily
        # weight 4.000
        alg straw
        hash 0  # rjenkins1
        item datacenter0 weight 2.000
        item datacenter1 weight 2.000
}

# rules
rule data {
        ruleset 1
        type replicated
        min_size 2
        max_size 10
        step take datacenter0
        step chooseleaf firstn -2 type host
        step emit
        step take datacenter1
        step chooseleaf firstn 2 type host
        step emit
}
rule rbd {
        ruleset 2
        type replicated
        min_size 2
        max_size 10
        step take datacenter0
        step chooseleaf firstn -2 type host
        step emit
        step take datacenter1
        step chooseleaf firstn 2 type host
        step emit
}

# end crush map
;
; Sample ceph ceph.conf file.
;
; This file defines cluster membership, the various locations
; that Ceph stores data, and any other runtime options.

;
; Experimental setup for 4 osd, 3 mon and 0 mds. 
; Will experiment with crush rules later...
;

; If a 'host' is defined for a daemon, the start/stop script will
; verify that it matches the hostname (or else ignore it).  If it is
; not defined, it is assumed that the daemon is intended to start on
; the current host (e.g., in a setup with a startup.conf on each
; node).

; The variables $type, $id and $name are available to use in paths
; $type = The type of daemon, possible values: mon, mds and osd
; $id = The ID of the daemon, for mon.alpha, $id will be alpha
; $name = $type.$id

; For example:
; osd.0
;  $type = osd
;  $id = 0
;  $name = osd.0

; mon.beta
;  $type = mon
;  $id = beta
;  $name = mon.beta

; global
[global]
        ; enable secure authentication
        ;auth supported = cephx
        ;keyring = /etc/ceph/ceph.keyring

        ; allow ourselves to open a lot of files
        max open files = 131072

        ; set log file
        log file = /var/log/ceph/$name.log
        ; log_to_syslog = true        ; uncomment this line to log to syslog

        ; set up pid files
        pid file = /var/run/ceph/$name.pid

        ; If you want to run a IPv6 cluster, set this to true. Dual-stack isn't 
possible
        ;ms bind ipv6 = true

; monitors
;  You need at least one.  You need at least three if you want to
;  tolerate any node failures.  Always create an odd number.
[mon]
        mon data = /var/data/$name
        ;keyring = /var/data/keyring.$name

        ; If you are using for example the RADOS Gateway and want to have your 
newly created
        ; pools a higher replication level, you can set a default
        ;osd pool default size = 3

        ; You can also specify a CRUSH rule for new pools
        ; Wiki: http://ceph.newdream.net/wiki/Custom_data_placement_with_CRUSH
        ;osd pool default crush rule = 0

        ; Timing is critical for monitors, but if you want to allow the clocks 
to drift a
        ; bit more, you can specify the max drift.
        ;mon clock drift allowed = 1

        ; Tell the monitor to backoff from this warning for 30 seconds
        ;mon clock drift warn backoff = 30

        ; logging, for debugging monitor crashes, in order of
        ; their likelihood of being helpful :)
        ;debug ms = 1
        ;debug mon = 20
        ;debug paxos = 20
        ;debug auth = 20

[mon.ved1]
        host = ved1
        mon addr = 192.168.122.11:6789

[mon.ved2]
        host = ved2
        mon addr = 192.168.122.12:6789

[mon.ved3]
        host = ved3
        mon addr = 192.168.122.13:6789

; osd
;  You need at least one.  Two if you want data to be replicated.
;  Define as many as you like.
[osd]
        ; This is where the btrfs volume will be mounted.
        osd data = /var/data/$name
        ;keyring = /var/data/keyring.$name

        ; Ideally, make this a separate disk or partition.  A few
        ; hundred MB should be enough; more if you have fast or many
        ; disks.  You can use a file under the osd data dir if need be
        ; (e.g. /data/$name/journal), but it will be slower than a
        ; separate disk or partition.

        ; This is an example of a file-based journal.
        osd journal = /var/data/$name/journal
        osd journal size = 1000 ; journal size, in megabytes

        ; If you want to run the journal on a tmpfs, disable DirectIO
        ;journal dio = false

        ; You can change the number of recovery operations to speed up recovery
        ; or slow it down if your machines can't handle it
        ; osd recovery max active = 3

        ; osd logging to debug osd issues, in order of likelihood of being
        ; helpful
        ;debug ms = 1
        ;debug osd = 20
        ;debug filestore = 20
        ;debug journal = 20

[osd.0]
        host = ved1

        ; if 'btrfs devs' is not specified, you're responsible for
        ; setting up the 'osd data' dir.  if it is not btrfs, things
        ; will behave up until you try to recover from a crash (which
        ; usually fine for basic testing).
        ;btrfs devs = /dev/sdx

        ; If you want to specify some other mount options, you can do so.
        ; The default values are rw,noatime
        ;btrfs options = rw,noatime

[osd.1]
        host = ved2

[osd.2]
        host = ved3

[osd.3]
        host = ved4

Reply via email to