Re: Dell R310 - H200 Raid performance problem
On Sun 2011.02.20 at 10:30 -0500, Okan Demirmen wrote: On Sun 2011.02.20 at 13:28 +0100, Mark Kettenis wrote: Date: Sun, 20 Feb 2011 07:03:25 -0500 From: Kenneth R Westerback kwesterb...@rogers.com On Sun, Feb 20, 2011 at 12:39:06PM +0100, Mark Kettenis wrote: Date: Sun, 20 Feb 2011 19:54:21 +1000 From: David Gwynne l...@animata.net how to manipulate write cache policy? the lsi firmwares dont implement handling of the mod page changes unfortunately. you could call the ioctl this implements yourself though from userland. David, while I think that implementing the cache manipulation ioctls for mpii(4) is a good idea, there is a problem here. We don't have a tool in base that actually issues those ioctls. And unless I'm misreading the diff, this still leaves the cache disabled on the stupid Dell. DIOCSCACHE is called in sdattach() to enable write cache for all disks that DIOCGCACHE reports as having write cache disabled. Or are you concerned that we have no way to manipulate it from userland if/when the default needs to be modified? Ah, that's the bit I was missing. A userland tool to display and manipulate the cache settings would still be good though. Functionality should probably be added to bioctl(8). A bit unfortunate that both the -c and -C options are already taken. Ah, I had a diff for bioctl (enable/disable WCE/RCD) based on dlg's sample, but I think marco wanted more of a policy of when to do WCE/RCD rather than a switch - I'll send it along when I get home later this week. I'm not certain this is wanted, but I said I would forward along this very simplisitc patch, so here it is. If something like this is wanted, it can be re-worked to take multiple args to -e and such, but again, only if this is deemed necessary in a userland tool outside of scsi(8). Index: bioctl.8 === RCS file: /cvs/src/sbin/bioctl/bioctl.8,v retrieving revision 1.84 diff -u -p -r1.84 bioctl.8 --- bioctl.822 Dec 2010 16:25:32 - 1.84 +++ bioctl.82 Mar 2011 10:44:23 - @@ -35,6 +35,7 @@ .Op Fl hiqv .Op Fl a Ar alarm-function .Op Fl b Ar channel:target[.lun] +.Op Fl e Ar flag .Op Fl H Ar channel:target[.lun] .Op Fl R Ar device \*(Ba channel:target[.lun] .Op Fl u Ar channel:target[.lun] @@ -128,6 +129,24 @@ digits to four or less. .It Fl i Enumerate the selected RAID devices. This is the default if no other option is given. +.It Fl e Ar flag +Pass +.Ar flag +to +.Nm . +May be one of: +.Bl -tag -width disable -compact +.It Ar q +Query the read/write cache status. +.It Ar R +Enable the read cache. +.It Ar r +Disable the read cache. +.It Ar W +Enable the write cache. +.It Ar w +Disable the write cache. +.El .It Fl q Show vendor, product, revision, and serial number for the given disk. .It Fl R Ar device \*(Ba channel:target[.lun] Index: bioctl.c === RCS file: /cvs/src/sbin/bioctl/bioctl.c,v retrieving revision 1.98 diff -u -p -r1.98 bioctl.c --- bioctl.c1 Dec 2010 19:40:18 - 1.98 +++ bioctl.c2 Mar 2011 10:44:23 - @@ -77,6 +77,7 @@ void bio_changepass(char *); u_int32_t bio_createflags(char *); char *bio_vis(char *); void bio_diskinq(char *); +void bio_cache(char *, char *); intdevh = -1; inthuman; @@ -97,17 +98,17 @@ main(int argc, char *argv[]) char*devicename = NULL; char*realname = NULL, *al_arg = NULL; char*bl_arg = NULL, *dev_list = NULL; - char*key_disk = NULL; + char*key_disk = NULL, *ca_arg = NULL; const char *errstr; int ch, rv, blink = 0, changepass = 0, diskinq = 0; - int ss_func = 0; + int ss_func = 0, diskcache = 0; u_int16_t cr_level = 0; int biodev = 0; if (argc 2) usage(); - while ((ch = getopt(argc, argv, a:b:C:c:dH:hik:l:Pp:qr:R:svu:)) != + while ((ch = getopt(argc, argv, a:b:C:c:de:H:hik:l:Pp:qr:R:svu:)) != -1) { switch (ch) { case 'a': /* alarm */ @@ -133,6 +134,10 @@ main(int argc, char *argv[]) /* delete volume */ func |= BIOC_DELETERAID; break; + case 'e': /* cache */ + diskcache = 1; + ca_arg = optarg; + break; case 'u': /* unblink */ func |= BIOC_BLINK; blink = BIOC_SBUNBLINK; @@ -219,6 +224,8 @@ main(int
Re: Dell R310 - H200 Raid performance problem
On Wed, Mar 2, 2011 at 11:54 AM, Okan Demirmen o...@demirmen.com wrote: I'm not certain this is wanted, but I said I would forward along this very simplisitc patch, so here it is. If something like this is wanted, it can be re-worked to take multiple args to -e and such, but again, only if this is deemed necessary in a userland tool outside of scsi(8). i think this is pointless. if you have an ioctl implemented in the driver that enables cache, then sd(4) itself will enable it for you. if your driver doesn't implement those ioctls it gives you a false idea that you can turn it on which is not true obviously. Index: bioctl.8 === RCS file: /cvs/src/sbin/bioctl/bioctl.8,v retrieving revision 1.84 diff -u -p -r1.84 bioctl.8 --- bioctl.822 Dec 2010 16:25:32 - 1.84 +++ bioctl.82 Mar 2011 10:44:23 - @@ -35,6 +35,7 @@ .Op Fl hiqv .Op Fl a Ar alarm-function .Op Fl b Ar channel:target[.lun] +.Op Fl e Ar flag .Op Fl H Ar channel:target[.lun] .Op Fl R Ar device \*(Ba channel:target[.lun] .Op Fl u Ar channel:target[.lun] @@ -128,6 +129,24 @@ digits to four or less. .It Fl i Enumerate the selected RAID devices. This is the default if no other option is given. +.It Fl e Ar flag +Pass +.Ar flag +to +.Nm . +May be one of: +.Bl -tag -width disable -compact +.It Ar q +Query the read/write cache status. +.It Ar R +Enable the read cache. +.It Ar r +Disable the read cache. +.It Ar W +Enable the write cache. +.It Ar w +Disable the write cache. +.El .It Fl q Show vendor, product, revision, and serial number for the given disk. .It Fl R Ar device \*(Ba channel:target[.lun]
Re: Dell R310 - H200 Raid performance problem
On 2011/03/02 12:09, Mike Belopuhov wrote: On Wed, Mar 2, 2011 at 11:54 AM, Okan Demirmen o...@demirmen.com wrote: I'm not certain this is wanted, but I said I would forward along this very simplisitc patch, so here it is. If something like this is wanted, it can be re-worked to take multiple args to -e and such, but again, only if this is deemed necessary in a userland tool outside of scsi(8). i think this is pointless. if you have an ioctl implemented in the driver that enables cache, then sd(4) itself will enable it for you. if your driver doesn't implement those ioctls it gives you a false idea that you can turn it on which is not true obviously. I guess some people might be thinking users may want to disable this cache for safety or something. Those people might reconsider if they actually try one of these systems - it isn't just a bit slower, the system really is unusable without it.
Re: Dell R310 - H200 Raid performance problem
I really think this heuristic belongs in the kernel. I think there is a desire to make the policy a knob (the old, I prefer slow and safe over fast and dangerous; well use a ups! they don't! debate). So instead of bioctl I think we need a sysctl, for example hw.diskcache, that by default is enabled which is the drive manufacturers suggested setting. Then if so desired one can turn it off. Or do people think this would be too large a hammer and would like to have a more granular control? On Wed, Mar 02, 2011 at 05:54:23AM -0500, Okan Demirmen wrote: On Sun 2011.02.20 at 10:30 -0500, Okan Demirmen wrote: On Sun 2011.02.20 at 13:28 +0100, Mark Kettenis wrote: Date: Sun, 20 Feb 2011 07:03:25 -0500 From: Kenneth R Westerback kwesterb...@rogers.com On Sun, Feb 20, 2011 at 12:39:06PM +0100, Mark Kettenis wrote: Date: Sun, 20 Feb 2011 19:54:21 +1000 From: David Gwynne l...@animata.net how to manipulate write cache policy? the lsi firmwares dont implement handling of the mod page changes unfortunately. you could call the ioctl this implements yourself though from userland. David, while I think that implementing the cache manipulation ioctls for mpii(4) is a good idea, there is a problem here. We don't have a tool in base that actually issues those ioctls. And unless I'm misreading the diff, this still leaves the cache disabled on the stupid Dell. DIOCSCACHE is called in sdattach() to enable write cache for all disks that DIOCGCACHE reports as having write cache disabled. Or are you concerned that we have no way to manipulate it from userland if/when the default needs to be modified? Ah, that's the bit I was missing. A userland tool to display and manipulate the cache settings would still be good though. Functionality should probably be added to bioctl(8). A bit unfortunate that both the -c and -C options are already taken. Ah, I had a diff for bioctl (enable/disable WCE/RCD) based on dlg's sample, but I think marco wanted more of a policy of when to do WCE/RCD rather than a switch - I'll send it along when I get home later this week. I'm not certain this is wanted, but I said I would forward along this very simplisitc patch, so here it is. If something like this is wanted, it can be re-worked to take multiple args to -e and such, but again, only if this is deemed necessary in a userland tool outside of scsi(8). Index: bioctl.8 === RCS file: /cvs/src/sbin/bioctl/bioctl.8,v retrieving revision 1.84 diff -u -p -r1.84 bioctl.8 --- bioctl.8 22 Dec 2010 16:25:32 - 1.84 +++ bioctl.8 2 Mar 2011 10:44:23 - @@ -35,6 +35,7 @@ .Op Fl hiqv .Op Fl a Ar alarm-function .Op Fl b Ar channel:target[.lun] +.Op Fl e Ar flag .Op Fl H Ar channel:target[.lun] .Op Fl R Ar device \*(Ba channel:target[.lun] .Op Fl u Ar channel:target[.lun] @@ -128,6 +129,24 @@ digits to four or less. .It Fl i Enumerate the selected RAID devices. This is the default if no other option is given. +.It Fl e Ar flag +Pass +.Ar flag +to +.Nm . +May be one of: +.Bl -tag -width disable -compact +.It Ar q +Query the read/write cache status. +.It Ar R +Enable the read cache. +.It Ar r +Disable the read cache. +.It Ar W +Enable the write cache. +.It Ar w +Disable the write cache. +.El .It Fl q Show vendor, product, revision, and serial number for the given disk. .It Fl R Ar device \*(Ba channel:target[.lun] Index: bioctl.c === RCS file: /cvs/src/sbin/bioctl/bioctl.c,v retrieving revision 1.98 diff -u -p -r1.98 bioctl.c --- bioctl.c 1 Dec 2010 19:40:18 - 1.98 +++ bioctl.c 2 Mar 2011 10:44:23 - @@ -77,6 +77,7 @@ voidbio_changepass(char *); u_int32_tbio_createflags(char *); char *bio_vis(char *); void bio_diskinq(char *); +void bio_cache(char *, char *); int devh = -1; int human; @@ -97,17 +98,17 @@ main(int argc, char *argv[]) char*devicename = NULL; char*realname = NULL, *al_arg = NULL; char*bl_arg = NULL, *dev_list = NULL; - char*key_disk = NULL; + char*key_disk = NULL, *ca_arg = NULL; const char *errstr; int ch, rv, blink = 0, changepass = 0, diskinq = 0; - int ss_func = 0; + int ss_func = 0, diskcache = 0; u_int16_t cr_level = 0; int biodev = 0; if (argc 2) usage(); - while ((ch = getopt(argc, argv, a:b:C:c:dH:hik:l:Pp:qr:R:svu:)) != + while ((ch =
Re: Dell R310 - H200 Raid performance problem
Date: Wed, 2 Mar 2011 12:09:01 +0100 From: Mike Belopuhov m...@crypt.org.ru On Wed, Mar 2, 2011 at 11:54 AM, Okan Demirmen o...@demirmen.com wrote: I'm not certain this is wanted, but I said I would forward along this very simplisitc patch, so here it is. If something like this is wanted, it can be re-worked to take multiple args to -e and such, but again, only if this is deemed necessary in a userland tool outside of scsi(8). i think this is pointless. if you have an ioctl implemented in the driver that enables cache, then sd(4) itself will enable it for you. if your driver doesn't implement those ioctls it gives you a false idea that you can turn it on which is not true obviously. Well, if sd(4) enables the cache by default, people actually might want to disable the cache. There are valid reasons for running with write caches disabled, especially with RAID controllers that don't have a battery backup. And what is the point of having an ioctl if its only supposed to be used internally by the kernel? Also, if the ioctls aren't implemented they will fail, so bioctl(8) presumably prints an error message in that case. Index: bioctl.8 === RCS file: /cvs/src/sbin/bioctl/bioctl.8,v retrieving revision 1.84 diff -u -p -r1.84 bioctl.8 --- bioctl.822 Dec 2010 16:25:32 - 1.84 +++ bioctl.82 Mar 2011 10:44:23 - @@ -35,6 +35,7 @@ .Op Fl hiqv .Op Fl a Ar alarm-function .Op Fl b Ar channel:target[.lun] +.Op Fl e Ar flag .Op Fl H Ar channel:target[.lun] .Op Fl R Ar device \*(Ba channel:target[.lun] .Op Fl u Ar channel:target[.lun] @@ -128,6 +129,24 @@ digits to four or less. .It Fl i Enumerate the selected RAID devices. This is the default if no other option is given. +.It Fl e Ar flag +Pass +.Ar flag +to +.Nm . +May be one of: +.Bl -tag -width disable -compact +.It Ar q +Query the read/write cache status. +.It Ar R +Enable the read cache. +.It Ar r +Disable the read cache. +.It Ar W +Enable the write cache. +.It Ar w +Disable the write cache. +.El .It Fl q Show vendor, product, revision, and serial number for the given disk. .It Fl R Ar device \*(Ba channel:target[.lun]
Re: Dell R310 - H200 Raid performance problem
Where will this bioctl call be done from? From /etc/rc.local after fsck has run and spent ages because the disks are not cached? I don't understand what the purpose is of giving an option that makes disks slow. What's the point? Why does the kernel just always try to make it best?
Re: Dell R310 - H200 Raid performance problem
i believe the diff below should work out of the box. it pulls in all mikeb's fixes. On Fri, Feb 18, 2011 at 07:54:09PM +0100, ??ukasz Czarniecki wrote: With following Mike's suggestions it worked. # scsi -f /dev/rsd0c -m 8 IC: 0 ABPF: 0 CAP: 0 DISC: 0 SIZE: 0 WCE: 1 MF: 0 RCD: 0 Demand Retention Priority: 0 Write Retention Priority: 0 Disable Pre-fetch Transfer Length: 65535 Minimum Pre-fetch: 0 Maximum Pre-fetch: 65280 Maximum Pre-fetch Ceiling: 65535 FSW: 0 LBCSS: 0 DRA: 0 Vendor-specific: 0 NV_DIS: 0 Number of Cache Segments: 15 Cache Segment Size: 0 how to manipulate write cache policy? the lsi firmwares dont implement handling of the mod page changes unfortunately. you could call the ioctl this implements yourself though from userland. Index: mpii.c === RCS file: /cvs/src/sys/dev/pci/mpii.c,v retrieving revision 1.37 diff -u -p -r1.37 mpii.c --- mpii.c 29 Dec 2010 03:55:09 - 1.37 +++ mpii.c 20 Feb 2011 09:18:58 - @@ -29,6 +29,7 @@ #include sys/kernel.h #include sys/rwlock.h #include sys/sensors.h +#include sys/dkio.h #include sys/tree.h #include machine/bus.h @@ -981,6 +982,51 @@ struct mpii_msg_sas_oper_reply { u_int32_t ioc_loginfo; } __packed; +struct mpii_msg_raid_action_request { + u_int8_taction; +#define MPII_RAID_ACTION_CHANGE_VOL_WRITE_CACHE(0x17) + u_int8_treserved1; + u_int8_tchain_offset; + u_int8_tfunction; + + u_int16_t vol_dev_handle; + u_int8_tphys_disk_num; + u_int8_tmsg_flags; + + u_int8_tvp_id; + u_int8_tvf_if; + u_int16_t reserved2; + + u_int32_t reserved3; + + u_int32_t action_data; +#define MPII_RAID_VOL_WRITE_CACHE_MASK (0x03) +#define MPII_RAID_VOL_WRITE_CACHE_DISABLE (0x01) +#define MPII_RAID_VOL_WRITE_CACHE_ENABLE (0x02) + + struct mpii_sge action_sge; +} __packed; + +struct mpii_msg_raid_action_reply { + u_int8_taction; + u_int8_treserved1; + u_int8_tchain_offset; + u_int8_tfunction; + + u_int16_t vol_dev_handle; + u_int8_tphys_disk_num; + u_int8_tmsg_flags; + + u_int8_tvp_id; + u_int8_tvf_if; + u_int16_t reserved2; + + u_int16_t reserved3; + u_int16_t ioc_status; + + u_int32_t action_data[5]; +} __packed; + struct mpii_cfg_hdr { u_int8_tpage_version; u_int8_tpage_length; @@ -1256,6 +1302,11 @@ struct mpii_cfg_raid_vol_pg0 { #define MPII_CFG_RAID_VOL_0_STATUS_RESYNC (116) u_int16_t volume_settings; +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_MASK(0x30) +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_UNCHANGED (0x00) +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_DISABLED(0x10) +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_ENABLED (0x20) + u_int8_thot_spare_pool; u_int8_treserved1; @@ -1972,6 +2023,8 @@ int mpii_req_cfg_page(struct mpii_softc intmpii_get_ioc_pg8(struct mpii_softc *); +intmpii_ioctl_cache(struct scsi_link *, u_long, struct dk_cache *); + #if NBIO 0 intmpii_ioctl(struct device *, u_long, caddr_t); intmpii_ioctl_inq(struct mpii_softc *, struct bioc_inq *); @@ -4650,19 +4703,123 @@ mpii_scsi_cmd_done(struct mpii_ccb *ccb) mpii_push_reply(sc, ccb-ccb_rcb); scsi_done(xs); -} +} int mpii_scsi_ioctl(struct scsi_link *link, u_long cmd, caddr_t addr, int flag) { struct mpii_softc *sc = (struct mpii_softc *)link-adapter_softc; + struct mpii_device *dev = sc-sc_devs[link-target]; DNPRINTF(MPII_D_IOCTL, %s: mpii_scsi_ioctl\n, DEVNAME(sc)); - if (sc-sc_ioctl) - return (sc-sc_ioctl(link-adapter_softc, cmd, addr)); - else - return (ENOTTY); + switch (cmd) { + case DIOCGCACHE: + case DIOCSCACHE: + if (dev != NULL ISSET(dev-flags, MPII_DF_VOLUME)) { + return (mpii_ioctl_cache(link, cmd, + (struct dk_cache *)addr)); + } + break; + + default: + if (sc-sc_ioctl) + return (sc-sc_ioctl(link-adapter_softc, cmd, addr)); + + break; + } + + return (ENOTTY); +} + +int +mpii_ioctl_cache(struct scsi_link *link, u_long cmd, struct dk_cache *dc) +{ + struct mpii_softc *sc = (struct mpii_softc *)link-adapter_softc; + struct mpii_device *dev = sc-sc_devs[link-target]; + struct mpii_cfg_raid_vol_pg0 *vpg; + struct
Re: Dell R310 - H200 Raid performance problem
Date: Sun, 20 Feb 2011 19:54:21 +1000 From: David Gwynne l...@animata.net how to manipulate write cache policy? the lsi firmwares dont implement handling of the mod page changes unfortunately. you could call the ioctl this implements yourself though from userland. David, while I think that implementing the cache manipulation ioctls for mpii(4) is a good idea, there is a problem here. We don't have a tool in base that actually issues those ioctls. And unless I'm misreading the diff, this still leaves the cache disabled on the stupid Dell. Index: mpii.c === RCS file: /cvs/src/sys/dev/pci/mpii.c,v retrieving revision 1.37 diff -u -p -r1.37 mpii.c --- mpii.c29 Dec 2010 03:55:09 - 1.37 +++ mpii.c20 Feb 2011 09:18:58 - @@ -29,6 +29,7 @@ #include sys/kernel.h #include sys/rwlock.h #include sys/sensors.h +#include sys/dkio.h #include sys/tree.h #include machine/bus.h @@ -981,6 +982,51 @@ struct mpii_msg_sas_oper_reply { u_int32_t ioc_loginfo; } __packed; +struct mpii_msg_raid_action_request { + u_int8_taction; +#define MPII_RAID_ACTION_CHANGE_VOL_WRITE_CACHE (0x17) + u_int8_treserved1; + u_int8_tchain_offset; + u_int8_tfunction; + + u_int16_t vol_dev_handle; + u_int8_tphys_disk_num; + u_int8_tmsg_flags; + + u_int8_tvp_id; + u_int8_tvf_if; + u_int16_t reserved2; + + u_int32_t reserved3; + + u_int32_t action_data; +#define MPII_RAID_VOL_WRITE_CACHE_MASK (0x03) +#define MPII_RAID_VOL_WRITE_CACHE_DISABLE(0x01) +#define MPII_RAID_VOL_WRITE_CACHE_ENABLE (0x02) + + struct mpii_sge action_sge; +} __packed; + +struct mpii_msg_raid_action_reply { + u_int8_taction; + u_int8_treserved1; + u_int8_tchain_offset; + u_int8_tfunction; + + u_int16_t vol_dev_handle; + u_int8_tphys_disk_num; + u_int8_tmsg_flags; + + u_int8_tvp_id; + u_int8_tvf_if; + u_int16_t reserved2; + + u_int16_t reserved3; + u_int16_t ioc_status; + + u_int32_t action_data[5]; +} __packed; + struct mpii_cfg_hdr { u_int8_tpage_version; u_int8_tpage_length; @@ -1256,6 +1302,11 @@ struct mpii_cfg_raid_vol_pg0 { #define MPII_CFG_RAID_VOL_0_STATUS_RESYNC(116) u_int16_t volume_settings; +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_MASK (0x30) +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_UNCHANGED (0x00) +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_DISABLED (0x10) +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_ENABLED (0x20) + u_int8_thot_spare_pool; u_int8_treserved1; @@ -1972,6 +2023,8 @@ int mpii_req_cfg_page(struct mpii_softc int mpii_get_ioc_pg8(struct mpii_softc *); +int mpii_ioctl_cache(struct scsi_link *, u_long, struct dk_cache *); + #if NBIO 0 int mpii_ioctl(struct device *, u_long, caddr_t); int mpii_ioctl_inq(struct mpii_softc *, struct bioc_inq *); @@ -4650,19 +4703,123 @@ mpii_scsi_cmd_done(struct mpii_ccb *ccb) mpii_push_reply(sc, ccb-ccb_rcb); scsi_done(xs); -} +} Looks like you're introducing spurious whitespace here. int mpii_scsi_ioctl(struct scsi_link *link, u_long cmd, caddr_t addr, int flag) { struct mpii_softc *sc = (struct mpii_softc *)link-adapter_softc; + struct mpii_device *dev = sc-sc_devs[link-target]; DNPRINTF(MPII_D_IOCTL, %s: mpii_scsi_ioctl\n, DEVNAME(sc)); - if (sc-sc_ioctl) - return (sc-sc_ioctl(link-adapter_softc, cmd, addr)); - else - return (ENOTTY); + switch (cmd) { + case DIOCGCACHE: + case DIOCSCACHE: + if (dev != NULL ISSET(dev-flags, MPII_DF_VOLUME)) { + return (mpii_ioctl_cache(link, cmd, + (struct dk_cache *)addr)); + } + break; + + default: + if (sc-sc_ioctl) + return (sc-sc_ioctl(link-adapter_softc, cmd, addr)); + + break; + } + + return (ENOTTY); +} + +int +mpii_ioctl_cache(struct scsi_link *link, u_long cmd, struct dk_cache *dc) +{ + struct mpii_softc *sc = (struct mpii_softc *)link-adapter_softc; + struct mpii_device *dev = sc-sc_devs[link-target]; + struct mpii_cfg_raid_vol_pg0 *vpg; + struct mpii_msg_raid_action_request *req; + struct mpii_msg_raid_action_reply *rep; + struct mpii_cfg_hdr hdr; + struct mpii_ccb *ccb; + u_int32_t addr = MPII_CFG_RAID_VOL_ADDR_HANDLE |
Re: Dell R310 - H200 Raid performance problem
Date: Sun, 20 Feb 2011 07:03:25 -0500 From: Kenneth R Westerback kwesterb...@rogers.com On Sun, Feb 20, 2011 at 12:39:06PM +0100, Mark Kettenis wrote: Date: Sun, 20 Feb 2011 19:54:21 +1000 From: David Gwynne l...@animata.net how to manipulate write cache policy? the lsi firmwares dont implement handling of the mod page changes unfortunately. you could call the ioctl this implements yourself though from userland. David, while I think that implementing the cache manipulation ioctls for mpii(4) is a good idea, there is a problem here. We don't have a tool in base that actually issues those ioctls. And unless I'm misreading the diff, this still leaves the cache disabled on the stupid Dell. DIOCSCACHE is called in sdattach() to enable write cache for all disks that DIOCGCACHE reports as having write cache disabled. Or are you concerned that we have no way to manipulate it from userland if/when the default needs to be modified? Ah, that's the bit I was missing. A userland tool to display and manipulate the cache settings would still be good though. Functionality should probably be added to bioctl(8). A bit unfortunate that both the -c and -C options are already taken.
Re: Dell R310 - H200 Raid performance problem
On Sun 2011.02.20 at 13:28 +0100, Mark Kettenis wrote: Date: Sun, 20 Feb 2011 07:03:25 -0500 From: Kenneth R Westerback kwesterb...@rogers.com On Sun, Feb 20, 2011 at 12:39:06PM +0100, Mark Kettenis wrote: Date: Sun, 20 Feb 2011 19:54:21 +1000 From: David Gwynne l...@animata.net how to manipulate write cache policy? the lsi firmwares dont implement handling of the mod page changes unfortunately. you could call the ioctl this implements yourself though from userland. David, while I think that implementing the cache manipulation ioctls for mpii(4) is a good idea, there is a problem here. We don't have a tool in base that actually issues those ioctls. And unless I'm misreading the diff, this still leaves the cache disabled on the stupid Dell. DIOCSCACHE is called in sdattach() to enable write cache for all disks that DIOCGCACHE reports as having write cache disabled. Or are you concerned that we have no way to manipulate it from userland if/when the default needs to be modified? Ah, that's the bit I was missing. A userland tool to display and manipulate the cache settings would still be good though. Functionality should probably be added to bioctl(8). A bit unfortunate that both the -c and -C options are already taken. Ah, I had a diff for bioctl (enable/disable WCE/RCD) based on dlg's sample, but I think marco wanted more of a policy of when to do WCE/RCD rather than a switch - I'll send it along when I get home later this week.
Re: Dell R310 - H200 Raid performance problem
bah! On Sun, Feb 20, 2011 at 07:20:19PM +, Stuart Henderson wrote: On 2011/02/20 11:59, Ted Unangst wrote: On Sun, Feb 20, 2011 at 7:28 AM, Mark Kettenis mark.kette...@xs4all.nl wrote: Ah, that's the bit I was missing. A userland tool to display and manipulate the cache settings would still be good though. Functionality should probably be added to bioctl(8). A bit unfortunate that both the -c and -C options are already taken. -w or -W wouldn't be too bad an alternative (_w_rite cache). We also have a scsi(8) tool that seems more analogous to atactl (which can manipulate cache behavior). scsi(8) can manipulate write cache on some drives too. But in this case we're talking about a setting for the volume rather than for drives, so bioctl(8) wouldn't be a bad choice. (I don't know about mpii, but for mpi the vendor management tool in some OS allows you to set this, and bioctl is the closest analogue to this).
Re: Dell R310 - H200 Raid performance problem
With following Mike's suggestions it worked. could you please change this line if (mpii_req_cfg_page(sc, addr, 0, hdr, 1, vpg, pagelen) != 0) { to if (mpii_req_cfg_page(sc, addr, MPII_PG_POLL, hdr, 1, vpg, pagelen) != 0) { and one more: this: if (mpii_req_cfg_header(sc, MPII_CONFIG_REQ_PAGE_TYPE_RAID_VOL, 0, addr, 0, hdr) != 0) to: if (mpii_req_cfg_header(sc, MPII_CONFIG_REQ_PAGE_TYPE_RAID_VOL, 0, addr, MPII_PG_POLL, hdr) != 0) mpii0 at pci2 dev 0 function 0 Symbios Logic SAS2008 rev 0x02: apic 0 int 16 (irq 15) scsibus0 at mpii0: 42 targets sd0 at scsibus0 targ 1 lun 0: Dell, Virtual Disk, 1028 SCSI4 0/direct fixed sd0: 237824MB, 512 bytes/sec, 487063552 sec total ses0 at scsibus0 targ 10 lun 0: DP, BACKPLANE, 1.07 SCSI3 13/enclosure services fixed ses0: unable to read enclosure configuration # scsi -f /dev/rsd0c -m 8 IC: 0 ABPF: 0 CAP: 0 DISC: 0 SIZE: 0 WCE: 1 MF: 0 RCD: 0 Demand Retention Priority: 0 Write Retention Priority: 0 Disable Pre-fetch Transfer Length: 65535 Minimum Pre-fetch: 0 Maximum Pre-fetch: 65280 Maximum Pre-fetch Ceiling: 65535 FSW: 0 LBCSS: 0 DRA: 0 Vendor-specific: 0 NV_DIS: 0 Number of Cache Segments: 15 Cache Segment Size: 0 how to manipulate write cache policy? Lukasz
Re: Dell R310 - H200 Raid performance problem
On 17.02.2011 16:22, Mike Belopuhov wrote: Lukasz has tested the patch below and it works fine for him. I don't have the hardware myself, so I'm not going to push it for the release, but if someone thinks it's worth it, please speak up. Here are some numbers: 4.8 # time tar xzf ./sys.tar.gz 0m11.06s real 0m0.80s user 0m0.86s system w/softdeps 0m4.97s real 0m0.68s user 0m0.58s system Current 0m7.13s real 0m0.75s user 0m0.83s system w/softdeps 0m3.72s real 0m0.60s user 0m0.37s system It seems that 4.9 has a lot of improvements. Big thanks for Mike and all developers. Lukasz
Re: Dell R310 - H200 Raid performance problem
On Thu, Feb 10, 2011 at 14:25 +0100, Lukasz Czarniecki wrote: Hi I've bought a Dell R310 with H200 raid controller reported in dmesg as: Symbios Logic SAS2008. It uses mpii driver and has two hard drives configured in RAID 1. Now it seems to work fine but i still have a problem with its performance. Raid is fully initialized. How can I help to resolve this problem? I'm doing simple benchmark: wget ftp.spline.de/pub/OpenBSD/4.8/sys.tar.gz time tar xzf ./sys.tar.gz On the same hardware Linux unpacks it in less then two seconds. Numbers for OpenBSD: 4.8 amd64 sp: 3m40.95s real 0m0.65s user 0m0.71s system 4.8 amd64 mp-stable: 3m43.36s real 0m0.48s user 0m0.98s system 4.9 amd64 sp: 3m47.72s real 0m0.51s user 0m0.69s system 4.9 i386 rd : 3m45.11s real 0m1.03s user 0m1.19s system Lukasz and me have figured out that disk write cache gets turned off by the Dell firmware when you create a volume (it doesn't get disabled if you use single drives): http://support.dell.com/support/edocs/storage/storlink/h200/en/ug/html/features.htm#wp1062398 H200 doesn't have and there's no possibility to install an onboard memory and the battery, so the device becomes pretty much useless unless the operating system takes care of it. Apparently Linux does. Should OpenBSD do the same? In my opinion yes. Lukasz has tested the patch below and it works fine for him. I don't have the hardware myself, so I'm not going to push it for the release, but if someone thinks it's worth it, please speak up. Index: mpii.c === RCS file: /home/cvs/src/sys/dev/pci/mpii.c,v retrieving revision 1.37 diff -u -p -r1.37 mpii.c --- mpii.c 29 Dec 2010 03:55:09 - 1.37 +++ mpii.c 17 Feb 2011 15:15:25 - @@ -981,6 +981,52 @@ struct mpii_msg_sas_oper_reply { u_int32_t ioc_loginfo; } __packed; +struct mpii_msg_raid_action_request { + u_int8_taction; +#define MPII_RAID_ACTION_CHANGE_VOL_WRITE_CACHE(0x17) + u_int8_treserved1; + u_int8_tchain_offset; + u_int8_tfunction; + + u_int16_t vol_dev_handle; + u_int8_tphys_disk_num; + u_int8_tmsg_flags; + + u_int8_tvp_id; + u_int8_tvf_if; + u_int16_t reserved2; + + u_int32_t reserved3; + + u_int32_t action_data; +#define MPII_RAID_VOL_WRITE_CACHE_DISABLE (0x01) +#define MPII_RAID_VOL_WRITE_CACHE_ENABLE (0x02) + + struct mpii_sge action_sge; +} __packed; + +struct mpii_msg_raid_action_reply { + u_int8_taction; + u_int8_treserved1; + u_int8_tchain_offset; + u_int8_tfunction; + + u_int16_t vol_dev_handle; + u_int8_tphys_disk_num; + u_int8_tmsg_flags; + + u_int8_tvp_id; + u_int8_tvf_if; + u_int16_t reserved2; + + u_int16_t reserved3; + u_int16_t ioc_status; + + u_int32_t action_data[5]; + + struct mpii_sge action_sge; +} __packed; + struct mpii_cfg_hdr { u_int8_tpage_version; u_int8_tpage_length; @@ -1972,6 +2018,8 @@ int mpii_req_cfg_page(struct mpii_softc intmpii_get_ioc_pg8(struct mpii_softc *); +void mpii_cache_enable(struct mpii_softc *); + #if NBIO 0 intmpii_ioctl(struct device *, u_long, caddr_t); intmpii_ioctl_inq(struct mpii_softc *, struct bioc_inq *); @@ -2175,6 +2223,9 @@ mpii_attach(struct device *parent, struc goto free_dev; } + /* enable write cache */ + mpii_cache_enable(sc); + /* we should be good to go now, attach scsibus */ sc-sc_link.adapter = mpii_switch; sc-sc_link.adapter_softc = sc; @@ -3206,6 +3257,45 @@ mpii_cfg_coalescing(struct mpii_softc *s } return (0); +} + +void +mpii_cache_enable(struct mpii_softc *sc) +{ + struct mpii_msg_raid_action_request *req; + struct mpii_device *dev; + struct mpii_ccb *ccb; + int i; + + ccb = scsi_io_get(sc-sc_iopool, 0); + if (ccb == NULL) + return; + + for (i = 0; i sc-sc_max_devices; i++) { + if (sc-sc_devs[i] == NULL || + !ISSET(sc-sc_devs[i]-flags, MPII_DF_VOLUME)) + continue; + + dev = sc-sc_devs[i]; + + ccb-ccb_state = MPII_CCB_READY; + ccb-ccb_rcb = NULL; + ccb-ccb_done = mpii_empty_done; + + req = ccb-ccb_cmd; + bzero(req, sizeof(*req)); + req-function = MPII_FUNCTION_RAID_ACTION; + req-action = MPII_RAID_ACTION_CHANGE_VOL_WRITE_CACHE; +
Re: Dell R310 - H200 Raid performance problem
On Thu, Feb 17, 2011 at 04:22:54PM +0100, Mike Belopuhov wrote: On Thu, Feb 10, 2011 at 14:25 +0100, Lukasz Czarniecki wrote: Hi I've bought a Dell R310 with H200 raid controller reported in dmesg as: Symbios Logic SAS2008. It uses mpii driver and has two hard drives configured in RAID 1. Now it seems to work fine but i still have a problem with its performance. Raid is fully initialized. How can I help to resolve this problem? I'm doing simple benchmark: wget ftp.spline.de/pub/OpenBSD/4.8/sys.tar.gz time tar xzf ./sys.tar.gz On the same hardware Linux unpacks it in less then two seconds. Numbers for OpenBSD: 4.8 amd64 sp: 3m40.95s real 0m0.65s user 0m0.71s system 4.8 amd64 mp-stable: 3m43.36s real 0m0.48s user 0m0.98s system 4.9 amd64 sp: 3m47.72s real 0m0.51s user 0m0.69s system 4.9 i386 rd : 3m45.11s real 0m1.03s user 0m1.19s system Lukasz and me have figured out that disk write cache gets turned off by the Dell firmware when you create a volume (it doesn't get disabled if you use single drives): http://support.dell.com/support/edocs/storage/storlink/h200/en/ug/html/features.htm#wp1062398 H200 doesn't have and there's no possibility to install an onboard memory and the battery, so the device becomes pretty much useless unless the operating system takes care of it. Apparently Linux does. Should OpenBSD do the same? In my opinion yes. Linux does this and we should too. All SATA manufacturers recommend (read recommend very very strongly and call you names when you don't listen) enabling write cache. Lukasz has tested the patch below and it works fine for him. I don't have the hardware myself, so I'm not going to push it for the release, but if someone thinks it's worth it, please speak up. I am ok with this making release and think it should. I did not realize WB was being disabled. Index: mpii.c === RCS file: /home/cvs/src/sys/dev/pci/mpii.c,v retrieving revision 1.37 diff -u -p -r1.37 mpii.c --- mpii.c29 Dec 2010 03:55:09 - 1.37 +++ mpii.c17 Feb 2011 15:15:25 - @@ -981,6 +981,52 @@ struct mpii_msg_sas_oper_reply { u_int32_t ioc_loginfo; } __packed; +struct mpii_msg_raid_action_request { + u_int8_taction; +#define MPII_RAID_ACTION_CHANGE_VOL_WRITE_CACHE (0x17) + u_int8_treserved1; + u_int8_tchain_offset; + u_int8_tfunction; + + u_int16_t vol_dev_handle; + u_int8_tphys_disk_num; + u_int8_tmsg_flags; + + u_int8_tvp_id; + u_int8_tvf_if; + u_int16_t reserved2; + + u_int32_t reserved3; + + u_int32_t action_data; +#define MPII_RAID_VOL_WRITE_CACHE_DISABLE(0x01) +#define MPII_RAID_VOL_WRITE_CACHE_ENABLE (0x02) + + struct mpii_sge action_sge; +} __packed; + +struct mpii_msg_raid_action_reply { + u_int8_taction; + u_int8_treserved1; + u_int8_tchain_offset; + u_int8_tfunction; + + u_int16_t vol_dev_handle; + u_int8_tphys_disk_num; + u_int8_tmsg_flags; + + u_int8_tvp_id; + u_int8_tvf_if; + u_int16_t reserved2; + + u_int16_t reserved3; + u_int16_t ioc_status; + + u_int32_t action_data[5]; + + struct mpii_sge action_sge; +} __packed; + struct mpii_cfg_hdr { u_int8_tpage_version; u_int8_tpage_length; @@ -1972,6 +2018,8 @@ int mpii_req_cfg_page(struct mpii_softc int mpii_get_ioc_pg8(struct mpii_softc *); +void mpii_cache_enable(struct mpii_softc *); + #if NBIO 0 int mpii_ioctl(struct device *, u_long, caddr_t); int mpii_ioctl_inq(struct mpii_softc *, struct bioc_inq *); @@ -2175,6 +2223,9 @@ mpii_attach(struct device *parent, struc goto free_dev; } + /* enable write cache */ + mpii_cache_enable(sc); + /* we should be good to go now, attach scsibus */ sc-sc_link.adapter = mpii_switch; sc-sc_link.adapter_softc = sc; @@ -3206,6 +3257,45 @@ mpii_cfg_coalescing(struct mpii_softc *s } return (0); +} + +void +mpii_cache_enable(struct mpii_softc *sc) +{ + struct mpii_msg_raid_action_request *req; + struct mpii_device *dev; + struct mpii_ccb *ccb; + int i; + + ccb = scsi_io_get(sc-sc_iopool, 0); + if (ccb == NULL) + return; + + for (i = 0; i sc-sc_max_devices; i++) { + if (sc-sc_devs[i] == NULL || + !ISSET(sc-sc_devs[i]-flags, MPII_DF_VOLUME)) + continue; + + dev = sc-sc_devs[i]; + +
Re: Dell R310 - H200 Raid performance problem
this diff implements the disk cache ioctl handling in mpii so sd(4) can drive the change rather than have mpii(4) whack everything. modelled on the same functionality in mpi(4) and mikeb's code... could someone test this please? Index: mpii.c === RCS file: /cvs/src/sys/dev/pci/mpii.c,v retrieving revision 1.37 diff -u -p -r1.37 mpii.c --- mpii.c 29 Dec 2010 03:55:09 - 1.37 +++ mpii.c 18 Feb 2011 06:54:58 - @@ -29,6 +29,7 @@ #include sys/kernel.h #include sys/rwlock.h #include sys/sensors.h +#include sys/dkio.h #include sys/tree.h #include machine/bus.h @@ -981,6 +982,52 @@ struct mpii_msg_sas_oper_reply { u_int32_t ioc_loginfo; } __packed; +struct mpii_msg_raid_action_request { + u_int8_taction; +#define MPII_RAID_ACTION_CHANGE_VOL_WRITE_CACHE(0x17) + u_int8_treserved1; + u_int8_tchain_offset; + u_int8_tfunction; + + u_int16_t vol_dev_handle; + u_int8_tphys_disk_num; + u_int8_tmsg_flags; + + u_int8_tvp_id; + u_int8_tvf_if; + u_int16_t reserved2; + + u_int32_t reserved3; + + u_int32_t action_data; +#define MPII_RAID_VOL_WRITE_CACHE_DISABLE (0x01) +#define MPII_RAID_VOL_WRITE_CACHE_ENABLE (0x02) + + struct mpii_sge action_sge; +} __packed; + +struct mpii_msg_raid_action_reply { + u_int8_taction; + u_int8_treserved1; + u_int8_tchain_offset; + u_int8_tfunction; + + u_int16_t vol_dev_handle; + u_int8_tphys_disk_num; + u_int8_tmsg_flags; + + u_int8_tvp_id; + u_int8_tvf_if; + u_int16_t reserved2; + + u_int16_t reserved3; + u_int16_t ioc_status; + + u_int32_t action_data[5]; + + struct mpii_sge action_sge; +} __packed; + struct mpii_cfg_hdr { u_int8_tpage_version; u_int8_tpage_length; @@ -1256,6 +1303,11 @@ struct mpii_cfg_raid_vol_pg0 { #define MPII_CFG_RAID_VOL_0_STATUS_RESYNC (116) u_int16_t volume_settings; +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_MASK(0x30) +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_UNCHANGED (0x00) +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_DISABLED(0x10) +#define MPII_CFG_RAID_VOL_0_SETTINGS_CACHE_ENABLED (0x20) + u_int8_thot_spare_pool; u_int8_treserved1; @@ -1972,6 +2024,8 @@ int mpii_req_cfg_page(struct mpii_softc intmpii_get_ioc_pg8(struct mpii_softc *); +intmpii_ioctl_cache(struct scsi_link *, u_long, struct dk_cache *); + #if NBIO 0 intmpii_ioctl(struct device *, u_long, caddr_t); intmpii_ioctl_inq(struct mpii_softc *, struct bioc_inq *); @@ -4650,19 +4704,113 @@ mpii_scsi_cmd_done(struct mpii_ccb *ccb) mpii_push_reply(sc, ccb-ccb_rcb); scsi_done(xs); -} +} int mpii_scsi_ioctl(struct scsi_link *link, u_long cmd, caddr_t addr, int flag) { struct mpii_softc *sc = (struct mpii_softc *)link-adapter_softc; + struct mpii_device *dev = sc-sc_devs[link-target]; DNPRINTF(MPII_D_IOCTL, %s: mpii_scsi_ioctl\n, DEVNAME(sc)); - if (sc-sc_ioctl) - return (sc-sc_ioctl(link-adapter_softc, cmd, addr)); - else - return (ENOTTY); + switch (cmd) { + case DIOCGCACHE: + case DIOCSCACHE: + if (dev != NULL ISSET(dev-flags, MPII_DF_VOLUME)) { + return (mpii_ioctl_cache(link, cmd, + (struct dk_cache *)addr)); + } + break; + + default: + if (sc-sc_ioctl) + return (sc-sc_ioctl(link-adapter_softc, cmd, addr)); + + break; + } + + return (ENOTTY); +} + +int +mpii_ioctl_cache(struct scsi_link *link, u_long cmd, struct dk_cache *dc) +{ + struct mpii_softc *sc = (struct mpii_softc *)link-adapter_softc; + struct mpii_device *dev = sc-sc_devs[link-target]; + struct mpii_cfg_raid_vol_pg0 *vpg; + struct mpii_msg_raid_action_request *req; + struct mpii_cfg_hdr hdr; + struct mpii_ccb *ccb; + u_int32_t addr = MPII_CFG_RAID_VOL_ADDR_HANDLE | dev-dev_handle; + size_t pagelen; + int rv = 0; + int enabled; + + if (mpii_req_cfg_header(sc, MPII_CONFIG_REQ_PAGE_TYPE_RAID_VOL, 0, + addr, 0, hdr) != 0) + return (EINVAL); + + pagelen = hdr.page_length * 4; + vpg = malloc(pagelen, M_TEMP, M_WAITOK | M_CANFAIL | M_ZERO); + if (vpg == NULL) + return (ENOMEM); + + if (mpii_req_cfg_page(sc, addr, 0, hdr, 1,