[PATCH] virtio_scsi: don't check for failure from mempool_alloc()
mempool_alloc() cannot fail when passed GFP_NOIO or any other gfp setting that is permitted to sleep. So remove this pointless code. Signed-off-by: NeilBrown <ne...@suse.com> --- drivers/scsi/virtio_scsi.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 939c47df73fa..7c2a5f7c5fb7 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -688,9 +688,6 @@ static int virtscsi_device_reset(struct scsi_cmnd *sc) sdev_printk(KERN_INFO, sc->device, "device reset\n"); cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO); - if (!cmd) - return FAILED; - memset(cmd, 0, sizeof(*cmd)); cmd->sc = sc; cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){ @@ -725,9 +722,6 @@ static int virtscsi_abort(struct scsi_cmnd *sc) scmd_printk(KERN_INFO, sc, "abort\n"); cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO); - if (!cmd) - return FAILED; - memset(cmd, 0, sizeof(*cmd)); cmd->sc = sc; cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){ -- 2.12.2 signature.asc Description: PGP signature
[PATCH] scsi: ibmvfc: don't check for failure from mempool_alloc()
mempool_alloc() cannot fail when passed GFP_NOIO or any other gfp setting that is permitted to sleep. So remove this pointless code. Signed-off-by: NeilBrown <ne...@suse.com> --- drivers/scsi/ibmvscsi/ibmvfc.c | 6 -- 1 file changed, 6 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 2c92dabb55f6..26cd3c28186a 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -3910,12 +3910,6 @@ static int ibmvfc_alloc_target(struct ibmvfc_host *vhost, u64 scsi_id) spin_unlock_irqrestore(vhost->host->host_lock, flags); tgt = mempool_alloc(vhost->tgt_pool, GFP_NOIO); - if (!tgt) { - dev_err(vhost->dev, "Target allocation failure for scsi id %08llx\n", - scsi_id); - return -ENOMEM; - } - memset(tgt, 0, sizeof(*tgt)); tgt->scsi_id = scsi_id; tgt->new_scsi_id = scsi_id; -- 2.12.2 signature.asc Description: PGP signature
[PATCH] scsi: lpfc: don't check for failure of mempool_alloc()
mempool_alloc() cannot fail if the gfp flags allow sleeping, specifically if __GFP_DIRECT_RECLAIM is included. GFP_KERNEL includes this flag and allows sleeping, so none of this error checking provides any value. Signed-off-by: NeilBrown <ne...@suse.com> --- drivers/scsi/lpfc/lpfc_attr.c | 9 -- drivers/scsi/lpfc/lpfc_bsg.c | 32 - drivers/scsi/lpfc/lpfc_els.c | 118 drivers/scsi/lpfc/lpfc_hbadisc.c | 270 +++-- drivers/scsi/lpfc/lpfc_init.c | 89 drivers/scsi/lpfc/lpfc_mbox.c | 34 +++-- drivers/scsi/lpfc/lpfc_nportdisc.c | 38 +- drivers/scsi/lpfc/lpfc_nvme.c | 6 - drivers/scsi/lpfc/lpfc_sli.c | 131 -- drivers/scsi/lpfc/lpfc_vport.c | 5 - 10 files changed, 153 insertions(+), 579 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index 22819afbaef5..33f45d3d7487 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -856,9 +856,6 @@ lpfc_issue_lip(struct Scsi_Host *shost) pmboxq = mempool_alloc(phba->mbox_mem_pool,GFP_KERNEL); - if (!pmboxq) - return -ENOMEM; - memset((void *)pmboxq, 0, sizeof (LPFC_MBOXQ_t)); pmboxq->u.mb.mbxCommand = MBX_DOWN_LINK; pmboxq->u.mb.mbxOwner = OWN_HOST; @@ -1411,8 +1408,6 @@ lpfc_get_hba_info(struct lpfc_hba *phba, return 0; pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!pmboxq) - return 0; memset(pmboxq, 0, sizeof (LPFC_MBOXQ_t)); pmb = >u.mb; @@ -5670,8 +5665,6 @@ lpfc_get_stats(struct Scsi_Host *shost) return NULL; pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!pmboxq) - return NULL; memset(pmboxq, 0, sizeof (LPFC_MBOXQ_t)); pmb = >u.mb; @@ -5786,8 +5779,6 @@ lpfc_reset_stats(struct Scsi_Host *shost) return; pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!pmboxq) - return; memset(pmboxq, 0, sizeof(LPFC_MBOXQ_t)); pmb = >u.mb; diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c index 18157d2840a3..7b3fcfabede1 100644 --- a/drivers/scsi/lpfc/lpfc_bsg.c +++ b/drivers/scsi/lpfc/lpfc_bsg.c @@ -1828,10 +1828,6 @@ lpfc_sli3_bsg_diag_loopback_mode(struct lpfc_hba *phba, struct bsg_job *job) timeout = loopback_mode->timeout * 100; pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!pmboxq) { - rc = -ENOMEM; - goto loopback_mode_exit; - } memset((void *)pmboxq, 0, sizeof(LPFC_MBOXQ_t)); pmboxq->u.mb.mbxCommand = MBX_DOWN_LINK; pmboxq->u.mb.mbxOwner = OWN_HOST; @@ -1921,8 +1917,6 @@ lpfc_sli4_bsg_set_link_diag_state(struct lpfc_hba *phba, uint32_t diag) int mbxstatus = MBX_SUCCESS, rc; pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!pmboxq) - return -ENOMEM; req_len = (sizeof(struct lpfc_mbx_set_link_diag_state) - sizeof(struct lpfc_sli4_cfg_mhdr)); @@ -1982,8 +1976,6 @@ lpfc_sli4_bsg_set_internal_loopback(struct lpfc_hba *phba) int mbxstatus = MBX_SUCCESS, rc = 0; pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!pmboxq) - return -ENOMEM; req_len = (sizeof(struct lpfc_mbx_set_link_diag_loopback) - sizeof(struct lpfc_sli4_cfg_mhdr)); alloc_len = lpfc_sli4_config(phba, pmboxq, LPFC_MBOX_SUBSYSTEM_FCOE, @@ -2384,10 +2376,6 @@ lpfc_sli4_bsg_link_diag_test(struct bsg_job *job) goto job_error; pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!pmboxq) { - rc = -ENOMEM; - goto link_diag_test_exit; - } req_len = (sizeof(struct lpfc_mbx_set_link_diag_state) - sizeof(struct lpfc_sli4_cfg_mhdr)); @@ -2477,8 +2465,6 @@ static int lpfcdiag_loop_self_reg(struct lpfc_hba *phba, uint16_t *rpi) int status; mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (!mbox) - return -ENOMEM; if (phba->sli_rev < LPFC_SLI_REV4) status = lpfc_reg_rpi(phba, 0, phba->pport->fc_myDID, @@ -2537,8 +2523,6 @@ static int lpfcdiag_loop_self_unreg(struct lpfc_hba *phba, uint16_t rpi) /* Allocate mboxq structure */ mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL); - if (mbox == NULL) - return -ENOMEM; if (phba->sli_rev < LPFC_SLI_REV4) lpfc_unreg_login(phba, 0, rpi, mbox); @@ -3917,10 +3901,6 @@ lpfc_bsg_sli_cfg_read_cmd_ext(struct lpfc_hba *phba, struct bsg_job *job, /* mailbox command structure for
Re: [Lsf-pc] [LSF/MM TOPIC] do we really need PG_error at all?
On Mon, Feb 27 2017, Jeff Layton wrote: > On Tue, 2017-02-28 at 10:32 +1100, NeilBrown wrote: >> On Mon, Feb 27 2017, Andreas Dilger wrote: >> >> > >> > My thought is that PG_error is definitely useful for applications to get >> > correct errors back when doing write()/sync_file_range() so that they know >> > there is an error in the data that _they_ wrote, rather than receiving an >> > error for data that may have been written by another thread, and in turn >> > clearing the error from another thread so it *doesn't* know it had a write >> > error. >> >> It might be useful in that way, but it is not currently used that way. >> Such usage would be a change in visible behaviour. >> >> sync_file_range() calls filemap_fdatawait_range(), which calls >> filemap_check_errors(). >> If there have been any errors in the file recently, inside or outside >> the range, the latter will return an error which will propagate up. >> >> > >> > As for stray sync() clearing PG_error from underneath an application, that >> > shouldn't happen since filemap_fdatawait_keep_errors() doesn't clear errors >> > and is used by device flushing code (fdatawait_one_bdev(), >> > wait_sb_inodes()). >> >> filemap_fdatawait_keep_errors() calls __filemap_fdatawait_range() which >> clears PG_error on every page. >> What it doesn't do is call filemap_check_errors(), and so doesn't clear >> AS_ENOSPC or AS_EIO. >> >> > > I think it's helpful to get a clear idea of what happens now in the face > of errors and what we expect to happen, and I don't quite have that yet: > > --8<- > void page_endio(struct page *page, bool is_write, int err) > { > if (!is_write) { > if (!err) { > SetPageUptodate(page); > } else { > ClearPageUptodate(page); > SetPageError(page); > } > unlock_page(page); > } else { > if (err) { > SetPageError(page); > if (page->mapping) > mapping_set_error(page->mapping, err); > } > end_page_writeback(page); > } > } > --8<- > > ...not everything uses page_endio, but it's a good place to look since > we have both flavors of error handling in one place. > > On a write error, we SetPageError and set the error in the mapping. > > What I'm not clear on is: > > 1) what happens to the page at that point when we get a writeback error? > Does it just remain in-core and is allowed to service reads (assuming > that it was uptodate before)? Yes, it remains in core and can service reads. It is no different from a page on which a write recent succeeded, except that the write didn't succeed so the contents of backing store might be different from the contents of the page. > > Can I redirty it and have it retry the write? Is there standard behavior > for this or is it just up to the whim of the filesystem? Everything is at the whim of the filesystem, but I doubt if many differ from the above. NeilBrown > > I'll probably have questions about the read side as well, but for now it > looks like it's mostly used in an ad-hoc way to communicate errors > across subsystems (block to fs layer, for instance). > -- > Jeff Layton <jlay...@redhat.com> signature.asc Description: PGP signature
Re: [LSF/MM TOPIC] do we really need PG_error at all?
On Mon, Feb 27 2017, Andreas Dilger wrote: > > My thought is that PG_error is definitely useful for applications to get > correct errors back when doing write()/sync_file_range() so that they know > there is an error in the data that _they_ wrote, rather than receiving an > error for data that may have been written by another thread, and in turn > clearing the error from another thread so it *doesn't* know it had a write > error. It might be useful in that way, but it is not currently used that way. Such usage would be a change in visible behaviour. sync_file_range() calls filemap_fdatawait_range(), which calls filemap_check_errors(). If there have been any errors in the file recently, inside or outside the range, the latter will return an error which will propagate up. > > As for stray sync() clearing PG_error from underneath an application, that > shouldn't happen since filemap_fdatawait_keep_errors() doesn't clear errors > and is used by device flushing code (fdatawait_one_bdev(), wait_sb_inodes()). filemap_fdatawait_keep_errors() calls __filemap_fdatawait_range() which clears PG_error on every page. What it doesn't do is call filemap_check_errors(), and so doesn't clear AS_ENOSPC or AS_EIO. NeilBrown signature.asc Description: PGP signature
Re: [LSF/MM TOPIC] do we really need PG_error at all?
On Sun, Feb 26 2017, James Bottomley wrote: > On Mon, 2017-02-27 at 08:03 +1100, NeilBrown wrote: >> On Sun, Feb 26 2017, James Bottomley wrote: >> >> > [added linux-scsi and linux-block because this is part of our error >> > handling as well] >> > On Sun, 2017-02-26 at 09:42 -0500, Jeff Layton wrote: >> > > Proposing this as a LSF/MM TOPIC, but it may turn out to be me >> > > just not understanding the semantics here. >> > > >> > > As I was looking into -ENOSPC handling in cephfs, I noticed that >> > > PG_error is only ever tested in one place [1] >> > > __filemap_fdatawait_range, which does this: >> > > >> > > if (TestClearPageError(page)) >> > > ret = -EIO; >> > > >> > > This error code will override any AS_* error that was set in the >> > > mapping. Which makes me wonder...why don't we just set this error >> > > in the mapping and not bother with a per-page flag? Could we >> > > potentially free up a page flag by eliminating this? >> > >> > Note that currently the AS_* codes are only set for write errors >> > not for reads and we have no mapping error handling at all for swap >> > pages, but I'm sure this is fixable. >> >> How is a read error different from a failure to set PG_uptodate? >> Does PG_error suppress retries? > > We don't do any retries in the code above the block layer (or at least > we shouldn't). I was wondering about what would/should happen if a read request was re-issued for some reason. Should the error flag on the page cause an immediate failure, or should it try again. If read-ahead sees a read-error on some future page, is it necessary to record the error so subsequent read-aheads don't notice the page is missing and repeatedly try to re-load it? When the application eventually gets to the faulty page, should a read be tried then, or is the read-ahead failure permanent? > >> > >> > From the I/O layer point of view we take great pains to try to >> > pinpoint the error exactly to the sector. We reflect this up by >> > setting the PG_error flag on the page where the error occurred. If >> > we only set the error on the mapping, we lose that granularity, >> > because the mapping is mostly at the file level (or VMA level for >> > anon pages). >> >> Are you saying that the IO layer finds the page in the bi_io_vec and >> explicitly sets PG_error, > > I didn't say anything about the mechanism. I think the function you're > looking for is fs/mpage.c:mpage_end_io(). layers below block indicate > the position in the request. Block maps the position to bio and the > bio completion maps to page. So the actual granularity seen in the > upper layer depends on how the page to bio mapping is done. If the block layer is just returning the status at a per-bio level (which makes perfect sense), then this has nothing directly to do with the PG_error flag. The page cache needs to do something with bi_error, but it isn't immediately clear that it needs to set PG_error. > >> rather than just passing an error indication to bi_end_io ?? That >> would seem to be wrong as the page may not be in the page cache. > > Usually pages in the mpage_end_io path are pinned, I think. > >> So I guess I misunderstand you. >> >> > >> > So I think the question for filesystem people from us would be do >> > you care about this accuracy? If it's OK just to know an error >> > occurred somewhere in this file, then perhaps we don't need it. >> >> I had always assumed that a bio would either succeed or fail, and >> that no finer granularity could be available. > > It does ... but a bio can be as small as a single page. > >> I think the question here is: Do filesystems need the pagecache to >> record which pages have seen an IO error? > > It's not just filesystems. The partition code uses PageError() ... the > metadata code might as well (those are things with no mapping). I'm > not saying we can't remove PG_error; I am saying it's not going to be > quite as simple as using the AS_ flags. The partition code could use PageUptodate(). mpage_end_io() calls page_endio() on each page, and on read error that calls: ClearPageUptodate(page); SetPageError(page); are both of these necessary? fs/buffer.c can use several bios to read a single page. If any one returns an error, PG_error is set. When all of them have completed, if PG_error is clear, PG_uptodate is then set. This is an opportunistic use of PG_error, rather than an essent
Re: [LSF/MM TOPIC] do we really need PG_error at all?
On Sun, Feb 26 2017, James Bottomley wrote: > [added linux-scsi and linux-block because this is part of our error > handling as well] > On Sun, 2017-02-26 at 09:42 -0500, Jeff Layton wrote: >> Proposing this as a LSF/MM TOPIC, but it may turn out to be me just >> not understanding the semantics here. >> >> As I was looking into -ENOSPC handling in cephfs, I noticed that >> PG_error is only ever tested in one place [1] >> __filemap_fdatawait_range, which does this: >> >> if (TestClearPageError(page)) >> ret = -EIO; >> >> This error code will override any AS_* error that was set in the >> mapping. Which makes me wonder...why don't we just set this error in >> the mapping and not bother with a per-page flag? Could we potentially >> free up a page flag by eliminating this? > > Note that currently the AS_* codes are only set for write errors not > for reads and we have no mapping error handling at all for swap pages, > but I'm sure this is fixable. How is a read error different from a failure to set PG_uptodate? Does PG_error suppress retries? > > From the I/O layer point of view we take great pains to try to pinpoint > the error exactly to the sector. We reflect this up by setting the > PG_error flag on the page where the error occurred. If we only set the > error on the mapping, we lose that granularity, because the mapping is > mostly at the file level (or VMA level for anon pages). Are you saying that the IO layer finds the page in the bi_io_vec and explicitly sets PG_error, rather than just passing an error indication to bi_end_io ?? That would seem to be wrong as the page may not be in the page cache. So I guess I misunderstand you. > > So I think the question for filesystem people from us would be do you > care about this accuracy? If it's OK just to know an error occurred > somewhere in this file, then perhaps we don't need it. I had always assumed that a bio would either succeed or fail, and that no finer granularity could be available. I think the question here is: Do filesystems need the pagecache to record which pages have seen an IO error? I think that for write errors, there is no value in recording block-oriented error status - only file-oriented status. For read errors, it might if help to avoid indefinite read retries, but I don't know the code well enough to be sure if this is an issue. NeilBrown > > James > >> The main argument I could see for keeping it is that removing it >> might subtly change the behavior of sync_file_range if you have tasks >> syncing different ranges in a file concurrently. I'm not sure if that >> would break any guarantees though. >> >> Even if we do need it, I think we might need some cleanup here >> anyway. A lot of readpage operations end up setting that flag when >> they hit an error. Isn't it wrong to return an error on fsync, just >> because we had a read error somewhere in the file in a range that was >> never dirtied? >> >> -- >> [1]: there is another place in f2fs, but it's more or less equivalent >> to the call site in __filemap_fdatawait_range. >> signature.asc Description: PGP signature
Re: [PATCH v2 1/3] badblocks: Add core badblock management code
On Wed, Dec 23 2015, Verma, Vishal L wrote: > On Tue, 2015-12-22 at 16:34 +1100, NeilBrown wrote: >> On Sat, Dec 05 2015, Verma, Vishal L wrote: >> >> > On Fri, 2015-12-04 at 15:30 -0800, James Bottomley wrote: >> > [...] >> > > > +ssize_t badblocks_store(struct badblocks *bb, const char *page, >> > > > size_t len, >> > > > + int unack) >> > > [...] >> > > > +int badblocks_init(struct badblocks *bb, int enable) >> > > > +{ >> > > > + bb->count = 0; >> > > > + if (enable) >> > > > + bb->shift = 0; >> > > > + else >> > > > + bb->shift = -1; >> > > > + bb->page = kmalloc(PAGE_SIZE, GFP_KERNEL); >> > > >> > > Why not __get_free_page(GFP_KERNEL)? The problem with kmalloc of >> > > an >> > > exactly known page sized quantity is that the slab tracker for >> > > this >> > > requires two contiguous pages for each page because of the >> > > overhead. >> > >> > Cool, I didn't know about __get_free_page - I can fix this up too. >> > >> >> I was reminded of this just recently I thought I should clear up the >> misunderstanding. >> >> kmalloc(PAGE_SIZE) does *not* incur significant overhead and certainly >> does not require two contiguous free pages. >> If you "grep kmalloc-4096 /proc/slabinfo" you will note that both >> objperslab and pagesperslab are 1. So one page is used to store each >> 4096 byte allocation. >> >> To quote the email from Linus which reminded me about this >> >> > If you >> > want to allocate a page, and get a pointer, just use "kmalloc()". >> > Boom, done! >> >> https://lkml.org/lkml/2015/12/21/605 >> >> There probably is a small CPU overhead from using kmalloc, but no >> memory >> overhead. > > Thanks Neil. > I just read the rest of that thread - and I'm wondering if we should > change back to kzalloc here. > > The one thing __get_free_page gets us is PAGE_SIZE-aligned memory. Do > you think that would be better for this use? (I can't think of any). If > not, I can send out a new version reverting back to kzalloc. kzalloc(PAGE_SIZE) will also always return page-aligned memory. kzalloc returns a void*, __get_free_page returns unsigned long. For that reason alone I would prefer kzalloc. But I'm not necessarily suggesting you change the code. I just wanted to clarify a misunderstanding. You should produce the code that you are most happy with. NeilBrown signature.asc Description: PGP signature
Re: [PATCH v2 1/3] badblocks: Add core badblock management code
On Sat, Dec 05 2015, Verma, Vishal L wrote: > On Fri, 2015-12-04 at 15:30 -0800, James Bottomley wrote: > [...] >> > +ssize_t badblocks_store(struct badblocks *bb, const char *page, >> > size_t len, >> > + int unack) >> [...] >> > +int badblocks_init(struct badblocks *bb, int enable) >> > +{ >> > + bb->count = 0; >> > + if (enable) >> > + bb->shift = 0; >> > + else >> > + bb->shift = -1; >> > + bb->page = kmalloc(PAGE_SIZE, GFP_KERNEL); >> >> Why not __get_free_page(GFP_KERNEL)? The problem with kmalloc of an >> exactly known page sized quantity is that the slab tracker for this >> requires two contiguous pages for each page because of the overhead. > > Cool, I didn't know about __get_free_page - I can fix this up too. > I was reminded of this just recently I thought I should clear up the misunderstanding. kmalloc(PAGE_SIZE) does *not* incur significant overhead and certainly does not require two contiguous free pages. If you "grep kmalloc-4096 /proc/slabinfo" you will note that both objperslab and pagesperslab are 1. So one page is used to store each 4096 byte allocation. To quote the email from Linus which reminded me about this > If you > want to allocate a page, and get a pointer, just use "kmalloc()". > Boom, done! https://lkml.org/lkml/2015/12/21/605 There probably is a small CPU overhead from using kmalloc, but no memory overhead. NeilBrown signature.asc Description: PGP signature
Re: [PATCH v4 0/3] Badblock tracking for gendisks
On Wed, Dec 09 2015, Vishal Verma <vishal.l.ve...@intel.com> wrote: > > Patch 3 converts md over to use the new badblocks 'library'. I have > done some pretty simple testing on this - created a raid 1 device, > made sure the sysfs entries show up, and can be used to add and view > badblocks. A closer look by the md folks would be nice here. Hi, this all looks reasonably sensible. I haven't gone over it with a fine toothed comb (I wonder where that analogy comes from) but nothing is obviously wrong. Acked-by: NeilBrown <ne...@suse.com> Thanks, NeilBrown signature.asc Description: PGP signature
Re: [PATCH v2 1/3] badblocks: Add core badblock management code
On Sat, Dec 05 2015, Verma, Vishal L wrote: >> >> > +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) >> > +{ >> [...] >> > +#define DO_DEBUG 1 >> >> Why have this at all if it's unconditionally defined and always set. > > Neil - any reason or anything you had in mind for this? Or is it just an > artifact and can be removed. Like the comment says: /* Allow clearing via sysfs *only* for testing/debugging. * Normally only a successful write may clear a badblock */ The DO_DEBUG define and ifdefs are documentation identifying bits of code that should be removed when it all seems to be working. Maybe now is a good time to remove that code. NeilBrown signature.asc Description: PGP signature
Re: [dm-devel] kernel BUG at drivers/scsi/scsi_lib.c:1101! observed during md5sum for one file on (RAID4-RAID0) device
On Thu, 30 Jul 2015 06:28:06 -0700 James Bottomley james.bottom...@hansenpartnership.com wrote: On Thu, 2015-07-30 at 05:03 -0400, Yi Zhang wrote: Hi SCSI/RAID maintainer During raid test with 4.2.0-rc3, I observed below kernel BUG, pls check below info for the test log/environment/test steps. Log: [ 306.741662] md: bindsdb1 [ 306.750865] md: bindsdc1 [ 306.753993] md: bindsdd1 [ 306.764475] md: bindsde1 [ 306.786156] md: bindsdf1 [ 306.789362] md: bindsdh1 [ 306.792555] md: bindsdg1 [ 306.868166] raid6: sse2x1 gen() 10589 MB/s [ 306.889143] raid6: sse2x1 xor() 8218 MB/s [ 306.910121] raid6: sse2x2 gen() 13453 MB/s [ 306.931102] raid6: sse2x2 xor() 8990 MB/s [ 306.952079] raid6: sse2x4 gen() 15539 MB/s [ 306.973063] raid6: sse2x4 xor() 10771 MB/s [ 306.994039] raid6: avx2x1 gen() 20582 MB/s [ 307.015017] raid6: avx2x2 gen() 24019 MB/s [ 307.035998] raid6: avx2x4 gen() 27824 MB/s [ 307.040755] raid6: using algorithm avx2x4 gen() 27824 MB/s [ 307.046869] raid6: using avx2x2 recovery algorithm [ 307.058793] async_tx: api initialized (async) [ 307.075428] xor: automatically using best checksumming function: [ 307.091942]avx : 32008.000 MB/sec [ 307.147662] md: raid6 personality registered for level 6 [ 307.153584] md: raid5 personality registered for level 5 [ 307.159505] md: raid4 personality registered for level 4 [ 307.165698] md/raid:md0: device sdf1 operational as raid disk 4 [ 307.172300] md/raid:md0: device sde1 operational as raid disk 3 [ 307.178899] md/raid:md0: device sdd1 operational as raid disk 2 [ 307.185497] md/raid:md0: device sdc1 operational as raid disk 1 [ 307.192093] md/raid:md0: device sdb1 operational as raid disk 0 [ 307.199052] md/raid:md0: allocated 6482kB [ 307.203573] md/raid:md0: raid level 4 active with 5 out of 6 devices, algorithm 0 [ 307.211958] md0: detected capacity change from 0 to 53645148160 [ 307.218658] md: recovery of RAID array md0 [ 307.223226] md: minimum _guaranteed_ speed: 1000 KB/sec/disk. [ 307.229729] md: using maximum available idle IO bandwidth (but not more than 20 KB/sec) for recovery. [ 307.240427] md: using 128k window, over a total of 10477568k. [ 374.670951] md: md0: recovery done. [ 375.722806] EXT4-fs (md0): mounted filesystem with ordered data mode. Opts: (null) [ 447.553364] md: unbindsdh1 [ 447.559905] md: export_rdev(sdh1) [ 447.572684] md: cannot remove active disk sdg1 from md0 ... [ 447.578909] md/raid:md0: Disk failure on sdg1, disabling device. [ 447.578909] md/raid:md0: Operation continuing on 5 devices. [ 447.594850] md: unbindsdg1 [ 447.601834] md: export_rdev(sdg1) [ 447.615446] md: raid0 personality registered for level 0 [ 447.629275] md/raid0:md0: md_size is 104775680 sectors. [ 447.635094] md: RAID0 configuration for md0 - 1 zone [ 447.640627] md: zone0=[sdb1/sdc1/sdd1/sde1/sdf1] [ 447.645833] zone-offset= 0KB, device-offset= 0KB, size= 52387840KB [ 447.654949] [ 447.739443] EXT4-fs (md0): mounted filesystem with ordered data mode. Opts: (null) [ 447.749258] bio too big device sde1 (768 512) This is the actual error. It looks like an md problem (md list copied). Thanks. It certainly does look like an md problem ah, found it. level_store in drivers/md/md.c calls blk_set_stacking_limits after calling -takeover and before calling -run. -run should impose the limits from the underlying device, but for RAID0, -takeover is doing that. I can fix that... hopefully it will become irrelevant soon when the immutable-bio patches go in. This patch isn't quite right, but it should be pretty close. Can you test and confirm? Thanks, NeilBrown diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index efb654eb5399..17804f374709 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -83,7 +83,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) char b[BDEVNAME_SIZE]; char b2[BDEVNAME_SIZE]; struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - bool discard_supported = false; if (!conf) return -ENOMEM; @@ -188,19 +187,12 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) } dev[j] = rdev1; - if (mddev-queue) - disk_stack_limits(mddev-gendisk, rdev1-bdev, - rdev1-data_offset 9); - if (rdev1-bdev-bd_disk-queue-merge_bvec_fn) conf-has_merge_bvec = 1; if (!smallest || (rdev1-sectors smallest-sectors)) smallest = rdev1; cnt++; - - if (blk_queue_discard(bdev_get_queue(rdev1-bdev))) - discard_supported = true; } if (cnt != mddev
Re: raid1 narrow_write_error with 4K disks, sd bad block number requested messages
On Thu, 12 Feb 2015 11:46:21 -0500 Nate Dailey nate.dai...@stratus.com wrote: On 02/04/2015 11:59 PM, NeilBrown wrote: On Wed, 28 Jan 2015 10:29:46 -0500 Nate Dailey nate.dai...@stratus.com wrote: I'm writing about something that appears to be an issue with raid1's narrow_write_error, particular to non-512-byte-sector disks. Here's what I'm doing: - 2 disk raid1, 4K disks, each connected to a different SAS HBA - mount a filesystem on the raid1, run a test that writes to it - remove one of the SAS HBAs (echo 1 /sys/bus/pci/devices/\:45\:00.0/remove) At this point, writes fail and narrow_write_error breaks them up and retries, one sector at a time. But these are 512-byte sectors, and sd doesn't like it: [ 2645.310517] sd 3:0:1:0: [sde] Bad block number requested [ 2645.310610] sd 3:0:1:0: [sde] Bad block number requested [ 2645.310690] sd 3:0:1:0: [sde] Bad block number requested ... There appears to be no real harm done, but there can be a huge number of these messages in the log. I can avoid this by disabling bad block tracking, but it looks like maybe the superblock's bblog_shift is intended to address this exact issue. However, I don't see a way to change it. Presumably this is something mdadm should be setting up? I don't see bblog_shift ever set to anything other than 0. This is on a RHEL 7.1 kernel, version 3.10.0-221.el7. I took a look at upstream sd and md changes and nothing jumps out at me that would have affected this (but I have not tested to see if the bad block messages do or do not happen on an upstream kernel). I'd appreciate any advice re: how to handle this. Thanks! Thanks for the report. narrow_write_error() should use bdev_logical_block_size() and round up to that. Possibly mdadm should get the same information and set bblog_shift accordingly when creating a bad block log. I've made a note to fix that, but I'm happy to review patches too :-) thanks, NeilBrown I will post a narrow_write_error patch shortly. I did some experimentation with setting the bblog_shift in mdadm, but it didn't work out the way I expected. It turns out that the value is only loaded from the superblock if: 1453if ((le32_to_cpu(sb-feature_map) MD_FEATURE_BAD_BLOCKS) 1454rdev-badblocks.count == 0) { ... 1473rdev-badblocks.shift = sb-bblog_shift; And this feature bit is only set if any bad blocks have actually been recorded. It also appears to me that the shift is used when loading the bad blocks from the superblock, but not when storing the bad block list in the superblock. Seems like these are bugs, but I'm not certain how the code is supposed to work (and am getting in a bit over my head with this). Yes, that's probably a bug. The } else if (sb-bblog_offset != 0) rdev-badblocks.shift = 0; should be } else if (sb-bblog_offset != 0) rdev-badblocks.shift = sb-bblog_shift; In any case, it doesn't appear to me that there's any harm in having the bblog_shift not match the disk's block size (right?). Having the bblog_shift larger than the disk's block size certainly should not be a problem. Having it small only causes the problem that you have already discovered. NeilBrown Nate Dailey -- To unsubscribe from this list: send the line unsubscribe linux-raid in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html pgpThbUqdUDTs.pgp Description: OpenPGP digital signature
Re: raid1 narrow_write_error with 4K disks, sd bad block number requested messages
On Wed, 28 Jan 2015 10:29:46 -0500 Nate Dailey nate.dai...@stratus.com wrote: I'm writing about something that appears to be an issue with raid1's narrow_write_error, particular to non-512-byte-sector disks. Here's what I'm doing: - 2 disk raid1, 4K disks, each connected to a different SAS HBA - mount a filesystem on the raid1, run a test that writes to it - remove one of the SAS HBAs (echo 1 /sys/bus/pci/devices/\:45\:00.0/remove) At this point, writes fail and narrow_write_error breaks them up and retries, one sector at a time. But these are 512-byte sectors, and sd doesn't like it: [ 2645.310517] sd 3:0:1:0: [sde] Bad block number requested [ 2645.310610] sd 3:0:1:0: [sde] Bad block number requested [ 2645.310690] sd 3:0:1:0: [sde] Bad block number requested ... There appears to be no real harm done, but there can be a huge number of these messages in the log. I can avoid this by disabling bad block tracking, but it looks like maybe the superblock's bblog_shift is intended to address this exact issue. However, I don't see a way to change it. Presumably this is something mdadm should be setting up? I don't see bblog_shift ever set to anything other than 0. This is on a RHEL 7.1 kernel, version 3.10.0-221.el7. I took a look at upstream sd and md changes and nothing jumps out at me that would have affected this (but I have not tested to see if the bad block messages do or do not happen on an upstream kernel). I'd appreciate any advice re: how to handle this. Thanks! Thanks for the report. narrow_write_error() should use bdev_logical_block_size() and round up to that. Possibly mdadm should get the same information and set bblog_shift accordingly when creating a bad block log. I've made a note to fix that, but I'm happy to review patches too :-) thanks, NeilBrown pgpB_wSd1jJTA.pgp Description: OpenPGP digital signature
Re: mpt2sas + raid10 goes boom
On Tue, 9 Apr 2013 12:22:29 +1000 Chris Dunlop ch...@onthe.net.au wrote: Neil -- should c8dc9c6 go to stable? I think it definitely should. Without it you can't create a raid10 and it looks like you have a controller issue! It is too late for 3.7, but it probably makes sense for 3.8. I suggest you ask get confirmation from the author, Joe Lawrence joe.lawre...@stratus.com, (oh wait, you already did!) and when Joe confirms, email sta...@vger.kernel.org and ask them to add it. NeilBrown signature.asc Description: PGP signature
Re: 'Device not ready' issue on mpt2sas since 3.1.10
On Mon, 09 Jul 2012 16:40:15 +0200 Matthias Prager li...@matthiasprager.de wrote: Hello linux-scsi and linux-raid, I did some further research regarding my problem. It appears to me the fault does not lie with the mpt2sas driver (not that I can definitely exclude it), but with the md implementation. I reproduced what I think is the same issue on a different machine (also running Vmware ESXi 5 and an LSI 9211-8i in IR mode) with a different set of hard-drives of the same model. Using systemrescuecd (2.8.1-beta003) and booting the 64bit 3.4.4 kernel, I issued the following commands: 1) 'hdparm -y /dev/sda' (to put the hard-drive to sleep) 2) 'mdadm --create /dev/md1 --metadata 1.2 --level=mirror --raid-devices=2 --name=test1 /dev/sda missing' 3) 'fdisk -l /dev/md127' (for some reason /proc/mdstat indicates the md is being created as md127) 2) gave me this feedback: -- mdadm: super1.x cannot open /dev/sda: Device or resource busy mdadm: /dev/sda is not suitable for this array. mdadm: create aborted --- Even though it says creating aborted it still created md127. One of my pet peeves in when people interpret the observations wrongly and then report their interpretation instead of their observation. However sometimes it is very hard to separate the two. You comment above looks perfectly reasonable and looks like a clean observation and not and interpretation. Yet it is an interpretation :-) The observation would be Even though it says creating abort, md127 was still created. You see, it wasn't this mdadm that created md127 - it certainly shouldn't have as you asked it to create md1. I don't know the exact sequence of events, but something - possibly relating to the error messages reported below - caused udev to notice /dev/sda. udev then ran mdadm -I /dev/sda and as it had some metadata on it, it created an array with it. As the name information in that metadata was probably test1 or similar, rather than 1, mdadm didn't know what number was wanted for the array, so it chose a free high number - 127. This metadata must have been left over from an earlier experiment. So it might have been something like. - you run mdadm (call this mdadm-1). - mdadm tries to open sda - driver notices that device is asleep, and wakes it up - the waking up of the device causes a CHANGE uevent to udev - this cause udev to run a new mdadm - mdadm-2 - mdadm-2 reads the metadata, sees old metadata, assembled sda in a new md127 - mdadm-1 gets scheduled again, tries to get O_EXCL access to sda and fails, because sda is now part of md127 Clearly undesirable behaviour. I'm not sure which bit is wrong. NeilBrown And 3) lead to these lines in dmesg: --- [ 604.838640] sd 2:0:0:0: [sda] Device not ready [ 604.838645] sd 2:0:0:0: [sda] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [ 604.838655] sd 2:0:0:0: [sda] Sense Key : Not Ready [current] [ 604.838663] sd 2:0:0:0: [sda] Add. Sense: Logical unit not ready, initializing command required [ 604.838668] sd 2:0:0:0: [sda] CDB: Read(10): 28 00 00 00 08 00 00 00 20 00 [ 604.838680] end_request: I/O error, dev sda, sector 2048 [ 604.838688] Buffer I/O error on device md127, logical block 0 [ 604.838695] Buffer I/O error on device md127, logical block 1 [ 604.838699] Buffer I/O error on device md127, logical block 2 [ 604.838702] Buffer I/O error on device md127, logical block 3 [ 604.838783] sd 2:0:0:0: [sda] Device not ready [ 604.838785] sd 2:0:0:0: [sda] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [ 604.838789] sd 2:0:0:0: [sda] Sense Key : Not Ready [current] [ 604.838793] sd 2:0:0:0: [sda] Add. Sense: Logical unit not ready, initializing command required [ 604.838797] sd 2:0:0:0: [sda] CDB: Read(10): 28 00 00 00 08 00 00 00 08 00 [ 604.838805] end_request: I/O error, dev sda, sector 2048 [ 604.838808] Buffer I/O error on device md127, logical block 0 [ 604.838983] sd 2:0:0:0: [sda] Device not ready [ 604.838986] sd 2:0:0:0: [sda] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [ 604.838989] sd 2:0:0:0: [sda] Sense Key : Not Ready [current] [ 604.838993] sd 2:0:0:0: [sda] Add. Sense: Logical unit not ready, initializing command required [ 604.838998] sd 2:0:0:0: [sda] CDB: Read(10): 28 00 57 54 65 d8 00 00 08 00 [ 604.839006] end_request: I/O error, dev sda, sector 146514 [ 604.839009] Buffer I/O error on device md127, logical block 183143355 [ 604.839087] sd 2:0:0:0: [sda] Device not ready [ 604.839090] sd 2:0:0:0: [sda] Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [ 604.839093] sd 2:0:0:0: [sda] Sense Key : Not Ready [current] [ 604.839097] sd 2:0:0:0: [sda] Add. Sense: Logical unit not ready, initializing command required [ 604.839102] sd 2:0:0:0: [sda] CDB: Read(10): 28 00 57 54 65 d8 00 00 08 00 [ 604.839110] end_request: I/O error, dev sda, sector 146514 [ 604.839113] Buffer I/O error on device md127, logical block