[PATCH] virtio_scsi: don't check for failure from mempool_alloc()

2017-04-09 Thread NeilBrown

mempool_alloc() cannot fail when passed GFP_NOIO or any
other gfp setting that is permitted to sleep.
So remove this pointless code.

Signed-off-by: NeilBrown <ne...@suse.com>
---
 drivers/scsi/virtio_scsi.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 939c47df73fa..7c2a5f7c5fb7 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -688,9 +688,6 @@ static int virtscsi_device_reset(struct scsi_cmnd *sc)
 
sdev_printk(KERN_INFO, sc->device, "device reset\n");
cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO);
-   if (!cmd)
-   return FAILED;
-
memset(cmd, 0, sizeof(*cmd));
cmd->sc = sc;
cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){
@@ -725,9 +722,6 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
 
scmd_printk(KERN_INFO, sc, "abort\n");
cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO);
-   if (!cmd)
-   return FAILED;
-
memset(cmd, 0, sizeof(*cmd));
cmd->sc = sc;
cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){
-- 
2.12.2



signature.asc
Description: PGP signature


[PATCH] scsi: ibmvfc: don't check for failure from mempool_alloc()

2017-04-09 Thread NeilBrown

mempool_alloc() cannot fail when passed GFP_NOIO or any
other gfp setting that is permitted to sleep.
So remove this pointless code.

Signed-off-by: NeilBrown <ne...@suse.com>
---
 drivers/scsi/ibmvscsi/ibmvfc.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c
index 2c92dabb55f6..26cd3c28186a 100644
--- a/drivers/scsi/ibmvscsi/ibmvfc.c
+++ b/drivers/scsi/ibmvscsi/ibmvfc.c
@@ -3910,12 +3910,6 @@ static int ibmvfc_alloc_target(struct ibmvfc_host 
*vhost, u64 scsi_id)
spin_unlock_irqrestore(vhost->host->host_lock, flags);
 
tgt = mempool_alloc(vhost->tgt_pool, GFP_NOIO);
-   if (!tgt) {
-   dev_err(vhost->dev, "Target allocation failure for scsi id 
%08llx\n",
-   scsi_id);
-   return -ENOMEM;
-   }
-
memset(tgt, 0, sizeof(*tgt));
tgt->scsi_id = scsi_id;
tgt->new_scsi_id = scsi_id;
-- 
2.12.2



signature.asc
Description: PGP signature


[PATCH] scsi: lpfc: don't check for failure of mempool_alloc()

2017-04-09 Thread NeilBrown

mempool_alloc() cannot fail if the gfp flags allow sleeping,
specifically if __GFP_DIRECT_RECLAIM is included.
GFP_KERNEL includes this flag and allows sleeping, so
none of this error checking provides any value.

Signed-off-by: NeilBrown <ne...@suse.com>
---
 drivers/scsi/lpfc/lpfc_attr.c  |   9 --
 drivers/scsi/lpfc/lpfc_bsg.c   |  32 -
 drivers/scsi/lpfc/lpfc_els.c   | 118 
 drivers/scsi/lpfc/lpfc_hbadisc.c   | 270 +++--
 drivers/scsi/lpfc/lpfc_init.c  |  89 
 drivers/scsi/lpfc/lpfc_mbox.c  |  34 +++--
 drivers/scsi/lpfc/lpfc_nportdisc.c |  38 +-
 drivers/scsi/lpfc/lpfc_nvme.c  |   6 -
 drivers/scsi/lpfc/lpfc_sli.c   | 131 --
 drivers/scsi/lpfc/lpfc_vport.c |   5 -
 10 files changed, 153 insertions(+), 579 deletions(-)

diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c
index 22819afbaef5..33f45d3d7487 100644
--- a/drivers/scsi/lpfc/lpfc_attr.c
+++ b/drivers/scsi/lpfc/lpfc_attr.c
@@ -856,9 +856,6 @@ lpfc_issue_lip(struct Scsi_Host *shost)
 
pmboxq = mempool_alloc(phba->mbox_mem_pool,GFP_KERNEL);
 
-   if (!pmboxq)
-   return -ENOMEM;
-
memset((void *)pmboxq, 0, sizeof (LPFC_MBOXQ_t));
pmboxq->u.mb.mbxCommand = MBX_DOWN_LINK;
pmboxq->u.mb.mbxOwner = OWN_HOST;
@@ -1411,8 +1408,6 @@ lpfc_get_hba_info(struct lpfc_hba *phba,
return 0;
 
pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
-   if (!pmboxq)
-   return 0;
memset(pmboxq, 0, sizeof (LPFC_MBOXQ_t));
 
pmb = >u.mb;
@@ -5670,8 +5665,6 @@ lpfc_get_stats(struct Scsi_Host *shost)
return NULL;
 
pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
-   if (!pmboxq)
-   return NULL;
memset(pmboxq, 0, sizeof (LPFC_MBOXQ_t));
 
pmb = >u.mb;
@@ -5786,8 +5779,6 @@ lpfc_reset_stats(struct Scsi_Host *shost)
return;
 
pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
-   if (!pmboxq)
-   return;
memset(pmboxq, 0, sizeof(LPFC_MBOXQ_t));
 
pmb = >u.mb;
diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c
index 18157d2840a3..7b3fcfabede1 100644
--- a/drivers/scsi/lpfc/lpfc_bsg.c
+++ b/drivers/scsi/lpfc/lpfc_bsg.c
@@ -1828,10 +1828,6 @@ lpfc_sli3_bsg_diag_loopback_mode(struct lpfc_hba *phba, 
struct bsg_job *job)
timeout = loopback_mode->timeout * 100;
 
pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
-   if (!pmboxq) {
-   rc = -ENOMEM;
-   goto loopback_mode_exit;
-   }
memset((void *)pmboxq, 0, sizeof(LPFC_MBOXQ_t));
pmboxq->u.mb.mbxCommand = MBX_DOWN_LINK;
pmboxq->u.mb.mbxOwner = OWN_HOST;
@@ -1921,8 +1917,6 @@ lpfc_sli4_bsg_set_link_diag_state(struct lpfc_hba *phba, 
uint32_t diag)
int mbxstatus = MBX_SUCCESS, rc;
 
pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
-   if (!pmboxq)
-   return -ENOMEM;
 
req_len = (sizeof(struct lpfc_mbx_set_link_diag_state) -
   sizeof(struct lpfc_sli4_cfg_mhdr));
@@ -1982,8 +1976,6 @@ lpfc_sli4_bsg_set_internal_loopback(struct lpfc_hba *phba)
int mbxstatus = MBX_SUCCESS, rc = 0;
 
pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
-   if (!pmboxq)
-   return -ENOMEM;
req_len = (sizeof(struct lpfc_mbx_set_link_diag_loopback) -
   sizeof(struct lpfc_sli4_cfg_mhdr));
alloc_len = lpfc_sli4_config(phba, pmboxq, LPFC_MBOX_SUBSYSTEM_FCOE,
@@ -2384,10 +2376,6 @@ lpfc_sli4_bsg_link_diag_test(struct bsg_job *job)
goto job_error;
 
pmboxq = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
-   if (!pmboxq) {
-   rc = -ENOMEM;
-   goto link_diag_test_exit;
-   }
 
req_len = (sizeof(struct lpfc_mbx_set_link_diag_state) -
   sizeof(struct lpfc_sli4_cfg_mhdr));
@@ -2477,8 +2465,6 @@ static int lpfcdiag_loop_self_reg(struct lpfc_hba *phba, 
uint16_t *rpi)
int status;
 
mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
-   if (!mbox)
-   return -ENOMEM;
 
if (phba->sli_rev < LPFC_SLI_REV4)
status = lpfc_reg_rpi(phba, 0, phba->pport->fc_myDID,
@@ -2537,8 +2523,6 @@ static int lpfcdiag_loop_self_unreg(struct lpfc_hba 
*phba, uint16_t rpi)
 
/* Allocate mboxq structure */
mbox = mempool_alloc(phba->mbox_mem_pool, GFP_KERNEL);
-   if (mbox == NULL)
-   return -ENOMEM;
 
if (phba->sli_rev < LPFC_SLI_REV4)
lpfc_unreg_login(phba, 0, rpi, mbox);
@@ -3917,10 +3901,6 @@ lpfc_bsg_sli_cfg_read_cmd_ext(struct lpfc_hba *phba, 
struct bsg_job *job,
 
/* mailbox command structure for

Re: [Lsf-pc] [LSF/MM TOPIC] do we really need PG_error at all?

2017-02-28 Thread NeilBrown
On Mon, Feb 27 2017, Jeff Layton wrote:

> On Tue, 2017-02-28 at 10:32 +1100, NeilBrown wrote:
>> On Mon, Feb 27 2017, Andreas Dilger wrote:
>> 
>> > 
>> > My thought is that PG_error is definitely useful for applications to get
>> > correct errors back when doing write()/sync_file_range() so that they know
>> > there is an error in the data that _they_ wrote, rather than receiving an
>> > error for data that may have been written by another thread, and in turn
>> > clearing the error from another thread so it *doesn't* know it had a write
>> > error.
>> 
>> It might be useful in that way, but it is not currently used that way.
>> Such usage would be a change in visible behaviour.
>> 
>> sync_file_range() calls filemap_fdatawait_range(), which calls
>> filemap_check_errors().
>> If there have been any errors in the file recently, inside or outside
>> the range, the latter will return an error which will propagate up.
>> 
>> > 
>> > As for stray sync() clearing PG_error from underneath an application, that
>> > shouldn't happen since filemap_fdatawait_keep_errors() doesn't clear errors
>> > and is used by device flushing code (fdatawait_one_bdev(), 
>> > wait_sb_inodes()).
>> 
>> filemap_fdatawait_keep_errors() calls __filemap_fdatawait_range() which
>> clears PG_error on every page.
>> What it doesn't do is call filemap_check_errors(), and so doesn't clear
>> AS_ENOSPC or AS_EIO.
>> 
>> 
>
> I think it's helpful to get a clear idea of what happens now in the face
> of errors and what we expect to happen, and I don't quite have that yet:
>
> --8<-
> void page_endio(struct page *page, bool is_write, int err)
> {
> if (!is_write) {
> if (!err) {
> SetPageUptodate(page);
> } else {
> ClearPageUptodate(page);
> SetPageError(page);
> }
> unlock_page(page);
> } else {
> if (err) {
> SetPageError(page);
> if (page->mapping)
> mapping_set_error(page->mapping, err);
> }
> end_page_writeback(page);
> }
> }
> --8<-
>
> ...not everything uses page_endio, but it's a good place to look since
> we have both flavors of error handling in one place.
>
> On a write error, we SetPageError and set the error in the mapping.
>
> What I'm not clear on is:
>
> 1) what happens to the page at that point when we get a writeback error?
> Does it just remain in-core and is allowed to service reads (assuming
> that it was uptodate before)?

Yes, it remains in core and can service reads.  It is no different from
a page on which a write recent succeeded, except that the write didn't
succeed so the contents of backing store might be different from the
contents of the page.

>
> Can I redirty it and have it retry the write? Is there standard behavior
> for this or is it just up to the whim of the filesystem?

Everything is at the whim of the filesystem, but I doubt if many differ
from the above.

NeilBrown

>
> I'll probably have questions about the read side as well, but for now it
> looks like it's mostly used in an ad-hoc way to communicate errors
> across subsystems (block to fs layer, for instance).
> --
> Jeff Layton <jlay...@redhat.com>


signature.asc
Description: PGP signature


Re: [LSF/MM TOPIC] do we really need PG_error at all?

2017-02-27 Thread NeilBrown
On Mon, Feb 27 2017, Andreas Dilger wrote:

>
> My thought is that PG_error is definitely useful for applications to get
> correct errors back when doing write()/sync_file_range() so that they know
> there is an error in the data that _they_ wrote, rather than receiving an
> error for data that may have been written by another thread, and in turn
> clearing the error from another thread so it *doesn't* know it had a write
> error.

It might be useful in that way, but it is not currently used that way.
Such usage would be a change in visible behaviour.

sync_file_range() calls filemap_fdatawait_range(), which calls
filemap_check_errors().
If there have been any errors in the file recently, inside or outside
the range, the latter will return an error which will propagate up.

>
> As for stray sync() clearing PG_error from underneath an application, that
> shouldn't happen since filemap_fdatawait_keep_errors() doesn't clear errors
> and is used by device flushing code (fdatawait_one_bdev(), wait_sb_inodes()).

filemap_fdatawait_keep_errors() calls __filemap_fdatawait_range() which
clears PG_error on every page.
What it doesn't do is call filemap_check_errors(), and so doesn't clear
AS_ENOSPC or AS_EIO.

NeilBrown


signature.asc
Description: PGP signature


Re: [LSF/MM TOPIC] do we really need PG_error at all?

2017-02-26 Thread NeilBrown
On Sun, Feb 26 2017, James Bottomley wrote:

> On Mon, 2017-02-27 at 08:03 +1100, NeilBrown wrote:
>> On Sun, Feb 26 2017, James Bottomley wrote:
>> 
>> > [added linux-scsi and linux-block because this is part of our error
>> > handling as well]
>> > On Sun, 2017-02-26 at 09:42 -0500, Jeff Layton wrote:
>> > > Proposing this as a LSF/MM TOPIC, but it may turn out to be me 
>> > > just not understanding the semantics here.
>> > > 
>> > > As I was looking into -ENOSPC handling in cephfs, I noticed that
>> > > PG_error is only ever tested in one place [1] 
>> > > __filemap_fdatawait_range, which does this:
>> > > 
>> > >  if (TestClearPageError(page))
>> > >  ret = -EIO;
>> > > 
>> > > This error code will override any AS_* error that was set in the
>> > > mapping. Which makes me wonder...why don't we just set this error 
>> > > in the mapping and not bother with a per-page flag? Could we
>> > > potentially free up a page flag by eliminating this?
>> > 
>> > Note that currently the AS_* codes are only set for write errors 
>> > not for reads and we have no mapping error handling at all for swap
>> > pages, but I'm sure this is fixable.
>> 
>> How is a read error different from a failure to set PG_uptodate?
>> Does PG_error suppress retries?
>
> We don't do any retries in the code above the block layer (or at least
> we shouldn't).

I was wondering about what would/should happen if a read request was
re-issued for some reason.  Should the error flag on the page cause an
immediate failure, or should it try again.
If read-ahead sees a read-error on some future page, is it necessary to
record the error so subsequent read-aheads don't notice the page is
missing and repeatedly try to re-load it?
When the application eventually gets to the faulty page, should a read
be tried then, or is the read-ahead failure permanent?



>
>> > 
>> > From the I/O layer point of view we take great pains to try to 
>> > pinpoint the error exactly to the sector.  We reflect this up by 
>> > setting the PG_error flag on the page where the error occurred.  If 
>> > we only set the error on the mapping, we lose that granularity, 
>> > because the mapping is mostly at the file level (or VMA level for
>> > anon pages).
>> 
>> Are you saying that the IO layer finds the page in the bi_io_vec and
>> explicitly sets PG_error,
>
> I didn't say anything about the mechanism.  I think the function you're
> looking for is fs/mpage.c:mpage_end_io().  layers below block indicate
> the position in the request.  Block maps the position to bio and the
> bio completion maps to page.  So the actual granularity seen in the
> upper layer depends on how the page to bio mapping is done.

If the block layer is just returning the status at a per-bio level (which
makes perfect sense), then this has nothing directly to do with the
PG_error flag.

The page cache needs to do something with bi_error, but it isn't
immediately clear that it needs to set PG_error.

>
>>  rather than just passing an error indication to bi_end_io ??  That
>> would seem to be wrong as the page may not be in the page cache.
>
> Usually pages in the mpage_end_io path are pinned, I think.
>
>>  So I guess I misunderstand you.
>> 
>> > 
>> > So I think the question for filesystem people from us would be do 
>> > you care about this accuracy?  If it's OK just to know an error
>> > occurred somewhere in this file, then perhaps we don't need it.
>> 
>> I had always assumed that a bio would either succeed or fail, and 
>> that no finer granularity could be available.
>
> It does ... but a bio can be as small as a single page.
>
>> I think the question here is: Do filesystems need the pagecache to
>> record which pages have seen an IO error?
>
> It's not just filesystems.  The partition code uses PageError() ... the
> metadata code might as well (those are things with no mapping).  I'm
> not saying we can't remove PG_error; I am saying it's not going to be
> quite as simple as using the AS_ flags.

The partition code could use PageUptodate().
mpage_end_io() calls page_endio() on each page, and on read error that
calls:

ClearPageUptodate(page);
SetPageError(page);

are both of these necessary?

fs/buffer.c can use several bios to read a single page.
If any one returns an error, PG_error is set.  When all of them have
completed, if PG_error is clear, PG_uptodate is then set.
This is an opportunistic use of PG_error, rather than an essent

Re: [LSF/MM TOPIC] do we really need PG_error at all?

2017-02-26 Thread NeilBrown
On Sun, Feb 26 2017, James Bottomley wrote:

> [added linux-scsi and linux-block because this is part of our error
> handling as well]
> On Sun, 2017-02-26 at 09:42 -0500, Jeff Layton wrote:
>> Proposing this as a LSF/MM TOPIC, but it may turn out to be me just 
>> not understanding the semantics here.
>> 
>> As I was looking into -ENOSPC handling in cephfs, I noticed that
>> PG_error is only ever tested in one place [1] 
>> __filemap_fdatawait_range, which does this:
>> 
>>  if (TestClearPageError(page))
>>  ret = -EIO;
>> 
>> This error code will override any AS_* error that was set in the
>> mapping. Which makes me wonder...why don't we just set this error in 
>> the mapping and not bother with a per-page flag? Could we potentially
>> free up a page flag by eliminating this?
>
> Note that currently the AS_* codes are only set for write errors not
> for reads and we have no mapping error handling at all for swap pages,
> but I'm sure this is fixable.

How is a read error different from a failure to set PG_uptodate?
Does PG_error suppress retries?

>
> From the I/O layer point of view we take great pains to try to pinpoint
> the error exactly to the sector.  We reflect this up by setting the
> PG_error flag on the page where the error occurred.  If we only set the
> error on the mapping, we lose that granularity, because the mapping is
> mostly at the file level (or VMA level for anon pages).

Are you saying that the IO layer finds the page in the bi_io_vec and
explicitly sets PG_error, rather than just passing an error indication
to bi_end_io ??  That would seem to be wrong as the page may not be in
the page cache. So I guess I misunderstand you.

>
> So I think the question for filesystem people from us would be do you
> care about this accuracy?  If it's OK just to know an error occurred
> somewhere in this file, then perhaps we don't need it.

I had always assumed that a bio would either succeed or fail, and that
no finer granularity could be available.

I think the question here is: Do filesystems need the pagecache to
record which pages have seen an IO error?
I think that for write errors, there is no value in recording
block-oriented error status - only file-oriented status.
For read errors, it might if help to avoid indefinite read retries, but
I don't know the code well enough to be sure if this is an issue.

NeilBrown


>
> James
>
>> The main argument I could see for keeping it is that removing it 
>> might subtly change the behavior of sync_file_range if you have tasks
>> syncing different ranges in a file concurrently. I'm not sure if that 
>> would break any guarantees though.
>> 
>> Even if we do need it, I think we might need some cleanup here 
>> anyway. A lot of readpage operations end up setting that flag when 
>> they hit an error. Isn't it wrong to return an error on fsync, just 
>> because we had a read error somewhere in the file in a range that was
>> never dirtied?
>> 
>> --
>> [1]: there is another place in f2fs, but it's more or less equivalent 
>> to the call site in __filemap_fdatawait_range.
>> 


signature.asc
Description: PGP signature


Re: [PATCH v2 1/3] badblocks: Add core badblock management code

2015-12-22 Thread NeilBrown
On Wed, Dec 23 2015, Verma, Vishal L wrote:

> On Tue, 2015-12-22 at 16:34 +1100, NeilBrown wrote:
>> On Sat, Dec 05 2015, Verma, Vishal L wrote:
>> 
>> > On Fri, 2015-12-04 at 15:30 -0800, James Bottomley wrote:
>> > [...]
>> > > > +ssize_t badblocks_store(struct badblocks *bb, const char *page,
>> > > > size_t len,
>> > > > +  int unack)
>> > > [...]
>> > > > +int badblocks_init(struct badblocks *bb, int enable)
>> > > > +{
>> > > > +  bb->count = 0;
>> > > > +  if (enable)
>> > > > +  bb->shift = 0;
>> > > > +  else
>> > > > +  bb->shift = -1;
>> > > > +  bb->page = kmalloc(PAGE_SIZE, GFP_KERNEL);
>> > > 
>> > > Why not __get_free_page(GFP_KERNEL)?  The problem with kmalloc of
>> > > an
>> > > exactly known page sized quantity is that the slab tracker for
>> > > this
>> > > requires two contiguous pages for each page because of the
>> > > overhead.
>> > 
>> > Cool, I didn't know about __get_free_page - I can fix this up too.
>> > 
>> 
>> I was reminded of this just recently I thought I should clear up the
>> misunderstanding.
>> 
>> kmalloc(PAGE_SIZE) does *not* incur significant overhead and certainly
>> does not require two contiguous free pages.
>> If you "grep kmalloc-4096 /proc/slabinfo" you will note that both
>> objperslab and pagesperslab are 1.  So one page is used to store each
>> 4096 byte allocation.
>> 
>> To quote the email from Linus which reminded me about this
>> 
>> > If you
>> > want to allocate a page, and get a pointer, just use "kmalloc()".
>> > Boom, done!
>> 
>> https://lkml.org/lkml/2015/12/21/605
>> 
>> There probably is a small CPU overhead from using kmalloc, but no
>> memory
>> overhead.
>
> Thanks Neil.
> I just read the rest of that thread - and I'm wondering if we should
> change back to kzalloc here.
>
> The one thing __get_free_page gets us is PAGE_SIZE-aligned memory. Do
> you think that would be better for this use? (I can't think of any). If
> not, I can send out a new version reverting back to kzalloc.

kzalloc(PAGE_SIZE) will also always return page-aligned memory.
kzalloc returns a void*, __get_free_page returns unsigned long.  For
that reason alone I would prefer kzalloc.

But I'm not necessarily suggesting you change the code.  I just wanted
to clarify a misunderstanding.  You should produce the code that you are
most happy with.

NeilBrown


signature.asc
Description: PGP signature


Re: [PATCH v2 1/3] badblocks: Add core badblock management code

2015-12-21 Thread NeilBrown
On Sat, Dec 05 2015, Verma, Vishal L wrote:

> On Fri, 2015-12-04 at 15:30 -0800, James Bottomley wrote:
> [...]
>> > +ssize_t badblocks_store(struct badblocks *bb, const char *page,
>> > size_t len,
>> > +  int unack)
>> [...]
>> > +int badblocks_init(struct badblocks *bb, int enable)
>> > +{
>> > +  bb->count = 0;
>> > +  if (enable)
>> > +  bb->shift = 0;
>> > +  else
>> > +  bb->shift = -1;
>> > +  bb->page = kmalloc(PAGE_SIZE, GFP_KERNEL);
>> 
>> Why not __get_free_page(GFP_KERNEL)?  The problem with kmalloc of an
>> exactly known page sized quantity is that the slab tracker for this
>> requires two contiguous pages for each page because of the overhead.
>
> Cool, I didn't know about __get_free_page - I can fix this up too.
>

I was reminded of this just recently I thought I should clear up the
misunderstanding.

kmalloc(PAGE_SIZE) does *not* incur significant overhead and certainly
does not require two contiguous free pages.
If you "grep kmalloc-4096 /proc/slabinfo" you will note that both
objperslab and pagesperslab are 1.  So one page is used to store each
4096 byte allocation.

To quote the email from Linus which reminded me about this

> If you
> want to allocate a page, and get a pointer, just use "kmalloc()".
> Boom, done!

https://lkml.org/lkml/2015/12/21/605

There probably is a small CPU overhead from using kmalloc, but no memory
overhead.

NeilBrown


signature.asc
Description: PGP signature


Re: [PATCH v4 0/3] Badblock tracking for gendisks

2015-12-08 Thread NeilBrown
On Wed, Dec 09 2015, Vishal Verma <vishal.l.ve...@intel.com> wrote:

>
> Patch 3 converts md over to use the new badblocks 'library'. I have
> done some pretty simple testing on this - created a raid 1 device,
> made sure the sysfs entries show up, and can be used to add and view
> badblocks. A closer look by the md folks would be nice here.

Hi,
 this all looks reasonably sensible.  I haven't gone over it with a
 fine toothed comb (I wonder where that analogy comes from) but
 nothing is obviously wrong.

 Acked-by: NeilBrown <ne...@suse.com>

Thanks,
NeilBrown


signature.asc
Description: PGP signature


Re: [PATCH v2 1/3] badblocks: Add core badblock management code

2015-12-08 Thread NeilBrown
On Sat, Dec 05 2015, Verma, Vishal L wrote:
>> 
>> > +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
>> > +{
>> [...]
>> > +#define DO_DEBUG 1
>> 
>> Why have this at all if it's unconditionally defined and always set.
>
> Neil - any reason or anything you had in mind for this? Or is it just an
> artifact and can be removed.

Like the comment says:

/* Allow clearing via sysfs *only* for testing/debugging.
 * Normally only a successful write may clear a badblock
 */

The DO_DEBUG define and ifdefs are documentation identifying bits of
code that should be removed when it all seems to be working.
Maybe now is a good time to remove that code.

NeilBrown


signature.asc
Description: PGP signature


Re: [dm-devel] kernel BUG at drivers/scsi/scsi_lib.c:1101! observed during md5sum for one file on (RAID4-RAID0) device

2015-07-30 Thread NeilBrown
On Thu, 30 Jul 2015 06:28:06 -0700 James Bottomley
james.bottom...@hansenpartnership.com wrote:

 On Thu, 2015-07-30 at 05:03 -0400, Yi Zhang wrote:
  Hi SCSI/RAID maintainer
  
  During raid test with 4.2.0-rc3, I observed below kernel BUG, pls check 
  below info for the test log/environment/test steps.
  
  Log:
  [  306.741662] md: bindsdb1
  [  306.750865] md: bindsdc1
  [  306.753993] md: bindsdd1
  [  306.764475] md: bindsde1
  [  306.786156] md: bindsdf1
  [  306.789362] md: bindsdh1
  [  306.792555] md: bindsdg1
  [  306.868166] raid6: sse2x1   gen() 10589 MB/s
  [  306.889143] raid6: sse2x1   xor()  8218 MB/s
  [  306.910121] raid6: sse2x2   gen() 13453 MB/s
  [  306.931102] raid6: sse2x2   xor()  8990 MB/s
  [  306.952079] raid6: sse2x4   gen() 15539 MB/s
  [  306.973063] raid6: sse2x4   xor() 10771 MB/s
  [  306.994039] raid6: avx2x1   gen() 20582 MB/s
  [  307.015017] raid6: avx2x2   gen() 24019 MB/s
  [  307.035998] raid6: avx2x4   gen() 27824 MB/s
  [  307.040755] raid6: using algorithm avx2x4 gen() 27824 MB/s
  [  307.046869] raid6: using avx2x2 recovery algorithm
  [  307.058793] async_tx: api initialized (async)
  [  307.075428] xor: automatically using best checksumming function:
  [  307.091942]avx   : 32008.000 MB/sec
  [  307.147662] md: raid6 personality registered for level 6
  [  307.153584] md: raid5 personality registered for level 5
  [  307.159505] md: raid4 personality registered for level 4
  [  307.165698] md/raid:md0: device sdf1 operational as raid disk 4
  [  307.172300] md/raid:md0: device sde1 operational as raid disk 3
  [  307.178899] md/raid:md0: device sdd1 operational as raid disk 2
  [  307.185497] md/raid:md0: device sdc1 operational as raid disk 1
  [  307.192093] md/raid:md0: device sdb1 operational as raid disk 0
  [  307.199052] md/raid:md0: allocated 6482kB
  [  307.203573] md/raid:md0: raid level 4 active with 5 out of 6 devices, 
  algorithm 0
  [  307.211958] md0: detected capacity change from 0 to 53645148160
  [  307.218658] md: recovery of RAID array md0
  [  307.223226] md: minimum _guaranteed_  speed: 1000 KB/sec/disk.
  [  307.229729] md: using maximum available idle IO bandwidth (but not more 
  than 20 KB/sec) for recovery.
  [  307.240427] md: using 128k window, over a total of 10477568k.
  [  374.670951] md: md0: recovery done.
  [  375.722806] EXT4-fs (md0): mounted filesystem with ordered data mode. 
  Opts: (null)
  [  447.553364] md: unbindsdh1
  [  447.559905] md: export_rdev(sdh1)
  [  447.572684] md: cannot remove active disk sdg1 from md0 ...
  [  447.578909] md/raid:md0: Disk failure on sdg1, disabling device.
  [  447.578909] md/raid:md0: Operation continuing on 5 devices.
  [  447.594850] md: unbindsdg1
  [  447.601834] md: export_rdev(sdg1)
  [  447.615446] md: raid0 personality registered for level 0
  [  447.629275] md/raid0:md0: md_size is 104775680 sectors.
  [  447.635094] md: RAID0 configuration for md0 - 1 zone
  [  447.640627] md: zone0=[sdb1/sdc1/sdd1/sde1/sdf1]
  [  447.645833]   zone-offset= 0KB, device-offset= 0KB, 
  size=  52387840KB
  [  447.654949] 
  [  447.739443] EXT4-fs (md0): mounted filesystem with ordered data mode. 
  Opts: (null)
  [  447.749258] bio too big device sde1 (768  512)
 
 This is the actual error.  It looks like an md problem (md list copied).

Thanks.  It certainly does look like an md problem ah, found it.

level_store in drivers/md/md.c calls blk_set_stacking_limits after
calling -takeover and before calling -run.
-run should impose the limits from the underlying device, but for
RAID0, -takeover is doing that.

I can fix that... hopefully it will become irrelevant soon when the
immutable-bio patches go in.


This patch isn't quite right, but it should be pretty close.
Can you test and confirm?
Thanks,
NeilBrown

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index efb654eb5399..17804f374709 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -83,7 +83,6 @@ static int create_strip_zones(struct mddev *mddev,
struct r0conf **private_conf) char b[BDEVNAME_SIZE];
char b2[BDEVNAME_SIZE];
struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
-   bool discard_supported = false;
 
if (!conf)
return -ENOMEM;
@@ -188,19 +187,12 @@ static int create_strip_zones(struct mddev
*mddev, struct r0conf **private_conf) }
dev[j] = rdev1;
 
-   if (mddev-queue)
-   disk_stack_limits(mddev-gendisk, rdev1-bdev,
- rdev1-data_offset  9);
-
if (rdev1-bdev-bd_disk-queue-merge_bvec_fn)
conf-has_merge_bvec = 1;
 
if (!smallest || (rdev1-sectors  smallest-sectors))
smallest = rdev1;
cnt++;
-
-   if (blk_queue_discard(bdev_get_queue(rdev1-bdev)))
-   discard_supported = true;
}
if (cnt != mddev

Re: raid1 narrow_write_error with 4K disks, sd bad block number requested messages

2015-02-12 Thread NeilBrown
On Thu, 12 Feb 2015 11:46:21 -0500 Nate Dailey nate.dai...@stratus.com
wrote:

 On 02/04/2015 11:59 PM, NeilBrown wrote:
  On Wed, 28 Jan 2015 10:29:46 -0500 Nate Dailey nate.dai...@stratus.com
  wrote:
 
  I'm writing about something that appears to be an issue with raid1's
  narrow_write_error, particular to non-512-byte-sector disks. Here's what
  I'm doing:
 
  - 2 disk raid1, 4K disks, each connected to a different SAS HBA
  - mount a filesystem on the raid1, run a test that writes to it
  - remove one of the SAS HBAs (echo 1 
  /sys/bus/pci/devices/\:45\:00.0/remove)
 
  At this point, writes fail and narrow_write_error breaks them up and
  retries, one sector at a time. But these are 512-byte sectors, and sd
  doesn't like it:
 
  [ 2645.310517] sd 3:0:1:0: [sde] Bad block number requested
  [ 2645.310610] sd 3:0:1:0: [sde] Bad block number requested
  [ 2645.310690] sd 3:0:1:0: [sde] Bad block number requested
  ...
 
  There appears to be no real harm done, but there can be a huge number of
  these messages in the log.
 
  I can avoid this by disabling bad block tracking, but it looks like
  maybe the superblock's bblog_shift is intended to address this exact
  issue. However, I don't see a way to change it. Presumably this is
  something mdadm should be setting up? I don't see bblog_shift ever set
  to anything other than 0.
 
  This is on a RHEL 7.1 kernel, version 3.10.0-221.el7. I took a look at
  upstream sd and md changes and nothing jumps out at me that would have
  affected this (but I have not tested to see if the bad block messages do
  or do not happen on an upstream kernel).
 
  I'd appreciate any advice re: how to handle this. Thanks!
 
  Thanks for the report.
 
  narrow_write_error() should use bdev_logical_block_size() and round up to
  that.
  Possibly mdadm should get the same information and set bblog_shift
  accordingly when creating a bad block log.
 
  I've made a note to fix that, but I'm happy to review  patches too :-)
 
  thanks,
  NeilBrown
 
 
 I will post a narrow_write_error patch shortly.
 
 I did some experimentation with setting the bblog_shift in mdadm, but it 
 didn't work out the way I expected. It turns out that the value is only 
 loaded from the superblock if:
 
 1453if ((le32_to_cpu(sb-feature_map)  MD_FEATURE_BAD_BLOCKS) 
 1454rdev-badblocks.count == 0) {
 ...
 1473rdev-badblocks.shift = sb-bblog_shift;
 
 And this feature bit is only set if any bad blocks have actually been 
 recorded.
 
 It also appears to me that the shift is used when loading the bad blocks 
 from the superblock, but not when storing the bad block list in the 
 superblock.
 
 Seems like these are bugs, but I'm not certain how the code is supposed 
 to work (and am getting in a bit over my head with this).

Yes, that's probably a bug.
The

} else if (sb-bblog_offset != 0)
rdev-badblocks.shift = 0;

should be

} else if (sb-bblog_offset != 0)
rdev-badblocks.shift = sb-bblog_shift;

 
 In any case, it doesn't appear to me that there's any harm in having the 
 bblog_shift not match the disk's block size (right?).

Having the bblog_shift larger than the disk's block size certainly should not
be a problem.  Having it small only causes the problem that you have already
discovered.

NeilBrown


 
 Nate Dailey
 
 --
 To unsubscribe from this list: send the line unsubscribe linux-raid in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html



pgpThbUqdUDTs.pgp
Description: OpenPGP digital signature


Re: raid1 narrow_write_error with 4K disks, sd bad block number requested messages

2015-02-04 Thread NeilBrown
On Wed, 28 Jan 2015 10:29:46 -0500 Nate Dailey nate.dai...@stratus.com
wrote:

 I'm writing about something that appears to be an issue with raid1's 
 narrow_write_error, particular to non-512-byte-sector disks. Here's what 
 I'm doing:
 
 - 2 disk raid1, 4K disks, each connected to a different SAS HBA
 - mount a filesystem on the raid1, run a test that writes to it
 - remove one of the SAS HBAs (echo 1  
 /sys/bus/pci/devices/\:45\:00.0/remove)
 
 At this point, writes fail and narrow_write_error breaks them up and 
 retries, one sector at a time. But these are 512-byte sectors, and sd 
 doesn't like it:
 
 [ 2645.310517] sd 3:0:1:0: [sde] Bad block number requested
 [ 2645.310610] sd 3:0:1:0: [sde] Bad block number requested
 [ 2645.310690] sd 3:0:1:0: [sde] Bad block number requested
 ...
 
 There appears to be no real harm done, but there can be a huge number of 
 these messages in the log.
 
 I can avoid this by disabling bad block tracking, but it looks like 
 maybe the superblock's bblog_shift is intended to address this exact 
 issue. However, I don't see a way to change it. Presumably this is 
 something mdadm should be setting up? I don't see bblog_shift ever set 
 to anything other than 0.
 
 This is on a RHEL 7.1 kernel, version 3.10.0-221.el7. I took a look at 
 upstream sd and md changes and nothing jumps out at me that would have 
 affected this (but I have not tested to see if the bad block messages do 
 or do not happen on an upstream kernel).
 
 I'd appreciate any advice re: how to handle this. Thanks!

Thanks for the report.

narrow_write_error() should use bdev_logical_block_size() and round up to
that.
Possibly mdadm should get the same information and set bblog_shift
accordingly when creating a bad block log.

I've made a note to fix that, but I'm happy to review  patches too :-)

thanks,
NeilBrown



pgpB_wSd1jJTA.pgp
Description: OpenPGP digital signature


Re: mpt2sas + raid10 goes boom

2013-04-08 Thread NeilBrown
On Tue, 9 Apr 2013 12:22:29 +1000 Chris Dunlop ch...@onthe.net.au wrote:


  Neil -- should c8dc9c6 go to stable?
 
 I think it definitely should. Without it you can't create a
 raid10 and it looks like you have a controller issue!

It is too late for 3.7, but it probably makes sense for 3.8.
I suggest you ask get confirmation from the author,
Joe Lawrence joe.lawre...@stratus.com, (oh wait, you already did!)
and when Joe confirms, email sta...@vger.kernel.org and ask them to add
it.

NeilBrown



signature.asc
Description: PGP signature


Re: 'Device not ready' issue on mpt2sas since 3.1.10

2012-07-09 Thread NeilBrown
On Mon, 09 Jul 2012 16:40:15 +0200 Matthias Prager li...@matthiasprager.de
wrote:

 Hello linux-scsi and linux-raid,
 
 I did some further research regarding my problem.
 It appears to me the fault does not lie with the mpt2sas driver (not
 that I can definitely exclude it), but with the md implementation.
 
 I reproduced what I think is the same issue on a different machine (also
 running Vmware ESXi 5 and an LSI 9211-8i in IR mode) with a different
 set of hard-drives of the same model. Using systemrescuecd
 (2.8.1-beta003) and booting the 64bit 3.4.4 kernel, I issued the
 following commands:
 
 1) 'hdparm -y /dev/sda' (to put the hard-drive to sleep)
 2) 'mdadm --create /dev/md1 --metadata 1.2 --level=mirror
 --raid-devices=2 --name=test1 /dev/sda missing'
 3) 'fdisk -l /dev/md127' (for some reason /proc/mdstat indicates the md
 is being created as md127)
 
 2) gave me this feedback:
 --
 mdadm: super1.x cannot open /dev/sda: Device or resource busy
 mdadm: /dev/sda is not suitable for this array.
 mdadm: create aborted
 ---
 Even though it says creating aborted it still created md127.

One of my pet peeves in when people interpret the observations wrongly and
then report their interpretation instead of their observation.  However
sometimes it is very hard to separate the two.  You comment above looks
perfectly reasonable and looks like a clean observation and not and
interpretation.  Yet it is an interpretation :-)

The observation would be
   Even though it says creating abort, md127 was still created.

You see, it wasn't this mdadm that created md127 - it certainly shouldn't
have as you asked it to create md1.

I don't know the exact sequence of events, but something - possibly relating
to the error messages reported below - caused udev to notice /dev/sda.
udev then ran mdadm -I /dev/sda and as it had some metadata on it, it
created an array with it.  As the name information in that metadata was
probably test1 or similar, rather than 1, mdadm didn't know what number
was wanted for the array, so it chose a free high number - 127.

This metadata must have been left over from an earlier experiment.

So it might have been something like.

- you run mdadm (call this mdadm-1).
- mdadm tries to open sda
- driver notices that device is asleep, and wakes it up
- the waking up of the device causes a CHANGE uevent to udev
- this cause udev to run a new mdadm - mdadm-2
- mdadm-2 reads the metadata, sees old metadata, assembled sda in a new md127
- mdadm-1 gets scheduled again, tries to get O_EXCL access to sda and fails, 
  because sda is now part of md127

Clearly undesirable behaviour.  I'm not sure which bit is wrong.

NeilBrown


 
 And 3) lead to these lines in dmesg:
 ---
 [  604.838640] sd 2:0:0:0: [sda] Device not ready
 [  604.838645] sd 2:0:0:0: [sda]  Result: hostbyte=DID_OK
 driverbyte=DRIVER_SENSE
 [  604.838655] sd 2:0:0:0: [sda]  Sense Key : Not Ready [current]
 [  604.838663] sd 2:0:0:0: [sda]  Add. Sense: Logical unit not ready,
 initializing command required
 [  604.838668] sd 2:0:0:0: [sda] CDB: Read(10): 28 00 00 00 08 00 00 00
 20 00
 [  604.838680] end_request: I/O error, dev sda, sector 2048
 [  604.838688] Buffer I/O error on device md127, logical block 0
 [  604.838695] Buffer I/O error on device md127, logical block 1
 [  604.838699] Buffer I/O error on device md127, logical block 2
 [  604.838702] Buffer I/O error on device md127, logical block 3
 [  604.838783] sd 2:0:0:0: [sda] Device not ready
 [  604.838785] sd 2:0:0:0: [sda]  Result: hostbyte=DID_OK
 driverbyte=DRIVER_SENSE
 [  604.838789] sd 2:0:0:0: [sda]  Sense Key : Not Ready [current]
 [  604.838793] sd 2:0:0:0: [sda]  Add. Sense: Logical unit not ready,
 initializing command required
 [  604.838797] sd 2:0:0:0: [sda] CDB: Read(10): 28 00 00 00 08 00 00 00
 08 00
 [  604.838805] end_request: I/O error, dev sda, sector 2048
 [  604.838808] Buffer I/O error on device md127, logical block 0
 [  604.838983] sd 2:0:0:0: [sda] Device not ready
 [  604.838986] sd 2:0:0:0: [sda]  Result: hostbyte=DID_OK
 driverbyte=DRIVER_SENSE
 [  604.838989] sd 2:0:0:0: [sda]  Sense Key : Not Ready [current]
 [  604.838993] sd 2:0:0:0: [sda]  Add. Sense: Logical unit not ready,
 initializing command required
 [  604.838998] sd 2:0:0:0: [sda] CDB: Read(10): 28 00 57 54 65 d8 00 00
 08 00
 [  604.839006] end_request: I/O error, dev sda, sector 146514
 [  604.839009] Buffer I/O error on device md127, logical block 183143355
 [  604.839087] sd 2:0:0:0: [sda] Device not ready
 [  604.839090] sd 2:0:0:0: [sda]  Result: hostbyte=DID_OK
 driverbyte=DRIVER_SENSE
 [  604.839093] sd 2:0:0:0: [sda]  Sense Key : Not Ready [current]
 [  604.839097] sd 2:0:0:0: [sda]  Add. Sense: Logical unit not ready,
 initializing command required
 [  604.839102] sd 2:0:0:0: [sda] CDB: Read(10): 28 00 57 54 65 d8 00 00
 08 00
 [  604.839110] end_request: I/O error, dev sda, sector 146514
 [  604.839113] Buffer I/O error on device md127, logical block