RE: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-16 Thread Dan Williams
[ add Tony and Boris ]

Al Viro wrote:
> [commit in question sits in vfs.git#fixes]
> 
> Unlike other copying operations on ITER_PIPE, copy_mc_to_iter() can
> result in a short copy.  In that case we need to trim the unused
> buffers, as well as the length of partially filled one - it's not
> enough to set ->head, ->iov_offset and ->count to reflect how
> much had we copied.  Not hard to fix, fortunately...
> 
> I'd put a helper (pipe_discard_from(pipe, head)) into pipe_fs_i.h,
> rather than iov_iter.c - it has nothing to do with iov_iter and
> having it will allow us to avoid an ugly kludge in fs/splice.c.
> We could put it into lib/iov_iter.c for now and move it later,
> but I don't see the point going that way...

Apologies for the delay in responding (reworking my email workflow after
a loss of Gmail access for my intel.com address). This looks good to me:

Acked-by: Dan Williams 

...and I also share the concern from Linus about the lack of testing
this gets outside of systems with the necessary hardware/firmware to do
error injection testing.

Boris and I had agreed to remove some software error injection machinery
for copy_mc_* in commit 3adb776384f2 ("x86, libnvdimm/test: Remove
COPY_MC_TEST"). Is there an appetite to see some of that return and
write a regression test for this bug?

> 
> Fixes: ca146f6f091e "lib/iov_iter: Fix pipe handling in 
> _copy_to_iter_mcsafe()"
> Signed-off-by: Al Viro 
> ---
> diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
> index cb0fd633a610..4ea496924106 100644
> --- a/include/linux/pipe_fs_i.h
> +++ b/include/linux/pipe_fs_i.h
> @@ -229,6 +229,15 @@ static inline bool pipe_buf_try_steal(struct 
> pipe_inode_info *pipe,
>   return buf->ops->try_steal(pipe, buf);
>  }
>  
> +static inline void pipe_discard_from(struct pipe_inode_info *pipe,
> + unsigned int old_head)
> +{
> + unsigned int mask = pipe->ring_size - 1;
> +
> + while (pipe->head > old_head)
> + pipe_buf_release(pipe, >bufs[--pipe->head & mask]);
> +}
> +
>  /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
> memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
>  #define PIPE_SIZEPAGE_SIZE
> diff --git a/lib/iov_iter.c b/lib/iov_iter.c
> index 0b64695ab632..2bf20b48a04a 100644
> --- a/lib/iov_iter.c
> +++ b/lib/iov_iter.c
> @@ -689,6 +689,7 @@ static size_t copy_mc_pipe_to_iter(const void *addr, 
> size_t bytes,
>   struct pipe_inode_info *pipe = i->pipe;
>   unsigned int p_mask = pipe->ring_size - 1;
>   unsigned int i_head;
> + unsigned int valid = pipe->head;
>   size_t n, off, xfer = 0;
>  
>   if (!sanity(i))
> @@ -702,11 +703,17 @@ static size_t copy_mc_pipe_to_iter(const void *addr, 
> size_t bytes,
>   rem = copy_mc_to_kernel(p + off, addr + xfer, chunk);
>   chunk -= rem;
>   kunmap_local(p);
> - i->head = i_head;
> - i->iov_offset = off + chunk;
> - xfer += chunk;
> - if (rem)
> + if (chunk) {
> + i->head = i_head;
> + i->iov_offset = off + chunk;
> + xfer += chunk;
> + valid = i_head + 1;
> + }
> + if (rem) {
> + pipe->bufs[i_head & p_mask].len -= rem;
> + pipe_discard_from(pipe, valid);
>   break;
> + }
>   n -= chunk;
>   off = 0;
>   i_head++;





Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-14 Thread Al Viro
On Tue, Jun 14, 2022 at 01:53:50AM +0100, Al Viro wrote:

> FWIW, I've got quite a bit of cleanups in the local tree; reordering and
> cleaning that queue up at the moment, will post tonight or tomorrow.
> 
> I've looked into doing allocations page-by-page (instead of single
> push_pipe(), followed by copying into those).  Doable, but it ends
> up being much messier.

Hmm...  Maybe not - a possible interface would be
append_pipe(iter, size, )

that would either do kmap_local_page() on the last buffer (if it's
anonymous and has space in it) or allocated and mapped a page and
added a new buffer.  Returning the mapped address and offset from it.
Then these loops would looks like this:

while (left) {
p = append_pipe(iter, left, );
if (!p)
break;
chunk = min(left, PAGE_SIZE - off);
rem = copy(p + off, whatever, chunk);
chunk -= rem;
kunmap_local(p);

copied += chunk;
left -= chunk;

if (unlikely(rem)) {
pipe_revert(i, rem);
break;
}
}
return copied;

with no push_pipe() used at all.  For operations that can't fail,
the things are simplified in an obvious way (rem is always 0).

Or we could have append_pipe() return a struct page * and leave
kmap_local_page() to the caller...

struct page *append_pipe(struct iov_iter *i, size_t size, unsigned *off)
{
struct pipe_inode_info *pipe = i->pipe;
unsigned offset = i->iov_offset;
struct page_buffer *buf;
struct page *page;

if (offset && offset < PAGE_SIZE) {
// some space in the last buffer; can we add to it?
buf = pipe_buf(pipe, pipe->head - 1);
if (allocated(buf)) {
size = min(size, PAGE_SIZE - offset);
buf->len += size;
i->iov_offset += size;
i->count -= size;
*off = offset;
return buf->page;   // or kmap_local_page(...)
}
}
// OK, we need a new buffer
size = min(size, PAGE_SIZE);
if (pipe_full(.))
return NULL;
page = alloc_page(GFP_USER);
if (!page)
return NULL;
// got it...
buf = pipe_buf(pipe, pipe->head++);
*buf = (struct pipe_buffer){.ops = _pipe_buf_ops,
.page = page, .len = size };
i->head = pipe->head - 1;
i->iov_offset = size;
i->count -= size;
*off = 0;
return page; // or kmap_local_page(...)
}

(matter of fact, the last part could use another helper in my tree - there
the tail would be
// OK, we need a new buffer
size = min(size, PAGE_SIZE);
page = push_anon(pipe, size);
if (!page)
return NULL;
i->head = pipe->head - 1;
i->iov_offset = size;
i->count -= size;
*off = 0;
return page;
)

Would that be readable enough from your POV?  That way push_pipe()
loses almost all callers and after the "make iov_iter_get_pages()
advancing" part of the series it simply goes away...

It's obviously too intrusive for backports, though - there I'd very much
prefer the variant I posted.

Comments?

PS: re local helpers:

static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe,
   unsigned int slot)
{
return >bufs[slot & (pipe->ring_size - 1)];
}

pretty much all places where we cache pipe->ring_size - 1 had been
absolutely pointless; there are several exceptions, but back in 2019
"pipe: Use head and tail pointers for the ring, not cursor and length"
went overboard with microoptimizations...



Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-14 Thread Al Viro
On Tue, Jun 14, 2022 at 07:36:19AM +0100, David Howells wrote:
> Al Viro  wrote:
> 
> > What's wrong with
> > p_occupancy = pipe_occupancy(head, tail);
> > if (p_occupancy >= pipe->max_usage)
> > return 0;
> > else
> > return pipe->max_usage - p_occupancy;
> 
> Because "pipe->max_usage - p_occupancy" can be negative.

Sure can.  And in that case you return 0; no problem wiht that.
It's what happens when occupancy is below max_usage that is weird.

> post_one_notification() is limited by pipe->ring_size, not pipe->max_usage.
> 
> The idea is to allow some slack in a watch pipe for the watch_queue code to
> use that userspace can't.

Sure.  And if this function is supposed to report how many times would
userspace be able to grab a slot, it's returning the wrong value.

Look: 32-slot ring.  max_usage is 16.  14 slots are already occupied.
Userland (sure as hell, anything in iov_iter.c) will be able to occupy
two more before it runs into the pipe_full().  And your function returns
min(32 - 14, 16), i.e. 16.

What am I missing here?



Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-14 Thread David Howells
Al Viro  wrote:

> What's wrong with
> p_occupancy = pipe_occupancy(head, tail);
> if (p_occupancy >= pipe->max_usage)
> return 0;
>   else
>   return pipe->max_usage - p_occupancy;

Because "pipe->max_usage - p_occupancy" can be negative.

post_one_notification() is limited by pipe->ring_size, not pipe->max_usage.

The idea is to allow some slack in a watch pipe for the watch_queue code to
use that userspace can't.

David




Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-13 Thread Al Viro
On Mon, Jun 13, 2022 at 11:28:34PM +0100, Al Viro wrote:
> On Mon, Jun 13, 2022 at 10:54:36AM -0700, Linus Torvalds wrote:
> > On Sun, Jun 12, 2022 at 5:10 PM Al Viro  wrote:
> > >
> > > Unlike other copying operations on ITER_PIPE, copy_mc_to_iter() can
> > > result in a short copy.  In that case we need to trim the unused
> > > buffers, as well as the length of partially filled one - it's not
> > > enough to set ->head, ->iov_offset and ->count to reflect how
> > > much had we copied.  Not hard to fix, fortunately...
> > >
> > > I'd put a helper (pipe_discard_from(pipe, head)) into pipe_fs_i.h,
> > > rather than iov_iter.c -
> > 
> > Actually, since this "copy_mc_xyz()" stuff is going to be entirely
> > impossible to debug and replicate for any normal situation, I would
> > suggest we take the approach that we (long ago) used to take with
> > copy_from_user(): zero out the destination buffer, so that developers
> > that can't test the faulting behavior don't have to worry about it.
> > 
> > And then the existing code is fine: it will break out of the loop, but
> > it won't do the odd revert games and the "randomnoise.len -= rem"
> > thing that I can't wrap my head around.
> > 
> > Hmm?
> 
> Not really - we would need to zero the rest of those pages somehow.
> They are already allocated and linked into pipe; leaving them
> there (and subsequent ones hadn't seen any stores whatsoever - they
> are fresh out of alloc_page(GFP_USER)) is a non-starter.
> 
> We could do allocation as we go, but that's a much more intrusive
> change...

FWIW, I've got quite a bit of cleanups in the local tree; reordering and
cleaning that queue up at the moment, will post tonight or tomorrow.

I've looked into doing allocations page-by-page (instead of single
push_pipe(), followed by copying into those).  Doable, but it ends
up being much messier.

IMO this "truncate on failure" approach is saner.



Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-13 Thread Al Viro
On Tue, Jun 14, 2022 at 12:25:03AM +0100, Al Viro wrote:

> The more I'm looking at that thing, the more it smells like a bug;
> it had the same 3 callers since the time it had been introduced.
> 
> 1) pipe_get_pages().  We are about to try and allocate up to that
> many pipe buffers.  Allocation (done in push_pipe()) is done only
> if we have !pipe_full(pipe->head, pipe->tail, pipe->max_usage).
> 
> It simply won't give you more than max_usage - occupancy.
> Your function returns min(ring_size - occupancy, max_usage), which
> is always greater than or equal to that (ring_size >= max_usage).
> 
> 2) pipe_get_pages_alloc().  Same story, same push_pipe() being
> called, same "we'll never get that much - it'll hit the limit
> first".
> 
> 3) iov_iter_npages() in case of ITER_PIPE.  Again, the value
> is bogus - it should not be greater than the amount of pages
> we would be able to write there.
> 
> AFAICS, 6718b6f855a0 "pipe: Allow pipes to have kernel-reserved slots"
> broke it for cases when ring_size != max_usage...

Unless I'm missing something, the following would do the right thing.
Dave?

diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 4ea496924106..c22173d6e500 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -165,15 +165,10 @@ static inline bool pipe_full(unsigned int head, unsigned 
int tail,
 static inline unsigned int pipe_space_for_user(unsigned int head, unsigned int 
tail,
   struct pipe_inode_info *pipe)
 {
-   unsigned int p_occupancy, p_space;
-
-   p_occupancy = pipe_occupancy(head, tail);
+   unsigned int p_occupancy = pipe_occupancy(head, tail);
if (p_occupancy >= pipe->max_usage)
return 0;
-   p_space = pipe->ring_size - p_occupancy;
-   if (p_space > pipe->max_usage)
-   p_space = pipe->max_usage;
-   return p_space;
+   return pipe->max_usage - p_occupancy;
 }
 
 /**



Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-13 Thread Al Viro
On Mon, Jun 13, 2022 at 11:28:34PM +0100, Al Viro wrote:

> Dave, could you explain what's going on there?  Note that pipe_write()
> does *not* use that thing at all; it's only splice (i.e. ITER_PIPE
> stuff) that is using it.
> 
> What's wrong with
> p_occupancy = pipe_occupancy(head, tail);
> if (p_occupancy >= pipe->max_usage)
> return 0;
>   else
>   return pipe->max_usage - p_occupancy;
> 
> which would match the way you are using ->max_usage in pipe_write()
> et.al.  Including the use in copy_page_to_iter_pipe(), BTW...

The more I'm looking at that thing, the more it smells like a bug;
it had the same 3 callers since the time it had been introduced.

1) pipe_get_pages().  We are about to try and allocate up to that
many pipe buffers.  Allocation (done in push_pipe()) is done only
if we have !pipe_full(pipe->head, pipe->tail, pipe->max_usage).

It simply won't give you more than max_usage - occupancy.
Your function returns min(ring_size - occupancy, max_usage), which
is always greater than or equal to that (ring_size >= max_usage).

2) pipe_get_pages_alloc().  Same story, same push_pipe() being
called, same "we'll never get that much - it'll hit the limit
first".

3) iov_iter_npages() in case of ITER_PIPE.  Again, the value
is bogus - it should not be greater than the amount of pages
we would be able to write there.

AFAICS, 6718b6f855a0 "pipe: Allow pipes to have kernel-reserved slots"
broke it for cases when ring_size != max_usage...



Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-13 Thread Al Viro
On Mon, Jun 13, 2022 at 10:54:36AM -0700, Linus Torvalds wrote:
> On Sun, Jun 12, 2022 at 5:10 PM Al Viro  wrote:
> >
> > Unlike other copying operations on ITER_PIPE, copy_mc_to_iter() can
> > result in a short copy.  In that case we need to trim the unused
> > buffers, as well as the length of partially filled one - it's not
> > enough to set ->head, ->iov_offset and ->count to reflect how
> > much had we copied.  Not hard to fix, fortunately...
> >
> > I'd put a helper (pipe_discard_from(pipe, head)) into pipe_fs_i.h,
> > rather than iov_iter.c -
> 
> Actually, since this "copy_mc_xyz()" stuff is going to be entirely
> impossible to debug and replicate for any normal situation, I would
> suggest we take the approach that we (long ago) used to take with
> copy_from_user(): zero out the destination buffer, so that developers
> that can't test the faulting behavior don't have to worry about it.
> 
> And then the existing code is fine: it will break out of the loop, but
> it won't do the odd revert games and the "randomnoise.len -= rem"
> thing that I can't wrap my head around.
> 
> Hmm?

Not really - we would need to zero the rest of those pages somehow.
They are already allocated and linked into pipe; leaving them
there (and subsequent ones hadn't seen any stores whatsoever - they
are fresh out of alloc_page(GFP_USER)) is a non-starter.

We could do allocation as we go, but that's a much more intrusive
change...

BTW, speaking of pipes:
static inline unsigned int pipe_space_for_user(unsigned int head, unsigned int 
tail,
   struct pipe_inode_info *pipe)
{
unsigned int p_occupancy, p_space;

p_occupancy = pipe_occupancy(head, tail);
if (p_occupancy >= pipe->max_usage)
return 0;
p_space = pipe->ring_size - p_occupancy;
if (p_space > pipe->max_usage)
p_space = pipe->max_usage;
return p_space;
}

OK, if head - tail >= max_usage, we get 0.  Fair enough, since
pipe_full() callers will get "it's full, sod off" in that situation.
But...  what the hell is the rest doing?  p_space is the amount of
slots not in use.  So we return the lesser of it and max_usage?

Suppose we have 128 slots in the ring, with max_usage being below
that (e.g. 64).  63 slots are in use; you can add at most one.
And p_space is 65, so this sucker will return 64.

Dave, could you explain what's going on there?  Note that pipe_write()
does *not* use that thing at all; it's only splice (i.e. ITER_PIPE
stuff) that is using it.

What's wrong with
p_occupancy = pipe_occupancy(head, tail);
if (p_occupancy >= pipe->max_usage)
return 0;
else
return pipe->max_usage - p_occupancy;

which would match the way you are using ->max_usage in pipe_write()
et.al.  Including the use in copy_page_to_iter_pipe(), BTW...



Re: [RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-13 Thread Linus Torvalds
On Sun, Jun 12, 2022 at 5:10 PM Al Viro  wrote:
>
> Unlike other copying operations on ITER_PIPE, copy_mc_to_iter() can
> result in a short copy.  In that case we need to trim the unused
> buffers, as well as the length of partially filled one - it's not
> enough to set ->head, ->iov_offset and ->count to reflect how
> much had we copied.  Not hard to fix, fortunately...
>
> I'd put a helper (pipe_discard_from(pipe, head)) into pipe_fs_i.h,
> rather than iov_iter.c -

Actually, since this "copy_mc_xyz()" stuff is going to be entirely
impossible to debug and replicate for any normal situation, I would
suggest we take the approach that we (long ago) used to take with
copy_from_user(): zero out the destination buffer, so that developers
that can't test the faulting behavior don't have to worry about it.

And then the existing code is fine: it will break out of the loop, but
it won't do the odd revert games and the "randomnoise.len -= rem"
thing that I can't wrap my head around.

Hmm?

Linus



[RFC][PATCH] fix short copy handling in copy_mc_pipe_to_iter()

2022-06-12 Thread Al Viro
[commit in question sits in vfs.git#fixes]

Unlike other copying operations on ITER_PIPE, copy_mc_to_iter() can
result in a short copy.  In that case we need to trim the unused
buffers, as well as the length of partially filled one - it's not
enough to set ->head, ->iov_offset and ->count to reflect how
much had we copied.  Not hard to fix, fortunately...

I'd put a helper (pipe_discard_from(pipe, head)) into pipe_fs_i.h,
rather than iov_iter.c - it has nothing to do with iov_iter and
having it will allow us to avoid an ugly kludge in fs/splice.c.
We could put it into lib/iov_iter.c for now and move it later,
but I don't see the point going that way...

Fixes: ca146f6f091e "lib/iov_iter: Fix pipe handling in _copy_to_iter_mcsafe()"
Signed-off-by: Al Viro 
---
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index cb0fd633a610..4ea496924106 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -229,6 +229,15 @@ static inline bool pipe_buf_try_steal(struct 
pipe_inode_info *pipe,
return buf->ops->try_steal(pipe, buf);
 }
 
+static inline void pipe_discard_from(struct pipe_inode_info *pipe,
+   unsigned int old_head)
+{
+   unsigned int mask = pipe->ring_size - 1;
+
+   while (pipe->head > old_head)
+   pipe_buf_release(pipe, >bufs[--pipe->head & mask]);
+}
+
 /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual
memory allocation, whereas PIPE_BUF makes atomicity guarantees.  */
 #define PIPE_SIZE  PAGE_SIZE
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 0b64695ab632..2bf20b48a04a 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -689,6 +689,7 @@ static size_t copy_mc_pipe_to_iter(const void *addr, size_t 
bytes,
struct pipe_inode_info *pipe = i->pipe;
unsigned int p_mask = pipe->ring_size - 1;
unsigned int i_head;
+   unsigned int valid = pipe->head;
size_t n, off, xfer = 0;
 
if (!sanity(i))
@@ -702,11 +703,17 @@ static size_t copy_mc_pipe_to_iter(const void *addr, 
size_t bytes,
rem = copy_mc_to_kernel(p + off, addr + xfer, chunk);
chunk -= rem;
kunmap_local(p);
-   i->head = i_head;
-   i->iov_offset = off + chunk;
-   xfer += chunk;
-   if (rem)
+   if (chunk) {
+   i->head = i_head;
+   i->iov_offset = off + chunk;
+   xfer += chunk;
+   valid = i_head + 1;
+   }
+   if (rem) {
+   pipe->bufs[i_head & p_mask].len -= rem;
+   pipe_discard_from(pipe, valid);
break;
+   }
n -= chunk;
off = 0;
i_head++;