Re: UBSAN: shift-out-of-bounds in validate_sb_layout

2024-05-20 Thread Kent Overstreet
On Mon, May 20, 2024 at 10:31:53AM -0400, Steven Rostedt wrote:
> On Mon, 20 May 2024 15:02:26 +0800
> "Ubisectech Sirius"  wrote:
> 
> > Hello.
> > We are Ubisectech Sirius Team, the vulnerability lab of China ValiantSec. 
> > Recently, our team has discovered a issue in Linux kernel 6.7. Attached to 
> > the email were a PoC file of the issue.
> > 
> > Stack dump:
> > UBSAN: shift-out-of-bounds in fs/bcachefs/super-io.c:310:18
> > shift exponent 127 is too large for 32-bit type 'int'
> > CPU: 0 PID: 14408 Comm: syz-executor.3 Not tainted 6.7.0 #2
> > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 
> > 04/01/2014
> > Call Trace:
> >  
> >  __dump_stack lib/dump_stack.c:88 [inline]
> >  dump_stack_lvl+0x136/0x150 lib/dump_stack.c:106
> >  ubsan_epilogue lib/ubsan.c:217 [inline]
> >  __ubsan_handle_shift_out_of_bounds+0x24b/0x430 lib/ubsan.c:387
> >  validate_sb_layout.cold+0x1a/0x51 fs/bcachefs/super-io.c:310
> >  bch2_read_super+0x980/0x1000 fs/bcachefs/super-io.c:786
> >  bch2_fs_open+0x471/0x3890 fs/bcachefs/super.c:1922
> >  bch2_mount+0x538/0x13c0 fs/bcachefs/fs.c:1863
> >  legacy_get_tree+0x109/0x220 fs/fs_context.c:662
> >  vfs_get_tree+0x93/0x380 fs/super.c:1771
> >  do_new_mount fs/namespace.c:3337 [inline]
> >  path_mount+0x679/0x1e40 fs/namespace.c:3664
> >  do_mount fs/namespace.c:3677 [inline]
> >  __do_sys_mount fs/namespace.c:3886 [inline]
> >  __se_sys_mount fs/namespace.c:3863 [inline]
> >  __x64_sys_mount+0x287/0x310 fs/namespace.c:3863
> >  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
> >  do_syscall_64+0x43/0x120 arch/x86/entry/common.c:83
> >  entry_SYSCALL_64_after_hwframe+0x6f/0x77
> > RIP: 0033:0x7f41e1091b3e
> > Code: 48 c7 c0 ff ff ff ff eb aa e8 be 0d 00 00 66 2e 0f 1f 84 00 00 00 00 
> > 00 0f 1f 40 00 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff 
> > ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
> > RSP: 002b:7f41e1d22e38 EFLAGS: 0202 ORIG_RAX: 00a5
> > RAX: ffda RBX: 5d82 RCX: 7f41e1091b3e
> > RDX: 20005d80 RSI: 2100 RDI: 7f41e1d22e90
> > RBP: 7f41e1d22ed0 R08: 7f41e1d22ed0 R09: 0080
> > R10: 0080 R11: 0202 R12: 20005d80
> > R13: 2100 R14: 7f41e1d22e90 R15: 20005e00
> >  
> > 
> > Thank you for taking the time to read this email and we look forward to 
> > working with you further.
> 
> I'm not sure why this is getting Cc'd to linux-trace-kernel. That's for
> anything to do with the tracing code (trace events, tracepoints, kprobes,
> uprobes, function tracer etc).
> 
> What part of tracing is this for?

Everything I've seen from Ubisectech has been duplicates of stuff syzbot
found awhile ago and is already fixed.



Re: kernel BUG in ptr_stale

2024-05-09 Thread Kent Overstreet
On Thu, May 09, 2024 at 02:26:24PM +0800, Ubisectech Sirius wrote:
> Hello.
> We are Ubisectech Sirius Team, the vulnerability lab of China ValiantSec. 
> Recently, our team has discovered a issue in Linux kernel 6.7. Attached to 
> the email were a PoC file of the issue.

This (and several of your others) are fixed in Linus's tree.

> 
> Stack dump:
> 
> bcachefs (loop1): mounting version 1.7: (unknown version) 
> opts=metadata_checksum=none,data_checksum=none,nojournal_transaction_names
> [ cut here ]
> kernel BUG at fs/bcachefs/buckets.h:114!
> invalid opcode:  [#1] PREEMPT SMP KASAN NOPTI
> CPU: 1 PID: 9472 Comm: syz-executor.1 Not tainted 6.7.0 #2
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 
> 04/01/2014
> RIP: 0010:bucket_gen fs/bcachefs/buckets.h:114 [inline]
> RIP: 0010:ptr_stale+0x474/0x4e0 fs/bcachefs/buckets.h:188
> Code: 48 c7 c2 80 8c 1b 8b be 67 00 00 00 48 c7 c7 e0 8c 1b 8b c6 05 ea a6 72 
> 0b 01 e8 57 55 9c fd e9 fb fc ff ff e8 9d 02 bd fd 90 <0f> 0b 48 89 04 24 e8 
> 31 bb 13 fe 48 8b 04 24 e9 35 fc ff ff e8 23
> RSP: 0018:c90007c4ec38 EFLAGS: 00010246
> RAX: 0004 RBX: 0080 RCX: c90002679000
> RDX: 0004 RSI: 83ccf3b3 RDI: 0006
> RBP:  R08: 0006 R09: 1028
> R10: 0080 R11:  R12: 1028
> R13: 88804dee5100 R14:  R15: 88805b1a4110
> FS:  7f79ba8ab640() GS:88807ec0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 7f0bbda3f000 CR3: 5f37a000 CR4: 00750ef0
> DR0:  DR1:  DR2: 
> DR3:  DR6: fffe0ff0 DR7: 0400
> PKRU: 5554
> Call Trace:
>  
>  bch2_bkey_ptrs_to_text+0xb4e/0x1760 fs/bcachefs/extents.c:1012
>  bch2_btree_ptr_v2_to_text+0x288/0x330 fs/bcachefs/extents.c:215
>  bch2_val_to_text fs/bcachefs/bkey_methods.c:287 [inline]
>  bch2_bkey_val_to_text+0x1c8/0x210 fs/bcachefs/bkey_methods.c:297
>  journal_validate_key+0x7ab/0xb50 fs/bcachefs/journal_io.c:322
>  journal_entry_btree_root_validate+0x31c/0x380 fs/bcachefs/journal_io.c:411
>  bch2_journal_entry_validate+0xc7/0x130 fs/bcachefs/journal_io.c:752
>  bch2_sb_clean_validate_late+0x14b/0x1e0 fs/bcachefs/sb-clean.c:32
>  bch2_read_superblock_clean+0xbb/0x250 fs/bcachefs/sb-clean.c:160
>  bch2_fs_recovery+0x113/0x52d0 fs/bcachefs/recovery.c:691
>  bch2_fs_start+0x365/0x5e0 fs/bcachefs/super.c:978
>  bch2_fs_open+0x1ac9/0x3890 fs/bcachefs/super.c:1968
>  bch2_mount+0x538/0x13c0 fs/bcachefs/fs.c:1863
>  legacy_get_tree+0x109/0x220 fs/fs_context.c:662
>  vfs_get_tree+0x93/0x380 fs/super.c:1771
>  do_new_mount fs/namespace.c:3337 [inline]
>  path_mount+0x679/0x1e40 fs/namespace.c:3664
>  do_mount fs/namespace.c:3677 [inline]
>  __do_sys_mount fs/namespace.c:3886 [inline]
>  __se_sys_mount fs/namespace.c:3863 [inline]
>  __x64_sys_mount+0x287/0x310 fs/namespace.c:3863
>  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
>  do_syscall_64+0x43/0x120 arch/x86/entry/common.c:83
>  entry_SYSCALL_64_after_hwframe+0x6f/0x77
> RIP: 0033:0x7f79b9a91b3e
> Code: 48 c7 c0 ff ff ff ff eb aa e8 be 0d 00 00 66 2e 0f 1f 84 00 00 00 00 00 
> 0f 1f 40 00 f3 0f 1e fa 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 
> 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48
> RSP: 002b:7f79ba8aae38 EFLAGS: 0202 ORIG_RAX: 00a5
> RAX: ffda RBX: 000119f4 RCX: 7f79b9a91b3e
> RDX: 20011a00 RSI: 20011a40 RDI: 7f79ba8aae90
> RBP: 7f79ba8aaed0 R08: 7f79ba8aaed0 R09: 0181c050
> R10: 0181c050 R11: 0202 R12: 20011a00
> R13: 20011a40 R14: 7f79ba8aae90 R15: 21c0
>  
> Modules linked in:
> ---[ end trace  ]---
> 
> 
> Thank you for taking the time to read this email and we look forward to 
> working with you further.
> 
> 
> 
> 
> 
> 





Re: [PATCH v4 00/15] mm: jit/text allocator

2024-04-11 Thread Kent Overstreet
On Thu, Apr 11, 2024 at 07:00:36PM +0300, Mike Rapoport wrote:
> From: "Mike Rapoport (IBM)" 
> 
> Hi,
> 
> Since v3 I looked into making execmem more of an utility toolbox, as we
> discussed at LPC with Mark Rutland, but it was getting more hairier than
> having a struct describing architecture constraints and a type identifying
> the consumer of execmem.
> 
> And I do think that having the description of architecture constraints for
> allocations of executable memory in a single place is better that having it
> spread all over the place.
> 
> The patches available via git:
> https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git/log/?h=execmem/v4
> 
> v4 changes:
> * rebase on v6.9-rc2
> * rename execmem_params to execmem_info and execmem_arch_params() to
>   execmem_arch_setup()
> * use single execmem_alloc() API instead of execmem_{text,data}_alloc() (Song)
> * avoid extra copy of execmem parameters (Rick)
> * run execmem_init() as core_initcall() except for the architectures that
>   may allocated text really early (currently only x86) (Will)
> * add acks for some of arm64 and riscv changes, thanks Will and Alexandre
> * new commits:
>   - drop call to kasan_alloc_module_shadow() on arm64 because it's not
> needed anymore
>   - rename MODULE_START to MODULES_VADDR on MIPS
>   - use CONFIG_EXECMEM instead of CONFIG_MODULES on powerpc as per Christophe:
> 
> https://lore.kernel.org/all/79062fa3-3402-47b3-8920-9231ad05e...@csgroup.eu/
> 
> v3: https://lore.kernel.org/all/20230918072955.2507221-1-r...@kernel.org
> * add type parameter to execmem allocation APIs
> * remove BPF dependency on modules
> 
> v2: https://lore.kernel.org/all/20230616085038.4121892-1-r...@kernel.org
> * Separate "module" and "others" allocations with execmem_text_alloc()
> and jit_text_alloc()
> * Drop ROX entailment on x86
> * Add ack for nios2 changes, thanks Dinh Nguyen
> 
> v1: https://lore.kernel.org/all/20230601101257.530867-1-r...@kernel.org
> 
> = Cover letter from v1 (sligtly updated) =
> 
> module_alloc() is used everywhere as a mean to allocate memory for code.
> 
> Beside being semantically wrong, this unnecessarily ties all subsystmes
> that need to allocate code, such as ftrace, kprobes and BPF to modules and
> puts the burden of code allocation to the modules code.
> 
> Several architectures override module_alloc() because of various
> constraints where the executable memory can be located and this causes
> additional obstacles for improvements of code allocation.
> 
> A centralized infrastructure for code allocation allows allocations of
> executable memory as ROX, and future optimizations such as caching large
> pages for better iTLB performance and providing sub-page allocations for
> users that only need small jit code snippets.
> 
> Rick Edgecombe proposed perm_alloc extension to vmalloc [1] and Song Liu
> proposed execmem_alloc [2], but both these approaches were targeting BPF
> allocations and lacked the ground work to abstract executable allocations
> and split them from the modules core.
> 
> Thomas Gleixner suggested to express module allocation restrictions and
> requirements as struct mod_alloc_type_params [3] that would define ranges,
> protections and other parameters for different types of allocations used by
> modules and following that suggestion Song separated allocations of
> different types in modules (commit ac3b43283923 ("module: replace
> module_layout with module_memory")) and posted "Type aware module
> allocator" set [4].
> 
> I liked the idea of parametrising code allocation requirements as a
> structure, but I believe the original proposal and Song's module allocator
> was too module centric, so I came up with these patches.
> 
> This set splits code allocation from modules by introducing execmem_alloc()
> and and execmem_free(), APIs, replaces call sites of module_alloc() and
> module_memfree() with the new APIs and implements core text and related
> allocations in a central place.
> 
> Instead of architecture specific overrides for module_alloc(), the
> architectures that require non-default behaviour for text allocation must
> fill execmem_info structure and implement execmem_arch_setup() that returns
> a pointer to that structure. If an architecture does not implement
> execmem_arch_setup(), the defaults compatible with the current
> modules::module_alloc() are used.
> 
> Since architectures define different restrictions on placement,
> permissions, alignment and other parameters for memory that can be used by
> different subsystems that allocate executable memory, execmem APIs
> take a type argument, that will be used to identify the calling subsystem
> and to allow architectures to define parameters for ranges suitable for that
> subsystem.
> 
> The new infrastructure allows decoupling of BPF, kprobes and ftrace from
> modules, and most importantly it paves the way for ROX allocations for
> executable memory.

It looks like you're just doing API cleanup first, then 

Re: [FYI][PATCH] tracing/treewide: Remove second parameter of __assign_str()

2024-02-23 Thread Kent Overstreet
On Fri, Feb 23, 2024 at 01:46:53PM -0500, Steven Rostedt wrote:
> On Fri, 23 Feb 2024 10:30:45 -0800
> Jeff Johnson  wrote:
> 
> > On 2/23/2024 9:56 AM, Steven Rostedt wrote:
> > > From: "Steven Rostedt (Google)" 
> > > 
> > > [
> > >This is a treewide change. I will likely re-create this patch again in
> > >the second week of the merge window of v6.9 and submit it then. Hoping
> > >to keep the conflicts that it will cause to a minimum.
> > > ]
> > > 
> > > With the rework of how the __string() handles dynamic strings where it
> > > saves off the source string in field in the helper structure[1], the
> > > assignment of that value to the trace event field is stored in the helper
> > > value and does not need to be passed in again.  
> > 
> > Just curious if this could be done piecemeal by first changing the
> > macros to be variadic macros which allows you to ignore the extra
> > argument. The callers could then be modified in their separate trees.
> > And then once all the callers have be merged, the macros could be
> > changed to no longer be variadic.
> 
> I weighed doing that, but I think ripping off the band-aid is a better
> approach. One thing I found is that leaving unused parameters in the macros
> can cause bugs itself. I found one case doing my clean up, where an unused
> parameter in one of the macros was bogus, and when I made it a used
> parameter, it broke the build.
> 
> I think for tree-wide changes, the preferred approach is to do one big
> patch at once. And since this only affects TRACE_EVENT() macros, it
> hopefully would not be too much of a burden (although out of tree users may
> suffer from this, but do we care?)

Agreed on doing it all at once, it'll be way less spam for people to
deal with.

Tangentially related though, what would make me really happy is if we
could create the string with in the TP__fast_assign() section. I have to
have a bunch of annoying wrappers right now because the string length
has to be known when we invoke the tracepoint.



Re: linux-next: Tree for Sep 12 (bcachefs)

2023-09-14 Thread Kent Overstreet
On Wed, Sep 13, 2023 at 06:17:00PM -0700, Kees Cook wrote:
> On Tue, Sep 12, 2023 at 03:26:45PM +1000, Stephen Rothwell wrote:
> > New tree: bcachefs
> 
> Thanks for going through and fixing all the fake flexible array members.
> It looks much nicer. :)
> 
> I have some questions about the remaining "markers", for example:
> 
> $ git grep -A8 '\bkey_start\b' -- fs/bcachefs
> fs/bcachefs/bcachefs_format.h:  __u8key_start[0];
> ...
> fs/bcachefs/bcachefs_format.h-  __u8pad[sizeof(struct bkey) - 3];
> --
> fs/bcachefs/bkey.c: u8 *l = k->key_start;
> 
> Why isn't this just:
> 
>   u8 *l = k->pad
> 
> and you can drop the marker?

In this case, it's documentation. >pad tells us nothing; why is pad
significant? k->key_start documents the intent better.

> And some seem entirely unused, like all of "struct bch_reflink_v".

No, those aren't unused :)

bcachefs does the "list of variable size items" a lot - see vstructs.h.
start[] is the type of the item being stored, _data is what we use for
pointer arithmetic - because we always store sizes in units of u64s, for
alignment.

> 
> And some are going to fail at runtime, since they're still zero-sized
> and being used as an actual array:
> 
> struct bch_sb_field_journal_seq_blacklist {
> struct bch_sb_field field;
> 
> struct journal_seq_blacklist_entry start[0];
> __u64   _data[];
> };
> ...
> memmove(>start[i],
> >start[i + 1],
> sizeof(bl->start[0]) * (nr - i));
> 
> It looks like you just want a type union for the flexible array.
> This can be done like this:
> 
> struct bch_sb_field_journal_seq_blacklist {
> struct bch_sb_field field;
> 
>   union {
>   DECLARE_FLEX_ARRAY(struct journal_seq_blacklist_entry, start);
>   DECLARE_FLEX_ARRAY(__u64, _data);
>   };
> };

Eesh, why though?

Honestly, I'm not a fan of the change to get rid of zero size arrays,
this seems to be adding a whole lot of macro layering and indirection
for nothing.

The only thing a zero size array could possibly be is a flexible array
member or a marker, why couldn't we have just kept treating zero size
arrays like flexible array members?


Re: [GIT PULL] bcachefs

2023-09-09 Thread Kent Overstreet
On Wed, Sep 06, 2023 at 01:20:59PM -0700, Linus Torvalds wrote:
> On Wed, 6 Sept 2023 at 13:02, Linus Torvalds
>  wrote:
> >
> > And guess what happens when you have (unsigned char)-1? It does *not*
> > cast back to -1.
> 
> Side note: again, this may be one of those "it works in practice",
> because if we have -fshort-enums, I think 'enum
> btree_node_locked_type' in turn ends up being represented as a 'signed
> char', because that's the smallest simple type that can fit all those
> values.
> 
> I don't think gcc ever uses less than that (ie while a six_lock_type
> could fit in two bits, it's still going to be considered at least a
> 8-bit value in practice).
> 
> So we may have 'enum six_lock_type' essentially being 'unsigned char',
> and when the code does
> 
> mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED);
> 
> that BTREE_NODE_UNLOCKED value might actually be 255.
> 
> And then when it's cast to 'enum btree_node_locked_type' in the inline
> function, the 255 will be cast to 'signed char', and we'll end up
> compatible with '(enum btree_node_locked_type)-1' again.
> 
> So it's one of those things that are seriously wrong to do, but might
> generate the expected code anyway.
> 
> Unless the compiler adds any other sanity checks, like UBSAN or
> something, that actually uses the exact range of the enums.
> 
> It could happen even without UBSAN, if the compiler ends up going "I
> can see that the original value came from a 'enum six_lock_type', so I
> know the original value can't be signed, so any comparison with
> BTREE_NODE_UNLOCKED can never be true.
> 
> But again, I suspect that in practice this all just happens to work.
> That doesn't make it right.

No, this was just broken - it should have been
mark_btree_node_unlocked(), we never should've been passing that enum
val there.


[PATCH 03/11] tools/testing/nvdimm: Convert to printbuf

2022-08-15 Thread Kent Overstreet
From: Kent Overstreet 

This converts from seq_buf to printbuf. Here we're using printbuf with
an external buffer, meaning it's a direct conversion.

Signed-off-by: Kent Overstreet 
Cc: Dan Williams 
Cc: Dave Hansen 
Cc: nvd...@lists.linux.dev
Acked-by: Dan Williams 
Tested-By: Shivaprasad G Bhat 
---
 tools/testing/nvdimm/test/ndtest.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/tools/testing/nvdimm/test/ndtest.c 
b/tools/testing/nvdimm/test/ndtest.c
index 4d1a947367..a2097955da 100644
--- a/tools/testing/nvdimm/test/ndtest.c
+++ b/tools/testing/nvdimm/test/ndtest.c
@@ -12,7 +12,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #include "../watermark.h"
 #include "nfit_test.h"
@@ -740,32 +740,30 @@ static ssize_t flags_show(struct device *dev,
 {
struct nvdimm *nvdimm = to_nvdimm(dev);
struct ndtest_dimm *dimm = nvdimm_provider_data(nvdimm);
-   struct seq_buf s;
+   struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE);
u64 flags;
 
flags = dimm->flags;
 
-   seq_buf_init(, buf, PAGE_SIZE);
if (flags & PAPR_PMEM_UNARMED_MASK)
-   seq_buf_printf(, "not_armed ");
+   prt_printf(, "not_armed ");
 
if (flags & PAPR_PMEM_BAD_SHUTDOWN_MASK)
-   seq_buf_printf(, "flush_fail ");
+   prt_printf(, "flush_fail ");
 
if (flags & PAPR_PMEM_BAD_RESTORE_MASK)
-   seq_buf_printf(, "restore_fail ");
+   prt_printf(, "restore_fail ");
 
if (flags & PAPR_PMEM_SAVE_MASK)
-   seq_buf_printf(, "save_fail ");
+   prt_printf(, "save_fail ");
 
if (flags & PAPR_PMEM_SMART_EVENT_MASK)
-   seq_buf_printf(, "smart_notify ");
+   prt_printf(, "smart_notify ");
 
+   if (printbuf_written())
+   prt_printf(, "\n");
 
-   if (seq_buf_used())
-   seq_buf_printf(, "\n");
-
-   return seq_buf_used();
+   return printbuf_written();
 }
 static DEVICE_ATTR_RO(flags);
 
-- 
2.36.1




Re: [PATCH v5 24/32] tools/testing/nvdimm: Convert to printbuf

2022-08-08 Thread Kent Overstreet

On 8/8/22 14:30, Dan Williams wrote:

Matthew Wilcox (Oracle) wrote:

From: Kent Overstreet 

This converts from seq_buf to printbuf. Here we're using printbuf with
an external buffer, meaning it's a direct conversion.

Signed-off-by: Kent Overstreet 
Cc: Dan Williams 
Cc: Dave Hansen 
Cc: nvd...@lists.linux.dev


My Acked-by still applies:

https://lore.kernel.org/all/62b61165348f4_a7a2f294d0@dwillia2-xfh.notmuch/

...and Shivaprasad's Tested-by should still apply:

https://lore.kernel.org/all/b299ebe2-88e5-c2bd-bad0-bef62d4ac...@linux.ibm.com/


Whoops - got them now, thanks!



[PATCH v4 26/34] tools/testing/nvdimm: Convert to printbuf

2022-06-19 Thread Kent Overstreet
This converts from seq_buf to printbuf. Here we're using printbuf with
an external buffer, meaning it's a direct conversion.

Signed-off-by: Kent Overstreet 
Cc: Dan Williams 
Cc: Dave Hansen 
Cc: nvd...@lists.linux.dev
---
 tools/testing/nvdimm/test/ndtest.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/tools/testing/nvdimm/test/ndtest.c 
b/tools/testing/nvdimm/test/ndtest.c
index 4d1a947367..a2097955da 100644
--- a/tools/testing/nvdimm/test/ndtest.c
+++ b/tools/testing/nvdimm/test/ndtest.c
@@ -12,7 +12,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #include "../watermark.h"
 #include "nfit_test.h"
@@ -740,32 +740,30 @@ static ssize_t flags_show(struct device *dev,
 {
struct nvdimm *nvdimm = to_nvdimm(dev);
struct ndtest_dimm *dimm = nvdimm_provider_data(nvdimm);
-   struct seq_buf s;
+   struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE);
u64 flags;
 
flags = dimm->flags;
 
-   seq_buf_init(, buf, PAGE_SIZE);
if (flags & PAPR_PMEM_UNARMED_MASK)
-   seq_buf_printf(, "not_armed ");
+   prt_printf(, "not_armed ");
 
if (flags & PAPR_PMEM_BAD_SHUTDOWN_MASK)
-   seq_buf_printf(, "flush_fail ");
+   prt_printf(, "flush_fail ");
 
if (flags & PAPR_PMEM_BAD_RESTORE_MASK)
-   seq_buf_printf(, "restore_fail ");
+   prt_printf(, "restore_fail ");
 
if (flags & PAPR_PMEM_SAVE_MASK)
-   seq_buf_printf(, "save_fail ");
+   prt_printf(, "save_fail ");
 
if (flags & PAPR_PMEM_SMART_EVENT_MASK)
-   seq_buf_printf(, "smart_notify ");
+   prt_printf(, "smart_notify ");
 
+   if (printbuf_written())
+   prt_printf(, "\n");
 
-   if (seq_buf_used())
-   seq_buf_printf(, "\n");
-
-   return seq_buf_used();
+   return printbuf_written();
 }
 static DEVICE_ATTR_RO(flags);
 
-- 
2.36.1




[PATCH v3 26/33] tools/testing/nvdimm: Convert to printbuf

2022-06-04 Thread Kent Overstreet
This converts from seq_buf to printbuf. Here we're using printbuf with
an external buffer, meaning it's a direct conversion.

Signed-off-by: Kent Overstreet 
Cc: Dan Williams 
Cc: Dave Hansen 
Cc: nvd...@lists.linux.dev
---
 tools/testing/nvdimm/test/ndtest.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/tools/testing/nvdimm/test/ndtest.c 
b/tools/testing/nvdimm/test/ndtest.c
index 4d1a947367..a2097955da 100644
--- a/tools/testing/nvdimm/test/ndtest.c
+++ b/tools/testing/nvdimm/test/ndtest.c
@@ -12,7 +12,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #include "../watermark.h"
 #include "nfit_test.h"
@@ -740,32 +740,30 @@ static ssize_t flags_show(struct device *dev,
 {
struct nvdimm *nvdimm = to_nvdimm(dev);
struct ndtest_dimm *dimm = nvdimm_provider_data(nvdimm);
-   struct seq_buf s;
+   struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE);
u64 flags;
 
flags = dimm->flags;
 
-   seq_buf_init(, buf, PAGE_SIZE);
if (flags & PAPR_PMEM_UNARMED_MASK)
-   seq_buf_printf(, "not_armed ");
+   prt_printf(, "not_armed ");
 
if (flags & PAPR_PMEM_BAD_SHUTDOWN_MASK)
-   seq_buf_printf(, "flush_fail ");
+   prt_printf(, "flush_fail ");
 
if (flags & PAPR_PMEM_BAD_RESTORE_MASK)
-   seq_buf_printf(, "restore_fail ");
+   prt_printf(, "restore_fail ");
 
if (flags & PAPR_PMEM_SAVE_MASK)
-   seq_buf_printf(, "save_fail ");
+   prt_printf(, "save_fail ");
 
if (flags & PAPR_PMEM_SMART_EVENT_MASK)
-   seq_buf_printf(, "smart_notify ");
+   prt_printf(, "smart_notify ");
 
+   if (printbuf_written())
+   prt_printf(, "\n");
 
-   if (seq_buf_used())
-   seq_buf_printf(, "\n");
-
-   return seq_buf_used();
+   return printbuf_written();
 }
 static DEVICE_ATTR_RO(flags);
 
-- 
2.36.0




[PATCH v2 25/28] tools/testing/nvdimm: Convert to printbuf

2022-05-19 Thread Kent Overstreet
This converts from seq_buf to printbuf. Here we're using printbuf with
an external buffer, meaning it's a direct conversion.

Signed-off-by: Kent Overstreet 
Cc: Dan Williams 
Cc: Dave Hansen 
Cc: nvd...@lists.linux.dev
---
 tools/testing/nvdimm/test/ndtest.c | 22 ++
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/tools/testing/nvdimm/test/ndtest.c 
b/tools/testing/nvdimm/test/ndtest.c
index 3ca7c32e93..e9b642f7f8 100644
--- a/tools/testing/nvdimm/test/ndtest.c
+++ b/tools/testing/nvdimm/test/ndtest.c
@@ -12,7 +12,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #include "../watermark.h"
 #include "nfit_test.h"
@@ -797,32 +797,30 @@ static ssize_t flags_show(struct device *dev,
 {
struct nvdimm *nvdimm = to_nvdimm(dev);
struct ndtest_dimm *dimm = nvdimm_provider_data(nvdimm);
-   struct seq_buf s;
+   struct printbuf s = PRINTBUF_EXTERN(buf, PAGE_SIZE);
u64 flags;
 
flags = dimm->flags;
 
-   seq_buf_init(, buf, PAGE_SIZE);
if (flags & PAPR_PMEM_UNARMED_MASK)
-   seq_buf_printf(, "not_armed ");
+   pr_buf(, "not_armed ");
 
if (flags & PAPR_PMEM_BAD_SHUTDOWN_MASK)
-   seq_buf_printf(, "flush_fail ");
+   pr_buf(, "flush_fail ");
 
if (flags & PAPR_PMEM_BAD_RESTORE_MASK)
-   seq_buf_printf(, "restore_fail ");
+   pr_buf(, "restore_fail ");
 
if (flags & PAPR_PMEM_SAVE_MASK)
-   seq_buf_printf(, "save_fail ");
+   pr_buf(, "save_fail ");
 
if (flags & PAPR_PMEM_SMART_EVENT_MASK)
-   seq_buf_printf(, "smart_notify ");
+   pr_buf(, "smart_notify ");
 
+   if (printbuf_written())
+   pr_buf(, "\n");
 
-   if (seq_buf_used())
-   seq_buf_printf(, "\n");
-
-   return seq_buf_used();
+   return printbuf_written();
 }
 static DEVICE_ATTR_RO(flags);
 
-- 
2.36.0




Re: [RFC PATCH] percpu_ref: Make percpu_ref_tryget*() ACQUIRE operations

2021-04-15 Thread Kent Overstreet
On Thu, Apr 15, 2021 at 09:42:56PM -0700, Paul E. McKenney wrote:
> On Tue, Apr 13, 2021 at 10:47:03AM +0800, Huang Ying wrote:
> > One typical use case of percpu_ref_tryget() family functions is as
> > follows,
> > 
> >   if (percpu_ref_tryget(>ref)) {
> >   /* Operate on the other fields of *p */
> >   }
> > 
> > The refcount needs to be checked before operating on the other fields
> > of the data structure (*p), otherwise, the values gotten from the
> > other fields may be invalid or inconsistent.  To guarantee the correct
> > memory ordering, percpu_ref_tryget*() needs to be the ACQUIRE
> > operations.
> 
> I am not seeing the need for this.
> 
> If __ref_is_percpu() returns true, then the overall count must be non-zero
> and there will be an RCU grace period between now and the time that this
> count becomes zero.  For the calls to __ref_is_percpu() enclosed within
> rcu_read_lock() and rcu_read_unlock(), the grace period will provide
> the needed ordering.  (See the comment header for the synchronize_rcu()
> function.)
> 
> Otherwise, when __ref_is_percpu() returns false, its caller does a
> value-returning atomic read-modify-write operation, which provides
> full ordering.
> 
> Either way, the required acquire semantics (and more) are already
> provided, and in particular, this analysis covers the percpu_ref_tryget()
> you call out above.
> 
> Or am I missing something subtle here?

I think you're right, but some details about the race we're concerned about
would be helpful. Are we concerned about seeing values from after the ref has
hit 0? In that case I agree with Paul. Or is the concern about seeing values
from before a transition from 0 to nonzero? That wasn't a concern when I wrote
the code for the patterns of use I had in mind, but Tejun's done some stuff with
the code since.

Huang, can you elaborate?


Re: [PATCH 3/3] vfs: inode cache conversion to hash-bl

2021-04-06 Thread Kent Overstreet
On Tue, Apr 06, 2021 at 10:33:43PM +1000, Dave Chinner wrote:
> From: Dave Chinner 
> 
> Because scalability of the global inode_hash_lock really, really
> sucks and prevents me from doing scalability characterisation and
> analysis of bcachefs algorithms.
> 
> Profiles of a 32-way concurrent create of 51.2m inodes with fsmark
> on a couple of different filesystems on a 5.10 kernel:
> 
> -   52.13% 0.04%  [kernel][k] ext4_create
>- 52.09% ext4_create
>   - 41.03% __ext4_new_inode
>  - 29.92% insert_inode_locked
> - 25.35% _raw_spin_lock
>- do_raw_spin_lock
>   - 24.97% __pv_queued_spin_lock_slowpath
> 
> 
> -   72.33% 0.02%  [kernel][k] do_filp_open
>- 72.31% do_filp_open
>   - 72.28% path_openat
>  - 57.03% bch2_create
> - 56.46% __bch2_create
>- 40.43% inode_insert5
>   - 36.07% _raw_spin_lock
>  - do_raw_spin_lock
>   35.86% __pv_queued_spin_lock_slowpath
> 4.02% find_inode
> 
> btrfs was tested but it is limited by internal lock contention at
> >=2 threads on this workload, so never hammers the inode cache lock
> hard enough for this change to matter to it's performance.
> 
> However, both bcachefs and ext4 demonstrate poor scaling at >=8
> threads on concurrent lookup or create workloads.
> 
> Hence convert the inode hash table to a RCU-aware hash-bl table just
> like the dentry cache. Note that we need to store a pointer to the
> hlist_bl_head the inode has been added to in the inode so that when
> it comes to unhash the inode we know what list to lock. We need to
> do this because, unlike the dentry cache, the hash value that is
> used to hash the inode is not generated from the inode itself. i.e.
> filesystems can provide this themselves so we have to either store
> the hashval or the hlist head pointer in the inode to be able to
> find the right list head for removal...
> 
> Concurrent create with variying thread count (files/s):
> 
>   vanilla Patched
> threads   ext4  bcachefs  ext4  bcachefs
> 2 117k112k   85k
> 4 185k190k  145k
> 8 303k  185k  346k  255k
> 16389k  190k  465k  420k
> 32360k  142k  437k  481k
> 
>   ext4bcachefs
> threads   vanilla  patchedvanilla patched
> 2 117k 112k80k 85k
> 4 185k 190k   133k145k
> 8 303k 346k   185k255k
> 16389k 465k   190k420k
> 32360k 437k   142k481k
> 
> CPU usage for both bcachefs and ext4 at 16 and 32 threads has been
> halved on the patched kernel, while performance has increased
> marginally on ext4 and massively on bcachefs. Internal filesystem
> algorithms now limit performance on these workloads, not the global
> inode_hash_lock.
> 
> Profile of the workloads on the patched kernels:
> 
> -   35.94% 0.07%  [kernel]  [k] ext4_create
>- 35.87% ext4_create
>   - 20.45% __ext4_new_inode
> ...
>3.36% insert_inode_locked
> 
>- 78.43% do_filp_open
>   - 78.36% path_openat
>  - 53.95% bch2_create
> - 47.99% __bch2_create
> 
>   - 7.57% inode_insert5
>     6.94% find_inode
> 
> Spinlock contention is largely gone from the inode hash operations
> and the filesystems are limited by contention in their internal
> algorithms.
> 
> Signed-off-by: Dave Chinner 

Reviewed-and-tested-by: Kent Overstreet 

> ---
>  fs/inode.c | 200 -
>  include/linux/fs.h |   9 +-
>  2 files changed, 132 insertions(+), 77 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index b8d9eb3454dc..867af386177b 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -57,8 +57,7 @@
>  
>  static unsigned int i_hash_mask __read_mostly;
>  static unsigned int i_hash_shift __read_mostly;
> -static struct hlist_head *inode_hashtable __read_mostly;
> -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
> +static struct hlist_bl_head *inode_hashtable __read_mostly;
>  
>  static unsigned long hash(struct super_block *sb, unsigned long hashval)
>  {
> @@ -70,7 +69,7 @@ static unsigned long hash(struct super_block *sb, unsigned 
> long hashval)
>   return tmp & i_hash_mask;

Re: [PATCH 2/3] hlist-bl: add hlist_bl_fake()

2021-04-06 Thread Kent Overstreet
On Tue, Apr 06, 2021 at 10:33:42PM +1000, Dave Chinner wrote:
> From: Dave Chinner 
> 
> in preparation for switching the VFS inode cache over the hlist_bl
> lists, we nee dto be able to fake a list node that looks like it is
> hased for correct operation of filesystems that don't directly use
> the VFS indoe cache.
> 
> Signed-off-by: Dave Chinner 

Reviewed-by: Kent Overstreet 

> ---
>  include/linux/list_bl.h | 22 ++
>  1 file changed, 22 insertions(+)
> 
> diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
> index ae1b541446c9..8ee2bf5af131 100644
> --- a/include/linux/list_bl.h
> +++ b/include/linux/list_bl.h
> @@ -143,6 +143,28 @@ static inline void hlist_bl_del_init(struct 
> hlist_bl_node *n)
>   }
>  }
>  
> +/**
> + * hlist_bl_add_fake - create a fake list consisting of a single headless 
> node
> + * @n: Node to make a fake list out of
> + *
> + * This makes @n appear to be its own predecessor on a headless hlist.
> + * The point of this is to allow things like hlist_bl_del() to work correctly
> + * in cases where there is no list.
> + */
> +static inline void hlist_bl_add_fake(struct hlist_bl_node *n)
> +{
> + n->pprev = >next;
> +}
> +
> +/**
> + * hlist_fake: Is this node a fake hlist_bl?
> + * @h: Node to check for being a self-referential fake hlist.
> + */
> +static inline bool hlist_bl_fake(struct hlist_bl_node *n)
> +{
> + return n->pprev == >next;
> +}
> +
>  static inline void hlist_bl_lock(struct hlist_bl_head *b)
>  {
>   bit_spin_lock(0, (unsigned long *)b);
> -- 
> 2.31.0
> 


Re: [PATCH 1/3] vfs: factor out inode hash head calculation

2021-04-06 Thread Kent Overstreet
On Tue, Apr 06, 2021 at 10:33:41PM +1000, Dave Chinner wrote:
> From: Dave Chinner 
> 
> In preparation for changing the inode hash table implementation.
> 
> Signed-off-by: Dave Chinner 

Reviewed-by: Kent Overstreet 

> ---
>  fs/inode.c | 44 +---
>  1 file changed, 25 insertions(+), 19 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index a047ab306f9a..b8d9eb3454dc 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -60,6 +60,22 @@ static unsigned int i_hash_shift __read_mostly;
>  static struct hlist_head *inode_hashtable __read_mostly;
>  static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
>  
> +static unsigned long hash(struct super_block *sb, unsigned long hashval)
> +{
> + unsigned long tmp;
> +
> + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
> + L1_CACHE_BYTES;
> + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
> + return tmp & i_hash_mask;
> +}
> +
> +static inline struct hlist_head *i_hash_head(struct super_block *sb,
> + unsigned int hashval)
> +{
> + return inode_hashtable + hash(sb, hashval);
> +}
> +
>  /*
>   * Empty aops. Can be used for the cases where the user does not
>   * define any of the address_space operations.
> @@ -475,16 +491,6 @@ static inline void inode_sb_list_del(struct inode *inode)
>   }
>  }
>  
> -static unsigned long hash(struct super_block *sb, unsigned long hashval)
> -{
> - unsigned long tmp;
> -
> - tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
> - L1_CACHE_BYTES;
> - tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift);
> - return tmp & i_hash_mask;
> -}
> -
>  /**
>   *   __insert_inode_hash - hash an inode
>   *   @inode: unhashed inode
> @@ -1073,7 +1079,7 @@ struct inode *inode_insert5(struct inode *inode, 
> unsigned long hashval,
>   int (*test)(struct inode *, void *),
>   int (*set)(struct inode *, void *), void *data)
>  {
> - struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval);
> + struct hlist_head *head = i_hash_head(inode->i_sb, hashval);
>   struct inode *old;
>   bool creating = inode->i_state & I_CREATING;
>  
> @@ -1173,7 +1179,7 @@ EXPORT_SYMBOL(iget5_locked);
>   */
>  struct inode *iget_locked(struct super_block *sb, unsigned long ino)
>  {
> - struct hlist_head *head = inode_hashtable + hash(sb, ino);
> + struct hlist_head *head = i_hash_head(sb, ino);
>   struct inode *inode;
>  again:
>   spin_lock(_hash_lock);
> @@ -1241,7 +1247,7 @@ EXPORT_SYMBOL(iget_locked);
>   */
>  static int test_inode_iunique(struct super_block *sb, unsigned long ino)
>  {
> - struct hlist_head *b = inode_hashtable + hash(sb, ino);
> + struct hlist_head *b = i_hash_head(sb, ino);
>   struct inode *inode;
>  
>   hlist_for_each_entry_rcu(inode, b, i_hash) {
> @@ -1328,7 +1334,7 @@ EXPORT_SYMBOL(igrab);
>  struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
>   int (*test)(struct inode *, void *), void *data)
>  {
> - struct hlist_head *head = inode_hashtable + hash(sb, hashval);
> + struct hlist_head *head = i_hash_head(sb, hashval);
>   struct inode *inode;
>  
>   spin_lock(_hash_lock);
> @@ -1383,7 +1389,7 @@ EXPORT_SYMBOL(ilookup5);
>   */
>  struct inode *ilookup(struct super_block *sb, unsigned long ino)
>  {
> - struct hlist_head *head = inode_hashtable + hash(sb, ino);
> + struct hlist_head *head = i_hash_head(sb, ino);
>   struct inode *inode;
>  again:
>   spin_lock(_hash_lock);
> @@ -1432,7 +1438,7 @@ struct inode *find_inode_nowait(struct super_block *sb,
>void *),
>   void *data)
>  {
> - struct hlist_head *head = inode_hashtable + hash(sb, hashval);
> + struct hlist_head *head = i_hash_head(sb, hashval);
>   struct inode *inode, *ret_inode = NULL;
>   int mval;
>  
> @@ -1477,7 +1483,7 @@ EXPORT_SYMBOL(find_inode_nowait);
>  struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval,
>int (*test)(struct inode *, void *), void *data)
>  {
> - struct hlist_head *head = inode_hashtable + hash(sb, hashval);
> + struct hlist_head *head = i_hash_head(sb, hashval);
>   struct inode *inode;
>  
>   RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
> @@ -1515,7 +1521,7 @@ EXPORT_SYMBOL(find_inode_rcu);
>  struct inode *find_inode_by_ino_rcu(st

Re: [RFC PATCH 0/3] vfs: convert inode cache to hlist-bl

2021-04-06 Thread Kent Overstreet
On Tue, Apr 06, 2021 at 10:33:40PM +1000, Dave Chinner wrote:
> Hi folks,
> 
> Recently I've been doing some scalability characterisation of
> various filesystems, and one of the limiting factors that has
> prevented me from exploring filesystem characteristics is the
> inode hash table. namely, the global inode_hash_lock that protects
> it.
> 
> This has long been a problem, but I personally haven't cared about
> it because, well, XFS doesn't use it and so it's not a limiting
> factor for most of my work. However, in trying to characterise the
> scalability boundaries of bcachefs, I kept hitting against VFS
> limitations first. bcachefs hits the inode hash table pretty hard
> and it becaomse a contention point a lot sooner than it does for
> ext4. Btrfs also uses the inode hash, but it's namespace doesn't
> have the capability to stress the indoe hash lock due to it hitting
> internal contention first.
> 
> Long story short, I did what should have been done a decade or more
> ago - I converted the inode hash table to use hlist-bl to split up
> the global lock. This is modelled on the dentry cache, with one
> minor tweak. That is, the inode hash value cannot be calculated from
> the inode, so we have to keep a record of either the hash value or a
> pointer to the hlist-bl list head that the inode is hashed into so
> taht we can lock the corect list on removal.
> 
> Other than that, this is mostly just a mechanical conversion from
> one list and lock type to another. None of the algorithms have
> changed and none of the RCU behaviours have changed. But it removes
> the inode_hash_lock from the picture and so performance for bcachefs
> goes way up and CPU usage for ext4 halves at 16 and 32 threads. At
> higher thread counts, we start to hit filesystem and other VFS locks
> as the limiting factors. Profiles and performance numbers are in
> patch 3 for those that are curious.
> 
> I've been running this in benchmarks and perf testing across
> bcachefs, btrfs and ext4 for a couple of weeks, and it passes
> fstests on ext4 and btrfs without regressions. So now it needs more
> eyes and testing and hopefully merging

These patches have been in the bcachefs repo for a bit with no issues, and they
definitely do help with performance - thanks, Dave!


Re: [PATCH v6 00/27] Memory Folios

2021-04-02 Thread Kent Overstreet
On Wed, Mar 31, 2021 at 07:47:01PM +0100, Matthew Wilcox (Oracle) wrote:
> The medium-term goal is to convert all filesystems and some device
> drivers to work in terms of folios.  This series contains a lot of
> explicit conversions, but it's important to realise it's removing a lot
> of implicit conversions in some relatively hot paths.  There will be very
> few conversions from folios when this work is completed; filesystems,
> the page cache, the LRU and so on will generally only deal with folios.

I'm pretty excited for this to land - 4k page overhead has been a pain point for
me for quite some time. I know this is going to be a lot of churn but I think
leveraging the type system is exactly the right way to go about this, and I
can't wait to start converting bcachefs.


bcachefs snapshots design doc - RFC

2021-03-17 Thread Kent Overstreet
Snapshots for bcaches are well under way, and I've written a design doc for
them. I'd love to get feedback on anything I might have missed, especially from
the btrfs people.

The current version of this document lives at
  https://bcachefs.org/Snapshots/

and the in-progress code lives at
  https://evilpiepirate.org/git/bcachefs.git/log/?h=snapshots

thanks for reading!

Snapshots & subvolumes:
===

The short version:

bcachefs snapshots are a different approach - we're not using COW btrees.
Instead, they're based on extending the key filesystem items (inodes, dirents,
xattrs, extents) with a version number - a snapshot id - for the low bits.

Snapshot IDs form a tree, which will be recorded in the snapshots btree. The
root snapshot ID is `U32_MAX`, and we allocate new snapshot IDs growing down so
that the parent of a given snapshot ID is always larger than that ID.

To create a new writeable snapshot, we allocate two new snapshot IDs. We update
the existing subvolume with one of the new snapshot IDs, and assign the other
snapshot ID to the new snapshot.

When we do a lookup for a filesystem item, we have to check if the snapshot ID
of the key we found is an ancestor of the snapshot ID we're searching for, and
filter out items that aren't.

Subvolumes:
===

Subvolumes are needed for two reasons:

* They're the mechanism for accessing snapshots

* Also, they're a way of isolating different parts of the filesystem hierarchy
  from snapshots, or taking snapshots that aren't global. I.e. you'd create a
  subvolume for your database so that filesystem snapshots don't COW it, or
  create subvolumes for each home directory so that users can snapshot their
  own home directories.

The functionality and userspace interface for snapshots and subvolumes are
roughly modelled after btrfs, but simplified.

Subvolumes in bcachefs are just fancy directories. We introduce internally a new
dirent type that can point to subvolumes, instead of inodes, and the subvolume
has a pointer to the root inode. Subvolumes get their own table (btree), and
subvolume keys have fields for root inode number and snapshot ID.

Subvolumes have no name outside of the filesystem heirarchy. This means that, in
order to enumerate and list subvolumes, we need to be able to reconstruct their
path.

To reconstruct paths, we're adding inode backpointers - two new inode fields for
the inode number of the directory they're in, and the dirent offset. We're only
adding fields for a single backpointer, i.e. we're not handling hardlinks yet -
we set an inode flag indicating the backpointer fields are untrusted whenever we
create a hardlink. If we do ever want to add multiple backpointers for files
with hardlinks, we'll need to add another btree where each backpointer gets its
own key. This should be good enough for now.

Subvolume root inodes have two more fields: one for the subvolume ID, and
another for the parent subvolume ID. The subvolume ID field is only set for
subvolume roots because otherwise taking a snapshot would require updating every
inode in that subvolume. With these fields and inode backpointers, we'll be able
to reconstruct a path to any directory, or any file that hasn't been hardlinked.

Snapshots:
==

We're also adding another table (btree) for snapshot keys. Snapshot keys form a
tree where each node is just a u32. The btree iterator code that filters by
snapshot ID assumes that a parent IDs are always larger than child IDs, so the
root starts at `U32_MAX`. And, there will be multiple trees - creating a new
empty subvolume will allocate a new snapshot ID that has no parent node.

Any filesystem operation that's within a subvolume starts by looking up the key
for that subvolume to get the current snapshot ID, to be used for both lookups
and updates. This means we have to remember what subvolume we're in, in the in
memory `bch_inode_info` - as mentioned previously only subvolume root inodes
have this field in the btree.

The btree iterator code is getting two new flags - `BTREE_ITER_ALL_SNAPSHOTS`
and `BTREE_ITER_FILTER_SNAPSHOTS`, that controls how iteration handles the
snapshot field of the key. One of these flags should be specified for iterators
for a btree that uses the snapshots field. `BTREE_ITER_ALL_SNAPSHOTS` means
don't handle the snapshot field specially: it returns every key, and
advancing/rewinding the iterator position increments/decrements the snapshot
field. `BTREE_ITER_FILTER_SNAPSHOTS` means incrementing/decrementing the
iterator position does not include the snapshot field - there's only one
iterator position for each inode number:offset - and we return the key that
matches the specified snapshot ID, or the first ancestor if not found.

The update path, `bch2_trans_commit()` now has more work to do:

* When deleting, we have to check if there's another key at the same position
  in an ancestor snapshot that would become visible - if so, we need to insert
  a whiteout instead.

* When 

bcachefs-for-review

2020-12-13 Thread Kent Overstreet
Since last posting: The main change for upstreaming is that I've added deadlock
avoidance code for the page cache coherency lock, and moved all of that code
into fs/bcachefs/ for now - besides adding faults_disabled_mapping to task
struct. See
https://lore.kernel.org/linux-fsdevel/2020191011.ge3365...@moria.home.lan/

This addresses the last known blocker.

Bcachefs status - not a lot new to report. I've been avoiding making major
changes for awhile and focusing on stabilizing and torture testing, and outside
of erasure coding almost everything is looking pretty solid.

There've been some recent performance improvements - the main one is I changed
the journalling code so that journal writes no longer have to be marked flush +
fua, so now we only issue flush/fua writes when an fsync is done, or when needed
to free up space in the journal, or after a timeout when there's dirty data in
the journal (default 1 second).

Known bugs:
 - we see oopses with zstd compression. It looks like if you pass
   ZSTD_compressCCtx() a buffer that's not guaranteed to be big enough to fit
   the output, it sometimes writes past the end of that buffer.

 - erasure coding is not quite stable, something's funky with the management of
   our stripes heap, and my erasure coding + device removal test is hanging.

 - xfstests generic/547 just started failing after the 5.10 rebase - this is an
   fsync issue

 - some timestamps bugs - generic/258 started failing after the 5.10 rebase

Aside from that multiple devices, replication, compression, checksumming,
encryption etc. should all be pretty solid - from polling my user base they say
things have stabilized nicely over the past year.

Non fs/bcachefs/ prep patches - bcachefs now depends on some SRCU patches from
Paul McKenney that aren't in yet. Other than that, the list of non fs/bcachefs/
patches has been getting steadily smaller since last posting.



And here's the pull request. It's been rebased and tested on top of 5.10-rc7,
and this is now the master branch that all the bcachefs users have been testing,
not a separate for-review branch:

The following changes since commit 0477e92881850d44910a7e94fc2c46f96faa131f:

  Linux 5.10-rc7 (2020-12-06 14:25:12 -0800)

are available in the Git repository at:

  https://evilpiepirate.org/git/bcachefs.git 

for you to fetch changes up to b7ddbb0e20d2add011d8f5a035b04ca3ac8b34fa:

  bcachefs: Add some cond_rescheds() in shutdown path (2020-12-13 16:14:10 
-0500)


Justin Husted (2):
  bcachefs: Set lost+found mode to 0700
  bcachefs: Update directory timestamps during link

Kent Overstreet (408):
  Compiler Attributes: add __flatten
  locking: SIX locks (shared/intent/exclusive)
  mm: export find_get_pages_range()
  sched: Add task_struct->faults_disabled_mapping
  mm: Bring back vmalloc_exec
  fs: factor out d_mark_tmpfile()
  block: Add some exports for bcachefs
  block: Add blk_status_to_str()
  bcache: move closures to lib/
  closures: closure_wait_event()
  bcachefs: Initial commit
  bcachefs: Fix setting of attributes mask in getattr
  bcachefs: Some reflink fixes
  bcachefs: Don't BUG_ON() sector count overflow
  bcachefs: Add an option for fsck error ratelimiting
  bcachefs: Avoid calling bch2_btree_iter_relock() in 
bch2_btree_iter_traverse()
  bcachefs: Inline fast path of bch2_increment_clock()
  bcachefs: Make __bch2_bkey_cmp_packed() smaller
  bcachefs: Pipeline binary searches and linear searches
  bcachefs: bch2_read_extent() microoptimizations
  bcachefs: kill BFLOAT_FAILED_PREV
  bcachefs: Fall back to slowpath on exact comparison
  bcachefs: Go back to 16 bit mantissa bkey floats
  bcachefs: Remove some BKEY_PADDED uses
  bcachefs: Be slightly less tricky with union usage
  bcachefs: Fix erorr path in bch2_write()
  bcachefs: Use wbc_to_write_flags()
  bcachefs: Make memcpy_to_bio() param const
  bcachefs: bkey_on_stack
  bcachefs: kill bch2_extent_has_device()
  bcachefs: bkey noops
  bcachefs: Rework of cut_front & cut_back
  bcachefs: Split out extent_update.c
  bcachefs: Inline data extents
  bcachefs: Reorganize extents.c
  bcachefs: splice_write is currently busted
  bcachefs: kill ca->freelist_lock
  bcachefs: bkey_on_stack_reassemble()
  bcachefs: Switch to macro for bkey_ops
  bcachefs: bch2_check_set_feature()
  bcachefs: Put inline data behind a mount option for now
  bcachefs: Fix bch2_verify_insert_pos()
  bcachefs: Always emit new extents on partial overwrite
  bcachefs: Whiteout changes
  bcachefs: Refactor whiteouts compaction
  bcachefs: Use one buffer for sorting whiteouts
  bcachefs: Kill btree_node_iter_large
  bcachefs: Fix a null ptr deref in btree_iter_traverse_one()
  bcachefs: Fix for an assertion on filesy

bcachefs-for-review

2020-10-27 Thread Kent Overstreet
Here's where bcachefs is at and what I'd like to get merged:

https://evilpiepirate.org/git/bcachefs.git/log/?h=bcachefs-for-review

Non bcachefs prep patches:

  Compiler Attributes: add __flatten
  locking: SIX locks (shared/intent/exclusive)
  mm: export find_get_pages_range()
  mm: Add a mechanism to disable faults for a specific mapping
  mm: Bring back vmalloc_exec
  fs: insert_inode_locked2()
  fs: factor out d_mark_tmpfile()
  block: Add some exports for bcachefs
  block: Add blk_status_to_str()
  bcache: move closures to lib/
  closures: closure_wait_event()

 block/bio.c|   2 +
 block/blk-core.c   |  13 +-
 drivers/md/bcache/Kconfig  |  10 +-
 drivers/md/bcache/Makefile |   4 +-
 drivers/md/bcache/bcache.h |   2 +-
 drivers/md/bcache/super.c  |   1 -
 drivers/md/bcache/util.h   |   3 +-
 fs/dcache.c|  10 +-
 fs/inode.c |  40 ++
 include/linux/blkdev.h |   1 +
 {drivers/md/bcache => include/linux}/closure.h |  39 +-
 include/linux/compiler_attributes.h|   5 +
 include/linux/dcache.h |   1 +
 include/linux/fs.h |   1 +
 include/linux/sched.h  |   1 +
 include/linux/six.h| 197 +
 include/linux/vmalloc.h|   1 +
 init/init_task.c   |   1 +
 kernel/Kconfig.locks   |   3 +
 kernel/locking/Makefile|   1 +
 kernel/locking/six.c   | 553 +
 kernel/module.c|   4 +-
 lib/Kconfig|   3 +
 lib/Kconfig.debug  |   9 +
 lib/Makefile   |   2 +
 {drivers/md/bcache => lib}/closure.c   |  35 +-
 mm/filemap.c   |   1 +
 mm/gup.c   |   7 +
 mm/nommu.c |  18 +
 mm/vmalloc.c   |  21 +
 30 files changed, 937 insertions(+), 52 deletions(-)
 rename {drivers/md/bcache => include/linux}/closure.h (94%)
 create mode 100644 include/linux/six.h
 create mode 100644 kernel/locking/six.c
 rename {drivers/md/bcache => lib}/closure.c (89%)

New since last posting that's relevant to the rest of the kernel:
 - Re: the DIO cache coherency issue, we finally have a solution that hopefully
   everyone will find palatable. We no longer try to do any fancy recursive
   locking stuff: if userspace issues a DIO read/write where the buffer points
   to the same address space as the file being read/written to, we just return
   an error.

   This requires a small change to gup.c, to add the check after the VMA lookup.
   My patch passes the mapping to check against via a new task_struct member,
   which is ugly because plumbing a new argument all the way to __get_user_pages
   is also going to be ugly and if I have to do that I'm likely to go on a
   refactoring binge, which gup.c looks like it needs anyways.

 - vmalloc_exec() is needed because bcachefs dynamically generates x86 machine
   code - per btree node unpack functions.

Bcachefs changes since last posting:
 - lots
 - reflink is done
 - erasure coding (reed solomon raid5/6) is maturing; I have declared it ready
   for beta testers and gotten some _very_ positive feedback on its performance.
 - btree key cache code is done and merged, big improvements to multithreaded
   write workloads
 - inline data extents
 - major improvements to how the btree code handles extents (still todo:
   re-implement extent merging)
 - huge improvements to mount/unmount times on huge filesystems
 - many, many bugfixes; bug reports are slowing and the bugs that are being
   reported look less and less concerning. In particular repair code is getting
   better and more polished.

TODO:
 - scrub, repair of replicated data when one of the replicas fail the checksum
   check
 - erasure coding needs repair code (it'll do reconstruct reads, but we don't
   have code to rewrite bad blocks in a stripe yet. this is going to be a hassle
   until we get backpointers)
 - fsck isn't checking refcounts of reflinked extents yet
 - bcachefs tests in ktest need to be moved to xfstests
 - user docs are still very minimal

So that's roughly where things are at. I think erasure coding is going to to be
bcachefs's killer feature (or at least one of them), and I'm pretty excited
about it: it's a completely new approach unlike ZFS and btrfs, no write hole (we
don't update existing stripes in place) and we don't have to fragment writes
either like ZFS does. Add to that the caching that we already do 

[PATCH v2 1/2] fs: Break generic_file_buffered_read up into multiple functions

2020-10-25 Thread Kent Overstreet
This is prep work for changing generic_file_buffered_read() to use
find_get_pages_contig() to batch up all the pagecache lookups.

This patch should be functionally identical to the existing code and
changes as little as of the flow control as possible. More refactoring
could be done, this patch is intended to be relatively minimal.

Signed-off-by: Kent Overstreet 
Cc: Matthew Wilcox (Oracle) 
Cc: Jens Axboe 
---
 mm/filemap.c | 473 ---
 1 file changed, 261 insertions(+), 212 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index d5e7c2029d..cc0f58a249 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2158,6 +2158,234 @@ static void shrink_readahead_size_eio(struct 
file_ra_state *ra)
ra->ra_pages /= 4;
 }
 
+static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
+{
+   if (iocb->ki_flags & IOCB_WAITQ)
+   return lock_page_async(page, iocb->ki_waitq);
+   else if (iocb->ki_flags & IOCB_NOWAIT)
+   return trylock_page(page) ? 0 : -EAGAIN;
+   else
+   return lock_page_killable(page);
+}
+
+static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
+   struct iov_iter *iter,
+   struct page *page)
+{
+   struct address_space *mapping = iocb->ki_filp->f_mapping;
+   struct inode *inode = mapping->host;
+   struct file_ra_state *ra = >ki_filp->f_ra;
+   unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
+   unsigned int bytes, copied;
+   loff_t isize, end_offset;
+
+   BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
+
+   /*
+* i_size must be checked after we know the page is Uptodate.
+*
+* Checking i_size after the check allows us to calculate
+* the correct value for "bytes", which means the zero-filled
+* part of the page is not copied back to userspace (unless
+* another truncate extends the file - this is desired though).
+*/
+
+   isize = i_size_read(inode);
+   if (unlikely(iocb->ki_pos >= isize))
+   return 1;
+
+   end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
+
+   bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
+
+   /* If users can be writing to this page using arbitrary
+* virtual addresses, take care about potential aliasing
+* before reading the page on the kernel side.
+*/
+   if (mapping_writably_mapped(mapping))
+   flush_dcache_page(page);
+
+   /*
+* Ok, we have the page, and it's up-to-date, so
+* now we can copy it to user space...
+*/
+
+   copied = copy_page_to_iter(page, offset, bytes, iter);
+
+   iocb->ki_pos += copied;
+
+   /*
+* When a sequential read accesses a page several times,
+* only mark it as accessed the first time.
+*/
+   if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
+   mark_page_accessed(page);
+
+   ra->prev_pos = iocb->ki_pos;
+
+   if (copied < bytes)
+   return -EFAULT;
+
+   return !iov_iter_count(iter) || iocb->ki_pos == isize;
+}
+
+static struct page *
+generic_file_buffered_read_readpage(struct kiocb *iocb,
+   struct file *filp,
+   struct address_space *mapping,
+   struct page *page)
+{
+   struct file_ra_state *ra = >f_ra;
+   int error;
+
+   if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
+   unlock_page(page);
+   put_page(page);
+   return ERR_PTR(-EAGAIN);
+   }
+
+   /*
+* A previous I/O error may have been due to temporary
+* failures, eg. multipath errors.
+* PG_error will be set again if readpage fails.
+*/
+   ClearPageError(page);
+   /* Start the actual read. The read will unlock the page. */
+   error = mapping->a_ops->readpage(filp, page);
+
+   if (unlikely(error)) {
+   put_page(page);
+   return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
+   }
+
+   if (!PageUptodate(page)) {
+   error = lock_page_for_iocb(iocb, page);
+   if (unlikely(error)) {
+   put_page(page);
+   return ERR_PTR(error);
+   }
+   if (!PageUptodate(page)) {
+   if (page->mapping == NULL) {
+   /*
+* invalidate_mapping_pages got it
+*/
+   unlock_page(page);
+   put_page(page);
+   return NULL;
+   }
+   

[PATCH v2 2/2] fs: generic_file_buffered_read() now uses find_get_pages_contig

2020-10-25 Thread Kent Overstreet
Convert generic_file_buffered_read() to get pages to read from in
batches, and then copy data to userspace from many pages at once - in
particular, we now don't touch any cachelines that might be contended
while we're in the loop to copy data to userspace.

This is is a performance improvement on workloads that do buffered reads
with large blocksizes, and a very large performance improvement if that
file is also being accessed concurrently by different threads.

On smaller reads (512 bytes), there's a very small performance
improvement (1%, within the margin of error).

Signed-off-by: Kent Overstreet 
---
 mm/filemap.c | 313 ---
 1 file changed, 175 insertions(+), 138 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index cc0f58a249..1bf1f424cb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2168,67 +2168,6 @@ static int lock_page_for_iocb(struct kiocb *iocb, struct 
page *page)
return lock_page_killable(page);
 }
 
-static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
-   struct iov_iter *iter,
-   struct page *page)
-{
-   struct address_space *mapping = iocb->ki_filp->f_mapping;
-   struct inode *inode = mapping->host;
-   struct file_ra_state *ra = >ki_filp->f_ra;
-   unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
-   unsigned int bytes, copied;
-   loff_t isize, end_offset;
-
-   BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
-
-   /*
-* i_size must be checked after we know the page is Uptodate.
-*
-* Checking i_size after the check allows us to calculate
-* the correct value for "bytes", which means the zero-filled
-* part of the page is not copied back to userspace (unless
-* another truncate extends the file - this is desired though).
-*/
-
-   isize = i_size_read(inode);
-   if (unlikely(iocb->ki_pos >= isize))
-   return 1;
-
-   end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
-
-   bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
-
-   /* If users can be writing to this page using arbitrary
-* virtual addresses, take care about potential aliasing
-* before reading the page on the kernel side.
-*/
-   if (mapping_writably_mapped(mapping))
-   flush_dcache_page(page);
-
-   /*
-* Ok, we have the page, and it's up-to-date, so
-* now we can copy it to user space...
-*/
-
-   copied = copy_page_to_iter(page, offset, bytes, iter);
-
-   iocb->ki_pos += copied;
-
-   /*
-* When a sequential read accesses a page several times,
-* only mark it as accessed the first time.
-*/
-   if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
-   mark_page_accessed(page);
-
-   ra->prev_pos = iocb->ki_pos;
-
-   if (copied < bytes)
-   return -EFAULT;
-
-   return !iov_iter_count(iter) || iocb->ki_pos == isize;
-}
-
 static struct page *
 generic_file_buffered_read_readpage(struct kiocb *iocb,
struct file *filp,
@@ -2386,6 +2325,92 @@ generic_file_buffered_read_no_cached_page(struct kiocb 
*iocb,
return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
 }
 
+static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
+   struct iov_iter *iter,
+   struct page **pages,
+   unsigned int nr)
+{
+   struct file *filp = iocb->ki_filp;
+   struct address_space *mapping = filp->f_mapping;
+   struct file_ra_state *ra = >f_ra;
+   pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+   pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> 
PAGE_SHIFT;
+   int i, j, nr_got, err = 0;
+
+   nr = min_t(unsigned long, last_index - index, nr);
+find_page:
+   if (fatal_signal_pending(current))
+   return -EINTR;
+
+   nr_got = find_get_pages_contig(mapping, index, nr, pages);
+   if (nr_got)
+   goto got_pages;
+
+   if (iocb->ki_flags & IOCB_NOIO)
+   return -EAGAIN;
+
+   page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
+
+   nr_got = find_get_pages_contig(mapping, index, nr, pages);
+   if (nr_got)
+   goto got_pages;
+
+   pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
+   err = PTR_ERR_OR_ZERO(pages[0]);
+   if (!IS_ERR_OR_NULL(pages[0]))
+   nr_got = 1;
+got_pages:
+   for (i = 0; i < nr_got; i++) {
+   struct page *page = pages[i];
+   pgoff_t pg_inde

[PATCH v2 0/2] generic_file_buffered_read() improvements

2020-10-25 Thread Kent Overstreet
Rebased onto current mainline - this series already included Jens' patch for
IOCB_WAITQ behaviour so nothing changed, but Jens might want to glance at it.

Kent Overstreet (2):
  fs: Break generic_file_buffered_read up into multiple functions
  fs: generic_file_buffered_read() now uses find_get_pages_contig

 mm/filemap.c | 572 +--
 1 file changed, 329 insertions(+), 243 deletions(-)

-- 
2.28.0



[PATCH v2 2/2] fs: kill add_to_page_cache_locked()

2020-10-21 Thread Kent Overstreet
No longer has any users, so remove it.

Signed-off-by: Kent Overstreet 
---
 include/linux/pagemap.h | 20 ++---
 mm/filemap.c| 64 -
 2 files changed, 33 insertions(+), 51 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 434c9c34ae..aceaebfaab 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -689,8 +689,8 @@ static inline int fault_in_pages_readable(const char __user 
*uaddr, int size)
return 0;
 }
 
-int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-   pgoff_t index, gfp_t gfp_mask);
+int add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
 extern void delete_from_page_cache(struct page *page);
@@ -710,22 +710,6 @@ void page_cache_readahead_unbounded(struct address_space 
*, struct file *,
pgoff_t index, unsigned long nr_to_read,
unsigned long lookahead_count);
 
-/*
- * Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __SetPageLocked() against it.
- */
-static inline int add_to_page_cache(struct page *page,
-   struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
-{
-   int error;
-
-   __SetPageLocked(page);
-   error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
-   if (unlikely(error))
-   __ClearPageLocked(page);
-   return error;
-}
-
 /**
  * struct readahead_control - Describes a readahead request.
  *
diff --git a/mm/filemap.c b/mm/filemap.c
index bb71334fdd..b92ca48b90 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -827,20 +827,20 @@ int replace_page_cache_page(struct page *old, struct page 
*new, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL_GPL(replace_page_cache_page);
 
-static int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
- void **shadowp)
+static int __add_to_page_cache(struct page *page,
+  struct address_space *mapping,
+  pgoff_t offset, gfp_t gfp_mask,
+  void **shadowp)
 {
XA_STATE(xas, >i_pages, offset);
int huge = PageHuge(page);
int error;
void *old;
 
-   VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
mapping_set_update(, mapping);
 
+   __SetPageLocked(page);
get_page(page);
page->mapping = mapping;
page->index = offset;
@@ -885,29 +885,31 @@ static int __add_to_page_cache_locked(struct page *page,
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
put_page(page);
+   __ClearPageLocked(page);
return error;
 }
-ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
+ALLOW_ERROR_INJECTION(__add_to_page_cache, ERRNO);
 
 /**
- * add_to_page_cache_locked - add a locked page to the pagecache
+ * add_to_page_cache - add a newly allocated page to the pagecache
  * @page:  page to add
  * @mapping:   the page's address_space
  * @offset:page index
  * @gfp_mask:  page allocation mode
  *
- * This function is used to add a page to the pagecache. It must be locked.
- * This function does not add the page to the LRU.  The caller must do that.
+ * This function is used to add a page to the pagecache. It must be newly
+ * allocated.  This function does not add the page to the LRU.  The caller must
+ * do that.
  *
  * Return: %0 on success, negative error code otherwise.
  */
-int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-   pgoff_t offset, gfp_t gfp_mask)
+int add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
 {
-   return __add_to_page_cache_locked(page, mapping, offset,
- gfp_mask, NULL);
+   return __add_to_page_cache(page, mapping, offset, gfp_mask, NULL);
 }
-EXPORT_SYMBOL(add_to_page_cache_locked);
+EXPORT_SYMBOL(add_to_page_cache);
+ALLOW_ERROR_INJECTION(add_to_page_cache, ERRNO);
 
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
@@ -915,26 +917,22 @@ int add_to_page_cache_lru(struct page *page, struct 
address_space *mapping,
void *shadow = NULL;
int ret;
 
-   __SetPageLocked(page);
-   ret = __add_to_page_cache_locked(page, mapping, offset,
-gfp_mask, );
+   ret = __ad

[PATCH v2 1/2] cifs: convert to add_to_page_cache()

2020-10-21 Thread Kent Overstreet
This is just open coding add_to_page_cache(), and the next patch will
delete add_to_page_cache_locked().

Signed-off-by: Kent Overstreet 
---
 fs/cifs/file.c | 20 
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index be46fab4c9..b3ee790532 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4296,20 +4296,11 @@ readpages_get_pages(struct address_space *mapping, 
struct list_head *page_list,
 
page = lru_to_page(page_list);
 
-   /*
-* Lock the page and put it in the cache. Since no one else
-* should have access to this page, we're safe to simply set
-* PG_locked without checking it first.
-*/
-   __SetPageLocked(page);
-   rc = add_to_page_cache_locked(page, mapping,
- page->index, gfp);
+   rc = add_to_page_cache(page, mapping, page->index, gfp);
 
/* give up if we can't stick it in the cache */
-   if (rc) {
-   __ClearPageLocked(page);
+   if (rc)
return rc;
-   }
 
/* move first page to the tmplist */
*offset = (loff_t)page->index << PAGE_SHIFT;
@@ -4328,12 +4319,9 @@ readpages_get_pages(struct address_space *mapping, 
struct list_head *page_list,
if (*bytes + PAGE_SIZE > rsize)
break;
 
-   __SetPageLocked(page);
-   rc = add_to_page_cache_locked(page, mapping, page->index, gfp);
-   if (rc) {
-   __ClearPageLocked(page);
+   rc = add_to_page_cache(page, mapping, page->index, gfp);
+   if (rc)
break;
-   }
list_move_tail(>lru, tmplist);
(*bytes) += PAGE_SIZE;
expected_index++;
-- 
2.28.0



[PATCH v2 0/2] kill add_to_page_cache_locked()

2020-10-21 Thread Kent Overstreet
since v1
 - kill a faulty assertion found by kernel test robot
 - drop an unneeded line break

Andrew, can this go through your tree?

Kent Overstreet (2):
  cifs: convert to add_to_page_cache()
  fs: kill add_to_page_cache_locked()

 fs/cifs/file.c  | 20 +++--
 include/linux/pagemap.h | 20 ++---
 mm/filemap.c| 64 -
 3 files changed, 37 insertions(+), 67 deletions(-)

-- 
2.28.0



[PATCH 2/2] block: Add blk_status_to_str()

2020-10-19 Thread Kent Overstreet
If we're going to the trouble of having these nice error strings, let's
make them available.

Signed-off-by: Kent Overstreet 
---
 block/blk-core.c   | 13 +
 include/linux/blkdev.h |  1 +
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 10c08ac506..d68f24a7ee 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -213,18 +213,23 @@ int blk_status_to_errno(blk_status_t status)
 }
 EXPORT_SYMBOL_GPL(blk_status_to_errno);
 
-static void print_req_error(struct request *req, blk_status_t status,
-   const char *caller)
+const char *blk_status_to_str(blk_status_t status)
 {
int idx = (__force int)status;
 
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
-   return;
+   return "(invalid error)";
+   return blk_errors[idx].name;
+}
+EXPORT_SYMBOL_GPL(blk_status_to_str);
 
+static void print_req_error(struct request *req, blk_status_t status,
+   const char *caller)
+{
printk_ratelimited(KERN_ERR
"%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
"phys_seg %u prio class %u\n",
-   caller, blk_errors[idx].name,
+   caller, blk_status_to_str(status),
req->rq_disk ? req->rq_disk->disk_name : "?",
blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
req->cmd_flags & ~REQ_OP_MASK,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 868e11face..d9e3b7b017 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -936,6 +936,7 @@ extern const char *blk_op_str(unsigned int op);
 
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
+const char *blk_status_to_str(blk_status_t status);
 
 int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
 
-- 
2.28.0



[PATCH 1/2] block: Add some exports for bcachefs

2020-10-19 Thread Kent Overstreet
bcachefs has its own direct IO code.

Signed-off-by: Kent Overstreet 
---
 block/bio.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index e865ea55b9..72a65c4113 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1320,6 +1320,7 @@ void bio_set_pages_dirty(struct bio *bio)
set_page_dirty_lock(bvec->bv_page);
}
 }
+EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
 
 /*
  * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
@@ -1379,6 +1380,7 @@ void bio_check_pages_dirty(struct bio *bio)
spin_unlock_irqrestore(_dirty_lock, flags);
schedule_work(_dirty_work);
 }
+EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
 static inline bool bio_remaining_done(struct bio *bio)
 {
-- 
2.28.0



[PATCH 1/2] cifs: convert to add_to_page_cache()

2020-10-19 Thread Kent Overstreet
This is just open coding add_to_page_cache(), and the next patch will
delete add_to_page_cache_locked().

Signed-off-by: Kent Overstreet 
---
 fs/cifs/file.c | 21 +
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index be46fab4c9..a17a21181e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -4296,20 +4296,12 @@ readpages_get_pages(struct address_space *mapping, 
struct list_head *page_list,
 
page = lru_to_page(page_list);
 
-   /*
-* Lock the page and put it in the cache. Since no one else
-* should have access to this page, we're safe to simply set
-* PG_locked without checking it first.
-*/
-   __SetPageLocked(page);
-   rc = add_to_page_cache_locked(page, mapping,
- page->index, gfp);
+   rc = add_to_page_cache(page, mapping,
+  page->index, gfp);
 
/* give up if we can't stick it in the cache */
-   if (rc) {
-   __ClearPageLocked(page);
+   if (rc)
return rc;
-   }
 
/* move first page to the tmplist */
*offset = (loff_t)page->index << PAGE_SHIFT;
@@ -4328,12 +4320,9 @@ readpages_get_pages(struct address_space *mapping, 
struct list_head *page_list,
if (*bytes + PAGE_SIZE > rsize)
break;
 
-   __SetPageLocked(page);
-   rc = add_to_page_cache_locked(page, mapping, page->index, gfp);
-   if (rc) {
-   __ClearPageLocked(page);
+   rc = add_to_page_cache(page, mapping, page->index, gfp);
+   if (rc)
break;
-   }
list_move_tail(>lru, tmplist);
(*bytes) += PAGE_SIZE;
expected_index++;
-- 
2.28.0



[PATCH 2/2] fs: kill add_to_page_cache_locked()

2020-10-19 Thread Kent Overstreet
No longer has any users, so remove it.

Signed-off-by: Kent Overstreet 
---
 include/linux/pagemap.h | 20 ++---
 mm/filemap.c| 62 -
 2 files changed, 32 insertions(+), 50 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 434c9c34ae..aceaebfaab 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -689,8 +689,8 @@ static inline int fault_in_pages_readable(const char __user 
*uaddr, int size)
return 0;
 }
 
-int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-   pgoff_t index, gfp_t gfp_mask);
+int add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t index, gfp_t gfp_mask);
 extern void delete_from_page_cache(struct page *page);
@@ -710,22 +710,6 @@ void page_cache_readahead_unbounded(struct address_space 
*, struct file *,
pgoff_t index, unsigned long nr_to_read,
unsigned long lookahead_count);
 
-/*
- * Like add_to_page_cache_locked, but used to add newly allocated pages:
- * the page is new, so we can just run __SetPageLocked() against it.
- */
-static inline int add_to_page_cache(struct page *page,
-   struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask)
-{
-   int error;
-
-   __SetPageLocked(page);
-   error = add_to_page_cache_locked(page, mapping, offset, gfp_mask);
-   if (unlikely(error))
-   __ClearPageLocked(page);
-   return error;
-}
-
 /**
  * struct readahead_control - Describes a readahead request.
  *
diff --git a/mm/filemap.c b/mm/filemap.c
index 82e5e0ba24..c562ad7e05 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -827,10 +827,10 @@ int replace_page_cache_page(struct page *old, struct page 
*new, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL_GPL(replace_page_cache_page);
 
-static int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
- void **shadowp)
+static int __add_to_page_cache(struct page *page,
+  struct address_space *mapping,
+  pgoff_t offset, gfp_t gfp_mask,
+  void **shadowp)
 {
XA_STATE(xas, >i_pages, offset);
int huge = PageHuge(page);
@@ -841,6 +841,7 @@ static int __add_to_page_cache_locked(struct page *page,
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
mapping_set_update(, mapping);
 
+   __SetPageLocked(page);
get_page(page);
page->mapping = mapping;
page->index = offset;
@@ -885,29 +886,30 @@ static int __add_to_page_cache_locked(struct page *page,
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
put_page(page);
+   __ClearPageLocked(page);
return error;
 }
-ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
 
 /**
- * add_to_page_cache_locked - add a locked page to the pagecache
+ * add_to_page_cache - add a newly allocated page to the pagecache
  * @page:  page to add
  * @mapping:   the page's address_space
  * @offset:page index
  * @gfp_mask:  page allocation mode
  *
- * This function is used to add a page to the pagecache. It must be locked.
- * This function does not add the page to the LRU.  The caller must do that.
+ * This function is used to add a page to the pagecache. It must be newly
+ * allocated.  This function does not add the page to the LRU.  The caller must
+ * do that.
  *
  * Return: %0 on success, negative error code otherwise.
  */
-int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
-   pgoff_t offset, gfp_t gfp_mask)
+int add_to_page_cache(struct page *page, struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
 {
-   return __add_to_page_cache_locked(page, mapping, offset,
- gfp_mask, NULL);
+   return __add_to_page_cache(page, mapping, offset, gfp_mask, NULL);
 }
-EXPORT_SYMBOL(add_to_page_cache_locked);
+EXPORT_SYMBOL(add_to_page_cache);
+ALLOW_ERROR_INJECTION(add_to_page_cache, ERRNO);
 
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
@@ -915,26 +917,22 @@ int add_to_page_cache_lru(struct page *page, struct 
address_space *mapping,
void *shadow = NULL;
int ret;
 
-   __SetPageLocked(page);
-   ret = __add_to_page_cache_locked(page, mapping, offset,
-gfp_mask, );
+   ret = __add_to_page_cache(page, mapping, offset, gfp_mask, );
i

[PATCH 2/2] fs: generic_file_buffered_read() now uses find_get_pages_contig

2020-10-17 Thread Kent Overstreet
Convert generic_file_buffered_read() to get pages to read from in
batches, and then copy data to userspace from many pages at once - in
particular, we now don't touch any cachelines that might be contended
while we're in the loop to copy data to userspace.

This is is a performance improvement on workloads that do buffered reads
with large blocksizes, and a very large performance improvement if that
file is also being accessed concurrently by different threads.

On smaller reads (512 bytes), there's a very small performance
improvement (1%, within the margin of error).

Signed-off-by: Kent Overstreet 
---
 mm/filemap.c | 300 +--
 1 file changed, 172 insertions(+), 128 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 482fd75d66..8cf5b5d9e6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2148,67 +2148,6 @@ static int lock_page_for_iocb(struct kiocb *iocb, struct 
page *page)
return lock_page_killable(page);
 }
 
-static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
-   struct iov_iter *iter,
-   struct page *page)
-{
-   struct address_space *mapping = iocb->ki_filp->f_mapping;
-   struct inode *inode = mapping->host;
-   struct file_ra_state *ra = >ki_filp->f_ra;
-   unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
-   unsigned int bytes, copied;
-   loff_t isize, end_offset;
-
-   BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
-
-   /*
-* i_size must be checked after we know the page is Uptodate.
-*
-* Checking i_size after the check allows us to calculate
-* the correct value for "bytes", which means the zero-filled
-* part of the page is not copied back to userspace (unless
-* another truncate extends the file - this is desired though).
-*/
-
-   isize = i_size_read(inode);
-   if (unlikely(iocb->ki_pos >= isize))
-   return 1;
-
-   end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
-
-   bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
-
-   /* If users can be writing to this page using arbitrary
-* virtual addresses, take care about potential aliasing
-* before reading the page on the kernel side.
-*/
-   if (mapping_writably_mapped(mapping))
-   flush_dcache_page(page);
-
-   /*
-* Ok, we have the page, and it's up-to-date, so
-* now we can copy it to user space...
-*/
-
-   copied = copy_page_to_iter(page, offset, bytes, iter);
-
-   iocb->ki_pos += copied;
-
-   /*
-* When a sequential read accesses a page several times,
-* only mark it as accessed the first time.
-*/
-   if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
-   mark_page_accessed(page);
-
-   ra->prev_pos = iocb->ki_pos;
-
-   if (copied < bytes)
-   return -EFAULT;
-
-   return !iov_iter_count(iter) || iocb->ki_pos == isize;
-}
-
 static struct page *
 generic_file_buffered_read_readpage(struct kiocb *iocb,
struct file *filp,
@@ -2366,6 +2305,92 @@ generic_file_buffered_read_no_cached_page(struct kiocb 
*iocb,
return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
 }
 
+static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
+   struct iov_iter *iter,
+   struct page **pages,
+   unsigned int nr)
+{
+   struct file *filp = iocb->ki_filp;
+   struct address_space *mapping = filp->f_mapping;
+   struct file_ra_state *ra = >f_ra;
+   pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+   pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> 
PAGE_SHIFT;
+   int i, j, nr_got, err = 0;
+
+   nr = min_t(unsigned long, last_index - index, nr);
+find_page:
+   if (fatal_signal_pending(current))
+   return -EINTR;
+
+   nr_got = find_get_pages_contig(mapping, index, nr, pages);
+   if (nr_got)
+   goto got_pages;
+
+   if (iocb->ki_flags & IOCB_NOIO)
+   return -EAGAIN;
+
+   page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
+
+   nr_got = find_get_pages_contig(mapping, index, nr, pages);
+   if (nr_got)
+   goto got_pages;
+
+   pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
+   err = PTR_ERR_OR_ZERO(pages[0]);
+   if (!IS_ERR_OR_NULL(pages[0]))
+   nr_got = 1;
+got_pages:
+   for (i = 0; i < nr_got; i++) {
+   struct page *page = pages[i];
+   pgoff_t pg_inde

[PATCH 0/2] generic_file_buffered_read() refactoring, perf improvements

2020-10-17 Thread Kent Overstreet
Rebased this patchset onto 5.9, I'd like to finally get this because
generic_file_buffered_read() has turned into a real monstrosity to work with.
And it's a major performance improvement, for both small random and large
sequential reads. On my test box, 4k buffered random reads go from ~150k to
~250k iops, and the improvements to big sequential reads are even bigger.

This incorporates the fix for IOCB_WAITQ handling that Jens just posted as well,
also factors out lock_page_for_iocb() to improve handling of the various iocb
flags.

Kent Overstreet (2):
  fs: Break generic_file_buffered_read up into multiple functions
  fs: generic_file_buffered_read() now uses find_get_pages_contig

 mm/filemap.c | 563 ++-
 1 file changed, 328 insertions(+), 235 deletions(-)

-- 
2.28.0



[PATCH 1/2] fs: Break generic_file_buffered_read up into multiple functions

2020-10-17 Thread Kent Overstreet
This is prep work for changing generic_file_buffered_read() to use
find_get_pages_contig() to batch up all the pagecache lookups.

This patch should be functionally identical to the existing code and
changes as little as of the flow control as possible. More refactoring
could be done, this patch is intended to be relatively minimal.

Signed-off-by: Kent Overstreet 
Cc: Matthew Wilcox (Oracle) 
Cc: Jens Axboe 
---
 mm/filemap.c | 473 ---
 1 file changed, 261 insertions(+), 212 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 99c49eeae7..482fd75d66 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2138,6 +2138,234 @@ static void shrink_readahead_size_eio(struct 
file_ra_state *ra)
ra->ra_pages /= 4;
 }
 
+static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
+{
+   if (iocb->ki_flags & IOCB_WAITQ)
+   return lock_page_async(page, iocb->ki_waitq);
+   else if (iocb->ki_flags & IOCB_NOWAIT)
+   return trylock_page(page) ? 0 : -EAGAIN;
+   else
+   return lock_page_killable(page);
+}
+
+static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
+   struct iov_iter *iter,
+   struct page *page)
+{
+   struct address_space *mapping = iocb->ki_filp->f_mapping;
+   struct inode *inode = mapping->host;
+   struct file_ra_state *ra = >ki_filp->f_ra;
+   unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
+   unsigned int bytes, copied;
+   loff_t isize, end_offset;
+
+   BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
+
+   /*
+* i_size must be checked after we know the page is Uptodate.
+*
+* Checking i_size after the check allows us to calculate
+* the correct value for "bytes", which means the zero-filled
+* part of the page is not copied back to userspace (unless
+* another truncate extends the file - this is desired though).
+*/
+
+   isize = i_size_read(inode);
+   if (unlikely(iocb->ki_pos >= isize))
+   return 1;
+
+   end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
+
+   bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
+
+   /* If users can be writing to this page using arbitrary
+* virtual addresses, take care about potential aliasing
+* before reading the page on the kernel side.
+*/
+   if (mapping_writably_mapped(mapping))
+   flush_dcache_page(page);
+
+   /*
+* Ok, we have the page, and it's up-to-date, so
+* now we can copy it to user space...
+*/
+
+   copied = copy_page_to_iter(page, offset, bytes, iter);
+
+   iocb->ki_pos += copied;
+
+   /*
+* When a sequential read accesses a page several times,
+* only mark it as accessed the first time.
+*/
+   if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
+   mark_page_accessed(page);
+
+   ra->prev_pos = iocb->ki_pos;
+
+   if (copied < bytes)
+   return -EFAULT;
+
+   return !iov_iter_count(iter) || iocb->ki_pos == isize;
+}
+
+static struct page *
+generic_file_buffered_read_readpage(struct kiocb *iocb,
+   struct file *filp,
+   struct address_space *mapping,
+   struct page *page)
+{
+   struct file_ra_state *ra = >f_ra;
+   int error;
+
+   if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
+   unlock_page(page);
+   put_page(page);
+   return ERR_PTR(-EAGAIN);
+   }
+
+   /*
+* A previous I/O error may have been due to temporary
+* failures, eg. multipath errors.
+* PG_error will be set again if readpage fails.
+*/
+   ClearPageError(page);
+   /* Start the actual read. The read will unlock the page. */
+   error = mapping->a_ops->readpage(filp, page);
+
+   if (unlikely(error)) {
+   put_page(page);
+   return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
+   }
+
+   if (!PageUptodate(page)) {
+   error = lock_page_for_iocb(iocb, page);
+   if (unlikely(error)) {
+   put_page(page);
+   return ERR_PTR(error);
+   }
+   if (!PageUptodate(page)) {
+   if (page->mapping == NULL) {
+   /*
+* invalidate_mapping_pages got it
+*/
+   unlock_page(page);
+   put_page(page);
+   return NULL;
+   }
+   

Re: [PATCH 00/13] lib/generic-radix-tree: genradix bug fix and optimisations.

2020-08-25 Thread Kent Overstreet
On Tue, Aug 25, 2020 at 04:00:35PM +, David Laight wrote:
> From: 'Marcelo Ricardo Leitner'
> > Sent: 25 August 2020 16:41
> > 
> > On Tue, Aug 25, 2020 at 02:52:34PM +, David Laight wrote:
> > > The genradix code is used by SCTP for accessing per-stream data.
> > > This means there are quite a lot of lookups but the code wasn't
> > > really optimised at all.
> > 
> > My test box is down for the moment and will bring it on later today or
> > tomorrow, so I can't test it yet. What should we expect as performance
> > gains here?
> 
> Not sure, probably not much, but it ought to show up :-)
> There'll be bigger gains on a cpu that has software ilog2().
> 
> I've only checked SCTP still works.
> I've requested 32k streams on a listener - to force a level-2 tree.
> I've also done at least one check with a massive pad in the sctp
> stream structure.

Have you benchmarked at all? Or were you looking at the generated assembly?


Fixup patch for [PATCH 0/2] generic_file_buffered_read() refactoring & optimization

2020-06-29 Thread Kent Overstreet
Andrew - fixup patch because I got a bug report where we were trying to do an
order 7 allocation here:

-- >8 --
Subject: [PATCH] fixup! fs: generic_file_buffered_read() now uses
 find_get_pages_contig

We shouldn't try to pin too many pages at once, reads can be almost
arbitrarily big.

Signed-off-by: Kent Overstreet 
---
 mm/filemap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index d8bd5e9647..b3a2aad1b7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2220,8 +2220,9 @@ static ssize_t generic_file_buffered_read(struct kiocb 
*iocb,
struct inode *inode = mapping->host;
size_t orig_count = iov_iter_count(iter);
struct page *pages_onstack[8], **pages = NULL;
-   unsigned int nr_pages = ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) 
>> PAGE_SHIFT) -
-   (iocb->ki_pos >> PAGE_SHIFT);
+   unsigned int nr_pages = min_t(unsigned int, 512,
+   ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> 
PAGE_SHIFT) -
+   (iocb->ki_pos >> PAGE_SHIFT));
int i, pg_nr, error = 0;
bool writably_mapped;
loff_t isize, end_offset;
-- 
2.27.0




Re: [PATCH v3 0/2] generic_file_buffered_read() refactoring & optimization

2020-06-19 Thread Kent Overstreet
On Fri, Jun 19, 2020 at 05:59:20AM -0700, Christoph Hellwig wrote:
> After looking at v2 yesterday I noticed I few things in the structure
> that I really didn't like:
> 
>  - using a struct page * return value just to communicate status codes
>  - the extremely long function names
>  - a generally somewhat non-intuitive split of the helpers
> 
> I then hacked on top of it for a bit while sitting in a telephone
> conference.  Below is my result, which passes a quick xfstests run.
> Note that this includes the refactoring and the batch lookup changes
> as I did it on top of your series:

I like it - I can't get your patch to apply to anything though, do you have it
up in a git repo anywhere?

> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index f0ae9a6308cb4d..9e0aecd99950f4 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -1972,273 +1972,360 @@ static void shrink_readahead_size_eio(struct 
> file_ra_state *ra)
>   ra->ra_pages /= 4;
>  }
>  
> -/**
> - * generic_file_buffered_read - generic file read routine
> - * @iocb:the iocb to read
> - * @iter:data destination
> - * @written: already copied
> - *
> - * This is a generic file read routine, and uses the
> - * mapping->a_ops->readpage() function for the actual low-level stuff.
> - *
> - * This is really ugly. But the goto's actually try to clarify some
> - * of the logic when it comes to error handling etc.
> - *
> - * Return:
> - * * total number of bytes copied, including those the were already @written
> - * * negative error code if nothing was copied
> - */
> -ssize_t generic_file_buffered_read(struct kiocb *iocb,
> - struct iov_iter *iter, ssize_t written)
> +static inline pgoff_t filemap_last_index(struct kiocb *iocb,
> + struct iov_iter *iter)
>  {
> - struct file *filp = iocb->ki_filp;
> - struct address_space *mapping = filp->f_mapping;
> - struct inode *inode = mapping->host;
> - struct file_ra_state *ra = >f_ra;
> - loff_t *ppos = >ki_pos;
> - pgoff_t index;
> - pgoff_t last_index;
> - pgoff_t prev_index;
> - unsigned long offset;  /* offset into pagecache page */
> - unsigned int prev_offset;
> - int error = 0;
> -
> - if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
> - return 0;
> - iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
> -
> - index = *ppos >> PAGE_SHIFT;
> - prev_index = ra->prev_pos >> PAGE_SHIFT;
> - prev_offset = ra->prev_pos & (PAGE_SIZE-1);
> - last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
> - offset = *ppos & ~PAGE_MASK;
> + return (iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +}
>  
> - for (;;) {
> - struct page *page;
> - pgoff_t end_index;
> - loff_t isize;
> - unsigned long nr, ret;
> +static inline unsigned long filemap_nr_pages(struct kiocb *iocb,
> + struct iov_iter *iter)
> +{
> + return filemap_last_index(iocb, iter) - (iocb->ki_pos >> PAGE_SHIFT);
> +}
>  
> - cond_resched();
> -find_page:
> - if (fatal_signal_pending(current)) {
> - error = -EINTR;
> - goto out;
> - }
> +static int __filemap_read_not_uptodate(struct file *file, struct page *page)
> +{
> + int error;
>  
> - page = find_get_page(mapping, index);
> - if (!page) {
> - if (iocb->ki_flags & IOCB_NOWAIT)
> - goto would_block;
> - page_cache_sync_readahead(mapping,
> - ra, filp,
> - index, last_index - index);
> - page = find_get_page(mapping, index);
> - if (unlikely(page == NULL))
> - goto no_cached_page;
> - }
> - if (PageReadahead(page)) {
> - page_cache_async_readahead(mapping,
> - ra, filp, page,
> - index, last_index - index);
> - }
> - if (!PageUptodate(page)) {
> - if (iocb->ki_flags & IOCB_NOWAIT) {
> - put_page(page);
> - goto would_block;
> - }
> + error = lock_page_killable(page);
> + if (error)
> + return error;
>  
> + if (!PageUptodate(page)) {
> + if (!page->mapping) {
>   /*
> -  * See comment in do_read_cache_page on why
> -  * wait_on_page_locked is used to avoid unnecessarily
> -  * serialisations and why it's safe.
> +  * invalidate_mapping_pages got it
>*/
> - error = wait_on_page_locked_killable(page);
> - if (unlikely(error))
> - 

[PATCH v3 1/2] fs: Break generic_file_buffered_read up into multiple functions

2020-06-18 Thread Kent Overstreet
This is prep work for changing generic_file_buffered_read() to use
find_get_pages_contig() to batch up all the pagecache lookups.

This patch should be functionally identical to the existing code and
changes as little as of the flow control as possible. More refactoring
could be done, this patch is intended to be relatively minimal.

Signed-off-by: Kent Overstreet 
---
 mm/filemap.c | 418 ---
 1 file changed, 228 insertions(+), 190 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 23a051a7ef..dc4a72042e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1975,6 +1975,210 @@ static void shrink_readahead_size_eio(struct 
file_ra_state *ra)
ra->ra_pages /= 4;
 }
 
+static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
+   struct iov_iter *iter,
+   struct page *page)
+{
+   struct address_space *mapping = iocb->ki_filp->f_mapping;
+   struct inode *inode = mapping->host;
+   struct file_ra_state *ra = >ki_filp->f_ra;
+   unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
+   unsigned int bytes, copied;
+   loff_t isize, end_offset;
+
+   BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
+
+   /*
+* i_size must be checked after we know the page is Uptodate.
+*
+* Checking i_size after the check allows us to calculate
+* the correct value for "bytes", which means the zero-filled
+* part of the page is not copied back to userspace (unless
+* another truncate extends the file - this is desired though).
+*/
+
+   isize = i_size_read(inode);
+   if (unlikely(iocb->ki_pos >= isize))
+   return 1;
+
+   end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
+
+   bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
+
+   /* If users can be writing to this page using arbitrary
+* virtual addresses, take care about potential aliasing
+* before reading the page on the kernel side.
+*/
+   if (mapping_writably_mapped(mapping))
+   flush_dcache_page(page);
+
+   /*
+* Ok, we have the page, and it's up-to-date, so
+* now we can copy it to user space...
+*/
+
+   copied = copy_page_to_iter(page, offset, bytes, iter);
+
+   iocb->ki_pos += copied;
+
+   /*
+* When a sequential read accesses a page several times,
+* only mark it as accessed the first time.
+*/
+   if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
+   mark_page_accessed(page);
+
+   ra->prev_pos = iocb->ki_pos;
+
+   if (copied < bytes)
+   return -EFAULT;
+
+   return !iov_iter_count(iter) || iocb->ki_pos == isize;
+}
+
+static struct page *
+generic_file_buffered_read_readpage(struct file *filp,
+   struct address_space *mapping,
+   struct page *page)
+{
+   struct file_ra_state *ra = >f_ra;
+   int error;
+
+   /*
+* A previous I/O error may have been due to temporary
+* failures, eg. multipath errors.
+* PG_error will be set again if readpage fails.
+*/
+   ClearPageError(page);
+   /* Start the actual read. The read will unlock the page. */
+   error = mapping->a_ops->readpage(filp, page);
+
+   if (unlikely(error)) {
+   put_page(page);
+   return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
+   }
+
+   if (!PageUptodate(page)) {
+   error = lock_page_killable(page);
+   if (unlikely(error)) {
+   put_page(page);
+   return ERR_PTR(error);
+   }
+   if (!PageUptodate(page)) {
+   if (page->mapping == NULL) {
+   /*
+* invalidate_mapping_pages got it
+*/
+   unlock_page(page);
+   put_page(page);
+   return NULL;
+   }
+   unlock_page(page);
+   shrink_readahead_size_eio(ra);
+   put_page(page);
+   return ERR_PTR(-EIO);
+   }
+   unlock_page(page);
+   }
+
+   return page;
+}
+
+static struct page *
+generic_file_buffered_read_pagenotuptodate(struct file *filp,
+  struct iov_iter *iter,
+  struct page *page,
+  loff_t pos, loff_t count)
+{
+   struct address_space *mapping = filp->f_mapping;
+   struct inode *inode = mapping->host;
+

[PATCH v3 2/2] fs: generic_file_buffered_read() now uses find_get_pages_contig

2020-06-18 Thread Kent Overstreet
Convert generic_file_buffered_read() to get pages to read from in
batches, and then copy data to userspace from many pages at once - in
particular, we now don't touch any cachelines that might be contended
while we're in the loop to copy data to userspace.

This is is a performance improvement on workloads that do buffered reads
with large blocksizes, and a very large performance improvement if that
file is also being accessed concurrently by different threads.

On smaller reads (512 bytes), there's a very small performance
improvement (1%, within the margin of error).

Signed-off-by: Kent Overstreet 
---
 mm/filemap.c | 279 +--
 1 file changed, 159 insertions(+), 120 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index dc4a72042e..d8bd5e9647 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1975,67 +1975,6 @@ static void shrink_readahead_size_eio(struct 
file_ra_state *ra)
ra->ra_pages /= 4;
 }
 
-static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
-   struct iov_iter *iter,
-   struct page *page)
-{
-   struct address_space *mapping = iocb->ki_filp->f_mapping;
-   struct inode *inode = mapping->host;
-   struct file_ra_state *ra = >ki_filp->f_ra;
-   unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
-   unsigned int bytes, copied;
-   loff_t isize, end_offset;
-
-   BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
-
-   /*
-* i_size must be checked after we know the page is Uptodate.
-*
-* Checking i_size after the check allows us to calculate
-* the correct value for "bytes", which means the zero-filled
-* part of the page is not copied back to userspace (unless
-* another truncate extends the file - this is desired though).
-*/
-
-   isize = i_size_read(inode);
-   if (unlikely(iocb->ki_pos >= isize))
-   return 1;
-
-   end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
-
-   bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
-
-   /* If users can be writing to this page using arbitrary
-* virtual addresses, take care about potential aliasing
-* before reading the page on the kernel side.
-*/
-   if (mapping_writably_mapped(mapping))
-   flush_dcache_page(page);
-
-   /*
-* Ok, we have the page, and it's up-to-date, so
-* now we can copy it to user space...
-*/
-
-   copied = copy_page_to_iter(page, offset, bytes, iter);
-
-   iocb->ki_pos += copied;
-
-   /*
-* When a sequential read accesses a page several times,
-* only mark it as accessed the first time.
-*/
-   if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
-   mark_page_accessed(page);
-
-   ra->prev_pos = iocb->ki_pos;
-
-   if (copied < bytes)
-   return -EFAULT;
-
-   return !iov_iter_count(iter) || iocb->ki_pos == isize;
-}
-
 static struct page *
 generic_file_buffered_read_readpage(struct file *filp,
struct address_space *mapping,
@@ -2179,6 +2118,83 @@ generic_file_buffered_read_no_cached_page(struct kiocb 
*iocb,
return generic_file_buffered_read_readpage(filp, mapping, page);
 }
 
+static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
+   struct iov_iter *iter,
+   struct page **pages,
+   unsigned int nr)
+{
+   struct file *filp = iocb->ki_filp;
+   struct address_space *mapping = filp->f_mapping;
+   struct file_ra_state *ra = >f_ra;
+   pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+   pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> 
PAGE_SHIFT;
+   int i, j, nr_got, err = 0;
+
+   nr = min_t(unsigned long, last_index - index, nr);
+find_page:
+   if (fatal_signal_pending(current))
+   return -EINTR;
+
+   nr_got = find_get_pages_contig(mapping, index, nr, pages);
+   if (nr_got)
+   goto got_pages;
+
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EAGAIN;
+
+   page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
+
+   nr_got = find_get_pages_contig(mapping, index, nr, pages);
+   if (nr_got)
+   goto got_pages;
+
+   pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
+   err = PTR_ERR_OR_ZERO(pages[0]);
+   if (!IS_ERR_OR_NULL(pages[0]))
+   nr_got = 1;
+got_pages:
+   for (i = 0; i < nr_got; i++) {
+   struct page *page = pages[i];
+   pgoff_t pg_index = index + i;
+   

[PATCH v3 0/2] generic_file_buffered_read() refactoring & optimization

2020-06-18 Thread Kent Overstreet
Ok - here's a new version, I fixed the checkpatch stuff and the thing with ret
should be more readable now:

Kent Overstreet (2):
  fs: Break generic_file_buffered_read up into multiple functions
  fs: generic_file_buffered_read() now uses find_get_pages_contig

 mm/filemap.c | 497 +--
 1 file changed, 287 insertions(+), 210 deletions(-)

-- 
2.26.2



Re: [PATCH 2/2] fs: generic_file_buffered_read() now uses find_get_pages_contig

2020-06-09 Thread Kent Overstreet
On Tue, Jun 09, 2020 at 06:38:08PM -0700, Matthew Wilcox wrote:
> On Tue, Jun 09, 2020 at 08:10:36PM -0400, Kent Overstreet wrote:
> > Convert generic_file_buffered_read() to get pages to read from in
> > batches, and then copy data to userspace from many pages at once - in
> > particular, we now don't touch any cachelines that might be contended
> > while we're in the loop to copy data to userspace.
> > 
> > This is is a performance improvement on workloads that do buffered reads
> > with large blocksizes, and a very large performance improvement if that
> > file is also being accessed concurrently by different threads.
> 
> Hey, you're stealing my performance improvements!

:)

> Granted, I haven't got to doing performance optimisations (certainly
> not in this function), but this is one of the places where THP in the
> page cache will have a useful performance improvement.
> 
> I'm not opposed to putting this in, but I may back it out as part of
> the THP work because the THPs will get the same performance improvements
> that you're seeing here with less code.

I'm an _enthusiastic_ supporter of the THP stuff (as you know), but my feeling
is that it's going to be a long time before hugepages are everywhere - and I
think even with the pagevec stuff generic_file_buffered_read() is somewhat
easier to read and deal with after this series than before it.

Though I could see the pagevec stuff making hugepage support a pain, so there is
that. Eh.


[PATCH v2 1/2] fs: Break generic_file_buffered_read up into multiple functions

2020-06-09 Thread Kent Overstreet
This is prep work for changing generic_file_buffered_read() to use
find_get_pages_contig() to batch up all the pagecache lookups.

This patch should be functionally identical to the existing code and
changes as little as of the flow control as possible. More refactoring
could be done, this patch is intended to be relatively minimal.

Signed-off-by: Kent Overstreet 
---
 mm/filemap.c | 418 ---
 1 file changed, 228 insertions(+), 190 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index e67fa8ab48..206d51a1c9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2051,6 +2051,210 @@ static void shrink_readahead_size_eio(struct 
file_ra_state *ra)
ra->ra_pages /= 4;
 }
 
+static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
+   struct iov_iter *iter,
+   struct page *page)
+{
+   struct address_space *mapping = iocb->ki_filp->f_mapping;
+   struct inode *inode = mapping->host;
+   struct file_ra_state *ra = >ki_filp->f_ra;
+   unsigned offset = iocb->ki_pos & ~PAGE_MASK;
+   unsigned bytes, copied;
+   loff_t isize, end_offset;
+
+   BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
+
+   /*
+* i_size must be checked after we know the page is Uptodate.
+*
+* Checking i_size after the check allows us to calculate
+* the correct value for "bytes", which means the zero-filled
+* part of the page is not copied back to userspace (unless
+* another truncate extends the file - this is desired though).
+*/
+
+   isize = i_size_read(inode);
+   if (unlikely(iocb->ki_pos >= isize))
+   return 1;
+
+   end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
+
+   bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
+
+   /* If users can be writing to this page using arbitrary
+* virtual addresses, take care about potential aliasing
+* before reading the page on the kernel side.
+*/
+   if (mapping_writably_mapped(mapping))
+   flush_dcache_page(page);
+
+   /*
+* Ok, we have the page, and it's up-to-date, so
+* now we can copy it to user space...
+*/
+
+   copied = copy_page_to_iter(page, offset, bytes, iter);
+
+   iocb->ki_pos += copied;
+
+   /*
+* When a sequential read accesses a page several times,
+* only mark it as accessed the first time.
+*/
+   if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
+   mark_page_accessed(page);
+
+   ra->prev_pos = iocb->ki_pos;
+
+   if (copied < bytes)
+   return -EFAULT;
+
+   return !iov_iter_count(iter) || iocb->ki_pos == isize;
+}
+
+static struct page *
+generic_file_buffered_read_readpage(struct file *filp,
+   struct address_space *mapping,
+   struct page *page)
+{
+   struct file_ra_state *ra = >f_ra;
+   int error;
+
+   /*
+* A previous I/O error may have been due to temporary
+* failures, eg. multipath errors.
+* PG_error will be set again if readpage fails.
+*/
+   ClearPageError(page);
+   /* Start the actual read. The read will unlock the page. */
+   error = mapping->a_ops->readpage(filp, page);
+
+   if (unlikely(error)) {
+   put_page(page);
+   return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
+   }
+
+   if (!PageUptodate(page)) {
+   error = lock_page_killable(page);
+   if (unlikely(error)) {
+   put_page(page);
+   return ERR_PTR(error);
+   }
+   if (!PageUptodate(page)) {
+   if (page->mapping == NULL) {
+   /*
+* invalidate_mapping_pages got it
+*/
+   unlock_page(page);
+   put_page(page);
+   return NULL;
+   }
+   unlock_page(page);
+   shrink_readahead_size_eio(ra);
+   put_page(page);
+   return ERR_PTR(-EIO);
+   }
+   unlock_page(page);
+   }
+
+   return page;
+}
+
+static struct page *
+generic_file_buffered_read_pagenotuptodate(struct file *filp,
+  struct iov_iter *iter,
+  struct page *page,
+  loff_t pos, loff_t count)
+{
+   struct address_space *mapping = filp->f_mapping;
+   struct inode *inode = mapping->host;
+   i

[PATCH v2 2/2] fs: generic_file_buffered_read() now uses find_get_pages_contig

2020-06-09 Thread Kent Overstreet
Convert generic_file_buffered_read() to get pages to read from in
batches, and then copy data to userspace from many pages at once - in
particular, we now don't touch any cachelines that might be contended
while we're in the loop to copy data to userspace.

This is is a performance improvement on workloads that do buffered reads
with large blocksizes, and a very large performance improvement if that
file is also being accessed concurrently by different threads.

On smaller reads (512 bytes), there's a very small performance
improvement (1%, within the margin of error).

Signed-off-by: Kent Overstreet 
---
 mm/filemap.c | 276 +--
 1 file changed, 155 insertions(+), 121 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 206d51a1c9..4fb0e5a238 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2051,67 +2051,6 @@ static void shrink_readahead_size_eio(struct 
file_ra_state *ra)
ra->ra_pages /= 4;
 }
 
-static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
-   struct iov_iter *iter,
-   struct page *page)
-{
-   struct address_space *mapping = iocb->ki_filp->f_mapping;
-   struct inode *inode = mapping->host;
-   struct file_ra_state *ra = >ki_filp->f_ra;
-   unsigned offset = iocb->ki_pos & ~PAGE_MASK;
-   unsigned bytes, copied;
-   loff_t isize, end_offset;
-
-   BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
-
-   /*
-* i_size must be checked after we know the page is Uptodate.
-*
-* Checking i_size after the check allows us to calculate
-* the correct value for "bytes", which means the zero-filled
-* part of the page is not copied back to userspace (unless
-* another truncate extends the file - this is desired though).
-*/
-
-   isize = i_size_read(inode);
-   if (unlikely(iocb->ki_pos >= isize))
-   return 1;
-
-   end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
-
-   bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
-
-   /* If users can be writing to this page using arbitrary
-* virtual addresses, take care about potential aliasing
-* before reading the page on the kernel side.
-*/
-   if (mapping_writably_mapped(mapping))
-   flush_dcache_page(page);
-
-   /*
-* Ok, we have the page, and it's up-to-date, so
-* now we can copy it to user space...
-*/
-
-   copied = copy_page_to_iter(page, offset, bytes, iter);
-
-   iocb->ki_pos += copied;
-
-   /*
-* When a sequential read accesses a page several times,
-* only mark it as accessed the first time.
-*/
-   if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
-   mark_page_accessed(page);
-
-   ra->prev_pos = iocb->ki_pos;
-
-   if (copied < bytes)
-   return -EFAULT;
-
-   return !iov_iter_count(iter) || iocb->ki_pos == isize;
-}
-
 static struct page *
 generic_file_buffered_read_readpage(struct file *filp,
struct address_space *mapping,
@@ -2255,6 +2194,79 @@ generic_file_buffered_read_no_cached_page(struct kiocb 
*iocb,
return generic_file_buffered_read_readpage(filp, mapping, page);
 }
 
+static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
+   struct iov_iter *iter,
+   struct page **pages,
+   unsigned nr)
+{
+   struct file *filp = iocb->ki_filp;
+   struct address_space *mapping = filp->f_mapping;
+   struct file_ra_state *ra = >f_ra;
+   pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+   pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> 
PAGE_SHIFT;
+   int i, j, ret, err = 0;
+
+   nr = min_t(unsigned long, last_index - index, nr);
+find_page:
+   if (fatal_signal_pending(current))
+   return -EINTR;
+
+   ret = find_get_pages_contig(mapping, index, nr, pages);
+   if (ret)
+   goto got_pages;
+
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EAGAIN;
+
+   page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
+
+   ret = find_get_pages_contig(mapping, index, nr, pages);
+   if (ret)
+   goto got_pages;
+
+   pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
+   err = PTR_ERR_OR_ZERO(pages[0]);
+   ret = !IS_ERR_OR_NULL(pages[0]);
+got_pages:
+   for (i = 0; i < ret; i++) {
+   struct page *page = pages[i];
+   pgoff_t pg_index = index +i;
+   loff_t pg_pos = max(iocb->ki_pos,
+

Re: [PATCH 2/2] fs: generic_file_buffered_read() now uses find_get_pages_contig

2020-06-09 Thread Kent Overstreet
On Tue, Jun 09, 2020 at 05:47:53PM -0700, Matthew Wilcox wrote:
> On Tue, Jun 09, 2020 at 08:10:36PM -0400, Kent Overstreet wrote:
> > @@ -2275,83 +2287,93 @@ static ssize_t generic_file_buffered_read(struct 
> > kiocb *iocb,
> > struct iov_iter *iter, ssize_t written)
> >  {
> > struct file *filp = iocb->ki_filp;
> > +   struct file_ra_state *ra = >f_ra;
> > struct address_space *mapping = filp->f_mapping;
> > struct inode *inode = mapping->host;
> > -   struct file_ra_state *ra = >f_ra;
> > size_t orig_count = iov_iter_count(iter);
> > -   pgoff_t last_index;
> > -   int error = 0;
> > +   struct page *pages[64];
> 
> That's 512 bytes which seems like a lot of stack space.  Would 16 be
> enough to see a significant fraction of the benefit?

Ah right, we do call into fs code for readahead from here. I'll switch it to
kmalloc the page array if it's more than 16.


[PATCH 2/2] fs: generic_file_buffered_read() now uses find_get_pages_contig

2020-06-09 Thread Kent Overstreet
Convert generic_file_buffered_read() to get pages to read from in
batches, and then copy data to userspace from many pages at once - in
particular, we now don't touch any cachelines that might be contended
while we're in the loop to copy data to userspace.

This is is a performance improvement on workloads that do buffered reads
with large blocksizes, and a very large performance improvement if that
file is also being accessed concurrently by different threads.

On smaller reads (512 bytes), there's a very small performance
improvement (1%, within the margin of error).

Signed-off-by: Kent Overstreet 
---
 mm/filemap.c | 266 ---
 1 file changed, 144 insertions(+), 122 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 206d51a1c9..0d1836081c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2051,67 +2051,6 @@ static void shrink_readahead_size_eio(struct 
file_ra_state *ra)
ra->ra_pages /= 4;
 }
 
-static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
-   struct iov_iter *iter,
-   struct page *page)
-{
-   struct address_space *mapping = iocb->ki_filp->f_mapping;
-   struct inode *inode = mapping->host;
-   struct file_ra_state *ra = >ki_filp->f_ra;
-   unsigned offset = iocb->ki_pos & ~PAGE_MASK;
-   unsigned bytes, copied;
-   loff_t isize, end_offset;
-
-   BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
-
-   /*
-* i_size must be checked after we know the page is Uptodate.
-*
-* Checking i_size after the check allows us to calculate
-* the correct value for "bytes", which means the zero-filled
-* part of the page is not copied back to userspace (unless
-* another truncate extends the file - this is desired though).
-*/
-
-   isize = i_size_read(inode);
-   if (unlikely(iocb->ki_pos >= isize))
-   return 1;
-
-   end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
-
-   bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
-
-   /* If users can be writing to this page using arbitrary
-* virtual addresses, take care about potential aliasing
-* before reading the page on the kernel side.
-*/
-   if (mapping_writably_mapped(mapping))
-   flush_dcache_page(page);
-
-   /*
-* Ok, we have the page, and it's up-to-date, so
-* now we can copy it to user space...
-*/
-
-   copied = copy_page_to_iter(page, offset, bytes, iter);
-
-   iocb->ki_pos += copied;
-
-   /*
-* When a sequential read accesses a page several times,
-* only mark it as accessed the first time.
-*/
-   if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
-   mark_page_accessed(page);
-
-   ra->prev_pos = iocb->ki_pos;
-
-   if (copied < bytes)
-   return -EFAULT;
-
-   return !iov_iter_count(iter) || iocb->ki_pos == isize;
-}
-
 static struct page *
 generic_file_buffered_read_readpage(struct file *filp,
struct address_space *mapping,
@@ -2255,6 +2194,79 @@ generic_file_buffered_read_no_cached_page(struct kiocb 
*iocb,
return generic_file_buffered_read_readpage(filp, mapping, page);
 }
 
+static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
+   struct iov_iter *iter,
+   struct page **pages,
+   unsigned nr)
+{
+   struct file *filp = iocb->ki_filp;
+   struct address_space *mapping = filp->f_mapping;
+   struct file_ra_state *ra = >f_ra;
+   pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+   pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> 
PAGE_SHIFT;
+   int i, j, ret, err = 0;
+
+   nr = min_t(unsigned long, last_index - index, nr);
+find_page:
+   if (fatal_signal_pending(current))
+   return -EINTR;
+
+   ret = find_get_pages_contig(mapping, index, nr, pages);
+   if (ret)
+   goto got_pages;
+
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   return -EAGAIN;
+
+   page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
+
+   ret = find_get_pages_contig(mapping, index, nr, pages);
+   if (ret)
+   goto got_pages;
+
+   pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
+   err = PTR_ERR_OR_ZERO(pages[0]);
+   ret = !IS_ERR_OR_NULL(pages[0]);
+got_pages:
+   for (i = 0; i < ret; i++) {
+   struct page *page = pages[i];
+   pgoff_t pg_index = index +i;
+   loff_t pg_pos = max(iocb->ki_pos,
+

[PATCH 0/2] generic_file_buffered_read() refactoring & optimization

2020-06-09 Thread Kent Overstreet
This is a small patch series that's been in the bcachefs tree for awhile.

In the buffered read path, we look up a page in the page cache, then copy from
that page in a loop - i.e. mixing the data copies in between looking up each
individual page. When we're doing large reads from the page cache, this is some
pretty major overhead.

This just reworks generic_file_buffered_read() to use find_get_pages_contig()
and work on an array of pages. It's a pretty significant performance
improvement for large buffered reads, and doesn't regress performance on single
page reads.

As a bonus, generic_file_buffered_read() gets broken up into multiple functions
that are _somewhat_ easier to follow.

Kent Overstreet (2):
  fs: Break generic_file_buffered_read up into multiple functions
  fs: generic_file_buffered_read() now uses find_get_pages_contig

 mm/filemap.c | 486 +--
 1 file changed, 273 insertions(+), 213 deletions(-)

-- 
2.27.0



[PATCH 1/2] fs: Break generic_file_buffered_read up into multiple functions

2020-06-09 Thread Kent Overstreet
This is prep work for changing generic_file_buffered_read() to use
find_get_pages_contig() to batch up all the pagecache lookups.

This patch should be functionally identical to the existing code and
changes as little as of the flow control as possible. More refactoring
could be done, this patch is intended to be relatively minimal.

Signed-off-by: Kent Overstreet 
---
 mm/filemap.c | 418 ---
 1 file changed, 228 insertions(+), 190 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index e67fa8ab48..206d51a1c9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2051,6 +2051,210 @@ static void shrink_readahead_size_eio(struct 
file_ra_state *ra)
ra->ra_pages /= 4;
 }
 
+static int generic_file_buffered_read_page_ok(struct kiocb *iocb,
+   struct iov_iter *iter,
+   struct page *page)
+{
+   struct address_space *mapping = iocb->ki_filp->f_mapping;
+   struct inode *inode = mapping->host;
+   struct file_ra_state *ra = >ki_filp->f_ra;
+   unsigned offset = iocb->ki_pos & ~PAGE_MASK;
+   unsigned bytes, copied;
+   loff_t isize, end_offset;
+
+   BUG_ON(iocb->ki_pos >> PAGE_SHIFT != page->index);
+
+   /*
+* i_size must be checked after we know the page is Uptodate.
+*
+* Checking i_size after the check allows us to calculate
+* the correct value for "bytes", which means the zero-filled
+* part of the page is not copied back to userspace (unless
+* another truncate extends the file - this is desired though).
+*/
+
+   isize = i_size_read(inode);
+   if (unlikely(iocb->ki_pos >= isize))
+   return 1;
+
+   end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
+
+   bytes = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset);
+
+   /* If users can be writing to this page using arbitrary
+* virtual addresses, take care about potential aliasing
+* before reading the page on the kernel side.
+*/
+   if (mapping_writably_mapped(mapping))
+   flush_dcache_page(page);
+
+   /*
+* Ok, we have the page, and it's up-to-date, so
+* now we can copy it to user space...
+*/
+
+   copied = copy_page_to_iter(page, offset, bytes, iter);
+
+   iocb->ki_pos += copied;
+
+   /*
+* When a sequential read accesses a page several times,
+* only mark it as accessed the first time.
+*/
+   if (iocb->ki_pos >> PAGE_SHIFT != ra->prev_pos >> PAGE_SHIFT)
+   mark_page_accessed(page);
+
+   ra->prev_pos = iocb->ki_pos;
+
+   if (copied < bytes)
+   return -EFAULT;
+
+   return !iov_iter_count(iter) || iocb->ki_pos == isize;
+}
+
+static struct page *
+generic_file_buffered_read_readpage(struct file *filp,
+   struct address_space *mapping,
+   struct page *page)
+{
+   struct file_ra_state *ra = >f_ra;
+   int error;
+
+   /*
+* A previous I/O error may have been due to temporary
+* failures, eg. multipath errors.
+* PG_error will be set again if readpage fails.
+*/
+   ClearPageError(page);
+   /* Start the actual read. The read will unlock the page. */
+   error = mapping->a_ops->readpage(filp, page);
+
+   if (unlikely(error)) {
+   put_page(page);
+   return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
+   }
+
+   if (!PageUptodate(page)) {
+   error = lock_page_killable(page);
+   if (unlikely(error)) {
+   put_page(page);
+   return ERR_PTR(error);
+   }
+   if (!PageUptodate(page)) {
+   if (page->mapping == NULL) {
+   /*
+* invalidate_mapping_pages got it
+*/
+   unlock_page(page);
+   put_page(page);
+   return NULL;
+   }
+   unlock_page(page);
+   shrink_readahead_size_eio(ra);
+   put_page(page);
+   return ERR_PTR(-EIO);
+   }
+   unlock_page(page);
+   }
+
+   return page;
+}
+
+static struct page *
+generic_file_buffered_read_pagenotuptodate(struct file *filp,
+  struct iov_iter *iter,
+  struct page *page,
+  loff_t pos, loff_t count)
+{
+   struct address_space *mapping = filp->f_mapping;
+   struct inode *inode = mapping->host;
+   i

Re: [PATCH 12/12] closures: fix a race on wakeup from closure_sync

2019-07-22 Thread Kent Overstreet
On Thu, Jul 18, 2019 at 03:46:46PM +0800, Coly Li wrote:
> On 2019/7/16 6:47 下午, Coly Li wrote:
> > Hi Kent,
> > 
> > On 2019/6/11 3:14 上午, Kent Overstreet wrote:
> >> Signed-off-by: Kent Overstreet 
> > Acked-by: Coly Li 
> > 
> > And also I receive report for suspicious closure race condition in
> > bcache, and people ask for having this patch into Linux v5.3.
> > 
> > So before this patch gets merged into upstream, I plan to rebase it to
> > drivers/md/bcache/closure.c at this moment. Of cause the author is you.
> > 
> > When lib/closure.c merged into upstream, I will rebase all closure usage
> > from bcache to use lib/closure.{c,h}.
> 
> Hi Kent,
> 
> The race bug reporter replies me that the closure race bug is very rare
> to reproduce, after applying the patch and testing, they are not sure
> whether their closure race problem is fixed or not.
> 
> And I notice rcu_read_lock()/rcu_read_unlock() is used here, but it is
> not clear to me what is the functionality of the rcu read lock in
> closure_sync_fn(). I believe you have reason to use the rcu stuffs here,
> could you please provide some hints to help me to understand the change
> better ?

The race was when a thread using closure_sync() notices cl->s->done == 1 before
the thread calling closure_put() calls wake_up_process(). Then, it's possible
for that thread to return and exit just before wake_up_process() is called - so
we're trying to wake up a process that no longer exists.

rcu_read_lock() is sufficient to protect against this, as there's an rcu barrier
somewhere in the process teardown path.


Re: pagecache locking (was: bcachefs status update) merged)

2019-06-14 Thread Kent Overstreet
On Fri, Jun 14, 2019 at 09:55:24AM +1000, Dave Chinner wrote:
> On Thu, Jun 13, 2019 at 02:36:25PM -0400, Kent Overstreet wrote:
> > On Thu, Jun 13, 2019 at 09:02:24AM +1000, Dave Chinner wrote:
> > > On Wed, Jun 12, 2019 at 12:21:44PM -0400, Kent Overstreet wrote:
> > > > Ok, I'm totally on board with returning EDEADLOCK.
> > > > 
> > > > Question: Would we be ok with returning EDEADLOCK for any IO where the 
> > > > buffer is
> > > > in the same address space as the file being read/written to, even if 
> > > > the buffer
> > > > and the IO don't technically overlap?
> > > 
> > > I'd say that depends on the lock granularity. For a range lock,
> > > we'd be able to do the IO for non-overlapping ranges. For a normal
> > > mutex or rwsem, then we risk deadlock if the page fault triggers on
> > > the same address space host as we already have locked for IO. That's
> > > the case we currently handle with the second IO lock in XFS, ext4,
> > > btrfs, etc (XFS_MMAPLOCK_* in XFS).
> > > 
> > > One of the reasons I'm looking at range locks for XFS is to get rid
> > > of the need for this second mmap lock, as there is no reason for it
> > > existing if we can lock ranges and EDEADLOCK inside page faults and
> > > return errors.
> > 
> > My concern is that range locks are going to turn out to be both more 
> > complicated
> > and heavier weight, performance wise, than the approach I've taken of just a
> > single lock per address space.
> 
> That's the battle I'm fighting at the moment with them for direct
> IO(*), but range locks are something I'm doing for XFS and I don't
> really care if anyone else wants to use them or not.

I'm not saying I won't use them :)

I just want to do the simple thing first, and then if range locks turn out well
I don't think it'll be hard to switch bcachefs to them. Or if the simple thing
turns out to be good enough, even better :)

> (*)Direct IO on XFS is a pure shared lock workload, so the rwsem
> scales until single atomic update cache line bouncing limits
> throughput. That means I can max out my hardware at 1.6 million
> random 4k read/write IOPS (a bit over 6GB/s)(**) to a single file
> with a rwsem at 32 AIO+DIO dispatch threads. I've only got range
> locks to about 1.1M IOPS on the same workload, though it's within a
> couple of percent of a rwsem up to 16 threads...
> 
> (**) A small handful of nvme SSDs fed by AIO+DIO are /way faster/
> than pmem that is emulated with RAM, let alone real pmem which is
> much slower at random writes than RAM.

So something I should mention is that I've been 

> 
> > Reason being range locks only help when you've got multiple operations 
> > going on
> > simultaneously that don't conflict - i.e. it's really only going to be 
> > useful
> > for applications that are doing buffered IO and direct IO simultaneously to 
> > the
> > same file.
> 
> Yes, they do that, but that's not why I'm looking at this.  Range
> locks are primarily for applications that mix multiple different
> types of operations to the same file concurrently. e.g:
> 
> - fallocate and read/write() can be run concurrently if they
> don't overlap, but right now we serialise them because we have no
> visibility into what other operations require.

True true. Have you ever seen this be an issue for real applications?

> - buffered read and buffered write can run concurrently if they
> don't overlap, but right now they are serialised because that's the
> only way to provide POSIX atomic write vs read semantics (only XFS
> provides userspace with that guarantee).

We already talked about this on IRC, but it's not the _only_ way - page locks
suffice if we lock all the pages for the read/write at once, and that's actually
a really good thing to for performance.

bcachefs doesn't currently provide any guarantees here, but since we already are
batching up the page operations for buffered writes (and I have patches to do so
for the buffered read path in filemap.c) I will tweak that slightly to provide
some sort of guarantee.

This is something I care about in bcachefs because I'm pretty sure I can make
both the buffered read and write paths work without taking any per inode locks -
so unrelated IOs to the same file won't be modifying at shared cachelines at
all. Haven't gotten around to it yet for the buffered write path, but it's on
the todo list.

> - Sub-block direct IO is serialised against all other direct IO
> because we can't tell if it overlaps with some other direct IO and
> so we have to take the slow but safe option - range locks solve that
> problem, too.

This feels like an internal filesystem implem

Re: pagecache locking (was: bcachefs status update) merged)

2019-06-13 Thread Kent Overstreet
On Thu, Jun 13, 2019 at 03:13:40PM -0600, Andreas Dilger wrote:
> There are definitely workloads that require multiple threads doing 
> non-overlapping
> writes to a single file in HPC.  This is becoming an increasingly common 
> problem
> as the number of cores on a single client increase, since there is typically 
> one
> thread per core trying to write to a shared file.  Using multiple files (one 
> per
> core) is possible, but that has file management issues for users when there 
> are a
> million cores running on the same job/file (obviously not on the same client 
> node)
> dumping data every hour.

Mixed buffered and O_DIRECT though? That profile looks like just buffered IO to
me.

> We were just looking at this exact problem last week, and most of the threads 
> are
> spinning in grab_cache_page_nowait->add_to_page_cache_lru() and 
> set_page_dirty()
> when writing at 1.9GB/s when they could be writing at 5.8GB/s (when threads 
> are
> writing O_DIRECT instead of buffered).  Flame graph is attached for 16-thread 
> case,
> but high-end systems today easily have 2-4x that many cores.

Yeah I've been spending some time on buffered IO performance too - 4k page
overhead is a killer.

bcachefs has a buffered write path that looks up multiple pages at a time and
locks them, and then copies the data to all the pages at once (I stole the idea
from btrfs). It was a very significant performance increase.

https://evilpiepirate.org/git/bcachefs.git/tree/fs/bcachefs/fs-io.c#n1498


pagecache locking (was: bcachefs status update) merged)

2019-06-13 Thread Kent Overstreet
On Thu, Jun 13, 2019 at 09:02:24AM +1000, Dave Chinner wrote:
> On Wed, Jun 12, 2019 at 12:21:44PM -0400, Kent Overstreet wrote:
> > Ok, I'm totally on board with returning EDEADLOCK.
> > 
> > Question: Would we be ok with returning EDEADLOCK for any IO where the 
> > buffer is
> > in the same address space as the file being read/written to, even if the 
> > buffer
> > and the IO don't technically overlap?
> 
> I'd say that depends on the lock granularity. For a range lock,
> we'd be able to do the IO for non-overlapping ranges. For a normal
> mutex or rwsem, then we risk deadlock if the page fault triggers on
> the same address space host as we already have locked for IO. That's
> the case we currently handle with the second IO lock in XFS, ext4,
> btrfs, etc (XFS_MMAPLOCK_* in XFS).
> 
> One of the reasons I'm looking at range locks for XFS is to get rid
> of the need for this second mmap lock, as there is no reason for it
> existing if we can lock ranges and EDEADLOCK inside page faults and
> return errors.

My concern is that range locks are going to turn out to be both more complicated
and heavier weight, performance wise, than the approach I've taken of just a
single lock per address space.

Reason being range locks only help when you've got multiple operations going on
simultaneously that don't conflict - i.e. it's really only going to be useful
for applications that are doing buffered IO and direct IO simultaneously to the
same file. Personally, I think that would be a pretty gross thing to do and I'm
not particularly interested in optimizing for that case myself... but, if you
know of applications that do depend on that I might change my opinion. If not, I
want to try and get the simpler, one-lock-per-address space approach to work.

That said though - range locks on the page cache can be viewed as just a
performance optimization over my approach, they've got the same semantics
(locking a subset of the page cache vs. the entire thing). So that's a bit of a
digression.

> > This would simplify things a lot and eliminate a really nasty corner case - 
> > page
> > faults trigger readahead. Even if the buffer and the direct IO don't 
> > overlap,
> > readahead can pull in pages that do overlap with the dio.
> 
> Page cache readahead needs to be moved under the filesystem IO
> locks. There was a recent thread about how readahead can race with
> hole punching and other fallocate() operations because page cache
> readahead bypasses the filesystem IO locks used to serialise page
> cache invalidation.
> 
> e.g. Readahead can be directed by userspace via fadvise, so we now
> have file->f_op->fadvise() so that filesystems can lock the inode
> before calling generic_fadvise() such that page cache instantiation
> and readahead dispatch can be serialised against page cache
> invalidation. I have a patch for XFS sitting around somewhere that
> implements the ->fadvise method.

I just puked a little in my mouth.

> I think there are some other patches floating around to address the
> other readahead mechanisms to only be done under filesytem IO locks,
> but I haven't had time to dig into it any further. Readahead from
> page faults most definitely needs to be under the MMAPLOCK at
> least so it serialises against fallocate()...

So I think there's two different approaches we should distinguish between. We
can either add the locking to all the top level IO paths - what you just
described - or, the locking can be pushed down to protect _only_ adding pages to
the page cache, which is the approach I've been taking.

I think both approaches are workable, but I do think that pushing the locking
down to __add_to_page_cache_locked is fundamentally the better, more correct
approach.

 - It better matches the semantics of what we're trying to do. All these
   operations we're trying to protect - dio, fallocate, truncate - they all have
   in common that they just want to shoot down a range of the page cache and
   keep it from being readded. And in general, it's better to have locks that
   protect specific data structures ("adding to this radix tree"), vs. large
   critical sections ("the io path").

   In bcachefs, at least for buffered IO I don't currently need any per-inode IO
   locks, page granularity locks suffice, so I'd like to keep that - under the
   theory that buffered IO to pages already in cache is more of a fast path than
   faulting pages in.

 - If we go with the approach of using the filesystem IO locks, we need to be
   really careful about auditing and adding assertions to make sure we've found
   and fixed all the code paths that can potentially add pages to the page
   cache. I didn't even know about the fadvise case, eesh.

 - We still need to do something about the case where we're recursively f

Re: [PATCH 10/12] bcache: move closures to lib/

2019-06-13 Thread Kent Overstreet
On Thu, Jun 13, 2019 at 12:28:41AM -0700, Christoph Hellwig wrote:
> On Mon, Jun 10, 2019 at 03:14:18PM -0400, Kent Overstreet wrote:
> > Prep work for bcachefs - being a fork of bcache it also uses closures
> 
> NAK.  This obsfucation needs to go away from bcache and not actually be
> spread further, especially not as an API with multiple users which will
> make it even harder to get rid of it.

Christoph, you've made it plenty clear how much you dislike closures in the past
but "I don't like it" is not remotely the kind of objection that is appropriate
or useful for technical discussions, and that's pretty much all you've ever
given.

If you really think that code should be gotten rid of, then maybe you should
actually _look at what they do that's not covered by other kernel
infrastructure_ and figure out something better. Otherwise... no one else seems
to care all that much about closures, and you're not actually giving any
technical feedback, so I'm not sure what you expect.


Re: bcachefs status update (it's done cooking; let's get this sucker merged)

2019-06-12 Thread Kent Overstreet
On Tue, Jun 11, 2019 at 02:33:36PM +1000, Dave Chinner wrote:
> I just recently said this with reference to the range lock stuff I'm
> working on in the background:
> 
>   FWIW, it's to avoid problems with stupid userspace stuff
>   that nobody really should be doing that I want range locks
>   for the XFS inode locks.  If userspace overlaps the ranges
>   and deadlocks in that case, they they get to keep all the
>   broken bits because, IMO, they are doing something
>   monumentally stupid. I'd probably be making it return
>   EDEADLOCK back out to userspace in the case rather than
>   deadlocking but, fundamentally, I think it's broken
>   behaviour that we should be rejecting with an error rather
>   than adding complexity trying to handle it.
> 
> So I think this recusive locking across a page fault case should
> just fail, not add yet more complexity to try to handle a rare
> corner case that exists more in theory than in reality. i.e put the
> lock context in the current task, then if the page fault requires a
> conflicting lock context to be taken, we terminate the page fault,
> back out of the IO and return EDEADLOCK out to userspace. This works
> for all types of lock contexts - only the filesystem itself needs to
> know what the lock context pointer contains

Ok, I'm totally on board with returning EDEADLOCK.

Question: Would we be ok with returning EDEADLOCK for any IO where the buffer is
in the same address space as the file being read/written to, even if the buffer
and the IO don't technically overlap?

This would simplify things a lot and eliminate a really nasty corner case - page
faults trigger readahead. Even if the buffer and the direct IO don't overlap,
readahead can pull in pages that do overlap with the dio.

And on getting EDEADLOCK we could fall back to buffered IO, so userspace would
never know...


Re: bcachefs status update (it's done cooking; let's get this sucker merged)

2019-06-10 Thread Kent Overstreet
On Mon, Jun 10, 2019 at 10:46:35AM -1000, Linus Torvalds wrote:
> On Mon, Jun 10, 2019 at 9:14 AM Kent Overstreet
>  wrote:
> >
> > So. Here's my bcachefs-for-review branch - this has the minimal set of 
> > patches
> > outside of fs/bcachefs/. My master branch has some performance 
> > optimizations for
> > the core buffered IO paths, but those are fairly tricky and invasive so I 
> > want
> > to hold off on those for now - this branch is intended to be more or less
> > suitable for merging as is.
> 
> Honestly, it really isn't.

Heh, I suppose that's what review is for :)

> There are obvious things wrong with it - like the fact that you've
> rebased it so that the original history is gone, yet you've not
> actually *fixed* the history, so you find things like reverts of
> commits that should simply have been removed, and fixes for things
> that should just have been fixed in the original commit the fix is
> for.

Yeah, I suppose I have dropped the ball on that lately. 
 
> But note that the cleanup should go further than just fix those kinds
> of technical issues. If you rebase, and you have fixes in your tree
> for things you rebase, just fix things as you rewrite history anyway
> (there are cases where the fix may be informative in itself and it's
> worth leaving around, but that's rare).

Yeah that has historically been my practice, I've just been moving away from
that kind of history editing as bcachefs has been getting more users. Hence the
in-between, worst of both workflows state of the current tree.

But, I can certainly go through and clean things up like that one last time and
make everything bisectable again - I'll go through and write proper commit
messages too. Unless you'd be ok with just squashing most of the history down to
one commit - which would you prefer?

> Anyway, aside from that, I only looked at the non-bcachefs parts. Some
> of those are not acceptable either, like
> 
> struct pagecache_lock add_lock
> cacheline_aligned_in_smp; /* protects adding new pages */
> 
> in 'struct address_space', which is completely bogus, since that
> forces not only a potentially huge amount of padding, it also requires
> alignment that that struct simply fundamentally does not have, and
> _will_ not have.

Oh, good point.

> You can only use cacheline_aligned_in_smp for top-level objects,
> and honestly, it's almost never a win. That lock shouldn't be so hot.
> 
> That lock is somewhat questionable in the first place, and no, we
> don't do those hacky recursive things anyway. A recursive lock is
> almost always a buggy and mis-designed one.

You're preaching to the choir there, I still feel dirty about that code and I'd
love nothing more than for someone else to come along and point out how stupid
I've been with a much better way of doing it. 

> Why does the regular page lock (at a finer granularity) not suffice?

Because the lock needs to prevent pages from being _added_ to the page cache -
to do it with a page granularity lock it'd have to be part of the radix tree, 

> And no, nobody has ever cared. The dio people just don't care about
> page cache anyway. They have their own thing going.

It's not just dio, it's even worse with the various fallocate operations. And
the xfs people care, but IIRC even they don't have locking for pages being
faulted in. This is an issue I've talked to other filesystem people quite a bit
about - especially Dave Chinner, maybe we can get him to weigh in here.

And this inconsistency does result in _real_ bugs. It goes something like this:
 - dio write shoots down the range of the page cache for the file it's writing
   to, using invalidate_inode_pages_range2
 - After the page cache shoot down, but before the write actually happens,
   another process pulls those pages back in to the page cache
 - Now the write happens: if that write was e.g. an allocating write, you're
   going to have page cache state (buffer heads) that say that page doesn't have
   anything on disk backing it, but it actually does because of the dio write.

xfs has additional locking (that the vfs does _not_ do) around both the buffered
and dio IO paths to prevent this happening because of a buffered read pulling
the pages back in, but no one has a solution for pages getting _faulted_ back in
- either because of mmap or gup().

And there are some filesystem people who do know about this race, because at
some point the dio code has been changed to shoot down the page cache _again_
after the write completes. But that doesn't eliminate the race, it just makes it
harder to trigger.

And dio writes actually aren't the worst of it, it's even worse with fallocate
FALLOC_FL_INSERT_RANGE/COLLAPSE_RANGE. Last time I looked at the ext4 fallocate
code, it looked _completely_ broken to me - the code seemed to think it was
using the

bcachefs status update (it's done cooking; let's get this sucker merged)

2019-06-10 Thread Kent Overstreet
Last status update: https://lkml.org/lkml/2018/12/2/46

Current status - I'm pretty much running out of things to polish and excuses to
keep tinkering. The core featureset is _done_ and the list of known outstanding
bugs is getting to be short and unexciting. The next big things on my todo list
are finishing erasure coding and reflink, but there's no reason for merging to
wait on those.

So. Here's my bcachefs-for-review branch - this has the minimal set of patches
outside of fs/bcachefs/. My master branch has some performance optimizations for
the core buffered IO paths, but those are fairly tricky and invasive so I want
to hold off on those for now - this branch is intended to be more or less
suitable for merging as is.

https://evilpiepirate.org/git/bcachefs.git/log/?h=bcachefs-for-review

The list of non bcachefs patches is:

closures: fix a race on wakeup from closure_sync
closures: closure_wait_event()
bcache: move closures to lib/
bcache: optimize continue_at_nobarrier()
block: Add some exports for bcachefs
Propagate gfp_t when allocating pte entries from __vmalloc
fs: factor out d_mark_tmpfile()
fs: insert_inode_locked2()
mm: export find_get_pages()
mm: pagecache add lock
locking: SIX locks (shared/intent/exclusive)
Compiler Attributes: add __flatten

Most of the patches are pretty small, of the ones that aren't:

 - SIX locks have already been discussed, and seem to be pretty uncontroversial.

 - pagecache add lock: it's kind of ugly, but necessary to rigorously prevent
   page cache inconsistencies with dio and other operations, in particular
   racing vs. page faults - honestly, it's criminal that we still don't have a
   mechanism in the kernel to address this, other filesystems are susceptible to
   these kinds of bugs too.

   My patch is intentionally ugly in the hopes that someone else will come up
   with a magical elegant solution, but in the meantime it's an "it's ugly but
   it works" sort of thing, and I suspect in real world scenarios it's going to
   beat any kind of range locking performance wise, which is the only
   alternative I've heard discussed.
   
 - Propaget gfp_t from __vmalloc() - bcachefs needs __vmalloc() to respect
   GFP_NOFS, that's all that is.

 - and, moving closures out of drivers/md/bcache to lib/. 

The rest of the tree is 62k lines of code in fs/bcachefs. So, I obviously won't
be mailing out all of that as patches, but if any code reviewers have
suggestions on what would make that go easier go ahead and speak up. The last
time I was mailing things out for review the main thing that came up was ioctls,
but the ioctl interface hasn't really changed since then. I'm pretty confident
in the on disk format stuff, which was the other thing that was mentioned.

--

This has been a monumental effort over a lot of years, and I'm _really_ happy
with how it's turned out. I'm excited to finally unleash this upon the world.


[PATCH 01/12] Compiler Attributes: add __flatten

2019-06-10 Thread Kent Overstreet
Signed-off-by: Kent Overstreet 
---
 include/linux/compiler_attributes.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/include/linux/compiler_attributes.h 
b/include/linux/compiler_attributes.h
index 6b318efd8a..48b2c6ae6f 100644
--- a/include/linux/compiler_attributes.h
+++ b/include/linux/compiler_attributes.h
@@ -253,4 +253,9 @@
  */
 #define __weak  __attribute__((__weak__))
 
+/*
+ *   gcc: 
https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute
+ */
+#define __flatten __attribute__((flatten))
+
 #endif /* __LINUX_COMPILER_ATTRIBUTES_H */
-- 
2.20.1



[PATCH 05/12] fs: insert_inode_locked2()

2019-06-10 Thread Kent Overstreet
New helper for bcachefs, so that when we race inserting an inode we can
atomically grab a ref to the inode already in the inode cache.

Signed-off-by: Kent Overstreet 
---
 fs/inode.c | 40 
 include/linux/fs.h |  1 +
 2 files changed, 41 insertions(+)

diff --git a/fs/inode.c b/fs/inode.c
index 8881dc551f..cc44f345e0 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1479,6 +1479,46 @@ int insert_inode_locked(struct inode *inode)
 }
 EXPORT_SYMBOL(insert_inode_locked);
 
+struct inode *insert_inode_locked2(struct inode *inode)
+{
+   struct super_block *sb = inode->i_sb;
+   ino_t ino = inode->i_ino;
+   struct hlist_head *head = inode_hashtable + hash(sb, ino);
+
+   while (1) {
+   struct inode *old = NULL;
+   spin_lock(_hash_lock);
+   hlist_for_each_entry(old, head, i_hash) {
+   if (old->i_ino != ino)
+   continue;
+   if (old->i_sb != sb)
+   continue;
+   spin_lock(>i_lock);
+   if (old->i_state & (I_FREEING|I_WILL_FREE)) {
+   spin_unlock(>i_lock);
+   continue;
+   }
+   break;
+   }
+   if (likely(!old)) {
+   spin_lock(>i_lock);
+   inode->i_state |= I_NEW | I_CREATING;
+   hlist_add_head(>i_hash, head);
+   spin_unlock(>i_lock);
+   spin_unlock(_hash_lock);
+   return NULL;
+   }
+   __iget(old);
+   spin_unlock(>i_lock);
+   spin_unlock(_hash_lock);
+   wait_on_inode(old);
+   if (unlikely(!inode_unhashed(old)))
+   return old;
+   iput(old);
+   }
+}
+EXPORT_SYMBOL(insert_inode_locked2);
+
 int insert_inode_locked4(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *), void *data)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a88d994751..d5d12d6981 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3010,6 +3010,7 @@ extern struct inode *find_inode_nowait(struct super_block 
*,
   void *data);
 extern int insert_inode_locked4(struct inode *, unsigned long, int 
(*test)(struct inode *, void *), void *);
 extern int insert_inode_locked(struct inode *);
+extern struct inode *insert_inode_locked2(struct inode *);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 extern void lockdep_annotate_inode_mutex_key(struct inode *inode);
 #else
-- 
2.20.1



[PATCH 02/12] locking: SIX locks (shared/intent/exclusive)

2019-06-10 Thread Kent Overstreet
New lock for bcachefs, like read/write locks but with a third state,
intent.

Intent locks conflict with each other, but not with read locks; taking a
write lock requires first holding an intent lock.

Signed-off-by: Kent Overstreet 
---
 include/linux/six.h | 192 +++
 kernel/Kconfig.locks|   3 +
 kernel/locking/Makefile |   1 +
 kernel/locking/six.c| 512 
 4 files changed, 708 insertions(+)
 create mode 100644 include/linux/six.h
 create mode 100644 kernel/locking/six.c

diff --git a/include/linux/six.h b/include/linux/six.h
new file mode 100644
index 00..0fb1b2f493
--- /dev/null
+++ b/include/linux/six.h
@@ -0,0 +1,192 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_SIX_H
+#define _LINUX_SIX_H
+
+/*
+ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw
+ * semaphores, except with a third intermediate state, intent. Basic operations
+ * are:
+ *
+ * six_lock_read(>lock);
+ * six_unlock_read(>lock);
+ *
+ * six_lock_intent(>lock);
+ * six_unlock_intent(>lock);
+ *
+ * six_lock_write(>lock);
+ * six_unlock_write(>lock);
+ *
+ * Intent locks block other intent locks, but do not block read locks, and you
+ * must have an intent lock held before taking a write lock, like so:
+ *
+ * six_lock_intent(>lock);
+ * six_lock_write(>lock);
+ * six_unlock_write(>lock);
+ * six_unlock_intent(>lock);
+ *
+ * Other operations:
+ *
+ *   six_trylock_read()
+ *   six_trylock_intent()
+ *   six_trylock_write()
+ *
+ *   six_lock_downgrade(): convert from intent to read
+ *   six_lock_tryupgrade():attempt to convert from read to intent
+ *
+ * Locks also embed a sequence number, which is incremented when the lock is
+ * locked or unlocked for write. The current sequence number can be grabbed
+ * while a lock is held from lock->state.seq; then, if you drop the lock you 
can
+ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock
+ * iff it hasn't been locked for write in the meantime.
+ *
+ * There are also operations that take the lock type as a parameter, where the
+ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write:
+ *
+ *   six_lock_type(lock, type)
+ *   six_unlock_type(lock, type)
+ *   six_relock(lock, type, seq)
+ *   six_trylock_type(lock, type)
+ *   six_trylock_convert(lock, from, to)
+ *
+ * A lock may be held multiple types by the same thread (for read or intent,
+ * not write). However, the six locks code does _not_ implement the actual
+ * recursive checks itself though - rather, if your code (e.g. btree iterator
+ * code) knows that the current thread already has a lock held, and for the
+ * correct type, six_lock_increment() may be used to bump up the counter for
+ * that type - the only effect is that one more call to unlock will be required
+ * before the lock is unlocked.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#define SIX_LOCK_SEPARATE_LOCKFNS
+
+union six_lock_state {
+   struct {
+   atomic64_t  counter;
+   };
+
+   struct {
+   u64 v;
+   };
+
+   struct {
+   /* for waitlist_bitnr() */
+   unsigned long   l;
+   };
+
+   struct {
+   unsignedread_lock:28;
+   unsignedintent_lock:1;
+   unsignedwaiters:3;
+   /*
+* seq works much like in seqlocks: it's incremented every time
+* we lock and unlock for write.
+*
+* If it's odd write lock is held, even unlocked.
+*
+* Thus readers can unlock, and then lock again later iff it
+* hasn't been modified in the meantime.
+*/
+   u32 seq;
+   };
+};
+
+enum six_lock_type {
+   SIX_LOCK_read,
+   SIX_LOCK_intent,
+   SIX_LOCK_write,
+};
+
+struct six_lock {
+   union six_lock_statestate;
+   unsignedintent_lock_recurse;
+   struct task_struct  *owner;
+   struct optimistic_spin_queue osq;
+
+   raw_spinlock_t  wait_lock;
+   struct list_headwait_list[2];
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+   struct lockdep_map  dep_map;
+#endif
+};
+
+static __always_inline void __six_lock_init(struct six_lock *lock,
+   const char *name,
+   struct lock_class_key *key)
+{
+   atomic64_set(>state.counter, 0);
+   raw_spin_lock_init(>wait_lock);
+   INIT_LIST_HEAD(>wait_list[SIX_LOCK_read]);
+   INIT_LIST_HEAD(>wait_list[SIX_LOCK_intent]);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+   debug_check_no_locks_freed((void *) lock, sizeof(*lock));
+   lockdep_init_map(>dep_map, name, key, 0);
+#endif
+}
+
+#define six_lock_init(lock) 

[PATCH 11/12] closures: closure_wait_event()

2019-06-10 Thread Kent Overstreet
Signed-off-by: Kent Overstreet 
---
 include/linux/closure.h | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/include/linux/closure.h b/include/linux/closure.h
index 308e38028c..abacb91c35 100644
--- a/include/linux/closure.h
+++ b/include/linux/closure.h
@@ -379,4 +379,26 @@ static inline void closure_call(struct closure *cl, 
closure_fn fn,
continue_at_nobarrier(cl, fn, wq);
 }
 
+#define __closure_wait_event(waitlist, _cond)  \
+do {   \
+   struct closure cl;  \
+   \
+   closure_init_stack();\
+   \
+   while (1) { \
+   closure_wait(waitlist, );\
+   if (_cond)  \
+   break;  \
+   closure_sync();  \
+   }   \
+   closure_wake_up(waitlist);  \
+   closure_sync();  \
+} while (0)
+
+#define closure_wait_event(waitlist, _cond)\
+do {   \
+   if (!(_cond))   \
+   __closure_wait_event(waitlist, _cond);  \
+} while (0)
+
 #endif /* _LINUX_CLOSURE_H */
-- 
2.20.1



[PATCH 12/12] closures: fix a race on wakeup from closure_sync

2019-06-10 Thread Kent Overstreet
Signed-off-by: Kent Overstreet 
---
 lib/closure.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lib/closure.c b/lib/closure.c
index 46cfe4c382..3e6366c262 100644
--- a/lib/closure.c
+++ b/lib/closure.c
@@ -104,8 +104,14 @@ struct closure_syncer {
 
 static void closure_sync_fn(struct closure *cl)
 {
-   cl->s->done = 1;
-   wake_up_process(cl->s->task);
+   struct closure_syncer *s = cl->s;
+   struct task_struct *p;
+
+   rcu_read_lock();
+   p = READ_ONCE(s->task);
+   s->done = 1;
+   wake_up_process(p);
+   rcu_read_unlock();
 }
 
 void __sched __closure_sync(struct closure *cl)
-- 
2.20.1



[PATCH 07/12] Propagate gfp_t when allocating pte entries from __vmalloc

2019-06-10 Thread Kent Overstreet
This fixes a lockdep recursion when using __vmalloc from places that
aren't GFP_KERNEL safe.

Signed-off-by: Kent Overstreet 
---
 arch/alpha/include/asm/pgalloc.h | 11 ++---
 arch/arc/include/asm/pgalloc.h   |  9 +---
 arch/arm/include/asm/pgalloc.h   | 11 +++--
 arch/arm/mm/idmap.c  |  2 +-
 arch/arm/mm/mmu.c|  5 +-
 arch/arm/mm/pgd.c|  8 +--
 arch/arm64/include/asm/pgalloc.h | 17 ---
 arch/arm64/mm/hugetlbpage.c  |  8 +--
 arch/csky/include/asm/pgalloc.h  |  4 +-
 arch/hexagon/include/asm/pgalloc.h   |  5 +-
 arch/ia64/include/asm/pgalloc.h  | 14 +++---
 arch/ia64/mm/hugetlbpage.c   |  4 +-
 arch/ia64/mm/init.c  |  6 +--
 arch/m68k/include/asm/mcf_pgalloc.h  | 12 ++---
 arch/m68k/include/asm/motorola_pgalloc.h |  7 +--
 arch/m68k/include/asm/sun3_pgalloc.h | 12 ++---
 arch/m68k/mm/kmap.c  |  5 +-
 arch/m68k/sun3x/dvma.c   |  6 ++-
 arch/microblaze/include/asm/pgalloc.h|  6 +--
 arch/microblaze/mm/pgtable.c |  6 +--
 arch/mips/include/asm/pgalloc.h  | 14 +++---
 arch/mips/mm/hugetlbpage.c   |  4 +-
 arch/mips/mm/ioremap.c   |  6 +--
 arch/nds32/include/asm/pgalloc.h | 14 ++
 arch/nds32/kernel/dma.c  |  4 +-
 arch/nios2/include/asm/pgalloc.h |  8 +--
 arch/nios2/mm/ioremap.c  |  6 +--
 arch/openrisc/include/asm/pgalloc.h  |  2 +-
 arch/openrisc/mm/ioremap.c   |  4 +-
 arch/parisc/include/asm/pgalloc.h| 16 +++---
 arch/parisc/kernel/pci-dma.c |  6 +--
 arch/parisc/mm/hugetlbpage.c |  4 +-
 arch/powerpc/include/asm/book3s/32/pgalloc.h |  4 +-
 arch/powerpc/include/asm/book3s/64/pgalloc.h | 20 
 arch/powerpc/include/asm/nohash/32/pgalloc.h |  6 +--
 arch/powerpc/include/asm/nohash/64/pgalloc.h | 14 +++---
 arch/powerpc/kvm/book3s_64_mmu_radix.c   |  2 +-
 arch/powerpc/mm/hugetlbpage.c|  8 +--
 arch/powerpc/mm/pgtable-book3e.c |  6 +--
 arch/powerpc/mm/pgtable-book3s64.c   | 14 +++---
 arch/powerpc/mm/pgtable-hash64.c |  6 +--
 arch/powerpc/mm/pgtable-radix.c  | 12 ++---
 arch/powerpc/mm/pgtable_32.c |  6 +--
 arch/riscv/include/asm/pgalloc.h | 11 ++---
 arch/s390/include/asm/pgalloc.h  | 25 +-
 arch/s390/mm/hugetlbpage.c   |  6 +--
 arch/s390/mm/pgalloc.c   | 10 ++--
 arch/s390/mm/pgtable.c   |  6 +--
 arch/s390/mm/vmem.c  |  2 +-
 arch/sh/include/asm/pgalloc.h|  7 +--
 arch/sh/mm/hugetlbpage.c |  4 +-
 arch/sh/mm/init.c|  4 +-
 arch/sh/mm/pgtable.c |  8 ++-
 arch/sparc/include/asm/pgalloc_32.h  |  6 +--
 arch/sparc/include/asm/pgalloc_64.h  | 12 +++--
 arch/sparc/mm/hugetlbpage.c  |  4 +-
 arch/sparc/mm/init_64.c  | 10 +---
 arch/sparc/mm/srmmu.c|  2 +-
 arch/um/include/asm/pgalloc.h|  2 +-
 arch/um/include/asm/pgtable-3level.h |  3 +-
 arch/um/kernel/mem.c | 17 ++-
 arch/um/kernel/skas/mmu.c|  4 +-
 arch/unicore32/include/asm/pgalloc.h |  8 ++-
 arch/unicore32/mm/pgd.c  |  2 +-
 arch/x86/include/asm/pgalloc.h   | 30 ++--
 arch/x86/kernel/espfix_64.c  |  2 +-
 arch/x86/kernel/tboot.c  |  6 +--
 arch/x86/mm/pgtable.c|  4 +-
 arch/x86/platform/efi/efi_64.c   |  9 ++--
 arch/xtensa/include/asm/pgalloc.h|  4 +-
 drivers/staging/media/ipu3/ipu3-dmamap.c |  2 +-
 include/asm-generic/4level-fixup.h   |  6 +--
 include/asm-generic/5level-fixup.h   |  6 +--
 include/asm-generic/pgtable-nop4d-hack.h |  2 +-
 include/asm-generic/pgtable-nop4d.h  |  2 +-
 include/asm-generic/pgtable-nopmd.h  |  2 +-
 include/asm-generic/pgtable-nopud.h  |  2 +-
 include/linux/mm.h   | 40 ---
 include/linux/vmalloc.h  |  2 +-
 lib/ioremap.c|  8 +--
 mm/hugetlb.c | 11 +++--
 mm/kasan/init.c  |  8 +--
 mm/memory.c  | 51 +++-
 mm/migrate.c |  6 +--
 mm/mremap.c  |  6 +--
 mm/userfaultfd.c |  6 +--
 mm/vmalloc.c | 49

[PATCH 09/12] bcache: optimize continue_at_nobarrier()

2019-06-10 Thread Kent Overstreet
Signed-off-by: Kent Overstreet 
---
 drivers/md/bcache/closure.h | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index c88cdc4ae4..376c5e659c 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -245,7 +245,7 @@ static inline void closure_queue(struct closure *cl)
 != offsetof(struct work_struct, func));
if (wq) {
INIT_WORK(>work, cl->work.func);
-   BUG_ON(!queue_work(wq, >work));
+   queue_work(wq, >work);
} else
cl->fn(cl);
 }
@@ -340,8 +340,13 @@ do {   
\
  */
 #define continue_at_nobarrier(_cl, _fn, _wq)   \
 do {   \
-   set_closure_fn(_cl, _fn, _wq);  \
-   closure_queue(_cl); \
+   closure_set_ip(_cl);\
+   if (_wq) {  \
+   INIT_WORK(&(_cl)->work, (void *) _fn);  \
+   queue_work((_wq), &(_cl)->work);\
+   } else {\
+   (_fn)(_cl); \
+   }   \
 } while (0)
 
 /**
-- 
2.20.1



[PATCH 06/12] fs: factor out d_mark_tmpfile()

2019-06-10 Thread Kent Overstreet
New helper for bcachefs - bcachefs doesn't want the
inode_dec_link_count() call that d_tmpfile does, it handles i_nlink on
its own atomically with other btree updates

Signed-off-by: Kent Overstreet 
---
 fs/dcache.c| 10 --
 include/linux/dcache.h |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index aac41adf47..18edb4e5bc 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3042,9 +3042,8 @@ void d_genocide(struct dentry *parent)
 
 EXPORT_SYMBOL(d_genocide);
 
-void d_tmpfile(struct dentry *dentry, struct inode *inode)
+void d_mark_tmpfile(struct dentry *dentry, struct inode *inode)
 {
-   inode_dec_link_count(inode);
BUG_ON(dentry->d_name.name != dentry->d_iname ||
!hlist_unhashed(>d_u.d_alias) ||
!d_unlinked(dentry));
@@ -3054,6 +3053,13 @@ void d_tmpfile(struct dentry *dentry, struct inode 
*inode)
(unsigned long long)inode->i_ino);
spin_unlock(>d_lock);
spin_unlock(>d_parent->d_lock);
+}
+EXPORT_SYMBOL(d_mark_tmpfile);
+
+void d_tmpfile(struct dentry *dentry, struct inode *inode)
+{
+   inode_dec_link_count(inode);
+   d_mark_tmpfile(dentry, inode);
d_instantiate(dentry, inode);
 }
 EXPORT_SYMBOL(d_tmpfile);
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 60996e64c5..e0fe330162 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -255,6 +255,7 @@ extern struct dentry * d_make_root(struct inode *);
 /* - the ramfs-type tree */
 extern void d_genocide(struct dentry *);
 
+extern void d_mark_tmpfile(struct dentry *, struct inode *);
 extern void d_tmpfile(struct dentry *, struct inode *);
 
 extern struct dentry *d_find_alias(struct inode *);
-- 
2.20.1



[PATCH 03/12] mm: pagecache add lock

2019-06-10 Thread Kent Overstreet
Add a per address space lock around adding pages to the pagecache - making it
possible for fallocate INSERT_RANGE/COLLAPSE_RANGE to work correctly, and also
hopefully making truncate and dio a bit saner.

Signed-off-by: Kent Overstreet 
---
 fs/inode.c|  1 +
 include/linux/fs.h| 24 +
 include/linux/sched.h |  4 +++
 init/init_task.c  |  1 +
 mm/filemap.c  | 81 +--
 5 files changed, 108 insertions(+), 3 deletions(-)

diff --git a/fs/inode.c b/fs/inode.c
index 9a453f3637..8881dc551f 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -350,6 +350,7 @@ EXPORT_SYMBOL(inc_nlink);
 static void __address_space_init_once(struct address_space *mapping)
 {
xa_init_flags(>i_pages, XA_FLAGS_LOCK_IRQ);
+   pagecache_lock_init(>add_lock);
init_rwsem(>i_mmap_rwsem);
INIT_LIST_HEAD(>private_list);
spin_lock_init(>private_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dd28e76790..a88d994751 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -418,6 +418,28 @@ int pagecache_write_end(struct file *, struct 
address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata);
 
+/*
+ * Two-state lock - can be taken for add or block - both states are shared,
+ * like read side of rwsem, but conflict with other state:
+ */
+struct pagecache_lock {
+   atomic_long_t   v;
+   wait_queue_head_t   wait;
+};
+
+static inline void pagecache_lock_init(struct pagecache_lock *lock)
+{
+   atomic_long_set(>v, 0);
+   init_waitqueue_head(>wait);
+}
+
+void pagecache_add_put(struct pagecache_lock *);
+void pagecache_add_get(struct pagecache_lock *);
+void __pagecache_block_put(struct pagecache_lock *);
+void __pagecache_block_get(struct pagecache_lock *);
+void pagecache_block_put(struct pagecache_lock *);
+void pagecache_block_get(struct pagecache_lock *);
+
 /**
  * struct address_space - Contents of a cacheable, mappable object.
  * @host: Owner, either the inode or the block_device.
@@ -452,6 +474,8 @@ struct address_space {
spinlock_t  private_lock;
struct list_headprivate_list;
void*private_data;
+   struct pagecache_lock   add_lock
+   cacheline_aligned_in_smp;   /* protects adding new pages */
 } __attribute__((aligned(sizeof(long __randomize_layout;
/*
 * On most architectures that alignment is already the case; but
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1549584a15..a46baade99 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -43,6 +43,7 @@ struct io_context;
 struct mempolicy;
 struct nameidata;
 struct nsproxy;
+struct pagecache_lock;
 struct perf_event_context;
 struct pid_namespace;
 struct pipe_inode_info;
@@ -935,6 +936,9 @@ struct task_struct {
unsigned intin_ubsan;
 #endif
 
+   /* currently held lock, for avoiding recursing in fault path: */
+   struct pagecache_lock *pagecache_lock;
+
/* Journalling filesystem info: */
void*journal_info;
 
diff --git a/init/init_task.c b/init/init_task.c
index c70ef656d0..92bbb6e909 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -115,6 +115,7 @@ struct task_struct init_task
},
.blocked= {{0}},
.alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock),
+   .pagecache_lock = NULL,
.journal_info   = NULL,
INIT_CPU_TIMERS(init_task)
.pi_lock= __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock),
diff --git a/mm/filemap.c b/mm/filemap.c
index d78f577bae..93d7e0e686 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -113,6 +113,73 @@
  *   ->tasklist_lock(memory_failure, collect_procs_ao)
  */
 
+static void __pagecache_lock_put(struct pagecache_lock *lock, long i)
+{
+   BUG_ON(atomic_long_read(>v) == 0);
+
+   if (atomic_long_sub_return_release(i, >v) == 0)
+   wake_up_all(>wait);
+}
+
+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i)
+{
+   long v = atomic_long_read(>v), old;
+
+   do {
+   old = v;
+
+   if (i > 0 ? v < 0 : v > 0)
+   return false;
+   } while ((v = atomic_long_cmpxchg_acquire(>v,
+   old, old + i)) != old);
+   return true;
+}
+
+static void __pagecache_lock_get(struct pagecache_lock *lock, long i)
+{
+   wait_event(lock->wait, __pagecache_lock_tryget(lock, i));
+}
+
+void pagecache_add_put(struct pagecache_lock *lock)
+{
+   __pagecache_lock_put(lock, 1);
+}
+EXPORT_SYMBOL(pagecache_add_put);
+
+void pagecache_add_get(struct pagecache_lock *lock)
+{
+   __pagecache_lock_get(lock, 1);
+}
+EXPORT_SYMBOL(pagec

[PATCH 10/12] bcache: move closures to lib/

2019-06-10 Thread Kent Overstreet
Prep work for bcachefs - being a fork of bcache it also uses closures

Signed-off-by: Kent Overstreet 
---
 drivers/md/bcache/Kconfig | 10 +--
 drivers/md/bcache/Makefile|  6 ++--
 drivers/md/bcache/bcache.h|  2 +-
 drivers/md/bcache/super.c |  1 -
 drivers/md/bcache/util.h  |  3 +-
 .../md/bcache => include/linux}/closure.h | 17 ++-
 lib/Kconfig   |  3 ++
 lib/Kconfig.debug |  9 ++
 lib/Makefile  |  2 ++
 {drivers/md/bcache => lib}/closure.c  | 28 ++-
 10 files changed, 37 insertions(+), 44 deletions(-)
 rename {drivers/md/bcache => include/linux}/closure.h (97%)
 rename {drivers/md/bcache => lib}/closure.c (89%)

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index f6e0a8b3a6..3dd1d48987 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -2,6 +2,7 @@
 config BCACHE
tristate "Block device as cache"
select CRC64
+   select CLOSURES
help
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
@@ -16,12 +17,3 @@ config BCACHE_DEBUG
 
Enables extra debugging tools, allows expensive runtime checks to be
turned on.
-
-config BCACHE_CLOSURES_DEBUG
-   bool "Debug closures"
-   depends on BCACHE
-   select DEBUG_FS
-   help
-   Keeps all active closures in a linked list and provides a debugfs
-   interface to list them, which makes it possible to see asynchronous
-   operations that get stuck.
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
index d26b351958..2b790fb813 100644
--- a/drivers/md/bcache/Makefile
+++ b/drivers/md/bcache/Makefile
@@ -2,8 +2,8 @@
 
 obj-$(CONFIG_BCACHE)   += bcache.o
 
-bcache-y   := alloc.o bset.o btree.o closure.o debug.o extents.o\
-   io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
-   util.o writeback.o
+bcache-y   := alloc.o bset.o btree.o debug.o extents.o io.o\
+   journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o util.o\
+   writeback.o
 
 CFLAGS_request.o   += -Iblock
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index fdf75352e1..ced9f1526c 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -180,6 +180,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -192,7 +193,6 @@
 
 #include "bset.h"
 #include "util.h"
-#include "closure.h"
 
 struct bucket {
atomic_tpin;
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index a697a3a923..da6803f280 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -2487,7 +2487,6 @@ static int __init bcache_init(void)
goto err;
 
bch_debug_init();
-   closure_debug_init();
 
return 0;
 err:
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 00aab6abcf..8a75100c0b 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -4,6 +4,7 @@
 #define _BCACHE_UTIL_H
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -13,8 +14,6 @@
 #include 
 #include 
 
-#include "closure.h"
-
 #define PAGE_SECTORS   (PAGE_SIZE / 512)
 
 struct closure;
diff --git a/drivers/md/bcache/closure.h b/include/linux/closure.h
similarity index 97%
rename from drivers/md/bcache/closure.h
rename to include/linux/closure.h
index 376c5e659c..308e38028c 100644
--- a/drivers/md/bcache/closure.h
+++ b/include/linux/closure.h
@@ -155,7 +155,7 @@ struct closure {
 
atomic_tremaining;
 
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#ifdef CONFIG_DEBUG_CLOSURES
 #define CLOSURE_MAGIC_DEAD 0xc054dead
 #define CLOSURE_MAGIC_ALIVE0xc054a11e
 
@@ -184,15 +184,13 @@ static inline void closure_sync(struct closure *cl)
__closure_sync(cl);
 }
 
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#ifdef CONFIG_DEBUG_CLOSURES
 
-void closure_debug_init(void);
 void closure_debug_create(struct closure *cl);
 void closure_debug_destroy(struct closure *cl);
 
 #else
 
-static inline void closure_debug_init(void) {}
 static inline void closure_debug_create(struct closure *cl) {}
 static inline void closure_debug_destroy(struct closure *cl) {}
 
@@ -200,21 +198,21 @@ static inline void closure_debug_destroy(struct closure 
*cl) {}
 
 static inline void closure_set_ip(struct closure *cl)
 {
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#ifdef CONFIG_DEBUG_CLOSURES
cl->ip = _THIS_IP_;
 #endif
 }
 
 static inline void closure_set_ret_ip(struct closure *cl)
 {
-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#ifdef CONFIG_DEBUG_CLOSURES
cl->ip = _RET_IP_;
 #endif
 }
 
 static i

[PATCH 04/12] mm: export find_get_pages()

2019-06-10 Thread Kent Overstreet
Needed for bcachefs

Signed-off-by: Kent Overstreet 
---
 mm/filemap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/filemap.c b/mm/filemap.c
index 93d7e0e686..617168474e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1899,6 +1899,7 @@ unsigned find_get_pages_range(struct address_space 
*mapping, pgoff_t *start,
 
return ret;
 }
+EXPORT_SYMBOL(find_get_pages_range);
 
 /**
  * find_get_pages_contig - gang contiguous pagecache lookup
-- 
2.20.1



[PATCH 08/12] block: Add some exports for bcachefs

2019-06-10 Thread Kent Overstreet
Signed-off-by: Kent Overstreet 
---
 block/bio.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/bio.c b/block/bio.c
index 716510ecd7..a67aa6e0de 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -958,6 +958,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter 
*iter)
 
return 0;
 }
+EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
 
 static void submit_bio_wait_endio(struct bio *bio)
 {
@@ -1658,6 +1659,7 @@ void bio_set_pages_dirty(struct bio *bio)
set_page_dirty_lock(bvec->bv_page);
}
 }
+EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
 
 static void bio_release_pages(struct bio *bio)
 {
@@ -1731,6 +1733,7 @@ void bio_check_pages_dirty(struct bio *bio)
spin_unlock_irqrestore(_dirty_lock, flags);
schedule_work(_dirty_work);
 }
+EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
 
 void update_io_ticks(struct hd_struct *part, unsigned long now)
 {
-- 
2.20.1



Re: general protection fault in ebitmap_destroy (2)

2019-03-21 Thread Kent Overstreet
#syz test: http://evilpiepirate.org/git/bcachefs.git syzbot-fix


Re: [PATCH -next] bcache: Fix potential NULL pointer dereference

2019-01-30 Thread Kent Overstreet
On Wed, Jan 30, 2019 at 06:21:12PM +0800, YueHaibing wrote:
> There is a potential NULL pointer dereference in case
> kzalloc() fails and returns NULL.
> 
> Fixes: bc082a55d25c ("bcache: fix inaccurate io state for detached bcache 
> devices")
> Signed-off-by: YueHaibing 
> ---
>  drivers/md/bcache/request.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index 1507041..a50afa4 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -1094,6 +1094,8 @@ static void detached_dev_do_request(struct 
> bcache_device *d, struct bio *bio)
>* which would call closure_get(>disk.cl)
>*/
>   ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
> + if (!ddip)
> + return;
>   ddip->d = d;
>   ddip->start_time = jiffies;
>   ddip->bi_end_io = bio->bi_end_io;

This should be using a mempool/bioset... just returning from a make_request
function is not correct, that's a serious bug - you're just dropping an IO on
the floor, which is going to cause whatever submitted that IO to hang.


Re: [PATCH 6/7] sctp: Convert to genradix

2018-12-17 Thread Kent Overstreet
On Mon, Dec 17, 2018 at 12:50:01PM -0800, Andrew Morton wrote:
> On Mon, 17 Dec 2018 08:19:28 -0500 Kent Overstreet 
>  wrote:
> 
> > @@ -535,9 +470,6 @@ int sctp_send_add_streams(struct sctp_association *asoc,
> > goto out;
> > }
> >  
> > -   stream->incnt = incnt;
> > -   stream->outcnt = outcnt;
> > -
> > asoc->strreset_outstanding = !!out + !!in;
> >  
> 
> I'm seeing a reject here for some reason.  Using todays's linux-next,
> but there are no changes against net/sctp/stream.c in -next.  The
> assignment to stream->incnt has disappeared.  I did this:
> 
> @@ -535,8 +470,6 @@ int sctp_send_add_streams(struct sctp_as
>   goto out;
>   }
>  
> - stream->outcnt = outcnt;
> -
>   asoc->strreset_outstanding = !!out + !!in;
>  
>  out:
> 
> 
> We're at 4.20-rc7 and this series is rather large.  I'll merge them all
> to see what happens, but I don't think it's 4.21-rc1 material?

Yeah, agreed. Thanks!


[PATCH 1/7] openvswitch: convert to kvmalloc

2018-12-17 Thread Kent Overstreet
There was no real need for this code to be using flexarrays, it's just
implementing a hash table - ideally it would be using rhashtables, but
that conversion would be significantly more complicated.

Signed-off-by: Kent Overstreet 
Cc: Pravin B Shelar 
Cc: d...@openvswitch.org
Reviewed-by: Matthew Wilcox 
---
 net/openvswitch/flow.h |  1 -
 net/openvswitch/flow_netlink.h |  1 -
 net/openvswitch/flow_table.c   | 51 --
 net/openvswitch/flow_table.h   |  3 +-
 4 files changed, 13 insertions(+), 43 deletions(-)

diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index c670dd24b8..4f06278166 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -30,7 +30,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 6657606b2b..66f9553758 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -30,7 +30,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 #include 
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 80ea2a7185..cfb0098c9a 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -111,29 +111,6 @@ int ovs_flow_tbl_count(const struct flow_table *table)
return table->count;
 }
 
-static struct flex_array *alloc_buckets(unsigned int n_buckets)
-{
-   struct flex_array *buckets;
-   int i, err;
-
-   buckets = flex_array_alloc(sizeof(struct hlist_head),
-  n_buckets, GFP_KERNEL);
-   if (!buckets)
-   return NULL;
-
-   err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
-   if (err) {
-   flex_array_free(buckets);
-   return NULL;
-   }
-
-   for (i = 0; i < n_buckets; i++)
-   INIT_HLIST_HEAD((struct hlist_head *)
-   flex_array_get(buckets, i));
-
-   return buckets;
-}
-
 static void flow_free(struct sw_flow *flow)
 {
int cpu;
@@ -168,31 +145,30 @@ void ovs_flow_free(struct sw_flow *flow, bool deferred)
flow_free(flow);
 }
 
-static void free_buckets(struct flex_array *buckets)
-{
-   flex_array_free(buckets);
-}
-
-
 static void __table_instance_destroy(struct table_instance *ti)
 {
-   free_buckets(ti->buckets);
+   kvfree(ti->buckets);
kfree(ti);
 }
 
 static struct table_instance *table_instance_alloc(int new_size)
 {
struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+   int i;
 
if (!ti)
return NULL;
 
-   ti->buckets = alloc_buckets(new_size);
-
+   ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head),
+GFP_KERNEL);
if (!ti->buckets) {
kfree(ti);
return NULL;
}
+
+   for (i = 0; i < new_size; i++)
+   INIT_HLIST_HEAD(>buckets[i]);
+
ti->n_buckets = new_size;
ti->node_ver = 0;
ti->keep_flows = false;
@@ -249,7 +225,7 @@ static void table_instance_destroy(struct table_instance 
*ti,
 
for (i = 0; i < ti->n_buckets; i++) {
struct sw_flow *flow;
-   struct hlist_head *head = flex_array_get(ti->buckets, i);
+   struct hlist_head *head = >buckets[i];
struct hlist_node *n;
int ver = ti->node_ver;
int ufid_ver = ufid_ti->node_ver;
@@ -294,7 +270,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct 
table_instance *ti,
ver = ti->node_ver;
while (*bucket < ti->n_buckets) {
i = 0;
-   head = flex_array_get(ti->buckets, *bucket);
+   head = >buckets[*bucket];
hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) {
if (i < *last) {
i++;
@@ -313,8 +289,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct 
table_instance *ti,
 static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
 {
hash = jhash_1word(hash, ti->hash_seed);
-   return flex_array_get(ti->buckets,
-   (hash & (ti->n_buckets - 1)));
+   return >buckets[hash & (ti->n_buckets - 1)];
 }
 
 static void table_instance_insert(struct table_instance *ti,
@@ -347,9 +322,7 @@ static void flow_table_copy_flows(struct table_instance 
*old,
/* Insert in new table. */
for (i = 0; i < old->n_buckets; i++) {
struct sw_flow *flow;
-   struct hlist_head *head;
-
-   head = flex_array_get(old->buckets, i);
+   struct hlist_head *head = >buckets[i];
 
if (ufid)
hlist_for_each_entry(flow, head,
diff --git a/net/openvs

[PATCH 2/7] md: convert to kvmalloc

2018-12-17 Thread Kent Overstreet
The code really just wants a big flat buffer, so just do that.

Signed-off-by: Kent Overstreet 
Cc: Shaohua Li 
Cc: linux-r...@vger.kernel.org
Reviewed-by: Matthew Wilcox 
---
 drivers/md/raid5-ppl.c |  6 +--
 drivers/md/raid5.c | 87 +++---
 drivers/md/raid5.h |  9 +++--
 3 files changed, 46 insertions(+), 56 deletions(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 3a7c363265..0b096ddc9c 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -16,7 +16,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include "md.h"
@@ -165,7 +164,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
raid5_percpu *percpu,
   struct dma_async_tx_descriptor *tx)
 {
int disks = sh->disks;
-   struct page **srcs = flex_array_get(percpu->scribble, 0);
+   struct page **srcs = percpu->scribble;
int count = 0, pd_idx = sh->pd_idx, i;
struct async_submit_ctl submit;
 
@@ -196,8 +195,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
raid5_percpu *percpu,
}
 
init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
- NULL, sh, flex_array_get(percpu->scribble, 0)
- + sizeof(struct page *) * (sh->disks + 2));
+ NULL, sh, (void *) (srcs + sh->disks + 2));
 
if (count == 1)
tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4990f0319f..c92e26fbcd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -54,7 +54,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 #include 
@@ -1394,22 +1393,16 @@ static void ops_complete_compute(void *stripe_head_ref)
 }
 
 /* return a pointer to the address conversion region of the scribble buffer */
-static addr_conv_t *to_addr_conv(struct stripe_head *sh,
-struct raid5_percpu *percpu, int i)
+static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
 {
-   void *addr;
-
-   addr = flex_array_get(percpu->scribble, i);
-   return addr + sizeof(struct page *) * (sh->disks + 2);
+   return percpu->scribble + i * percpu->scribble_obj_size;
 }
 
 /* return a pointer to the address conversion region of the scribble buffer */
-static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
+static addr_conv_t *to_addr_conv(struct stripe_head *sh,
+struct raid5_percpu *percpu, int i)
 {
-   void *addr;
-
-   addr = flex_array_get(percpu->scribble, i);
-   return addr;
+   return (void *) (to_addr_page(percpu, i) + sh->disks + 2);
 }
 
 static struct dma_async_tx_descriptor *
@@ -2238,21 +2231,23 @@ static int grow_stripes(struct r5conf *conf, int num)
  * calculate over all devices (not just the data blocks), using zeros in place
  * of the P and Q blocks.
  */
-static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
+static int scribble_alloc(struct raid5_percpu *percpu,
+ int num, int cnt, gfp_t flags)
 {
-   struct flex_array *ret;
-   size_t len;
+   size_t obj_size =
+   sizeof(struct page *) * (num+2) +
+   sizeof(addr_conv_t) * (num+2);
+   void *scribble;
 
-   len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
-   ret = flex_array_alloc(len, cnt, flags);
-   if (!ret)
-   return NULL;
-   /* always prealloc all elements, so no locking is required */
-   if (flex_array_prealloc(ret, 0, cnt, flags)) {
-   flex_array_free(ret);
-   return NULL;
-   }
-   return ret;
+   scribble = kvmalloc_array(cnt, obj_size, flags);
+   if (!scribble)
+   return -ENOMEM;
+
+   kvfree(percpu->scribble);
+
+   percpu->scribble = scribble;
+   percpu->scribble_obj_size = obj_size;
+   return 0;
 }
 
 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
@@ -2270,23 +2265,18 @@ static int resize_chunks(struct r5conf *conf, int 
new_disks, int new_sectors)
return 0;
mddev_suspend(conf->mddev);
get_online_cpus();
+
for_each_present_cpu(cpu) {
struct raid5_percpu *percpu;
-   struct flex_array *scribble;
 
percpu = per_cpu_ptr(conf->percpu, cpu);
-   scribble = scribble_alloc(new_disks,
- new_sectors / STRIPE_SECTORS,
- GFP_NOIO);
-
-   if (scribble) {
-   flex_array_free(percpu->scribble);
-   percpu->scribble = scribble;
-   } else {
-   err = -ENOMEM;
+   err = scribble_alloc(percpu, new

[PATCH 7/7] Drop flex_arrays

2018-12-17 Thread Kent Overstreet
All existing users have been converted to generic radix trees

Signed-off-by: Kent Overstreet 
Acked-by: Dave Hansen 
---
 Documentation/core-api/flexible-arrays.rst | 130 ---
 Documentation/flexible-arrays.txt  | 123 ---
 include/linux/flex_array.h | 149 
 include/linux/poison.h |   3 -
 lib/Makefile   |   2 +-
 lib/flex_array.c   | 398 -
 tools/include/linux/poison.h   |   3 -
 7 files changed, 1 insertion(+), 807 deletions(-)
 delete mode 100644 Documentation/core-api/flexible-arrays.rst
 delete mode 100644 Documentation/flexible-arrays.txt
 delete mode 100644 include/linux/flex_array.h
 delete mode 100644 lib/flex_array.c

diff --git a/Documentation/core-api/flexible-arrays.rst 
b/Documentation/core-api/flexible-arrays.rst
deleted file mode 100644
index b6b85a1b51..00
--- a/Documentation/core-api/flexible-arrays.rst
+++ /dev/null
@@ -1,130 +0,0 @@
-
-===
-Using flexible arrays in the kernel
-===
-
-Large contiguous memory allocations can be unreliable in the Linux kernel.
-Kernel programmers will sometimes respond to this problem by allocating
-pages with :c:func:`vmalloc()`.  This solution not ideal, though.  On 32-bit
-systems, memory from vmalloc() must be mapped into a relatively small address
-space; it's easy to run out.  On SMP systems, the page table changes required
-by vmalloc() allocations can require expensive cross-processor interrupts on
-all CPUs.  And, on all systems, use of space in the vmalloc() range increases
-pressure on the translation lookaside buffer (TLB), reducing the performance
-of the system.
-
-In many cases, the need for memory from vmalloc() can be eliminated by piecing
-together an array from smaller parts; the flexible array library exists to make
-this task easier.
-
-A flexible array holds an arbitrary (within limits) number of fixed-sized
-objects, accessed via an integer index.  Sparse arrays are handled
-reasonably well.  Only single-page allocations are made, so memory
-allocation failures should be relatively rare.  The down sides are that the
-arrays cannot be indexed directly, individual object size cannot exceed the
-system page size, and putting data into a flexible array requires a copy
-operation.  It's also worth noting that flexible arrays do no internal
-locking at all; if concurrent access to an array is possible, then the
-caller must arrange for appropriate mutual exclusion.
-
-The creation of a flexible array is done with :c:func:`flex_array_alloc()`::
-
-#include 
-
-struct flex_array *flex_array_alloc(int element_size,
-   unsigned int total,
-   gfp_t flags);
-
-The individual object size is provided by ``element_size``, while total is the
-maximum number of objects which can be stored in the array.  The flags
-argument is passed directly to the internal memory allocation calls.  With
-the current code, using flags to ask for high memory is likely to lead to
-notably unpleasant side effects.
-
-It is also possible to define flexible arrays at compile time with::
-
-DEFINE_FLEX_ARRAY(name, element_size, total);
-
-This macro will result in a definition of an array with the given name; the
-element size and total will be checked for validity at compile time.
-
-Storing data into a flexible array is accomplished with a call to
-:c:func:`flex_array_put()`::
-
-int flex_array_put(struct flex_array *array, unsigned int element_nr,
-  void *src, gfp_t flags);
-
-This call will copy the data from src into the array, in the position
-indicated by ``element_nr`` (which must be less than the maximum specified when
-the array was created).  If any memory allocations must be performed, flags
-will be used.  The return value is zero on success, a negative error code
-otherwise.
-
-There might possibly be a need to store data into a flexible array while
-running in some sort of atomic context; in this situation, sleeping in the
-memory allocator would be a bad thing.  That can be avoided by using
-``GFP_ATOMIC`` for the flags value, but, often, there is a better way.  The
-trick is to ensure that any needed memory allocations are done before
-entering atomic context, using :c:func:`flex_array_prealloc()`::
-
-int flex_array_prealloc(struct flex_array *array, unsigned int start,
-   unsigned int nr_elements, gfp_t flags);
-
-This function will ensure that memory for the elements indexed in the range
-defined by ``start`` and ``nr_elements`` has been allocated.  Thereafter, a
-``flex_array_put()`` call on an element in that range is guaranteed not to
-block.
-
-Getting data back out of the array is done with :c:func:`flex_array_get()`::
-
-void *flex_array_get(struct flex_array *fa, unsigned int element_nr

[PATCH 5/7] proc: commit to genradix

2018-12-17 Thread Kent Overstreet
the new generic radix trees have a simpler API and implementation, and
no limitations on number of elements, so all flex_array users are being
converted

Signed-off-by: Kent Overstreet 
Cc: Al Viro 
---
 fs/proc/base.c | 43 +++
 1 file changed, 15 insertions(+), 28 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index ce34654794..2edf386ed3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -59,6 +59,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -92,7 +93,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include "internal.h"
@@ -2140,11 +2140,12 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
struct task_struct *task;
struct mm_struct *mm;
unsigned long nr_files, pos, i;
-   struct flex_array *fa = NULL;
-   struct map_files_info info;
+   GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
 
+   genradix_init();
+
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2176,35 +2177,22 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
 */
 
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
-   if (vma->vm_file && ++pos > ctx->pos)
-   nr_files++;
-   }
+   if (!vma->vm_file)
+   continue;
+   if (++pos <= ctx->pos)
+   continue;
 
-   if (nr_files) {
-   fa = flex_array_alloc(sizeof(info), nr_files,
-   GFP_KERNEL);
-   if (!fa || flex_array_prealloc(fa, 0, nr_files,
-   GFP_KERNEL)) {
+   p = genradix_ptr_alloc(, nr_files++, GFP_KERNEL);
+   if (!p) {
ret = -ENOMEM;
-   if (fa)
-   flex_array_free(fa);
up_read(>mmap_sem);
mmput(mm);
goto out_put_task;
}
-   for (i = 0, vma = mm->mmap, pos = 2; vma;
-   vma = vma->vm_next) {
-   if (!vma->vm_file)
-   continue;
-   if (++pos <= ctx->pos)
-   continue;
 
-   info.start = vma->vm_start;
-   info.end = vma->vm_end;
-   info.mode = vma->vm_file->f_mode;
-   if (flex_array_put(fa, i++, , GFP_KERNEL))
-   BUG();
-   }
+   p->start = vma->vm_start;
+   p->end = vma->vm_end;
+   p->mode = vma->vm_file->f_mode;
}
up_read(>mmap_sem);
mmput(mm);
@@ -2213,7 +2201,7 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
unsigned int len;
 
-   p = flex_array_get(fa, i);
+   p = genradix_ptr(, i);
len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
if (!proc_fill_cache(file, ctx,
  buf, len,
@@ -2223,12 +2211,11 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
break;
ctx->pos++;
}
-   if (fa)
-   flex_array_free(fa);
 
 out_put_task:
put_task_struct(task);
 out:
+   genradix_free();
return ret;
 }
 
-- 
2.20.1



[PATCH 6/7] sctp: Convert to genradix

2018-12-17 Thread Kent Overstreet
This also makes sctp_stream_alloc_(out|in) saner, in that they no longer
allocate new flex_arrays/genradixes, they just preallocate more
elements.

This code does however have a suspicious lack of locking.

Signed-off-by: Kent Overstreet 
Cc: Vlad Yasevich 
Cc: Neil Horman 
Cc: Marcelo Ricardo Leitner 
Cc: linux-s...@vger.kernel.org
---
 include/net/sctp/structs.h   |  15 ++---
 net/sctp/stream.c| 106 +++
 net/sctp/stream_interleave.c |   2 +-
 3 files changed, 28 insertions(+), 95 deletions(-)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index a11f937904..ee606e0fff 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -48,6 +48,7 @@
 #define __sctp_structs_h__
 
 #include 
+#include 
 #include 
 #include   /* linux/in.h needs this!!*/
 #include   /* We get struct sockaddr_in. */
@@ -57,7 +58,6 @@
 #include   /* This gets us atomic counters.  */
 #include   /* We need sk_buff_head. */
 #include/* We need tq_struct.*/
-#include   /* We need flex_array.   */
 #include /* We need sctp* header structs.  */
 #include  /* We need auth specific structs */
 #include /* For inet_skb_parm */
@@ -1441,8 +1441,9 @@ struct sctp_stream_in {
 };
 
 struct sctp_stream {
-   struct flex_array *out;
-   struct flex_array *in;
+   GENRADIX(struct sctp_stream_out) out;
+   GENRADIX(struct sctp_stream_in) in;
+
__u16 outcnt;
__u16 incnt;
/* Current stream being sent, if any */
@@ -1465,17 +1466,17 @@ struct sctp_stream {
 };
 
 static inline struct sctp_stream_out *sctp_stream_out(
-   const struct sctp_stream *stream,
+   struct sctp_stream *stream,
__u16 sid)
 {
-   return flex_array_get(stream->out, sid);
+   return genradix_ptr(>out, sid);
 }
 
 static inline struct sctp_stream_in *sctp_stream_in(
-   const struct sctp_stream *stream,
+   struct sctp_stream *stream,
__u16 sid)
 {
-   return flex_array_get(stream->in, sid);
+   return genradix_ptr(>in, sid);
 }
 
 #define SCTP_SO(s, i) sctp_stream_out((s), (i))
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index ffb940d3b5..09b9c7dc59 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -37,53 +37,6 @@
 #include 
 #include 
 
-static struct flex_array *fa_alloc(size_t elem_size, size_t elem_count,
-  gfp_t gfp)
-{
-   struct flex_array *result;
-   int err;
-
-   result = flex_array_alloc(elem_size, elem_count, gfp);
-   if (result) {
-   err = flex_array_prealloc(result, 0, elem_count, gfp);
-   if (err) {
-   flex_array_free(result);
-   result = NULL;
-   }
-   }
-
-   return result;
-}
-
-static void fa_free(struct flex_array *fa)
-{
-   if (fa)
-   flex_array_free(fa);
-}
-
-static void fa_copy(struct flex_array *fa, struct flex_array *from,
-   size_t index, size_t count)
-{
-   void *elem;
-
-   while (count--) {
-   elem = flex_array_get(from, index);
-   flex_array_put(fa, index, elem, 0);
-   index++;
-   }
-}
-
-static void fa_zero(struct flex_array *fa, size_t index, size_t count)
-{
-   void *elem;
-
-   while (count--) {
-   elem = flex_array_get(fa, index);
-   memset(elem, 0, fa->element_size);
-   index++;
-   }
-}
-
 /* Migrates chunks from stream queues to new stream queues if needed,
  * but not across associations. Also, removes those chunks to streams
  * higher than the new max.
@@ -138,46 +91,32 @@ static void sctp_stream_outq_migrate(struct sctp_stream 
*stream,
 static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
 gfp_t gfp)
 {
-   struct flex_array *out;
-   size_t elem_size = sizeof(struct sctp_stream_out);
-
-   out = fa_alloc(elem_size, outcnt, gfp);
-   if (!out)
-   return -ENOMEM;
-
-   if (stream->out) {
-   fa_copy(out, stream->out, 0, min(outcnt, stream->outcnt));
-   fa_free(stream->out);
-   }
+   int ret;
 
-   if (outcnt > stream->outcnt)
-   fa_zero(out, stream->outcnt, (outcnt - stream->outcnt));
+   if (outcnt <= stream->outcnt)
+   return 0;
 
-   stream->out = out;
+   ret = genradix_prealloc(>out, outcnt, gfp);
+   if (ret)
+   return ret;
 
+   stream->outcnt = outcnt;
return 0;
 }
 
 static int sctp_stream_alloc_in(struct sctp_stream *stream, __u16 incnt,
gfp_t gfp)
 {
-   struct flex_array *in;
-   size_t elem_size = sizeof(struct sctp_stream_in);
-
-   in = fa_alloc(elem_size, incnt, gfp);
-   if (!in

[PATCH 0/7] generic radix trees; drop flex arrays

2018-12-17 Thread Kent Overstreet
this has been sitting on my todo list for far too long; let's try and get it
merged before flex arrays grow any new users.

The only significant change since the last time I mailed this out is that it's
now safe to use a genradix from multiple threads, including when new nodes are
being allocated.

Since we never _free_ nodes on a genradix while it's in use (only when the
entire tree is being freed), this adds very little code - we just use cmpxchg
for adding pointers to new nodes, there's no need to use rcu_read_lock() or
anything like that. And for the types of things we seem to want to use it for
it's quite useful.

Andrew, would you mind picking up this series, or would someone else be more
suitable? The patch series has been sent out a few times and seen a fair amoun
of review.

Kent Overstreet (7):
  openvswitch: convert to kvmalloc
  md: convert to kvmalloc
  selinux: convert to kvmalloc
  Generic radix trees
  proc: commit to genradix
  sctp: Convert to genradix
  Drop flex_arrays

 Documentation/core-api/flexible-arrays.rst| 130 --
 Documentation/core-api/generic-radix-tree.rst |  12 +
 Documentation/core-api/index.rst  |   1 +
 Documentation/flexible-arrays.txt | 123 --
 drivers/md/raid5-ppl.c|   6 +-
 drivers/md/raid5.c|  87 ++--
 drivers/md/raid5.h|   9 +-
 fs/proc/base.c|  43 +-
 include/linux/flex_array.h| 149 ---
 include/linux/generic-radix-tree.h| 231 ++
 include/linux/poison.h|   3 -
 include/net/sctp/structs.h|  15 +-
 lib/Makefile  |   5 +-
 lib/flex_array.c  | 398 --
 lib/generic-radix-tree.c  | 217 ++
 net/openvswitch/flow.h|   1 -
 net/openvswitch/flow_netlink.h|   1 -
 net/openvswitch/flow_table.c  |  51 +--
 net/openvswitch/flow_table.h  |   3 +-
 net/sctp/stream.c | 106 +
 net/sctp/stream_interleave.c  |   2 +-
 security/selinux/ss/avtab.c   |  40 +-
 security/selinux/ss/avtab.h   |   4 +-
 security/selinux/ss/conditional.c |   6 +-
 security/selinux/ss/policydb.c| 122 ++
 security/selinux/ss/policydb.h|  12 +-
 security/selinux/ss/services.c|  22 +-
 tools/include/linux/poison.h  |   3 -
 28 files changed, 628 insertions(+), 1174 deletions(-)
 delete mode 100644 Documentation/core-api/flexible-arrays.rst
 create mode 100644 Documentation/core-api/generic-radix-tree.rst
 delete mode 100644 Documentation/flexible-arrays.txt
 delete mode 100644 include/linux/flex_array.h
 create mode 100644 include/linux/generic-radix-tree.h
 delete mode 100644 lib/flex_array.c
 create mode 100644 lib/generic-radix-tree.c

-- 
2.20.1



[PATCH 4/7] Generic radix trees

2018-12-17 Thread Kent Overstreet
Very simple radix tree implementation that supports storing arbitrary
size entries, up to PAGE_SIZE - upcoming patches will convert existing
flex_array users to genradixes. The new genradix code has a much simpler
API and implementation, and doesn't have a hard limit on the number of
elements like flex_array does.

Signed-off-by: Kent Overstreet 
---
 Documentation/core-api/generic-radix-tree.rst |  12 +
 Documentation/core-api/index.rst  |   1 +
 include/linux/generic-radix-tree.h| 231 ++
 lib/Makefile  |   3 +-
 lib/generic-radix-tree.c  | 217 
 5 files changed, 463 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/core-api/generic-radix-tree.rst
 create mode 100644 include/linux/generic-radix-tree.h
 create mode 100644 lib/generic-radix-tree.c

diff --git a/Documentation/core-api/generic-radix-tree.rst 
b/Documentation/core-api/generic-radix-tree.rst
new file mode 100644
index 00..80fd27440f
--- /dev/null
+++ b/Documentation/core-api/generic-radix-tree.rst
@@ -0,0 +1,12 @@
+=  

+Generic radix trees/sparse arrays  

+=  

+   

+.. kernel-doc:: include/linux/generic-radix-tree.h 

+   :doc: Generic radix trees/sparse arrays 

+   

+generic radix tree functions   

+   

+   

+.. kernel-doc:: include/linux/generic-radix-tree.h 

+   :functions:   
diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst
index 3adee82be3..6870baffef 100644
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -28,6 +28,7 @@ Core utilities
errseq
printk-formats
circular-buffers
+   generic-radix-tree
memory-allocation
mm-api
gfp_mask-from-fs-io
diff --git a/include/linux/generic-radix-tree.h 
b/include/linux/generic-radix-tree.h
new file mode 100644
index 00..3a91130a4f
--- /dev/null
+++ b/include/linux/generic-radix-tree.h
@@ -0,0 +1,231 @@
+#ifndef _LINUX_GENERIC_RADIX_TREE_H
+#define _LINUX_GENERIC_RADIX_TREE_H
+
+/**
+ * DOC: Generic radix trees/sparse arrays:
+ *
+ * Very simple and minimalistic, supporting arbitrary size entries up to
+ * PAGE_SIZE.
+ *
+ * A genradix is defined with the type it will store, like so:
+ *
+ * static GENRADIX(struct foo) foo_genradix;
+ *
+ * The main operations are:
+ *
+ * - genradix_init(radix) - initialize an empty genradix
+ *
+ * - genradix_free(radix) - free all memory owned by the genradix and
+ *   reinitialize it
+ *
+ * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
+ *   NULL if that entry does not exist
+ *
+ * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
+ *   allocating it if necessary
+ *
+ * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
+ *
+ * The radix tree allocates one page of entries at a time, so entries may exist
+ * that were never explicitly allocated - they will be initialized to all
+ * zeroes.
+ *
+ * Internally, a genradix is just a radix tree of pages, and indexing works in
+ * terms of byte offsets. The wrappers in this header file use sizeof on the
+ * type the radix contains to calculate a byte offset from the index - see
+ * __idx_to_offset.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct genradix_root;
+
+struct __genradix {
+   struct genradix_root __rcu  *root;
+};
+
+/*
+ * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE:
+ */
+
+#define __GENRADIX_INITIALIZER \
+   {   \
+   .tree = {   \
+   .root = NULL

[PATCH 3/7] selinux: convert to kvmalloc

2018-12-17 Thread Kent Overstreet
The flex arrays were being used for constant sized arrays, so there's no
benefit to using flex_arrays over something simpler.

Signed-off-by: Kent Overstreet 
Cc: linux-security-mod...@vger.kernel.org
---
 security/selinux/ss/avtab.c   |  40 +-
 security/selinux/ss/avtab.h   |   4 +-
 security/selinux/ss/conditional.c |   6 +-
 security/selinux/ss/policydb.c| 122 --
 security/selinux/ss/policydb.h|  12 +--
 security/selinux/ss/services.c|  22 ++
 6 files changed, 62 insertions(+), 144 deletions(-)

diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index c0417cf17f..8c5800750f 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -93,12 +93,10 @@ avtab_insert_node(struct avtab *h, int hvalue,
newnode->next = prev->next;
prev->next = newnode;
} else {
-   newnode->next = flex_array_get_ptr(h->htable, hvalue);
-   if (flex_array_put_ptr(h->htable, hvalue, newnode,
-  GFP_KERNEL|__GFP_ZERO)) {
-   kmem_cache_free(avtab_node_cachep, newnode);
-   return NULL;
-   }
+   struct avtab_node **n = >htable[hvalue];
+
+   newnode->next = *n;
+   *n = newnode;
}
 
h->nel++;
@@ -111,11 +109,11 @@ static int avtab_insert(struct avtab *h, struct avtab_key 
*key, struct avtab_dat
struct avtab_node *prev, *cur, *newnode;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return -EINVAL;
 
hvalue = avtab_hash(key, h->mask);
-   for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue);
+   for (prev = NULL, cur = h->htable[hvalue];
 cur;
 prev = cur, cur = cur->next) {
if (key->source_type == cur->key.source_type &&
@@ -156,10 +154,10 @@ avtab_insert_nonunique(struct avtab *h, struct avtab_key 
*key, struct avtab_datu
struct avtab_node *prev, *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
hvalue = avtab_hash(key, h->mask);
-   for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue);
+   for (prev = NULL, cur = h->htable[hvalue];
 cur;
 prev = cur, cur = cur->next) {
if (key->source_type == cur->key.source_type &&
@@ -186,11 +184,11 @@ struct avtab_datum *avtab_search(struct avtab *h, struct 
avtab_key *key)
struct avtab_node *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
 
hvalue = avtab_hash(key, h->mask);
-   for (cur = flex_array_get_ptr(h->htable, hvalue); cur;
+   for (cur = h->htable[hvalue]; cur;
 cur = cur->next) {
if (key->source_type == cur->key.source_type &&
key->target_type == cur->key.target_type &&
@@ -222,11 +220,11 @@ avtab_search_node(struct avtab *h, struct avtab_key *key)
struct avtab_node *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
 
hvalue = avtab_hash(key, h->mask);
-   for (cur = flex_array_get_ptr(h->htable, hvalue); cur;
+   for (cur = h->htable[hvalue]; cur;
 cur = cur->next) {
if (key->source_type == cur->key.source_type &&
key->target_type == cur->key.target_type &&
@@ -281,11 +279,11 @@ void avtab_destroy(struct avtab *h)
int i;
struct avtab_node *cur, *temp;
 
-   if (!h || !h->htable)
+   if (!h)
return;
 
for (i = 0; i < h->nslot; i++) {
-   cur = flex_array_get_ptr(h->htable, i);
+   cur = h->htable[i];
while (cur) {
temp = cur;
cur = cur->next;
@@ -295,7 +293,7 @@ void avtab_destroy(struct avtab *h)
kmem_cache_free(avtab_node_cachep, temp);
}
}
-   flex_array_free(h->htable);
+   kvfree(h->htable);
h->htable = NULL;
h->nslot = 0;
h->mask = 0;
@@ -303,6 +301,7 @@ void avtab_destroy(struct avtab *h)
 
 int avtab_init(struct avtab *h)
 {
+   kvfree(h->htable);
h->htable = NULL;
h->nel = 0;
return 0;
@@ -329,8 +328,7 @@ int avtab_alloc(struct avtab *h, u32 nrules)
nslot = MAX_AVTAB_HASH_BUCKETS;
  

Re: [PATCH 6/6] Drop flex_arrays

2018-12-17 Thread Kent Overstreet
On Thu, Dec 13, 2018 at 01:09:17PM -0500, Neil Horman wrote:
> On Thu, Dec 13, 2018 at 08:45:33AM -0800, Matthew Wilcox wrote:
> > On Thu, Dec 13, 2018 at 10:51:49AM -0500, Neil Horman wrote:
> > > On Thu, Dec 13, 2018 at 06:41:11AM -0800, Matthew Wilcox wrote:
> > > > On Thu, Dec 13, 2018 at 09:30:47PM +0900, Xin Long wrote:
> > > > > On Sat, Sep 8, 2018 at 1:57 AM Kent Overstreet
> > > > >  wrote:
> > > > > >
> > > > > > All existing users have been converted to generic radix trees
> > > > > NAK, SCTP is still using flex_arrays,
> > > > > # grep flex_array net/sctp/*
> > > > > 
> > > > > This patch will break the build.
> > > > 
> > > > sctp added that user after this patch was sent.  Please stop adding
> > > > flexarray users!
> > > > 
> > > > This particular user should probably have just used kvmalloc.
> > > > 
> > > 
> > > No, I don't think thats right.
> > > 
> > > This appears to have been sent on September 7th.  Commit
> > > 0d493b4d0be352b5e361e4fa0bc3efe952d8b10e, which added the use of 
> > > flex_arrays to
> > > sctp, seems to have been merged on August 10th, a month prior.
> > 
> > Are you seriously suggesting anybody sending cleanups needs to be
> > monitoring every single email list to see if anybody has added a new user?
> > Removing the flexarray has been advertised since May.
> > https://lkml.org/lkml/2018/5/22/1142
> > 
> I don't see how thats any more egregious than everyone else having to monitor
> for removals of code thats in the tree at some indeterminate future.  The 
> long and the short of it
> is that a new flex_array user was added in the intervening 7 months that this
> patch has been waiting to go in, and it will now break if merged.  I'm sorry 
> we
> started using it during that time, but it got missed by everyone in the chain
> that merged it, and hasn't been noticed in the 4 months since.  It is what it
> is, and now it needs to be undone. 
> 
> > > regardless, however, sctp has a current in-tree use of flex_arrays, and 
> > > merging
> > > this patch will break the build without a respin.
> > 
> > Great.  I await your patch to replace the flexarray usage.
> Sure, we'll get to it as soon as we can, or, if you are in a hurry, you can
> replace the same usage, like you've done for all the other users in this 
> series.

This is really my fault for slacking on getting generic-radix-trees in, and
given that the sctp code has been merged I'll do the conversion.

However.

Looking at the sctp code, honestly, wtf is going on here.

sctp_send_add_streams() calls sctp_stream_alloc_out() when it wants to make the
out flex_array bigger - ok, this makes sense, you're using a flex_array because
you want something resizable.

But wait, look what it actually does - it unconditionally frees the old flex
array and preallocates a new one and copies the contents of the old one over.

Without, as far as I can tell, any locking whatsoever.

Was this code tested? Reviewed?


Re: [RFC v3 11/19] kunit: add Python libraries for handing KUnit config and kernel

2018-12-07 Thread Kent Overstreet
On Thu, Dec 06, 2018 at 12:32:47PM +, Kieran Bingham wrote:
> Oh - although, yes - there are some good concepts there - but I'm a bit
> weary of how easy it would be to 'run' the said test against multiple
> kernel version libraries... there would be a lot of possible ABI
> conflicts perhaps.
> 
> My main initial idea for a libumlinux is to provide infrastructure such
> as our linked-lists and other kernel formatting so that we can take
> kernel code directly to userspace for test and debug (assuming that
> there are no hardware dependencies or things that we can't mock out)

I think this would be a really wonderful to make happen, and could potentially
be much wore widely useful than for just running tests, by making it easier to
share code between both kernel and userspace.

For bcachefs I've got a shim layer that lets me build almost everything in
fs/bcachefs and use it as a library in the userspace bcachefs-tools - e.g. for
fsck and migrate. Mine was a quick and dirty hack, but even so it's been
_extremely_ useful and a major success - I think if this became something more
official a lot of uses would be found for it.

I'm not sure if you've actually started on this (haven't seen most of the thread
yet), but if any of the bcachefs-tools shim code is useful feel free to steal it
- I've got dirt-simple, minimum viable shims for the kthread api, workqueus,
timers, the block layer, and assorted other stuff:

https://evilpiepirate.org/git/bcachefs-tools.git/

Going forward, one issue is going to be that a libumllinux is going to want to
shim some interfaces, and for other things it'll just want to pull in the kernel
implementation - e.g. rhashtables. It might be nice if we could refactor things
a bit so that things like rhashtables could be built as a standalone library, as
is.


Bcachefs status update, current work

2018-12-01 Thread Kent Overstreet
So, since I've been pretty quiet since LSF I thought I ought to give an update
on where bcachefs is at - and in particular talk about what sorts of problems
and improvements are currently being worked on.

As of last LSF, there was still a lot of work to be done before we had fast
mount times that don't require walking all metadata. There were two main work
items:
 - atomicity of filesystem operations. Any filesystem operation that had
   anything to do with i_nlink wasn't atomic (but they were ordered so that
   filesystem consistency wasn't an issue) - on startup we'd have to scan and
   recalculate i_nlink and also delete no longer referenced inodes.
 - allocation information wasn't persisted (per bucket sector counts) - so on
   startup we have to walk all the extents and recalculate all the disk space
   accounting.

#1 is done. For those curious about the details, if you've seen how bcachefs
implements rename (with multiple linked btree iterators), it's based off of
that. Basically, there's a new btree transaction context widget for allocating
btree iterators out of, and queuing up updates to be done at transaction commit
- so that different code paths (e.g. inode create, dirent create, xattr create)
can be used together without having to manually write code to keep track of all
the iterators that need to be used and kept locked, etc. I think it's pretty
neat how clean it turned out.

So basically, everything's fully atomic now except for fallocate/fcollapse/etc. 
-
and after unclean shutdown we do have to scan just the inodes btree for inodes
that have been deleted. Eventually we'll have to implement a linked list of
deleted inodes like xfs does (or perhaps fake hidden directory), but inodes are
small in bcachefs, < 100 bytes, so it's a low priority.

Erasure coding is about 80% done now. I'm quite happy with how erasure coding
turned out - there's no write hole (we never update existing stripes in place),
and we also don't fragment writes like zfs does. Instead, foreground writes are
replicated (raid10 style), and as soon as we have a stripe of new data we write
out p/q blocks and then update the extents with a pointer to the stripe and drop
the now unneeded replicas. Right now it's just reed solomon (raid5/6), but
weaver codes or something else could be added in the future if anyone wants to.
The part that still needs to be implemented before it'll be useful is stripe
level compaction - when we have stripes with some empty blocks (all the data in
them was overwritten), we need to use the remaining data blocks when creating
new stripes so that we can drop the old stripe (and stop pinning the empty
blocks). I'm leaving that off until later though because that won't impact the
on disk format at all, and there's other stuff I want to get done first.

My current priority is reflink - as that will be highly useful to the company
that's funding bcachefs development. That's more or less requiring me to do
persistent allocation information first though, so that's become my current
project (the reflinked extent refcounts will be much too big to keep in memory
like I am now for bucket sector counts, so they'll have to be kept in a btree
and updated whenever doing extent updates - and the infrastructure I need to
make that happen is also what I need for making all the other disk space
accounting persistent).

So, bcachefs will have fast mounts (including after unclean shutdown) soon.

At the very moment what I'm working on (leading up to fast mounts after clean
shutdowns, first) is some improvements to disk space accounting for multi device
filesystems.

The background to this is that in order to know whether you can safely mount in
degraded mode, you have to store a list of all the combinations of disks that
have data replicated across them (or are in an erasure coded stripe) - this is
assuming you don't have any kind of fixed layout, like regular RAID does. That
is, if you've got 8 disks in your filesystem, and you're running with
replicas=2, and two of your disks are offline, you need to know whether you have
any data that's replicated across those two particular disks.

bcachefs has such a table kept in the superblock, but entries in it aren't
refcounted - we create new entries if necessary when inserting new extents into
the extents btree, but we need a gc pass to delete them, generally triggered by
device removal. That's kind of lame, since it means we might fail mounts that
are actually safe.

So, before writing the code to persist the filesystem level sector counts I'm
changing it to track them broken out by replicas entry - i.e. per unique
combination of disks the data lies on. Which also means you'll be able to see in
a multi device filesystem how your data is laid out in a really fine grained
way.

Re: upstreaming - my current thinking is that since so much of the current
development involves on disk format changes/additions it probably make sense to
hold off until reflink is done, which I'm 

Bcachefs status update, current work

2018-12-01 Thread Kent Overstreet
So, since I've been pretty quiet since LSF I thought I ought to give an update
on where bcachefs is at - and in particular talk about what sorts of problems
and improvements are currently being worked on.

As of last LSF, there was still a lot of work to be done before we had fast
mount times that don't require walking all metadata. There were two main work
items:
 - atomicity of filesystem operations. Any filesystem operation that had
   anything to do with i_nlink wasn't atomic (but they were ordered so that
   filesystem consistency wasn't an issue) - on startup we'd have to scan and
   recalculate i_nlink and also delete no longer referenced inodes.
 - allocation information wasn't persisted (per bucket sector counts) - so on
   startup we have to walk all the extents and recalculate all the disk space
   accounting.

#1 is done. For those curious about the details, if you've seen how bcachefs
implements rename (with multiple linked btree iterators), it's based off of
that. Basically, there's a new btree transaction context widget for allocating
btree iterators out of, and queuing up updates to be done at transaction commit
- so that different code paths (e.g. inode create, dirent create, xattr create)
can be used together without having to manually write code to keep track of all
the iterators that need to be used and kept locked, etc. I think it's pretty
neat how clean it turned out.

So basically, everything's fully atomic now except for fallocate/fcollapse/etc. 
-
and after unclean shutdown we do have to scan just the inodes btree for inodes
that have been deleted. Eventually we'll have to implement a linked list of
deleted inodes like xfs does (or perhaps fake hidden directory), but inodes are
small in bcachefs, < 100 bytes, so it's a low priority.

Erasure coding is about 80% done now. I'm quite happy with how erasure coding
turned out - there's no write hole (we never update existing stripes in place),
and we also don't fragment writes like zfs does. Instead, foreground writes are
replicated (raid10 style), and as soon as we have a stripe of new data we write
out p/q blocks and then update the extents with a pointer to the stripe and drop
the now unneeded replicas. Right now it's just reed solomon (raid5/6), but
weaver codes or something else could be added in the future if anyone wants to.
The part that still needs to be implemented before it'll be useful is stripe
level compaction - when we have stripes with some empty blocks (all the data in
them was overwritten), we need to use the remaining data blocks when creating
new stripes so that we can drop the old stripe (and stop pinning the empty
blocks). I'm leaving that off until later though because that won't impact the
on disk format at all, and there's other stuff I want to get done first.

My current priority is reflink - as that will be highly useful to the company
that's funding bcachefs development. That's more or less requiring me to do
persistent allocation information first though, so that's become my current
project (the reflinked extent refcounts will be much too big to keep in memory
like I am now for bucket sector counts, so they'll have to be kept in a btree
and updated whenever doing extent updates - and the infrastructure I need to
make that happen is also what I need for making all the other disk space
accounting persistent).

So, bcachefs will have fast mounts (including after unclean shutdown) soon.

At the very moment what I'm working on (leading up to fast mounts after clean
shutdowns, first) is some improvements to disk space accounting for multi device
filesystems.

The background to this is that in order to know whether you can safely mount in
degraded mode, you have to store a list of all the combinations of disks that
have data replicated across them (or are in an erasure coded stripe) - this is
assuming you don't have any kind of fixed layout, like regular RAID does. That
is, if you've got 8 disks in your filesystem, and you're running with
replicas=2, and two of your disks are offline, you need to know whether you have
any data that's replicated across those two particular disks.

bcachefs has such a table kept in the superblock, but entries in it aren't
refcounted - we create new entries if necessary when inserting new extents into
the extents btree, but we need a gc pass to delete them, generally triggered by
device removal. That's kind of lame, since it means we might fail mounts that
are actually safe.

So, before writing the code to persist the filesystem level sector counts I'm
changing it to track them broken out by replicas entry - i.e. per unique
combination of disks the data lies on. Which also means you'll be able to see in
a multi device filesystem how your data is laid out in a really fine grained
way.

Re: upstreaming - my current thinking is that since so much of the current
development involves on disk format changes/additions it probably make sense to
hold off until reflink is done, which I'm 

Re: [PATCH] bug fix in function check_should_bypass

2018-09-27 Thread Kent Overstreet
On Thu, Sep 27, 2018 at 04:27:49PM +0800, Dongbo Cao wrote:
> bio->bi_iter.bi_sector is the sector index of current request, no need to be 
> aligned.
> instead, bio->bi_iter.bi_size should be aligned to block_bytes-1, not 
> block_size-1.
> and bio_sectors is the number of sectors of current request, also no need to 
> be aligned, just remove it.

this isn't a bug fix, please don't label things as bug fixes that aren't.

also, it's wrong. an unaligned IO that overlaps with data already in the cache
will result in an extent in the cache with misaligned size, which will result in
misaligned IOs when reading from it.

> 
> Signed-off-by: Dongbo Cao 
> ---
>  drivers/md/bcache/request.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index 13d3355a..fb3502da 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -398,8 +398,7 @@ static bool check_should_bypass(struct cached_dev *dc, 
> struct bio *bio)
>   !(bio->bi_opf & REQ_PRIO))
>   goto skip;
>  
> - if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
> - bio_sectors(bio) & (c->sb.block_size - 1)) {
> + if (bio->bi_iter.bi_size & (block_bytes(c) - 1)) {
>   pr_debug("skipping unaligned io");
>   goto skip;
>   }
> -- 
> 2.17.1
> 
> 


Re: [PATCH] bug fix in function check_should_bypass

2018-09-27 Thread Kent Overstreet
On Thu, Sep 27, 2018 at 04:27:49PM +0800, Dongbo Cao wrote:
> bio->bi_iter.bi_sector is the sector index of current request, no need to be 
> aligned.
> instead, bio->bi_iter.bi_size should be aligned to block_bytes-1, not 
> block_size-1.
> and bio_sectors is the number of sectors of current request, also no need to 
> be aligned, just remove it.

this isn't a bug fix, please don't label things as bug fixes that aren't.

also, it's wrong. an unaligned IO that overlaps with data already in the cache
will result in an extent in the cache with misaligned size, which will result in
misaligned IOs when reading from it.

> 
> Signed-off-by: Dongbo Cao 
> ---
>  drivers/md/bcache/request.c | 3 +--
>  1 file changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
> index 13d3355a..fb3502da 100644
> --- a/drivers/md/bcache/request.c
> +++ b/drivers/md/bcache/request.c
> @@ -398,8 +398,7 @@ static bool check_should_bypass(struct cached_dev *dc, 
> struct bio *bio)
>   !(bio->bi_opf & REQ_PRIO))
>   goto skip;
>  
> - if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) ||
> - bio_sectors(bio) & (c->sb.block_size - 1)) {
> + if (bio->bi_iter.bi_size & (block_bytes(c) - 1)) {
>   pr_debug("skipping unaligned io");
>   goto skip;
>   }
> -- 
> 2.17.1
> 
> 


Re: [PATCH 2/6] md: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
On Fri, Sep 07, 2018 at 10:49:42AM -0700, Matthew Wilcox wrote:
> On Fri, Sep 07, 2018 at 12:56:31PM -0400, Kent Overstreet wrote:
> > @@ -165,7 +164,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
> > raid5_percpu *percpu,
> >struct dma_async_tx_descriptor *tx)
> >  {
> > int disks = sh->disks;
> > -   struct page **srcs = flex_array_get(percpu->scribble, 0);
> > +   struct page **srcs = percpu->scribble;
> > int count = 0, pd_idx = sh->pd_idx, i;
> > struct async_submit_ctl submit;
> >  
> > @@ -196,8 +195,8 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
> > raid5_percpu *percpu,
> > }
> >  
> > init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
> > - NULL, sh, flex_array_get(percpu->scribble, 0)
> > - + sizeof(struct page *) * (sh->disks + 2));
> > + NULL, sh, percpu->scribble +
> > + sizeof(struct page *) * (sh->disks + 2));
> 
> I think this would read better written as:
> 
>   init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
> NULL, sh, srcs + sh->disks + 2);
> 
> >  static addr_conv_t *to_addr_conv(struct stripe_head *sh,
> >  struct raid5_percpu *percpu, int i)
> >  {
> > -   void *addr;
> > -
> > -   addr = flex_array_get(percpu->scribble, i);
> > -   return addr + sizeof(struct page *) * (sh->disks + 2);
> > +   return percpu->scribble + i * percpu->scribble_obj_size +
> > +   sizeof(struct page *) * (sh->disks + 2);
> >  }
> >  
> >  /* return a pointer to the address conversion region of the scribble 
> > buffer */
> >  static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
> >  {
> > -   void *addr;
> > -
> > -   addr = flex_array_get(percpu->scribble, i);
> > -   return addr;
> > +   return percpu->scribble + i * percpu->scribble_obj_size;
> >  }
> 
> Perhaps this would be better as ...
> 
>  static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
>  {
> - void *addr;
> -
> - addr = flex_array_get(percpu->scribble, i);
> - return addr;
> + return percpu->scribble + i * percpu->scribble_obj_size;
>  }
> 
>  static addr_conv_t *to_addr_conv(struct stripe_head *sh,
>struct raid5_percpu *percpu, int i)
>  {
> - void *addr;
> -
> - addr = flex_array_get(percpu->scribble, i);
> - return addr + sizeof(struct page *) * (sh->disks + 2);
> + return to_addr_page(percpu, i) + sh->disks + 2;
>  }
> 
> 
> The rest looks good.

Need some casts (to void * or addr_conv_t *) but yeah, I suppose that's a bit
cleaner.


Re: [PATCH 2/6] md: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
On Fri, Sep 07, 2018 at 10:49:42AM -0700, Matthew Wilcox wrote:
> On Fri, Sep 07, 2018 at 12:56:31PM -0400, Kent Overstreet wrote:
> > @@ -165,7 +164,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
> > raid5_percpu *percpu,
> >struct dma_async_tx_descriptor *tx)
> >  {
> > int disks = sh->disks;
> > -   struct page **srcs = flex_array_get(percpu->scribble, 0);
> > +   struct page **srcs = percpu->scribble;
> > int count = 0, pd_idx = sh->pd_idx, i;
> > struct async_submit_ctl submit;
> >  
> > @@ -196,8 +195,8 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
> > raid5_percpu *percpu,
> > }
> >  
> > init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
> > - NULL, sh, flex_array_get(percpu->scribble, 0)
> > - + sizeof(struct page *) * (sh->disks + 2));
> > + NULL, sh, percpu->scribble +
> > + sizeof(struct page *) * (sh->disks + 2));
> 
> I think this would read better written as:
> 
>   init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
> NULL, sh, srcs + sh->disks + 2);
> 
> >  static addr_conv_t *to_addr_conv(struct stripe_head *sh,
> >  struct raid5_percpu *percpu, int i)
> >  {
> > -   void *addr;
> > -
> > -   addr = flex_array_get(percpu->scribble, i);
> > -   return addr + sizeof(struct page *) * (sh->disks + 2);
> > +   return percpu->scribble + i * percpu->scribble_obj_size +
> > +   sizeof(struct page *) * (sh->disks + 2);
> >  }
> >  
> >  /* return a pointer to the address conversion region of the scribble 
> > buffer */
> >  static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
> >  {
> > -   void *addr;
> > -
> > -   addr = flex_array_get(percpu->scribble, i);
> > -   return addr;
> > +   return percpu->scribble + i * percpu->scribble_obj_size;
> >  }
> 
> Perhaps this would be better as ...
> 
>  static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
>  {
> - void *addr;
> -
> - addr = flex_array_get(percpu->scribble, i);
> - return addr;
> + return percpu->scribble + i * percpu->scribble_obj_size;
>  }
> 
>  static addr_conv_t *to_addr_conv(struct stripe_head *sh,
>struct raid5_percpu *percpu, int i)
>  {
> - void *addr;
> -
> - addr = flex_array_get(percpu->scribble, i);
> - return addr + sizeof(struct page *) * (sh->disks + 2);
> + return to_addr_page(percpu, i) + sh->disks + 2;
>  }
> 
> 
> The rest looks good.

Need some casts (to void * or addr_conv_t *) but yeah, I suppose that's a bit
cleaner.


Re: [PATCH 3/6] selinux: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
On Sat, Sep 08, 2018 at 02:08:03AM +0900, Tetsuo Handa wrote:
> On 2018/09/08 1:56, Kent Overstreet wrote:
> > @@ -329,8 +328,7 @@ int avtab_alloc(struct avtab *h, u32 nrules)
> > nslot = MAX_AVTAB_HASH_BUCKETS;
> > mask = nslot - 1;
> >  
> > -   h->htable = flex_array_alloc(sizeof(struct avtab_node *), nslot,
> > -GFP_KERNEL | __GFP_ZERO);
> > +   h->htable = kvmalloc_array(nslot, sizeof(void *), GFP_KERNEL);
> > if (!h->htable)
> > return -ENOMEM;
> >  
> 
> kvmalloc_array() does not imply __GFP_ZERO.

Thanks, fixed


Re: [PATCH 3/6] selinux: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
On Sat, Sep 08, 2018 at 02:08:03AM +0900, Tetsuo Handa wrote:
> On 2018/09/08 1:56, Kent Overstreet wrote:
> > @@ -329,8 +328,7 @@ int avtab_alloc(struct avtab *h, u32 nrules)
> > nslot = MAX_AVTAB_HASH_BUCKETS;
> > mask = nslot - 1;
> >  
> > -   h->htable = flex_array_alloc(sizeof(struct avtab_node *), nslot,
> > -GFP_KERNEL | __GFP_ZERO);
> > +   h->htable = kvmalloc_array(nslot, sizeof(void *), GFP_KERNEL);
> > if (!h->htable)
> > return -ENOMEM;
> >  
> 
> kvmalloc_array() does not imply __GFP_ZERO.

Thanks, fixed


[PATCH 5/6] proc: commit to genradix

2018-09-07 Thread Kent Overstreet
the new generic radix trees have a simpler API and implementation, and
no limitations on number of elements, so all flex_array users are being
converted

Signed-off-by: Kent Overstreet 
Cc: Al Viro 
---
 fs/proc/base.c | 43 +++
 1 file changed, 15 insertions(+), 28 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index aaffc0c302..e11fbb390a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -59,6 +59,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -92,7 +93,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include "internal.h"
@@ -2128,11 +2128,12 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
struct task_struct *task;
struct mm_struct *mm;
unsigned long nr_files, pos, i;
-   struct flex_array *fa = NULL;
-   struct map_files_info info;
+   GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
 
+   genradix_init();
+
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2164,35 +2165,22 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
 */
 
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
-   if (vma->vm_file && ++pos > ctx->pos)
-   nr_files++;
-   }
+   if (!vma->vm_file)
+   continue;
+   if (++pos <= ctx->pos)
+   continue;
 
-   if (nr_files) {
-   fa = flex_array_alloc(sizeof(info), nr_files,
-   GFP_KERNEL);
-   if (!fa || flex_array_prealloc(fa, 0, nr_files,
-   GFP_KERNEL)) {
+   p = genradix_ptr_alloc(, nr_files++, GFP_KERNEL);
+   if (!p) {
ret = -ENOMEM;
-   if (fa)
-   flex_array_free(fa);
up_read(>mmap_sem);
mmput(mm);
goto out_put_task;
}
-   for (i = 0, vma = mm->mmap, pos = 2; vma;
-   vma = vma->vm_next) {
-   if (!vma->vm_file)
-   continue;
-   if (++pos <= ctx->pos)
-   continue;
 
-   info.start = vma->vm_start;
-   info.end = vma->vm_end;
-   info.mode = vma->vm_file->f_mode;
-   if (flex_array_put(fa, i++, , GFP_KERNEL))
-   BUG();
-   }
+   p->start = vma->vm_start;
+   p->end = vma->vm_end;
+   p->mode = vma->vm_file->f_mode;
}
up_read(>mmap_sem);
mmput(mm);
@@ -2201,7 +2189,7 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
unsigned int len;
 
-   p = flex_array_get(fa, i);
+   p = genradix_ptr(, i);
len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
if (!proc_fill_cache(file, ctx,
  buf, len,
@@ -2211,12 +2199,11 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
break;
ctx->pos++;
}
-   if (fa)
-   flex_array_free(fa);
 
 out_put_task:
put_task_struct(task);
 out:
+   genradix_free();
return ret;
 }
 
-- 
2.19.0.rc2



[PATCH 5/6] proc: commit to genradix

2018-09-07 Thread Kent Overstreet
the new generic radix trees have a simpler API and implementation, and
no limitations on number of elements, so all flex_array users are being
converted

Signed-off-by: Kent Overstreet 
Cc: Al Viro 
---
 fs/proc/base.c | 43 +++
 1 file changed, 15 insertions(+), 28 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index aaffc0c302..e11fbb390a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -59,6 +59,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -92,7 +93,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include "internal.h"
@@ -2128,11 +2128,12 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
struct task_struct *task;
struct mm_struct *mm;
unsigned long nr_files, pos, i;
-   struct flex_array *fa = NULL;
-   struct map_files_info info;
+   GENRADIX(struct map_files_info) fa;
struct map_files_info *p;
int ret;
 
+   genradix_init();
+
ret = -ENOENT;
task = get_proc_task(file_inode(file));
if (!task)
@@ -2164,35 +2165,22 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
 */
 
for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
-   if (vma->vm_file && ++pos > ctx->pos)
-   nr_files++;
-   }
+   if (!vma->vm_file)
+   continue;
+   if (++pos <= ctx->pos)
+   continue;
 
-   if (nr_files) {
-   fa = flex_array_alloc(sizeof(info), nr_files,
-   GFP_KERNEL);
-   if (!fa || flex_array_prealloc(fa, 0, nr_files,
-   GFP_KERNEL)) {
+   p = genradix_ptr_alloc(, nr_files++, GFP_KERNEL);
+   if (!p) {
ret = -ENOMEM;
-   if (fa)
-   flex_array_free(fa);
up_read(>mmap_sem);
mmput(mm);
goto out_put_task;
}
-   for (i = 0, vma = mm->mmap, pos = 2; vma;
-   vma = vma->vm_next) {
-   if (!vma->vm_file)
-   continue;
-   if (++pos <= ctx->pos)
-   continue;
 
-   info.start = vma->vm_start;
-   info.end = vma->vm_end;
-   info.mode = vma->vm_file->f_mode;
-   if (flex_array_put(fa, i++, , GFP_KERNEL))
-   BUG();
-   }
+   p->start = vma->vm_start;
+   p->end = vma->vm_end;
+   p->mode = vma->vm_file->f_mode;
}
up_read(>mmap_sem);
mmput(mm);
@@ -2201,7 +2189,7 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
char buf[4 * sizeof(long) + 2]; /* max: %lx-%lx\0 */
unsigned int len;
 
-   p = flex_array_get(fa, i);
+   p = genradix_ptr(, i);
len = snprintf(buf, sizeof(buf), "%lx-%lx", p->start, p->end);
if (!proc_fill_cache(file, ctx,
  buf, len,
@@ -2211,12 +2199,11 @@ proc_map_files_readdir(struct file *file, struct 
dir_context *ctx)
break;
ctx->pos++;
}
-   if (fa)
-   flex_array_free(fa);
 
 out_put_task:
put_task_struct(task);
 out:
+   genradix_free();
return ret;
 }
 
-- 
2.19.0.rc2



[PATCH 4/6] Generic radix trees

2018-09-07 Thread Kent Overstreet
Very simple radix tree implementation that supports storing arbitrary
size entries, up to PAGE_SIZE - upcoming patches will convert existing
flex_array users to genradixes. The new genradix code has a much simpler
API and implementation, and doesn't have a hard limit on the number of
elements like flex_array does.

Signed-off-by: Kent Overstreet 
---
 include/linux/generic-radix-tree.h | 222 +
 lib/Makefile   |   3 +-
 lib/generic-radix-tree.c   | 180 +++
 3 files changed, 404 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/generic-radix-tree.h
 create mode 100644 lib/generic-radix-tree.c

diff --git a/include/linux/generic-radix-tree.h 
b/include/linux/generic-radix-tree.h
new file mode 100644
index 00..3328813322
--- /dev/null
+++ b/include/linux/generic-radix-tree.h
@@ -0,0 +1,222 @@
+#ifndef _LINUX_GENERIC_RADIX_TREE_H
+#define _LINUX_GENERIC_RADIX_TREE_H
+
+/*
+ * Generic radix trees/sparse arrays:
+ *
+ * Very simple and minimalistic, supporting arbitrary size entries up to
+ * PAGE_SIZE.
+ *
+ * A genradix is defined with the type it will store, like so:
+ * static GENRADIX(struct foo) foo_genradix;
+ *
+ * The main operations are:
+ * - genradix_init(radix) - initialize an empty genradix
+ *
+ * - genradix_free(radix) - free all memory owned by the genradix and
+ *   reinitialize it
+ *
+ * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
+ *   NULL if that entry does not exist
+ *
+ * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
+ *   allocating it if necessary
+ *
+ * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
+ *
+ * The radix tree allocates one page of entries at a time, so entries may exist
+ * that were never explicitly allocated - they will be initialized to all
+ * zeroes.
+ *
+ * Internally, a genradix is just a radix tree of pages, and indexing works in
+ * terms of byte offsets. The wrappers in this header file use sizeof on the
+ * type the radix contains to calculate a byte offset from the index - see
+ * __idx_to_offset.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct genradix_node;
+
+struct __genradix {
+   struct genradix_node*root;
+   size_t  depth;
+};
+
+#define __GENRADIX_INITIALIZER \
+   {   \
+   .tree = {   \
+   .root = NULL,   \
+   .depth = 0, \
+   }   \
+   }
+
+/*
+ * We use a 0 size array to stash the type we're storing without taking any
+ * space at runtime - then the various accessor macros can use typeof() to get
+ * to it for casts/sizeof - we also force the alignment so that storing a type
+ * with a ridiculous alignment doesn't blow up the alignment or size of the
+ * genradix.
+ */
+
+#define GENRADIX(_type)\
+struct {   \
+   struct __genradix   tree;   \
+   _type   type[0] __aligned(1);   \
+}
+
+#define DEFINE_GENRADIX(_name, _type)  \
+   GENRADIX(_type) _name = __GENRADIX_INITIALIZER
+
+/**
+ * genradix_init - initialize a genradix
+ * @_radix:genradix to initialize
+ *
+ * Does not fail
+ */
+#define genradix_init(_radix)  \
+do {   \
+   *(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER;   \
+} while (0)
+
+void __genradix_free(struct __genradix *);
+
+/**
+ * genradix_free: free all memory owned by a genradix
+ *
+ * After freeing, @_radix will be reinitialized and empty
+ */
+#define genradix_free(_radix)  __genradix_free(&(_radix)->tree)
+
+static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
+{
+   if (__builtin_constant_p(obj_size))
+   BUILD_BUG_ON(obj_size > PAGE_SIZE);
+   else
+   BUG_ON(obj_size > PAGE_SIZE);
+
+   if (!is_power_of_2(obj_size)) {
+   size_t objs_per_page = PAGE_SIZE / obj_size;
+
+   return (idx / objs_per_page) * PAGE_SIZE +
+   (idx % objs_per_page) * obj_size;
+   } else {
+   return idx * obj_size;
+   }
+}
+
+#define __genradix_cast(_radix)(typeof((_radix)->type[0]) *)
+#define __genradix_obj_size(_radix)sizeof((_radix)->type[0])
+#define __genradix_idx_to_offset(_radix, _idx) \
+   __idx_to_offset(_idx, __genradix_obj_size(_radix))
+
+void *__genradix_ptr(struct __genradix *, size_t);
+
+/**
+ * genradix_ptr - get a pointer to a genradix entry
+ * @

[PATCH 6/6] Drop flex_arrays

2018-09-07 Thread Kent Overstreet
All existing users have been converted to generic radix trees

Signed-off-by: Kent Overstreet 
Acked-by: Dave Hansen 
---
 Documentation/core-api/flexible-arrays.rst | 130 ---
 Documentation/flexible-arrays.txt  | 123 ---
 include/linux/flex_array.h | 149 
 include/linux/poison.h |   3 -
 lib/Makefile   |   2 +-
 lib/flex_array.c   | 398 -
 tools/include/linux/poison.h   |   3 -
 7 files changed, 1 insertion(+), 807 deletions(-)
 delete mode 100644 Documentation/core-api/flexible-arrays.rst
 delete mode 100644 Documentation/flexible-arrays.txt
 delete mode 100644 include/linux/flex_array.h
 delete mode 100644 lib/flex_array.c

diff --git a/Documentation/core-api/flexible-arrays.rst 
b/Documentation/core-api/flexible-arrays.rst
deleted file mode 100644
index b6b85a1b51..00
--- a/Documentation/core-api/flexible-arrays.rst
+++ /dev/null
@@ -1,130 +0,0 @@
-
-===
-Using flexible arrays in the kernel
-===
-
-Large contiguous memory allocations can be unreliable in the Linux kernel.
-Kernel programmers will sometimes respond to this problem by allocating
-pages with :c:func:`vmalloc()`.  This solution not ideal, though.  On 32-bit
-systems, memory from vmalloc() must be mapped into a relatively small address
-space; it's easy to run out.  On SMP systems, the page table changes required
-by vmalloc() allocations can require expensive cross-processor interrupts on
-all CPUs.  And, on all systems, use of space in the vmalloc() range increases
-pressure on the translation lookaside buffer (TLB), reducing the performance
-of the system.
-
-In many cases, the need for memory from vmalloc() can be eliminated by piecing
-together an array from smaller parts; the flexible array library exists to make
-this task easier.
-
-A flexible array holds an arbitrary (within limits) number of fixed-sized
-objects, accessed via an integer index.  Sparse arrays are handled
-reasonably well.  Only single-page allocations are made, so memory
-allocation failures should be relatively rare.  The down sides are that the
-arrays cannot be indexed directly, individual object size cannot exceed the
-system page size, and putting data into a flexible array requires a copy
-operation.  It's also worth noting that flexible arrays do no internal
-locking at all; if concurrent access to an array is possible, then the
-caller must arrange for appropriate mutual exclusion.
-
-The creation of a flexible array is done with :c:func:`flex_array_alloc()`::
-
-#include 
-
-struct flex_array *flex_array_alloc(int element_size,
-   unsigned int total,
-   gfp_t flags);
-
-The individual object size is provided by ``element_size``, while total is the
-maximum number of objects which can be stored in the array.  The flags
-argument is passed directly to the internal memory allocation calls.  With
-the current code, using flags to ask for high memory is likely to lead to
-notably unpleasant side effects.
-
-It is also possible to define flexible arrays at compile time with::
-
-DEFINE_FLEX_ARRAY(name, element_size, total);
-
-This macro will result in a definition of an array with the given name; the
-element size and total will be checked for validity at compile time.
-
-Storing data into a flexible array is accomplished with a call to
-:c:func:`flex_array_put()`::
-
-int flex_array_put(struct flex_array *array, unsigned int element_nr,
-  void *src, gfp_t flags);
-
-This call will copy the data from src into the array, in the position
-indicated by ``element_nr`` (which must be less than the maximum specified when
-the array was created).  If any memory allocations must be performed, flags
-will be used.  The return value is zero on success, a negative error code
-otherwise.
-
-There might possibly be a need to store data into a flexible array while
-running in some sort of atomic context; in this situation, sleeping in the
-memory allocator would be a bad thing.  That can be avoided by using
-``GFP_ATOMIC`` for the flags value, but, often, there is a better way.  The
-trick is to ensure that any needed memory allocations are done before
-entering atomic context, using :c:func:`flex_array_prealloc()`::
-
-int flex_array_prealloc(struct flex_array *array, unsigned int start,
-   unsigned int nr_elements, gfp_t flags);
-
-This function will ensure that memory for the elements indexed in the range
-defined by ``start`` and ``nr_elements`` has been allocated.  Thereafter, a
-``flex_array_put()`` call on an element in that range is guaranteed not to
-block.
-
-Getting data back out of the array is done with :c:func:`flex_array_get()`::
-
-void *flex_array_get(struct flex_array *fa, unsigned int element_nr

[PATCH 4/6] Generic radix trees

2018-09-07 Thread Kent Overstreet
Very simple radix tree implementation that supports storing arbitrary
size entries, up to PAGE_SIZE - upcoming patches will convert existing
flex_array users to genradixes. The new genradix code has a much simpler
API and implementation, and doesn't have a hard limit on the number of
elements like flex_array does.

Signed-off-by: Kent Overstreet 
---
 include/linux/generic-radix-tree.h | 222 +
 lib/Makefile   |   3 +-
 lib/generic-radix-tree.c   | 180 +++
 3 files changed, 404 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/generic-radix-tree.h
 create mode 100644 lib/generic-radix-tree.c

diff --git a/include/linux/generic-radix-tree.h 
b/include/linux/generic-radix-tree.h
new file mode 100644
index 00..3328813322
--- /dev/null
+++ b/include/linux/generic-radix-tree.h
@@ -0,0 +1,222 @@
+#ifndef _LINUX_GENERIC_RADIX_TREE_H
+#define _LINUX_GENERIC_RADIX_TREE_H
+
+/*
+ * Generic radix trees/sparse arrays:
+ *
+ * Very simple and minimalistic, supporting arbitrary size entries up to
+ * PAGE_SIZE.
+ *
+ * A genradix is defined with the type it will store, like so:
+ * static GENRADIX(struct foo) foo_genradix;
+ *
+ * The main operations are:
+ * - genradix_init(radix) - initialize an empty genradix
+ *
+ * - genradix_free(radix) - free all memory owned by the genradix and
+ *   reinitialize it
+ *
+ * - genradix_ptr(radix, idx) - gets a pointer to the entry at idx, returning
+ *   NULL if that entry does not exist
+ *
+ * - genradix_ptr_alloc(radix, idx, gfp) - gets a pointer to an entry,
+ *   allocating it if necessary
+ *
+ * - genradix_for_each(radix, iter, p) - iterate over each entry in a genradix
+ *
+ * The radix tree allocates one page of entries at a time, so entries may exist
+ * that were never explicitly allocated - they will be initialized to all
+ * zeroes.
+ *
+ * Internally, a genradix is just a radix tree of pages, and indexing works in
+ * terms of byte offsets. The wrappers in this header file use sizeof on the
+ * type the radix contains to calculate a byte offset from the index - see
+ * __idx_to_offset.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct genradix_node;
+
+struct __genradix {
+   struct genradix_node*root;
+   size_t  depth;
+};
+
+#define __GENRADIX_INITIALIZER \
+   {   \
+   .tree = {   \
+   .root = NULL,   \
+   .depth = 0, \
+   }   \
+   }
+
+/*
+ * We use a 0 size array to stash the type we're storing without taking any
+ * space at runtime - then the various accessor macros can use typeof() to get
+ * to it for casts/sizeof - we also force the alignment so that storing a type
+ * with a ridiculous alignment doesn't blow up the alignment or size of the
+ * genradix.
+ */
+
+#define GENRADIX(_type)\
+struct {   \
+   struct __genradix   tree;   \
+   _type   type[0] __aligned(1);   \
+}
+
+#define DEFINE_GENRADIX(_name, _type)  \
+   GENRADIX(_type) _name = __GENRADIX_INITIALIZER
+
+/**
+ * genradix_init - initialize a genradix
+ * @_radix:genradix to initialize
+ *
+ * Does not fail
+ */
+#define genradix_init(_radix)  \
+do {   \
+   *(_radix) = (typeof(*_radix)) __GENRADIX_INITIALIZER;   \
+} while (0)
+
+void __genradix_free(struct __genradix *);
+
+/**
+ * genradix_free: free all memory owned by a genradix
+ *
+ * After freeing, @_radix will be reinitialized and empty
+ */
+#define genradix_free(_radix)  __genradix_free(&(_radix)->tree)
+
+static inline size_t __idx_to_offset(size_t idx, size_t obj_size)
+{
+   if (__builtin_constant_p(obj_size))
+   BUILD_BUG_ON(obj_size > PAGE_SIZE);
+   else
+   BUG_ON(obj_size > PAGE_SIZE);
+
+   if (!is_power_of_2(obj_size)) {
+   size_t objs_per_page = PAGE_SIZE / obj_size;
+
+   return (idx / objs_per_page) * PAGE_SIZE +
+   (idx % objs_per_page) * obj_size;
+   } else {
+   return idx * obj_size;
+   }
+}
+
+#define __genradix_cast(_radix)(typeof((_radix)->type[0]) *)
+#define __genradix_obj_size(_radix)sizeof((_radix)->type[0])
+#define __genradix_idx_to_offset(_radix, _idx) \
+   __idx_to_offset(_idx, __genradix_obj_size(_radix))
+
+void *__genradix_ptr(struct __genradix *, size_t);
+
+/**
+ * genradix_ptr - get a pointer to a genradix entry
+ * @

[PATCH 6/6] Drop flex_arrays

2018-09-07 Thread Kent Overstreet
All existing users have been converted to generic radix trees

Signed-off-by: Kent Overstreet 
Acked-by: Dave Hansen 
---
 Documentation/core-api/flexible-arrays.rst | 130 ---
 Documentation/flexible-arrays.txt  | 123 ---
 include/linux/flex_array.h | 149 
 include/linux/poison.h |   3 -
 lib/Makefile   |   2 +-
 lib/flex_array.c   | 398 -
 tools/include/linux/poison.h   |   3 -
 7 files changed, 1 insertion(+), 807 deletions(-)
 delete mode 100644 Documentation/core-api/flexible-arrays.rst
 delete mode 100644 Documentation/flexible-arrays.txt
 delete mode 100644 include/linux/flex_array.h
 delete mode 100644 lib/flex_array.c

diff --git a/Documentation/core-api/flexible-arrays.rst 
b/Documentation/core-api/flexible-arrays.rst
deleted file mode 100644
index b6b85a1b51..00
--- a/Documentation/core-api/flexible-arrays.rst
+++ /dev/null
@@ -1,130 +0,0 @@
-
-===
-Using flexible arrays in the kernel
-===
-
-Large contiguous memory allocations can be unreliable in the Linux kernel.
-Kernel programmers will sometimes respond to this problem by allocating
-pages with :c:func:`vmalloc()`.  This solution not ideal, though.  On 32-bit
-systems, memory from vmalloc() must be mapped into a relatively small address
-space; it's easy to run out.  On SMP systems, the page table changes required
-by vmalloc() allocations can require expensive cross-processor interrupts on
-all CPUs.  And, on all systems, use of space in the vmalloc() range increases
-pressure on the translation lookaside buffer (TLB), reducing the performance
-of the system.
-
-In many cases, the need for memory from vmalloc() can be eliminated by piecing
-together an array from smaller parts; the flexible array library exists to make
-this task easier.
-
-A flexible array holds an arbitrary (within limits) number of fixed-sized
-objects, accessed via an integer index.  Sparse arrays are handled
-reasonably well.  Only single-page allocations are made, so memory
-allocation failures should be relatively rare.  The down sides are that the
-arrays cannot be indexed directly, individual object size cannot exceed the
-system page size, and putting data into a flexible array requires a copy
-operation.  It's also worth noting that flexible arrays do no internal
-locking at all; if concurrent access to an array is possible, then the
-caller must arrange for appropriate mutual exclusion.
-
-The creation of a flexible array is done with :c:func:`flex_array_alloc()`::
-
-#include 
-
-struct flex_array *flex_array_alloc(int element_size,
-   unsigned int total,
-   gfp_t flags);
-
-The individual object size is provided by ``element_size``, while total is the
-maximum number of objects which can be stored in the array.  The flags
-argument is passed directly to the internal memory allocation calls.  With
-the current code, using flags to ask for high memory is likely to lead to
-notably unpleasant side effects.
-
-It is also possible to define flexible arrays at compile time with::
-
-DEFINE_FLEX_ARRAY(name, element_size, total);
-
-This macro will result in a definition of an array with the given name; the
-element size and total will be checked for validity at compile time.
-
-Storing data into a flexible array is accomplished with a call to
-:c:func:`flex_array_put()`::
-
-int flex_array_put(struct flex_array *array, unsigned int element_nr,
-  void *src, gfp_t flags);
-
-This call will copy the data from src into the array, in the position
-indicated by ``element_nr`` (which must be less than the maximum specified when
-the array was created).  If any memory allocations must be performed, flags
-will be used.  The return value is zero on success, a negative error code
-otherwise.
-
-There might possibly be a need to store data into a flexible array while
-running in some sort of atomic context; in this situation, sleeping in the
-memory allocator would be a bad thing.  That can be avoided by using
-``GFP_ATOMIC`` for the flags value, but, often, there is a better way.  The
-trick is to ensure that any needed memory allocations are done before
-entering atomic context, using :c:func:`flex_array_prealloc()`::
-
-int flex_array_prealloc(struct flex_array *array, unsigned int start,
-   unsigned int nr_elements, gfp_t flags);
-
-This function will ensure that memory for the elements indexed in the range
-defined by ``start`` and ``nr_elements`` has been allocated.  Thereafter, a
-``flex_array_put()`` call on an element in that range is guaranteed not to
-block.
-
-Getting data back out of the array is done with :c:func:`flex_array_get()`::
-
-void *flex_array_get(struct flex_array *fa, unsigned int element_nr

[PATCH 2/6] md: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
The code really just wants a big flat buffer, so just do that.

Signed-off-by: Kent Overstreet 
Cc: Shaohua Li 
Cc: linux-r...@vger.kernel.org
---
 drivers/md/raid5-ppl.c |  7 ++--
 drivers/md/raid5.c | 82 +++---
 drivers/md/raid5.h |  9 ++---
 3 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 3a7c363265..5911810101 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -16,7 +16,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include "md.h"
@@ -165,7 +164,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
raid5_percpu *percpu,
   struct dma_async_tx_descriptor *tx)
 {
int disks = sh->disks;
-   struct page **srcs = flex_array_get(percpu->scribble, 0);
+   struct page **srcs = percpu->scribble;
int count = 0, pd_idx = sh->pd_idx, i;
struct async_submit_ctl submit;
 
@@ -196,8 +195,8 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
raid5_percpu *percpu,
}
 
init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
- NULL, sh, flex_array_get(percpu->scribble, 0)
- + sizeof(struct page *) * (sh->disks + 2));
+ NULL, sh, percpu->scribble +
+ sizeof(struct page *) * (sh->disks + 2));
 
if (count == 1)
tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2031506a0e..d5603946dc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -54,7 +54,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 #include 
@@ -1399,19 +1398,14 @@ static void ops_complete_compute(void *stripe_head_ref)
 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
 struct raid5_percpu *percpu, int i)
 {
-   void *addr;
-
-   addr = flex_array_get(percpu->scribble, i);
-   return addr + sizeof(struct page *) * (sh->disks + 2);
+   return percpu->scribble + i * percpu->scribble_obj_size +
+   sizeof(struct page *) * (sh->disks + 2);
 }
 
 /* return a pointer to the address conversion region of the scribble buffer */
 static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
 {
-   void *addr;
-
-   addr = flex_array_get(percpu->scribble, i);
-   return addr;
+   return percpu->scribble + i * percpu->scribble_obj_size;
 }
 
 static struct dma_async_tx_descriptor *
@@ -2240,21 +2234,23 @@ static int grow_stripes(struct r5conf *conf, int num)
  * calculate over all devices (not just the data blocks), using zeros in place
  * of the P and Q blocks.
  */
-static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
+static int scribble_alloc(struct raid5_percpu *percpu,
+ int num, int cnt, gfp_t flags)
 {
-   struct flex_array *ret;
-   size_t len;
+   size_t obj_size =
+   sizeof(struct page *) * (num+2) +
+   sizeof(addr_conv_t) * (num+2);
+   void *scribble;
 
-   len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
-   ret = flex_array_alloc(len, cnt, flags);
-   if (!ret)
-   return NULL;
-   /* always prealloc all elements, so no locking is required */
-   if (flex_array_prealloc(ret, 0, cnt, flags)) {
-   flex_array_free(ret);
-   return NULL;
-   }
-   return ret;
+   scribble = kvmalloc_array(cnt, obj_size, flags);
+   if (!scribble)
+   return -ENOMEM;
+
+   kvfree(percpu->scribble);
+
+   percpu->scribble = scribble;
+   percpu->scribble_obj_size = obj_size;
+   return 0;
 }
 
 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
@@ -2272,23 +2268,18 @@ static int resize_chunks(struct r5conf *conf, int 
new_disks, int new_sectors)
return 0;
mddev_suspend(conf->mddev);
get_online_cpus();
+
for_each_present_cpu(cpu) {
struct raid5_percpu *percpu;
-   struct flex_array *scribble;
 
percpu = per_cpu_ptr(conf->percpu, cpu);
-   scribble = scribble_alloc(new_disks,
- new_sectors / STRIPE_SECTORS,
- GFP_NOIO);
-
-   if (scribble) {
-   flex_array_free(percpu->scribble);
-   percpu->scribble = scribble;
-   } else {
-   err = -ENOMEM;
+   err = scribble_alloc(percpu, new_disks,
+new_sectors / STRIPE_SECTORS,
+GFP_NOIO);
+   if (err)
break;
-  

[PATCH 3/6] selinux: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
The flex arrays were being used for constant sized arrays, so there's no
benefit to using flex_arrays over something simpler.

Signed-off-by: Kent Overstreet 
Cc: linux-security-mod...@vger.kernel.org
---
 security/selinux/ss/avtab.c   |  40 +-
 security/selinux/ss/avtab.h   |   4 +-
 security/selinux/ss/conditional.c |   6 +-
 security/selinux/ss/policydb.c| 122 --
 security/selinux/ss/policydb.h|  12 +--
 security/selinux/ss/services.c|  22 ++
 6 files changed, 62 insertions(+), 144 deletions(-)

diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index a2c9148b06..5a7fd5f0b7 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -93,12 +93,10 @@ avtab_insert_node(struct avtab *h, int hvalue,
newnode->next = prev->next;
prev->next = newnode;
} else {
-   newnode->next = flex_array_get_ptr(h->htable, hvalue);
-   if (flex_array_put_ptr(h->htable, hvalue, newnode,
-  GFP_KERNEL|__GFP_ZERO)) {
-   kmem_cache_free(avtab_node_cachep, newnode);
-   return NULL;
-   }
+   struct avtab_node **n = >htable[hvalue];
+
+   newnode->next = *n;
+   *n = newnode;
}
 
h->nel++;
@@ -111,11 +109,11 @@ static int avtab_insert(struct avtab *h, struct avtab_key 
*key, struct avtab_dat
struct avtab_node *prev, *cur, *newnode;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return -EINVAL;
 
hvalue = avtab_hash(key, h->mask);
-   for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue);
+   for (prev = NULL, cur = h->htable[hvalue];
 cur;
 prev = cur, cur = cur->next) {
if (key->source_type == cur->key.source_type &&
@@ -156,10 +154,10 @@ avtab_insert_nonunique(struct avtab *h, struct avtab_key 
*key, struct avtab_datu
struct avtab_node *prev, *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
hvalue = avtab_hash(key, h->mask);
-   for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue);
+   for (prev = NULL, cur = h->htable[hvalue];
 cur;
 prev = cur, cur = cur->next) {
if (key->source_type == cur->key.source_type &&
@@ -186,11 +184,11 @@ struct avtab_datum *avtab_search(struct avtab *h, struct 
avtab_key *key)
struct avtab_node *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
 
hvalue = avtab_hash(key, h->mask);
-   for (cur = flex_array_get_ptr(h->htable, hvalue); cur;
+   for (cur = h->htable[hvalue]; cur;
 cur = cur->next) {
if (key->source_type == cur->key.source_type &&
key->target_type == cur->key.target_type &&
@@ -222,11 +220,11 @@ avtab_search_node(struct avtab *h, struct avtab_key *key)
struct avtab_node *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
 
hvalue = avtab_hash(key, h->mask);
-   for (cur = flex_array_get_ptr(h->htable, hvalue); cur;
+   for (cur = h->htable[hvalue]; cur;
 cur = cur->next) {
if (key->source_type == cur->key.source_type &&
key->target_type == cur->key.target_type &&
@@ -281,11 +279,11 @@ void avtab_destroy(struct avtab *h)
int i;
struct avtab_node *cur, *temp;
 
-   if (!h || !h->htable)
+   if (!h)
return;
 
for (i = 0; i < h->nslot; i++) {
-   cur = flex_array_get_ptr(h->htable, i);
+   cur = h->htable[i];
while (cur) {
temp = cur;
cur = cur->next;
@@ -295,7 +293,7 @@ void avtab_destroy(struct avtab *h)
kmem_cache_free(avtab_node_cachep, temp);
}
}
-   flex_array_free(h->htable);
+   kvfree(h->htable);
h->htable = NULL;
h->nslot = 0;
h->mask = 0;
@@ -303,6 +301,7 @@ void avtab_destroy(struct avtab *h)
 
 int avtab_init(struct avtab *h)
 {
+   kvfree(h->htable);
h->htable = NULL;
h->nel = 0;
return 0;
@@ -329,8 +328,7 @@ int avtab_alloc(struct avtab *h, u32 nrules)
nslot = MAX_AVTAB_HASH_

[PATCH 2/6] md: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
The code really just wants a big flat buffer, so just do that.

Signed-off-by: Kent Overstreet 
Cc: Shaohua Li 
Cc: linux-r...@vger.kernel.org
---
 drivers/md/raid5-ppl.c |  7 ++--
 drivers/md/raid5.c | 82 +++---
 drivers/md/raid5.h |  9 ++---
 3 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 3a7c363265..5911810101 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -16,7 +16,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include "md.h"
@@ -165,7 +164,7 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
raid5_percpu *percpu,
   struct dma_async_tx_descriptor *tx)
 {
int disks = sh->disks;
-   struct page **srcs = flex_array_get(percpu->scribble, 0);
+   struct page **srcs = percpu->scribble;
int count = 0, pd_idx = sh->pd_idx, i;
struct async_submit_ctl submit;
 
@@ -196,8 +195,8 @@ ops_run_partial_parity(struct stripe_head *sh, struct 
raid5_percpu *percpu,
}
 
init_async_submit(, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, tx,
- NULL, sh, flex_array_get(percpu->scribble, 0)
- + sizeof(struct page *) * (sh->disks + 2));
+ NULL, sh, percpu->scribble +
+ sizeof(struct page *) * (sh->disks + 2));
 
if (count == 1)
tx = async_memcpy(sh->ppl_page, srcs[0], 0, 0, PAGE_SIZE,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2031506a0e..d5603946dc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -54,7 +54,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 #include 
@@ -1399,19 +1398,14 @@ static void ops_complete_compute(void *stripe_head_ref)
 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
 struct raid5_percpu *percpu, int i)
 {
-   void *addr;
-
-   addr = flex_array_get(percpu->scribble, i);
-   return addr + sizeof(struct page *) * (sh->disks + 2);
+   return percpu->scribble + i * percpu->scribble_obj_size +
+   sizeof(struct page *) * (sh->disks + 2);
 }
 
 /* return a pointer to the address conversion region of the scribble buffer */
 static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
 {
-   void *addr;
-
-   addr = flex_array_get(percpu->scribble, i);
-   return addr;
+   return percpu->scribble + i * percpu->scribble_obj_size;
 }
 
 static struct dma_async_tx_descriptor *
@@ -2240,21 +2234,23 @@ static int grow_stripes(struct r5conf *conf, int num)
  * calculate over all devices (not just the data blocks), using zeros in place
  * of the P and Q blocks.
  */
-static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
+static int scribble_alloc(struct raid5_percpu *percpu,
+ int num, int cnt, gfp_t flags)
 {
-   struct flex_array *ret;
-   size_t len;
+   size_t obj_size =
+   sizeof(struct page *) * (num+2) +
+   sizeof(addr_conv_t) * (num+2);
+   void *scribble;
 
-   len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
-   ret = flex_array_alloc(len, cnt, flags);
-   if (!ret)
-   return NULL;
-   /* always prealloc all elements, so no locking is required */
-   if (flex_array_prealloc(ret, 0, cnt, flags)) {
-   flex_array_free(ret);
-   return NULL;
-   }
-   return ret;
+   scribble = kvmalloc_array(cnt, obj_size, flags);
+   if (!scribble)
+   return -ENOMEM;
+
+   kvfree(percpu->scribble);
+
+   percpu->scribble = scribble;
+   percpu->scribble_obj_size = obj_size;
+   return 0;
 }
 
 static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
@@ -2272,23 +2268,18 @@ static int resize_chunks(struct r5conf *conf, int 
new_disks, int new_sectors)
return 0;
mddev_suspend(conf->mddev);
get_online_cpus();
+
for_each_present_cpu(cpu) {
struct raid5_percpu *percpu;
-   struct flex_array *scribble;
 
percpu = per_cpu_ptr(conf->percpu, cpu);
-   scribble = scribble_alloc(new_disks,
- new_sectors / STRIPE_SECTORS,
- GFP_NOIO);
-
-   if (scribble) {
-   flex_array_free(percpu->scribble);
-   percpu->scribble = scribble;
-   } else {
-   err = -ENOMEM;
+   err = scribble_alloc(percpu, new_disks,
+new_sectors / STRIPE_SECTORS,
+GFP_NOIO);
+   if (err)
break;
-  

[PATCH 3/6] selinux: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
The flex arrays were being used for constant sized arrays, so there's no
benefit to using flex_arrays over something simpler.

Signed-off-by: Kent Overstreet 
Cc: linux-security-mod...@vger.kernel.org
---
 security/selinux/ss/avtab.c   |  40 +-
 security/selinux/ss/avtab.h   |   4 +-
 security/selinux/ss/conditional.c |   6 +-
 security/selinux/ss/policydb.c| 122 --
 security/selinux/ss/policydb.h|  12 +--
 security/selinux/ss/services.c|  22 ++
 6 files changed, 62 insertions(+), 144 deletions(-)

diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index a2c9148b06..5a7fd5f0b7 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -93,12 +93,10 @@ avtab_insert_node(struct avtab *h, int hvalue,
newnode->next = prev->next;
prev->next = newnode;
} else {
-   newnode->next = flex_array_get_ptr(h->htable, hvalue);
-   if (flex_array_put_ptr(h->htable, hvalue, newnode,
-  GFP_KERNEL|__GFP_ZERO)) {
-   kmem_cache_free(avtab_node_cachep, newnode);
-   return NULL;
-   }
+   struct avtab_node **n = >htable[hvalue];
+
+   newnode->next = *n;
+   *n = newnode;
}
 
h->nel++;
@@ -111,11 +109,11 @@ static int avtab_insert(struct avtab *h, struct avtab_key 
*key, struct avtab_dat
struct avtab_node *prev, *cur, *newnode;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return -EINVAL;
 
hvalue = avtab_hash(key, h->mask);
-   for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue);
+   for (prev = NULL, cur = h->htable[hvalue];
 cur;
 prev = cur, cur = cur->next) {
if (key->source_type == cur->key.source_type &&
@@ -156,10 +154,10 @@ avtab_insert_nonunique(struct avtab *h, struct avtab_key 
*key, struct avtab_datu
struct avtab_node *prev, *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
hvalue = avtab_hash(key, h->mask);
-   for (prev = NULL, cur = flex_array_get_ptr(h->htable, hvalue);
+   for (prev = NULL, cur = h->htable[hvalue];
 cur;
 prev = cur, cur = cur->next) {
if (key->source_type == cur->key.source_type &&
@@ -186,11 +184,11 @@ struct avtab_datum *avtab_search(struct avtab *h, struct 
avtab_key *key)
struct avtab_node *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
 
hvalue = avtab_hash(key, h->mask);
-   for (cur = flex_array_get_ptr(h->htable, hvalue); cur;
+   for (cur = h->htable[hvalue]; cur;
 cur = cur->next) {
if (key->source_type == cur->key.source_type &&
key->target_type == cur->key.target_type &&
@@ -222,11 +220,11 @@ avtab_search_node(struct avtab *h, struct avtab_key *key)
struct avtab_node *cur;
u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD);
 
-   if (!h || !h->htable)
+   if (!h)
return NULL;
 
hvalue = avtab_hash(key, h->mask);
-   for (cur = flex_array_get_ptr(h->htable, hvalue); cur;
+   for (cur = h->htable[hvalue]; cur;
 cur = cur->next) {
if (key->source_type == cur->key.source_type &&
key->target_type == cur->key.target_type &&
@@ -281,11 +279,11 @@ void avtab_destroy(struct avtab *h)
int i;
struct avtab_node *cur, *temp;
 
-   if (!h || !h->htable)
+   if (!h)
return;
 
for (i = 0; i < h->nslot; i++) {
-   cur = flex_array_get_ptr(h->htable, i);
+   cur = h->htable[i];
while (cur) {
temp = cur;
cur = cur->next;
@@ -295,7 +293,7 @@ void avtab_destroy(struct avtab *h)
kmem_cache_free(avtab_node_cachep, temp);
}
}
-   flex_array_free(h->htable);
+   kvfree(h->htable);
h->htable = NULL;
h->nslot = 0;
h->mask = 0;
@@ -303,6 +301,7 @@ void avtab_destroy(struct avtab *h)
 
 int avtab_init(struct avtab *h)
 {
+   kvfree(h->htable);
h->htable = NULL;
h->nel = 0;
return 0;
@@ -329,8 +328,7 @@ int avtab_alloc(struct avtab *h, u32 nrules)
nslot = MAX_AVTAB_HASH_

[PATCH 1/6] openvswitch: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
There was no real need for this code to be using flexarrays, it's just
implementing a hash table - ideally it would be using rhashtables, but
that conversion would be significantly more complicated.

Signed-off-by: Kent Overstreet 
Cc: Pravin B Shelar 
Cc: d...@openvswitch.org
---
 net/openvswitch/flow.h |  1 -
 net/openvswitch/flow_netlink.h |  1 -
 net/openvswitch/flow_table.c   | 51 --
 net/openvswitch/flow_table.h   |  3 +-
 4 files changed, 13 insertions(+), 43 deletions(-)

diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index c670dd24b8..4f06278166 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -30,7 +30,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 6657606b2b..66f9553758 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -30,7 +30,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 #include 
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 80ea2a7185..cfb0098c9a 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -111,29 +111,6 @@ int ovs_flow_tbl_count(const struct flow_table *table)
return table->count;
 }
 
-static struct flex_array *alloc_buckets(unsigned int n_buckets)
-{
-   struct flex_array *buckets;
-   int i, err;
-
-   buckets = flex_array_alloc(sizeof(struct hlist_head),
-  n_buckets, GFP_KERNEL);
-   if (!buckets)
-   return NULL;
-
-   err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
-   if (err) {
-   flex_array_free(buckets);
-   return NULL;
-   }
-
-   for (i = 0; i < n_buckets; i++)
-   INIT_HLIST_HEAD((struct hlist_head *)
-   flex_array_get(buckets, i));
-
-   return buckets;
-}
-
 static void flow_free(struct sw_flow *flow)
 {
int cpu;
@@ -168,31 +145,30 @@ void ovs_flow_free(struct sw_flow *flow, bool deferred)
flow_free(flow);
 }
 
-static void free_buckets(struct flex_array *buckets)
-{
-   flex_array_free(buckets);
-}
-
-
 static void __table_instance_destroy(struct table_instance *ti)
 {
-   free_buckets(ti->buckets);
+   kvfree(ti->buckets);
kfree(ti);
 }
 
 static struct table_instance *table_instance_alloc(int new_size)
 {
struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+   int i;
 
if (!ti)
return NULL;
 
-   ti->buckets = alloc_buckets(new_size);
-
+   ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head),
+GFP_KERNEL);
if (!ti->buckets) {
kfree(ti);
return NULL;
}
+
+   for (i = 0; i < new_size; i++)
+   INIT_HLIST_HEAD(>buckets[i]);
+
ti->n_buckets = new_size;
ti->node_ver = 0;
ti->keep_flows = false;
@@ -249,7 +225,7 @@ static void table_instance_destroy(struct table_instance 
*ti,
 
for (i = 0; i < ti->n_buckets; i++) {
struct sw_flow *flow;
-   struct hlist_head *head = flex_array_get(ti->buckets, i);
+   struct hlist_head *head = >buckets[i];
struct hlist_node *n;
int ver = ti->node_ver;
int ufid_ver = ufid_ti->node_ver;
@@ -294,7 +270,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct 
table_instance *ti,
ver = ti->node_ver;
while (*bucket < ti->n_buckets) {
i = 0;
-   head = flex_array_get(ti->buckets, *bucket);
+   head = >buckets[*bucket];
hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) {
if (i < *last) {
i++;
@@ -313,8 +289,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct 
table_instance *ti,
 static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
 {
hash = jhash_1word(hash, ti->hash_seed);
-   return flex_array_get(ti->buckets,
-   (hash & (ti->n_buckets - 1)));
+   return >buckets[hash & (ti->n_buckets - 1)];
 }
 
 static void table_instance_insert(struct table_instance *ti,
@@ -347,9 +322,7 @@ static void flow_table_copy_flows(struct table_instance 
*old,
/* Insert in new table. */
for (i = 0; i < old->n_buckets; i++) {
struct sw_flow *flow;
-   struct hlist_head *head;
-
-   head = flex_array_get(old->buckets, i);
+   struct hlist_head *head = >buckets[i];
 
if (ufid)
hlist_for_each_entry(flow, head,
diff --git a/net/openvswitch/flow_table.h b/ne

[PATCH 0/6] flex_arrays -> genradix; prep work for bcachefs

2018-09-07 Thread Kent Overstreet
Generic radix trees are a dead simple radix tree implementation that can store
types of different sizes, needed for bcachefs.

The patch series was sent out previously and was pretty uncontroversial - this
is a respin that converts most users to just use kvmalloc.

Kent Overstreet (6):
  openvswitch: convert to kvmalloc
  md: convert to kvmalloc
  selinux: convert to kvmalloc
  Generic radix trees
  proc: commit to genradix
  Drop flex_arrays

 Documentation/core-api/flexible-arrays.rst | 130 ---
 Documentation/flexible-arrays.txt  | 123 ---
 drivers/md/raid5-ppl.c |   7 +-
 drivers/md/raid5.c |  82 ++---
 drivers/md/raid5.h |   9 +-
 fs/proc/base.c |  43 +--
 include/linux/flex_array.h | 149 
 include/linux/generic-radix-tree.h | 222 
 include/linux/poison.h |   3 -
 lib/Makefile   |   5 +-
 lib/flex_array.c   | 398 -
 lib/generic-radix-tree.c   | 180 ++
 net/openvswitch/flow.h |   1 -
 net/openvswitch/flow_netlink.h |   1 -
 net/openvswitch/flow_table.c   |  51 +--
 net/openvswitch/flow_table.h   |   3 +-
 security/selinux/ss/avtab.c|  40 +--
 security/selinux/ss/avtab.h|   4 +-
 security/selinux/ss/conditional.c  |   6 +-
 security/selinux/ss/policydb.c | 122 ++-
 security/selinux/ss/policydb.h |  12 +-
 security/selinux/ss/services.c |  22 +-
 tools/include/linux/poison.h   |   3 -
 23 files changed, 540 insertions(+), 1076 deletions(-)
 delete mode 100644 Documentation/core-api/flexible-arrays.rst
 delete mode 100644 Documentation/flexible-arrays.txt
 delete mode 100644 include/linux/flex_array.h
 create mode 100644 include/linux/generic-radix-tree.h
 delete mode 100644 lib/flex_array.c
 create mode 100644 lib/generic-radix-tree.c

-- 
2.19.0.rc2



[PATCH 1/6] openvswitch: convert to kvmalloc

2018-09-07 Thread Kent Overstreet
There was no real need for this code to be using flexarrays, it's just
implementing a hash table - ideally it would be using rhashtables, but
that conversion would be significantly more complicated.

Signed-off-by: Kent Overstreet 
Cc: Pravin B Shelar 
Cc: d...@openvswitch.org
---
 net/openvswitch/flow.h |  1 -
 net/openvswitch/flow_netlink.h |  1 -
 net/openvswitch/flow_table.c   | 51 --
 net/openvswitch/flow_table.h   |  3 +-
 4 files changed, 13 insertions(+), 43 deletions(-)

diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index c670dd24b8..4f06278166 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -30,7 +30,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 6657606b2b..66f9553758 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -30,7 +30,6 @@
 #include 
 #include 
 #include 
-#include 
 
 #include 
 #include 
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index 80ea2a7185..cfb0098c9a 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -111,29 +111,6 @@ int ovs_flow_tbl_count(const struct flow_table *table)
return table->count;
 }
 
-static struct flex_array *alloc_buckets(unsigned int n_buckets)
-{
-   struct flex_array *buckets;
-   int i, err;
-
-   buckets = flex_array_alloc(sizeof(struct hlist_head),
-  n_buckets, GFP_KERNEL);
-   if (!buckets)
-   return NULL;
-
-   err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
-   if (err) {
-   flex_array_free(buckets);
-   return NULL;
-   }
-
-   for (i = 0; i < n_buckets; i++)
-   INIT_HLIST_HEAD((struct hlist_head *)
-   flex_array_get(buckets, i));
-
-   return buckets;
-}
-
 static void flow_free(struct sw_flow *flow)
 {
int cpu;
@@ -168,31 +145,30 @@ void ovs_flow_free(struct sw_flow *flow, bool deferred)
flow_free(flow);
 }
 
-static void free_buckets(struct flex_array *buckets)
-{
-   flex_array_free(buckets);
-}
-
-
 static void __table_instance_destroy(struct table_instance *ti)
 {
-   free_buckets(ti->buckets);
+   kvfree(ti->buckets);
kfree(ti);
 }
 
 static struct table_instance *table_instance_alloc(int new_size)
 {
struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
+   int i;
 
if (!ti)
return NULL;
 
-   ti->buckets = alloc_buckets(new_size);
-
+   ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head),
+GFP_KERNEL);
if (!ti->buckets) {
kfree(ti);
return NULL;
}
+
+   for (i = 0; i < new_size; i++)
+   INIT_HLIST_HEAD(>buckets[i]);
+
ti->n_buckets = new_size;
ti->node_ver = 0;
ti->keep_flows = false;
@@ -249,7 +225,7 @@ static void table_instance_destroy(struct table_instance 
*ti,
 
for (i = 0; i < ti->n_buckets; i++) {
struct sw_flow *flow;
-   struct hlist_head *head = flex_array_get(ti->buckets, i);
+   struct hlist_head *head = >buckets[i];
struct hlist_node *n;
int ver = ti->node_ver;
int ufid_ver = ufid_ti->node_ver;
@@ -294,7 +270,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct 
table_instance *ti,
ver = ti->node_ver;
while (*bucket < ti->n_buckets) {
i = 0;
-   head = flex_array_get(ti->buckets, *bucket);
+   head = >buckets[*bucket];
hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) {
if (i < *last) {
i++;
@@ -313,8 +289,7 @@ struct sw_flow *ovs_flow_tbl_dump_next(struct 
table_instance *ti,
 static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
 {
hash = jhash_1word(hash, ti->hash_seed);
-   return flex_array_get(ti->buckets,
-   (hash & (ti->n_buckets - 1)));
+   return >buckets[hash & (ti->n_buckets - 1)];
 }
 
 static void table_instance_insert(struct table_instance *ti,
@@ -347,9 +322,7 @@ static void flow_table_copy_flows(struct table_instance 
*old,
/* Insert in new table. */
for (i = 0; i < old->n_buckets; i++) {
struct sw_flow *flow;
-   struct hlist_head *head;
-
-   head = flex_array_get(old->buckets, i);
+   struct hlist_head *head = >buckets[i];
 
if (ufid)
hlist_for_each_entry(flow, head,
diff --git a/net/openvswitch/flow_table.h b/ne

  1   2   3   4   5   6   7   8   9   10   >