Re: [PATCH 09/19] zonefs: remove duplicate cleanup in zonefs_fill_super

2023-09-13 Thread Damien Le Moal
On 9/13/23 20:10, Christoph Hellwig wrote:
> When ->fill_super fails, ->kill_sb is called which already cleans up
> the inodes and zgroups.
> 
> Drop the extra cleanup code in zonefs_fill_super.
> 
> Signed-off-by: Christoph Hellwig 

Looks good to me.

Acked-by: Damien Le Moal 

> ---
>  fs/zonefs/super.c | 21 +
>  1 file changed, 5 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index 9d1a9808fbbba6..35b2554ce2ac2e 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -1309,13 +1309,12 @@ static int zonefs_fill_super(struct super_block *sb, 
> void *data, int silent)
>   /* Initialize the zone groups */
>   ret = zonefs_init_zgroups(sb);
>   if (ret)
> - goto cleanup;
> + return ret;
>  
>   /* Create the root directory inode */
> - ret = -ENOMEM;
>   inode = new_inode(sb);
>   if (!inode)
> - goto cleanup;
> + return -ENOMEM;
>  
>   inode->i_ino = bdev_nr_zones(sb->s_bdev);
>   inode->i_mode = S_IFDIR | 0555;
> @@ -1333,7 +1332,7 @@ static int zonefs_fill_super(struct super_block *sb, 
> void *data, int silent)
>  
>   sb->s_root = d_make_root(inode);
>   if (!sb->s_root)
> - goto cleanup;
> + return -ENOMEM;
>  
>   /*
>* Take a reference on the zone groups directory inodes
> @@ -1341,19 +1340,9 @@ static int zonefs_fill_super(struct super_block *sb, 
> void *data, int silent)
>*/
>   ret = zonefs_get_zgroup_inodes(sb);
>   if (ret)
> - goto cleanup;
> -
> - ret = zonefs_sysfs_register(sb);
> - if (ret)
> - goto cleanup;
> -
> - return 0;
> -
> -cleanup:
> - zonefs_release_zgroup_inodes(sb);
> - zonefs_free_zgroups(sb);
> + return ret;
>  
> - return ret;
> + return zonefs_sysfs_register(sb);
>  }
>  
>  static struct dentry *zonefs_mount(struct file_system_type *fs_type,

-- 
Damien Le Moal
Western Digital Research



Re: [PATCH v4 1/2] binfmt_flat: allow not offsetting data start

2021-04-16 Thread Damien Le Moal
On 2021/04/17 13:52, Greg Ungerer wrote:
> 
> On 17/4/21 11:10 am, Damien Le Moal wrote:
>> Commit 2217b9826246 ("binfmt_flat: revert "binfmt_flat: don't offset
>> the data start"") restored offsetting the start of the data section by
>> a number of words defined by MAX_SHARED_LIBS. As a result, since
>> MAX_SHARED_LIBS is never 0, a gap between the text and data sections
>> always exists. For architectures which cannot support a such gap
>> between the text and data sections (e.g. riscv nommu), flat binary
>> programs cannot be executed.
>>
>> To allow an architecture to request no data start offset to allow for
>> contiguous text and data sections for binaries flagged with
>> FLAT_FLAG_RAM, introduce the new config option
>> CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET. Using this new option, the
>> macro DATA_START_OFFSET_WORDS is conditionally defined in binfmt_flat.c
>> to MAX_SHARED_LIBS for architectures tolerating or needing the data
>> start offset (CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET disabled case)
>> and to 0 when CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET is enabled.
>> DATA_START_OFFSET_WORDS is used in load_flat_file() to calculate the
>> data section length and start position.
>>
>> Signed-off-by: Damien Le Moal 
>> ---
>>   fs/Kconfig.binfmt |  3 +++
>>   fs/binfmt_flat.c  | 19 ++-
>>   2 files changed, 17 insertions(+), 5 deletions(-)
>>
>> diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
>> index c6f1c8c1934e..06fb7a93a1bd 100644
>> --- a/fs/Kconfig.binfmt
>> +++ b/fs/Kconfig.binfmt
>> @@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK
>>   config BINFMT_FLAT_OLD_ALWAYS_RAM
>>  bool
>>   
>> +config BINFMT_FLAT_NO_DATA_START_OFFSET
>> +bool
>> +
>>   config BINFMT_FLAT_OLD
>>  bool "Enable support for very old legacy flat binaries"
>>  depends on BINFMT_FLAT
>> diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
>> index b9c658e0548e..1dc68dfba3e0 100644
>> --- a/fs/binfmt_flat.c
>> +++ b/fs/binfmt_flat.c
>> @@ -74,6 +74,12 @@
>>   #defineMAX_SHARED_LIBS (1)
>>   #endif
>>   
>> +#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET
>> +#define DATA_START_OFFSET_WORDS (0)
>> +#else
>> +#define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS)
>> +#endif
>> +
>>   struct lib_info {
>>  struct {
>>  unsigned long start_code;   /* Start of text 
>> segment */
>> @@ -560,6 +566,7 @@ static int load_flat_file(struct linux_binprm *bprm,
>>   * it all together.
>>   */
>>  if (!IS_ENABLED(CONFIG_MMU) && !(flags & 
>> (FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) {
>> +
> 
> Random white space change...
> Don't worry about re-spinning though, I will just edit this chunk out.

Oops. Sorry about that. I should have better checked :)

> 
> 
>>  /*
>>   * this should give us a ROM ptr,  but if it doesn't we don't
>>   * really care
>> @@ -576,7 +583,8 @@ static int load_flat_file(struct linux_binprm *bprm,
>>  goto err;
>>  }
>>   
>> -len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned 
>> long);
>> +len = data_len + extra +
>> +DATA_START_OFFSET_WORDS * sizeof(unsigned long);
>>  len = PAGE_ALIGN(len);
>>  realdatastart = vm_mmap(NULL, 0, len,
>>  PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
>> @@ -591,7 +599,7 @@ static int load_flat_file(struct linux_binprm *bprm,
>>  goto err;
>>  }
>>  datapos = ALIGN(realdatastart +
>> -MAX_SHARED_LIBS * sizeof(unsigned long),
>> +DATA_START_OFFSET_WORDS * sizeof(unsigned long),
>>  FLAT_DATA_ALIGN);
>>   
>>  pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
>> @@ -622,7 +630,8 @@ static int load_flat_file(struct linux_binprm *bprm,
>>  memp_size = len;
>>  } else {
>>   
>> -len = text_len + data_len + extra + MAX_SHARED_LIBS * 
>> sizeof(u32);
>> +len = text_len + data_len + extra +
>> +DATA_START_OFFSET_WORDS * sizeof(u32);
>>  len = PAGE_ALIGN(len);
>>  textpos = vm_mmap(NULL, 0, len,
>>  PROT_READ | PROT_E

[PATCH v4 1/2] binfmt_flat: allow not offsetting data start

2021-04-16 Thread Damien Le Moal
Commit 2217b9826246 ("binfmt_flat: revert "binfmt_flat: don't offset
the data start"") restored offsetting the start of the data section by
a number of words defined by MAX_SHARED_LIBS. As a result, since
MAX_SHARED_LIBS is never 0, a gap between the text and data sections
always exists. For architectures which cannot support a such gap
between the text and data sections (e.g. riscv nommu), flat binary
programs cannot be executed.

To allow an architecture to request no data start offset to allow for
contiguous text and data sections for binaries flagged with
FLAT_FLAG_RAM, introduce the new config option
CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET. Using this new option, the
macro DATA_START_OFFSET_WORDS is conditionally defined in binfmt_flat.c
to MAX_SHARED_LIBS for architectures tolerating or needing the data
start offset (CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET disabled case)
and to 0 when CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET is enabled.
DATA_START_OFFSET_WORDS is used in load_flat_file() to calculate the
data section length and start position.

Signed-off-by: Damien Le Moal 
---
 fs/Kconfig.binfmt |  3 +++
 fs/binfmt_flat.c  | 19 ++-
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c6f1c8c1934e..06fb7a93a1bd 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK
 config BINFMT_FLAT_OLD_ALWAYS_RAM
bool
 
+config BINFMT_FLAT_NO_DATA_START_OFFSET
+   bool
+
 config BINFMT_FLAT_OLD
bool "Enable support for very old legacy flat binaries"
depends on BINFMT_FLAT
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b9c658e0548e..1dc68dfba3e0 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -74,6 +74,12 @@
 #defineMAX_SHARED_LIBS (1)
 #endif
 
+#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET
+#define DATA_START_OFFSET_WORDS(0)
+#else
+#define DATA_START_OFFSET_WORDS(MAX_SHARED_LIBS)
+#endif
+
 struct lib_info {
struct {
unsigned long start_code;   /* Start of text 
segment */
@@ -560,6 +566,7 @@ static int load_flat_file(struct linux_binprm *bprm,
 * it all together.
 */
if (!IS_ENABLED(CONFIG_MMU) && !(flags & 
(FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) {
+
/*
 * this should give us a ROM ptr,  but if it doesn't we don't
 * really care
@@ -576,7 +583,8 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
 
-   len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned 
long);
+   len = data_len + extra +
+   DATA_START_OFFSET_WORDS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -591,7 +599,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
datapos = ALIGN(realdatastart +
-   MAX_SHARED_LIBS * sizeof(unsigned long),
+   DATA_START_OFFSET_WORDS * sizeof(unsigned long),
FLAT_DATA_ALIGN);
 
pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
@@ -622,7 +630,8 @@ static int load_flat_file(struct linux_binprm *bprm,
memp_size = len;
} else {
 
-   len = text_len + data_len + extra + MAX_SHARED_LIBS * 
sizeof(u32);
+   len = text_len + data_len + extra +
+   DATA_START_OFFSET_WORDS * sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -638,7 +647,7 @@ static int load_flat_file(struct linux_binprm *bprm,
 
realdatastart = textpos + ntohl(hdr->data_start);
datapos = ALIGN(realdatastart +
-   MAX_SHARED_LIBS * sizeof(u32),
+   DATA_START_OFFSET_WORDS * sizeof(u32),
FLAT_DATA_ALIGN);
 
reloc = (__be32 __user *)
@@ -714,7 +723,7 @@ static int load_flat_file(struct linux_binprm *bprm,
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
vm_munmap(textpos, text_len + data_len + extra +
-   MAX_SHARED_LIBS * sizeof(u32));
+ DATA_START_OFFSET_WORDS * sizeof(u32));
goto err;
}
}
-- 
2.30.2



[PATCH v4 2/2] riscv: Disable data start offset in flat binaries

2021-04-16 Thread Damien Le Moal
uclibc/gcc combined with elf2flt riscv linker file fully resolve the
PC relative __global_pointer$ value at compile time and do not generate
a relocation entry to set a correct value of the gp register at runtime.
As a result, if the flatbin loader offsets the start of the data
section, the relative position change between the text and data sections
compared to the compile time positions results in an incorrect gp value
being used. This causes flatbin executables to crash.

Avoid this problem by enabling CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET
automatically when CONFIG_RISCV is enabled and CONFIG_MMU is disabled.

Signed-off-by: Damien Le Moal 
Acked-by: Palmer Dabbelt 
---
 arch/riscv/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 4515a10c5d22..add528eb9235 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -33,6 +33,7 @@ config RISCV
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
+   select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU
select CLONE_BACKWARDS
select CLINT_TIMER if !MMU
select COMMON_CLK
-- 
2.30.2



[PATCH v4 0/2] Fix binfmt_flat loader for RISC-V

2021-04-16 Thread Damien Le Moal
RISC-V NOMMU flat binaries cannot tolerate a gap between the text and
data section as the toolchain fully resolves at compile time the PC
relative global pointer (__global_pointer$ value loaded in the gp
register). Without a relocation entry provided, the flat bin loader
cannot fix the value if a gap is introduced and user executables fail
to run.

This series fixes this problem by allowing an architecture to request
the flat loader to suppress the offset of the data start section.
Combined with the use of elf2flt "-r" option to mark the flat
executables with the FLAT_FLAG_RAM flag, the text and data sections are
loaded contiguously in memory, without a change in their relative
position from compile time.

The first patch fixes binfmt_flat flat_load_file() using the new
configuration option CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET. The
second patch enables this new option for RISCV NOMMU builds.

These patches do not change the binfmt_flat loader behavior for other
architectures.

Changes from v3:
* Renamed the configuration option from
  CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP to
  CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET to clarify that only the
  offseting of the data section start is suppressed.
* Do not force loding to RAM (contiguously) if the flat binary does not
  have the FLAT_FLAG_RAM flag set.
* Updated commit messages to reflect above changes.

Changes from v2:
* Updated distribution list
* Added Palmer ack-by tag

Changes from v1:
* Replace FLAT_TEXT_DATA_NO_GAP macro with
  CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP config option (patch 1).
* Remove the addition of riscv/include/asm/flat.h and set
  CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP for RISCV and !MMU

Damien Le Moal (2):
  binfmt_flat: allow not offsetting data start
  riscv: Disable data start offset in flat binaries

 arch/riscv/Kconfig |  1 +
 fs/Kconfig.binfmt  |  3 +++
 fs/binfmt_flat.c   | 19 ++-
 3 files changed, 18 insertions(+), 5 deletions(-)

-- 
2.30.2



Re: [PATCH v3 1/2] binfmt_flat: allow not offsetting data start

2021-04-16 Thread Damien Le Moal
On 2021/04/16 16:24, Greg Ungerer wrote:
> 
> On 16/4/21 9:22 am, Damien Le Moal wrote:
>> On 2021/04/15 23:04, Greg Ungerer wrote:
>>> Hi Damien,
>>>
>>> On 15/4/21 4:15 pm, Damien Le Moal wrote:
>>>> Commit 2217b9826246 ("binfmt_flat: revert "binfmt_flat: don't offset
>>>> the data start"") restored offsetting the start of the data section by
>>>> a number of words defined by MAX_SHARED_LIBS. As a result, since
>>>> MAX_SHARED_LIBS is never 0, a gap between the text and data sections
>>>> always exists. For architectures which cannot support a such gap
>>>> between the text and data sections (e.g. riscv nommu), flat binary
>>>> programs cannot be executed.
>>>>
>>>> To allow an architecture to request contiguous text and data sections,
>>>> introduce the config option CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP.
>>>> Using this new option, the macro DATA_GAP_WORDS is conditionally
>>>> defined in binfmt_flat.c to MAX_SHARED_LIBS for architectures
>>>> tolerating the text-to-data gap (CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP
>>>> disabled case) and to 0 when CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP is
>>>> enabled. DATA_GAP_WORDS is used in load_flat_file() to calculate the
>>>> data section length and start position.
>>>>
>>>> An architecture enabling CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP also
>>>> prevents the use of the separate text/data load case (when the flat file
>>>> header flags FLAT_FLAG_RAM and FLAT_FLAG_GZIP are not set with NOMMU
>>>> kernels) and forces the use of a single RAM region for loading
>>>> (equivalent to FLAT_FLAG_RAM being set).
>>>
>>> So is it the case that a flat format file on RISC-V will never have
>>> relocations?
>>
>> No, it does have relocations. But there is no entry for the global pointer
>> (__global_pointer$) location. This is because the loading of that value in 
>> the
>> gp register in the C-library crt1.S is done using a PC-relative instruction. 
>> The
>> value for it is resolved at compile time and does not get a relocation table
>> entry. Other functions calls and symbol references do have relocation table
>> entries, so the binary can be loaded anywhere. The missing relocation for the
>> global pointer mandates that text and data be loaded at the same positions
>> relative to each other that the linker file defines. Otherwise, loading of
>> __global_pointer$ into the gp register (first thing that C libraries crt1.S 
>> do)
>> result in a garbage value being loaded.
>>
>> I tried some tricks with the linker file and changing uclibc crt1.S to have 
>> the
>> gp loading done using a symbol address instead of a PC-relative offset. I 
>> could
>> then see a relocation table entry for that symbol. That still did not work 
>> as I
>> was probably doing something wrong. Anyway, such solution requires changing a
>> lot of things in C libraries loading assembler that is common between NOMMU 
>> and
>> MMU code. Changing it would break MMU enabled programs.
>>
>>
>>>> Signed-off-by: Damien Le Moal 
>>>> Acked-by: Palmer Dabbelt 
>>>> ---
>>>>fs/Kconfig.binfmt |  3 +++
>>>>fs/binfmt_flat.c  | 21 +++--
>>>>2 files changed, 18 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
>>>> index c6f1c8c1934e..c6df931d5d45 100644
>>>> --- a/fs/Kconfig.binfmt
>>>> +++ b/fs/Kconfig.binfmt
>>>> @@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK
>>>>config BINFMT_FLAT_OLD_ALWAYS_RAM
>>>>bool
>>>>
>>>> +config BINFMT_FLAT_NO_TEXT_DATA_GAP
>>>> +  bool
>>>> +
>>>>config BINFMT_FLAT_OLD
>>>>bool "Enable support for very old legacy flat binaries"
>>>>depends on BINFMT_FLAT
>>>> diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
>>>> index b9c658e0548e..2be29bb964b8 100644
>>>> --- a/fs/binfmt_flat.c
>>>> +++ b/fs/binfmt_flat.c
>>>> @@ -74,6 +74,12 @@
>>>>#define MAX_SHARED_LIBS (1)
>>>>#endif
>>>>
>>>> +#ifdef CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP
>>>> +#define DATA_GAP_WORDS(0)
>>>> +#else
>>>> +#define DATA_GAP_WORDS(MA

Re: [PATCH v2 0/2] Fix binfmt_flat loader for RISC-V

2021-04-15 Thread Damien Le Moal
On 2021/04/16 9:22, Al Viro wrote:
> On Thu, Apr 15, 2021 at 07:56:05AM +0200, Christoph Hellwig wrote:
>> binfmt_flat tends to go through Greg's uclinux tree, adding him and
>> the list.
> 
>   FWIW, my involvement with binfmt_flat had been pretty much nil -
> the least trivial had been "binfmt_flat: flat_{get,put}_addr_from_rp()
> should be able to fail" about 4 years ago and that fell out of hunting
> for places where __get_user() had been used without checking error values.
> 
>   It's in fs/*, but I've no way to test it and I have pretty much
> zero familiarity with the guts of that one, so I can't give any useful
> feedback on that series.  So consider the Christoph's comment seconded -
> you want it reviewed by gerg et.al., and it probably ought to go via
> gerg/uclinux.git tree.
> 
>   I'm reasonably familiar with binfmt_{elf,misc,script}; anything
> else gets touched as part of larger series and only with sanity checks
> from other folks, if the changes are not entirely trivial.

Al,

Thanks for the clarification. Would it make sense to have an entry in
MAINTAINERS file pointing to Greg and the uclinux tree for binfmt_flat.c ?
Greg ?


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v3 1/2] binfmt_flat: allow not offsetting data start

2021-04-15 Thread Damien Le Moal
On 2021/04/15 23:04, Greg Ungerer wrote:
> Hi Damien,
> 
> On 15/4/21 4:15 pm, Damien Le Moal wrote:
>> Commit 2217b9826246 ("binfmt_flat: revert "binfmt_flat: don't offset
>> the data start"") restored offsetting the start of the data section by
>> a number of words defined by MAX_SHARED_LIBS. As a result, since
>> MAX_SHARED_LIBS is never 0, a gap between the text and data sections
>> always exists. For architectures which cannot support a such gap
>> between the text and data sections (e.g. riscv nommu), flat binary
>> programs cannot be executed.
>>
>> To allow an architecture to request contiguous text and data sections,
>> introduce the config option CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP.
>> Using this new option, the macro DATA_GAP_WORDS is conditionally
>> defined in binfmt_flat.c to MAX_SHARED_LIBS for architectures
>> tolerating the text-to-data gap (CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP
>> disabled case) and to 0 when CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP is
>> enabled. DATA_GAP_WORDS is used in load_flat_file() to calculate the
>> data section length and start position.
>>
>> An architecture enabling CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP also
>> prevents the use of the separate text/data load case (when the flat file
>> header flags FLAT_FLAG_RAM and FLAT_FLAG_GZIP are not set with NOMMU
>> kernels) and forces the use of a single RAM region for loading
>> (equivalent to FLAT_FLAG_RAM being set).
> 
> So is it the case that a flat format file on RISC-V will never have
> relocations?

No, it does have relocations. But there is no entry for the global pointer
(__global_pointer$) location. This is because the loading of that value in the
gp register in the C-library crt1.S is done using a PC-relative instruction. The
value for it is resolved at compile time and does not get a relocation table
entry. Other functions calls and symbol references do have relocation table
entries, so the binary can be loaded anywhere. The missing relocation for the
global pointer mandates that text and data be loaded at the same positions
relative to each other that the linker file defines. Otherwise, loading of
__global_pointer$ into the gp register (first thing that C libraries crt1.S do)
result in a garbage value being loaded.

I tried some tricks with the linker file and changing uclibc crt1.S to have the
gp loading done using a symbol address instead of a PC-relative offset. I could
then see a relocation table entry for that symbol. That still did not work as I
was probably doing something wrong. Anyway, such solution requires changing a
lot of things in C libraries loading assembler that is common between NOMMU and
MMU code. Changing it would break MMU enabled programs.


>> Signed-off-by: Damien Le Moal 
>> Acked-by: Palmer Dabbelt 
>> ---
>>   fs/Kconfig.binfmt |  3 +++
>>   fs/binfmt_flat.c  | 21 +++--
>>   2 files changed, 18 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
>> index c6f1c8c1934e..c6df931d5d45 100644
>> --- a/fs/Kconfig.binfmt
>> +++ b/fs/Kconfig.binfmt
>> @@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK
>>   config BINFMT_FLAT_OLD_ALWAYS_RAM
>>  bool
>>   
>> +config BINFMT_FLAT_NO_TEXT_DATA_GAP
>> +bool
>> +
>>   config BINFMT_FLAT_OLD
>>  bool "Enable support for very old legacy flat binaries"
>>  depends on BINFMT_FLAT
>> diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
>> index b9c658e0548e..2be29bb964b8 100644
>> --- a/fs/binfmt_flat.c
>> +++ b/fs/binfmt_flat.c
>> @@ -74,6 +74,12 @@
>>   #defineMAX_SHARED_LIBS (1)
>>   #endif
>>   
>> +#ifdef CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP
>> +#define DATA_GAP_WORDS  (0)
>> +#else
>> +#define DATA_GAP_WORDS  (MAX_SHARED_LIBS)
>> +#endif
>> +>   struct lib_info {
>>  struct {
>>  unsigned long start_code;   /* Start of text 
>> segment */
>> @@ -559,7 +565,10 @@ static int load_flat_file(struct linux_binprm *bprm,
>>   * case,  and then the fully copied to RAM case which lumps
>>   * it all together.
>>   */
>> -if (!IS_ENABLED(CONFIG_MMU) && !(flags & 
>> (FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) {
>> +if (!IS_ENABLED(CONFIG_MMU) &&
>> +!IS_ENABLED(CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP) &&
> 
> If RISC-V flat format files must always be loaded to RAM then why don't
> they set the FLAT_FLAG_RAM when compiled/generated?

That is done. The patch I have for elf2flt sets it. Coding it l

Re: [PATCH v2 0/2] Fix binfmt_flat loader for RISC-V

2021-04-15 Thread Damien Le Moal
On 2021/04/15 14:56, Christoph Hellwig wrote:
> binfmt_flat tends to go through Greg's uclinux tree, adding him and
> the list.

Thanks Christoph. I resent the series adding Gerg and uclinux-dev.
MAINTAINERS file needs an update may be ?

> 
> On Wed, Apr 14, 2021 at 10:46:36PM -0700, Palmer Dabbelt wrote:
>> On Wed, 14 Apr 2021 17:32:10 PDT (-0700), Damien Le Moal wrote:
>>>> On 2021/04/08 0:49, Damien Le Moal wrote:
>>>> RISC-V NOMMU flat binaries cannot tolerate a gap between the text and
>>>> data section as the toolchain fully resolves at compile time the PC
>>>> relative global pointer (__global_pointer$ value loaded in gp register).
>>>> Without a relocation entry provided, the flat bin loader cannot fix the
>>>> value if a gap is introduced and executables fail to run.
>>>>
>>>> This series fixes this problem by allowing an architecture to request
>>>> the flat loader to suppress the gap between the text and data sections.
>>>> The first patch fixes binfmt_flat flat_load_file() using the new
>>>> configuration option CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP. The second
>>>> patch enables this option for RISCV NOMMU builds.
>>>>
>>>> These patches do not change the binfmt_flat loader behavior for other
>>>> architectures.
>>>>
>>>> Changes from v1:
>>>> * Replace FLAT_TEXT_DATA_NO_GAP macro with
>>>>   CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP config option (patch 1).
>>>> * Remove the addition of riscv/include/asm/flat.h and set
>>>>   CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP for RISCV and !MMU
>>>>
>>>> Damien Le Moal (2):
>>>>   binfmt_flat: allow not offsetting data start
>>>>   riscv: Disable text-data gap in flat binaries
>>>>
>>>>  arch/riscv/Kconfig |  1 +
>>>>  fs/Kconfig.binfmt  |  3 +++
>>>>  fs/binfmt_flat.c   | 21 +++--
>>>>  3 files changed, 19 insertions(+), 6 deletions(-)
>>>>
>>>
>>> Ping ?
>>>
>>> Any comment on these patches ?
>>>
>>> Without them, RISC-V NOMMU user space does not run... I would really like 
>>> to get
>>> these in this cycle if possible.
>>
>> This LGTM, but it's pretty far out of my area of expertise.  I'm happy to 
>> take them via my tree, but I'd prefer to get an Ack from someone.
>>
>> Al, get_maintainer suggests you?
>>
>> Acked-by: Palmer Dabbelt 
> ---end quoted text---
> 


-- 
Damien Le Moal
Western Digital Research


[PATCH v3 2/2] riscv: Disable text-data gap in flat binaries

2021-04-15 Thread Damien Le Moal
uclibc/gcc combined with elf2flt riscv linker file fully resolve the
PC relative __global_pointer$ value at compile time and do not generate
a relocation entry to set a runtime gp value. As a result, if the
flatbin loader introduces a gap between the text and data sections, the
gp value becomes incorrect and prevent correct execution of a flatbin
executable.

Avoid this problem by enabling CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP
automatically when CONFIG_RISCV is enabled and CONFIG_MMU disabled.

Signed-off-by: Damien Le Moal 
Acked-by: Palmer Dabbelt 
---
 arch/riscv/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 0d0cf67359cb..6a85fbbd056e 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -33,6 +33,7 @@ config RISCV
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
+   select BINFMT_FLAT_NO_TEXT_DATA_GAP if !MMU
select CLONE_BACKWARDS
select CLINT_TIMER if !MMU
select COMMON_CLK
-- 
2.30.2



[PATCH v3 0/2] Fix binfmt_flat loader for RISC-V

2021-04-15 Thread Damien Le Moal
RISC-V NOMMU flat binaries cannot tolerate a gap between the text and
data section as the toolchain fully resolves at compile time the PC
relative global pointer (__global_pointer$ value loaded in gp register).
Without a relocation entry provided, the flat bin loader cannot fix the
value if a gap is introduced and executables fail to run.

This series fixes this problem by allowing an architecture to request
the flat loader to suppress the gap between the text and data sections.
The first patch fixes binfmt_flat flat_load_file() using the new
configuration option CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP. The second
patch enables this option for RISCV NOMMU builds.

These patches do not change the binfmt_flat loader behavior for other
architectures.

Changes from v2:
* Updated distribution list
* Added Palmer ack-by tag

Changes from v1:
* Replace FLAT_TEXT_DATA_NO_GAP macro with
  CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP config option (patch 1).
* Remove the addition of riscv/include/asm/flat.h and set
  CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP for RISCV and !MMU

Damien Le Moal (2):
  binfmt_flat: allow not offsetting data start
  riscv: Disable text-data gap in flat binaries

 arch/riscv/Kconfig |  1 +
 fs/Kconfig.binfmt  |  3 +++
 fs/binfmt_flat.c   | 21 +++--
 3 files changed, 19 insertions(+), 6 deletions(-)

-- 
2.30.2



[PATCH v3 1/2] binfmt_flat: allow not offsetting data start

2021-04-15 Thread Damien Le Moal
Commit 2217b9826246 ("binfmt_flat: revert "binfmt_flat: don't offset
the data start"") restored offsetting the start of the data section by
a number of words defined by MAX_SHARED_LIBS. As a result, since
MAX_SHARED_LIBS is never 0, a gap between the text and data sections
always exists. For architectures which cannot support a such gap
between the text and data sections (e.g. riscv nommu), flat binary
programs cannot be executed.

To allow an architecture to request contiguous text and data sections,
introduce the config option CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP.
Using this new option, the macro DATA_GAP_WORDS is conditionally
defined in binfmt_flat.c to MAX_SHARED_LIBS for architectures
tolerating the text-to-data gap (CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP
disabled case) and to 0 when CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP is
enabled. DATA_GAP_WORDS is used in load_flat_file() to calculate the
data section length and start position.

An architecture enabling CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP also
prevents the use of the separate text/data load case (when the flat file
header flags FLAT_FLAG_RAM and FLAT_FLAG_GZIP are not set with NOMMU
kernels) and forces the use of a single RAM region for loading
(equivalent to FLAT_FLAG_RAM being set).

Signed-off-by: Damien Le Moal 
Acked-by: Palmer Dabbelt 
---
 fs/Kconfig.binfmt |  3 +++
 fs/binfmt_flat.c  | 21 +++--
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c6f1c8c1934e..c6df931d5d45 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK
 config BINFMT_FLAT_OLD_ALWAYS_RAM
bool
 
+config BINFMT_FLAT_NO_TEXT_DATA_GAP
+   bool
+
 config BINFMT_FLAT_OLD
bool "Enable support for very old legacy flat binaries"
depends on BINFMT_FLAT
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b9c658e0548e..2be29bb964b8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -74,6 +74,12 @@
 #defineMAX_SHARED_LIBS (1)
 #endif
 
+#ifdef CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP
+#define DATA_GAP_WORDS (0)
+#else
+#define DATA_GAP_WORDS (MAX_SHARED_LIBS)
+#endif
+
 struct lib_info {
struct {
unsigned long start_code;   /* Start of text 
segment */
@@ -559,7 +565,10 @@ static int load_flat_file(struct linux_binprm *bprm,
 * case,  and then the fully copied to RAM case which lumps
 * it all together.
 */
-   if (!IS_ENABLED(CONFIG_MMU) && !(flags & 
(FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) {
+   if (!IS_ENABLED(CONFIG_MMU) &&
+   !IS_ENABLED(CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP) &&
+   !(flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) {
+
/*
 * this should give us a ROM ptr,  but if it doesn't we don't
 * really care
@@ -576,7 +585,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
 
-   len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned 
long);
+   len = data_len + extra + DATA_GAP_WORDS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -591,7 +600,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
datapos = ALIGN(realdatastart +
-   MAX_SHARED_LIBS * sizeof(unsigned long),
+   DATA_GAP_WORDS * sizeof(unsigned long),
FLAT_DATA_ALIGN);
 
pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
@@ -622,7 +631,7 @@ static int load_flat_file(struct linux_binprm *bprm,
memp_size = len;
} else {
 
-   len = text_len + data_len + extra + MAX_SHARED_LIBS * 
sizeof(u32);
+   len = text_len + data_len + extra + DATA_GAP_WORDS * 
sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -638,7 +647,7 @@ static int load_flat_file(struct linux_binprm *bprm,
 
realdatastart = textpos + ntohl(hdr->data_start);
datapos = ALIGN(realdatastart +
-   MAX_SHARED_LIBS * sizeof(u32),
+   DATA_GAP_WORDS * sizeof(u32),
FLAT_DATA_ALIGN);
 
reloc = (__be32 __user *)
@@ -714,7 +723,7 @@ static int load_flat_file(struct linux_binprm *bprm,
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
vm_munmap(t

Re: [PATCH v2 0/2] Fix binfmt_flat loader for RISC-V

2021-04-14 Thread Damien Le Moal
On 2021/04/08 0:49, Damien Le Moal wrote:
> RISC-V NOMMU flat binaries cannot tolerate a gap between the text and
> data section as the toolchain fully resolves at compile time the PC
> relative global pointer (__global_pointer$ value loaded in gp register).
> Without a relocation entry provided, the flat bin loader cannot fix the
> value if a gap is introduced and executables fail to run.
> 
> This series fixes this problem by allowing an architecture to request
> the flat loader to suppress the gap between the text and data sections.
> The first patch fixes binfmt_flat flat_load_file() using the new
> configuration option CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP. The second
> patch enables this option for RISCV NOMMU builds.
> 
> These patches do not change the binfmt_flat loader behavior for other
> architectures.
> 
> Changes from v1:
> * Replace FLAT_TEXT_DATA_NO_GAP macro with
>   CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP config option (patch 1).
> * Remove the addition of riscv/include/asm/flat.h and set
>   CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP for RISCV and !MMU
> 
> Damien Le Moal (2):
>   binfmt_flat: allow not offsetting data start
>   riscv: Disable text-data gap in flat binaries
> 
>  arch/riscv/Kconfig |  1 +
>  fs/Kconfig.binfmt  |  3 +++
>  fs/binfmt_flat.c   | 21 +++--
>  3 files changed, 19 insertions(+), 6 deletions(-)
> 

Ping ?

Any comment on these patches ?

Without them, RISC-V NOMMU user space does not run... I would really like to get
these in this cycle if possible.


-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH v5 2/4] block: add simple copy support

2021-04-12 Thread Damien Le Moal
On 2021/04/12 23:35, Selva Jove wrote:
> On Mon, Apr 12, 2021 at 5:55 AM Damien Le Moal  wrote:
>>
>> On 2021/04/07 20:33, Selva Jove wrote:
>>> Initially I started moving the dm-kcopyd interface to the block layer
>>> as a generic interface.
>>> Once I dig deeper in dm-kcopyd code, I figured that dm-kcopyd is
>>> tightly coupled with dm_io()
>>>
>>> To move dm-kcopyd to block layer, it would also require dm_io code to
>>> be moved to block layer.
>>> It would cause havoc in dm layer, as it is the backbone of the
>>> dm-layer and needs complete
>>> rewriting of dm-layer. Do you see any other way of doing this without
>>> having to move dm_io code
>>> or to have redundant code ?
>>
>> Right. Missed that. So reusing dm-kcopyd and making it a common interface 
>> will
>> take some more efforts. OK, then. For the first round of commits, let's 
>> forget
>> about this. But I still think that your emulation could be a lot better than 
>> a
>> loop doing blocking writes after blocking reads.
>>
> 
> Current implementation issues read asynchronously and once all the reads are
> completed, then the write is issued as whole to reduce the IO traffic
> in the queue.
> I agree that things can be better. Will explore another approach of
> sending writes
> immediately once reads are completed and with  plugging to increase the 
> chances
> of merging.
> 
>> [...]
>>>>> +int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
>>>>> + struct range_entry *src_rlist, struct block_device 
>>>>> *dest_bdev,
>>>>> + sector_t dest, gfp_t gfp_mask, int flags)
>>>>> +{
>>>>> + struct request_queue *q = bdev_get_queue(src_bdev);
>>>>> + struct request_queue *dest_q = bdev_get_queue(dest_bdev);
>>>>> + struct blk_copy_payload *payload;
>>>>> + sector_t bs_mask, copy_size;
>>>>> + int ret;
>>>>> +
>>>>> + ret = blk_prepare_payload(src_bdev, nr_srcs, src_rlist, gfp_mask,
>>>>> + , _size);
>>>>> + if (ret)
>>>>> + return ret;
>>>>> +
>>>>> + bs_mask = (bdev_logical_block_size(dest_bdev) >> 9) - 1;
>>>>> + if (dest & bs_mask) {
>>>>> + return -EINVAL;
>>>>> + goto out;
>>>>> + }
>>>>> +
>>>>> + if (q == dest_q && q->limits.copy_offload) {
>>>>> + ret = blk_copy_offload(src_bdev, payload, dest, gfp_mask);
>>>>> + if (ret)
>>>>> + goto out;
>>>>> + } else if (flags & BLKDEV_COPY_NOEMULATION) {
>>>>
>>>> Why ? whoever calls blkdev_issue_copy() wants a copy to be done. Why would 
>>>> that
>>>> user say "Fail on me if the device does not support copy" ??? This is a 
>>>> weird
>>>> interface in my opinion.
>>>>
>>>
>>> BLKDEV_COPY_NOEMULATION flag was introduced to allow blkdev_issue_copy() 
>>> callers
>>> to use their native copying method instead of the emulated copy that I
>>> added. This way we
>>> ensure that dm uses the hw-assisted copy and if that is not present,
>>> it falls back to existing
>>> copy method.
>>>
>>> The other users who don't have their native emulation can use this
>>> emulated-copy implementation.
>>
>> I do not understand. Emulation or not should be entirely driven by the device
>> reporting support for simple copy (or not). It does not matter which 
>> component
>> is issuing the simple copy call: an FS to a real device, and FS to a DM 
>> device
>> or a DM target driver. If the underlying device reported support for simple
>> copy, use that. Otherwise, emulate with read/write. What am I missing here ?
>>
> 
> blkdev_issue_copy() api will generally complete the copy-operation,
> either by using
> offloaded-copy or by using emulated-copy. The caller of the api is not
> required to
> figure the type of support. However, it can opt out of emulated-copy
> by specifying
> the flag BLKDEV_NOEMULATION. This is helpful for the case when the
> caller already
> has got a sophisticated emulation (e.g. dm-kcopyd users).

This does not make any sense to me. If the user has already another mean of
doing copies, then that user will not call blkdev_issue_copy(). So I really do
not understand what the "opting out of emulated copy" would be useful for. That
user can check the simple copy support glag in the device request queue and act
accordingly: use its own block copy code when simple copy is not supported or
use blkdev_issue_copy() when the device has simple copy. Adding that
BLKDEV_COPY_NOEMULATION does not serve any purpose at all.



-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH v5 2/4] block: add simple copy support

2021-04-11 Thread Damien Le Moal
On 2021/04/07 20:33, Selva Jove wrote:
> Initially I started moving the dm-kcopyd interface to the block layer
> as a generic interface.
> Once I dig deeper in dm-kcopyd code, I figured that dm-kcopyd is
> tightly coupled with dm_io()
> 
> To move dm-kcopyd to block layer, it would also require dm_io code to
> be moved to block layer.
> It would cause havoc in dm layer, as it is the backbone of the
> dm-layer and needs complete
> rewriting of dm-layer. Do you see any other way of doing this without
> having to move dm_io code
> or to have redundant code ?

Right. Missed that. So reusing dm-kcopyd and making it a common interface will
take some more efforts. OK, then. For the first round of commits, let's forget
about this. But I still think that your emulation could be a lot better than a
loop doing blocking writes after blocking reads.

[...]
>>> +int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
>>> + struct range_entry *src_rlist, struct block_device *dest_bdev,
>>> + sector_t dest, gfp_t gfp_mask, int flags)
>>> +{
>>> + struct request_queue *q = bdev_get_queue(src_bdev);
>>> + struct request_queue *dest_q = bdev_get_queue(dest_bdev);
>>> + struct blk_copy_payload *payload;
>>> + sector_t bs_mask, copy_size;
>>> + int ret;
>>> +
>>> + ret = blk_prepare_payload(src_bdev, nr_srcs, src_rlist, gfp_mask,
>>> + , _size);
>>> + if (ret)
>>> + return ret;
>>> +
>>> + bs_mask = (bdev_logical_block_size(dest_bdev) >> 9) - 1;
>>> + if (dest & bs_mask) {
>>> + return -EINVAL;
>>> + goto out;
>>> + }
>>> +
>>> + if (q == dest_q && q->limits.copy_offload) {
>>> + ret = blk_copy_offload(src_bdev, payload, dest, gfp_mask);
>>> + if (ret)
>>> + goto out;
>>> + } else if (flags & BLKDEV_COPY_NOEMULATION) {
>>
>> Why ? whoever calls blkdev_issue_copy() wants a copy to be done. Why would 
>> that
>> user say "Fail on me if the device does not support copy" ??? This is a weird
>> interface in my opinion.
>>
> 
> BLKDEV_COPY_NOEMULATION flag was introduced to allow blkdev_issue_copy() 
> callers
> to use their native copying method instead of the emulated copy that I
> added. This way we
> ensure that dm uses the hw-assisted copy and if that is not present,
> it falls back to existing
> copy method.
> 
> The other users who don't have their native emulation can use this
> emulated-copy implementation.

I do not understand. Emulation or not should be entirely driven by the device
reporting support for simple copy (or not). It does not matter which component
is issuing the simple copy call: an FS to a real device, and FS to a DM device
or a DM target driver. If the underlying device reported support for simple
copy, use that. Otherwise, emulate with read/write. What am I missing here ?

[...]
>>> @@ -565,6 +569,12 @@ int blk_stack_limits(struct queue_limits *t, struct 
>>> queue_limits *b,
>>>   if (b->chunk_sectors)
>>>   t->chunk_sectors = gcd(t->chunk_sectors, b->chunk_sectors);
>>>
>>> + /* simple copy not supported in stacked devices */
>>> + t->copy_offload = 0;
>>> + t->max_copy_sectors = 0;
>>> + t->max_copy_range_sectors = 0;
>>> + t->max_copy_nr_ranges = 0;
>>
>> You do not need this. Limits not explicitely initialized are 0 already.
>> But I do not see why you can't support copy on stacked devices. That should 
>> be
>> feasible taking the min() for each of the above limit.
>>
> 
> Disabling stacked device support was feedback from v2.
> 
> https://patchwork.kernel.org/project/linux-block/patch/20201204094659.12732-2-selvakuma...@samsung.com/

Right. But the initialization to 0 is still not needed. The fields are already
initialized to 0.


-- 
Damien Le Moal
Western Digital Research


Re: [RESEND,v5,1/2] bio: limit bio max size

2021-04-11 Thread Damien Le Moal
On 2021/04/09 23:47, Bart Van Assche wrote:
> On 4/7/21 3:27 AM, Damien Le Moal wrote:
>> On 2021/04/07 18:46, Changheun Lee wrote:
>>> I'll prepare new patch as you recommand. It will be added setting of
>>> limit_bio_size automatically when queue max sectors is determined.
>>
>> Please do that in the driver for the HW that benefits from it. Do not do this
>> for all block devices.
> 
> Hmm ... is it ever useful to build a bio with a size that exceeds 
> max_hw_sectors when submitting a bio directly to a block device, or in 
> other words, if no stacked block driver sits between the submitter and 
> the block device? Am I perhaps missing something?

Device performance wise, the benefits are certainly not obvious to me either.
But for very fast block devices, I think the CPU overhead of building more
smaller BIOs may be significant compared to splitting a large BIO into multiple
requests. Though it may be good to revisit this with some benchmark numbers.

> 
> Thanks,
> 
> Bart.
> 


-- 
Damien Le Moal
Western Digital Research


[PATCH v2 0/2] Fix binfmt_flat loader for RISC-V

2021-04-07 Thread Damien Le Moal
RISC-V NOMMU flat binaries cannot tolerate a gap between the text and
data section as the toolchain fully resolves at compile time the PC
relative global pointer (__global_pointer$ value loaded in gp register).
Without a relocation entry provided, the flat bin loader cannot fix the
value if a gap is introduced and executables fail to run.

This series fixes this problem by allowing an architecture to request
the flat loader to suppress the gap between the text and data sections.
The first patch fixes binfmt_flat flat_load_file() using the new
configuration option CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP. The second
patch enables this option for RISCV NOMMU builds.

These patches do not change the binfmt_flat loader behavior for other
architectures.

Changes from v1:
* Replace FLAT_TEXT_DATA_NO_GAP macro with
  CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP config option (patch 1).
* Remove the addition of riscv/include/asm/flat.h and set
  CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP for RISCV and !MMU

Damien Le Moal (2):
  binfmt_flat: allow not offsetting data start
  riscv: Disable text-data gap in flat binaries

 arch/riscv/Kconfig |  1 +
 fs/Kconfig.binfmt  |  3 +++
 fs/binfmt_flat.c   | 21 +++--
 3 files changed, 19 insertions(+), 6 deletions(-)

-- 
2.30.2



[PATCH v2 1/2] binfmt_flat: allow not offsetting data start

2021-04-07 Thread Damien Le Moal
Commit 2217b9826246 ("binfmt_flat: revert "binfmt_flat: don't offset
the data start"") restored offsetting the start of the data section by
a number of words defined by MAX_SHARED_LIBS. As a result, since
MAX_SHARED_LIBS is never 0, a gap between the text and data sections
always exists. For architectures which cannot support a such gap
between the text and data sections (e.g. riscv nommu), flat binary
programs cannot be executed.

To allow an architecture to request contiguous text and data sections,
introduce the config option CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP.
Using this new option, the macro DATA_GAP_WORDS is conditionally
defined in binfmt_flat.c to MAX_SHARED_LIBS for architectures
tolerating the text-to-data gap (CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP
disabled case) and to 0 when CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP is
enabled. DATA_GAP_WORDS is used in load_flat_file() to calculate the
data section length and start position.

An architecture enabling CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP also
prevents the use of the separate text/data load case (when the flat file
header flags FLAT_FLAG_RAM and FLAT_FLAG_GZIP are not set with NOMMU
kernels) and forces the use of a single RAM region for loading
(equivalent to FLAT_FLAG_RAM being set).

Signed-off-by: Damien Le Moal 
---
 fs/Kconfig.binfmt |  3 +++
 fs/binfmt_flat.c  | 21 +++--
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c6f1c8c1934e..c6df931d5d45 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK
 config BINFMT_FLAT_OLD_ALWAYS_RAM
bool
 
+config BINFMT_FLAT_NO_TEXT_DATA_GAP
+   bool
+
 config BINFMT_FLAT_OLD
bool "Enable support for very old legacy flat binaries"
depends on BINFMT_FLAT
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b9c658e0548e..2be29bb964b8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -74,6 +74,12 @@
 #defineMAX_SHARED_LIBS (1)
 #endif
 
+#ifdef CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP
+#define DATA_GAP_WORDS (0)
+#else
+#define DATA_GAP_WORDS (MAX_SHARED_LIBS)
+#endif
+
 struct lib_info {
struct {
unsigned long start_code;   /* Start of text 
segment */
@@ -559,7 +565,10 @@ static int load_flat_file(struct linux_binprm *bprm,
 * case,  and then the fully copied to RAM case which lumps
 * it all together.
 */
-   if (!IS_ENABLED(CONFIG_MMU) && !(flags & 
(FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) {
+   if (!IS_ENABLED(CONFIG_MMU) &&
+   !IS_ENABLED(CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP) &&
+   !(flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) {
+
/*
 * this should give us a ROM ptr,  but if it doesn't we don't
 * really care
@@ -576,7 +585,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
 
-   len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned 
long);
+   len = data_len + extra + DATA_GAP_WORDS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -591,7 +600,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
datapos = ALIGN(realdatastart +
-   MAX_SHARED_LIBS * sizeof(unsigned long),
+   DATA_GAP_WORDS * sizeof(unsigned long),
FLAT_DATA_ALIGN);
 
pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
@@ -622,7 +631,7 @@ static int load_flat_file(struct linux_binprm *bprm,
memp_size = len;
} else {
 
-   len = text_len + data_len + extra + MAX_SHARED_LIBS * 
sizeof(u32);
+   len = text_len + data_len + extra + DATA_GAP_WORDS * 
sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -638,7 +647,7 @@ static int load_flat_file(struct linux_binprm *bprm,
 
realdatastart = textpos + ntohl(hdr->data_start);
datapos = ALIGN(realdatastart +
-   MAX_SHARED_LIBS * sizeof(u32),
+   DATA_GAP_WORDS * sizeof(u32),
FLAT_DATA_ALIGN);
 
reloc = (__be32 __user *)
@@ -714,7 +723,7 @@ static int load_flat_file(struct linux_binprm *bprm,
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
vm_munmap(textpos, text_

[PATCH v2 2/2] riscv: Disable text-data gap in flat binaries

2021-04-07 Thread Damien Le Moal
uclibc/gcc combined with elf2flt riscv linker file fully resolve the
PC relative __global_pointer$ value at compile time and do not generate
a relocation entry to set a runtime gp value. As a result, if the
flatbin loader introduces a gap between the text and data sections, the
gp value becomes incorrect and prevent correct execution of a flatbin
executable.

Avoid this problem by enabling CONFIG_BINFMT_FLAT_NO_TEXT_DATA_GAP
automatically when CONFIG_RISCV is enabled and CONFIG_MMU disabled.

Signed-off-by: Damien Le Moal 
---
 arch/riscv/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 0d0cf67359cb..6a85fbbd056e 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -33,6 +33,7 @@ config RISCV
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_HUGE_PMD_SHARE if 64BIT
+   select BINFMT_FLAT_NO_TEXT_DATA_GAP if !MMU
select CLONE_BACKWARDS
select CLINT_TIMER if !MMU
select COMMON_CLK
-- 
2.30.2



Re: [null_blk] de3510e52b: blktests.block.014.fail

2021-04-07 Thread Damien Le Moal
On 2021/04/07 18:02, kernel test robot wrote:
> 
> 
> Greeting,
> 
> FYI, we noticed the following commit (built with gcc-9):
> 
> commit: de3510e52b0a398261271455562458003b8eea62 ("null_blk: fix command 
> timeout completion handling")
> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git master
> 
> 
> in testcase: blktests
> version: blktests-x86_64-a210761-1_20210124
> with following parameters:
> 
>   disk: 1SSD
>   test: nvme-group-00
>   ucode: 0x11
> 
> 
> 
> on test machine: 288 threads Intel(R) Xeon Phi(TM) CPU 7295 @ 1.50GHz with 
> 80G memory
> 
> caused below changes (please refer to attached dmesg/kmsg for entire 
> log/backtrace):
> 
> 
> 
> If you fix the issue, kindly add following tag
> Reported-by: kernel test robot 
> 
> 
> block/014 (run null-blk with blk-mq and timeout injection configured)
> block/014 (run null-blk with blk-mq and timeout injection configured) [failed]
> runtime  ...  71.624s
> --- tests/block/014.out 2021-01-24 06:04:08.0 +
> +++ /mnt/nvme-group-00/nodev/block/014.out.bad  2021-04-06 
> 09:21:25.133971868 +
> @@ -1,2 +1,377 @@
>  Running block/014
> +dd: error reading '/dev/nullb0': Connection timed out
> +dd: error reading '/dev/nullb0': Connection timed out
> +dd: error reading '/dev/nullb0': Connection timed out
> +dd: error reading '/dev/nullb0': Connection timed out
> +dd: error reading '/dev/nullb0': Connection timed out
> +dd: error reading '/dev/nullb0': Connection timed out
> ...
> (Run 'diff -u tests/block/014.out 
> /mnt/nvme-group-00/nodev/block/014.out.bad' to see the entire diff)

This is not a kernel bug. It is a problem with blktest. Before my patch, the
timeout error was not propagated back to the user. It is now and causes dd to
fail. blktest seeing dd failing reports the test as failed. On the kernel side,
all is good, the reqs are completed as expected.

Note that the timeout error is reported back as is, using BLK_STS_TIMEOUT which
becomes ETIMEDOUT, hence the "Connection timed out" error message. May be we
should use the more traditional EIO ? Jens ?

In any case, I will send a patch to fix blktest block/014.


> 
> 
> 
> To reproduce:
> 
> git clone https://github.com/intel/lkp-tests.git
> cd lkp-tests
> bin/lkp installjob.yaml  # job file is attached in 
> this email
> bin/lkp split-job --compatible job.yaml
> bin/lkp runcompatible-job.yaml
> 
> 
> 
> ---
> 0DAY/LKP+ Test Infrastructure   Open Source Technology Center
> https://lists.01.org/hyperkitty/list/l...@lists.01.org   Intel Corporation
> 
> Thanks,
> Oliver Sang
> 


-- 
Damien Le Moal
Western Digital Research


[PATCH 1/2] binfmt_flat: allow not offsetting data start

2021-04-07 Thread Damien Le Moal
Commit 2217b9826246 ("binfmt_flat: revert "binfmt_flat: don't offset
the data start"") restored offsetting the start of the data section by
a number of words defined by MAX_SHARED_LIBS. As a result, since
MAX_SHARED_LIBS is never 0, a gap between the text and data sections
always exist. For architecture which cannot support a such gap between
the text and data sections (e.g. riscv nommu), flat binary programs
cannot be executed.

To allow an architecture to request contiguous text and data sections,
introduce the macro FLAT_TEXT_DATA_NO_GAP which can be defined by the
architecture in its asm/flat.h file. With this change, the macro
DATA_GAP_WORDS is conditionally defined in binfmt_flat.c to
MAX_SHARED_LIBS for architectures tolerating the gap
(FLAT_TEXT_DATA_NO_GAP undefined case) and to 0 when
FLAT_TEXT_DATA_NO_GAP is defined. DATA_GAP_WORDS is used in
load_flat_file() to calculate the data section length and start
position.

The definition of FLAT_TEXT_DATA_NO_GAP by an architecture also
prevents the use of the separate text/data load case (when
FLAT_FLAG_RAM and FLAT_FLAG_GZIP are not set with NOMMU kernels).

Signed-off-by: Damien Le Moal 
---
 fs/binfmt_flat.c | 25 +++--
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b9c658e0548e..2bfa05ac5cb4 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -74,6 +74,12 @@
 #defineMAX_SHARED_LIBS (1)
 #endif
 
+#ifdef FLAT_TEXT_DATA_NO_GAP
+#define DATA_GAP_WORDS (0)
+#else
+#define DATA_GAP_WORDS (MAX_SHARED_LIBS)
+#endif
+
 struct lib_info {
struct {
unsigned long start_code;   /* Start of text 
segment */
@@ -437,7 +443,6 @@ static int load_flat_file(struct linux_binprm *bprm,
__be32 __user *reloc;
u32 __user *rp;
int i, rev, relocs;
-   loff_t fpos;
unsigned long start_code, end_code;
ssize_t result;
int ret;
@@ -560,6 +565,9 @@ static int load_flat_file(struct linux_binprm *bprm,
 * it all together.
 */
if (!IS_ENABLED(CONFIG_MMU) && !(flags & 
(FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) {
+#ifndef FLAT_TEXT_DATA_NO_GAP
+   loff_t fpos;
+
/*
 * this should give us a ROM ptr,  but if it doesn't we don't
 * really care
@@ -576,7 +584,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
 
-   len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned 
long);
+   len = data_len + extra + DATA_GAP_WORDS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -591,7 +599,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
datapos = ALIGN(realdatastart +
-   MAX_SHARED_LIBS * sizeof(unsigned long),
+   DATA_GAP_WORDS * sizeof(unsigned long),
FLAT_DATA_ALIGN);
 
pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
@@ -620,9 +628,14 @@ static int load_flat_file(struct linux_binprm *bprm,
(datapos + (ntohl(hdr->reloc_start) - text_len));
memp = realdatastart;
memp_size = len;
+#else
+   pr_err("Separate text/data loading not supported\n");
+   ret = -ENOEXEC;
+   goto err;
+#endif /* FLAT_TEXT_DATA_NO_GAP */
} else {
 
-   len = text_len + data_len + extra + MAX_SHARED_LIBS * 
sizeof(u32);
+   len = text_len + data_len + extra + DATA_GAP_WORDS * 
sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -638,7 +651,7 @@ static int load_flat_file(struct linux_binprm *bprm,
 
realdatastart = textpos + ntohl(hdr->data_start);
datapos = ALIGN(realdatastart +
-   MAX_SHARED_LIBS * sizeof(u32),
+   DATA_GAP_WORDS * sizeof(u32),
FLAT_DATA_ALIGN);
 
reloc = (__be32 __user *)
@@ -714,7 +727,7 @@ static int load_flat_file(struct linux_binprm *bprm,
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
vm_munmap(textpos, text_len + data_len + extra +
-   MAX_SHARED_LIBS * sizeof(u32));
+ DATA_GAP_WORDS * sizeof(u32));
goto err;
}
}
-- 
2.30.2



[PATCH 0/2] Fix binfmt_flat loader for RISC-V

2021-04-07 Thread Damien Le Moal
RISC-V NOMMU flat binaries cannot tolerate a gap between the text and
data section as the toolchain fully resolve at compile time the PC
relative global pointer (__global_pointer$ value loaded in gp register).
Without a relocation entry provided, the flat bin loader cannot fix the
value if a gap is introduced and executables fail to run.

This series fixes this problem by allowing an architecture to request
the flat loader to suppress the gap between the text and data sections.
The first patch fixes binfmt_flat flat_load_file(). The second patch
adds the asm/flat.h file to riscv arch to request the gap suppression
using the newly introduced macro FLAT_TEXT_DATA_NO_GAP.

These patches do not change the binfmt_flat loader behavior for other
architectures.

Damien Le Moal (2):
  binfmt_flat: allow not offsetting data start
  riscv: introduce asm/flat.h

 arch/riscv/include/asm/Kbuild |  1 -
 arch/riscv/include/asm/flat.h | 29 +
 fs/binfmt_flat.c  | 25 +++--
 3 files changed, 48 insertions(+), 7 deletions(-)
 create mode 100644 arch/riscv/include/asm/flat.h

-- 
2.30.2



[PATCH 2/2] riscv: introduce asm/flat.h

2021-04-07 Thread Damien Le Moal
uclibc/gcc combined with elf2flt riscv linker file fully resolve the
PC relative __global_pointer$ value at compile time and do not generate
a relocation entry to set a runtime gp value. As a result, if the
flatbin loader introduces a gap between the text and data sections, the
gp value becomes incorrect and prevent correct execution of a flatbin
executable. Avoid this problem by introducing the file asm/flat.h
and defining the macro FLAT_TEXT_DATA_NO_GAP to indicate that the text
and data sections must be loaded at contiguous addresses.

Signed-off-by: Damien Le Moal 
---
 arch/riscv/include/asm/Kbuild |  1 -
 arch/riscv/include/asm/flat.h | 29 +
 2 files changed, 29 insertions(+), 1 deletion(-)
 create mode 100644 arch/riscv/include/asm/flat.h

diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 445ccc97305a..a8b54a3f4c2b 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 generic-y += early_ioremap.h
 generic-y += extable.h
-generic-y += flat.h
 generic-y += kvm_para.h
 generic-y += user.h
 generic-y += vmlinux.lds.h
diff --git a/arch/riscv/include/asm/flat.h b/arch/riscv/include/asm/flat.h
new file mode 100644
index ..43bccf090fd1
--- /dev/null
+++ b/arch/riscv/include/asm/flat.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_RISCV_FLAT_H
+#define _ASM_RISCV_FLAT_H
+
+#include 
+
+static inline int flat_get_addr_from_rp(u32 __user *rp, u32 relval, u32 flags,
+   u32 *addr)
+{
+   *addr = get_unaligned((__force u32 *)rp);
+   return 0;
+}
+
+static inline int flat_put_addr_at_rp(u32 __user *rp, u32 addr, u32 rel)
+{
+   put_unaligned(addr, (__force u32 *)rp);
+   return 0;
+}
+
+/*
+ * uclibc/gcc fully resolve the PC relative __global_pointer value
+ * at compile time and do not generate a relocation entry to set a
+ * runtime gp value. As a result, the flatbin loader must not introduce
+ * a gap between the text and data sections and keep them contiguous to
+ * avoid invalid address accesses.
+ */
+#define FLAT_TEXT_DATA_NO_GAP  (1)
+
+#endif /* _ASM_RISCV_FLAT_H */
-- 
2.30.2



Re: [RESEND,v5,1/2] bio: limit bio max size

2021-04-07 Thread Damien Le Moal
d.
>>>>>>>>>> And it lead to delay first I/O request issue.
>>>>>>>>>>
>>>>>>>>>> Signed-off-by: Changheun Lee 
>>>>>>>>>> ---
>>>>>>>>>>  block/bio.c| 13 -
>>>>>>>>>>  include/linux/bio.h|  2 +-
>>>>>>>>>>  include/linux/blkdev.h |  3 +++
>>>>>>>>>>  3 files changed, 16 insertions(+), 2 deletions(-)
>>>>>>>>>>
>>>>>>>>>> diff --git a/block/bio.c b/block/bio.c
>>>>>>>>>> index 1f2cc1fbe283..c528e1f944c7 100644
>>>>>>>>>> --- a/block/bio.c
>>>>>>>>>> +++ b/block/bio.c
>>>>>>>>>> @@ -287,6 +287,17 @@ void bio_init(struct bio *bio, struct bio_vec 
>>>>>>>>>> *table,
>>>>>>>>>>  }
>>>>>>>>>>  EXPORT_SYMBOL(bio_init);
>>>>>>>>>>  
>>>>>>>>>> +unsigned int bio_max_size(struct bio *bio)
>>>>>>>>>> +{
>>>>>>>>>> +struct request_queue *q = bio->bi_disk->queue;
>>>>>>>>>> +
>>>>>>>>>> +if (blk_queue_limit_bio_size(q))
>>>>>>>>>> +return blk_queue_get_max_sectors(q, bio_op(bio))
>>>>>>>>>> +<< SECTOR_SHIFT;
>>>>>>>>>> +
>>>>>>>>>> +return UINT_MAX;
>>>>>>>>>> +}
>>>>>>>>>> +
>>>>>>>>>>  /**
>>>>>>>>>>   * bio_reset - reinitialize a bio
>>>>>>>>>>   * @bio:bio to reset
>>>>>>>>>> @@ -877,7 +888,7 @@ bool __bio_try_merge_page(struct bio *bio, 
>>>>>>>>>> struct page *page,
>>>>>>>>>>  struct bio_vec *bv = >bi_io_vec[bio->bi_vcnt - 1];
>>>>>>>>>>  
>>>>>>>>>>  if (page_is_mergeable(bv, page, len, off, same_page)) {
>>>>>>>>>> -if (bio->bi_iter.bi_size > UINT_MAX - len) {
>>>>>>>>>> +if (bio->bi_iter.bi_size > bio_max_size(bio) - 
>>>>>>>>>> len) {
>>>>>>>>>>  *same_page = false;
>>>>>>>>>>  return false;
>>>>>>>>>>  }
>>>>>>>>>> diff --git a/include/linux/bio.h b/include/linux/bio.h
>>>>>>>>>> index 1edda614f7ce..13b6f6562a5b 100644
>>>>>>>>>> --- a/include/linux/bio.h
>>>>>>>>>> +++ b/include/linux/bio.h
>>>>>>>>>> @@ -113,7 +113,7 @@ static inline bool bio_full(struct bio *bio, 
>>>>>>>>>> unsigned len)
>>>>>>>>>>  if (bio->bi_vcnt >= bio->bi_max_vecs)
>>>>>>>>>>  return true;
>>>>>>>>>>  
>>>>>>>>>> -if (bio->bi_iter.bi_size > UINT_MAX - len)
>>>>>>>>>> +if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
>>>>>>>>>>  return true;
>>>>>>>>>>  
>>>>>>>>>>  return false;
>>>>>>>>>> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
>>>>>>>>>> index f94ee3089e01..3aeab9e7e97b 100644
>>>>>>>>>> --- a/include/linux/blkdev.h
>>>>>>>>>> +++ b/include/linux/blkdev.h
>>>>>>>>>> @@ -621,6 +621,7 @@ struct request_queue {
>>>>>>>>>>  #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */
>>>>>>>>>>  #define QUEUE_FLAG_HCTX_ACTIVE  28  /* at least one blk-mq 
>>>>>>>>>> hctx is active */
>>>>>>>>>>  #define QUEUE_FLAG_NOWAIT   29  /* device supports NOWAIT */
>>>>>>>>>> +#define QUEUE_FLAG_LIMIT_BIO_SIZE 30/* limit bio size */
>>>>>>>>>>  
>>>>>>>>>>  #define QUEUE_FLAG_MQ_DEFAULT   ((1 << QUEUE_FLAG_IO_STAT) |
>>>>>>>>>> \
>>>>>>>>>>   (1 << QUEUE_FLAG_SAME_COMP) |  
>>>>>>>>>> \
>>>>>>>>>> @@ -667,6 +668,8 @@ bool blk_queue_flag_test_and_set(unsigned int 
>>>>>>>>>> flag, struct request_queue *q);
>>>>>>>>>>  #define blk_queue_fua(q)test_bit(QUEUE_FLAG_FUA, 
>>>>>>>>>> &(q)->queue_flags)
>>>>>>>>>>  #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, 
>>>>>>>>>> &(q)->queue_flags)
>>>>>>>>>>  #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, 
>>>>>>>>>> &(q)->queue_flags)
>>>>>>>>>> +#define blk_queue_limit_bio_size(q) \
>>>>>>>>>> +test_bit(QUEUE_FLAG_LIMIT_BIO_SIZE, &(q)->queue_flags)
>>>>>>>>>>  
>>>>>>>>>>  extern void blk_set_pm_only(struct request_queue *q);
>>>>>>>>>>  extern void blk_clear_pm_only(struct request_queue *q);
>>>>>>>>>> -- 
>>>>>>>>>> 2.28.0
>>>>>>>>>>
>>>>>>>>>
>>>>>>>>> Please feedback to me if more modification is needed to apply. :)
>>>>>>>>
>>>>>>>> You are adding code that tests for a value to be set, yet you never set
>>>>>>>> it in this code so why is it needed at all?
>>>>>>>
>>>>>>> This patch is a solution for some inefficient case of multipage bvec 
>>>>>>> like
>>>>>>> as current DIO scenario. So it's not set as a default.
>>>>>>> It will be set when bio size limitation is needed in runtime.
>>>>>>
>>>>>> Set where?
>>>>>
>>>>> In my environment, set it on init.rc file like as below.
>>>>> "echo 1 > /sys/block/sda/queue/limit_bio_size"
>>>>
>>>> I do not see any sysfs file in this patch, and why would you ever want
>>>> to be forced to manually do this?  The hardware should know the limits
>>>> itself, and should automatically tune things like this, do not force a
>>>> user to do it as that's just not going to go well at all.
>>>
>>> Patch for sysfs is sent "[RESEND,v5,2/2] bio: add limit_bio_size sysfs".
>>> Actually I just suggested constant - 1MB - value to limit bio size at first.
>>> But I got a feedback that patch will be better if it's optional, and
>>> getting meaningful value from device queue on patchwork.
>>> There are some differences for each system environment I think.
>>>
>>> But there are inefficient logic obviously by applying of multipage bvec.
>>> So it will be shown in several system environment.
>>> Currently providing this patch as a option would be better to select
>>> according to each system environment, and policy I think.
>>>
>>> Please, revisit applying this patch.
>>>
>>>>
>>>> So if this patch series is forcing a new option to be configured by
>>>> sysfs only, that's not acceptable, sorry.
>>>
>>> If it is not acceptable ever with current, may I progress review again
>>> with default enabled?
>>
>> I am sorry, I can not parse this, can you rephrase this?
>>
>> thanks,
>>
>> greg k-h
>>
> 
> I'll prepare new patch as you recommand. It will be added setting of
> limit_bio_size automatically when queue max sectors is determined.

Please do that in the driver for the HW that benefits from it. Do not do this
for all block devices.

> 
> 
> Thanks,
> 
> Changheun Lee
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v4 5/6] PCI: fu740: Add SiFive FU740 PCIe host controller driver

2021-04-01 Thread Damien Le Moal
   fu740_phyregwrite(1, PCIEX8MGMT_PHY_LANE1_BASE, 
> PCIEX8MGMT_PHY_INIT_VAL, afp);
> + fu740_phyregwrite(1, PCIEX8MGMT_PHY_LANE2_BASE, 
> PCIEX8MGMT_PHY_INIT_VAL, afp);
> + fu740_phyregwrite(1, PCIEX8MGMT_PHY_LANE3_BASE, 
> PCIEX8MGMT_PHY_INIT_VAL, afp);
> +}
> +
> +static void fu740_pcie_ltssm_enable(struct device *dev)
> +{
> + struct fu740_pcie *afp = dev_get_drvdata(dev);
> +
> + /* Enable LTSSM */
> + writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_APP_LTSSM_ENABLE);
> +}
> +
> +static int fu740_pcie_start_link(struct dw_pcie *pci)
> +{
> + struct device *dev = pci->dev;

No need for this variable.

> +
> + /* Start LTSSM. */
> + fu740_pcie_ltssm_enable(dev);
> + return 0;
> +}
> +
> +static int fu740_pcie_host_init(struct pcie_port *pp)
> +{
> + struct dw_pcie *pci = to_dw_pcie_from_pp(pp);
> + struct fu740_pcie *afp = to_fu740_pcie(pci);
> + struct device *dev = pci->dev;
> + int ret;
> +
> + /* Power on reset */
> + fu740_pcie_drive_reset(afp);
> +
> + /* Enable pcieauxclk */
> + ret = clk_prepare_enable(afp->pcie_aux);
> + if (ret)
> + dev_err(dev, "unable to enable pcie_aux clock\n");

No bailing out ? Without a clock, is this going to work ?
If that is not a problem, then I would suggest a dev_warn() here.

> +
> + /*
> +  * Assert hold_phy_rst (hold the controller LTSSM in reset after
> +  * power_up_rst_n for register programming with cr_para)
> +  */
> + writel_relaxed(0x1, afp->mgmt_base + PCIEX8MGMT_APP_HOLD_PHY_RST);
> +
> + /* Deassert power_up_rst_n */
> + ret = reset_control_deassert(afp->rst);
> + if (ret)
> + dev_err(dev, "unable to deassert pcie_power_up_rst_n\n");

Same as above.

> +
> + fu740_pcie_init_phy(afp);
> +
> + /* Disable pcieauxclk */
> + clk_disable_unprepare(afp->pcie_aux);
> + /* Clear hold_phy_rst */
> + writel_relaxed(0x0, afp->mgmt_base + PCIEX8MGMT_APP_HOLD_PHY_RST);
> + /* Enable pcieauxclk */
> + ret = clk_prepare_enable(afp->pcie_aux);
> + /* Set RC mode */
> + writel_relaxed(0x4, afp->mgmt_base + PCIEX8MGMT_DEVICE_TYPE);
> +
> + return 0;
> +}
> +
> +static const struct dw_pcie_host_ops fu740_pcie_host_ops = {
> + .host_init = fu740_pcie_host_init,
> +};
> +
> +static const struct dw_pcie_ops dw_pcie_ops = {
> + .start_link = fu740_pcie_start_link,
> +};
> +
> +static int fu740_pcie_probe(struct platform_device *pdev)
> +{
> + struct device *dev = >dev;
> + struct dw_pcie *pci;
> + struct fu740_pcie *afp;
> + int ret;
> +
> + afp = devm_kzalloc(dev, sizeof(*afp), GFP_KERNEL);
> + if (!afp)
> + return -ENOMEM;
> + pci = >pci;
> + pci->dev = dev;
> + pci->ops = _pcie_ops;
> + pci->pp.ops = _pcie_host_ops;
> +
> + /* SiFive specific region: mgmt */
> + afp->mgmt_base = devm_platform_ioremap_resource_byname(pdev, "mgmt");
> + if (IS_ERR(afp->mgmt_base))
> + return PTR_ERR(afp->mgmt_base);
> +
> + /* Fetch GPIOs */
> + afp->reset = devm_gpiod_get_optional(dev, "reset-gpios", GPIOD_OUT_LOW);
> + if (IS_ERR(afp->reset)) {
> + dev_err(dev, "unable to get reset-gpios\n");
> + return ret;
> + }
> + afp->pwren = devm_gpiod_get_optional(dev, "pwren-gpios", GPIOD_OUT_LOW);
> + if (IS_ERR(afp->pwren)) {
> + dev_err(dev, "unable to get pwren-gpios\n");
> + return ret;

Why not return dev_err_probe(...); ? Same for the returns above.

> + }
> +
> + /* Fetch clocks */
> + afp->pcie_aux = devm_clk_get(dev, "pcie_aux");
> + if (IS_ERR(afp->pcie_aux))
> + return dev_err_probe(dev, PTR_ERR(afp->pcie_aux),
> +  "pcie_aux clock source missing or 
> invalid\n");
> +
> + /* Fetch reset */
> + afp->rst = devm_reset_control_get_exclusive(dev, NULL);
> + if (IS_ERR(afp->rst))
> + return dev_err_probe(dev, PTR_ERR(afp->rst), "unable to get 
> reset\n");
> +
> + platform_set_drvdata(pdev, afp);
> +
> + ret = dw_pcie_host_init(>pp);
> + if (ret < 0)
> + return ret;

You can simplify this with a simple:

return dw_pcie_host_init(>pp);

> +
> + return 0;
> +}
> +
> +static void fu740_pcie_shutdown(struct platform_device *pdev)
> +{
> + struct fu740_pcie *afp = platform_get_drvdata(pdev);
> +
> + /* Bring down link, so bootloader gets clean state in case of reboot */
> + fu740_pcie_assert_reset(afp);
> +}
> +
> +static const struct of_device_id fu740_pcie_of_match[] = {
> + { .compatible = "sifive,fu740-pcie", },
> + {},
> +};
> +
> +static struct platform_driver fu740_pcie_driver = {
> + .driver = {
> +.name = "fu740-pcie",
> +.of_match_table = fu740_pcie_of_match,
> +.suppress_bind_attrs = true,
> + },
> + .probe = fu740_pcie_probe,
> + .shutdown = fu740_pcie_shutdown,
> +};
> +
> +builtin_platform_driver(fu740_pcie_driver);
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] btrfs: Fix a typo

2021-03-25 Thread Damien Le Moal
On 2021/03/26 10:02, Bhaskar Chowdhury wrote:
> 
> s/reponsible/responsible/
> 
> Signed-off-by: Bhaskar Chowdhury 
> ---
>  fs/btrfs/scrub.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
> index 3d9088eab2fc..14de898967bf 100644
> --- a/fs/btrfs/scrub.c
> +++ b/fs/btrfs/scrub.c
> @@ -2426,7 +2426,7 @@ static void drop_csum_range(struct scrub_ctx *sctx, 
> struct btrfs_ordered_sum *su
>   * the csum into @csum.
>   *
>   * The search source is sctx->csum_list, which is a pre-populated list
> - * storing bytenr ordered csum ranges.  We're reponsible to cleanup any range
> + * storing bytenr ordered csum ranges.  We're responsible to cleanup any 
> range

If you are at fixing typos, you may as well fix the grammar at the same time :)

We're responsible to cleanup... -> We're responsible for cleaning up...

>   * that is before @logical.
>   *
>   * Return 0 if there is no csum for the range.
> --
> 2.26.2
> 
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] drivers: pinctrl: Remove duplicate include of io.h

2021-03-22 Thread Damien Le Moal
On 2021/03/23 10:38, Wan Jiabing wrote:
> linux/io.h has been included at line 6, so remove the 
> duplicate include at line 18.
> 
> Signed-off-by: Wan Jiabing 
> ---
>  drivers/pinctrl/pinctrl-k210.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/drivers/pinctrl/pinctrl-k210.c b/drivers/pinctrl/pinctrl-k210.c
> index 8a733cf77ba0..f831526d06ff 100644
> --- a/drivers/pinctrl/pinctrl-k210.c
> +++ b/drivers/pinctrl/pinctrl-k210.c
> @@ -15,7 +15,6 @@
>  #include 
>  #include 
>  #include 
> -#include 
>  
>  #include 
>  
> 

Good catch !

Reviewed-by: Damien Le Moal 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] zonefs: fix to update .i_wr_refcnt correctly in zonefs_open_zone()

2021-03-16 Thread Damien Le Moal
On 2021/03/16 21:30, Chao Yu wrote:
> In zonefs_open_zone(), if opened zone count is larger than
> .s_max_open_zones threshold, we missed to recover .i_wr_refcnt,
> fix this.
> 
> Fixes: b5c00e975779 ("zonefs: open/close zone on file open/close")
> Signed-off-by: Chao Yu 
> ---
>  fs/zonefs/super.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index 0fe76f376dee..be6b99f7de74 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -966,8 +966,7 @@ static int zonefs_open_zone(struct inode *inode)
>  
>   mutex_lock(>i_truncate_mutex);
>  
> - zi->i_wr_refcnt++;
> - if (zi->i_wr_refcnt == 1) {
> + if (zi->i_wr_refcnt == 0) {

Nit: if (!zi->i_wr_refcnt) ? I can change that when applying.

>  
>   if (atomic_inc_return(>s_open_zones) > 
> sbi->s_max_open_zones) {
>   atomic_dec(>s_open_zones);
> @@ -978,7 +977,6 @@ static int zonefs_open_zone(struct inode *inode)
>   if (i_size_read(inode) < zi->i_max_size) {
>   ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN);
>   if (ret) {
> - zi->i_wr_refcnt--;
>   atomic_dec(>s_open_zones);
>   goto unlock;
>   }
> @@ -986,6 +984,8 @@ static int zonefs_open_zone(struct inode *inode)
>   }
>   }
>  
> + zi->i_wr_refcnt++;
> +
>  unlock:
>   mutex_unlock(>i_truncate_mutex);
>  
> 

Good catch ! Will apply this and check zonefs test suite as this bug went
undetected.

Thanks.

-- 
Damien Le Moal
Western Digital Research


Re: [PATCH -next 2/5] block: add ioctl to read the disk sequence number

2021-03-15 Thread Damien Le Moal
On 2021/03/16 5:14, Matthew Wilcox wrote:
> On Mon, Mar 15, 2021 at 09:02:39PM +0100, Matteo Croce wrote:
>> +++ b/include/uapi/linux/fs.h
>> @@ -184,6 +184,7 @@ struct fsxattr {
>>  #define BLKSECDISCARD _IO(0x12,125)
>>  #define BLKROTATIONAL _IO(0x12,126)
>>  #define BLKZEROOUT _IO(0x12,127)
>> +#define BLKGETDISKSEQ _IOR(0x12,128,__u64)
>>  /*
>>   * A jump here: 130-131 are reserved for zoned block devices
>>   * (see uapi/linux/blkzoned.h)
> 
> Not your bug, but this is now 130-136.
> 
> +cc all the people who signed off on the commits that added those ioctl
> numbers without updating this comment.  Perhaps one of them will figure
> out how to stop this happening in future.
> 

Indeed. Will be more careful :)
And send a patch to fix this.

Thanks !

-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] block: fix possible bd_size_lock deadlock

2021-03-12 Thread Damien Le Moal
On 2021/03/13 4:37, Jens Axboe wrote:
> On 3/11/21 5:11 AM, yanfei...@windriver.com wrote:
>> From: Yanfei Xu 
>>
>> bd_size_lock spinlock could be taken in block softirq, thus we should
>> disable the softirq before taking the lock.
>>
>> WARNING: inconsistent lock state
>> 5.12.0-rc2-syzkaller #0 Not tainted
>> 
>> inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-R} usage.
>> kworker/u4:0/7 [HC0[0]:SC1[1]:HE0:SE0] takes:
>> 8f87826c (>i_size_seqcount){+.+-}-{0:0}, at:
>> end_bio_bh_io_sync+0x38/0x54 fs/buffer.c:3006
>> {SOFTIRQ-ON-W} state was registered at:
>>   lock_acquire.part.0+0xf0/0x41c kernel/locking/lockdep.c:5510
>>   lock_acquire+0x6c/0x74 kernel/locking/lockdep.c:5483
>>   do_write_seqcount_begin_nested include/linux/seqlock.h:520 [inline]
>>   do_write_seqcount_begin include/linux/seqlock.h:545 [inline]
>>   i_size_write include/linux/fs.h:863 [inline]
>>   set_capacity+0x13c/0x1f8 block/genhd.c:50
>>   brd_alloc+0x130/0x180 drivers/block/brd.c:401
>>   brd_init+0xcc/0x1e0 drivers/block/brd.c:500
>>   do_one_initcall+0x8c/0x59c init/main.c:1226
>>   do_initcall_level init/main.c:1299 [inline]
>>   do_initcalls init/main.c:1315 [inline]
>>   do_basic_setup init/main.c:1335 [inline]
>>   kernel_init_freeable+0x2cc/0x330 init/main.c:1537
>>   kernel_init+0x10/0x120 init/main.c:1424
>>   ret_from_fork+0x14/0x20 arch/arm/kernel/entry-common.S:158
>>   0x0
>> irq event stamp: 2783413
>> hardirqs last  enabled at (2783412): [<802011ec>]
>> __do_softirq+0xf4/0x7ac kernel/softirq.c:329
>> hardirqs last disabled at (2783413): [<8277d260>]
>> __raw_read_lock_irqsave include/linux/rwlock_api_smp.h:157 [inline]
>> hardirqs last disabled at (2783413): [<8277d260>]
>> _raw_read_lock_irqsave+0x84/0x88 kernel/locking/spinlock.c:231
>> softirqs last  enabled at (2783410): [<826b5050>] spin_unlock_bh
>> include/linux/spinlock.h:399 [inline]
>> softirqs last  enabled at (2783410): [<826b5050>]
>> batadv_nc_purge_paths+0x10c/0x148 net/batman-adv/network-coding.c:467
>> softirqs last disabled at (2783411): [<8024ddfc>] do_softirq_own_stack
>> include/asm-generic/softirq_stack.h:10 [inline]
>> softirqs last disabled at (2783411): [<8024ddfc>] do_softirq
>> kernel/softirq.c:248 [inline]
>> softirqs last disabled at (2783411): [<8024ddfc>] do_softirq+0xd8/0xe4
>> kernel/softirq.c:235
>>
>> other info that might help us debug this:
>>  Possible unsafe locking scenario:
>>
>>CPU0
>>
>>   lock(>i_size_seqcount);
>>   
>> lock(>i_size_seqcount);
>>
>>  *** DEADLOCK ***
>>
>> 3 locks held by kworker/u4:0/7:
>>  #0: 88c622a8 ((wq_completion)bat_events){+.+.}-{0:0}, at: set_work_data
>> kernel/workqueue.c:615 [inline]
>>  #0: 88c622a8 ((wq_completion)bat_events){+.+.}-{0:0}, at:
>> set_work_pool_and_clear_pending kernel/workqueue.c:643 [inline]
>>  #0: 88c622a8 ((wq_completion)bat_events){+.+.}-{0:0}, at:
>> process_one_work+0x214/0x998 kernel/workqueue.c:2246
>>  #1: 85147ef8
>> ((work_completion)(&(_priv->nc.work)->work)){+.+.}-{0:0}, at:
>> set_work_data kernel/workqueue.c:615 [inline]
>>  #1: 85147ef8
>> ((work_completion)(&(_priv->nc.work)->work)){+.+.}-{0:0}, at:
>> set_work_pool_and_clear_pending kernel/workqueue.c:643 [inline]
>>  #1: 85147ef8
>> ((work_completion)(&(_priv->nc.work)->work)){+.+.}-{0:0}, at:
>> process_one_work+0x214/0x998 kernel/workqueue.c:2246
>>  #2: 8f878010 (>size_lock){...-}-{2:2}, at:
>> ntfs_end_buffer_async_read+0x6c/0x558 fs/ntfs/aops.c:66
> 
> Damien? We have that revert queued up for this for 5.12, but looking
> at that, the state before that was kind of messy too.

Indeed... I was thinking about this and I think I am with Christoph on this:
drivers should not call set_capacity() from command completion context. I think
the best thing to do would be to fix drivers that do that but that may not be RC
material ?

Looking into more details of this case, it is slightly different though.
set_capacity() is here not called from soft IRQ context. It looks like a regular
initialization, but one that seems way too early in the boot process when a
secondary core is being initialized with IRQ not yet enabled... I think. And the
warnings come from i_size_write() calling preempt_disable() rather than
set_capacity() use of spin_lock(>bd_size_lock).

I wonder how it is possible to have brd being initialized so early.
I am not sure how to fix that. It looks like arm arch code territory.

For now, we could revert the revert as I do not think that Yanfei patch is
enough since completions may be from hard IRQ context too, which is not covered
with the spin_lock_bh() variants (c.f. a similar problem we are facing with that
in scsi completion [1])
I do not have any good idea how to proceed though.

[1]
https://lore.kernel.org/linux-scsi/ph0pr04mb7416c8330459e92d8aa21a889b...@ph0pr04mb7416.namprd04.prod.outlook.com/T/#t

-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] soc: canaan: Sort the Makefile alphabetically

2021-02-22 Thread Damien Le Moal
On 2021/02/23 11:19, Palmer Dabbelt wrote:
> From: Palmer Dabbelt 
> 
> The rest of these are alphabetically sorted, and leaving it this way
> causes a merge conflict.
> 
> Signed-off-by: Palmer Dabbelt 
> 
> ---
> 
> I missed this when reviewing these patches, but happened across it when
> test merging from Linus' tree.  It goes back a way so I'm hesitant to
> rebase this one out just for cleanliness, but if I have to go back that
> far before sending the merge window PR I'll squash it in.
> ---
>  drivers/soc/Makefile | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/soc/Makefile b/drivers/soc/Makefile
> index fa7071246546..34b23645be14 100644
> --- a/drivers/soc/Makefile
> +++ b/drivers/soc/Makefile
> @@ -7,6 +7,7 @@ obj-$(CONFIG_ARCH_ACTIONS)+= actions/
>  obj-y+= aspeed/
>  obj-$(CONFIG_ARCH_AT91)  += atmel/
>  obj-y+= bcm/
> +obj-$(CONFIG_SOC_CANAAN) += canaan/
>  obj-$(CONFIG_ARCH_DOVE)  += dove/
>  obj-$(CONFIG_MACH_DOVE)  += dove/
>  obj-y+= fsl/
> @@ -29,4 +30,3 @@ obj-$(CONFIG_ARCH_U8500)+= ux500/
>  obj-$(CONFIG_PLAT_VERSATILE) += versatile/
>  obj-y+= xilinx/
>  obj-$(CONFIG_ARCH_ZX)+= zte/
> -obj-$(CONFIG_SOC_CANAAN) += canaan/
> 

Yes. Should have sent that... Thanks.

Reviewed-by: Damien Le Moal 

-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH v5 2/4] block: add simple copy support

2021-02-19 Thread Damien Le Moal
) * crange.nr_range)) {
> + ret = -EFAULT;
> + goto out;
> + }
> +
> + ret = blkdev_issue_copy(bdev, crange.nr_range, rlist, bdev, crange.dest,
> + GFP_KERNEL, flags);
> +out:
> + kfree(rlist);
> + return ret;
> +}
> +
>  static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
>   unsigned long arg)
>  {
> @@ -458,6 +489,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, 
> fmode_t mode,
>   case BLKSECDISCARD:
>   return blk_ioctl_discard(bdev, mode, arg,
>   BLKDEV_DISCARD_SECURE);
> + case BLKCOPY:
> + return blk_ioctl_copy(bdev, mode, arg, 0);
>   case BLKZEROOUT:
>   return blk_ioctl_zeroout(bdev, mode, arg);
>   case BLKREPORTZONE:
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 1edda614f7ce..164313bdfb35 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -71,6 +71,7 @@ static inline bool bio_has_data(struct bio *bio)
>  static inline bool bio_no_advance_iter(const struct bio *bio)
>  {
>   return bio_op(bio) == REQ_OP_DISCARD ||
> +bio_op(bio) == REQ_OP_COPY ||
>  bio_op(bio) == REQ_OP_SECURE_ERASE ||
>  bio_op(bio) == REQ_OP_WRITE_SAME ||
>  bio_op(bio) == REQ_OP_WRITE_ZEROES;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 866f74261b3b..5a35c02ac0a8 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -380,6 +380,8 @@ enum req_opf {
>   REQ_OP_ZONE_RESET   = 15,
>   /* reset all the zone present on the device */
>   REQ_OP_ZONE_RESET_ALL   = 17,
> + /* copy ranges within device */
> + REQ_OP_COPY = 19,
>  
>   /* SCSI passthrough using struct scsi_request */
>   REQ_OP_SCSI_IN  = 32,
> @@ -506,6 +508,11 @@ static inline bool op_is_discard(unsigned int op)
>   return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
>  }
>  
> +static inline bool op_is_copy(unsigned int op)
> +{
> + return (op & REQ_OP_MASK) == REQ_OP_COPY;
> +}
> +
>  /*
>   * Check if a bio or request operation is a zone management operation, with
>   * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
> @@ -565,4 +572,11 @@ struct blk_rq_stat {
>   u64 batch;
>  };
>  
> +struct blk_copy_payload {
> + sector_tdest;
> + int copy_nr_ranges;
> + struct block_device *src_bdev;
> + struct  range_entry range[];
> +};
> +
>  #endif /* __LINUX_BLK_TYPES_H */
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 699ace6b25ff..2bb4513d4bb8 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -337,10 +337,14 @@ struct queue_limits {
>   unsigned intmax_zone_append_sectors;
>   unsigned intdiscard_granularity;
>   unsigned intdiscard_alignment;
> + unsigned intcopy_offload;
> + unsigned intmax_copy_sectors;
>  
>   unsigned short  max_segments;
>   unsigned short  max_integrity_segments;
>   unsigned short  max_discard_segments;
> + unsigned short  max_copy_range_sectors;
> + unsigned short  max_copy_nr_ranges;
>  
>   unsigned char   misaligned;
>   unsigned char   discard_misaligned;
> @@ -621,6 +625,7 @@ struct request_queue {
>  #define QUEUE_FLAG_RQ_ALLOC_TIME 27  /* record rq->alloc_time_ns */
>  #define QUEUE_FLAG_HCTX_ACTIVE   28  /* at least one blk-mq hctx is 
> active */
>  #define QUEUE_FLAG_NOWAIT   29   /* device supports NOWAIT */
> +#define QUEUE_FLAG_SIMPLE_COPY   30  /* supports simple copy */
>  
>  #define QUEUE_FLAG_MQ_DEFAULT((1 << QUEUE_FLAG_IO_STAT) |
> \
>(1 << QUEUE_FLAG_SAME_COMP) |  \
> @@ -643,6 +648,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, 
> struct request_queue *q);
>  #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
>  #define blk_queue_add_random(q)  test_bit(QUEUE_FLAG_ADD_RANDOM, 
> &(q)->queue_flags)
>  #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
> +#define blk_queue_copy(q)test_bit(QUEUE_FLAG_SIMPLE_COPY, 
> &(q)->queue_flags)
>  #define blk_queue_zone_resetall(q)   \
>   test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
>  #define blk_queue_secure_erase(q) \
> @@ -1069,6 +1075,9 @@ static inline unsigned int 
> blk_queue_get_max_sectors(struct request_queue *q,
>   return min(q->limits.max_discard_sectors,
>  UINT_MAX >> SECTOR_SHIFT);
>  
> + if (unlikely(op == REQ_OP_COPY))
> + return q->limits.max_copy_sectors;
> +

I would agreee with this if a copy BIO was always a single range, but that is
not the case. So I am not sure this makes sense at all.

>   if (unlikely(op == REQ_OP_WRITE_SAME))
>   return q->limits.max_write_same_sectors;
>  
> @@ -1343,6 +1352,12 @@ extern int __blkdev_issue_discard(struct block_device 
> *bdev, sector_t sector,
>   sector_t nr_sects, gfp_t gfp_mask, int flags,
>   struct bio **biop);
>  
> +#define BLKDEV_COPY_NOEMULATION  (1 << 0)/* do not emulate if 
> copy offload not supported */
> +
> +extern int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
> + struct range_entry *src_rlist, struct block_device *dest_bdev,
> + sector_t dest, gfp_t gfp_mask, int flags);

No need for extern.

> +
>  #define BLKDEV_ZERO_NOUNMAP  (1 << 0)  /* do not free blocks */
>  #define BLKDEV_ZERO_NOFALLBACK   (1 << 1)  /* don't write explicit 
> zeroes */
>  
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index f44eb0a04afd..5cadb176317a 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -64,6 +64,18 @@ struct fstrim_range {
>   __u64 minlen;
>  };
>  
> +struct range_entry {
> + __u64 src;
> + __u64 len;
> +};
> +
> +struct copy_range {
> + __u64 dest;
> + __u64 nr_range;
> + __u64 range_list;
> + __u64 rsvd;
> +};
> +
>  /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions 
> */
>  #define FILE_DEDUPE_RANGE_SAME   0
>  #define FILE_DEDUPE_RANGE_DIFFERS1
> @@ -184,6 +196,7 @@ struct fsxattr {
>  #define BLKSECDISCARD _IO(0x12,125)
>  #define BLKROTATIONAL _IO(0x12,126)
>  #define BLKZEROOUT _IO(0x12,127)
> +#define BLKCOPY _IOWR(0x12, 128, struct copy_range)
>  /*
>   * A jump here: 130-131 are reserved for zoned block devices
>   * (see uapi/linux/blkzoned.h)
> 

Please test your code more thoroughly. It is full of problems that you should
have detected with better testing including RO devices, partitions and error
path coverage.

-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH 02/34] block: introduce and use bio_new

2021-01-27 Thread Damien Le Moal
On 2021/01/28 16:21, Damien Le Moal wrote:
> On 2021/01/28 16:12, Chaitanya Kulkarni wrote:
>> Introduce bio_new() helper and use it in blk-lib.c to allocate and
>> initialize various non-optional or semi-optional members of the bio
>> along with bio allocation done with bio_alloc(). Here we also calmp the
>> max_bvecs for bio with BIO_MAX_PAGES before we pass to bio_alloc().
>>
>> Signed-off-by: Chaitanya Kulkarni 
>> ---
>>  block/blk-lib.c |  6 +-
>>  include/linux/bio.h | 25 +
>>  2 files changed, 26 insertions(+), 5 deletions(-)
>>
>> diff --git a/block/blk-lib.c b/block/blk-lib.c
>> index fb486a0bdb58..ec29415f00dd 100644
>> --- a/block/blk-lib.c
>> +++ b/block/blk-lib.c
>> @@ -14,17 +14,13 @@ struct bio *blk_next_bio(struct bio *bio, struct 
>> block_device *bdev,
>>  sector_t sect, unsigned op, unsigned opf,
>>  unsigned int nr_pages, gfp_t gfp)
>>  {
>> -struct bio *new = bio_alloc(gfp, nr_pages);
>> +struct bio *new = bio_new(bdev, sect, op, opf, gfp, nr_pages);
>>  
>>  if (bio) {
>>  bio_chain(bio, new);
>>  submit_bio(bio);
>>  }
>>  
>> -new->bi_iter.bi_sector = sect;
>> -bio_set_dev(new, bdev);
>> -bio_set_op_attrs(new, op, opf);
>> -
>>  return new;
>>  }
>>  
>> diff --git a/include/linux/bio.h b/include/linux/bio.h
>> index c74857cf1252..2a09ba100546 100644
>> --- a/include/linux/bio.h
>> +++ b/include/linux/bio.h
>> @@ -826,5 +826,30 @@ static inline void bio_set_polled(struct bio *bio, 
>> struct kiocb *kiocb)
>>  if (!is_sync_kiocb(kiocb))
>>  bio->bi_opf |= REQ_NOWAIT;
>>  }
>> +/**
>> + * bio_new -allcate and initialize new bio
>> + * @bdev:   blockdev to issue discard for
>> + * @sector: start sector
>> + * @op: REQ_OP_XXX from enum req_opf
>> + * @op_flags:   REQ_XXX from enum req_flag_bits
>> + * @max_bvecs:  maximum bvec to be allocated for this bio
>> + * @gfp_mask:   memory allocation flags (for bio_alloc)
>> + *
>> + * Description:
>> + *Allocates, initializes common members, and returns a new bio.
>> + */
>> +static inline struct bio *bio_new(struct block_device *bdev, sector_t 
>> sector,
>> +  unsigned int op, unsigned int op_flags,
>> +  unsigned int max_bvecs, gfp_t gfp_mask)
>> +{
>> +unsigned nr_bvec = clamp_t(unsigned int, max_bvecs, 0, BIO_MAX_PAGES);
>> +struct bio *bio = bio_alloc(gfp_mask, nr_bvec);
> 
> I think that depending on the gfp_mask passed, bio can be NULL. So this should
> be checked.
> 
>> +
>> +bio_set_dev(bio, bdev);
>> +bio->bi_iter.bi_sector = sector;
>> +bio_set_op_attrs(bio, op, op_flags);
> 
> This function is obsolete. Open code this.

And that also mean that you could remove one argument to bio_new(): combine op
and op_flags into "unsigned int opf"

> 
>> +
>> +return bio;
>> +}
>>  
>>  #endif /* __LINUX_BIO_H */
>>
> 
> 


-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH 28/34] zonefs: use bio_new

2021-01-27 Thread Damien Le Moal
On 2021/01/28 16:15, Chaitanya Kulkarni wrote:
> Signed-off-by: Chaitanya Kulkarni 
> ---
>  fs/zonefs/super.c | 6 ++
>  1 file changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index ab68e27bb322..620d67965a22 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -661,6 +661,7 @@ static const struct iomap_dio_ops zonefs_write_dio_ops = {
>  
>  static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter 
> *from)
>  {
> + unsigned int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;

I do not see the point of adding this variable since it is used only for the
bio_new() call. Pass the op value directly.

>   struct inode *inode = file_inode(iocb->ki_filp);
>   struct zonefs_inode_info *zi = ZONEFS_I(inode);
>   struct block_device *bdev = inode->i_sb->s_bdev;
> @@ -678,15 +679,12 @@ static ssize_t zonefs_file_dio_append(struct kiocb 
> *iocb, struct iov_iter *from)
>   if (!nr_pages)
>   return 0;
>  
> - bio = bio_alloc(GFP_NOFS, nr_pages);
> + bio = bio_new(bdev, zi->i_zsector, op, 0, GFP_NOFS, nr_pages);
>   if (!bio)
>   return -ENOMEM;
>  
> - bio_set_dev(bio, bdev);
> - bio->bi_iter.bi_sector = zi->i_zsector;
>   bio->bi_write_hint = iocb->ki_hint;
>   bio->bi_ioprio = iocb->ki_ioprio;
> - bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
>   if (iocb->ki_flags & IOCB_DSYNC)
>   bio->bi_opf |= REQ_FUA;
>  
> 


-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH 02/34] block: introduce and use bio_new

2021-01-27 Thread Damien Le Moal
On 2021/01/28 16:12, Chaitanya Kulkarni wrote:
> Introduce bio_new() helper and use it in blk-lib.c to allocate and
> initialize various non-optional or semi-optional members of the bio
> along with bio allocation done with bio_alloc(). Here we also calmp the
> max_bvecs for bio with BIO_MAX_PAGES before we pass to bio_alloc().
> 
> Signed-off-by: Chaitanya Kulkarni 
> ---
>  block/blk-lib.c |  6 +-
>  include/linux/bio.h | 25 +
>  2 files changed, 26 insertions(+), 5 deletions(-)
> 
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index fb486a0bdb58..ec29415f00dd 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -14,17 +14,13 @@ struct bio *blk_next_bio(struct bio *bio, struct 
> block_device *bdev,
>   sector_t sect, unsigned op, unsigned opf,
>   unsigned int nr_pages, gfp_t gfp)
>  {
> - struct bio *new = bio_alloc(gfp, nr_pages);
> + struct bio *new = bio_new(bdev, sect, op, opf, gfp, nr_pages);
>  
>   if (bio) {
>   bio_chain(bio, new);
>   submit_bio(bio);
>   }
>  
> - new->bi_iter.bi_sector = sect;
> - bio_set_dev(new, bdev);
> - bio_set_op_attrs(new, op, opf);
> -
>   return new;
>  }
>  
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index c74857cf1252..2a09ba100546 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -826,5 +826,30 @@ static inline void bio_set_polled(struct bio *bio, 
> struct kiocb *kiocb)
>   if (!is_sync_kiocb(kiocb))
>   bio->bi_opf |= REQ_NOWAIT;
>  }
> +/**
> + * bio_new - allcate and initialize new bio
> + * @bdev:blockdev to issue discard for
> + * @sector:  start sector
> + * @op:  REQ_OP_XXX from enum req_opf
> + * @op_flags:REQ_XXX from enum req_flag_bits
> + * @max_bvecs:   maximum bvec to be allocated for this bio
> + * @gfp_mask:memory allocation flags (for bio_alloc)
> + *
> + * Description:
> + *Allocates, initializes common members, and returns a new bio.
> + */
> +static inline struct bio *bio_new(struct block_device *bdev, sector_t sector,
> +   unsigned int op, unsigned int op_flags,
> +   unsigned int max_bvecs, gfp_t gfp_mask)
> +{
> + unsigned nr_bvec = clamp_t(unsigned int, max_bvecs, 0, BIO_MAX_PAGES);
> + struct bio *bio = bio_alloc(gfp_mask, nr_bvec);

I think that depending on the gfp_mask passed, bio can be NULL. So this should
be checked.

> +
> + bio_set_dev(bio, bdev);
> + bio->bi_iter.bi_sector = sector;
> + bio_set_op_attrs(bio, op, op_flags);

This function is obsolete. Open code this.

> +
> + return bio;
> +}
>  
>  #endif /* __LINUX_BIO_H */
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v3 1/2] bio: limit bio max size

2021-01-26 Thread Damien Le Moal
On 2021/01/27 9:36, Changheun Lee wrote:
>>> +
>>>  /**
>>>   * bio_reset - reinitialize a bio
>>>   * @bio:   bio to reset
>>> @@ -877,7 +892,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page 
>>> *page,
>>> struct bio_vec *bv = >bi_io_vec[bio->bi_vcnt - 1];
>>>  
>>> if (page_is_mergeable(bv, page, len, off, same_page)) {
>>> -   if (bio->bi_iter.bi_size > UINT_MAX - len) {
>>> +   if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
>>> *same_page = false;
>>> return false;
>>> }
>>> diff --git a/include/linux/bio.h b/include/linux/bio.h
>>> index 1edda614f7ce..cdb134ca7bf5 100644
>>> --- a/include/linux/bio.h
>>> +++ b/include/linux/bio.h
>>> @@ -100,6 +100,8 @@ static inline void *bio_data(struct bio *bio)
>>> return NULL;
>>>  }
>>>  
>>> +extern unsigned int bio_max_size(struct bio *);
>>
>> No need for extern.
> 
> It's just for compile warning in my test environment.
> I'll remove it too. But I think compile warning could be in the other
> .c file which includes bio.h. Is it OK?

Hmmm... not having extern should not generate a compilation warning. There are
tons of functions declared without extern in header files in the kernel. What
compiler are you using ?


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v3 1/2] bio: limit bio max size

2021-01-26 Thread Damien Le Moal
x_vecs)
>   return true;
>  
> - if (bio->bi_iter.bi_size > UINT_MAX - len)
> + if (bio->bi_iter.bi_size > bio_max_size(bio) - len)
>   return true;
>  
>   return false;
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index f94ee3089e01..3aeab9e7e97b 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -621,6 +621,7 @@ struct request_queue {
>  #define QUEUE_FLAG_RQ_ALLOC_TIME 27  /* record rq->alloc_time_ns */
>  #define QUEUE_FLAG_HCTX_ACTIVE   28  /* at least one blk-mq hctx is 
> active */
>  #define QUEUE_FLAG_NOWAIT   29   /* device supports NOWAIT */
> +#define QUEUE_FLAG_LIMIT_BIO_SIZE 30 /* limit bio size */
>  
>  #define QUEUE_FLAG_MQ_DEFAULT((1 << QUEUE_FLAG_IO_STAT) |
> \
>(1 << QUEUE_FLAG_SAME_COMP) |  \
> @@ -667,6 +668,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, 
> struct request_queue *q);
>  #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
>  #define blk_queue_registered(q)  test_bit(QUEUE_FLAG_REGISTERED, 
> &(q)->queue_flags)
>  #define blk_queue_nowait(q)  test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
> +#define blk_queue_limit_bio_size(q)  \
> + test_bit(QUEUE_FLAG_LIMIT_BIO_SIZE, &(q)->queue_flags)
>  
>  extern void blk_set_pm_only(struct request_queue *q);
>  extern void blk_clear_pm_only(struct request_queue *q);
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v3 1/2] bio: limit bio max size

2021-01-26 Thread Damien Le Moal
On 2021/01/26 15:07, Ming Lei wrote:
> On Tue, Jan 26, 2021 at 04:06:06AM +0000, Damien Le Moal wrote:
>> On 2021/01/26 12:58, Ming Lei wrote:
>>> On Tue, Jan 26, 2021 at 10:32:34AM +0900, Changheun Lee wrote:
>>>> bio size can grow up to 4GB when muli-page bvec is enabled.
>>>> but sometimes it would lead to inefficient behaviors.
>>>> in case of large chunk direct I/O, - 32MB chunk read in user space -
>>>> all pages for 32MB would be merged to a bio structure if the pages
>>>> physical addresses are contiguous. it makes some delay to submit
>>>> until merge complete. bio max size should be limited to a proper size.
>>>>
>>>> When 32MB chunk read with direct I/O option is coming from userspace,
>>>> kernel behavior is below now. it's timeline.
>>>>
>>>>  | bio merge for 32MB. total 8,192 pages are merged.
>>>>  | total elapsed time is over 2ms.
>>>>  |-- ... --->|
>>>>  | 8,192 pages merged a 
>>>> bio.
>>>>  | at this time, first bio 
>>>> submit is done.
>>>>  | 1 bio is split to 32 
>>>> read request and issue.
>>>>  |--->
>>>>   |--->
>>>>|--->
>>>>   ..
>>>>
>>>> |--->
>>>> 
>>>> |--->|
>>>>   total 19ms elapsed to complete 32MB read done 
>>>> from device. |
>>>>
>>>> If bio max size is limited with 1MB, behavior is changed below.
>>>>
>>>>  | bio merge for 1MB. 256 pages are merged for each bio.
>>>>  | total 32 bio will be made.
>>>>  | total elapsed time is over 2ms. it's same.
>>>>  | but, first bio submit timing is fast. about 100us.
>>>>  |--->|--->|--->|---> ... -->|--->|--->|--->|--->|
>>>>   | 256 pages merged a bio.
>>>>   | at this time, first bio submit is done.
>>>>   | and 1 read request is issued for 1 bio.
>>>>   |--->
>>>>|--->
>>>> |--->
>>>>   ..
>>>>  |--->
>>>>   |--->|
>>>> total 17ms elapsed to complete 32MB read done from device. |
>>>>
>>>> As a result, read request issue timing is faster if bio max size is 
>>>> limited.
>>>> Current kernel behavior with multipage bvec, super large bio can be 
>>>> created.
>>>> And it lead to delay first I/O request issue.
>>>>
>>>> Signed-off-by: Changheun Lee 
>>>> ---
>>>>  block/bio.c| 17 -
>>>>  include/linux/bio.h|  4 +++-
>>>>  include/linux/blkdev.h |  3 +++
>>>>  3 files changed, 22 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/block/bio.c b/block/bio.c
>>>> index 1f2cc1fbe283..ec0281889045 100644
>>>> --- a/block/bio.c
>>>> +++ b/block/bio.c
>>>> @@ -287,6 +287,21 @@ void bio_init(struct bio *bio, struct bio_vec *table,
>>>>  }
>>>>  EXPORT_SYMBOL(bio_init);
>>>>  
>>>> +unsigned int bio_max_size(struct bio *bio)
>>>> +{
>>>> +  struct request_queue *q;
>>>> +
>>>> +  if (!bio->bi_disk)
>>>> +  return UINT_MAX;
>>>> +
>>>> +  q = bio->bi_disk->queue;
>>>> +  if (!blk_queue_limit_bio_size(q))
>>>> +  return UINT_MAX;
>>>> +
>>>> +  return blk_queue_get_max_sectors(q, bio_op(bio)) << SECTOR_SHIFT;
>>>> +}
>>>> +EXPORT_SYMBOL(bio_max_size);
>>>> +
>>>>  /**
>>>>   * bio_reset - reinitialize a bio
>>>>   * @bio: 

Re: [PATCH v3 1/2] bio: limit bio max size

2021-01-26 Thread Damien Le Moal
t;  
> - ret = bio_iov_iter_get_pages(, iter);
> + ret = bio_iov_iter_get_pages(, iter, true);
>   if (unlikely(ret))
>   goto out;
>   ret = bio.bi_iter.bi_size;
> @@ -397,7 +397,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter 
> *iter, int nr_pages)
>   bio->bi_end_io = blkdev_bio_end_io;
>   bio->bi_ioprio = iocb->ki_ioprio;
>  
> - ret = bio_iov_iter_get_pages(bio, iter);
> + ret = bio_iov_iter_get_pages(bio, iter, is_sync);
>   if (unlikely(ret)) {
>   bio->bi_status = BLK_STS_IOERR;
>   bio_endio(bio);
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index ea1e8f696076..5105982a9bf8 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -277,7 +277,8 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, 
> loff_t length,
>   bio->bi_private = dio;
>   bio->bi_end_io = iomap_dio_bio_end_io;
>  
> - ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
> + ret = bio_iov_iter_get_pages(bio, dio->submit.iter,
> + is_sync_kiocb(dio->iocb));
>   if (unlikely(ret)) {
>   /*
>* We have to stop part way through an IO. We must fall
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index bec47f2d074b..c95ac37f9305 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -690,7 +690,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, 
> struct iov_iter *from)
>   if (iocb->ki_flags & IOCB_DSYNC)
>   bio->bi_opf |= REQ_FUA;
>  
> - ret = bio_iov_iter_get_pages(bio, from);
> + ret = bio_iov_iter_get_pages(bio, from, is_sync_kiocb(iocb));
>   if (unlikely(ret))
>   goto out_release;
>  
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 676870b2c88d..fa3a503b955c 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -472,7 +472,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page 
> *page,
>   unsigned int len, unsigned int off, bool *same_page);
>  void __bio_add_page(struct bio *bio, struct page *page,
>   unsigned int len, unsigned int off);
> -int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
> +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter, bool 
> sync);
>  void bio_release_pages(struct bio *bio, bool mark_dirty);
>  extern void bio_set_pages_dirty(struct bio *bio);
>  extern void bio_check_pages_dirty(struct bio *bio);
> 
> 
> Thanks,
> Ming
> 
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH 1/1] scsi: sd: use max_xfer_blocks for set rw_max if max_xfer_blocks is available

2021-01-21 Thread Damien Le Moal
validate_max_xfer_size(struct scsi_disk *sdkp,
>> const char *name,
>> unsigned int xfer_blocks,
>> unsigned int dev_max)
>>
>> To allow checking both opt_xfer_blocks and max_xfer_blocks ?
>>
>>>> +
>>>> /*
>>>> * Determine the device's preferred I/O size for reads and writes
>>>> * unless the reported value is unreasonably small, large, not a
>>>> @@ -3233,12 +3280,13 @@ static int sd_revalidate_disk(struct gendisk *disk)
>>>>
>>>> /* Initial block count limit based on CDB TRANSFER LENGTH field size. */
>>>> dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS;
>>
>> This looks weird: no indentation. Care to resend ?
>>
>>>> -
>>>> -  /* Some devices report a maximum block count for READ/WRITE requests. */
>>>> -  dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks);
>>>> q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max);
>>>>
>>>> -  if (sd_validate_opt_xfer_size(sdkp, dev_max)) {
>>>> +  if (sd_validate_max_xfer_size(sdkp, dev_max)) {
>>>> +  q->limits.io_opt = 0;
>>>> +  rw_max = logical_to_sectors(sdp, sdkp->max_xfer_blocks);
>>>> +  q->limits.max_dev_sectors = rw_max;
>>>> +  } else if (sd_validate_opt_xfer_size(sdkp, dev_max)) {
>>
>> This does not look correct to me. This renders the device reported
>> opt_xfer_blocks useless.
>>
>> The unmodified code sets dev_max to the min of SD_MAX_XFER_BLOCKS or
>> SD_DEF_XFER_BLOCKS and of the device reported max_xfer_blocks. The result of
>> this is used as the device max_dev_sectors queue limit, which in turn is 
>> used to
>> set the max_hw_sectors queue limit accounting for the adapter limits too.
>>
>> opt_xfer_blocks, if it is valid, will be used to set the io_opt queue limit,
>> which is a hint. This hint is used to optimize the "soft" max_sectors command
>> limit used by the block layer to limit command size if the value of
>> opt_xfer_blocks is smaller than the limit initially set with max_xfer_blocks.
>>
>> So if for your device max_sectors end up being too small, it is likely 
>> because
>> the device itself is reporting an opt_xfer_blocks value that is too small for
>> its own good. The max_sectors limit can be manually increased with "echo xxx 
>> >
>> /sys/block/sdX/queue/max_sectors_kb". A udev rule can be used to handle this
>> autmatically if needed.
>>
>> But to get a saner default for that device, I do not think that this patch is
>> the right solution. Ideally, the device peculiarity should be handled with a
>> quirk, but that is not used in scsi. So beside the udev rule trick, I am not
>> sure what the right approach is here.
>>
> 
> This approach is for using sdkp->max_xfer_blocks as a rw_max.
> There are no way to use it now when sdkp->opt_xfer_blocks is valid.
> In my case, scsi device reports both of sdkp->max_xfer_blocks, and
> sdkp->opt_xfer_blocks.
> 
> How about set larger valid value between sdkp->max_xfer_blocks,
> and sdkp->opt_xfer_blocks to rw_max?

Again, if your device reports an opt_xfer_blocks value that is too small for its
own good, that is a problem with this device. The solution for that is not to
change something that will affect *all* other storage devices, including those
with a perfectly valid opt_xfer_blocks value.

I think that the solution should be at the LLD level, for that device only. But
I am not sure how to communicate a quirk for opt_xfer_blocks back to the generic
sd driver. You should explore a solution like that. Others may have ideas about
this too. Wait for more comments.

> 
>>>> q->limits.io_opt = logical_to_bytes(sdp, sdkp->opt_xfer_blocks);
>>>> rw_max = logical_to_sectors(sdp, sdkp->opt_xfer_blocks);
>>>> } else {
>>>> -- 
>>>> 2.29.0
>>>>
>>>>
>>>
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v2] bio: limit bio max size

2021-01-21 Thread Damien Le Moal
se of devices that do not care about limiting the bio size
(like now), this will add one boolean evaluation (queue flag test). That's it.
For your case, sure you now have 2 boolean evals instead of one. But that must
be put in perspective with the cost of increasing the bio size.

> 
> bool __bio_try_merge_page(struct bio *bio, struct page *page,
>   unsigned int len, unsigned int off, bool *same_page)
> {
>   ...
>   if (page_is_mergeable(bv, page, len, off, same_page)) {
>   if (bio->bi_iter.bi_size > UINT_MAX - len) {
>   *same_page = false;
>   return false;
>   }
> 
> + if (blk_queue_limit_bio_max_size(bio) &&
> +(bio->bi_iter.bi_size >
> blk_queue_get_bio_max_size(bio) - len)) {
> + *same_page = false;
> + return false;
> + }
> 
>   bv->bv_len += len;
>   bio->bi_iter.bi_size += len;
>   return true;
>   }
>   ...
> }
> 
> 
> static inline bool bio_full(struct bio *bio, unsigned len)
> {
>   ...
>   if (bio->bi_iter.bi_size > UINT_MAX - len)
>   return true;
> 
> + if (blk_queue_limit_bio_max_size(bio) &&
> +(bio->bi_iter.bi_size > blk_queue_get_bio_max_size(bio) - len))
> + return true;
>   ...
> }
> 
> 
> Page merge is CPU-bound job as you said.
> How about below with adding of bi_max_size in bio?

I am not a fan of adding a bio field for using it only in one place.
This is only my opinion. I will let others comment about this, but personnally
I would rather do something like this:

#define blk_queue_limit_bio_merge_size(q) \
test_bit(QUEUE_FLAG_LIMIT_MERGE, &(q)->queue_flags)

static inline unsigned int bio_max_merge_size(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;

if (blk_queue_limit_bio_merge_size(q))
    return blk_queue_get_max_sectors(q, bio_op(bio))
<< SECTOR_SHIFT;
return UINT_MAX;
}

and use that helper in __bio_try_merge_page(), e.g.:

if (bio->bi_iter.bi_size > bio_max_merge_size(bio) - len) {
*same_page = false;
return false;
}

No need to change the bio struct.

If you measure performance with and without this change on nullblk, you can
verify if it has any impact for regular devices. And for your use case, that
should give you the same performance.

> 
> bool __bio_try_merge_page(struct bio *bio, struct page *page,
>   unsigned int len, unsigned int off, bool *same_page)
> {
>   ...
>   if (page_is_mergeable(bv, page, len, off, same_page)) {
> - if (bio->bi_iter.bi_size > UINT_MAX - len) {
> + if (bio->bi_iter.bi_size > bio->bi_max_size - len) {
>   *same_page = false;
>   return false;
>   }
> 
>   bv->bv_len += len;
>   bio->bi_iter.bi_size += len;
>   return true;
>   }
>   ...
> }
> 
> 
> static inline bool bio_full(struct bio *bio, unsigned len)
> {
>   ...
> - if (bio->bi_iter.bi_size > UINT_MAX - len)
> + if (bio->bi_iter.bi_size > bio->bi_max_size - len)
>   return true;
>   ...
> }
> 
> +void bio_set_dev(struct bio *bio, struct block_device *bdev)
> +{
> + if (bio->bi_disk != bdev->bd_disk)
> + bio_clear_flag(bio, BIO_THROTTLED);
> +
> + bio->bi_disk = bdev->bd_disk;
> + bio->bi_partno = bdev->bd_partno;
> + if (blk_queue_limit_bio_max_size(bio))
> + bio->bi_max_size = blk_queue_get_bio_max_size(bio);
> +
> + bio_associate_blkg(bio);
> +}
> +EXPORT_SYMBOL(bio_set_dev);
> 
> > -- 
> > Damien Le Moal
> > Western Digital Research
> 
> ---
> Changheun Lee
> Samsung Electronics

-- 
Damien Le Moal
Western Digital


Re: [PATCH v2] bio: limit bio max size.

2021-01-20 Thread Damien Le Moal
>   if (bio->bi_vcnt >= bio->bi_max_vecs)
>   return true;
>  
> - if (bio->bi_iter.bi_size > UINT_MAX - len)
> + if (bio->bi_iter.bi_size > bio->bi_max_size - len)
>   return true;
>  
>   return false;
> @@ -482,20 +482,13 @@ extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned 
> long *, mempool_t *);
>  extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
>  extern unsigned int bvec_nr_vecs(unsigned short idx);
>  extern const char *bio_devname(struct bio *bio, char *buffer);
> -
> -#define bio_set_dev(bio, bdev)   \
> -do { \
> - if ((bio)->bi_disk != (bdev)->bd_disk)  \
> - bio_clear_flag(bio, BIO_THROTTLED);\
> - (bio)->bi_disk = (bdev)->bd_disk;   \
> - (bio)->bi_partno = (bdev)->bd_partno;   \
> - bio_associate_blkg(bio);\
> -} while (0)
> +extern void bio_set_dev(struct bio *bio, struct block_device *bdev);
>  
>  #define bio_copy_dev(dst, src)   \
>  do { \
>   (dst)->bi_disk = (src)->bi_disk;\
>   (dst)->bi_partno = (src)->bi_partno;\
> + (dst)->bi_max_size = (src)->bi_max_size;\
>   bio_clone_blkg_association(dst, src);   \
>  } while (0)
>  
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 866f74261b3b..e5dd5b7d8fc1 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -270,6 +270,7 @@ struct bio {
>*/
>  
>   unsigned short  bi_max_vecs;/* max bvl_vecs we can hold */
> + unsigned intbi_max_size;/* max data size we can hold */
>  
>   atomic_t__bi_cnt;   /* pin count */

This modification comes at the cost of increasing the bio structure size to
simply tell the block layer "do not delay BIO splitting"...

I think there is a much simpler approach. What about:

1) Use a request queue flag to indicate "limit BIO size"
2) modify __bio_try_merge_page() to look at that flag to disallow page merging
if the bio size exceeds blk_queue_get_max_sectors(), or more ideally a version
of it that takes into account the bio start sector.
3) Set the "limit bio size" queue flag in the driver of the device that benefit
from this change. Eventually, that could also be controlled through sysfs.

With such change, you will get the same result without having to increase the
BIO structure size.


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH 1/1] scsi: sd: use max_xfer_blocks for set rw_max if max_xfer_blocks is available

2021-01-20 Thread Damien Le Moal
 Some devices report a maximum block count for READ/WRITE requests. */
>> -dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks);
>> q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max);
>>
>> -if (sd_validate_opt_xfer_size(sdkp, dev_max)) {
>> +if (sd_validate_max_xfer_size(sdkp, dev_max)) {
>> +q->limits.io_opt = 0;
>> +rw_max = logical_to_sectors(sdp, sdkp->max_xfer_blocks);
>> +q->limits.max_dev_sectors = rw_max;
>> +} else if (sd_validate_opt_xfer_size(sdkp, dev_max)) {

This does not look correct to me. This renders the device reported
opt_xfer_blocks useless.

The unmodified code sets dev_max to the min of SD_MAX_XFER_BLOCKS or
SD_DEF_XFER_BLOCKS and of the device reported max_xfer_blocks. The result of
this is used as the device max_dev_sectors queue limit, which in turn is used to
set the max_hw_sectors queue limit accounting for the adapter limits too.

opt_xfer_blocks, if it is valid, will be used to set the io_opt queue limit,
which is a hint. This hint is used to optimize the "soft" max_sectors command
limit used by the block layer to limit command size if the value of
opt_xfer_blocks is smaller than the limit initially set with max_xfer_blocks.

So if for your device max_sectors end up being too small, it is likely because
the device itself is reporting an opt_xfer_blocks value that is too small for
its own good. The max_sectors limit can be manually increased with "echo xxx >
/sys/block/sdX/queue/max_sectors_kb". A udev rule can be used to handle this
autmatically if needed.

But to get a saner default for that device, I do not think that this patch is
the right solution. Ideally, the device peculiarity should be handled with a
quirk, but that is not used in scsi. So beside the udev rule trick, I am not
sure what the right approach is here.

>> q->limits.io_opt = logical_to_bytes(sdp, sdkp->opt_xfer_blocks);
>> rw_max = logical_to_sectors(sdp, sdkp->opt_xfer_blocks);
>> } else {
>> -- 
>> 2.29.0
>>
>>
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] bio: limit bio max size.

2021-01-13 Thread Damien Le Moal
On 2021/01/14 12:53, Ming Lei wrote:
> On Wed, Jan 13, 2021 at 12:02:44PM +0000, Damien Le Moal wrote:
>> On 2021/01/13 20:48, Ming Lei wrote:
>>> On Wed, Jan 13, 2021 at 11:16:11AM +0000, Damien Le Moal wrote:
>>>> On 2021/01/13 19:25, Ming Lei wrote:
>>>>> On Wed, Jan 13, 2021 at 09:28:02AM +, Damien Le Moal wrote:
>>>>>> On 2021/01/13 18:19, Ming Lei wrote:
>>>>>>> On Wed, Jan 13, 2021 at 12:09 PM Changheun Lee  
>>>>>>> wrote:
>>>>>>>>
>>>>>>>>> On 2021/01/12 21:14, Changheun Lee wrote:
>>>>>>>>>>> On 2021/01/12 17:52, Changheun Lee wrote:
>>>>>>>>>>>> From: "Changheun Lee" 
>>>>>>>>>>>>
>>>>>>>>>>>> bio size can grow up to 4GB when muli-page bvec is enabled.
>>>>>>>>>>>> but sometimes it would lead to inefficient behaviors.
>>>>>>>>>>>> in case of large chunk direct I/O, - 64MB chunk read in user space 
>>>>>>>>>>>> -
>>>>>>>>>>>> all pages for 64MB would be merged to a bio structure if memory 
>>>>>>>>>>>> address is
>>>>>>>>>>>> continued phsycally. it makes some delay to submit until merge 
>>>>>>>>>>>> complete.
>>>>>>>>>>>> bio max size should be limited as a proper size.
>>>>>>>>>>>
>>>>>>>>>>> But merging physically contiguous pages into the same bvec + later 
>>>>>>>>>>> automatic bio
>>>>>>>>>>> split on submit should give you better throughput for large IOs 
>>>>>>>>>>> compared to
>>>>>>>>>>> having to issue a bio chain of smaller BIOs that are arbitrarily 
>>>>>>>>>>> sized and will
>>>>>>>>>>> likely need splitting anyway (because of DMA boundaries etc).
>>>>>>>>>>>
>>>>>>>>>>> Do you have a specific case where you see higher performance with 
>>>>>>>>>>> this patch
>>>>>>>>>>> applied ? On Intel, BIO_MAX_SIZE would be 1MB... That is arbitrary 
>>>>>>>>>>> and too small
>>>>>>>>>>> considering that many hardware can execute larger IOs than that.
>>>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> When I tested 32MB chunk read with O_DIRECT in android, all pages of 
>>>>>>>>>> 32MB
>>>>>>>>>> is merged into a bio structure.
>>>>>>>>>> And elapsed time to merge complete was about 2ms.
>>>>>>>>>> It means first bio-submit is after 2ms.
>>>>>>>>>> If bio size is limited with 1MB with this patch, first bio-submit is 
>>>>>>>>>> about
>>>>>>>>>> 100us by bio_full operation.
>>>>>>>>>
>>>>>>>>> bio_submit() will split the large BIO case into multiple requests 
>>>>>>>>> while the
>>>>>>>>> small BIO case will likely result one or two requests only. That 
>>>>>>>>> likely explain
>>>>>>>>> the time difference here. However, for the large case, the 2ms will 
>>>>>>>>> issue ALL
>>>>>>>>> requests needed for processing the entire 32MB user IO while the 1MB 
>>>>>>>>> bio case
>>>>>>>>> will need 32 different bio_submit() calls. So what is the actual 
>>>>>>>>> total latency
>>>>>>>>> difference for the entire 32MB user IO ? That is I think what needs 
>>>>>>>>> to be
>>>>>>>>> compared here.
>>>>>>>>>
>>>>>>>>> Also, what is your device max_sectors_kb and max queue depth ?
>>>>>>>>>
>>>>>>>>
>>>>>>>> 32MB total latency is about 19ms including merge time without this 
>>>>>>>> patch.
>>>>>>>> But with this patch, total latenc

Re: [PATCH] bio: limit bio max size.

2021-01-13 Thread Damien Le Moal
On 2021/01/13 20:48, Ming Lei wrote:
> On Wed, Jan 13, 2021 at 11:16:11AM +0000, Damien Le Moal wrote:
>> On 2021/01/13 19:25, Ming Lei wrote:
>>> On Wed, Jan 13, 2021 at 09:28:02AM +0000, Damien Le Moal wrote:
>>>> On 2021/01/13 18:19, Ming Lei wrote:
>>>>> On Wed, Jan 13, 2021 at 12:09 PM Changheun Lee  
>>>>> wrote:
>>>>>>
>>>>>>> On 2021/01/12 21:14, Changheun Lee wrote:
>>>>>>>>> On 2021/01/12 17:52, Changheun Lee wrote:
>>>>>>>>>> From: "Changheun Lee" 
>>>>>>>>>>
>>>>>>>>>> bio size can grow up to 4GB when muli-page bvec is enabled.
>>>>>>>>>> but sometimes it would lead to inefficient behaviors.
>>>>>>>>>> in case of large chunk direct I/O, - 64MB chunk read in user space -
>>>>>>>>>> all pages for 64MB would be merged to a bio structure if memory 
>>>>>>>>>> address is
>>>>>>>>>> continued phsycally. it makes some delay to submit until merge 
>>>>>>>>>> complete.
>>>>>>>>>> bio max size should be limited as a proper size.
>>>>>>>>>
>>>>>>>>> But merging physically contiguous pages into the same bvec + later 
>>>>>>>>> automatic bio
>>>>>>>>> split on submit should give you better throughput for large IOs 
>>>>>>>>> compared to
>>>>>>>>> having to issue a bio chain of smaller BIOs that are arbitrarily 
>>>>>>>>> sized and will
>>>>>>>>> likely need splitting anyway (because of DMA boundaries etc).
>>>>>>>>>
>>>>>>>>> Do you have a specific case where you see higher performance with 
>>>>>>>>> this patch
>>>>>>>>> applied ? On Intel, BIO_MAX_SIZE would be 1MB... That is arbitrary 
>>>>>>>>> and too small
>>>>>>>>> considering that many hardware can execute larger IOs than that.
>>>>>>>>>
>>>>>>>>
>>>>>>>> When I tested 32MB chunk read with O_DIRECT in android, all pages of 
>>>>>>>> 32MB
>>>>>>>> is merged into a bio structure.
>>>>>>>> And elapsed time to merge complete was about 2ms.
>>>>>>>> It means first bio-submit is after 2ms.
>>>>>>>> If bio size is limited with 1MB with this patch, first bio-submit is 
>>>>>>>> about
>>>>>>>> 100us by bio_full operation.
>>>>>>>
>>>>>>> bio_submit() will split the large BIO case into multiple requests while 
>>>>>>> the
>>>>>>> small BIO case will likely result one or two requests only. That likely 
>>>>>>> explain
>>>>>>> the time difference here. However, for the large case, the 2ms will 
>>>>>>> issue ALL
>>>>>>> requests needed for processing the entire 32MB user IO while the 1MB 
>>>>>>> bio case
>>>>>>> will need 32 different bio_submit() calls. So what is the actual total 
>>>>>>> latency
>>>>>>> difference for the entire 32MB user IO ? That is I think what needs to 
>>>>>>> be
>>>>>>> compared here.
>>>>>>>
>>>>>>> Also, what is your device max_sectors_kb and max queue depth ?
>>>>>>>
>>>>>>
>>>>>> 32MB total latency is about 19ms including merge time without this patch.
>>>>>> But with this patch, total latency is about 17ms including merge time 
>>>>>> too.
>>>>>
>>>>> 19ms looks too big just for preparing one 32MB sized bio, which isn't
>>>>> supposed to
>>>>> take so long.  Can you investigate where the 19ms is taken just for
>>>>> preparing one
>>>>> 32MB sized bio?
>>>>
>>>> Changheun mentioned that the device side IO latency is 16.7ms out of the 
>>>> 19ms
>>>> total. So the BIO handling, submission+completion takes about 2.3ms, and
>>>> Changheun points above to 2ms for the submission part.
>>>
>>> OK, looks I m

Re: [PATCH] bio: limit bio max size.

2021-01-13 Thread Damien Le Moal
On 2021/01/13 19:25, Ming Lei wrote:
> On Wed, Jan 13, 2021 at 09:28:02AM +0000, Damien Le Moal wrote:
>> On 2021/01/13 18:19, Ming Lei wrote:
>>> On Wed, Jan 13, 2021 at 12:09 PM Changheun Lee  
>>> wrote:
>>>>
>>>>> On 2021/01/12 21:14, Changheun Lee wrote:
>>>>>>> On 2021/01/12 17:52, Changheun Lee wrote:
>>>>>>>> From: "Changheun Lee" 
>>>>>>>>
>>>>>>>> bio size can grow up to 4GB when muli-page bvec is enabled.
>>>>>>>> but sometimes it would lead to inefficient behaviors.
>>>>>>>> in case of large chunk direct I/O, - 64MB chunk read in user space -
>>>>>>>> all pages for 64MB would be merged to a bio structure if memory 
>>>>>>>> address is
>>>>>>>> continued phsycally. it makes some delay to submit until merge 
>>>>>>>> complete.
>>>>>>>> bio max size should be limited as a proper size.
>>>>>>>
>>>>>>> But merging physically contiguous pages into the same bvec + later 
>>>>>>> automatic bio
>>>>>>> split on submit should give you better throughput for large IOs 
>>>>>>> compared to
>>>>>>> having to issue a bio chain of smaller BIOs that are arbitrarily sized 
>>>>>>> and will
>>>>>>> likely need splitting anyway (because of DMA boundaries etc).
>>>>>>>
>>>>>>> Do you have a specific case where you see higher performance with this 
>>>>>>> patch
>>>>>>> applied ? On Intel, BIO_MAX_SIZE would be 1MB... That is arbitrary and 
>>>>>>> too small
>>>>>>> considering that many hardware can execute larger IOs than that.
>>>>>>>
>>>>>>
>>>>>> When I tested 32MB chunk read with O_DIRECT in android, all pages of 32MB
>>>>>> is merged into a bio structure.
>>>>>> And elapsed time to merge complete was about 2ms.
>>>>>> It means first bio-submit is after 2ms.
>>>>>> If bio size is limited with 1MB with this patch, first bio-submit is 
>>>>>> about
>>>>>> 100us by bio_full operation.
>>>>>
>>>>> bio_submit() will split the large BIO case into multiple requests while 
>>>>> the
>>>>> small BIO case will likely result one or two requests only. That likely 
>>>>> explain
>>>>> the time difference here. However, for the large case, the 2ms will issue 
>>>>> ALL
>>>>> requests needed for processing the entire 32MB user IO while the 1MB bio 
>>>>> case
>>>>> will need 32 different bio_submit() calls. So what is the actual total 
>>>>> latency
>>>>> difference for the entire 32MB user IO ? That is I think what needs to be
>>>>> compared here.
>>>>>
>>>>> Also, what is your device max_sectors_kb and max queue depth ?
>>>>>
>>>>
>>>> 32MB total latency is about 19ms including merge time without this patch.
>>>> But with this patch, total latency is about 17ms including merge time too.
>>>
>>> 19ms looks too big just for preparing one 32MB sized bio, which isn't
>>> supposed to
>>> take so long.  Can you investigate where the 19ms is taken just for
>>> preparing one
>>> 32MB sized bio?
>>
>> Changheun mentioned that the device side IO latency is 16.7ms out of the 19ms
>> total. So the BIO handling, submission+completion takes about 2.3ms, and
>> Changheun points above to 2ms for the submission part.
> 
> OK, looks I misunderstood the data.
> 
>>
>>>
>>> It might be iov_iter_get_pages() for handling page fault. If yes, one 
>>> suggestion
>>> is to enable THP(Transparent HugePage Support) in your application.
>>
>> But if that was due to page faults, the same large-ish time would be taken 
>> for
>> the preparing the size-limited BIOs too, no ? No matter how the BIOs are 
>> diced,
>> all 32MB of pages of the user IO are referenced...
> 
> If bio size is reduced to 1MB, just 256 pages need to be faulted before 
> submitting this
> bio, instead of 256*32 pages, that is why the following words are mentioned:
> 
>   It means first bio-submit is after 2ms.
>   If bio size is limited with 1MB with this patch, first bio-submit is 
> about
>   100us by bio_full operation.

Yes, but eventually, all pages for the 32MB IO will be faulted in, just not in
one go. Overall number of page faults is likely the same as with the large BIO
preparation. So I think we are back to my previous point, that is, reducing the
device idle time by starting a BIO more quickly, even a small one, leads to
overlap between CPU time needed for the next BIO preparation and previous BIO
execution, reducing overall the latency for the entire 32MB user IO. I don't
think that the reason is page faulting in itself.

> 
> 
> Thanks,
> Ming
> 
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] bio: limit bio max size.

2021-01-13 Thread Damien Le Moal
On 2021/01/13 18:19, Ming Lei wrote:
> On Wed, Jan 13, 2021 at 12:09 PM Changheun Lee  wrote:
>>
>>> On 2021/01/12 21:14, Changheun Lee wrote:
>>>>> On 2021/01/12 17:52, Changheun Lee wrote:
>>>>>> From: "Changheun Lee" 
>>>>>>
>>>>>> bio size can grow up to 4GB when muli-page bvec is enabled.
>>>>>> but sometimes it would lead to inefficient behaviors.
>>>>>> in case of large chunk direct I/O, - 64MB chunk read in user space -
>>>>>> all pages for 64MB would be merged to a bio structure if memory address 
>>>>>> is
>>>>>> continued phsycally. it makes some delay to submit until merge complete.
>>>>>> bio max size should be limited as a proper size.
>>>>>
>>>>> But merging physically contiguous pages into the same bvec + later 
>>>>> automatic bio
>>>>> split on submit should give you better throughput for large IOs compared 
>>>>> to
>>>>> having to issue a bio chain of smaller BIOs that are arbitrarily sized 
>>>>> and will
>>>>> likely need splitting anyway (because of DMA boundaries etc).
>>>>>
>>>>> Do you have a specific case where you see higher performance with this 
>>>>> patch
>>>>> applied ? On Intel, BIO_MAX_SIZE would be 1MB... That is arbitrary and 
>>>>> too small
>>>>> considering that many hardware can execute larger IOs than that.
>>>>>
>>>>
>>>> When I tested 32MB chunk read with O_DIRECT in android, all pages of 32MB
>>>> is merged into a bio structure.
>>>> And elapsed time to merge complete was about 2ms.
>>>> It means first bio-submit is after 2ms.
>>>> If bio size is limited with 1MB with this patch, first bio-submit is about
>>>> 100us by bio_full operation.
>>>
>>> bio_submit() will split the large BIO case into multiple requests while the
>>> small BIO case will likely result one or two requests only. That likely 
>>> explain
>>> the time difference here. However, for the large case, the 2ms will issue 
>>> ALL
>>> requests needed for processing the entire 32MB user IO while the 1MB bio 
>>> case
>>> will need 32 different bio_submit() calls. So what is the actual total 
>>> latency
>>> difference for the entire 32MB user IO ? That is I think what needs to be
>>> compared here.
>>>
>>> Also, what is your device max_sectors_kb and max queue depth ?
>>>
>>
>> 32MB total latency is about 19ms including merge time without this patch.
>> But with this patch, total latency is about 17ms including merge time too.
> 
> 19ms looks too big just for preparing one 32MB sized bio, which isn't
> supposed to
> take so long.  Can you investigate where the 19ms is taken just for
> preparing one
> 32MB sized bio?

Changheun mentioned that the device side IO latency is 16.7ms out of the 19ms
total. So the BIO handling, submission+completion takes about 2.3ms, and
Changheun points above to 2ms for the submission part.

> 
> It might be iov_iter_get_pages() for handling page fault. If yes, one 
> suggestion
> is to enable THP(Transparent HugePage Support) in your application.

But if that was due to page faults, the same large-ish time would be taken for
the preparing the size-limited BIOs too, no ? No matter how the BIOs are diced,
all 32MB of pages of the user IO are referenced...

> 
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] bio: limit bio max size.

2021-01-12 Thread Damien Le Moal
large BIO is a lot more
efficient, CPU wise, than building and issuing a lot of small BIOs. That gives a
lot of benefits on high-end desktops and servers with fast CPUs, but is counter
productive in your case with a slower CPU.

I wonder: what is the user IO size when you start seeing a performance drop
without the patch ? It is clear that limiting the BIO size does imporve things
for the 32MB IO size you tested, but what about more realistic workloads with
128K or so IO sizes (typical IO size for an FS using the page cache) ?

> 
>>>
>>>>> It's not large delay and can't be observed with low speed device.
>>>>> But it's needed to reduce merge delay for high speed device.
>>>>> I improved 512MB sequential read performance from 1900MB/s to 2000MB/s
>>>>> with this patch on android platform.
>>>>> As you said, 1MB might be small for some device.
>>>>> But method is needed to re-size, or select the bio max size.
>>>>
>>>> At the very least, I think that such limit should not be arbitrary as your 
>>>> patch
>>>> proposes but rely on the device characteristics (e.g.
>>>> max_hw_sectors_kb/max_sectors_kb and queue depth).
>>>>
>>>
>>> I agree with your opinion, I thought same as your idea. For that, deep 
>>> research
>>> is needed, proper timing to set and bio structure modification, etc ...
>>
>> Why would you need any BIO structure modifications ? Your patch is on the 
>> right
>> track if limiting the BIO size is the right solution (I am not still 
>> completely
>> convinced). E.g., the code:
>>
>> if (page_is_mergeable(bv, page, len, off, same_page)) {
>> if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len) {
>> *same_page = false;
>> return false;
>> }
>>
>> could just become:
>>
>> if (page_is_mergeable(bv, page, len, off, same_page)) {
>> if (bio->bi_iter.bi_size > bio_max_size(bio) - len) {
>> *same_page = false;
>> return false;
>> }
>>
>> With bio_max_size() being something like:
>>
>> static inline size_t bio_max_size(struct bio *bio)
>> {
>> sector_t max_sectors = blk_queue_get_max_sectors(bio->bi_disk->queue,
>> bio_op(bio));
>>
>> return max_sectors << SECTOR_SHIFT;
>> }
>>
>> Note that this is not super efficient as a BIO maximum size depends on the 
>> BIO
>> offset too (its start sector). So writing something similar to
>> blk_rq_get_max_sectors() would probably be better.
> 
> Good suggestion. :)
> 
>>
>>> Current is simple patch for default bio max size.
>>> Before applying of multipage bvec, bio max size was 1MB in kernel 4.x by 
>>> BIO_MAX_PAGES.
>>> So I think 1MB bio max size is reasonable as a default.
>>
>> max_sectors_kb is always defined for any block device so I do not think 
>> there is
>> a need for any arbitrary default value.
>>
>> Since such optimization likely very much depend on the speed of the system 
>> CPU
>> and of the storage device used, it may be a good idea to have this 
>> configurable
>> through sysfs. That is, bio_max_size() simply returns UINT_MAX leading to no
>> change from the current behavior if the optimization is disabled (default) 
>> and
>> max_sectors_kb if it is enabled.
>>
> 
> OK, I agree with you. It will be best for all now.
> I'll try to make this.
> 
>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>> Signed-off-by: Changheun Lee 
>>>>>>> ---
>>>>>>>  block/bio.c | 2 +-
>>>>>>>  include/linux/bio.h | 3 ++-
>>>>>>>  2 files changed, 3 insertions(+), 2 deletions(-)
>>>>>>>
>>>>>>> diff --git a/block/bio.c b/block/bio.c
>>>>>>> index 1f2cc1fbe283..dbe14d675f28 100644
>>>>>>> --- a/block/bio.c
>>>>>>> +++ b/block/bio.c
>>>>>>> @@ -877,7 +877,7 @@ bool __bio_try_merge_page(struct bio *bio, struct 
>>>>>>> page *page,
>>>>>>> struct bio_vec *bv = >bi_io_vec[bio->bi_vcnt - 1];
>>>>>>>  
>>>>>>> if (page_is_mergeable(bv, page, len, off, same_page)) {
>>>>>>> -   if (bio->bi_iter.bi_size > UINT_MAX - len) {
>>>>>>> +   if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len) {
>>>>>>> *same_page = false;
>>>>>>> return false;
>>>>>>> }
>>>>>>> diff --git a/include/linux/bio.h b/include/linux/bio.h
>>>>>>> index 1edda614f7ce..0f49b354b1f6 100644
>>>>>>> --- a/include/linux/bio.h
>>>>>>> +++ b/include/linux/bio.h
>>>>>>> @@ -20,6 +20,7 @@
>>>>>>>  #endif
>>>>>>>  
>>>>>>>  #define BIO_MAX_PAGES  256
>>>>>>> +#define BIO_MAX_SIZE   (BIO_MAX_PAGES * PAGE_SIZE)
>>>>>>>  
>>>>>>>  #define bio_prio(bio)  (bio)->bi_ioprio
>>>>>>>  #define bio_set_prio(bio, prio)((bio)->bi_ioprio = 
>>>>>>> prio)
>>>>>>> @@ -113,7 +114,7 @@ static inline bool bio_full(struct bio *bio, 
>>>>>>> unsigned len)
>>>>>>> if (bio->bi_vcnt >= bio->bi_max_vecs)
>>>>>>> return true;
>>>>>>>  
>>>>>>> -   if (bio->bi_iter.bi_size > UINT_MAX - len)
>>>>>>> +   if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
>>>>>>> return true;
>>>>>>>  
>>>>>>> return false;
>>>>>>>
>>>>>>
>>>>>>
>>>>>> -- 
>>>>>> Damien Le Moal
>>>>>> Western Digital Research
>>>>>
>>>>
>>>>
>>>> -- 
>>>> Damien Le Moal
>>>> Western Digital Research
>>>>
>>>
>>> ---
>>> Changheun Lee
>>> Samsung Electronics
>>>
>>>
>>
>>
>> -- 
>> Damien Le Moal
>> Western Digital Research
>>
> 
> ---
> Changheun Lee
> Samsung Electronics
> 
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] bio: limit bio max size.

2021-01-12 Thread Damien Le Moal
   *same_page = false;
return false;
}

With bio_max_size() being something like:

static inline size_t bio_max_size(struct bio *bio)
{
sector_t max_sectors = blk_queue_get_max_sectors(bio->bi_disk->queue,
 bio_op(bio));

return max_sectors << SECTOR_SHIFT;
}

Note that this is not super efficient as a BIO maximum size depends on the BIO
offset too (its start sector). So writing something similar to
blk_rq_get_max_sectors() would probably be better.

> Current is simple patch for default bio max size.
> Before applying of multipage bvec, bio max size was 1MB in kernel 4.x by 
> BIO_MAX_PAGES.
> So I think 1MB bio max size is reasonable as a default.

max_sectors_kb is always defined for any block device so I do not think there is
a need for any arbitrary default value.

Since such optimization likely very much depend on the speed of the system CPU
and of the storage device used, it may be a good idea to have this configurable
through sysfs. That is, bio_max_size() simply returns UINT_MAX leading to no
change from the current behavior if the optimization is disabled (default) and
max_sectors_kb if it is enabled.

> 
>>>
>>>>
>>>>>
>>>>> Signed-off-by: Changheun Lee 
>>>>> ---
>>>>>  block/bio.c | 2 +-
>>>>>  include/linux/bio.h | 3 ++-
>>>>>  2 files changed, 3 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/block/bio.c b/block/bio.c
>>>>> index 1f2cc1fbe283..dbe14d675f28 100644
>>>>> --- a/block/bio.c
>>>>> +++ b/block/bio.c
>>>>> @@ -877,7 +877,7 @@ bool __bio_try_merge_page(struct bio *bio, struct 
>>>>> page *page,
>>>>>   struct bio_vec *bv = >bi_io_vec[bio->bi_vcnt - 1];
>>>>>  
>>>>>   if (page_is_mergeable(bv, page, len, off, same_page)) {
>>>>> - if (bio->bi_iter.bi_size > UINT_MAX - len) {
>>>>> + if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len) {
>>>>>   *same_page = false;
>>>>>   return false;
>>>>>   }
>>>>> diff --git a/include/linux/bio.h b/include/linux/bio.h
>>>>> index 1edda614f7ce..0f49b354b1f6 100644
>>>>> --- a/include/linux/bio.h
>>>>> +++ b/include/linux/bio.h
>>>>> @@ -20,6 +20,7 @@
>>>>>  #endif
>>>>>  
>>>>>  #define BIO_MAX_PAGES256
>>>>> +#define BIO_MAX_SIZE (BIO_MAX_PAGES * PAGE_SIZE)
>>>>>  
>>>>>  #define bio_prio(bio)(bio)->bi_ioprio
>>>>>  #define bio_set_prio(bio, prio)  ((bio)->bi_ioprio = prio)
>>>>> @@ -113,7 +114,7 @@ static inline bool bio_full(struct bio *bio, unsigned 
>>>>> len)
>>>>>   if (bio->bi_vcnt >= bio->bi_max_vecs)
>>>>>   return true;
>>>>>  
>>>>> - if (bio->bi_iter.bi_size > UINT_MAX - len)
>>>>> + if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
>>>>>   return true;
>>>>>  
>>>>>   return false;
>>>>>
>>>>
>>>>
>>>> -- 
>>>> Damien Le Moal
>>>> Western Digital Research
>>>
>>
>>
>> -- 
>> Damien Le Moal
>> Western Digital Research
>>
> 
> ---
> Changheun Lee
> Samsung Electronics
> 
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] bio: limit bio max size.

2021-01-12 Thread Damien Le Moal
On 2021/01/12 21:14, Changheun Lee wrote:
>> On 2021/01/12 17:52, Changheun Lee wrote:
>>> From: "Changheun Lee" 
>>>
>>> bio size can grow up to 4GB when muli-page bvec is enabled.
>>> but sometimes it would lead to inefficient behaviors.
>>> in case of large chunk direct I/O, - 64MB chunk read in user space -
>>> all pages for 64MB would be merged to a bio structure if memory address is
>>> continued phsycally. it makes some delay to submit until merge complete.
>>> bio max size should be limited as a proper size.
>>
>> But merging physically contiguous pages into the same bvec + later automatic 
>> bio
>> split on submit should give you better throughput for large IOs compared to
>> having to issue a bio chain of smaller BIOs that are arbitrarily sized and 
>> will
>> likely need splitting anyway (because of DMA boundaries etc).
>>
>> Do you have a specific case where you see higher performance with this patch
>> applied ? On Intel, BIO_MAX_SIZE would be 1MB... That is arbitrary and too 
>> small
>> considering that many hardware can execute larger IOs than that.
>>
> 
> When I tested 32MB chunk read with O_DIRECT in android, all pages of 32MB
> is merged into a bio structure.
> And elapsed time to merge complete was about 2ms.
> It means first bio-submit is after 2ms.
> If bio size is limited with 1MB with this patch, first bio-submit is about
> 100us by bio_full operation.

bio_submit() will split the large BIO case into multiple requests while the
small BIO case will likely result one or two requests only. That likely explain
the time difference here. However, for the large case, the 2ms will issue ALL
requests needed for processing the entire 32MB user IO while the 1MB bio case
will need 32 different bio_submit() calls. So what is the actual total latency
difference for the entire 32MB user IO ? That is I think what needs to be
compared here.

Also, what is your device max_sectors_kb and max queue depth ?

> It's not large delay and can't be observed with low speed device.
> But it's needed to reduce merge delay for high speed device.
> I improved 512MB sequential read performance from 1900MB/s to 2000MB/s
> with this patch on android platform.
> As you said, 1MB might be small for some device.
> But method is needed to re-size, or select the bio max size.

At the very least, I think that such limit should not be arbitrary as your patch
proposes but rely on the device characteristics (e.g.
max_hw_sectors_kb/max_sectors_kb and queue depth).

> 
>>
>>>
>>> Signed-off-by: Changheun Lee 
>>> ---
>>>  block/bio.c | 2 +-
>>>  include/linux/bio.h | 3 ++-
>>>  2 files changed, 3 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/block/bio.c b/block/bio.c
>>> index 1f2cc1fbe283..dbe14d675f28 100644
>>> --- a/block/bio.c
>>> +++ b/block/bio.c
>>> @@ -877,7 +877,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page 
>>> *page,
>>> struct bio_vec *bv = >bi_io_vec[bio->bi_vcnt - 1];
>>>  
>>> if (page_is_mergeable(bv, page, len, off, same_page)) {
>>> -   if (bio->bi_iter.bi_size > UINT_MAX - len) {
>>> +   if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len) {
>>> *same_page = false;
>>> return false;
>>> }
>>> diff --git a/include/linux/bio.h b/include/linux/bio.h
>>> index 1edda614f7ce..0f49b354b1f6 100644
>>> --- a/include/linux/bio.h
>>> +++ b/include/linux/bio.h
>>> @@ -20,6 +20,7 @@
>>>  #endif
>>>  
>>>  #define BIO_MAX_PAGES  256
>>> +#define BIO_MAX_SIZE   (BIO_MAX_PAGES * PAGE_SIZE)
>>>  
>>>  #define bio_prio(bio)          (bio)->bi_ioprio
>>>  #define bio_set_prio(bio, prio)((bio)->bi_ioprio = prio)
>>> @@ -113,7 +114,7 @@ static inline bool bio_full(struct bio *bio, unsigned 
>>> len)
>>> if (bio->bi_vcnt >= bio->bi_max_vecs)
>>> return true;
>>>  
>>> -   if (bio->bi_iter.bi_size > UINT_MAX - len)
>>> +   if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
>>> return true;
>>>  
>>> return false;
>>>
>>
>>
>> -- 
>> Damien Le Moal
>> Western Digital Research
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] bio: limit bio max size.

2021-01-12 Thread Damien Le Moal
On 2021/01/12 17:52, Changheun Lee wrote:
> From: "Changheun Lee" 
> 
> bio size can grow up to 4GB when muli-page bvec is enabled.
> but sometimes it would lead to inefficient behaviors.
> in case of large chunk direct I/O, - 64MB chunk read in user space -
> all pages for 64MB would be merged to a bio structure if memory address is
> continued phsycally. it makes some delay to submit until merge complete.
> bio max size should be limited as a proper size.

But merging physically contiguous pages into the same bvec + later automatic bio
split on submit should give you better throughput for large IOs compared to
having to issue a bio chain of smaller BIOs that are arbitrarily sized and will
likely need splitting anyway (because of DMA boundaries etc).

Do you have a specific case where you see higher performance with this patch
applied ? On Intel, BIO_MAX_SIZE would be 1MB... That is arbitrary and too small
considering that many hardware can execute larger IOs than that.


> 
> Signed-off-by: Changheun Lee 
> ---
>  block/bio.c | 2 +-
>  include/linux/bio.h | 3 ++-
>  2 files changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 1f2cc1fbe283..dbe14d675f28 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -877,7 +877,7 @@ bool __bio_try_merge_page(struct bio *bio, struct page 
> *page,
>   struct bio_vec *bv = >bi_io_vec[bio->bi_vcnt - 1];
>  
>   if (page_is_mergeable(bv, page, len, off, same_page)) {
> - if (bio->bi_iter.bi_size > UINT_MAX - len) {
> + if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len) {
>   *same_page = false;
>   return false;
>   }
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 1edda614f7ce..0f49b354b1f6 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -20,6 +20,7 @@
>  #endif
>  
>  #define BIO_MAX_PAGES256
> +#define BIO_MAX_SIZE (BIO_MAX_PAGES * PAGE_SIZE)
>  
>  #define bio_prio(bio)(bio)->bi_ioprio
>  #define bio_set_prio(bio, prio)  ((bio)->bi_ioprio = prio)
> @@ -113,7 +114,7 @@ static inline bool bio_full(struct bio *bio, unsigned len)
>   if (bio->bi_vcnt >= bio->bi_max_vecs)
>   return true;
>  
> - if (bio->bi_iter.bi_size > UINT_MAX - len)
> + if (bio->bi_iter.bi_size > BIO_MAX_SIZE - len)
>   return true;
>  
>   return false;
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] RISC-V: simplify BUILTIN_DTB processing

2021-01-11 Thread Damien Le Moal
On 2021/01/12 6:21, Vitaly Wool wrote:
> Provide __dtb_start as a parameter to setup_vm() in case
> CONFIG_BUILTIN_DTB is true, so we don't have to duplicate
> BUILTIN_DTB specific processing in MMU-enabled and MMU-disabled
> versions of setup_vm().
> 
> Signed-off-by: Vitaly Wool 
> ---
>  arch/riscv/kernel/head.S | 4 
>  arch/riscv/mm/init.c | 4 
>  2 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
> index 16e9941900c4..f5a9bad86e58 100644
> --- a/arch/riscv/kernel/head.S
> +++ b/arch/riscv/kernel/head.S
> @@ -260,7 +260,11 @@ clear_bss_done:
>  
>   /* Initialize page tables and relocate to virtual addresses */
>   la sp, init_thread_union + THREAD_SIZE
> +#ifdef CONFIG_BUILTIN_DTB
> + la a0, __dtb_start
> +#else
>   mv a0, s1
> +#endif /* CONFIG_BUILTIN_DTB */
>   call setup_vm
>  #ifdef CONFIG_MMU
>   la a0, early_pg_dir
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 5b17f8d22f91..45faad7c4291 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -615,11 +615,7 @@ static void __init setup_vm_final(void)
>  #else
>  asmlinkage void __init setup_vm(uintptr_t dtb_pa)
>  {
> -#ifdef CONFIG_BUILTIN_DTB
> - dtb_early_va = (void *) __dtb_start;
> -#else
>   dtb_early_va = (void *)dtb_pa;
> -#endif
>   dtb_early_pa = dtb_pa;
>  }
>  
> 

Tested this with a nommu kernel on a MAIX bit board (K210 SoC). No problems
detected.

Tested-by: Damien Le Moal 

-- 
Damien Le Moal
Western Digital Research


Re: [PATCH 4.19 13/35] null_blk: Fix zone size initialization

2021-01-10 Thread Damien Le Moal
On 2021/01/06 21:55, Pavel Machek wrote:
> Hi!
> 
>> commit 0ebcdd702f49aeb0ad2e2d894f8c124a0acc6e23 upstream.
>>
>> For a null_blk device with zoned mode enabled is currently initialized
>> with a number of zones equal to the device capacity divided by the zone
>> size, without considering if the device capacity is a multiple of the
>> zone size. If the zone size is not a divisor of the capacity, the zones
>> end up not covering the entire capacity, potentially resulting is out
>> of bounds accesses to the zone array.
>>
>> Fix this by adding one last smaller zone with a size equal to the
>> remainder of the disk capacity divided by the zone size if the capacity
>> is not a multiple of the zone size. For such smaller last zone, the zone
>> capacity is also checked so that it does not exceed the smaller zone
>> size.
> 
>> --- a/drivers/block/null_blk_zoned.c
>> +++ b/drivers/block/null_blk_zoned.c
>> @@ -1,9 +1,9 @@
>>  // SPDX-License-Identifier: GPL-2.0
>>  #include 
>> +#include 
>>  #include "null_blk.h"
>>  
>> -/* zone_size in MBs to sectors. */
>> -#define ZONE_SIZE_SHIFT 11
>> +#define MB_TO_SECTS(mb) (((sector_t)mb * SZ_1M) >> SECTOR_SHIFT)
> 
> This macro is quite dangerous. (mb) would help, but inline function
> would be better.

Indeed.

> 
> 
>> +dev->nr_zones = dev_capacity_sects >> ilog2(dev->zone_size_sects);
>> +if (dev_capacity_sects & (dev->zone_size_sects - 1))
>> +dev->nr_zones++;
> 
> Is this same as nr_zones = DIV_ROUND_UP(dev_capacity_sects,
> dev->zone_size_sects)? Would that be faster, more readable and robust
> against weird dev->zone_size_sects sizes?

Yes, we can change to this to be more readable.
Will send a cleanup patch. Thanks !

> 
> Best regards,
>   Pavel
> 


-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH v4 2/3] block: add simple copy support

2021-01-05 Thread Damien Le Moal
On 2021/01/05 21:24, Selva Jove wrote:
> Thanks for the review, Damien.
> 
> On Mon, Jan 4, 2021 at 6:17 PM Damien Le Moal  wrote:
>>
>> On 2021/01/04 19:48, SelvaKumar S wrote:
>>> Add new BLKCOPY ioctl that offloads copying of one or more sources
>>> ranges to a destination in the device. Accepts copy_ranges that contains
>>> destination, no of sources and pointer to the array of source
>>> ranges. Each range_entry contains start and length of source
>>> ranges (in bytes).
>>>
>>> Introduce REQ_OP_COPY, a no-merge copy offload operation. Create
>>> bio with control information as payload and submit to the device.
>>> REQ_OP_COPY(19) is a write op and takes zone_write_lock when submitted
>>> to zoned device.
>>>
>>> If the device doesn't support copy or copy offload is disabled, then
>>> copy is emulated by allocating memory of total copy size. The source
>>> ranges are read into memory by chaining bio for each source ranges and
>>> submitting them async and the last bio waits for completion. After data
>>> is read, it is written to the destination.
>>>
>>> bio_map_kern() is used to allocate bio and add pages of copy buffer to
>>> bio. As bio->bi_private and bio->bi_end_io is needed for chaining the
>>> bio and over written, invalidate_kernel_vmap_range() for read is called
>>> in the caller.
>>>
>>> Introduce queue limits for simple copy and other helper functions.
>>> Add device limits as sysfs entries.
>>>   - copy_offload
>>>   - max_copy_sectors
>>>   - max_copy_ranges_sectors
>>>   - max_copy_nr_ranges
>>>
>>> copy_offload(= 0) is disabled by default.
>>> max_copy_sectors = 0 indicates the device doesn't support native copy.
>>>
>>> Native copy offload is not supported for stacked devices and is done via
>>> copy emulation.
>>>
>>> Signed-off-by: SelvaKumar S 
>>> Signed-off-by: Kanchan Joshi 
>>> Signed-off-by: Nitesh Shetty 
>>> Signed-off-by: Javier González 
>>> ---
>>>  block/blk-core.c  |  94 ++--
>>>  block/blk-lib.c   | 223 ++
>>>  block/blk-merge.c |   2 +
>>>  block/blk-settings.c  |  10 ++
>>>  block/blk-sysfs.c |  50 +
>>>  block/blk-zoned.c |   1 +
>>>  block/bounce.c|   1 +
>>>  block/ioctl.c |  43 
>>>  include/linux/bio.h   |   1 +
>>>  include/linux/blk_types.h |  15 +++
>>>  include/linux/blkdev.h|  13 +++
>>>  include/uapi/linux/fs.h   |  13 +++
>>>  12 files changed, 458 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index 96e5fcd7f071..4a5cd3f53cd2 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -719,6 +719,17 @@ static noinline int should_fail_bio(struct bio *bio)
>>>  }
>>>  ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
>>>
>>> +static inline int bio_check_copy_eod(struct bio *bio, sector_t start,
>>> + sector_t nr_sectors, sector_t maxsector)
>>> +{
>>> + if (nr_sectors && maxsector &&
>>> + (nr_sectors > maxsector || start > maxsector - nr_sectors)) {
>>> + handle_bad_sector(bio, maxsector);
>>> + return -EIO;
>>> + }
>>> + return 0;
>>> +}
>>> +
>>>  /*
>>>   * Check whether this bio extends beyond the end of the device or 
>>> partition.
>>>   * This may well happen - the kernel calls bread() without checking the 
>>> size of
>>> @@ -737,6 +748,65 @@ static inline int bio_check_eod(struct bio *bio, 
>>> sector_t maxsector)
>>>   return 0;
>>>  }
>>>
>>> +/*
>>> + * Check for copy limits and remap source ranges if needed.
>>> + */
>>> +static int blk_check_copy(struct bio *bio)
>>> +{
>>> + struct block_device *p = NULL;
>>> + struct request_queue *q = bio->bi_disk->queue;
>>> + struct blk_copy_payload *payload;
>>> + int i, maxsector, start_sect = 0, ret = -EIO;
>>> + unsigned short nr_range;
>>> +
>>> + rcu_read_lock();
>>> +
>>> + p = __disk_get_part(bio->bi_disk, bio->bi_partno);
>>> + if (unlikely(!p))
>>>

Re: [RFC PATCH v4 2/3] block: add simple copy support

2021-01-04 Thread Damien Le Moal
t; +
>  static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
>   unsigned long arg)
>  {
> @@ -458,6 +499,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, 
> fmode_t mode,
>   case BLKSECDISCARD:
>   return blk_ioctl_discard(bdev, mode, arg,
>   BLKDEV_DISCARD_SECURE);
> + case BLKCOPY:
> + return blk_ioctl_copy(bdev, mode, arg);
>   case BLKZEROOUT:
>   return blk_ioctl_zeroout(bdev, mode, arg);
>   case BLKREPORTZONE:
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 1edda614f7ce..164313bdfb35 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -71,6 +71,7 @@ static inline bool bio_has_data(struct bio *bio)
>  static inline bool bio_no_advance_iter(const struct bio *bio)
>  {
>   return bio_op(bio) == REQ_OP_DISCARD ||
> +bio_op(bio) == REQ_OP_COPY ||
>  bio_op(bio) == REQ_OP_SECURE_ERASE ||
>  bio_op(bio) == REQ_OP_WRITE_SAME ||
>  bio_op(bio) == REQ_OP_WRITE_ZEROES;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 866f74261b3b..d4d11e9ff814 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -380,6 +380,8 @@ enum req_opf {
>   REQ_OP_ZONE_RESET   = 15,
>   /* reset all the zone present on the device */
>   REQ_OP_ZONE_RESET_ALL   = 17,
> + /* copy ranges within device */
> + REQ_OP_COPY = 19,
>  
>   /* SCSI passthrough using struct scsi_request */
>   REQ_OP_SCSI_IN  = 32,
> @@ -506,6 +508,11 @@ static inline bool op_is_discard(unsigned int op)
>   return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
>  }
>  
> +static inline bool op_is_copy(unsigned int op)
> +{
> + return (op & REQ_OP_MASK) == REQ_OP_COPY;
> +}
> +
>  /*
>   * Check if a bio or request operation is a zone management operation, with
>   * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
> @@ -565,4 +572,12 @@ struct blk_rq_stat {
>   u64 batch;
>  };
>  
> +struct blk_copy_payload {
> + sector_tdest;
> + int copy_range;
> + int copy_size;
> + int err;
> + struct  range_entry range[];
> +};
> +
>  #endif /* __LINUX_BLK_TYPES_H */
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 81f9e7bec16c..4c7e861e57e4 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -340,10 +340,14 @@ struct queue_limits {
>   unsigned intmax_zone_append_sectors;
>   unsigned intdiscard_granularity;
>   unsigned intdiscard_alignment;
> + unsigned intcopy_offload;
> + unsigned intmax_copy_sectors;
>  
>   unsigned short  max_segments;
>   unsigned short  max_integrity_segments;
>   unsigned short  max_discard_segments;
> + unsigned short  max_copy_range_sectors;
> + unsigned short  max_copy_nr_ranges;
>  
>   unsigned char   misaligned;
>   unsigned char   discard_misaligned;
> @@ -625,6 +629,7 @@ struct request_queue {
>  #define QUEUE_FLAG_RQ_ALLOC_TIME 27  /* record rq->alloc_time_ns */
>  #define QUEUE_FLAG_HCTX_ACTIVE   28  /* at least one blk-mq hctx is 
> active */
>  #define QUEUE_FLAG_NOWAIT   29   /* device supports NOWAIT */
> +#define QUEUE_FLAG_COPY  30  /* supports copy */

I think this should be called QUEUE_FLAG_SIMPLE_COPY to indicate more precisely
the type of copy supported. SCSI XCOPY is more advanced...

>  
>  #define QUEUE_FLAG_MQ_DEFAULT    ((1 << QUEUE_FLAG_IO_STAT) |
> \
>(1 << QUEUE_FLAG_SAME_COMP) |  \
> @@ -647,6 +652,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, 
> struct request_queue *q);
>  #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
>  #define blk_queue_add_random(q)  test_bit(QUEUE_FLAG_ADD_RANDOM, 
> &(q)->queue_flags)
>  #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
> +#define blk_queue_copy(q)test_bit(QUEUE_FLAG_COPY, &(q)->queue_flags)
>  #define blk_queue_zone_resetall(q)   \
>   test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
>  #define blk_queue_secure_erase(q) \
> @@ -1061,6 +1067,9 @@ static inline unsigned int 
> blk_queue_get_max_sectors(struct request_queue *q,
>   return min(q->limits.max_discard_sectors,
>  UINT_MAX >> SECTOR_SHIFT);
>  
> + if (unlikely(op == REQ_OP_COPY))
> + return q->limits.max_copy_sectors;
> +
>   if (unlikely(op == REQ_OP_WRITE_SAME))
>   return q->limits.max_write_same_sectors;
>  
> @@ -1335,6 +1344,10 @@ extern int __blkdev_issue_discard(struct block_device 
> *bdev, sector_t sector,
>   sector_t nr_sects, gfp_t gfp_mask, int flags,
>   struct bio **biop);
>  
> +extern int blkdev_issue_copy(struct block_device *bdev, int nr_srcs,
> + struct range_entry *src_rlist, struct block_device *dest_bdev,
> + sector_t dest, gfp_t gfp_mask);
> +
>  #define BLKDEV_ZERO_NOUNMAP  (1 << 0)  /* do not free blocks */
>  #define BLKDEV_ZERO_NOFALLBACK   (1 << 1)  /* don't write explicit 
> zeroes */
>  
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index f44eb0a04afd..5cadb176317a 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -64,6 +64,18 @@ struct fstrim_range {
>   __u64 minlen;
>  };
>  
> +struct range_entry {
> + __u64 src;
> + __u64 len;
> +};
> +
> +struct copy_range {
> + __u64 dest;
> + __u64 nr_range;
> + __u64 range_list;
> + __u64 rsvd;
> +};
> +
>  /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions 
> */
>  #define FILE_DEDUPE_RANGE_SAME   0
>  #define FILE_DEDUPE_RANGE_DIFFERS1
> @@ -184,6 +196,7 @@ struct fsxattr {
>  #define BLKSECDISCARD _IO(0x12,125)
>  #define BLKROTATIONAL _IO(0x12,126)
>  #define BLKZEROOUT _IO(0x12,127)
> +#define BLKCOPY _IOWR(0x12, 128, struct copy_range)
>  /*
>   * A jump here: 130-131 are reserved for zoned block devices
>   * (see uapi/linux/blkzoned.h)
> 


-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH v4 1/3] block: export bio_map_kern()

2021-01-04 Thread Damien Le Moal
On 2021/01/04 19:48, SelvaKumar S wrote:
> Export bio_map_kern() so that copy offload emulation can use
> it to add vmalloced memory to bio.
> 
> Signed-off-by: SelvaKumar S 
> ---
>  block/blk-map.c| 3 ++-
>  include/linux/blkdev.h | 2 ++
>  2 files changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/block/blk-map.c b/block/blk-map.c
> index 21630dccac62..50d61475bb68 100644
> --- a/block/blk-map.c
> +++ b/block/blk-map.c
> @@ -378,7 +378,7 @@ static void bio_map_kern_endio(struct bio *bio)
>   *   Map the kernel address into a bio suitable for io to a block
>   *   device. Returns an error pointer in case of error.
>   */
> -static struct bio *bio_map_kern(struct request_queue *q, void *data,
> +struct bio *bio_map_kern(struct request_queue *q, void *data,
>   unsigned int len, gfp_t gfp_mask)
>  {
>   unsigned long kaddr = (unsigned long)data;
> @@ -428,6 +428,7 @@ static struct bio *bio_map_kern(struct request_queue *q, 
> void *data,
>   bio->bi_end_io = bio_map_kern_endio;
>   return bio;
>  }
> +EXPORT_SYMBOL(bio_map_kern);

Simple copy support is a block layer code, so you I do not think you need this.
You only need to remove the static declaration of the function.

>  
>  static void bio_copy_kern_endio(struct bio *bio)
>  {
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 070de09425ad..81f9e7bec16c 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -936,6 +936,8 @@ extern int blk_rq_map_user(struct request_queue *, struct 
> request *,
>  struct rq_map_data *, void __user *, unsigned long,
>  gfp_t);
>  extern int blk_rq_unmap_user(struct bio *);
> +extern struct bio *bio_map_kern(struct request_queue *q, void *data,
> + unsigned int len, gfp_t gfp_mask);
>  extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, 
> unsigned int, gfp_t);
>  extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
>  struct rq_map_data *, const struct iov_iter *,
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] zonefs: select CONFIG_CRC32

2021-01-03 Thread Damien Le Moal
On 2021/01/04 6:44, Arnd Bergmann wrote:
> From: Arnd Bergmann 
> 
> When CRC32 is disabled, zonefs cannot be linked:
> 
> ld: fs/zonefs/super.o: in function `zonefs_fill_super':
> 
> Add a Kconfig 'select' statement for it.
> 
> Fixes: 8dcc1a9d90c1 ("fs: New zonefs file system")
> Signed-off-by: Arnd Bergmann 
> ---
>  fs/zonefs/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/fs/zonefs/Kconfig b/fs/zonefs/Kconfig
> index ef2697b78820..827278f937fe 100644
> --- a/fs/zonefs/Kconfig
> +++ b/fs/zonefs/Kconfig
> @@ -3,6 +3,7 @@ config ZONEFS_FS
>   depends on BLOCK
>   depends on BLK_DEV_ZONED
>   select FS_IOMAP
> + select CRC32
>   help
> zonefs is a simple file system which exposes zones of a zoned block
> device (e.g. host-managed or host-aware SMR disk drives) as files.
> 

Applied. Thanks Arnd !

-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] dm zoned: select CONFIG_CRC32

2021-01-03 Thread Damien Le Moal
On 2021/01/04 6:41, Arnd Bergmann wrote:
> From: Arnd Bergmann 
> 
> Without crc32 support, this driver fails to link:
> 
> arm-linux-gnueabi-ld: drivers/md/dm-zoned-metadata.o: in function 
> `dmz_write_sb':
> dm-zoned-metadata.c:(.text+0xe98): undefined reference to `crc32_le'
> arm-linux-gnueabi-ld: drivers/md/dm-zoned-metadata.o: in function 
> `dmz_check_sb':
> dm-zoned-metadata.c:(.text+0x7978): undefined reference to `crc32_le'
> 
> Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target")
> Signed-off-by: Arnd Bergmann 
> ---
>  drivers/md/Kconfig | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
> index b7e2d914..a67b9ed3ca89 100644
> --- a/drivers/md/Kconfig
> +++ b/drivers/md/Kconfig
> @@ -622,6 +622,7 @@ config DM_ZONED
>   tristate "Drive-managed zoned block device target support"
>   depends on BLK_DEV_DM
>   depends on BLK_DEV_ZONED
> + select CRC32
>   help
> This device-mapper target takes a host-managed or host-aware zoned
>     block device and exposes most of its capacity as a regular block
> 

Reviewed-by: Damien Le Moal 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v1] drivers: block: skd: remove skd_pci_info()

2020-12-14 Thread Damien Le Moal
On 2020/12/15 0:27, Puranjay Mohan wrote:
> Change the call to skd_pci_info() to pcie_print_link_status().
> pcie_print_link_status() can be used to print the link speed and
> the link width, skd_pci_info() does the same and hence it is removed.
> 
> Signed-off-by: Puranjay Mohan 
> ---
> v1 - Add call to pcie_print_link_status()
> ---
>  drivers/block/skd_main.c | 33 +
>  1 file changed, 1 insertion(+), 32 deletions(-)
> 
> diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
> index a962b4551bed..efd69f349043 100644
> --- a/drivers/block/skd_main.c
> +++ b/drivers/block/skd_main.c
> @@ -3134,40 +3134,10 @@ static const struct pci_device_id skd_pci_tbl[] = {
>  
>  MODULE_DEVICE_TABLE(pci, skd_pci_tbl);
>  
> -static char *skd_pci_info(struct skd_device *skdev, char *str)
> -{
> - int pcie_reg;
> -
> - strcpy(str, "PCIe (");
> - pcie_reg = pci_find_capability(skdev->pdev, PCI_CAP_ID_EXP);
> -
> - if (pcie_reg) {
> -
> - char lwstr[6];
> - uint16_t pcie_lstat, lspeed, lwidth;
> -
> - pcie_reg += 0x12;
> - pci_read_config_word(skdev->pdev, pcie_reg, _lstat);
> - lspeed = pcie_lstat & (0xF);
> - lwidth = (pcie_lstat & 0x3F0) >> 4;
> -
> - if (lspeed == 1)
> - strcat(str, "2.5GT/s ");
> - else if (lspeed == 2)
> - strcat(str, "5.0GT/s ");
> - else
> - strcat(str, " ");
> - snprintf(lwstr, sizeof(lwstr), "%dX)", lwidth);
> - strcat(str, lwstr);
> - }
> - return str;
> -}
> -
>  static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id 
> *ent)
>  {
>   int i;
>   int rc = 0;
> - char pci_str[32];
>   struct skd_device *skdev;
>  
>   dev_dbg(>dev, "vendor=%04X device=%04x\n", pdev->vendor,
> @@ -3201,8 +3171,7 @@ static int skd_pci_probe(struct pci_dev *pdev, const 
> struct pci_device_id *ent)
>   goto err_out_regions;
>   }
>  
> - skd_pci_info(skdev, pci_str);
> - dev_info(>dev, "%s 64bit\n", pci_str);
> + pcie_print_link_status(pdev);
>  
>   pci_set_master(pdev);
>   rc = pci_enable_pcie_error_reporting(pdev);
> 

Note: V1 of this patch was the one I commented on. This one should thus be V2.

In any case, this looks OK to me.

Acked-by: Damien Le Moal 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] drivers: block: skd: remove skd_pci_info()

2020-12-13 Thread Damien Le Moal
On Fri, 2020-12-11 at 22:11 +0530, Puranjay Mohan wrote:
> PCI core calls __pcie_print_link_status() for every device, it prints
> both the link width and the link speed. skd_pci_info() does the same
> thing again, hence it can be removed.

Hmmm... On my box, I see this for the skd card:

[8.509243] pci :d8:00.0: [1b39:0001] type 00 class 0x018000
[8.515933] pci :d8:00.0: reg 0x10: [mem 0xfbe0-0xfbe0]
[8.521924] pci :d8:00.0: reg 0x14: [mem 0xfbe1-0xfbe10fff]
[8.527957] pci :d8:00.0: reg 0x30: [mem 0xfbd0-0xfbdf
pref]
[8.534999] pci :d8:00.0: supports D1 D2

No link speed. Checking the code, I think you need to actually call
pcie_print_link_status() (which calls __pcie_print_link_status() with
verbose = true) from the driver to see anything. Otherwise, the PCIe
core will not print anything if the driver is just probing and getting
resources for the card.

> 
> Signed-off-by: Puranjay Mohan 
> ---
>  drivers/block/skd_main.c | 31 ---
>  1 file changed, 31 deletions(-)
> 
> diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c
> index a962b4551bed..da7aac5335d9 100644
> --- a/drivers/block/skd_main.c
> +++ b/drivers/block/skd_main.c

[...]
>  static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id 
> *ent)
>  {
>   int i;
>   int rc = 0;
> - char pci_str[32];
>   struct skd_device *skdev;
> 
>   dev_dbg(>dev, "vendor=%04X device=%04x\n", pdev->vendor,
> @@ -3201,8 +3172,6 @@ static int skd_pci_probe(struct pci_dev *pdev, const 
> struct pci_device_id *ent)
>   goto err_out_regions;
>   }
> 
> - skd_pci_info(skdev, pci_str);
> - dev_info(>dev, "%s 64bit\n", pci_str);

Replace these 2 lines with:

pcie_print_link_status(pdev);

And the link speed information will be printed.



-- 
Damien Le Moal
Western Digital Research


Re: [dm-devel] [RFC PATCH v2 1/2] block: add simple copy support

2020-12-08 Thread Damien Le Moal
On 2020/12/09 13:20, Martin K. Petersen wrote:
> 
> SelvaKumar,
> 
>> Add new BLKCOPY ioctl that offloads copying of multiple sources
>> to a destination to the device.
> 
> Your patches are limited in scope to what is currently possible with
> NVMe. I.e. multiple source ranges to a single destination within the
> same device. That's fine, I think the garbage collection use case is
> valid and worth pursuing.
> 
> I just wanted to go over what the pain points were for the various
> attempts in SCSI over the years.
> 
> The main headache was due the stacking situation with DM and MD.
> Restricting offload to raw SCSI disks would have been simple but not
> really a good fit for most real world developments that often use DM or
> MD to provision the storage.
> 
> Things are simple for DM/MD with reads and writes because you have one
> bio as parent that may get split into many clones that complete
> individually prior to the parent being marked as completed.
> 
> In the copy offload scenario things quickly become complex once both
> source and destination ranges have to be split into multiple commands
> for potentially multiple devices. And these clones then need to be
> correctly paired at the bottom of the stack. There's also no guarantee
> that a 1MB source range maps to a single 1MB destination range. So you
> could end up with an M:N relationship to resolve.
> 
> After a few failed attempts we focused on single source range/single
> destination range. Just to simplify the slicing and dicing. That worked
> reasonably well. However, then came along the token-based commands in
> SCSI and those threw a wrench in the gears. Now the block layer plumbing
> had to support two completely different semantic approaches.
> 
> Inspired by a combination of Mikulas' efforts with pointer matching and
> the token-based approach in SCSI I switched the block layer
> implementation from a single operation (REQ_COPY) to something similar
> to the SCSI token approach with a REQ_COPY_IN and a REQ_COPY_OUT.
> 
> The premise being that you would send a command to the source device and
> "get" the data. In the EXTENDED COPY scenario, the data wasn't really
> anything but a confirmation from the SCSI disk driver that the I/O had
> reached the bottom of the stack without being split by DM/MD. And once
> completion of the REQ_COPY_IN reached blk-lib, a REQ_COPY_OUT would be
> issued and, if that arrived unchanged in the disk driver, get turned
> into an EXTENDED COPY sent to the destination.
> 
> In the token-based scenario the same thing happened except POPULATE
> TOKEN was sent all the way out to the device to receive a cookie
> representing the source block ranges. Upon completion, that cookie was
> used by blk-lib to issue a REQ_COPY_OUT command which was then sent to
> the destination device. Again only if the REQ_COPY_OUT I/O hadn't been
> split traversing the stack.
> 
> The idea was to subsequently leverage the separation of REQ_COPY_IN and
> REQ_COPY_OUT to permit a DM/MD iterative approach to both stages of the
> operation. That seemed to me like the only reasonable way to approach
> the M:N splitting problem (if at all)...

Another simple approach, at least initially for the first drop, would be to
disable any sort of native hardware-based copy for stacked devices. These
devices would simply not advertise copy support in their request queue flags,
forcing the block layer generic copy API to do read-writes, very similar to
dm-kcopyd. Use cases where a drive with native copy support is used directly
would still be able to benefit from the hardware native function, dependent
eventually on a sysfs switch (which by default would be off maybe).

Integrating nvme simple copy in such initial support would I think be quite
simple and scsi xcopy can follow. From there, adding stack device support can be
worked on with little, if any, impact on the existing users of the block copy
API (mostly FSes such as f2fs and btrfs).


-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH v2 0/2] add simple copy support

2020-12-07 Thread Damien Le Moal
On 2020/12/07 17:16, javier.g...@samsung.com wrote:
> On 07.12.2020 08:06, Damien Le Moal wrote:
>> On 2020/12/07 16:46, javier.g...@samsung.com wrote:
>>> On 04.12.2020 23:40, Keith Busch wrote:
>>>> On Fri, Dec 04, 2020 at 11:25:12AM +, Damien Le Moal wrote:
>>>>> On 2020/12/04 20:02, SelvaKumar S wrote:
>>>>>> This patchset tries to add support for TP4065a ("Simple Copy Command"),
>>>>>> v2020.05.04 ("Ratified")
>>>>>>
>>>>>> The Specification can be found in following link.
>>>>>> https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
>>>>>>
>>>>>> This is an RFC. Looking forward for any feedbacks or other alternate
>>>>>> designs for plumbing simple copy to IO stack.
>>>>>>
>>>>>> Simple copy command is a copy offloading operation and is  used to copy
>>>>>> multiple contiguous ranges (source_ranges) of LBA's to a single 
>>>>>> destination
>>>>>> LBA within the device reducing traffic between host and device.
>>>>>>
>>>>>> This implementation accepts destination, no of sources and arrays of
>>>>>> source ranges from application and attach it as payload to the bio and
>>>>>> submits to the device.
>>>>>>
>>>>>> Following limits are added to queue limits and are exposed in sysfs
>>>>>> to userspace
>>>>>>  - *max_copy_sectors* limits the sum of all source_range length
>>>>>>  - *max_copy_nr_ranges* limits the number of source ranges
>>>>>>  - *max_copy_range_sectors* limit the maximum number of sectors
>>>>>>  that can constitute a single source range.
>>>>>
>>>>> Same comment as before. I think this is a good start, but for this to be 
>>>>> really
>>>>> useful to users and kernel components alike, this really needs copy 
>>>>> emulation
>>>>> for drives that do not have a native copy feature, similarly to what 
>>>>> write zeros
>>>>> handling for instance: if the drive does not have a copy command (simple 
>>>>> copy
>>>>> for NVMe or XCOPY for scsi), then the block layer should issue read/write
>>>>> commands to seamlessly execute the copy. Otherwise, this will only serve 
>>>>> a small
>>>>> niche for users and will not be optimal for FS and DM drivers that could 
>>>>> be
>>>>> simplified with a generic block layer copy functionality.
>>>>>
>>>>> This is my 10 cents though, others may differ about this.
>>>>
>>>> Yes, I agree that copy emulation support should be included with the
>>>> hardware enabled solution.
>>>
>>> Keith, Damien,
>>>
>>> Can we do the block layer emulation with this patchset and then work in
>>> follow-up patchses on (i) the FS interface with F2FS as a first user and
>>> (ii) other HW accelerations such as XCOPY?
>>
>> The initial patchset supporting NVMe simple copy and emulation copy, all 
>> under
>> an API that probably will be similar that of dm-kcopyd will cover all block
>> devices. Other hardware native support for copy functions such as scsi 
>> extended
>> copy can be added later under the hood without any API changes (or minimal 
>> changes).
> 
> Sounds good. That we can do. We will add a new patch for this.
> 
>>
>> I am not sure what you mean by "FS interface for F2FS": the block layer API 
>> for
>> this copy functionality will be what F2FS (and other FSes) will call. That is
>> the interface, no ?
> 
> Essentially yes.. I mean adding the F2FS logic and potentially some
> helpers to the block layer to aid GC.

GC is very much special to each FS. SO I do not think adding helpers to the
block layer will have value. We should stick to a pure block copy API for that
layer.

> 
>>
>>> For XCOPY, I believe we need to have a separate discussion as much works
>>> is already done that we should align to.
>>
>> I think Martin (added to this thread) and others have looked into it but I do
>> not think that anything made it into the kernel yet.
> 
> Exactly. Looking at some of the code posted through time and recalling
> the discussions at LSF/MM, seems like there are a number of things we
> are not addressing here that could be incorporated down the road, such
> as dedicated syscalls / extensions, multi namespace / device support,
> etc.

dm-kcopyd interface supports copy between multiple devices. That of course would
not enable NVMe simple copy use, but that makes the interface generic enough so
that we should not have any problem with other hardware copy functions.

>>
> 


-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH v2 0/2] add simple copy support

2020-12-07 Thread Damien Le Moal
On 2020/12/07 16:46, javier.g...@samsung.com wrote:
> On 04.12.2020 23:40, Keith Busch wrote:
>> On Fri, Dec 04, 2020 at 11:25:12AM +0000, Damien Le Moal wrote:
>>> On 2020/12/04 20:02, SelvaKumar S wrote:
>>>> This patchset tries to add support for TP4065a ("Simple Copy Command"),
>>>> v2020.05.04 ("Ratified")
>>>>
>>>> The Specification can be found in following link.
>>>> https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
>>>>
>>>> This is an RFC. Looking forward for any feedbacks or other alternate
>>>> designs for plumbing simple copy to IO stack.
>>>>
>>>> Simple copy command is a copy offloading operation and is  used to copy
>>>> multiple contiguous ranges (source_ranges) of LBA's to a single destination
>>>> LBA within the device reducing traffic between host and device.
>>>>
>>>> This implementation accepts destination, no of sources and arrays of
>>>> source ranges from application and attach it as payload to the bio and
>>>> submits to the device.
>>>>
>>>> Following limits are added to queue limits and are exposed in sysfs
>>>> to userspace
>>>>- *max_copy_sectors* limits the sum of all source_range length
>>>>- *max_copy_nr_ranges* limits the number of source ranges
>>>>- *max_copy_range_sectors* limit the maximum number of sectors
>>>>that can constitute a single source range.
>>>
>>> Same comment as before. I think this is a good start, but for this to be 
>>> really
>>> useful to users and kernel components alike, this really needs copy 
>>> emulation
>>> for drives that do not have a native copy feature, similarly to what write 
>>> zeros
>>> handling for instance: if the drive does not have a copy command (simple 
>>> copy
>>> for NVMe or XCOPY for scsi), then the block layer should issue read/write
>>> commands to seamlessly execute the copy. Otherwise, this will only serve a 
>>> small
>>> niche for users and will not be optimal for FS and DM drivers that could be
>>> simplified with a generic block layer copy functionality.
>>>
>>> This is my 10 cents though, others may differ about this.
>>
>> Yes, I agree that copy emulation support should be included with the
>> hardware enabled solution.
> 
> Keith, Damien,
> 
> Can we do the block layer emulation with this patchset and then work in
> follow-up patchses on (i) the FS interface with F2FS as a first user and
> (ii) other HW accelerations such as XCOPY?

The initial patchset supporting NVMe simple copy and emulation copy, all under
an API that probably will be similar that of dm-kcopyd will cover all block
devices. Other hardware native support for copy functions such as scsi extended
copy can be added later under the hood without any API changes (or minimal 
changes).

I am not sure what you mean by "FS interface for F2FS": the block layer API for
this copy functionality will be what F2FS (and other FSes) will call. That is
the interface, no ?

> For XCOPY, I believe we need to have a separate discussion as much works
> is already done that we should align to.

I think Martin (added to this thread) and others have looked into it but I do
not think that anything made it into the kernel yet.


-- 
Damien Le Moal
Western Digital Research


Re: [RFC PATCH v2 0/2] add simple copy support

2020-12-04 Thread Damien Le Moal
On 2020/12/04 20:02, SelvaKumar S wrote:
> This patchset tries to add support for TP4065a ("Simple Copy Command"),
> v2020.05.04 ("Ratified")
> 
> The Specification can be found in following link.
> https://nvmexpress.org/wp-content/uploads/NVM-Express-1.4-Ratified-TPs-1.zip
> 
> This is an RFC. Looking forward for any feedbacks or other alternate
> designs for plumbing simple copy to IO stack.
> 
> Simple copy command is a copy offloading operation and is  used to copy
> multiple contiguous ranges (source_ranges) of LBA's to a single destination
> LBA within the device reducing traffic between host and device.
> 
> This implementation accepts destination, no of sources and arrays of
> source ranges from application and attach it as payload to the bio and
> submits to the device.
> 
> Following limits are added to queue limits and are exposed in sysfs
> to userspace
>   - *max_copy_sectors* limits the sum of all source_range length
>   - *max_copy_nr_ranges* limits the number of source ranges
>   - *max_copy_range_sectors* limit the maximum number of sectors
>   that can constitute a single source range.

Same comment as before. I think this is a good start, but for this to be really
useful to users and kernel components alike, this really needs copy emulation
for drives that do not have a native copy feature, similarly to what write zeros
handling for instance: if the drive does not have a copy command (simple copy
for NVMe or XCOPY for scsi), then the block layer should issue read/write
commands to seamlessly execute the copy. Otherwise, this will only serve a small
niche for users and will not be optimal for FS and DM drivers that could be
simplified with a generic block layer copy functionality.

This is my 10 cents though, others may differ about this.

> 
> Changes from v1:
> 
> 1. Fix memory leak in __blkdev_issue_copy
> 2. Unmark blk_check_copy inline
> 3. Fix line break in blk_check_copy_eod
> 4. Remove p checks and made code more readable
> 5. Don't use bio_set_op_attrs and remove op and set
>bi_opf directly
> 6. Use struct_size to calculate total_size
> 7. Fix partition remap of copy destination
> 8. Remove mcl,mssrl,msrc from nvme_ns
> 9. Initialize copy queue limits to 0 in nvme_config_copy
> 10. Remove return in QUEUE_FLAG_COPY check
> 11. Remove unused OCFS
> 
> SelvaKumar S (2):
>   block: add simple copy support
>   nvme: add simple copy support
> 
>  block/blk-core.c  |  94 ++---
>  block/blk-lib.c   | 123 ++
>  block/blk-merge.c |   2 +
>  block/blk-settings.c  |  11 
>  block/blk-sysfs.c |  23 +++
>  block/blk-zoned.c |   1 +
>  block/bounce.c|   1 +
>  block/ioctl.c |  43 +
>  drivers/nvme/host/core.c  |  87 +++
>  include/linux/bio.h   |   1 +
>  include/linux/blk_types.h |  15 +
>  include/linux/blkdev.h|  15 +
>  include/linux/nvme.h  |  43 -
>  include/uapi/linux/fs.h   |  13 
>  14 files changed, 461 insertions(+), 11 deletions(-)
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] riscv: defconfig: k210: Disable CONFIG_VT

2020-11-25 Thread Damien Le Moal
On Wed, 2020-11-25 at 13:47 +0100, Geert Uytterhoeven wrote:
> Hi Damien,
> 
> On Wed, Nov 25, 2020 at 12:00 PM Damien Le Moal  wrote:
> > On 2020/11/25 18:26, Geert Uytterhoeven wrote:
> > > On Wed, Nov 25, 2020 at 10:02 AM Damien Le Moal  
> > > wrote:
> > > > On 2020/11/25 17:51, Geert Uytterhoeven wrote:
> > > > I was just fiddling with CONFIG_UNIX98_PTYS. Disabling it is OK with 
> > > > the simple
> > > > busybox userspace (no telnet/xterm like app running). But it saves only 
> > > > about
> > > > 1KB with my toolchain (gcc 9.3). So I left that one enabled. I am 
> > > > surprised that
> > > > you see 16K size impact... How big is your image ?
> > > > 
> > > > For me, it is 1.768 MB right now for the sdcard defconfig, with 
> > > > CONFIG_VT
> > > > disabled and ext2 enabled.
> > > 
> > > It might depend on how you measure.  "size" says 15 KiB impact for UNIX98
> > > ptys, while bloat-o-meter reported less than 7 (my script uses "size").
> > 
> > I look at the size of the arch/riscv/boot/loader.bin file since that is what
> > gets loaded in RAM and booted. It is significantly smaller than vmlinux file
> > size. E.g. for the sd card defconfig, I have:
> > 
> > vmlinux: 2369920 B
> > loader.bin: 1751250 B
> 
> Doesn't loader.bin lack bss?
> Bss does consume RAM, so you do want to take that into account, too.

Indeed. Good point. Thanks !

> 
> Gr{oetje,eeting}s,
> 
> Geert
> 

-- 
Damien Le Moal
Western Digital


Re: [PATCH] riscv: defconfig: k210: Disable CONFIG_VT

2020-11-25 Thread Damien Le Moal
On 2020/11/25 20:00, Damien Le Moal wrote:
> On 2020/11/25 18:26, Geert Uytterhoeven wrote:
>> Hi Damien,
>>
>> On Wed, Nov 25, 2020 at 10:02 AM Damien Le Moal  
>> wrote:
>>> On 2020/11/25 17:51, Geert Uytterhoeven wrote:
>>>> On Wed, Nov 25, 2020 at 7:14 AM Damien Le Moal  
>>>> wrote:
>>>>> On 2020/11/25 3:57, Geert Uytterhoeven wrote:
>>>>>> There is no need to enable Virtual Terminal support in the Canaan
>>>>>> Kendryte K210 defconfigs, as no terminal devices are supported and
>>>>>> enabled.  Hence disable CONFIG_VT, and remove the no longer needed
>>>>>> override for CONFIG_VGA_CONSOLE.
>>>>>>
>>>>>> This reduces kernel size by ca. 65 KiB.
>>>>>
>>>>> Indeed, nice saving. Just tested, and all is good.
>>>>
>>>> I used my old script[1] to check the impact of disabling config options.
>>
>>>> I haven't done enough riscv kernel development yet to assess if I need
>>>> CONFIG_FRAME_POINTER or not.
>>>
>>> Disabling it significantly reduced code size for me. Since the series is 
>>> more
>>> stable now, it is not really needed, so I disabled it in the defconfig.
>>>
>>> I was just fiddling with CONFIG_UNIX98_PTYS. Disabling it is OK with the 
>>> simple
>>> busybox userspace (no telnet/xterm like app running). But it saves only 
>>> about
>>> 1KB with my toolchain (gcc 9.3). So I left that one enabled. I am surprised 
>>> that
>>> you see 16K size impact... How big is your image ?
>>>
>>> For me, it is 1.768 MB right now for the sdcard defconfig, with CONFIG_VT
>>> disabled and ext2 enabled.
>>
>> It might depend on how you measure.  "size" says 15 KiB impact for UNIX98
>> ptys, while bloat-o-meter reported less than 7 (my script uses "size").
> 
> I look at the size of the arch/riscv/boot/loader.bin file since that is what
> gets loaded in RAM and booted. It is significantly smaller than vmlinux file
> size. E.g. for the sd card defconfig, I have:
> 
> vmlinux: 2369920 B
> loader.bin: 1751250 B
> 
>> I'm at 1.88 MiB, with ext4 and without frame pointers.
>> I also got rid of the EFI partition support, and a few I/O schedulers:
>>
>> +CONFIG_PARTITION_ADVANCED=y
>> +# CONFIG_EFI_PARTITION is not set
>> +# CONFIG_MQ_IOSCHED_DEADLINE is not set
>> +# CONFIG_MQ_IOSCHED_KYBER is not set
> 
> I have all of these disabled. The schedulers are forced disabled in the sdcard
> defconfig.
> 
> I also noticed that it hugely depend on the compiler. Using the buildroot
> generated rv64 gcc 10, the kernel image goes up to almost 2 MB. So for the
> kernel, I keep using the bootlin precompiled gcc 9.3:
> 
> https://toolchains.bootlin.com/
> 
> Just noticed that they now have a 10.2 version available. Will try it out.

Correction: my PATH was actually pointing to the Fedora riscv64 gcc from the
distro rpm package which is version 10.2, not 9.3.

> 
>>
>> Gr{oetje,eeting}s,
>>
>> Geert
>>
> 
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] riscv: defconfig: k210: Disable CONFIG_VT

2020-11-25 Thread Damien Le Moal
On 2020/11/25 18:26, Geert Uytterhoeven wrote:
> Hi Damien,
> 
> On Wed, Nov 25, 2020 at 10:02 AM Damien Le Moal  wrote:
>> On 2020/11/25 17:51, Geert Uytterhoeven wrote:
>>> On Wed, Nov 25, 2020 at 7:14 AM Damien Le Moal  
>>> wrote:
>>>> On 2020/11/25 3:57, Geert Uytterhoeven wrote:
>>>>> There is no need to enable Virtual Terminal support in the Canaan
>>>>> Kendryte K210 defconfigs, as no terminal devices are supported and
>>>>> enabled.  Hence disable CONFIG_VT, and remove the no longer needed
>>>>> override for CONFIG_VGA_CONSOLE.
>>>>>
>>>>> This reduces kernel size by ca. 65 KiB.
>>>>
>>>> Indeed, nice saving. Just tested, and all is good.
>>>
>>> I used my old script[1] to check the impact of disabling config options.
> 
>>> I haven't done enough riscv kernel development yet to assess if I need
>>> CONFIG_FRAME_POINTER or not.
>>
>> Disabling it significantly reduced code size for me. Since the series is more
>> stable now, it is not really needed, so I disabled it in the defconfig.
>>
>> I was just fiddling with CONFIG_UNIX98_PTYS. Disabling it is OK with the 
>> simple
>> busybox userspace (no telnet/xterm like app running). But it saves only about
>> 1KB with my toolchain (gcc 9.3). So I left that one enabled. I am surprised 
>> that
>> you see 16K size impact... How big is your image ?
>>
>> For me, it is 1.768 MB right now for the sdcard defconfig, with CONFIG_VT
>> disabled and ext2 enabled.
> 
> It might depend on how you measure.  "size" says 15 KiB impact for UNIX98
> ptys, while bloat-o-meter reported less than 7 (my script uses "size").

I look at the size of the arch/riscv/boot/loader.bin file since that is what
gets loaded in RAM and booted. It is significantly smaller than vmlinux file
size. E.g. for the sd card defconfig, I have:

vmlinux: 2369920 B
loader.bin: 1751250 B

> I'm at 1.88 MiB, with ext4 and without frame pointers.
> I also got rid of the EFI partition support, and a few I/O schedulers:
> 
> +CONFIG_PARTITION_ADVANCED=y
> +# CONFIG_EFI_PARTITION is not set
> +# CONFIG_MQ_IOSCHED_DEADLINE is not set
> +# CONFIG_MQ_IOSCHED_KYBER is not set

I have all of these disabled. The schedulers are forced disabled in the sdcard
defconfig.

I also noticed that it hugely depend on the compiler. Using the buildroot
generated rv64 gcc 10, the kernel image goes up to almost 2 MB. So for the
kernel, I keep using the bootlin precompiled gcc 9.3:

https://toolchains.bootlin.com/

Just noticed that they now have a 10.2 version available. Will try it out.

> 
> Gr{oetje,eeting}s,
> 
> Geert
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] riscv: defconfig: k210: Disable CONFIG_VT

2020-11-25 Thread Damien Le Moal
On 2020/11/25 17:51, Geert Uytterhoeven wrote:
> Hi Damien,
> 
> On Wed, Nov 25, 2020 at 7:14 AM Damien Le Moal  wrote:
>> On 2020/11/25 3:57, Geert Uytterhoeven wrote:
>>> There is no need to enable Virtual Terminal support in the Canaan
>>> Kendryte K210 defconfigs, as no terminal devices are supported and
>>> enabled.  Hence disable CONFIG_VT, and remove the no longer needed
>>> override for CONFIG_VGA_CONSOLE.
>>>
>>> This reduces kernel size by ca. 65 KiB.
>>
>> Indeed, nice saving. Just tested, and all is good.
> 
> I used my old script[1] to check the impact of disabling config options.
> 
> I don't see any other low-hanging fruits:
> 
> Disabling CONFIG_BLOCK saves 492890 bytes
> Disabling CONFIG_EXT4_FS saves 322528 bytes
> Disabling CONFIG_PRINTK saves 214612 bytes
> Disabling CONFIG_SMP saves 214486 bytes
> Disabling CONFIG_FRAME_POINTER saves 166368 bytes
> Disabling CONFIG_TTY saves 156618 bytes
> Disabling CONFIG_PROC_FS saves 110274 bytes
> Disabling CONFIG_MMC saves 87656 bytes
> Disabling CONFIG_VT saves 70350 bytes
> Disabling CONFIG_SYSFS saves 62298 bytes
> Disabling CONFIG_BUG saves 50882 bytes
> Disabling CONFIG_SPI saves 34420 bytes
> Disabling CONFIG_SOC_CANAAN saves 34286 bytes
> Disabling CONFIG_I2C saves 34086 bytes
> Disabling CONFIG_PROC_SYSCTL saves 23890 bytes
> Disabling CONFIG_POSIX_TIMERS saves 18388 bytes
> Disabling CONFIG_I2C_DESIGNWARE_PLATFORM saves 17530 bytes
> Disabling CONFIG_MMC_BLOCK saves 17200 bytes
> Disabling CONFIG_UNIX98_PTYS saves 16360 bytes
> Disabling CONFIG_MULTIUSER saves 16148 bytes
> Disabling CONFIG_NEW_LEDS saves 15964 bytes
> Disabling CONFIG_SPI_DESIGNWARE saves 15434 bytes
> Disabling CONFIG_GPIO_CDEV saves 15352 bytes
> Disabling CONFIG_MMC_SPI saves 14754 bytes
> Disabling CONFIG_SOC_CANAAN_K210_DTB_BUILTIN saves 13864 bytes
> 
> (Yes, I have ext4 enabled ;-)
> 
> I haven't done enough riscv kernel development yet to assess if I need
> CONFIG_FRAME_POINTER or not.

Disabling it significantly reduced code size for me. Since the series is more
stable now, it is not really needed, so I disabled it in the defconfig.

I was just fiddling with CONFIG_UNIX98_PTYS. Disabling it is OK with the simple
busybox userspace (no telnet/xterm like app running). But it saves only about
1KB with my toolchain (gcc 9.3). So I left that one enabled. I am surprised that
you see 16K size impact... How big is your image ?

For me, it is 1.768 MB right now for the sdcard defconfig, with CONFIG_VT
disabled and ext2 enabled.

Disabling the block layer, ext2 and mmc driver, the size goes down to 1.511 MB
without any intramfs cpio file embedded.

> 
> [1] 
> https://github.com/geertu/linux-scripts/blob/master/linux-analyze-marginal-sizes

Thanks ! I will try to run this on my end.

> 
> Gr{oetje,eeting}s,
> 
> Geert
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] riscv: defconfig: k210: Disable CONFIG_VT

2020-11-25 Thread Damien Le Moal
On Wed, 2020-11-25 at 09:20 +0100, Geert Uytterhoeven wrote:
> Hi Damien,
> 
> On Wed, Nov 25, 2020 at 7:14 AM Damien Le Moal  wrote:
> > On 2020/11/25 3:57, Geert Uytterhoeven wrote:
> > > There is no need to enable Virtual Terminal support in the Canaan
> > > Kendryte K210 defconfigs, as no terminal devices are supported and
> > > enabled.  Hence disable CONFIG_VT, and remove the no longer needed
> > > override for CONFIG_VGA_CONSOLE.
> > > 
> > > This reduces kernel size by ca. 65 KiB.
> > 
> > Indeed, nice saving. Just tested, and all is good.
> > 
> > Can I squash this patch into the 2 defconfig update patches of the series,
> > adding your signed-off-by ? Or would you prefer that I keep it as a 
> > separate patch ?
> 
> Feel free to squash it into your queued updates.
> No need to add my SoB, as the full updates don't pass through me.

Done. Thanks !

> 
> Thanks!
> 
> Gr{oetje,eeting}s,
> 
> Geert
> 

-- 
Damien Le Moal
Western Digital


Re: [PATCH] efi: EFI_EARLYCON should depend on EFI

2020-11-25 Thread Damien Le Moal
On Tue, 2020-11-24 at 20:16 +0100, Geert Uytterhoeven wrote:
> CONFIG_EFI_EARLYCON defaults to yes, and thus is enabled on systems that
> do not support EFI, or do not have EFI support enabled, but do satisfy
> the symbol's other dependencies.
> 
> While drivers/firmware/efi/ won't be entered during the build phase if
> CONFIG_EFI=n, and drivers/firmware/efi/earlycon.c itself thus won't be
> built, enabling EFI_EARLYCON does force-enable CONFIG_FONT_SUPPORT and
> CONFIG_ARCH_USE_MEMREMAP_PROT, and CONFIG_FONT_8x16, which is
> undesirable.
> 
> Fix this by making CONFIG_EFI_EARLYCON depend on CONFIG_EFI.
> 
> This reduces kernel size on headless systems by more than 4 KiB.
> 
> Fixes: 69c1f396f25b805a ("efi/x86: Convert x86 EFI earlyprintk into generic 
> earlycon implementation")
> Signed-off-by: Geert Uytterhoeven 
> ---
>  drivers/firmware/efi/Kconfig | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
> index b452cfa2100b401c..1dd1f7784f0888ff 100644
> --- a/drivers/firmware/efi/Kconfig
> +++ b/drivers/firmware/efi/Kconfig
> @@ -270,7 +270,7 @@ config EFI_DEV_PATH_PARSER
>  
> 
>  config EFI_EARLYCON
>   def_bool y
> - depends on SERIAL_EARLYCON && !ARM && !IA64
> + depends on EFI && SERIAL_EARLYCON && !ARM && !IA64
>   select FONT_SUPPORT
>   select ARCH_USE_MEMREMAP_PROT
>  
> 

Looks good to me.
Reviewed-by: Damien Le Moal 

-- 
Damien Le Moal
Western Digital


Re: [PATCH] riscv: defconfig: k210: Disable CONFIG_VT

2020-11-24 Thread Damien Le Moal
On 2020/11/25 3:57, Geert Uytterhoeven wrote:
> There is no need to enable Virtual Terminal support in the Canaan
> Kendryte K210 defconfigs, as no terminal devices are supported and
> enabled.  Hence disable CONFIG_VT, and remove the no longer needed
> override for CONFIG_VGA_CONSOLE.
> 
> This reduces kernel size by ca. 65 KiB.

Indeed, nice saving. Just tested, and all is good.

Can I squash this patch into the 2 defconfig update patches of the series,
adding your signed-off-by ? Or would you prefer that I keep it as a separate 
patch ?

> 
> Signed-off-by: Geert Uytterhoeven 
> ---
> Against k210-sysctl-v15
> ---
>  arch/riscv/configs/nommu_k210_defconfig| 2 +-
>  arch/riscv/configs/nommu_k210_sdcard_defconfig | 2 +-
>  2 files changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/riscv/configs/nommu_k210_defconfig 
> b/arch/riscv/configs/nommu_k210_defconfig
> index df89d53bd125679b..9262223037e43479 100644
> --- a/arch/riscv/configs/nommu_k210_defconfig
> +++ b/arch/riscv/configs/nommu_k210_defconfig
> @@ -48,6 +48,7 @@ CONFIG_DEVTMPFS_MOUNT=y
>  # CONFIG_INPUT_KEYBOARD is not set
>  # CONFIG_INPUT_MOUSE is not set
>  # CONFIG_SERIO is not set
> +# CONFIG_VT is not set
>  # CONFIG_LEGACY_PTYS is not set
>  # CONFIG_LDISC_AUTOLOAD is not set
>  # CONFIG_HW_RANDOM is not set
> @@ -67,7 +68,6 @@ CONFIG_GPIO_SIFIVE=y
>  CONFIG_POWER_RESET=y
>  CONFIG_POWER_RESET_SYSCON=y
>  # CONFIG_HWMON is not set
> -# CONFIG_VGA_CONSOLE is not set
>  # CONFIG_HID is not set
>  # CONFIG_USB_SUPPORT is not set
>  CONFIG_NEW_LEDS=y
> diff --git a/arch/riscv/configs/nommu_k210_sdcard_defconfig 
> b/arch/riscv/configs/nommu_k210_sdcard_defconfig
> index 3d2cb4747e7f85b7..4cd1715dd0cf3747 100644
> --- a/arch/riscv/configs/nommu_k210_sdcard_defconfig
> +++ b/arch/riscv/configs/nommu_k210_sdcard_defconfig
> @@ -41,6 +41,7 @@ CONFIG_DEVTMPFS_MOUNT=y
>  # CONFIG_INPUT_KEYBOARD is not set
>  # CONFIG_INPUT_MOUSE is not set
>  # CONFIG_SERIO is not set
> +# CONFIG_VT is not set
>  # CONFIG_LEGACY_PTYS is not set
>  # CONFIG_LDISC_AUTOLOAD is not set
>  # CONFIG_HW_RANDOM is not set
> @@ -60,7 +61,6 @@ CONFIG_GPIO_SIFIVE=y
>  CONFIG_POWER_RESET=y
>  CONFIG_POWER_RESET_SYSCON=y
>  # CONFIG_HWMON is not set
> -# CONFIG_VGA_CONSOLE is not set
>  # CONFIG_HID is not set
>  # CONFIG_USB_SUPPORT is not set
>  CONFIG_MMC=y
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH 5.9 374/391] null_blk: synchronization fix for zoned device

2020-11-03 Thread Damien Le Moal
On 2020/11/04 5:52, Greg Kroah-Hartman wrote:
> From: Kanchan Joshi 
> 
> commit 35bc10b2eafbb701064b94f283b77c54d3304842 upstream.
> 
> Parallel write,read,zone-mgmt operations accessing/altering zone state
> and write-pointer may get into race. Avoid the situation by using a new
> spinlock for zoned device.
> Concurrent zone-appends (on a zone) returning same write-pointer issue
> is also avoided using this lock.
> 
> Cc: sta...@vger.kernel.org
> Fixes: e0489ed5daeb ("null_blk: Support REQ_OP_ZONE_APPEND")
> Signed-off-by: Kanchan Joshi 
> Reviewed-by: Damien Le Moal 
> Signed-off-by: Jens Axboe 
> Signed-off-by: Greg Kroah-Hartman 

Greg,

I sent a followup patch fixing a bug introduced by this patch, but I forgot to
mark it for stable. The patch is

commit aa1c09cb65e2 "null_blk: Fix locking in zoned mode"

Could you pickup that one too please ?

Best regards.

>
> ---
>  drivers/block/null_blk.h   |1 +
>  drivers/block/null_blk_zoned.c |   22 ++
>  2 files changed, 19 insertions(+), 4 deletions(-)
> 
> --- a/drivers/block/null_blk.h
> +++ b/drivers/block/null_blk.h
> @@ -44,6 +44,7 @@ struct nullb_device {
>   unsigned int nr_zones;
>   struct blk_zone *zones;
>   sector_t zone_size_sects;
> + spinlock_t zone_lock;
>  
>   unsigned long size; /* device size in MB */
>   unsigned long completion_nsec; /* time in ns to complete a request */
> --- a/drivers/block/null_blk_zoned.c
> +++ b/drivers/block/null_blk_zoned.c
> @@ -45,6 +45,7 @@ int null_init_zoned_dev(struct nullb_dev
>   if (!dev->zones)
>   return -ENOMEM;
>  
> + spin_lock_init(>zone_lock);
>   if (dev->zone_nr_conv >= dev->nr_zones) {
>   dev->zone_nr_conv = dev->nr_zones - 1;
>   pr_info("changed the number of conventional zones to %u",
> @@ -131,8 +132,11 @@ int null_report_zones(struct gendisk *di
>* So use a local copy to avoid corruption of the device zone
>* array.
>*/
> + spin_lock_irq(>zone_lock);
>   memcpy(, >zones[first_zone + i],
>  sizeof(struct blk_zone));
> + spin_unlock_irq(>zone_lock);
> +
>   error = cb(, i, data);
>   if (error)
>   return error;
> @@ -277,18 +281,28 @@ static blk_status_t null_zone_mgmt(struc
>  blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
>   sector_t sector, sector_t nr_sectors)
>  {
> + blk_status_t sts;
> + struct nullb_device *dev = cmd->nq->dev;
> +
> + spin_lock_irq(>zone_lock);
>   switch (op) {
>   case REQ_OP_WRITE:
> - return null_zone_write(cmd, sector, nr_sectors, false);
> + sts = null_zone_write(cmd, sector, nr_sectors, false);
> + break;
>   case REQ_OP_ZONE_APPEND:
> - return null_zone_write(cmd, sector, nr_sectors, true);
> + sts = null_zone_write(cmd, sector, nr_sectors, true);
> + break;
>   case REQ_OP_ZONE_RESET:
>   case REQ_OP_ZONE_RESET_ALL:
>   case REQ_OP_ZONE_OPEN:
>   case REQ_OP_ZONE_CLOSE:
>   case REQ_OP_ZONE_FINISH:
> - return null_zone_mgmt(cmd, op, sector);
> + sts = null_zone_mgmt(cmd, op, sector);
> + break;
>   default:
> - return null_process_cmd(cmd, op, sector, nr_sectors);
> + sts = null_process_cmd(cmd, op, sector, nr_sectors);
>   }
> + spin_unlock_irq(>zone_lock);
> +
> + return sts;
>  }
> 
> 
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH 0/2] block layer filter and block device snapshot module

2020-10-22 Thread Damien Le Moal
On 2020/10/22 18:43, Sergei Shtepa wrote:
> The 10/22/2020 08:58, Hannes Reinecke wrote:
>> On 10/21/20 4:10 PM, Sergei Shtepa wrote:
>>> The 10/21/2020 16:31, Hannes Reinecke wrote:
>>>> I do understand where you are coming from, but then we already have a
>>>> dm-snap which does exactly what you want to achieve.
>>>> Of course, that would require a reconfiguration of the storage stack on
>>>> the machine, which is not always possible (or desired).
>>>
>>> Yes, reconfiguring the storage stack on a machine is almost impossible.
>>>
>>>>
>>>> What I _could_ imagine would be a 'dm-intercept' thingie, which
>>>> redirects the current submit_bio() function for any block device, and
>>>> re-routes that to a linear device-mapper device pointing back to the
>>>> original block device.
>>>>
>>>> That way you could attach it to basically any block device, _and_ can
>>>> use the existing device-mapper functionality to do fancy stuff once the
>>>> submit_io() callback has been re-routed.
>>>>
>>>> And it also would help in other scenarios, too; with such a
>>>> functionality we could seamlessly clone devices without having to move
>>>> the whole setup to device-mapper first.
>>>
>>> Hm...
>>> Did I understand correctly that the filter itself can be left approximately
>>> as it is, but the blk-snap module can be replaced with 'dm-intercept',
>>> which would use the re-route mechanism from the dm?
>>> I think I may be able to implement it, if you describe your idea in more
>>> detail.
>>>
>>>
>> Actually, once we have an dm-intercept, why do you need the block-layer 
>> filter at all?
>>  From you initial description the block-layer filter was implemented 
>> such that blk-snap could work; but if we have dm-intercept (and with it 
>> the ability to use device-mapper functionality even for normal block 
>> devices) there wouldn't be any need for the block-layer filter, no?
> 
> Maybe, but the problem is that I can't imagine how to implement
> dm-intercept yet. 
> How to use dm to implement interception without changing the stack
> of block devices. We'll have to make a hook somewhere, isn`t it?

Once your dm-intercept target driver is inserted with "dmsetup" or any user land
tool you implement using libdevicemapper, the "hooks" will naturally be in place
since the dm infrastructure already does that: all submitted BIOs will be passed
to dm-intercept through the "map" operation defined in the target_type
descriptor. It is then that driver job to execute the BIOs as it sees fit.

Look at simple device mappers like dm-linear or dm-flakey for hints of how
things work (driver/md/dm-linear.c). More complex dm drivers like dm-crypt,
dm-writecache or dm-thin can give you hints about more features of device 
mapper.
Functions such as __map_bio() in drivers/md/dm.c are the core of DM and show
what happens to BIOs depending on the the return value of the map operation.
dm_submit_bio() and __split_and_process_bio() is the entry points for BIO
processing in DM.

> 
>>
>> Cheers,
>>
>> Hannes
>> -- 
>> Dr. Hannes ReineckeKernel Storage Architect
>> h...@suse.de  +49 911 74053 688
>> SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
>> HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH 1/2] Block layer filter - second version

2020-10-21 Thread Damien Le Moal
 is kept as
the direct version (as today) and you use a "submit_bio_filtered()" where 
needed.

>  
>   sdio->bio = NULL;
>   sdio->boundary = 0;
> diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
> index c1aafb2ab990..e05f20ce8b5f 100644
> --- a/fs/iomap/direct-io.c
> +++ b/fs/iomap/direct-io.c
> @@ -73,7 +73,7 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, 
> struct iomap *iomap,
>   file_inode(dio->iocb->ki_filp),
>   iomap, bio, pos);
>   else
> - dio->submit.cookie = submit_bio(bio);
> + dio->submit.cookie = submit_bio_direct(bio);
>  }
>  
>  static ssize_t iomap_dio_complete(struct iomap_dio *dio)
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index c6d765382926..5b0a32697207 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -10,6 +10,7 @@
>  #include 
>  /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
>  #include 
> +#include 
>  
>  #define BIO_DEBUG
>  
> @@ -411,7 +412,8 @@ static inline struct bio *bio_kmalloc(gfp_t gfp_mask, 
> unsigned int nr_iovecs)
>   return bio_alloc_bioset(gfp_mask, nr_iovecs, NULL);
>  }
>  
> -extern blk_qc_t submit_bio(struct bio *);
> +extern blk_qc_t submit_bio_direct(struct bio *bio);
> +extern void submit_bio(struct bio *bio);
>  
>  extern void bio_endio(struct bio *);
>  
> diff --git a/include/linux/blk-filter.h b/include/linux/blk-filter.h
> new file mode 100644
> index ..f3e79e5b4586
> --- /dev/null
> +++ b/include/linux/blk-filter.h
> @@ -0,0 +1,76 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +/*
> + * API declarations for kernel modules utilizing block device filters
> + */
> +
> +#ifndef BLK_FILTER_H
> +#define BLK_FILTER_H
> +
> +#ifdef CONFIG_BLK_FILTER
> +#include 
> +
> +struct blk_filter_ops {
> + /*
> +  * Intercept bio callback.
> +  *
> +  * Returns true if the request was intercepted and placed in the
> +  * queue for processing. Otherwise submit_bio_direct() calling
> +  * needed.
> +  */
> + bool (*filter_bio)(struct bio *bio, void *filter_data);
> +
> + /*
> +  * Callback to a request to add block device to the filter.
> +  *
> +  * Returns true if the block device will be filtered.
> +  * p_filter_data gets a pointer to data that is unique to
> +  * this device.
> +  */
> + bool (*part_add)(dev_t devt, void **p_filter_data);
> +
> + /*
> +  * Callback to remove block device from the filter.
> +  */
> + void (*part_del)(void *filter_data);
> +};
> +
> +struct blk_filter {
> + struct list_head link;
> + struct kref kref;
> + struct blk_filter_ops *ops;
> +};
> +
> +/*
> + * Register/unregister device to filter
> + */
> +void *blk_filter_register(struct blk_filter_ops *ops);
> +
> +void blk_filter_unregister(void *filter);
> +
> +/*
> + * Attach/detach device to filter
> + */
> +int blk_filter_attach(dev_t devt, void *filter, void *filter_data);
> +
> +void blk_filter_detach(dev_t devt);
> +
> +/*
> + * For a consistent state of the file system use the freeze_bdev/thaw_bdav.
> + * But in addition, to ensure that the filter is not in the state of
> + * intercepting the next BIO, you need to call 
> black_filter_freeze/blk_filter_thaw.
> + * This is especially actual if there is no file system on the disk.
> + */
> +
> +void blk_filter_freeze(struct block_device *bdev);
> +
> +void blk_filter_thaw(struct block_device *bdev);
> +
> +/*
> + * Filters intercept function
> + */
> +void blk_filter_submit_bio(struct bio *bio);
> +
> +#endif /* CONFIG_BLK_FILTER */
> +
> +#endif
> diff --git a/include/linux/genhd.h b/include/linux/genhd.h
> index 4ab853461dff..514fab6b947e 100644
> --- a/include/linux/genhd.h
> +++ b/include/linux/genhd.h
> @@ -4,7 +4,7 @@
>  
>  /*
>   *   genhd.h Copyright (C) 1992 Drew Eckhardt
> - *   Generic hard disk header file by  
> + *   Generic hard disk header file by
>   *   Drew Eckhardt
>   *
>   *   
> @@ -75,6 +75,12 @@ struct hd_struct {
>   int make_it_fail;
>  #endif
>   struct rcu_work rcu_work;
> +
> +#ifdef CONFIG_BLK_FILTER
> + struct rw_semaphore filter_rw_lockup; /* for freezing block device*/
> + struct blk_filter *filter; /* block layer filter*/
> + void *filter_data; /*specific for each block device filters data*/
> +#endif
>  };
>  
>  /**
> diff --git a/kernel/power/swap.c b/kernel/power/swap.c
> index 01e2858b5fe3..5287346b87a1 100644
> --- a/kernel/power/swap.c
> +++ b/kernel/power/swap.c
> @@ -283,7 +283,7 @@ static int hib_submit_io(int op, int op_flags, pgoff_t 
> page_off, void *addr,
>   bio->bi_end_io = hib_end_io;
>   bio->bi_private = hb;
>   atomic_inc(>count);
> - submit_bio(bio);
> + submit_bio_direct(bio);
>   } else {
>   error = submit_bio_wait(bio);
>   bio_put(bio);
> diff --git a/mm/page_io.c b/mm/page_io.c
> index e485a6e8a6cd..4540426400b3 100644
> --- a/mm/page_io.c
> +++ b/mm/page_io.c
> @@ -362,7 +362,7 @@ int __swap_writepage(struct page *page, struct 
> writeback_control *wbc,
>   count_swpout_vm_event(page);
>   set_page_writeback(page);
>   unlock_page(page);
> - submit_bio(bio);
> + submit_bio_direct(bio);
>  out:
>   return ret;
>  }
> @@ -434,7 +434,7 @@ int swap_readpage(struct page *page, bool synchronous)
>   }
>   count_vm_event(PSWPIN);
>   bio_get(bio);
> - qc = submit_bio(bio);
> + qc = submit_bio_direct(bio);
>   while (synchronous) {
>   set_current_state(TASK_UNINTERRUPTIBLE);
>   if (!READ_ONCE(bio->bi_private))
> 

Separate into multiple patches: one that introduces the filter functions/ops
code and another that changes the block layer where needed.


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH RFC PKS/PMEM 26/58] fs/zonefs: Utilize new kmap_thread()

2020-10-11 Thread Damien Le Moal
On 2020/10/10 4:52, ira.we...@intel.com wrote:
> From: Ira Weiny 
> 
> The kmap() calls in this FS are localized to a single thread.  To avoid
> the over head of global PKRS updates use the new kmap_thread() call.
> 
> Cc: Damien Le Moal 
> Cc: Naohiro Aota 
> Signed-off-by: Ira Weiny 
> ---
>  fs/zonefs/super.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
> index 8ec7c8f109d7..2fd6c86beee1 100644
> --- a/fs/zonefs/super.c
> +++ b/fs/zonefs/super.c
> @@ -1297,7 +1297,7 @@ static int zonefs_read_super(struct super_block *sb)
>   if (ret)
>   goto free_page;
>  
> - super = kmap(page);
> + super = kmap_thread(page);
>  
>   ret = -EINVAL;
>   if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC)
> @@ -1349,7 +1349,7 @@ static int zonefs_read_super(struct super_block *sb)
>   ret = 0;
>  
>  unmap:
> - kunmap(page);
> +     kunmap_thread(page);
>  free_page:
>   __free_page(page);
>  
> 

acked-by: Damien Le Moal 

-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v4 6/6] io_uring: add support for zone-append

2020-09-28 Thread Damien Le Moal
stination zone).

nvme passthrough ? If that does not fit your use case, then think of an
interface, its definition/semantic and propose it. But again, use a different
thread. This is mixing up zone-append and simple copy, which I do not think are
directly related.

> Not sure if I am clear, perhaps sending RFC would be better for
> discussion on simple-copy.

Separate this discussion from zone append please. Mixing up 2 problems in one
thread is not helpful to make progress.


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v2 1/1] null_blk: synchronization fix for zoned device

2020-09-28 Thread Damien Le Moal
On 2020/09/28 18:59, Kanchan Joshi wrote:
> Parallel write,read,zone-mgmt operations accessing/altering zone state
> and write-pointer may get into race. Avoid the situation by using a new
> spinlock for zoned device.
> Concurrent zone-appends (on a zone) returning same write-pointer issue
> is also avoided using this lock.
> 
> Fixes: e0489ed5daeb ("null_blk: Support REQ_OP_ZONE_APPEND")
> Signed-off-by: Kanchan Joshi 
> ---
>  drivers/block/null_blk.h   |  1 +
>  drivers/block/null_blk_zoned.c | 22 ++
>  2 files changed, 19 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
> index daed4a9c3436..28099be50395 100644
> --- a/drivers/block/null_blk.h
> +++ b/drivers/block/null_blk.h
> @@ -44,6 +44,7 @@ struct nullb_device {
>   unsigned int nr_zones;
>   struct blk_zone *zones;
>   sector_t zone_size_sects;
> + spinlock_t zone_lock;
>  
>   unsigned long size; /* device size in MB */
>   unsigned long completion_nsec; /* time in ns to complete a request */
> diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
> index 3d25c9ad2383..e8d8b13aaa5a 100644
> --- a/drivers/block/null_blk_zoned.c
> +++ b/drivers/block/null_blk_zoned.c
> @@ -45,6 +45,7 @@ int null_init_zoned_dev(struct nullb_device *dev, struct 
> request_queue *q)
>   if (!dev->zones)
>   return -ENOMEM;
>  
> + spin_lock_init(>zone_lock);
>   if (dev->zone_nr_conv >= dev->nr_zones) {
>   dev->zone_nr_conv = dev->nr_zones - 1;
>   pr_info("changed the number of conventional zones to %u",
> @@ -131,8 +132,11 @@ int null_report_zones(struct gendisk *disk, sector_t 
> sector,
>* So use a local copy to avoid corruption of the device zone
>* array.
>*/
> + spin_lock_irq(>zone_lock);
>   memcpy(, >zones[first_zone + i],
>  sizeof(struct blk_zone));
> + spin_unlock_irq(>zone_lock);
> +
>   error = cb(, i, data);
>   if (error)
>   return error;
> @@ -277,18 +281,28 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd 
> *cmd, enum req_opf op,
>  blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
>   sector_t sector, sector_t nr_sectors)
>  {
> + blk_status_t sts;
> + struct nullb_device *dev = cmd->nq->dev;
> +
> + spin_lock_irq(>zone_lock);
>   switch (op) {
>   case REQ_OP_WRITE:
> - return null_zone_write(cmd, sector, nr_sectors, false);
> + sts = null_zone_write(cmd, sector, nr_sectors, false);
> + break;
>   case REQ_OP_ZONE_APPEND:
> - return null_zone_write(cmd, sector, nr_sectors, true);
> + sts = null_zone_write(cmd, sector, nr_sectors, true);
> + break;
>   case REQ_OP_ZONE_RESET:
>   case REQ_OP_ZONE_RESET_ALL:
>   case REQ_OP_ZONE_OPEN:
>   case REQ_OP_ZONE_CLOSE:
>   case REQ_OP_ZONE_FINISH:
> - return null_zone_mgmt(cmd, op, sector);
> + sts = null_zone_mgmt(cmd, op, sector);
> +     break;
>   default:
> - return null_process_cmd(cmd, op, sector, nr_sectors);
> + sts = null_process_cmd(cmd, op, sector, nr_sectors);
>   }
> + spin_unlock_irq(>zone_lock);
> +
> + return sts;
>  }
> 

Looks good.

Reviewed-by: Damien Le Moal 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v2 0/1] concurrency handling for zoned null-blk

2020-09-28 Thread Damien Le Moal
On 2020/09/28 18:59, Kanchan Joshi wrote:
> Changes since v1:
> - applied the refactoring suggested by Damien
> 
> Kanchan Joshi (1):
>   null_blk: synchronization fix for zoned device
> 
>  drivers/block/null_blk.h   |  1 +
>  drivers/block/null_blk_zoned.c | 22 ++
>  2 files changed, 19 insertions(+), 4 deletions(-)
> 

For single patches, you should add this after the "---" in the patch file, above
the patch stats. This is ignores by git when the patch is applied (the patch
starts at the first "diff" entry).


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v3] RISC-V: Check clint_time_val before use

2020-09-26 Thread Damien Le Moal
On Sun, 2020-09-27 at 11:09 +0530, Anup Patel wrote:
> The NoMMU kernel is broken for QEMU virt machine from Linux-5.9-rc6
> because clint_time_val is used even before CLINT driver is probed
> at following places:
> 1. rand_initialize() calls get_cycles() which in-turn uses
>clint_time_val
> 2. boot_init_stack_canary() calls get_cycles() which in-turn
>uses clint_time_val
> 
> The issue#1 (above) is fixed by providing custom random_get_entropy()
> for RISC-V NoMMU kernel. For issue#2 (above), we remove dependency of
> boot_init_stack_canary() on get_cycles() and this is aligned with the
> boot_init_stack_canary() implementations of ARM, ARM64 and MIPS kernel.
> 
> Fixes: d5be89a8d118 ("RISC-V: Resurrect the MMIO timer implementation
> for M-mode systems")
> Signed-off-by: Palmer Dabbelt 
> Signed-off-by: Anup Patel 
> ---
> Changes since v2:
>  - Take different approach and provide custom random_get_entropy() for
>RISC-V NoMMU kernel
>  - Remove dependency of boot_init_stack_canary() on get_cycles()
>  - Hopefully we don't require to set clint_time_val = NULL in CLINT
>driver with a different approach to fix.
> Changes since v1:
>  - Explicitly initialize clint_time_val to NULL in CLINT driver to
>avoid hang on Kendryte K210
> ---
>  arch/riscv/include/asm/stackprotector.h |  4 
>  arch/riscv/include/asm/timex.h  | 13 +
>  2 files changed, 13 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/stackprotector.h 
> b/arch/riscv/include/asm/stackprotector.h
> index d95f7b2a7f37..5962f8891f06 100644
> --- a/arch/riscv/include/asm/stackprotector.h
> +++ b/arch/riscv/include/asm/stackprotector.h
> @@ -5,7 +5,6 @@
>  
>  #include 
>  #include 
> -#include 
>  
>  extern unsigned long __stack_chk_guard;
>  
> @@ -18,12 +17,9 @@ extern unsigned long __stack_chk_guard;
>  static __always_inline void boot_init_stack_canary(void)
>  {
>   unsigned long canary;
> - unsigned long tsc;
>  
>   /* Try to get a semi random initial value. */
>   get_random_bytes(, sizeof(canary));
> - tsc = get_cycles();
> - canary += tsc + (tsc << BITS_PER_LONG/2);
>   canary ^= LINUX_VERSION_CODE;
>   canary &= CANARY_MASK;
>  
> diff --git a/arch/riscv/include/asm/timex.h b/arch/riscv/include/asm/timex.h
> index 7f659dda0032..ab104905d4db 100644
> --- a/arch/riscv/include/asm/timex.h
> +++ b/arch/riscv/include/asm/timex.h
> @@ -33,6 +33,19 @@ static inline u32 get_cycles_hi(void)
>  #define get_cycles_hi get_cycles_hi
>  #endif /* CONFIG_64BIT */
>  
> +/*
> + * Much like MIPS, we may not have a viable counter to use at an early point
> + * in the boot process. Unfortunately we don't have a fallback, so instead
> + * we just return 0.
> + */
> +static inline unsigned long random_get_entropy(void)
> +{
> + if (unlikely(clint_time_val == NULL))
> + return 0;
> +     return get_cycles();
> +}
> +#define random_get_entropy() random_get_entropy()
> +
>  #else /* CONFIG_RISCV_M_MODE */
>  
>  static inline cycles_t get_cycles(void)

Did not reply to the patch... So again for Kendryte:

Tested-by: Damien Le Moal 


-- 
Damien Le Moal
Western Digital


Re: [PATCH v3] RISC-V: Check clint_time_val before use

2020-09-26 Thread Damien Le Moal
On Sat, 2020-09-26 at 22:46 -0700, Palmer Dabbelt wrote:
> On Sat, 26 Sep 2020 22:39:16 PDT (-0700), Anup Patel wrote:
> > The NoMMU kernel is broken for QEMU virt machine from Linux-5.9-rc6
> > because clint_time_val is used even before CLINT driver is probed
> > at following places:
> > 1. rand_initialize() calls get_cycles() which in-turn uses
> >clint_time_val
> > 2. boot_init_stack_canary() calls get_cycles() which in-turn
> >uses clint_time_val
> > 
> > The issue#1 (above) is fixed by providing custom random_get_entropy()
> > for RISC-V NoMMU kernel. For issue#2 (above), we remove dependency of
> > boot_init_stack_canary() on get_cycles() and this is aligned with the
> > boot_init_stack_canary() implementations of ARM, ARM64 and MIPS kernel.
> > 
> > Fixes: d5be89a8d118 ("RISC-V: Resurrect the MMIO timer implementation
> > for M-mode systems")
> > Signed-off-by: Palmer Dabbelt 
> > Signed-off-by: Anup Patel 
> > ---
> > Changes since v2:
> >  - Take different approach and provide custom random_get_entropy() for
> >RISC-V NoMMU kernel
> >  - Remove dependency of boot_init_stack_canary() on get_cycles()
> >  - Hopefully we don't require to set clint_time_val = NULL in CLINT
> >driver with a different approach to fix.
> > Changes since v1:
> >  - Explicitly initialize clint_time_val to NULL in CLINT driver to
> >avoid hang on Kendryte K210
> > ---
> >  arch/riscv/include/asm/stackprotector.h |  4 
> >  arch/riscv/include/asm/timex.h  | 13 +
> >  2 files changed, 13 insertions(+), 4 deletions(-)
> > 
> > diff --git a/arch/riscv/include/asm/stackprotector.h 
> > b/arch/riscv/include/asm/stackprotector.h
> > index d95f7b2a7f37..5962f8891f06 100644
> > --- a/arch/riscv/include/asm/stackprotector.h
> > +++ b/arch/riscv/include/asm/stackprotector.h
> > @@ -5,7 +5,6 @@
> > 
> >  #include 
> >  #include 
> > -#include 
> > 
> >  extern unsigned long __stack_chk_guard;
> > 
> > @@ -18,12 +17,9 @@ extern unsigned long __stack_chk_guard;
> >  static __always_inline void boot_init_stack_canary(void)
> >  {
> > unsigned long canary;
> > -   unsigned long tsc;
> > 
> > /* Try to get a semi random initial value. */
> > get_random_bytes(, sizeof(canary));
> > -   tsc = get_cycles();
> > -   canary += tsc + (tsc << BITS_PER_LONG/2);
> > canary ^= LINUX_VERSION_CODE;
> > canary &= CANARY_MASK;
> > 
> > diff --git a/arch/riscv/include/asm/timex.h b/arch/riscv/include/asm/timex.h
> > index 7f659dda0032..ab104905d4db 100644
> > --- a/arch/riscv/include/asm/timex.h
> > +++ b/arch/riscv/include/asm/timex.h
> > @@ -33,6 +33,19 @@ static inline u32 get_cycles_hi(void)
> >  #define get_cycles_hi get_cycles_hi
> >  #endif /* CONFIG_64BIT */
> > 
> > +/*
> > + * Much like MIPS, we may not have a viable counter to use at an early 
> > point
> > + * in the boot process. Unfortunately we don't have a fallback, so instead
> > + * we just return 0.
> > + */
> > +static inline unsigned long random_get_entropy(void)
> > +{
> > +   if (unlikely(clint_time_val == NULL))
> > +   return 0;
> > +   return get_cycles();
> > +}
> > +#define random_get_entropy()   random_get_entropy()
> > +
> >  #else /* CONFIG_RISCV_M_MODE */
> > 
> >  static inline cycles_t get_cycles(void)
> 
> Reviewed-by: Palmer Dabbelt 
> 
> I'm going to wait for Damien to chime in about the NULL initialization boot
> failure, though, as I'm a bit worried something else was going on.
> 
> Thanks!

For Kendryte, no problems. Boots correctly.

Tested-by: Damien Le Moal 



-- 
Damien Le Moal
Western Digital


Re: [PATCH v2] RISC-V: Check clint_time_val before use

2020-09-26 Thread Damien Le Moal
On Sat, 2020-09-26 at 15:51 +0530, Anup Patel wrote:
> The NoMMU kernel is broken for QEMU virt machine from Linux-5.9-rc6
> because the get_cycles() and friends are called very early from
> rand_initialize() before CLINT driver is probed. To fix this, we
> should check clint_time_val before use in get_cycles() and friends.
> 
> Fixes: d5be89a8d118 ("RISC-V: Resurrect the MMIO timer implementation
> for M-mode systems")
> Signed-off-by: Anup Patel 
> ---
> Changes since v1:
>  - Explicitly initialize clint_time_val to NULL in CLINT driver to
>avoid hang on Kendryte K210
> ---
>  arch/riscv/include/asm/timex.h| 12 +---
>  drivers/clocksource/timer-clint.c |  2 +-
>  2 files changed, 10 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/timex.h b/arch/riscv/include/asm/timex.h
> index 7f659dda0032..6e7b04874755 100644
> --- a/arch/riscv/include/asm/timex.h
> +++ b/arch/riscv/include/asm/timex.h
> @@ -17,18 +17,24 @@ typedef unsigned long cycles_t;
>  #ifdef CONFIG_64BIT
>  static inline cycles_t get_cycles(void)
>  {
> - return readq_relaxed(clint_time_val);
> + if (clint_time_val)
> + return readq_relaxed(clint_time_val);
> + return 0;
>  }
>  #else /* !CONFIG_64BIT */
>  static inline u32 get_cycles(void)
>  {
> - return readl_relaxed(((u32 *)clint_time_val));
> + if (clint_time_val)
> + return readl_relaxed(((u32 *)clint_time_val));
> + return 0;
>  }
>  #define get_cycles get_cycles
>  
>  static inline u32 get_cycles_hi(void)
>  {
> - return readl_relaxed(((u32 *)clint_time_val) + 1);
> + if (clint_time_val)
> + return readl_relaxed(((u32 *)clint_time_val) + 1);
> + return 0;
>  }
>  #define get_cycles_hi get_cycles_hi
>  #endif /* CONFIG_64BIT */
> diff --git a/drivers/clocksource/timer-clint.c 
> b/drivers/clocksource/timer-clint.c
> index d17367dee02c..8dbec85979fd 100644
> --- a/drivers/clocksource/timer-clint.c
> +++ b/drivers/clocksource/timer-clint.c
> @@ -37,7 +37,7 @@ static unsigned long clint_timer_freq;
>  static unsigned int clint_timer_irq;
>  
>  #ifdef CONFIG_RISCV_M_MODE
> -u64 __iomem *clint_time_val;
> +u64 __iomem *clint_time_val = NULL;
>  #endif
>  
>  static void clint_send_ipi(const struct cpumask *target)

For Kendryte:

Tested-by: Damien Le Moal 

-- 
Damien Le Moal
Western Digital


Re: [PATCH] RISC-V: Check clint_time_val before use

2020-09-26 Thread Damien Le Moal
On Sat, 2020-09-26 at 11:09 +0100, Maciej W. Rozycki wrote:
> On Sat, 26 Sep 2020, Damien Le Moal wrote:
> 
> > > > Applying this on top of rc6, I now get a hang on Kendryte boot...
> > > > No problems without the patch on the other hand.
> > > 
> > > Not sure about the issue with Kendryte but I get a crash on
> > > QEMU virt machine without this patch.
> > 
> > With this applied in addition to your patch, it works.
> > 
> > diff --git a/drivers/clocksource/timer-clint.c b/drivers/clocksource/timer-
> > clint.c
> > index d17367dee02c..8dbec85979fd 100644
> > --- a/drivers/clocksource/timer-clint.c
> > +++ b/drivers/clocksource/timer-clint.c
> > @@ -37,7 +37,7 @@ static unsigned long clint_timer_freq;
> >  static unsigned int clint_timer_irq;
> >  
> >  #ifdef CONFIG_RISCV_M_MODE
> > -u64 __iomem *clint_time_val;
> > +u64 __iomem *clint_time_val = NULL;
> >  #endif
> 
>  Hmm, BSS initialisation issue?

Not a static variable, so it is not in BSS, no ?

> 
>   Maciej

-- 
Damien Le Moal
Western Digital


Re: [PATCH] RISC-V: Check clint_time_val before use

2020-09-26 Thread Damien Le Moal
On Sat, 2020-09-26 at 15:27 +0530, Anup Patel wrote:
> On Sat, Sep 26, 2020 at 3:16 PM Damien Le Moal  wrote:
> > On Sat, 2020-09-26 at 09:31 +, Anup Patel wrote:
> > > > -Original Message-
> > > > From: Damien Le Moal 
> > > > Sent: 26 September 2020 14:55
> > > > To: paul.walms...@sifive.com; pal...@dabbelt.com;
> > > > palmerdabb...@google.com; Anup Patel ;
> > > > a...@eecs.berkeley.edu
> > > > Cc: a...@brainfault.org; linux-ri...@lists.infradead.org; Atish Patra
> > > > ; Alistair Francis ; 
> > > > linux-
> > > > ker...@vger.kernel.org
> > > > Subject: Re: [PATCH] RISC-V: Check clint_time_val before use
> > > > 
> > > > On Sat, 2020-09-26 at 12:57 +0530, Anup Patel wrote:
> > > > > The NoMMU kernel is broken for QEMU virt machine from Linux-5.9-rc6
> > > > > because the get_cycles() and friends are called very early from
> > > > > rand_initialize() before CLINT driver is probed. To fix this, we
> > > > > should check clint_time_val before use in get_cycles() and friends.
> > > > > 
> > > > > Fixes: d5be89a8d118 ("RISC-V: Resurrect the MMIO timer implementation
> > > > > for M-mode systems")
> > > > > Signed-off-by: Anup Patel 
> > > > > ---
> > > > >  arch/riscv/include/asm/timex.h | 12 +---
> > > > >  1 file changed, 9 insertions(+), 3 deletions(-)
> > > > > 
> > > > > diff --git a/arch/riscv/include/asm/timex.h
> > > > > b/arch/riscv/include/asm/timex.h index 7f659dda0032..52b42bb1602c
> > > > > 100644
> > > > > --- a/arch/riscv/include/asm/timex.h
> > > > > +++ b/arch/riscv/include/asm/timex.h
> > > > > @@ -17,18 +17,24 @@ typedef unsigned long cycles_t;  #ifdef
> > > > > CONFIG_64BIT  static inline cycles_t get_cycles(void)  {
> > > > > - return readq_relaxed(clint_time_val);
> > > > > + if (clint_time_val)
> > > > > + return readq_relaxed(clint_time_val);
> > > > > + return 0;
> > > > >  }
> > > > >  #else /* !CONFIG_64BIT */
> > > > >  static inline u32 get_cycles(void)
> > > > >  {
> > > > > - return readl_relaxed(((u32 *)clint_time_val));
> > > > > + if (clint_time_val)
> > > > > + return readl_relaxed(((u32 *)clint_time_val));
> > > > > + return 0;
> > > > >  }
> > > > >  #define get_cycles get_cycles
> > > > > 
> > > > >  static inline u32 get_cycles_hi(void)  {
> > > > > - return readl_relaxed(((u32 *)clint_time_val) + 1);
> > > > > + if (clint_time_val)
> > > > > + return readl_relaxed(((u32 *)clint_time_val) + 1);
> > > > > + return 0
> > > > >  }
> > > > >  #define get_cycles_hi get_cycles_hi
> > > > >  #endif /* CONFIG_64BIT */
> > > > 
> > > > Applying this on top of rc6, I now get a hang on Kendryte boot...
> > > > No problems without the patch on the other hand.
> > > 
> > > Not sure about the issue with Kendryte but I get a crash on
> > > QEMU virt machine without this patch.
> > 
> > With this applied in addition to your patch, it works.
> > 
> > diff --git a/drivers/clocksource/timer-clint.c b/drivers/clocksource/timer-
> > clint.c
> > index d17367dee02c..8dbec85979fd 100644
> > --- a/drivers/clocksource/timer-clint.c
> > +++ b/drivers/clocksource/timer-clint.c
> > @@ -37,7 +37,7 @@ static unsigned long clint_timer_freq;
> >  static unsigned int clint_timer_irq;
> > 
> >  #ifdef CONFIG_RISCV_M_MODE
> > -u64 __iomem *clint_time_val;
> > +u64 __iomem *clint_time_val = NULL;
> >  #endif
> > 
> >  static void clint_send_ipi(const struct cpumask *target)
> 
> Ahh, clint_time_val is an uninitialized variable.
> 
> I will send a v2 with your SoB.

No need for my sob. Just merge that in your patch.

> 
> Regards,
> Anup
> 
> > --
> > Damien Le Moal
> > Western Digital

-- 
Damien Le Moal
Western Digital


Re: [PATCH] RISC-V: Check clint_time_val before use

2020-09-26 Thread Damien Le Moal
On Sat, 2020-09-26 at 09:31 +, Anup Patel wrote:
> > -Original Message-
> > From: Damien Le Moal 
> > Sent: 26 September 2020 14:55
> > To: paul.walms...@sifive.com; pal...@dabbelt.com;
> > palmerdabb...@google.com; Anup Patel ;
> > a...@eecs.berkeley.edu
> > Cc: a...@brainfault.org; linux-ri...@lists.infradead.org; Atish Patra
> > ; Alistair Francis ; linux-
> > ker...@vger.kernel.org
> > Subject: Re: [PATCH] RISC-V: Check clint_time_val before use
> > 
> > On Sat, 2020-09-26 at 12:57 +0530, Anup Patel wrote:
> > > The NoMMU kernel is broken for QEMU virt machine from Linux-5.9-rc6
> > > because the get_cycles() and friends are called very early from
> > > rand_initialize() before CLINT driver is probed. To fix this, we
> > > should check clint_time_val before use in get_cycles() and friends.
> > > 
> > > Fixes: d5be89a8d118 ("RISC-V: Resurrect the MMIO timer implementation
> > > for M-mode systems")
> > > Signed-off-by: Anup Patel 
> > > ---
> > >  arch/riscv/include/asm/timex.h | 12 +---
> > >  1 file changed, 9 insertions(+), 3 deletions(-)
> > > 
> > > diff --git a/arch/riscv/include/asm/timex.h
> > > b/arch/riscv/include/asm/timex.h index 7f659dda0032..52b42bb1602c
> > > 100644
> > > --- a/arch/riscv/include/asm/timex.h
> > > +++ b/arch/riscv/include/asm/timex.h
> > > @@ -17,18 +17,24 @@ typedef unsigned long cycles_t;  #ifdef
> > > CONFIG_64BIT  static inline cycles_t get_cycles(void)  {
> > > - return readq_relaxed(clint_time_val);
> > > + if (clint_time_val)
> > > + return readq_relaxed(clint_time_val);
> > > + return 0;
> > >  }
> > >  #else /* !CONFIG_64BIT */
> > >  static inline u32 get_cycles(void)
> > >  {
> > > - return readl_relaxed(((u32 *)clint_time_val));
> > > + if (clint_time_val)
> > > + return readl_relaxed(((u32 *)clint_time_val));
> > > + return 0;
> > >  }
> > >  #define get_cycles get_cycles
> > > 
> > >  static inline u32 get_cycles_hi(void)  {
> > > - return readl_relaxed(((u32 *)clint_time_val) + 1);
> > > + if (clint_time_val)
> > > + return readl_relaxed(((u32 *)clint_time_val) + 1);
> > > + return 0
> > >  }
> > >  #define get_cycles_hi get_cycles_hi
> > >  #endif /* CONFIG_64BIT */
> > 
> > Applying this on top of rc6, I now get a hang on Kendryte boot...
> > No problems without the patch on the other hand.
> 
> Not sure about the issue with Kendryte but I get a crash on
> QEMU virt machine without this patch.

With this applied in addition to your patch, it works.

diff --git a/drivers/clocksource/timer-clint.c b/drivers/clocksource/timer-
clint.c
index d17367dee02c..8dbec85979fd 100644
--- a/drivers/clocksource/timer-clint.c
+++ b/drivers/clocksource/timer-clint.c
@@ -37,7 +37,7 @@ static unsigned long clint_timer_freq;
 static unsigned int clint_timer_irq;
 
 #ifdef CONFIG_RISCV_M_MODE
-u64 __iomem *clint_time_val;
+u64 __iomem *clint_time_val = NULL;
 #endif
 
 static void clint_send_ipi(const struct cpumask *target)

-- 
Damien Le Moal
Western Digital


Re: [PATCH] RISC-V: Check clint_time_val before use

2020-09-26 Thread Damien Le Moal
On Sat, 2020-09-26 at 12:57 +0530, Anup Patel wrote:
> The NoMMU kernel is broken for QEMU virt machine from Linux-5.9-rc6
> because the get_cycles() and friends are called very early from
> rand_initialize() before CLINT driver is probed. To fix this, we
> should check clint_time_val before use in get_cycles() and friends.
> 
> Fixes: d5be89a8d118 ("RISC-V: Resurrect the MMIO timer implementation
> for M-mode systems")
> Signed-off-by: Anup Patel 
> ---
>  arch/riscv/include/asm/timex.h | 12 +---
>  1 file changed, 9 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/riscv/include/asm/timex.h b/arch/riscv/include/asm/timex.h
> index 7f659dda0032..52b42bb1602c 100644
> --- a/arch/riscv/include/asm/timex.h
> +++ b/arch/riscv/include/asm/timex.h
> @@ -17,18 +17,24 @@ typedef unsigned long cycles_t;
>  #ifdef CONFIG_64BIT
>  static inline cycles_t get_cycles(void)
>  {
> - return readq_relaxed(clint_time_val);
> + if (clint_time_val)
> + return readq_relaxed(clint_time_val);
> + return 0;
>  }
>  #else /* !CONFIG_64BIT */
>  static inline u32 get_cycles(void)
>  {
> - return readl_relaxed(((u32 *)clint_time_val));
> + if (clint_time_val)
> + return readl_relaxed(((u32 *)clint_time_val));
> + return 0;
>  }
>  #define get_cycles get_cycles
>  
>  static inline u32 get_cycles_hi(void)
>  {
> - return readl_relaxed(((u32 *)clint_time_val) + 1);
> + if (clint_time_val)
> + return readl_relaxed(((u32 *)clint_time_val) + 1);
> + return 0
>  }
>  #define get_cycles_hi get_cycles_hi
>  #endif /* CONFIG_64BIT */

Applying this on top of rc6, I now get a hang on Kendryte boot...
No problems without the patch on the other hand.


-- 
Damien Le Moal
Western Digital


Re: [PATCH v4 6/6] io_uring: add support for zone-append

2020-09-24 Thread Damien Le Moal
e there is no metadata and all inodes always exist in-memory.
And zonefs() now supports MAR/MOR limits for O_WRONLY open(). That can simplify
things for the user.


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH] null_blk: synchronization fix for zoned device

2020-09-24 Thread Damien Le Moal
   case BLK_ZONE_COND_IMP_OPEN:
>   case BLK_ZONE_COND_EXP_OPEN:
> @@ -193,27 +208,33 @@ static blk_status_t null_zone_write(struct nullb_cmd 
> *cmd, sector_t sector,
>   else
>   cmd->rq->__sector = sector;
>   } else if (sector != zone->wp) {
> - return BLK_STS_IOERR;
> + ret = BLK_STS_IOERR;
> + break;
>   }
>  
> - if (zone->wp + nr_sectors > zone->start + zone->capacity)
> - return BLK_STS_IOERR;
> + if (zone->wp + nr_sectors > zone->start + zone->capacity) {
> + ret = BLK_STS_IOERR;
> + break;
> + }
>  
>   if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
>   zone->cond = BLK_ZONE_COND_IMP_OPEN;
>  
>   ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
>   if (ret != BLK_STS_OK)
> - return ret;
> + break;
>  
>   zone->wp += nr_sectors;
>   if (zone->wp == zone->start + zone->capacity)
>   zone->cond = BLK_ZONE_COND_FULL;
> - return BLK_STS_OK;
> + break;
>   default:
>   /* Invalid zone condition */
> - return BLK_STS_IOERR;
> + ret = BLK_STS_IOERR;
>   }
> +
> + spin_unlock_irq(>zlock);
> + return ret;
>  }
>  
>  static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
> @@ -223,7 +244,9 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, 
> enum req_opf op,
>   unsigned int zone_no = null_zone_no(dev, sector);
>   struct blk_zone *zone = >zones[zone_no];
>   size_t i;
> + blk_status_t ret = BLK_STS_OK;
>  
> + spin_lock_irq(>zlock);
>   switch (op) {
>   case REQ_OP_ZONE_RESET_ALL:
>   for (i = 0; i < dev->nr_zones; i++) {
> @@ -234,25 +257,29 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd 
> *cmd, enum req_opf op,
>   }
>   break;
>   case REQ_OP_ZONE_RESET:
> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> - return BLK_STS_IOERR;
> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
> + ret = BLK_STS_IOERR;
> + break;
> + }
>  
>   zone->cond = BLK_ZONE_COND_EMPTY;
>   zone->wp = zone->start;
>   break;
>   case REQ_OP_ZONE_OPEN:
> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> - return BLK_STS_IOERR;
> - if (zone->cond == BLK_ZONE_COND_FULL)
> - return BLK_STS_IOERR;
> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
> + zone->cond == BLK_ZONE_COND_FULL) {
> + ret = BLK_STS_IOERR;
> + break;
> + }
>  
>   zone->cond = BLK_ZONE_COND_EXP_OPEN;
>   break;
>   case REQ_OP_ZONE_CLOSE:
> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> - return BLK_STS_IOERR;
> - if (zone->cond == BLK_ZONE_COND_FULL)
> - return BLK_STS_IOERR;
> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
> + zone->cond == BLK_ZONE_COND_FULL) {
> + ret = BLK_STS_IOERR;
> + break;
> + }
>  
>   if (zone->wp == zone->start)
>   zone->cond = BLK_ZONE_COND_EMPTY;
> @@ -260,18 +287,21 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd 
> *cmd, enum req_opf op,
>   zone->cond = BLK_ZONE_COND_CLOSED;
>   break;
>   case REQ_OP_ZONE_FINISH:
> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> - return BLK_STS_IOERR;
> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
> + ret = BLK_STS_IOERR;
> + break;
> + }
>  
>   zone->cond = BLK_ZONE_COND_FULL;
>   zone->wp = zone->start + zone->len;
>   break;
>   default:
> - return BLK_STS_NOTSUPP;
> + ret = BLK_STS_NOTSUPP;
>   }
>  
> + spin_unlock_irq(>zlock);
>   trace_nullb_zone_op(cmd, zone_no, zone->cond);
> - return BLK_STS_OK;
> + return ret;
>  }

I think you can avoid all of these changes by taking the lock around the calls
to null_zone_mgmt() and null_zone_write() in null_process_zoned_cmd(). That will
make the patch a lot smaller and simplify maintenance. And even, I think that
taking the lock on entry to null_process_zoned_cmd() and unlocking on return
should even be simpler since that would cover reads too (valid read len). Only
report zones would need special treatment.

>  
>  blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
> 

I think we also need this to have a Cc: stable and a "Fixes" tag too.

-- 
Damien Le Moal
Western Digital Research


Re: [PATCH AUTOSEL 5.8 20/20] riscv: Fix Kendryte K210 device tree

2020-09-22 Thread Damien Le Moal
On 2020/09/21 23:41, Sasha Levin wrote:
> From: Damien Le Moal 
> 
> [ Upstream commit f025d9d9934b84cd03b7796072d10686029c408e ]
> 
> The Kendryte K210 SoC CLINT is compatible with Sifive clint v0
> (sifive,clint0). Fix the Kendryte K210 device tree clint entry to be
> inline with the sifive timer definition documented in
> Documentation/devicetree/bindings/timer/sifive,clint.yaml.
> The device tree clint entry is renamed similarly to u-boot device tree
> definition to improve compatibility with u-boot defined device tree.
> To ensure correct initialization, the interrup-cells attribute is added
> and the interrupt-extended attribute definition fixed.
> 
> This fixes boot failures with Kendryte K210 SoC boards.
> 
> Note that the clock referenced is kept as K210_CLK_ACLK, which does not
> necessarilly match the clint MTIME increment rate. This however does not
> seem to cause any problem for now.
> 
> Signed-off-by: Damien Le Moal 
> Signed-off-by: Palmer Dabbelt 
> Signed-off-by: Sasha Levin 
> ---
>  arch/riscv/boot/dts/kendryte/k210.dtsi | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/riscv/boot/dts/kendryte/k210.dtsi 
> b/arch/riscv/boot/dts/kendryte/k210.dtsi
> index c1df56ccb8d55..d2d0ff6456325 100644
> --- a/arch/riscv/boot/dts/kendryte/k210.dtsi
> +++ b/arch/riscv/boot/dts/kendryte/k210.dtsi
> @@ -95,10 +95,12 @@ sysctl: sysctl@5044 {
>   #clock-cells = <1>;
>   };
>  
> - clint0: interrupt-controller@200 {
> + clint0: clint@200 {
> + #interrupt-cells = <1>;
>   compatible = "riscv,clint0";
>   reg = <0x200 0xC000>;
> - interrupts-extended = <_intc 3>,  <_intc 3>;
> + interrupts-extended =  <_intc 3 _intc 7
> + _intc 3 _intc 7>;
>   clocks = < K210_CLK_ACLK>;
>   };
>  
> 

Sasha,

This is a fix for a problem in 5.9 tree. 5.8 kernel is fine without this patch.
And I think applying it to 5.8 might actually break things since the proper
clint driver was added to kernel 5.9 and does not exist in 5.8.

Best regards.

-- 
Damien Le Moal
Western Digital Research


Re: linux-next: Signed-off-by missing for commits in the zonefs tree

2020-09-15 Thread Damien Le Moal
On 2020/09/15 18:26, Stephen Rothwell wrote:
> Hi all,
> 
> Commits
> 
>   7de0d8dc5dea ("zonefs: document the explicit-open mount option")
>   7e7dda2cb1b6 ("zonefs: open/close zone on file open/close")
>   c282d13f6ceb ("zonefs: provide zonefs_io_error variant that can be called 
> with i_truncate_mutex held")
>   16ef4f7638ac ("zonefs: introduce helper for zone management")
> 
> are missing a Signed-off-by from their committer.
> 

Fixed. Sorry about that !

-- 
Damien Le Moal
Western Digital Research


Re: first bad commit: [5795eb443060148796beeba106e4366d7f1458a6] scsi: sd_zbc: emulate ZONE_APPEND commands

2020-09-12 Thread Damien Le Moal
On 2020/09/12 18:09, Johannes Thumshirn wrote:
> On 12/09/2020 04:31, Damien Le Moal wrote:
>> On 2020/09/12 8:07, Borislav Petkov wrote:
>>> On Sat, Sep 12, 2020 at 12:17:59AM +0200, Borislav Petkov wrote:
>>>> Enabling it, fixes the issue.
>>>
>>> Btw, I just hit the below warn with 5.8, while booting with the above
>>> config option enabled. Looks familiar and I didn't trigger it with
>>> 5.9-rc4+ so you guys either fixed it or something changed in-between:
>>>
>>> [5.124321] ata4.00: NCQ Send/Recv Log not supported
>>> [5.131484] ata4.00: configured for UDMA/133
>>> [5.135847] scsi 3:0:0:0: Direct-Access ATA  ST8000AS0022-1WL 
>>> SN01 PQ: 0 ANSI: 5
>>> [5.143972] sd 3:0:0:0: Attached scsi generic sg1 type 0
>>> [5.144033] sd 3:0:0:0: [sdb] Host-aware zoned block device
>>> [5.177105] sd 3:0:0:0: [sdb] 15628053168 512-byte logical blocks: (8.00 
>>> TB/7.28 TiB)
>>> [5.184880] sd 3:0:0:0: [sdb] 4096-byte physical blocks
>>> [5.190084] sd 3:0:0:0: [sdb] 29808 zones of 524288 logical blocks + 1 
>>> runt zone
>>> [5.197439] sd 3:0:0:0: [sdb] Write Protect is off
>>> [5.202220] sd 3:0:0:0: [sdb] Mode Sense: 00 3a 00 00
>>> [5.207260] sd 3:0:0:0: [sdb] Write cache: enabled, read cache: enabled, 
>>> doesn't support DPO or FUA
>>> [5.356631]  sdb: sdb1
>>> [5.359014] sdb: disabling host aware zoned block device support due to 
>>> partitions
>>> [5.389941] [ cut here ]
>>> [5.394557] WARNING: CPU: 8 PID: 164 at block/blk-settings.c:236 
>>> blk_queue_max_zone_append_sectors+0x12/0x40
>>> [5.404300] Modules linked in:
>>> [5.407365] CPU: 8 PID: 164 Comm: kworker/u32:6 Not tainted 5.8.0 #7
>>> [5.413682] Hardware name: Micro-Star International Co., Ltd. 
>>> MS-7B79/X470 GAMING PRO (MS-7B79), BIOS 1.70 01/23/2019
>>> [5.424191] Workqueue: events_unbound async_run_entry_fn
>>> [5.429482] RIP: 0010:blk_queue_max_zone_append_sectors+0x12/0x40
>>> [5.435543] Code: fe 0f 00 00 53 48 89 fb 0f 86 3d 07 00 00 48 89 b3 e0 
>>> 03 00 00 5b c3 90 0f 1f 44 00 00 8b 87 40 04 00 00 ff c8 83 f8 01 76 03 
>>> <0f> 0b c3 8b 87 f8 03 00 00 39 87 f0 03 00 00 0f 46 87 f0 03 00 00
>>> [5.454099] RSP: 0018:c9697c60 EFLAGS: 00010282
>>> [5.459306] RAX:  RBX: 8887fa0a9400 RCX: 
>>> 
>>> [5.466390] RDX: 8887faf0d400 RSI: 0540 RDI: 
>>> 8887f0dde6c8
>>> [5.473474] RBP: 7471 R08: 001d1c40 R09: 
>>> 8887fee29ad0
>>> [5.480559] R10: 0001434bac00 R11: 00358275 R12: 
>>> 0008
>>> [5.487643] R13: 8887f0dde6c8 R14: 8887fa0a9738 R15: 
>>> 
>>> [5.494726] FS:  () GS:8887fee0() 
>>> knlGS:
>>> [5.502757] CS:  0010 DS:  ES:  CR0: 80050033
>>> [5.508474] CR2:  CR3: 02209000 CR4: 
>>> 003406e0
>>> [5.515558] Call Trace:
>>> [5.518026]  sd_zbc_read_zones+0x323/0x480
>>> [5.522122]  sd_revalidate_disk+0x122b/0x2000
>>> [5.526472]  ? __device_add_disk+0x2f7/0x4e0
>>> [5.530738]  sd_probe+0x347/0x44b
>>> [5.534058]  really_probe+0x2c4/0x3f0
>>> [5.537720]  driver_probe_device+0xe1/0x150
>>> [5.541902]  ? driver_allows_async_probing+0x50/0x50
>>> [5.546852]  bus_for_each_drv+0x6a/0xa0
>>> [5.550683]  __device_attach_async_helper+0x8c/0xd0
>>> [5.47]  async_run_entry_fn+0x4a/0x180
>>> [5.559636]  process_one_work+0x1a5/0x3a0
>>> [5.563637]  worker_thread+0x50/0x3a0
>>> [5.567300]  ? process_one_work+0x3a0/0x3a0
>>> [5.571480]  kthread+0x117/0x160
>>> [5.574715]  ? kthread_park+0x90/0x90
>>> [5.578377]  ret_from_fork+0x22/0x30
>>> [5.581960] ---[ end trace 94141003236730cf ]---
>>> [5.586578] sd 3:0:0:0: [sdb] Attached SCSI disk
>>> [6.186783] ata5: failed to resume link (SControl 0)
>>> [6.191818] ata5: SATA link down (SStatus 0 SControl 0)
>>>
> 
> 
> This looks like we're trying to configure zone append max sectors 
> on a device that doesn't have the zoned flag set.

Yep. That's because sd_zbc_revalidate_zones() entry test uses sd_is_zoned() and
does not look at queue->l

Re: first bad commit: [5795eb443060148796beeba106e4366d7f1458a6] scsi: sd_zbc: emulate ZONE_APPEND commands

2020-09-12 Thread Damien Le Moal
On 2020/09/12 17:37, Borislav Petkov wrote:
> Hi Damien,
> 
> On Sat, Sep 12, 2020 at 02:31:55AM +0000, Damien Le Moal wrote:
>> Can you try this:
> 
> sure, but it is white-space damaged:
> 
> checking file drivers/scsi/sd.c
> patch:  malformed patch at line 86: scsi_disk *sdkp)
> 
> Welcome to the world of outlook and how sending patches with it never
> works. You guys might need linux.wdc.com now :-)))

Working on it :)
But it was Thunderbird, getting real plain text emails with outlook is
impossible. Corruption I think came from the copy-paste from the Mac bash
terminal... Tabs get replaced by spacers.

>> That should fix the above as well as the hang on boot with 
>> CONFIG_BLK_DEV_ZONED
>> disabled (for that one I do not totally understand what is going on...).
>>
>> We do not have any host-aware disk for testing (as far as I know, nobody is
>> selling these anymore),
> 
> Yeah, so Johannes said. I love it how a (relatively) brand new
> technology gets immediately deprecated :-\

Host-managed is still a thing, getting bigger. But host-aware never really
gained a lot of traction due to, I think, the potentially very weird performance
profile they can get into (Hmmm... similar to recent drive-managed noise...)

>> so our test setup is a bit lame in this area. We'll rig something up
>> with tcmu-runner emulation to add tests for these devices to avoid
>> a repeat of such problem. And we'll make sure to add a test for
>> host-aware+partitions, since we at least know for sure there is one
>> user :)
> 
> Bah, I use it as a big data dump so if you say, I should make use of it
> as a proper zoned device (I've skimmed through http://zonedstorage.io/ a
> bit last night), I can try to find some time...

No worries, we will fix the mess (sorry we hit you again !).
Also, Naohiro just posted btrfs zone support v7 !! Luckily, we can get that into
5.11.

The patch was space corrupted, but could you still try it ? Did it solve your
problem ? I can recend it (minus space corruption) if needed.

Cheers.


-- 
Damien Le Moal
Western Digital Research


Re: first bad commit: [5795eb443060148796beeba106e4366d7f1458a6] scsi: sd_zbc: emulate ZONE_APPEND commands

2020-09-11 Thread Damien Le Moal
access_timeouts = SD_MAX_MEDIUM_TIMEOUTS;

+   sd_revalidate_disk(gd);
+
error = sd_zbc_init_disk(sdkp);
if (error)
goto out_free_index;

-   sd_revalidate_disk(gd);
-
gd->flags = GENHD_FL_EXT_DEVT;
if (sdp->removable) {
gd->flags |= GENHD_FL_REMOVABLE;
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 4933e7daf17d..f4dc81d48a01 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -241,6 +241,8 @@ static inline void sd_zbc_release_disk(struct scsi_disk
*sdkp) {}
 static inline int sd_zbc_read_zones(struct scsi_disk *sdkp,
unsigned char *buf)
 {
+   if (sd_is_zoned(sdkp))
+   sdkp->capacity = 0;
return 0;
 }

That should fix the above as well as the hang on boot with CONFIG_BLK_DEV_ZONED
disabled (for that one I do not totally understand what is going on...).

We do not have any host-aware disk for testing (as far as I know, nobody is
selling these anymore), so our test setup is a bit lame in this area. We'll rig
something up with tcmu-runner emulation to add tests for these devices to avoid
a repeat of such problem. And we'll make sure to add a test for
host-aware+partitions, since we at least know for sure there is one user :)

Johannes: The "goto out_free_index;" on sd_zbc_init_disk() failure is wrong I
think: the disk is already added and a ref taken on the dev, but out_free_index
does not seem to do cleanup for that. Need to revisit this.

Cheers.

-- 
Damien Le Moal
Western Digital Research


Re: [PATCH 1/2] nvme: set io-scheduler requirement for ZNS

2020-09-07 Thread Damien Le Moal
On 2020/09/07 20:54, Kanchan Joshi wrote:
> On Mon, Sep 7, 2020 at 5:07 PM Damien Le Moal  wrote:
>>
>> On 2020/09/07 20:24, Kanchan Joshi wrote:
>>> On Mon, Sep 7, 2020 at 1:52 PM Damien Le Moal  wrote:
>>>>
>>>> On 2020/09/07 16:01, Kanchan Joshi wrote:
>>>>>> Even for SMR, the user is free to set the elevator to none, which 
>>>>>> disables zone
>>>>>> write locking. Issuing writes correctly then becomes the responsibility 
>>>>>> of the
>>>>>> application. This can be useful for settings that for instance use NCQ 
>>>>>> I/O
>>>>>> priorities, which give better results when "none" is used.
>>>>>
>>>>> Was it not a problem that even if the application is sending writes
>>>>> correctly, scheduler may not preserve the order.
>>>>> And even when none is being used, re-queue can happen which may lead
>>>>> to different ordering.
>>>>
>>>> "Issuing writes correctly" means doing small writes, one per zone at most. 
>>>> In
>>>> that case, it does not matter if the block layer reorders writes. Per 
>>>> zone, they
>>>> will still be sequential.
>>>>
>>>>>> As far as I know, zoned drives are always used in tightly controlled
>>>>>> environments. Problems like "does not know what other applications would 
>>>>>> be
>>>>>> doing" are non-existent. Setting up the drive correctly for the use case 
>>>>>> at hand
>>>>>> is a sysadmin/server setup problem, based on *the* application (singular)
>>>>>> requirements.
>>>>>
>>>>> Fine.
>>>>> But what about the null-block-zone which sets MQ-deadline but does not
>>>>> actually use write-lock to avoid race among multiple appends on a
>>>>> zone.
>>>>> Does that deserve a fix?
>>>>
>>>> In nullblk, commands are executed under a spinlock. So there is no 
>>>> concurrency
>>>> problem. The spinlock serializes the execution of all commands. null_blk 
>>>> zone
>>>> append emulation thus does not need to take the scheduler level zone write 
>>>> lock
>>>> like scsi does.
>>>
>>> I do not see spinlock for that. There is one "nullb->lock", but its
>>> scope is limited to memory-backed handling.
>>> For concurrent zone-appends on a zone, multiple threads may set the
>>> "same" write-pointer into incoming request(s).
>>> Are you referring to any other spinlock that can avoid "same wp being
>>> returned to multiple threads".
>>
>> Checking again, it looks like you are correct. nullb->lock is indeed only 
>> used
>> for processing read/write with memory backing turned on.
>> We either need to extend that spinlock use, or add one to protect the zone 
>> array
>> when doing zoned commands and checks of read/write against a zone wp.
>> Care to send a patch ? I can send one too.
> 
> Sure, I can send.
> Do you think it is not OK to use zone write-lock (same like SCSI
> emulation) instead of introducing a new spinlock?

zone write lock will not protect against read or zone management commands
executed concurrently with writes. Only concurrent writes to the same zone will
be serialized with the scheduler zone write locking, which may not be used at
all also if the user set the scheduler to none. A lock for exclusive access and
changes to the zone array is needed.


> 
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH 1/2] nvme: set io-scheduler requirement for ZNS

2020-09-07 Thread Damien Le Moal
On 2020/09/07 20:24, Kanchan Joshi wrote:
> On Mon, Sep 7, 2020 at 1:52 PM Damien Le Moal  wrote:
>>
>> On 2020/09/07 16:01, Kanchan Joshi wrote:
>>>> Even for SMR, the user is free to set the elevator to none, which disables 
>>>> zone
>>>> write locking. Issuing writes correctly then becomes the responsibility of 
>>>> the
>>>> application. This can be useful for settings that for instance use NCQ I/O
>>>> priorities, which give better results when "none" is used.
>>>
>>> Was it not a problem that even if the application is sending writes
>>> correctly, scheduler may not preserve the order.
>>> And even when none is being used, re-queue can happen which may lead
>>> to different ordering.
>>
>> "Issuing writes correctly" means doing small writes, one per zone at most. In
>> that case, it does not matter if the block layer reorders writes. Per zone, 
>> they
>> will still be sequential.
>>
>>>> As far as I know, zoned drives are always used in tightly controlled
>>>> environments. Problems like "does not know what other applications would be
>>>> doing" are non-existent. Setting up the drive correctly for the use case 
>>>> at hand
>>>> is a sysadmin/server setup problem, based on *the* application (singular)
>>>> requirements.
>>>
>>> Fine.
>>> But what about the null-block-zone which sets MQ-deadline but does not
>>> actually use write-lock to avoid race among multiple appends on a
>>> zone.
>>> Does that deserve a fix?
>>
>> In nullblk, commands are executed under a spinlock. So there is no 
>> concurrency
>> problem. The spinlock serializes the execution of all commands. null_blk zone
>> append emulation thus does not need to take the scheduler level zone write 
>> lock
>> like scsi does.
> 
> I do not see spinlock for that. There is one "nullb->lock", but its
> scope is limited to memory-backed handling.
> For concurrent zone-appends on a zone, multiple threads may set the
> "same" write-pointer into incoming request(s).
> Are you referring to any other spinlock that can avoid "same wp being
> returned to multiple threads".

Checking again, it looks like you are correct. nullb->lock is indeed only used
for processing read/write with memory backing turned on.
We either need to extend that spinlock use, or add one to protect the zone array
when doing zoned commands and checks of read/write against a zone wp.
Care to send a patch ? I can send one too.


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH 1/2] nvme: set io-scheduler requirement for ZNS

2020-09-07 Thread Damien Le Moal
On 2020/09/07 16:01, Kanchan Joshi wrote:
>> Even for SMR, the user is free to set the elevator to none, which disables 
>> zone
>> write locking. Issuing writes correctly then becomes the responsibility of 
>> the
>> application. This can be useful for settings that for instance use NCQ I/O
>> priorities, which give better results when "none" is used.
> 
> Was it not a problem that even if the application is sending writes
> correctly, scheduler may not preserve the order.
> And even when none is being used, re-queue can happen which may lead
> to different ordering.

"Issuing writes correctly" means doing small writes, one per zone at most. In
that case, it does not matter if the block layer reorders writes. Per zone, they
will still be sequential.

>> As far as I know, zoned drives are always used in tightly controlled
>> environments. Problems like "does not know what other applications would be
>> doing" are non-existent. Setting up the drive correctly for the use case at 
>> hand
>> is a sysadmin/server setup problem, based on *the* application (singular)
>> requirements.
> 
> Fine.
> But what about the null-block-zone which sets MQ-deadline but does not
> actually use write-lock to avoid race among multiple appends on a
> zone.
> Does that deserve a fix?

In nullblk, commands are executed under a spinlock. So there is no concurrency
problem. The spinlock serializes the execution of all commands. null_blk zone
append emulation thus does not need to take the scheduler level zone write lock
like scsi does.



-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v2] null_blk: add support for max open/active zone limit for zoned devices

2020-08-28 Thread Damien Le Moal
On 2020/08/28 19:06, Niklas Cassel wrote:
> On Fri, Aug 28, 2020 at 07:06:26AM +0000, Damien Le Moal wrote:
>> On 2020/08/27 22:50, Niklas Cassel wrote:
>>> Add support for user space to set a max open zone and a max active zone
>>> limit via configfs. By default, the default values are 0 == no limit.
>>>
>>> Call the block layer API functions used for exposing the configured
>>> limits to sysfs.
>>>
>>> Add accounting in null_blk_zoned so that these new limits are respected.
>>> Performing an operating that would exceed these limits results in a
>>
>> Performing a write operation that would result in exceeding these...
>>
>>> standard I/O error.
>>>
> 
> It is not only a write operation, also e.g. open zone operation.
> However I will s/Performing an operating/Performing an operation/
> 
>>> +/*
>>> + * This function matches the manage open zone resources function in the 
>>> ZBC standard,
>>> + * with the addition of max active zones support (added in the ZNS 
>>> standard).
>>> + *
>>> + * The function determines if a zone can transition to implicit open or 
>>> explicit open,
>>> + * while maintaining the max open zone (and max active zone) limit(s). It 
>>> may close an
>>> + * implicit open zone in order to make additional zone resources available.
>>> + *
>>> + * ZBC states that an implicit open zone shall be closed only if there is 
>>> not
>>> + * room within the open limit. However, with the addition of an active 
>>> limit,
>>> + * it is not certain that closing an implicit open zone will allow a new 
>>> zone
>>> + * to be opened, since we might already be at the active limit capacity.
>>> + */
>>> +static bool null_manage_zone_resources(struct nullb_device *dev, struct 
>>> blk_zone *zone)
>>
>> I still do not like the name. Since this return a bool, what about
>> null_has_zone_resources() ?
> 
> I also don't like the name :)
> 
> However, since the ZBC spec, in the descriptions of "Write operation, Finish
> operation, and Open operation", says that the "manage open zone resources"
> function must be called before each of these operations are performed,
> and that there is a section that defines how the "manage open zone resources"
> is defined, I was thinking that having a similar name would be of value.
> 
> And I agree that it is weird that it returns a bool, but that is how it is
> defined in the standard.
> 
> Perhaps it should have exactly the same name as the standard, i.e.
> null_manage_open_zone_resources() ?
> 
> However, if you don't think that there is any point of trying to have
> a similar name to the function in ZBC, then I will happily rename it :)

Well, I prefer to prioritize code readability over following a not-so-good name
that the standard chose. The function description makes it clear that it is zone
management a-la-ZBC, so a function name clarifying what is being checked is
better in my opinion. Not a blocker though. Feel free to chose what to do here.

Cheers.


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v2] null_blk: add support for max open/active zone limit for zoned devices

2020-08-28 Thread Damien Le Moal
On 2020/08/28 16:47, Klaus Jensen wrote:
> On Aug 28 07:36, Damien Le Moal wrote:
>> On 2020/08/28 16:23, Klaus Jensen wrote:
>>> On Aug 28 07:06, Damien Le Moal wrote:
>>>> On 2020/08/27 22:50, Niklas Cassel wrote:
>>>>> +static blk_status_t null_finish_zone(struct nullb_device *dev, struct 
>>>>> blk_zone *zone)
>>>>> +{
>>>>> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>>>>> + return BLK_STS_IOERR;
>>>>> +
>>>>> + switch (zone->cond) {
>>>>> + case BLK_ZONE_COND_FULL:
>>>>> + /* finish operation on full is not an error */
>>>>> + return BLK_STS_OK;
>>>>> + case BLK_ZONE_COND_EMPTY:
>>>>> + if (!null_manage_zone_resources(dev, zone))
>>>>
>>>> OK. So you are hitting a fuzzy case here that is not actually well 
>>>> described in
>>>> the standards. That is, does finishing an empty zone necessarilly imply a
>>>> temporary transition through imp open ? Which you are assuming is a yes 
>>>> here.
>>>> Personally, I would say that is not necessary, but no strong feeling 
>>>> either way.
>>>>
>>>
>>> For ZNS, the spec is pretty clear that ZSE to ZSF is a legal direct
>>> transition. So I don't think the transition should be allowed to fail
>>> due to a lack of resources.
>>
>> I had a doubt and checked again ZBC & ZAC. I section 4.4.3.2.4 it says:
>>
>> The Zone Condition state machine (see 4.4.3.5) requires the specified zone to
>> have a Zone Condition of EXPLICITLY OPENED or IMPLICITLY OPENED before a 
>> finish
>> zone operation is performed. If a zone with a Zone Condition of EMPTY or 
>> CLOSED
>> is specified for a finish zone operation, prior to processing the finish zone
>> operation, then the Zone Condition state machine requires that:
>> a) a manage open zone resources operation (see 4.4.3.2.6) be performed; and
>> b) the Zone Condition becomes IMPLICITLY OPENED.
>>
>> And section 5.3 describing the zone finish command points to this section.
>> So this is not the same as ZNS.
>>
>> As Niklas mentioned, nullblk tends to follow more ZBC than ZNS, so the code 
>> is
>> correct in this respect. We could also lean toward ZNS on this one. I 
>> personally
>>  have no strong opinion either way since there is not real good reasons for
>> finishing an empty zone that I can think of.
>>
>>
> 
> Alrighty then; thanks for looking it up! I won't fight for the ZNS
> behavior then :)

If it becomes a problem, we could add another option to force one way or the
other, something like "zone_specs=zbc" or "zone_specs=zns". That actually may
prove useful for testing file systems etc.


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v2] null_blk: add support for max open/active zone limit for zoned devices

2020-08-28 Thread Damien Le Moal
On 2020/08/28 16:23, Klaus Jensen wrote:
> On Aug 28 07:06, Damien Le Moal wrote:
>> On 2020/08/27 22:50, Niklas Cassel wrote:
>>> +static blk_status_t null_finish_zone(struct nullb_device *dev, struct 
>>> blk_zone *zone)
>>> +{
>>> +   if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>>> +   return BLK_STS_IOERR;
>>> +
>>> +   switch (zone->cond) {
>>> +   case BLK_ZONE_COND_FULL:
>>> +   /* finish operation on full is not an error */
>>> +   return BLK_STS_OK;
>>> +   case BLK_ZONE_COND_EMPTY:
>>> +   if (!null_manage_zone_resources(dev, zone))
>>
>> OK. So you are hitting a fuzzy case here that is not actually well described 
>> in
>> the standards. That is, does finishing an empty zone necessarilly imply a
>> temporary transition through imp open ? Which you are assuming is a yes here.
>> Personally, I would say that is not necessary, but no strong feeling either 
>> way.
>>
> 
> For ZNS, the spec is pretty clear that ZSE to ZSF is a legal direct
> transition. So I don't think the transition should be allowed to fail
> due to a lack of resources.

I had a doubt and checked again ZBC & ZAC. I section 4.4.3.2.4 it says:

The Zone Condition state machine (see 4.4.3.5) requires the specified zone to
have a Zone Condition of EXPLICITLY OPENED or IMPLICITLY OPENED before a finish
zone operation is performed. If a zone with a Zone Condition of EMPTY or CLOSED
is specified for a finish zone operation, prior to processing the finish zone
operation, then the Zone Condition state machine requires that:
a) a manage open zone resources operation (see 4.4.3.2.6) be performed; and
b) the Zone Condition becomes IMPLICITLY OPENED.

And section 5.3 describing the zone finish command points to this section.
So this is not the same as ZNS.

As Niklas mentioned, nullblk tends to follow more ZBC than ZNS, so the code is
correct in this respect. We could also lean toward ZNS on this one. I personally
 have no strong opinion either way since there is not real good reasons for
finishing an empty zone that I can think of.


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH 0/1] block io layer filters api

2020-08-28 Thread Damien Le Moal
On 2020/08/28 4:14, Sergei Shtepa wrote:
> Hello everyone! Requesting for your comments and suggestions.
> 
> We propose new kernel API that should be beneficial for out-of-tree
> kernel modules of multiple backup vendors: block layer filter API.
> 
> Functionality:
> * Provide callback to intercept bio requests, the main purpose is to
> allow block level snapshots for the devices that do not support it,
> for example, non-LVM block devices and implementation of changed block
> tracking for faster incremental backups without system reconfiguration
> or reboot, but there could be other use cases that we have not thought of.
> * Allow multiple filters to work at the same time. The order in which the
> request is intercepted is determined by their altitude.
> * When new block devices appear, send a synchronous request to the
> registered filter to add it for filtering.
> * If a block device is permanently deleted or disappears, send a
> synchronous request to remove the device from filtering.
> 
> Why dm-snap and dm-era is not the solution:
> Device mapper must be set up in advance, usually backup vendors have very
> little ability to change or convince users to modify the existing setup
> at the time of software installation.
> One of the most common setups is still a block device without LVM and
> formatted with ext4.
> Convincing users to redeploy or reconfigure machine, just to make block
> level snapshots/backup software work, is a challenging task.

And convincing said users to change their kernel is not challenging ? In my
experience, that is even harder than trying to get them to change their
configuration.

> As of now, commit c62b37d96b6e removed make_request_fn from
> struct request_queue and our out-of-tree module [1] can no longer
> hook/replace it to intercept bio requests. And fops in struct gendisk
> is declared as const and cannot be hooked as well.
> 
> We would appreciate your feedback!

Upstream your out-of-tree module ?

> [1] https://github.com/veeam/veeamsnap
> 
> Sergei Shtepa (1):
>   block io layer filters api
> 
>  block/Kconfig   |  11 ++
>  block/Makefile  |   1 +
>  block/blk-core.c|  11 +-
>  block/blk-filter-internal.h |  34 +
>  block/blk-filter.c  | 288 
>  block/genhd.c   |  24 +++
>  include/linux/blk-filter.h  |  41 +
>  include/linux/genhd.h   |   2 +
>  8 files changed, 410 insertions(+), 2 deletions(-)
>  create mode 100644 block/blk-filter-internal.h
>  create mode 100644 block/blk-filter.c
>  create mode 100644 include/linux/blk-filter.h
> 


-- 
Damien Le Moal
Western Digital Research


Re: [PATCH v2] null_blk: add support for max open/active zone limit for zoned devices

2020-08-28 Thread Damien Le Moal
On 2020/08/27 22:50, Niklas Cassel wrote:
> Add support for user space to set a max open zone and a max active zone
> limit via configfs. By default, the default values are 0 == no limit.
> 
> Call the block layer API functions used for exposing the configured
> limits to sysfs.
> 
> Add accounting in null_blk_zoned so that these new limits are respected.
> Performing an operating that would exceed these limits results in a

Performing a write operation that would result in exceeding these...

> standard I/O error.
> 
> A max open zone limit exists in the ZBC standard.
> While null_blk_zoned is used to test the Zoned Block Device model in
> Linux, when it comes to differences between ZBC and ZNS, null_blk_zoned
> mostly follows ZBC.
> 
> Therefore, implement the manage open zone resources function from ZBC,
> but additionally add support for max active zones.
> This enables user space not only to test against a device with an open
> zone limit, but also to test against a device with an active zone limit.
> 
> Signed-off-by: Niklas Cassel 
> ---
> Changes since v1:
> -Fixed review comments by Damien Le Moal.
> 
>  drivers/block/null_blk.h   |   5 +
>  drivers/block/null_blk_main.c  |  16 +-
>  drivers/block/null_blk_zoned.c | 319 +++--
>  3 files changed, 282 insertions(+), 58 deletions(-)
> 
> diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
> index daed4a9c34367..d2e7db43a52a7 100644
> --- a/drivers/block/null_blk.h
> +++ b/drivers/block/null_blk.h
> @@ -42,6 +42,9 @@ struct nullb_device {
>   struct badblocks badblocks;
>  
>   unsigned int nr_zones;
> + unsigned int nr_zones_imp_open;
> + unsigned int nr_zones_exp_open;
> + unsigned int nr_zones_closed;
>   struct blk_zone *zones;
>   sector_t zone_size_sects;
>  
> @@ -51,6 +54,8 @@ struct nullb_device {
>   unsigned long zone_size; /* zone size in MB if device is zoned */
>   unsigned long zone_capacity; /* zone capacity in MB if device is zoned 
> */
>   unsigned int zone_nr_conv; /* number of conventional zones */
> + unsigned int zone_max_open; /* max number of open zones */
> + unsigned int zone_max_active; /* max number of active zones */
>   unsigned int submit_queues; /* number of submission queues */
>   unsigned int home_node; /* home node for the device */
>   unsigned int queue_mode; /* block interface */
> diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
> index d74443a9c8fa2..53161a418611b 100644
> --- a/drivers/block/null_blk_main.c
> +++ b/drivers/block/null_blk_main.c
> @@ -208,6 +208,14 @@ static unsigned int g_zone_nr_conv;
>  module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
>  MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block 
> device is zoned. Default: 0");
>  
> +static unsigned int g_zone_max_open;
> +module_param_named(zone_max_open, g_zone_max_open, uint, 0444);
> +MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block 
> device is zoned. Default: 0 (no limit)");
> +
> +static unsigned int g_zone_max_active;
> +module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
> +MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block 
> device is zoned. Default: 0 (no limit)");
> +
>  static struct nullb_device *null_alloc_dev(void);
>  static void null_free_dev(struct nullb_device *dev);
>  static void null_del_dev(struct nullb *nullb);
> @@ -347,6 +355,8 @@ NULLB_DEVICE_ATTR(zoned, bool, NULL);
>  NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
>  NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
>  NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
> +NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
> +NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
>  
>  static ssize_t nullb_device_power_show(struct config_item *item, char *page)
>  {
> @@ -464,6 +474,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
>   _device_attr_zone_size,
>   _device_attr_zone_capacity,
>   _device_attr_zone_nr_conv,
> + _device_attr_zone_max_open,
> + _device_attr_zone_max_active,
>   NULL,
>  };
>  
> @@ -517,7 +529,7 @@ nullb_group_drop_item(struct config_group *group, struct 
> config_item *item)
>  static ssize_t memb_group_features_show(struct config_item *item, char *page)
>  {
>   return snprintf(page, PAGE_SIZE,
> - 
> "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n");
> + 
> "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_ma

  1   2   3   >