[PATCH v10.6 3/5] btrfs-progs: dedupe: Add disable support for inband dedupelication

2018-11-05 Thread Lu Fengqi
From: Qu Wenruo 

Add disable subcommand for dedupe command group.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  5 +++
 btrfs-completion   |  2 +-
 cmds-dedupe-ib.c   | 41 ++
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index d895aafbcf45..3452f690e3e5 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -22,6 +22,11 @@ use with caution.
 
 SUBCOMMAND
 --
+*disable* ::
+Disable in-band de-duplication for a filesystem.
++
+This will trash all stored dedupe hash.
++
 *enable* [options] ::
 Enable in-band de-duplication for a filesystem.
 +
diff --git a/btrfs-completion b/btrfs-completion
index 621801cf12fb..e6ec785bf849 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -34,7 +34,7 @@ _btrfs()
commands_quota='enable disable rescan'
commands_qgroup='assign remove create destroy show limit'
commands_replace='start status cancel'
-   commands_dedupe_inband='enable'
+   commands_dedupe_inband='enable disable'
 
if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then
COMPREPLY=( $( compgen -W '--help' -- "$cur" ) )
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index 4d499677d9ae..91b6fe234043 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -259,10 +259,51 @@ out:
return ret;
 }
 
+static const char * const cmd_dedupe_ib_disable_usage[] = {
+   "btrfs dedupe-inband disable ",
+   "Disable in-band(write time) de-duplication of a btrfs.",
+   NULL
+};
+
+static int cmd_dedupe_ib_disable(int argc, char **argv)
+{
+   struct btrfs_ioctl_dedupe_args dargs;
+   DIR *dirstream;
+   char *path;
+   int fd;
+   int ret;
+
+   if (check_argc_exact(argc, 2))
+   usage(cmd_dedupe_ib_disable_usage);
+
+   path = argv[1];
+   fd = open_file_or_dir(path, );
+   if (fd < 0) {
+   error("failed to open file or directory: %s", path);
+   return 1;
+   }
+   memset(, 0, sizeof(dargs));
+   dargs.cmd = BTRFS_DEDUPE_CTL_DISABLE;
+
+   ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, );
+   if (ret < 0) {
+   error("failed to disable inband deduplication: %m");
+   ret = 1;
+   goto out;
+   }
+   ret = 0;
+
+out:
+   close_file_or_dir(fd, dirstream);
+   return 0;
+}
+
 const struct cmd_group dedupe_ib_cmd_group = {
dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, {
{ "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage,
  NULL, 0},
+   { "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage,
+ NULL, 0},
NULL_CMD_STRUCT
}
 };
-- 
2.19.1





[PATCH v10.6 1/5] btrfs-progs: Basic framework for dedupe-inband command group

2018-11-05 Thread Lu Fengqi
From: Qu Wenruo 

Add basic ioctl header and command group framework for later use.
Alone with basic man page doc.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/Makefile.in  |  1 +
 Documentation/btrfs-dedupe-inband.asciidoc | 40 ++
 Documentation/btrfs.asciidoc   |  4 +++
 Makefile   |  3 +-
 btrfs.c|  2 ++
 cmds-dedupe-ib.c   | 35 +++
 commands.h |  2 ++
 dedupe-ib.h| 28 +++
 ioctl.h| 36 +++
 9 files changed, 150 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc
 create mode 100644 cmds-dedupe-ib.c
 create mode 100644 dedupe-ib.h

diff --git a/Documentation/Makefile.in b/Documentation/Makefile.in
index afc16980c6d9..c0d797324c25 100644
--- a/Documentation/Makefile.in
+++ b/Documentation/Makefile.in
@@ -28,6 +28,7 @@ MAN8_TXT += btrfs-qgroup.asciidoc
 MAN8_TXT += btrfs-replace.asciidoc
 MAN8_TXT += btrfs-restore.asciidoc
 MAN8_TXT += btrfs-property.asciidoc
+MAN8_TXT += btrfs-dedupe-inband.asciidoc
 
 # Category 5 manual page
 MAN5_TXT += btrfs-man5.asciidoc
diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
new file mode 100644
index ..83113f5487e2
--- /dev/null
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -0,0 +1,40 @@
+btrfs-dedupe-inband(8)
+==
+
+NAME
+
+btrfs-dedupe-inband - manage in-band (write time) de-duplication of a btrfs
+filesystem
+
+SYNOPSIS
+
+*btrfs dedupe-inband*  
+
+DESCRIPTION
+---
+*btrfs dedupe-inband* is used to enable/disable or show current in-band 
de-duplication
+status of a btrfs filesystem.
+
+Kernel support for in-band de-duplication starts from 4.19.
+
+WARNING: In-band de-duplication is still an experimental feautre of btrfs,
+use with caution.
+
+SUBCOMMAND
+--
+Nothing yet
+
+EXIT STATUS
+---
+*btrfs dedupe-inband* returns a zero exit status if it succeeds. Non zero is
+returned in case of failure.
+
+AVAILABILITY
+
+*btrfs* is part of btrfs-progs.
+Please refer to the btrfs wiki http://btrfs.wiki.kernel.org for
+further details.
+
+SEE ALSO
+
+`mkfs.btrfs`(8),
diff --git a/Documentation/btrfs.asciidoc b/Documentation/btrfs.asciidoc
index 7316ac094413..1cf5bddec335 100644
--- a/Documentation/btrfs.asciidoc
+++ b/Documentation/btrfs.asciidoc
@@ -50,6 +50,10 @@ COMMANDS
Do off-line check on a btrfs filesystem. +
See `btrfs-check`(8) for details.
 
+*dedupe-inband*::
+   Control btrfs in-band(write time) de-duplication. +
+   See `btrfs-dedupe-inband`(8) for details.
+
 *device*::
Manage devices managed by btrfs, including add/delete/scan and so
on. +
diff --git a/Makefile b/Makefile
index f4ab14ea74c8..f155252c91f1 100644
--- a/Makefile
+++ b/Makefile
@@ -124,7 +124,8 @@ cmds_objects = cmds-subvolume.o cmds-filesystem.o 
cmds-device.o cmds-scrub.o \
   cmds-restore.o cmds-rescue.o chunk-recover.o super-recover.o \
   cmds-property.o cmds-fi-usage.o cmds-inspect-dump-tree.o \
   cmds-inspect-dump-super.o cmds-inspect-tree-stats.o cmds-fi-du.o 
\
-  mkfs/common.o check/mode-common.o check/mode-lowmem.o
+  mkfs/common.o check/mode-common.o check/mode-lowmem.o \
+  cmds-dedupe-ib.o
 libbtrfs_objects = send-stream.o send-utils.o kernel-lib/rbtree.o btrfs-list.o 
\
   kernel-lib/crc32c.o messages.o \
   uuid-tree.o utils-lib.o rbtree-utils.o
diff --git a/btrfs.c b/btrfs.c
index 2d39f2ced3e8..2168f5a8bc7f 100644
--- a/btrfs.c
+++ b/btrfs.c
@@ -255,6 +255,8 @@ static const struct cmd_group btrfs_cmd_group = {
{ "quota", cmd_quota, NULL, _cmd_group, 0 },
{ "qgroup", cmd_qgroup, NULL, _cmd_group, 0 },
{ "replace", cmd_replace, NULL, _cmd_group, 0 },
+   { "dedupe-inband", cmd_dedupe_ib, NULL, _ib_cmd_group,
+   0 },
{ "help", cmd_help, cmd_help_usage, NULL, 0 },
{ "version", cmd_version, cmd_version_usage, NULL, 0 },
NULL_CMD_STRUCT
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
new file mode 100644
index ..73c923a797da
--- /dev/null
+++ b/cmds-dedupe-ib.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2017 Fujitsu.  All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+
+#include "ctree.h"
+#include "ioctl.h"
+
+#include "commands.h"
+#include "utils.h"
+#include "kerncompat.h"
+#include "dedupe-ib.h"
+
+static const char * const dedupe_ib_cmd_group_usage[] = {
+   "btrfs dedupe-inband  [options] ",
+   NULL
+};
+
+static const char dedupe_ib_cmd_group_info[] =

[PATCH v10.6 5/5] btrfs-progs: dedupe: introduce reconfigure subcommand

2018-11-05 Thread Lu Fengqi
From: Qu Wenruo 

Introduce reconfigure subcommand to co-operate with new kernel ioctl
modification.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  7 +++
 btrfs-completion   |  2 +-
 cmds-dedupe-ib.c   | 73 +-
 3 files changed, 66 insertions(+), 16 deletions(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index 6096389cb0b4..78c806f772d6 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -86,6 +86,13 @@ And compression has higher priority than in-band 
de-duplication, means if
 compression and de-duplication is enabled at the same time, only compression
 will work.
 
+*reconfigure* [options] ::
+Re-configure in-band de-duplication parameters of a filesystem.
++
+In-band de-duplication must be enbaled first before re-configuration.
++
+[Options] are the same with 'btrfs dedupe-inband enable'.
+
 *status* ::
 Show current in-band de-duplication status of a filesystem.
 
diff --git a/btrfs-completion b/btrfs-completion
index 0808f9a14df9..a3e05b238eda 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -34,7 +34,7 @@ _btrfs()
commands_quota='enable disable rescan'
commands_qgroup='assign remove create destroy show limit'
commands_replace='start status cancel'
-   commands_dedupe_inband='enable disable status'
+   commands_dedupe_inband='enable disable status reconfigure'
 
if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then
COMPREPLY=( $( compgen -W '--help' -- "$cur" ) )
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index e778457e25a8..e52f939c9ced 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -56,7 +56,6 @@ static const char * const cmd_dedupe_ib_enable_usage[] = {
NULL
 };
 
-
 #define report_fatal_parameter(dargs, old, member, type, err_val, fmt) \
 ({ \
if (dargs->member != old->member && \
@@ -88,6 +87,12 @@ static void report_parameter_error(struct 
btrfs_ioctl_dedupe_args *dargs,
}
report_option_parameter(dargs, old, flags, u8, -1, x);
}
+
+   if (dargs->status == 0 && old->cmd == BTRFS_DEDUPE_CTL_RECONF) {
+   error("must enable dedupe before reconfiguration");
+   return;
+   }
+
if (report_fatal_parameter(dargs, old, cmd, u16, -1, u) ||
report_fatal_parameter(dargs, old, blocksize, u64, -1, llu) ||
report_fatal_parameter(dargs, old, backend, u16, -1, u) ||
@@ -100,14 +105,17 @@ static void report_parameter_error(struct 
btrfs_ioctl_dedupe_args *dargs,
old->limit_nr, old->limit_mem);
 }
 
-static int cmd_dedupe_ib_enable(int argc, char **argv)
+static int enable_reconfig_dedupe(int argc, char **argv, int reconf)
 {
int ret;
int fd = -1;
char *path;
u64 blocksize = BTRFS_DEDUPE_BLOCKSIZE_DEFAULT;
+   int blocksize_set = 0;
u16 hash_algo = BTRFS_DEDUPE_HASH_SHA256;
+   int hash_algo_set = 0;
u16 backend = BTRFS_DEDUPE_BACKEND_INMEMORY;
+   int backend_set = 0;
u64 limit_nr = 0;
u64 limit_mem = 0;
u64 sys_mem = 0;
@@ -134,15 +142,17 @@ static int cmd_dedupe_ib_enable(int argc, char **argv)
break;
switch (c) {
case 's':
-   if (!strcasecmp("inmemory", optarg))
+   if (!strcasecmp("inmemory", optarg)) {
backend = BTRFS_DEDUPE_BACKEND_INMEMORY;
-   else {
+   backend_set = 1;
+   } else {
error("unsupported dedupe backend: %s", optarg);
exit(1);
}
break;
case 'b':
blocksize = parse_size(optarg);
+   blocksize_set = 1;
break;
case 'a':
if (strcmp("sha256", optarg)) {
@@ -224,26 +234,40 @@ static int cmd_dedupe_ib_enable(int argc, char **argv)
return 1;
}
memset(, -1, sizeof(dargs));
-   dargs.cmd = BTRFS_DEDUPE_CTL_ENABLE;
-   dargs.blocksize = blocksize;
-   dargs.hash_algo = hash_algo;
-   dargs.limit_nr = limit_nr;
-   dargs.limit_mem = limit_mem;
-   dargs.backend = backend;
-   if (force)
-   dargs.flags |= BTRFS_DEDUPE_FLAG_FORCE;
-   else
-   dargs.flags = 0;
+   if (reconf) {
+   dargs.cmd = BTRFS_DEDUPE_CTL_RECONF;
+   if (blocksize_set)
+   dargs.blocksize = blocksize;
+   if (hash_algo_set)
+  

[PATCH v10.6 2/5] btrfs-progs: dedupe: Add enable command for dedupe command group

2018-11-05 Thread Lu Fengqi
From: Qu Wenruo 

Add enable subcommand for dedupe commmand group.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc | 114 +-
 btrfs-completion   |   6 +-
 cmds-dedupe-ib.c   | 238 +
 ioctl.h|   2 +
 4 files changed, 358 insertions(+), 2 deletions(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index 83113f5487e2..d895aafbcf45 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -22,7 +22,119 @@ use with caution.
 
 SUBCOMMAND
 --
-Nothing yet
+*enable* [options] ::
+Enable in-band de-duplication for a filesystem.
++
+`Options`
++
+-f|--force
+Force 'enable' command to be exected.
+Will skip memory limit check and allow 'enable' to be executed even in-band
+de-duplication is already enabled.
++
+NOTE: If re-enable dedupe with '-f' option, any unspecified parameter will be
+reset to its default value.
+
+-s|--storage-backend 
+Specify de-duplication hash storage backend.
+Only 'inmemory' backend is supported yet.
+If not specified, default value is 'inmemory'.
++
+Refer to *BACKENDS* sector for more information.
+
+-b|--blocksize 
+Specify dedupe block size.
+Supported values are power of 2 from '16K' to '8M'.
+Default value is '128K'.
++
+Refer to *BLOCKSIZE* sector for more information.
+
+-a|--hash-algorithm 
+Specify hash algorithm.
+Only 'sha256' is supported yet.
+
+-l|--limit-hash 
+Specify maximum number of hashes stored in memory.
+Only works for 'inmemory' backend.
+Conflicts with '-m' option.
++
+Only positive values are valid.
+Default value is '32K'.
+
+-m|--limit-memory 
+Specify maximum memory used for hashes.
+Only works for 'inmemory' backend.
+Conflicts with '-l' option.
++
+Only value larger than or equal to '1024' is valid.
+No default value.
++
+NOTE: Memory limit will be rounded down to kernel internal hash size,
+so the memory limit shown in 'btrfs dedupe-inband status' may be different
+from the .
+
+WARNING: Too large value for '-l' or '-m' will easily trigger OOM.
+Please use with caution according to system memory.
+
+NOTE: In-band de-duplication is not compactible with compression yet.
+And compression has higher priority than in-band de-duplication, means if
+compression and de-duplication is enabled at the same time, only compression
+will work.
+
+BACKENDS
+
+Btrfs in-band de-duplication will support different storage backends, with
+different use case and features.
+
+In-memory backend::
+This backend provides backward-compatibility, and more fine-tuning options.
+But hash pool is non-persistent and may exhaust kernel memory if not setup
+properly.
++
+This backend can be used on old btrfs(without '-O dedupe' mkfs option).
+When used on old btrfs, this backend needs to be enabled manually after mount.
++
+Designed for fast hash search speed, in-memory backend will keep all dedupe
+hashes in memory. (Although overall performance is still much the same with
+'ondisk' backend if all 'ondisk' hash can be cached in memory)
++
+And only keeps limited number of hash in memory to avoid exhausting memory.
+Hashes over the limit will be dropped following Last-Recent-Use behavior.
+So this backend has a consistent overhead for given limit but can\'t ensure
+all duplicated blocks will be de-duplicated.
++
+After umount and mount, in-memory backend need to refill its hash pool.
+
+On-disk backend::
+This backend provides persistent hash pool, with more smart memory management
+for hash pool.
+But it\'s not backward-compatible, meaning it must be used with '-O dedupe' 
mkfs
+option and older kernel can\'t mount it read-write.
++
+Designed for de-duplication rate, hash pool is stored as btrfs B+ tree on disk.
+This behavior may cause extra disk IO for hash search under high memory
+pressure.
++
+After umount and mount, on-disk backend still has its hash on disk, no need to
+refill its dedupe hash pool.
+
+Currently, only 'inmemory' backend is supported in btrfs-progs.
+
+DEDUPE BLOCK SIZE
+
+In-band de-duplication is done at dedupe block size.
+Any data smaller than dedupe block size won\'t go through in-band
+de-duplication.
+
+And dedupe block size affects dedupe rate and fragmentation heavily.
+
+Smaller block size will cause more fragments, but higher dedupe rate.
+
+Larger block size will cause less fragments, but lower dedupe rate.
+
+In-band de-duplication rate is highly related to the workload pattern.
+So it\'s highly recommended to align dedupe block size to the workload
+block size to make full use of de-duplication.
 
 EXIT STATUS
 ---
diff --git a/btrfs-completion b/btrfs-completion
index 6ae57d1b752b..621801cf12fb 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -22,7 +22,7 @@ _btrfs()
 
local cmd=${words[1]}
 
-   

[PATCH v10.6 0/5] In-band de-duplication for btrfs-progs

2018-11-05 Thread Lu Fengqi
Patchset can be fetched from github:
https://github.com/littleroad/btrfs-progs.git dedupe_latest

Inband dedupe(in-memory backend only) ioctl support for btrfs-progs.

v7 changes:
   Update ctree.h to follow kernel structure change
   Update print-tree to follow kernel structure change
V8 changes:
   Move dedup props and on-disk backend support out of the patchset
   Change command group name to "dedupe-inband", to avoid confusion with
   possible out-of-band dedupe. Suggested by Mark.
   Rebase to latest devel branch.
V9 changes:
   Follow kernels ioctl change to support FORCE flag, new reconf ioctl,
   and more precious error reporting.
v10 changes:
   Rebase to v4.10.
   Add BUILD_ASSERT for btrfs_ioctl_dedupe_args
v10.1 changes:
   Rebase to v4.14.
v10.2 changes:
   Rebase to v4.16.1.
v10.3 changes:
   Rebase to v4.17.
v10.4 changes:
   Deal with offline reviews from Misono Tomohiro.
   1. s/btrfs-dedupe/btrfs-dedupe-inband
   2. Replace strerror(errno) with %m
   3. Use SZ_* instead of intermedia number
   4. update btrfs-completion for reconfigure subcommand
v10.5 changes:
   Rebase to v4.17.1.
v10.6 changes:
   Rebase to v4.19.

Qu Wenruo (5):
  btrfs-progs: Basic framework for dedupe-inband command group
  btrfs-progs: dedupe: Add enable command for dedupe command group
  btrfs-progs: dedupe: Add disable support for inband dedupelication
  btrfs-progs: dedupe: Add status subcommand
  btrfs-progs: dedupe: introduce reconfigure subcommand

 Documentation/Makefile.in  |   1 +
 Documentation/btrfs-dedupe-inband.asciidoc | 167 
 Documentation/btrfs.asciidoc   |   4 +
 Makefile   |   3 +-
 btrfs-completion   |   6 +-
 btrfs.c|   2 +
 cmds-dedupe-ib.c   | 437 +
 commands.h |   2 +
 dedupe-ib.h|  28 ++
 ioctl.h|  38 ++
 10 files changed, 686 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/btrfs-dedupe-inband.asciidoc
 create mode 100644 cmds-dedupe-ib.c
 create mode 100644 dedupe-ib.h

-- 
2.19.1





[PATCH v10.6 4/5] btrfs-progs: dedupe: Add status subcommand

2018-11-05 Thread Lu Fengqi
From: Qu Wenruo 

Add status subcommand for dedupe command group.

Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 Documentation/btrfs-dedupe-inband.asciidoc |  3 +
 btrfs-completion   |  2 +-
 cmds-dedupe-ib.c   | 80 ++
 3 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/Documentation/btrfs-dedupe-inband.asciidoc 
b/Documentation/btrfs-dedupe-inband.asciidoc
index 3452f690e3e5..6096389cb0b4 100644
--- a/Documentation/btrfs-dedupe-inband.asciidoc
+++ b/Documentation/btrfs-dedupe-inband.asciidoc
@@ -86,6 +86,9 @@ And compression has higher priority than in-band 
de-duplication, means if
 compression and de-duplication is enabled at the same time, only compression
 will work.
 
+*status* ::
+Show current in-band de-duplication status of a filesystem.
+
 BACKENDS
 
 Btrfs in-band de-duplication will support different storage backends, with
diff --git a/btrfs-completion b/btrfs-completion
index e6ec785bf849..0808f9a14df9 100644
--- a/btrfs-completion
+++ b/btrfs-completion
@@ -34,7 +34,7 @@ _btrfs()
commands_quota='enable disable rescan'
commands_qgroup='assign remove create destroy show limit'
commands_replace='start status cancel'
-   commands_dedupe_inband='enable disable'
+   commands_dedupe_inband='enable disable status'
 
if [[ "$cur" == -* && $cword -le 3 && "$cmd" != "help" ]]; then
COMPREPLY=( $( compgen -W '--help' -- "$cur" ) )
diff --git a/cmds-dedupe-ib.c b/cmds-dedupe-ib.c
index 91b6fe234043..e778457e25a8 100644
--- a/cmds-dedupe-ib.c
+++ b/cmds-dedupe-ib.c
@@ -298,12 +298,92 @@ out:
return 0;
 }
 
+static const char * const cmd_dedupe_ib_status_usage[] = {
+   "btrfs dedupe-inband status ",
+   "Show current in-band(write time) de-duplication status of a btrfs.",
+   NULL
+};
+
+static int cmd_dedupe_ib_status(int argc, char **argv)
+{
+   struct btrfs_ioctl_dedupe_args dargs;
+   DIR *dirstream;
+   char *path;
+   int fd;
+   int ret;
+   int print_limit = 1;
+
+   if (check_argc_exact(argc, 2))
+   usage(cmd_dedupe_ib_status_usage);
+
+   path = argv[1];
+   fd = open_file_or_dir(path, );
+   if (fd < 0) {
+   error("failed to open file or directory: %s", path);
+   ret = 1;
+   goto out;
+   }
+   memset(, 0, sizeof(dargs));
+   dargs.cmd = BTRFS_DEDUPE_CTL_STATUS;
+
+   ret = ioctl(fd, BTRFS_IOC_DEDUPE_CTL, );
+   if (ret < 0) {
+   error("failed to get inband deduplication status: %m");
+   ret = 1;
+   goto out;
+   }
+   ret = 0;
+   if (dargs.status == 0) {
+   printf("Status: \t\t\tDisabled\n");
+   goto out;
+   }
+   printf("Status:\t\t\tEnabled\n");
+
+   if (dargs.hash_algo == BTRFS_DEDUPE_HASH_SHA256)
+   printf("Hash algorithm:\t\tSHA-256\n");
+   else
+   printf("Hash algorithm:\t\tUnrecognized(%x)\n",
+   dargs.hash_algo);
+
+   if (dargs.backend == BTRFS_DEDUPE_BACKEND_INMEMORY) {
+   printf("Backend:\t\tIn-memory\n");
+   print_limit = 1;
+   } else  {
+   printf("Backend:\t\tUnrecognized(%x)\n",
+   dargs.backend);
+   }
+
+   printf("Dedup Blocksize:\t%llu\n", dargs.blocksize);
+
+   if (print_limit) {
+   u64 cur_mem;
+
+   /* Limit nr may be 0 */
+   if (dargs.limit_nr)
+   cur_mem = dargs.current_nr * (dargs.limit_mem /
+   dargs.limit_nr);
+   else
+   cur_mem = 0;
+
+   printf("Number of hash: \t[%llu/%llu]\n", dargs.current_nr,
+   dargs.limit_nr);
+   printf("Memory usage: \t\t[%s/%s]\n",
+   pretty_size(cur_mem),
+   pretty_size(dargs.limit_mem));
+   }
+out:
+   close_file_or_dir(fd, dirstream);
+   return ret;
+}
+
 const struct cmd_group dedupe_ib_cmd_group = {
dedupe_ib_cmd_group_usage, dedupe_ib_cmd_group_info, {
{ "enable", cmd_dedupe_ib_enable, cmd_dedupe_ib_enable_usage,
  NULL, 0},
{ "disable", cmd_dedupe_ib_disable, cmd_dedupe_ib_disable_usage,
  NULL, 0},
+   { "status", cmd_dedupe_ib_status, cmd_dedupe_ib_status_usage,
+ NULL, 0},
NULL_CMD_STRUCT
}
 };
-- 
2.19.1





[PATCH v15.1 09/13] btrfs: introduce type based delalloc metadata reserve

2018-11-05 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce type based metadata reserve parameter for delalloc space
reservation/freeing function.

The problem we are going to solve is, btrfs use different max extent
size for different mount options.

For de-duplication, the max extent size can be set by the dedupe ioctl,
while for normal write it's 128M.
And furthermore, split/merge extent hook highly depends that max extent
size.

Such situation contributes to quite a lot of false ENOSPC.

So this patch introduces the facility to help solve these false ENOSPC
related to different max extent size.

Currently, only normal 128M extent size is supported. More types will
follow soon.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h |  43 ++---
 fs/btrfs/extent-tree.c   |  48 ---
 fs/btrfs/file.c  |  30 +
 fs/btrfs/free-space-cache.c  |   6 +-
 fs/btrfs/inode-map.c |   9 ++-
 fs/btrfs/inode.c | 115 +--
 fs/btrfs/ioctl.c |  23 +++
 fs/btrfs/ordered-data.c  |   6 +-
 fs/btrfs/ordered-data.h  |   3 +-
 fs/btrfs/relocation.c|  22 ---
 fs/btrfs/tests/inode-tests.c |  15 +++--
 11 files changed, 223 insertions(+), 97 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 910050d904ef..b119a19cbeaf 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -92,11 +92,24 @@ static const int btrfs_csum_sizes[] = { 4 };
 /*
  * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size
  */
-static inline u32 count_max_extents(u64 size)
+static inline u32 count_max_extents(u64 size, u64 max_extent_size)
 {
-   return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
+   return div_u64(size + max_extent_size - 1, max_extent_size);
 }
 
+/*
+ * Type based metadata reserve type
+ * This affects how btrfs reserve metadata space for buffered write.
+ *
+ * This is caused by the different max extent size for normal COW
+ * and further in-band dedupe
+ */
+enum btrfs_metadata_reserve_type {
+   BTRFS_RESERVE_NORMAL,
+};
+
+u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+
 struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
 };
@@ -2732,8 +2745,9 @@ int btrfs_check_data_free_space(struct inode *inode,
 void btrfs_free_reserved_data_space(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
 void btrfs_delalloc_release_space(struct inode *inode,
- struct extent_changeset *reserved,
- u64 start, u64 len, bool qgroup_free);
+   struct extent_changeset *reserved,
+   u64 start, u64 len, bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
u64 len);
 void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -2743,13 +2757,17 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root 
*root,
 void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
  struct btrfs_block_rsv *rsv);
 void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
-   bool qgroup_free);
+   bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 
-int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
-bool qgroup_free);
+   bool qgroup_free,
+   enum btrfs_metadata_reserve_type reserve_type);
 int btrfs_delalloc_reserve_space(struct inode *inode,
-   struct extent_changeset **reserved, u64 start, u64 len);
+   struct extent_changeset **reserved, u64 start, u64 len,
+   enum btrfs_metadata_reserve_type reserve_type);
 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info,
  unsigned short type);
@@ -3152,7 +3170,11 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root);
 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
  unsigned int extra_bits,
- struct extent_state **cached_state, int dedupe);
+   

[PATCH v15.1 10/13] btrfs: dedupe: Inband in-memory only de-duplication implement

2018-11-05 Thread Lu Fengqi
From: Qu Wenruo 

Core implement for inband de-duplication.
It reuses the async_cow_start() facility to do the calculate dedupe hash.
And use dedupe hash to do inband de-duplication at extent level.

The workflow is as below:
1) Run delalloc range for an inode
2) Calculate hash for the delalloc range at the unit of dedupe_bs
3) For hash match(duplicated) case, just increase source extent ref
   and insert file extent.
   For hash mismatch case, go through the normal cow_file_range()
   fallback, and add hash into dedupe_tree.
   Compress for hash miss case is not supported yet.

Current implement restore all dedupe hash in memory rb-tree, with LRU
behavior to control the limit.

Signed-off-by: Wang Xiaoguang 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   |   4 +-
 fs/btrfs/dedupe.h  |  15 ++
 fs/btrfs/extent-tree.c |  31 +++-
 fs/btrfs/extent_io.c   |   7 +-
 fs/btrfs/extent_io.h   |   1 +
 fs/btrfs/file.c|   4 +
 fs/btrfs/inode.c   | 319 ++---
 fs/btrfs/ioctl.c   |   1 +
 fs/btrfs/relocation.c  |  18 +++
 9 files changed, 343 insertions(+), 57 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b119a19cbeaf..3a8e35b5328a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -106,9 +106,11 @@ static inline u32 count_max_extents(u64 size, u64 
max_extent_size)
  */
 enum btrfs_metadata_reserve_type {
BTRFS_RESERVE_NORMAL,
+   BTRFS_RESERVE_DEDUPE,
 };
 
-u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type);
+u64 btrfs_max_extent_size(struct btrfs_inode *inode,
+ enum btrfs_metadata_reserve_type reserve_type);
 
 struct btrfs_mapping_tree {
struct extent_map_tree map_tree;
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 87f5b7ce7766..8157b17c4d11 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -7,6 +7,7 @@
 #define BTRFS_DEDUPE_H
 
 #include 
+#include "btrfs_inode.h"
 
 /* 32 bytes for SHA256 */
 static const int btrfs_hash_sizes[] = { 32 };
@@ -47,6 +48,20 @@ struct btrfs_dedupe_info {
u64 current_nr;
 };
 
+static inline u64 btrfs_dedupe_blocksize(struct btrfs_inode *inode)
+{
+   struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+   return fs_info->dedupe_info->blocksize;
+}
+
+static inline int inode_need_dedupe(struct inode *inode)
+{
+   struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+
+   return fs_info->dedupe_enabled;
+}
+
 static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash)
 {
return (hash && hash->bytenr);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2c8992b919ae..fa3654045ba8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -28,6 +28,7 @@
 #include "sysfs.h"
 #include "qgroup.h"
 #include "ref-verify.h"
+#include "dedupe.h"
 
 #undef SCRAMBLE_DELAYED_REFS
 
@@ -2492,6 +2493,17 @@ static int cleanup_ref_head(struct btrfs_trans_handle 
*trans,
btrfs_pin_extent(fs_info, head->bytenr,
 head->num_bytes, 1);
if (head->is_data) {
+   /*
+* If insert_reserved is given, it means
+* a new extent is revered, then deleted
+* in one tran, and inc/dec get merged to 0.
+*
+* In this case, we need to remove its dedupe
+* hash.
+*/
+   ret = btrfs_dedupe_del(fs_info, head->bytenr);
+   if (ret < 0)
+   return ret;
ret = btrfs_del_csums(trans, fs_info, head->bytenr,
  head->num_bytes);
}
@@ -5913,13 +5925,15 @@ static void btrfs_calculate_inode_block_rsv_size(struct 
btrfs_fs_info *fs_info,
spin_unlock(_rsv->lock);
 }
 
-u64 btrfs_max_extent_size(enum btrfs_metadata_reserve_type reserve_type)
+u64 btrfs_max_extent_size(struct btrfs_inode *inode,
+ enum btrfs_metadata_reserve_type reserve_type)
 {
if (reserve_type == BTRFS_RESERVE_NORMAL)
return BTRFS_MAX_EXTENT_SIZE;
-
-   ASSERT(0);
-   return BTRFS_MAX_EXTENT_SIZE;
+   else if (reserve_type == BTRFS_RESERVE_DEDUPE)
+   return btrfs_dedupe_blocksize(inode);
+   else
+   return BTRFS_MAX_EXTENT_SIZE;
 }
 
 int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
@@ -5930,7 +5944,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode 
*inode, u64 num_bytes,
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
bool delalloc_lock = true;
-   u64 max_extent_size = btrfs_max_extent_size(reserve_type);
+   u64 max_extent_size = btrfs_max_extent_size(inode, reserve_type);
 
/* If we are a 

[PATCH v15.1 02/13] btrfs: dedupe: Introduce function to initialize dedupe info

2018-11-05 Thread Lu Fengqi
From: Wang Xiaoguang 

Add generic function to initialize dedupe info.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/Makefile  |   2 +-
 fs/btrfs/dedupe.c  | 169 +
 fs/btrfs/dedupe.h  |  12 +++
 include/uapi/linux/btrfs.h |   3 +
 4 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/dedupe.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index ca693dd554e9..78fdc87dba39 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -10,7 +10,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o 
root-tree.o dir-item.o \
   export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-  uuid-tree.o props.o free-space-tree.o tree-checker.o
+  uuid-tree.o props.o free-space-tree.o tree-checker.o dedupe.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
new file mode 100644
index ..06523162753d
--- /dev/null
+++ b/fs/btrfs/dedupe.c
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2016 Fujitsu.  All rights reserved.
+ */
+
+#include "ctree.h"
+#include "dedupe.h"
+#include "btrfs_inode.h"
+#include "delayed-ref.h"
+
+struct inmem_hash {
+   struct rb_node hash_node;
+   struct rb_node bytenr_node;
+   struct list_head lru_list;
+
+   u64 bytenr;
+   u32 num_bytes;
+
+   u8 hash[];
+};
+
+static struct btrfs_dedupe_info *
+init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+
+   dedupe_info = kzalloc(sizeof(*dedupe_info), GFP_NOFS);
+   if (!dedupe_info)
+   return ERR_PTR(-ENOMEM);
+
+   dedupe_info->hash_algo = dargs->hash_algo;
+   dedupe_info->backend = dargs->backend;
+   dedupe_info->blocksize = dargs->blocksize;
+   dedupe_info->limit_nr = dargs->limit_nr;
+
+   /* only support SHA256 yet */
+   dedupe_info->dedupe_driver = crypto_alloc_shash("sha256", 0, 0);
+   if (IS_ERR(dedupe_info->dedupe_driver)) {
+   kfree(dedupe_info);
+   return ERR_CAST(dedupe_info->dedupe_driver);
+   }
+
+   dedupe_info->hash_root = RB_ROOT;
+   dedupe_info->bytenr_root = RB_ROOT;
+   dedupe_info->current_nr = 0;
+   INIT_LIST_HEAD(_info->lru_list);
+   mutex_init(_info->lock);
+
+   return dedupe_info;
+}
+
+/*
+ * Helper to check if parameters are valid.
+ * The first invalid field will be set to (-1), to info user which parameter
+ * is invalid.
+ * Except dargs->limit_nr or dargs->limit_mem, in that case, 0 will returned
+ * to info user, since user can specify any value to limit, except 0.
+ */
+static int check_dedupe_parameter(struct btrfs_fs_info *fs_info,
+ struct btrfs_ioctl_dedupe_args *dargs)
+{
+   u64 blocksize = dargs->blocksize;
+   u64 limit_nr = dargs->limit_nr;
+   u64 limit_mem = dargs->limit_mem;
+   u16 hash_algo = dargs->hash_algo;
+   u8 backend = dargs->backend;
+
+   /*
+* Set all reserved fields to -1, allow user to detect
+* unsupported optional parameters.
+*/
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+   if (blocksize > BTRFS_DEDUPE_BLOCKSIZE_MAX ||
+   blocksize < BTRFS_DEDUPE_BLOCKSIZE_MIN ||
+   blocksize < fs_info->sectorsize ||
+   !is_power_of_2(blocksize) ||
+   blocksize < PAGE_SIZE) {
+   dargs->blocksize = (u64)-1;
+   return -EINVAL;
+   }
+   if (hash_algo >= ARRAY_SIZE(btrfs_hash_sizes)) {
+   dargs->hash_algo = (u16)-1;
+   return -EINVAL;
+   }
+   if (backend >= BTRFS_DEDUPE_BACKEND_COUNT) {
+   dargs->backend = (u8)-1;
+   return -EINVAL;
+   }
+
+   /* Backend specific check */
+   if (backend == BTRFS_DEDUPE_BACKEND_INMEMORY) {
+   /* only one limit is accepted for enable*/
+   if (dargs->limit_nr && dargs->limit_mem) {
+   dargs->limit_nr = 0;
+   dargs->limit_mem = 0;
+   return -EINVAL;
+   }
+
+   if (!limit_nr && !limit_mem)
+   dargs->limit_nr = BTRFS_DEDUPE_LIMIT_NR_DEFAULT;
+   else {
+   u64 tmp = (u64)-1;
+
+   if (limit_mem) {
+   tmp = div_u64(limit_mem,
+   (sizeof(struct inmem_hash)) +
+   btrfs_hash_sizes[hash_algo]);
+   /* Too small limit_mem to fill a hash 

[PATCH v15.1 11/13] btrfs: dedupe: Add ioctl for inband deduplication

2018-11-05 Thread Lu Fengqi
From: Wang Xiaoguang 

Add ioctl interface for inband deduplication, which includes:
1) enable
2) disable
3) status

And a pseudo RO compat flag, to imply that btrfs now supports inband
dedup.
However we don't add any ondisk format change, it's just a pseudo RO
compat flag.

All these ioctl interfaces are state-less, which means caller don't need
to bother previous dedupe state before calling them, and only need to
care the final desired state.

For example, if user want to enable dedupe with specified block size and
limit, just fill the ioctl structure and call enable ioctl.
No need to check if dedupe is already running.

These ioctls will handle things like re-configure or disable quite well.

Also, for invalid parameters, enable ioctl interface will set the field
of the first encountered invalid parameter to (-1) to inform caller.
While for limit_nr/limit_mem, the value will be (0).

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c  | 50 ++
 fs/btrfs/dedupe.h  | 17 +---
 fs/btrfs/disk-io.c |  3 ++
 fs/btrfs/ioctl.c   | 85 ++
 fs/btrfs/sysfs.c   |  2 +
 include/uapi/linux/btrfs.h | 12 +-
 6 files changed, 163 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 6199215022e6..76a967cca68e 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -29,6 +29,35 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
GFP_NOFS);
 }
 
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs)
+{
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   if (!fs_info->dedupe_enabled || !dedupe_info) {
+   dargs->status = 0;
+   dargs->blocksize = 0;
+   dargs->backend = 0;
+   dargs->hash_algo = 0;
+   dargs->limit_nr = 0;
+   dargs->current_nr = 0;
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+   return;
+   }
+   mutex_lock(_info->lock);
+   dargs->status = 1;
+   dargs->blocksize = dedupe_info->blocksize;
+   dargs->backend = dedupe_info->backend;
+   dargs->hash_algo = dedupe_info->hash_algo;
+   dargs->limit_nr = dedupe_info->limit_nr;
+   dargs->limit_mem = dedupe_info->limit_nr *
+   (sizeof(struct inmem_hash) +
+btrfs_hash_sizes[dedupe_info->hash_algo]);
+   dargs->current_nr = dedupe_info->current_nr;
+   mutex_unlock(_info->lock);
+   memset(dargs->__unused, -1, sizeof(dargs->__unused));
+}
+
 static struct btrfs_dedupe_info *
 init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -402,6 +431,27 @@ static void unblock_all_writers(struct btrfs_fs_info 
*fs_info)
percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
 }
 
+int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+
+   fs_info->dedupe_enabled = 0;
+   /* same as disable */
+   smp_wmb();
+   dedupe_info = fs_info->dedupe_info;
+   fs_info->dedupe_info = NULL;
+
+   if (!dedupe_info)
+   return 0;
+
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   inmem_destroy(dedupe_info);
+
+   crypto_free_shash(dedupe_info->dedupe_driver);
+   kfree(dedupe_info);
+   return 0;
+}
+
 int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
 {
struct btrfs_dedupe_info *dedupe_info;
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 8157b17c4d11..fdd00355d6b5 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -90,6 +90,15 @@ static inline struct btrfs_dedupe_hash 
*btrfs_dedupe_alloc_hash(u16 algo)
 int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dedupe_args *dargs);
 
+
+/*
+ * Get inband dedupe info
+ * Since it needs to access different backends' hash size, which
+ * is not exported, we need such simple function.
+ */
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs);
+
 /*
  * Disable dedupe and invalidate all its dedupe data.
  * Called at dedupe disable time.
@@ -101,12 +110,10 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
 int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info);
 
 /*
- * Get current dedupe status.
- * Return 0 for success
- * No possible error yet
+ * Cleanup current btrfs_dedupe_info
+ * Called in umount time
  */
-void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
-struct btrfs_ioctl_dedupe_args *dargs);
+int btrfs_dedupe_cleanup(struct btrfs_fs_info *fs_info);
 
 /*
  * Calculate hash for dedupe.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d1fa9d90cc8f..f15e89d9d26a 100644
--- a/fs/btrfs/disk-io.c
+++ 

[PATCH v15.1 06/13] btrfs: dedupe: Introduce function to search for an existing hash

2018-11-05 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce static function inmem_search() to handle the job for in-memory
hash tree.

The trick is, we must ensure the delayed ref head is not being run at
the time we search the for the hash.

With inmem_search(), we can implement the btrfs_dedupe_search()
interface.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 210 +-
 1 file changed, 209 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 951fefd19fde..03ad41423c01 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -7,6 +7,8 @@
 #include "dedupe.h"
 #include "btrfs_inode.h"
 #include "delayed-ref.h"
+#include "qgroup.h"
+#include "transaction.h"
 
 struct inmem_hash {
struct rb_node hash_node;
@@ -242,7 +244,6 @@ static int inmem_add(struct btrfs_dedupe_info *dedupe_info,
struct inmem_hash *ihash;
 
ihash = inmem_alloc_hash(algo);
-
if (!ihash)
return -ENOMEM;
 
@@ -436,3 +437,210 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
kfree(dedupe_info);
return 0;
 }
+
+/*
+ * Caller must ensure the corresponding ref head is not being run.
+ */
+static struct inmem_hash *
+inmem_search_hash(struct btrfs_dedupe_info *dedupe_info, u8 *hash)
+{
+   struct rb_node **p = _info->hash_root.rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+   u16 hash_algo = dedupe_info->hash_algo;
+   int hash_len = btrfs_hash_sizes[hash_algo];
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, hash_node);
+
+   if (memcmp(hash, entry->hash, hash_len) < 0) {
+   p = &(*p)->rb_left;
+   } else if (memcmp(hash, entry->hash, hash_len) > 0) {
+   p = &(*p)->rb_right;
+   } else {
+   /* Found, need to re-add it to LRU list head */
+   list_del(>lru_list);
+   list_add(>lru_list, _info->lru_list);
+   return entry;
+   }
+   }
+   return NULL;
+}
+
+static int inmem_search(struct btrfs_dedupe_info *dedupe_info,
+   struct inode *inode, u64 file_pos,
+   struct btrfs_dedupe_hash *hash)
+{
+   int ret;
+   struct btrfs_root *root = BTRFS_I(inode)->root;
+   struct btrfs_trans_handle *trans;
+   struct btrfs_delayed_ref_root *delayed_refs;
+   struct btrfs_delayed_ref_head *head;
+   struct btrfs_delayed_ref_head *insert_head;
+   struct btrfs_delayed_data_ref *insert_dref;
+   struct btrfs_qgroup_extent_record *insert_qrecord = NULL;
+   struct inmem_hash *found_hash;
+   int free_insert = 1;
+   int qrecord_inserted = 0;
+   u64 ref_root = root->root_key.objectid;
+   u64 bytenr;
+   u32 num_bytes;
+
+   insert_head = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
+   if (!insert_head)
+   return -ENOMEM;
+   insert_head->extent_op = NULL;
+
+   insert_dref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
+   if (!insert_dref) {
+   kmem_cache_free(btrfs_delayed_ref_head_cachep, insert_head);
+   return -ENOMEM;
+   }
+   if (test_bit(BTRFS_FS_QUOTA_ENABLED, >fs_info->flags) &&
+   is_fstree(ref_root)) {
+   insert_qrecord = kmalloc(sizeof(*insert_qrecord), GFP_NOFS);
+   if (!insert_qrecord) {
+   kmem_cache_free(btrfs_delayed_ref_head_cachep,
+   insert_head);
+   kmem_cache_free(btrfs_delayed_data_ref_cachep,
+   insert_dref);
+   return -ENOMEM;
+   }
+   }
+
+   trans = btrfs_join_transaction(root);
+   if (IS_ERR(trans)) {
+   ret = PTR_ERR(trans);
+   goto free_mem;
+   }
+
+again:
+   mutex_lock(_info->lock);
+   found_hash = inmem_search_hash(dedupe_info, hash->hash);
+   /* If we don't find a duplicated extent, just return. */
+   if (!found_hash) {
+   ret = 0;
+   goto out;
+   }
+   bytenr = found_hash->bytenr;
+   num_bytes = found_hash->num_bytes;
+
+   btrfs_init_delayed_ref_head(insert_head, insert_qrecord, bytenr,
+   num_bytes, ref_root, 0, BTRFS_ADD_DELAYED_REF, true,
+   false);
+
+   btrfs_init_delayed_ref_common(trans->fs_info, _dref->node,
+   bytenr, num_bytes, ref_root, BTRFS_ADD_DELAYED_REF,
+   BTRFS_EXTENT_DATA_REF_KEY);
+   insert_dref->root = ref_root;
+   insert_dref->parent = 0;
+   insert_dref->objectid = btrfs_ino(BTRFS_I(inode));
+   insert_dref->offset = 

[PATCH v15.1 01/13] btrfs: dedupe: Introduce dedupe framework and its header

2018-11-05 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce the header for btrfs in-band(write time) de-duplication
framework and needed header.

The new de-duplication framework is going to support 2 different dedupe
methods and 1 dedupe hash.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/ctree.h   |   7 ++
 fs/btrfs/dedupe.h  | 128 -
 fs/btrfs/disk-io.c |   1 +
 include/uapi/linux/btrfs.h |  34 ++
 4 files changed, 168 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80953528572d..910050d904ef 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1118,6 +1118,13 @@ struct btrfs_fs_info {
spinlock_t ref_verify_lock;
struct rb_root block_tree;
 #endif
+
+   /*
+* Inband de-duplication related structures
+*/
+   unsigned long dedupe_enabled:1;
+   struct btrfs_dedupe_info *dedupe_info;
+   struct mutex dedupe_ioctl_lock;
 };
 
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
index 90281a7a35a8..222ce7b4d827 100644
--- a/fs/btrfs/dedupe.h
+++ b/fs/btrfs/dedupe.h
@@ -6,7 +6,131 @@
 #ifndef BTRFS_DEDUPE_H
 #define BTRFS_DEDUPE_H
 
-/* later in-band dedupe will expand this struct */
-struct btrfs_dedupe_hash;
+#include 
 
+/* 32 bytes for SHA256 */
+static const int btrfs_hash_sizes[] = { 32 };
+
+/*
+ * For caller outside of dedupe.c
+ *
+ * Different dedupe backends should have their own hash structure
+ */
+struct btrfs_dedupe_hash {
+   u64 bytenr;
+   u32 num_bytes;
+
+   /* last field is a variable length array of dedupe hash */
+   u8 hash[];
+};
+
+struct btrfs_dedupe_info {
+   /* dedupe blocksize */
+   u64 blocksize;
+   u16 backend;
+   u16 hash_algo;
+
+   struct crypto_shash *dedupe_driver;
+
+   /*
+* Use mutex to portect both backends
+* Even for in-memory backends, the rb-tree can be quite large,
+* so mutex is better for such use case.
+*/
+   struct mutex lock;
+
+   /* following members are only used in in-memory backend */
+   struct rb_root hash_root;
+   struct rb_root bytenr_root;
+   struct list_head lru_list;
+   u64 limit_nr;
+   u64 current_nr;
+};
+
+static inline int btrfs_dedupe_hash_hit(struct btrfs_dedupe_hash *hash)
+{
+   return (hash && hash->bytenr);
+}
+
+/*
+ * Initial inband dedupe info
+ * Called at dedupe enable time.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (from unsupported param to tree creation error for some backends)
+ */
+int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
+   struct btrfs_ioctl_dedupe_args *dargs);
+
+/*
+ * Disable dedupe and invalidate all its dedupe data.
+ * Called at dedupe disable time.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info);
+
+/*
+ * Get current dedupe status.
+ * Return 0 for success
+ * No possible error yet
+ */
+void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
+struct btrfs_ioctl_dedupe_args *dargs);
+
+/*
+ * Calculate hash for dedupe.
+ * Caller must ensure [start, start + dedupe_bs) has valid data.
+ *
+ * Return 0 for success
+ * Return <0 for any error
+ * (error from hash codes)
+ */
+int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info,
+  struct inode *inode, u64 start,
+  struct btrfs_dedupe_hash *hash);
+
+/*
+ * Search for duplicated extents by calculated hash
+ * Caller must call btrfs_dedupe_calc_hash() first to get the hash.
+ *
+ * @inode: the inode for we are writing
+ * @file_pos: offset inside the inode
+ * As we will increase extent ref immediately after a hash match,
+ * we need @file_pos and @inode in this case.
+ *
+ * Return > 0 for a hash match, and the extent ref will be
+ * *INCREASED*, and hash->bytenr/num_bytes will record the existing
+ * extent data.
+ * Return 0 for a hash miss. Nothing is done
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_search(struct btrfs_fs_info *fs_info,
+   struct inode *inode, u64 file_pos,
+   struct btrfs_dedupe_hash *hash);
+
+/*
+ * Add a dedupe hash into dedupe info
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ */
+int btrfs_dedupe_add(struct btrfs_fs_info *fs_info,
+struct btrfs_dedupe_hash *hash);
+
+/*
+ * Remove a dedupe hash from dedupe info
+ * Return 0 for success
+ * Return <0 for any error
+ * (tree operation error for some backends)
+ *
+ * NOTE: if hash deletion error is not handled well, it will lead
+ * to corrupted fs, as later dedupe write can points to non-exist or even
+ * wrong extent.
+ */
+int 

[PATCH v15.1 04/13] btrfs: dedupe: Introduce function to remove hash from in-memory tree

2018-11-05 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce static function inmem_del() to remove hash from in-memory
dedupe tree.
And implement btrfs_dedupe_del() and btrfs_dedup_disable() interfaces.

Also for btrfs_dedupe_disable(), add new functions to wait existing
writer and block incoming writers to eliminate all possible race.

Cc: Mark Fasheh 
Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 131 +++---
 1 file changed, 125 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 784bb3a8a5ab..951fefd19fde 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -170,12 +170,6 @@ int btrfs_dedupe_enable(struct btrfs_fs_info *fs_info,
return ret;
 }
 
-int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
-{
-   /* Place holder for bisect, will be implemented in later patches */
-   return 0;
-}
-
 static int inmem_insert_hash(struct rb_root *root,
 struct inmem_hash *hash, int hash_len)
 {
@@ -317,3 +311,128 @@ int btrfs_dedupe_add(struct btrfs_fs_info *fs_info,
return inmem_add(dedupe_info, hash);
return -EINVAL;
 }
+
+static struct inmem_hash *
+inmem_search_bytenr(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
+{
+   struct rb_node **p = _info->bytenr_root.rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, bytenr_node);
+
+   if (bytenr < entry->bytenr)
+   p = &(*p)->rb_left;
+   else if (bytenr > entry->bytenr)
+   p = &(*p)->rb_right;
+   else
+   return entry;
+   }
+
+   return NULL;
+}
+
+/* Delete a hash from in-memory dedupe tree */
+static int inmem_del(struct btrfs_dedupe_info *dedupe_info, u64 bytenr)
+{
+   struct inmem_hash *hash;
+
+   mutex_lock(_info->lock);
+   hash = inmem_search_bytenr(dedupe_info, bytenr);
+   if (!hash) {
+   mutex_unlock(_info->lock);
+   return 0;
+   }
+
+   __inmem_del(dedupe_info, hash);
+   mutex_unlock(_info->lock);
+   return 0;
+}
+
+/* Remove a dedupe hash from dedupe tree */
+int btrfs_dedupe_del(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   if (!fs_info->dedupe_enabled)
+   return 0;
+
+   if (WARN_ON(dedupe_info == NULL))
+   return -EINVAL;
+
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   return inmem_del(dedupe_info, bytenr);
+   return -EINVAL;
+}
+
+static void inmem_destroy(struct btrfs_dedupe_info *dedupe_info)
+{
+   struct inmem_hash *entry, *tmp;
+
+   mutex_lock(_info->lock);
+   list_for_each_entry_safe(entry, tmp, _info->lru_list, lru_list)
+   __inmem_del(dedupe_info, entry);
+   mutex_unlock(_info->lock);
+}
+
+/*
+ * Helper function to wait and block all incoming writers
+ *
+ * Use rw_sem introduced for freeze to wait/block writers.
+ * So during the block time, no new write will happen, so we can
+ * do something quite safe, espcially helpful for dedupe disable,
+ * as it affect buffered write.
+ */
+static void block_all_writers(struct btrfs_fs_info *fs_info)
+{
+   struct super_block *sb = fs_info->sb;
+
+   percpu_down_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
+   down_write(>s_umount);
+}
+
+static void unblock_all_writers(struct btrfs_fs_info *fs_info)
+{
+   struct super_block *sb = fs_info->sb;
+
+   up_write(>s_umount);
+   percpu_up_write(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1);
+}
+
+int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
+{
+   struct btrfs_dedupe_info *dedupe_info;
+   int ret;
+
+   dedupe_info = fs_info->dedupe_info;
+
+   if (!dedupe_info)
+   return 0;
+
+   /* Don't allow disable status change in RO mount */
+   if (fs_info->sb->s_flags & MS_RDONLY)
+   return -EROFS;
+
+   /*
+* Wait for all unfinished writers and block further writers.
+* Then sync the whole fs so all current write will go through
+* dedupe, and all later write won't go through dedupe.
+*/
+   block_all_writers(fs_info);
+   ret = sync_filesystem(fs_info->sb);
+   fs_info->dedupe_enabled = 0;
+   fs_info->dedupe_info = NULL;
+   unblock_all_writers(fs_info);
+   if (ret < 0)
+   return ret;
+
+   /* now we are OK to clean up everything */
+   if (dedupe_info->backend == BTRFS_DEDUPE_BACKEND_INMEMORY)
+   inmem_destroy(dedupe_info);
+
+   crypto_free_shash(dedupe_info->dedupe_driver);
+   kfree(dedupe_info);
+   return 0;
+}
-- 
2.19.1





[PATCH v15.1 13/13] btrfs: dedupe: Introduce new reconfigure ioctl

2018-11-05 Thread Lu Fengqi
From: Qu Wenruo 

Introduce new reconfigure ioctl and new FORCE flag for in-band dedupe
ioctls.

Now dedupe enable and reconfigure ioctl are stateful.


| Current state |   Ioctl| Next state  |

| Disabled  |  enable| Enabled |
| Enabled   |  enable| Not allowed |
| Enabled   |  reconf| Enabled |
| Enabled   |  disable   | Disabled|
| Disabled  |  dsiable   | Disabled|
| Disabled  |  reconf| Not allowed |

(While disable is always stateless)

While for guys prefer stateless ioctl (myself for example), new FORCE
flag is introduced.

In FORCE mode, enable/disable is completely stateless.

| Current state |   Ioctl| Next state  |

| Disabled  |  enable| Enabled |
| Enabled   |  enable| Enabled |
| Enabled   |  disable   | Disabled|
| Disabled  |  disable   | Disabled|


Also, re-configure ioctl will only modify specified fields.
Unlike enable, un-specified fields will be filled with default value.

For example:
 # btrfs dedupe enable --block-size 64k /mnt
 # btrfs dedupe reconfigure --limit-hash 1m /mnt
Will leads to:
 dedupe blocksize: 64K
 dedupe hash limit nr: 1m

While for enable:
 # btrfs dedupe enable --force --block-size 64k /mnt
 # btrfs dedupe enable --force --limit-hash 1m /mnt
Will reset blocksize to default value:
 dedupe blocksize: 128K << reset
 dedupe hash limit nr: 1m

Suggested-by: David Sterba 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c  | 132 ++---
 fs/btrfs/dedupe.h  |  13 
 fs/btrfs/ioctl.c   |  13 
 include/uapi/linux/btrfs.h |  11 +++-
 4 files changed, 143 insertions(+), 26 deletions(-)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 76a967cca68e..92152134d3c0 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -29,6 +29,40 @@ static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
GFP_NOFS);
 }
 
+/*
+ * Copy from current dedupe info to fill dargs.
+ * For reconf case, only fill members which is uninitialized.
+ */
+static void get_dedupe_status(struct btrfs_dedupe_info *dedupe_info,
+ struct btrfs_ioctl_dedupe_args *dargs)
+{
+   int reconf = (dargs->cmd == BTRFS_DEDUPE_CTL_RECONF);
+
+   dargs->status = 1;
+
+   if (!reconf || (reconf && dargs->blocksize == (u64)-1))
+   dargs->blocksize = dedupe_info->blocksize;
+   if (!reconf || (reconf && dargs->backend == (u16)-1))
+   dargs->backend = dedupe_info->backend;
+   if (!reconf || (reconf && dargs->hash_algo == (u16)-1))
+   dargs->hash_algo = dedupe_info->hash_algo;
+
+   /*
+* For re-configure case, if not modifying limit,
+* therir limit will be set to 0, unlike other fields
+*/
+   if (!reconf || !(dargs->limit_nr || dargs->limit_mem)) {
+   dargs->limit_nr = dedupe_info->limit_nr;
+   dargs->limit_mem = dedupe_info->limit_nr *
+   (sizeof(struct inmem_hash) +
+btrfs_hash_sizes[dedupe_info->hash_algo]);
+   }
+
+   /* current_nr doesn't makes sense for reconfig case */
+   if (!reconf)
+   dargs->current_nr = dedupe_info->current_nr;
+}
+
 void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
 struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -45,15 +79,7 @@ void btrfs_dedupe_status(struct btrfs_fs_info *fs_info,
return;
}
mutex_lock(_info->lock);
-   dargs->status = 1;
-   dargs->blocksize = dedupe_info->blocksize;
-   dargs->backend = dedupe_info->backend;
-   dargs->hash_algo = dedupe_info->hash_algo;
-   dargs->limit_nr = dedupe_info->limit_nr;
-   dargs->limit_mem = dedupe_info->limit_nr *
-   (sizeof(struct inmem_hash) +
-btrfs_hash_sizes[dedupe_info->hash_algo]);
-   dargs->current_nr = dedupe_info->current_nr;
+   get_dedupe_status(dedupe_info, dargs);
mutex_unlock(_info->lock);
memset(dargs->__unused, -1, sizeof(dargs->__unused));
 }
@@ -98,17 +124,50 @@ init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
 static int check_dedupe_parameter(struct btrfs_fs_info *fs_info,
  struct btrfs_ioctl_dedupe_args *dargs)
 {
-   u64 blocksize = dargs->blocksize;
-   u64 limit_nr = dargs->limit_nr;
-   u64 limit_mem = dargs->limit_mem;
-   u16 hash_algo = dargs->hash_algo;
-   u8 backend = dargs->backend;
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+
+   u64 blocksize;
+   u64 limit_nr;
+   u64 limit_mem;
+ 

[PATCH v15.1 12/13] btrfs: relocation: Enhance error handling to avoid BUG_ON

2018-11-05 Thread Lu Fengqi
From: Qu Wenruo 

Since the introduction of btrfs dedupe tree, it's possible that balance can
race with dedupe disabling.

When this happens, dedupe_enabled will make btrfs_get_fs_root() return
PTR_ERR(-ENOENT).
But due to a bug in error handling branch, when this happens
backref_cache->nr_nodes is increased but the node is neither added to
backref_cache or nr_nodes decreased.
Causing BUG_ON() in backref_cache_cleanup()

[ 2611.668810] [ cut here ]
[ 2611.669946] kernel BUG at
/home/sat/ktest/linux/fs/btrfs/relocation.c:243!
[ 2611.670572] invalid opcode:  [#1] SMP
[ 2611.686797] Call Trace:
[ 2611.687034]  []
btrfs_relocate_block_group+0x1b3/0x290 [btrfs]
[ 2611.687706]  []
btrfs_relocate_chunk.isra.40+0x47/0xd0 [btrfs]
[ 2611.688385]  [] btrfs_balance+0xb22/0x11e0 [btrfs]
[ 2611.688966]  [] btrfs_ioctl_balance+0x391/0x3a0
[btrfs]
[ 2611.689587]  [] btrfs_ioctl+0x1650/0x2290 [btrfs]
[ 2611.690145]  [] ? lru_cache_add+0x3a/0x80
[ 2611.690647]  [] ?
lru_cache_add_active_or_unevictable+0x4c/0xc0
[ 2611.691310]  [] ? handle_mm_fault+0xcd4/0x17f0
[ 2611.691842]  [] ? cp_new_stat+0x153/0x180
[ 2611.692342]  [] ? __vma_link_rb+0xfd/0x110
[ 2611.692842]  [] ? vma_link+0xb9/0xc0
[ 2611.693303]  [] do_vfs_ioctl+0xa1/0x5a0
[ 2611.693781]  [] ? __do_page_fault+0x1b4/0x400
[ 2611.694310]  [] SyS_ioctl+0x41/0x70
[ 2611.694758]  [] entry_SYSCALL_64_fastpath+0x12/0x71
[ 2611.695331] Code: ff 48 8b 45 bf 49 83 af a8 05 00 00 01 49 89 87 a0
05 00 00 e9 2e fd ff ff b8 f4 ff ff ff e9 e4 fb ff ff 0f 0b 0f 0b 0f 0b
0f 0b <0f> 0b 0f 0b 41 89 c6 e9 b8 fb ff ff e8 9e a6 e8 e0 4c 89 e7 44
[ 2611.697870] RIP  []
relocate_block_group+0x741/0x7a0 [btrfs]
[ 2611.698818]  RSP 

This patch will call remove_backref_node() in error handling branch, and
cache the returned -ENOENT in relocate_tree_block() and continue
balancing.

Reported-by: Satoru Takeuchi 
Signed-off-by: Qu Wenruo 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/relocation.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b7c304c6e741..ee96390d1e42 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -854,6 +854,13 @@ struct backref_node *build_backref_tree(struct 
reloc_control *rc,
root = read_fs_root(rc->extent_root->fs_info, key.offset);
if (IS_ERR(root)) {
err = PTR_ERR(root);
+   /*
+* Don't forget to cleanup current node.
+* As it may not be added to backref_cache but nr_node
+* increased.
+* This will cause BUG_ON() in backref_cache_cleanup().
+*/
+   remove_backref_node(>backref_cache, cur);
goto out;
}
 
@@ -3021,8 +3028,15 @@ int relocate_tree_blocks(struct btrfs_trans_handle 
*trans,
node = build_backref_tree(rc, >key,
  block->level, block->bytenr);
if (IS_ERR(node)) {
+   /*
+* The root(dedupe tree yet) of the tree block is
+* going to be freed and can't be reached.
+* Just skip it and continue balancing.
+*/
+   if (PTR_ERR(node) == -ENOENT)
+   continue;
err = PTR_ERR(node);
-   goto out;
+   break;
}
 
ret = relocate_tree_block(trans, rc, node, >key,
@@ -3030,10 +3044,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle 
*trans,
if (ret < 0) {
if (ret != -EAGAIN || >rb_node == 
rb_first(blocks))
err = ret;
-   goto out;
+   break;
}
}
-out:
err = finish_pending_nodes(trans, rc, path, err);
 
 out_free_path:
-- 
2.19.1





[PATCH v15.1 00/13] Btrfs In-band De-duplication

2018-11-05 Thread Lu Fengqi
This patchset can be fetched from github:
https://github.com/littleroad/linux.git dedupe_latest

Now the new base is v4.20-rc1.

Normal test cases from auto group exposes no regression, and ib-dedupe
group can pass without problem.

xfstests ib-dedupe group can be fetched from github:
https://github.com/littleroad/xfstests-dev.git btrfs_dedupe_latest

Changelog:
v2:
  Totally reworked to handle multiple backends
v3:
  Fix a stupid but deadly on-disk backend bug
  Add handle for multiple hash on same bytenr corner case to fix abort
  trans error
  Increase dedup rate by enhancing delayed ref handler for both backend.
  Move dedup_add() to run_delayed_ref() time, to fix abort trans error.
  Increase dedup block size up limit to 8M.
v4:
  Add dedup prop for disabling dedup for given files/dirs.
  Merge inmem_search() and ondisk_search() into generic_search() to save
  some code
  Fix another delayed_ref related bug.
  Use the same mutex for both inmem and ondisk backend.
  Move dedup_add() back to btrfs_finish_ordered_io() to increase dedup
  rate.
v5:
  Reuse compress routine for much simpler dedup function.
  Slightly improved performance due to above modification.
  Fix race between dedup enable/disable
  Fix for false ENOSPC report
v6:
  Further enable/disable race window fix.
  Minor format change according to checkpatch.
v7:
  Fix one concurrency bug with balance.
  Slightly modify return value from -EINVAL to -EOPNOTSUPP for
  btrfs_dedup_ioctl() to allow progs to distinguish unsupported commands
  and wrong parameter.
  Rebased to integration-4.6.
v8:
  Rename 'dedup' to 'dedupe'.
  Add support to allow dedupe and compression work at the same time.
  Fix several balance related bugs. Special thanks to Satoru Takeuchi,
  who exposed most of them.
  Small dedupe hit case performance improvement.
v9:
  Re-order the patchset to completely separate pure in-memory and any
  on-disk format change.
  Fold bug fixes into its original patch.
v10:
  Adding back missing bug fix patch.
  Reduce on-disk item size.
  Hide dedupe ioctl under CONFIG_BTRFS_DEBUG.
v11:
  Remove other backend and props support to focus on the framework and
  in-memory backend. Suggested by David.
  Better disable and buffered write race protection.
  Comprehensive fix to dedupe metadata ENOSPC problem.
v12:
  Stateful 'enable' ioctl and new 'reconf' ioctl
  New FORCE flag for enable ioctl to allow stateless ioctl
  Precise error report and extendable ioctl structure.
v12.1
  Rebase to David's for-next-20160704 branch
  Add co-ordinate patch for subpage and dedupe patchset.
v12.2
  Rebase to David's for-next-20160715 branch
  Add co-ordinate patch for other patchset.
v13
  Rebase to David's for-next-20160906 branch
  Fix a reserved space leak bug, which only frees quota reserved space
  but not space_info->byte_may_use.
v13.1
  Rebase to Chris' for-linux-4.9 branch
v14
  Use generic ENOSPC fix for both compression and dedupe.
v14.1
  Further split ENOSPC fix.
v14.2
  Rebase to v4.11-rc2.
  Co-operate with count_max_extent() to calculate num_extents.
  No longer rely on qgroup fixes.
v14.3
  Rebase to v4.12-rc1.
v14.4
  Rebase to kdave/for-4.13-part1.
v14.5
  Rebase to v4.15-rc3.
v14.6
  Rebase to v4.17-rc5.
v14.7
  Replace SHASH_DESC_ON_STACK with kmalloc to remove VLA.
  Fixed the following errors by switching to div_u64.
  ├── arm-allmodconfig
  │   └── ERROR:__aeabi_uldivmod-fs-btrfs-btrfs.ko-undefined
  └── i386-allmodconfig
  └── ERROR:__udivdi3-fs-btrfs-btrfs.ko-undefined
v14.8
  Rebase to v4.18-rc4.
v15
  Rebase to v4.19-rc2.
  Drop "btrfs: Introduce COMPRESS reserve type to fix false enospc for 
compression".
  Remove the ifdef around btrfs inband dedupe ioctl.
v15.1
  Rebase to v4.20-rc1.

Qu Wenruo (4):
  btrfs: delayed-ref: Add support for increasing data ref under spinlock
  btrfs: dedupe: Inband in-memory only de-duplication implement
  btrfs: relocation: Enhance error handling to avoid BUG_ON
  btrfs: dedupe: Introduce new reconfigure ioctl

Wang Xiaoguang (9):
  btrfs: dedupe: Introduce dedupe framework and its header
  btrfs: dedupe: Introduce function to initialize dedupe info
  btrfs: dedupe: Introduce function to add hash into in-memory tree
  btrfs: dedupe: Introduce function to remove hash from in-memory tree
  btrfs: dedupe: Introduce function to search for an existing hash
  btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface
  btrfs: ordered-extent: Add support for dedupe
  btrfs: introduce type based delalloc metadata reserve
  btrfs: dedupe: Add ioctl for inband deduplication

 fs/btrfs/Makefile|   2 +-
 fs/btrfs/ctree.h |  52 ++-
 fs/btrfs/dedupe.c| 828 +++
 fs/btrfs/dedupe.h| 175 +++-
 fs/btrfs/delayed-ref.c   |  53 ++-
 fs/btrfs/delayed-ref.h   |  15 +
 fs/btrfs/disk-io.c   |   4 +
 fs/btrfs/extent-tree.c   |  67 ++-
 fs/btrfs/extent_io.c |   7 +-
 fs/btrfs/extent_io.h |   1 +
 

[PATCH v15.1 03/13] btrfs: dedupe: Introduce function to add hash into in-memory tree

2018-11-05 Thread Lu Fengqi
From: Wang Xiaoguang 

Introduce static function inmem_add() to add hash into in-memory tree.
And now we can implement the btrfs_dedupe_add() interface.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 150 ++
 1 file changed, 150 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 06523162753d..784bb3a8a5ab 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -19,6 +19,14 @@ struct inmem_hash {
u8 hash[];
 };
 
+static inline struct inmem_hash *inmem_alloc_hash(u16 algo)
+{
+   if (WARN_ON(algo >= ARRAY_SIZE(btrfs_hash_sizes)))
+   return NULL;
+   return kzalloc(sizeof(struct inmem_hash) + btrfs_hash_sizes[algo],
+   GFP_NOFS);
+}
+
 static struct btrfs_dedupe_info *
 init_dedupe_info(struct btrfs_ioctl_dedupe_args *dargs)
 {
@@ -167,3 +175,145 @@ int btrfs_dedupe_disable(struct btrfs_fs_info *fs_info)
/* Place holder for bisect, will be implemented in later patches */
return 0;
 }
+
+static int inmem_insert_hash(struct rb_root *root,
+struct inmem_hash *hash, int hash_len)
+{
+   struct rb_node **p = >rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, hash_node);
+   if (memcmp(hash->hash, entry->hash, hash_len) < 0)
+   p = &(*p)->rb_left;
+   else if (memcmp(hash->hash, entry->hash, hash_len) > 0)
+   p = &(*p)->rb_right;
+   else
+   return 1;
+   }
+   rb_link_node(>hash_node, parent, p);
+   rb_insert_color(>hash_node, root);
+   return 0;
+}
+
+static int inmem_insert_bytenr(struct rb_root *root,
+  struct inmem_hash *hash)
+{
+   struct rb_node **p = >rb_node;
+   struct rb_node *parent = NULL;
+   struct inmem_hash *entry = NULL;
+
+   while (*p) {
+   parent = *p;
+   entry = rb_entry(parent, struct inmem_hash, bytenr_node);
+   if (hash->bytenr < entry->bytenr)
+   p = &(*p)->rb_left;
+   else if (hash->bytenr > entry->bytenr)
+   p = &(*p)->rb_right;
+   else
+   return 1;
+   }
+   rb_link_node(>bytenr_node, parent, p);
+   rb_insert_color(>bytenr_node, root);
+   return 0;
+}
+
+static void __inmem_del(struct btrfs_dedupe_info *dedupe_info,
+   struct inmem_hash *hash)
+{
+   list_del(>lru_list);
+   rb_erase(>hash_node, _info->hash_root);
+   rb_erase(>bytenr_node, _info->bytenr_root);
+
+   if (!WARN_ON(dedupe_info->current_nr == 0))
+   dedupe_info->current_nr--;
+
+   kfree(hash);
+}
+
+/*
+ * Insert a hash into in-memory dedupe tree
+ * Will remove exceeding last recent use hash.
+ *
+ * If the hash mathced with existing one, we won't insert it, to
+ * save memory
+ */
+static int inmem_add(struct btrfs_dedupe_info *dedupe_info,
+struct btrfs_dedupe_hash *hash)
+{
+   int ret = 0;
+   u16 algo = dedupe_info->hash_algo;
+   struct inmem_hash *ihash;
+
+   ihash = inmem_alloc_hash(algo);
+
+   if (!ihash)
+   return -ENOMEM;
+
+   /* Copy the data out */
+   ihash->bytenr = hash->bytenr;
+   ihash->num_bytes = hash->num_bytes;
+   memcpy(ihash->hash, hash->hash, btrfs_hash_sizes[algo]);
+
+   mutex_lock(_info->lock);
+
+   ret = inmem_insert_bytenr(_info->bytenr_root, ihash);
+   if (ret > 0) {
+   kfree(ihash);
+   ret = 0;
+   goto out;
+   }
+
+   ret = inmem_insert_hash(_info->hash_root, ihash,
+   btrfs_hash_sizes[algo]);
+   if (ret > 0) {
+   /*
+* We only keep one hash in tree to save memory, so if
+* hash conflicts, free the one to insert.
+*/
+   rb_erase(>bytenr_node, _info->bytenr_root);
+   kfree(ihash);
+   ret = 0;
+   goto out;
+   }
+
+   list_add(>lru_list, _info->lru_list);
+   dedupe_info->current_nr++;
+
+   /* Remove the last dedupe hash if we exceed limit */
+   while (dedupe_info->current_nr > dedupe_info->limit_nr) {
+   struct inmem_hash *last;
+
+   last = list_entry(dedupe_info->lru_list.prev,
+ struct inmem_hash, lru_list);
+   __inmem_del(dedupe_info, last);
+   }
+out:
+   mutex_unlock(_info->lock);
+   return 0;
+}
+
+int btrfs_dedupe_add(struct btrfs_fs_info *fs_info,
+struct btrfs_dedupe_hash *hash)
+{
+   struct btrfs_dedupe_info *dedupe_info 

[PATCH v15.1 05/13] btrfs: delayed-ref: Add support for increasing data ref under spinlock

2018-11-05 Thread Lu Fengqi
From: Qu Wenruo 

For in-band dedupe, btrfs needs to increase data ref with delayed_ref
locked, so add a new function btrfs_add_delayed_data_ref_lock() to
increase extent ref with delayed_refs already locked. Export
init_delayed_ref_head and init_delayed_ref_common for inband dedupe.

Signed-off-by: Qu Wenruo 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/delayed-ref.c | 53 +-
 fs/btrfs/delayed-ref.h | 15 
 2 files changed, 52 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 9301b3ad9217..ae8968f10ce0 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -533,7 +533,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_root 
*delayed_refs,
spin_unlock(>lock);
 }
 
-static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
+void btrfs_init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
  struct btrfs_qgroup_extent_record *qrecord,
  u64 bytenr, u64 num_bytes, u64 ref_root,
  u64 reserved, int action, bool is_data,
@@ -661,7 +661,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
 }
 
 /*
- * init_delayed_ref_common - Initialize the structure which represents a
+ * btrfs_init_delayed_ref_common - Initialize the structure which represents a
  *  modification to a an extent.
  *
  * @fs_info:Internal to the mounted filesystem mount structure.
@@ -685,7 +685,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
  * when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
  * BTRFS_EXTENT_DATA_REF_KEY when recording data extent
  */
-static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
+void btrfs_init_delayed_ref_common(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_node *ref,
u64 bytenr, u64 num_bytes, u64 ref_root,
int action, u8 ref_type)
@@ -758,14 +758,14 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle 
*trans,
else
ref_type = BTRFS_TREE_BLOCK_REF_KEY;
 
-   init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
-   ref_root, action, ref_type);
+   btrfs_init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
+ ref_root, action, ref_type);
ref->root = ref_root;
ref->parent = parent;
ref->level = level;
 
-   init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
- ref_root, 0, action, false, is_system);
+   btrfs_init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
+   ref_root, 0, action, false, is_system);
head_ref->extent_op = extent_op;
 
delayed_refs = >transaction->delayed_refs;
@@ -794,6 +794,29 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle 
*trans,
return 0;
 }
 
+/*
+ * Do real delayed data ref insert.
+ * Caller must hold delayed_refs->lock and allocation memory
+ * for dref,head_ref and record.
+ */
+int btrfs_add_delayed_data_ref_locked(struct btrfs_trans_handle *trans,
+   struct btrfs_delayed_ref_head *head_ref,
+   struct btrfs_qgroup_extent_record *qrecord,
+   struct btrfs_delayed_data_ref *ref, int action,
+   int *qrecord_inserted_ret, int *old_ref_mod,
+   int *new_ref_mod)
+{
+   struct btrfs_delayed_ref_root *delayed_refs;
+
+   head_ref = add_delayed_ref_head(trans, head_ref, qrecord,
+   action, qrecord_inserted_ret,
+   old_ref_mod, new_ref_mod);
+
+   delayed_refs = >transaction->delayed_refs;
+
+   return insert_delayed_ref(trans, delayed_refs, head_ref, >node);
+}
+
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
@@ -820,7 +843,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle 
*trans,
ref_type = BTRFS_SHARED_DATA_REF_KEY;
else
ref_type = BTRFS_EXTENT_DATA_REF_KEY;
-   init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
+   btrfs_init_delayed_ref_common(fs_info, >node, bytenr, num_bytes,
ref_root, action, ref_type);
ref->root = ref_root;
ref->parent = parent;
@@ -845,8 +868,8 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle 
*trans,
}
}
 
-   init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
- reserved, action, true, false);
+   btrfs_init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
+ ref_root, reserved, 

[PATCH v15.1 08/13] btrfs: ordered-extent: Add support for dedupe

2018-11-05 Thread Lu Fengqi
From: Wang Xiaoguang 

Add ordered-extent support for dedupe.

Note, current ordered-extent support only supports non-compressed source
extent.
Support for compressed source extent will be added later.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
---
 fs/btrfs/ordered-data.c | 46 +
 fs/btrfs/ordered-data.h | 13 
 2 files changed, 55 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0c4ef208b8b9..4b112258a79b 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -12,6 +12,7 @@
 #include "extent_io.h"
 #include "disk-io.h"
 #include "compression.h"
+#include "dedupe.h"
 
 static struct kmem_cache *btrfs_ordered_extent_cache;
 
@@ -170,7 +171,8 @@ static inline struct rb_node *tree_search(struct 
btrfs_ordered_inode_tree *tree,
  */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
  u64 start, u64 len, u64 disk_len,
- int type, int dio, int compress_type)
+ int type, int dio, int compress_type,
+ struct btrfs_dedupe_hash *hash)
 {
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -191,6 +193,33 @@ static int __btrfs_add_ordered_extent(struct inode *inode, 
u64 file_offset,
entry->inode = igrab(inode);
entry->compress_type = compress_type;
entry->truncated_len = (u64)-1;
+   entry->hash = NULL;
+   /*
+* A hash hit means we have already incremented the extents delayed
+* ref.
+* We must handle this even if another process is trying to
+* turn off dedupe, otherwise we will leak a reference.
+*/
+   if (hash && (hash->bytenr || root->fs_info->dedupe_enabled)) {
+   struct btrfs_dedupe_info *dedupe_info;
+
+   dedupe_info = root->fs_info->dedupe_info;
+   if (WARN_ON(dedupe_info == NULL)) {
+   kmem_cache_free(btrfs_ordered_extent_cache,
+   entry);
+   return -EINVAL;
+   }
+   entry->hash = btrfs_dedupe_alloc_hash(dedupe_info->hash_algo);
+   if (!entry->hash) {
+   kmem_cache_free(btrfs_ordered_extent_cache, entry);
+   return -ENOMEM;
+   }
+   entry->hash->bytenr = hash->bytenr;
+   entry->hash->num_bytes = hash->num_bytes;
+   memcpy(entry->hash->hash, hash->hash,
+  btrfs_hash_sizes[dedupe_info->hash_algo]);
+   }
+
if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
set_bit(type, >flags);
 
@@ -245,15 +274,23 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 
file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, NULL);
 }
 
+int btrfs_add_ordered_extent_dedupe(struct inode *inode, u64 file_offset,
+  u64 start, u64 len, u64 disk_len, int type,
+  struct btrfs_dedupe_hash *hash)
+{
+   return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+ disk_len, type, 0,
+ BTRFS_COMPRESS_NONE, hash);
+}
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
 u64 start, u64 len, u64 disk_len, int type)
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 1,
- BTRFS_COMPRESS_NONE);
+ BTRFS_COMPRESS_NONE, NULL);
 }
 
 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
@@ -262,7 +299,7 @@ int btrfs_add_ordered_extent_compress(struct inode *inode, 
u64 file_offset,
 {
return __btrfs_add_ordered_extent(inode, file_offset, start, len,
  disk_len, type, 0,
- compress_type);
+ compress_type, NULL);
 }
 
 /*
@@ -444,6 +481,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent 
*entry)
list_del(>list);
kfree(sum);
}
+   kfree(entry->hash);
kmem_cache_free(btrfs_ordered_extent_cache, entry);
}
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 02d813aaa261..08c7ee986bb9 100644
--- 

[PATCH v15.1 07/13] btrfs: dedupe: Implement btrfs_dedupe_calc_hash interface

2018-11-05 Thread Lu Fengqi
From: Wang Xiaoguang 

Unlike in-memory or on-disk dedupe method, only SHA256 hash method is
supported yet, so implement btrfs_dedupe_calc_hash() interface using
SHA256.

Signed-off-by: Qu Wenruo 
Signed-off-by: Wang Xiaoguang 
Reviewed-by: Josef Bacik 
Signed-off-by: Lu Fengqi 
---
 fs/btrfs/dedupe.c | 50 +++
 1 file changed, 50 insertions(+)

diff --git a/fs/btrfs/dedupe.c b/fs/btrfs/dedupe.c
index 03ad41423c01..6199215022e6 100644
--- a/fs/btrfs/dedupe.c
+++ b/fs/btrfs/dedupe.c
@@ -644,3 +644,53 @@ int btrfs_dedupe_search(struct btrfs_fs_info *fs_info,
}
return ret;
 }
+
+int btrfs_dedupe_calc_hash(struct btrfs_fs_info *fs_info,
+  struct inode *inode, u64 start,
+  struct btrfs_dedupe_hash *hash)
+{
+   int i;
+   int ret;
+   struct page *p;
+   struct shash_desc *shash;
+   struct btrfs_dedupe_info *dedupe_info = fs_info->dedupe_info;
+   struct crypto_shash *tfm = dedupe_info->dedupe_driver;
+   u64 dedupe_bs;
+   u64 sectorsize = fs_info->sectorsize;
+
+   shash = kmalloc(sizeof(*shash) + crypto_shash_descsize(tfm), GFP_NOFS);
+   if (!shash)
+   return -ENOMEM;
+
+   if (!fs_info->dedupe_enabled || !hash)
+   return 0;
+
+   if (WARN_ON(dedupe_info == NULL))
+   return -EINVAL;
+
+   WARN_ON(!IS_ALIGNED(start, sectorsize));
+
+   dedupe_bs = dedupe_info->blocksize;
+
+   shash->tfm = tfm;
+   shash->flags = 0;
+   ret = crypto_shash_init(shash);
+   if (ret)
+   return ret;
+   for (i = 0; sectorsize * i < dedupe_bs; i++) {
+   char *d;
+
+   p = find_get_page(inode->i_mapping,
+ (start >> PAGE_SHIFT) + i);
+   if (WARN_ON(!p))
+   return -ENOENT;
+   d = kmap(p);
+   ret = crypto_shash_update(shash, d, sectorsize);
+   kunmap(p);
+   put_page(p);
+   if (ret)
+   return ret;
+   }
+   ret = crypto_shash_final(shash, hash->hash);
+   return ret;
+}
-- 
2.19.1





Re: Filesystem mounts fine but hangs on access

2018-11-05 Thread Nikolay Borisov



On 5.11.18 г. 2:23 ч., Qu Wenruo wrote:
> Great, then it's completely free space cache causing the problem.
> 
> You could use -o nospace_cache mount option to avoid the problem as a
> workaround.
> 
> Free space cache only speed up free extent search, it doesn't has extra
> effect on the fs (except the bug).
> So you could disable free space cache without problem.

I'd rather recommend him use -o space_cache=v2


Re: [PATCH 1/3] bitops: Fix big endian compilation

2018-11-05 Thread Nikolay Borisov



On 5.11.18 г. 23:42 ч., Rosen Penev wrote:
> On Mon, Nov 5, 2018 at 1:31 PM Nikolay Borisov  wrote:
>>
>>
>>
>> On 5.11.18 г. 21:06 ч., Rosen Penev wrote:
>>> Replaced bswap with _ variants. While it's a glibc extension, all of the
>>> popular libc implementations (glibc, uClibc, musl, BIONIC) seem to support
>>> it.
>>>
>>> Added static inline to two functions to match little endian variants. This
>>> fixes a linking error experienced when compiling.
>>
>> On what platform did you experience the linking error?
> MIPS 24kc. OpenWrt specifically. Here's a link with the compile log
> (near the end):
> https://circleci.com/gh/openwrt/packages/77?utm_campaign=workflow-failed_medium=email_source=notification
> 
> The LTO errors I believe are due to GCC 7.3.0 being broken (fixed in 7.3.1).
> 
> This patch fixes both issues. I'm still unsure why bswap32 is not
> being included but the _ variant works.
>>
>>>
>>> Signed-off-by: Rosen Penev 

(Some explanation below but the patch is ok)

Reviewed-by: Nikolay Borisov 

>>> ---
>>>  kernel-lib/bitops.h | 8 
>>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/kernel-lib/bitops.h b/kernel-lib/bitops.h
>>> index b1fd6f5..2c51a26 100644
>>> --- a/kernel-lib/bitops.h
>>> +++ b/kernel-lib/bitops.h
>>> @@ -178,9 +178,9 @@ static inline unsigned long find_next_zero_bit(const 
>>> unsigned long *addr,
>>>  static inline unsigned long ext2_swab(const unsigned long y)
>>>  {
>>>  #if BITS_PER_LONG == 64
>>> - return (unsigned long) bswap64((u64) y);
>>> + return (unsigned long) bswap_64((u64) y);
>>>  #elif BITS_PER_LONG == 32
>>> - return (unsigned long) bswap32((u32) y);
>>> + return (unsigned long) bswap_32((u32) y);

Alternatively those bswaps* could be replaced by __builtin_bswap32/64
which come directly from the compiler.

Looking at: https://git.musl-libc.org/cgit/musl/tree/include/byteswap.h

It seems musl only defines bswap_* variants. glibc also seems to define
only _ variants as per:
https://sourceware.org/git/?p=glibc.git;a=blob;f=string/byteswap.h;h=a45b3e20ed5d0849bc3939b80272bd3d6d43dc31;hb=HEAD

And indeed http://man7.org/linux/man-pages/man3/bswap.3.html documents
the functions as having _ suffix so this was a mistake on my  part.

Your solution is correct (and now I have no explanation where did
bswap64 definition come from so that it didn't fail for me).


>>>  #else
>>>  #error BITS_PER_LONG not defined
>>>  #endif
>>> @@ -218,14 +218,14 @@ static inline unsigned long _find_next_bit_le(const 
>>> unsigned long *addr1,
>>>   return min(start + __ffs(ext2_swab(tmp)), nbits);
>>>  }
>>>
>>> -unsigned long find_next_zero_bit_le(const void *addr, unsigned long size,
>>> +static inline unsigned long find_next_zero_bit_le(const void *addr, 
>>> unsigned long size,
>>>   unsigned long offset)
>>>  {
>>>   return _find_next_bit_le(addr, NULL, size, offset, ~0UL);
>>>  }
>>>
>>>
>>> -unsigned long find_next_bit_le(const void *addr, unsigned long size,
>>> +static inline unsigned long find_next_bit_le(const void *addr, unsigned 
>>> long size,
>>>   unsigned long offset)
>>>  {
>>>   return _find_next_bit_le(addr, NULL, size, offset, 0UL);
>>>
> 


Re: btrfs partition is broken, cannot restore anything

2018-11-05 Thread Qu Wenruo


On 2018/11/6 上午2:01, Attila Vangel wrote:
> Hi,
> 
> TL;DR: I want to save data from my unmountable btrfs partition.
> I saw some commands in another thread "Salvage files from broken btrfs".
> I use the most recent Manjaro live (kernel: 4.19.0-3-MANJARO,
> btrfs-progs 4.17.1-1) to execute these commands.
> 
> $ sudo mount -o ro,nologreplay /dev/nvme0n1p2 /mnt
> mount: /mnt: wrong fs type, bad option, bad superblock on
> /dev/nvme0n1p2, missing codepage or helper program, or other error.
> 
> Corresponding lines from dmesg:
> 
> [ 1517.772302] BTRFS info (device nvme0n1p2): disabling log replay at mount 
> time
> [ 1517.772307] BTRFS info (device nvme0n1p2): disk space caching is enabled
> [ 1517.772310] BTRFS info (device nvme0n1p2): has skinny extents
> [ 1517.793414] BTRFS error (device nvme0n1p2): bad tree block start,
> want 18811453440 have 0
> [ 1517.793430] BTRFS error (device nvme0n1p2): failed to read block groups: -5
> [ 1517.808619] BTRFS error (device nvme0n1p2): open_ctree failed

Extent tree corrupted.

If it's the only problem, btrfs-restore should be able to salvage data.

> 
> $ sudo btrfs-find-root /dev/nvme0n1p2

No, that's not what you need.

> Superblock thinks the generation is 220524
> Superblock thinks the level is 1
> Found tree root at 25018368 gen 220524 level 1
> Well block 4243456(gen: 220520 level: 1) seems good, but
> generation/level doesn't match, want gen: 220524 level: 1
> Well block 5259264(gen: 220519 level: 1) seems good, but
> generation/level doesn't match, want gen: 220524 level: 1
> Well block 4866048(gen: 220518 level: 0) seems good, but
> generation/level doesn't match, want gen: 220524 level: 1
> 
> $ sudo btrfs ins dump-super -Ffa /dev/nvme0n1p2
> superblock: bytenr=65536, device=/dev/nvme0n1p2
[snip]
> 
> If I understood correctly, somehow it is possible to use this data to
> parametrize btrfs restore to save the files from the partition.

None of the output is really helpful.

In your case, your extent tree is corrupted, thus kernel will refuse to
mount (even RO).

You should run "btrfs check" on the fs to see if btrfs can check fs tree.
If not, then go directly to "btrfs restore".

Thanks,
Qu

> Could you please help how to do it in this case? I am not familiar
> with these technical terms in the outputs.
> Thanks in advance!
> 
> Cheers,
> Attila
> 
> On Thu, Nov 1, 2018 at 8:40 PM Attila Vangel  wrote:
>>
>> Hi,
>>
>> Somehow my btrfs partition got broken. I use Arch, so my kernel is
>> quite new (4.18.x).
>> I don't remember exactly the sequence of events. At some point it was
>> accessible in read-only, but unfortunately I did not take backup
>> immediately. dmesg log from that time:
>>
>> [ 62.602388] BTRFS warning (device nvme0n1p2): block group
>> 103923318784 has wrong amount of free space
>> [ 62.602390] BTRFS warning (device nvme0n1p2): failed to load free
>> space cache for block group 103923318784, rebuilding it now
>> [ 108.039188] BTRFS error (device nvme0n1p2): bad tree block start 0 
>> 18812026880
>> [ 108.039227] BTRFS: error (device nvme0n1p2) in
>> __btrfs_free_extent:7010: errno=-5 IO failure
>> [ 108.039241] BTRFS info (device nvme0n1p2): forced readonly
>> [ 108.039250] BTRFS: error (device nvme0n1p2) in
>> btrfs_run_delayed_refs:3076: errno=-5 IO failure
>>
>> At the next reboot it failed to mount. Problem may have been that at
>> some point I booted to another distro with older kernel (4.15.x,
>> 4.14.52) and unfortunately attempted some checks/repairs (?) e.g. from
>> gparted, and at that time I did not know it could be destructive.
>>
>> Anyway, currently it fails to mount (even with ro and/or recovery),
>> btrfs check results in "checksum verify failed" and "bad tree block"
>> errors, btrfs restore resulted in "We have looped trying to restore
>> files in" errors for a dozen of paths then exit.
>>
>> Is there some hope to save data from the filesystem, and if so, how?
>>
>> BTW I checked some diagnostics commands regarding my SSD with the nvme
>> client and from that it seems there are no hardware problems.
>>
>> Your help is highly appreciated.
>>
>> Cheers,
>> Attila



signature.asc
Description: OpenPGP digital signature


Re: [PATCH] btrfs: tree-checker: Fix misleading group system information

2018-11-05 Thread Zhangshaokun
Hi Qu,

On 2018/11/5 22:42, Qu Wenruo wrote:
> 
> 
> On 2018/11/5 下午7:33, Zhangshaokun wrote:
>> Hi Qu,
>>
>> On 2018/11/5 19:03, Qu Wenruo wrote:
>>>
>>>
>>> On 2018/11/5 下午6:49, Shaokun Zhang wrote:
 block_group_err shows the group system as a decimal value with a '0x'
 prefix, which is somewhat misleading.

 Fix it to print hexadecimal, as was intended.

 Cc: David Sterba  
 Cc: Chris Mason 
 Cc: Josef Bacik  
 Signed-off-by: Shaokun Zhang 
>>>
>>> Reviewed-by: Qu Wenruo 
>>>
>>> BTW, did you catch it with some real world case or just by looking into
>>> the code?
>>
>> I made a mistake (0x%d) when debugged my code, so I grep the similar format
>> for the kernel code and came across this typo, a trivial patch.
> 
> Ok, that's fine.
> 
> Just a small tip for your further involvement in kernel, for such small
> fix, there is really no need to bother all the maintainers.
> 

My apologies for the noise to all the maintainers. I shall pay more attention
on it.

> You could just use "git blame" to find who is causing the problem, in
> this case it's me unfortunately :( , and Cc that guy.
> 

In fact, I really used the "git blame" and saw it from your patch. While I
use the get_maintainer.pl and Cc them directly, forgot to Cc you. I will
do what you said for the further work.

> Furthermore, you could add a "fixes:" tag.
> About these common tags, you could refer to 'Describe your changes'
> section of 'Documentation/process/submitting-patches.rst'.
> 

Sure, thanks for your nice guidance.

Shaokun

> Thanks,
> Qu
> 
>>
>> Thanks,
>> Shaokun
>>
>>>
>>> Thanks,
>>> Qu
>>>
 ---
  fs/btrfs/tree-checker.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

 diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
 index cab0b1f..efcf89a 100644
 --- a/fs/btrfs/tree-checker.c
 +++ b/fs/btrfs/tree-checker.c
 @@ -440,7 +440,7 @@ static int check_block_group_item(struct btrfs_fs_info 
 *fs_info,
type != (BTRFS_BLOCK_GROUP_METADATA |
   BTRFS_BLOCK_GROUP_DATA)) {
block_group_err(fs_info, leaf, slot,
 -"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 
 0x%llu or 0x%llx",
 +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 
 0x%llx or 0x%llx",
type, hweight64(type),
BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
BTRFS_BLOCK_GROUP_SYSTEM,

>>>
>>
> 



Re: [PATCH 1/3] bitops: Fix big endian compilation

2018-11-05 Thread Rosen Penev
On Mon, Nov 5, 2018 at 1:31 PM Nikolay Borisov  wrote:
>
>
>
> On 5.11.18 г. 21:06 ч., Rosen Penev wrote:
> > Replaced bswap with _ variants. While it's a glibc extension, all of the
> > popular libc implementations (glibc, uClibc, musl, BIONIC) seem to support
> > it.
> >
> > Added static inline to two functions to match little endian variants. This
> > fixes a linking error experienced when compiling.
>
> On what platform did you experience the linking error?
MIPS 24kc. OpenWrt specifically. Here's a link with the compile log
(near the end):
https://circleci.com/gh/openwrt/packages/77?utm_campaign=workflow-failed_medium=email_source=notification

The LTO errors I believe are due to GCC 7.3.0 being broken (fixed in 7.3.1).

This patch fixes both issues. I'm still unsure why bswap32 is not
being included but the _ variant works.
>
> >
> > Signed-off-by: Rosen Penev 
> > ---
> >  kernel-lib/bitops.h | 8 
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/kernel-lib/bitops.h b/kernel-lib/bitops.h
> > index b1fd6f5..2c51a26 100644
> > --- a/kernel-lib/bitops.h
> > +++ b/kernel-lib/bitops.h
> > @@ -178,9 +178,9 @@ static inline unsigned long find_next_zero_bit(const 
> > unsigned long *addr,
> >  static inline unsigned long ext2_swab(const unsigned long y)
> >  {
> >  #if BITS_PER_LONG == 64
> > - return (unsigned long) bswap64((u64) y);
> > + return (unsigned long) bswap_64((u64) y);
> >  #elif BITS_PER_LONG == 32
> > - return (unsigned long) bswap32((u32) y);
> > + return (unsigned long) bswap_32((u32) y);
> >  #else
> >  #error BITS_PER_LONG not defined
> >  #endif
> > @@ -218,14 +218,14 @@ static inline unsigned long _find_next_bit_le(const 
> > unsigned long *addr1,
> >   return min(start + __ffs(ext2_swab(tmp)), nbits);
> >  }
> >
> > -unsigned long find_next_zero_bit_le(const void *addr, unsigned long size,
> > +static inline unsigned long find_next_zero_bit_le(const void *addr, 
> > unsigned long size,
> >   unsigned long offset)
> >  {
> >   return _find_next_bit_le(addr, NULL, size, offset, ~0UL);
> >  }
> >
> >
> > -unsigned long find_next_bit_le(const void *addr, unsigned long size,
> > +static inline unsigned long find_next_bit_le(const void *addr, unsigned 
> > long size,
> >   unsigned long offset)
> >  {
> >   return _find_next_bit_le(addr, NULL, size, offset, 0UL);
> >


Re: [PATCH 2/3] task-utils: Fix comparison between pointer and integer

2018-11-05 Thread Nikolay Borisov



On 5.11.18 г. 21:06 ч., Rosen Penev wrote:
> pthread_t is a pointer type, not an integer one. The > 0 makes no sense
> and throws a warning.

Code-wise the patch is ok, however, technically pthread_t is an opaque
type. I guess David could fix it on the way in so no need to resend.

Reviewed-by: Nikolay Borisov 

> 
> Signed-off-by: Rosen Penev 
> ---
>  task-utils.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/task-utils.c b/task-utils.c
> index a9bee8f..e4dcd36 100644
> --- a/task-utils.c
> +++ b/task-utils.c
> @@ -67,7 +67,7 @@ void task_stop(struct task_info *info)
>   if (!info)
>   return;
>  
> - if (info->id > 0) {
> + if (info->id) {
>   pthread_cancel(info->id);
>   pthread_join(info->id, NULL);
>   info->id = 0;
> 


Re: [PATCH 1/3] bitops: Fix big endian compilation

2018-11-05 Thread Nikolay Borisov



On 5.11.18 г. 21:06 ч., Rosen Penev wrote:
> Replaced bswap with _ variants. While it's a glibc extension, all of the
> popular libc implementations (glibc, uClibc, musl, BIONIC) seem to support
> it.
> 
> Added static inline to two functions to match little endian variants. This
> fixes a linking error experienced when compiling.

On what platform did you experience the linking error?

> 
> Signed-off-by: Rosen Penev 
> ---
>  kernel-lib/bitops.h | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/kernel-lib/bitops.h b/kernel-lib/bitops.h
> index b1fd6f5..2c51a26 100644
> --- a/kernel-lib/bitops.h
> +++ b/kernel-lib/bitops.h
> @@ -178,9 +178,9 @@ static inline unsigned long find_next_zero_bit(const 
> unsigned long *addr,
>  static inline unsigned long ext2_swab(const unsigned long y)
>  {
>  #if BITS_PER_LONG == 64
> - return (unsigned long) bswap64((u64) y);
> + return (unsigned long) bswap_64((u64) y);
>  #elif BITS_PER_LONG == 32
> - return (unsigned long) bswap32((u32) y);
> + return (unsigned long) bswap_32((u32) y);
>  #else
>  #error BITS_PER_LONG not defined
>  #endif
> @@ -218,14 +218,14 @@ static inline unsigned long _find_next_bit_le(const 
> unsigned long *addr1,
>   return min(start + __ffs(ext2_swab(tmp)), nbits);
>  }
>  
> -unsigned long find_next_zero_bit_le(const void *addr, unsigned long size,
> +static inline unsigned long find_next_zero_bit_le(const void *addr, unsigned 
> long size,
>   unsigned long offset)
>  {
>   return _find_next_bit_le(addr, NULL, size, offset, ~0UL);
>  }
>  
>  
> -unsigned long find_next_bit_le(const void *addr, unsigned long size,
> +static inline unsigned long find_next_bit_le(const void *addr, unsigned long 
> size,
>   unsigned long offset)
>  {
>   return _find_next_bit_le(addr, NULL, size, offset, 0UL);
> 


Re: [PATCH 3/3] treewide: Fix missing declarations

2018-11-05 Thread Nikolay Borisov



On 5.11.18 г. 21:06 ч., Rosen Penev wrote:
> Found using -Wmissing-prototypes in GCC.
> 
> This should improve LTO behavior.
> 
> Note that set_free_space_tree_thresholds is an unused function. Adding
> inline seems to remove the unused function warning.
> 
> Signed-off-by: Rosen Penev 


I had a series that did exactly this but since you came in first:

Reviewed-by: Nikolay Borisov 

> ---
>  btrfs.c  |  2 +-
>  check/mode-lowmem.c  |  2 +-
>  extent-tree.c|  2 +-
>  free-space-tree.c| 12 ++--
>  libbtrfsutil/stubs.c |  1 +
>  utils-lib.c  |  2 ++
>  utils.h  |  1 +
>  7 files changed, 13 insertions(+), 9 deletions(-)
> 
> diff --git a/btrfs.c b/btrfs.c
> index 2d39f2c..78c468d 100644
> --- a/btrfs.c
> +++ b/btrfs.c
> @@ -210,7 +210,7 @@ static int handle_global_options(int argc, char **argv)
>   return shift;
>  }
>  
> -void handle_special_globals(int shift, int argc, char **argv)
> +static void handle_special_globals(int shift, int argc, char **argv)
>  {
>   int has_help = 0;
>   int has_full = 0;
> diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c
> index 14bbc9e..94123c1 100644
> --- a/check/mode-lowmem.c
> +++ b/check/mode-lowmem.c
> @@ -953,7 +953,7 @@ out:
>   * returns 0 means success.
>   * returns not 0 means on error;
>   */
> -int repair_ternary_lowmem(struct btrfs_root *root, u64 dir_ino, u64 ino,
> +static int repair_ternary_lowmem(struct btrfs_root *root, u64 dir_ino, u64 
> ino,
> u64 index, char *name, int name_len, u8 filetype,
> int err)
>  {
> diff --git a/extent-tree.c b/extent-tree.c
> index cd98633..8c9cdef 100644
> --- a/extent-tree.c
> +++ b/extent-tree.c
> @@ -3749,7 +3749,7 @@ static void __get_extent_size(struct btrfs_root *root, 
> struct btrfs_path *path,
>   * Return >0 for not found.
>   * Return <0 for err
>   */
> -int btrfs_search_overlap_extent(struct btrfs_root *root,
> +static int btrfs_search_overlap_extent(struct btrfs_root *root,
>   struct btrfs_path *path, u64 bytenr, u64 len)
>  {
>   struct btrfs_key key;
> diff --git a/free-space-tree.c b/free-space-tree.c
> index 6641cdf..6ef5792 100644
> --- a/free-space-tree.c
> +++ b/free-space-tree.c
> @@ -24,7 +24,7 @@
>  #include "bitops.h"
>  #include "internal.h"
>  
> -void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache,
> +static inline void set_free_space_tree_thresholds(struct 
> btrfs_block_group_cache *cache,
>   u64 sectorsize)
>  {
>   u32 bitmap_range;
> @@ -202,7 +202,7 @@ static void le_bitmap_set(unsigned long *map, unsigned 
> int start, int len)
>   }
>  }
>  
> -int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
> +static int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
> struct btrfs_block_group_cache *block_group,
> struct btrfs_path *path)
>  {
> @@ -341,7 +341,7 @@ out:
>   return ret;
>  }
>  
> -int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
> +static int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
> struct btrfs_block_group_cache *block_group,
> struct btrfs_path *path)
>  {
> @@ -780,7 +780,7 @@ out:
>   return ret;
>  }
>  
> -int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
> +static int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
> struct btrfs_block_group_cache *block_group,
> struct btrfs_path *path, u64 start, u64 size)
>  {
> @@ -960,7 +960,7 @@ out:
>   return ret;
>  }
>  
> -int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
> +static int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
>struct btrfs_block_group_cache *block_group,
>struct btrfs_path *path, u64 start, u64 size)
>  {
> @@ -1420,7 +1420,7 @@ out:
>   return ret;
>  }
>  
> -struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
> +static struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
>struct btrfs_fs_info *fs_info,
>u64 objectid)
>  {
> diff --git a/libbtrfsutil/stubs.c b/libbtrfsutil/stubs.c
> index 9b9e037..c530e40 100644
> --- a/libbtrfsutil/stubs.c
> +++ b/libbtrfsutil/stubs.c
> @@ -19,6 +19,7 @@
>  
>  #include 
>  #include 
> +#include "stubs.h"
>  
>  void *reallocarray(void *ptr, size_t nmemb, size_t size)
>  {
> diff --git a/utils-lib.c b/utils-lib.c
> index 044f93f..2ac421b 100644
> --- a/utils-lib.c
> +++ b/utils-lib.c
> @@ -5,6 +5,8 @@
>  #include 
>  #include 
>  
> +#include "utils.h"
> +
>  #if BTRFS_FLAT_INCLUDES
>  #include "ctree.h"
>  #else
> diff --git a/utils.h b/utils.h
> index 

[PATCH 1/3] bitops: Fix big endian compilation

2018-11-05 Thread Rosen Penev
Replaced bswap with _ variants. While it's a glibc extension, all of the
popular libc implementations (glibc, uClibc, musl, BIONIC) seem to support
it.

Added static inline to two functions to match little endian variants. This
fixes a linking error experienced when compiling.

Signed-off-by: Rosen Penev 
---
 kernel-lib/bitops.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel-lib/bitops.h b/kernel-lib/bitops.h
index b1fd6f5..2c51a26 100644
--- a/kernel-lib/bitops.h
+++ b/kernel-lib/bitops.h
@@ -178,9 +178,9 @@ static inline unsigned long find_next_zero_bit(const 
unsigned long *addr,
 static inline unsigned long ext2_swab(const unsigned long y)
 {
 #if BITS_PER_LONG == 64
-   return (unsigned long) bswap64((u64) y);
+   return (unsigned long) bswap_64((u64) y);
 #elif BITS_PER_LONG == 32
-   return (unsigned long) bswap32((u32) y);
+   return (unsigned long) bswap_32((u32) y);
 #else
 #error BITS_PER_LONG not defined
 #endif
@@ -218,14 +218,14 @@ static inline unsigned long _find_next_bit_le(const 
unsigned long *addr1,
return min(start + __ffs(ext2_swab(tmp)), nbits);
 }
 
-unsigned long find_next_zero_bit_le(const void *addr, unsigned long size,
+static inline unsigned long find_next_zero_bit_le(const void *addr, unsigned 
long size,
unsigned long offset)
 {
return _find_next_bit_le(addr, NULL, size, offset, ~0UL);
 }
 
 
-unsigned long find_next_bit_le(const void *addr, unsigned long size,
+static inline unsigned long find_next_bit_le(const void *addr, unsigned long 
size,
unsigned long offset)
 {
return _find_next_bit_le(addr, NULL, size, offset, 0UL);
-- 
2.19.1



[PATCH 2/3] task-utils: Fix comparison between pointer and integer

2018-11-05 Thread Rosen Penev
pthread_t is a pointer type, not an integer one. The > 0 makes no sense
and throws a warning.

Signed-off-by: Rosen Penev 
---
 task-utils.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/task-utils.c b/task-utils.c
index a9bee8f..e4dcd36 100644
--- a/task-utils.c
+++ b/task-utils.c
@@ -67,7 +67,7 @@ void task_stop(struct task_info *info)
if (!info)
return;
 
-   if (info->id > 0) {
+   if (info->id) {
pthread_cancel(info->id);
pthread_join(info->id, NULL);
info->id = 0;
-- 
2.19.1



[PATCH 3/3] treewide: Fix missing declarations

2018-11-05 Thread Rosen Penev
Found using -Wmissing-prototypes in GCC.

This should improve LTO behavior.

Note that set_free_space_tree_thresholds is an unused function. Adding
inline seems to remove the unused function warning.

Signed-off-by: Rosen Penev 
---
 btrfs.c  |  2 +-
 check/mode-lowmem.c  |  2 +-
 extent-tree.c|  2 +-
 free-space-tree.c| 12 ++--
 libbtrfsutil/stubs.c |  1 +
 utils-lib.c  |  2 ++
 utils.h  |  1 +
 7 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/btrfs.c b/btrfs.c
index 2d39f2c..78c468d 100644
--- a/btrfs.c
+++ b/btrfs.c
@@ -210,7 +210,7 @@ static int handle_global_options(int argc, char **argv)
return shift;
 }
 
-void handle_special_globals(int shift, int argc, char **argv)
+static void handle_special_globals(int shift, int argc, char **argv)
 {
int has_help = 0;
int has_full = 0;
diff --git a/check/mode-lowmem.c b/check/mode-lowmem.c
index 14bbc9e..94123c1 100644
--- a/check/mode-lowmem.c
+++ b/check/mode-lowmem.c
@@ -953,7 +953,7 @@ out:
  * returns 0 means success.
  * returns not 0 means on error;
  */
-int repair_ternary_lowmem(struct btrfs_root *root, u64 dir_ino, u64 ino,
+static int repair_ternary_lowmem(struct btrfs_root *root, u64 dir_ino, u64 ino,
  u64 index, char *name, int name_len, u8 filetype,
  int err)
 {
diff --git a/extent-tree.c b/extent-tree.c
index cd98633..8c9cdef 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -3749,7 +3749,7 @@ static void __get_extent_size(struct btrfs_root *root, 
struct btrfs_path *path,
  * Return >0 for not found.
  * Return <0 for err
  */
-int btrfs_search_overlap_extent(struct btrfs_root *root,
+static int btrfs_search_overlap_extent(struct btrfs_root *root,
struct btrfs_path *path, u64 bytenr, u64 len)
 {
struct btrfs_key key;
diff --git a/free-space-tree.c b/free-space-tree.c
index 6641cdf..6ef5792 100644
--- a/free-space-tree.c
+++ b/free-space-tree.c
@@ -24,7 +24,7 @@
 #include "bitops.h"
 #include "internal.h"
 
-void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache,
+static inline void set_free_space_tree_thresholds(struct 
btrfs_block_group_cache *cache,
u64 sectorsize)
 {
u32 bitmap_range;
@@ -202,7 +202,7 @@ static void le_bitmap_set(unsigned long *map, unsigned int 
start, int len)
}
 }
 
-int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+static int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
  struct btrfs_block_group_cache *block_group,
  struct btrfs_path *path)
 {
@@ -341,7 +341,7 @@ out:
return ret;
 }
 
-int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+static int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
  struct btrfs_block_group_cache *block_group,
  struct btrfs_path *path)
 {
@@ -780,7 +780,7 @@ out:
return ret;
 }
 
-int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+static int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
  struct btrfs_block_group_cache *block_group,
  struct btrfs_path *path, u64 start, u64 size)
 {
@@ -960,7 +960,7 @@ out:
return ret;
 }
 
-int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+static int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
 struct btrfs_block_group_cache *block_group,
 struct btrfs_path *path, u64 start, u64 size)
 {
@@ -1420,7 +1420,7 @@ out:
return ret;
 }
 
-struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+static struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 struct btrfs_fs_info *fs_info,
 u64 objectid)
 {
diff --git a/libbtrfsutil/stubs.c b/libbtrfsutil/stubs.c
index 9b9e037..c530e40 100644
--- a/libbtrfsutil/stubs.c
+++ b/libbtrfsutil/stubs.c
@@ -19,6 +19,7 @@
 
 #include 
 #include 
+#include "stubs.h"
 
 void *reallocarray(void *ptr, size_t nmemb, size_t size)
 {
diff --git a/utils-lib.c b/utils-lib.c
index 044f93f..2ac421b 100644
--- a/utils-lib.c
+++ b/utils-lib.c
@@ -5,6 +5,8 @@
 #include 
 #include 
 
+#include "utils.h"
+
 #if BTRFS_FLAT_INCLUDES
 #include "ctree.h"
 #else
diff --git a/utils.h b/utils.h
index b6c00cf..7c5eb79 100644
--- a/utils.h
+++ b/utils.h
@@ -29,6 +29,7 @@
 #include "sizes.h"
 #include "messages.h"
 #include "ioctl.h"
+#include "fsfeatures.h"
 
 #define BTRFS_SCAN_MOUNTED (1ULL << 0)
 #define BTRFS_SCAN_LBLKID  (1ULL << 1)
-- 
2.19.1



Re: [PATCH] btrfs: tests: Use BTRFS_MAX_EXTENT_SIZE to replace the intermediate number

2018-11-05 Thread David Sterba
On Sat, Nov 03, 2018 at 05:24:52PM +0800, Qu Wenruo wrote:
> In extent-io self test, we need 2 ordered extents at its maximum size to
> do the test.
> 
> Instead of using the intermediate numbers, use BTRFS_MAX_EXTENT_SIZE for
> @max_bytes, and twice @max_bytes for @total_dirty.
> This should explain why we need all these magic numbers and prevent
> people to modify them by accident.
> 
> Signed-off-by: Qu Wenruo 

Added to misc-next, thanks.


Re: BTRFS did it's job nicely (thanks!)

2018-11-05 Thread Chris Murphy
On Mon, Nov 5, 2018 at 6:27 AM, Austin S. Hemmelgarn
 wrote:
> On 11/4/2018 11:44 AM, waxhead wrote:
>>
>> Sterling Windmill wrote:
>>>
>>> Out of curiosity, what led to you choosing RAID1 for data but RAID10
>>> for metadata?
>>>
>>> I've flip flipped between these two modes myself after finding out
>>> that BTRFS RAID10 doesn't work how I would've expected.
>>>
>>> Wondering what made you choose your configuration.
>>>
>>> Thanks!
>>> Sure,
>>
>>
>> The "RAID"1 profile for data was chosen to maximize disk space utilization
>> since I got a lot of mixed size devices.
>>
>> The "RAID"10 profile for metadata was chosen simply because it *feels* a
>> bit faster for some of my (previous) workload which was reading a lot of
>> small files (which I guess was embedded in the metadata). While I never
>> remembered that I got any measurable performance increase the system simply
>> felt smoother (which is strange since "RAID"10 should hog more disks at
>> once).
>>
>> I would love to try "RAID"10 for both data and metadata, but I have to
>> delete some files first (or add yet another drive).
>>
>> Would you like to elaborate a bit more yourself about how BTRFS "RAID"10
>> does not work as you expected?
>>
>> As far as I know BTRFS' version of "RAID"10 means it ensure 2 copies (1
>> replica) is striped over as many disks it can (as long as there is free
>> space).
>>
>> So if I am not terribly mistaking a "RAID"10 with 20 devices will stripe
>> over (20/2) x 2 and if you run out of space on 10 of the devices it will
>> continue to stripe over (5/2) x 2. So your stripe width vary with the
>> available space essentially... I may be terribly wrong about this (until
>> someones corrects me that is...)
>
> He's probably referring to the fact that instead of there being a roughly
> 50% chance of it surviving the failure of at least 2 devices like classical
> RAID10 is technically able to do, it's currently functionally 100% certain
> it won't survive more than one device failing.

Right. Classic RAID10 is *two block device* copies, where you have
mirror1 drives and mirror2 drives, and each mirror pair becomes a
single virtual block device that are then striped across. If you lose
a single mirror1 drive, its mirror2 data is available and
statistically unlikely to also go away.

Whereas with Btrfs raid10, it's *two block group* copies. And it is
the block group that's striped. That means block group copy 1 is
striped across 1/2 the available drives (at the time the bg is
allocated), and block group copy 2 is striped across the other drives.
When a drive dies, there is no single remaining drive that contains
all the missing copies, they're distributed. Which means you've got a
very good chance in a 2 drive failure of losing two copies of either
metadata or data or both. While I'm not certain it's 100% not
survivable, the real gotcha is it's possible maybe even likely that
it'll mount and seem to work fine but as soon as it runs into two
missing bg's, it'll face plant.


-- 
Chris Murphy


Re: btrfs partition is broken, cannot restore anything

2018-11-05 Thread Attila Vangel
Hi,

Stupid gmail has put my email (or Qu's reply? ) to spam, so I just saw
the reply after I sent my reply (gmail asked me whether to remove it
from spam).

Anyway here is the requested output. Thanks for the help!

$ sudo btrfs check /dev/nvme0n1p2
Opening filesystem to check...
checksum verify failed on 18811453440 found E4E3BDB6 wanted 
checksum verify failed on 18811453440 found E4E3BDB6 wanted 
bad tree block 18811453440, bytenr mismatch, want=18811453440, have=0
ERROR: cannot open file system

$ sudo btrfs check --mode=lowmem /dev/nvme0n1p2
Opening filesystem to check...
checksum verify failed on 18811453440 found E4E3BDB6 wanted 
checksum verify failed on 18811453440 found E4E3BDB6 wanted 
bad tree block 18811453440, bytenr mismatch, want=18811453440, have=0
ERROR: cannot open file system

Regards,
Attila

On Mon, Nov 5, 2018 at 6:01 PM Attila Vangel  wrote:
>
> Hi,
>
> TL;DR: I want to save data from my unmountable btrfs partition.
> I saw some commands in another thread "Salvage files from broken btrfs".
> I use the most recent Manjaro live (kernel: 4.19.0-3-MANJARO,
> btrfs-progs 4.17.1-1) to execute these commands.
>
> $ sudo mount -o ro,nologreplay /dev/nvme0n1p2 /mnt
> mount: /mnt: wrong fs type, bad option, bad superblock on
> /dev/nvme0n1p2, missing codepage or helper program, or other error.
>
> Corresponding lines from dmesg:
>
> [ 1517.772302] BTRFS info (device nvme0n1p2): disabling log replay at mount 
> time
> [ 1517.772307] BTRFS info (device nvme0n1p2): disk space caching is enabled
> [ 1517.772310] BTRFS info (device nvme0n1p2): has skinny extents
> [ 1517.793414] BTRFS error (device nvme0n1p2): bad tree block start,
> want 18811453440 have 0
> [ 1517.793430] BTRFS error (device nvme0n1p2): failed to read block groups: -5
> [ 1517.808619] BTRFS error (device nvme0n1p2): open_ctree failed
>
> $ sudo btrfs-find-root /dev/nvme0n1p2
> Superblock thinks the generation is 220524
> Superblock thinks the level is 1
> Found tree root at 25018368 gen 220524 level 1
> Well block 4243456(gen: 220520 level: 1) seems good, but
> generation/level doesn't match, want gen: 220524 level: 1
> Well block 5259264(gen: 220519 level: 1) seems good, but
> generation/level doesn't match, want gen: 220524 level: 1
> Well block 4866048(gen: 220518 level: 0) seems good, but
> generation/level doesn't match, want gen: 220524 level: 1
>
> $ sudo btrfs ins dump-super -Ffa /dev/nvme0n1p2
> superblock: bytenr=65536, device=/dev/nvme0n1p2
> -
> csum_type0 (crc32c)
> csum_size4
> csum0x7956a931 [match]
> bytenr65536
> flags0x1
> ( WRITTEN )
> magic_BHRfS_M [match]
> fsid014c9d24-339c-482e-8f06-9284e4a7bc40
> labelnewhome
> generation220524
> root25018368
> sys_array_size97
> chunk_root_generation219209
> root_level1
> chunk_root131072
> chunk_root_level1
> log_root86818816
> log_root_transid0
> log_root_level0
> total_bytes355938074624
> bytes_used344504737792
> sectorsize4096
> nodesize16384
> leafsize (deprecated)16384
> stripesize4096
> root_dir6
> num_devices1
> compat_flags0x0
> compat_ro_flags0x0
> incompat_flags0x161
> ( MIXED_BACKREF |
>   BIG_METADATA |
>   EXTENDED_IREF |
>   SKINNY_METADATA )
> cache_generation220524
> uuid_tree_generation220524
> dev_item.uuid05fe6ce8-1f2d-41ba-a367-cbdb8f06ffd3
> dev_item.fsid014c9d24-339c-482e-8f06-9284e4a7bc40 [match]
> dev_item.type0
> dev_item.total_bytes355938074624
> dev_item.bytes_used355792322560
> dev_item.io_align4096
> dev_item.io_width4096
> dev_item.sector_size4096
> dev_item.devid1
> dev_item.dev_group0
> dev_item.seek_speed0
> dev_item.bandwidth0
> dev_item.generation0
> sys_chunk_array[2048]:
> item 0 key (FIRST_CHUNK_TREE CHUNK_ITEM 0)
> length 4194304 owner 2 stripe_len 65536 type SYSTEM
> io_align 4096 io_width 4096 sector_size 4096
> num_stripes 1 sub_stripes 0
> stripe 0 devid 1 offset 0
> dev_uuid 05fe6ce8-1f2d-41ba-a367-cbdb8f06ffd3
> backup_roots[4]:
> backup 0:
> backup_tree_root:42598400gen: 220522level: 1
> backup_chunk_root:131072gen: 219209level: 1
> backup_extent_root:26460160gen: 220522level: 2
> backup_fs_root:51347456gen: 220523level: 2
> backup_dev_root:4472832gen: 220520level: 1
> backup_csum_root:26558464gen: 220522level: 2
> backup_total_bytes:355938074624
> backup_bytes_used:344504741888
> backup_num_devices:1
>
> backup 1:
> backup_tree_root:   

Re: [PATCH v9 0/6] Btrfs: implement swap file support

2018-11-05 Thread David Sterba
On Mon, Oct 22, 2018 at 02:13:27PM -0700, Omar Sandoval wrote:
> > > Omar Sandoval (6):
> > >   mm: split SWP_FILE into SWP_ACTIVATED and SWP_FS
> > >   mm: export add_swap_extent()
> > >   vfs: update swap_{,de}activate documentation

> > >   Btrfs: prevent ioctls from interfering with a swap file
> > >   Btrfs: rename get_chunk_map() and make it non-static
> > >   Btrfs: support swap files
> > 
> > Patches 1 and 2 now going through Andrew's tree, the btrfs part will be
> > delayed and not merged to 4.20. This is a bit unfortuante, I was busy
> > with the non-feature patches and other things, sorry.
> 
> That's perfectly fine with me, than

The 3 btrfs patches are now in misc-next.

Setting up the swap file needs the extra steps to make sure it's a NOCOW
file and preallocated, that's just 2 more commands using common tools.
The surprise may come on an a multi-device filesystem when the chunks
get spread over more devices and the user has no control over that.
Reducing the file size until it fits is a workaround, tough not totally
reliable.

I've explored the error cases (balance, dev delete, adding more
swapfiles). Also a stress test (make -j on kernel). The OOM killer was
able to get the system back after each round after which I added one
more swapfile, until the system was effectively dead.

So the stability seems to be ok, we will need to document the usecase,
constraints and how to properly set up the swapfile, and that's about
it. Thanks.


Re: btrfs partition is broken, cannot restore anything

2018-11-05 Thread Attila Vangel
Hi,

TL;DR: I want to save data from my unmountable btrfs partition.
I saw some commands in another thread "Salvage files from broken btrfs".
I use the most recent Manjaro live (kernel: 4.19.0-3-MANJARO,
btrfs-progs 4.17.1-1) to execute these commands.

$ sudo mount -o ro,nologreplay /dev/nvme0n1p2 /mnt
mount: /mnt: wrong fs type, bad option, bad superblock on
/dev/nvme0n1p2, missing codepage or helper program, or other error.

Corresponding lines from dmesg:

[ 1517.772302] BTRFS info (device nvme0n1p2): disabling log replay at mount time
[ 1517.772307] BTRFS info (device nvme0n1p2): disk space caching is enabled
[ 1517.772310] BTRFS info (device nvme0n1p2): has skinny extents
[ 1517.793414] BTRFS error (device nvme0n1p2): bad tree block start,
want 18811453440 have 0
[ 1517.793430] BTRFS error (device nvme0n1p2): failed to read block groups: -5
[ 1517.808619] BTRFS error (device nvme0n1p2): open_ctree failed

$ sudo btrfs-find-root /dev/nvme0n1p2
Superblock thinks the generation is 220524
Superblock thinks the level is 1
Found tree root at 25018368 gen 220524 level 1
Well block 4243456(gen: 220520 level: 1) seems good, but
generation/level doesn't match, want gen: 220524 level: 1
Well block 5259264(gen: 220519 level: 1) seems good, but
generation/level doesn't match, want gen: 220524 level: 1
Well block 4866048(gen: 220518 level: 0) seems good, but
generation/level doesn't match, want gen: 220524 level: 1

$ sudo btrfs ins dump-super -Ffa /dev/nvme0n1p2
superblock: bytenr=65536, device=/dev/nvme0n1p2
-
csum_type0 (crc32c)
csum_size4
csum0x7956a931 [match]
bytenr65536
flags0x1
( WRITTEN )
magic_BHRfS_M [match]
fsid014c9d24-339c-482e-8f06-9284e4a7bc40
labelnewhome
generation220524
root25018368
sys_array_size97
chunk_root_generation219209
root_level1
chunk_root131072
chunk_root_level1
log_root86818816
log_root_transid0
log_root_level0
total_bytes355938074624
bytes_used344504737792
sectorsize4096
nodesize16384
leafsize (deprecated)16384
stripesize4096
root_dir6
num_devices1
compat_flags0x0
compat_ro_flags0x0
incompat_flags0x161
( MIXED_BACKREF |
  BIG_METADATA |
  EXTENDED_IREF |
  SKINNY_METADATA )
cache_generation220524
uuid_tree_generation220524
dev_item.uuid05fe6ce8-1f2d-41ba-a367-cbdb8f06ffd3
dev_item.fsid014c9d24-339c-482e-8f06-9284e4a7bc40 [match]
dev_item.type0
dev_item.total_bytes355938074624
dev_item.bytes_used355792322560
dev_item.io_align4096
dev_item.io_width4096
dev_item.sector_size4096
dev_item.devid1
dev_item.dev_group0
dev_item.seek_speed0
dev_item.bandwidth0
dev_item.generation0
sys_chunk_array[2048]:
item 0 key (FIRST_CHUNK_TREE CHUNK_ITEM 0)
length 4194304 owner 2 stripe_len 65536 type SYSTEM
io_align 4096 io_width 4096 sector_size 4096
num_stripes 1 sub_stripes 0
stripe 0 devid 1 offset 0
dev_uuid 05fe6ce8-1f2d-41ba-a367-cbdb8f06ffd3
backup_roots[4]:
backup 0:
backup_tree_root:42598400gen: 220522level: 1
backup_chunk_root:131072gen: 219209level: 1
backup_extent_root:26460160gen: 220522level: 2
backup_fs_root:51347456gen: 220523level: 2
backup_dev_root:4472832gen: 220520level: 1
backup_csum_root:26558464gen: 220522level: 2
backup_total_bytes:355938074624
backup_bytes_used:344504741888
backup_num_devices:1

backup 1:
backup_tree_root:52363264gen: 220523level: 1
backup_chunk_root:131072gen: 219209level: 1
backup_extent_root:51806208gen: 220523level: 2
backup_fs_root:51347456gen: 220523level: 2
backup_dev_root:4472832gen: 220520level: 1
backup_csum_root:52461568gen: 220523level: 2
backup_total_bytes:355938074624
backup_bytes_used:344504729600
backup_num_devices:1

backup 2:
backup_tree_root:25018368gen: 220524level: 1
backup_chunk_root:131072gen: 219209level: 1
backup_extent_root:21479424gen: 220524level: 2
backup_fs_root:53084160gen: 220524level: 2
backup_dev_root:4472832gen: 220520level: 1
backup_csum_root:53379072gen: 220524level: 2
backup_total_bytes:355938074624
backup_bytes_used:344504737792
backup_num_devices:1

backup 3:
backup_tree_root:21921792gen: 220521level: 1
backup_chunk_root:

Re: [PATCH] btrfs: tree-checker: Fix misleading group system information

2018-11-05 Thread David Sterba
On Mon, Nov 05, 2018 at 06:49:09PM +0800, Shaokun Zhang wrote:
> block_group_err shows the group system as a decimal value with a '0x'
> prefix, which is somewhat misleading.

Thanks. As this is user visible I'll add stable tag and get it to 4.20
in the next rc.


Re: [PATCH v4] Btrfs: fix deadlock on tree root leaf when finding free extent

2018-11-05 Thread Filipe Manana
On Mon, Nov 5, 2018 at 4:34 PM David Sterba  wrote:
>
> On Mon, Nov 05, 2018 at 04:30:35PM +, Filipe Manana wrote:
> > On Mon, Nov 5, 2018 at 4:29 PM David Sterba  wrote:
> > >
> > > On Wed, Oct 24, 2018 at 01:48:40PM +0100, Filipe Manana wrote:
> > > > > Ah ok makes sense.  Well in that case lets just make 
> > > > > btrfs_read_locked_inode()
> > > > > take a path, and allocate it in btrfs_iget, that'll remove the ugly
> > > > >
> > > > > if (path != in_path)
> > > >
> > > > You mean the following on top of v4:
> > > >
> > > > https://friendpaste.com/6XrGXb5p0RSJGixUFYouHg
> > > >
> > > > Not much different, just saves one such if statement. I'm ok with that.
> > >
> > > Now in misc-next with v4 and the friendpaste incremental as
> > >
> > > https://github.com/kdave/btrfs-devel/commit/efcfd6c87d28b3aa9bcba52d7c1e1fc79a2dad69
> >
> > Please don't add the incremental. It's buggy. It was meant to figure
> > out what Josef was saying. That's why I haven't sent a V5.
>
> Ok dropped, I'll will wait for a proper patch.

It's V4, the last sent version. Just forget the incremental.
Thanks.


Re: [PATCH v4] Btrfs: fix deadlock on tree root leaf when finding free extent

2018-11-05 Thread David Sterba
On Mon, Nov 05, 2018 at 04:30:35PM +, Filipe Manana wrote:
> On Mon, Nov 5, 2018 at 4:29 PM David Sterba  wrote:
> >
> > On Wed, Oct 24, 2018 at 01:48:40PM +0100, Filipe Manana wrote:
> > > > Ah ok makes sense.  Well in that case lets just make 
> > > > btrfs_read_locked_inode()
> > > > take a path, and allocate it in btrfs_iget, that'll remove the ugly
> > > >
> > > > if (path != in_path)
> > >
> > > You mean the following on top of v4:
> > >
> > > https://friendpaste.com/6XrGXb5p0RSJGixUFYouHg
> > >
> > > Not much different, just saves one such if statement. I'm ok with that.
> >
> > Now in misc-next with v4 and the friendpaste incremental as
> >
> > https://github.com/kdave/btrfs-devel/commit/efcfd6c87d28b3aa9bcba52d7c1e1fc79a2dad69
> 
> Please don't add the incremental. It's buggy. It was meant to figure
> out what Josef was saying. That's why I haven't sent a V5.

Ok dropped, I'll will wait for a proper patch.


Re: [PATCH v4] Btrfs: fix deadlock on tree root leaf when finding free extent

2018-11-05 Thread Filipe Manana
On Mon, Nov 5, 2018 at 4:29 PM David Sterba  wrote:
>
> On Wed, Oct 24, 2018 at 01:48:40PM +0100, Filipe Manana wrote:
> > > Ah ok makes sense.  Well in that case lets just make 
> > > btrfs_read_locked_inode()
> > > take a path, and allocate it in btrfs_iget, that'll remove the ugly
> > >
> > > if (path != in_path)
> >
> > You mean the following on top of v4:
> >
> > https://friendpaste.com/6XrGXb5p0RSJGixUFYouHg
> >
> > Not much different, just saves one such if statement. I'm ok with that.
>
> Now in misc-next with v4 and the friendpaste incremental as
>
> https://github.com/kdave/btrfs-devel/commit/efcfd6c87d28b3aa9bcba52d7c1e1fc79a2dad69

Please don't add the incremental. It's buggy. It was meant to figure
out what Josef was saying. That's why I haven't sent a V5.


Re: [PATCH v4] Btrfs: fix deadlock on tree root leaf when finding free extent

2018-11-05 Thread David Sterba
On Wed, Oct 24, 2018 at 01:48:40PM +0100, Filipe Manana wrote:
> > Ah ok makes sense.  Well in that case lets just make 
> > btrfs_read_locked_inode()
> > take a path, and allocate it in btrfs_iget, that'll remove the ugly
> >
> > if (path != in_path)
> 
> You mean the following on top of v4:
> 
> https://friendpaste.com/6XrGXb5p0RSJGixUFYouHg
> 
> Not much different, just saves one such if statement. I'm ok with that.

Now in misc-next with v4 and the friendpaste incremental as

https://github.com/kdave/btrfs-devel/commit/efcfd6c87d28b3aa9bcba52d7c1e1fc79a2dad69


Re: [PATCH 1/8] btrfs: Remove extent_io_ops::fill_delalloc

2018-11-05 Thread David Sterba
On Thu, Nov 01, 2018 at 02:09:46PM +0200, Nikolay Borisov wrote:
> This callback is called only from writepage_delalloc which in turn
> is guaranteed to be called from the data page writeout path. In the end
> there is no reason to have the call to this function to be indrected
> via the extent_io_ops structure. This patch removes the callback
> definition, exports the function and calls it directly. No functional
> changes.
> 
> Signed-off-by: Nikolay Borisov 
> ---
>  fs/btrfs/ctree.h |  3 +++
>  fs/btrfs/extent_io.c | 14 ++
>  fs/btrfs/extent_io.h |  5 -
>  fs/btrfs/inode.c | 10 +-
>  4 files changed, 14 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 68ca41dbbef3..dbeb5b2486d5 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3186,6 +3186,9 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
>   struct btrfs_trans_handle *trans, int mode,
>   u64 start, u64 num_bytes, u64 min_size,
>   loff_t actual_len, u64 *alloc_hint);
> +int run_delalloc_range(void *private_data, struct page *locked_page, u64 
> start,

Functions exported in .h should have the btrfs_prefix, updated in the
patch.

> +u64 end, int *page_started, unsigned long *nr_written,
> +struct writeback_control *wbc);
>  extern const struct dentry_operations btrfs_dentry_operations;
>  
>  /* ioctl.c */
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 6877a74c7469..2e6191aa25f3 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -3205,7 +3205,7 @@ static void update_nr_written(struct writeback_control 
> *wbc,
>  /*
>   * helper for __extent_writepage, doing all of the delayed allocation setup.
>   *
> - * This returns 1 if our fill_delalloc function did all the work required
> + * This returns 1 if run_delalloc_range function did all the work required
>   * to write the page (copy into inline extent).  In this case the IO has
>   * been started and the page is already unlocked.
>   *
> @@ -3226,7 +3226,7 @@ static noinline_for_stack int writepage_delalloc(struct 
> inode *inode,
>   int ret;
>   int page_started = 0;
>  
> - if (epd->extent_locked || !tree->ops || !tree->ops->fill_delalloc)
> + if (epd->extent_locked)
>   return 0;
>  
>   while (delalloc_end < page_end) {
> @@ -3239,15 +3239,13 @@ static noinline_for_stack int 
> writepage_delalloc(struct inode *inode,
>   delalloc_start = delalloc_end + 1;
>   continue;
>   }
> - ret = tree->ops->fill_delalloc(inode, page,
> -delalloc_start,
> -delalloc_end,
> -_started,
> -nr_written, wbc);
> + ret = run_delalloc_range(inode, page, delalloc_start,
> +  delalloc_end, _started,
> +  nr_written, wbc);
>   /* File system has been set read-only */
>   if (ret) {
>   SetPageError(page);
> - /* fill_delalloc should be return < 0 for error
> + /* run_delalloc_range should return < 0 for error

Please don't use this style of comments, fixed.


Re: [PATCH 0/8] Removal of optional hooks from struct extent_io_ops

2018-11-05 Thread David Sterba
On Thu, Nov 01, 2018 at 02:09:45PM +0200, Nikolay Borisov wrote:
> extent_io_ops has a set of 8 optional hooks which are set only for data and 
> freespace inodes. The majority of them actually deal with delallocs in one 
> way 
> or another. Inspecting the code it transpired that there is actually no need 
> to
> have them as function pointers in a structure. Data/freespace inodes can 
> easily
> be distinguished from the btree_inode (which is pending removal anyway) by 
> inspecting extent_io_tree::private_data. This member is set by all 
> data/freespace
> inodes. This series exploits this fact to remove the majority of them. 
> Others, 
> such as fill_delalloc, writepage_start_hook and writepage_end_io_hook are 
> always
> called from the data writeout path and can be directly called without having 
> to
> check whether the respective pointers are set. 
> 
> This series has undergone multiple xfstest runs and no regressions were 
> identified. Additionally all but run_delalloc_range functions are given more 
> descriptive names, related to their actual intent. 
> 
> Nikolay Borisov (8):
>   btrfs: Remove extent_io_ops::fill_delalloc
>   btrfs: Remove extent_io_ops::writepage_start_hook
>   btrfs: Remove extent_io_ops::writepage_end_io_hook
>   btrfs: Remove extent_io_ops::check_extent_io_range callback
>   btrfs: Remove extent_io_ops::set_bit_hook extent_io callback
>   btrfs: Remove extent_io_ops::clear_bit_hook callback
>   btrfs: Remove extent_io_ops::merge_extent_hook callback
>   btrfs: Remove extent_io_ops::split_extent_hook callback

Added to misc-next, thanks.


Re: [PATCH v2 0/2] Enhance btrfs_verify_dev_extents() to do more checks on dev extents

2018-11-05 Thread David Sterba
On Fri, Oct 05, 2018 at 05:45:53PM +0800, Qu Wenruo wrote:
> Inspired by Hans' possible flawed DUP chunk allocator, add the following
> dev extents checker:
> 
> 1) Dev extent overlap check
>Dev extents don't use extent_cache so it can't report dev extents
>overlap.
>So manually check dev extents overlap.
>This check is pretty simple since we're already iterating dev extents
>by its physical offset, we only need to remember previous checked dev
>extents to do such check.
> 
> 2) Dev extent end check
>No dev extent should go beyond device boundary.
> 
> These two checks are pretty cheap so it shouldn't bring any performance
> overhead.
> 
> Changelog:
> v2:
>   Add "Link:" tag for the first patch.
>   Move the actual check into verify_one_dev_extent() for the 2nd patch.

Moved from for-next topic branch to misc-next, with some changelog
updates. Thanks.


Re: [PATCH v5 0/4] btrfs: Refactor find_free_extent()

2018-11-05 Thread David Sterba
On Fri, Nov 02, 2018 at 09:39:46AM +0800, Qu Wenruo wrote:
> Can be fetched from github:
> https://github.com/adam900710/linux/tree/refactor_find_free_extent
> 
> Which is based on david's misc-4.20 branch.

> v5:

Now moved from for-next topic branch to misc-next, from now on please
send updates as separate patches. Thanks.


Re: [PATCH] btrfs: tree-checker: Fix misleading group system information

2018-11-05 Thread Qu Wenruo


On 2018/11/5 下午7:33, Zhangshaokun wrote:
> Hi Qu,
> 
> On 2018/11/5 19:03, Qu Wenruo wrote:
>>
>>
>> On 2018/11/5 下午6:49, Shaokun Zhang wrote:
>>> block_group_err shows the group system as a decimal value with a '0x'
>>> prefix, which is somewhat misleading.
>>>
>>> Fix it to print hexadecimal, as was intended.
>>>
>>> Cc: David Sterba  
>>> Cc: Chris Mason 
>>> Cc: Josef Bacik  
>>> Signed-off-by: Shaokun Zhang 
>>
>> Reviewed-by: Qu Wenruo 
>>
>> BTW, did you catch it with some real world case or just by looking into
>> the code?
> 
> I made a mistake (0x%d) when debugged my code, so I grep the similar format
> for the kernel code and came across this typo, a trivial patch.

Ok, that's fine.

Just a small tip for your further involvement in kernel, for such small
fix, there is really no need to bother all the maintainers.

You could just use "git blame" to find who is causing the problem, in
this case it's me unfortunately :( , and Cc that guy.

Furthermore, you could add a "fixes:" tag.
About these common tags, you could refer to 'Describe your changes'
section of 'Documentation/process/submitting-patches.rst'.

Thanks,
Qu

> 
> Thanks,
> Shaokun
> 
>>
>> Thanks,
>> Qu
>>
>>> ---
>>>  fs/btrfs/tree-checker.c | 2 +-
>>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>>
>>> diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
>>> index cab0b1f..efcf89a 100644
>>> --- a/fs/btrfs/tree-checker.c
>>> +++ b/fs/btrfs/tree-checker.c
>>> @@ -440,7 +440,7 @@ static int check_block_group_item(struct btrfs_fs_info 
>>> *fs_info,
>>> type != (BTRFS_BLOCK_GROUP_METADATA |
>>>BTRFS_BLOCK_GROUP_DATA)) {
>>> block_group_err(fs_info, leaf, slot,
>>> -"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 
>>> 0x%llu or 0x%llx",
>>> +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 
>>> 0x%llx or 0x%llx",
>>> type, hweight64(type),
>>> BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
>>> BTRFS_BLOCK_GROUP_SYSTEM,
>>>
>>
> 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH] Btrfs: incremental send, fix infinite loop when apply children dir moves

2018-11-05 Thread Qu Wenruo


On 2018/11/5 下午7:11, Filipe Manana wrote:
> On Mon, Nov 5, 2018 at 4:10 AM robbieko  wrote:
>>
>> Filipe Manana 於 2018-10-30 19:36 寫到:
>>> On Tue, Oct 30, 2018 at 7:00 AM robbieko  wrote:

 From: Robbie Ko 

 In apply_children_dir_moves, we first create an empty list (stack),
 then we get an entry from pending_dir_moves and add it to the stack,
 but we didn't delete the entry from rb_tree.

 So, in add_pending_dir_move, we create a new entry and then use the
 parent_ino in the current rb_tree to find the corresponding entry,
 and if so, add the new entry to the corresponding list.

 However, the entry may have been added to the stack, causing new
 entries to be added to the stack as well.

I'm not a send guy, so I can totally be wrong, but that 'may' word seems
to hide the demon.


 Finally, each time we take the first entry from the stack and start
 processing, it ends up with an infinite loop.

 Fix this problem by remove node from pending_dir_moves,
 avoid add new pending_dir_move to error list.
>>>
>>> I can't parse that explanation.
>>> Can you give a concrete example (reproducer) or did this came out of
>>> thin air?
>>>
>>> Thanks.
>>>
>>
>> I am sorry that I replied so late.
>>
>> I have no way to give a simple example.
>> But I can provide a btrfs image file
>> You can restore the Image via btrfs-image
>> Then directly command "btrfs send -e -p parent send -f dump_file"

According to the name, it doesn't look like a real world case, but some
more or less manually crafted image.
It shouldn't be that hard to describe the root cause in details if it's
crafted.

Or, if it's a image caused by some stress test, then I really hope you
could locate the direct and root cause, or at least minimize the image.
The extra noise will really take a lot of time from reviewer.

IMHO, it shouldn't be that hard to locate the key/key range that send
loops, with that located it should provide some clue to further pin down
the root cause.

I totally understand that everyone has their own work, if you can't
really spare time for this, would you please upload the image to public
for anyone (me for example) to look into the case?

Thanks,
Qu

>> Infinite loop will occur.
>> I use ubuntu 16.04, kernel 4.15.0.36-generic can be stable reproduce
> 
> You have been occasionally submitting fixes for send/receive for a few
> years now, and you know already
> that I always ask for a changelog that describes well the problem and
> an example/reproducer.
> 
> Why did you do this?
> 
> What I can read from your answer is that you were too lazy to extract
> a reproducer from that image.
> Just made some change that fixes the infinite loop and because it
> apparently works you're done with it,
> Without an example at least, I don't think you or anyone can fully
> understand the problem, and if what
> you have (despite somewhat making theoretical sense) is really a good
> solution or just a workaround for
> the cause of the problem - after all if you can't give an example, you
> can't explain how in practice such loop
> of dependencies between directories happens. This, as with most
> send/receive problems, is a pure sequential
> and deterministic problem so there's really no excuse for not getting
> a reproducer.
> 
> Without an example, an explanation how it happens in the real world,
> does one know that your change is
> fixing the problem is the right place and not introducing other
> problems? Like the receiver not getting some
> changes (missing directories, files, or renames, etc).
> 
> Tests are not just to prove a change is correct, they exist to catch
> and prevent regressions in the future too.
> 
> You can do better than that.
> 
>>
>> Image file, please refer to the attachment.
>>
>> Thanks.
>>
>>

 Signed-off-by: Robbie Ko 
 ---
  fs/btrfs/send.c | 11 ---
  1 file changed, 8 insertions(+), 3 deletions(-)

 diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
 index 094cc144..5be83b5 100644
 --- a/fs/btrfs/send.c
 +++ b/fs/btrfs/send.c
 @@ -3340,7 +3340,8 @@ static void free_pending_move(struct send_ctx
 *sctx, struct pending_dir_move *m)
 kfree(m);
  }

 -static void tail_append_pending_moves(struct pending_dir_move *moves,
 +static void tail_append_pending_moves(struct send_ctx *sctx,
 + struct pending_dir_move *moves,
   struct list_head *stack)
  {
 if (list_empty(>list)) {
 @@ -3351,6 +3352,10 @@ static void tail_append_pending_moves(struct
 pending_dir_move *moves,
 list_add_tail(>list, stack);
 list_splice_tail(, stack);
 }
 +   if (!RB_EMPTY_NODE(>node)) {
 +   rb_erase(>node, >pending_dir_moves);
 +   RB_CLEAR_NODE(>node);
 +   }
  }

  static int 

Re: BTRFS did it's job nicely (thanks!)

2018-11-05 Thread Austin S. Hemmelgarn

On 11/4/2018 11:44 AM, waxhead wrote:

Sterling Windmill wrote:

Out of curiosity, what led to you choosing RAID1 for data but RAID10
for metadata?

I've flip flipped between these two modes myself after finding out
that BTRFS RAID10 doesn't work how I would've expected.

Wondering what made you choose your configuration.

Thanks!
Sure,


The "RAID"1 profile for data was chosen to maximize disk space 
utilization since I got a lot of mixed size devices.


The "RAID"10 profile for metadata was chosen simply because it *feels* a 
bit faster for some of my (previous) workload which was reading a lot of 
small files (which I guess was embedded in the metadata). While I never 
remembered that I got any measurable performance increase the system 
simply felt smoother (which is strange since "RAID"10 should hog more 
disks at once).


I would love to try "RAID"10 for both data and metadata, but I have to 
delete some files first (or add yet another drive).


Would you like to elaborate a bit more yourself about how BTRFS "RAID"10 
does not work as you expected?


As far as I know BTRFS' version of "RAID"10 means it ensure 2 copies (1 
replica) is striped over as many disks it can (as long as there is free 
space).


So if I am not terribly mistaking a "RAID"10 with 20 devices will stripe 
over (20/2) x 2 and if you run out of space on 10 of the devices it will 
continue to stripe over (5/2) x 2. So your stripe width vary with the 
available space essentially... I may be terribly wrong about this (until 
someones corrects me that is...)
He's probably referring to the fact that instead of there being a 
roughly 50% chance of it surviving the failure of at least 2 devices 
like classical RAID10 is technically able to do, it's currently 
functionally 100% certain it won't survive more than one device failing.




Salvage files from broken btrfs

2018-11-05 Thread M. Klingmann
On 03.11.2018 at 02:05 Qu Wenruo wrote:
> On 2018/11/3 上午1:18, M. Klingmann wrote:
>> On 02.11.2018 at 15:45 Qu Wenruo wrote:
>>> On 2018/11/2 下午10:30, M. Klingmann wrote:
 On 31.10.2018 at 01:03 Qu Wenruo wrote:
> My plan for such recovery is:
>
> 1) btrfs ins dump-super to make sure system chunk array is valid
> 2) btrfs-find-root to find any valid chunk tree blocks
> 3) pass that chunk tree bytenr to btrfs-restore
>Unfortunately, btrfs-restore doesn't support specifying chunk root
>yet. But it's pretty easy to add such support.
>
> So, please provide the "btrfs ins dump-super -Ffa" output to start with.
 Following your plan, I did 1) and 2).
 As 2) failed (see below), is there anything I can do to find the tree
 bytenr to supply btrfs-restore with it?

 1) Here's the output given by "btrfs-show-super -Ffa":

 superblock: bytenr=65536, device=sdcard.iso
 -
 csum            0xb8e15dd7 [match]
>> [snip]
 2) "btrfs-find-root" yields "Couldn't read chunk root; Open ctree failed".
>>> It's not plain "btrfs-find-root" but "btrfs-find-root -o 5".
>>>
>>> And you should use btrfs-progs v4.17.1, not the old v4.4.
>>> The ability to continue search even if chunk tree get corrupted is added
>>> in v4.5, and I strongly recommend to use latest (v4.17.1) for a lot of
>>> fixes and extra debug output.
>>>
>>> If you can't find any handy way to update btrfs-progs, you could use
>>> Archlinux iso as a rescue OS to use the latest btrfs-progs.
>> Using Archlinux in fact is the easiest way to use version 4.17.1
>> (Archlinux for 2018-11-01).
>>
>> Here's the output from "btrfs-find-root sdcard.iso":
>>
>> WARNING: cannot read chunk root, continue anyway
>> Superblock thinks the generation is 1757933
>> Superblock thinks the level is 0
>>
>> Here's the output using "btrfs-find-root -o 5 sdcard.iso":
>>
>> WARNING: cannot read chunk root, continue anyway
>> Superblock doesn't contain generation info for root 5
>> Superblock doesn't contain the level info for root 5
> No other output at all?
>
> That means the whole 8M range of system chunk get corrupted.
> Thus really no way to get any meaningful data out of the filesystem,
> unfortunately.
>
> Thanks,
> Qu
That's a pity. So I'm back to the hex editor.
I hope to find another angle before searching for file content.
Thank you for your efforts anyway.
Cheers,
Mirko


Btrfs progs release 4.19

2018-11-05 Thread David Sterba
Hi,

btrfs-progs version 4.19 have been released.

The version 4.18 was skipped to keep the time of release close to kernel. The
sort-of promise that 'progs version X supports features from kernel X' does not
hold for the user accessible ioctls to list subvolumes. As this is not a
critical feature that's missing, hopefully this is berable.

On the downside this blocked the whole 4.18 release as this is a user interface
change that must be done right on the first try. I don't want to repeat this in
future releases so the kernel/userspace feature parity will be more relaxed.

Changes since 4.19-rc1: fix test failure for check --mode=lowmem

Changes:

* check: support repair of fs with free-space-tree feature
* core:
  * port delayed ref infrastructure from kernel
  * support write to free space tree
* dump-tree: new options for BFS and DFS enumeration of b-trees
* quota: rescan is now done automatically after 'assign'
* btrfstune: incomplete fix to uuid change
* subvol: fix 255 char limit checks
* completion: complete block devices and now regular files too
* docs:
  * ship uncompressed manual pages
  * btrfsck uses a manual page link instead of symlink
* other
  * improved error handling
  * docs
  * new tests

Tarballs: https://www.kernel.org/pub/linux/kernel/people/kdave/btrfs-progs/
Git: git://git.kernel.org/pub/scm/linux/kernel/git/kdave/btrfs-progs.git

Shortlog:

David Sterba (9):
  btrfs-progs: btrfstune: allow to continue uuid change
  btrfs-progs: tests: renumber last fsck test to 036-rescan-not-kicked-in
  btrfs-progs: docs: use manual page link instead of symlink
  btrfs-progs: build: remove gzip dependency
  btrfs-progs: docs: update clean target file masks
  btrfs-progs: clean up .gitignore
  btrfs-progs: tests: add runtime check for free-space-tree
  btrfs-progs: convert strerror to implicit %m
  btrfs-progs: update CHANGES for v4.19

Mike Gilbert (1):
  btrfs-progs: docs: install uncompressed manual pages

Misono Tomohiro (3):
  btrfs-progs: doc: update manual page of btrfs subvolume
  btrfs-progs: ioctl/libbtrfsutil: add 3 definitions of new unprivileged 
ioctl
  libbtrfsutil: factor out btrfs_util_subvolume_info_fd

Nikolay Borisov (23):
  btrfs-progs: tests: add test for missing device delete error value
  btrfs-progs: add __free_extent2 function
  btrfs-progs: add alloc_reserved_tree_block2 function
  btrfs-progs: Add delayed refs infrastructure
  btrfs-progs: Make btrfs_write_dirty_block_groups take only trans argument
  btrfs-progs: Wire up delayed refs
  btrfs-progs: Remove old delayed refs infrastructure
  btrfs-progs: Remove __free_extent2, now unused
  btrfs-progs: Merge alloc_reserved_tree_block2 and 
alloc_reserved_tree_block
  btrfs-progs: Add support for freespace tree in btrfs_read_fs_root
  btrfs-progs: Add extent buffer bitmap manipulation infrastructure
  btrfs-progs: Replace homegrown bitops related functions with kernel 
counterparts
  btrfs-progs: Implement find_*_bit_le operations
  btrfs-progs: Pull free space tree related code from kernel
  btrfs-progs: Hook FST code in extent (de)alloc
  btrfs-progs: Add freespace tree as compat_ro supported feature
  btrfs-progs: check: Add support for freespace tree fixing
  btrfs-progs: tests: Test for FST corruption detection/repair
  btrfs-progs: check: lowmem: Factor out inline extent checking code in its 
own function
  btrfs-progs: check: lowmem: Refactor extent len test in 
check_file_extent_inline
  btrfs-progs: check: lowmem: Refactor extent type checks in 
check_file_extent
  btrfs-progs: btrfstune: Remove fs_info arg from change_device_uuid
  btrfs-progs: btrfstune: Rename change_header_uuid to 
change_buffer_header_uuid

Qu Wenruo (22):
  btrfs-progs: transaction: do proper error handling in transaction commit
  btrfs-progs: completion: use _filedir to replace _btrfs_devs
  btrfs-progs: completion: let dump-tree/dump-super/inode-resolve accept 
any file
  btrfs-progs: print-tree: skip deprecated blockptr / nodesize output
  btrfs-progs: exit gracefully if we hit ENOSPC when allocating tree block
  btrfs-progs: exit gracefully when root dir item repair fails
  btrfs-progs: only warn if there are leaked extent buffers after 
transaction abort
  btrfs-progs: fix infinite loop when bad key order repair fails
  btrfs-progs: exit gracefully when device extent allocation fails
  btrfs-progs: rescue-super: don't double free fs_devices
  btrfs-progs: qgroup: don't return 1 if qgroup is marked inconsistent 
during relationship assignment
  btrfs-progs: convert: Make read_disk_extent return more -EIO instead of -1
  btrfs-progs: convert: Output meaningful error messages for create_image
  btrfs-progs: image: Warn about log tree generation mismatch when restoring
  btrfs-progs: Replace root parameter using fs_info for 

Re: [PATCH 3/3] btrfs: add new filter for file cloning error translation

2018-11-05 Thread Nikolay Borisov



On 5.11.18 г. 13:15 ч., fdman...@kernel.org wrote:
> From: Filipe Manana 
> 
> A bug in file cloning/reflinking was recently found that afftected both
> Btrfs and XFS, which was caused by allowing the cloning of an eof block
> into the middle of a file when the eof is not aligned to the filesystem's
> block size.
> 
> The fix consists of returning the errno -EINVAL to user space when the
> arguments passed to the system call lead to the scenario of data
> corruption. However this overlaps with some cases where the system call,
> in Btrfs, returned -EOPNOTSUPP, which means we are trying to reflink
> inline extents. That is unsupported in Btrfs due to the huge complexity
> of supporting it (due to copying and trimming inline extents, deal with
> eventual compression, etc).
> 
> We have a few btrfs test cases that verify that attempts to clone inline
> extents result in a failure, and are currently expecting an -EINVAL error
> message from the output of the cloner program. So create a filter that
> converts error messages related to the -EOPNOTSUPP error to messages
> related to the -EINVAL error, so that the test can run both on patched
> and non-patched linux kernels.
> 
> The corresponding btrfs patch for the linux kernel is titled:
> 
>  "Btrfs: fix data corruption due to cloning of eof block"
> 
> And the VFS change that introduces the -EINVAL error return was introduced
> by the following linux kernel commit (landed in 4.20-rc1):
> 
>  07d19dc9fbe9 ("vfs: avoid problematic remapping requests into partial EOF 
> block")
> 
> The btrfs patch is not yet in Linus' tree (it was submitted around the
> same time as this change) and the VFS change was introduced in 4.10-rc1.
> 
> Signed-off-by: Filipe Manana 

Reviewed-by: Nikolay Borisov 

> ---
>  common/filter.btrfs | 17 +
>  tests/btrfs/035 |  3 ++-
>  tests/btrfs/035.out |  2 +-
>  tests/btrfs/096 |  7 ---
>  tests/btrfs/096.out |  2 +-
>  tests/btrfs/112 | 25 +
>  tests/btrfs/112.out | 48 
>  tests/btrfs/113 |  4 +++-
>  tests/btrfs/113.out |  2 +-
>  9 files changed, 70 insertions(+), 40 deletions(-)
> 
> diff --git a/common/filter.btrfs b/common/filter.btrfs
> index dda85776..d4169cc6 100644
> --- a/common/filter.btrfs
> +++ b/common/filter.btrfs
> @@ -97,5 +97,22 @@ _filter_btrfs_qgroup_assign_warnings()
>   -e "/quotas may be inconsistent, rescan needed/d"
>  }
>  
> +# Long ago we found that attempting to clone inline extents resulted in 
> hitting
> +# a BUG_ON() and then decided to not support such use cases by returning 
> errno
> +# -EOPNOTSUPP to user space. Later on, clone/reflink became a VFS API too, 
> since
> +# other filesystems (such as XFS) implemented this feature. After that we 
> found
> +# one scenario of data corruption due to allowing cloning an EOF block into 
> the
> +# middle of a file, and started to reject such scenario by returning the 
> errno
> +# -EINVAL to user space (this affected both Btrfs and XFS). Such scenario 
> often
> +# overlaps the detection of attempts to clone inline extents, since it is 
> done
> +# early on based only on the arguments passed to the clone system call (and
> +# btrfs' specific ioctl) before processing the source file extents.
> +# So replace error messages related to errno -EOPNOTSUPP to be the same as 
> the
> +# one we get from a -EINVAL errno.
> +_filter_btrfs_cloner_error()
> +{
> + sed -e "s/\(clone failed:\) Operation not supported/\1 Invalid 
> argument/g"
> +}
> +
>  # make sure this script returns success
>  /bin/true
> diff --git a/tests/btrfs/035 b/tests/btrfs/035
> index c9c09e16..a6f67d4f 100755
> --- a/tests/btrfs/035
> +++ b/tests/btrfs/035
> @@ -24,6 +24,7 @@ trap "_cleanup ; exit \$status" 0 1 2 3 15
>  # get standard environment, filters and checks
>  . ./common/rc
>  . ./common/filter
> +. ./common/filter.btrfs
>  
>  # real QA test starts here
>  _supported_fs btrfs
> @@ -49,7 +50,7 @@ $CLONER_PROG $SCRATCH_MNT/src $SCRATCH_MNT/src.clone2
>  snap_src_sz=`ls -lah $SCRATCH_MNT/src.clone1 | awk '{print $5}'`
>  echo "attempting ioctl (src.clone1 src)"
>  $CLONER_PROG -s 0 -d 0 -l ${snap_src_sz} \
> - $SCRATCH_MNT/src.clone1 $SCRATCH_MNT/src
> + $SCRATCH_MNT/src.clone1 $SCRATCH_MNT/src | _filter_btrfs_cloner_error
>  
>  # The clone operation should have failed. If it did not it meant we had data
>  # loss, because file "src.clone1" has an inline extent which is 10 bytes long
> diff --git a/tests/btrfs/035.out b/tests/btrfs/035.out
> index 3ea7d779..d810bb2b 100644
> --- a/tests/btrfs/035.out
> +++ b/tests/btrfs/035.out
> @@ -1,6 +1,6 @@
>  QA output created by 035
>  attempting ioctl (src.clone1 src)
> -clone failed: Operation not supported
> +clone failed: Invalid argument
>  File src data after attempt to clone from src.clone1 into src:
>  000 62 62 62 62 62 62 62 62 62 62 63 63 63 63 63 63
>  020 63 63 63 63
> diff --git 

Re: [PATCH] Btrfs: fix infinite loop on inode eviction after deduplication of eof block

2018-11-05 Thread Nikolay Borisov



On 5.11.18 г. 13:14 ч., fdman...@kernel.org wrote:
> From: Filipe Manana 
> 
> If we attempt to deduplicate the last block of a file A into the middle of
> a file B, and file A's size is not a multiple of the block size, we end
> rounding the deduplication length to 0 bytes, to avoid the data corruption
> issue fixed by commit de02b9f6bb65 ("Btrfs: fix data corruption when
> deduplicating between different files"). However a length of zero will
> cause the insertion of an extent state with a start value greater (by 1)
> then the end value, leading to a corrupt extent state that will trigger a
> warning and cause chaos such as an infinite loop during inode eviction.
> Example trace:
> 
>  [96049.833585] [ cut here ]
>  [96049.833714] WARNING: CPU: 0 PID: 24448 at fs/btrfs/extent_io.c:436 
> insert_state+0x101/0x120 [btrfs]
>  [96049.833767] CPU: 0 PID: 24448 Comm: xfs_io Not tainted 
> 4.19.0-rc7-btrfs-next-39 #1
>  [96049.833768] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
>  [96049.833780] RIP: 0010:insert_state+0x101/0x120 [btrfs]
>  [96049.833783] RSP: 0018:afd2c3707af0 EFLAGS: 00010282
>  [96049.833785] RAX:  RBX: 0004dfff RCX: 
> 0006
>  [96049.833786] RDX: 0007 RSI: 99045c143230 RDI: 
> 99047b2168a0
>  [96049.833787] RBP: 990457851cd0 R08: 0001 R09: 
> 
>  [96049.833787] R10: afd2c3707ab8 R11:  R12: 
> 9903b93b12c8
>  [96049.833788] R13: 0004e000 R14: afd2c3707b80 R15: 
> afd2c3707b78
>  [96049.833790] FS:  7f5c14e7d700() GS:99047b20() 
> knlGS:
>  [96049.833791] CS:  0010 DS:  ES:  CR0: 80050033
>  [96049.833792] CR2: 7f5c146abff8 CR3: 000115f4c004 CR4: 
> 003606f0
>  [96049.833795] DR0:  DR1:  DR2: 
> 
>  [96049.833796] DR3:  DR6: fffe0ff0 DR7: 
> 0400
>  [96049.833796] Call Trace:
>  [96049.833809]  __set_extent_bit+0x46c/0x6a0 [btrfs]
>  [96049.833823]  lock_extent_bits+0x6b/0x210 [btrfs]
>  [96049.833831]  ? _raw_spin_unlock+0x24/0x30
>  [96049.833841]  ? test_range_bit+0xdf/0x130 [btrfs]
>  [96049.833853]  lock_extent_range+0x8e/0x150 [btrfs]
>  [96049.833864]  btrfs_double_extent_lock+0x78/0xb0 [btrfs]
>  [96049.833875]  btrfs_extent_same_range+0x14e/0x550 [btrfs]
>  [96049.833885]  ? rcu_read_lock_sched_held+0x3f/0x70
>  [96049.833890]  ? __kmalloc_node+0x2b0/0x2f0
>  [96049.833899]  ? btrfs_dedupe_file_range+0x19a/0x280 [btrfs]
>  [96049.833909]  btrfs_dedupe_file_range+0x270/0x280 [btrfs]
>  [96049.833916]  vfs_dedupe_file_range_one+0xd9/0xe0
>  [96049.833919]  vfs_dedupe_file_range+0x131/0x1b0
>  [96049.833924]  do_vfs_ioctl+0x272/0x6e0
>  [96049.833927]  ? __fget+0x113/0x200
>  [96049.833931]  ksys_ioctl+0x70/0x80
>  [96049.833933]  __x64_sys_ioctl+0x16/0x20
>  [96049.833937]  do_syscall_64+0x60/0x1b0
>  [96049.833939]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
>  [96049.833941] RIP: 0033:0x7f5c1478ddd7
>  [96049.833943] RSP: 002b:7ffe15b196a8 EFLAGS: 0202 ORIG_RAX: 
> 0010
>  [96049.833945] RAX: ffda RBX:  RCX: 
> 7f5c1478ddd7
>  [96049.833946] RDX: 5625ece322d0 RSI: c0189436 RDI: 
> 0004
>  [96049.833947] RBP:  R08: 7f5c14a46f48 R09: 
> 0040
>  [96049.833948] R10: 0541 R11: 0202 R12: 
> 
>  [96049.833949] R13:  R14: 0004 R15: 
> 5625ece322d0
>  [96049.833954] irq event stamp: 6196
>  [96049.833956] hardirqs last  enabled at (6195): [] 
> console_unlock+0x503/0x640
>  [96049.833958] hardirqs last disabled at (6196): [] 
> trace_hardirqs_off_thunk+0x1a/0x1c
>  [96049.833959] softirqs last  enabled at (6114): [] 
> __do_softirq+0x370/0x421
>  [96049.833964] softirqs last disabled at (6095): [] 
> irq_exit+0xcd/0xe0
>  [96049.833965] ---[ end trace db7b05f01b7fa10c ]---
>  [96049.935816] R13:  R14: 5562e5259240 R15: 
> 7092b910
>  [96049.935822] irq event stamp: 6584
>  [96049.935823] hardirqs last  enabled at (6583): [] 
> console_unlock+0x503/0x640
>  [96049.935825] hardirqs last disabled at (6584): [] 
> trace_hardirqs_off_thunk+0x1a/0x1c
>  [96049.935827] softirqs last  enabled at (6328): [] 
> __do_softirq+0x370/0x421
>  [96049.935828] softirqs last disabled at (6313): [] 
> irq_exit+0xcd/0xe0
>  [96049.935829] ---[ end trace db7b05f01b7fa123 ]---
>  [96049.935840] [ cut here ]
>  [96049.936065] WARNING: CPU: 1 PID: 24463 at fs/btrfs/extent_io.c:436 
> insert_state+0x101/0x120 [btrfs]
>  [96049.936107] CPU: 1 PID: 24463 Comm: umount Tainted: GW 
> 4.19.0-rc7-btrfs-next-39 #1
>  [96049.936108] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> 

Re: [PATCH] btrfs: tree-checker: Fix misleading group system information

2018-11-05 Thread Zhangshaokun
Hi Qu,

On 2018/11/5 19:03, Qu Wenruo wrote:
> 
> 
> On 2018/11/5 下午6:49, Shaokun Zhang wrote:
>> block_group_err shows the group system as a decimal value with a '0x'
>> prefix, which is somewhat misleading.
>>
>> Fix it to print hexadecimal, as was intended.
>>
>> Cc: David Sterba  
>> Cc: Chris Mason 
>> Cc: Josef Bacik  
>> Signed-off-by: Shaokun Zhang 
> 
> Reviewed-by: Qu Wenruo 
> 
> BTW, did you catch it with some real world case or just by looking into
> the code?

I made a mistake (0x%d) when debugged my code, so I grep the similar format
for the kernel code and came across this typo, a trivial patch.

Thanks,
Shaokun

> 
> Thanks,
> Qu
> 
>> ---
>>  fs/btrfs/tree-checker.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
>> index cab0b1f..efcf89a 100644
>> --- a/fs/btrfs/tree-checker.c
>> +++ b/fs/btrfs/tree-checker.c
>> @@ -440,7 +440,7 @@ static int check_block_group_item(struct btrfs_fs_info 
>> *fs_info,
>>  type != (BTRFS_BLOCK_GROUP_METADATA |
>> BTRFS_BLOCK_GROUP_DATA)) {
>>  block_group_err(fs_info, leaf, slot,
>> -"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 
>> 0x%llu or 0x%llx",
>> +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 
>> 0x%llx or 0x%llx",
>>  type, hweight64(type),
>>  BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
>>  BTRFS_BLOCK_GROUP_SYSTEM,
>>
> 



[PATCH 3/3] btrfs: add new filter for file cloning error translation

2018-11-05 Thread fdmanana
From: Filipe Manana 

A bug in file cloning/reflinking was recently found that afftected both
Btrfs and XFS, which was caused by allowing the cloning of an eof block
into the middle of a file when the eof is not aligned to the filesystem's
block size.

The fix consists of returning the errno -EINVAL to user space when the
arguments passed to the system call lead to the scenario of data
corruption. However this overlaps with some cases where the system call,
in Btrfs, returned -EOPNOTSUPP, which means we are trying to reflink
inline extents. That is unsupported in Btrfs due to the huge complexity
of supporting it (due to copying and trimming inline extents, deal with
eventual compression, etc).

We have a few btrfs test cases that verify that attempts to clone inline
extents result in a failure, and are currently expecting an -EINVAL error
message from the output of the cloner program. So create a filter that
converts error messages related to the -EOPNOTSUPP error to messages
related to the -EINVAL error, so that the test can run both on patched
and non-patched linux kernels.

The corresponding btrfs patch for the linux kernel is titled:

 "Btrfs: fix data corruption due to cloning of eof block"

And the VFS change that introduces the -EINVAL error return was introduced
by the following linux kernel commit (landed in 4.20-rc1):

 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into partial EOF 
block")

The btrfs patch is not yet in Linus' tree (it was submitted around the
same time as this change) and the VFS change was introduced in 4.10-rc1.

Signed-off-by: Filipe Manana 
---
 common/filter.btrfs | 17 +
 tests/btrfs/035 |  3 ++-
 tests/btrfs/035.out |  2 +-
 tests/btrfs/096 |  7 ---
 tests/btrfs/096.out |  2 +-
 tests/btrfs/112 | 25 +
 tests/btrfs/112.out | 48 
 tests/btrfs/113 |  4 +++-
 tests/btrfs/113.out |  2 +-
 9 files changed, 70 insertions(+), 40 deletions(-)

diff --git a/common/filter.btrfs b/common/filter.btrfs
index dda85776..d4169cc6 100644
--- a/common/filter.btrfs
+++ b/common/filter.btrfs
@@ -97,5 +97,22 @@ _filter_btrfs_qgroup_assign_warnings()
-e "/quotas may be inconsistent, rescan needed/d"
 }
 
+# Long ago we found that attempting to clone inline extents resulted in hitting
+# a BUG_ON() and then decided to not support such use cases by returning errno
+# -EOPNOTSUPP to user space. Later on, clone/reflink became a VFS API too, 
since
+# other filesystems (such as XFS) implemented this feature. After that we found
+# one scenario of data corruption due to allowing cloning an EOF block into the
+# middle of a file, and started to reject such scenario by returning the errno
+# -EINVAL to user space (this affected both Btrfs and XFS). Such scenario often
+# overlaps the detection of attempts to clone inline extents, since it is done
+# early on based only on the arguments passed to the clone system call (and
+# btrfs' specific ioctl) before processing the source file extents.
+# So replace error messages related to errno -EOPNOTSUPP to be the same as the
+# one we get from a -EINVAL errno.
+_filter_btrfs_cloner_error()
+{
+   sed -e "s/\(clone failed:\) Operation not supported/\1 Invalid 
argument/g"
+}
+
 # make sure this script returns success
 /bin/true
diff --git a/tests/btrfs/035 b/tests/btrfs/035
index c9c09e16..a6f67d4f 100755
--- a/tests/btrfs/035
+++ b/tests/btrfs/035
@@ -24,6 +24,7 @@ trap "_cleanup ; exit \$status" 0 1 2 3 15
 # get standard environment, filters and checks
 . ./common/rc
 . ./common/filter
+. ./common/filter.btrfs
 
 # real QA test starts here
 _supported_fs btrfs
@@ -49,7 +50,7 @@ $CLONER_PROG $SCRATCH_MNT/src $SCRATCH_MNT/src.clone2
 snap_src_sz=`ls -lah $SCRATCH_MNT/src.clone1 | awk '{print $5}'`
 echo "attempting ioctl (src.clone1 src)"
 $CLONER_PROG -s 0 -d 0 -l ${snap_src_sz} \
-   $SCRATCH_MNT/src.clone1 $SCRATCH_MNT/src
+   $SCRATCH_MNT/src.clone1 $SCRATCH_MNT/src | _filter_btrfs_cloner_error
 
 # The clone operation should have failed. If it did not it meant we had data
 # loss, because file "src.clone1" has an inline extent which is 10 bytes long
diff --git a/tests/btrfs/035.out b/tests/btrfs/035.out
index 3ea7d779..d810bb2b 100644
--- a/tests/btrfs/035.out
+++ b/tests/btrfs/035.out
@@ -1,6 +1,6 @@
 QA output created by 035
 attempting ioctl (src.clone1 src)
-clone failed: Operation not supported
+clone failed: Invalid argument
 File src data after attempt to clone from src.clone1 into src:
 000 62 62 62 62 62 62 62 62 62 62 63 63 63 63 63 63
 020 63 63 63 63
diff --git a/tests/btrfs/096 b/tests/btrfs/096
index e8552947..b9188e6e 100755
--- a/tests/btrfs/096
+++ b/tests/btrfs/096
@@ -21,6 +21,7 @@ _cleanup()
 # get standard environment, filters and checks
 . ./common/rc
 . ./common/filter
+. ./common/filter.btrfs
 
 # real QA test starts here
 _supported_fs btrfs
@@ -52,11 +53,11 @@ $XFS_IO_PROG -f -s 

[PATCH 2/3] generic: test attempt to reflink eof block into the middle of a file

2018-11-05 Thread fdmanana
From: Filipe Manana 

Test that we can not clone a range from a file A into the middle of a file B
when the range includes the last block of file A and file A's size is not
aligned with the filesystem's block size. Allowing such case would lead to
data corruption since the data between EOF and the end of its block is
undefined.

This is motivated by a bug recently found that affects both Btrfs and XFS
and is fixed by the following commits/patches for the linux kernel:

 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into partial EOF 
block")
 b39989009bdb ("xfs: fix data corruption w/ unaligned reflink ranges")
 Btrfs: fix data corruption due to cloning of eof block

The VFS patch landed in kernel 4.20-rc1 and the XFS patch landed in 4.19.
The Btrfs fix is very recent and it is not yet in Linus' tree.

Signed-off-by: Filipe Manana 
---
 tests/generic/518 | 60 +++
 tests/generic/518.out | 10 +
 tests/generic/group   |  1 +
 3 files changed, 71 insertions(+)
 create mode 100755 tests/generic/518
 create mode 100644 tests/generic/518.out

diff --git a/tests/generic/518 b/tests/generic/518
new file mode 100755
index ..c75110d1
--- /dev/null
+++ b/tests/generic/518
@@ -0,0 +1,60 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 518
+#
+# Test that we can not clone a range from a file A into the middle of a file B
+# when the range includes the last block of file A and file A's size is not
+# aligned with the filesystem's block size. Allowing such case would lead to
+# data corruption since the data between EOF and the end of its block is
+# undefined.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch_reflink
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_scratch_mount
+
+foo_size=$((256 * 1024 + 100)) # 256Kb + 100 bytes
+bar_size="1M"
+
+$XFS_IO_PROG -f -c "pwrite -S 0x3c 0 $foo_size" $SCRATCH_MNT/foo | 
_filter_xfs_io
+$XFS_IO_PROG -f -c "pwrite -S 0xb5 0 $bar_size" $SCRATCH_MNT/bar | 
_filter_xfs_io
+
+# Cloning the EOF block of a file into the middle of another file should fail
+# with an invalid argument error.
+$XFS_IO_PROG -c "reflink $SCRATCH_MNT/foo 0 512K $foo_size" $SCRATCH_MNT/bar
+
+# Unmount the filesystem and mount it again. This guarantees any file data in
+# the page cache is dropped.
+_scratch_cycle_mount
+
+# Verify no changes were made to the file.
+echo "File content after failed reflink:"
+od -A d -t x1 $SCRATCH_MNT/bar
+
+status=0
+exit
diff --git a/tests/generic/518.out b/tests/generic/518.out
new file mode 100644
index ..726c2073
--- /dev/null
+++ b/tests/generic/518.out
@@ -0,0 +1,10 @@
+QA output created by 518
+wrote 262244/262244 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1048576/1048576 bytes at offset 0
+XXX Bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+XFS_IOC_CLONE_RANGE: Invalid argument
+File content after failed reflink:
+000 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5
+*
+1048576
diff --git a/tests/generic/group b/tests/generic/group
index 326d3a1d..ef24f578 100644
--- a/tests/generic/group
+++ b/tests/generic/group
@@ -520,3 +520,4 @@
 515 auto quick clone
 516 auto quick dedupe clone
 517 auto quick dedupe clone
+518 auto quick clone
-- 
2.11.0



[PATCH 1/3] generic: test attempt to dedup eof block into the middle of a file

2018-11-05 Thread fdmanana
From: Filipe Manana 

Test that deduplication of an entire file that has a size that is not
aligned to the filesystem's block size into the middle of a different
file does not corrupt the destination's file data by reflinking the last
(eof) block.

This test is motivated by a bug recently found that affects both Btrfs
and XFS, and is fixed by the following commits/patches for the linux
kernel:

 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into partial EOF 
block")
 dceeb47b0ed6 ("xfs: fix data corruption w/ unaligned dedupe ranges")
 de02b9f6bb65 ("Btrfs: fix data corruption when deduplicating between different 
files")
 Btrfs: fix infinite loop on inode eviction after deduplication of eof block

The VFS patch was added to kernel 4.20-rc1 and the XFS and first Btrfs
patches were added to kernel 4.19. The second patch for Btrfs is very
recent and it is not yet in Linus' tree.

Signed-off-by: Filipe Manana 
---
 tests/generic/517 | 98 +++
 tests/generic/517.out | 45 +++
 tests/generic/group   |  1 +
 3 files changed, 144 insertions(+)
 create mode 100755 tests/generic/517
 create mode 100644 tests/generic/517.out

diff --git a/tests/generic/517 b/tests/generic/517
new file mode 100755
index ..601bb24e
--- /dev/null
+++ b/tests/generic/517
@@ -0,0 +1,98 @@
+#! /bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2018 SUSE Linux Products GmbH. All Rights Reserved.
+#
+# FS QA Test No. 517
+#
+# Test that deduplication of an entire file that has a size that is not aligned
+# to the filesystem's block size into the middle of a different file does not
+# corrupt the destination's file data by reflinking the last (eof) block.
+#
+seq=`basename $0`
+seqres=$RESULT_DIR/$seq
+echo "QA output created by $seq"
+tmp=/tmp/$$
+status=1   # failure is the default!
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+_cleanup()
+{
+   cd /
+   rm -f $tmp.*
+}
+
+# get standard environment, filters and checks
+. ./common/rc
+. ./common/filter
+. ./common/reflink
+
+# real QA test starts here
+_supported_fs generic
+_supported_os Linux
+_require_scratch_dedupe
+
+rm -f $seqres.full
+
+_scratch_mkfs >>$seqres.full 2>&1
+_scratch_mount
+
+# The first byte with a value of 0xae starts at an offset (2518890) which is 
not
+# a multiple of the block size.
+$XFS_IO_PROG -f \
+   -c "pwrite -S 0x6b 0 2518890" \
+   -c "pwrite -S 0xae 2518890 102398" \
+   $SCRATCH_MNT/foo | _filter_xfs_io
+
+# Create a second file with a length not aligned to the block size, whose bytes
+# all have the value 0x6b, so that its extent(s) can be deduplicated with the
+# first file.
+$XFS_IO_PROG -f -c "pwrite -S 0x6b 0 557771" $SCRATCH_MNT/bar | _filter_xfs_io
+
+# The file is filled with bytes having the value 0x6b from offset 0 to offset
+# 2518889 and with the value 0xae from offset 2518890 to offset 2621287.
+echo "File content before first deduplication:"
+od -t x1 $SCRATCH_MNT/foo
+
+# Now deduplicate the entire second file into a range of the first file that
+# also has all bytes with the value 0x6b. The destination range's end offset
+# must not be aligned to the block size and must be less then the offset of
+# the first byte with the value 0xae (byte at offset 2518890).
+$XFS_IO_PROG -c "dedupe $SCRATCH_MNT/bar 0 1957888 557771" $SCRATCH_MNT/foo \
+   | _filter_xfs_io
+
+# We should have exactly the same data we had before we asked for 
deduplication.
+echo "File content after first deduplication and before unmounting:"
+od -A d -t x1 $SCRATCH_MNT/foo
+
+# Unmount the filesystem and mount it again. This guarantees any file data in
+# the page cache is dropped.
+_scratch_cycle_mount
+
+# We should have exactly the same data we had before we asked for 
deduplication.
+echo "File content after first unmount:"
+od -A d -t x1 $SCRATCH_MNT/foo
+
+# Now do a similar test when trying to dedup just the last (eof) block of a 
file
+# into the middle of another file. This triggered a different bug on btrfs.
+$XFS_IO_PROG -f -c "pwrite -S 0xae 0 100" $SCRATCH_MNT/baz | _filter_xfs_io
+
+# Unmount the filesystem and mount it again before attempting to dedupe baz's
+# last block into foo. This is necessary to trigger that btrfs bug mentioned
+# before.
+_scratch_cycle_mount
+
+# Now attempt to dedupe the single block of baz into foo.
+$XFS_IO_PROG -c "dedupe $SCRATCH_MNT/baz 0 2519040 100" $SCRATCH_MNT/foo \
+| _filter_xfs_io
+
+# Now attempt to unmount the filesystem before reading from the file. This is
+# meant to trigger the btrfs bug which caused an infinite loop during inode
+# eviction.
+_scratch_cycle_mount
+
+# We should have exactly the same data we had before we asked for 
deduplication.
+echo "File content after second deduplication:"
+od -A d -t x1 $SCRATCH_MNT/foo
+
+status=0
+exit
diff --git a/tests/generic/517.out b/tests/generic/517.out
new file mode 100644
index ..137a9719
--- /dev/null
+++ 

[PATCH] Btrfs: fix data corruption due to cloning of eof block

2018-11-05 Thread fdmanana
From: Filipe Manana 

We currently allow cloning a range from a file which includes the last
block of the file even if the file's size is not aligned to the block
size. This is fine and useful when the destination file has the same size,
but when it does not and the range ends somewhere in the middle of the
destination file, it leads to corruption because the bytes between the EOF
and the end of the block have undefined data (when there is support for
discard/trimming they have a value of 0x00).

Example:

 $ mkfs.btrfs -f /dev/sdb
 $ mount /dev/sdb /mnt

 $ export foo_size=$((256 * 1024 + 100))
 $ xfs_io -f -c "pwrite -S 0x3c 0 $foo_size" /mnt/foo
 $ xfs_io -f -c "pwrite -S 0xb5 0 1M" /mnt/bar

 $ xfs_io -c "reflink /mnt/foo 0 512K $foo_size" /mnt/bar

 $ od -A d -t x1 /mnt/bar
 000 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5
 *
 0524288 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c
 *
 0786528 3c 3c 3c 3c 00 00 00 00 00 00 00 00 00 00 00 00
 0786544 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 *
 0790528 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5
 *
 1048576

The bytes in the range from 786532 (512Kb + 256Kb + 100 bytes) to 790527
(512Kb + 256Kb + 4Kb - 1) got corrupted, having now a value of 0x00 instead
of 0xb5.

This is similar to the problem we had for deduplication that got recently
fixed by commit de02b9f6bb65 ("Btrfs: fix data corruption when
deduplicating between different files").

Fix this by not allowing such operations to be performed and return the
errno -EINVAL to user space. This is what XFS is doing as well at the VFS
level. This change however now makes us return -EINVAL instead of
-EOPNOTSUPP for cases where the source range maps to an inline extent and
the destination range's end is smaller then the destination file's size,
since the detection of inline extents is done during the actual process of
dropping file extent items (at __btrfs_drop_extents()). Returning the
-EINVAL error is done early on and solely based on the input parameters
(offsets and length) and destination file's size. This makes us consistent
with XFS and anyone else supporting cloning since this case is now checked
at a higher level in the VFS and is where the -EINVAL will be returned
from starting with kernel 4.20 (the VFS changed was introduced in 4.20-rc1
by commit 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into
partial EOF block"). So this change is more geared towards stable kernels,
as it's unlikely the new VFS checks get removed intentionally.

A test case for fstests follows soon, as well as an update to filter
existing tests that expect -EOPNOTSUPP to accept -EINVAL as well.

CC:  # 4.4+
Signed-off-by: Filipe Manana 
---
 fs/btrfs/ioctl.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f3134fc69880..30e098970063 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -4277,9 +4277,17 @@ static noinline int btrfs_clone_files(struct file *file, 
struct file *file_src,
goto out_unlock;
if (len == 0)
olen = len = src->i_size - off;
-   /* if we extend to eof, continue to block boundary */
-   if (off + len == src->i_size)
+   /*
+* If we extend to eof, continue to block boundary if and only if the
+* destination end offset matches the destination file's size, otherwise
+* we would be corrupting data by placing the eof block into the middle
+* of a file.
+*/
+   if (off + len == src->i_size) {
+   if (!IS_ALIGNED(len, bs) && destoff + len < inode->i_size)
+   goto out_unlock;
len = ALIGN(src->i_size, bs) - off;
+   }
 
if (len == 0) {
ret = 0;
-- 
2.11.0



[PATCH] Btrfs: fix infinite loop on inode eviction after deduplication of eof block

2018-11-05 Thread fdmanana
From: Filipe Manana 

If we attempt to deduplicate the last block of a file A into the middle of
a file B, and file A's size is not a multiple of the block size, we end
rounding the deduplication length to 0 bytes, to avoid the data corruption
issue fixed by commit de02b9f6bb65 ("Btrfs: fix data corruption when
deduplicating between different files"). However a length of zero will
cause the insertion of an extent state with a start value greater (by 1)
then the end value, leading to a corrupt extent state that will trigger a
warning and cause chaos such as an infinite loop during inode eviction.
Example trace:

 [96049.833585] [ cut here ]
 [96049.833714] WARNING: CPU: 0 PID: 24448 at fs/btrfs/extent_io.c:436 
insert_state+0x101/0x120 [btrfs]
 [96049.833767] CPU: 0 PID: 24448 Comm: xfs_io Not tainted 
4.19.0-rc7-btrfs-next-39 #1
 [96049.833768] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
 [96049.833780] RIP: 0010:insert_state+0x101/0x120 [btrfs]
 [96049.833783] RSP: 0018:afd2c3707af0 EFLAGS: 00010282
 [96049.833785] RAX:  RBX: 0004dfff RCX: 
0006
 [96049.833786] RDX: 0007 RSI: 99045c143230 RDI: 
99047b2168a0
 [96049.833787] RBP: 990457851cd0 R08: 0001 R09: 

 [96049.833787] R10: afd2c3707ab8 R11:  R12: 
9903b93b12c8
 [96049.833788] R13: 0004e000 R14: afd2c3707b80 R15: 
afd2c3707b78
 [96049.833790] FS:  7f5c14e7d700() GS:99047b20() 
knlGS:
 [96049.833791] CS:  0010 DS:  ES:  CR0: 80050033
 [96049.833792] CR2: 7f5c146abff8 CR3: 000115f4c004 CR4: 
003606f0
 [96049.833795] DR0:  DR1:  DR2: 

 [96049.833796] DR3:  DR6: fffe0ff0 DR7: 
0400
 [96049.833796] Call Trace:
 [96049.833809]  __set_extent_bit+0x46c/0x6a0 [btrfs]
 [96049.833823]  lock_extent_bits+0x6b/0x210 [btrfs]
 [96049.833831]  ? _raw_spin_unlock+0x24/0x30
 [96049.833841]  ? test_range_bit+0xdf/0x130 [btrfs]
 [96049.833853]  lock_extent_range+0x8e/0x150 [btrfs]
 [96049.833864]  btrfs_double_extent_lock+0x78/0xb0 [btrfs]
 [96049.833875]  btrfs_extent_same_range+0x14e/0x550 [btrfs]
 [96049.833885]  ? rcu_read_lock_sched_held+0x3f/0x70
 [96049.833890]  ? __kmalloc_node+0x2b0/0x2f0
 [96049.833899]  ? btrfs_dedupe_file_range+0x19a/0x280 [btrfs]
 [96049.833909]  btrfs_dedupe_file_range+0x270/0x280 [btrfs]
 [96049.833916]  vfs_dedupe_file_range_one+0xd9/0xe0
 [96049.833919]  vfs_dedupe_file_range+0x131/0x1b0
 [96049.833924]  do_vfs_ioctl+0x272/0x6e0
 [96049.833927]  ? __fget+0x113/0x200
 [96049.833931]  ksys_ioctl+0x70/0x80
 [96049.833933]  __x64_sys_ioctl+0x16/0x20
 [96049.833937]  do_syscall_64+0x60/0x1b0
 [96049.833939]  entry_SYSCALL_64_after_hwframe+0x49/0xbe
 [96049.833941] RIP: 0033:0x7f5c1478ddd7
 [96049.833943] RSP: 002b:7ffe15b196a8 EFLAGS: 0202 ORIG_RAX: 
0010
 [96049.833945] RAX: ffda RBX:  RCX: 
7f5c1478ddd7
 [96049.833946] RDX: 5625ece322d0 RSI: c0189436 RDI: 
0004
 [96049.833947] RBP:  R08: 7f5c14a46f48 R09: 
0040
 [96049.833948] R10: 0541 R11: 0202 R12: 

 [96049.833949] R13:  R14: 0004 R15: 
5625ece322d0
 [96049.833954] irq event stamp: 6196
 [96049.833956] hardirqs last  enabled at (6195): [] 
console_unlock+0x503/0x640
 [96049.833958] hardirqs last disabled at (6196): [] 
trace_hardirqs_off_thunk+0x1a/0x1c
 [96049.833959] softirqs last  enabled at (6114): [] 
__do_softirq+0x370/0x421
 [96049.833964] softirqs last disabled at (6095): [] 
irq_exit+0xcd/0xe0
 [96049.833965] ---[ end trace db7b05f01b7fa10c ]---
 [96049.935816] R13:  R14: 5562e5259240 R15: 
7092b910
 [96049.935822] irq event stamp: 6584
 [96049.935823] hardirqs last  enabled at (6583): [] 
console_unlock+0x503/0x640
 [96049.935825] hardirqs last disabled at (6584): [] 
trace_hardirqs_off_thunk+0x1a/0x1c
 [96049.935827] softirqs last  enabled at (6328): [] 
__do_softirq+0x370/0x421
 [96049.935828] softirqs last disabled at (6313): [] 
irq_exit+0xcd/0xe0
 [96049.935829] ---[ end trace db7b05f01b7fa123 ]---
 [96049.935840] [ cut here ]
 [96049.936065] WARNING: CPU: 1 PID: 24463 at fs/btrfs/extent_io.c:436 
insert_state+0x101/0x120 [btrfs]
 [96049.936107] CPU: 1 PID: 24463 Comm: umount Tainted: GW 
4.19.0-rc7-btrfs-next-39 #1
 [96049.936108] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014
 [96049.936117] RIP: 0010:insert_state+0x101/0x120 [btrfs]
 [96049.936119] RSP: 0018:afd2c3637bc0 EFLAGS: 00010282
 [96049.936120] RAX:  RBX: 0004dfff RCX: 

Re: [PATCH] Btrfs: incremental send, fix infinite loop when apply children dir moves

2018-11-05 Thread Filipe Manana
On Mon, Nov 5, 2018 at 4:10 AM robbieko  wrote:
>
> Filipe Manana 於 2018-10-30 19:36 寫到:
> > On Tue, Oct 30, 2018 at 7:00 AM robbieko  wrote:
> >>
> >> From: Robbie Ko 
> >>
> >> In apply_children_dir_moves, we first create an empty list (stack),
> >> then we get an entry from pending_dir_moves and add it to the stack,
> >> but we didn't delete the entry from rb_tree.
> >>
> >> So, in add_pending_dir_move, we create a new entry and then use the
> >> parent_ino in the current rb_tree to find the corresponding entry,
> >> and if so, add the new entry to the corresponding list.
> >>
> >> However, the entry may have been added to the stack, causing new
> >> entries to be added to the stack as well.
> >>
> >> Finally, each time we take the first entry from the stack and start
> >> processing, it ends up with an infinite loop.
> >>
> >> Fix this problem by remove node from pending_dir_moves,
> >> avoid add new pending_dir_move to error list.
> >
> > I can't parse that explanation.
> > Can you give a concrete example (reproducer) or did this came out of
> > thin air?
> >
> > Thanks.
> >
>
> I am sorry that I replied so late.
>
> I have no way to give a simple example.
> But I can provide a btrfs image file
> You can restore the Image via btrfs-image
> Then directly command "btrfs send -e -p parent send -f dump_file"
> Infinite loop will occur.
> I use ubuntu 16.04, kernel 4.15.0.36-generic can be stable reproduce

You have been occasionally submitting fixes for send/receive for a few
years now, and you know already
that I always ask for a changelog that describes well the problem and
an example/reproducer.

Why did you do this?

What I can read from your answer is that you were too lazy to extract
a reproducer from that image.
Just made some change that fixes the infinite loop and because it
apparently works you're done with it,
Without an example at least, I don't think you or anyone can fully
understand the problem, and if what
you have (despite somewhat making theoretical sense) is really a good
solution or just a workaround for
the cause of the problem - after all if you can't give an example, you
can't explain how in practice such loop
of dependencies between directories happens. This, as with most
send/receive problems, is a pure sequential
and deterministic problem so there's really no excuse for not getting
a reproducer.

Without an example, an explanation how it happens in the real world,
does one know that your change is
fixing the problem is the right place and not introducing other
problems? Like the receiver not getting some
changes (missing directories, files, or renames, etc).

Tests are not just to prove a change is correct, they exist to catch
and prevent regressions in the future too.

You can do better than that.

>
> Image file, please refer to the attachment.
>
> Thanks.
>
>
> >>
> >> Signed-off-by: Robbie Ko 
> >> ---
> >>  fs/btrfs/send.c | 11 ---
> >>  1 file changed, 8 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
> >> index 094cc144..5be83b5 100644
> >> --- a/fs/btrfs/send.c
> >> +++ b/fs/btrfs/send.c
> >> @@ -3340,7 +3340,8 @@ static void free_pending_move(struct send_ctx
> >> *sctx, struct pending_dir_move *m)
> >> kfree(m);
> >>  }
> >>
> >> -static void tail_append_pending_moves(struct pending_dir_move *moves,
> >> +static void tail_append_pending_moves(struct send_ctx *sctx,
> >> + struct pending_dir_move *moves,
> >>   struct list_head *stack)
> >>  {
> >> if (list_empty(>list)) {
> >> @@ -3351,6 +3352,10 @@ static void tail_append_pending_moves(struct
> >> pending_dir_move *moves,
> >> list_add_tail(>list, stack);
> >> list_splice_tail(, stack);
> >> }
> >> +   if (!RB_EMPTY_NODE(>node)) {
> >> +   rb_erase(>node, >pending_dir_moves);
> >> +   RB_CLEAR_NODE(>node);
> >> +   }
> >>  }
> >>
> >>  static int apply_children_dir_moves(struct send_ctx *sctx)
> >> @@ -3365,7 +3370,7 @@ static int apply_children_dir_moves(struct
> >> send_ctx *sctx)
> >> return 0;
> >>
> >> INIT_LIST_HEAD();
> >> -   tail_append_pending_moves(pm, );
> >> +   tail_append_pending_moves(sctx, pm, );
> >>
> >> while (!list_empty()) {
> >> pm = list_first_entry(, struct pending_dir_move,
> >> list);
> >> @@ -3376,7 +3381,7 @@ static int apply_children_dir_moves(struct
> >> send_ctx *sctx)
> >> goto out;
> >> pm = get_pending_dir_moves(sctx, parent_ino);
> >> if (pm)
> >> -   tail_append_pending_moves(pm, );
> >> +   tail_append_pending_moves(sctx, pm, );
> >> }
> >> return 0;
> >>
> >> --
> >> 1.9.1
> >>



-- 
Filipe David Manana,

“Whether you think you can, or you think you can't — you're right.”


Re: [PATCH] btrfs: tree-checker: Fix misleading group system information

2018-11-05 Thread Qu Wenruo


On 2018/11/5 下午6:49, Shaokun Zhang wrote:
> block_group_err shows the group system as a decimal value with a '0x'
> prefix, which is somewhat misleading.
> 
> Fix it to print hexadecimal, as was intended.
> 
> Cc: David Sterba  
> Cc: Chris Mason 
> Cc: Josef Bacik  
> Signed-off-by: Shaokun Zhang 

Reviewed-by: Qu Wenruo 

BTW, did you catch it with some real world case or just by looking into
the code?

Thanks,
Qu

> ---
>  fs/btrfs/tree-checker.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
> index cab0b1f..efcf89a 100644
> --- a/fs/btrfs/tree-checker.c
> +++ b/fs/btrfs/tree-checker.c
> @@ -440,7 +440,7 @@ static int check_block_group_item(struct btrfs_fs_info 
> *fs_info,
>   type != (BTRFS_BLOCK_GROUP_METADATA |
>  BTRFS_BLOCK_GROUP_DATA)) {
>   block_group_err(fs_info, leaf, slot,
> -"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 
> 0x%llu or 0x%llx",
> +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 
> 0x%llx or 0x%llx",
>   type, hweight64(type),
>   BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
>   BTRFS_BLOCK_GROUP_SYSTEM,
> 



signature.asc
Description: OpenPGP digital signature


Re: [PATCH] btrfs: tree-checker: Fix misleading group system information

2018-11-05 Thread Nikolay Borisov



On 5.11.18 г. 12:49 ч., Shaokun Zhang wrote:
> block_group_err shows the group system as a decimal value with a '0x'
> prefix, which is somewhat misleading.
> 
> Fix it to print hexadecimal, as was intended.
> 
> Cc: David Sterba  
> Cc: Chris Mason 
> Cc: Josef Bacik  
> Signed-off-by: Shaokun Zhang 

Good catch !

Reviewed-by: Nikolay Borisov 

> ---
>  fs/btrfs/tree-checker.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
> index cab0b1f..efcf89a 100644
> --- a/fs/btrfs/tree-checker.c
> +++ b/fs/btrfs/tree-checker.c
> @@ -440,7 +440,7 @@ static int check_block_group_item(struct btrfs_fs_info 
> *fs_info,
>   type != (BTRFS_BLOCK_GROUP_METADATA |
>  BTRFS_BLOCK_GROUP_DATA)) {
>   block_group_err(fs_info, leaf, slot,
> -"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 
> 0x%llu or 0x%llx",
> +"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 
> 0x%llx or 0x%llx",
>   type, hweight64(type),
>   BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
>   BTRFS_BLOCK_GROUP_SYSTEM,
> 


[PATCH] btrfs: tree-checker: Fix misleading group system information

2018-11-05 Thread Shaokun Zhang
block_group_err shows the group system as a decimal value with a '0x'
prefix, which is somewhat misleading.

Fix it to print hexadecimal, as was intended.

Cc: David Sterba  
Cc: Chris Mason 
Cc: Josef Bacik  
Signed-off-by: Shaokun Zhang 
---
 fs/btrfs/tree-checker.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index cab0b1f..efcf89a 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -440,7 +440,7 @@ static int check_block_group_item(struct btrfs_fs_info 
*fs_info,
type != (BTRFS_BLOCK_GROUP_METADATA |
   BTRFS_BLOCK_GROUP_DATA)) {
block_group_err(fs_info, leaf, slot,
-"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llu 
or 0x%llx",
+"invalid type, have 0x%llx (%lu bits set) expect either 0x%llx, 0x%llx, 0x%llx 
or 0x%llx",
type, hweight64(type),
BTRFS_BLOCK_GROUP_DATA, BTRFS_BLOCK_GROUP_METADATA,
BTRFS_BLOCK_GROUP_SYSTEM,
-- 
2.7.4