commit:     4a98cf83c330d33c7109041b84409437c9142fb6
Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>
AuthorDate: Wed Sep 20 10:09:02 2017 +0000
Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>
CommitDate: Wed Sep 20 10:09:02 2017 +0000
URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=4a98cf83

Linux patch 4.13.3

 0000_README             |    4 +
 1002_linux-4.13.3.patch | 2829 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 2833 insertions(+)

diff --git a/0000_README b/0000_README
index cd98c21..70f03ff 100644
--- a/0000_README
+++ b/0000_README
@@ -51,6 +51,10 @@ Patch:  1001_linux-4.13.2.patch
 From:   http://www.kernel.org
 Desc:   Linux 4.13.2
 
+Patch:  1002_linux-4.13.3.patch
+From:   http://www.kernel.org
+Desc:   Linux 4.13.3
+
 Patch:  1500_XATTR_USER_PREFIX.patch
 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644
 Desc:   Support for namespace user.pax.* on tmpfs.

diff --git a/1002_linux-4.13.3.patch b/1002_linux-4.13.3.patch
new file mode 100644
index 0000000..fe7ad47
--- /dev/null
+++ b/1002_linux-4.13.3.patch
@@ -0,0 +1,2829 @@
+diff --git a/Documentation/ABI/testing/sysfs-bus-thunderbolt 
b/Documentation/ABI/testing/sysfs-bus-thunderbolt
+index 2a98149943ea..392bef5bd399 100644
+--- a/Documentation/ABI/testing/sysfs-bus-thunderbolt
++++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt
+@@ -45,6 +45,8 @@ Contact:     [email protected]
+ Description:  When a devices supports Thunderbolt secure connect it will
+               have this attribute. Writing 32 byte hex string changes
+               authorization to use the secure connection method instead.
++              Writing an empty string clears the key and regular connection
++              method can be used again.
+ 
+ What:         /sys/bus/thunderbolt/devices/.../device
+ Date:         Sep 2017
+diff --git a/Makefile b/Makefile
+index 8aad6bc50d52..0f31ef4aea7b 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,6 +1,6 @@
+ VERSION = 4
+ PATCHLEVEL = 13
+-SUBLEVEL = 2
++SUBLEVEL = 3
+ EXTRAVERSION =
+ NAME = Fearless Coyote
+ 
+diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
+index 9aeb91935ce0..e2c4dd051ef8 100644
+--- a/arch/x86/include/asm/elf.h
++++ b/arch/x86/include/asm/elf.h
+@@ -204,6 +204,7 @@ void set_personality_ia32(bool);
+ 
+ #define ELF_CORE_COPY_REGS(pr_reg, regs)                      \
+ do {                                                          \
++      unsigned long base;                                     \
+       unsigned v;                                             \
+       (pr_reg)[0] = (regs)->r15;                              \
+       (pr_reg)[1] = (regs)->r14;                              \
+@@ -226,8 +227,8 @@ do {                                                       
        \
+       (pr_reg)[18] = (regs)->flags;                           \
+       (pr_reg)[19] = (regs)->sp;                              \
+       (pr_reg)[20] = (regs)->ss;                              \
+-      (pr_reg)[21] = current->thread.fsbase;                  \
+-      (pr_reg)[22] = current->thread.gsbase;                  \
++      rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base;         \
++      rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base;  \
+       asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v;       \
+       asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v;       \
+       asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v;       \
+diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
+index b4a0d43248cf..b50df06ad251 100644
+--- a/arch/x86/include/asm/page_64.h
++++ b/arch/x86/include/asm/page_64.h
+@@ -51,6 +51,10 @@ static inline void clear_page(void *page)
+ 
+ void copy_page(void *to, void *from);
+ 
++#ifdef CONFIG_X86_MCE
++#define arch_unmap_kpfn arch_unmap_kpfn
++#endif
++
+ #endif        /* !__ASSEMBLY__ */
+ 
+ #ifdef CONFIG_X86_VSYSCALL_EMULATION
+diff --git a/arch/x86/kernel/cpu/mcheck/mce.c 
b/arch/x86/kernel/cpu/mcheck/mce.c
+index 6dde0497efc7..3b413065c613 100644
+--- a/arch/x86/kernel/cpu/mcheck/mce.c
++++ b/arch/x86/kernel/cpu/mcheck/mce.c
+@@ -51,6 +51,7 @@
+ #include <asm/mce.h>
+ #include <asm/msr.h>
+ #include <asm/reboot.h>
++#include <asm/set_memory.h>
+ 
+ #include "mce-internal.h"
+ 
+@@ -1051,6 +1052,48 @@ static int do_memory_failure(struct mce *m)
+       return ret;
+ }
+ 
++#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE)
++
++void arch_unmap_kpfn(unsigned long pfn)
++{
++      unsigned long decoy_addr;
++
++      /*
++       * Unmap this page from the kernel 1:1 mappings to make sure
++       * we don't log more errors because of speculative access to
++       * the page.
++       * We would like to just call:
++       *      set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1);
++       * but doing that would radically increase the odds of a
++       * speculative access to the posion page because we'd have
++       * the virtual address of the kernel 1:1 mapping sitting
++       * around in registers.
++       * Instead we get tricky.  We create a non-canonical address
++       * that looks just like the one we want, but has bit 63 flipped.
++       * This relies on set_memory_np() not checking whether we passed
++       * a legal address.
++       */
++
++/*
++ * Build time check to see if we have a spare virtual bit. Don't want
++ * to leave this until run time because most developers don't have a
++ * system that can exercise this code path. This will only become a
++ * problem if/when we move beyond 5-level page tables.
++ *
++ * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD)
++ */
++#if PGDIR_SHIFT + 9 < 63
++      decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63));
++#else
++#error "no unused virtual bit available"
++#endif
++
++      if (set_memory_np(decoy_addr, 1))
++              pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn);
++
++}
++#endif
++
+ /*
+  * The actual machine check handler. This only handles real
+  * exceptions when something got corrupted coming in through int 18.
+diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
+index c3169be4c596..8c44e0cb2912 100644
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -149,6 +149,123 @@ void release_thread(struct task_struct *dead_task)
+       }
+ }
+ 
++enum which_selector {
++      FS,
++      GS
++};
++
++/*
++ * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
++ * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
++ * It's forcibly inlined because it'll generate better code and this function
++ * is hot.
++ */
++static __always_inline void save_base_legacy(struct task_struct *prev_p,
++                                           unsigned short selector,
++                                           enum which_selector which)
++{
++      if (likely(selector == 0)) {
++              /*
++               * On Intel (without X86_BUG_NULL_SEG), the segment base could
++               * be the pre-existing saved base or it could be zero.  On AMD
++               * (with X86_BUG_NULL_SEG), the segment base could be almost
++               * anything.
++               *
++               * This branch is very hot (it's hit twice on almost every
++               * context switch between 64-bit programs), and avoiding
++               * the RDMSR helps a lot, so we just assume that whatever
++               * value is already saved is correct.  This matches historical
++               * Linux behavior, so it won't break existing applications.
++               *
++               * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
++               * report that the base is zero, it needs to actually be zero:
++               * see the corresponding logic in load_seg_legacy.
++               */
++      } else {
++              /*
++               * If the selector is 1, 2, or 3, then the base is zero on
++               * !X86_BUG_NULL_SEG CPUs and could be anything on
++               * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
++               * has never attempted to preserve the base across context
++               * switches.
++               *
++               * If selector > 3, then it refers to a real segment, and
++               * saving the base isn't necessary.
++               */
++              if (which == FS)
++                      prev_p->thread.fsbase = 0;
++              else
++                      prev_p->thread.gsbase = 0;
++      }
++}
++
++static __always_inline void save_fsgs(struct task_struct *task)
++{
++      savesegment(fs, task->thread.fsindex);
++      savesegment(gs, task->thread.gsindex);
++      save_base_legacy(task, task->thread.fsindex, FS);
++      save_base_legacy(task, task->thread.gsindex, GS);
++}
++
++static __always_inline void loadseg(enum which_selector which,
++                                  unsigned short sel)
++{
++      if (which == FS)
++              loadsegment(fs, sel);
++      else
++              load_gs_index(sel);
++}
++
++static __always_inline void load_seg_legacy(unsigned short prev_index,
++                                          unsigned long prev_base,
++                                          unsigned short next_index,
++                                          unsigned long next_base,
++                                          enum which_selector which)
++{
++      if (likely(next_index <= 3)) {
++              /*
++               * The next task is using 64-bit TLS, is not using this
++               * segment at all, or is having fun with arcane CPU features.
++               */
++              if (next_base == 0) {
++                      /*
++                       * Nasty case: on AMD CPUs, we need to forcibly zero
++                       * the base.
++                       */
++                      if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
++                              loadseg(which, __USER_DS);
++                              loadseg(which, next_index);
++                      } else {
++                              /*
++                               * We could try to exhaustively detect cases
++                               * under which we can skip the segment load,
++                               * but there's really only one case that matters
++                               * for performance: if both the previous and
++                               * next states are fully zeroed, we can skip
++                               * the load.
++                               *
++                               * (This assumes that prev_base == 0 has no
++                               * false positives.  This is the case on
++                               * Intel-style CPUs.)
++                               */
++                              if (likely(prev_index | next_index | prev_base))
++                                      loadseg(which, next_index);
++                      }
++              } else {
++                      if (prev_index != next_index)
++                              loadseg(which, next_index);
++                      wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
++                             next_base);
++              }
++      } else {
++              /*
++               * The next task is using a real segment.  Loading the selector
++               * is sufficient.
++               */
++              loadseg(which, next_index);
++      }
++}
++
+ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
+               unsigned long arg, struct task_struct *p, unsigned long tls)
+ {
+@@ -229,10 +346,19 @@ start_thread_common(struct pt_regs *regs, unsigned long 
new_ip,
+                   unsigned long new_sp,
+                   unsigned int _cs, unsigned int _ss, unsigned int _ds)
+ {
++      WARN_ON_ONCE(regs != current_pt_regs());
++
++      if (static_cpu_has(X86_BUG_NULL_SEG)) {
++              /* Loading zero below won't clear the base. */
++              loadsegment(fs, __USER_DS);
++              load_gs_index(__USER_DS);
++      }
++
+       loadsegment(fs, 0);
+       loadsegment(es, _ds);
+       loadsegment(ds, _ds);
+       load_gs_index(0);
++
+       regs->ip                = new_ip;
+       regs->sp                = new_sp;
+       regs->cs                = _cs;
+@@ -277,7 +403,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct 
*next_p)
+       struct fpu *next_fpu = &next->fpu;
+       int cpu = smp_processor_id();
+       struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+-      unsigned prev_fsindex, prev_gsindex;
+ 
+       switch_fpu_prepare(prev_fpu, cpu);
+ 
+@@ -286,8 +411,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct 
*next_p)
+        *
+        * (e.g. xen_load_tls())
+        */
+-      savesegment(fs, prev_fsindex);
+-      savesegment(gs, prev_gsindex);
++      save_fsgs(prev_p);
+ 
+       /*
+        * Load TLS before restoring any segments so that segment loads
+@@ -326,108 +450,10 @@ __switch_to(struct task_struct *prev_p, struct 
task_struct *next_p)
+       if (unlikely(next->ds | prev->ds))
+               loadsegment(ds, next->ds);
+ 
+-      /*
+-       * Switch FS and GS.
+-       *
+-       * These are even more complicated than DS and ES: they have
+-       * 64-bit bases are that controlled by arch_prctl.  The bases
+-       * don't necessarily match the selectors, as user code can do
+-       * any number of things to cause them to be inconsistent.
+-       *
+-       * We don't promise to preserve the bases if the selectors are
+-       * nonzero.  We also don't promise to preserve the base if the
+-       * selector is zero and the base doesn't match whatever was
+-       * most recently passed to ARCH_SET_FS/GS.  (If/when the
+-       * FSGSBASE instructions are enabled, we'll need to offer
+-       * stronger guarantees.)
+-       *
+-       * As an invariant,
+-       * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is
+-       * impossible.
+-       */
+-      if (next->fsindex) {
+-              /* Loading a nonzero value into FS sets the index and base. */
+-              loadsegment(fs, next->fsindex);
+-      } else {
+-              if (next->fsbase) {
+-                      /* Next index is zero but next base is nonzero. */
+-                      if (prev_fsindex)
+-                              loadsegment(fs, 0);
+-                      wrmsrl(MSR_FS_BASE, next->fsbase);
+-              } else {
+-                      /* Next base and index are both zero. */
+-                      if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+-                              /*
+-                               * We don't know the previous base and can't
+-                               * find out without RDMSR.  Forcibly clear it.
+-                               */
+-                              loadsegment(fs, __USER_DS);
+-                              loadsegment(fs, 0);
+-                      } else {
+-                              /*
+-                               * If the previous index is zero and ARCH_SET_FS
+-                               * didn't change the base, then the base is
+-                               * also zero and we don't need to do anything.
+-                               */
+-                              if (prev->fsbase || prev_fsindex)
+-                                      loadsegment(fs, 0);
+-                      }
+-              }
+-      }
+-      /*
+-       * Save the old state and preserve the invariant.
+-       * NB: if prev_fsindex == 0, then we can't reliably learn the base
+-       * without RDMSR because Intel user code can zero it without telling
+-       * us and AMD user code can program any 32-bit value without telling
+-       * us.
+-       */
+-      if (prev_fsindex)
+-              prev->fsbase = 0;
+-      prev->fsindex = prev_fsindex;
+-
+-      if (next->gsindex) {
+-              /* Loading a nonzero value into GS sets the index and base. */
+-              load_gs_index(next->gsindex);
+-      } else {
+-              if (next->gsbase) {
+-                      /* Next index is zero but next base is nonzero. */
+-                      if (prev_gsindex)
+-                              load_gs_index(0);
+-                      wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase);
+-              } else {
+-                      /* Next base and index are both zero. */
+-                      if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
+-                              /*
+-                               * We don't know the previous base and can't
+-                               * find out without RDMSR.  Forcibly clear it.
+-                               *
+-                               * This contains a pointless SWAPGS pair.
+-                               * Fixing it would involve an explicit check
+-                               * for Xen or a new pvop.
+-                               */
+-                              load_gs_index(__USER_DS);
+-                              load_gs_index(0);
+-                      } else {
+-                              /*
+-                               * If the previous index is zero and ARCH_SET_GS
+-                               * didn't change the base, then the base is
+-                               * also zero and we don't need to do anything.
+-                               */
+-                              if (prev->gsbase || prev_gsindex)
+-                                      load_gs_index(0);
+-                      }
+-              }
+-      }
+-      /*
+-       * Save the old state and preserve the invariant.
+-       * NB: if prev_gsindex == 0, then we can't reliably learn the base
+-       * without RDMSR because Intel user code can zero it without telling
+-       * us and AMD user code can program any 32-bit value without telling
+-       * us.
+-       */
+-      if (prev_gsindex)
+-              prev->gsbase = 0;
+-      prev->gsindex = prev_gsindex;
++      load_seg_legacy(prev->fsindex, prev->fsbase,
++                      next->fsindex, next->fsbase, FS);
++      load_seg_legacy(prev->gsindex, prev->gsbase,
++                      next->gsindex, next->gsbase, GS);
+ 
+       switch_fpu_finish(next_fpu, cpu);
+ 
+diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
+index f50958ded9f0..79474f47eeef 100644
+--- a/drivers/md/raid1.c
++++ b/drivers/md/raid1.c
+@@ -2564,6 +2564,23 @@ static int init_resync(struct r1conf *conf)
+       return 0;
+ }
+ 
++static struct r1bio *raid1_alloc_init_r1buf(struct r1conf *conf)
++{
++      struct r1bio *r1bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
++      struct resync_pages *rps;
++      struct bio *bio;
++      int i;
++
++      for (i = conf->poolinfo->raid_disks; i--; ) {
++              bio = r1bio->bios[i];
++              rps = bio->bi_private;
++              bio_reset(bio);
++              bio->bi_private = rps;
++      }
++      r1bio->master_bio = NULL;
++      return r1bio;
++}
++
+ /*
+  * perform a "sync" on one "block"
+  *
+@@ -2649,7 +2666,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, 
sector_t sector_nr,
+ 
+       bitmap_cond_end_sync(mddev->bitmap, sector_nr,
+               mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > 
conf->cluster_sync_high));
+-      r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
++      r1_bio = raid1_alloc_init_r1buf(conf);
+ 
+       raise_barrier(conf, sector_nr);
+ 
+diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
+index f55d4cc085f6..d51ac02e98ef 100644
+--- a/drivers/md/raid10.c
++++ b/drivers/md/raid10.c
+@@ -2798,6 +2798,35 @@ static int init_resync(struct r10conf *conf)
+       return 0;
+ }
+ 
++static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
++{
++      struct r10bio *r10bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
++      struct rsync_pages *rp;
++      struct bio *bio;
++      int nalloc;
++      int i;
++
++      if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
++          test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
++              nalloc = conf->copies; /* resync */
++      else
++              nalloc = 2; /* recovery */
++
++      for (i = 0; i < nalloc; i++) {
++              bio = r10bio->devs[i].bio;
++              rp = bio->bi_private;
++              bio_reset(bio);
++              bio->bi_private = rp;
++              bio = r10bio->devs[i].repl_bio;
++              if (bio) {
++                      rp = bio->bi_private;
++                      bio_reset(bio);
++                      bio->bi_private = rp;
++              }
++      }
++      return r10bio;
++}
++
+ /*
+  * perform a "sync" on one "block"
+  *
+@@ -3027,7 +3056,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, 
sector_t sector_nr,
+                               atomic_inc(&mreplace->nr_pending);
+                       rcu_read_unlock();
+ 
+-                      r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
++                      r10_bio = raid10_alloc_init_r10buf(conf);
+                       r10_bio->state = 0;
+                       raise_barrier(conf, rb2 != NULL);
+                       atomic_set(&r10_bio->remaining, 0);
+@@ -3236,7 +3265,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, 
sector_t sector_nr,
+               }
+               if (sync_blocks < max_sync)
+                       max_sync = sync_blocks;
+-              r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
++              r10_bio = raid10_alloc_init_r10buf(conf);
+               r10_bio->state = 0;
+ 
+               r10_bio->mddev = mddev;
+@@ -4360,7 +4389,7 @@ static sector_t reshape_request(struct mddev *mddev, 
sector_t sector_nr,
+ 
+ read_more:
+       /* Now schedule reads for blocks from sector_nr to last */
+-      r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
++      r10_bio = raid10_alloc_init_r10buf(conf);
+       r10_bio->state = 0;
+       raise_barrier(conf, sectors_done != 0);
+       atomic_set(&r10_bio->remaining, 0);
+diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
+index 0fc2748aaf95..e13a8ce7f589 100644
+--- a/drivers/md/raid5.c
++++ b/drivers/md/raid5.c
+@@ -6235,6 +6235,10 @@ static void raid5_do_work(struct work_struct *work)
+ 
+       spin_unlock_irq(&conf->device_lock);
+ 
++      flush_deferred_bios(conf);
++
++      r5l_flush_stripe_to_raid(conf->log);
++
+       async_tx_issue_pending_all();
+       blk_finish_plug(&plug);
+ 
+diff --git a/drivers/net/ethernet/freescale/gianfar.c 
b/drivers/net/ethernet/freescale/gianfar.c
+index c4b4b0a1bbf0..5be52d89b182 100644
+--- a/drivers/net/ethernet/freescale/gianfar.c
++++ b/drivers/net/ethernet/freescale/gianfar.c
+@@ -3687,7 +3687,7 @@ static noinline void gfar_update_link_state(struct 
gfar_private *priv)
+               u32 tempval1 = gfar_read(&regs->maccfg1);
+               u32 tempval = gfar_read(&regs->maccfg2);
+               u32 ecntrl = gfar_read(&regs->ecntrl);
+-              u32 tx_flow_oldval = (tempval & MACCFG1_TX_FLOW);
++              u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW);
+ 
+               if (phydev->duplex != priv->oldduplex) {
+                       if (!(phydev->duplex))
+diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
+index 14323faf8bd9..7ec6393b6ba1 100644
+--- a/drivers/nvdimm/btt.c
++++ b/drivers/nvdimm/btt.c
+@@ -1429,6 +1429,8 @@ int nvdimm_namespace_attach_btt(struct 
nd_namespace_common *ndns)
+       }
+ 
+       btt_sb = devm_kzalloc(&nd_btt->dev, sizeof(*btt_sb), GFP_KERNEL);
++      if (!btt_sb)
++              return -ENOMEM;
+ 
+       /*
+        * If this returns < 0, that is ok as it just means there wasn't
+diff --git a/drivers/nvdimm/bus.c b/drivers/nvdimm/bus.c
+index 937fafa1886a..54eb14c7ef90 100644
+--- a/drivers/nvdimm/bus.c
++++ b/drivers/nvdimm/bus.c
+@@ -905,19 +905,20 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
+               int read_only, unsigned int ioctl_cmd, unsigned long arg)
+ {
+       struct nvdimm_bus_descriptor *nd_desc = nvdimm_bus->nd_desc;
+-      size_t buf_len = 0, in_len = 0, out_len = 0;
+       static char out_env[ND_CMD_MAX_ENVELOPE];
+       static char in_env[ND_CMD_MAX_ENVELOPE];
+       const struct nd_cmd_desc *desc = NULL;
+       unsigned int cmd = _IOC_NR(ioctl_cmd);
+-      unsigned int func = cmd;
+-      void __user *p = (void __user *) arg;
+       struct device *dev = &nvdimm_bus->dev;
+-      struct nd_cmd_pkg pkg;
++      void __user *p = (void __user *) arg;
+       const char *cmd_name, *dimm_name;
++      u32 in_len = 0, out_len = 0;
++      unsigned int func = cmd;
+       unsigned long cmd_mask;
+-      void *buf;
++      struct nd_cmd_pkg pkg;
+       int rc, i, cmd_rc;
++      u64 buf_len = 0;
++      void *buf;
+ 
+       if (nvdimm) {
+               desc = nd_cmd_dimm_desc(cmd);
+@@ -977,7 +978,7 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
+ 
+       if (cmd == ND_CMD_CALL) {
+               func = pkg.nd_command;
+-              dev_dbg(dev, "%s:%s, idx: %llu, in: %zu, out: %zu, len %zu\n",
++              dev_dbg(dev, "%s:%s, idx: %llu, in: %u, out: %u, len %llu\n",
+                               __func__, dimm_name, pkg.nd_command,
+                               in_len, out_len, buf_len);
+ 
+@@ -1007,9 +1008,9 @@ static int __nd_ioctl(struct nvdimm_bus *nvdimm_bus, 
struct nvdimm *nvdimm,
+               out_len += out_size;
+       }
+ 
+-      buf_len = out_len + in_len;
++      buf_len = (u64) out_len + (u64) in_len;
+       if (buf_len > ND_IOCTL_MAX_BUFLEN) {
+-              dev_dbg(dev, "%s:%s cmd: %s buf_len: %zu > %d\n", __func__,
++              dev_dbg(dev, "%s:%s cmd: %s buf_len: %llu > %d\n", __func__,
+                               dimm_name, cmd_name, buf_len,
+                               ND_IOCTL_MAX_BUFLEN);
+               return -EINVAL;
+diff --git a/drivers/thunderbolt/switch.c b/drivers/thunderbolt/switch.c
+index e9391bbd4036..53f40c57df59 100644
+--- a/drivers/thunderbolt/switch.c
++++ b/drivers/thunderbolt/switch.c
+@@ -807,11 +807,11 @@ static ssize_t key_store(struct device *dev, struct 
device_attribute *attr,
+       struct tb_switch *sw = tb_to_switch(dev);
+       u8 key[TB_SWITCH_KEY_SIZE];
+       ssize_t ret = count;
++      bool clear = false;
+ 
+-      if (count < 64)
+-              return -EINVAL;
+-
+-      if (hex2bin(key, buf, sizeof(key)))
++      if (!strcmp(buf, "\n"))
++              clear = true;
++      else if (hex2bin(key, buf, sizeof(key)))
+               return -EINVAL;
+ 
+       if (mutex_lock_interruptible(&switch_lock))
+@@ -821,15 +821,19 @@ static ssize_t key_store(struct device *dev, struct 
device_attribute *attr,
+               ret = -EBUSY;
+       } else {
+               kfree(sw->key);
+-              sw->key = kmemdup(key, sizeof(key), GFP_KERNEL);
+-              if (!sw->key)
+-                      ret = -ENOMEM;
++              if (clear) {
++                      sw->key = NULL;
++              } else {
++                      sw->key = kmemdup(key, sizeof(key), GFP_KERNEL);
++                      if (!sw->key)
++                              ret = -ENOMEM;
++              }
+       }
+ 
+       mutex_unlock(&switch_lock);
+       return ret;
+ }
+-static DEVICE_ATTR_RW(key);
++static DEVICE_ATTR(key, 0600, key_show, key_store);
+ 
+ static ssize_t nvm_authenticate_show(struct device *dev,
+       struct device_attribute *attr, char *buf)
+diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
+index 06d044862e58..1c75572f5a3f 100644
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -634,8 +634,13 @@ static int vhost_net_rx_peek_head_len(struct vhost_net 
*net, struct sock *sk)
+ 
+               preempt_enable();
+ 
+-              if (vhost_enable_notify(&net->dev, vq))
++              if (!vhost_vq_avail_empty(&net->dev, vq))
+                       vhost_poll_queue(&vq->poll);
++              else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
++                      vhost_disable_notify(&net->dev, vq);
++                      vhost_poll_queue(&vq->poll);
++              }
++
+               mutex_unlock(&vq->mutex);
+ 
+               len = peek_head_len(rvq, sk);
+diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
+index 907d6b7dde6a..86d813a3f5d1 100644
+--- a/fs/f2fs/recovery.c
++++ b/fs/f2fs/recovery.c
+@@ -291,7 +291,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info 
*sbi,
+               return 0;
+ 
+       /* Get the previous summary */
+-      for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) {
++      for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+               struct curseg_info *curseg = CURSEG_I(sbi, i);
+               if (curseg->segno == segno) {
+                       sum = curseg->sum_blk->entries[blkoff];
+@@ -599,8 +599,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool 
check_only)
+       }
+ 
+       clear_sbi_flag(sbi, SBI_POR_DOING);
+-      if (err)
+-              set_ckpt_flags(sbi, CP_ERROR_FLAG);
+       mutex_unlock(&sbi->cp_mutex);
+ 
+       /* let's drop all the directory inodes for clean checkpoint */
+diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
+index c16d00e53264..13c65dd2d37d 100644
+--- a/fs/fuse/dev.c
++++ b/fs/fuse/dev.c
+@@ -1222,9 +1222,6 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, 
struct file *file,
+       struct fuse_in *in;
+       unsigned reqsize;
+ 
+-      if (task_active_pid_ns(current) != fc->pid_ns)
+-              return -EIO;
+-
+  restart:
+       spin_lock(&fiq->waitq.lock);
+       err = -EAGAIN;
+@@ -1262,6 +1259,13 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, 
struct file *file,
+ 
+       in = &req->in;
+       reqsize = in->h.len;
++
++      if (task_active_pid_ns(current) != fc->pid_ns) {
++              rcu_read_lock();
++              in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns));
++              rcu_read_unlock();
++      }
++
+       /* If request is too large, reply with an error and restart the read */
+       if (nbytes < reqsize) {
+               req->out.h.error = -EIO;
+@@ -1823,9 +1827,6 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
+       struct fuse_req *req;
+       struct fuse_out_header oh;
+ 
+-      if (task_active_pid_ns(current) != fc->pid_ns)
+-              return -EIO;
+-
+       if (nbytes < sizeof(struct fuse_out_header))
+               return -EINVAL;
+ 
+diff --git a/fs/fuse/file.c b/fs/fuse/file.c
+index ab60051be6e5..6d8e65cec01a 100644
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -2181,9 +2181,6 @@ static int fuse_setlk(struct file *file, struct 
file_lock *fl, int flock)
+       if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
+               return 0;
+ 
+-      if (pid && pid_nr == 0)
+-              return -EOVERFLOW;
+-
+       fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
+       err = fuse_simple_request(fc, &args);
+ 
+diff --git a/fs/inode.c b/fs/inode.c
+index 50370599e371..6a1626e0edaf 100644
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -637,6 +637,7 @@ void evict_inodes(struct super_block *sb)
+ 
+       dispose_list(&dispose);
+ }
++EXPORT_SYMBOL_GPL(evict_inodes);
+ 
+ /**
+  * invalidate_inodes  - attempt to free all inodes on a superblock
+diff --git a/fs/internal.h b/fs/internal.h
+index 9676fe11c093..fedfe94d84ba 100644
+--- a/fs/internal.h
++++ b/fs/internal.h
+@@ -132,7 +132,6 @@ static inline bool atime_needs_update_rcu(const struct 
path *path,
+ extern void inode_io_list_del(struct inode *inode);
+ 
+ extern long get_nr_dirty_inodes(void);
+-extern void evict_inodes(struct super_block *);
+ extern int invalidate_inodes(struct super_block *, bool);
+ 
+ /*
+diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
+index 5bc71642b226..ef55c926463c 100644
+--- a/fs/overlayfs/inode.c
++++ b/fs/overlayfs/inode.c
+@@ -576,10 +576,13 @@ static int ovl_inode_set(struct inode *inode, void *data)
+ static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
+                            struct dentry *upperdentry)
+ {
+-      struct inode *lowerinode = lowerdentry ? d_inode(lowerdentry) : NULL;
+-
+-      /* Lower (origin) inode must match, even if NULL */
+-      if (ovl_inode_lower(inode) != lowerinode)
++      /*
++       * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
++       * This happens when finding a copied up overlay inode for a renamed
++       * or hardlinked overlay dentry and lower dentry cannot be followed
++       * by origin because lower fs does not support file handles.
++       */
++      if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
+               return false;
+ 
+       /*
+diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
+index c09c16b1ad3b..6f2a5baded76 100644
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -579,7 +579,7 @@ xfs_bmap_validate_ret(
+ 
+ #else
+ #define xfs_bmap_check_leaf_extents(cur, ip, whichfork)               do { } 
while (0)
+-#define       xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)
++#define       xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap)    do { } 
while (0)
+ #endif /* DEBUG */
+ 
+ /*
+diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
+index 85de22513014..a6331ffa51e3 100644
+--- a/fs/xfs/libxfs/xfs_bmap_btree.c
++++ b/fs/xfs/libxfs/xfs_bmap_btree.c
+@@ -858,6 +858,7 @@ xfs_bmbt_change_owner(
+       cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+       if (!cur)
+               return -ENOMEM;
++      cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER;
+ 
+       error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+       xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
+index e0bcc4a59efd..5bfb88261c7e 100644
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -1791,6 +1791,7 @@ xfs_btree_lookup_get_block(
+ 
+       /* Check the inode owner since the verifiers don't. */
+       if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
++          !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) &&
+           (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
+           be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
+                       cur->bc_private.b.ip->i_ino)
+@@ -4451,10 +4452,15 @@ xfs_btree_block_change_owner(
+ 
+       /* modify the owner */
+       block = xfs_btree_get_block(cur, level, &bp);
+-      if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
++      if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
++              if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
++                      return 0;
+               block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
+-      else
++      } else {
++              if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
++                      return 0;
+               block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
++      }
+ 
+       /*
+        * If the block is a root block hosted in an inode, we might not have a
+@@ -4463,16 +4469,19 @@ xfs_btree_block_change_owner(
+        * block is formatted into the on-disk inode fork. We still change it,
+        * though, so everything is consistent in memory.
+        */
+-      if (bp) {
+-              if (cur->bc_tp) {
+-                      xfs_trans_ordered_buf(cur->bc_tp, bp);
++      if (!bp) {
++              ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
++              ASSERT(level == cur->bc_nlevels - 1);
++              return 0;
++      }
++
++      if (cur->bc_tp) {
++              if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
+                       xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+-              } else {
+-                      xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
++                      return -EAGAIN;
+               }
+       } else {
+-              ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+-              ASSERT(level == cur->bc_nlevels - 1);
++              xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
+       }
+ 
+       return 0;
+diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
+index 9c95e965cfe5..f2a88c3b1159 100644
+--- a/fs/xfs/libxfs/xfs_btree.h
++++ b/fs/xfs/libxfs/xfs_btree.h
+@@ -233,7 +233,8 @@ typedef struct xfs_btree_cur
+                       short           forksize;       /* fork's inode space */
+                       char            whichfork;      /* data or attr fork */
+                       char            flags;          /* flags */
+-#define       XFS_BTCUR_BPRV_WASDEL   1                       /* was delayed 
*/
++#define       XFS_BTCUR_BPRV_WASDEL           (1<<0)          /* was delayed 
*/
++#define       XFS_BTCUR_BPRV_INVALID_OWNER    (1<<1)          /* for ext swap 
*/
+               } b;
+       }               bc_private;     /* per-btree type data */
+ } xfs_btree_cur_t;
+diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
+index abf5beaae907..988bb3f31446 100644
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -378,8 +378,6 @@ xfs_ialloc_inode_init(
+                                * transaction and pin the log appropriately.
+                                */
+                               xfs_trans_ordered_buf(tp, fbuf);
+-                              xfs_trans_log_buf(tp, fbuf, 0,
+-                                                BBTOB(fbuf->b_length) - 1);
+                       }
+               } else {
+                       fbuf->b_flags |= XBF_DONE;
+@@ -1133,6 +1131,7 @@ xfs_dialloc_ag_inobt(
+       int                     error;
+       int                     offset;
+       int                     i, j;
++      int                     searchdistance = 10;
+ 
+       pag = xfs_perag_get(mp, agno);
+ 
+@@ -1159,7 +1158,6 @@ xfs_dialloc_ag_inobt(
+       if (pagno == agno) {
+               int             doneleft;       /* done, to the left */
+               int             doneright;      /* done, to the right */
+-              int             searchdistance = 10;
+ 
+               error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+               if (error)
+@@ -1220,21 +1218,9 @@ xfs_dialloc_ag_inobt(
+               /*
+                * Loop until we find an inode chunk with a free inode.
+                */
+-              while (!doneleft || !doneright) {
++              while (--searchdistance > 0 && (!doneleft || !doneright)) {
+                       int     useleft;  /* using left inode chunk this time */
+ 
+-                      if (!--searchdistance) {
+-                              /*
+-                               * Not in range - save last search
+-                               * location and allocate a new inode
+-                               */
+-                              xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+-                              pag->pagl_leftrec = trec.ir_startino;
+-                              pag->pagl_rightrec = rec.ir_startino;
+-                              pag->pagl_pagino = pagino;
+-                              goto newino;
+-                      }
+-
+                       /* figure out the closer block if both are valid. */
+                       if (!doneleft && !doneright) {
+                               useleft = pagino -
+@@ -1278,26 +1264,37 @@ xfs_dialloc_ag_inobt(
+                               goto error1;
+               }
+ 
+-              /*
+-               * We've reached the end of the btree. because
+-               * we are only searching a small chunk of the
+-               * btree each search, there is obviously free
+-               * inodes closer to the parent inode than we
+-               * are now. restart the search again.
+-               */
+-              pag->pagl_pagino = NULLAGINO;
+-              pag->pagl_leftrec = NULLAGINO;
+-              pag->pagl_rightrec = NULLAGINO;
+-              xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+-              xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+-              goto restart_pagno;
++              if (searchdistance <= 0) {
++                      /*
++                       * Not in range - save last search
++                       * location and allocate a new inode
++                       */
++                      xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
++                      pag->pagl_leftrec = trec.ir_startino;
++                      pag->pagl_rightrec = rec.ir_startino;
++                      pag->pagl_pagino = pagino;
++
++              } else {
++                      /*
++                       * We've reached the end of the btree. because
++                       * we are only searching a small chunk of the
++                       * btree each search, there is obviously free
++                       * inodes closer to the parent inode than we
++                       * are now. restart the search again.
++                       */
++                      pag->pagl_pagino = NULLAGINO;
++                      pag->pagl_leftrec = NULLAGINO;
++                      pag->pagl_rightrec = NULLAGINO;
++                      xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
++                      xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
++                      goto restart_pagno;
++              }
+       }
+ 
+       /*
+        * In a different AG from the parent.
+        * See if the most recently allocated block has any free.
+        */
+-newino:
+       if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
+               error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+                                        XFS_LOOKUP_EQ, &i);
+diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
+index 0e80f34fe97c..5eb165555934 100644
+--- a/fs/xfs/libxfs/xfs_inode_fork.c
++++ b/fs/xfs/libxfs/xfs_inode_fork.c
+@@ -1499,14 +1499,11 @@ xfs_iext_realloc_indirect(
+       xfs_ifork_t     *ifp,           /* inode fork pointer */
+       int             new_size)       /* new indirection array size */
+ {
+-      int             nlists;         /* number of irec's (ex lists) */
+-      int             size;           /* current indirection array size */
+-
+       ASSERT(ifp->if_flags & XFS_IFEXTIREC);
+-      nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
+-      size = nlists * sizeof(xfs_ext_irec_t);
+       ASSERT(ifp->if_real_bytes);
+-      ASSERT((new_size >= 0) && (new_size != size));
++      ASSERT((new_size >= 0) &&
++             (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) *
++                           sizeof(xfs_ext_irec_t))));
+       if (new_size == 0) {
+               xfs_iext_destroy(ifp);
+       } else {
+diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
+index 6bf120bb1a17..f9efd67f6fa1 100644
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -85,11 +85,11 @@ xfs_find_bdev_for_inode(
+  * associated buffer_heads, paying attention to the start and end offsets that
+  * we need to process on the page.
+  *
+- * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last
+- * buffer in the IO. Once it does this, it is unsafe to access the bufferhead 
or
+- * the page at all, as we may be racing with memory reclaim and it can free 
both
+- * the bufferhead chain and the page as it will see the page as clean and
+- * unused.
++ * Note that we open code the action in end_buffer_async_write here so that we
++ * only have to iterate over the buffers attached to the page once.  This is 
not
++ * only more efficient, but also ensures that we only calls end_page_writeback
++ * at the end of the iteration, and thus avoids the pitfall of having the page
++ * and buffers potentially freed after every call to end_buffer_async_write.
+  */
+ static void
+ xfs_finish_page_writeback(
+@@ -97,29 +97,44 @@ xfs_finish_page_writeback(
+       struct bio_vec          *bvec,
+       int                     error)
+ {
+-      unsigned int            end = bvec->bv_offset + bvec->bv_len - 1;
+-      struct buffer_head      *head, *bh, *next;
++      struct buffer_head      *head = page_buffers(bvec->bv_page), *bh = head;
++      bool                    busy = false;
+       unsigned int            off = 0;
+-      unsigned int            bsize;
++      unsigned long           flags;
+ 
+       ASSERT(bvec->bv_offset < PAGE_SIZE);
+       ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
+-      ASSERT(end < PAGE_SIZE);
++      ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
+       ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
+ 
+-      bh = head = page_buffers(bvec->bv_page);
+-
+-      bsize = bh->b_size;
++      local_irq_save(flags);
++      bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
+       do {
+-              if (off > end)
+-                      break;
+-              next = bh->b_this_page;
+-              if (off < bvec->bv_offset)
+-                      goto next_bh;
+-              bh->b_end_io(bh, !error);
+-next_bh:
+-              off += bsize;
+-      } while ((bh = next) != head);
++              if (off >= bvec->bv_offset &&
++                  off < bvec->bv_offset + bvec->bv_len) {
++                      ASSERT(buffer_async_write(bh));
++                      ASSERT(bh->b_end_io == NULL);
++
++                      if (error) {
++                              mark_buffer_write_io_error(bh);
++                              clear_buffer_uptodate(bh);
++                              SetPageError(bvec->bv_page);
++                      } else {
++                              set_buffer_uptodate(bh);
++                      }
++                      clear_buffer_async_write(bh);
++                      unlock_buffer(bh);
++              } else if (buffer_async_write(bh)) {
++                      ASSERT(buffer_locked(bh));
++                      busy = true;
++              }
++              off += bh->b_size;
++      } while ((bh = bh->b_this_page) != head);
++      bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
++      local_irq_restore(flags);
++
++      if (!busy)
++              end_page_writeback(bvec->bv_page);
+ }
+ 
+ /*
+@@ -133,8 +148,10 @@ xfs_destroy_ioend(
+       int                     error)
+ {
+       struct inode            *inode = ioend->io_inode;
+-      struct bio              *last = ioend->io_bio;
+-      struct bio              *bio, *next;
++      struct bio              *bio = &ioend->io_inline_bio;
++      struct bio              *last = ioend->io_bio, *next;
++      u64                     start = bio->bi_iter.bi_sector;
++      bool                    quiet = bio_flagged(bio, BIO_QUIET);
+ 
+       for (bio = &ioend->io_inline_bio; bio; bio = next) {
+               struct bio_vec  *bvec;
+@@ -155,6 +172,11 @@ xfs_destroy_ioend(
+ 
+               bio_put(bio);
+       }
++
++      if (unlikely(error && !quiet)) {
++              xfs_err_ratelimited(XFS_I(inode)->i_mount,
++                      "writeback error on sector %llu", start);
++      }
+ }
+ 
+ /*
+@@ -423,7 +445,8 @@ xfs_start_buffer_writeback(
+       ASSERT(!buffer_delay(bh));
+       ASSERT(!buffer_unwritten(bh));
+ 
+-      mark_buffer_async_write(bh);
++      bh->b_end_io = NULL;
++      set_buffer_async_write(bh);
+       set_buffer_uptodate(bh);
+       clear_buffer_dirty(bh);
+ }
+diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
+index 93e955262d07..3e9b7a4fb8fd 100644
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -1840,29 +1840,18 @@ xfs_swap_extent_forks(
+       }
+ 
+       /*
+-       * Before we've swapped the forks, lets set the owners of the forks
+-       * appropriately. We have to do this as we are demand paging the btree
+-       * buffers, and so the validation done on read will expect the owner
+-       * field to be correctly set. Once we change the owners, we can swap the
+-       * inode forks.
++       * Btree format (v3) inodes have the inode number stamped in the bmbt
++       * block headers. We can't start changing the bmbt blocks until the
++       * inode owner change is logged so recovery does the right thing in the
++       * event of a crash. Set the owner change log flags now and leave the
++       * bmbt scan as the last step.
+        */
+       if (ip->i_d.di_version == 3 &&
+-          ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
++          ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+               (*target_log_flags) |= XFS_ILOG_DOWNER;
+-              error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+-                                            tip->i_ino, NULL);
+-              if (error)
+-                      return error;
+-      }
+-
+       if (tip->i_d.di_version == 3 &&
+-          tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
++          tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+               (*src_log_flags) |= XFS_ILOG_DOWNER;
+-              error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+-                                            ip->i_ino, NULL);
+-              if (error)
+-                      return error;
+-      }
+ 
+       /*
+        * Swap the data forks of the inodes
+@@ -1940,6 +1929,48 @@ xfs_swap_extent_forks(
+       return 0;
+ }
+ 
++/*
++ * Fix up the owners of the bmbt blocks to refer to the current inode. The
++ * change owner scan attempts to order all modified buffers in the current
++ * transaction. In the event of ordered buffer failure, the offending buffer 
is
++ * physically logged as a fallback and the scan returns -EAGAIN. We must roll
++ * the transaction in this case to replenish the fallback log reservation and
++ * restart the scan. This process repeats until the scan completes.
++ */
++static int
++xfs_swap_change_owner(
++      struct xfs_trans        **tpp,
++      struct xfs_inode        *ip,
++      struct xfs_inode        *tmpip)
++{
++      int                     error;
++      struct xfs_trans        *tp = *tpp;
++
++      do {
++              error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino,
++                                            NULL);
++              /* success or fatal error */
++              if (error != -EAGAIN)
++                      break;
++
++              error = xfs_trans_roll(tpp, NULL);
++              if (error)
++                      break;
++              tp = *tpp;
++
++              /*
++               * Redirty both inodes so they can relog and keep the log tail
++               * moving forward.
++               */
++              xfs_trans_ijoin(tp, ip, 0);
++              xfs_trans_ijoin(tp, tmpip, 0);
++              xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
++              xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE);
++      } while (true);
++
++      return error;
++}
++
+ int
+ xfs_swap_extents(
+       struct xfs_inode        *ip,    /* target inode */
+@@ -1954,7 +1985,7 @@ xfs_swap_extents(
+       int                     lock_flags;
+       struct xfs_ifork        *cowfp;
+       uint64_t                f;
+-      int                     resblks;
++      int                     resblks = 0;
+ 
+       /*
+        * Lock the inodes against other IO, page faults and truncate to
+@@ -2002,11 +2033,8 @@ xfs_swap_extents(
+                         XFS_SWAP_RMAP_SPACE_RES(mp,
+                               XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
+                               XFS_DATA_FORK);
+-              error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+-                              0, 0, &tp);
+-      } else
+-              error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
+-                              0, 0, &tp);
++      }
++      error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+       if (error)
+               goto out_unlock;
+ 
+@@ -2091,6 +2119,23 @@ xfs_swap_extents(
+       xfs_trans_log_inode(tp, ip,  src_log_flags);
+       xfs_trans_log_inode(tp, tip, target_log_flags);
+ 
++      /*
++       * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems
++       * have inode number owner values in the bmbt blocks that still refer to
++       * the old inode. Scan each bmbt to fix up the owner values with the
++       * inode number of the current inode.
++       */
++      if (src_log_flags & XFS_ILOG_DOWNER) {
++              error = xfs_swap_change_owner(&tp, ip, tip);
++              if (error)
++                      goto out_trans_cancel;
++      }
++      if (target_log_flags & XFS_ILOG_DOWNER) {
++              error = xfs_swap_change_owner(&tp, tip, ip);
++              if (error)
++                      goto out_trans_cancel;
++      }
++
+       /*
+        * If this is a synchronous mount, make sure that the
+        * transaction goes to disk before returning to the user.
+diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
+index f6a8422e9562..e0a0af0946f2 100644
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -29,6 +29,7 @@
+ #include "xfs_error.h"
+ #include "xfs_trace.h"
+ #include "xfs_log.h"
++#include "xfs_inode.h"
+ 
+ 
+ kmem_zone_t   *xfs_buf_item_zone;
+@@ -322,6 +323,8 @@ xfs_buf_item_format(
+       ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
+              (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
+               && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
++      ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) ||
++             (bip->bli_flags & XFS_BLI_STALE));
+ 
+ 
+       /*
+@@ -346,16 +349,6 @@ xfs_buf_item_format(
+               bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+       }
+ 
+-      if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) ==
+-                                                      XFS_BLI_ORDERED) {
+-              /*
+-               * The buffer has been logged just to order it.  It is not being
+-               * included in the transaction commit, so don't format it.
+-               */
+-              trace_xfs_buf_item_format_ordered(bip);
+-              return;
+-      }
+-
+       for (i = 0; i < bip->bli_format_count; i++) {
+               xfs_buf_item_format_segment(bip, lv, &vecp, offset,
+                                           &bip->bli_formats[i]);
+@@ -574,26 +567,20 @@ xfs_buf_item_unlock(
+ {
+       struct xfs_buf_log_item *bip = BUF_ITEM(lip);
+       struct xfs_buf          *bp = bip->bli_buf;
+-      bool                    clean;
+-      bool                    aborted;
+-      int                     flags;
++      bool                    aborted = !!(lip->li_flags & XFS_LI_ABORTED);
++      bool                    hold = !!(bip->bli_flags & XFS_BLI_HOLD);
++      bool                    dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
++#if defined(DEBUG) || defined(XFS_WARN)
++      bool                    ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
++#endif
+ 
+       /* Clear the buffer's association with this transaction. */
+       bp->b_transp = NULL;
+ 
+       /*
+-       * If this is a transaction abort, don't return early.  Instead, allow
+-       * the brelse to happen.  Normally it would be done for stale
+-       * (cancelled) buffers at unpin time, but we'll never go through the
+-       * pin/unpin cycle if we abort inside commit.
++       * The per-transaction state has been copied above so clear it from the
++       * bli.
+        */
+-      aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false;
+-      /*
+-       * Before possibly freeing the buf item, copy the per-transaction state
+-       * so we can reference it safely later after clearing it from the
+-       * buffer log item.
+-       */
+-      flags = bip->bli_flags;
+       bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
+ 
+       /*
+@@ -601,7 +588,7 @@ xfs_buf_item_unlock(
+        * unlock the buffer and free the buf item when the buffer is unpinned
+        * for the last time.
+        */
+-      if (flags & XFS_BLI_STALE) {
++      if (bip->bli_flags & XFS_BLI_STALE) {
+               trace_xfs_buf_item_unlock_stale(bip);
+               ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+               if (!aborted) {
+@@ -619,20 +606,11 @@ xfs_buf_item_unlock(
+        * regardless of whether it is dirty or not. A dirty abort implies a
+        * shutdown, anyway.
+        *
+-       * Ordered buffers are dirty but may have no recorded changes, so ensure
+-       * we only release clean items here.
++       * The bli dirty state should match whether the blf has logged segments
++       * except for ordered buffers, where only the bli should be dirty.
+        */
+-      clean = (flags & XFS_BLI_DIRTY) ? false : true;
+-      if (clean) {
+-              int i;
+-              for (i = 0; i < bip->bli_format_count; i++) {
+-                      if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+-                                   bip->bli_formats[i].blf_map_size)) {
+-                              clean = false;
+-                              break;
+-                      }
+-              }
+-      }
++      ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
++             (ordered && dirty && !xfs_buf_item_dirty_format(bip)));
+ 
+       /*
+        * Clean buffers, by definition, cannot be in the AIL. However, aborted
+@@ -651,11 +629,11 @@ xfs_buf_item_unlock(
+                       ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+                       xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+                       xfs_buf_item_relse(bp);
+-              } else if (clean)
++              } else if (!dirty)
+                       xfs_buf_item_relse(bp);
+       }
+ 
+-      if (!(flags & XFS_BLI_HOLD))
++      if (!hold)
+               xfs_buf_relse(bp);
+ }
+ 
+@@ -945,14 +923,22 @@ xfs_buf_item_log(
+ 
+ 
+ /*
+- * Return 1 if the buffer has been logged or ordered in a transaction (at any
+- * point, not just the current transaction) and 0 if not.
++ * Return true if the buffer has any ranges logged/dirtied by a transaction,
++ * false otherwise.
+  */
+-uint
+-xfs_buf_item_dirty(
+-      xfs_buf_log_item_t      *bip)
++bool
++xfs_buf_item_dirty_format(
++      struct xfs_buf_log_item *bip)
+ {
+-      return (bip->bli_flags & XFS_BLI_DIRTY);
++      int                     i;
++
++      for (i = 0; i < bip->bli_format_count; i++) {
++              if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
++                           bip->bli_formats[i].blf_map_size))
++                      return true;
++      }
++
++      return false;
+ }
+ 
+ STATIC void
+@@ -1054,6 +1040,31 @@ xfs_buf_do_callbacks(
+       }
+ }
+ 
++/*
++ * Invoke the error state callback for each log item affected by the failed 
I/O.
++ *
++ * If a metadata buffer write fails with a non-permanent error, the buffer is
++ * eventually resubmitted and so the completion callbacks are not run. The 
error
++ * state may need to be propagated to the log items attached to the buffer,
++ * however, so the next AIL push of the item knows hot to handle it correctly.
++ */
++STATIC void
++xfs_buf_do_callbacks_fail(
++      struct xfs_buf          *bp)
++{
++      struct xfs_log_item     *next;
++      struct xfs_log_item     *lip = bp->b_fspriv;
++      struct xfs_ail          *ailp = lip->li_ailp;
++
++      spin_lock(&ailp->xa_lock);
++      for (; lip; lip = next) {
++              next = lip->li_bio_list;
++              if (lip->li_ops->iop_error)
++                      lip->li_ops->iop_error(lip, bp);
++      }
++      spin_unlock(&ailp->xa_lock);
++}
++
+ static bool
+ xfs_buf_iodone_callback_error(
+       struct xfs_buf          *bp)
+@@ -1123,7 +1134,11 @@ xfs_buf_iodone_callback_error(
+       if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount)
+               goto permanent_error;
+ 
+-      /* still a transient error, higher layers will retry */
++      /*
++       * Still a transient error, run IO completion failure callbacks and let
++       * the higher layers retry the buffer.
++       */
++      xfs_buf_do_callbacks_fail(bp);
+       xfs_buf_ioerror(bp, 0);
+       xfs_buf_relse(bp);
+       return true;
+@@ -1204,3 +1219,31 @@ xfs_buf_iodone(
+       xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE);
+       xfs_buf_item_free(BUF_ITEM(lip));
+ }
++
++/*
++ * Requeue a failed buffer for writeback
++ *
++ * Return true if the buffer has been re-queued properly, false otherwise
++ */
++bool
++xfs_buf_resubmit_failed_buffers(
++      struct xfs_buf          *bp,
++      struct xfs_log_item     *lip,
++      struct list_head        *buffer_list)
++{
++      struct xfs_log_item     *next;
++
++      /*
++       * Clear XFS_LI_FAILED flag from all items before resubmit
++       *
++       * XFS_LI_FAILED set/clear is protected by xa_lock, caller  this
++       * function already have it acquired
++       */
++      for (; lip; lip = next) {
++              next = lip->li_bio_list;
++              xfs_clear_li_failed(lip);
++      }
++
++      /* Add this buffer back to the delayed write list */
++      return xfs_buf_delwri_queue(bp, buffer_list);
++}
+diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
+index f7eba99d19dd..9690ce62c9a7 100644
+--- a/fs/xfs/xfs_buf_item.h
++++ b/fs/xfs/xfs_buf_item.h
+@@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item {
+ int   xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
+ void  xfs_buf_item_relse(struct xfs_buf *);
+ void  xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint);
+-uint  xfs_buf_item_dirty(xfs_buf_log_item_t *);
++bool  xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
+ void  xfs_buf_attach_iodone(struct xfs_buf *,
+                             void(*)(struct xfs_buf *, xfs_log_item_t *),
+                             xfs_log_item_t *);
+ void  xfs_buf_iodone_callbacks(struct xfs_buf *);
+ void  xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
++bool  xfs_buf_resubmit_failed_buffers(struct xfs_buf *,
++                                      struct xfs_log_item *,
++                                      struct list_head *);
+ 
+ extern kmem_zone_t    *xfs_buf_item_zone;
+ 
+diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
+index 0a9e6985a0d0..34227115a5d6 100644
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1124,11 +1124,11 @@ xfs_reclaim_inode(
+        * Because we use RCU freeing we need to ensure the inode always appears
+        * to be reclaimed with an invalid inode number when in the free state.
+        * We do this as early as possible under the ILOCK so that
+-       * xfs_iflush_cluster() can be guaranteed to detect races with us here.
+-       * By doing this, we guarantee that once xfs_iflush_cluster has locked
+-       * XFS_ILOCK that it will see either a valid, flushable inode that will
+-       * serialise correctly, or it will see a clean (and invalid) inode that
+-       * it can skip.
++       * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
++       * detect races with us here. By doing this, we guarantee that once
++       * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
++       * it will see either a valid inode that will serialise correctly, or it
++       * will see an invalid inode that it can skip.
+        */
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags = XFS_IRECLAIM;
+diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
+index ff48f0096810..97045e8dfed5 100644
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2359,11 +2359,24 @@ xfs_ifree_cluster(
+                        * already marked stale. If we can't lock it, back off
+                        * and retry.
+                        */
+-                      if (ip != free_ip &&
+-                          !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+-                              rcu_read_unlock();
+-                              delay(1);
+-                              goto retry;
++                      if (ip != free_ip) {
++                              if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
++                                      rcu_read_unlock();
++                                      delay(1);
++                                      goto retry;
++                              }
++
++                              /*
++                               * Check the inode number again in case we're
++                               * racing with freeing in xfs_reclaim_inode().
++                               * See the comments in that function for more
++                               * information as to why the initial check is
++                               * not sufficient.
++                               */
++                              if (ip->i_ino != inum + i) {
++                                      xfs_iunlock(ip, XFS_ILOCK_EXCL);
++                                      continue;
++                              }
+                       }
+                       rcu_read_unlock();
+ 
+diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
+index 013cc78d7daf..6d0f74ec31e8 100644
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -27,6 +27,7 @@
+ #include "xfs_error.h"
+ #include "xfs_trace.h"
+ #include "xfs_trans_priv.h"
++#include "xfs_buf_item.h"
+ #include "xfs_log.h"
+ 
+ 
+@@ -475,6 +476,23 @@ xfs_inode_item_unpin(
+               wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
+ }
+ 
++/*
++ * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer
++ * have been failed during writeback
++ *
++ * This informs the AIL that the inode is already flush locked on the next 
push,
++ * and acquires a hold on the buffer to ensure that it isn't reclaimed before
++ * dirty data makes it to disk.
++ */
++STATIC void
++xfs_inode_item_error(
++      struct xfs_log_item     *lip,
++      struct xfs_buf          *bp)
++{
++      ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode));
++      xfs_set_li_failed(lip, bp);
++}
++
+ STATIC uint
+ xfs_inode_item_push(
+       struct xfs_log_item     *lip,
+@@ -484,13 +502,28 @@ xfs_inode_item_push(
+ {
+       struct xfs_inode_log_item *iip = INODE_ITEM(lip);
+       struct xfs_inode        *ip = iip->ili_inode;
+-      struct xfs_buf          *bp = NULL;
++      struct xfs_buf          *bp = lip->li_buf;
+       uint                    rval = XFS_ITEM_SUCCESS;
+       int                     error;
+ 
+       if (xfs_ipincount(ip) > 0)
+               return XFS_ITEM_PINNED;
+ 
++      /*
++       * The buffer containing this item failed to be written back
++       * previously. Resubmit the buffer for IO.
++       */
++      if (lip->li_flags & XFS_LI_FAILED) {
++              if (!xfs_buf_trylock(bp))
++                      return XFS_ITEM_LOCKED;
++
++              if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list))
++                      rval = XFS_ITEM_FLUSHING;
++
++              xfs_buf_unlock(bp);
++              return rval;
++      }
++
+       if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+               return XFS_ITEM_LOCKED;
+ 
+@@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
+       .iop_unlock     = xfs_inode_item_unlock,
+       .iop_committed  = xfs_inode_item_committed,
+       .iop_push       = xfs_inode_item_push,
+-      .iop_committing = xfs_inode_item_committing
++      .iop_committing = xfs_inode_item_committing,
++      .iop_error      = xfs_inode_item_error
+ };
+ 
+ 
+@@ -710,7 +744,8 @@ xfs_iflush_done(
+                * the AIL lock.
+                */
+               iip = INODE_ITEM(blip);
+-              if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn)
++              if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) ||
++                  lip->li_flags & XFS_LI_FAILED)
+                       need_ail++;
+ 
+               blip = next;
+@@ -718,7 +753,8 @@ xfs_iflush_done(
+ 
+       /* make sure we capture the state of the initial inode. */
+       iip = INODE_ITEM(lip);
+-      if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn)
++      if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) ||
++          lip->li_flags & XFS_LI_FAILED)
+               need_ail++;
+ 
+       /*
+@@ -739,6 +775,9 @@ xfs_iflush_done(
+                       if (INODE_ITEM(blip)->ili_logged &&
+                           blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
+                               mlip_changed |= xfs_ail_delete_one(ailp, blip);
++                      else {
++                              xfs_clear_li_failed(blip);
++                      }
+               }
+ 
+               if (mlip_changed) {
+diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
+index 9c0c7a920304..5049e8ab6e30 100644
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -931,16 +931,15 @@ xfs_ioc_fsgetxattr(
+       return 0;
+ }
+ 
+-STATIC void
+-xfs_set_diflags(
++STATIC uint16_t
++xfs_flags2diflags(
+       struct xfs_inode        *ip,
+       unsigned int            xflags)
+ {
+-      unsigned int            di_flags;
+-      uint64_t                di_flags2;
+-
+       /* can't set PREALLOC this way, just preserve it */
+-      di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
++      uint16_t                di_flags =
++              (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
++
+       if (xflags & FS_XFLAG_IMMUTABLE)
+               di_flags |= XFS_DIFLAG_IMMUTABLE;
+       if (xflags & FS_XFLAG_APPEND)
+@@ -970,19 +969,24 @@ xfs_set_diflags(
+               if (xflags & FS_XFLAG_EXTSIZE)
+                       di_flags |= XFS_DIFLAG_EXTSIZE;
+       }
+-      ip->i_d.di_flags = di_flags;
+ 
+-      /* diflags2 only valid for v3 inodes. */
+-      if (ip->i_d.di_version < 3)
+-              return;
++      return di_flags;
++}
++
++STATIC uint64_t
++xfs_flags2diflags2(
++      struct xfs_inode        *ip,
++      unsigned int            xflags)
++{
++      uint64_t                di_flags2 =
++              (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+ 
+-      di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
+       if (xflags & FS_XFLAG_DAX)
+               di_flags2 |= XFS_DIFLAG2_DAX;
+       if (xflags & FS_XFLAG_COWEXTSIZE)
+               di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ 
+-      ip->i_d.di_flags2 = di_flags2;
++      return di_flags2;
+ }
+ 
+ STATIC void
+@@ -1008,11 +1012,12 @@ xfs_diflags_to_linux(
+               inode->i_flags |= S_NOATIME;
+       else
+               inode->i_flags &= ~S_NOATIME;
++#if 0 /* disabled until the flag switching races are sorted out */
+       if (xflags & FS_XFLAG_DAX)
+               inode->i_flags |= S_DAX;
+       else
+               inode->i_flags &= ~S_DAX;
+-
++#endif
+ }
+ 
+ static int
+@@ -1022,6 +1027,7 @@ xfs_ioctl_setattr_xflags(
+       struct fsxattr          *fa)
+ {
+       struct xfs_mount        *mp = ip->i_mount;
++      uint64_t                di_flags2;
+ 
+       /* Can't change realtime flag if any extents are allocated. */
+       if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+@@ -1052,7 +1058,14 @@ xfs_ioctl_setattr_xflags(
+           !capable(CAP_LINUX_IMMUTABLE))
+               return -EPERM;
+ 
+-      xfs_set_diflags(ip, fa->fsx_xflags);
++      /* diflags2 only valid for v3 inodes. */
++      di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
++      if (di_flags2 && ip->i_d.di_version < 3)
++              return -EINVAL;
++
++      ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
++      ip->i_d.di_flags2 = di_flags2;
++
+       xfs_diflags_to_linux(ip);
+       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
+index 469c9fa4c178..17081c77ef86 100644
+--- a/fs/xfs/xfs_iops.c
++++ b/fs/xfs/xfs_iops.c
+@@ -817,7 +817,7 @@ xfs_vn_setattr_nonsize(
+  * Caution: The caller of this function is responsible for calling
+  * setattr_prepare() or otherwise verifying the change is fine.
+  */
+-int
++STATIC int
+ xfs_setattr_size(
+       struct xfs_inode        *ip,
+       struct iattr            *iattr)
+diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
+index 4ebd0bafc914..c5107c7bc4bf 100644
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -743,10 +743,14 @@ xfs_log_mount_finish(
+       struct xfs_mount        *mp)
+ {
+       int     error = 0;
++      bool    readonly = (mp->m_flags & XFS_MOUNT_RDONLY);
+ 
+       if (mp->m_flags & XFS_MOUNT_NORECOVERY) {
+               ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+               return 0;
++      } else if (readonly) {
++              /* Allow unlinked processing to proceed */
++              mp->m_flags &= ~XFS_MOUNT_RDONLY;
+       }
+ 
+       /*
+@@ -757,12 +761,27 @@ xfs_log_mount_finish(
+        * inodes.  Turn it off immediately after recovery finishes
+        * so that we don't leak the quota inodes if subsequent mount
+        * activities fail.
++       *
++       * We let all inodes involved in redo item processing end up on
++       * the LRU instead of being evicted immediately so that if we do
++       * something to an unlinked inode, the irele won't cause
++       * premature truncation and freeing of the inode, which results
++       * in log recovery failure.  We have to evict the unreferenced
++       * lru inodes after clearing MS_ACTIVE because we don't
++       * otherwise clean up the lru if there's a subsequent failure in
++       * xfs_mountfs, which leads to us leaking the inodes if nothing
++       * else (e.g. quotacheck) references the inodes before the
++       * mount failure occurs.
+        */
+       mp->m_super->s_flags |= MS_ACTIVE;
+       error = xlog_recover_finish(mp->m_log);
+       if (!error)
+               xfs_log_work_queue(mp);
+       mp->m_super->s_flags &= ~MS_ACTIVE;
++      evict_inodes(mp->m_super);
++
++      if (readonly)
++              mp->m_flags |= XFS_MOUNT_RDONLY;
+ 
+       return error;
+ }
+@@ -812,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp)
+       int              error;
+ 
+       /*
+-       * Don't write out unmount record on read-only mounts.
++       * Don't write out unmount record on norecovery mounts or ro devices.
+        * Or, if we are doing a forced umount (typically because of IO errors).
+        */
+-      if (mp->m_flags & XFS_MOUNT_RDONLY)
++      if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
++          xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) {
++              ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+               return 0;
++      }
+ 
+       error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL);
+       ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
+@@ -3353,8 +3375,6 @@ _xfs_log_force(
+                */
+               if (iclog->ic_state & XLOG_STATE_IOERROR)
+                       return -EIO;
+-              if (log_flushed)
+-                      *log_flushed = 1;
+       } else {
+ 
+ no_sleep:
+@@ -3458,8 +3478,6 @@ _xfs_log_force_lsn(
+ 
+                               xlog_wait(&iclog->ic_prev->ic_write_wait,
+                                                       &log->l_icloglock);
+-                              if (log_flushed)
+-                                      *log_flushed = 1;
+                               already_slept = 1;
+                               goto try_again;
+                       }
+@@ -3493,9 +3511,6 @@ _xfs_log_force_lsn(
+                        */
+                       if (iclog->ic_state & XLOG_STATE_IOERROR)
+                               return -EIO;
+-
+-                      if (log_flushed)
+-                              *log_flushed = 1;
+               } else {                /* just return */
+                       spin_unlock(&log->l_icloglock);
+               }
+diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
+index 9549188f5a36..093ee8289057 100644
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -1029,61 +1029,106 @@ xlog_seek_logrec_hdr(
+ }
+ 
+ /*
+- * Check the log tail for torn writes. This is required when torn writes are
+- * detected at the head and the head had to be walked back to a previous 
record.
+- * The tail of the previous record must now be verified to ensure the torn
+- * writes didn't corrupt the previous tail.
++ * Calculate distance from head to tail (i.e., unused space in the log).
++ */
++static inline int
++xlog_tail_distance(
++      struct xlog     *log,
++      xfs_daddr_t     head_blk,
++      xfs_daddr_t     tail_blk)
++{
++      if (head_blk < tail_blk)
++              return tail_blk - head_blk;
++
++      return tail_blk + (log->l_logBBsize - head_blk);
++}
++
++/*
++ * Verify the log tail. This is particularly important when torn or incomplete
++ * writes have been detected near the front of the log and the head has been
++ * walked back accordingly.
++ *
++ * We also have to handle the case where the tail was pinned and the head
++ * blocked behind the tail right before a crash. If the tail had been pushed
++ * immediately prior to the crash and the subsequent checkpoint was only
++ * partially written, it's possible it overwrote the last referenced tail in 
the
++ * log with garbage. This is not a coherency problem because the tail must 
have
++ * been pushed before it can be overwritten, but appears as log corruption to
++ * recovery because we have no way to know the tail was updated if the
++ * subsequent checkpoint didn't write successfully.
+  *
+- * Return an error if CRC verification fails as recovery cannot proceed.
++ * Therefore, CRC check the log from tail to head. If a failure occurs and the
++ * offending record is within max iclog bufs from the head, walk the tail
++ * forward and retry until a valid tail is found or corruption is detected out
++ * of the range of a possible overwrite.
+  */
+ STATIC int
+ xlog_verify_tail(
+       struct xlog             *log,
+       xfs_daddr_t             head_blk,
+-      xfs_daddr_t             tail_blk)
++      xfs_daddr_t             *tail_blk,
++      int                     hsize)
+ {
+       struct xlog_rec_header  *thead;
+       struct xfs_buf          *bp;
+       xfs_daddr_t             first_bad;
+-      int                     count;
+       int                     error = 0;
+       bool                    wrapped;
+-      xfs_daddr_t             tmp_head;
++      xfs_daddr_t             tmp_tail;
++      xfs_daddr_t             orig_tail = *tail_blk;
+ 
+       bp = xlog_get_bp(log, 1);
+       if (!bp)
+               return -ENOMEM;
+ 
+       /*
+-       * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
+-       * a temporary head block that points after the last possible
+-       * concurrently written record of the tail.
++       * Make sure the tail points to a record (returns positive count on
++       * success).
+        */
+-      count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
+-                                   XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
+-                                   &wrapped);
+-      if (count < 0) {
+-              error = count;
++      error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp,
++                      &tmp_tail, &thead, &wrapped);
++      if (error < 0)
+               goto out;
+-      }
+-
+-      /*
+-       * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
+-       * into the actual log head. tmp_head points to the start of the record
+-       * so update it to the actual head block.
+-       */
+-      if (count < XLOG_MAX_ICLOGS + 1)
+-              tmp_head = head_blk;
++      if (*tail_blk != tmp_tail)
++              *tail_blk = tmp_tail;
+ 
+       /*
+-       * We now have a tail and temporary head block that covers at least
+-       * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
+-       * records were completely written. Run a CRC verification pass from
+-       * tail to head and return the result.
++       * Run a CRC check from the tail to the head. We can't just check
++       * MAX_ICLOGS records past the tail because the tail may point to stale
++       * blocks cleared during the search for the head/tail. These blocks are
++       * overwritten with zero-length records and thus record count is not a
++       * reliable indicator of the iclog state before a crash.
+        */
+-      error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
++      first_bad = 0;
++      error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
+                                     XLOG_RECOVER_CRCPASS, &first_bad);
++      while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
++              int     tail_distance;
++
++              /*
++               * Is corruption within range of the head? If so, retry from
++               * the next record. Otherwise return an error.
++               */
++              tail_distance = xlog_tail_distance(log, head_blk, first_bad);
++              if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
++                      break;
+ 
++              /* skip to the next record; returns positive count on success */
++              error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp,
++                              &tmp_tail, &thead, &wrapped);
++              if (error < 0)
++                      goto out;
++
++              *tail_blk = tmp_tail;
++              first_bad = 0;
++              error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
++                                            XLOG_RECOVER_CRCPASS, &first_bad);
++      }
++
++      if (!error && *tail_blk != orig_tail)
++              xfs_warn(log->l_mp,
++              "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
++                       orig_tail, *tail_blk);
+ out:
+       xlog_put_bp(bp);
+       return error;
+@@ -1143,7 +1188,7 @@ xlog_verify_head(
+        */
+       error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
+                                     XLOG_RECOVER_CRCPASS, &first_bad);
+-      if (error == -EFSBADCRC) {
++      if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
+               /*
+                * We've hit a potential torn write. Reset the error and warn
+                * about it.
+@@ -1183,31 +1228,12 @@ xlog_verify_head(
+                       ASSERT(0);
+                       return 0;
+               }
+-
+-              /*
+-               * Now verify the tail based on the updated head. This is
+-               * required because the torn writes trimmed from the head could
+-               * have been written over the tail of a previous record. Return
+-               * any errors since recovery cannot proceed if the tail is
+-               * corrupt.
+-               *
+-               * XXX: This leaves a gap in truly robust protection from torn
+-               * writes in the log. If the head is behind the tail, the tail
+-               * pushes forward to create some space and then a crash occurs
+-               * causing the writes into the previous record's tail region to
+-               * tear, log recovery isn't able to recover.
+-               *
+-               * How likely is this to occur? If possible, can we do something
+-               * more intelligent here? Is it safe to push the tail forward if
+-               * we can determine that the tail is within the range of the
+-               * torn write (e.g., the kernel can only overwrite the tail if
+-               * it has actually been pushed forward)? Alternatively, could we
+-               * somehow prevent this condition at runtime?
+-               */
+-              error = xlog_verify_tail(log, *head_blk, *tail_blk);
+       }
++      if (error)
++              return error;
+ 
+-      return error;
++      return xlog_verify_tail(log, *head_blk, tail_blk,
++                              be32_to_cpu((*rhead)->h_size));
+ }
+ 
+ /*
+@@ -4801,12 +4827,16 @@ xlog_recover_process_intents(
+       int                     error = 0;
+       struct xfs_ail_cursor   cur;
+       struct xfs_ail          *ailp;
++#if defined(DEBUG) || defined(XFS_WARN)
+       xfs_lsn_t               last_lsn;
++#endif
+ 
+       ailp = log->l_ailp;
+       spin_lock(&ailp->xa_lock);
+       lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
++#if defined(DEBUG) || defined(XFS_WARN)
+       last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
++#endif
+       while (lip != NULL) {
+               /*
+                * We're done when we see something other than an intent.
+@@ -5218,7 +5248,7 @@ xlog_do_recovery_pass(
+       xfs_daddr_t             *first_bad)     /* out: first bad log rec */
+ {
+       xlog_rec_header_t       *rhead;
+-      xfs_daddr_t             blk_no;
++      xfs_daddr_t             blk_no, rblk_no;
+       xfs_daddr_t             rhead_blk;
+       char                    *offset;
+       xfs_buf_t               *hbp, *dbp;
+@@ -5231,7 +5261,7 @@ xlog_do_recovery_pass(
+       LIST_HEAD               (buffer_list);
+ 
+       ASSERT(head_blk != tail_blk);
+-      rhead_blk = 0;
++      blk_no = rhead_blk = tail_blk;
+ 
+       for (i = 0; i < XLOG_RHASH_SIZE; i++)
+               INIT_HLIST_HEAD(&rhash[i]);
+@@ -5309,7 +5339,6 @@ xlog_do_recovery_pass(
+       }
+ 
+       memset(rhash, 0, sizeof(rhash));
+-      blk_no = rhead_blk = tail_blk;
+       if (tail_blk > head_blk) {
+               /*
+                * Perform recovery around the end of the physical log.
+@@ -5371,9 +5400,19 @@ xlog_do_recovery_pass(
+                       bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
+                       blk_no += hblks;
+ 
+-                      /* Read in data for log record */
+-                      if (blk_no + bblks <= log->l_logBBsize) {
+-                              error = xlog_bread(log, blk_no, bblks, dbp,
++                      /*
++                       * Read the log record data in multiple reads if it
++                       * wraps around the end of the log. Note that if the
++                       * header already wrapped, blk_no could point past the
++                       * end of the log. The record data is contiguous in
++                       * that case.
++                       */
++                      if (blk_no + bblks <= log->l_logBBsize ||
++                          blk_no >= log->l_logBBsize) {
++                              /* mod blk_no in case the header wrapped and
++                               * pushed it beyond the end of the log */
++                              rblk_no = do_mod(blk_no, log->l_logBBsize);
++                              error = xlog_bread(log, rblk_no, bblks, dbp,
+                                                  &offset);
+                               if (error)
+                                       goto bread_err2;
+diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
+index 38aaacdbb8b3..c1c4c2ea1014 100644
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1220,7 +1220,7 @@ xfs_test_remount_options(
+       tmp_mp->m_super = sb;
+       error = xfs_parseargs(tmp_mp, options);
+       xfs_free_fsname(tmp_mp);
+-      kfree(tmp_mp);
++      kmem_free(tmp_mp);
+ 
+       return error;
+ }
+diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
+index bcc3cdf8e1c5..bb0099708827 100644
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -517,7 +517,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered);
+ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
+diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
+index 6bdad6f58934..4709823e04b9 100644
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -49,6 +49,7 @@ typedef struct xfs_log_item {
+       struct xfs_ail                  *li_ailp;       /* ptr to AIL */
+       uint                            li_type;        /* item type */
+       uint                            li_flags;       /* misc flags */
++      struct xfs_buf                  *li_buf;        /* real buffer pointer 
*/
+       struct xfs_log_item             *li_bio_list;   /* buffer item list */
+       void                            (*li_cb)(struct xfs_buf *,
+                                                struct xfs_log_item *);
+@@ -64,11 +65,13 @@ typedef struct xfs_log_item {
+ } xfs_log_item_t;
+ 
+ #define       XFS_LI_IN_AIL   0x1
+-#define XFS_LI_ABORTED        0x2
++#define       XFS_LI_ABORTED  0x2
++#define       XFS_LI_FAILED   0x4
+ 
+ #define XFS_LI_FLAGS \
+       { XFS_LI_IN_AIL,        "IN_AIL" }, \
+-      { XFS_LI_ABORTED,       "ABORTED" }
++      { XFS_LI_ABORTED,       "ABORTED" }, \
++      { XFS_LI_FAILED,        "FAILED" }
+ 
+ struct xfs_item_ops {
+       void (*iop_size)(xfs_log_item_t *, int *, int *);
+@@ -79,6 +82,7 @@ struct xfs_item_ops {
+       void (*iop_unlock)(xfs_log_item_t *);
+       xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+       void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
++      void (*iop_error)(xfs_log_item_t *, xfs_buf_t *);
+ };
+ 
+ void  xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item,
+@@ -208,12 +212,14 @@ void             xfs_trans_bhold_release(xfs_trans_t *, 
struct xfs_buf *);
+ void          xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
+ void          xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *);
+ void          xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *);
+-void          xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
++bool          xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *);
+ void          xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint);
+ void          xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *);
+ void          xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int);
+ void          xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
+-void          xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint);
++void          xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
++                                uint);
++void          xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
+ void          xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
+ 
+ void          xfs_extent_free_init_defer_op(void);
+diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
+index 9056c0f34a3c..70f5ab017323 100644
+--- a/fs/xfs/xfs_trans_ail.c
++++ b/fs/xfs/xfs_trans_ail.c
+@@ -687,12 +687,13 @@ xfs_trans_ail_update_bulk(
+ bool
+ xfs_ail_delete_one(
+       struct xfs_ail          *ailp,
+-      struct xfs_log_item     *lip)
++      struct xfs_log_item     *lip)
+ {
+       struct xfs_log_item     *mlip = xfs_ail_min(ailp);
+ 
+       trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+       xfs_ail_delete(ailp, lip);
++      xfs_clear_li_failed(lip);
+       lip->li_flags &= ~XFS_LI_IN_AIL;
+       lip->li_lsn = 0;
+ 
+diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
+index 86987d823d76..3ba7a96a8abd 100644
+--- a/fs/xfs/xfs_trans_buf.c
++++ b/fs/xfs/xfs_trans_buf.c
+@@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t       *tp,
+       if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
+               xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
+               xfs_buf_item_relse(bp);
+-      } else if (!xfs_buf_item_dirty(bip)) {
++      } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
+ /***
+               ASSERT(bp->b_pincount == 0);
+ ***/
+@@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t      *tp,
+ }
+ 
+ /*
+- * This is called to mark bytes first through last inclusive of the given
+- * buffer as needing to be logged when the transaction is committed.
+- * The buffer must already be associated with the given transaction.
+- *
+- * First and last are numbers relative to the beginning of this buffer,
+- * so the first byte in the buffer is numbered 0 regardless of the
+- * value of b_blkno.
++ * Mark a buffer dirty in the transaction.
+  */
+ void
+-xfs_trans_log_buf(xfs_trans_t *tp,
+-                xfs_buf_t     *bp,
+-                uint          first,
+-                uint          last)
++xfs_trans_dirty_buf(
++      struct xfs_trans        *tp,
++      struct xfs_buf          *bp)
+ {
+-      xfs_buf_log_item_t      *bip = bp->b_fspriv;
++      struct xfs_buf_log_item *bip = bp->b_fspriv;
+ 
+       ASSERT(bp->b_transp == tp);
+       ASSERT(bip != NULL);
+-      ASSERT(first <= last && last < BBTOB(bp->b_length));
+       ASSERT(bp->b_iodone == NULL ||
+              bp->b_iodone == xfs_buf_iodone_callbacks);
+ 
+@@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t      *tp,
+       bp->b_iodone = xfs_buf_iodone_callbacks;
+       bip->bli_item.li_cb = xfs_buf_iodone;
+ 
+-      trace_xfs_trans_log_buf(bip);
+-
+       /*
+        * If we invalidated the buffer within this transaction, then
+        * cancel the invalidation now that we're dirtying the buffer
+@@ -545,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t    *tp,
+               bp->b_flags &= ~XBF_STALE;
+               bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
+       }
++      bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+ 
+       tp->t_flags |= XFS_TRANS_DIRTY;
+       bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
++}
+ 
+-      /*
+-       * If we have an ordered buffer we are not logging any dirty range but
+-       * it still needs to be marked dirty and that it has been logged.
+-       */
+-      bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED;
+-      if (!(bip->bli_flags & XFS_BLI_ORDERED))
+-              xfs_buf_item_log(bip, first, last);
++/*
++ * This is called to mark bytes first through last inclusive of the given
++ * buffer as needing to be logged when the transaction is committed.
++ * The buffer must already be associated with the given transaction.
++ *
++ * First and last are numbers relative to the beginning of this buffer,
++ * so the first byte in the buffer is numbered 0 regardless of the
++ * value of b_blkno.
++ */
++void
++xfs_trans_log_buf(
++      struct xfs_trans        *tp,
++      struct xfs_buf          *bp,
++      uint                    first,
++      uint                    last)
++{
++      struct xfs_buf_log_item *bip = bp->b_fspriv;
++
++      ASSERT(first <= last && last < BBTOB(bp->b_length));
++      ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
++
++      xfs_trans_dirty_buf(tp, bp);
++
++      trace_xfs_trans_log_buf(bip);
++      xfs_buf_item_log(bip, first, last);
+ }
+ 
+ 
+@@ -708,14 +718,13 @@ xfs_trans_inode_alloc_buf(
+ }
+ 
+ /*
+- * Mark the buffer as ordered for this transaction. This means
+- * that the contents of the buffer are not recorded in the transaction
+- * but it is tracked in the AIL as though it was. This allows us
+- * to record logical changes in transactions rather than the physical
+- * changes we make to the buffer without changing writeback ordering
+- * constraints of metadata buffers.
++ * Mark the buffer as ordered for this transaction. This means that the 
contents
++ * of the buffer are not recorded in the transaction but it is tracked in the
++ * AIL as though it was. This allows us to record logical changes in
++ * transactions rather than the physical changes we make to the buffer without
++ * changing writeback ordering constraints of metadata buffers.
+  */
+-void
++bool
+ xfs_trans_ordered_buf(
+       struct xfs_trans        *tp,
+       struct xfs_buf          *bp)
+@@ -726,8 +735,18 @@ xfs_trans_ordered_buf(
+       ASSERT(bip != NULL);
+       ASSERT(atomic_read(&bip->bli_refcount) > 0);
+ 
++      if (xfs_buf_item_dirty_format(bip))
++              return false;
++
+       bip->bli_flags |= XFS_BLI_ORDERED;
+       trace_xfs_buf_item_ordered(bip);
++
++      /*
++       * We don't log a dirty range of an ordered buffer but it still needs
++       * to be marked dirty and that it has been logged.
++       */
++      xfs_trans_dirty_buf(tp, bp);
++      return true;
+ }
+ 
+ /*
+diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
+index d91706c56c63..b317a3644c00 100644
+--- a/fs/xfs/xfs_trans_priv.h
++++ b/fs/xfs/xfs_trans_priv.h
+@@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn(
+       *dst = *src;
+ }
+ #endif
++
++static inline void
++xfs_clear_li_failed(
++      struct xfs_log_item     *lip)
++{
++      struct xfs_buf  *bp = lip->li_buf;
++
++      ASSERT(lip->li_flags & XFS_LI_IN_AIL);
++      lockdep_assert_held(&lip->li_ailp->xa_lock);
++
++      if (lip->li_flags & XFS_LI_FAILED) {
++              lip->li_flags &= ~XFS_LI_FAILED;
++              lip->li_buf = NULL;
++              xfs_buf_rele(bp);
++      }
++}
++
++static inline void
++xfs_set_li_failed(
++      struct xfs_log_item     *lip,
++      struct xfs_buf          *bp)
++{
++      lockdep_assert_held(&lip->li_ailp->xa_lock);
++
++      if (!(lip->li_flags & XFS_LI_FAILED)) {
++              xfs_buf_hold(bp);
++              lip->li_flags |= XFS_LI_FAILED;
++              lip->li_buf = bp;
++      }
++}
++
+ #endif        /* __XFS_TRANS_PRIV_H__ */
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index cbfe127bccf8..d0c0ca8ea8c1 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -2831,6 +2831,7 @@ static inline void 
lockdep_annotate_inode_mutex_key(struct inode *inode) { };
+ #endif
+ extern void unlock_new_inode(struct inode *);
+ extern unsigned int get_next_ino(void);
++extern void evict_inodes(struct super_block *sb);
+ 
+ extern void __iget(struct inode * inode);
+ extern void iget_failed(struct inode *);
+diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
+index e030a68ead7e..25438b2b6f22 100644
+--- a/include/linux/mm_inline.h
++++ b/include/linux/mm_inline.h
+@@ -126,4 +126,10 @@ static __always_inline enum lru_list page_lru(struct page 
*page)
+ 
+ #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
+ 
++#ifdef arch_unmap_kpfn
++extern void arch_unmap_kpfn(unsigned long pfn);
++#else
++static __always_inline void arch_unmap_kpfn(unsigned long pfn) { }
++#endif
++
+ #endif
+diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
+index d67a8182e5eb..63df75ae70ee 100644
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -885,7 +885,7 @@ void kfree_skb(struct sk_buff *skb);
+ void kfree_skb_list(struct sk_buff *segs);
+ void skb_tx_error(struct sk_buff *skb);
+ void consume_skb(struct sk_buff *skb);
+-void consume_stateless_skb(struct sk_buff *skb);
++void __consume_stateless_skb(struct sk_buff *skb);
+ void  __kfree_skb(struct sk_buff *skb);
+ extern struct kmem_cache *skbuff_head_cache;
+ 
+diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
+index 6fdcd2427776..fc59e0775e00 100644
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -1,14 +1,9 @@
+ #ifndef __NET_FRAG_H__
+ #define __NET_FRAG_H__
+ 
+-#include <linux/percpu_counter.h>
+-
+ struct netns_frags {
+-      /* The percpu_counter "mem" need to be cacheline aligned.
+-       *  mem.count must not share cacheline with other writers
+-       */
+-      struct percpu_counter   mem ____cacheline_aligned_in_smp;
+-
++      /* Keep atomic mem on separate cachelines in structs that include it */
++      atomic_t                mem ____cacheline_aligned_in_smp;
+       /* sysctls */
+       int                     timeout;
+       int                     high_thresh;
+@@ -108,15 +103,10 @@ struct inet_frags {
+ int inet_frags_init(struct inet_frags *);
+ void inet_frags_fini(struct inet_frags *);
+ 
+-static inline int inet_frags_init_net(struct netns_frags *nf)
+-{
+-      return percpu_counter_init(&nf->mem, 0, GFP_KERNEL);
+-}
+-static inline void inet_frags_uninit_net(struct netns_frags *nf)
++static inline void inet_frags_init_net(struct netns_frags *nf)
+ {
+-      percpu_counter_destroy(&nf->mem);
++      atomic_set(&nf->mem, 0);
+ }
+-
+ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+ 
+ void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
+@@ -140,31 +130,24 @@ static inline bool inet_frag_evicting(struct 
inet_frag_queue *q)
+ 
+ /* Memory Tracking Functions. */
+ 
+-/* The default percpu_counter batch size is not big enough to scale to
+- * fragmentation mem acct sizes.
+- * The mem size of a 64K fragment is approx:
+- *  (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
+- */
+-static unsigned int frag_percpu_counter_batch = 130000;
+-
+ static inline int frag_mem_limit(struct netns_frags *nf)
+ {
+-      return percpu_counter_read(&nf->mem);
++      return atomic_read(&nf->mem);
+ }
+ 
+ static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
+ {
+-      percpu_counter_add_batch(&nf->mem, -i, frag_percpu_counter_batch);
++      atomic_sub(i, &nf->mem);
+ }
+ 
+ static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
+ {
+-      percpu_counter_add_batch(&nf->mem, i, frag_percpu_counter_batch);
++      atomic_add(i, &nf->mem);
+ }
+ 
+-static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
++static inline int sum_frag_mem_limit(struct netns_frags *nf)
+ {
+-      return percpu_counter_sum_positive(&nf->mem);
++      return atomic_read(&nf->mem);
+ }
+ 
+ /* RFC 3168 support :
+diff --git a/lib/idr.c b/lib/idr.c
+index b13682bb0a1c..20c2779e8d12 100644
+--- a/lib/idr.c
++++ b/lib/idr.c
+@@ -154,7 +154,7 @@ void *idr_replace(struct idr *idr, void *ptr, int id)
+       void __rcu **slot = NULL;
+       void *entry;
+ 
+-      if (WARN_ON_ONCE(id < 0))
++      if (id < 0)
+               return ERR_PTR(-EINVAL);
+       if (WARN_ON_ONCE(radix_tree_is_internal_node(ptr)))
+               return ERR_PTR(-EINVAL);
+diff --git a/mm/memory-failure.c b/mm/memory-failure.c
+index 1cd3b3569af8..88366626c0b7 100644
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -1146,6 +1146,8 @@ int memory_failure(unsigned long pfn, int trapno, int 
flags)
+               return 0;
+       }
+ 
++      arch_unmap_kpfn(pfn);
++
+       orig_head = hpage = compound_head(p);
+       num_poisoned_pages_inc();
+ 
+diff --git a/net/core/skbuff.c b/net/core/skbuff.c
+index e07556606284..72eb23d2426f 100644
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -753,14 +753,11 @@ EXPORT_SYMBOL(consume_skb);
+  *    consume_stateless_skb - free an skbuff, assuming it is stateless
+  *    @skb: buffer to free
+  *
+- *    Works like consume_skb(), but this variant assumes that all the head
+- *    states have been already dropped.
++ *    Alike consume_skb(), but this variant assumes that this is the last
++ *    skb reference and all the head states have been already dropped
+  */
+-void consume_stateless_skb(struct sk_buff *skb)
++void __consume_stateless_skb(struct sk_buff *skb)
+ {
+-      if (!skb_unref(skb))
+-              return;
+-
+       trace_consume_skb(skb);
+       if (likely(skb->head))
+               skb_release_data(skb);
+diff --git a/net/ieee802154/6lowpan/reassembly.c 
b/net/ieee802154/6lowpan/reassembly.c
+index 30d875dff6b5..f85b08baff16 100644
+--- a/net/ieee802154/6lowpan/reassembly.c
++++ b/net/ieee802154/6lowpan/reassembly.c
+@@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_net(struct net 
*net)
+ {
+       struct netns_ieee802154_lowpan *ieee802154_lowpan =
+               net_ieee802154_lowpan(net);
+-      int res;
+ 
+       ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+ 
+-      res = inet_frags_init_net(&ieee802154_lowpan->frags);
+-      if (res)
+-              return res;
+-      res = lowpan_frags_ns_sysctl_register(net);
+-      if (res)
+-              inet_frags_uninit_net(&ieee802154_lowpan->frags);
+-      return res;
++      inet_frags_init_net(&ieee802154_lowpan->frags);
++
++      return lowpan_frags_ns_sysctl_register(net);
+ }
+ 
+ static void __net_exit lowpan_frags_exit_net(struct net *net)
+diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
+index 96e95e83cc61..af74d0433453 100644
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -234,10 +234,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct 
inet_frags *f)
+       cond_resched();
+ 
+       if (read_seqretry(&f->rnd_seqlock, seq) ||
+-          percpu_counter_sum(&nf->mem))
++          sum_frag_mem_limit(nf))
+               goto evict_again;
+-
+-      percpu_counter_destroy(&nf->mem);
+ }
+ EXPORT_SYMBOL(inet_frags_exit_net);
+ 
+diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
+index 9a8cfac503dc..46408c220d9d 100644
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -844,8 +844,6 @@ static void __init ip4_frags_ctl_register(void)
+ 
+ static int __net_init ipv4_frags_init_net(struct net *net)
+ {
+-      int res;
+-
+       /* Fragment cache limits.
+        *
+        * The fragment memory accounting code, (tries to) account for
+@@ -871,13 +869,9 @@ static int __net_init ipv4_frags_init_net(struct net *net)
+ 
+       net->ipv4.frags.max_dist = 64;
+ 
+-      res = inet_frags_init_net(&net->ipv4.frags);
+-      if (res)
+-              return res;
+-      res = ip4_frags_ns_ctl_register(net);
+-      if (res)
+-              inet_frags_uninit_net(&net->ipv4.frags);
+-      return res;
++      inet_frags_init_net(&net->ipv4.frags);
++
++      return ip4_frags_ns_ctl_register(net);
+ }
+ 
+ static void __net_exit ipv4_frags_exit_net(struct net *net)
+diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
+index 129d1a3616f8..e1856bfa753d 100644
+--- a/net/ipv4/ip_tunnel.c
++++ b/net/ipv4/ip_tunnel.c
+@@ -618,8 +618,8 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct 
net_device *dev, u8 proto)
+               ip_rt_put(rt);
+               goto tx_dropped;
+       }
+-      iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
+-                    key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
++      iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
++                    df, !net_eq(tunnel->net, dev_net(dev)));
+       return;
+ tx_error:
+       dev->stats.tx_errors++;
+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
+index e9252c7df809..21022db7a2a6 100644
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -1722,9 +1722,9 @@ int tcp_v4_rcv(struct sk_buff *skb)
+                */
+               sock_hold(sk);
+               refcounted = true;
+-              if (tcp_filter(sk, skb))
+-                      goto discard_and_relse;
+-              nsk = tcp_check_req(sk, skb, req, false);
++              nsk = NULL;
++              if (!tcp_filter(sk, skb))
++                      nsk = tcp_check_req(sk, skb, req, false);
+               if (!nsk) {
+                       reqsk_put(req);
+                       goto discard_and_relse;
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index 62344804baae..979e4d8526ba 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -1386,12 +1386,15 @@ void skb_consume_udp(struct sock *sk, struct sk_buff 
*skb, int len)
+               unlock_sock_fast(sk, slow);
+       }
+ 
++      if (!skb_unref(skb))
++              return;
++
+       /* In the more common cases we cleared the head states previously,
+        * see __udp_queue_rcv_skb().
+        */
+       if (unlikely(udp_skb_has_head_state(skb)))
+               skb_release_head_state(skb);
+-      consume_stateless_skb(skb);
++      __consume_stateless_skb(skb);
+ }
+ EXPORT_SYMBOL_GPL(skb_consume_udp);
+ 
+diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
+index e1c85bb4eac0..1792bbfd80e1 100644
+--- a/net/ipv6/ip6_fib.c
++++ b/net/ipv6/ip6_fib.c
+@@ -198,6 +198,12 @@ static void rt6_release(struct rt6_info *rt)
+       }
+ }
+ 
++static void fib6_free_table(struct fib6_table *table)
++{
++      inetpeer_invalidate_tree(&table->tb6_peers);
++      kfree(table);
++}
++
+ static void fib6_link_table(struct net *net, struct fib6_table *tb)
+ {
+       unsigned int h;
+@@ -1915,15 +1921,22 @@ static int __net_init fib6_net_init(struct net *net)
+ 
+ static void fib6_net_exit(struct net *net)
+ {
++      unsigned int i;
++
+       rt6_ifdown(net, NULL);
+       del_timer_sync(&net->ipv6.ip6_fib_timer);
+ 
+-#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+-      inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
+-      kfree(net->ipv6.fib6_local_tbl);
+-#endif
+-      inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
+-      kfree(net->ipv6.fib6_main_tbl);
++      for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
++              struct hlist_head *head = &net->ipv6.fib_table_hash[i];
++              struct hlist_node *tmp;
++              struct fib6_table *tb;
++
++              hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
++                      hlist_del(&tb->tb6_hlist);
++                      fib6_free_table(tb);
++              }
++      }
++
+       kfree(net->ipv6.fib_table_hash);
+       kfree(net->ipv6.rt6_stats);
+ }
+diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
+index 67ff2aaf5dcb..b7a72d409334 100644
+--- a/net/ipv6/ip6_gre.c
++++ b/net/ipv6/ip6_gre.c
+@@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *skb, struct 
inet6_skb_parm *opt,
+               }
+               break;
+       case ICMPV6_PKT_TOOBIG:
+-              mtu = be32_to_cpu(info) - offset;
++              mtu = be32_to_cpu(info) - offset - t->tun_hlen;
++              if (t->dev->type == ARPHRD_ETHER)
++                      mtu -= ETH_HLEN;
+               if (mtu < IPV6_MIN_MTU)
+                       mtu = IPV6_MIN_MTU;
+               t->dev->mtu = mtu;
+diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c 
b/net/ipv6/netfilter/nf_conntrack_reasm.c
+index 986d4ca38832..b263bf3a19f7 100644
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
+ 
+ static int nf_ct_net_init(struct net *net)
+ {
+-      int res;
+-
+       net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
+-      res = inet_frags_init_net(&net->nf_frag.frags);
+-      if (res)
+-              return res;
+-      res = nf_ct_frag6_sysctl_register(net);
+-      if (res)
+-              inet_frags_uninit_net(&net->nf_frag.frags);
+-      return res;
++      inet_frags_init_net(&net->nf_frag.frags);
++
++      return nf_ct_frag6_sysctl_register(net);
+ }
+ 
+ static void nf_ct_net_exit(struct net *net)
+diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
+index e1da5b888cc4..846012eae526 100644
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -714,19 +714,13 @@ static void ip6_frags_sysctl_unregister(void)
+ 
+ static int __net_init ipv6_frags_init_net(struct net *net)
+ {
+-      int res;
+-
+       net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
+ 
+-      res = inet_frags_init_net(&net->ipv6.frags);
+-      if (res)
+-              return res;
+-      res = ip6_frags_ns_sysctl_register(net);
+-      if (res)
+-              inet_frags_uninit_net(&net->ipv6.frags);
+-      return res;
++      inet_frags_init_net(&net->ipv6.frags);
++
++      return ip6_frags_ns_sysctl_register(net);
+ }
+ 
+ static void __net_exit ipv6_frags_exit_net(struct net *net)
+diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
+index 206210125fd7..660b9b2a8a25 100644
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -1456,9 +1456,9 @@ static int tcp_v6_rcv(struct sk_buff *skb)
+               }
+               sock_hold(sk);
+               refcounted = true;
+-              if (tcp_filter(sk, skb))
+-                      goto discard_and_relse;
+-              nsk = tcp_check_req(sk, skb, req, false);
++              nsk = NULL;
++              if (!tcp_filter(sk, skb))
++                      nsk = tcp_check_req(sk, skb, req, false);
+               if (!nsk) {
+                       reqsk_put(req);
+                       goto discard_and_relse;
+diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
+index 0225d62a869f..a71be33f3afe 100644
+--- a/net/sctp/ulpqueue.c
++++ b/net/sctp/ulpqueue.c
+@@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct 
sctp_ulpevent *event)
+               sctp_ulpq_clear_pd(ulpq);
+ 
+       if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
+-              sp->data_ready_signalled = 1;
++              if (!sock_owned_by_user(sk))
++                      sp->data_ready_signalled = 1;
+               sk->sk_data_ready(sk);
+       }
+       return 1;

Reply via email to