commit:     aa3aade4f155b96481a44b6733e806c8181271cc
Author:     Mike Pagano <mpagano <AT> gentoo <DOT> org>
AuthorDate: Sun May  1 17:02:58 2022 +0000
Commit:     Mike Pagano <mpagano <AT> gentoo <DOT> org>
CommitDate: Sun May  1 17:02:58 2022 +0000
URL:        https://gitweb.gentoo.org/proj/linux-patches.git/commit/?id=aa3aade4

Linux patch 5.15.37

Signed-off-by: Mike Pagano <mpagano <AT> gentoo.org>

 0000_README              |    4 +
 1036_linux-5.15.37.patch | 4223 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 4227 insertions(+)

diff --git a/0000_README b/0000_README
index 0f44e39b..cb4266b1 100644
--- a/0000_README
+++ b/0000_README
@@ -187,6 +187,10 @@ Patch:  1035_linux-5.15.36.patch
 From:   http://www.kernel.org
 Desc:   Linux 5.15.36
 
+Patch:  1036_linux-5.15.37.patch
+From:   http://www.kernel.org
+Desc:   Linux 5.15.37
+
 Patch:  1500_XATTR_USER_PREFIX.patch
 From:   https://bugs.gentoo.org/show_bug.cgi?id=470644
 Desc:   Support for namespace user.pax.* on tmpfs.

diff --git a/1036_linux-5.15.37.patch b/1036_linux-5.15.37.patch
new file mode 100644
index 00000000..b9d4c0ea
--- /dev/null
+++ b/1036_linux-5.15.37.patch
@@ -0,0 +1,4223 @@
+diff --git a/Makefile b/Makefile
+index e0710f9837847..50b1688a4ca2c 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,7 +1,7 @@
+ # SPDX-License-Identifier: GPL-2.0
+ VERSION = 5
+ PATCHLEVEL = 15
+-SUBLEVEL = 36
++SUBLEVEL = 37
+ EXTRAVERSION =
+ NAME = Trick or Treat
+ 
+diff --git a/arch/arm/boot/dts/socfpga.dtsi b/arch/arm/boot/dts/socfpga.dtsi
+index 0b021eef0b538..7c1d6423d7f8c 100644
+--- a/arch/arm/boot/dts/socfpga.dtsi
++++ b/arch/arm/boot/dts/socfpga.dtsi
+@@ -782,7 +782,7 @@
+               };
+ 
+               qspi: spi@ff705000 {
+-                      compatible = "cdns,qspi-nor";
++                      compatible = "intel,socfpga-qspi", "cdns,qspi-nor";
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+                       reg = <0xff705000 0x1000>,
+diff --git a/arch/arm/boot/dts/socfpga_arria10.dtsi 
b/arch/arm/boot/dts/socfpga_arria10.dtsi
+index a574ea91d9d3f..3ba431dfa8c94 100644
+--- a/arch/arm/boot/dts/socfpga_arria10.dtsi
++++ b/arch/arm/boot/dts/socfpga_arria10.dtsi
+@@ -756,7 +756,7 @@
+               };
+ 
+               qspi: spi@ff809000 {
+-                      compatible = "cdns,qspi-nor";
++                      compatible = "intel,socfpga-qspi", "cdns,qspi-nor";
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+                       reg = <0xff809000 0x100>,
+diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi 
b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
+index d301ac0d406bf..3ec301bd08a91 100644
+--- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
++++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi
+@@ -594,7 +594,7 @@
+               };
+ 
+               qspi: spi@ff8d2000 {
+-                      compatible = "cdns,qspi-nor";
++                      compatible =  "intel,socfpga-qspi", "cdns,qspi-nor";
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+                       reg = <0xff8d2000 0x100>,
+diff --git a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi 
b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi
+index de1e98c99ec5b..f4270cf189962 100644
+--- a/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi
++++ b/arch/arm64/boot/dts/intel/socfpga_agilex.dtsi
+@@ -628,7 +628,7 @@
+               };
+ 
+               qspi: spi@ff8d2000 {
+-                      compatible = "cdns,qspi-nor";
++                      compatible = "intel,socfpga-qspi", "cdns,qspi-nor";
+                       #address-cells = <1>;
+                       #size-cells = <0>;
+                       reg = <0xff8d2000 0x100>,
+diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
+index d89cf802d9aa7..6568823cf3063 100644
+--- a/arch/powerpc/kernel/kvm.c
++++ b/arch/powerpc/kernel/kvm.c
+@@ -669,7 +669,8 @@ static void __init kvm_use_magic_page(void)
+       on_each_cpu(kvm_map_magic_page, &features, 1);
+ 
+       /* Quick self-test to see if the mapping works */
+-      if (fault_in_pages_readable((const char *)KVM_MAGIC_PAGE, sizeof(u32))) 
{
++      if (fault_in_readable((const char __user *)KVM_MAGIC_PAGE,
++                            sizeof(u32))) {
+               kvm_patching_worked = false;
+               return;
+       }
+diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c
+index f2da879264bcd..3e053e2fd6b69 100644
+--- a/arch/powerpc/kernel/signal_32.c
++++ b/arch/powerpc/kernel/signal_32.c
+@@ -1048,7 +1048,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, 
old_ctx,
+       if (new_ctx == NULL)
+               return 0;
+       if (!access_ok(new_ctx, ctx_size) ||
+-          fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
++          fault_in_readable((char __user *)new_ctx, ctx_size))
+               return -EFAULT;
+ 
+       /*
+@@ -1239,7 +1239,7 @@ SYSCALL_DEFINE3(debug_setcontext, struct ucontext __user 
*, ctx,
+ #endif
+ 
+       if (!access_ok(ctx, sizeof(*ctx)) ||
+-          fault_in_pages_readable((u8 __user *)ctx, sizeof(*ctx)))
++          fault_in_readable((char __user *)ctx, sizeof(*ctx)))
+               return -EFAULT;
+ 
+       /*
+diff --git a/arch/powerpc/kernel/signal_64.c b/arch/powerpc/kernel/signal_64.c
+index bb9c077ac1322..d1e1fc0acbea3 100644
+--- a/arch/powerpc/kernel/signal_64.c
++++ b/arch/powerpc/kernel/signal_64.c
+@@ -688,7 +688,7 @@ SYSCALL_DEFINE3(swapcontext, struct ucontext __user *, 
old_ctx,
+       if (new_ctx == NULL)
+               return 0;
+       if (!access_ok(new_ctx, ctx_size) ||
+-          fault_in_pages_readable((u8 __user *)new_ctx, ctx_size))
++          fault_in_readable((char __user *)new_ctx, ctx_size))
+               return -EFAULT;
+ 
+       /*
+diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
+index 831b25c5e7058..7f71bd4dcd0d6 100644
+--- a/arch/x86/kernel/fpu/signal.c
++++ b/arch/x86/kernel/fpu/signal.c
+@@ -205,7 +205,7 @@ retry:
+       fpregs_unlock();
+ 
+       if (ret) {
+-              if (!fault_in_pages_writeable(buf_fx, fpu_user_xstate_size))
++              if (!fault_in_writeable(buf_fx, fpu_user_xstate_size))
+                       goto retry;
+               return -EFAULT;
+       }
+@@ -278,10 +278,9 @@ retry:
+               if (ret != -EFAULT)
+                       return -EINVAL;
+ 
+-              ret = fault_in_pages_readable(buf, size);
+-              if (!ret)
++              if (!fault_in_readable(buf, size))
+                       goto retry;
+-              return ret;
++              return -EFAULT;
+       }
+ 
+       /*
+diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
+index ab3e37aa1830c..f93cb989241cc 100644
+--- a/drivers/block/Kconfig
++++ b/drivers/block/Kconfig
+@@ -33,6 +33,22 @@ config BLK_DEV_FD
+         To compile this driver as a module, choose M here: the
+         module will be called floppy.
+ 
++config BLK_DEV_FD_RAWCMD
++      bool "Support for raw floppy disk commands (DEPRECATED)"
++      depends on BLK_DEV_FD
++      help
++        If you want to use actual physical floppies and expect to do
++        special low-level hardware accesses to them (access and use
++        non-standard formats, for example), then enable this.
++
++        Note that the code enabled by this option is rarely used and
++        might be unstable or insecure, and distros should not enable it.
++
++        Note: FDRAWCMD is deprecated and will be removed from the kernel
++        in the near future.
++
++        If unsure, say N.
++
+ config AMIGA_FLOPPY
+       tristate "Amiga floppy support"
+       depends on AMIGA
+diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
+index 0f58594c5a4d6..1c152b542a52d 100644
+--- a/drivers/block/floppy.c
++++ b/drivers/block/floppy.c
+@@ -2984,6 +2984,8 @@ static const char *drive_name(int type, int drive)
+               return "(null)";
+ }
+ 
++#ifdef CONFIG_BLK_DEV_FD_RAWCMD
++
+ /* raw commands */
+ static void raw_cmd_done(int flag)
+ {
+@@ -3183,6 +3185,35 @@ static int raw_cmd_ioctl(int cmd, void __user *param)
+       return ret;
+ }
+ 
++static int floppy_raw_cmd_ioctl(int type, int drive, int cmd,
++                              void __user *param)
++{
++      int ret;
++
++      pr_warn_once("Note: FDRAWCMD is deprecated and will be removed from the 
kernel in the near future.\n");
++
++      if (type)
++              return -EINVAL;
++      if (lock_fdc(drive))
++              return -EINTR;
++      set_floppy(drive);
++      ret = raw_cmd_ioctl(cmd, param);
++      if (ret == -EINTR)
++              return -EINTR;
++      process_fd_request();
++      return ret;
++}
++
++#else /* CONFIG_BLK_DEV_FD_RAWCMD */
++
++static int floppy_raw_cmd_ioctl(int type, int drive, int cmd,
++                              void __user *param)
++{
++      return -EOPNOTSUPP;
++}
++
++#endif
++
+ static int invalidate_drive(struct block_device *bdev)
+ {
+       /* invalidate the buffer track to force a reread */
+@@ -3371,7 +3402,6 @@ static int fd_locked_ioctl(struct block_device *bdev, 
fmode_t mode, unsigned int
+ {
+       int drive = (long)bdev->bd_disk->private_data;
+       int type = ITYPE(drive_state[drive].fd_device);
+-      int i;
+       int ret;
+       int size;
+       union inparam {
+@@ -3522,16 +3552,7 @@ static int fd_locked_ioctl(struct block_device *bdev, 
fmode_t mode, unsigned int
+               outparam = &write_errors[drive];
+               break;
+       case FDRAWCMD:
+-              if (type)
+-                      return -EINVAL;
+-              if (lock_fdc(drive))
+-                      return -EINTR;
+-              set_floppy(drive);
+-              i = raw_cmd_ioctl(cmd, (void __user *)param);
+-              if (i == -EINTR)
+-                      return -EINTR;
+-              process_fd_request();
+-              return i;
++              return floppy_raw_cmd_ioctl(type, drive, cmd, (void __user 
*)param);
+       case FDTWADDLE:
+               if (lock_fdc(drive))
+                       return -EINTR;
+diff --git a/drivers/gpu/drm/armada/armada_gem.c 
b/drivers/gpu/drm/armada/armada_gem.c
+index 21909642ee4ca..8fbb25913327c 100644
+--- a/drivers/gpu/drm/armada/armada_gem.c
++++ b/drivers/gpu/drm/armada/armada_gem.c
+@@ -336,7 +336,7 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void 
*data,
+       struct drm_armada_gem_pwrite *args = data;
+       struct armada_gem_object *dobj;
+       char __user *ptr;
+-      int ret;
++      int ret = 0;
+ 
+       DRM_DEBUG_DRIVER("handle %u off %u size %u ptr 0x%llx\n",
+               args->handle, args->offset, args->size, args->ptr);
+@@ -349,9 +349,8 @@ int armada_gem_pwrite_ioctl(struct drm_device *dev, void 
*data,
+       if (!access_ok(ptr, args->size))
+               return -EFAULT;
+ 
+-      ret = fault_in_pages_readable(ptr, args->size);
+-      if (ret)
+-              return ret;
++      if (fault_in_readable(ptr, args->size))
++              return -EFAULT;
+ 
+       dobj = armada_gem_object_lookup(file, args->handle);
+       if (dobj == NULL)
+diff --git a/drivers/spi/spi-cadence-quadspi.c 
b/drivers/spi/spi-cadence-quadspi.c
+index 75680eecd2f7d..2714ba02b176b 100644
+--- a/drivers/spi/spi-cadence-quadspi.c
++++ b/drivers/spi/spi-cadence-quadspi.c
+@@ -36,6 +36,7 @@
+ /* Quirks */
+ #define CQSPI_NEEDS_WR_DELAY          BIT(0)
+ #define CQSPI_DISABLE_DAC_MODE                BIT(1)
++#define CQSPI_NO_SUPPORT_WR_COMPLETION        BIT(3)
+ 
+ /* Capabilities */
+ #define CQSPI_SUPPORTS_OCTAL          BIT(0)
+@@ -83,6 +84,7 @@ struct cqspi_st {
+       u32                     wr_delay;
+       bool                    use_direct_mode;
+       struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT];
++      bool                    wr_completion;
+ };
+ 
+ struct cqspi_driver_platdata {
+@@ -797,9 +799,11 @@ static int cqspi_write_setup(struct cqspi_flash_pdata 
*f_pdata,
+        * polling on the controller's side. spinand and spi-nor will take
+        * care of polling the status register.
+        */
+-      reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
+-      reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL;
+-      writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
++      if (cqspi->wr_completion) {
++              reg = readl(reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
++              reg |= CQSPI_REG_WR_DISABLE_AUTO_POLL;
++              writel(reg, reg_base + CQSPI_REG_WR_COMPLETION_CTRL);
++      }
+ 
+       reg = readl(reg_base + CQSPI_REG_SIZE);
+       reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK;
+@@ -1532,6 +1536,10 @@ static int cqspi_probe(struct platform_device *pdev)
+ 
+       cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk);
+       master->max_speed_hz = cqspi->master_ref_clk_hz;
++
++      /* write completion is supported by default */
++      cqspi->wr_completion = true;
++
+       ddata  = of_device_get_match_data(dev);
+       if (ddata) {
+               if (ddata->quirks & CQSPI_NEEDS_WR_DELAY)
+@@ -1541,6 +1549,8 @@ static int cqspi_probe(struct platform_device *pdev)
+                       master->mode_bits |= SPI_RX_OCTAL | SPI_TX_OCTAL;
+               if (!(ddata->quirks & CQSPI_DISABLE_DAC_MODE))
+                       cqspi->use_direct_mode = true;
++              if (ddata->quirks & CQSPI_NO_SUPPORT_WR_COMPLETION)
++                      cqspi->wr_completion = false;
+       }
+ 
+       ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0,
+@@ -1649,6 +1659,10 @@ static const struct cqspi_driver_platdata 
intel_lgm_qspi = {
+       .quirks = CQSPI_DISABLE_DAC_MODE,
+ };
+ 
++static const struct cqspi_driver_platdata socfpga_qspi = {
++      .quirks = CQSPI_NO_SUPPORT_WR_COMPLETION,
++};
++
+ static const struct of_device_id cqspi_dt_ids[] = {
+       {
+               .compatible = "cdns,qspi-nor",
+@@ -1666,6 +1680,10 @@ static const struct of_device_id cqspi_dt_ids[] = {
+               .compatible = "intel,lgm-qspi",
+               .data = &intel_lgm_qspi,
+       },
++      {
++              .compatible = "intel,socfpga-qspi",
++              .data = (void *)&socfpga_qspi,
++      },
+       { /* end of table */ }
+ };
+ 
+diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
+index dc1e4d1b72914..ff578c934bbcf 100644
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -1709,7 +1709,7 @@ static noinline ssize_t btrfs_buffered_write(struct 
kiocb *iocb,
+                * Fault pages before locking them in prepare_pages
+                * to avoid recursive lock
+                */
+-              if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
++              if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
+                       ret = -EFAULT;
+                       break;
+               }
+@@ -1903,16 +1903,17 @@ static ssize_t check_direct_IO(struct btrfs_fs_info 
*fs_info,
+ 
+ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
+ {
++      const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file_inode(file);
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       loff_t pos;
+       ssize_t written = 0;
+       ssize_t written_buffered;
++      size_t prev_left = 0;
+       loff_t endbyte;
+       ssize_t err;
+       unsigned int ilock_flags = 0;
+-      struct iomap_dio *dio = NULL;
+ 
+       if (iocb->ki_flags & IOCB_NOWAIT)
+               ilock_flags |= BTRFS_ILOCK_TRY;
+@@ -1955,23 +1956,80 @@ relock:
+               goto buffered;
+       }
+ 
+-      dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
+-                           0);
++      /*
++       * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
++       * calls generic_write_sync() (through iomap_dio_complete()), because
++       * that results in calling fsync (btrfs_sync_file()) which will try to
++       * lock the inode in exclusive/write mode.
++       */
++      if (is_sync_write)
++              iocb->ki_flags &= ~IOCB_DSYNC;
+ 
+-      btrfs_inode_unlock(inode, ilock_flags);
++      /*
++       * The iov_iter can be mapped to the same file range we are writing to.
++       * If that's the case, then we will deadlock in the iomap code, because
++       * it first calls our callback btrfs_dio_iomap_begin(), which will 
create
++       * an ordered extent, and after that it will fault in the pages that the
++       * iov_iter refers to. During the fault in we end up in the readahead
++       * pages code (starting at btrfs_readahead()), which will lock the 
range,
++       * find that ordered extent and then wait for it to complete (at
++       * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
++       * obviously the ordered extent can never complete as we didn't submit
++       * yet the respective bio(s). This always happens when the buffer is
++       * memory mapped to the same file range, since the iomap DIO code always
++       * invalidates pages in the target file range (after starting and 
waiting
++       * for any writeback).
++       *
++       * So here we disable page faults in the iov_iter and then retry if we
++       * got -EFAULT, faulting in the pages before the retry.
++       */
++again:
++      from->nofault = true;
++      err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
++                         IOMAP_DIO_PARTIAL, written);
++      from->nofault = false;
+ 
+-      if (IS_ERR_OR_NULL(dio)) {
+-              err = PTR_ERR_OR_ZERO(dio);
+-              if (err < 0 && err != -ENOTBLK)
+-                      goto out;
+-      } else {
+-              written = iomap_dio_complete(dio);
++      /* No increment (+=) because iomap returns a cumulative value. */
++      if (err > 0)
++              written = err;
++
++      if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
++              const size_t left = iov_iter_count(from);
++              /*
++               * We have more data left to write. Try to fault in as many as
++               * possible of the remainder pages and retry. We do this without
++               * releasing and locking again the inode, to prevent races with
++               * truncate.
++               *
++               * Also, in case the iov refers to pages in the file range of 
the
++               * file we want to write to (due to a mmap), we could enter an
++               * infinite loop if we retry after faulting the pages in, since
++               * iomap will invalidate any pages in the range early on, before
++               * it tries to fault in the pages of the iov. So we keep track 
of
++               * how much was left of iov in the previous EFAULT and fallback
++               * to buffered IO in case we haven't made any progress.
++               */
++              if (left == prev_left) {
++                      err = -ENOTBLK;
++              } else {
++                      fault_in_iov_iter_readable(from, left);
++                      prev_left = left;
++                      goto again;
++              }
+       }
+ 
+-      if (written < 0 || !iov_iter_count(from)) {
+-              err = written;
++      btrfs_inode_unlock(inode, ilock_flags);
++
++      /*
++       * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
++       * the fsync (call generic_write_sync()).
++       */
++      if (is_sync_write)
++              iocb->ki_flags |= IOCB_DSYNC;
++
++      /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. 
*/
++      if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
+               goto out;
+-      }
+ 
+ buffered:
+       pos = iocb->ki_pos;
+@@ -1996,7 +2054,7 @@ buffered:
+       invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
+                                endbyte >> PAGE_SHIFT);
+ out:
+-      return written ? written : err;
++      return err < 0 ? err : written;
+ }
+ 
+ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
+@@ -3659,6 +3717,8 @@ static int check_direct_read(struct btrfs_fs_info 
*fs_info,
+ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
+ {
+       struct inode *inode = file_inode(iocb->ki_filp);
++      size_t prev_left = 0;
++      ssize_t read = 0;
+       ssize_t ret;
+ 
+       if (fsverity_active(inode))
+@@ -3668,9 +3728,57 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, 
struct iov_iter *to)
+               return 0;
+ 
+       btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+-      ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
++again:
++      /*
++       * This is similar to what we do for direct IO writes, see the comment
++       * at btrfs_direct_write(), but we also disable page faults in addition
++       * to disabling them only at the iov_iter level. This is because when
++       * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
++       * which can still trigger page fault ins despite having set ->nofault
++       * to true of our 'to' iov_iter.
++       *
++       * The difference to direct IO writes is that we deadlock when trying
++       * to lock the extent range in the inode's tree during he page reads
++       * triggered by the fault in (while for writes it is due to waiting for
++       * our own ordered extent). This is because for direct IO reads,
++       * btrfs_dio_iomap_begin() returns with the extent range locked, which
++       * is only unlocked in the endio callback (end_bio_extent_readpage()).
++       */
++      pagefault_disable();
++      to->nofault = true;
++      ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
++                         IOMAP_DIO_PARTIAL, read);
++      to->nofault = false;
++      pagefault_enable();
++
++      /* No increment (+=) because iomap returns a cumulative value. */
++      if (ret > 0)
++              read = ret;
++
++      if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
++              const size_t left = iov_iter_count(to);
++
++              if (left == prev_left) {
++                      /*
++                       * We didn't make any progress since the last attempt,
++                       * fallback to a buffered read for the remainder of the
++                       * range. This is just to avoid any possibility of 
looping
++                       * for too long.
++                       */
++                      ret = read;
++              } else {
++                      /*
++                       * We made some progress since the last retry or this is
++                       * the first time we are retrying. Fault in as many 
pages
++                       * as possible and retry.
++                       */
++                      fault_in_iov_iter_writeable(to, left);
++                      prev_left = left;
++                      goto again;
++              }
++      }
+       btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+-      return ret;
++      return ret < 0 ? ret : read;
+ }
+ 
+ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
+index 6266a706bff7d..044d584c3467c 100644
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -7961,6 +7961,34 @@ static int btrfs_dio_iomap_begin(struct inode *inode, 
loff_t start,
+       }
+ 
+       len = min(len, em->len - (start - em->start));
++
++      /*
++       * If we have a NOWAIT request and the range contains multiple extents
++       * (or a mix of extents and holes), then we return -EAGAIN to make the
++       * caller fallback to a context where it can do a blocking (without
++       * NOWAIT) request. This way we avoid doing partial IO and returning
++       * success to the caller, which is not optimal for writes and for reads
++       * it can result in unexpected behaviour for an application.
++       *
++       * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
++       * iomap_dio_rw(), we can end up returning less data then what the 
caller
++       * asked for, resulting in an unexpected, and incorrect, short read.
++       * That is, the caller asked to read N bytes and we return less than 
that,
++       * which is wrong unless we are crossing EOF. This happens if we get a
++       * page fault error when trying to fault in pages for the buffer that is
++       * associated to the struct iov_iter passed to iomap_dio_rw(), and we
++       * have previously submitted bios for other extents in the range, in
++       * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
++       * those bios have completed by the time we get the page fault error,
++       * which we return back to our caller - we should only return 
EIOCBQUEUED
++       * after we have submitted bios for all the extents in the range.
++       */
++      if ((flags & IOMAP_NOWAIT) && len < length) {
++              free_extent_map(em);
++              ret = -EAGAIN;
++              goto unlock_err;
++      }
++
+       if (write) {
+               ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
+                                                   start, len);
+diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
+index 6a863b3f6de03..bf53af8694f8e 100644
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -2258,9 +2258,8 @@ static noinline int search_ioctl(struct inode *inode,
+       key.offset = sk->min_offset;
+ 
+       while (1) {
+-              ret = fault_in_pages_writeable(ubuf + sk_offset,
+-                                             *buf_size - sk_offset);
+-              if (ret)
++              ret = -EFAULT;
++              if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
+                       break;
+ 
+               ret = btrfs_search_forward(root, &key, path, sk->min_transid);
+diff --git a/fs/erofs/data.c b/fs/erofs/data.c
+index 9db8297156527..16a41d0db55a3 100644
+--- a/fs/erofs/data.c
++++ b/fs/erofs/data.c
+@@ -287,7 +287,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, 
struct iov_iter *to)
+ 
+               if (!err)
+                       return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
+-                                          NULL, 0);
++                                          NULL, 0, 0);
+               if (err < 0)
+                       return err;
+       }
+diff --git a/fs/ext4/file.c b/fs/ext4/file.c
+index ac0e11bbb4450..b25c1f8f7c4f1 100644
+--- a/fs/ext4/file.c
++++ b/fs/ext4/file.c
+@@ -74,7 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct 
iov_iter *to)
+               return generic_file_read_iter(iocb, to);
+       }
+ 
+-      ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0);
++      ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0);
+       inode_unlock_shared(inode);
+ 
+       file_accessed(iocb->ki_filp);
+@@ -566,7 +566,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
+       if (ilock_shared)
+               iomap_ops = &ext4_iomap_overwrite_ops;
+       ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
+-                         (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0);
++                         (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
++                         0);
+       if (ret == -ENOTBLK)
+               ret = 0;
+ 
+diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
+index 0e14dc41ed4e6..8ef92719c6799 100644
+--- a/fs/f2fs/file.c
++++ b/fs/f2fs/file.c
+@@ -4279,7 +4279,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
+               size_t target_size = 0;
+               int err;
+ 
+-              if (iov_iter_fault_in_readable(from, iov_iter_count(from)))
++              if (fault_in_iov_iter_readable(from, iov_iter_count(from)))
+                       set_inode_flag(inode, FI_NO_PREALLOC);
+ 
+               if ((iocb->ki_flags & IOCB_NOWAIT)) {
+diff --git a/fs/fuse/file.c b/fs/fuse/file.c
+index bc50a9fa84a0c..71e9e301e569d 100644
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -1164,7 +1164,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args 
*ia,
+ 
+  again:
+               err = -EFAULT;
+-              if (iov_iter_fault_in_readable(ii, bytes))
++              if (fault_in_iov_iter_readable(ii, bytes))
+                       break;
+ 
+               err = -ENOMEM;
+diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
+index bb9014ced702a..fbdb7a30470a3 100644
+--- a/fs/gfs2/bmap.c
++++ b/fs/gfs2/bmap.c
+@@ -961,46 +961,6 @@ hole_found:
+       goto out;
+ }
+ 
+-static int gfs2_write_lock(struct inode *inode)
+-{
+-      struct gfs2_inode *ip = GFS2_I(inode);
+-      struct gfs2_sbd *sdp = GFS2_SB(inode);
+-      int error;
+-
+-      gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+-      error = gfs2_glock_nq(&ip->i_gh);
+-      if (error)
+-              goto out_uninit;
+-      if (&ip->i_inode == sdp->sd_rindex) {
+-              struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+-
+-              error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+-                                         GL_NOCACHE, &m_ip->i_gh);
+-              if (error)
+-                      goto out_unlock;
+-      }
+-      return 0;
+-
+-out_unlock:
+-      gfs2_glock_dq(&ip->i_gh);
+-out_uninit:
+-      gfs2_holder_uninit(&ip->i_gh);
+-      return error;
+-}
+-
+-static void gfs2_write_unlock(struct inode *inode)
+-{
+-      struct gfs2_inode *ip = GFS2_I(inode);
+-      struct gfs2_sbd *sdp = GFS2_SB(inode);
+-
+-      if (&ip->i_inode == sdp->sd_rindex) {
+-              struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+-
+-              gfs2_glock_dq_uninit(&m_ip->i_gh);
+-      }
+-      gfs2_glock_dq_uninit(&ip->i_gh);
+-}
+-
+ static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
+                                  unsigned len)
+ {
+@@ -1118,11 +1078,6 @@ out_qunlock:
+       return ret;
+ }
+ 
+-static inline bool gfs2_iomap_need_write_lock(unsigned flags)
+-{
+-      return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT);
+-}
+-
+ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
+                           unsigned flags, struct iomap *iomap,
+                           struct iomap *srcmap)
+@@ -1135,12 +1090,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t 
pos, loff_t length,
+               iomap->flags |= IOMAP_F_BUFFER_HEAD;
+ 
+       trace_gfs2_iomap_start(ip, pos, length, flags);
+-      if (gfs2_iomap_need_write_lock(flags)) {
+-              ret = gfs2_write_lock(inode);
+-              if (ret)
+-                      goto out;
+-      }
+-
+       ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+       if (ret)
+               goto out_unlock;
+@@ -1168,10 +1117,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t 
pos, loff_t length,
+       ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
+ 
+ out_unlock:
+-      if (ret && gfs2_iomap_need_write_lock(flags))
+-              gfs2_write_unlock(inode);
+       release_metapath(&mp);
+-out:
+       trace_gfs2_iomap_end(ip, iomap, ret);
+       return ret;
+ }
+@@ -1219,15 +1165,11 @@ static int gfs2_iomap_end(struct inode *inode, loff_t 
pos, loff_t length,
+       }
+ 
+       if (unlikely(!written))
+-              goto out_unlock;
++              return 0;
+ 
+       if (iomap->flags & IOMAP_F_SIZE_CHANGED)
+               mark_inode_dirty(inode);
+       set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
+-
+-out_unlock:
+-      if (gfs2_iomap_need_write_lock(flags))
+-              gfs2_write_unlock(inode);
+       return 0;
+ }
+ 
+diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
+index 1c8b747072cba..247b8d95b5ef4 100644
+--- a/fs/gfs2/file.c
++++ b/fs/gfs2/file.c
+@@ -777,27 +777,99 @@ static int gfs2_fsync(struct file *file, loff_t start, 
loff_t end,
+       return ret ? ret : ret1;
+ }
+ 
++static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i,
++                                       size_t *prev_count,
++                                       size_t *window_size)
++{
++      char __user *p = i->iov[0].iov_base + i->iov_offset;
++      size_t count = iov_iter_count(i);
++      int pages = 1;
++
++      if (likely(!count))
++              return false;
++      if (ret <= 0 && ret != -EFAULT)
++              return false;
++      if (!iter_is_iovec(i))
++              return false;
++
++      if (*prev_count != count || !*window_size) {
++              int pages, nr_dirtied;
++
++              pages = min_t(int, BIO_MAX_VECS,
++                            DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE));
++              nr_dirtied = max(current->nr_dirtied_pause -
++                               current->nr_dirtied, 1);
++              pages = min(pages, nr_dirtied);
++      }
++
++      *prev_count = count;
++      *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p);
++      return true;
++}
++
+ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to,
+                                    struct gfs2_holder *gh)
+ {
+       struct file *file = iocb->ki_filp;
+       struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
+-      size_t count = iov_iter_count(to);
++      size_t prev_count = 0, window_size = 0;
++      size_t written = 0;
+       ssize_t ret;
+ 
+-      if (!count)
++      /*
++       * In this function, we disable page faults when we're holding the
++       * inode glock while doing I/O.  If a page fault occurs, we indicate
++       * that the inode glock may be dropped, fault in the pages manually,
++       * and retry.
++       *
++       * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger
++       * physical as well as manual page faults, and we need to disable both
++       * kinds.
++       *
++       * For direct I/O, gfs2 takes the inode glock in deferred mode.  This
++       * locking mode is compatible with other deferred holders, so multiple
++       * processes and nodes can do direct I/O to a file at the same time.
++       * There's no guarantee that reads or writes will be atomic.  Any
++       * coordination among readers and writers needs to happen externally.
++       */
++
++      if (!iov_iter_count(to))
+               return 0; /* skip atime */
+ 
+       gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
++retry:
+       ret = gfs2_glock_nq(gh);
+       if (ret)
+               goto out_uninit;
++retry_under_glock:
++      pagefault_disable();
++      to->nofault = true;
++      ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
++                         IOMAP_DIO_PARTIAL, written);
++      to->nofault = false;
++      pagefault_enable();
++      if (ret > 0)
++              written = ret;
++
++      if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
++              size_t leftover;
+ 
+-      ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0);
+-      gfs2_glock_dq(gh);
++              gfs2_holder_allow_demote(gh);
++              leftover = fault_in_iov_iter_writeable(to, window_size);
++              gfs2_holder_disallow_demote(gh);
++              if (leftover != window_size) {
++                      if (!gfs2_holder_queued(gh))
++                              goto retry;
++                      goto retry_under_glock;
++              }
++      }
++      if (gfs2_holder_queued(gh))
++              gfs2_glock_dq(gh);
+ out_uninit:
+       gfs2_holder_uninit(gh);
+-      return ret;
++      if (ret < 0)
++              return ret;
++      return written;
+ }
+ 
+ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter 
*from,
+@@ -806,10 +878,20 @@ static ssize_t gfs2_file_direct_write(struct kiocb 
*iocb, struct iov_iter *from,
+       struct file *file = iocb->ki_filp;
+       struct inode *inode = file->f_mapping->host;
+       struct gfs2_inode *ip = GFS2_I(inode);
+-      size_t len = iov_iter_count(from);
+-      loff_t offset = iocb->ki_pos;
++      size_t prev_count = 0, window_size = 0;
++      size_t read = 0;
+       ssize_t ret;
+ 
++      /*
++       * In this function, we disable page faults when we're holding the
++       * inode glock while doing I/O.  If a page fault occurs, we indicate
++       * that the inode glock may be dropped, fault in the pages manually,
++       * and retry.
++       *
++       * For writes, iomap_dio_rw only triggers manual page faults, so we
++       * don't need to disable physical ones.
++       */
++
+       /*
+        * Deferred lock, even if its a write, since we do no allocation on
+        * this path. All we need to change is the atime, and this lock mode
+@@ -819,31 +901,62 @@ static ssize_t gfs2_file_direct_write(struct kiocb 
*iocb, struct iov_iter *from,
+        * VFS does.
+        */
+       gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh);
++retry:
+       ret = gfs2_glock_nq(gh);
+       if (ret)
+               goto out_uninit;
+-
++retry_under_glock:
+       /* Silently fall back to buffered I/O when writing beyond EOF */
+-      if (offset + len > i_size_read(&ip->i_inode))
++      if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode))
+               goto out;
+ 
+-      ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0);
++      from->nofault = true;
++      ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
++                         IOMAP_DIO_PARTIAL, read);
++      from->nofault = false;
++
+       if (ret == -ENOTBLK)
+               ret = 0;
++      if (ret > 0)
++              read = ret;
++
++      if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
++              size_t leftover;
++
++              gfs2_holder_allow_demote(gh);
++              leftover = fault_in_iov_iter_readable(from, window_size);
++              gfs2_holder_disallow_demote(gh);
++              if (leftover != window_size) {
++                      if (!gfs2_holder_queued(gh))
++                              goto retry;
++                      goto retry_under_glock;
++              }
++      }
+ out:
+-      gfs2_glock_dq(gh);
++      if (gfs2_holder_queued(gh))
++              gfs2_glock_dq(gh);
+ out_uninit:
+       gfs2_holder_uninit(gh);
+-      return ret;
++      if (ret < 0)
++              return ret;
++      return read;
+ }
+ 
+ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+ {
+       struct gfs2_inode *ip;
+       struct gfs2_holder gh;
++      size_t prev_count = 0, window_size = 0;
+       size_t written = 0;
+       ssize_t ret;
+ 
++      /*
++       * In this function, we disable page faults when we're holding the
++       * inode glock while doing I/O.  If a page fault occurs, we indicate
++       * that the inode glock may be dropped, fault in the pages manually,
++       * and retry.
++       */
++
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               ret = gfs2_file_direct_read(iocb, to, &gh);
+               if (likely(ret != -ENOTBLK))
+@@ -865,18 +978,118 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, 
struct iov_iter *to)
+       }
+       ip = GFS2_I(iocb->ki_filp->f_mapping->host);
+       gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
++retry:
+       ret = gfs2_glock_nq(&gh);
+       if (ret)
+               goto out_uninit;
++retry_under_glock:
++      pagefault_disable();
+       ret = generic_file_read_iter(iocb, to);
++      pagefault_enable();
+       if (ret > 0)
+               written += ret;
+-      gfs2_glock_dq(&gh);
++
++      if (should_fault_in_pages(ret, to, &prev_count, &window_size)) {
++              size_t leftover;
++
++              gfs2_holder_allow_demote(&gh);
++              leftover = fault_in_iov_iter_writeable(to, window_size);
++              gfs2_holder_disallow_demote(&gh);
++              if (leftover != window_size) {
++                      if (!gfs2_holder_queued(&gh)) {
++                              if (written)
++                                      goto out_uninit;
++                              goto retry;
++                      }
++                      goto retry_under_glock;
++              }
++      }
++      if (gfs2_holder_queued(&gh))
++              gfs2_glock_dq(&gh);
+ out_uninit:
+       gfs2_holder_uninit(&gh);
+       return written ? written : ret;
+ }
+ 
++static ssize_t gfs2_file_buffered_write(struct kiocb *iocb,
++                                      struct iov_iter *from,
++                                      struct gfs2_holder *gh)
++{
++      struct file *file = iocb->ki_filp;
++      struct inode *inode = file_inode(file);
++      struct gfs2_inode *ip = GFS2_I(inode);
++      struct gfs2_sbd *sdp = GFS2_SB(inode);
++      struct gfs2_holder *statfs_gh = NULL;
++      size_t prev_count = 0, window_size = 0;
++      size_t read = 0;
++      ssize_t ret;
++
++      /*
++       * In this function, we disable page faults when we're holding the
++       * inode glock while doing I/O.  If a page fault occurs, we indicate
++       * that the inode glock may be dropped, fault in the pages manually,
++       * and retry.
++       */
++
++      if (inode == sdp->sd_rindex) {
++              statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS);
++              if (!statfs_gh)
++                      return -ENOMEM;
++      }
++
++      gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh);
++retry:
++      ret = gfs2_glock_nq(gh);
++      if (ret)
++              goto out_uninit;
++retry_under_glock:
++      if (inode == sdp->sd_rindex) {
++              struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
++
++              ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
++                                       GL_NOCACHE, statfs_gh);
++              if (ret)
++                      goto out_unlock;
++      }
++
++      current->backing_dev_info = inode_to_bdi(inode);
++      pagefault_disable();
++      ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
++      pagefault_enable();
++      current->backing_dev_info = NULL;
++      if (ret > 0) {
++              iocb->ki_pos += ret;
++              read += ret;
++      }
++
++      if (inode == sdp->sd_rindex)
++              gfs2_glock_dq_uninit(statfs_gh);
++
++      if (should_fault_in_pages(ret, from, &prev_count, &window_size)) {
++              size_t leftover;
++
++              gfs2_holder_allow_demote(gh);
++              leftover = fault_in_iov_iter_readable(from, window_size);
++              gfs2_holder_disallow_demote(gh);
++              if (leftover != window_size) {
++                      if (!gfs2_holder_queued(gh)) {
++                              if (read)
++                                      goto out_uninit;
++                              goto retry;
++                      }
++                      goto retry_under_glock;
++              }
++      }
++out_unlock:
++      if (gfs2_holder_queued(gh))
++              gfs2_glock_dq(gh);
++out_uninit:
++      gfs2_holder_uninit(gh);
++      if (statfs_gh)
++              kfree(statfs_gh);
++      return read ? read : ret;
++}
++
+ /**
+  * gfs2_file_write_iter - Perform a write to a file
+  * @iocb: The io context
+@@ -928,9 +1141,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
+                       goto out_unlock;
+ 
+               iocb->ki_flags |= IOCB_DSYNC;
+-              current->backing_dev_info = inode_to_bdi(inode);
+-              buffered = iomap_file_buffered_write(iocb, from, 
&gfs2_iomap_ops);
+-              current->backing_dev_info = NULL;
++              buffered = gfs2_file_buffered_write(iocb, from, &gh);
+               if (unlikely(buffered <= 0)) {
+                       if (!ret)
+                               ret = buffered;
+@@ -944,7 +1155,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
+                * the direct I/O range as we don't know if the buffered pages
+                * made it to disk.
+                */
+-              iocb->ki_pos += buffered;
+               ret2 = generic_write_sync(iocb, buffered);
+               invalidate_mapping_pages(mapping,
+                               (iocb->ki_pos - buffered) >> PAGE_SHIFT,
+@@ -952,13 +1162,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
+               if (!ret || ret2 > 0)
+                       ret += ret2;
+       } else {
+-              current->backing_dev_info = inode_to_bdi(inode);
+-              ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+-              current->backing_dev_info = NULL;
+-              if (likely(ret > 0)) {
+-                      iocb->ki_pos += ret;
++              ret = gfs2_file_buffered_write(iocb, from, &gh);
++              if (likely(ret > 0))
+                       ret = generic_write_sync(iocb, ret);
+-              }
+       }
+ 
+ out_unlock:
+diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
+index 02cd0ae98208d..e85ef6b14777d 100644
+--- a/fs/gfs2/glock.c
++++ b/fs/gfs2/glock.c
+@@ -58,6 +58,7 @@ struct gfs2_glock_iter {
+ typedef void (*glock_examiner) (struct gfs2_glock * gl);
+ 
+ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned 
int target);
++static void __gfs2_glock_dq(struct gfs2_holder *gh);
+ 
+ static struct dentry *gfs2_root;
+ static struct workqueue_struct *glock_workqueue;
+@@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock *gl)
+ 
+       if (gl->gl_state == LM_ST_UNLOCKED)
+               return 0;
++      /*
++       * Note that demote_ok is used for the lru process of disposing of
++       * glocks. For this purpose, we don't care if the glock's holders
++       * have the HIF_MAY_DEMOTE flag set or not. If someone is using
++       * them, don't demote.
++       */
+       if (!list_empty(&gl->gl_holders))
+               return 0;
+       if (glops->go_demote_ok)
+@@ -301,46 +308,59 @@ void gfs2_glock_put(struct gfs2_glock *gl)
+ }
+ 
+ /**
+- * may_grant - check if its ok to grant a new lock
++ * may_grant - check if it's ok to grant a new lock
+  * @gl: The glock
++ * @current_gh: One of the current holders of @gl
+  * @gh: The lock request which we wish to grant
+  *
+- * Returns: true if its ok to grant the lock
++ * With our current compatibility rules, if a glock has one or more active
++ * holders (HIF_HOLDER flag set), any of those holders can be passed in as
++ * @current_gh; they are all the same as far as compatibility with the new @gh
++ * goes.
++ *
++ * Returns true if it's ok to grant the lock.
+  */
+ 
+-static inline int may_grant(const struct gfs2_glock *gl, const struct 
gfs2_holder *gh)
+-{
+-      const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, 
const struct gfs2_holder, gh_list);
++static inline bool may_grant(struct gfs2_glock *gl,
++                           struct gfs2_holder *current_gh,
++                           struct gfs2_holder *gh)
++{
++      if (current_gh) {
++              GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, &current_gh->gh_iflags));
++
++              switch(current_gh->gh_state) {
++              case LM_ST_EXCLUSIVE:
++                      /*
++                       * Here we make a special exception to grant holders
++                       * who agree to share the EX lock with other holders
++                       * who also have the bit set. If the original holder
++                       * has the LM_FLAG_NODE_SCOPE bit set, we grant more
++                       * holders with the bit set.
++                       */
++                      return gh->gh_state == LM_ST_EXCLUSIVE &&
++                             (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) &&
++                             (gh->gh_flags & LM_FLAG_NODE_SCOPE);
+ 
+-      if (gh != gh_head) {
+-              /**
+-               * Here we make a special exception to grant holders who agree
+-               * to share the EX lock with other holders who also have the
+-               * bit set. If the original holder has the LM_FLAG_NODE_SCOPE 
bit
+-               * is set, we grant more holders with the bit set.
+-               */
+-              if (gh_head->gh_state == LM_ST_EXCLUSIVE &&
+-                  (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) &&
+-                  gh->gh_state == LM_ST_EXCLUSIVE &&
+-                  (gh->gh_flags & LM_FLAG_NODE_SCOPE))
+-                      return 1;
+-              if ((gh->gh_state == LM_ST_EXCLUSIVE ||
+-                   gh_head->gh_state == LM_ST_EXCLUSIVE))
+-                      return 0;
++              case LM_ST_SHARED:
++              case LM_ST_DEFERRED:
++                      return gh->gh_state == current_gh->gh_state;
++
++              default:
++                      return false;
++              }
+       }
++
+       if (gl->gl_state == gh->gh_state)
+-              return 1;
++              return true;
+       if (gh->gh_flags & GL_EXACT)
+-              return 0;
++              return false;
+       if (gl->gl_state == LM_ST_EXCLUSIVE) {
+-              if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == 
LM_ST_SHARED)
+-                      return 1;
+-              if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == 
LM_ST_DEFERRED)
+-                      return 1;
++              return gh->gh_state == LM_ST_SHARED ||
++                     gh->gh_state == LM_ST_DEFERRED;
+       }
+-      if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY))
+-              return 1;
+-      return 0;
++      if (gh->gh_flags & LM_FLAG_ANY)
++              return gl->gl_state != LM_ST_UNLOCKED;
++      return false;
+ }
+ 
+ static void gfs2_holder_wake(struct gfs2_holder *gh)
+@@ -366,7 +386,7 @@ static void do_error(struct gfs2_glock *gl, const int ret)
+       struct gfs2_holder *gh, *tmp;
+ 
+       list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+-              if (test_bit(HIF_HOLDER, &gh->gh_iflags))
++              if (!test_bit(HIF_WAIT, &gh->gh_iflags))
+                       continue;
+               if (ret & LM_OUT_ERROR)
+                       gh->gh_error = -EIO;
+@@ -380,6 +400,78 @@ static void do_error(struct gfs2_glock *gl, const int ret)
+       }
+ }
+ 
++/**
++ * demote_incompat_holders - demote incompatible demoteable holders
++ * @gl: the glock we want to promote
++ * @new_gh: the new holder to be promoted
++ */
++static void demote_incompat_holders(struct gfs2_glock *gl,
++                                  struct gfs2_holder *new_gh)
++{
++      struct gfs2_holder *gh;
++
++      /*
++       * Demote incompatible holders before we make ourselves eligible.
++       * (This holder may or may not allow auto-demoting, but we don't want
++       * to demote the new holder before it's even granted.)
++       */
++      list_for_each_entry(gh, &gl->gl_holders, gh_list) {
++              /*
++               * Since holders are at the front of the list, we stop when we
++               * find the first non-holder.
++               */
++              if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
++                      return;
++              if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) &&
++                  !may_grant(gl, new_gh, gh)) {
++                      /*
++                       * We should not recurse into do_promote because
++                       * __gfs2_glock_dq only calls handle_callback,
++                       * gfs2_glock_add_to_lru and __gfs2_glock_queue_work.
++                       */
++                      __gfs2_glock_dq(gh);
++              }
++      }
++}
++
++/**
++ * find_first_holder - find the first "holder" gh
++ * @gl: the glock
++ */
++
++static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock 
*gl)
++{
++      struct gfs2_holder *gh;
++
++      if (!list_empty(&gl->gl_holders)) {
++              gh = list_first_entry(&gl->gl_holders, struct gfs2_holder,
++                                    gh_list);
++              if (test_bit(HIF_HOLDER, &gh->gh_iflags))
++                      return gh;
++      }
++      return NULL;
++}
++
++/**
++ * find_first_strong_holder - find the first non-demoteable holder
++ * @gl: the glock
++ *
++ * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set.
++ */
++static inline struct gfs2_holder *
++find_first_strong_holder(struct gfs2_glock *gl)
++{
++      struct gfs2_holder *gh;
++
++      list_for_each_entry(gh, &gl->gl_holders, gh_list) {
++              if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
++                      return NULL;
++              if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
++                      return gh;
++      }
++      return NULL;
++}
++
+ /**
+  * do_promote - promote as many requests as possible on the current queue
+  * @gl: The glock
+@@ -393,14 +485,21 @@ __releases(&gl->gl_lockref.lock)
+ __acquires(&gl->gl_lockref.lock)
+ {
+       const struct gfs2_glock_operations *glops = gl->gl_ops;
+-      struct gfs2_holder *gh, *tmp;
++      struct gfs2_holder *gh, *tmp, *first_gh;
++      bool incompat_holders_demoted = false;
+       int ret;
+ 
+ restart:
++      first_gh = find_first_strong_holder(gl);
+       list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) {
+-              if (test_bit(HIF_HOLDER, &gh->gh_iflags))
++              if (!test_bit(HIF_WAIT, &gh->gh_iflags))
+                       continue;
+-              if (may_grant(gl, gh)) {
++              if (may_grant(gl, first_gh, gh)) {
++                      if (!incompat_holders_demoted) {
++                              demote_incompat_holders(gl, first_gh);
++                              incompat_holders_demoted = true;
++                              first_gh = gh;
++                      }
+                       if (gh->gh_list.prev == &gl->gl_holders &&
+                           glops->go_lock) {
+                               spin_unlock(&gl->gl_lockref.lock);
+@@ -426,6 +525,11 @@ restart:
+                       gfs2_holder_wake(gh);
+                       continue;
+               }
++              /*
++               * If we get here, it means we may not grant this holder for
++               * some reason. If this holder is the head of the list, it
++               * means we have a blocked holder at the head, so return 1.
++               */
+               if (gh->gh_list.prev == &gl->gl_holders)
+                       return 1;
+               do_error(gl, 0);
+@@ -722,23 +826,6 @@ out:
+       spin_lock(&gl->gl_lockref.lock);
+ }
+ 
+-/**
+- * find_first_holder - find the first "holder" gh
+- * @gl: the glock
+- */
+-
+-static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock 
*gl)
+-{
+-      struct gfs2_holder *gh;
+-
+-      if (!list_empty(&gl->gl_holders)) {
+-              gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, 
gh_list);
+-              if (test_bit(HIF_HOLDER, &gh->gh_iflags))
+-                      return gh;
+-      }
+-      return NULL;
+-}
+-
+ /**
+  * run_queue - do all outstanding tasks related to a glock
+  * @gl: The glock in question
+@@ -1354,15 +1441,20 @@ __acquires(&gl->gl_lockref.lock)
+               GLOCK_BUG_ON(gl, true);
+ 
+       if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) {
+-              if (test_bit(GLF_LOCK, &gl->gl_flags))
+-                      try_futile = !may_grant(gl, gh);
++              if (test_bit(GLF_LOCK, &gl->gl_flags)) {
++                      struct gfs2_holder *first_gh;
++
++                      first_gh = find_first_strong_holder(gl);
++                      try_futile = !may_grant(gl, first_gh, gh);
++              }
+               if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
+                       goto fail;
+       }
+ 
+       list_for_each_entry(gh2, &gl->gl_holders, gh_list) {
+               if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid &&
+-                  (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK)))
++                  (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) &&
++                  !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags)))
+                       goto trap_recursive;
+               if (try_futile &&
+                   !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) {
+@@ -1458,51 +1550,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh)
+       return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1;
+ }
+ 
+-/**
+- * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
+- * @gh: the glock holder
+- *
+- */
++static inline bool needs_demote(struct gfs2_glock *gl)
++{
++      return (test_bit(GLF_DEMOTE, &gl->gl_flags) ||
++              test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags));
++}
+ 
+-void gfs2_glock_dq(struct gfs2_holder *gh)
++static void __gfs2_glock_dq(struct gfs2_holder *gh)
+ {
+       struct gfs2_glock *gl = gh->gh_gl;
+       struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
+       unsigned delay = 0;
+       int fast_path = 0;
+ 
+-      spin_lock(&gl->gl_lockref.lock);
+       /*
+-       * If we're in the process of file system withdraw, we cannot just
+-       * dequeue any glocks until our journal is recovered, lest we
+-       * introduce file system corruption. We need two exceptions to this
+-       * rule: We need to allow unlocking of nondisk glocks and the glock
+-       * for our own journal that needs recovery.
++       * This while loop is similar to function demote_incompat_holders:
++       * If the glock is due to be demoted (which may be from another node
++       * or even if this holder is GL_NOCACHE), the weak holders are
++       * demoted as well, allowing the glock to be demoted.
+        */
+-      if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
+-          glock_blocked_by_withdraw(gl) &&
+-          gh->gh_gl != sdp->sd_jinode_gl) {
+-              sdp->sd_glock_dqs_held++;
+-              spin_unlock(&gl->gl_lockref.lock);
+-              might_sleep();
+-              wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
+-                          TASK_UNINTERRUPTIBLE);
+-              spin_lock(&gl->gl_lockref.lock);
+-      }
+-      if (gh->gh_flags & GL_NOCACHE)
+-              handle_callback(gl, LM_ST_UNLOCKED, 0, false);
++      while (gh) {
++              /*
++               * If we're in the process of file system withdraw, we cannot
++               * just dequeue any glocks until our journal is recovered, lest
++               * we introduce file system corruption. We need two exceptions
++               * to this rule: We need to allow unlocking of nondisk glocks
++               * and the glock for our own journal that needs recovery.
++               */
++              if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) &&
++                  glock_blocked_by_withdraw(gl) &&
++                  gh->gh_gl != sdp->sd_jinode_gl) {
++                      sdp->sd_glock_dqs_held++;
++                      spin_unlock(&gl->gl_lockref.lock);
++                      might_sleep();
++                      wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY,
++                                  TASK_UNINTERRUPTIBLE);
++                      spin_lock(&gl->gl_lockref.lock);
++              }
+ 
+-      list_del_init(&gh->gh_list);
+-      clear_bit(HIF_HOLDER, &gh->gh_iflags);
+-      if (list_empty(&gl->gl_holders) &&
+-          !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+-          !test_bit(GLF_DEMOTE, &gl->gl_flags))
+-              fast_path = 1;
++              /*
++               * This holder should not be cached, so mark it for demote.
++               * Note: this should be done before the check for needs_demote
++               * below.
++               */
++              if (gh->gh_flags & GL_NOCACHE)
++                      handle_callback(gl, LM_ST_UNLOCKED, 0, false);
++
++              list_del_init(&gh->gh_list);
++              clear_bit(HIF_HOLDER, &gh->gh_iflags);
++              trace_gfs2_glock_queue(gh, 0);
++
++              /*
++               * If there hasn't been a demote request we are done.
++               * (Let the remaining holders, if any, keep holding it.)
++               */
++              if (!needs_demote(gl)) {
++                      if (list_empty(&gl->gl_holders))
++                              fast_path = 1;
++                      break;
++              }
++              /*
++               * If we have another strong holder (we cannot auto-demote)
++               * we are done. It keeps holding it until it is done.
++               */
++              if (find_first_strong_holder(gl))
++                      break;
++
++              /*
++               * If we have a weak holder at the head of the list, it
++               * (and all others like it) must be auto-demoted. If there
++               * are no more weak holders, we exit the while loop.
++               */
++              gh = find_first_holder(gl);
++      }
+ 
+       if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl))
+               gfs2_glock_add_to_lru(gl);
+ 
+-      trace_gfs2_glock_queue(gh, 0);
+       if (unlikely(!fast_path)) {
+               gl->gl_lockref.count++;
+               if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
+@@ -1511,6 +1635,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
+                       delay = gl->gl_hold_time;
+               __gfs2_glock_queue_work(gl, delay);
+       }
++}
++
++/**
++ * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock)
++ * @gh: the glock holder
++ *
++ */
++void gfs2_glock_dq(struct gfs2_holder *gh)
++{
++      struct gfs2_glock *gl = gh->gh_gl;
++
++      spin_lock(&gl->gl_lockref.lock);
++      __gfs2_glock_dq(gh);
+       spin_unlock(&gl->gl_lockref.lock);
+ }
+ 
+@@ -1673,6 +1810,7 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct 
gfs2_holder *ghs)
+ 
+ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
+ {
++      struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, };
+       unsigned long delay = 0;
+       unsigned long holdtime;
+       unsigned long now = jiffies;
+@@ -1687,6 +1825,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int 
state)
+               if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
+                       delay = gl->gl_hold_time;
+       }
++      /*
++       * Note 1: We cannot call demote_incompat_holders from handle_callback
++       * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq ->
++       * handle_callback -> demote_incompat_holders -> gfs2_glock_dq
++       * Plus, we only want to demote the holders if the request comes from
++       * a remote cluster node because local holder conflicts are resolved
++       * elsewhere.
++       *
++       * Note 2: if a remote node wants this glock in EX mode, lock_dlm will
++       * request that we set our state to UNLOCKED. Here we mock up a holder
++       * to make it look like someone wants the lock EX locally. Any SH
++       * and DF requests should be able to share the lock without demoting.
++       *
++       * Note 3: We only want to demote the demoteable holders when there
++       * are no more strong holders. The demoteable holders might as well
++       * keep the glock until the last strong holder is done with it.
++       */
++      if (!find_first_strong_holder(gl)) {
++              if (state == LM_ST_UNLOCKED)
++                      mock_gh.gh_state = LM_ST_EXCLUSIVE;
++              demote_incompat_holders(gl, &mock_gh);
++      }
+       handle_callback(gl, state, delay, true);
+       __gfs2_glock_queue_work(gl, delay);
+       spin_unlock(&gl->gl_lockref.lock);
+@@ -2078,6 +2238,8 @@ static const char *hflags2str(char *buf, u16 flags, 
unsigned long iflags)
+               *p++ = 'H';
+       if (test_bit(HIF_WAIT, &iflags))
+               *p++ = 'W';
++      if (test_bit(HIF_MAY_DEMOTE, &iflags))
++              *p++ = 'D';
+       *p = 0;
+       return buf;
+ }
+diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
+index 31a8f2f649b52..9012487da4c69 100644
+--- a/fs/gfs2/glock.h
++++ b/fs/gfs2/glock.h
+@@ -150,6 +150,8 @@ static inline struct gfs2_holder 
*gfs2_glock_is_locked_by_me(struct gfs2_glock *
+       list_for_each_entry(gh, &gl->gl_holders, gh_list) {
+               if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
+                       break;
++              if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags))
++                      continue;
+               if (gh->gh_owner_pid == pid)
+                       goto out;
+       }
+@@ -325,6 +327,24 @@ static inline void glock_clear_object(struct gfs2_glock 
*gl, void *object)
+       spin_unlock(&gl->gl_lockref.lock);
+ }
+ 
++static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh)
++{
++      struct gfs2_glock *gl = gh->gh_gl;
++
++      spin_lock(&gl->gl_lockref.lock);
++      set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
++      spin_unlock(&gl->gl_lockref.lock);
++}
++
++static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh)
++{
++      struct gfs2_glock *gl = gh->gh_gl;
++
++      spin_lock(&gl->gl_lockref.lock);
++      clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags);
++      spin_unlock(&gl->gl_lockref.lock);
++}
++
+ extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation);
+ extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation);
+ 
+diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
+index 0fe49770166ea..ca42d310fd4d6 100644
+--- a/fs/gfs2/incore.h
++++ b/fs/gfs2/incore.h
+@@ -252,6 +252,7 @@ struct gfs2_lkstats {
+ 
+ enum {
+       /* States */
++      HIF_MAY_DEMOTE          = 1,
+       HIF_HOLDER              = 6,  /* Set for gh that "holds" the glock */
+       HIF_WAIT                = 10,
+ };
+@@ -386,9 +387,8 @@ struct gfs2_inode {
+       u64 i_generation;
+       u64 i_eattr;
+       unsigned long i_flags;          /* GIF_... */
+-      struct gfs2_glock *i_gl; /* Move into i_gh? */
++      struct gfs2_glock *i_gl;
+       struct gfs2_holder i_iopen_gh;
+-      struct gfs2_holder i_gh; /* for prepare/commit_write only */
+       struct gfs2_qadata *i_qadata; /* quota allocation data */
+       struct gfs2_holder i_rgd_gh;
+       struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
+diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
+index 97119ec3b8503..fe10d8a30f6bd 100644
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -757,7 +757,7 @@ again:
+                * same page as we're writing to, without it being marked
+                * up-to-date.
+                */
+-              if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
++              if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
+                       status = -EFAULT;
+                       break;
+               }
+diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
+index 4ecd255e0511c..468dcbba45bcb 100644
+--- a/fs/iomap/direct-io.c
++++ b/fs/iomap/direct-io.c
+@@ -31,6 +31,7 @@ struct iomap_dio {
+       atomic_t                ref;
+       unsigned                flags;
+       int                     error;
++      size_t                  done_before;
+       bool                    wait_for_completion;
+ 
+       union {
+@@ -124,6 +125,9 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
+       if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
+               ret = generic_write_sync(iocb, ret);
+ 
++      if (ret > 0)
++              ret += dio->done_before;
++
+       kfree(dio);
+ 
+       return ret;
+@@ -371,6 +375,8 @@ static loff_t iomap_dio_hole_iter(const struct iomap_iter 
*iter,
+       loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);
+ 
+       dio->size += length;
++      if (!length)
++              return -EFAULT;
+       return length;
+ }
+ 
+@@ -402,6 +408,8 @@ static loff_t iomap_dio_inline_iter(const struct 
iomap_iter *iomi,
+               copied = copy_to_iter(inline_data, length, iter);
+       }
+       dio->size += copied;
++      if (!copied)
++              return -EFAULT;
+       return copied;
+ }
+ 
+@@ -446,13 +454,21 @@ static loff_t iomap_dio_iter(const struct iomap_iter 
*iter,
+  * may be pure data writes. In that case, we still need to do a full data sync
+  * completion.
+  *
++ * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL,
++ * __iomap_dio_rw can return a partial result if it encounters a non-resident
++ * page in @iter after preparing a transfer.  In that case, the non-resident
++ * pages can be faulted in and the request resumed with @done_before set to 
the
++ * number of bytes previously transferred.  The request will then complete 
with
++ * the correct total number of bytes transferred; this is essential for
++ * completing partial requests asynchronously.
++ *
+  * Returns -ENOTBLK In case of a page invalidation invalidation failure for
+  * writes.  The callers needs to fall back to buffered I/O in this case.
+  */
+ struct iomap_dio *
+ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+               const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+-              unsigned int dio_flags)
++              unsigned int dio_flags, size_t done_before)
+ {
+       struct address_space *mapping = iocb->ki_filp->f_mapping;
+       struct inode *inode = file_inode(iocb->ki_filp);
+@@ -482,6 +498,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+       dio->dops = dops;
+       dio->error = 0;
+       dio->flags = 0;
++      dio->done_before = done_before;
+ 
+       dio->submit.iter = iter;
+       dio->submit.waiter = current;
+@@ -577,6 +594,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+       if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size)
+               iov_iter_revert(iter, iomi.pos - dio->i_size);
+ 
++      if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) {
++              if (!(iocb->ki_flags & IOCB_NOWAIT))
++                      wait_for_completion = true;
++              ret = 0;
++      }
++
+       /* magic error code to fall back to buffered I/O */
+       if (ret == -ENOTBLK) {
+               wait_for_completion = true;
+@@ -642,11 +665,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
+ ssize_t
+ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+               const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+-              unsigned int dio_flags)
++              unsigned int dio_flags, size_t done_before)
+ {
+       struct iomap_dio *dio;
+ 
+-      dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags);
++      dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
+       if (IS_ERR_OR_NULL(dio))
+               return PTR_ERR_OR_ZERO(dio);
+       return iomap_dio_complete(dio);
+diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
+index ab4f3362466d0..a43adeacd930c 100644
+--- a/fs/ntfs/file.c
++++ b/fs/ntfs/file.c
+@@ -1829,7 +1829,7 @@ again:
+                * pages being swapped out between us bringing them into memory
+                * and doing the actual copying.
+                */
+-              if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
++              if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
+                       status = -EFAULT;
+                       break;
+               }
+diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
+index 43b1451bff539..54b9599640ef4 100644
+--- a/fs/ntfs3/file.c
++++ b/fs/ntfs3/file.c
+@@ -989,7 +989,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, 
struct iov_iter *from)
+               frame_vbo = pos & ~(frame_size - 1);
+               index = frame_vbo >> PAGE_SHIFT;
+ 
+-              if (unlikely(iov_iter_fault_in_readable(from, bytes))) {
++              if (unlikely(fault_in_iov_iter_readable(from, bytes))) {
+                       err = -EFAULT;
+                       goto out;
+               }
+diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
+index 7aa943edfc02f..240eb932c014b 100644
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -259,7 +259,7 @@ xfs_file_dio_read(
+       ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
+       if (ret)
+               return ret;
+-      ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0);
++      ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+ 
+       return ret;
+@@ -569,7 +569,7 @@ xfs_file_dio_write_aligned(
+       }
+       trace_xfs_file_direct_write(iocb, from);
+       ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+-                         &xfs_dio_write_ops, 0);
++                         &xfs_dio_write_ops, 0, 0);
+ out_unlock:
+       if (iolock)
+               xfs_iunlock(ip, iolock);
+@@ -647,7 +647,7 @@ retry_exclusive:
+ 
+       trace_xfs_file_direct_write(iocb, from);
+       ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
+-                         &xfs_dio_write_ops, flags);
++                         &xfs_dio_write_ops, flags, 0);
+ 
+       /*
+        * Retry unaligned I/O with exclusive blocking semantics if the DIO
+diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
+index 807f33553a8eb..bced33b76beac 100644
+--- a/fs/zonefs/super.c
++++ b/fs/zonefs/super.c
+@@ -852,7 +852,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, 
struct iov_iter *from)
+               ret = zonefs_file_dio_append(iocb, from);
+       else
+               ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
+-                                 &zonefs_write_dio_ops, 0);
++                                 &zonefs_write_dio_ops, 0, 0);
+       if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
+           (ret > 0 || ret == -EIOCBQUEUED)) {
+               if (ret > 0)
+@@ -987,7 +987,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, 
struct iov_iter *to)
+               }
+               file_accessed(iocb->ki_filp);
+               ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
+-                                 &zonefs_read_dio_ops, 0);
++                                 &zonefs_read_dio_ops, 0, 0);
+       } else {
+               ret = generic_file_read_iter(iocb, to);
+               if (ret == -EIO)
+diff --git a/include/linux/bpf.h b/include/linux/bpf.h
+index 15b690a0cecb0..c5c4b6f09e230 100644
+--- a/include/linux/bpf.h
++++ b/include/linux/bpf.h
+@@ -293,6 +293,34 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
+ 
+ extern const struct bpf_map_ops bpf_map_offload_ops;
+ 
++/* bpf_type_flag contains a set of flags that are applicable to the values of
++ * arg_type, ret_type and reg_type. For example, a pointer value may be null,
++ * or a memory is read-only. We classify types into two categories: base types
++ * and extended types. Extended types are base types combined with a type 
flag.
++ *
++ * Currently there are no more than 32 base types in arg_type, ret_type and
++ * reg_types.
++ */
++#define BPF_BASE_TYPE_BITS    8
++
++enum bpf_type_flag {
++      /* PTR may be NULL. */
++      PTR_MAYBE_NULL          = BIT(0 + BPF_BASE_TYPE_BITS),
++
++      /* MEM is read-only. When applied on bpf_arg, it indicates the arg is
++       * compatible with both mutable and immutable memory.
++       */
++      MEM_RDONLY              = BIT(1 + BPF_BASE_TYPE_BITS),
++
++      __BPF_TYPE_LAST_FLAG    = MEM_RDONLY,
++};
++
++/* Max number of base types. */
++#define BPF_BASE_TYPE_LIMIT   (1UL << BPF_BASE_TYPE_BITS)
++
++/* Max number of all types. */
++#define BPF_TYPE_LIMIT                (__BPF_TYPE_LAST_FLAG | 
(__BPF_TYPE_LAST_FLAG - 1))
++
+ /* function argument constraints */
+ enum bpf_arg_type {
+       ARG_DONTCARE = 0,       /* unused argument in helper function */
+@@ -304,13 +332,11 @@ enum bpf_arg_type {
+       ARG_PTR_TO_MAP_KEY,     /* pointer to stack used as map key */
+       ARG_PTR_TO_MAP_VALUE,   /* pointer to stack used as map value */
+       ARG_PTR_TO_UNINIT_MAP_VALUE,    /* pointer to valid memory used to 
store a map value */
+-      ARG_PTR_TO_MAP_VALUE_OR_NULL,   /* pointer to stack used as map value 
or NULL */
+ 
+       /* the following constraints used to prototype bpf_memcmp() and other
+        * functions that access data on eBPF program stack
+        */
+       ARG_PTR_TO_MEM,         /* pointer to valid memory (stack, packet, map 
value) */
+-      ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */
+       ARG_PTR_TO_UNINIT_MEM,  /* pointer to memory does not need to be 
initialized,
+                                * helper function must fill all bytes or clear
+                                * them in error case.
+@@ -320,42 +346,65 @@ enum bpf_arg_type {
+       ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */
+ 
+       ARG_PTR_TO_CTX,         /* pointer to context */
+-      ARG_PTR_TO_CTX_OR_NULL, /* pointer to context or NULL */
+       ARG_ANYTHING,           /* any (initialized) argument is ok */
+       ARG_PTR_TO_SPIN_LOCK,   /* pointer to bpf_spin_lock */
+       ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */
+       ARG_PTR_TO_INT,         /* pointer to int */
+       ARG_PTR_TO_LONG,        /* pointer to long */
+       ARG_PTR_TO_SOCKET,      /* pointer to bpf_sock (fullsock) */
+-      ARG_PTR_TO_SOCKET_OR_NULL,      /* pointer to bpf_sock (fullsock) or 
NULL */
+       ARG_PTR_TO_BTF_ID,      /* pointer to in-kernel struct */
+       ARG_PTR_TO_ALLOC_MEM,   /* pointer to dynamically allocated memory */
+-      ARG_PTR_TO_ALLOC_MEM_OR_NULL,   /* pointer to dynamically allocated 
memory or NULL */
+       ARG_CONST_ALLOC_SIZE_OR_ZERO,   /* number of allocated bytes requested 
*/
+       ARG_PTR_TO_BTF_ID_SOCK_COMMON,  /* pointer to in-kernel sock_common or 
bpf-mirrored bpf_sock */
+       ARG_PTR_TO_PERCPU_BTF_ID,       /* pointer to in-kernel percpu type */
+       ARG_PTR_TO_FUNC,        /* pointer to a bpf program function */
+-      ARG_PTR_TO_STACK_OR_NULL,       /* pointer to stack or NULL */
++      ARG_PTR_TO_STACK,       /* pointer to stack */
+       ARG_PTR_TO_CONST_STR,   /* pointer to a null terminated read-only 
string */
+       ARG_PTR_TO_TIMER,       /* pointer to bpf_timer */
+       __BPF_ARG_TYPE_MAX,
++
++      /* Extended arg_types. */
++      ARG_PTR_TO_MAP_VALUE_OR_NULL    = PTR_MAYBE_NULL | ARG_PTR_TO_MAP_VALUE,
++      ARG_PTR_TO_MEM_OR_NULL          = PTR_MAYBE_NULL | ARG_PTR_TO_MEM,
++      ARG_PTR_TO_CTX_OR_NULL          = PTR_MAYBE_NULL | ARG_PTR_TO_CTX,
++      ARG_PTR_TO_SOCKET_OR_NULL       = PTR_MAYBE_NULL | ARG_PTR_TO_SOCKET,
++      ARG_PTR_TO_ALLOC_MEM_OR_NULL    = PTR_MAYBE_NULL | ARG_PTR_TO_ALLOC_MEM,
++      ARG_PTR_TO_STACK_OR_NULL        = PTR_MAYBE_NULL | ARG_PTR_TO_STACK,
++
++      /* This must be the last entry. Its purpose is to ensure the enum is
++       * wide enough to hold the higher bits reserved for bpf_type_flag.
++       */
++      __BPF_ARG_TYPE_LIMIT    = BPF_TYPE_LIMIT,
+ };
++static_assert(__BPF_ARG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
+ 
+ /* type of values returned from helper functions */
+ enum bpf_return_type {
+       RET_INTEGER,                    /* function returns integer */
+       RET_VOID,                       /* function doesn't return anything */
+       RET_PTR_TO_MAP_VALUE,           /* returns a pointer to map elem value 
*/
+-      RET_PTR_TO_MAP_VALUE_OR_NULL,   /* returns a pointer to map elem value 
or NULL */
+-      RET_PTR_TO_SOCKET_OR_NULL,      /* returns a pointer to a socket or 
NULL */
+-      RET_PTR_TO_TCP_SOCK_OR_NULL,    /* returns a pointer to a tcp_sock or 
NULL */
+-      RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common 
or NULL */
+-      RET_PTR_TO_ALLOC_MEM_OR_NULL,   /* returns a pointer to dynamically 
allocated memory or NULL */
+-      RET_PTR_TO_BTF_ID_OR_NULL,      /* returns a pointer to a btf_id or 
NULL */
+-      RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid 
memory or a btf_id or NULL */
++      RET_PTR_TO_SOCKET,              /* returns a pointer to a socket */
++      RET_PTR_TO_TCP_SOCK,            /* returns a pointer to a tcp_sock */
++      RET_PTR_TO_SOCK_COMMON,         /* returns a pointer to a sock_common */
++      RET_PTR_TO_ALLOC_MEM,           /* returns a pointer to dynamically 
allocated memory */
+       RET_PTR_TO_MEM_OR_BTF_ID,       /* returns a pointer to a valid memory 
or a btf_id */
+       RET_PTR_TO_BTF_ID,              /* returns a pointer to a btf_id */
++      __BPF_RET_TYPE_MAX,
++
++      /* Extended ret_types. */
++      RET_PTR_TO_MAP_VALUE_OR_NULL    = PTR_MAYBE_NULL | RET_PTR_TO_MAP_VALUE,
++      RET_PTR_TO_SOCKET_OR_NULL       = PTR_MAYBE_NULL | RET_PTR_TO_SOCKET,
++      RET_PTR_TO_TCP_SOCK_OR_NULL     = PTR_MAYBE_NULL | RET_PTR_TO_TCP_SOCK,
++      RET_PTR_TO_SOCK_COMMON_OR_NULL  = PTR_MAYBE_NULL | 
RET_PTR_TO_SOCK_COMMON,
++      RET_PTR_TO_ALLOC_MEM_OR_NULL    = PTR_MAYBE_NULL | RET_PTR_TO_ALLOC_MEM,
++      RET_PTR_TO_BTF_ID_OR_NULL       = PTR_MAYBE_NULL | RET_PTR_TO_BTF_ID,
++
++      /* This must be the last entry. Its purpose is to ensure the enum is
++       * wide enough to hold the higher bits reserved for bpf_type_flag.
++       */
++      __BPF_RET_TYPE_LIMIT    = BPF_TYPE_LIMIT,
+ };
++static_assert(__BPF_RET_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
+ 
+ /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF 
programs
+  * to in-kernel helper functions and for adjusting imm32 field in BPF_CALL
+@@ -417,18 +466,15 @@ enum bpf_reg_type {
+       PTR_TO_CTX,              /* reg points to bpf_context */
+       CONST_PTR_TO_MAP,        /* reg points to struct bpf_map */
+       PTR_TO_MAP_VALUE,        /* reg points to map element value */
+-      PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
++      PTR_TO_MAP_KEY,          /* reg points to a map element key */
+       PTR_TO_STACK,            /* reg == frame_pointer + offset */
+       PTR_TO_PACKET_META,      /* skb->data - meta_len */
+       PTR_TO_PACKET,           /* reg points to skb->data */
+       PTR_TO_PACKET_END,       /* skb->data + headlen */
+       PTR_TO_FLOW_KEYS,        /* reg points to bpf_flow_keys */
+       PTR_TO_SOCKET,           /* reg points to struct bpf_sock */
+-      PTR_TO_SOCKET_OR_NULL,   /* reg points to struct bpf_sock or NULL */
+       PTR_TO_SOCK_COMMON,      /* reg points to sock_common */
+-      PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */
+       PTR_TO_TCP_SOCK,         /* reg points to struct tcp_sock */
+-      PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */
+       PTR_TO_TP_BUFFER,        /* reg points to a writable raw tp's buffer */
+       PTR_TO_XDP_SOCK,         /* reg points to struct xdp_sock */
+       /* PTR_TO_BTF_ID points to a kernel struct that does not need
+@@ -446,18 +492,25 @@ enum bpf_reg_type {
+        * been checked for null. Used primarily to inform the verifier
+        * an explicit null check is required for this struct.
+        */
+-      PTR_TO_BTF_ID_OR_NULL,
+       PTR_TO_MEM,              /* reg points to valid memory region */
+-      PTR_TO_MEM_OR_NULL,      /* reg points to valid memory region or NULL */
+-      PTR_TO_RDONLY_BUF,       /* reg points to a readonly buffer */
+-      PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */
+-      PTR_TO_RDWR_BUF,         /* reg points to a read/write buffer */
+-      PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
++      PTR_TO_BUF,              /* reg points to a read/write buffer */
+       PTR_TO_PERCPU_BTF_ID,    /* reg points to a percpu kernel variable */
+       PTR_TO_FUNC,             /* reg points to a bpf program function */
+-      PTR_TO_MAP_KEY,          /* reg points to a map element key */
+       __BPF_REG_TYPE_MAX,
++
++      /* Extended reg_types. */
++      PTR_TO_MAP_VALUE_OR_NULL        = PTR_MAYBE_NULL | PTR_TO_MAP_VALUE,
++      PTR_TO_SOCKET_OR_NULL           = PTR_MAYBE_NULL | PTR_TO_SOCKET,
++      PTR_TO_SOCK_COMMON_OR_NULL      = PTR_MAYBE_NULL | PTR_TO_SOCK_COMMON,
++      PTR_TO_TCP_SOCK_OR_NULL         = PTR_MAYBE_NULL | PTR_TO_TCP_SOCK,
++      PTR_TO_BTF_ID_OR_NULL           = PTR_MAYBE_NULL | PTR_TO_BTF_ID,
++
++      /* This must be the last entry. Its purpose is to ensure the enum is
++       * wide enough to hold the higher bits reserved for bpf_type_flag.
++       */
++      __BPF_REG_TYPE_LIMIT    = BPF_TYPE_LIMIT,
+ };
++static_assert(__BPF_REG_TYPE_MAX <= BPF_BASE_TYPE_LIMIT);
+ 
+ /* The information passed from prog-specific *_is_valid_access
+  * back to the verifier.
+diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
+index 364550dd19c4a..bb1cc3fbc4bab 100644
+--- a/include/linux/bpf_verifier.h
++++ b/include/linux/bpf_verifier.h
+@@ -18,6 +18,8 @@
+  * that converting umax_value to int cannot overflow.
+  */
+ #define BPF_MAX_VAR_SIZ       (1 << 29)
++/* size of type_str_buf in bpf_verifier. */
++#define TYPE_STR_BUF_LEN 64
+ 
+ /* Liveness marks, used for registers and spilled-regs (in stack slots).
+  * Read marks propagate upwards until they find a write mark; they record that
+@@ -474,6 +476,8 @@ struct bpf_verifier_env {
+       /* longest register parentage chain walked for liveness marking */
+       u32 longest_mark_read_walk;
+       bpfptr_t fd_array;
++      /* buffer used in reg_type_str() to generate reg_type string */
++      char type_str_buf[TYPE_STR_BUF_LEN];
+ };
+ 
+ __printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log,
+@@ -535,4 +539,18 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
+                           u32 btf_id,
+                           struct bpf_attach_target_info *tgt_info);
+ 
++#define BPF_BASE_TYPE_MASK    GENMASK(BPF_BASE_TYPE_BITS - 1, 0)
++
++/* extract base type from bpf_{arg, return, reg}_type. */
++static inline u32 base_type(u32 type)
++{
++      return type & BPF_BASE_TYPE_MASK;
++}
++
++/* extract flags from an extended type. See bpf_type_flag in bpf.h. */
++static inline u32 type_flag(u32 type)
++{
++      return type & ~BPF_BASE_TYPE_MASK;
++}
++
+ #endif /* _LINUX_BPF_VERIFIER_H */
+diff --git a/include/linux/iomap.h b/include/linux/iomap.h
+index 24f8489583ca7..829f2325ecbab 100644
+--- a/include/linux/iomap.h
++++ b/include/linux/iomap.h
+@@ -330,12 +330,19 @@ struct iomap_dio_ops {
+   */
+ #define IOMAP_DIO_OVERWRITE_ONLY      (1 << 1)
+ 
++/*
++ * When a page fault occurs, return a partial synchronous result and allow
++ * the caller to retry the rest of the operation after dealing with the page
++ * fault.
++ */
++#define IOMAP_DIO_PARTIAL             (1 << 2)
++
+ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+               const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+-              unsigned int dio_flags);
++              unsigned int dio_flags, size_t done_before);
+ struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
+               const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
+-              unsigned int dio_flags);
++              unsigned int dio_flags, size_t done_before);
+ ssize_t iomap_dio_complete(struct iomap_dio *dio);
+ int iomap_dio_iopoll(struct kiocb *kiocb, bool spin);
+ 
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 90c2d7f3c7a88..04345ff97f8ca 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -2858,7 +2858,8 @@ struct page *follow_page(struct vm_area_struct *vma, 
unsigned long address,
+ #define FOLL_FORCE    0x10    /* get_user_pages read/write w/o permission */
+ #define FOLL_NOWAIT   0x20    /* if a disk transfer is needed, start the IO
+                                * and return without waiting upon it */
+-#define FOLL_POPULATE 0x40    /* fault in page */
++#define FOLL_POPULATE 0x40    /* fault in pages (with FOLL_MLOCK) */
++#define FOLL_NOFAULT  0x80    /* do not fault in pages */
+ #define FOLL_HWPOISON 0x100   /* check page is hwpoisoned */
+ #define FOLL_NUMA     0x200   /* force NUMA hinting page fault */
+ #define FOLL_MIGRATION        0x400   /* wait for page to replace migration 
entry */
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
+index 62db6b0176b95..2f7dd14083d94 100644
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -733,61 +733,11 @@ int wait_on_page_private_2_killable(struct page *page);
+ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t 
*waiter);
+ 
+ /*
+- * Fault everything in given userspace address range in.
++ * Fault in userspace address range.
+  */
+-static inline int fault_in_pages_writeable(char __user *uaddr, size_t size)
+-{
+-      char __user *end = uaddr + size - 1;
+-
+-      if (unlikely(size == 0))
+-              return 0;
+-
+-      if (unlikely(uaddr > end))
+-              return -EFAULT;
+-      /*
+-       * Writing zeroes into userspace here is OK, because we know that if
+-       * the zero gets there, we'll be overwriting it.
+-       */
+-      do {
+-              if (unlikely(__put_user(0, uaddr) != 0))
+-                      return -EFAULT;
+-              uaddr += PAGE_SIZE;
+-      } while (uaddr <= end);
+-
+-      /* Check whether the range spilled into the next page. */
+-      if (((unsigned long)uaddr & PAGE_MASK) ==
+-                      ((unsigned long)end & PAGE_MASK))
+-              return __put_user(0, end);
+-
+-      return 0;
+-}
+-
+-static inline int fault_in_pages_readable(const char __user *uaddr, size_t 
size)
+-{
+-      volatile char c;
+-      const char __user *end = uaddr + size - 1;
+-
+-      if (unlikely(size == 0))
+-              return 0;
+-
+-      if (unlikely(uaddr > end))
+-              return -EFAULT;
+-
+-      do {
+-              if (unlikely(__get_user(c, uaddr) != 0))
+-                      return -EFAULT;
+-              uaddr += PAGE_SIZE;
+-      } while (uaddr <= end);
+-
+-      /* Check whether the range spilled into the next page. */
+-      if (((unsigned long)uaddr & PAGE_MASK) ==
+-                      ((unsigned long)end & PAGE_MASK)) {
+-              return __get_user(c, end);
+-      }
+-
+-      (void)c;
+-      return 0;
+-}
++size_t fault_in_writeable(char __user *uaddr, size_t size);
++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size);
++size_t fault_in_readable(const char __user *uaddr, size_t size);
+ 
+ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
+                               pgoff_t index, gfp_t gfp_mask);
+diff --git a/include/linux/uio.h b/include/linux/uio.h
+index 207101a9c5c32..6350354f97e90 100644
+--- a/include/linux/uio.h
++++ b/include/linux/uio.h
+@@ -35,6 +35,7 @@ struct iov_iter_state {
+ 
+ struct iov_iter {
+       u8 iter_type;
++      bool nofault;
+       bool data_source;
+       size_t iov_offset;
+       size_t count;
+@@ -133,7 +134,8 @@ size_t copy_page_from_iter_atomic(struct page *page, 
unsigned offset,
+                                 size_t bytes, struct iov_iter *i);
+ void iov_iter_advance(struct iov_iter *i, size_t bytes);
+ void iov_iter_revert(struct iov_iter *i, size_t bytes);
+-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes);
++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes);
++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes);
+ size_t iov_iter_single_seg_count(const struct iov_iter *i);
+ size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
+                        struct iov_iter *i);
+diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
+index 09406b0e215e1..40df35088cdbd 100644
+--- a/kernel/bpf/btf.c
++++ b/kernel/bpf/btf.c
+@@ -4800,10 +4800,12 @@ bool btf_ctx_access(int off, int size, enum 
bpf_access_type type,
+       /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
+       for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+               const struct bpf_ctx_arg_aux *ctx_arg_info = 
&prog->aux->ctx_arg_info[i];
++              u32 type, flag;
+ 
+-              if (ctx_arg_info->offset == off &&
+-                  (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
+-                   ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
++              type = base_type(ctx_arg_info->reg_type);
++              flag = type_flag(ctx_arg_info->reg_type);
++              if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
++                  (flag & PTR_MAYBE_NULL)) {
+                       info->reg_type = ctx_arg_info->reg_type;
+                       return true;
+               }
+@@ -5508,9 +5510,9 @@ static int btf_check_func_arg_match(struct 
bpf_verifier_env *env,
+                       if (reg->type == PTR_TO_BTF_ID) {
+                               reg_btf = reg->btf;
+                               reg_ref_id = reg->btf_id;
+-                      } else if (reg2btf_ids[reg->type]) {
++                      } else if (reg2btf_ids[base_type(reg->type)]) {
+                               reg_btf = btf_vmlinux;
+-                              reg_ref_id = *reg2btf_ids[reg->type];
++                              reg_ref_id = *reg2btf_ids[base_type(reg->type)];
+                       } else {
+                               bpf_log(log, "kernel function %s args#%d 
expected pointer to %s %s but R%d is not a pointer to btf_id\n",
+                                       func_name, i,
+@@ -5717,7 +5719,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, 
int subprog,
+                               return -EINVAL;
+                       }
+ 
+-                      reg->type = PTR_TO_MEM_OR_NULL;
++                      reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
+                       reg->id = ++env->id_gen;
+ 
+                       continue;
+@@ -6229,7 +6231,7 @@ const struct bpf_func_proto 
bpf_btf_find_by_name_kind_proto = {
+       .func           = bpf_btf_find_by_name_kind,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+-      .arg1_type      = ARG_PTR_TO_MEM,
++      .arg1_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg2_type      = ARG_CONST_SIZE,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_ANYTHING,
+diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
+index 7dbd68195a2b0..fe053ffd89329 100644
+--- a/kernel/bpf/cgroup.c
++++ b/kernel/bpf/cgroup.c
+@@ -1753,7 +1753,7 @@ static const struct bpf_func_proto 
bpf_sysctl_set_new_value_proto = {
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+ };
+ 
+diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
+index 6f600cc95ccda..a711ffe238932 100644
+--- a/kernel/bpf/helpers.c
++++ b/kernel/bpf/helpers.c
+@@ -530,7 +530,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
+       .func           = bpf_strtol,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+-      .arg1_type      = ARG_PTR_TO_MEM,
++      .arg1_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg2_type      = ARG_CONST_SIZE,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_PTR_TO_LONG,
+@@ -558,7 +558,7 @@ const struct bpf_func_proto bpf_strtoul_proto = {
+       .func           = bpf_strtoul,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+-      .arg1_type      = ARG_PTR_TO_MEM,
++      .arg1_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg2_type      = ARG_CONST_SIZE,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_PTR_TO_LONG,
+@@ -630,7 +630,7 @@ const struct bpf_func_proto bpf_event_output_data_proto =  
{
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
+@@ -667,7 +667,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
+ const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
+       .func           = bpf_per_cpu_ptr,
+       .gpl_only       = false,
+-      .ret_type       = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
++      .ret_type       = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | 
MEM_RDONLY,
+       .arg1_type      = ARG_PTR_TO_PERCPU_BTF_ID,
+       .arg2_type      = ARG_ANYTHING,
+ };
+@@ -680,7 +680,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
+ const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
+       .func           = bpf_this_cpu_ptr,
+       .gpl_only       = false,
+-      .ret_type       = RET_PTR_TO_MEM_OR_BTF_ID,
++      .ret_type       = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
+       .arg1_type      = ARG_PTR_TO_PERCPU_BTF_ID,
+ };
+ 
+@@ -1013,7 +1013,7 @@ const struct bpf_func_proto bpf_snprintf_proto = {
+       .arg1_type      = ARG_PTR_TO_MEM_OR_NULL,
+       .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
+       .arg3_type      = ARG_PTR_TO_CONST_STR,
+-      .arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
++      .arg4_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
+diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
+index 6a9542af4212a..b0fa190b09790 100644
+--- a/kernel/bpf/map_iter.c
++++ b/kernel/bpf/map_iter.c
+@@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
+       .ctx_arg_info_size      = 2,
+       .ctx_arg_info           = {
+               { offsetof(struct bpf_iter__bpf_map_elem, key),
+-                PTR_TO_RDONLY_BUF_OR_NULL },
++                PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
+               { offsetof(struct bpf_iter__bpf_map_elem, value),
+-                PTR_TO_RDWR_BUF_OR_NULL },
++                PTR_TO_BUF | PTR_MAYBE_NULL },
+       },
+ };
+ 
+diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c
+index f1c51c45667d3..710ba9de12ce4 100644
+--- a/kernel/bpf/ringbuf.c
++++ b/kernel/bpf/ringbuf.c
+@@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {
+       .func           = bpf_ringbuf_output,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_CONST_MAP_PTR,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+       .arg4_type      = ARG_ANYTHING,
+ };
+diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
+index 42490c39dfbf5..48e02a725563f 100644
+--- a/kernel/bpf/syscall.c
++++ b/kernel/bpf/syscall.c
+@@ -4753,7 +4753,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_ANYTHING,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+ };
+ 
+diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
+index 670721e39c0e8..d2b119b4fbe74 100644
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -445,18 +445,6 @@ static bool reg_type_not_null(enum bpf_reg_type type)
+               type == PTR_TO_SOCK_COMMON;
+ }
+ 
+-static bool reg_type_may_be_null(enum bpf_reg_type type)
+-{
+-      return type == PTR_TO_MAP_VALUE_OR_NULL ||
+-             type == PTR_TO_SOCKET_OR_NULL ||
+-             type == PTR_TO_SOCK_COMMON_OR_NULL ||
+-             type == PTR_TO_TCP_SOCK_OR_NULL ||
+-             type == PTR_TO_BTF_ID_OR_NULL ||
+-             type == PTR_TO_MEM_OR_NULL ||
+-             type == PTR_TO_RDONLY_BUF_OR_NULL ||
+-             type == PTR_TO_RDWR_BUF_OR_NULL;
+-}
+-
+ static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
+ {
+       return reg->type == PTR_TO_MAP_VALUE &&
+@@ -465,12 +453,14 @@ static bool reg_may_point_to_spin_lock(const struct 
bpf_reg_state *reg)
+ 
+ static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type)
+ {
+-      return type == PTR_TO_SOCKET ||
+-              type == PTR_TO_SOCKET_OR_NULL ||
+-              type == PTR_TO_TCP_SOCK ||
+-              type == PTR_TO_TCP_SOCK_OR_NULL ||
+-              type == PTR_TO_MEM ||
+-              type == PTR_TO_MEM_OR_NULL;
++      return base_type(type) == PTR_TO_SOCKET ||
++              base_type(type) == PTR_TO_TCP_SOCK ||
++              base_type(type) == PTR_TO_MEM;
++}
++
++static bool type_is_rdonly_mem(u32 type)
++{
++      return type & MEM_RDONLY;
+ }
+ 
+ static bool arg_type_may_be_refcounted(enum bpf_arg_type type)
+@@ -478,14 +468,9 @@ static bool arg_type_may_be_refcounted(enum bpf_arg_type 
type)
+       return type == ARG_PTR_TO_SOCK_COMMON;
+ }
+ 
+-static bool arg_type_may_be_null(enum bpf_arg_type type)
++static bool type_may_be_null(u32 type)
+ {
+-      return type == ARG_PTR_TO_MAP_VALUE_OR_NULL ||
+-             type == ARG_PTR_TO_MEM_OR_NULL ||
+-             type == ARG_PTR_TO_CTX_OR_NULL ||
+-             type == ARG_PTR_TO_SOCKET_OR_NULL ||
+-             type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
+-             type == ARG_PTR_TO_STACK_OR_NULL;
++      return type & PTR_MAYBE_NULL;
+ }
+ 
+ /* Determine whether the function releases some resources allocated by another
+@@ -545,39 +530,54 @@ static bool is_cmpxchg_insn(const struct bpf_insn *insn)
+              insn->imm == BPF_CMPXCHG;
+ }
+ 
+-/* string representation of 'enum bpf_reg_type' */
+-static const char * const reg_type_str[] = {
+-      [NOT_INIT]              = "?",
+-      [SCALAR_VALUE]          = "inv",
+-      [PTR_TO_CTX]            = "ctx",
+-      [CONST_PTR_TO_MAP]      = "map_ptr",
+-      [PTR_TO_MAP_VALUE]      = "map_value",
+-      [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+-      [PTR_TO_STACK]          = "fp",
+-      [PTR_TO_PACKET]         = "pkt",
+-      [PTR_TO_PACKET_META]    = "pkt_meta",
+-      [PTR_TO_PACKET_END]     = "pkt_end",
+-      [PTR_TO_FLOW_KEYS]      = "flow_keys",
+-      [PTR_TO_SOCKET]         = "sock",
+-      [PTR_TO_SOCKET_OR_NULL] = "sock_or_null",
+-      [PTR_TO_SOCK_COMMON]    = "sock_common",
+-      [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null",
+-      [PTR_TO_TCP_SOCK]       = "tcp_sock",
+-      [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null",
+-      [PTR_TO_TP_BUFFER]      = "tp_buffer",
+-      [PTR_TO_XDP_SOCK]       = "xdp_sock",
+-      [PTR_TO_BTF_ID]         = "ptr_",
+-      [PTR_TO_BTF_ID_OR_NULL] = "ptr_or_null_",
+-      [PTR_TO_PERCPU_BTF_ID]  = "percpu_ptr_",
+-      [PTR_TO_MEM]            = "mem",
+-      [PTR_TO_MEM_OR_NULL]    = "mem_or_null",
+-      [PTR_TO_RDONLY_BUF]     = "rdonly_buf",
+-      [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
+-      [PTR_TO_RDWR_BUF]       = "rdwr_buf",
+-      [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
+-      [PTR_TO_FUNC]           = "func",
+-      [PTR_TO_MAP_KEY]        = "map_key",
+-};
++/* string representation of 'enum bpf_reg_type'
++ *
++ * Note that reg_type_str() can not appear more than once in a single 
verbose()
++ * statement.
++ */
++static const char *reg_type_str(struct bpf_verifier_env *env,
++                              enum bpf_reg_type type)
++{
++      char postfix[16] = {0}, prefix[16] = {0};
++      static const char * const str[] = {
++              [NOT_INIT]              = "?",
++              [SCALAR_VALUE]          = "inv",
++              [PTR_TO_CTX]            = "ctx",
++              [CONST_PTR_TO_MAP]      = "map_ptr",
++              [PTR_TO_MAP_VALUE]      = "map_value",
++              [PTR_TO_STACK]          = "fp",
++              [PTR_TO_PACKET]         = "pkt",
++              [PTR_TO_PACKET_META]    = "pkt_meta",
++              [PTR_TO_PACKET_END]     = "pkt_end",
++              [PTR_TO_FLOW_KEYS]      = "flow_keys",
++              [PTR_TO_SOCKET]         = "sock",
++              [PTR_TO_SOCK_COMMON]    = "sock_common",
++              [PTR_TO_TCP_SOCK]       = "tcp_sock",
++              [PTR_TO_TP_BUFFER]      = "tp_buffer",
++              [PTR_TO_XDP_SOCK]       = "xdp_sock",
++              [PTR_TO_BTF_ID]         = "ptr_",
++              [PTR_TO_PERCPU_BTF_ID]  = "percpu_ptr_",
++              [PTR_TO_MEM]            = "mem",
++              [PTR_TO_BUF]            = "buf",
++              [PTR_TO_FUNC]           = "func",
++              [PTR_TO_MAP_KEY]        = "map_key",
++      };
++
++      if (type & PTR_MAYBE_NULL) {
++              if (base_type(type) == PTR_TO_BTF_ID ||
++                  base_type(type) == PTR_TO_PERCPU_BTF_ID)
++                      strncpy(postfix, "or_null_", 16);
++              else
++                      strncpy(postfix, "_or_null", 16);
++      }
++
++      if (type & MEM_RDONLY)
++              strncpy(prefix, "rdonly_", 16);
++
++      snprintf(env->type_str_buf, TYPE_STR_BUF_LEN, "%s%s%s",
++               prefix, str[base_type(type)], postfix);
++      return env->type_str_buf;
++}
+ 
+ static char slot_type_char[] = {
+       [STACK_INVALID] = '?',
+@@ -628,7 +628,7 @@ static void print_verifier_state(struct bpf_verifier_env 
*env,
+                       continue;
+               verbose(env, " R%d", i);
+               print_liveness(env, reg->live);
+-              verbose(env, "=%s", reg_type_str[t]);
++              verbose(env, "=%s", reg_type_str(env, t));
+               if (t == SCALAR_VALUE && reg->precise)
+                       verbose(env, "P");
+               if ((t == SCALAR_VALUE || t == PTR_TO_STACK) &&
+@@ -636,9 +636,8 @@ static void print_verifier_state(struct bpf_verifier_env 
*env,
+                       /* reg->off should be 0 for SCALAR_VALUE */
+                       verbose(env, "%lld", reg->var_off.value + reg->off);
+               } else {
+-                      if (t == PTR_TO_BTF_ID ||
+-                          t == PTR_TO_BTF_ID_OR_NULL ||
+-                          t == PTR_TO_PERCPU_BTF_ID)
++                      if (base_type(t) == PTR_TO_BTF_ID ||
++                          base_type(t) == PTR_TO_PERCPU_BTF_ID)
+                               verbose(env, "%s", kernel_type_name(reg->btf, 
reg->btf_id));
+                       verbose(env, "(id=%d", reg->id);
+                       if (reg_type_may_be_refcounted_or_null(t))
+@@ -647,10 +646,9 @@ static void print_verifier_state(struct bpf_verifier_env 
*env,
+                               verbose(env, ",off=%d", reg->off);
+                       if (type_is_pkt_pointer(t))
+                               verbose(env, ",r=%d", reg->range);
+-                      else if (t == CONST_PTR_TO_MAP ||
+-                               t == PTR_TO_MAP_KEY ||
+-                               t == PTR_TO_MAP_VALUE ||
+-                               t == PTR_TO_MAP_VALUE_OR_NULL)
++                      else if (base_type(t) == CONST_PTR_TO_MAP ||
++                               base_type(t) == PTR_TO_MAP_KEY ||
++                               base_type(t) == PTR_TO_MAP_VALUE)
+                               verbose(env, ",ks=%d,vs=%d",
+                                       reg->map_ptr->key_size,
+                                       reg->map_ptr->value_size);
+@@ -720,7 +718,7 @@ static void print_verifier_state(struct bpf_verifier_env 
*env,
+               if (state->stack[i].slot_type[0] == STACK_SPILL) {
+                       reg = &state->stack[i].spilled_ptr;
+                       t = reg->type;
+-                      verbose(env, "=%s", reg_type_str[t]);
++                      verbose(env, "=%s", reg_type_str(env, t));
+                       if (t == SCALAR_VALUE && reg->precise)
+                               verbose(env, "P");
+                       if (t == SCALAR_VALUE && tnum_is_const(reg->var_off))
+@@ -1133,8 +1131,7 @@ static void mark_reg_known_zero(struct bpf_verifier_env 
*env,
+ 
+ static void mark_ptr_not_null_reg(struct bpf_reg_state *reg)
+ {
+-      switch (reg->type) {
+-      case PTR_TO_MAP_VALUE_OR_NULL: {
++      if (base_type(reg->type) == PTR_TO_MAP_VALUE) {
+               const struct bpf_map *map = reg->map_ptr;
+ 
+               if (map->inner_map_meta) {
+@@ -1153,32 +1150,10 @@ static void mark_ptr_not_null_reg(struct bpf_reg_state 
*reg)
+               } else {
+                       reg->type = PTR_TO_MAP_VALUE;
+               }
+-              break;
+-      }
+-      case PTR_TO_SOCKET_OR_NULL:
+-              reg->type = PTR_TO_SOCKET;
+-              break;
+-      case PTR_TO_SOCK_COMMON_OR_NULL:
+-              reg->type = PTR_TO_SOCK_COMMON;
+-              break;
+-      case PTR_TO_TCP_SOCK_OR_NULL:
+-              reg->type = PTR_TO_TCP_SOCK;
+-              break;
+-      case PTR_TO_BTF_ID_OR_NULL:
+-              reg->type = PTR_TO_BTF_ID;
+-              break;
+-      case PTR_TO_MEM_OR_NULL:
+-              reg->type = PTR_TO_MEM;
+-              break;
+-      case PTR_TO_RDONLY_BUF_OR_NULL:
+-              reg->type = PTR_TO_RDONLY_BUF;
+-              break;
+-      case PTR_TO_RDWR_BUF_OR_NULL:
+-              reg->type = PTR_TO_RDWR_BUF;
+-              break;
+-      default:
+-              WARN_ONCE(1, "unknown nullable register type");
++              return;
+       }
++
++      reg->type &= ~PTR_MAYBE_NULL;
+ }
+ 
+ static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg)
+@@ -1906,7 +1881,7 @@ static int mark_reg_read(struct bpf_verifier_env *env,
+                       break;
+               if (parent->live & REG_LIVE_DONE) {
+                       verbose(env, "verifier BUG type %s var_off %lld off 
%d\n",
+-                              reg_type_str[parent->type],
++                              reg_type_str(env, parent->type),
+                               parent->var_off.value, parent->off);
+                       return -EFAULT;
+               }
+@@ -2564,9 +2539,8 @@ static int mark_chain_precision_stack(struct 
bpf_verifier_env *env, int spi)
+ 
+ static bool is_spillable_regtype(enum bpf_reg_type type)
+ {
+-      switch (type) {
++      switch (base_type(type)) {
+       case PTR_TO_MAP_VALUE:
+-      case PTR_TO_MAP_VALUE_OR_NULL:
+       case PTR_TO_STACK:
+       case PTR_TO_CTX:
+       case PTR_TO_PACKET:
+@@ -2575,21 +2549,13 @@ static bool is_spillable_regtype(enum bpf_reg_type 
type)
+       case PTR_TO_FLOW_KEYS:
+       case CONST_PTR_TO_MAP:
+       case PTR_TO_SOCKET:
+-      case PTR_TO_SOCKET_OR_NULL:
+       case PTR_TO_SOCK_COMMON:
+-      case PTR_TO_SOCK_COMMON_OR_NULL:
+       case PTR_TO_TCP_SOCK:
+-      case PTR_TO_TCP_SOCK_OR_NULL:
+       case PTR_TO_XDP_SOCK:
+       case PTR_TO_BTF_ID:
+-      case PTR_TO_BTF_ID_OR_NULL:
+-      case PTR_TO_RDONLY_BUF:
+-      case PTR_TO_RDONLY_BUF_OR_NULL:
+-      case PTR_TO_RDWR_BUF:
+-      case PTR_TO_RDWR_BUF_OR_NULL:
++      case PTR_TO_BUF:
+       case PTR_TO_PERCPU_BTF_ID:
+       case PTR_TO_MEM:
+-      case PTR_TO_MEM_OR_NULL:
+       case PTR_TO_FUNC:
+       case PTR_TO_MAP_KEY:
+               return true;
+@@ -3405,7 +3371,7 @@ static int check_ctx_access(struct bpf_verifier_env 
*env, int insn_idx, int off,
+                */
+               *reg_type = info.reg_type;
+ 
+-              if (*reg_type == PTR_TO_BTF_ID || *reg_type == 
PTR_TO_BTF_ID_OR_NULL) {
++              if (base_type(*reg_type) == PTR_TO_BTF_ID) {
+                       *btf = info.btf;
+                       *btf_id = info.btf_id;
+               } else {
+@@ -3473,7 +3439,7 @@ static int check_sock_access(struct bpf_verifier_env 
*env, int insn_idx,
+       }
+ 
+       verbose(env, "R%d invalid %s access off=%d size=%d\n",
+-              regno, reg_type_str[reg->type], off, size);
++              regno, reg_type_str(env, reg->type), off, size);
+ 
+       return -EACCES;
+ }
+@@ -4200,15 +4166,30 @@ static int check_mem_access(struct bpf_verifier_env 
*env, int insn_idx, u32 regn
+                               mark_reg_unknown(env, regs, value_regno);
+                       }
+               }
+-      } else if (reg->type == PTR_TO_MEM) {
++      } else if (base_type(reg->type) == PTR_TO_MEM) {
++              bool rdonly_mem = type_is_rdonly_mem(reg->type);
++
++              if (type_may_be_null(reg->type)) {
++                      verbose(env, "R%d invalid mem access '%s'\n", regno,
++                              reg_type_str(env, reg->type));
++                      return -EACCES;
++              }
++
++              if (t == BPF_WRITE && rdonly_mem) {
++                      verbose(env, "R%d cannot write into %s\n",
++                              regno, reg_type_str(env, reg->type));
++                      return -EACCES;
++              }
++
+               if (t == BPF_WRITE && value_regno >= 0 &&
+                   is_pointer_value(env, value_regno)) {
+                       verbose(env, "R%d leaks addr into mem\n", value_regno);
+                       return -EACCES;
+               }
++
+               err = check_mem_region_access(env, regno, off, size,
+                                             reg->mem_size, false);
+-              if (!err && t == BPF_READ && value_regno >= 0)
++              if (!err && value_regno >= 0 && (t == BPF_READ || rdonly_mem))
+                       mark_reg_unknown(env, regs, value_regno);
+       } else if (reg->type == PTR_TO_CTX) {
+               enum bpf_reg_type reg_type = SCALAR_VALUE;
+@@ -4238,7 +4219,7 @@ static int check_mem_access(struct bpf_verifier_env 
*env, int insn_idx, u32 regn
+                       } else {
+                               mark_reg_known_zero(env, regs,
+                                                   value_regno);
+-                              if (reg_type_may_be_null(reg_type))
++                              if (type_may_be_null(reg_type))
+                                       regs[value_regno].id = ++env->id_gen;
+                               /* A load of ctx field could have different
+                                * actual load size with the one encoded in the
+@@ -4246,8 +4227,7 @@ static int check_mem_access(struct bpf_verifier_env 
*env, int insn_idx, u32 regn
+                                * a sub-register.
+                                */
+                               regs[value_regno].subreg_def = DEF_NOT_SUBREG;
+-                              if (reg_type == PTR_TO_BTF_ID ||
+-                                  reg_type == PTR_TO_BTF_ID_OR_NULL) {
++                              if (base_type(reg_type) == PTR_TO_BTF_ID) {
+                                       regs[value_regno].btf = btf;
+                                       regs[value_regno].btf_id = btf_id;
+                               }
+@@ -4300,7 +4280,7 @@ static int check_mem_access(struct bpf_verifier_env 
*env, int insn_idx, u32 regn
+       } else if (type_is_sk_pointer(reg->type)) {
+               if (t == BPF_WRITE) {
+                       verbose(env, "R%d cannot write into %s\n",
+-                              regno, reg_type_str[reg->type]);
++                              regno, reg_type_str(env, reg->type));
+                       return -EACCES;
+               }
+               err = check_sock_access(env, insn_idx, regno, off, size, t);
+@@ -4316,26 +4296,32 @@ static int check_mem_access(struct bpf_verifier_env 
*env, int insn_idx, u32 regn
+       } else if (reg->type == CONST_PTR_TO_MAP) {
+               err = check_ptr_to_map_access(env, regs, regno, off, size, t,
+                                             value_regno);
+-      } else if (reg->type == PTR_TO_RDONLY_BUF) {
+-              if (t == BPF_WRITE) {
+-                      verbose(env, "R%d cannot write into %s\n",
+-                              regno, reg_type_str[reg->type]);
+-                      return -EACCES;
++      } else if (base_type(reg->type) == PTR_TO_BUF) {
++              bool rdonly_mem = type_is_rdonly_mem(reg->type);
++              const char *buf_info;
++              u32 *max_access;
++
++              if (rdonly_mem) {
++                      if (t == BPF_WRITE) {
++                              verbose(env, "R%d cannot write into %s\n",
++                                      regno, reg_type_str(env, reg->type));
++                              return -EACCES;
++                      }
++                      buf_info = "rdonly";
++                      max_access = &env->prog->aux->max_rdonly_access;
++              } else {
++                      buf_info = "rdwr";
++                      max_access = &env->prog->aux->max_rdwr_access;
+               }
++
+               err = check_buffer_access(env, reg, regno, off, size, false,
+-                                        "rdonly",
+-                                        &env->prog->aux->max_rdonly_access);
+-              if (!err && value_regno >= 0)
+-                      mark_reg_unknown(env, regs, value_regno);
+-      } else if (reg->type == PTR_TO_RDWR_BUF) {
+-              err = check_buffer_access(env, reg, regno, off, size, false,
+-                                        "rdwr",
+-                                        &env->prog->aux->max_rdwr_access);
+-              if (!err && t == BPF_READ && value_regno >= 0)
++                                        buf_info, max_access);
++
++              if (!err && value_regno >= 0 && (rdonly_mem || t == BPF_READ))
+                       mark_reg_unknown(env, regs, value_regno);
+       } else {
+               verbose(env, "R%d invalid mem access '%s'\n", regno,
+-                      reg_type_str[reg->type]);
++                      reg_type_str(env, reg->type));
+               return -EACCES;
+       }
+ 
+@@ -4409,7 +4395,7 @@ static int check_atomic(struct bpf_verifier_env *env, 
int insn_idx, struct bpf_i
+           is_sk_reg(env, insn->dst_reg)) {
+               verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n",
+                       insn->dst_reg,
+-                      reg_type_str[reg_state(env, insn->dst_reg)->type]);
++                      reg_type_str(env, reg_state(env, insn->dst_reg)->type));
+               return -EACCES;
+       }
+ 
+@@ -4592,8 +4578,10 @@ static int check_helper_mem_access(struct 
bpf_verifier_env *env, int regno,
+                                  struct bpf_call_arg_meta *meta)
+ {
+       struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
++      const char *buf_info;
++      u32 *max_access;
+ 
+-      switch (reg->type) {
++      switch (base_type(reg->type)) {
+       case PTR_TO_PACKET:
+       case PTR_TO_PACKET_META:
+               return check_packet_access(env, regno, reg->off, access_size,
+@@ -4612,18 +4600,20 @@ static int check_helper_mem_access(struct 
bpf_verifier_env *env, int regno,
+               return check_mem_region_access(env, regno, reg->off,
+                                              access_size, reg->mem_size,
+                                              zero_size_allowed);
+-      case PTR_TO_RDONLY_BUF:
+-              if (meta && meta->raw_mode)
+-                      return -EACCES;
+-              return check_buffer_access(env, reg, regno, reg->off,
+-                                         access_size, zero_size_allowed,
+-                                         "rdonly",
+-                                         &env->prog->aux->max_rdonly_access);
+-      case PTR_TO_RDWR_BUF:
++      case PTR_TO_BUF:
++              if (type_is_rdonly_mem(reg->type)) {
++                      if (meta && meta->raw_mode)
++                              return -EACCES;
++
++                      buf_info = "rdonly";
++                      max_access = &env->prog->aux->max_rdonly_access;
++              } else {
++                      buf_info = "rdwr";
++                      max_access = &env->prog->aux->max_rdwr_access;
++              }
+               return check_buffer_access(env, reg, regno, reg->off,
+                                          access_size, zero_size_allowed,
+-                                         "rdwr",
+-                                         &env->prog->aux->max_rdwr_access);
++                                         buf_info, max_access);
+       case PTR_TO_STACK:
+               return check_stack_range_initialized(
+                               env,
+@@ -4635,9 +4625,9 @@ static int check_helper_mem_access(struct 
bpf_verifier_env *env, int regno,
+                   register_is_null(reg))
+                       return 0;
+ 
+-              verbose(env, "R%d type=%s expected=%s\n", regno,
+-                      reg_type_str[reg->type],
+-                      reg_type_str[PTR_TO_STACK]);
++              verbose(env, "R%d type=%s ", regno,
++                      reg_type_str(env, reg->type));
++              verbose(env, "expected=%s\n", reg_type_str(env, PTR_TO_STACK));
+               return -EACCES;
+       }
+ }
+@@ -4648,7 +4638,7 @@ int check_mem_reg(struct bpf_verifier_env *env, struct 
bpf_reg_state *reg,
+       if (register_is_null(reg))
+               return 0;
+ 
+-      if (reg_type_may_be_null(reg->type)) {
++      if (type_may_be_null(reg->type)) {
+               /* Assuming that the register contains a value check if the 
memory
+                * access is safe. Temporarily save and restore the register's 
state as
+                * the conversion shouldn't be visible to a caller.
+@@ -4796,9 +4786,8 @@ static int process_timer_func(struct bpf_verifier_env 
*env, int regno,
+ 
+ static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
+ {
+-      return type == ARG_PTR_TO_MEM ||
+-             type == ARG_PTR_TO_MEM_OR_NULL ||
+-             type == ARG_PTR_TO_UNINIT_MEM;
++      return base_type(type) == ARG_PTR_TO_MEM ||
++             base_type(type) == ARG_PTR_TO_UNINIT_MEM;
+ }
+ 
+ static bool arg_type_is_mem_size(enum bpf_arg_type type)
+@@ -4900,8 +4889,7 @@ static const struct bpf_reg_types mem_types = {
+               PTR_TO_MAP_KEY,
+               PTR_TO_MAP_VALUE,
+               PTR_TO_MEM,
+-              PTR_TO_RDONLY_BUF,
+-              PTR_TO_RDWR_BUF,
++              PTR_TO_BUF,
+       },
+ };
+ 
+@@ -4932,31 +4920,26 @@ static const struct bpf_reg_types 
*compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
+       [ARG_PTR_TO_MAP_KEY]            = &map_key_value_types,
+       [ARG_PTR_TO_MAP_VALUE]          = &map_key_value_types,
+       [ARG_PTR_TO_UNINIT_MAP_VALUE]   = &map_key_value_types,
+-      [ARG_PTR_TO_MAP_VALUE_OR_NULL]  = &map_key_value_types,
+       [ARG_CONST_SIZE]                = &scalar_types,
+       [ARG_CONST_SIZE_OR_ZERO]        = &scalar_types,
+       [ARG_CONST_ALLOC_SIZE_OR_ZERO]  = &scalar_types,
+       [ARG_CONST_MAP_PTR]             = &const_map_ptr_types,
+       [ARG_PTR_TO_CTX]                = &context_types,
+-      [ARG_PTR_TO_CTX_OR_NULL]        = &context_types,
+       [ARG_PTR_TO_SOCK_COMMON]        = &sock_types,
+ #ifdef CONFIG_NET
+       [ARG_PTR_TO_BTF_ID_SOCK_COMMON] = &btf_id_sock_common_types,
+ #endif
+       [ARG_PTR_TO_SOCKET]             = &fullsock_types,
+-      [ARG_PTR_TO_SOCKET_OR_NULL]     = &fullsock_types,
+       [ARG_PTR_TO_BTF_ID]             = &btf_ptr_types,
+       [ARG_PTR_TO_SPIN_LOCK]          = &spin_lock_types,
+       [ARG_PTR_TO_MEM]                = &mem_types,
+-      [ARG_PTR_TO_MEM_OR_NULL]        = &mem_types,
+       [ARG_PTR_TO_UNINIT_MEM]         = &mem_types,
+       [ARG_PTR_TO_ALLOC_MEM]          = &alloc_mem_types,
+-      [ARG_PTR_TO_ALLOC_MEM_OR_NULL]  = &alloc_mem_types,
+       [ARG_PTR_TO_INT]                = &int_ptr_types,
+       [ARG_PTR_TO_LONG]               = &int_ptr_types,
+       [ARG_PTR_TO_PERCPU_BTF_ID]      = &percpu_btf_ptr_types,
+       [ARG_PTR_TO_FUNC]               = &func_ptr_types,
+-      [ARG_PTR_TO_STACK_OR_NULL]      = &stack_ptr_types,
++      [ARG_PTR_TO_STACK]              = &stack_ptr_types,
+       [ARG_PTR_TO_CONST_STR]          = &const_str_ptr_types,
+       [ARG_PTR_TO_TIMER]              = &timer_types,
+ };
+@@ -4970,12 +4953,27 @@ static int check_reg_type(struct bpf_verifier_env 
*env, u32 regno,
+       const struct bpf_reg_types *compatible;
+       int i, j;
+ 
+-      compatible = compatible_reg_types[arg_type];
++      compatible = compatible_reg_types[base_type(arg_type)];
+       if (!compatible) {
+               verbose(env, "verifier internal error: unsupported arg type 
%d\n", arg_type);
+               return -EFAULT;
+       }
+ 
++      /* ARG_PTR_TO_MEM + RDONLY is compatible with PTR_TO_MEM and PTR_TO_MEM 
+ RDONLY,
++       * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM and NOT with 
PTR_TO_MEM + RDONLY
++       *
++       * Same for MAYBE_NULL:
++       *
++       * ARG_PTR_TO_MEM + MAYBE_NULL is compatible with PTR_TO_MEM and 
PTR_TO_MEM + MAYBE_NULL,
++       * but ARG_PTR_TO_MEM is compatible only with PTR_TO_MEM but NOT with 
PTR_TO_MEM + MAYBE_NULL
++       *
++       * Therefore we fold these flags depending on the arg_type before 
comparison.
++       */
++      if (arg_type & MEM_RDONLY)
++              type &= ~MEM_RDONLY;
++      if (arg_type & PTR_MAYBE_NULL)
++              type &= ~PTR_MAYBE_NULL;
++
+       for (i = 0; i < ARRAY_SIZE(compatible->types); i++) {
+               expected = compatible->types[i];
+               if (expected == NOT_INIT)
+@@ -4985,14 +4983,14 @@ static int check_reg_type(struct bpf_verifier_env 
*env, u32 regno,
+                       goto found;
+       }
+ 
+-      verbose(env, "R%d type=%s expected=", regno, reg_type_str[type]);
++      verbose(env, "R%d type=%s expected=", regno, reg_type_str(env, 
reg->type));
+       for (j = 0; j + 1 < i; j++)
+-              verbose(env, "%s, ", reg_type_str[compatible->types[j]]);
+-      verbose(env, "%s\n", reg_type_str[compatible->types[j]]);
++              verbose(env, "%s, ", reg_type_str(env, compatible->types[j]));
++      verbose(env, "%s\n", reg_type_str(env, compatible->types[j]));
+       return -EACCES;
+ 
+ found:
+-      if (type == PTR_TO_BTF_ID) {
++      if (reg->type == PTR_TO_BTF_ID) {
+               if (!arg_btf_id) {
+                       if (!compatible->btf_id) {
+                               verbose(env, "verifier internal error: missing 
arg compatible BTF ID\n");
+@@ -5051,15 +5049,14 @@ static int check_func_arg(struct bpf_verifier_env 
*env, u32 arg,
+               return -EACCES;
+       }
+ 
+-      if (arg_type == ARG_PTR_TO_MAP_VALUE ||
+-          arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE ||
+-          arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) {
++      if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
++          base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
+               err = resolve_map_arg_type(env, meta, &arg_type);
+               if (err)
+                       return err;
+       }
+ 
+-      if (register_is_null(reg) && arg_type_may_be_null(arg_type))
++      if (register_is_null(reg) && type_may_be_null(arg_type))
+               /* A NULL register has a SCALAR_VALUE type, so skip
+                * type checking.
+                */
+@@ -5128,10 +5125,11 @@ skip_type_check:
+               err = check_helper_mem_access(env, regno,
+                                             meta->map_ptr->key_size, false,
+                                             NULL);
+-      } else if (arg_type == ARG_PTR_TO_MAP_VALUE ||
+-                 (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL &&
+-                  !register_is_null(reg)) ||
+-                 arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) {
++      } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE ||
++                 base_type(arg_type) == ARG_PTR_TO_UNINIT_MAP_VALUE) {
++              if (type_may_be_null(arg_type) && register_is_null(reg))
++                      return 0;
++
+               /* bpf_map_xxx(..., map_ptr, ..., value) call:
+                * check [value, value + map->value_size) validity
+                */
+@@ -6206,6 +6204,8 @@ static int check_helper_call(struct bpf_verifier_env 
*env, struct bpf_insn *insn
+                            int *insn_idx_p)
+ {
+       const struct bpf_func_proto *fn = NULL;
++      enum bpf_return_type ret_type;
++      enum bpf_type_flag ret_flag;
+       struct bpf_reg_state *regs;
+       struct bpf_call_arg_meta meta;
+       int insn_idx = *insn_idx_p;
+@@ -6339,13 +6339,14 @@ static int check_helper_call(struct bpf_verifier_env 
*env, struct bpf_insn *insn
+       regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+ 
+       /* update return register (already marked as written above) */
+-      if (fn->ret_type == RET_INTEGER) {
++      ret_type = fn->ret_type;
++      ret_flag = type_flag(fn->ret_type);
++      if (ret_type == RET_INTEGER) {
+               /* sets type to SCALAR_VALUE */
+               mark_reg_unknown(env, regs, BPF_REG_0);
+-      } else if (fn->ret_type == RET_VOID) {
++      } else if (ret_type == RET_VOID) {
+               regs[BPF_REG_0].type = NOT_INIT;
+-      } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL ||
+-                 fn->ret_type == RET_PTR_TO_MAP_VALUE) {
++      } else if (base_type(ret_type) == RET_PTR_TO_MAP_VALUE) {
+               /* There is no offset yet applied, variable or fixed */
+               mark_reg_known_zero(env, regs, BPF_REG_0);
+               /* remember map_ptr, so that check_map_access()
+@@ -6359,28 +6360,25 @@ static int check_helper_call(struct bpf_verifier_env 
*env, struct bpf_insn *insn
+               }
+               regs[BPF_REG_0].map_ptr = meta.map_ptr;
+               regs[BPF_REG_0].map_uid = meta.map_uid;
+-              if (fn->ret_type == RET_PTR_TO_MAP_VALUE) {
+-                      regs[BPF_REG_0].type = PTR_TO_MAP_VALUE;
+-                      if (map_value_has_spin_lock(meta.map_ptr))
+-                              regs[BPF_REG_0].id = ++env->id_gen;
+-              } else {
+-                      regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
++              regs[BPF_REG_0].type = PTR_TO_MAP_VALUE | ret_flag;
++              if (!type_may_be_null(ret_type) &&
++                  map_value_has_spin_lock(meta.map_ptr)) {
++                      regs[BPF_REG_0].id = ++env->id_gen;
+               }
+-      } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) {
++      } else if (base_type(ret_type) == RET_PTR_TO_SOCKET) {
+               mark_reg_known_zero(env, regs, BPF_REG_0);
+-              regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL;
+-      } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) {
++              regs[BPF_REG_0].type = PTR_TO_SOCKET | ret_flag;
++      } else if (base_type(ret_type) == RET_PTR_TO_SOCK_COMMON) {
+               mark_reg_known_zero(env, regs, BPF_REG_0);
+-              regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL;
+-      } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) {
++              regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON | ret_flag;
++      } else if (base_type(ret_type) == RET_PTR_TO_TCP_SOCK) {
+               mark_reg_known_zero(env, regs, BPF_REG_0);
+-              regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL;
+-      } else if (fn->ret_type == RET_PTR_TO_ALLOC_MEM_OR_NULL) {
++              regs[BPF_REG_0].type = PTR_TO_TCP_SOCK | ret_flag;
++      } else if (base_type(ret_type) == RET_PTR_TO_ALLOC_MEM) {
+               mark_reg_known_zero(env, regs, BPF_REG_0);
+-              regs[BPF_REG_0].type = PTR_TO_MEM_OR_NULL;
++              regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
+               regs[BPF_REG_0].mem_size = meta.mem_size;
+-      } else if (fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL ||
+-                 fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID) {
++      } else if (base_type(ret_type) == RET_PTR_TO_MEM_OR_BTF_ID) {
+               const struct btf_type *t;
+ 
+               mark_reg_known_zero(env, regs, BPF_REG_0);
+@@ -6398,29 +6396,30 @@ static int check_helper_call(struct bpf_verifier_env 
*env, struct bpf_insn *insn
+                                       tname, PTR_ERR(ret));
+                               return -EINVAL;
+                       }
+-                      regs[BPF_REG_0].type =
+-                              fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
+-                              PTR_TO_MEM : PTR_TO_MEM_OR_NULL;
++                      regs[BPF_REG_0].type = PTR_TO_MEM | ret_flag;
+                       regs[BPF_REG_0].mem_size = tsize;
+               } else {
+-                      regs[BPF_REG_0].type =
+-                              fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ?
+-                              PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL;
++                      /* MEM_RDONLY may be carried from ret_flag, but it
++                       * doesn't apply on PTR_TO_BTF_ID. Fold it, otherwise
++                       * it will confuse the check of PTR_TO_BTF_ID in
++                       * check_mem_access().
++                       */
++                      ret_flag &= ~MEM_RDONLY;
++
++                      regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+                       regs[BPF_REG_0].btf = meta.ret_btf;
+                       regs[BPF_REG_0].btf_id = meta.ret_btf_id;
+               }
+-      } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL ||
+-                 fn->ret_type == RET_PTR_TO_BTF_ID) {
++      } else if (base_type(ret_type) == RET_PTR_TO_BTF_ID) {
+               int ret_btf_id;
+ 
+               mark_reg_known_zero(env, regs, BPF_REG_0);
+-              regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ?
+-                                                   PTR_TO_BTF_ID :
+-                                                   PTR_TO_BTF_ID_OR_NULL;
++              regs[BPF_REG_0].type = PTR_TO_BTF_ID | ret_flag;
+               ret_btf_id = *fn->ret_btf_id;
+               if (ret_btf_id == 0) {
+-                      verbose(env, "invalid return type %d of func %s#%d\n",
+-                              fn->ret_type, func_id_name(func_id), func_id);
++                      verbose(env, "invalid return type %u of func %s#%d\n",
++                              base_type(ret_type), func_id_name(func_id),
++                              func_id);
+                       return -EINVAL;
+               }
+               /* current BPF helper definitions are only coming from
+@@ -6429,12 +6428,12 @@ static int check_helper_call(struct bpf_verifier_env 
*env, struct bpf_insn *insn
+               regs[BPF_REG_0].btf = btf_vmlinux;
+               regs[BPF_REG_0].btf_id = ret_btf_id;
+       } else {
+-              verbose(env, "unknown return type %d of func %s#%d\n",
+-                      fn->ret_type, func_id_name(func_id), func_id);
++              verbose(env, "unknown return type %u of func %s#%d\n",
++                      base_type(ret_type), func_id_name(func_id), func_id);
+               return -EINVAL;
+       }
+ 
+-      if (reg_type_may_be_null(regs[BPF_REG_0].type))
++      if (type_may_be_null(regs[BPF_REG_0].type))
+               regs[BPF_REG_0].id = ++env->id_gen;
+ 
+       if (is_ptr_cast_function(func_id)) {
+@@ -6633,25 +6632,25 @@ static bool check_reg_sane_offset(struct 
bpf_verifier_env *env,
+ 
+       if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) {
+               verbose(env, "math between %s pointer and %lld is not 
allowed\n",
+-                      reg_type_str[type], val);
++                      reg_type_str(env, type), val);
+               return false;
+       }
+ 
+       if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) {
+               verbose(env, "%s pointer offset %d is not allowed\n",
+-                      reg_type_str[type], reg->off);
++                      reg_type_str(env, type), reg->off);
+               return false;
+       }
+ 
+       if (smin == S64_MIN) {
+               verbose(env, "math between %s pointer and register with 
unbounded min value is not allowed\n",
+-                      reg_type_str[type]);
++                      reg_type_str(env, type));
+               return false;
+       }
+ 
+       if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) {
+               verbose(env, "value %lld makes %s pointer be out of bounds\n",
+-                      smin, reg_type_str[type]);
++                      smin, reg_type_str(env, type));
+               return false;
+       }
+ 
+@@ -7028,11 +7027,13 @@ static int adjust_ptr_min_max_vals(struct 
bpf_verifier_env *env,
+               return -EACCES;
+       }
+ 
+-      switch (ptr_reg->type) {
+-      case PTR_TO_MAP_VALUE_OR_NULL:
++      if (ptr_reg->type & PTR_MAYBE_NULL) {
+               verbose(env, "R%d pointer arithmetic on %s prohibited, 
null-check it first\n",
+-                      dst, reg_type_str[ptr_reg->type]);
++                      dst, reg_type_str(env, ptr_reg->type));
+               return -EACCES;
++      }
++
++      switch (base_type(ptr_reg->type)) {
+       case CONST_PTR_TO_MAP:
+               /* smin_val represents the known value */
+               if (known && smin_val == 0 && opcode == BPF_ADD)
+@@ -7045,10 +7046,10 @@ static int adjust_ptr_min_max_vals(struct 
bpf_verifier_env *env,
+       case PTR_TO_XDP_SOCK:
+ reject:
+               verbose(env, "R%d pointer arithmetic on %s prohibited\n",
+-                      dst, reg_type_str[ptr_reg->type]);
++                      dst, reg_type_str(env, ptr_reg->type));
+               return -EACCES;
+       default:
+-              if (reg_type_may_be_null(ptr_reg->type))
++              if (type_may_be_null(ptr_reg->type))
+                       goto reject;
+               break;
+       }
+@@ -8770,7 +8771,7 @@ static void mark_ptr_or_null_reg(struct bpf_func_state 
*state,
+                                struct bpf_reg_state *reg, u32 id,
+                                bool is_null)
+ {
+-      if (reg_type_may_be_null(reg->type) && reg->id == id &&
++      if (type_may_be_null(reg->type) && reg->id == id &&
+           !WARN_ON_ONCE(!reg->id)) {
+               if (WARN_ON_ONCE(reg->smin_value || reg->smax_value ||
+                                !tnum_equals_const(reg->var_off, 0) ||
+@@ -9148,7 +9149,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env 
*env,
+        */
+       if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K &&
+           insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+-          reg_type_may_be_null(dst_reg->type)) {
++          type_may_be_null(dst_reg->type)) {
+               /* Mark all identical registers in each branch as either
+                * safe or unknown depending R == 0 or R != 0 conditional.
+                */
+@@ -9207,7 +9208,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, 
struct bpf_insn *insn)
+ 
+       if (insn->src_reg == BPF_PSEUDO_BTF_ID) {
+               dst_reg->type = aux->btf_var.reg_type;
+-              switch (dst_reg->type) {
++              switch (base_type(dst_reg->type)) {
+               case PTR_TO_MEM:
+                       dst_reg->mem_size = aux->btf_var.mem_size;
+                       break;
+@@ -9404,7 +9405,7 @@ static int check_return_code(struct bpf_verifier_env 
*env)
+               /* enforce return zero from async callbacks like timer */
+               if (reg->type != SCALAR_VALUE) {
+                       verbose(env, "In async callback the register R0 is not 
a known value (%s)\n",
+-                              reg_type_str[reg->type]);
++                              reg_type_str(env, reg->type));
+                       return -EINVAL;
+               }
+ 
+@@ -9418,7 +9419,7 @@ static int check_return_code(struct bpf_verifier_env 
*env)
+       if (is_subprog) {
+               if (reg->type != SCALAR_VALUE) {
+                       verbose(env, "At subprogram exit the register R0 is not 
a scalar value (%s)\n",
+-                              reg_type_str[reg->type]);
++                              reg_type_str(env, reg->type));
+                       return -EINVAL;
+               }
+               return 0;
+@@ -9482,7 +9483,7 @@ static int check_return_code(struct bpf_verifier_env 
*env)
+ 
+       if (reg->type != SCALAR_VALUE) {
+               verbose(env, "At program exit the register R0 is not a known 
value (%s)\n",
+-                      reg_type_str[reg->type]);
++                      reg_type_str(env, reg->type));
+               return -EINVAL;
+       }
+ 
+@@ -10263,7 +10264,7 @@ static bool regsafe(struct bpf_verifier_env *env, 
struct bpf_reg_state *rold,
+               return true;
+       if (rcur->type == NOT_INIT)
+               return false;
+-      switch (rold->type) {
++      switch (base_type(rold->type)) {
+       case SCALAR_VALUE:
+               if (env->explore_alu_limits)
+                       return false;
+@@ -10285,6 +10286,22 @@ static bool regsafe(struct bpf_verifier_env *env, 
struct bpf_reg_state *rold,
+               }
+       case PTR_TO_MAP_KEY:
+       case PTR_TO_MAP_VALUE:
++              /* a PTR_TO_MAP_VALUE could be safe to use as a
++               * PTR_TO_MAP_VALUE_OR_NULL into the same map.
++               * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
++               * checked, doing so could have affected others with the same
++               * id, and we can't check for that because we lost the id when
++               * we converted to a PTR_TO_MAP_VALUE.
++               */
++              if (type_may_be_null(rold->type)) {
++                      if (!type_may_be_null(rcur->type))
++                              return false;
++                      if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, 
id)))
++                              return false;
++                      /* Check our ids match any regs they're supposed to */
++                      return check_ids(rold->id, rcur->id, idmap);
++              }
++
+               /* If the new min/max/var_off satisfy the old ones and
+                * everything else matches, we are OK.
+                * 'id' is not compared, since it's only used for maps with
+@@ -10296,20 +10313,6 @@ static bool regsafe(struct bpf_verifier_env *env, 
struct bpf_reg_state *rold,
+               return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) 
== 0 &&
+                      range_within(rold, rcur) &&
+                      tnum_in(rold->var_off, rcur->var_off);
+-      case PTR_TO_MAP_VALUE_OR_NULL:
+-              /* a PTR_TO_MAP_VALUE could be safe to use as a
+-               * PTR_TO_MAP_VALUE_OR_NULL into the same map.
+-               * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL-
+-               * checked, doing so could have affected others with the same
+-               * id, and we can't check for that because we lost the id when
+-               * we converted to a PTR_TO_MAP_VALUE.
+-               */
+-              if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL)
+-                      return false;
+-              if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)))
+-                      return false;
+-              /* Check our ids match any regs they're supposed to */
+-              return check_ids(rold->id, rcur->id, idmap);
+       case PTR_TO_PACKET_META:
+       case PTR_TO_PACKET:
+               if (rcur->type != rold->type)
+@@ -10338,11 +10341,8 @@ static bool regsafe(struct bpf_verifier_env *env, 
struct bpf_reg_state *rold,
+       case PTR_TO_PACKET_END:
+       case PTR_TO_FLOW_KEYS:
+       case PTR_TO_SOCKET:
+-      case PTR_TO_SOCKET_OR_NULL:
+       case PTR_TO_SOCK_COMMON:
+-      case PTR_TO_SOCK_COMMON_OR_NULL:
+       case PTR_TO_TCP_SOCK:
+-      case PTR_TO_TCP_SOCK_OR_NULL:
+       case PTR_TO_XDP_SOCK:
+               /* Only valid matches are exact, which memcmp() above
+                * would have accepted
+@@ -10868,17 +10868,13 @@ next:
+ /* Return true if it's OK to have the same insn return a different type. */
+ static bool reg_type_mismatch_ok(enum bpf_reg_type type)
+ {
+-      switch (type) {
++      switch (base_type(type)) {
+       case PTR_TO_CTX:
+       case PTR_TO_SOCKET:
+-      case PTR_TO_SOCKET_OR_NULL:
+       case PTR_TO_SOCK_COMMON:
+-      case PTR_TO_SOCK_COMMON_OR_NULL:
+       case PTR_TO_TCP_SOCK:
+-      case PTR_TO_TCP_SOCK_OR_NULL:
+       case PTR_TO_XDP_SOCK:
+       case PTR_TO_BTF_ID:
+-      case PTR_TO_BTF_ID_OR_NULL:
+               return false;
+       default:
+               return true;
+@@ -11102,7 +11098,7 @@ static int do_check(struct bpf_verifier_env *env)
+                       if (is_ctx_reg(env, insn->dst_reg)) {
+                               verbose(env, "BPF_ST stores into R%d %s is not 
allowed\n",
+                                       insn->dst_reg,
+-                                      reg_type_str[reg_state(env, 
insn->dst_reg)->type]);
++                                      reg_type_str(env, reg_state(env, 
insn->dst_reg)->type));
+                               return -EACCES;
+                       }
+ 
+@@ -11353,7 +11349,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env 
*env,
+                       err = -EINVAL;
+                       goto err_put;
+               }
+-              aux->btf_var.reg_type = PTR_TO_MEM;
++              aux->btf_var.reg_type = PTR_TO_MEM | MEM_RDONLY;
+               aux->btf_var.mem_size = tsize;
+       } else {
+               aux->btf_var.reg_type = PTR_TO_BTF_ID;
+@@ -13175,7 +13171,7 @@ static int do_check_common(struct bpf_verifier_env 
*env, int subprog)
+                               mark_reg_known_zero(env, regs, i);
+                       else if (regs[i].type == SCALAR_VALUE)
+                               mark_reg_unknown(env, regs, i);
+-                      else if (regs[i].type == PTR_TO_MEM_OR_NULL) {
++                      else if (base_type(regs[i].type) == PTR_TO_MEM) {
+                               const u32 mem_size = regs[i].mem_size;
+ 
+                               mark_reg_known_zero(env, regs, i);
+diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
+index 5a18b861fcf75..c289010b0964e 100644
+--- a/kernel/trace/bpf_trace.c
++++ b/kernel/trace/bpf_trace.c
+@@ -345,7 +345,7 @@ static const struct bpf_func_proto 
bpf_probe_write_user_proto = {
+       .gpl_only       = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_ANYTHING,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+ };
+ 
+@@ -394,7 +394,7 @@ static const struct bpf_func_proto bpf_trace_printk_proto 
= {
+       .func           = bpf_trace_printk,
+       .gpl_only       = true,
+       .ret_type       = RET_INTEGER,
+-      .arg1_type      = ARG_PTR_TO_MEM,
++      .arg1_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg2_type      = ARG_CONST_SIZE,
+ };
+ 
+@@ -446,9 +446,9 @@ static const struct bpf_func_proto bpf_seq_printf_proto = {
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID,
+       .arg1_btf_id    = &btf_seq_file_ids[0],
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+-      .arg4_type      = ARG_PTR_TO_MEM_OR_NULL,
++      .arg4_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
+@@ -463,7 +463,7 @@ static const struct bpf_func_proto bpf_seq_write_proto = {
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID,
+       .arg1_btf_id    = &btf_seq_file_ids[0],
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
+@@ -487,7 +487,7 @@ static const struct bpf_func_proto 
bpf_seq_printf_btf_proto = {
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID,
+       .arg1_btf_id    = &btf_seq_file_ids[0],
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+       .arg4_type      = ARG_ANYTHING,
+ };
+@@ -648,7 +648,7 @@ static const struct bpf_func_proto 
bpf_perf_event_output_proto = {
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
+@@ -958,7 +958,7 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = {
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_MEM,
+       .arg2_type      = ARG_CONST_SIZE,
+-      .arg3_type      = ARG_PTR_TO_MEM,
++      .arg3_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg4_type      = ARG_CONST_SIZE,
+       .arg5_type      = ARG_ANYTHING,
+ };
+@@ -1207,7 +1207,7 @@ static const struct bpf_func_proto 
bpf_perf_event_output_proto_tp = {
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
+@@ -1429,7 +1429,7 @@ static const struct bpf_func_proto 
bpf_perf_event_output_proto_raw_tp = {
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
+@@ -1483,7 +1483,7 @@ static const struct bpf_func_proto 
bpf_get_stack_proto_raw_tp = {
+       .gpl_only       = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+       .arg4_type      = ARG_ANYTHING,
+ };
+diff --git a/lib/iov_iter.c b/lib/iov_iter.c
+index c5b2f0f4b8a84..6d146f77601d7 100644
+--- a/lib/iov_iter.c
++++ b/lib/iov_iter.c
+@@ -191,7 +191,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, 
size_t offset, size_t b
+       buf = iov->iov_base + skip;
+       copy = min(bytes, iov->iov_len - skip);
+ 
+-      if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_writeable(buf, copy)) 
{
++      if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {
+               kaddr = kmap_atomic(page);
+               from = kaddr + offset;
+ 
+@@ -275,7 +275,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, 
size_t offset, size_t
+       buf = iov->iov_base + skip;
+       copy = min(bytes, iov->iov_len - skip);
+ 
+-      if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_pages_readable(buf, copy)) {
++      if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {
+               kaddr = kmap_atomic(page);
+               to = kaddr + offset;
+ 
+@@ -431,35 +431,81 @@ out:
+ }
+ 
+ /*
++ * fault_in_iov_iter_readable - fault in iov iterator for reading
++ * @i: iterator
++ * @size: maximum length
++ *
+  * Fault in one or more iovecs of the given iov_iter, to a maximum length of
+- * bytes.  For each iovec, fault in each page that constitutes the iovec.
++ * @size.  For each iovec, fault in each page that constitutes the iovec.
++ *
++ * Returns the number of bytes not faulted in (like copy_to_user() and
++ * copy_from_user()).
+  *
+- * Return 0 on success, or non-zero if the memory could not be accessed (i.e.
+- * because it is an invalid address).
++ * Always returns 0 for non-userspace iterators.
+  */
+-int iov_iter_fault_in_readable(const struct iov_iter *i, size_t bytes)
++size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size)
+ {
+       if (iter_is_iovec(i)) {
++              size_t count = min(size, iov_iter_count(i));
+               const struct iovec *p;
+               size_t skip;
+ 
+-              if (bytes > i->count)
+-                      bytes = i->count;
+-              for (p = i->iov, skip = i->iov_offset; bytes; p++, skip = 0) {
+-                      size_t len = min(bytes, p->iov_len - skip);
+-                      int err;
++              size -= count;
++              for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
++                      size_t len = min(count, p->iov_len - skip);
++                      size_t ret;
+ 
+                       if (unlikely(!len))
+                               continue;
+-                      err = fault_in_pages_readable(p->iov_base + skip, len);
+-                      if (unlikely(err))
+-                              return err;
+-                      bytes -= len;
++                      ret = fault_in_readable(p->iov_base + skip, len);
++                      count -= len - ret;
++                      if (ret)
++                              break;
+               }
++              return count + size;
+       }
+       return 0;
+ }
+-EXPORT_SYMBOL(iov_iter_fault_in_readable);
++EXPORT_SYMBOL(fault_in_iov_iter_readable);
++
++/*
++ * fault_in_iov_iter_writeable - fault in iov iterator for writing
++ * @i: iterator
++ * @size: maximum length
++ *
++ * Faults in the iterator using get_user_pages(), i.e., without triggering
++ * hardware page faults.  This is primarily useful when we already know that
++ * some or all of the pages in @i aren't in memory.
++ *
++ * Returns the number of bytes not faulted in, like copy_to_user() and
++ * copy_from_user().
++ *
++ * Always returns 0 for non-user-space iterators.
++ */
++size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size)
++{
++      if (iter_is_iovec(i)) {
++              size_t count = min(size, iov_iter_count(i));
++              const struct iovec *p;
++              size_t skip;
++
++              size -= count;
++              for (p = i->iov, skip = i->iov_offset; count; p++, skip = 0) {
++                      size_t len = min(count, p->iov_len - skip);
++                      size_t ret;
++
++                      if (unlikely(!len))
++                              continue;
++                      ret = fault_in_safe_writeable(p->iov_base + skip, len);
++                      count -= len - ret;
++                      if (ret)
++                              break;
++              }
++              return count + size;
++      }
++      return 0;
++}
++EXPORT_SYMBOL(fault_in_iov_iter_writeable);
+ 
+ void iov_iter_init(struct iov_iter *i, unsigned int direction,
+                       const struct iovec *iov, unsigned long nr_segs,
+@@ -468,6 +514,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int 
direction,
+       WARN_ON(direction & ~(READ | WRITE));
+       *i = (struct iov_iter) {
+               .iter_type = ITER_IOVEC,
++              .nofault = false,
+               .data_source = direction,
+               .iov = iov,
+               .nr_segs = nr_segs,
+@@ -1483,13 +1530,17 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
+               return 0;
+ 
+       if (likely(iter_is_iovec(i))) {
++              unsigned int gup_flags = 0;
+               unsigned long addr;
+ 
++              if (iov_iter_rw(i) != WRITE)
++                      gup_flags |= FOLL_WRITE;
++              if (i->nofault)
++                      gup_flags |= FOLL_NOFAULT;
++
+               addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
+               n = DIV_ROUND_UP(len, PAGE_SIZE);
+-              res = get_user_pages_fast(addr, n,
+-                              iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
+-                              pages);
++              res = get_user_pages_fast(addr, n, gup_flags, pages);
+               if (unlikely(res <= 0))
+                       return res;
+               return (res == n ? len : res * PAGE_SIZE) - *start;
+@@ -1605,15 +1656,20 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+               return 0;
+ 
+       if (likely(iter_is_iovec(i))) {
++              unsigned int gup_flags = 0;
+               unsigned long addr;
+ 
++              if (iov_iter_rw(i) != WRITE)
++                      gup_flags |= FOLL_WRITE;
++              if (i->nofault)
++                      gup_flags |= FOLL_NOFAULT;
++
+               addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
+               n = DIV_ROUND_UP(len, PAGE_SIZE);
+               p = get_pages_array(n);
+               if (!p)
+                       return -ENOMEM;
+-              res = get_user_pages_fast(addr, n,
+-                              iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0, p);
++              res = get_user_pages_fast(addr, n, gup_flags, p);
+               if (unlikely(res <= 0)) {
+                       kvfree(p);
+                       *pages = NULL;
+diff --git a/mm/filemap.c b/mm/filemap.c
+index 1293c3409e429..00e391e758801 100644
+--- a/mm/filemap.c
++++ b/mm/filemap.c
+@@ -90,7 +90,7 @@
+  *      ->lock_page           (filemap_fault, access_process_vm)
+  *
+  *  ->i_rwsem                 (generic_perform_write)
+- *    ->mmap_lock             (fault_in_pages_readable->do_page_fault)
++ *    ->mmap_lock             (fault_in_readable->do_page_fault)
+  *
+  *  bdi->wb.list_lock
+  *    sb_lock                 (fs/fs-writeback.c)
+@@ -3760,7 +3760,7 @@ again:
+                * same page as we're writing to, without it being marked
+                * up-to-date.
+                */
+-              if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
++              if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
+                       status = -EFAULT;
+                       break;
+               }
+diff --git a/mm/gup.c b/mm/gup.c
+index 52f08e3177e9f..ba2ab7a223f8e 100644
+--- a/mm/gup.c
++++ b/mm/gup.c
+@@ -943,6 +943,8 @@ static int faultin_page(struct vm_area_struct *vma,
+       /* mlock all present pages, but do not fault in new pages */
+       if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
+               return -ENOENT;
++      if (*flags & FOLL_NOFAULT)
++              return -EFAULT;
+       if (*flags & FOLL_WRITE)
+               fault_flags |= FAULT_FLAG_WRITE;
+       if (*flags & FOLL_REMOTE)
+@@ -1681,6 +1683,122 @@ finish_or_fault:
+ }
+ #endif /* !CONFIG_MMU */
+ 
++/**
++ * fault_in_writeable - fault in userspace address range for writing
++ * @uaddr: start of address range
++ * @size: size of address range
++ *
++ * Returns the number of bytes not faulted in (like copy_to_user() and
++ * copy_from_user()).
++ */
++size_t fault_in_writeable(char __user *uaddr, size_t size)
++{
++      char __user *start = uaddr, *end;
++
++      if (unlikely(size == 0))
++              return 0;
++      if (!PAGE_ALIGNED(uaddr)) {
++              if (unlikely(__put_user(0, uaddr) != 0))
++                      return size;
++              uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
++      }
++      end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
++      if (unlikely(end < start))
++              end = NULL;
++      while (uaddr != end) {
++              if (unlikely(__put_user(0, uaddr) != 0))
++                      goto out;
++              uaddr += PAGE_SIZE;
++      }
++
++out:
++      if (size > uaddr - start)
++              return size - (uaddr - start);
++      return 0;
++}
++EXPORT_SYMBOL(fault_in_writeable);
++
++/*
++ * fault_in_safe_writeable - fault in an address range for writing
++ * @uaddr: start of address range
++ * @size: length of address range
++ *
++ * Faults in an address range for writing.  This is primarily useful when we
++ * already know that some or all of the pages in the address range aren't in
++ * memory.
++ *
++ * Unlike fault_in_writeable(), this function is non-destructive.
++ *
++ * Note that we don't pin or otherwise hold the pages referenced that we fault
++ * in.  There's no guarantee that they'll stay in memory for any duration of
++ * time.
++ *
++ * Returns the number of bytes not faulted in, like copy_to_user() and
++ * copy_from_user().
++ */
++size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
++{
++      unsigned long start = (unsigned long)uaddr, end;
++      struct mm_struct *mm = current->mm;
++      bool unlocked = false;
++
++      if (unlikely(size == 0))
++              return 0;
++      end = PAGE_ALIGN(start + size);
++      if (end < start)
++              end = 0;
++
++      mmap_read_lock(mm);
++      do {
++              if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
++                      break;
++              start = (start + PAGE_SIZE) & PAGE_MASK;
++      } while (start != end);
++      mmap_read_unlock(mm);
++
++      if (size > (unsigned long)uaddr - start)
++              return size - ((unsigned long)uaddr - start);
++      return 0;
++}
++EXPORT_SYMBOL(fault_in_safe_writeable);
++
++/**
++ * fault_in_readable - fault in userspace address range for reading
++ * @uaddr: start of user address range
++ * @size: size of user address range
++ *
++ * Returns the number of bytes not faulted in (like copy_to_user() and
++ * copy_from_user()).
++ */
++size_t fault_in_readable(const char __user *uaddr, size_t size)
++{
++      const char __user *start = uaddr, *end;
++      volatile char c;
++
++      if (unlikely(size == 0))
++              return 0;
++      if (!PAGE_ALIGNED(uaddr)) {
++              if (unlikely(__get_user(c, uaddr) != 0))
++                      return size;
++              uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
++      }
++      end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
++      if (unlikely(end < start))
++              end = NULL;
++      while (uaddr != end) {
++              if (unlikely(__get_user(c, uaddr) != 0))
++                      goto out;
++              uaddr += PAGE_SIZE;
++      }
++
++out:
++      (void)c;
++      if (size > uaddr - start)
++              return size - (uaddr - start);
++      return 0;
++}
++EXPORT_SYMBOL(fault_in_readable);
++
+ /**
+  * get_dump_page() - pin user page in memory while writing it to core dump
+  * @addr: user address
+@@ -2733,7 +2851,7 @@ static int internal_get_user_pages_fast(unsigned long 
start,
+ 
+       if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
+                                      FOLL_FORCE | FOLL_PIN | FOLL_GET |
+-                                     FOLL_FAST_ONLY)))
++                                     FOLL_FAST_ONLY | FOLL_NOFAULT)))
+               return -EINVAL;
+ 
+       if (gup_flags & FOLL_PIN)
+diff --git a/mm/kfence/core.c b/mm/kfence/core.c
+index 86260e8f28302..66076d8742b78 100644
+--- a/mm/kfence/core.c
++++ b/mm/kfence/core.c
+@@ -528,6 +528,8 @@ static bool __init kfence_init_pool(void)
+        * enters __slab_free() slow-path.
+        */
+       for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
++              struct page *page = &pages[i];
++
+               if (!i || (i % 2))
+                       continue;
+ 
+@@ -535,7 +537,11 @@ static bool __init kfence_init_pool(void)
+               if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
+                       goto err;
+ 
+-              __SetPageSlab(&pages[i]);
++              __SetPageSlab(page);
++#ifdef CONFIG_MEMCG
++              page->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 
1].objcg |
++                                 MEMCG_DATA_OBJCGS;
++#endif
+       }
+ 
+       /*
+@@ -911,6 +917,9 @@ void __kfence_free(void *addr)
+ {
+       struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+ 
++#ifdef CONFIG_MEMCG
++      KFENCE_WARN_ON(meta->objcg);
++#endif
+       /*
+        * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
+        * the object, as the object page may be recycled for other-typed
+diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
+index 92bf6eff6060d..600f2e2431d6d 100644
+--- a/mm/kfence/kfence.h
++++ b/mm/kfence/kfence.h
+@@ -89,6 +89,9 @@ struct kfence_metadata {
+       struct kfence_track free_track;
+       /* For updating alloc_covered on frees. */
+       u32 alloc_stack_hash;
++#ifdef CONFIG_MEMCG
++      struct obj_cgroup *objcg;
++#endif
+ };
+ 
+ extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
+index 68d2cbf8331ac..ea61dfe19c869 100644
+--- a/net/core/bpf_sk_storage.c
++++ b/net/core/bpf_sk_storage.c
+@@ -929,7 +929,7 @@ static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
+               { offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
+                 PTR_TO_BTF_ID_OR_NULL },
+               { offsetof(struct bpf_iter__bpf_sk_storage_map, value),
+-                PTR_TO_RDWR_BUF_OR_NULL },
++                PTR_TO_BUF | PTR_MAYBE_NULL },
+       },
+       .seq_info               = &iter_seq_info,
+ };
+diff --git a/net/core/filter.c b/net/core/filter.c
+index cdd7e92db3030..821278b906b71 100644
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -1713,7 +1713,7 @@ static const struct bpf_func_proto 
bpf_skb_store_bytes_proto = {
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+-      .arg3_type      = ARG_PTR_TO_MEM,
++      .arg3_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg4_type      = ARG_CONST_SIZE,
+       .arg5_type      = ARG_ANYTHING,
+ };
+@@ -2018,9 +2018,9 @@ static const struct bpf_func_proto bpf_csum_diff_proto = 
{
+       .gpl_only       = false,
+       .pkt_access     = true,
+       .ret_type       = RET_INTEGER,
+-      .arg1_type      = ARG_PTR_TO_MEM_OR_NULL,
++      .arg1_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+       .arg2_type      = ARG_CONST_SIZE_OR_ZERO,
+-      .arg3_type      = ARG_PTR_TO_MEM_OR_NULL,
++      .arg3_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+       .arg4_type      = ARG_CONST_SIZE_OR_ZERO,
+       .arg5_type      = ARG_ANYTHING,
+ };
+@@ -2541,7 +2541,7 @@ static const struct bpf_func_proto 
bpf_redirect_neigh_proto = {
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_ANYTHING,
+-      .arg2_type      = ARG_PTR_TO_MEM_OR_NULL,
++      .arg2_type      = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE_OR_ZERO,
+       .arg4_type      = ARG_ANYTHING,
+ };
+@@ -4177,7 +4177,7 @@ static const struct bpf_func_proto 
bpf_skb_event_output_proto = {
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
+@@ -4191,7 +4191,7 @@ const struct bpf_func_proto bpf_skb_output_proto = {
+       .arg1_btf_id    = &bpf_skb_output_btf_ids[0],
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
+@@ -4374,7 +4374,7 @@ static const struct bpf_func_proto 
bpf_skb_set_tunnel_key_proto = {
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+       .arg4_type      = ARG_ANYTHING,
+ };
+@@ -4400,7 +4400,7 @@ static const struct bpf_func_proto 
bpf_skb_set_tunnel_opt_proto = {
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+ };
+ 
+@@ -4570,7 +4570,7 @@ static const struct bpf_func_proto 
bpf_xdp_event_output_proto = {
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
+@@ -4584,7 +4584,7 @@ const struct bpf_func_proto bpf_xdp_output_proto = {
+       .arg1_btf_id    = &bpf_xdp_output_btf_ids[0],
+       .arg2_type      = ARG_CONST_MAP_PTR,
+       .arg3_type      = ARG_ANYTHING,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE_OR_ZERO,
+ };
+ 
+@@ -5072,7 +5072,7 @@ const struct bpf_func_proto bpf_sk_setsockopt_proto = {
+       .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_ANYTHING,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE,
+ };
+ 
+@@ -5106,7 +5106,7 @@ static const struct bpf_func_proto 
bpf_sock_addr_setsockopt_proto = {
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_ANYTHING,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE,
+ };
+ 
+@@ -5140,7 +5140,7 @@ static const struct bpf_func_proto 
bpf_sock_ops_setsockopt_proto = {
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_ANYTHING,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE,
+ };
+ 
+@@ -5315,7 +5315,7 @@ static const struct bpf_func_proto bpf_bind_proto = {
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+ };
+ 
+@@ -5903,7 +5903,7 @@ static const struct bpf_func_proto 
bpf_lwt_in_push_encap_proto = {
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+-      .arg3_type      = ARG_PTR_TO_MEM,
++      .arg3_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg4_type      = ARG_CONST_SIZE
+ };
+ 
+@@ -5913,7 +5913,7 @@ static const struct bpf_func_proto 
bpf_lwt_xmit_push_encap_proto = {
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+-      .arg3_type      = ARG_PTR_TO_MEM,
++      .arg3_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg4_type      = ARG_CONST_SIZE
+ };
+ 
+@@ -5956,7 +5956,7 @@ static const struct bpf_func_proto 
bpf_lwt_seg6_store_bytes_proto = {
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+-      .arg3_type      = ARG_PTR_TO_MEM,
++      .arg3_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg4_type      = ARG_CONST_SIZE
+ };
+ 
+@@ -6044,7 +6044,7 @@ static const struct bpf_func_proto 
bpf_lwt_seg6_action_proto = {
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+       .arg2_type      = ARG_ANYTHING,
+-      .arg3_type      = ARG_PTR_TO_MEM,
++      .arg3_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg4_type      = ARG_CONST_SIZE
+ };
+ 
+@@ -6269,7 +6269,7 @@ static const struct bpf_func_proto 
bpf_skc_lookup_tcp_proto = {
+       .pkt_access     = true,
+       .ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+@@ -6288,7 +6288,7 @@ static const struct bpf_func_proto 
bpf_sk_lookup_tcp_proto = {
+       .pkt_access     = true,
+       .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+@@ -6307,7 +6307,7 @@ static const struct bpf_func_proto 
bpf_sk_lookup_udp_proto = {
+       .pkt_access     = true,
+       .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+@@ -6344,7 +6344,7 @@ static const struct bpf_func_proto 
bpf_xdp_sk_lookup_udp_proto = {
+       .pkt_access     = true,
+       .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+@@ -6367,7 +6367,7 @@ static const struct bpf_func_proto 
bpf_xdp_skc_lookup_tcp_proto = {
+       .pkt_access     = true,
+       .ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+@@ -6390,7 +6390,7 @@ static const struct bpf_func_proto 
bpf_xdp_sk_lookup_tcp_proto = {
+       .pkt_access     = true,
+       .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+@@ -6409,7 +6409,7 @@ static const struct bpf_func_proto 
bpf_sock_addr_skc_lookup_tcp_proto = {
+       .gpl_only       = false,
+       .ret_type       = RET_PTR_TO_SOCK_COMMON_OR_NULL,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+@@ -6428,7 +6428,7 @@ static const struct bpf_func_proto 
bpf_sock_addr_sk_lookup_tcp_proto = {
+       .gpl_only       = false,
+       .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+@@ -6447,7 +6447,7 @@ static const struct bpf_func_proto 
bpf_sock_addr_sk_lookup_udp_proto = {
+       .gpl_only       = false,
+       .ret_type       = RET_PTR_TO_SOCKET_OR_NULL,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+       .arg4_type      = ARG_ANYTHING,
+       .arg5_type      = ARG_ANYTHING,
+@@ -6769,9 +6769,9 @@ static const struct bpf_func_proto 
bpf_tcp_check_syncookie_proto = {
+       .pkt_access     = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE,
+ };
+ 
+@@ -6838,9 +6838,9 @@ static const struct bpf_func_proto 
bpf_tcp_gen_syncookie_proto = {
+       .pkt_access     = true,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+-      .arg4_type      = ARG_PTR_TO_MEM,
++      .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE,
+ };
+ 
+@@ -7069,7 +7069,7 @@ static const struct bpf_func_proto 
bpf_sock_ops_store_hdr_opt_proto = {
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_CTX,
+-      .arg2_type      = ARG_PTR_TO_MEM,
++      .arg2_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg3_type      = ARG_CONST_SIZE,
+       .arg4_type      = ARG_ANYTHING,
+ };
+diff --git a/net/core/sock_map.c b/net/core/sock_map.c
+index 8288b5382f08d..6351b6af7aca9 100644
+--- a/net/core/sock_map.c
++++ b/net/core/sock_map.c
+@@ -1575,7 +1575,7 @@ static struct bpf_iter_reg sock_map_iter_reg = {
+       .ctx_arg_info_size      = 2,
+       .ctx_arg_info           = {
+               { offsetof(struct bpf_iter__sockmap, key),
+-                PTR_TO_RDONLY_BUF_OR_NULL },
++                PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
+               { offsetof(struct bpf_iter__sockmap, sk),
+                 PTR_TO_BTF_ID_OR_NULL },
+       },
+diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c 
b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
+index cf3acfa5a91d5..69455fe90ac3e 100644
+--- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
++++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
+@@ -7,6 +7,7 @@
+ #include "test_ksyms_btf.skel.h"
+ #include "test_ksyms_btf_null_check.skel.h"
+ #include "test_ksyms_weak.skel.h"
++#include "test_ksyms_btf_write_check.skel.h"
+ 
+ static int duration;
+ 
+@@ -109,6 +110,16 @@ cleanup:
+       test_ksyms_weak__destroy(skel);
+ }
+ 
++static void test_write_check(void)
++{
++      struct test_ksyms_btf_write_check *skel;
++
++      skel = test_ksyms_btf_write_check__open_and_load();
++      ASSERT_ERR_PTR(skel, "unexpected load of a prog writing to ksym 
memory\n");
++
++      test_ksyms_btf_write_check__destroy(skel);
++}
++
+ void test_ksyms_btf(void)
+ {
+       int percpu_datasec;
+@@ -136,4 +147,7 @@ void test_ksyms_btf(void)
+ 
+       if (test__start_subtest("weak_ksyms"))
+               test_weak_syms();
++
++      if (test__start_subtest("write_check"))
++              test_write_check();
+ }
+diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c 
b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
+new file mode 100644
+index 0000000000000..2180c41cd890f
+--- /dev/null
++++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
+@@ -0,0 +1,29 @@
++// SPDX-License-Identifier: GPL-2.0
++/* Copyright (c) 2021 Google */
++
++#include "vmlinux.h"
++
++#include <bpf/bpf_helpers.h>
++
++extern const int bpf_prog_active __ksym; /* int type global var. */
++
++SEC("raw_tp/sys_enter")
++int handler(const void *ctx)
++{
++      int *active;
++      __u32 cpu;
++
++      cpu = bpf_get_smp_processor_id();
++      active = (int *)bpf_per_cpu_ptr(&bpf_prog_active, cpu);
++      if (active) {
++              /* Kernel memory obtained from bpf_{per,this}_cpu_ptr
++               * is read-only, should _not_ pass verification.
++               */
++              /* WRITE_ONCE */
++              *(volatile int *)active = -1;
++      }
++
++      return 0;
++}
++
++char _license[] SEC("license") = "GPL";
+diff --git a/tools/testing/selftests/bpf/verifier/calls.c 
b/tools/testing/selftests/bpf/verifier/calls.c
+index 336a749673d19..2e701e7f69680 100644
+--- a/tools/testing/selftests/bpf/verifier/calls.c
++++ b/tools/testing/selftests/bpf/verifier/calls.c
+@@ -107,6 +107,25 @@
+       .result = REJECT,
+       .errstr = "R0 min value is outside of the allowed memory range",
+ },
++{
++      "calls: trigger reg2btf_ids[reg->type] for reg->type > 
__BPF_REG_TYPE_MAX",
++      .insns = {
++      BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
++      BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8),
++      BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 0),
++      BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
++      BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
++      BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, BPF_PSEUDO_KFUNC_CALL, 0, 0),
++      BPF_EXIT_INSN(),
++      },
++      .prog_type = BPF_PROG_TYPE_SCHED_CLS,
++      .result = REJECT,
++      .errstr = "arg#0 pointer type STRUCT prog_test_ref_kfunc must point",
++      .fixup_kfunc_btf_id = {
++              { "bpf_kfunc_call_test_acquire", 3 },
++              { "bpf_kfunc_call_test_release", 5 },
++      },
++},
+ {
+       "calls: overlapping caller/callee",
+       .insns = {

Reply via email to