date:20210310

[PATCH v4 17/28] afs: Print the operation debug_id when logging an unexpected data version

2021-03-10 Thread David Howells

Print the afs_operation debug_id when logging an unexpected change in the
data version.  This allows the logged message to be matched against
tracelines.

Signed-off-by: David Howells 
cc: linux-...@lists.infradead.org
cc: linux-cach...@redhat.com
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/160588528377.3465195.2206051235095182302.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118146111.1232039.11398082422487058312.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161042180.2537118.2471333561661033316.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340405772.1303470.3877167548944248214.st...@warthog.procyon.org.uk/
 # v3
---

 fs/afs/inode.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 48519ee00eef..af6556bb3d6a 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -215,11 +215,12 @@ static void afs_apply_status(struct afs_operation *op,
 
if (vp->dv_before + vp->dv_delta != status->data_version) {
if (test_bit(AFS_VNODE_CB_PROMISED, >flags))
-   pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx 
%s\n",
+   pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s 
(op=%x)\n",
vnode->fid.vid, vnode->fid.vnode,
(unsigned long long)vp->dv_before + 
vp->dv_delta,
(unsigned long long)status->data_version,
-   op->type ? op->type->name : "???");
+   op->type ? op->type->name : "???",
+   op->debug_id);
 
vnode->invalid_before = status->data_version;
if (vnode->status.type == AFS_FTYPE_DIR) {

[PATCH v4 15/28] afs: Disable use of the fscache I/O routines

2021-03-10 Thread David Howells

Disable use of the fscache I/O routined by the AFS filesystem.  It's about
to transition to passing iov_iters down and fscache is about to have its
I/O path to use iov_iter, so all that needs to change.

Signed-off-by: David Howells 
cc: linux-...@lists.infradead.org
cc: linux-cach...@redhat.com
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/158861209824.340223.1864211542341758994.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/159465768717.1376105.2229314852486665807.st...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/160588457929.3465195.1730097418904945578.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118143744.1232039.2727898205333669064.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161039077.2537118.7986870854927176905.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340403323.1303470.8159439948319423431.st...@warthog.procyon.org.uk/
 # v3
---

 fs/afs/file.c  |  199 ++--
 fs/afs/inode.c |2 -
 fs/afs/write.c |   10 ---
 3 files changed, 36 insertions(+), 175 deletions(-)

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 85f5adf21aa0..6d43713fde01 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -203,24 +203,6 @@ void afs_put_read(struct afs_read *req)
}
 }
 
-#ifdef CONFIG_AFS_FSCACHE
-/*
- * deal with notification that a page was read from the cache
- */
-static void afs_file_readpage_read_complete(struct page *page,
-   void *data,
-   int error)
-{
-   _enter("%p,%p,%d", page, data, error);
-
-   /* if the read completes with an error, we just unlock the page and let
-* the VM reissue the readpage */
-   if (!error)
-   SetPageUptodate(page);
-   unlock_page(page);
-}
-#endif
-
 static void afs_fetch_data_success(struct afs_operation *op)
 {
struct afs_vnode *vnode = op->file[0].vnode;
@@ -288,89 +270,46 @@ int afs_page_filler(void *data, struct page *page)
if (test_bit(AFS_VNODE_DELETED, >flags))
goto error;
 
-   /* is it cached? */
-#ifdef CONFIG_AFS_FSCACHE
-   ret = fscache_read_or_alloc_page(vnode->cache,
-page,
-afs_file_readpage_read_complete,
-NULL,
-GFP_KERNEL);
-#else
-   ret = -ENOBUFS;
-#endif
-   switch (ret) {
-   /* read BIO submitted (page in cache) */
-   case 0:
-   break;
-
-   /* page not yet cached */
-   case -ENODATA:
-   _debug("cache said ENODATA");
-   goto go_on;
-
-   /* page will not be cached */
-   case -ENOBUFS:
-   _debug("cache said ENOBUFS");
-
-   fallthrough;
-   default:
-   go_on:
-   req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
-   if (!req)
-   goto enomem;
-
-   /* We request a full page.  If the page is a partial one at the
-* end of the file, the server will return a short read and the
-* unmarshalling code will clear the unfilled space.
-*/
-   refcount_set(>usage, 1);
-   req->pos = (loff_t)page->index << PAGE_SHIFT;
-   req->len = PAGE_SIZE;
-   req->nr_pages = 1;
-   req->pages = req->array;
-   req->pages[0] = page;
-   get_page(page);
-
-   /* read the contents of the file from the server into the
-* page */
-   ret = afs_fetch_data(vnode, key, req);
-   afs_put_read(req);
-
-   if (ret < 0) {
-   if (ret == -ENOENT) {
-   _debug("got NOENT from server"
-  " - marking file deleted and stale");
-   set_bit(AFS_VNODE_DELETED, >flags);
-   ret = -ESTALE;
-   }
-
-#ifdef CONFIG_AFS_FSCACHE
-   fscache_uncache_page(vnode->cache, page);
-#endif
-   BUG_ON(PageFsCache(page));
-
-   if (ret == -EINTR ||
-   ret == -ENOMEM ||
-   ret == -ERESTARTSYS ||
-   ret == -EAGAIN)
-   goto error;
-   goto io_error;
-   }
+   req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
+   if (!req)
+   goto enomem;
 
-   SetPageUptodate(page);
+   /* We request a full page.  If the page is a partial one at the
+* end of the file, the server will return a short read and the
+* unmarshalling code

[PATCH v4 16/28] afs: Pass page into dirty region helpers to provide THP size

2021-03-10 Thread David Howells

Pass a pointer to the page being accessed into the dirty region helpers so
that the size of the page can be determined in case it's a transparent huge
page.

This also required the page to be passed into the afs_page_dirty trace
point - so there's no need to specifically pass in the index or private
data as these can be retrieved directly from the page struct.

Signed-off-by: David Howells 
cc: linux-...@lists.infradead.org
cc: linux-cach...@redhat.com
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/160588527183.3465195.16107942526481976308.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118144921.1232039.11377711180492625929.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161040747.2537118.11435394902674511430.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340404553.1303470.11414163641767769882.st...@warthog.procyon.org.uk/
 # v3
---

 fs/afs/file.c  |   20 +++
 fs/afs/internal.h  |   16 ++--
 fs/afs/write.c |   60 ++--
 include/trace/events/afs.h |   23 ++---
 4 files changed, 55 insertions(+), 64 deletions(-)

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6d43713fde01..21868bfc3a44 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -515,8 +515,8 @@ static void afs_invalidate_dirty(struct page *page, 
unsigned int offset,
return;
 
/* We may need to shorten the dirty region */
-   f = afs_page_dirty_from(priv);
-   t = afs_page_dirty_to(priv);
+   f = afs_page_dirty_from(page, priv);
+   t = afs_page_dirty_to(page, priv);
 
if (t <= offset || f >= end)
return; /* Doesn't overlap */
@@ -534,17 +534,17 @@ static void afs_invalidate_dirty(struct page *page, 
unsigned int offset,
if (f == t)
goto undirty;
 
-   priv = afs_page_dirty(f, t);
+   priv = afs_page_dirty(page, f, t);
set_page_private(page, priv);
-   trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, 
priv);
+   trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page);
return;
 
 undirty:
-   trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, 
priv);
+   trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page);
clear_page_dirty_for_io(page);
 full_invalidate:
-   priv = (unsigned long)detach_page_private(page);
-   trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, 
priv);
+   detach_page_private(page);
+   trace_afs_page_dirty(vnode, tracepoint_string("inval"), page);
 }
 
 /*
@@ -572,7 +572,6 @@ static void afs_invalidatepage(struct page *page, unsigned 
int offset,
 static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 {
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
-   unsigned long priv;
 
_enter("{{%llx:%llu}[%lu],%lx},%x",
   vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
@@ -581,9 +580,8 @@ static int afs_releasepage(struct page *page, gfp_t 
gfp_flags)
/* deny if page is being written to the cache and the caller hasn't
 * elected to wait */
if (PagePrivate(page)) {
-   priv = (unsigned long)detach_page_private(page);
-   trace_afs_page_dirty(vnode, tracepoint_string("rel"),
-page->index, priv);
+   detach_page_private(page);
+   trace_afs_page_dirty(vnode, tracepoint_string("rel"), page);
}
 
/* indicate that the page can be released */
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index b626e38e9ab5..180eae8134da 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -875,31 +875,31 @@ struct afs_vnode_cache_aux {
 #define __AFS_PAGE_PRIV_MMAPPED0x8000UL
 #endif
 
-static inline unsigned int afs_page_dirty_resolution(void)
+static inline unsigned int afs_page_dirty_resolution(struct page *page)
 {
-   int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1);
+   int shift = thp_order(page) + PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1);
return (shift > 0) ? shift : 0;
 }
 
-static inline size_t afs_page_dirty_from(unsigned long priv)
+static inline size_t afs_page_dirty_from(struct page *page, unsigned long priv)
 {
unsigned long x = priv & __AFS_PAGE_PRIV_MASK;
 
/* The lower bound is inclusive */
-   return x << afs_page_dirty_resolution();
+   return x << afs_page_dirty_resolution(page);
 }
 
-static inline size_t afs_page_dirty_to(unsigned long priv)
+static inline size_t afs_page_dirty_to(struct page *page, unsigned long priv)
 {
unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & 
__AFS_PAGE_PRIV_MASK;
 
/* The upper bound is immediately beyond the region */
-   return (x + 1) << afs_page_dirty_resolution();
+   return (x + 1) << afs_page_dirty_resolution(page);

[PATCH v4 14/28] fscache, cachefiles: Add alternate API to use kiocb for read/write to cache

2021-03-10 Thread David Howells

Add an alternate API by which the cache can be accessed through a kiocb,
doing async DIO, rather than using the current API that tells the cache
where all the pages are.

The new API is intended to be used in conjunction with the netfs helper
library.  A filesystem must pick one or the other and not mix them.

Filesystems wanting to use the new API must #define FSCACHE_USE_NEW_IO_API
before #including the header.  This prevents them from continuing to use
the old API at the same time as there are incompatibilities in how the
PG_fscache page bit is used.

Changes:
 - Use the vfs_iocb_iter_read/write() helpers[1]
 - Move initial definition of fscache_begin_read_operation() here.
 - Remove a commented-out line[2]
 - Combine ki->term_func calls in cachefiles_read_complete()[2].
 - Remove explicit NULL initialiser[2].
 - Remove extern on func decl[2].
 - Put in param names on func decl[2].
 - Remove redundant else[2].
 - Fill out the kdoc comment for fscache_begin_read_operation().
 - Rename fs/fscache/page2.c to io.c to match later patches.

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Christoph Hellwig 
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/161118142558.1232039.17993829899588971439.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161037850.2537118.8819808229350326503.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340402057.1303470.8038373593844486698.st...@warthog.procyon.org.uk/
 # v3
Link: https://lore.kernel.org/r/20210216102614.ga27...@lst.de/ [1]
Link: https://lore.kernel.org/r/20210216084230.ga23...@lst.de/ [2]
---

 fs/cachefiles/Makefile|1 
 fs/cachefiles/interface.c |5 -
 fs/cachefiles/internal.h  |9 +
 fs/cachefiles/rdwr2.c |  403 +
 fs/fscache/Kconfig|1 
 fs/fscache/Makefile   |1 
 fs/fscache/internal.h |4 
 fs/fscache/io.c   |  116 
 fs/fscache/page.c |2 
 fs/fscache/stats.c|1 
 include/linux/fscache-cache.h |4 
 include/linux/fscache.h   |   39 
 12 files changed, 583 insertions(+), 3 deletions(-)
 create mode 100644 fs/cachefiles/rdwr2.c
 create mode 100644 fs/fscache/io.c

diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 891dedda5905..ea17b169ea5e 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -11,6 +11,7 @@ cachefiles-y := \
main.o \
namei.o \
rdwr.o \
+   rdwr2.o \
security.o \
xattr.o
 
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 5efa6a3702c0..da3948fdb615 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -319,8 +319,8 @@ static void cachefiles_drop_object(struct fscache_object 
*_object)
 /*
  * dispose of a reference to an object
  */
-static void cachefiles_put_object(struct fscache_object *_object,
- enum fscache_obj_ref_trace why)
+void cachefiles_put_object(struct fscache_object *_object,
+  enum fscache_obj_ref_trace why)
 {
struct cachefiles_object *object;
struct fscache_cache *cache;
@@ -568,4 +568,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
.uncache_page   = cachefiles_uncache_page,
.dissociate_pages   = cachefiles_dissociate_pages,
.check_consistency  = cachefiles_check_consistency,
+   .begin_read_operation   = cachefiles_begin_read_operation,
 };
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index cf9bd6401c2d..4ed83aa5253b 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -150,6 +150,9 @@ extern int cachefiles_has_space(struct cachefiles_cache 
*cache,
  */
 extern const struct fscache_cache_ops cachefiles_cache_ops;
 
+void cachefiles_put_object(struct fscache_object *_object,
+  enum fscache_obj_ref_trace why);
+
 /*
  * key.c
  */
@@ -217,6 +220,12 @@ extern int cachefiles_allocate_pages(struct 
fscache_retrieval *,
 extern int cachefiles_write_page(struct fscache_storage *, struct page *);
 extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
 
+/*
+ * rdwr2.c
+ */
+extern int cachefiles_begin_read_operation(struct netfs_read_request *,
+  struct fscache_retrieval *);
+
 /*
  * security.c
  */
diff --git a/fs/cachefiles/rdwr2.c b/fs/cachefiles/rdwr2.c
new file mode 100644
index ..620959d1e95b
--- /dev/null
+++ b/fs/cachefiles/rdwr2.c
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* kiocb-using read/write
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David

[PATCH v4 13/28] netfs: Hold a ref on a page when PG_private_2 is set

2021-03-10 Thread David Howells

Take a reference on a page when PG_private_2 is set and drop it once the
bit is unlocked[1].

Reported-by: Linus Torvalds 
Signed-off-by: David Howells 
cc: Matthew Wilcox 
cc: Linus Torvalds 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/CAHk-=wh+2gbF7XEjYc=HV9w_2uVzVf7vs60BPz0gFA=+pum...@mail.gmail.com/
 [1]
Link: https://lore.kernel.org/r/1331025.1612974...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/161340400833.1303470.8070750156044982282.st...@warthog.procyon.org.uk/
 # v3
---

 fs/netfs/read_helper.c |   10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 0a7b76c11c98..ce11ca4c32e4 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -238,10 +239,13 @@ static void netfs_rreq_unmark_after_write(struct 
netfs_read_request *rreq,
  bool was_async)
 {
struct netfs_read_subrequest *subreq;
+   struct pagevec pvec;
struct page *page;
pgoff_t unlocked = 0;
bool have_unlocked = false;
 
+   pagevec_init();
+
rcu_read_lock();
 
list_for_each_entry(subreq, >subrequests, rreq_link) {
@@ -255,6 +259,8 @@ static void netfs_rreq_unmark_after_write(struct 
netfs_read_request *rreq,
continue;
unlocked = page->index;
unlock_page_fscache(page);
+   if (pagevec_add(, page) == 0)
+   pagevec_release();
have_unlocked = true;
}
}
@@ -413,8 +419,10 @@ static void netfs_rreq_unlock(struct netfs_read_request 
*rreq)
pg_failed = true;
break;
}
-   if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, >flags))
+   if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, 
>flags)) {
+   get_page(page);
SetPageFsCache(page);
+   }
pg_failed |= subreq_failed;
if (pgend < iopos + subreq->len)
break;

[PATCH v4 12/28] netfs: Define an interface to talk to a cache

2021-03-10 Thread David Howells

Add an interface to the netfs helper library for reading data from the
cache instead of downloading it from the server and support for writing
data just downloaded or cleared to the cache.

The API passes an iov_iter to the cache read/write routines to indicate the
data/buffer to be used.  This is done using the ITER_XARRAY type to provide
direct access to the netfs inode's pagecache.

When the netfs's ->begin_cache_operation() method is called, this must fill
in the cache_resources in the netfs_read_request struct, including the
netfs_cache_ops used by the helper lib to talk to the cache.  The helper
lib does not directly access the cache.

Changes
- Added flag to netfs_subreq_terminated() to indicate that the caller may
  have been running async and stuff that might sleep needs punting to a
  workqueue (can't use in_softirq()[1]).
- Add missing inc of netfs_n_rh_read stat.
- Move initial definition of fscache_begin_read_operation() elsewhere.
- Need to call op->begin_cache_operation() from netfs_write_begin().

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/161118141321.1232039.8296910406755622458.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161036700.2537118.11170748455436854978.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340399569.1303470.1138884774643385730.st...@warthog.procyon.org.uk/
 # v3
Link: https://lore.kernel.org/r/20210216084230.ga23...@lst.de/ [1]
---

 fs/netfs/read_helper.c |  241 
 include/linux/netfs.h  |   49 ++
 2 files changed, 289 insertions(+), 1 deletion(-)

diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index ce3a2f5844d1..0a7b76c11c98 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -88,6 +88,8 @@ static void netfs_free_read_request(struct work_struct *work)
if (rreq->netfs_priv)
rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+   if (rreq->cache_resources.ops)
+   
rreq->cache_resources.ops->end_operation(>cache_resources);
kfree(rreq);
netfs_stat_d(_n_rh_rreq);
 }
@@ -154,6 +156,34 @@ static void netfs_clear_unread(struct 
netfs_read_subrequest *subreq)
iov_iter_zero(iov_iter_count(), );
 }
 
+static void netfs_cache_read_terminated(void *priv, ssize_t 
transferred_or_error,
+   bool was_async)
+{
+   struct netfs_read_subrequest *subreq = priv;
+
+   netfs_subreq_terminated(subreq, transferred_or_error, was_async);
+}
+
+/*
+ * Issue a read against the cache.
+ * - Eats the caller's ref on subreq.
+ */
+static void netfs_read_from_cache(struct netfs_read_request *rreq,
+ struct netfs_read_subrequest *subreq,
+ bool seek_data)
+{
+   struct netfs_cache_resources *cres = >cache_resources;
+   struct iov_iter iter;
+
+   netfs_stat(_n_rh_read);
+   iov_iter_xarray(, READ, >mapping->i_pages,
+   subreq->start + subreq->transferred,
+   subreq->len   - subreq->transferred);
+
+   cres->ops->read(cres, subreq->start, , seek_data,
+   netfs_cache_read_terminated, subreq);
+}
+
 /*
  * Fill a subrequest region with zeroes.
  */
@@ -198,6 +228,144 @@ static void netfs_rreq_completed(struct 
netfs_read_request *rreq, bool was_async
netfs_put_read_request(rreq, was_async);
 }
 
+/*
+ * Deal with the completion of writing the data to the cache.  We have to clear
+ * the PG_fscache bits on the pages involved and release the caller's ref.
+ *
+ * May be called in softirq mode and we inherit a ref from the caller.
+ */
+static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq,
+ bool was_async)
+{
+   struct netfs_read_subrequest *subreq;
+   struct page *page;
+   pgoff_t unlocked = 0;
+   bool have_unlocked = false;
+
+   rcu_read_lock();
+
+   list_for_each_entry(subreq, >subrequests, rreq_link) {
+   XA_STATE(xas, >mapping->i_pages, subreq->start / 
PAGE_SIZE);
+
+   xas_for_each(, page, (subreq->start + subreq->len - 1) / 
PAGE_SIZE) {
+   /* We might have multiple writes from the same huge
+* page, but we mustn't unlock a page more than once.
+*/
+   if (have_unlocked && page->index <= unlocked)
+   continue;
+   unlocked = page->index;
+

Re: MaxLinear, please maintain your drivers was Re: [PATCH] leds: lgm: fix gpiolib dependency

2021-03-10 Thread Arnd Bergmann

On Wed, Mar 10, 2021 at 8:30 AM Pavel Machek  wrote:
>
> Hi!
>
> > > > I'd like people from Intel to contact me. There's more to fix there,
> > > > and AFAICT original author went away.
> > >
> > > The following message to  was
> > > undeliverable.
> >
> > > : Recipient
> > > +address rejected: User unknown in virtual mailbox table'
> >
> > > commit c3987cd2bca34ddfec69027acedb2fae5ffcf7a0
> > > Author: Amireddy Mallikarjuna reddy 
> >
> > I asked around, and got told Mallikarjuna has been "sold" to MaxLinear,
> > together with the rest of the Connected Home Division.  So he most likely
> > still works on this stuff, just under a different banner.
> >
> > > If someone knows how to contact the author, that would be welcome.
> >
> > Alas, no idea about his MaxLinear address.
>
> Thanks for the effort. Anyway, I suspect I'll just do this:

Maybe Hauke or John  (added both to cc) know who at MaxLinear is
responsible for maintaining the Lightning Mountain drivers now.

   Arnd

> diff --git a/drivers/leds/blink/Kconfig b/drivers/leds/blink/Kconfig
> index 6dedc58c47b3..79493f21d365 100644
> --- a/drivers/leds/blink/Kconfig
> +++ b/drivers/leds/blink/Kconfig
> @@ -1,14 +1,6 @@
> -menuconfig LEDS_BLINK
> -   bool "LED Blink support"
> -   depends on LEDS_CLASS
> -   help
> - This option enables blink support for the leds class.
> - If unsure, say Y.
> -
> -if LEDS_BLINK
> -
>  config LEDS_BLINK_LGM
> tristate "LED support for Intel LGM SoC series"
> +   depends on BROKEN
> depends on GPIOLIB
> depends on LEDS_CLASS
> depends on MFD_SYSCON
> @@ -17,5 +9,3 @@ config LEDS_BLINK_LGM
>   Parallel to serial conversion, which is also called SSO controller,
>   can drive external shift register for LED outputs.
>   This enables LED support for Serial Shift Output controller(SSO).
> -
> -endif # LEDS_BLINK
>
>
> --
> http://www.livejournal.com/~pavelmachek

[PATCH v4 11/28] netfs: Add write_begin helper

2021-03-10 Thread David Howells

Add a helper to do the pre-reading work for the netfs write_begin address
space op.

Changes
- Added flag to netfs_subreq_terminated() to indicate that the caller may
  have been running async and stuff that might sleep needs punting to a
  workqueue (can't use in_softirq()[1]).

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/160588543960.3465195.2792938973035886168.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118140165.1232039.16418853874312234477.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161035539.2537118.15674887534950908530.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340398368.1303470.11242918276563276090.st...@warthog.procyon.org.uk/
 # v3
Link: https://lore.kernel.org/r/20210216084230.ga23...@lst.de/ [1]
---

 fs/netfs/internal.h  |2 +
 fs/netfs/read_helper.c   |  165 ++
 fs/netfs/stats.c |   11 ++-
 include/linux/netfs.h|8 ++
 include/trace/events/netfs.h |4 +
 5 files changed, 186 insertions(+), 4 deletions(-)

diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 98b6f4516da1..b7f2c4459f33 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -34,8 +34,10 @@ extern atomic_t netfs_n_rh_read_failed;
 extern atomic_t netfs_n_rh_zero;
 extern atomic_t netfs_n_rh_short_read;
 extern atomic_t netfs_n_rh_write;
+extern atomic_t netfs_n_rh_write_begin;
 extern atomic_t netfs_n_rh_write_done;
 extern atomic_t netfs_n_rh_write_failed;
+extern atomic_t netfs_n_rh_write_zskip;
 
 
 static inline void netfs_stat(atomic_t *stat)
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 8b03fcc2f8f1..ce3a2f5844d1 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -766,3 +766,168 @@ int netfs_readpage(struct file *file,
return ret;
 }
 EXPORT_SYMBOL(netfs_readpage);
+
+static void netfs_clear_thp(struct page *page)
+{
+   unsigned int i;
+
+   for (i = 0; i < thp_nr_pages(page); i++)
+   clear_highpage(page + i);
+}
+
+/**
+ * netfs_write_begin - Helper to prepare for writing
+ * @file: The file to read from
+ * @mapping: The mapping to read from
+ * @pos: File position at which the write will begin
+ * @len: The length of the write in this page
+ * @flags: AOP_* flags
+ * @_page: Where to put the resultant page
+ * @_fsdata: Place for the netfs to store a cookie
+ * @ops: The network filesystem's operations for the helper to use
+ * @netfs_priv: Private netfs data to be retained in the request
+ *
+ * Pre-read data for a write-begin request by drawing data from the cache if
+ * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
+ * Multiple I/O requests from different sources will get munged together.  If
+ * necessary, the readahead window can be expanded in either direction to a
+ * more convenient alighment for RPC efficiency or to make storage in the cache
+ * feasible.
+ *
+ * The calling netfs must provide a table of operations, only one of which,
+ * issue_op, is mandatory.
+ *
+ * The check_write_begin() operation can be provided to check for and flush
+ * conflicting writes once the page is grabbed and locked.  It is passed a
+ * pointer to the fsdata cookie that gets returned to the VM to be passed to
+ * write_end.  It is permitted to sleep.  It should return 0 if the request
+ * should go ahead; unlock the page and return -EAGAIN to cause the page to be
+ * regot; or return an error.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int flags,
+ struct page **_page, void **_fsdata,
+ const struct netfs_read_request_ops *ops,
+ void *netfs_priv)
+{
+   struct netfs_read_request *rreq;
+   struct page *page, *xpage;
+   struct inode *inode = file_inode(file);
+   unsigned int debug_index = 0;
+   pgoff_t index = pos >> PAGE_SHIFT;
+   int pos_in_page = pos & ~PAGE_MASK;
+   loff_t size;
+   int ret;
+
+   struct readahead_control ractl = {
+   .file   = file,
+   .mapping= mapping,
+   ._index = index,
+   ._nr_pages  = 0,
+   };
+
+retry:
+   page = grab_cache_page_write_begin(mapping, index, 0);
+   if (!page)
+   return -ENOMEM;
+
+   if (ops->check_write_begin) {
+   /* Allow the netfs (eg. ceph) to flush conflicts. */
+   ret = ops->check_write_begin(file, pos, len,

Re: [PATCH v4 2/4] hugetlb/userfaultfd: Forbid huge pmd sharing when uffd enabled

2021-03-10 Thread Peter Xu

On Wed, Mar 10, 2021 at 01:18:42PM +0530, Naresh Kamboju wrote:
> Hi Peter,

Hi, Naresh,

> 
> On Fri, 19 Feb 2021 at 04:43, Peter Xu  wrote:
> >
> > Huge pmd sharing could bring problem to userfaultfd.  The thing is that
> > userfaultfd is running its logic based on the special bits on page table
> > entries, however the huge pmd sharing could potentially share page table
> > entries for different address ranges.  That could cause issues on either:
> >
> >   - When sharing huge pmd page tables for an uffd write protected range, the
> > newly mapped huge pmd range will also be write protected unexpectedly, 
> > or,
> >
> >   - When we try to write protect a range of huge pmd shared range, we'll 
> > first
> > do huge_pmd_unshare() in hugetlb_change_protection(), however that also
> > means the UFFDIO_WRITEPROTECT could be silently skipped for the shared
> > region, which could lead to data loss.
> >
> > Since at it, a few other things are done altogether:
> >
> >   - Move want_pmd_share() from mm/hugetlb.c into linux/hugetlb.h, because
> > that's definitely something that arch code would like to use too
> >
> >   - ARM64 currently directly check against CONFIG_ARCH_WANT_HUGE_PMD_SHARE 
> > when
> > trying to share huge pmd.  Switch to the want_pmd_share() helper.
> >
> > Since at it, move vma_shareable() from huge_pmd_share() into 
> > want_pmd_share().
> >
> > Reviewed-by: Mike Kravetz 
> > Reviewed-by: Axel Rasmussen 
> > Signed-off-by: Peter Xu 
> > ---
> >  arch/arm64/mm/hugetlbpage.c   |  3 +--
> >  include/linux/hugetlb.h   |  2 ++
> >  include/linux/userfaultfd_k.h |  9 +
> >  mm/hugetlb.c  | 20 ++--
> >  4 files changed, 26 insertions(+), 8 deletions(-)
> >
> > diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
> > index 6e3bcffe2837..58987a98e179 100644
> > --- a/arch/arm64/mm/hugetlbpage.c
> > +++ b/arch/arm64/mm/hugetlbpage.c
> > @@ -284,8 +284,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
> > vm_area_struct *vma,
> >  */
> > ptep = pte_alloc_map(mm, pmdp, addr);
> > } else if (sz == PMD_SIZE) {
> > -   if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
> > -   pud_none(READ_ONCE(*pudp)))
> > +   if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp)))
> 
> While building Linux next 20210310 tag for arm64 architecture with
> 
>   - CONFIG_ARM64_64K_PAGES=y
> 
> enabled the build failed due to below errors / warnings
> 
> make --silent --keep-going --jobs=8
> O=/home/tuxbuild/.cache/tuxmake/builds/1/tmp ARCH=arm64
> CROSS_COMPILE=aarch64-linux-gnu- 'CC=sccache aarch64-linux-gnu-gcc'
> 'HOSTCC=sccache gcc'
> aarch64-linux-gnu-ld: Unexpected GOT/PLT entries detected!
> aarch64-linux-gnu-ld: Unexpected run-time procedure linkages detected!
> aarch64-linux-gnu-ld: arch/arm64/mm/hugetlbpage.o: in function 
> `huge_pte_alloc':
> hugetlbpage.c:(.text+0x7d8): undefined reference to `want_pmd_share'
> 
> Reported-by: Naresh Kamboju 

Sorry for the issue & thanks for the report.  Would you please check whether
the patch attached could fix the issue?

-- 
Peter Xu
>From 4072042b415d1f78ac1ab017671e345b414ca6ab Mon Sep 17 00:00:00 2001
From: Peter Xu 
Date: Wed, 10 Mar 2021 11:48:56 -0500
Subject: [PATCH] mm/hugetlb: Fix build with !ARCH_WANT_HUGE_PMD_SHARE

want_pmd_share() is undefined with !ARCH_WANT_HUGE_PMD_SHARE since it's put
by accident into a "#ifdef ARCH_WANT_HUGE_PMD_SHARE" block.  Moving it out
won't work either since vma_shareable() is only defined within the block.
Define it for !ARCH_WANT_HUGE_PMD_SHARE instead.

Fixes: 5b109cc1cdcc ("hugetlb/userfaultfd: forbid huge pmd sharing when uffd 
enabled")
Cc: Andrew Morton 
Cc: Mike Kravetz 
Cc: Axel Rasmussen 
Reported-by: Naresh Kamboju 
Signed-off-by: Peter Xu 
---
 mm/hugetlb.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d58f1456fe27..8dda7e034477 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5469,9 +5469,6 @@ static bool vma_shareable(struct vm_area_struct *vma, 
unsigned long addr)
 
 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
 {
-#ifndef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
-   return false;
-#endif
 #ifdef CONFIG_USERFAULTFD
if (uffd_disable_huge_pmd_share(vma))
return false;
@@ -5616,6 +5613,11 @@ void adjust_range_if_pmd_sharing_possible(struct 
vm_area_struct *vma,
unsigned long *start, unsigned long *end)
 {
 }
+
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+   return false;
+}
 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 
 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-- 
2.26.2

[PATCH v4 10/28] netfs: Gather stats

2021-03-10 Thread David Howells

Gather statistics from the netfs interface that can be exported through a
seqfile.  This is intended to be called by a later patch when viewing
/proc/fs/fscache/stats.

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/161118139247.1232039.10556850937548511068.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161034669.2537118.2761232524997091480.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340397101.1303470.17581910581108378458.st...@warthog.procyon.org.uk/
 # v3
---

 fs/netfs/Kconfig   |   15 +
 fs/netfs/Makefile  |3 +--
 fs/netfs/internal.h|   34 ++
 fs/netfs/read_helper.c |   23 
 fs/netfs/stats.c   |   54 
 include/linux/netfs.h  |1 +
 6 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 fs/netfs/stats.c

diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index 2ebf90e6ca95..578112713703 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -6,3 +6,18 @@ config NETFS_SUPPORT
  This option enables support for network filesystems, including
  helpers for high-level buffered I/O, abstracting out read
  segmentation, local caching and transparent huge page support.
+
+config NETFS_STATS
+   bool "Gather statistical information on local caching"
+   depends on NETFS_SUPPORT && PROC_FS
+   help
+ This option causes statistical information to be gathered on local
+ caching and exported through file:
+
+   /proc/fs/fscache/stats
+
+ The gathering of statistics adds a certain amount of overhead to
+ execution as there are a quite a few stats gathered, and on a
+ multi-CPU system these may be on cachelines that keep bouncing
+ between CPUs.  On the other hand, the stats are very useful for
+ debugging purposes.  Saying 'Y' here is recommended.
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 4b4eff2ba369..c15bfc966d96 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 
-netfs-y := \
-   read_helper.o
+netfs-y := read_helper.o stats.o
 
 obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index ee665c0e7dc8..98b6f4516da1 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -16,8 +16,42 @@
  */
 extern unsigned int netfs_debug;
 
+/*
+ * stats.c
+ */
+#ifdef CONFIG_NETFS_STATS
+extern atomic_t netfs_n_rh_readahead;
+extern atomic_t netfs_n_rh_readpage;
+extern atomic_t netfs_n_rh_rreq;
+extern atomic_t netfs_n_rh_sreq;
+extern atomic_t netfs_n_rh_download;
+extern atomic_t netfs_n_rh_download_done;
+extern atomic_t netfs_n_rh_download_failed;
+extern atomic_t netfs_n_rh_download_instead;
+extern atomic_t netfs_n_rh_read;
+extern atomic_t netfs_n_rh_read_done;
+extern atomic_t netfs_n_rh_read_failed;
+extern atomic_t netfs_n_rh_zero;
+extern atomic_t netfs_n_rh_short_read;
+extern atomic_t netfs_n_rh_write;
+extern atomic_t netfs_n_rh_write_done;
+extern atomic_t netfs_n_rh_write_failed;
+
+
+static inline void netfs_stat(atomic_t *stat)
+{
+   atomic_inc(stat);
+}
+
+static inline void netfs_stat_d(atomic_t *stat)
+{
+   atomic_dec(stat);
+}
+
+#else
 #define netfs_stat(x) do {} while(0)
 #define netfs_stat_d(x) do {} while(0)
+#endif
 
 /*/
 /*
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index e64d9dad0c36..8b03fcc2f8f1 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -56,6 +56,7 @@ static struct netfs_read_request *netfs_alloc_read_request(
refcount_set(>usage, 1);
__set_bit(NETFS_RREQ_IN_PROGRESS, >flags);
ops->init_rreq(rreq, file);
+   netfs_stat(_n_rh_rreq);
}
 
return rreq;
@@ -88,6 +89,7 @@ static void netfs_free_read_request(struct work_struct *work)
rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
kfree(rreq);
+   netfs_stat_d(_n_rh_rreq);
 }
 
 static void netfs_put_read_request(struct netfs_read_request *rreq, bool 
was_async)
@@ -117,6 +119,7 @@ static struct netfs_read_subrequest *netfs_alloc_subrequest(
refcount_set(>usage, 2);
subreq->rreq = rreq;
netfs_get_read_request(rreq);
+   netfs_stat(_n_rh_sreq);
}
 
return subreq;
@@ -134,6 +137,7 @@ static void __netfs_put_subrequest(struct 
netfs_read_subrequest *subreq,

[PATCH v4 09/28] netfs: Add tracepoints

2021-03-10 Thread David Howells

Add three tracepoints to track the activity of the read helpers:

 (1) netfs/netfs_read

 This logs entry to the read helpers and also expansion of the range in
 a readahead request.

 (2) netfs/netfs_rreq

 This logs the progress of netfs_read_request objects which track
 read requests.  A read request may be a compound of multiple
 subrequests.

 (3) netfs/netfs_sreq

 This logs the progress of netfs_read_subrequest objects, which track
 the contributions from various sources to a read request.

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/161118138060.1232039.5353374588021776217.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161033468.2537118.14021843889844001905.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340395843.1303470.7355519662919639648.st...@warthog.procyon.org.uk/
 # v3
---

 fs/netfs/read_helper.c   |   28 ++
 include/linux/netfs.h|2 
 include/trace/events/netfs.h |  199 ++
 3 files changed, 229 insertions(+)
 create mode 100644 include/trace/events/netfs.h

diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index c5ed0dcc9571..e64d9dad0c36 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -16,6 +16,8 @@
 #include 
 #include 
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include 
 
 MODULE_DESCRIPTION("Network fs support");
 MODULE_AUTHOR("Red Hat, Inc.");
@@ -39,6 +41,7 @@ static struct netfs_read_request *netfs_alloc_read_request(
const struct netfs_read_request_ops *ops, void *netfs_priv,
struct file *file)
 {
+   static atomic_t debug_ids;
struct netfs_read_request *rreq;
 
rreq = kzalloc(sizeof(struct netfs_read_request), GFP_KERNEL);
@@ -47,6 +50,7 @@ static struct netfs_read_request *netfs_alloc_read_request(
rreq->netfs_priv = netfs_priv;
rreq->inode = file_inode(file);
rreq->i_size= i_size_read(rreq->inode);
+   rreq->debug_id  = atomic_inc_return(_ids);
INIT_LIST_HEAD(>subrequests);
INIT_WORK(>work, netfs_rreq_work);
refcount_set(>usage, 1);
@@ -82,6 +86,7 @@ static void netfs_free_read_request(struct work_struct *work)
netfs_rreq_clear_subreqs(rreq, false);
if (rreq->netfs_priv)
rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
+   trace_netfs_rreq(rreq, netfs_rreq_trace_free);
kfree(rreq);
 }
 
@@ -127,6 +132,7 @@ static void __netfs_put_subrequest(struct 
netfs_read_subrequest *subreq,
 {
struct netfs_read_request *rreq = subreq->rreq;
 
+   trace_netfs_sreq(subreq, netfs_sreq_trace_free);
kfree(subreq);
netfs_put_read_request(rreq, was_async);
 }
@@ -181,6 +187,7 @@ static void netfs_read_from_server(struct 
netfs_read_request *rreq,
  */
 static void netfs_rreq_completed(struct netfs_read_request *rreq, bool 
was_async)
 {
+   trace_netfs_rreq(rreq, netfs_rreq_trace_done);
netfs_rreq_clear_subreqs(rreq, was_async);
netfs_put_read_request(rreq, was_async);
 }
@@ -219,6 +226,8 @@ static void netfs_rreq_unlock(struct netfs_read_request 
*rreq)
iopos = 0;
subreq_failed = (subreq->error < 0);
 
+   trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
+
rcu_read_lock();
xas_for_each(, page, last_page) {
unsigned int pgpos = (page->index - start_page) * PAGE_SIZE;
@@ -279,6 +288,8 @@ static void netfs_rreq_short_read(struct netfs_read_request 
*rreq,
__clear_bit(NETFS_SREQ_SHORT_READ, >flags);
__set_bit(NETFS_SREQ_SEEK_DATA_READ, >flags);
 
+   trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
+
netfs_get_read_subrequest(subreq);
atomic_inc(>nr_rd_ops);
netfs_read_from_server(rreq, subreq);
@@ -294,6 +305,8 @@ static bool netfs_rreq_perform_resubmissions(struct 
netfs_read_request *rreq)
 
WARN_ON(in_interrupt());
 
+   trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
+
/* We don't want terminating submissions trying to wake us up whilst
 * we're still going through the list.
 */
@@ -306,6 +319,7 @@ static bool netfs_rreq_perform_resubmissions(struct 
netfs_read_request *rreq)
break;
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->error = 0;
+   trace_netfs_sreq(subreq, 
netfs_sreq_trace_download_instead);
netfs_get_read_subrequest(subreq);
atomic_inc(>nr_rd_ops);

[PATCH v4 08/28] netfs: Provide readahead and readpage netfs helpers

2021-03-10 Thread David Howells

Add a pair of helper functions:

 (*) netfs_readahead()
 (*) netfs_readpage()

to do the work of handling a readahead or a readpage, where the page(s)
that form part of the request may be split between the local cache, the
server or just require clearing, and may be single pages and transparent
huge pages.  This is all handled within the helper.

Note that while both will read from the cache if there is data present,
only netfs_readahead() will expand the request beyond what it was asked to
do, and only netfs_readahead() will write back to the cache.

netfs_readpage(), on the other hand, is synchronous and only fetches the
page (which might be a THP) it is asked for.

The netfs gives the helper parameters from the VM, the cache cookie it
wants to use (or NULL) and a table of operations (only one of which is
mandatory):

 (*) expand_readahead() [optional]

 Called to allow the netfs to request an expansion of a readahead
 request to meet its own alignment requirements.  This is done by
 changing rreq->start and rreq->len.

 (*) clamp_length() [optional]

 Called to allow the netfs to cut down a subrequest to meet its own
 boundary requirements.  If it does this, the helper will generate
 additional subrequests until the full request is satisfied.

 (*) is_still_valid() [optional]

 Called to find out if the data just read from the cache has been
 invalidated and must be reread from the server.

 (*) issue_op() [required]

 Called to ask the netfs to issue a read to the server.  The subrequest
 describes the read.  The read request holds information about the file
 being accessed.

 The netfs can cache information in rreq->netfs_priv.

 Upon completion, the netfs should set the error, transferred and can
 also set FSCACHE_SREQ_CLEAR_TAIL and then call
 fscache_subreq_terminated().

 (*) done() [optional]

 Called after the pages have been unlocked.  The read request is still
 pinning the file and mapping and may still be pinning pages with
 PG_fscache.  rreq->error indicates any error that has been
 accumulated.

 (*) cleanup() [optional]

 Called when the helper is disposing of a finished read request.  This
 allows the netfs to clear rreq->netfs_priv.

Netfs support is enabled with CONFIG_NETFS_SUPPORT=y.  It will be built
even if CONFIG_FSCACHE=n and in this case much of it should be optimised
away, allowing the filesystem to use it even when caching is disabled.

Changes:
 - Folded in a kerneldoc comment fix.
 - Folded in a fix for the error handling in the case that ENOMEM occurs.
 - Added flag to netfs_subreq_terminated() to indicate that the caller may
   have been running async and stuff that might sleep needs punting to a
   workqueue (can't use in_softirq()[1]).

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/160588497406.3465195.18003475695899726222.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118136849.1232039.8923686136144228724.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161032290.2537118.13400578415247339173.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340394873.1303470.6237319335883242536.st...@warthog.procyon.org.uk/
 # v3
Link: https://lore.kernel.org/r/20210216084230.ga23...@lst.de/ [1]
---

 fs/Kconfig |1 
 fs/Makefile|1 
 fs/netfs/Makefile  |6 
 fs/netfs/internal.h|   61 
 fs/netfs/read_helper.c |  717 
 include/linux/netfs.h  |   82 +
 6 files changed, 868 insertions(+)
 create mode 100644 fs/netfs/Makefile
 create mode 100644 fs/netfs/internal.h
 create mode 100644 fs/netfs/read_helper.c

diff --git a/fs/Kconfig b/fs/Kconfig
index 462253ae483a..eccbcf1e3f2e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -125,6 +125,7 @@ source "fs/overlayfs/Kconfig"
 
 menu "Caches"
 
+source "fs/netfs/Kconfig"
 source "fs/fscache/Kconfig"
 source "fs/cachefiles/Kconfig"
 
diff --git a/fs/Makefile b/fs/Makefile
index 3215fe205256..9c708e1fbe8f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -67,6 +67,7 @@ obj-y += devpts/
 obj-$(CONFIG_DLM)  += dlm/
  
 # Do not add any filesystems before this line
+obj-$(CONFIG_NETFS_SUPPORT)+= netfs/
 obj-$(CONFIG_FSCACHE)  += fscache/
 obj-$(CONFIG_REISERFS_FS)  += reiserfs/
 obj-$(CONFIG_EXT4_FS)  += ext4/
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
new file mode 100644
index ..4b4eff2ba369
--- /dev/null
+++ b/fs/netfs/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+
+netfs-y := \
+   read_helper.o
+

[PATCH v4 07/28] netfs, mm: Add unlock_page_fscache() and wait_on_page_fscache()

2021-03-10 Thread David Howells

Add unlock_page_fscache() as an alias of unlock_page_private_2().  This
allows a page 'locked' with PG_fscache to be unlocked.

Add wait_on_page_fscache() to wait for PG_fscache to be unlocked.

[Linus suggested putting the fscache-themed functions into the
 caching-specific headers rather than pagemap.h[1]]

Signed-off-by: David Howells 
cc: Linus Torvalds 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: https://lore.kernel.org/r/1330473.1612974...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/CAHk-=wjgA-74ddehziVk=xaemtkswpu1yw4uaro1r3ibs27...@mail.gmail.com/
 [1]
Link: 
https://lore.kernel.org/r/161340393568.1303470.4997526899111310530.st...@warthog.procyon.org.uk/
 # v3
---

 include/linux/netfs.h |   29 +
 1 file changed, 29 insertions(+)

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index cc1102040488..5ed8c6dc7453 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -26,4 +26,33 @@
 #define TestSetPageFsCache(page)   TestSetPagePrivate2((page))
 #define TestClearPageFsCache(page) TestClearPagePrivate2((page))
 
+/**
+ * unlock_page_fscache - Unlock a page that's locked with PG_fscache
+ * @page: The page
+ *
+ * Unlocks a page that's locked with PG_fscache and wakes up sleepers in
+ * wait_on_page_fscache().  This page bit is used by the netfs helpers when a
+ * netfs page is being written to a local disk cache, thereby allowing writes
+ * to the cache for the same page to be serialised.
+ */
+static inline void unlock_page_fscache(struct page *page)
+{
+   unlock_page_private_2(page);
+}
+
+/**
+ * wait_on_page_fscache - Wait for PG_fscache to be cleared on a page
+ * @page: The page
+ *
+ * Wait for the PG_fscache (PG_private_2) page bit to be removed from a page.
+ * This is, for example, used to handle a netfs page being written to a local
+ * disk cache, thereby allowing writes to the cache for the same page to be
+ * serialised.
+ */
+static inline void wait_on_page_fscache(struct page *page)
+{
+   if (PageFsCache(page))
+   wait_on_page_bit(compound_head(page), PG_fscache);
+}
+
 #endif /* _LINUX_NETFS_H */

Re: [PATCH v1 1/1] fs: Allow no_new_privs tasks to call chroot(2)

2021-03-10 Thread Eric W. Biederman

Mickaël Salaün  writes:

> From: Mickaël Salaün 
>
> Being able to easily change root directories enable to ease some
> development workflow and can be used as a tool to strengthen
> unprivileged security sandboxes.  chroot(2) is not an access-control
> mechanism per se, but it can be used to limit the absolute view of the
> filesystem, and then limit ways to access data and kernel interfaces
> (e.g. /proc, /sys, /dev, etc.).

Actually chroot does not so limit the view of things.  It only limits
the default view.

A process that is chrooted can always escape by something like
chroot("../../../../../../../../..").

So I don't see the point of allowing chroot once you are in your locked
down sandbox.

> Users may not wish to expose namespace complexity to potentially
> malicious processes, or limit their use because of limited resources.
> The chroot feature is much more simple (and limited) than the mount
> namespace, but can still be useful.  As for containers, users of
> chroot(2) should take care of file descriptors or data accessible by
> other means (e.g. current working directory, leaked FDs, passed FDs,
> devices, mount points, etc.).  There is a lot of literature that discuss
> the limitations of chroot, and users of this feature should be aware of
> the multiple ways to bypass it.  Using chroot(2) for security purposes
> can make sense if it is combined with other features (e.g. dedicated
> user, seccomp, LSM access-controls, etc.).
>
> One could argue that chroot(2) is useless without a properly populated
> root hierarchy (i.e. without /dev and /proc).  However, there are
> multiple use cases that don't require the chrooting process to create
> file hierarchies with special files nor mount points, e.g.:
> * A process sandboxing itself, once all its libraries are loaded, may
>   not need files other than regular files, or even no file at all.
> * Some pre-populated root hierarchies could be used to chroot into,
>   provided for instance by development environments or tailored
>   distributions.
> * Processes executed in a chroot may not require access to these special
>   files (e.g. with minimal runtimes, or by emulating some special files
>   with a LD_PRELOADed library or seccomp).
>
> Allowing a task to change its own root directory is not a threat to the
> system if we can prevent confused deputy attacks, which could be
> performed through execution of SUID-like binaries.  This can be
> prevented if the calling task sets PR_SET_NO_NEW_PRIVS on itself with
> prctl(2).  To only affect this task, its filesystem information must not
> be shared with other tasks, which can be achieved by not passing
> CLONE_FS to clone(2).  A similar no_new_privs check is already used by
> seccomp to avoid the same kind of security issues.  Furthermore, because
> of its security use and to avoid giving a new way for attackers to get
> out of a chroot (e.g. using /proc//root), an unprivileged chroot is
> only allowed if the new root directory is the same or beneath the
> current one.  This still allows a process to use a subset of its
> legitimate filesystem to chroot into and then further reduce its view of
> the filesystem.
>
> This change may not impact systems relying on other permission models
> than POSIX capabilities (e.g. Tomoyo).  Being able to use chroot(2) on
> such systems may require to update their security policies.
>
> Only the chroot system call is relaxed with this no_new_privs check; the
> init_chroot() helper doesn't require such change.
>
> Allowing unprivileged users to use chroot(2) is one of the initial
> objectives of no_new_privs:
> https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html
> This patch is a follow-up of a previous one sent by Andy Lutomirski, but
> with less limitations:
> https://lore.kernel.org/lkml/0e2f0f54e19bff53a3739ecfddb4ffa9a6dbde4d.1327858005.git.l...@amacapital.net/

Last time I remember talking architecture we agreed that user namespaces
would be used for enabling features and that no_new_privs would just be
used to lock-down userspace.  That way no_new_privs could be kept simple
and trivial to audit and understand.

You can build your sandbox and use chroot if you use a user namespace at
the start.  A mount namespace would also help lock things down.  Still
allowing chroot after the sanbox has been built, a seccomp filter has
been installed and no_new_privs has been enabled seems like it is asking
for trouble and may weaken existing sandboxes.

So I think we need a pretty compelling use case to consider allowing
chroot(2).  You haven't even mentioned what your usecase is at this
point so I don't know why we would tackle that complexity.

Eric

[PATCH v4 06/28] netfs, mm: Move PG_fscache helper funcs to linux/netfs.h

2021-03-10 Thread David Howells

Move the PG_fscache related helper funcs (such as SetPageFsCache()) to
linux/netfs.h rather than linux/fscache.h as the intention is to move to a
model where they're used by the network filesystem and the helper library,
but not by fscache/cachefiles itself.

Signed-off-by: David Howells 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/161340392347.1303470.18065131603507621762.st...@warthog.procyon.org.uk/
 # v3
---

 include/linux/fscache.h |   11 +--
 include/linux/netfs.h   |   29 +
 2 files changed, 30 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/netfs.h

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index a1c928fe98e7..1f8dc72369ee 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE)
 #define fscache_available() (1)
@@ -29,16 +30,6 @@
 #endif
 
 
-/*
- * overload PG_private_2 to give us PG_fscache - this is used to indicate that
- * a page is currently backed by a local disk cache
- */
-#define PageFsCache(page)  PagePrivate2((page))
-#define SetPageFsCache(page)   SetPagePrivate2((page))
-#define ClearPageFsCache(page) ClearPagePrivate2((page))
-#define TestSetPageFsCache(page)   TestSetPagePrivate2((page))
-#define TestClearPageFsCache(page) TestClearPagePrivate2((page))
-
 /* pattern used to fill dead space in an index entry */
 #define FSCACHE_INDEX_DEADFILL_PATTERN 0x79
 
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
new file mode 100644
index ..cc1102040488
--- /dev/null
+++ b/include/linux/netfs.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Network filesystem support services.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowe...@redhat.com)
+ *
+ * See:
+ *
+ * Documentation/filesystems/netfs_library.rst
+ *
+ * for a description of the network filesystem interface declared here.
+ */
+
+#ifndef _LINUX_NETFS_H
+#define _LINUX_NETFS_H
+
+#include 
+
+/*
+ * Overload PG_private_2 to give us PG_fscache - this is used to indicate that
+ * a page is currently backed by a local disk cache
+ */
+#define PageFsCache(page)  PagePrivate2((page))
+#define SetPageFsCache(page)   SetPagePrivate2((page))
+#define ClearPageFsCache(page) ClearPagePrivate2((page))
+#define TestSetPageFsCache(page)   TestSetPagePrivate2((page))
+#define TestClearPageFsCache(page) TestClearPagePrivate2((page))
+
+#endif /* _LINUX_NETFS_H */

[PATCH v4 04/28] netfs: Make a netfs helper module

2021-03-10 Thread David Howells

Make a netfs helper module to manage read request segmentation, caching
support and transparent huge page support on behalf of a network
filesystem.

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/160588496284.3465195.10102643717770106661.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118135638.1232039.1622182202673126285.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161031028.2537118.1213974428943508753.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340391427.1303470.14884950716721956560.st...@warthog.procyon.org.uk/
 # v3
---

 fs/netfs/Kconfig |8 
 1 file changed, 8 insertions(+)
 create mode 100644 fs/netfs/Kconfig

diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
new file mode 100644
index ..2ebf90e6ca95
--- /dev/null
+++ b/fs/netfs/Kconfig
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config NETFS_SUPPORT
+   tristate "Support for network filesystem high-level I/O"
+   help
+ This option enables support for network filesystems, including
+ helpers for high-level buffered I/O, abstracting out read
+ segmentation, local caching and transparent huge page support.

[PATCH v4 05/28] netfs: Documentation for helper library

2021-03-10 Thread David Howells

Add interface documentation for the netfs helper library.

Signed-off-by: David Howells 
---

 Documentation/filesystems/index.rst |1 
 Documentation/filesystems/netfs_library.rst |  526 +++
 2 files changed, 527 insertions(+)
 create mode 100644 Documentation/filesystems/netfs_library.rst

diff --git a/Documentation/filesystems/index.rst 
b/Documentation/filesystems/index.rst
index 1f76b1cb3348..d4853cb919d2 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -53,6 +53,7 @@ filesystem implementations.
journalling
fscrypt
fsverity
+   netfs_library
 
 Filesystems
 ===
diff --git a/Documentation/filesystems/netfs_library.rst 
b/Documentation/filesystems/netfs_library.rst
new file mode 100644
index ..57a641847818
--- /dev/null
+++ b/Documentation/filesystems/netfs_library.rst
@@ -0,0 +1,526 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=
+NETWORK FILESYSTEM HELPER LIBRARY
+=
+
+.. Contents:
+
+ - Overview.
+ - Buffered read helpers.
+   - Read helper functions.
+   - Read helper structures.
+   - Read helper operations.
+   - Read helper procedure.
+   - Read helper cache API.
+
+
+Overview
+
+
+The network filesystem helper library is a set of functions designed to aid a
+network filesystem in implementing VM/VFS operations.  For the moment, that
+just includes turning various VM buffered read operations into requests to read
+from the server.  The helper library, however, can also interpose other
+services, such as local caching or local data encryption.
+
+Note that the library module doesn't link against local caching directly, so
+access must be provided by the netfs.
+
+
+Buffered Read Helpers
+=
+
+The library provides a set of read helpers that handle the ->readpage(),
+->readahead() and much of the ->write_begin() VM operations and translate them
+into a common call framework.
+
+The following services are provided:
+
+ * Handles transparent huge pages (THPs).
+
+ * Insulates the netfs from VM interface changes.
+
+ * Allows the netfs to arbitrarily split reads up into pieces, even ones that
+   don't match page sizes or page alignments and that may cross pages.
+
+ * Allows the netfs to expand a readahead request in both directions to meet
+   its needs.
+
+ * Allows the netfs to partially fulfil a read, which will then be resubmitted.
+
+ * Handles local caching, allowing cached data and server-read data to be
+   interleaved for a single request.
+
+ * Handles clearing of bufferage that aren't on the server.
+
+ * Handle retrying of reads that failed, switching reads from the cache to the
+   server as necessary.
+
+ * In the future, this is a place that other services can be performed, such as
+   local encryption of data to be stored remotely or in the cache.
+
+From the network filesystem, the helpers require a table of operations.  This
+includes a mandatory method to issue a read operation along with a number of
+optional methods.
+
+
+Read Helper Functions
+-
+
+Three read helpers are provided::
+
+ * void netfs_readahead(struct readahead_control *ractl,
+   const struct netfs_read_request_ops *ops,
+   void *netfs_priv);``
+ * int netfs_readpage(struct file *file,
+ struct page *page,
+ const struct netfs_read_request_ops *ops,
+ void *netfs_priv);
+ * int netfs_write_begin(struct file *file,
+struct address_space *mapping,
+loff_t pos,
+unsigned int len,
+unsigned int flags,
+struct page **_page,
+void **_fsdata,
+const struct netfs_read_request_ops *ops,
+void *netfs_priv);
+
+Each corresponds to a VM operation, with the addition of a couple of parameters
+for the use of the read helpers:
+
+ * ``ops``
+
+   A table of operations through which the helpers can talk to the filesystem.
+
+ * ``netfs_priv``
+
+   Filesystem private data (can be NULL).
+
+Both of these values will be stored into the read request structure.
+
+For ->readahead() and ->readpage(), the network filesystem should just jump
+into the corresponding read helper; whereas for ->write_begin(), it may be a
+little more complicated as the network filesystem might want to flush
+conflicting writes or track dirty data and needs to put the acquired page if an
+error occurs after calling the helper.
+
+The helpers manage the read request, calling back into the network filesystem
+through the suppplied table of operations.  Waits will be performed as
+necessary before returning for helpers that are meant to be synchronous.
+
+If an error occurs and netfs_priv is non-NULL, ops->cleanup() will be called to
+deal with it.  If

[PATCH v4 03/28] mm: Implement readahead_control pageset expansion

2021-03-10 Thread David Howells

Provide a function, readahead_expand(), that expands the set of pages
specified by a readahead_control object to encompass a revised area with a
proposed size and length.

The proposed area must include all of the old area and may be expanded yet
more by this function so that the edges align on (transparent huge) page
boundaries as allocated.

The expansion will be cut short if a page already exists in either of the
areas being expanded into.  Note that any expansion made in such a case is
not rolled back.

This will be used by fscache so that reads can be expanded to cache granule
boundaries, thereby allowing whole granules to be stored in the cache, but
there are other potential users also.

Changes:
- Moved the declaration of readahead_expand() to a better place[1].

Suggested-by: Matthew Wilcox (Oracle) 
Signed-off-by: David Howells 
cc: Matthew Wilcox (Oracle) 
cc: Alexander Viro 
cc: Christoph Hellwig 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: https://lore.kernel.org/r/20210217161358.gm2858...@casper.infradead.org/ 
[1]
Link: 
https://lore.kernel.org/r/159974633888.2094769.8326206446358128373.st...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/160588479816.3465195.553952688795241765.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118131787.1232039.4863969952441067985.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161028670.2537118.13831420617039766044.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340389201.1303470.14353807284546854878.st...@warthog.procyon.org.uk/
 # v3
---

 include/linux/pagemap.h |2 +
 mm/readahead.c  |   70 +++
 2 files changed, 72 insertions(+)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ac91b33f3873..bf05e99ce588 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -818,6 +818,8 @@ void page_cache_sync_ra(struct readahead_control *, struct 
file_ra_state *,
unsigned long req_count);
 void page_cache_async_ra(struct readahead_control *, struct file_ra_state *,
struct page *, unsigned long req_count);
+void readahead_expand(struct readahead_control *ractl,
+ loff_t new_start, size_t new_len);
 
 /**
  * page_cache_sync_readahead - generic file readahead
diff --git a/mm/readahead.c b/mm/readahead.c
index c5b0457415be..4446dada0bc2 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -638,3 +638,73 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, 
size_t, count)
 {
return ksys_readahead(fd, offset, count);
 }
+
+/**
+ * readahead_expand - Expand a readahead request
+ * @ractl: The request to be expanded
+ * @new_start: The revised start
+ * @new_len: The revised size of the request
+ *
+ * Attempt to expand a readahead request outwards from the current size to the
+ * specified size by inserting locked pages before and after the current window
+ * to increase the size to the new window.  This may involve the insertion of
+ * THPs, in which case the window may get expanded even beyond what was
+ * requested.
+ *
+ * The algorithm will stop if it encounters a conflicting page already in the
+ * pagecache and leave a smaller expansion than requested.
+ *
+ * The caller must check for this by examining the revised @ractl object for a
+ * different expansion than was requested.
+ */
+void readahead_expand(struct readahead_control *ractl,
+ loff_t new_start, size_t new_len)
+{
+   struct address_space *mapping = ractl->mapping;
+   pgoff_t new_index, new_nr_pages;
+   gfp_t gfp_mask = readahead_gfp_mask(mapping);
+
+   new_index = new_start / PAGE_SIZE;
+
+   /* Expand the leading edge downwards */
+   while (ractl->_index > new_index) {
+   unsigned long index = ractl->_index - 1;
+   struct page *page = xa_load(>i_pages, index);
+
+   if (page && !xa_is_value(page))
+   return; /* Page apparently present */
+
+   page = __page_cache_alloc(gfp_mask);
+   if (!page)
+   return;
+   if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
+   put_page(page);
+   return;
+   }
+
+   ractl->_nr_pages++;
+   ractl->_index = page->index;
+   }
+
+   new_len += new_start - readahead_pos(ractl);
+   new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);
+
+   /* Expand the trailing edge upwards */
+   while (ractl->_nr_pages < new_nr_pages) {
+   unsigned long index = ractl->_index + ractl->_nr_pages;
+   struct page *page = xa_load(>i_pages, index);
+
+

Re: [PATCH] net: bonding: fix error return code of bond_neigh_init()

2021-03-10 Thread Eric Dumazet




On 3/10/21 10:24 AM, Roi Dayan wrote:
> 
> 
> On 2021-03-08 5:11 AM, Jia-Ju Bai wrote:
>> When slave is NULL or slave_ops->ndo_neigh_setup is NULL, no error
>> return code of bond_neigh_init() is assigned.
>> To fix this bug, ret is assigned with -EINVAL in these cases.
>>
>> Fixes: 9e99bfefdbce ("bonding: fix bond_neigh_init()")
>> Reported-by: TOTE Robot 
>> Signed-off-by: Jia-Ju Bai 
>> ---
>>   drivers/net/bonding/bond_main.c | 8 ++--
>>   1 file changed, 6 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/bonding/bond_main.c 
>> b/drivers/net/bonding/bond_main.c
>> index 74cbbb22470b..456315bef3a8 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -3978,11 +3978,15 @@ static int bond_neigh_init(struct neighbour *n)
>>     rcu_read_lock();
>>   slave = bond_first_slave_rcu(bond);
>> -    if (!slave)
>> +    if (!slave) {
>> +    ret = -EINVAL;
>>   goto out;
>> +    }
>>   slave_ops = slave->dev->netdev_ops;
>> -    if (!slave_ops->ndo_neigh_setup)
>> +    if (!slave_ops->ndo_neigh_setup) {
>> +    ret = -EINVAL;
>>   goto out;
>> +    }
>>     /* TODO: find another way [1] to implement this.
>>    * Passing a zeroed structure is fragile,
>>
> 
> 
> Hi,
> 
> This breaks basic functionally that always worked. A slave doesn't need
> to exists nor to implement ndo_neigh_setup.
> Now trying to add a neigh entry because of that fails.
> This commit needs to be reverted.
> 
> Thanks,
> Roi

Agreed, this commit made no sense, please revert.

[PATCH v4 02/28] mm: Add an unlock function for PG_private_2/PG_fscache

2021-03-10 Thread David Howells

Add a function, unlock_page_private_2(), to unlock PG_private_2 analogous
to that of PG_lock.  Add a kerneldoc banner to that indicating the example
usage case.

A wrapper will need to be placed in the netfs header in the patch that adds
that.

[This implements a suggestion by Linus[1] to not mix the terminology of
 PG_private_2 and PG_fscache in the mm core function]

Changes:
- Remove extern from the declaration[2].

Suggested-by: Linus Torvalds 
Signed-off-by: David Howells 
Reviewed-by: Christoph Hellwig 
cc: Matthew Wilcox (Oracle) 
cc: Alexander Viro 
cc: Christoph Hellwig 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: https://lore.kernel.org/r/1330473.1612974...@warthog.procyon.org.uk/ # v1
Link: 
https://lore.kernel.org/r/CAHk-=wjgA-74ddehziVk=xaemtkswpu1yw4uaro1r3ibs27...@mail.gmail.com/
 [1]
Link: https://lore.kernel.org/r/20210216102659.ga27...@lst.de/ [2]
Link: 
https://lore.kernel.org/r/161340387944.1303470.7944159520278177652.st...@warthog.procyon.org.uk/
 # v3
---

 include/linux/pagemap.h |1 +
 mm/filemap.c|   20 
 2 files changed, 21 insertions(+)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 20225b067583..ac91b33f3873 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -591,6 +591,7 @@ extern int __lock_page_async(struct page *page, struct 
wait_page_queue *wait);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
 extern void unlock_page(struct page *page);
+void unlock_page_private_2(struct page *page);
 
 /*
  * Return true if the page was successfully locked
diff --git a/mm/filemap.c b/mm/filemap.c
index 43700480d897..925964b67583 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1432,6 +1432,26 @@ void unlock_page(struct page *page)
 }
 EXPORT_SYMBOL(unlock_page);
 
+/**
+ * unlock_page_private_2 - Unlock a page that's locked with PG_private_2
+ * @page: The page
+ *
+ * Unlocks a page that's locked with PG_private_2 and wakes up sleepers in
+ * wait_on_page_private_2().
+ *
+ * This is, for example, used when a netfs page is being written to a local
+ * disk cache, thereby allowing writes to the cache for the same page to be
+ * serialised.
+ */
+void unlock_page_private_2(struct page *page)
+{
+   page = compound_head(page);
+   VM_BUG_ON_PAGE(!PagePrivate2(page), page);
+   clear_bit_unlock(PG_private_2, >flags);
+   wake_up_page_bit(page, PG_private_2);
+}
+EXPORT_SYMBOL(unlock_page_private_2);
+
 /**
  * end_page_writeback - end writeback against a page
  * @page: the page

Re: [PATCH] kbuild: remove unneeded -O option to dtc

2021-03-10 Thread Rob Herring

On Wed, Mar 10, 2021 at 4:09 AM Masahiro Yamada  wrote:
>
> This piece of code converts the target suffix to the dtc -O option:
>
> *.dtb  ->  -O dtb
> *.dt.yaml  ->  -O yaml
>
> Commit ce88c9c79455 ("kbuild: Add support to build overlays (%.dtbo)")
> added the third case:
>
> *.dtbo ->  -O dtbo
>
> This works thanks to commit 163f0469bf2e ("dtc: Allow overlays to have
> .dtbo extension") in the upstream DTC, which has already been pulled in
> the kernel.
>
> However, I think it is a bit odd because "dtbo" is not a format name.
> At least, it does not show up in the help message of dtc.
>
> $ scripts/dtc/dtc --help
>   [ snip ]
>   -O, --out-format 
> Output formats are:
> dts - device tree source text
> dtb - device tree blob
> yaml - device tree encoded as YAML
> asm - assembler source
>
> So, I am not a big fan of the second hunk of that change:
>
> } else if (streq(outform, "dtbo")) {
> dt_to_blob(outf, dti, outversion);
>
> Anyway, we did not need to do this in Makefile in the first place.
>
> guess_type_by_name() had already understood ".yaml" before commit
> 4f0e3a57d6eb ("kbuild: Add support for DT binding schema checks"),
> and now does ".dtbo" as well.
>
> Makefile does not need to duplicate the same logic. Let's leave it
> to dtc.
>
> Signed-off-by: Masahiro Yamada 
> ---
>
>  scripts/Makefile.lib | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
> index eee59184de64..90a4e04cd8f5 100644
> --- a/scripts/Makefile.lib
> +++ b/scripts/Makefile.lib
> @@ -327,7 +327,7 @@ $(obj)/%.dtb.S: $(obj)/%.dtb FORCE
>
>  quiet_cmd_dtc = DTC $@
>  cmd_dtc = $(HOSTCC) -E $(dtc_cpp_flags) -x assembler-with-cpp -o $(dtc-tmp) 
> $< ; \
> -   $(DTC) -O $(patsubst .%,%,$(suffix $@)) -o $@ -b 0 \
> +   $(DTC) -o $@ -b 0 \
> $(addprefix -i,$(dir $<) $(DTC_INCLUDE)) $(DTC_FLAGS) \
> -d $(depfile).dtc.tmp $(dtc-tmp) ; \
> cat $(depfile).pre.tmp $(depfile).dtc.tmp > $(depfile)

I think you should also drop 'yaml' from:

$(call if_changed_rule,dtc,yaml)

Though that's somewhat separate, so either way:

Acked-by: Rob Herring 

Rob

Re: [PATCH] fb_defio: Remove custom address_space_operations

2021-03-10 Thread Christoph Hellwig

On Wed, Mar 10, 2021 at 01:51:28PM +, Matthew Wilcox (Oracle) wrote:
> There's no need to give the page an address_space.  Leaving the
> page->mapping as NULL will cause the VM to handle set_page_dirty()
> the same way that it's set now, and that was the only reason to
> set the address_space in the first place.
> 
> Signed-off-by: Matthew Wilcox (Oracle) 


Looks good:

Reviewed-by: Christoph Hellwig

[PATCH v4 01/28] iov_iter: Add ITER_XARRAY

2021-03-10 Thread David Howells

Add an iterator, ITER_XARRAY, that walks through a set of pages attached to
an xarray, starting at a given page and offset and walking for the
specified amount of bytes.  The iterator supports transparent huge pages.

The iterate_xarray() macro calls the helper function with rcu_access()
helped.  I think that this is only a problem for iov_iter_for_each_range()
- and that returns an error for ITER_XARRAY (also, this function does not
appear to be called).

The caller must guarantee that the pages are all present and they must be
locked using PG_locked, PG_writeback or PG_fscache to prevent them from
going away or being migrated whilst they're being accessed.

This is useful for copying data from socket buffers to inodes in network
filesystems and for transferring data between those inodes and the cache
using direct I/O.

Whilst it is true that ITER_BVEC could be used instead, that would require
a bio_vec array to be allocated to refer to all the pages - which should be
redundant if inode->i_pages also points to all these pages.

Note that older versions of this patch implemented an ITER_MAPPING instead,
which was almost the same.

Signed-off-by: David Howells 
cc: Alexander Viro 
cc: Matthew Wilcox (Oracle) 
cc: Christoph Hellwig 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: https://lore.kernel.org/r/3577430.1579705...@warthog.procyon.org.uk/ # rfc
Link: 
https://lore.kernel.org/r/158861205740.340223.16592990225607814022.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/159465785214.1376674.6062549291411362531.st...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/160588477334.3465195.3608963255682568730.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118129703.1232039.17141248432017826976.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161026313.2537118.14676007075365418649.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340386671.1303470.10752208972482479840.st...@warthog.procyon.org.uk/
 # v3
---

 include/linux/uio.h |   11 ++
 lib/iov_iter.c  |  313 +++
 2 files changed, 301 insertions(+), 23 deletions(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 27ff8eb786dc..5f5ffc45d4aa 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -10,6 +10,7 @@
 #include 
 
 struct page;
+struct address_space;
 struct pipe_inode_info;
 
 struct kvec {
@@ -24,6 +25,7 @@ enum iter_type {
ITER_BVEC = 16,
ITER_PIPE = 32,
ITER_DISCARD = 64,
+   ITER_XARRAY = 128,
 };
 
 struct iov_iter {
@@ -39,6 +41,7 @@ struct iov_iter {
const struct iovec *iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
+   struct xarray *xarray;
struct pipe_inode_info *pipe;
};
union {
@@ -47,6 +50,7 @@ struct iov_iter {
unsigned int head;
unsigned int start_head;
};
+   loff_t xarray_start;
};
 };
 
@@ -80,6 +84,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter 
*i)
return iov_iter_type(i) == ITER_DISCARD;
 }
 
+static inline bool iov_iter_is_xarray(const struct iov_iter *i)
+{
+   return iov_iter_type(i) == ITER_XARRAY;
+}
+
 static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 {
return i->type & (READ | WRITE);
@@ -221,6 +230,8 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int 
direction, const struct bio_
 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct 
pipe_inode_info *pipe,
size_t count);
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t 
count);
+void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray 
*xarray,
+loff_t start, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
size_t maxsize, unsigned maxpages, size_t *start);
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f66c62aa7154..f808c625c11e 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -76,7 +76,44 @@
}   \
 }
 
-#define iterate_all_kinds(i, n, v, I, B, K) {  \
+#define iterate_xarray(i, n, __v, skip, STEP) {\
+   struct page *head = NULL;   \
+   size_t wanted = n, seg, offset; \
+   loff_t start = i->xarray_start + skip;  \
+   pgoff_t index = start >> PAGE_SHIFT;\
+   int j;

[PATCH v4 00/28] Network fs helper library & fscache kiocb API

2021-03-10 Thread David Howells

Here's a set of patches to do two things:

(1) Add a helper library to handle the new VM readahead interface. This
is intended to be used unconditionally by the filesystem (whether or
not caching is enabled) and provides a common framework for doing
caching, transparent huge pages and, in the future, possibly fscrypt
and read bandwidth maximisation. It also allows the netfs and the
cache to align, expand and slice up a read request from the VM in
various ways; the netfs need only provide a function to read a stretch
of data to the pagecache and the helper takes care of the rest.

(2) Add an alternative fscache/cachfiles I/O API that uses the kiocb
facility to do async DIO to transfer data to/from the netfs's pages,
rather than using readpage with wait queue snooping on one side and
vfs_write() on the other. It also uses less memory, since it doesn't
do buffered I/O on the backing file.

Note that this uses SEEK_HOLE/SEEK_DATA to locate the data available
to be read from the cache. Whilst this is an improvement from the
bmap interface, it still has a problem with regard to a modern
extent-based filesystem inserting or removing bridging blocks of
zeros. Fixing that requires a much greater overhaul.

This is a step towards overhauling the fscache API. The change is opt-in
on the part of the network filesystem. A netfs should not try to mix the
old and the new API because of conflicting ways of handling pages and the
PG_fscache page flag and because it would be mixing DIO with buffered I/O.
Further, the helper library can't be used with the old API.

This does not change any of the fscache cookie handling APIs or the way
invalidation is done.

In the near term, I intend to deprecate and remove the old I/O API
(fscache_allocate_page{,s}(), fscache_read_or_alloc_page{,s}(),
fscache_write_page() and fscache_uncache_page()) and eventually replace
most of fscache/cachefiles with something simpler and easier to follow.

The patchset contains the following parts:

(1) Some helper patches, including provision of an ITER_XARRAY iov
iterator and a function to do readahead expansion.

(2) Patches to add the netfs helper library.

(3) A patch to add the fscache/cachefiles kiocb API.

(4) Patches to add support in AFS for this.

Jeff Layton has patches to add support in Ceph for this.

Dave Wysochanski also has patches for NFS for this, though they're not
included on this branch as there's an issue with PNFS.

With this, AFS without a cache passes all expected xfstests; with a cache,
there's an extra failure, but that's also there before these patches.
Fixing that probably requires a greater overhaul. Ceph and NFS also pass
the expected tests.

These patches can be found also on:

https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git/log/?h=fscache-netfs-lib

Changes
===

ver #4:
Fixed some review comments from Christoph Hellwig, including dropping
the export of rw_verify_area()[3] and some minor stuff[4].

Moved the declaration of readahead_expand() to a better location[5].

Rebased to v5.12-rc2 and added a bunch of references into individual
commits.

Dropped Ceph support - that will go through the maintainer's tree.

Added interface documentation for the netfs helper library.

ver #3:
Rolled in the bug fixes.

Adjusted the functions that unlock and wait for PG_fscache according
to Linus's suggestion[1].

Hold a ref on a page when PG_fscache is set as per Linus's
suggestion[2].

Dropped NFS support and added Ceph support.

ver #2:
Fixed some bugs and added NFS support.

Link:
https://lore.kernel.org/r/CAHk-=wh+2gbF7XEjYc=HV9w_2uVzVf7vs60BPz0gFA=+pum...@mail.gmail.com/
[1]
Link:
https://lore.kernel.org/r/CAHk-=wjgA-74ddehziVk=xaemtkswpu1yw4uaro1r3ibs27...@mail.gmail.com/
[2]
Link: https://lore.kernel.org/r/20210216102614.ga27...@lst.de/ [3]
Link: https://lore.kernel.org/r/20210216084230.ga23...@lst.de/ [4]
Link: https://lore.kernel.org/r/20210217161358.gm2858...@casper.infradead.org/
[5]

References
==

These patches have been published for review before, firstly as part of a
larger set:

Link:
https://lore.kernel.org/r/158861203563.340223.7585359869938129395.st...@warthog.procyon.org.uk/

Link:
https://lore.kernel.org/r/159465766378.1376105.11619976251039287525.st...@warthog.procyon.org.uk/
Link:
https://lore.kernel.org/r/159465784033.1376674.18106463693989811037.st...@warthog.procyon.org.uk/
Link:
https://lore.kernel.org/r/159465821598.1377938.2046362270225008168.st...@warthog.procyon.org.uk/

Link:
https://lore.kernel.org/r/160588455242.3465195.3214733858273019178.st...@warthog.procyon.org.uk/

Then as a cut-down set:

Link:
https://lore.kernel.org/r/161118128472.1232039.11746799833066425131.st...@warthog.procyon.org.uk/
# v1

Link:

Re: [PATCH V2 1/25] x86/cpufeatures: Enumerate Intel Hybrid Technology feature bit

2021-03-10 Thread Borislav Petkov

On Wed, Mar 10, 2021 at 08:37:37AM -0800, kan.li...@linux.intel.com wrote:
> From: Ricardo Neri 
> 
> Add feature enumeration to identify a processor with Intel Hybrid
> Technology: one in which CPUs of more than one type are the same package.
> On a hybrid processor, all CPUs support the same homogeneous (i.e.,
> symmetric) instruction set. All CPUs enumerate the same features in CPUID.
> Thus, software (user space and kernel) can run and migrate to any CPU in
> the system as well as utilize any of the enumerated features without any
> change or special provisions. The main difference among CPUs in a hybrid
> processor are power and performance properties.
> 
> Cc: Andi Kleen 
> Cc: Kan Liang 
> Cc: "Peter Zijlstra (Intel)" 
> Cc: "Rafael J. Wysocki" 
> Cc: "Ravi V. Shankar" 
> Cc: Srinivas Pandruvada 
> Cc: linux-kernel@vger.kernel.org
> Reviewed-by: Len Brown 
> Reviewed-by: Tony Luck 
> Signed-off-by: Ricardo Neri 
> ---
> Changes since v1 (as part of patchset for perf change for Alderlake)
>  * None
> 
> Changes since v1 (in a separate posting):
>  * Reworded commit message to clearly state what is Intel Hybrid
>Technology. Stress that all CPUs can run the same instruction
>set and support the same features.
> ---
>  arch/x86/include/asm/cpufeatures.h | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/arch/x86/include/asm/cpufeatures.h 
> b/arch/x86/include/asm/cpufeatures.h
> index cc96e26d69f7..e7cfc9eedf8d 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -374,6 +374,7 @@
>  #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
>  #define X86_FEATURE_TSX_FORCE_ABORT  (18*32+13) /* "" TSX_FORCE_ABORT */
>  #define X86_FEATURE_SERIALIZE(18*32+14) /* SERIALIZE 
> instruction */
> +#define X86_FEATURE_HYBRID_CPU   (18*32+15) /* This part has 
> CPUs of more than one type */

  /* "" This ...

unless you have a valid use case for "hybrid_cpu" being present there.

Thx.

-- 
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Re: [PATCH V2] ASoC: soc-core: Prevent warning if no DMI table is present

2021-03-10 Thread Mark Brown

On Wed, Mar 10, 2021 at 10:41:18AM -0600, Pierre-Louis Bossart wrote:

> would this work?

> if (!IS_ENABLED(CONFIG_DMI))
> return 0;

Build time dependencies aren't going to help anything, arm64 (and to my
understanding some future x86 systems, LynxPoint IIRC) supports both DT
and ACPI and so you have kernels built with support for both.

signature.asc
Description: PGP signature

[RESEND] dt-bindings: spi: Convert NXP flexspi to json schema

2021-03-10 Thread Kuldeep Singh

Convert the NXP FlexSPI binding to DT schema format using json-schema.

Signed-off-by: Kuldeep Singh 
---
 .../bindings/spi/nxp,spi-nxp-fspi.yaml| 85 +++
 .../devicetree/bindings/spi/spi-nxp-fspi.txt  | 43 --
 MAINTAINERS   |  2 +-
 3 files changed, 86 insertions(+), 44 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/spi/nxp,spi-nxp-fspi.yaml
 delete mode 100644 Documentation/devicetree/bindings/spi/spi-nxp-fspi.txt

diff --git a/Documentation/devicetree/bindings/spi/nxp,spi-nxp-fspi.yaml 
b/Documentation/devicetree/bindings/spi/nxp,spi-nxp-fspi.yaml
new file mode 100644
index ..e3f2c5aae847
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/nxp,spi-nxp-fspi.yaml
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/nxp,spi-nxp-fspi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP Flex Serial Peripheral Interface (FSPI)
+
+maintainers:
+  - Ashish Kumar 
+
+allOf:
+  - $ref: "spi-controller.yaml#"
+
+properties:
+  compatible:
+enum:
+  - nxp,lx2160a-fspi
+  - nxp,imx8qxp-fspi
+  - nxp,imx8mm-fspi
+  - nxp,imx8dxl-fspi
+
+  reg:
+items:
+  - description: registers
+  - description: memory mapping
+
+  reg-names:
+items:
+  - const: fspi_base
+  - const: fspi_mmap
+
+  interrupts:
+maxItems: 1
+
+  clocks:
+items:
+  - description: SoC SPI fspi_en clock
+  - description: SoC SPI fspi clock
+
+  clock-names:
+items:
+  - const: fspi_en
+  - const: fspi
+
+required:
+  - compatible
+  - reg
+  - reg-names
+  - interrupts
+  - clocks
+  - clock-names
+
+unevaluatedProperties: false
+
+examples:
+  - |
+#include 
+#include 
+
+soc {
+#address-cells = <2>;
+#size-cells = <2>;
+
+spi@20c {
+compatible = "nxp,lx2160a-fspi";
+reg = <0x0 0x20c 0x0 0x10>,
+  <0x0 0x2000 0x0 0x1000>;
+reg-names = "fspi_base", "fspi_mmap";
+interrupts = ;
+clocks = < QORIQ_CLK_PLATFORM_PLL QORIQ_CLK_PLL_DIV(4)>,
+ < QORIQ_CLK_PLATFORM_PLL QORIQ_CLK_PLL_DIV(4)>;
+clock-names = "fspi_en", "fspi";
+#address-cells = <1>;
+#size-cells = <0>;
+
+flash@0 {
+compatible = "jedec,spi-nor";
+spi-max-frequency = <5000>;
+reg = <0>;
+spi-rx-bus-width = <8>;
+spi-tx-bus-width = <8>;
+};
+};
+};
diff --git a/Documentation/devicetree/bindings/spi/spi-nxp-fspi.txt 
b/Documentation/devicetree/bindings/spi/spi-nxp-fspi.txt
deleted file mode 100644
index df178d1b62e6..
--- a/Documentation/devicetree/bindings/spi/spi-nxp-fspi.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-* NXP Flex Serial Peripheral Interface (FSPI)
-
-Required properties:
-  - compatible : Should be "nxp,lx2160a-fspi"
-   "nxp,imx8qxp-fspi"
-   "nxp,imx8mm-fspi"
-   "nxp,imx8dxl-fspi"
-
-  - reg :First contains the register location and length,
- Second contains the memory mapping address and length
-  - reg-names :  Should contain the resource reg names:
-- fspi_base: configuration register address space
- - fspi_mmap: memory mapped address space
-  - interrupts : Should contain the interrupt for the device
-
-Required SPI slave node properties:
-  - reg :There are two buses (A and B) with two chip selects each.
- This encodes to which bus and CS the flash is connected:
- - <0>: Bus A, CS 0
- - <1>: Bus A, CS 1
- - <2>: Bus B, CS 0
- - <3>: Bus B, CS 1
-
-Example showing the usage of two SPI NOR slave devices on bus A:
-
-fspi0: spi@20c {
-   compatible = "nxp,lx2160a-fspi";
-   reg = <0x0 0x20c 0x0 0x1>, <0x0 0x2000 0x0 0x1000>;
-   reg-names = "fspi_base", "fspi_mmap";
-   interrupts = <0 25 0x4>; /* Level high type */
-   clocks = < 4 3>, < 4 3>;
-   clock-names = "fspi_en", "fspi";
-
-   mt35xu512aba0: flash@0 {
-   reg = <0>;
-   
-   };
-
-   mt35xu512aba1: flash@1 {
-   reg = <1>;
-   
-   };
-};
diff --git a/MAINTAINERS b/MAINTAINERS
index d92f85ca831d..8729f7b50945 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12832,7 +12832,7 @@ M:  Ashish Kumar 
 R: Yogesh Gaur 
 L: linux-...@vger.kernel.org
 S: Maintained
-F: Documentation/devicetree/bindings/spi/spi-nxp-fspi.txt
+F: Documentation/devicetree/bindings/spi/nxp,spi-nxp-fspi.yaml
 F: drivers/spi/spi-nxp-fspi.c
 
 NXP FXAS21002C DRIVER
-- 
2.25.1

Re: [PATCH V2] ASoC: soc-core: Prevent warning if no DMI table is present

2021-03-10 Thread Mark Brown

On Wed, Mar 10, 2021 at 05:37:25PM +0100, Takashi Iwai wrote:
> Mark Brown wrote:

> > > did you mean if (!IS_ENABLED(CONFIG_ACPI)) ?

> > Is there a runtime check?

> Well, basically both DMI and ACPI are completely different things, so
> I don't think it's right to check the availability of ACPI as a signal
> of the availability of DMI.

In theory they are only somewhat related, but in practice they're both
part of a holistic system model - ACPI users complain if their system
does not also provide DMI information.

signature.asc
Description: PGP signature

Re: [PATCH] i2c: i2c-scmi: Drop unused ACPI_MODULE_NAME definition

2021-03-10 Thread Rafael J. Wysocki

On Wed, Mar 10, 2021 at 5:08 PM Wolfram Sang  wrote:
>
> On Wed, Mar 10, 2021 at 03:47:10PM +0100, Rafael J. Wysocki wrote:
> > On Fri, Mar 5, 2021 at 7:29 PM Rafael J. Wysocki  wrote:
> > >
> > > From: Rafael J. Wysocki 
> > >
> > > The ACPI_MODULE_NAME() definition is only used by the message
> > > printing macros from ACPICA that are not used by the code in
> > > question, so it is redundant.  Drop it.
> > >
> > > No functional impact.
> > >
> > > Signed-off-by: Rafael J. Wysocki 
> >
> > If there are no concerns regarding this, I'll queue it up for 5.13 in
> > the ACPI tree, thanks!
>
> I'd prefer the I2C tree a tad to avoid conflicts. Any reason for the
> ACPI tree?

There are some patches doing this type of a cleanup in the ACPI tree,
but this is the only reason, so please route it through the i2c tree
if that is preferred.

Re: [PATCH V2 16/25] perf/x86: Register hybrid PMUs

2021-03-10 Thread Dave Hansen

On 3/10/21 8:37 AM, kan.li...@linux.intel.com wrote:
> - err = perf_pmu_register(, "cpu", PERF_TYPE_RAW);
> - if (err)
> - goto out2;
> + if (!is_hybrid()) {
> + err = perf_pmu_register(, "cpu", PERF_TYPE_RAW);
> + if (err)
> + goto out2;
> + } else {
> + u8 cpu_type = get_hybrid_cpu_type(smp_processor_id());
> + struct x86_hybrid_pmu *hybrid_pmu;
> + int i;

Where's the preempt_disable()?

> +static void init_hybrid_pmu(int cpu)
> +{
> + unsigned int fixed_mask, unused_eax, unused_ebx, unused_edx;
> + struct cpu_hw_events *cpuc = _cpu(cpu_hw_events, cpu);
> + u8 cpu_type = get_hybrid_cpu_type(cpu);
> + struct x86_hybrid_pmu *pmu = NULL;
> + struct perf_cpu_context *cpuctx;
> + int i;

Ditto.

Are we really sure the IPIs are worth the trouble?  Why don't we just
cache the leaf when we bring the CPU up like just about every other
thing we read from CPUID?

Re: [PATCH] arm64: dts: qcom: sc7280: Add WPSS remoteproc node

2021-03-10 Thread Bjorn Andersson

On Wed 10 Mar 01:37 CST 2021, Rakesh Pillai wrote:

> Add the WPSS remoteproc node in dts for
> PIL loading.
> 
> Signed-off-by: Rakesh Pillai 
> ---
> - This change is dependent on the below patch series
> 1) https://lore.kernel.org/patchwork/project/lkml/list/?series=487403
> 2) https://lore.kernel.org/patchwork/project/lkml/list/?series=488365
> ---
>  arch/arm64/boot/dts/qcom/sc7280-idp.dts |  4 +++
>  arch/arm64/boot/dts/qcom/sc7280.dtsi| 47 
> +
>  2 files changed, 51 insertions(+)
> 
> diff --git a/arch/arm64/boot/dts/qcom/sc7280-idp.dts 
> b/arch/arm64/boot/dts/qcom/sc7280-idp.dts
> index 950ecb2..603f56b 100644
> --- a/arch/arm64/boot/dts/qcom/sc7280-idp.dts
> +++ b/arch/arm64/boot/dts/qcom/sc7280-idp.dts
> @@ -26,6 +26,10 @@
>   status = "okay";
>  };
>  
> +_wpss {
> + status = "okay";
> +};
> +
>   {
>   status = "okay";
>  };
> diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi 
> b/arch/arm64/boot/dts/qcom/sc7280.dtsi
> index 8af6d77..26dd466 100644
> --- a/arch/arm64/boot/dts/qcom/sc7280.dtsi
> +++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi
> @@ -53,6 +53,16 @@
>   no-map;
>   reg = <0x0 0x80b0 0x0 0x10>;
>   };
> +
> + wlan_fw_mem: memory@80c0 {
> + no-map;
> + reg = <0x0 0x80c0 0x0 0xc0>;
> + };
> +
> + wpss_mem: memory@9ae0 {
> + no-map;
> + reg = <0x0 0x9ae0 0x0 0x190>;
> + };
>   };
>  
>   cpus {
> @@ -305,6 +315,43 @@
>   };
>   };
>  
> + remoteproc_wpss: remoteproc@8a0 {
> + compatible = "qcom,sc7280-wpss-pil";
> + reg = <0 0x08a0 0 0x1>;
> +
> + interrupts-extended = < GIC_SPI 587 
> IRQ_TYPE_EDGE_RISING>,
> +   <_smp2p_in 0 0>,

IRQ_TYPE_NONE?

Apart from that this looks good.

Regards,
Bjorn

> +   <_smp2p_in 1 0>,
> +   <_smp2p_in 2 0>,
> +   <_smp2p_in 3 0>,
> +   <_smp2p_in 7 0>;
> + interrupt-names = "wdog", "fatal", "ready", "handover",
> +   "stop-ack", "shutdown-ack";
> +
> + memory-region = <_mem>;
> +
> + qcom,smem-states = <_smp2p_out 0>;
> + qcom,smem-state-names = "stop";
> +
> + resets = <_reset AOSS_CC_WCSS_RESTART>;
> + reset-names = "restart";
> +
> + qcom,halt-regs = <_mutex_regs 0x37000>;
> +
> + status = "disabled";
> +
> + glink-edge {
> + interrupts-extended = < IPCC_CLIENT_WPSS
> +  
> IPCC_MPROC_SIGNAL_GLINK_QMP
> +  
> IRQ_TYPE_EDGE_RISING>;
> + mboxes = < IPCC_CLIENT_WPSS
> + IPCC_MPROC_SIGNAL_GLINK_QMP>;
> +
> + label = "wpss";
> + qcom,remote-pid = <13>;
> + };
> + };
> +
>   pdc: interrupt-controller@b22 {
>   compatible = "qcom,sc7280-pdc", "qcom,pdc";
>   reg = <0 0x0b22 0 0x3>;
> -- 
> 2.7.4
>

Re: [PATCH] net: add net namespace inode for all net_dev events

2021-03-10 Thread Steven Rostedt

On Wed, 10 Mar 2021 17:03:40 +0800
Tony Lu  wrote:

> On Tue, Mar 09, 2021 at 12:40:11PM -0500, Steven Rostedt wrote:


> > The above shows 10 bytes wasted for this event.
> > 


> 
> I use pahole to read vmlinux.o directly with defconfig and
> CONFIG_DEBUG_INFO enabled, the result shows 22 structs prefixed with
> trace_event_raw_ that have at least one hole.
> 

> 
> pahole shows there are 5 holes with 10 bytes in net_dev_start_xmit event.
> 

Oh, an I'm glad that my analysis matched with pahole ;-)

-- Steve

Re: [RFC v2 5/5] clk: socfpga: allow compile testing of Stratix 10 / Agilex clocks

2021-03-10 Thread Arnd Bergmann

On Wed, Mar 10, 2021 at 9:38 AM Krzysztof Kozlowski
 wrote:
> --- a/drivers/clk/socfpga/Kconfig
> +++ b/drivers/clk/socfpga/Kconfig
> @@ -1,6 +1,17 @@
>  # SPDX-License-Identifier: GPL-2.0
> +config COMMON_CLK_SOCFPGA
> +   bool "Intel SoCFPGA family clock support" if COMPILE_TEST && 
> !ARCH_SOCFPGA && !ARCH_SOCFPGA64
> +   depends on ARCH_SOCFPGA || ARCH_SOCFPGA64 || COMPILE_TEST
> +   default y if ARCH_SOCFPGA || ARCH_SOCFPGA64

I think the 'depends on' line here is redundant if you also have the
'if' line and the default.

Arnd

[PATCH V1 6/6] arm64: dts: qcom: sm8150: Add Data Capture and Compare(DCC) support node

2021-03-10 Thread Souradeep Chowdhury

Add the DCC(Data Capture and Compare) device tree node entry along with
the addresses for register regions.

Signed-off-by: Souradeep Chowdhury 
---
 arch/arm64/boot/dts/qcom/sm8150.dtsi | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/sm8150.dtsi 
b/arch/arm64/boot/dts/qcom/sm8150.dtsi
index e5bb17b..5c564b1 100644
--- a/arch/arm64/boot/dts/qcom/sm8150.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8150.dtsi
@@ -654,6 +654,13 @@
interrupts = ;
};
 
+   dcc@10a2000 {
+   compatible = "qcom,sm8150-dcc", "qcom,dcc";
+   reg = <0x0 0x010a2000 0x0 0x1000>,
+ <0x0 0x010ad000 0x0 0x3000>;
+   reg-names = "dcc-base", "dcc-ram-base";
+   };
+
ufs_mem_hc: ufshc@1d84000 {
compatible = "qcom,sm8150-ufshc", "qcom,ufshc",
 "jedec,ufs-2.0";
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

[PATCH V1 5/6] MAINTAINERS: Add the entry for DCC(Data Capture and Compare) driver support

2021-03-10 Thread Souradeep Chowdhury

Added the entries for all the files added as a part of driver support for
DCC(Data Capture and Compare).

Signed-off-by: Souradeep Chowdhury 
---
 MAINTAINERS | 8 
 1 file changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index d92f85c..fb28218 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4969,6 +4969,14 @@ F:   include/linux/tfrc.h
 F: include/uapi/linux/dccp.h
 F: net/dccp/
 
+QTI DCC DRIVER
+M: Souradeep Chowdhury 
+L: linux-arm-...@vger.kernel.org
+S: Maintained
+F: Documentation/ABI/testing/sysfs-driver-dcc
+F: Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml
+F: drivers/soc/qcom/dcc.c
+
 DECnet NETWORK LAYER
 L: linux-decnet-u...@lists.sourceforge.net
 S: Orphan
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

[PATCH V1 3/6] soc: qcom: dcc: Add the sysfs variables to the Data Capture and Compare driver(DCC)

2021-03-10 Thread Souradeep Chowdhury

Add the sysfs variables to expose the user space functionalities
like DCC enable, disable, configure addresses and software triggers.
Also add the necessary methods along with the same.

Signed-off-by: Souradeep Chowdhury 
---
 drivers/soc/qcom/dcc.c | 1179 +++-
 1 file changed, 1171 insertions(+), 8 deletions(-)

diff --git a/drivers/soc/qcom/dcc.c b/drivers/soc/qcom/dcc.c
index 89816bf..61e5225 100644
--- a/drivers/soc/qcom/dcc.c
+++ b/drivers/soc/qcom/dcc.c
@@ -17,12 +17,30 @@
 #include 
 #include 
 
+
+#define TIMEOUT_US 100
+
+#define dcc_writel(drvdata, val, off)  \
+   writel((val), drvdata->base + dcc_offset_conv(drvdata, off))
 #define dcc_readl(drvdata, off)
\
readl(drvdata->base + dcc_offset_conv(drvdata, off))
 
+#define dcc_sram_readl(drvdata, off)   \
+   readl(drvdata->ram_base + off)
+
 /* DCC registers */
 #define DCC_HW_INFO0x04
 #define DCC_LL_NUM_INFO0x10
+#define DCC_STATUS 0x1C
+#define DCC_LL_LOCK(m) (0x34 + 0x80 * m)
+#define DCC_LL_CFG(m)  (0x38 + 0x80 * m)
+#define DCC_LL_BASE(m) (0x3c + 0x80 * m)
+#define DCC_FD_BASE(m) (0x40 + 0x80 * m)
+#define DCC_LL_TIMEOUT(m)  (0x44 + 0x80 * m)
+#define DCC_LL_INT_ENABLE(m)   (0x4C + 0x80 * m)
+#define DCC_LL_INT_STATUS(m)   (0x50 + 0x80 * m)
+#define DCC_LL_SW_TRIGGER(m)   (0x60 + 0x80 * m)
+#define DCC_LL_BUS_ACCESS_STATUS(m)(0x64 + 0x80 * m)
 
 #define DCC_MAP_LEVEL1 0x18
 #define DCC_MAP_LEVEL2 0x34
@@ -36,24 +54,38 @@
 #define DCC_FIX_LOOP_OFFSET16
 #define DCC_VER_INFO_BIT   9
 
+#define DCC_READ   0
+#define DCC_WRITE  1
+#define DCC_LOOP   2
+#define DCC_READ_WRITE 3
+
+#define MAX_DCC_OFFSET GENMASK(9, 2)
+#define MAX_DCC_LENGENMASK(6, 0)
+#define MAX_LOOP_CNT   GENMASK(7, 0)
+
+#define DCC_ADDR_DESCRIPTOR0x00
+#define DCC_LOOP_DESCRIPTORBIT(30)
+#define DCC_RD_MOD_WR_DESCRIPTOR   BIT(31)
+#define DCC_LINK_DESCRIPTORGENMASK(31, 30)
+
+#define DCC_READ_IND   0x00
+#define DCC_WRITE_IND  (BIT(28))
+
+#define DCC_AHB_IND0x00
+#define DCC_APB_INDBIT(29)
+
 #define DCC_MAX_LINK_LIST  8
 #define DCC_INVALID_LINK_LIST  GENMASK(7, 0)
 
 #define DCC_VER_MASK1  GENMASK(6, 0)
 #define DCC_VER_MASK2  GENMASK(5, 0)
 
-#define DCC_RD_MOD_WR_ADDR 0xC105E
+#define DCC_RD_MOD_WR_ADDR  0xC105E
 
 struct qcom_dcc_config {
const int dcc_ram_offset;
 };
 
-enum dcc_mem_map_ver {
-   DCC_MEM_MAP_VER1,
-   DCC_MEM_MAP_VER2,
-   DCC_MEM_MAP_VER3
-};
-
 enum dcc_descriptor_type {
DCC_ADDR_TYPE,
DCC_LOOP_TYPE,
@@ -61,6 +93,12 @@ enum dcc_descriptor_type {
DCC_WRITE_TYPE
 };
 
+enum dcc_mem_map_ver {
+   DCC_MEM_MAP_VER1,
+   DCC_MEM_MAP_VER2,
+   DCC_MEM_MAP_VER3
+};
+
 struct dcc_config_entry {
u32 base;
u32 offset;
@@ -98,6 +136,22 @@ struct dcc_drvdata {
u8  loopoff;
 };
 
+struct dcc_cfg_attr {
+   u32 addr;
+   u32 prev_addr;
+   u32 prev_off;
+   u32 link;
+   u32 sram_offset;
+};
+
+struct dcc_cfg_loop_attr {
+   u32 loop;
+   boolloop_start;
+   u32 loop_cnt;
+   u32 loop_len;
+   u32 loop_off;
+};
+
 static u32 dcc_offset_conv(struct dcc_drvdata *drvdata, u32 off)
 {
if (drvdata->mem_map_ver == DCC_MEM_MAP_VER1) {
@@ -114,6 +168,832 @@ static u32 dcc_offset_conv(struct dcc_drvdata *drvdata, 
u32 off)
return off;
 }
 
+static int dcc_sram_writel(struct dcc_drvdata *drvdata,
+   u32 val, u32 off)
+{
+   if (unlikely(off > (drvdata->ram_size - 4)))
+   return -EINVAL;
+
+   writel((val), drvdata->ram_base + off);
+
+   return 0;
+}
+
+static bool dcc_ready(struct dcc_drvdata *drvdata)
+{
+   u32 val;
+
+   /* poll until DCC ready */
+   if (!readl_poll_timeout((drvdata->base + DCC_STATUS), val,
+   (FIELD_GET(GENMASK(1, 0), val) == 0), 1, 
TIMEOUT_US))
+   return true;
+
+   return false;
+}
+
+static int dcc_read_status(struct dcc_drvdata *drvdata)
+{
+   int curr_list;
+   u32 bus_status;
+   u32 ll_cfg = 0;
+   u32 tmp_ll_cfg = 0;
+
+   for (curr_list = 0; curr_list < drvdata->nr_link_list; curr_list++) {
+   if (!drvdata->enable[curr_list])
+

[PATCH V1 1/6] dt-bindings: Added the yaml bindings for DCC

2021-03-10 Thread Souradeep Chowdhury

Documentation for Data Capture and Compare(DCC) device tree bindings
in yaml format.

Signed-off-by: Souradeep Chowdhury 
---
 .../devicetree/bindings/arm/msm/qcom,dcc.yaml  | 49 ++
 1 file changed, 49 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml

diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml 
b/Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml
new file mode 100644
index 000..b8e9998
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/msm/qcom,dcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Data Capture and Compare
+
+maintainers:
+  - Souradeep Chowdhury 
+
+description: |
+DCC (Data Capture and Compare) is a DMA engine which is used to save
+configuration data or system memory contents during catastrophic failure
+or SW trigger. DCC is used to capture and store data for debugging purpose
+
+
+properties:
+  compatible:
+items:
+  - enum:
+  - qcom,sm8150-dcc
+  - const: qcom,dcc
+
+  reg:
+items:
+  - description: DCC base register region
+  - description: DCC RAM base register region
+
+  reg-names:
+items:
+  - const: dcc-base
+  - const: dcc-ram-base
+
+required:
+  - compatible
+  - reg
+  - reg-names
+
+additionalProperties: false
+
+examples:
+  - |
+dcc@10a2000{
+compatible = "qcom,sm8150-dcc","qcom,dcc";
+reg = <0x010a2000  0x1000>,
+  <0x010ad000  0x2000>;
+reg-names = "dcc-base", "dcc-ram-base";
+};
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

[PATCH V1 4/6] DCC: Added the sysfs entries for DCC(Data Capture and Compare) driver

2021-03-10 Thread Souradeep Chowdhury

The DCC is a DMA engine designed to store register values either in
case of a system crash or in case of software triggers manually done
by the user. Using DCC hardware and the sysfs interface of the driver
the user can exploit various functionalities of DCC. The user can specify
the register addresses, the values of which is stored by DCC in it's
dedicated SRAM. The register addresses can be used either to read from,
write to, first read and store value and then write or to loop. All these
options can be exploited using the sysfs interface given to the user.
Following are the sysfs interfaces exposed in DCC driver which are
documented
1)trigger
2)config
3)config_write
4)config_reset
5)enable
6)rd_mod_wr
7)loop

Signed-off-by: Souradeep Chowdhury 
---
 Documentation/ABI/testing/sysfs-driver-dcc | 74 ++
 1 file changed, 74 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-driver-dcc

diff --git a/Documentation/ABI/testing/sysfs-driver-dcc 
b/Documentation/ABI/testing/sysfs-driver-dcc
new file mode 100644
index 000..7a855ca
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-dcc
@@ -0,0 +1,74 @@
+What:   /sys/bus/platform/devices/.../trigger
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file allows the software trigger to be enabled
+   by the user through the sysfs interface.Through this
+   interface the user can manually start a software trigger
+   in dcc where by the dcc driver stores the current status
+   of the specified registers in dcc sram.
+
+What:   /sys/bus/platform/devices/.../enable
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file allows the user to manually enable or
+   disable dcc driver.The dcc hardware needs to be
+   enabled before use.
+
+What:   /sys/bus/platform/devices/.../config
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file allows user to configure the register values
+   along with addresses to the dcc driver.This register
+   addresses are used to read from,write or loop through.
+   To enable all these options separate sysfs files have
+   are created.
+
+What:   /sys/bus/platform/devices/.../config_write
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file allows user to write a value to the register
+   address given as argument.The values are entered in the
+   form of  .
+
+What:   /sys/bus/platform/devices/.../config_reset
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file is used to reset the configuration of
+   a dcc driver to the default configuration.
+
+What:   /sys/bus/platform/devices/.../loop
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file is used to enter the loop count as dcc
+   driver gives the option to loop multiple times on
+   the same register and store the values for each
+   loop.
+
+What:   /sys/bus/platform/devices/.../rd_mod_wr
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file is used to read the value of the register
+   and then write the value given as an argument to the
+   register address in config.The address argument should
+   be given of the form  .
+
+What:   /sys/bus/platform/devices/.../ready
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file is used to check the status of the dcc
+   hardware if it's ready to take the inputs.
+
+What:  /sys/bus/platform/devices/.../curr_list
+Date:  February 2021
+Contact:   Souradeep Chowdhury 
+Description:
+   This file is used to configure the linkedlist data
+   to be used while configuring addresses.
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

[PATCH V1 2/6] soc: qcom: dcc: Add driver support for Data Capture and Compare unit(DCC)

2021-03-10 Thread Souradeep Chowdhury

The DCC is a DMA Engine designed to capture and store data
during system crash or software triggers. The DCC operates
based on link list entries which provides it with data and
addresses and the function it needs to perform. These
functions are read, write and loop. Added the basic driver
in this patch which contains a probe method which instantiates
the resources needed by the driver. DCC has it's own SRAM which
needs to be instantiated at probe time as well.

Signed-off-by: Souradeep Chowdhury 
---
 drivers/soc/qcom/Kconfig  |   8 +
 drivers/soc/qcom/Makefile |   1 +
 drivers/soc/qcom/dcc.c| 388 ++
 3 files changed, 397 insertions(+)
 create mode 100644 drivers/soc/qcom/dcc.c

diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 79b568f..8819e0b 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -69,6 +69,14 @@ config QCOM_LLCC
  SDM845. This provides interfaces to clients that use the LLCC.
  Say yes here to enable LLCC slice driver.
 
+config QCOM_DCC
+   tristate "Qualcomm Technologies, Inc. Data Capture and Compare engine 
driver"
+   depends on ARCH_QCOM || COMPILE_TEST
+   help
+ This option enables driver for Data Capture and Compare engine. DCC
+ driver provides interface to configure DCC block and read back
+ captured data from DCC's internal SRAM.
+
 config QCOM_KRYO_L2_ACCESSORS
bool
depends on ARCH_QCOM && ARM64 || COMPILE_TEST
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index ad675a6..1b00870 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_QCOM_LLCC) += llcc-qcom.o
 obj-$(CONFIG_QCOM_RPMHPD) += rpmhpd.o
 obj-$(CONFIG_QCOM_RPMPD) += rpmpd.o
 obj-$(CONFIG_QCOM_KRYO_L2_ACCESSORS) +=kryo-l2-accessors.o
+obj-$(CONFIG_QCOM_DCC) += dcc.o
diff --git a/drivers/soc/qcom/dcc.c b/drivers/soc/qcom/dcc.c
new file mode 100644
index 000..89816bf
--- /dev/null
+++ b/drivers/soc/qcom/dcc.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2015-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define dcc_readl(drvdata, off)
\
+   readl(drvdata->base + dcc_offset_conv(drvdata, off))
+
+/* DCC registers */
+#define DCC_HW_INFO0x04
+#define DCC_LL_NUM_INFO0x10
+
+#define DCC_MAP_LEVEL1 0x18
+#define DCC_MAP_LEVEL2 0x34
+#define DCC_MAP_LEVEL3 0x4C
+
+#define DCC_MAP_OFFSET10x10
+#define DCC_MAP_OFFSET20x18
+#define DCC_MAP_OFFSET30x1C
+#define DCC_MAP_OFFSET40x8
+
+#define DCC_FIX_LOOP_OFFSET16
+#define DCC_VER_INFO_BIT   9
+
+#define DCC_MAX_LINK_LIST  8
+#define DCC_INVALID_LINK_LIST  GENMASK(7, 0)
+
+#define DCC_VER_MASK1  GENMASK(6, 0)
+#define DCC_VER_MASK2  GENMASK(5, 0)
+
+#define DCC_RD_MOD_WR_ADDR 0xC105E
+
+struct qcom_dcc_config {
+   const int dcc_ram_offset;
+};
+
+enum dcc_mem_map_ver {
+   DCC_MEM_MAP_VER1,
+   DCC_MEM_MAP_VER2,
+   DCC_MEM_MAP_VER3
+};
+
+enum dcc_descriptor_type {
+   DCC_ADDR_TYPE,
+   DCC_LOOP_TYPE,
+   DCC_READ_WRITE_TYPE,
+   DCC_WRITE_TYPE
+};
+
+struct dcc_config_entry {
+   u32 base;
+   u32 offset;
+   u32 len;
+   u32 index;
+   u32 loop_cnt;
+   u32 write_val;
+   u32 mask;
+   boolapb_bus;
+   enum dcc_descriptor_typedesc_type;
+   struct list_headlist;
+};
+
+struct dcc_drvdata {
+   void __iomem*base;
+   u32 reg_size;
+   struct device   *dev;
+   struct mutexmutex;
+   void __iomem*ram_base;
+   u32 ram_size;
+   u32 ram_offset;
+   enum dcc_mem_map_vermem_map_ver;
+   u32 ram_cfg;
+   u32 ram_start;
+   bool*enable;
+   bool*configured;
+   boolinterrupt_disable;
+   char*sram_node;
+   struct cdev sram_dev;
+   struct class*sram_class;
+   struct list_head*cfg_head;
+   u32 *nr_config;
+   u32 nr_link_list;
+   u8

[PATCH V1 0/6] Add driver support for Data Capture and Compare Engine(DCC) for SM8150

2021-03-10 Thread Souradeep Chowdhury

DCC(Data Capture and Compare) is a DMA engine designed for debugging purposes. 
In case of a system
crash or manual software triggers by the user the DCC hardware stores the value 
at the register
addresses which can be used for debugging purposes. The DCC driver provides the 
user with sysfs
interface to configure the register addresses. The options that the DCC 
hardware provides include
reading from registers, writing to registers, first reading and then writing to 
registers and looping
through the values of the same register.

In certain cases a register write needs to be executed for accessing the rest 
of the registers, also
the user might want to record the changing values of a particular register with 
time for which he has
the option to use the loop feature. The options mentioned above are exposed to 
the user by sysfs files
once the driver is probed. The details and usage of this sysfs files are 
documented in
Documentation/ABI/testing/sysfs-driver-dcc. As an example if a user wants to 
configure to store 100 words
starting from address 0x8050 he should give inputs as following to the 
sysfs config file:-

echo  0x8050 100 > /sys/bus/platform/devices/.../config

Similarly if the user wants to write to a register using DCC hardware he should 
give following input to
config_write sysfs file:-
echo 0x8000 0xFF > /sys/bus/platform/devices/10a2000.dcc/config_write
All this read and write occurs at crash time or if the user manually invokes a 
software trigger.

Souradeep Chowdhury (6):
  dt-bindings: Added the yaml bindings for DCC
  soc: qcom: dcc: Add driver support for Data Capture and Compare
unit(DCC)
  soc: qcom: dcc: Add the sysfs variables to the Data Capture and
Compare driver(DCC)
  DCC: Added the sysfs entries for DCC(Data Capture and Compare) driver
  MAINTAINERS: Add the entry for DCC(Data Capture and Compare) driver
support
  arm64: dts: qcom: sm8150: Add Data Capture and Compare(DCC) support
node

 Documentation/ABI/testing/sysfs-driver-dcc |   74 +
 .../devicetree/bindings/arm/msm/qcom,dcc.yaml  |   49 +
 MAINTAINERS|8 +
 arch/arm64/boot/dts/qcom/sm8150.dtsi   |7 +
 drivers/soc/qcom/Kconfig   |8 +
 drivers/soc/qcom/Makefile  |1 +
 drivers/soc/qcom/dcc.c | 1551 
 7 files changed, 1698 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-driver-dcc
 create mode 100644 Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml
 create mode 100644 drivers/soc/qcom/dcc.c

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

Re: [PATCH 1/2] dt-bindings: remoteproc: qcom: Add SC7280 WPSS support

2021-03-10 Thread Bjorn Andersson

On Wed 10 Mar 01:28 CST 2021, Rakesh Pillai wrote:

> Add WPSS PIL loading support for SC7280 SoCs.
> 

Acked-by: Bjorn Andersson 

But can you please follow up with a patch that converts this to yaml?

Regards,
Bjorn

> Signed-off-by: Rakesh Pillai 
> ---
>  .../bindings/remoteproc/qcom,hexagon-v56.txt   | 35 
> --
>  1 file changed, 20 insertions(+), 15 deletions(-)
> 
> diff --git 
> a/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt 
> b/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt
> index 1337a3d..edad5e8 100644
> --- a/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt
> +++ b/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt
> @@ -9,6 +9,7 @@ on the Qualcomm Technology Inc. Hexagon v56 core.
>   Definition: must be one of:
>   "qcom,qcs404-cdsp-pil",
>   "qcom,sdm845-adsp-pil"
> + "qcom,sc7280-wpss-pil"
>  
>  - reg:
>   Usage: required
> @@ -24,7 +25,13 @@ on the Qualcomm Technology Inc. Hexagon v56 core.
>  - interrupt-names:
>   Usage: required
>   Value type: 
> - Definition: must be "wdog", "fatal", "ready", "handover", "stop-ack"
> + Definition: The interrupts needed depends on the compatible string
> + qcom,sdm845-adsp-pil:
> + qcom,qcs404-cdsp-pil:
> + must be "wdog", "fatal", "ready", "handover", "stop-ack"
> + qcom,sc7280-wpss-pil:
> + must be "wdog", "fatal", "ready", "handover", "stop-ack"
> + "shutdown-ack"
>  
>  - clocks:
>   Usage: required
> @@ -35,19 +42,17 @@ on the Qualcomm Technology Inc. Hexagon v56 core.
>  - clock-names:
>   Usage: required for SDM845 ADSP
>   Value type: 
> - Definition: List of clock input name strings sorted in the same
> - order as the clocks property. Definition must have
> - "xo", "sway_cbcr", "lpass_ahbs_aon_cbcr",
> - "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", "qdsp6ss_sleep"
> - and "qdsp6ss_core".
> -
> -- clock-names:
> - Usage: required for QCS404 CDSP
> - Value type: 
> - Definition: List of clock input name strings sorted in the same
> - order as the clocks property. Definition must have
> - "xo", "sway", "tbu", "bimc", "ahb_aon", "q6ss_slave",
> - "q6ss_master", "q6_axim".
> + Definition: The clocks needed depends on the compatible string
> + qcom,sdm845-adsp-pil:
> + must be "xo", "sway_cbcr", "lpass_ahbs_aon_cbcr",
> + "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", "qdsp6ss_sleep",
> + "qdsp6ss_core"
> + qcom,qcs404-cdsp-pil:
> + must be "xo", "sway", "tbu", "bimc", "ahb_aon", "q6ss_slave",
> + "q6ss_master", "q6_axim"
> + qcom,sc7280-wpss-pil:
> + must be "gcc_wpss_ahb_bdg_mst_clk", "gcc_wpss_ahb_clk",
> + "gcc_wpss_rscp_clk"
>  
>  - power-domains:
>   Usage: required
> @@ -65,7 +70,7 @@ on the Qualcomm Technology Inc. Hexagon v56 core.
>  Definition: must be "pdc_sync" and "cc_lpass"
>  
>  - reset-names:
> -Usage: required for QCS404 CDSP
> +Usage: required for QCS404 CDSP, SC7280 WPSS
>  Value type: 
>  Definition: must be "restart"
>  
> -- 
> 2.7.4
>

Re: [RFC PATCH 0/3] hugetlb: add demote/split page functionality

2021-03-10 Thread Zi Yan

On 10 Mar 2021, at 11:23, Michal Hocko wrote:

> On Mon 08-03-21 16:18:52, Mike Kravetz wrote:
> [...]
>> Converting larger to smaller hugetlb pages can be accomplished today by
>> first freeing the larger page to the buddy allocator and then allocating
>> the smaller pages.  However, there are two issues with this approach:
>> 1) This process can take quite some time, especially if allocation of
>>the smaller pages is not immediate and requires migration/compaction.
>> 2) There is no guarantee that the total size of smaller pages allocated
>>will match the size of the larger page which was freed.  This is
>>because the area freed by the larger page could quickly be
>>fragmented.
>
> I will likely not surprise to show some level of reservation. While your
> concerns about reconfiguration by existing interfaces are quite real is
> this really a problem in practice? How often do you need such a
> reconfiguration?
>
> Is this all really worth the additional code to something as tricky as
> hugetlb code base?
>
>>  include/linux/hugetlb.h |   8 ++
>>  mm/hugetlb.c| 199 +++-
>>  2 files changed, 204 insertions(+), 3 deletions(-)
>>
>> -- 
>> 2.29.2
>>

The high level goal of this patchset seems to enable flexible huge page
allocation from a single pool, when multiple huge page sizes are available
to use. The limitation of existing mechanism is that user has to specify
how many huge pages he/she wants and how many gigantic pages he/she wants
before the actual use.

I just want to throw an idea here, please ignore if it is too crazy.
Could we have a variant buddy allocator for huge page allocations,
which only has available huge page orders in the free list? For example,
if user wants 2MB and 1GB pages, the allocator will only have order-9 and
order-19 pages; when order-9 pages run out, we can split order-19 pages;
if possible, adjacent order-9 pages can be merged back to order-19 pages.

—
Best Regards,
Yan Zi

signature.asc
Description: OpenPGP digital signature

Re: [PATCH v9 5/6] KVM: arm64: ioctl to fetch/store tags in a guest

2021-03-10 Thread Steven Price


On 09/03/2021 17:57, Marc Zyngier wrote:

On Mon, 01 Mar 2021 14:23:14 +,
Steven Price  wrote:


The VMM may not wish to have it's own mapping of guest memory mapped
with PROT_MTE because this causes problems if the VMM has tag checking
enabled (the guest controls the tags in physical RAM and it's unlikely
the tags are correct for the VMM).

Instead add a new ioctl which allows the VMM to easily read/write the
tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
while the VMM can still read/write the tags for the purpose of
migration.

Signed-off-by: Steven Price 
---
  arch/arm64/include/uapi/asm/kvm.h | 13 +++
  arch/arm64/kvm/arm.c  | 57 +++
  include/uapi/linux/kvm.h  |  1 +
  3 files changed, 71 insertions(+)

diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index 24223adae150..5fc2534ac5df 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -184,6 +184,19 @@ struct kvm_vcpu_events {
__u32 reserved[12];
  };
  
+struct kvm_arm_copy_mte_tags {

+   __u64 guest_ipa;
+   __u64 length;
+   union {
+   void __user *addr;
+   __u64 padding;
+   };
+   __u64 flags;


I'd be keen on a couple of reserved __64s. Just in case...


Fair enough, I'll add a __u64 reserved[2];


+};
+
+#define KVM_ARM_TAGS_TO_GUEST  0
+#define KVM_ARM_TAGS_FROM_GUEST1
+
  /* If you need to interpret the index values, here is the key: */
  #define KVM_REG_ARM_COPROC_MASK   0x0FFF
  #define KVM_REG_ARM_COPROC_SHIFT  16
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 46bf319f6cb7..01d404833e24 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1297,6 +1297,53 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
}
  }
  
+static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,

+ struct kvm_arm_copy_mte_tags *copy_tags)
+{
+   gpa_t guest_ipa = copy_tags->guest_ipa;
+   size_t length = copy_tags->length;
+   void __user *tags = copy_tags->addr;
+   gpa_t gfn;
+   bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
+
+   if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
+   return -EINVAL;
+
+   if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
+   return -EINVAL;


It is a bit odd to require userspace to provide a page-aligned
addr/size, as it now has to find out about the kernel's page
size. MTE_GRANULE_SIZE-aligned values would make more sense. Is there
an underlying reason for this?


No fundamental reason, my thoughts were:

 * It's likely user space is naturally going to be using page-aligned 
quantities during migration, so it already has to care about this.


 * It makes the loop below easier.

 * It's easy to relax the restriction in the future if it becomes a 
problem, much harder to tighten it without breaking anything.


But I can switch to MTE_GRANULE_SIZE if you'd prefer, let me know.


+
+   gfn = gpa_to_gfn(guest_ipa);
+
+   while (length > 0) {
+   kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
+   void *maddr;
+   unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE;
+
+   if (is_error_noslot_pfn(pfn))
+   return -ENOENT;
+
+   maddr = page_address(pfn_to_page(pfn));
+
+   if (!write) {
+   num_tags = mte_copy_tags_to_user(tags, maddr, num_tags);
+   kvm_release_pfn_clean(pfn);
+   } else {
+   num_tags = mte_copy_tags_from_user(maddr, tags,
+  num_tags);
+   kvm_release_pfn_dirty(pfn);
+   }
+


Is it actually safe to do this without holding any lock, without
checking anything against the mmu_notifier_seq? What if the pages are
being swapped out? Or the memslot removed from under your feet?

It looks... dangerous. Do you even want to allow this while vcpus are
actually running?


Umm... yeah I'm not sure how I managed to forgot the locks. This should 
be holding kvm->slots_lock to prevent the slot going under our feet. I 
was surprised that lockdep didn't catch that, until I noticed I'd 
disabled it and discovered why (the model makes it incredibly slow). 
However I've done a run with it enabled now - and with the 
kvm->slots_lock taken it's happy.


gfn_to_pfn_prot() internally calls a variant of get_user_pages() - so 
swapping out shouldn't be a problem.


In terms of running with the vcpus running - given this is going to be 
used for migration I think that's pretty much a requirement. We want to 
be able to dump the tags while executing to enable early transfer of the 
memory.


Steve


+   if (num_tags != PAGE_SIZE / MTE_GRANULE_SIZE)
+

Re: [PATCH v2] mm: page_alloc: dump migrate-failed pages

2021-03-10 Thread Michal Hocko

On Wed 10-03-21 08:05:36, Minchan Kim wrote:
> On Wed, Mar 10, 2021 at 02:07:05PM +0100, Michal Hocko wrote:
[...]
> > The is a lot of churn indeed. Have you considered adding $FOO_lglvl
> > variants for those so that you can use them for your particular case
> > without affecting most of existing users? Something similar we have
> > discussed in other email thread regarding lru_add_drain_all?
> 
> I thought that way but didn't try since it couldn't make them
> atomic(For example, other printk place in other context will
> affect by the $FOO_lglvl).

I do not follow. I meant something like the following (likely incomplete
but you should get an idea).

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 3468794f83d2..71b402eb8f78 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -14,7 +14,7 @@ extern void __set_page_owner(struct page *page,
 extern void __split_page_owner(struct page *page, unsigned int nr);
 extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
 extern void __set_page_owner_migrate_reason(struct page *page, int reason);
-extern void __dump_page_owner(struct page *page);
+extern void __dump_page_owner(struct page *page, const char *loglvl);
 extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
pg_data_t *pgdat, struct zone *zone);
 
@@ -46,10 +46,10 @@ static inline void set_page_owner_migrate_reason(struct 
page *page, int reason)
if (static_branch_unlikely(_owner_inited))
__set_page_owner_migrate_reason(page, reason);
 }
-static inline void dump_page_owner(struct page *page)
+static inline void dump_page_owner(struct page *page, const char *loglvl)
 {
if (static_branch_unlikely(_owner_inited))
-   __dump_page_owner(page);
+   __dump_page_owner(page, loglvl);
 }
 #else
 static inline void reset_page_owner(struct page *page, unsigned int order)
@@ -69,7 +69,7 @@ static inline void copy_page_owner(struct page *oldpage, 
struct page *newpage)
 static inline void set_page_owner_migrate_reason(struct page *page, int reason)
 {
 }
-static inline void dump_page_owner(struct page *page)
+static inline void dump_page_owner(struct page *page, const char *loglvl)
 {
 }
 #endif /* CONFIG_PAGE_OWNER */
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9f8117c7cfdd..1b13135d9916 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -14,6 +14,18 @@
 #include 
 #include 
 
+void __stack_trace_print(const unsigned long *entries, unsigned int nr_entries,
+  int spacesconst, char *loglvl)
+{
+   unsigned int i;
+
+   if (WARN_ON(!entries))
+   return;
+
+   for (i = 0; i < nr_entries; i++)
+   printk("%s%*c%pS\n", loglvl, 1 + spaces, ' ', (void 
*)entries[i]);
+}
+
 /**
  * stack_trace_print - Print the entries in the stack trace
  * @entries:   Pointer to storage array
@@ -23,13 +35,7 @@
 void stack_trace_print(const unsigned long *entries, unsigned int nr_entries,
   int spaces)
 {
-   unsigned int i;
-
-   if (WARN_ON(!entries))
-   return;
-
-   for (i = 0; i < nr_entries; i++)
-   printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]);
+   __stack_trace_print(entries, nr_entries, spaces, KERN_DEFAULT);
 }
 EXPORT_SYMBOL_GPL(stack_trace_print);
 
diff --git a/mm/debug.c b/mm/debug.c
index 8a40b3fefbeb..b989ee2ffa89 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -42,7 +42,7 @@ const struct trace_print_flags vmaflag_names[] = {
{0, NULL}
 };
 
-void __dump_page(struct page *page, const char *reason)
+void __dump_page(struct page *page, const char *reason, const char *loglvl)
 {
struct page *head = compound_head(page);
struct address_space *mapping;
@@ -64,7 +64,7 @@ void __dump_page(struct page *page, const char *reason)
 * dump_page() when detected.
 */
if (page_poisoned) {
-   pr_warn("page:%px is uninitialized and poisoned", page);
+   printk("%spage:%px is uninitialized and poisoned", loglvl, 
page);
goto hex_only;
}
 
@@ -95,17 +95,17 @@ void __dump_page(struct page *page, const char *reason)
 */
mapcount = PageSlab(head) ? 0 : page_mapcount(page);
 
-   pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx 
pfn:%#lx\n",
+   printk("%spage:%p refcount:%d mapcount:%d mapping:%p index:%#lx 
pfn:%#lx\n", loglvl,
page, page_ref_count(head), mapcount, mapping,
page_to_pgoff(page), page_to_pfn(page));
if (compound) {
if (hpage_pincount_available(page)) {
-   pr_warn("head:%p order:%u compound_mapcount:%d 
compound_pincount:%d\n",
+   printk("%shead:%p order:%u compound_mapcount:%d 
compound_pincount:%d\n", loglvl,
head,

[syzbot] BUG: unable to handle kernel access to user memory in schedule_tail

2021-03-10 Thread syzbot

Hello,

syzbot found the following issue on:

HEAD commit:0d7588ab riscv: process: Fix no prototype for arch_dup_tas..
git tree:   git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git 
fixes
console output: https://syzkaller.appspot.com/x/log.txt?x=1212c6e6d0
kernel config:  https://syzkaller.appspot.com/x/.config?x=e3c595255fb2d136
dashboard link: https://syzkaller.appspot.com/bug?extid=e74b94fe601ab9552d69
userspace arch: riscv64

Unfortunately, I don't have any reproducer for this issue yet.

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+e74b94fe601ab9552...@syzkaller.appspotmail.com

Unable to handle kernel access to user memory without uaccess routines at 
virtual address 2749f0d0
Oops [#1]
Modules linked in:
CPU: 1 PID: 4875 Comm: syz-executor.0 Not tainted 
5.12.0-rc2-syzkaller-00467-g0d7588ab9ef9 #0
Hardware name: riscv-virtio,qemu (DT)
epc : schedule_tail+0x72/0xb2 kernel/sched/core.c:4264
 ra : task_pid_vnr include/linux/sched.h:1421 [inline]
 ra : schedule_tail+0x70/0xb2 kernel/sched/core.c:4264
epc : ffe8c8b0 ra : ffe8c8ae sp : ffe025d17ec0
 gp : ffe005d25378 tp : ffe00f0d t0 : 
 t1 : 0001 t2 : 000f4240 s0 : ffe025d17ee0
 s1 : 2749f0d0 a0 : 002a a1 : 0003
 a2 : 1ffc0cfac500 a3 : ffec80cc a4 : 5ae9db91c19bbe00
 a5 :  a6 : 00f0 a7 : ffe82eba
 s2 : 0004 s3 : ffe00eef96c0 s4 : ffe022c77fe0
 s5 : 4000 s6 : ffe067d74e00 s7 : ffe067d74850
 s8 : ffe067d73e18 s9 : ffe067d74e00 s10: ffe00eef96e8
 s11: 00ae6cdf8368 t3 : 5ae9db91c19bbe00 t4 : ffc4043cafb2
 t5 : ffc4043cafba t6 : 0004
status: 0120 badaddr: 2749f0d0 cause: 000f
Call Trace:
[] schedule_tail+0x72/0xb2 kernel/sched/core.c:4264
[] ret_from_exception+0x0/0x14
Dumping ftrace buffer:
   (ftrace buffer empty)
---[ end trace b5f8f9231dc87dda ]---


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkal...@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.

Re: [PATCH] exit: trigger panic when init process is set to SIGNAL_GROUP_EXIT

2021-03-10 Thread Eric W. Biederman

qianli zhao  writes:

> Hi,Oleg
>
> Thanks for your replay.
>
>> To be honest, I don't understand the changelog. It seems that you want
>> to uglify the kernel to simplify the debugging of buggy init? Or what?
>
> My patch is for the following purpose:
> 1. I hope to fix the occurrence of unexpected panic
> - [   24.705390] kernel BUG at include/linux/pid_namespace.h:98!
> This problem occurs when both two init threads enter the do_exit,
> One of the init thread is syscall sys_exit_group,and set SIGNAL_GROUP_EXIT
> The other init thread perform ret_to_user()->get_signal() and found
> SIGNAL_GROUP_EXIT is set,then do_group_exit()->do_exit()
> Since there are no alive init threads it finally goes to
> zap_pid_ns_processes() and BUG().
> Timing details are in the changelog.

At the start of your changelog and your patch subject you describe what
you are doing but not why.  For the next revision of the patch please
lead with the why it makes what you are trying to do much easier to
understand.

> 2. I hope to fix the problem that coredump cannot be parsed after init
> crash Before my patch,ever init sub-thread will finish do_exit(),the last
> thread will trigger panic().
> Except for the thread that triggered the panic,the state(such as
> PF_EXITING etc) of the other threads is already exiting,so we can't
> parse coredump from fulldump
> In fact, when any one init has been set to SIGNAL_GROUP_EXIT, then we
> can trigger panic,and prevent other init threads from continuing to
> exit
>
>> Nor can I understand the patch. I fail to understand the games with
>> SIGNAL_UNKILLABLE and ->siglock.
>
> When first init thread panic,i don't want other init threads to still
> exit,this will destroy the state of other init threads.
> so i use SIGNAL_UNKILLABLE to mark this stat,prevent other init
> threads from continuing to exit
> In addition i use siglock to protect tsk->signal->flags.

It does not work to use SIGNAL_UNKILLABLE for this.  Normally init
has SIGNAL_UNKILLABLE set.  The only case that clears SIGNAL_UNKILLABLE
is force_sig_info_to_task.  If the init process exits with exit(2)
SIGNAL_UNKILLABLE will already be set.  Which means testing
SIGNAL_UNKILLABLE as your patch does will prevent the panic.

Further simply calling panic is sufficient to guarantee that the other
threads don't exit, and that whichever thread calls panic first
will be the reporting thread.  The rest of the threads will be caught
in panic_smp_self_stop(), if they happen to be running on other cpus.

So I would make the whole thing just be:

/* If global init has exited,
 * panic immediately to get a useable coredump.
 */
if (unlikely(is_global_init(tsk) &&
(thread_group_empty(tsk) ||
(tsk->signal->flags & SIGNAL_GROUP_EXIT {
panic("Attempted to kill init!  exitcode=0x%08x\n",
tsk->signal->group_exit_code ?: (int)code);
}

The thread_group_empty test is needed to handle single threaded
inits.

Do you think you can respin your patch as something like that?

Eric

>
>> And iiuc with this patch the kernel will crash if init's sub-thread execs,
>> signal_group_exit() returns T in this case.
>
> Oleg Nesterov  于2021年3月10日周三 上午2:27写道：
>>
>> On 03/09, Qianli Zhao wrote:
>> >
>> > From: Qianli Zhao 
>> >
>> > Once any init thread finds SIGNAL_GROUP_EXIT, trigger panic immediately
>> > instead of last thread of global init has exited, and do not allow other
>> > init threads to exit, protect task/memory state of all sub-threads for
>> > get reliable init coredump
>>
>> To be honest, I don't understand the changelog. It seems that you want
>> to uglify the kernel to simplify the debugging of buggy init? Or what?
>>
>> Nor can I understand the patch. I fail to understand the games with
>> SIGNAL_UNKILLABLE and ->siglock.
>>
>> And iiuc with this patch the kernel will crash if init's sub-thread execs,
>> signal_group_exit() returns T in this case.
>>
>> Oleg.
>>
>> > [   24.705376] Kernel panic - not syncing: Attempted to kill init! 
>> > exitcode=0x7f00
>> > [   24.705382] CPU: 4 PID: 552 Comm: init Tainted: G S O
>> > 4.14.180-perf-g4483caa8ae80-dirty #1
>> > [   24.705390] kernel BUG at include/linux/pid_namespace.h:98!
>> >
>> > PID: 552   CPU: 4   COMMAND: "init"
>> > PID: 1 CPU: 7   COMMAND: "init"
>> > core4 core7
>> > ...   sys_exit_group()
>> >   do_group_exit()
>> >   - sig->flags = SIGNAL_GROUP_EXIT
>> >   - zap_other_threads()
>> >   do_exit()
>> >   - PF_EXITING is set
>> > ret_to_user()
>> > do_notify_resume()
>> > get_signal()
>> > - signal_group_exit
>> > - goto fatal;
>> > do_group_exit()
>> > do_exit()
>> > - PF_EXITING is set
>> > - panic("Attempted to kill init! exitcode=0x%08x\n")
>> >

Re: [PATCH 2/2] remoteproc: qcom: q6v5_wpss: Add support for sc7280 WPSS

2021-03-10 Thread Bjorn Andersson

On Wed 10 Mar 01:28 CST 2021, Rakesh Pillai wrote:

> Add support for PIL loading of WPSS processor for SC7280
> WPSS boot will be requested by the wifi driver and hence
> disable auto-boot for WPSS. Also add a separate shutdown
> sequence handler for WPSS.
> 
> Signed-off-by: Rakesh Pillai 
> ---
>  drivers/remoteproc/qcom_q6v5_adsp.c | 77 
> -
>  1 file changed, 76 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/remoteproc/qcom_q6v5_adsp.c 
> b/drivers/remoteproc/qcom_q6v5_adsp.c
> index e024502..dc6b91d 100644
> --- a/drivers/remoteproc/qcom_q6v5_adsp.c
> +++ b/drivers/remoteproc/qcom_q6v5_adsp.c
> @@ -58,6 +58,8 @@ struct adsp_pil_data {
>   const char *ssr_name;
>   const char *sysmon_name;
>   int ssctl_id;
> + bool is_wpss;
> + bool auto_boot;
>  
>   const char **clk_ids;
>   int num_clks;
> @@ -96,8 +98,54 @@ struct qcom_adsp {
>   struct qcom_rproc_glink glink_subdev;
>   struct qcom_rproc_ssr ssr_subdev;
>   struct qcom_sysmon *sysmon;
> +
> + int (*shutdown)(struct qcom_adsp *adsp);
>  };
>  
> +static int qcom_wpss_shutdown(struct qcom_adsp *adsp)
> +{
> + unsigned long timeout;
> + unsigned int val;
> + int ret;
> +
> + regmap_write(adsp->halt_map, adsp->halt_lpass + LPASS_HALTREQ_REG, 1);
> +
> + /* Wait for halt ACK from QDSP6 */
> + timeout = jiffies + msecs_to_jiffies(ACK_TIMEOUT);
> + for (;;) {
> + ret = regmap_read(adsp->halt_map,
> +   adsp->halt_lpass + LPASS_HALTACK_REG, );
> + if (ret || val || time_after(jiffies, timeout))
> + break;
> +
> + usleep_range(1000, 1100);
> + }
> +
> + /* Place the WPSS processor into reset */
> + reset_control_assert(adsp->restart);
> + /* wait after asserting subsystem restart from AOSS */
> + usleep_range(100, 105);
> + /* Remove the WPSS reset */
> + reset_control_deassert(adsp->restart);
> +
> + usleep_range(100, 105);
> +
> + regmap_write(adsp->halt_map, adsp->halt_lpass + LPASS_HALTREQ_REG, 0);
> +
> + /* Wait for halt ACK from QDSP6 */
> + timeout = jiffies + msecs_to_jiffies(ACK_TIMEOUT);
> + for (;;) {
> + ret = regmap_read(adsp->halt_map,
> +   adsp->halt_lpass + LPASS_HALTACK_REG, );
> + if (ret || !val || time_after(jiffies, timeout))
> + break;
> +
> + usleep_range(1000, 1100);
> + }
> +
> + return 0;
> +}
> +
>  static int qcom_adsp_shutdown(struct qcom_adsp *adsp)
>  {
>   unsigned long timeout;
> @@ -270,7 +318,7 @@ static int adsp_stop(struct rproc *rproc)
>   if (ret == -ETIMEDOUT)
>   dev_err(adsp->dev, "timed out on wait\n");
>  
> - ret = qcom_adsp_shutdown(adsp);
> + ret = adsp->shutdown(adsp);
>   if (ret)
>   dev_err(adsp->dev, "failed to shutdown: %d\n", ret);
>  
> @@ -439,6 +487,8 @@ static int adsp_probe(struct platform_device *pdev)
>   dev_err(>dev, "unable to allocate remoteproc\n");
>   return -ENOMEM;
>   }
> +
> + rproc->auto_boot = desc->auto_boot;
>   rproc_coredump_set_elf_info(rproc, ELFCLASS32, EM_NONE);
>  
>   adsp = (struct qcom_adsp *)rproc->priv;
> @@ -447,6 +497,11 @@ static int adsp_probe(struct platform_device *pdev)
>   adsp->info_name = desc->sysmon_name;
>   platform_set_drvdata(pdev, adsp);
>  
> + if (desc->is_wpss)
> + adsp->shutdown = qcom_wpss_shutdown;
> + else
> + adsp->shutdown = qcom_adsp_shutdown;
> +
>   ret = adsp_alloc_memory_region(adsp);
>   if (ret)
>   goto free_rproc;
> @@ -515,6 +570,8 @@ static const struct adsp_pil_data adsp_resource_init = {
>   .ssr_name = "lpass",
>   .sysmon_name = "adsp",
>   .ssctl_id = 0x14,
> + .is_wpss = false,
> + .auto_boot = true;
>   .clk_ids = (const char*[]) {
>   "sway_cbcr", "lpass_ahbs_aon_cbcr", "lpass_ahbm_aon_cbcr",
>   "qdsp6ss_xo", "qdsp6ss_sleep", "qdsp6ss_core", NULL
> @@ -528,6 +585,8 @@ static const struct adsp_pil_data cdsp_resource_init = {
>   .ssr_name = "cdsp",
>   .sysmon_name = "cdsp",
>   .ssctl_id = 0x17,
> + .is_wpss = false,
> + .auto_boot = true;
>   .clk_ids = (const char*[]) {
>   "sway", "tbu", "bimc", "ahb_aon", "q6ss_slave", "q6ss_master",
>   "q6_axim", NULL
> @@ -535,7 +594,23 @@ static const struct adsp_pil_data cdsp_resource_init = {
>   .num_clks = 7,
>  };
>  
> +static const struct adsp_pil_data wpss_resource_init = {
> + .crash_reason_smem = 626,
> + .firmware_name = "wpss.mdt",
> + .ssr_name = "wpss",
> + .sysmon_name = "wpss",
> + .ssctl_id = 0x19,
> + .is_wpss = true,
> + .auto_boot = false;

Why is auto_boot false for the WPSS?

> + .clk_ids = (const char*[]) {
> + "gcc_wpss_ahb_bdg_mst_clk",

< 5 6 7 8 9 10 11 12 13 14 >

901 - 1000 of 1854 matches

Mail list logo