[PATCH v4 17/28] afs: Print the operation debug_id when logging an unexpected data version

2021-03-10 Thread David Howells
Print the afs_operation debug_id when logging an unexpected change in the
data version.  This allows the logged message to be matched against
tracelines.

Signed-off-by: David Howells 
cc: linux-...@lists.infradead.org
cc: linux-cach...@redhat.com
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/160588528377.3465195.2206051235095182302.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118146111.1232039.11398082422487058312.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161042180.2537118.2471333561661033316.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340405772.1303470.3877167548944248214.st...@warthog.procyon.org.uk/
 # v3
---

 fs/afs/inode.c |5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 48519ee00eef..af6556bb3d6a 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -215,11 +215,12 @@ static void afs_apply_status(struct afs_operation *op,
 
if (vp->dv_before + vp->dv_delta != status->data_version) {
if (test_bit(AFS_VNODE_CB_PROMISED, >flags))
-   pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx 
%s\n",
+   pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s 
(op=%x)\n",
vnode->fid.vid, vnode->fid.vnode,
(unsigned long long)vp->dv_before + 
vp->dv_delta,
(unsigned long long)status->data_version,
-   op->type ? op->type->name : "???");
+   op->type ? op->type->name : "???",
+   op->debug_id);
 
vnode->invalid_before = status->data_version;
if (vnode->status.type == AFS_FTYPE_DIR) {




[PATCH v4 15/28] afs: Disable use of the fscache I/O routines

2021-03-10 Thread David Howells
Disable use of the fscache I/O routined by the AFS filesystem.  It's about
to transition to passing iov_iters down and fscache is about to have its
I/O path to use iov_iter, so all that needs to change.

Signed-off-by: David Howells 
cc: linux-...@lists.infradead.org
cc: linux-cach...@redhat.com
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/158861209824.340223.1864211542341758994.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/159465768717.1376105.2229314852486665807.st...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/160588457929.3465195.1730097418904945578.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118143744.1232039.2727898205333669064.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161039077.2537118.7986870854927176905.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340403323.1303470.8159439948319423431.st...@warthog.procyon.org.uk/
 # v3
---

 fs/afs/file.c  |  199 ++--
 fs/afs/inode.c |2 -
 fs/afs/write.c |   10 ---
 3 files changed, 36 insertions(+), 175 deletions(-)

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 85f5adf21aa0..6d43713fde01 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -203,24 +203,6 @@ void afs_put_read(struct afs_read *req)
}
 }
 
-#ifdef CONFIG_AFS_FSCACHE
-/*
- * deal with notification that a page was read from the cache
- */
-static void afs_file_readpage_read_complete(struct page *page,
-   void *data,
-   int error)
-{
-   _enter("%p,%p,%d", page, data, error);
-
-   /* if the read completes with an error, we just unlock the page and let
-* the VM reissue the readpage */
-   if (!error)
-   SetPageUptodate(page);
-   unlock_page(page);
-}
-#endif
-
 static void afs_fetch_data_success(struct afs_operation *op)
 {
struct afs_vnode *vnode = op->file[0].vnode;
@@ -288,89 +270,46 @@ int afs_page_filler(void *data, struct page *page)
if (test_bit(AFS_VNODE_DELETED, >flags))
goto error;
 
-   /* is it cached? */
-#ifdef CONFIG_AFS_FSCACHE
-   ret = fscache_read_or_alloc_page(vnode->cache,
-page,
-afs_file_readpage_read_complete,
-NULL,
-GFP_KERNEL);
-#else
-   ret = -ENOBUFS;
-#endif
-   switch (ret) {
-   /* read BIO submitted (page in cache) */
-   case 0:
-   break;
-
-   /* page not yet cached */
-   case -ENODATA:
-   _debug("cache said ENODATA");
-   goto go_on;
-
-   /* page will not be cached */
-   case -ENOBUFS:
-   _debug("cache said ENOBUFS");
-
-   fallthrough;
-   default:
-   go_on:
-   req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
-   if (!req)
-   goto enomem;
-
-   /* We request a full page.  If the page is a partial one at the
-* end of the file, the server will return a short read and the
-* unmarshalling code will clear the unfilled space.
-*/
-   refcount_set(>usage, 1);
-   req->pos = (loff_t)page->index << PAGE_SHIFT;
-   req->len = PAGE_SIZE;
-   req->nr_pages = 1;
-   req->pages = req->array;
-   req->pages[0] = page;
-   get_page(page);
-
-   /* read the contents of the file from the server into the
-* page */
-   ret = afs_fetch_data(vnode, key, req);
-   afs_put_read(req);
-
-   if (ret < 0) {
-   if (ret == -ENOENT) {
-   _debug("got NOENT from server"
-  " - marking file deleted and stale");
-   set_bit(AFS_VNODE_DELETED, >flags);
-   ret = -ESTALE;
-   }
-
-#ifdef CONFIG_AFS_FSCACHE
-   fscache_uncache_page(vnode->cache, page);
-#endif
-   BUG_ON(PageFsCache(page));
-
-   if (ret == -EINTR ||
-   ret == -ENOMEM ||
-   ret == -ERESTARTSYS ||
-   ret == -EAGAIN)
-   goto error;
-   goto io_error;
-   }
+   req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
+   if (!req)
+   goto enomem;
 
-   SetPageUptodate(page);
+   /* We request a full page.  If the page is a partial one at the
+* end of the file, the server will return a short read and the
+* unmarshalling code 

[PATCH v4 16/28] afs: Pass page into dirty region helpers to provide THP size

2021-03-10 Thread David Howells
Pass a pointer to the page being accessed into the dirty region helpers so
that the size of the page can be determined in case it's a transparent huge
page.

This also required the page to be passed into the afs_page_dirty trace
point - so there's no need to specifically pass in the index or private
data as these can be retrieved directly from the page struct.

Signed-off-by: David Howells 
cc: linux-...@lists.infradead.org
cc: linux-cach...@redhat.com
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/160588527183.3465195.16107942526481976308.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118144921.1232039.11377711180492625929.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161040747.2537118.11435394902674511430.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340404553.1303470.11414163641767769882.st...@warthog.procyon.org.uk/
 # v3
---

 fs/afs/file.c  |   20 +++
 fs/afs/internal.h  |   16 ++--
 fs/afs/write.c |   60 ++--
 include/trace/events/afs.h |   23 ++---
 4 files changed, 55 insertions(+), 64 deletions(-)

diff --git a/fs/afs/file.c b/fs/afs/file.c
index 6d43713fde01..21868bfc3a44 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -515,8 +515,8 @@ static void afs_invalidate_dirty(struct page *page, 
unsigned int offset,
return;
 
/* We may need to shorten the dirty region */
-   f = afs_page_dirty_from(priv);
-   t = afs_page_dirty_to(priv);
+   f = afs_page_dirty_from(page, priv);
+   t = afs_page_dirty_to(page, priv);
 
if (t <= offset || f >= end)
return; /* Doesn't overlap */
@@ -534,17 +534,17 @@ static void afs_invalidate_dirty(struct page *page, 
unsigned int offset,
if (f == t)
goto undirty;
 
-   priv = afs_page_dirty(f, t);
+   priv = afs_page_dirty(page, f, t);
set_page_private(page, priv);
-   trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, 
priv);
+   trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page);
return;
 
 undirty:
-   trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, 
priv);
+   trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page);
clear_page_dirty_for_io(page);
 full_invalidate:
-   priv = (unsigned long)detach_page_private(page);
-   trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, 
priv);
+   detach_page_private(page);
+   trace_afs_page_dirty(vnode, tracepoint_string("inval"), page);
 }
 
 /*
@@ -572,7 +572,6 @@ static void afs_invalidatepage(struct page *page, unsigned 
int offset,
 static int afs_releasepage(struct page *page, gfp_t gfp_flags)
 {
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
-   unsigned long priv;
 
_enter("{{%llx:%llu}[%lu],%lx},%x",
   vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
@@ -581,9 +580,8 @@ static int afs_releasepage(struct page *page, gfp_t 
gfp_flags)
/* deny if page is being written to the cache and the caller hasn't
 * elected to wait */
if (PagePrivate(page)) {
-   priv = (unsigned long)detach_page_private(page);
-   trace_afs_page_dirty(vnode, tracepoint_string("rel"),
-page->index, priv);
+   detach_page_private(page);
+   trace_afs_page_dirty(vnode, tracepoint_string("rel"), page);
}
 
/* indicate that the page can be released */
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index b626e38e9ab5..180eae8134da 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -875,31 +875,31 @@ struct afs_vnode_cache_aux {
 #define __AFS_PAGE_PRIV_MMAPPED0x8000UL
 #endif
 
-static inline unsigned int afs_page_dirty_resolution(void)
+static inline unsigned int afs_page_dirty_resolution(struct page *page)
 {
-   int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1);
+   int shift = thp_order(page) + PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1);
return (shift > 0) ? shift : 0;
 }
 
-static inline size_t afs_page_dirty_from(unsigned long priv)
+static inline size_t afs_page_dirty_from(struct page *page, unsigned long priv)
 {
unsigned long x = priv & __AFS_PAGE_PRIV_MASK;
 
/* The lower bound is inclusive */
-   return x << afs_page_dirty_resolution();
+   return x << afs_page_dirty_resolution(page);
 }
 
-static inline size_t afs_page_dirty_to(unsigned long priv)
+static inline size_t afs_page_dirty_to(struct page *page, unsigned long priv)
 {
unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & 
__AFS_PAGE_PRIV_MASK;
 
/* The upper bound is immediately beyond the region */
-   return (x + 1) << afs_page_dirty_resolution();
+   return (x + 1) << afs_page_dirty_resolution(page);

[PATCH v4 14/28] fscache, cachefiles: Add alternate API to use kiocb for read/write to cache

2021-03-10 Thread David Howells
Add an alternate API by which the cache can be accessed through a kiocb,
doing async DIO, rather than using the current API that tells the cache
where all the pages are.

The new API is intended to be used in conjunction with the netfs helper
library.  A filesystem must pick one or the other and not mix them.

Filesystems wanting to use the new API must #define FSCACHE_USE_NEW_IO_API
before #including the header.  This prevents them from continuing to use
the old API at the same time as there are incompatibilities in how the
PG_fscache page bit is used.

Changes:
 - Use the vfs_iocb_iter_read/write() helpers[1]
 - Move initial definition of fscache_begin_read_operation() here.
 - Remove a commented-out line[2]
 - Combine ki->term_func calls in cachefiles_read_complete()[2].
 - Remove explicit NULL initialiser[2].
 - Remove extern on func decl[2].
 - Put in param names on func decl[2].
 - Remove redundant else[2].
 - Fill out the kdoc comment for fscache_begin_read_operation().
 - Rename fs/fscache/page2.c to io.c to match later patches.

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Christoph Hellwig 
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/161118142558.1232039.17993829899588971439.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161037850.2537118.8819808229350326503.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340402057.1303470.8038373593844486698.st...@warthog.procyon.org.uk/
 # v3
Link: https://lore.kernel.org/r/20210216102614.ga27...@lst.de/ [1]
Link: https://lore.kernel.org/r/20210216084230.ga23...@lst.de/ [2]
---

 fs/cachefiles/Makefile|1 
 fs/cachefiles/interface.c |5 -
 fs/cachefiles/internal.h  |9 +
 fs/cachefiles/rdwr2.c |  403 +
 fs/fscache/Kconfig|1 
 fs/fscache/Makefile   |1 
 fs/fscache/internal.h |4 
 fs/fscache/io.c   |  116 
 fs/fscache/page.c |2 
 fs/fscache/stats.c|1 
 include/linux/fscache-cache.h |4 
 include/linux/fscache.h   |   39 
 12 files changed, 583 insertions(+), 3 deletions(-)
 create mode 100644 fs/cachefiles/rdwr2.c
 create mode 100644 fs/fscache/io.c

diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 891dedda5905..ea17b169ea5e 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -11,6 +11,7 @@ cachefiles-y := \
main.o \
namei.o \
rdwr.o \
+   rdwr2.o \
security.o \
xattr.o
 
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 5efa6a3702c0..da3948fdb615 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -319,8 +319,8 @@ static void cachefiles_drop_object(struct fscache_object 
*_object)
 /*
  * dispose of a reference to an object
  */
-static void cachefiles_put_object(struct fscache_object *_object,
- enum fscache_obj_ref_trace why)
+void cachefiles_put_object(struct fscache_object *_object,
+  enum fscache_obj_ref_trace why)
 {
struct cachefiles_object *object;
struct fscache_cache *cache;
@@ -568,4 +568,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
.uncache_page   = cachefiles_uncache_page,
.dissociate_pages   = cachefiles_dissociate_pages,
.check_consistency  = cachefiles_check_consistency,
+   .begin_read_operation   = cachefiles_begin_read_operation,
 };
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index cf9bd6401c2d..4ed83aa5253b 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -150,6 +150,9 @@ extern int cachefiles_has_space(struct cachefiles_cache 
*cache,
  */
 extern const struct fscache_cache_ops cachefiles_cache_ops;
 
+void cachefiles_put_object(struct fscache_object *_object,
+  enum fscache_obj_ref_trace why);
+
 /*
  * key.c
  */
@@ -217,6 +220,12 @@ extern int cachefiles_allocate_pages(struct 
fscache_retrieval *,
 extern int cachefiles_write_page(struct fscache_storage *, struct page *);
 extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
 
+/*
+ * rdwr2.c
+ */
+extern int cachefiles_begin_read_operation(struct netfs_read_request *,
+  struct fscache_retrieval *);
+
 /*
  * security.c
  */
diff --git a/fs/cachefiles/rdwr2.c b/fs/cachefiles/rdwr2.c
new file mode 100644
index ..620959d1e95b
--- /dev/null
+++ b/fs/cachefiles/rdwr2.c
@@ -0,0 +1,403 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* kiocb-using read/write
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David 

[PATCH v4 13/28] netfs: Hold a ref on a page when PG_private_2 is set

2021-03-10 Thread David Howells
Take a reference on a page when PG_private_2 is set and drop it once the
bit is unlocked[1].

Reported-by: Linus Torvalds 
Signed-off-by: David Howells 
cc: Matthew Wilcox 
cc: Linus Torvalds 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/CAHk-=wh+2gbF7XEjYc=HV9w_2uVzVf7vs60BPz0gFA=+pum...@mail.gmail.com/
 [1]
Link: https://lore.kernel.org/r/1331025.1612974...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/161340400833.1303470.8070750156044982282.st...@warthog.procyon.org.uk/
 # v3
---

 fs/netfs/read_helper.c |   10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 0a7b76c11c98..ce11ca4c32e4 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -238,10 +239,13 @@ static void netfs_rreq_unmark_after_write(struct 
netfs_read_request *rreq,
  bool was_async)
 {
struct netfs_read_subrequest *subreq;
+   struct pagevec pvec;
struct page *page;
pgoff_t unlocked = 0;
bool have_unlocked = false;
 
+   pagevec_init();
+
rcu_read_lock();
 
list_for_each_entry(subreq, >subrequests, rreq_link) {
@@ -255,6 +259,8 @@ static void netfs_rreq_unmark_after_write(struct 
netfs_read_request *rreq,
continue;
unlocked = page->index;
unlock_page_fscache(page);
+   if (pagevec_add(, page) == 0)
+   pagevec_release();
have_unlocked = true;
}
}
@@ -413,8 +419,10 @@ static void netfs_rreq_unlock(struct netfs_read_request 
*rreq)
pg_failed = true;
break;
}
-   if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, >flags))
+   if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, 
>flags)) {
+   get_page(page);
SetPageFsCache(page);
+   }
pg_failed |= subreq_failed;
if (pgend < iopos + subreq->len)
break;




[PATCH v4 12/28] netfs: Define an interface to talk to a cache

2021-03-10 Thread David Howells
Add an interface to the netfs helper library for reading data from the
cache instead of downloading it from the server and support for writing
data just downloaded or cleared to the cache.

The API passes an iov_iter to the cache read/write routines to indicate the
data/buffer to be used.  This is done using the ITER_XARRAY type to provide
direct access to the netfs inode's pagecache.

When the netfs's ->begin_cache_operation() method is called, this must fill
in the cache_resources in the netfs_read_request struct, including the
netfs_cache_ops used by the helper lib to talk to the cache.  The helper
lib does not directly access the cache.

Changes
- Added flag to netfs_subreq_terminated() to indicate that the caller may
  have been running async and stuff that might sleep needs punting to a
  workqueue (can't use in_softirq()[1]).
- Add missing inc of netfs_n_rh_read stat.
- Move initial definition of fscache_begin_read_operation() elsewhere.
- Need to call op->begin_cache_operation() from netfs_write_begin().

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/161118141321.1232039.8296910406755622458.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161036700.2537118.11170748455436854978.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340399569.1303470.1138884774643385730.st...@warthog.procyon.org.uk/
 # v3
Link: https://lore.kernel.org/r/20210216084230.ga23...@lst.de/ [1]
---

 fs/netfs/read_helper.c |  241 
 include/linux/netfs.h  |   49 ++
 2 files changed, 289 insertions(+), 1 deletion(-)

diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index ce3a2f5844d1..0a7b76c11c98 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -88,6 +88,8 @@ static void netfs_free_read_request(struct work_struct *work)
if (rreq->netfs_priv)
rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+   if (rreq->cache_resources.ops)
+   
rreq->cache_resources.ops->end_operation(>cache_resources);
kfree(rreq);
netfs_stat_d(_n_rh_rreq);
 }
@@ -154,6 +156,34 @@ static void netfs_clear_unread(struct 
netfs_read_subrequest *subreq)
iov_iter_zero(iov_iter_count(), );
 }
 
+static void netfs_cache_read_terminated(void *priv, ssize_t 
transferred_or_error,
+   bool was_async)
+{
+   struct netfs_read_subrequest *subreq = priv;
+
+   netfs_subreq_terminated(subreq, transferred_or_error, was_async);
+}
+
+/*
+ * Issue a read against the cache.
+ * - Eats the caller's ref on subreq.
+ */
+static void netfs_read_from_cache(struct netfs_read_request *rreq,
+ struct netfs_read_subrequest *subreq,
+ bool seek_data)
+{
+   struct netfs_cache_resources *cres = >cache_resources;
+   struct iov_iter iter;
+
+   netfs_stat(_n_rh_read);
+   iov_iter_xarray(, READ, >mapping->i_pages,
+   subreq->start + subreq->transferred,
+   subreq->len   - subreq->transferred);
+
+   cres->ops->read(cres, subreq->start, , seek_data,
+   netfs_cache_read_terminated, subreq);
+}
+
 /*
  * Fill a subrequest region with zeroes.
  */
@@ -198,6 +228,144 @@ static void netfs_rreq_completed(struct 
netfs_read_request *rreq, bool was_async
netfs_put_read_request(rreq, was_async);
 }
 
+/*
+ * Deal with the completion of writing the data to the cache.  We have to clear
+ * the PG_fscache bits on the pages involved and release the caller's ref.
+ *
+ * May be called in softirq mode and we inherit a ref from the caller.
+ */
+static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq,
+ bool was_async)
+{
+   struct netfs_read_subrequest *subreq;
+   struct page *page;
+   pgoff_t unlocked = 0;
+   bool have_unlocked = false;
+
+   rcu_read_lock();
+
+   list_for_each_entry(subreq, >subrequests, rreq_link) {
+   XA_STATE(xas, >mapping->i_pages, subreq->start / 
PAGE_SIZE);
+
+   xas_for_each(, page, (subreq->start + subreq->len - 1) / 
PAGE_SIZE) {
+   /* We might have multiple writes from the same huge
+* page, but we mustn't unlock a page more than once.
+*/
+   if (have_unlocked && page->index <= unlocked)
+   continue;
+   unlocked = page->index;
+   

Re: MaxLinear, please maintain your drivers was Re: [PATCH] leds: lgm: fix gpiolib dependency

2021-03-10 Thread Arnd Bergmann
On Wed, Mar 10, 2021 at 8:30 AM Pavel Machek  wrote:
>
> Hi!
>
> > > > I'd like people from Intel to contact me. There's more to fix there,
> > > > and AFAICT original author went away.
> > >
> > > The following message to  was
> > > undeliverable.
> >
> > > : Recipient
> > > +address rejected: User unknown in virtual mailbox table'
> >
> > > commit c3987cd2bca34ddfec69027acedb2fae5ffcf7a0
> > > Author: Amireddy Mallikarjuna reddy 
> >
> > I asked around, and got told Mallikarjuna has been "sold" to MaxLinear,
> > together with the rest of the Connected Home Division.  So he most likely
> > still works on this stuff, just under a different banner.
> >
> > > If someone knows how to contact the author, that would be welcome.
> >
> > Alas, no idea about his MaxLinear address.
>
> Thanks for the effort. Anyway, I suspect I'll just do this:

Maybe Hauke or John  (added both to cc) know who at MaxLinear is
responsible for maintaining the Lightning Mountain drivers now.

   Arnd

> diff --git a/drivers/leds/blink/Kconfig b/drivers/leds/blink/Kconfig
> index 6dedc58c47b3..79493f21d365 100644
> --- a/drivers/leds/blink/Kconfig
> +++ b/drivers/leds/blink/Kconfig
> @@ -1,14 +1,6 @@
> -menuconfig LEDS_BLINK
> -   bool "LED Blink support"
> -   depends on LEDS_CLASS
> -   help
> - This option enables blink support for the leds class.
> - If unsure, say Y.
> -
> -if LEDS_BLINK
> -
>  config LEDS_BLINK_LGM
> tristate "LED support for Intel LGM SoC series"
> +   depends on BROKEN
> depends on GPIOLIB
> depends on LEDS_CLASS
> depends on MFD_SYSCON
> @@ -17,5 +9,3 @@ config LEDS_BLINK_LGM
>   Parallel to serial conversion, which is also called SSO controller,
>   can drive external shift register for LED outputs.
>   This enables LED support for Serial Shift Output controller(SSO).
> -
> -endif # LEDS_BLINK
>
>
> --
> http://www.livejournal.com/~pavelmachek


[PATCH v4 11/28] netfs: Add write_begin helper

2021-03-10 Thread David Howells
Add a helper to do the pre-reading work for the netfs write_begin address
space op.

Changes
- Added flag to netfs_subreq_terminated() to indicate that the caller may
  have been running async and stuff that might sleep needs punting to a
  workqueue (can't use in_softirq()[1]).

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/160588543960.3465195.2792938973035886168.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118140165.1232039.16418853874312234477.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161035539.2537118.15674887534950908530.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340398368.1303470.11242918276563276090.st...@warthog.procyon.org.uk/
 # v3
Link: https://lore.kernel.org/r/20210216084230.ga23...@lst.de/ [1]
---

 fs/netfs/internal.h  |2 +
 fs/netfs/read_helper.c   |  165 ++
 fs/netfs/stats.c |   11 ++-
 include/linux/netfs.h|8 ++
 include/trace/events/netfs.h |4 +
 5 files changed, 186 insertions(+), 4 deletions(-)

diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 98b6f4516da1..b7f2c4459f33 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -34,8 +34,10 @@ extern atomic_t netfs_n_rh_read_failed;
 extern atomic_t netfs_n_rh_zero;
 extern atomic_t netfs_n_rh_short_read;
 extern atomic_t netfs_n_rh_write;
+extern atomic_t netfs_n_rh_write_begin;
 extern atomic_t netfs_n_rh_write_done;
 extern atomic_t netfs_n_rh_write_failed;
+extern atomic_t netfs_n_rh_write_zskip;
 
 
 static inline void netfs_stat(atomic_t *stat)
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index 8b03fcc2f8f1..ce3a2f5844d1 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -766,3 +766,168 @@ int netfs_readpage(struct file *file,
return ret;
 }
 EXPORT_SYMBOL(netfs_readpage);
+
+static void netfs_clear_thp(struct page *page)
+{
+   unsigned int i;
+
+   for (i = 0; i < thp_nr_pages(page); i++)
+   clear_highpage(page + i);
+}
+
+/**
+ * netfs_write_begin - Helper to prepare for writing
+ * @file: The file to read from
+ * @mapping: The mapping to read from
+ * @pos: File position at which the write will begin
+ * @len: The length of the write in this page
+ * @flags: AOP_* flags
+ * @_page: Where to put the resultant page
+ * @_fsdata: Place for the netfs to store a cookie
+ * @ops: The network filesystem's operations for the helper to use
+ * @netfs_priv: Private netfs data to be retained in the request
+ *
+ * Pre-read data for a write-begin request by drawing data from the cache if
+ * possible, or the netfs if not.  Space beyond the EOF is zero-filled.
+ * Multiple I/O requests from different sources will get munged together.  If
+ * necessary, the readahead window can be expanded in either direction to a
+ * more convenient alighment for RPC efficiency or to make storage in the cache
+ * feasible.
+ *
+ * The calling netfs must provide a table of operations, only one of which,
+ * issue_op, is mandatory.
+ *
+ * The check_write_begin() operation can be provided to check for and flush
+ * conflicting writes once the page is grabbed and locked.  It is passed a
+ * pointer to the fsdata cookie that gets returned to the VM to be passed to
+ * write_end.  It is permitted to sleep.  It should return 0 if the request
+ * should go ahead; unlock the page and return -EAGAIN to cause the page to be
+ * regot; or return an error.
+ *
+ * This is usable whether or not caching is enabled.
+ */
+int netfs_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned int len, unsigned int flags,
+ struct page **_page, void **_fsdata,
+ const struct netfs_read_request_ops *ops,
+ void *netfs_priv)
+{
+   struct netfs_read_request *rreq;
+   struct page *page, *xpage;
+   struct inode *inode = file_inode(file);
+   unsigned int debug_index = 0;
+   pgoff_t index = pos >> PAGE_SHIFT;
+   int pos_in_page = pos & ~PAGE_MASK;
+   loff_t size;
+   int ret;
+
+   struct readahead_control ractl = {
+   .file   = file,
+   .mapping= mapping,
+   ._index = index,
+   ._nr_pages  = 0,
+   };
+
+retry:
+   page = grab_cache_page_write_begin(mapping, index, 0);
+   if (!page)
+   return -ENOMEM;
+
+   if (ops->check_write_begin) {
+   /* Allow the netfs (eg. ceph) to flush conflicts. */
+   ret = ops->check_write_begin(file, pos, len, 

Re: [PATCH v4 2/4] hugetlb/userfaultfd: Forbid huge pmd sharing when uffd enabled

2021-03-10 Thread Peter Xu
On Wed, Mar 10, 2021 at 01:18:42PM +0530, Naresh Kamboju wrote:
> Hi Peter,

Hi, Naresh,

> 
> On Fri, 19 Feb 2021 at 04:43, Peter Xu  wrote:
> >
> > Huge pmd sharing could bring problem to userfaultfd.  The thing is that
> > userfaultfd is running its logic based on the special bits on page table
> > entries, however the huge pmd sharing could potentially share page table
> > entries for different address ranges.  That could cause issues on either:
> >
> >   - When sharing huge pmd page tables for an uffd write protected range, the
> > newly mapped huge pmd range will also be write protected unexpectedly, 
> > or,
> >
> >   - When we try to write protect a range of huge pmd shared range, we'll 
> > first
> > do huge_pmd_unshare() in hugetlb_change_protection(), however that also
> > means the UFFDIO_WRITEPROTECT could be silently skipped for the shared
> > region, which could lead to data loss.
> >
> > Since at it, a few other things are done altogether:
> >
> >   - Move want_pmd_share() from mm/hugetlb.c into linux/hugetlb.h, because
> > that's definitely something that arch code would like to use too
> >
> >   - ARM64 currently directly check against CONFIG_ARCH_WANT_HUGE_PMD_SHARE 
> > when
> > trying to share huge pmd.  Switch to the want_pmd_share() helper.
> >
> > Since at it, move vma_shareable() from huge_pmd_share() into 
> > want_pmd_share().
> >
> > Reviewed-by: Mike Kravetz 
> > Reviewed-by: Axel Rasmussen 
> > Signed-off-by: Peter Xu 
> > ---
> >  arch/arm64/mm/hugetlbpage.c   |  3 +--
> >  include/linux/hugetlb.h   |  2 ++
> >  include/linux/userfaultfd_k.h |  9 +
> >  mm/hugetlb.c  | 20 ++--
> >  4 files changed, 26 insertions(+), 8 deletions(-)
> >
> > diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
> > index 6e3bcffe2837..58987a98e179 100644
> > --- a/arch/arm64/mm/hugetlbpage.c
> > +++ b/arch/arm64/mm/hugetlbpage.c
> > @@ -284,8 +284,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct 
> > vm_area_struct *vma,
> >  */
> > ptep = pte_alloc_map(mm, pmdp, addr);
> > } else if (sz == PMD_SIZE) {
> > -   if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) &&
> > -   pud_none(READ_ONCE(*pudp)))
> > +   if (want_pmd_share(vma, addr) && pud_none(READ_ONCE(*pudp)))
> 
> While building Linux next 20210310 tag for arm64 architecture with
> 
>   - CONFIG_ARM64_64K_PAGES=y
> 
> enabled the build failed due to below errors / warnings
> 
> make --silent --keep-going --jobs=8
> O=/home/tuxbuild/.cache/tuxmake/builds/1/tmp ARCH=arm64
> CROSS_COMPILE=aarch64-linux-gnu- 'CC=sccache aarch64-linux-gnu-gcc'
> 'HOSTCC=sccache gcc'
> aarch64-linux-gnu-ld: Unexpected GOT/PLT entries detected!
> aarch64-linux-gnu-ld: Unexpected run-time procedure linkages detected!
> aarch64-linux-gnu-ld: arch/arm64/mm/hugetlbpage.o: in function 
> `huge_pte_alloc':
> hugetlbpage.c:(.text+0x7d8): undefined reference to `want_pmd_share'
> 
> Reported-by: Naresh Kamboju 

Sorry for the issue & thanks for the report.  Would you please check whether
the patch attached could fix the issue?

-- 
Peter Xu
>From 4072042b415d1f78ac1ab017671e345b414ca6ab Mon Sep 17 00:00:00 2001
From: Peter Xu 
Date: Wed, 10 Mar 2021 11:48:56 -0500
Subject: [PATCH] mm/hugetlb: Fix build with !ARCH_WANT_HUGE_PMD_SHARE

want_pmd_share() is undefined with !ARCH_WANT_HUGE_PMD_SHARE since it's put
by accident into a "#ifdef ARCH_WANT_HUGE_PMD_SHARE" block.  Moving it out
won't work either since vma_shareable() is only defined within the block.
Define it for !ARCH_WANT_HUGE_PMD_SHARE instead.

Fixes: 5b109cc1cdcc ("hugetlb/userfaultfd: forbid huge pmd sharing when uffd 
enabled")
Cc: Andrew Morton 
Cc: Mike Kravetz 
Cc: Axel Rasmussen 
Reported-by: Naresh Kamboju 
Signed-off-by: Peter Xu 
---
 mm/hugetlb.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d58f1456fe27..8dda7e034477 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5469,9 +5469,6 @@ static bool vma_shareable(struct vm_area_struct *vma, 
unsigned long addr)
 
 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
 {
-#ifndef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
-   return false;
-#endif
 #ifdef CONFIG_USERFAULTFD
if (uffd_disable_huge_pmd_share(vma))
return false;
@@ -5616,6 +5613,11 @@ void adjust_range_if_pmd_sharing_possible(struct 
vm_area_struct *vma,
unsigned long *start, unsigned long *end)
 {
 }
+
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+   return false;
+}
 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 
 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-- 
2.26.2



[PATCH v4 10/28] netfs: Gather stats

2021-03-10 Thread David Howells
Gather statistics from the netfs interface that can be exported through a
seqfile.  This is intended to be called by a later patch when viewing
/proc/fs/fscache/stats.

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/161118139247.1232039.10556850937548511068.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161034669.2537118.2761232524997091480.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340397101.1303470.17581910581108378458.st...@warthog.procyon.org.uk/
 # v3
---

 fs/netfs/Kconfig   |   15 +
 fs/netfs/Makefile  |3 +--
 fs/netfs/internal.h|   34 ++
 fs/netfs/read_helper.c |   23 
 fs/netfs/stats.c   |   54 
 include/linux/netfs.h  |1 +
 6 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 fs/netfs/stats.c

diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index 2ebf90e6ca95..578112713703 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -6,3 +6,18 @@ config NETFS_SUPPORT
  This option enables support for network filesystems, including
  helpers for high-level buffered I/O, abstracting out read
  segmentation, local caching and transparent huge page support.
+
+config NETFS_STATS
+   bool "Gather statistical information on local caching"
+   depends on NETFS_SUPPORT && PROC_FS
+   help
+ This option causes statistical information to be gathered on local
+ caching and exported through file:
+
+   /proc/fs/fscache/stats
+
+ The gathering of statistics adds a certain amount of overhead to
+ execution as there are a quite a few stats gathered, and on a
+ multi-CPU system these may be on cachelines that keep bouncing
+ between CPUs.  On the other hand, the stats are very useful for
+ debugging purposes.  Saying 'Y' here is recommended.
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 4b4eff2ba369..c15bfc966d96 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
 
-netfs-y := \
-   read_helper.o
+netfs-y := read_helper.o stats.o
 
 obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index ee665c0e7dc8..98b6f4516da1 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -16,8 +16,42 @@
  */
 extern unsigned int netfs_debug;
 
+/*
+ * stats.c
+ */
+#ifdef CONFIG_NETFS_STATS
+extern atomic_t netfs_n_rh_readahead;
+extern atomic_t netfs_n_rh_readpage;
+extern atomic_t netfs_n_rh_rreq;
+extern atomic_t netfs_n_rh_sreq;
+extern atomic_t netfs_n_rh_download;
+extern atomic_t netfs_n_rh_download_done;
+extern atomic_t netfs_n_rh_download_failed;
+extern atomic_t netfs_n_rh_download_instead;
+extern atomic_t netfs_n_rh_read;
+extern atomic_t netfs_n_rh_read_done;
+extern atomic_t netfs_n_rh_read_failed;
+extern atomic_t netfs_n_rh_zero;
+extern atomic_t netfs_n_rh_short_read;
+extern atomic_t netfs_n_rh_write;
+extern atomic_t netfs_n_rh_write_done;
+extern atomic_t netfs_n_rh_write_failed;
+
+
+static inline void netfs_stat(atomic_t *stat)
+{
+   atomic_inc(stat);
+}
+
+static inline void netfs_stat_d(atomic_t *stat)
+{
+   atomic_dec(stat);
+}
+
+#else
 #define netfs_stat(x) do {} while(0)
 #define netfs_stat_d(x) do {} while(0)
+#endif
 
 /*/
 /*
diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index e64d9dad0c36..8b03fcc2f8f1 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -56,6 +56,7 @@ static struct netfs_read_request *netfs_alloc_read_request(
refcount_set(>usage, 1);
__set_bit(NETFS_RREQ_IN_PROGRESS, >flags);
ops->init_rreq(rreq, file);
+   netfs_stat(_n_rh_rreq);
}
 
return rreq;
@@ -88,6 +89,7 @@ static void netfs_free_read_request(struct work_struct *work)
rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
kfree(rreq);
+   netfs_stat_d(_n_rh_rreq);
 }
 
 static void netfs_put_read_request(struct netfs_read_request *rreq, bool 
was_async)
@@ -117,6 +119,7 @@ static struct netfs_read_subrequest *netfs_alloc_subrequest(
refcount_set(>usage, 2);
subreq->rreq = rreq;
netfs_get_read_request(rreq);
+   netfs_stat(_n_rh_sreq);
}
 
return subreq;
@@ -134,6 +137,7 @@ static void __netfs_put_subrequest(struct 
netfs_read_subrequest *subreq,
 

[PATCH v4 09/28] netfs: Add tracepoints

2021-03-10 Thread David Howells
Add three tracepoints to track the activity of the read helpers:

 (1) netfs/netfs_read

 This logs entry to the read helpers and also expansion of the range in
 a readahead request.

 (2) netfs/netfs_rreq

 This logs the progress of netfs_read_request objects which track
 read requests.  A read request may be a compound of multiple
 subrequests.

 (3) netfs/netfs_sreq

 This logs the progress of netfs_read_subrequest objects, which track
 the contributions from various sources to a read request.

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/161118138060.1232039.5353374588021776217.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161033468.2537118.14021843889844001905.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340395843.1303470.7355519662919639648.st...@warthog.procyon.org.uk/
 # v3
---

 fs/netfs/read_helper.c   |   28 ++
 include/linux/netfs.h|2 
 include/trace/events/netfs.h |  199 ++
 3 files changed, 229 insertions(+)
 create mode 100644 include/trace/events/netfs.h

diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c
index c5ed0dcc9571..e64d9dad0c36 100644
--- a/fs/netfs/read_helper.c
+++ b/fs/netfs/read_helper.c
@@ -16,6 +16,8 @@
 #include 
 #include 
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include 
 
 MODULE_DESCRIPTION("Network fs support");
 MODULE_AUTHOR("Red Hat, Inc.");
@@ -39,6 +41,7 @@ static struct netfs_read_request *netfs_alloc_read_request(
const struct netfs_read_request_ops *ops, void *netfs_priv,
struct file *file)
 {
+   static atomic_t debug_ids;
struct netfs_read_request *rreq;
 
rreq = kzalloc(sizeof(struct netfs_read_request), GFP_KERNEL);
@@ -47,6 +50,7 @@ static struct netfs_read_request *netfs_alloc_read_request(
rreq->netfs_priv = netfs_priv;
rreq->inode = file_inode(file);
rreq->i_size= i_size_read(rreq->inode);
+   rreq->debug_id  = atomic_inc_return(_ids);
INIT_LIST_HEAD(>subrequests);
INIT_WORK(>work, netfs_rreq_work);
refcount_set(>usage, 1);
@@ -82,6 +86,7 @@ static void netfs_free_read_request(struct work_struct *work)
netfs_rreq_clear_subreqs(rreq, false);
if (rreq->netfs_priv)
rreq->netfs_ops->cleanup(rreq->mapping, rreq->netfs_priv);
+   trace_netfs_rreq(rreq, netfs_rreq_trace_free);
kfree(rreq);
 }
 
@@ -127,6 +132,7 @@ static void __netfs_put_subrequest(struct 
netfs_read_subrequest *subreq,
 {
struct netfs_read_request *rreq = subreq->rreq;
 
+   trace_netfs_sreq(subreq, netfs_sreq_trace_free);
kfree(subreq);
netfs_put_read_request(rreq, was_async);
 }
@@ -181,6 +187,7 @@ static void netfs_read_from_server(struct 
netfs_read_request *rreq,
  */
 static void netfs_rreq_completed(struct netfs_read_request *rreq, bool 
was_async)
 {
+   trace_netfs_rreq(rreq, netfs_rreq_trace_done);
netfs_rreq_clear_subreqs(rreq, was_async);
netfs_put_read_request(rreq, was_async);
 }
@@ -219,6 +226,8 @@ static void netfs_rreq_unlock(struct netfs_read_request 
*rreq)
iopos = 0;
subreq_failed = (subreq->error < 0);
 
+   trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
+
rcu_read_lock();
xas_for_each(, page, last_page) {
unsigned int pgpos = (page->index - start_page) * PAGE_SIZE;
@@ -279,6 +288,8 @@ static void netfs_rreq_short_read(struct netfs_read_request 
*rreq,
__clear_bit(NETFS_SREQ_SHORT_READ, >flags);
__set_bit(NETFS_SREQ_SEEK_DATA_READ, >flags);
 
+   trace_netfs_sreq(subreq, netfs_sreq_trace_resubmit_short);
+
netfs_get_read_subrequest(subreq);
atomic_inc(>nr_rd_ops);
netfs_read_from_server(rreq, subreq);
@@ -294,6 +305,8 @@ static bool netfs_rreq_perform_resubmissions(struct 
netfs_read_request *rreq)
 
WARN_ON(in_interrupt());
 
+   trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
+
/* We don't want terminating submissions trying to wake us up whilst
 * we're still going through the list.
 */
@@ -306,6 +319,7 @@ static bool netfs_rreq_perform_resubmissions(struct 
netfs_read_request *rreq)
break;
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->error = 0;
+   trace_netfs_sreq(subreq, 
netfs_sreq_trace_download_instead);
netfs_get_read_subrequest(subreq);
atomic_inc(>nr_rd_ops);
 

[PATCH v4 08/28] netfs: Provide readahead and readpage netfs helpers

2021-03-10 Thread David Howells
Add a pair of helper functions:

 (*) netfs_readahead()
 (*) netfs_readpage()

to do the work of handling a readahead or a readpage, where the page(s)
that form part of the request may be split between the local cache, the
server or just require clearing, and may be single pages and transparent
huge pages.  This is all handled within the helper.

Note that while both will read from the cache if there is data present,
only netfs_readahead() will expand the request beyond what it was asked to
do, and only netfs_readahead() will write back to the cache.

netfs_readpage(), on the other hand, is synchronous and only fetches the
page (which might be a THP) it is asked for.

The netfs gives the helper parameters from the VM, the cache cookie it
wants to use (or NULL) and a table of operations (only one of which is
mandatory):

 (*) expand_readahead() [optional]

 Called to allow the netfs to request an expansion of a readahead
 request to meet its own alignment requirements.  This is done by
 changing rreq->start and rreq->len.

 (*) clamp_length() [optional]

 Called to allow the netfs to cut down a subrequest to meet its own
 boundary requirements.  If it does this, the helper will generate
 additional subrequests until the full request is satisfied.

 (*) is_still_valid() [optional]

 Called to find out if the data just read from the cache has been
 invalidated and must be reread from the server.

 (*) issue_op() [required]

 Called to ask the netfs to issue a read to the server.  The subrequest
 describes the read.  The read request holds information about the file
 being accessed.

 The netfs can cache information in rreq->netfs_priv.

 Upon completion, the netfs should set the error, transferred and can
 also set FSCACHE_SREQ_CLEAR_TAIL and then call
 fscache_subreq_terminated().

 (*) done() [optional]

 Called after the pages have been unlocked.  The read request is still
 pinning the file and mapping and may still be pinning pages with
 PG_fscache.  rreq->error indicates any error that has been
 accumulated.

 (*) cleanup() [optional]

 Called when the helper is disposing of a finished read request.  This
 allows the netfs to clear rreq->netfs_priv.

Netfs support is enabled with CONFIG_NETFS_SUPPORT=y.  It will be built
even if CONFIG_FSCACHE=n and in this case much of it should be optimised
away, allowing the filesystem to use it even when caching is disabled.

Changes:
 - Folded in a kerneldoc comment fix.
 - Folded in a fix for the error handling in the case that ENOMEM occurs.
 - Added flag to netfs_subreq_terminated() to indicate that the caller may
   have been running async and stuff that might sleep needs punting to a
   workqueue (can't use in_softirq()[1]).

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/160588497406.3465195.18003475695899726222.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118136849.1232039.8923686136144228724.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161032290.2537118.13400578415247339173.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340394873.1303470.6237319335883242536.st...@warthog.procyon.org.uk/
 # v3
Link: https://lore.kernel.org/r/20210216084230.ga23...@lst.de/ [1]
---

 fs/Kconfig |1 
 fs/Makefile|1 
 fs/netfs/Makefile  |6 
 fs/netfs/internal.h|   61 
 fs/netfs/read_helper.c |  717 
 include/linux/netfs.h  |   82 +
 6 files changed, 868 insertions(+)
 create mode 100644 fs/netfs/Makefile
 create mode 100644 fs/netfs/internal.h
 create mode 100644 fs/netfs/read_helper.c

diff --git a/fs/Kconfig b/fs/Kconfig
index 462253ae483a..eccbcf1e3f2e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -125,6 +125,7 @@ source "fs/overlayfs/Kconfig"
 
 menu "Caches"
 
+source "fs/netfs/Kconfig"
 source "fs/fscache/Kconfig"
 source "fs/cachefiles/Kconfig"
 
diff --git a/fs/Makefile b/fs/Makefile
index 3215fe205256..9c708e1fbe8f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -67,6 +67,7 @@ obj-y += devpts/
 obj-$(CONFIG_DLM)  += dlm/
  
 # Do not add any filesystems before this line
+obj-$(CONFIG_NETFS_SUPPORT)+= netfs/
 obj-$(CONFIG_FSCACHE)  += fscache/
 obj-$(CONFIG_REISERFS_FS)  += reiserfs/
 obj-$(CONFIG_EXT4_FS)  += ext4/
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
new file mode 100644
index ..4b4eff2ba369
--- /dev/null
+++ b/fs/netfs/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+
+netfs-y := \
+   read_helper.o
+

[PATCH v4 07/28] netfs, mm: Add unlock_page_fscache() and wait_on_page_fscache()

2021-03-10 Thread David Howells
Add unlock_page_fscache() as an alias of unlock_page_private_2().  This
allows a page 'locked' with PG_fscache to be unlocked.

Add wait_on_page_fscache() to wait for PG_fscache to be unlocked.

[Linus suggested putting the fscache-themed functions into the
 caching-specific headers rather than pagemap.h[1]]

Signed-off-by: David Howells 
cc: Linus Torvalds 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: https://lore.kernel.org/r/1330473.1612974...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/CAHk-=wjgA-74ddehziVk=xaemtkswpu1yw4uaro1r3ibs27...@mail.gmail.com/
 [1]
Link: 
https://lore.kernel.org/r/161340393568.1303470.4997526899111310530.st...@warthog.procyon.org.uk/
 # v3
---

 include/linux/netfs.h |   29 +
 1 file changed, 29 insertions(+)

diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index cc1102040488..5ed8c6dc7453 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -26,4 +26,33 @@
 #define TestSetPageFsCache(page)   TestSetPagePrivate2((page))
 #define TestClearPageFsCache(page) TestClearPagePrivate2((page))
 
+/**
+ * unlock_page_fscache - Unlock a page that's locked with PG_fscache
+ * @page: The page
+ *
+ * Unlocks a page that's locked with PG_fscache and wakes up sleepers in
+ * wait_on_page_fscache().  This page bit is used by the netfs helpers when a
+ * netfs page is being written to a local disk cache, thereby allowing writes
+ * to the cache for the same page to be serialised.
+ */
+static inline void unlock_page_fscache(struct page *page)
+{
+   unlock_page_private_2(page);
+}
+
+/**
+ * wait_on_page_fscache - Wait for PG_fscache to be cleared on a page
+ * @page: The page
+ *
+ * Wait for the PG_fscache (PG_private_2) page bit to be removed from a page.
+ * This is, for example, used to handle a netfs page being written to a local
+ * disk cache, thereby allowing writes to the cache for the same page to be
+ * serialised.
+ */
+static inline void wait_on_page_fscache(struct page *page)
+{
+   if (PageFsCache(page))
+   wait_on_page_bit(compound_head(page), PG_fscache);
+}
+
 #endif /* _LINUX_NETFS_H */




Re: [PATCH v1 1/1] fs: Allow no_new_privs tasks to call chroot(2)

2021-03-10 Thread Eric W. Biederman
Mickaël Salaün  writes:

> From: Mickaël Salaün 
>
> Being able to easily change root directories enable to ease some
> development workflow and can be used as a tool to strengthen
> unprivileged security sandboxes.  chroot(2) is not an access-control
> mechanism per se, but it can be used to limit the absolute view of the
> filesystem, and then limit ways to access data and kernel interfaces
> (e.g. /proc, /sys, /dev, etc.).

Actually chroot does not so limit the view of things.  It only limits
the default view.

A process that is chrooted can always escape by something like
chroot("../../../../../../../../..").

So I don't see the point of allowing chroot once you are in your locked
down sandbox.

> Users may not wish to expose namespace complexity to potentially
> malicious processes, or limit their use because of limited resources.
> The chroot feature is much more simple (and limited) than the mount
> namespace, but can still be useful.  As for containers, users of
> chroot(2) should take care of file descriptors or data accessible by
> other means (e.g. current working directory, leaked FDs, passed FDs,
> devices, mount points, etc.).  There is a lot of literature that discuss
> the limitations of chroot, and users of this feature should be aware of
> the multiple ways to bypass it.  Using chroot(2) for security purposes
> can make sense if it is combined with other features (e.g. dedicated
> user, seccomp, LSM access-controls, etc.).
>
> One could argue that chroot(2) is useless without a properly populated
> root hierarchy (i.e. without /dev and /proc).  However, there are
> multiple use cases that don't require the chrooting process to create
> file hierarchies with special files nor mount points, e.g.:
> * A process sandboxing itself, once all its libraries are loaded, may
>   not need files other than regular files, or even no file at all.
> * Some pre-populated root hierarchies could be used to chroot into,
>   provided for instance by development environments or tailored
>   distributions.
> * Processes executed in a chroot may not require access to these special
>   files (e.g. with minimal runtimes, or by emulating some special files
>   with a LD_PRELOADed library or seccomp).
>
> Allowing a task to change its own root directory is not a threat to the
> system if we can prevent confused deputy attacks, which could be
> performed through execution of SUID-like binaries.  This can be
> prevented if the calling task sets PR_SET_NO_NEW_PRIVS on itself with
> prctl(2).  To only affect this task, its filesystem information must not
> be shared with other tasks, which can be achieved by not passing
> CLONE_FS to clone(2).  A similar no_new_privs check is already used by
> seccomp to avoid the same kind of security issues.  Furthermore, because
> of its security use and to avoid giving a new way for attackers to get
> out of a chroot (e.g. using /proc//root), an unprivileged chroot is
> only allowed if the new root directory is the same or beneath the
> current one.  This still allows a process to use a subset of its
> legitimate filesystem to chroot into and then further reduce its view of
> the filesystem.
>
> This change may not impact systems relying on other permission models
> than POSIX capabilities (e.g. Tomoyo).  Being able to use chroot(2) on
> such systems may require to update their security policies.
>
> Only the chroot system call is relaxed with this no_new_privs check; the
> init_chroot() helper doesn't require such change.
>
> Allowing unprivileged users to use chroot(2) is one of the initial
> objectives of no_new_privs:
> https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html
> This patch is a follow-up of a previous one sent by Andy Lutomirski, but
> with less limitations:
> https://lore.kernel.org/lkml/0e2f0f54e19bff53a3739ecfddb4ffa9a6dbde4d.1327858005.git.l...@amacapital.net/

Last time I remember talking architecture we agreed that user namespaces
would be used for enabling features and that no_new_privs would just be
used to lock-down userspace.  That way no_new_privs could be kept simple
and trivial to audit and understand.

You can build your sandbox and use chroot if you use a user namespace at
the start.  A mount namespace would also help lock things down.  Still
allowing chroot after the sanbox has been built, a seccomp filter has
been installed and no_new_privs has been enabled seems like it is asking
for trouble and may weaken existing sandboxes.

So I think we need a pretty compelling use case to consider allowing
chroot(2).  You haven't even mentioned what your usecase is at this
point so I don't know why we would tackle that complexity.

Eric



[PATCH v4 06/28] netfs, mm: Move PG_fscache helper funcs to linux/netfs.h

2021-03-10 Thread David Howells
Move the PG_fscache related helper funcs (such as SetPageFsCache()) to
linux/netfs.h rather than linux/fscache.h as the intention is to move to a
model where they're used by the network filesystem and the helper library,
but not by fscache/cachefiles itself.

Signed-off-by: David Howells 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/161340392347.1303470.18065131603507621762.st...@warthog.procyon.org.uk/
 # v3
---

 include/linux/fscache.h |   11 +--
 include/linux/netfs.h   |   29 +
 2 files changed, 30 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/netfs.h

diff --git a/include/linux/fscache.h b/include/linux/fscache.h
index a1c928fe98e7..1f8dc72369ee 100644
--- a/include/linux/fscache.h
+++ b/include/linux/fscache.h
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #if defined(CONFIG_FSCACHE) || defined(CONFIG_FSCACHE_MODULE)
 #define fscache_available() (1)
@@ -29,16 +30,6 @@
 #endif
 
 
-/*
- * overload PG_private_2 to give us PG_fscache - this is used to indicate that
- * a page is currently backed by a local disk cache
- */
-#define PageFsCache(page)  PagePrivate2((page))
-#define SetPageFsCache(page)   SetPagePrivate2((page))
-#define ClearPageFsCache(page) ClearPagePrivate2((page))
-#define TestSetPageFsCache(page)   TestSetPagePrivate2((page))
-#define TestClearPageFsCache(page) TestClearPagePrivate2((page))
-
 /* pattern used to fill dead space in an index entry */
 #define FSCACHE_INDEX_DEADFILL_PATTERN 0x79
 
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
new file mode 100644
index ..cc1102040488
--- /dev/null
+++ b/include/linux/netfs.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Network filesystem support services.
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowe...@redhat.com)
+ *
+ * See:
+ *
+ * Documentation/filesystems/netfs_library.rst
+ *
+ * for a description of the network filesystem interface declared here.
+ */
+
+#ifndef _LINUX_NETFS_H
+#define _LINUX_NETFS_H
+
+#include 
+
+/*
+ * Overload PG_private_2 to give us PG_fscache - this is used to indicate that
+ * a page is currently backed by a local disk cache
+ */
+#define PageFsCache(page)  PagePrivate2((page))
+#define SetPageFsCache(page)   SetPagePrivate2((page))
+#define ClearPageFsCache(page) ClearPagePrivate2((page))
+#define TestSetPageFsCache(page)   TestSetPagePrivate2((page))
+#define TestClearPageFsCache(page) TestClearPagePrivate2((page))
+
+#endif /* _LINUX_NETFS_H */




[PATCH v4 04/28] netfs: Make a netfs helper module

2021-03-10 Thread David Howells
Make a netfs helper module to manage read request segmentation, caching
support and transparent huge page support on behalf of a network
filesystem.

Signed-off-by: David Howells 
Reviewed-by: Jeff Layton 
cc: Matthew Wilcox 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: 
https://lore.kernel.org/r/160588496284.3465195.10102643717770106661.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118135638.1232039.1622182202673126285.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161031028.2537118.1213974428943508753.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340391427.1303470.14884950716721956560.st...@warthog.procyon.org.uk/
 # v3
---

 fs/netfs/Kconfig |8 
 1 file changed, 8 insertions(+)
 create mode 100644 fs/netfs/Kconfig

diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
new file mode 100644
index ..2ebf90e6ca95
--- /dev/null
+++ b/fs/netfs/Kconfig
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config NETFS_SUPPORT
+   tristate "Support for network filesystem high-level I/O"
+   help
+ This option enables support for network filesystems, including
+ helpers for high-level buffered I/O, abstracting out read
+ segmentation, local caching and transparent huge page support.




[PATCH v4 05/28] netfs: Documentation for helper library

2021-03-10 Thread David Howells
Add interface documentation for the netfs helper library.

Signed-off-by: David Howells 
---

 Documentation/filesystems/index.rst |1 
 Documentation/filesystems/netfs_library.rst |  526 +++
 2 files changed, 527 insertions(+)
 create mode 100644 Documentation/filesystems/netfs_library.rst

diff --git a/Documentation/filesystems/index.rst 
b/Documentation/filesystems/index.rst
index 1f76b1cb3348..d4853cb919d2 100644
--- a/Documentation/filesystems/index.rst
+++ b/Documentation/filesystems/index.rst
@@ -53,6 +53,7 @@ filesystem implementations.
journalling
fscrypt
fsverity
+   netfs_library
 
 Filesystems
 ===
diff --git a/Documentation/filesystems/netfs_library.rst 
b/Documentation/filesystems/netfs_library.rst
new file mode 100644
index ..57a641847818
--- /dev/null
+++ b/Documentation/filesystems/netfs_library.rst
@@ -0,0 +1,526 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=
+NETWORK FILESYSTEM HELPER LIBRARY
+=
+
+.. Contents:
+
+ - Overview.
+ - Buffered read helpers.
+   - Read helper functions.
+   - Read helper structures.
+   - Read helper operations.
+   - Read helper procedure.
+   - Read helper cache API.
+
+
+Overview
+
+
+The network filesystem helper library is a set of functions designed to aid a
+network filesystem in implementing VM/VFS operations.  For the moment, that
+just includes turning various VM buffered read operations into requests to read
+from the server.  The helper library, however, can also interpose other
+services, such as local caching or local data encryption.
+
+Note that the library module doesn't link against local caching directly, so
+access must be provided by the netfs.
+
+
+Buffered Read Helpers
+=
+
+The library provides a set of read helpers that handle the ->readpage(),
+->readahead() and much of the ->write_begin() VM operations and translate them
+into a common call framework.
+
+The following services are provided:
+
+ * Handles transparent huge pages (THPs).
+
+ * Insulates the netfs from VM interface changes.
+
+ * Allows the netfs to arbitrarily split reads up into pieces, even ones that
+   don't match page sizes or page alignments and that may cross pages.
+
+ * Allows the netfs to expand a readahead request in both directions to meet
+   its needs.
+
+ * Allows the netfs to partially fulfil a read, which will then be resubmitted.
+
+ * Handles local caching, allowing cached data and server-read data to be
+   interleaved for a single request.
+
+ * Handles clearing of bufferage that aren't on the server.
+
+ * Handle retrying of reads that failed, switching reads from the cache to the
+   server as necessary.
+
+ * In the future, this is a place that other services can be performed, such as
+   local encryption of data to be stored remotely or in the cache.
+
+From the network filesystem, the helpers require a table of operations.  This
+includes a mandatory method to issue a read operation along with a number of
+optional methods.
+
+
+Read Helper Functions
+-
+
+Three read helpers are provided::
+
+ * void netfs_readahead(struct readahead_control *ractl,
+   const struct netfs_read_request_ops *ops,
+   void *netfs_priv);``
+ * int netfs_readpage(struct file *file,
+ struct page *page,
+ const struct netfs_read_request_ops *ops,
+ void *netfs_priv);
+ * int netfs_write_begin(struct file *file,
+struct address_space *mapping,
+loff_t pos,
+unsigned int len,
+unsigned int flags,
+struct page **_page,
+void **_fsdata,
+const struct netfs_read_request_ops *ops,
+void *netfs_priv);
+
+Each corresponds to a VM operation, with the addition of a couple of parameters
+for the use of the read helpers:
+
+ * ``ops``
+
+   A table of operations through which the helpers can talk to the filesystem.
+
+ * ``netfs_priv``
+
+   Filesystem private data (can be NULL).
+
+Both of these values will be stored into the read request structure.
+
+For ->readahead() and ->readpage(), the network filesystem should just jump
+into the corresponding read helper; whereas for ->write_begin(), it may be a
+little more complicated as the network filesystem might want to flush
+conflicting writes or track dirty data and needs to put the acquired page if an
+error occurs after calling the helper.
+
+The helpers manage the read request, calling back into the network filesystem
+through the suppplied table of operations.  Waits will be performed as
+necessary before returning for helpers that are meant to be synchronous.
+
+If an error occurs and netfs_priv is non-NULL, ops->cleanup() will be called to
+deal with it.  If 

[PATCH v4 03/28] mm: Implement readahead_control pageset expansion

2021-03-10 Thread David Howells
Provide a function, readahead_expand(), that expands the set of pages
specified by a readahead_control object to encompass a revised area with a
proposed size and length.

The proposed area must include all of the old area and may be expanded yet
more by this function so that the edges align on (transparent huge) page
boundaries as allocated.

The expansion will be cut short if a page already exists in either of the
areas being expanded into.  Note that any expansion made in such a case is
not rolled back.

This will be used by fscache so that reads can be expanded to cache granule
boundaries, thereby allowing whole granules to be stored in the cache, but
there are other potential users also.

Changes:
- Moved the declaration of readahead_expand() to a better place[1].

Suggested-by: Matthew Wilcox (Oracle) 
Signed-off-by: David Howells 
cc: Matthew Wilcox (Oracle) 
cc: Alexander Viro 
cc: Christoph Hellwig 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: https://lore.kernel.org/r/20210217161358.gm2858...@casper.infradead.org/ 
[1]
Link: 
https://lore.kernel.org/r/159974633888.2094769.8326206446358128373.st...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/160588479816.3465195.553952688795241765.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118131787.1232039.4863969952441067985.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161028670.2537118.13831420617039766044.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340389201.1303470.14353807284546854878.st...@warthog.procyon.org.uk/
 # v3
---

 include/linux/pagemap.h |2 +
 mm/readahead.c  |   70 +++
 2 files changed, 72 insertions(+)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ac91b33f3873..bf05e99ce588 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -818,6 +818,8 @@ void page_cache_sync_ra(struct readahead_control *, struct 
file_ra_state *,
unsigned long req_count);
 void page_cache_async_ra(struct readahead_control *, struct file_ra_state *,
struct page *, unsigned long req_count);
+void readahead_expand(struct readahead_control *ractl,
+ loff_t new_start, size_t new_len);
 
 /**
  * page_cache_sync_readahead - generic file readahead
diff --git a/mm/readahead.c b/mm/readahead.c
index c5b0457415be..4446dada0bc2 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -638,3 +638,73 @@ SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, 
size_t, count)
 {
return ksys_readahead(fd, offset, count);
 }
+
+/**
+ * readahead_expand - Expand a readahead request
+ * @ractl: The request to be expanded
+ * @new_start: The revised start
+ * @new_len: The revised size of the request
+ *
+ * Attempt to expand a readahead request outwards from the current size to the
+ * specified size by inserting locked pages before and after the current window
+ * to increase the size to the new window.  This may involve the insertion of
+ * THPs, in which case the window may get expanded even beyond what was
+ * requested.
+ *
+ * The algorithm will stop if it encounters a conflicting page already in the
+ * pagecache and leave a smaller expansion than requested.
+ *
+ * The caller must check for this by examining the revised @ractl object for a
+ * different expansion than was requested.
+ */
+void readahead_expand(struct readahead_control *ractl,
+ loff_t new_start, size_t new_len)
+{
+   struct address_space *mapping = ractl->mapping;
+   pgoff_t new_index, new_nr_pages;
+   gfp_t gfp_mask = readahead_gfp_mask(mapping);
+
+   new_index = new_start / PAGE_SIZE;
+
+   /* Expand the leading edge downwards */
+   while (ractl->_index > new_index) {
+   unsigned long index = ractl->_index - 1;
+   struct page *page = xa_load(>i_pages, index);
+
+   if (page && !xa_is_value(page))
+   return; /* Page apparently present */
+
+   page = __page_cache_alloc(gfp_mask);
+   if (!page)
+   return;
+   if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
+   put_page(page);
+   return;
+   }
+
+   ractl->_nr_pages++;
+   ractl->_index = page->index;
+   }
+
+   new_len += new_start - readahead_pos(ractl);
+   new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);
+
+   /* Expand the trailing edge upwards */
+   while (ractl->_nr_pages < new_nr_pages) {
+   unsigned long index = ractl->_index + ractl->_nr_pages;
+   struct page *page = xa_load(>i_pages, index);
+
+

Re: [PATCH] net: bonding: fix error return code of bond_neigh_init()

2021-03-10 Thread Eric Dumazet



On 3/10/21 10:24 AM, Roi Dayan wrote:
> 
> 
> On 2021-03-08 5:11 AM, Jia-Ju Bai wrote:
>> When slave is NULL or slave_ops->ndo_neigh_setup is NULL, no error
>> return code of bond_neigh_init() is assigned.
>> To fix this bug, ret is assigned with -EINVAL in these cases.
>>
>> Fixes: 9e99bfefdbce ("bonding: fix bond_neigh_init()")
>> Reported-by: TOTE Robot 
>> Signed-off-by: Jia-Ju Bai 
>> ---
>>   drivers/net/bonding/bond_main.c | 8 ++--
>>   1 file changed, 6 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/bonding/bond_main.c 
>> b/drivers/net/bonding/bond_main.c
>> index 74cbbb22470b..456315bef3a8 100644
>> --- a/drivers/net/bonding/bond_main.c
>> +++ b/drivers/net/bonding/bond_main.c
>> @@ -3978,11 +3978,15 @@ static int bond_neigh_init(struct neighbour *n)
>>     rcu_read_lock();
>>   slave = bond_first_slave_rcu(bond);
>> -    if (!slave)
>> +    if (!slave) {
>> +    ret = -EINVAL;
>>   goto out;
>> +    }
>>   slave_ops = slave->dev->netdev_ops;
>> -    if (!slave_ops->ndo_neigh_setup)
>> +    if (!slave_ops->ndo_neigh_setup) {
>> +    ret = -EINVAL;
>>   goto out;
>> +    }
>>     /* TODO: find another way [1] to implement this.
>>    * Passing a zeroed structure is fragile,
>>
> 
> 
> Hi,
> 
> This breaks basic functionally that always worked. A slave doesn't need
> to exists nor to implement ndo_neigh_setup.
> Now trying to add a neigh entry because of that fails.
> This commit needs to be reverted.
> 
> Thanks,
> Roi

Agreed, this commit made no sense, please revert.


[PATCH v4 02/28] mm: Add an unlock function for PG_private_2/PG_fscache

2021-03-10 Thread David Howells
Add a function, unlock_page_private_2(), to unlock PG_private_2 analogous
to that of PG_lock.  Add a kerneldoc banner to that indicating the example
usage case.

A wrapper will need to be placed in the netfs header in the patch that adds
that.

[This implements a suggestion by Linus[1] to not mix the terminology of
 PG_private_2 and PG_fscache in the mm core function]

Changes:
- Remove extern from the declaration[2].

Suggested-by: Linus Torvalds 
Signed-off-by: David Howells 
Reviewed-by: Christoph Hellwig 
cc: Matthew Wilcox (Oracle) 
cc: Alexander Viro 
cc: Christoph Hellwig 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: https://lore.kernel.org/r/1330473.1612974...@warthog.procyon.org.uk/ # v1
Link: 
https://lore.kernel.org/r/CAHk-=wjgA-74ddehziVk=xaemtkswpu1yw4uaro1r3ibs27...@mail.gmail.com/
 [1]
Link: https://lore.kernel.org/r/20210216102659.ga27...@lst.de/ [2]
Link: 
https://lore.kernel.org/r/161340387944.1303470.7944159520278177652.st...@warthog.procyon.org.uk/
 # v3
---

 include/linux/pagemap.h |1 +
 mm/filemap.c|   20 
 2 files changed, 21 insertions(+)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 20225b067583..ac91b33f3873 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -591,6 +591,7 @@ extern int __lock_page_async(struct page *page, struct 
wait_page_queue *wait);
 extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
 extern void unlock_page(struct page *page);
+void unlock_page_private_2(struct page *page);
 
 /*
  * Return true if the page was successfully locked
diff --git a/mm/filemap.c b/mm/filemap.c
index 43700480d897..925964b67583 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1432,6 +1432,26 @@ void unlock_page(struct page *page)
 }
 EXPORT_SYMBOL(unlock_page);
 
+/**
+ * unlock_page_private_2 - Unlock a page that's locked with PG_private_2
+ * @page: The page
+ *
+ * Unlocks a page that's locked with PG_private_2 and wakes up sleepers in
+ * wait_on_page_private_2().
+ *
+ * This is, for example, used when a netfs page is being written to a local
+ * disk cache, thereby allowing writes to the cache for the same page to be
+ * serialised.
+ */
+void unlock_page_private_2(struct page *page)
+{
+   page = compound_head(page);
+   VM_BUG_ON_PAGE(!PagePrivate2(page), page);
+   clear_bit_unlock(PG_private_2, >flags);
+   wake_up_page_bit(page, PG_private_2);
+}
+EXPORT_SYMBOL(unlock_page_private_2);
+
 /**
  * end_page_writeback - end writeback against a page
  * @page: the page




Re: [PATCH] kbuild: remove unneeded -O option to dtc

2021-03-10 Thread Rob Herring
On Wed, Mar 10, 2021 at 4:09 AM Masahiro Yamada  wrote:
>
> This piece of code converts the target suffix to the dtc -O option:
>
> *.dtb  ->  -O dtb
> *.dt.yaml  ->  -O yaml
>
> Commit ce88c9c79455 ("kbuild: Add support to build overlays (%.dtbo)")
> added the third case:
>
> *.dtbo ->  -O dtbo
>
> This works thanks to commit 163f0469bf2e ("dtc: Allow overlays to have
> .dtbo extension") in the upstream DTC, which has already been pulled in
> the kernel.
>
> However, I think it is a bit odd because "dtbo" is not a format name.
> At least, it does not show up in the help message of dtc.
>
> $ scripts/dtc/dtc --help
>   [ snip ]
>   -O, --out-format 
> Output formats are:
> dts - device tree source text
> dtb - device tree blob
> yaml - device tree encoded as YAML
> asm - assembler source
>
> So, I am not a big fan of the second hunk of that change:
>
> } else if (streq(outform, "dtbo")) {
> dt_to_blob(outf, dti, outversion);
>
> Anyway, we did not need to do this in Makefile in the first place.
>
> guess_type_by_name() had already understood ".yaml" before commit
> 4f0e3a57d6eb ("kbuild: Add support for DT binding schema checks"),
> and now does ".dtbo" as well.
>
> Makefile does not need to duplicate the same logic. Let's leave it
> to dtc.
>
> Signed-off-by: Masahiro Yamada 
> ---
>
>  scripts/Makefile.lib | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
> index eee59184de64..90a4e04cd8f5 100644
> --- a/scripts/Makefile.lib
> +++ b/scripts/Makefile.lib
> @@ -327,7 +327,7 @@ $(obj)/%.dtb.S: $(obj)/%.dtb FORCE
>
>  quiet_cmd_dtc = DTC $@
>  cmd_dtc = $(HOSTCC) -E $(dtc_cpp_flags) -x assembler-with-cpp -o $(dtc-tmp) 
> $< ; \
> -   $(DTC) -O $(patsubst .%,%,$(suffix $@)) -o $@ -b 0 \
> +   $(DTC) -o $@ -b 0 \
> $(addprefix -i,$(dir $<) $(DTC_INCLUDE)) $(DTC_FLAGS) \
> -d $(depfile).dtc.tmp $(dtc-tmp) ; \
> cat $(depfile).pre.tmp $(depfile).dtc.tmp > $(depfile)

I think you should also drop 'yaml' from:

$(call if_changed_rule,dtc,yaml)

Though that's somewhat separate, so either way:

Acked-by: Rob Herring 

Rob


Re: [PATCH] fb_defio: Remove custom address_space_operations

2021-03-10 Thread Christoph Hellwig
On Wed, Mar 10, 2021 at 01:51:28PM +, Matthew Wilcox (Oracle) wrote:
> There's no need to give the page an address_space.  Leaving the
> page->mapping as NULL will cause the VM to handle set_page_dirty()
> the same way that it's set now, and that was the only reason to
> set the address_space in the first place.
> 
> Signed-off-by: Matthew Wilcox (Oracle) 


Looks good:

Reviewed-by: Christoph Hellwig 


[PATCH v4 01/28] iov_iter: Add ITER_XARRAY

2021-03-10 Thread David Howells
Add an iterator, ITER_XARRAY, that walks through a set of pages attached to
an xarray, starting at a given page and offset and walking for the
specified amount of bytes.  The iterator supports transparent huge pages.

The iterate_xarray() macro calls the helper function with rcu_access()
helped.  I think that this is only a problem for iov_iter_for_each_range()
- and that returns an error for ITER_XARRAY (also, this function does not
appear to be called).

The caller must guarantee that the pages are all present and they must be
locked using PG_locked, PG_writeback or PG_fscache to prevent them from
going away or being migrated whilst they're being accessed.

This is useful for copying data from socket buffers to inodes in network
filesystems and for transferring data between those inodes and the cache
using direct I/O.

Whilst it is true that ITER_BVEC could be used instead, that would require
a bio_vec array to be allocated to refer to all the pages - which should be
redundant if inode->i_pages also points to all these pages.

Note that older versions of this patch implemented an ITER_MAPPING instead,
which was almost the same.

Signed-off-by: David Howells 
cc: Alexander Viro 
cc: Matthew Wilcox (Oracle) 
cc: Christoph Hellwig 
cc: linux...@kvack.org
cc: linux-cach...@redhat.com
cc: linux-...@lists.infradead.org
cc: linux-...@vger.kernel.org
cc: linux-c...@vger.kernel.org
cc: ceph-de...@vger.kernel.org
cc: v9fs-develo...@lists.sourceforge.net
cc: linux-fsde...@vger.kernel.org
Link: https://lore.kernel.org/r/3577430.1579705...@warthog.procyon.org.uk/ # rfc
Link: 
https://lore.kernel.org/r/158861205740.340223.16592990225607814022.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/159465785214.1376674.6062549291411362531.st...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/160588477334.3465195.3608963255682568730.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161118129703.1232039.17141248432017826976.st...@warthog.procyon.org.uk/
 # rfc
Link: 
https://lore.kernel.org/r/161161026313.2537118.14676007075365418649.st...@warthog.procyon.org.uk/
 # v2
Link: 
https://lore.kernel.org/r/161340386671.1303470.10752208972482479840.st...@warthog.procyon.org.uk/
 # v3
---

 include/linux/uio.h |   11 ++
 lib/iov_iter.c  |  313 +++
 2 files changed, 301 insertions(+), 23 deletions(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 27ff8eb786dc..5f5ffc45d4aa 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -10,6 +10,7 @@
 #include 
 
 struct page;
+struct address_space;
 struct pipe_inode_info;
 
 struct kvec {
@@ -24,6 +25,7 @@ enum iter_type {
ITER_BVEC = 16,
ITER_PIPE = 32,
ITER_DISCARD = 64,
+   ITER_XARRAY = 128,
 };
 
 struct iov_iter {
@@ -39,6 +41,7 @@ struct iov_iter {
const struct iovec *iov;
const struct kvec *kvec;
const struct bio_vec *bvec;
+   struct xarray *xarray;
struct pipe_inode_info *pipe;
};
union {
@@ -47,6 +50,7 @@ struct iov_iter {
unsigned int head;
unsigned int start_head;
};
+   loff_t xarray_start;
};
 };
 
@@ -80,6 +84,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter 
*i)
return iov_iter_type(i) == ITER_DISCARD;
 }
 
+static inline bool iov_iter_is_xarray(const struct iov_iter *i)
+{
+   return iov_iter_type(i) == ITER_XARRAY;
+}
+
 static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 {
return i->type & (READ | WRITE);
@@ -221,6 +230,8 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int 
direction, const struct bio_
 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct 
pipe_inode_info *pipe,
size_t count);
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t 
count);
+void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray 
*xarray,
+loff_t start, size_t count);
 ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
size_t maxsize, unsigned maxpages, size_t *start);
 ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f66c62aa7154..f808c625c11e 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -76,7 +76,44 @@
}   \
 }
 
-#define iterate_all_kinds(i, n, v, I, B, K) {  \
+#define iterate_xarray(i, n, __v, skip, STEP) {\
+   struct page *head = NULL;   \
+   size_t wanted = n, seg, offset; \
+   loff_t start = i->xarray_start + skip;  \
+   pgoff_t index = start >> PAGE_SHIFT;\
+   int j; 

[PATCH v4 00/28] Network fs helper library & fscache kiocb API

2021-03-10 Thread David Howells


Here's a set of patches to do two things:

 (1) Add a helper library to handle the new VM readahead interface.  This
 is intended to be used unconditionally by the filesystem (whether or
 not caching is enabled) and provides a common framework for doing
 caching, transparent huge pages and, in the future, possibly fscrypt
 and read bandwidth maximisation.  It also allows the netfs and the
 cache to align, expand and slice up a read request from the VM in
 various ways; the netfs need only provide a function to read a stretch
 of data to the pagecache and the helper takes care of the rest.

 (2) Add an alternative fscache/cachfiles I/O API that uses the kiocb
 facility to do async DIO to transfer data to/from the netfs's pages,
 rather than using readpage with wait queue snooping on one side and
 vfs_write() on the other.  It also uses less memory, since it doesn't
 do buffered I/O on the backing file.

 Note that this uses SEEK_HOLE/SEEK_DATA to locate the data available
 to be read from the cache.  Whilst this is an improvement from the
 bmap interface, it still has a problem with regard to a modern
 extent-based filesystem inserting or removing bridging blocks of
 zeros.  Fixing that requires a much greater overhaul.

This is a step towards overhauling the fscache API.  The change is opt-in
on the part of the network filesystem.  A netfs should not try to mix the
old and the new API because of conflicting ways of handling pages and the
PG_fscache page flag and because it would be mixing DIO with buffered I/O.
Further, the helper library can't be used with the old API.

This does not change any of the fscache cookie handling APIs or the way
invalidation is done.

In the near term, I intend to deprecate and remove the old I/O API
(fscache_allocate_page{,s}(), fscache_read_or_alloc_page{,s}(),
fscache_write_page() and fscache_uncache_page()) and eventually replace
most of fscache/cachefiles with something simpler and easier to follow.

The patchset contains the following parts:

 (1) Some helper patches, including provision of an ITER_XARRAY iov
 iterator and a function to do readahead expansion.

 (2) Patches to add the netfs helper library.

 (3) A patch to add the fscache/cachefiles kiocb API.

 (4) Patches to add support in AFS for this.

Jeff Layton has patches to add support in Ceph for this.

Dave Wysochanski also has patches for NFS for this, though they're not
included on this branch as there's an issue with PNFS.

With this, AFS without a cache passes all expected xfstests; with a cache,
there's an extra failure, but that's also there before these patches.
Fixing that probably requires a greater overhaul.  Ceph and NFS also pass
the expected tests.

These patches can be found also on:


https://git.kernel.org/pub/scm/linux/kernel/git/dhowells/linux-fs.git/log/?h=fscache-netfs-lib


Changes
===

ver #4:
  Fixed some review comments from Christoph Hellwig, including dropping
  the export of rw_verify_area()[3] and some minor stuff[4].

  Moved the declaration of readahead_expand() to a better location[5].

  Rebased to v5.12-rc2 and added a bunch of references into individual
  commits.

  Dropped Ceph support - that will go through the maintainer's tree.

  Added interface documentation for the netfs helper library.

ver #3:
  Rolled in the bug fixes.

  Adjusted the functions that unlock and wait for PG_fscache according
  to Linus's suggestion[1].

  Hold a ref on a page when PG_fscache is set as per Linus's
  suggestion[2].

  Dropped NFS support and added Ceph support.

ver #2:
  Fixed some bugs and added NFS support.

Link: 
https://lore.kernel.org/r/CAHk-=wh+2gbF7XEjYc=HV9w_2uVzVf7vs60BPz0gFA=+pum...@mail.gmail.com/
 [1]
Link: 
https://lore.kernel.org/r/CAHk-=wjgA-74ddehziVk=xaemtkswpu1yw4uaro1r3ibs27...@mail.gmail.com/
 [2]
Link: https://lore.kernel.org/r/20210216102614.ga27...@lst.de/ [3]
Link: https://lore.kernel.org/r/20210216084230.ga23...@lst.de/ [4]
Link: https://lore.kernel.org/r/20210217161358.gm2858...@casper.infradead.org/ 
[5]


References
==

These patches have been published for review before, firstly as part of a
larger set:

Link: 
https://lore.kernel.org/r/158861203563.340223.7585359869938129395.st...@warthog.procyon.org.uk/

Link: 
https://lore.kernel.org/r/159465766378.1376105.11619976251039287525.st...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/159465784033.1376674.18106463693989811037.st...@warthog.procyon.org.uk/
Link: 
https://lore.kernel.org/r/159465821598.1377938.2046362270225008168.st...@warthog.procyon.org.uk/

Link: 
https://lore.kernel.org/r/160588455242.3465195.3214733858273019178.st...@warthog.procyon.org.uk/

Then as a cut-down set:

Link: 
https://lore.kernel.org/r/161118128472.1232039.11746799833066425131.st...@warthog.procyon.org.uk/
 # v1

Link: 

Re: [PATCH V2 1/25] x86/cpufeatures: Enumerate Intel Hybrid Technology feature bit

2021-03-10 Thread Borislav Petkov
On Wed, Mar 10, 2021 at 08:37:37AM -0800, kan.li...@linux.intel.com wrote:
> From: Ricardo Neri 
> 
> Add feature enumeration to identify a processor with Intel Hybrid
> Technology: one in which CPUs of more than one type are the same package.
> On a hybrid processor, all CPUs support the same homogeneous (i.e.,
> symmetric) instruction set. All CPUs enumerate the same features in CPUID.
> Thus, software (user space and kernel) can run and migrate to any CPU in
> the system as well as utilize any of the enumerated features without any
> change or special provisions. The main difference among CPUs in a hybrid
> processor are power and performance properties.
> 
> Cc: Andi Kleen 
> Cc: Kan Liang 
> Cc: "Peter Zijlstra (Intel)" 
> Cc: "Rafael J. Wysocki" 
> Cc: "Ravi V. Shankar" 
> Cc: Srinivas Pandruvada 
> Cc: linux-kernel@vger.kernel.org
> Reviewed-by: Len Brown 
> Reviewed-by: Tony Luck 
> Signed-off-by: Ricardo Neri 
> ---
> Changes since v1 (as part of patchset for perf change for Alderlake)
>  * None
> 
> Changes since v1 (in a separate posting):
>  * Reworded commit message to clearly state what is Intel Hybrid
>Technology. Stress that all CPUs can run the same instruction
>set and support the same features.
> ---
>  arch/x86/include/asm/cpufeatures.h | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/arch/x86/include/asm/cpufeatures.h 
> b/arch/x86/include/asm/cpufeatures.h
> index cc96e26d69f7..e7cfc9eedf8d 100644
> --- a/arch/x86/include/asm/cpufeatures.h
> +++ b/arch/x86/include/asm/cpufeatures.h
> @@ -374,6 +374,7 @@
>  #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */
>  #define X86_FEATURE_TSX_FORCE_ABORT  (18*32+13) /* "" TSX_FORCE_ABORT */
>  #define X86_FEATURE_SERIALIZE(18*32+14) /* SERIALIZE 
> instruction */
> +#define X86_FEATURE_HYBRID_CPU   (18*32+15) /* This part has 
> CPUs of more than one type */

  /* "" This ...

unless you have a valid use case for "hybrid_cpu" being present there.

Thx.

-- 
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette


Re: [PATCH V2] ASoC: soc-core: Prevent warning if no DMI table is present

2021-03-10 Thread Mark Brown
On Wed, Mar 10, 2021 at 10:41:18AM -0600, Pierre-Louis Bossart wrote:

> would this work?

> if (!IS_ENABLED(CONFIG_DMI))
> return 0;

Build time dependencies aren't going to help anything, arm64 (and to my
understanding some future x86 systems, LynxPoint IIRC) supports both DT
and ACPI and so you have kernels built with support for both.


signature.asc
Description: PGP signature


[RESEND] dt-bindings: spi: Convert NXP flexspi to json schema

2021-03-10 Thread Kuldeep Singh
Convert the NXP FlexSPI binding to DT schema format using json-schema.

Signed-off-by: Kuldeep Singh 
---
 .../bindings/spi/nxp,spi-nxp-fspi.yaml| 85 +++
 .../devicetree/bindings/spi/spi-nxp-fspi.txt  | 43 --
 MAINTAINERS   |  2 +-
 3 files changed, 86 insertions(+), 44 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/spi/nxp,spi-nxp-fspi.yaml
 delete mode 100644 Documentation/devicetree/bindings/spi/spi-nxp-fspi.txt

diff --git a/Documentation/devicetree/bindings/spi/nxp,spi-nxp-fspi.yaml 
b/Documentation/devicetree/bindings/spi/nxp,spi-nxp-fspi.yaml
new file mode 100644
index ..e3f2c5aae847
--- /dev/null
+++ b/Documentation/devicetree/bindings/spi/nxp,spi-nxp-fspi.yaml
@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/spi/nxp,spi-nxp-fspi.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: NXP Flex Serial Peripheral Interface (FSPI)
+
+maintainers:
+  - Ashish Kumar 
+
+allOf:
+  - $ref: "spi-controller.yaml#"
+
+properties:
+  compatible:
+enum:
+  - nxp,lx2160a-fspi
+  - nxp,imx8qxp-fspi
+  - nxp,imx8mm-fspi
+  - nxp,imx8dxl-fspi
+
+  reg:
+items:
+  - description: registers
+  - description: memory mapping
+
+  reg-names:
+items:
+  - const: fspi_base
+  - const: fspi_mmap
+
+  interrupts:
+maxItems: 1
+
+  clocks:
+items:
+  - description: SoC SPI fspi_en clock
+  - description: SoC SPI fspi clock
+
+  clock-names:
+items:
+  - const: fspi_en
+  - const: fspi
+
+required:
+  - compatible
+  - reg
+  - reg-names
+  - interrupts
+  - clocks
+  - clock-names
+
+unevaluatedProperties: false
+
+examples:
+  - |
+#include 
+#include 
+
+soc {
+#address-cells = <2>;
+#size-cells = <2>;
+
+spi@20c {
+compatible = "nxp,lx2160a-fspi";
+reg = <0x0 0x20c 0x0 0x10>,
+  <0x0 0x2000 0x0 0x1000>;
+reg-names = "fspi_base", "fspi_mmap";
+interrupts = ;
+clocks = < QORIQ_CLK_PLATFORM_PLL QORIQ_CLK_PLL_DIV(4)>,
+ < QORIQ_CLK_PLATFORM_PLL QORIQ_CLK_PLL_DIV(4)>;
+clock-names = "fspi_en", "fspi";
+#address-cells = <1>;
+#size-cells = <0>;
+
+flash@0 {
+compatible = "jedec,spi-nor";
+spi-max-frequency = <5000>;
+reg = <0>;
+spi-rx-bus-width = <8>;
+spi-tx-bus-width = <8>;
+};
+};
+};
diff --git a/Documentation/devicetree/bindings/spi/spi-nxp-fspi.txt 
b/Documentation/devicetree/bindings/spi/spi-nxp-fspi.txt
deleted file mode 100644
index df178d1b62e6..
--- a/Documentation/devicetree/bindings/spi/spi-nxp-fspi.txt
+++ /dev/null
@@ -1,43 +0,0 @@
-* NXP Flex Serial Peripheral Interface (FSPI)
-
-Required properties:
-  - compatible : Should be "nxp,lx2160a-fspi"
-   "nxp,imx8qxp-fspi"
-   "nxp,imx8mm-fspi"
-   "nxp,imx8dxl-fspi"
-
-  - reg :First contains the register location and length,
- Second contains the memory mapping address and length
-  - reg-names :  Should contain the resource reg names:
-- fspi_base: configuration register address space
- - fspi_mmap: memory mapped address space
-  - interrupts : Should contain the interrupt for the device
-
-Required SPI slave node properties:
-  - reg :There are two buses (A and B) with two chip selects each.
- This encodes to which bus and CS the flash is connected:
- - <0>: Bus A, CS 0
- - <1>: Bus A, CS 1
- - <2>: Bus B, CS 0
- - <3>: Bus B, CS 1
-
-Example showing the usage of two SPI NOR slave devices on bus A:
-
-fspi0: spi@20c {
-   compatible = "nxp,lx2160a-fspi";
-   reg = <0x0 0x20c 0x0 0x1>, <0x0 0x2000 0x0 0x1000>;
-   reg-names = "fspi_base", "fspi_mmap";
-   interrupts = <0 25 0x4>; /* Level high type */
-   clocks = < 4 3>, < 4 3>;
-   clock-names = "fspi_en", "fspi";
-
-   mt35xu512aba0: flash@0 {
-   reg = <0>;
-   
-   };
-
-   mt35xu512aba1: flash@1 {
-   reg = <1>;
-   
-   };
-};
diff --git a/MAINTAINERS b/MAINTAINERS
index d92f85ca831d..8729f7b50945 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12832,7 +12832,7 @@ M:  Ashish Kumar 
 R: Yogesh Gaur 
 L: linux-...@vger.kernel.org
 S: Maintained
-F: Documentation/devicetree/bindings/spi/spi-nxp-fspi.txt
+F: Documentation/devicetree/bindings/spi/nxp,spi-nxp-fspi.yaml
 F: drivers/spi/spi-nxp-fspi.c
 
 NXP FXAS21002C DRIVER
-- 
2.25.1



Re: [PATCH V2] ASoC: soc-core: Prevent warning if no DMI table is present

2021-03-10 Thread Mark Brown
On Wed, Mar 10, 2021 at 05:37:25PM +0100, Takashi Iwai wrote:
> Mark Brown wrote:

> > > did you mean if (!IS_ENABLED(CONFIG_ACPI)) ?

> > Is there a runtime check?

> Well, basically both DMI and ACPI are completely different things, so
> I don't think it's right to check the availability of ACPI as a signal
> of the availability of DMI.

In theory they are only somewhat related, but in practice they're both
part of a holistic system model - ACPI users complain if their system
does not also provide DMI information.


signature.asc
Description: PGP signature


Re: [PATCH] i2c: i2c-scmi: Drop unused ACPI_MODULE_NAME definition

2021-03-10 Thread Rafael J. Wysocki
On Wed, Mar 10, 2021 at 5:08 PM Wolfram Sang  wrote:
>
> On Wed, Mar 10, 2021 at 03:47:10PM +0100, Rafael J. Wysocki wrote:
> > On Fri, Mar 5, 2021 at 7:29 PM Rafael J. Wysocki  wrote:
> > >
> > > From: Rafael J. Wysocki 
> > >
> > > The ACPI_MODULE_NAME() definition is only used by the message
> > > printing macros from ACPICA that are not used by the code in
> > > question, so it is redundant.  Drop it.
> > >
> > > No functional impact.
> > >
> > > Signed-off-by: Rafael J. Wysocki 
> >
> > If there are no concerns regarding this, I'll queue it up for 5.13 in
> > the ACPI tree, thanks!
>
> I'd prefer the I2C tree a tad to avoid conflicts. Any reason for the
> ACPI tree?

There are some patches doing this type of a cleanup in the ACPI tree,
but this is the only reason, so please route it through the i2c tree
if that is preferred.


Re: [PATCH V2 16/25] perf/x86: Register hybrid PMUs

2021-03-10 Thread Dave Hansen
On 3/10/21 8:37 AM, kan.li...@linux.intel.com wrote:
> - err = perf_pmu_register(, "cpu", PERF_TYPE_RAW);
> - if (err)
> - goto out2;
> + if (!is_hybrid()) {
> + err = perf_pmu_register(, "cpu", PERF_TYPE_RAW);
> + if (err)
> + goto out2;
> + } else {
> + u8 cpu_type = get_hybrid_cpu_type(smp_processor_id());
> + struct x86_hybrid_pmu *hybrid_pmu;
> + int i;

Where's the preempt_disable()?

> +static void init_hybrid_pmu(int cpu)
> +{
> + unsigned int fixed_mask, unused_eax, unused_ebx, unused_edx;
> + struct cpu_hw_events *cpuc = _cpu(cpu_hw_events, cpu);
> + u8 cpu_type = get_hybrid_cpu_type(cpu);
> + struct x86_hybrid_pmu *pmu = NULL;
> + struct perf_cpu_context *cpuctx;
> + int i;

Ditto.

Are we really sure the IPIs are worth the trouble?  Why don't we just
cache the leaf when we bring the CPU up like just about every other
thing we read from CPUID?


Re: [PATCH] arm64: dts: qcom: sc7280: Add WPSS remoteproc node

2021-03-10 Thread Bjorn Andersson
On Wed 10 Mar 01:37 CST 2021, Rakesh Pillai wrote:

> Add the WPSS remoteproc node in dts for
> PIL loading.
> 
> Signed-off-by: Rakesh Pillai 
> ---
> - This change is dependent on the below patch series
> 1) https://lore.kernel.org/patchwork/project/lkml/list/?series=487403
> 2) https://lore.kernel.org/patchwork/project/lkml/list/?series=488365
> ---
>  arch/arm64/boot/dts/qcom/sc7280-idp.dts |  4 +++
>  arch/arm64/boot/dts/qcom/sc7280.dtsi| 47 
> +
>  2 files changed, 51 insertions(+)
> 
> diff --git a/arch/arm64/boot/dts/qcom/sc7280-idp.dts 
> b/arch/arm64/boot/dts/qcom/sc7280-idp.dts
> index 950ecb2..603f56b 100644
> --- a/arch/arm64/boot/dts/qcom/sc7280-idp.dts
> +++ b/arch/arm64/boot/dts/qcom/sc7280-idp.dts
> @@ -26,6 +26,10 @@
>   status = "okay";
>  };
>  
> +_wpss {
> + status = "okay";
> +};
> +
>   {
>   status = "okay";
>  };
> diff --git a/arch/arm64/boot/dts/qcom/sc7280.dtsi 
> b/arch/arm64/boot/dts/qcom/sc7280.dtsi
> index 8af6d77..26dd466 100644
> --- a/arch/arm64/boot/dts/qcom/sc7280.dtsi
> +++ b/arch/arm64/boot/dts/qcom/sc7280.dtsi
> @@ -53,6 +53,16 @@
>   no-map;
>   reg = <0x0 0x80b0 0x0 0x10>;
>   };
> +
> + wlan_fw_mem: memory@80c0 {
> + no-map;
> + reg = <0x0 0x80c0 0x0 0xc0>;
> + };
> +
> + wpss_mem: memory@9ae0 {
> + no-map;
> + reg = <0x0 0x9ae0 0x0 0x190>;
> + };
>   };
>  
>   cpus {
> @@ -305,6 +315,43 @@
>   };
>   };
>  
> + remoteproc_wpss: remoteproc@8a0 {
> + compatible = "qcom,sc7280-wpss-pil";
> + reg = <0 0x08a0 0 0x1>;
> +
> + interrupts-extended = < GIC_SPI 587 
> IRQ_TYPE_EDGE_RISING>,
> +   <_smp2p_in 0 0>,

IRQ_TYPE_NONE?

Apart from that this looks good.

Regards,
Bjorn

> +   <_smp2p_in 1 0>,
> +   <_smp2p_in 2 0>,
> +   <_smp2p_in 3 0>,
> +   <_smp2p_in 7 0>;
> + interrupt-names = "wdog", "fatal", "ready", "handover",
> +   "stop-ack", "shutdown-ack";
> +
> + memory-region = <_mem>;
> +
> + qcom,smem-states = <_smp2p_out 0>;
> + qcom,smem-state-names = "stop";
> +
> + resets = <_reset AOSS_CC_WCSS_RESTART>;
> + reset-names = "restart";
> +
> + qcom,halt-regs = <_mutex_regs 0x37000>;
> +
> + status = "disabled";
> +
> + glink-edge {
> + interrupts-extended = < IPCC_CLIENT_WPSS
> +  
> IPCC_MPROC_SIGNAL_GLINK_QMP
> +  
> IRQ_TYPE_EDGE_RISING>;
> + mboxes = < IPCC_CLIENT_WPSS
> + IPCC_MPROC_SIGNAL_GLINK_QMP>;
> +
> + label = "wpss";
> + qcom,remote-pid = <13>;
> + };
> + };
> +
>   pdc: interrupt-controller@b22 {
>   compatible = "qcom,sc7280-pdc", "qcom,pdc";
>   reg = <0 0x0b22 0 0x3>;
> -- 
> 2.7.4
> 


Re: [PATCH] net: add net namespace inode for all net_dev events

2021-03-10 Thread Steven Rostedt
On Wed, 10 Mar 2021 17:03:40 +0800
Tony Lu  wrote:

> On Tue, Mar 09, 2021 at 12:40:11PM -0500, Steven Rostedt wrote:


> > The above shows 10 bytes wasted for this event.
> > 


> 
> I use pahole to read vmlinux.o directly with defconfig and
> CONFIG_DEBUG_INFO enabled, the result shows 22 structs prefixed with
> trace_event_raw_ that have at least one hole.
> 

> 
> pahole shows there are 5 holes with 10 bytes in net_dev_start_xmit event.
> 

Oh, an I'm glad that my analysis matched with pahole ;-)

-- Steve


Re: [RFC v2 5/5] clk: socfpga: allow compile testing of Stratix 10 / Agilex clocks

2021-03-10 Thread Arnd Bergmann
On Wed, Mar 10, 2021 at 9:38 AM Krzysztof Kozlowski
 wrote:
> --- a/drivers/clk/socfpga/Kconfig
> +++ b/drivers/clk/socfpga/Kconfig
> @@ -1,6 +1,17 @@
>  # SPDX-License-Identifier: GPL-2.0
> +config COMMON_CLK_SOCFPGA
> +   bool "Intel SoCFPGA family clock support" if COMPILE_TEST && 
> !ARCH_SOCFPGA && !ARCH_SOCFPGA64
> +   depends on ARCH_SOCFPGA || ARCH_SOCFPGA64 || COMPILE_TEST
> +   default y if ARCH_SOCFPGA || ARCH_SOCFPGA64

I think the 'depends on' line here is redundant if you also have the
'if' line and the default.

Arnd


[PATCH V1 6/6] arm64: dts: qcom: sm8150: Add Data Capture and Compare(DCC) support node

2021-03-10 Thread Souradeep Chowdhury
Add the DCC(Data Capture and Compare) device tree node entry along with
the addresses for register regions.

Signed-off-by: Souradeep Chowdhury 
---
 arch/arm64/boot/dts/qcom/sm8150.dtsi | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/sm8150.dtsi 
b/arch/arm64/boot/dts/qcom/sm8150.dtsi
index e5bb17b..5c564b1 100644
--- a/arch/arm64/boot/dts/qcom/sm8150.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8150.dtsi
@@ -654,6 +654,13 @@
interrupts = ;
};
 
+   dcc@10a2000 {
+   compatible = "qcom,sm8150-dcc", "qcom,dcc";
+   reg = <0x0 0x010a2000 0x0 0x1000>,
+ <0x0 0x010ad000 0x0 0x3000>;
+   reg-names = "dcc-base", "dcc-ram-base";
+   };
+
ufs_mem_hc: ufshc@1d84000 {
compatible = "qcom,sm8150-ufshc", "qcom,ufshc",
 "jedec,ufs-2.0";
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation



[PATCH V1 5/6] MAINTAINERS: Add the entry for DCC(Data Capture and Compare) driver support

2021-03-10 Thread Souradeep Chowdhury
Added the entries for all the files added as a part of driver support for
DCC(Data Capture and Compare).

Signed-off-by: Souradeep Chowdhury 
---
 MAINTAINERS | 8 
 1 file changed, 8 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index d92f85c..fb28218 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4969,6 +4969,14 @@ F:   include/linux/tfrc.h
 F: include/uapi/linux/dccp.h
 F: net/dccp/
 
+QTI DCC DRIVER
+M: Souradeep Chowdhury 
+L: linux-arm-...@vger.kernel.org
+S: Maintained
+F: Documentation/ABI/testing/sysfs-driver-dcc
+F: Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml
+F: drivers/soc/qcom/dcc.c
+
 DECnet NETWORK LAYER
 L: linux-decnet-u...@lists.sourceforge.net
 S: Orphan
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation



[PATCH V1 3/6] soc: qcom: dcc: Add the sysfs variables to the Data Capture and Compare driver(DCC)

2021-03-10 Thread Souradeep Chowdhury
Add the sysfs variables to expose the user space functionalities
like DCC enable, disable, configure addresses and software triggers.
Also add the necessary methods along with the same.

Signed-off-by: Souradeep Chowdhury 
---
 drivers/soc/qcom/dcc.c | 1179 +++-
 1 file changed, 1171 insertions(+), 8 deletions(-)

diff --git a/drivers/soc/qcom/dcc.c b/drivers/soc/qcom/dcc.c
index 89816bf..61e5225 100644
--- a/drivers/soc/qcom/dcc.c
+++ b/drivers/soc/qcom/dcc.c
@@ -17,12 +17,30 @@
 #include 
 #include 
 
+
+#define TIMEOUT_US 100
+
+#define dcc_writel(drvdata, val, off)  \
+   writel((val), drvdata->base + dcc_offset_conv(drvdata, off))
 #define dcc_readl(drvdata, off)
\
readl(drvdata->base + dcc_offset_conv(drvdata, off))
 
+#define dcc_sram_readl(drvdata, off)   \
+   readl(drvdata->ram_base + off)
+
 /* DCC registers */
 #define DCC_HW_INFO0x04
 #define DCC_LL_NUM_INFO0x10
+#define DCC_STATUS 0x1C
+#define DCC_LL_LOCK(m) (0x34 + 0x80 * m)
+#define DCC_LL_CFG(m)  (0x38 + 0x80 * m)
+#define DCC_LL_BASE(m) (0x3c + 0x80 * m)
+#define DCC_FD_BASE(m) (0x40 + 0x80 * m)
+#define DCC_LL_TIMEOUT(m)  (0x44 + 0x80 * m)
+#define DCC_LL_INT_ENABLE(m)   (0x4C + 0x80 * m)
+#define DCC_LL_INT_STATUS(m)   (0x50 + 0x80 * m)
+#define DCC_LL_SW_TRIGGER(m)   (0x60 + 0x80 * m)
+#define DCC_LL_BUS_ACCESS_STATUS(m)(0x64 + 0x80 * m)
 
 #define DCC_MAP_LEVEL1 0x18
 #define DCC_MAP_LEVEL2 0x34
@@ -36,24 +54,38 @@
 #define DCC_FIX_LOOP_OFFSET16
 #define DCC_VER_INFO_BIT   9
 
+#define DCC_READ   0
+#define DCC_WRITE  1
+#define DCC_LOOP   2
+#define DCC_READ_WRITE 3
+
+#define MAX_DCC_OFFSET GENMASK(9, 2)
+#define MAX_DCC_LENGENMASK(6, 0)
+#define MAX_LOOP_CNT   GENMASK(7, 0)
+
+#define DCC_ADDR_DESCRIPTOR0x00
+#define DCC_LOOP_DESCRIPTORBIT(30)
+#define DCC_RD_MOD_WR_DESCRIPTOR   BIT(31)
+#define DCC_LINK_DESCRIPTORGENMASK(31, 30)
+
+#define DCC_READ_IND   0x00
+#define DCC_WRITE_IND  (BIT(28))
+
+#define DCC_AHB_IND0x00
+#define DCC_APB_INDBIT(29)
+
 #define DCC_MAX_LINK_LIST  8
 #define DCC_INVALID_LINK_LIST  GENMASK(7, 0)
 
 #define DCC_VER_MASK1  GENMASK(6, 0)
 #define DCC_VER_MASK2  GENMASK(5, 0)
 
-#define DCC_RD_MOD_WR_ADDR 0xC105E
+#define DCC_RD_MOD_WR_ADDR  0xC105E
 
 struct qcom_dcc_config {
const int dcc_ram_offset;
 };
 
-enum dcc_mem_map_ver {
-   DCC_MEM_MAP_VER1,
-   DCC_MEM_MAP_VER2,
-   DCC_MEM_MAP_VER3
-};
-
 enum dcc_descriptor_type {
DCC_ADDR_TYPE,
DCC_LOOP_TYPE,
@@ -61,6 +93,12 @@ enum dcc_descriptor_type {
DCC_WRITE_TYPE
 };
 
+enum dcc_mem_map_ver {
+   DCC_MEM_MAP_VER1,
+   DCC_MEM_MAP_VER2,
+   DCC_MEM_MAP_VER3
+};
+
 struct dcc_config_entry {
u32 base;
u32 offset;
@@ -98,6 +136,22 @@ struct dcc_drvdata {
u8  loopoff;
 };
 
+struct dcc_cfg_attr {
+   u32 addr;
+   u32 prev_addr;
+   u32 prev_off;
+   u32 link;
+   u32 sram_offset;
+};
+
+struct dcc_cfg_loop_attr {
+   u32 loop;
+   boolloop_start;
+   u32 loop_cnt;
+   u32 loop_len;
+   u32 loop_off;
+};
+
 static u32 dcc_offset_conv(struct dcc_drvdata *drvdata, u32 off)
 {
if (drvdata->mem_map_ver == DCC_MEM_MAP_VER1) {
@@ -114,6 +168,832 @@ static u32 dcc_offset_conv(struct dcc_drvdata *drvdata, 
u32 off)
return off;
 }
 
+static int dcc_sram_writel(struct dcc_drvdata *drvdata,
+   u32 val, u32 off)
+{
+   if (unlikely(off > (drvdata->ram_size - 4)))
+   return -EINVAL;
+
+   writel((val), drvdata->ram_base + off);
+
+   return 0;
+}
+
+static bool dcc_ready(struct dcc_drvdata *drvdata)
+{
+   u32 val;
+
+   /* poll until DCC ready */
+   if (!readl_poll_timeout((drvdata->base + DCC_STATUS), val,
+   (FIELD_GET(GENMASK(1, 0), val) == 0), 1, 
TIMEOUT_US))
+   return true;
+
+   return false;
+}
+
+static int dcc_read_status(struct dcc_drvdata *drvdata)
+{
+   int curr_list;
+   u32 bus_status;
+   u32 ll_cfg = 0;
+   u32 tmp_ll_cfg = 0;
+
+   for (curr_list = 0; curr_list < drvdata->nr_link_list; curr_list++) {
+   if (!drvdata->enable[curr_list])
+

[PATCH V1 1/6] dt-bindings: Added the yaml bindings for DCC

2021-03-10 Thread Souradeep Chowdhury
Documentation for Data Capture and Compare(DCC) device tree bindings
in yaml format.

Signed-off-by: Souradeep Chowdhury 
---
 .../devicetree/bindings/arm/msm/qcom,dcc.yaml  | 49 ++
 1 file changed, 49 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml

diff --git a/Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml 
b/Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml
new file mode 100644
index 000..b8e9998
--- /dev/null
+++ b/Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml
@@ -0,0 +1,49 @@
+# SPDX-License-Identifier: (GPL-2.0-or-later OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/arm/msm/qcom,dcc.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Data Capture and Compare
+
+maintainers:
+  - Souradeep Chowdhury 
+
+description: |
+DCC (Data Capture and Compare) is a DMA engine which is used to save
+configuration data or system memory contents during catastrophic failure
+or SW trigger. DCC is used to capture and store data for debugging purpose
+
+
+properties:
+  compatible:
+items:
+  - enum:
+  - qcom,sm8150-dcc
+  - const: qcom,dcc
+
+  reg:
+items:
+  - description: DCC base register region
+  - description: DCC RAM base register region
+
+  reg-names:
+items:
+  - const: dcc-base
+  - const: dcc-ram-base
+
+required:
+  - compatible
+  - reg
+  - reg-names
+
+additionalProperties: false
+
+examples:
+  - |
+dcc@10a2000{
+compatible = "qcom,sm8150-dcc","qcom,dcc";
+reg = <0x010a2000  0x1000>,
+  <0x010ad000  0x2000>;
+reg-names = "dcc-base", "dcc-ram-base";
+};
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation



[PATCH V1 4/6] DCC: Added the sysfs entries for DCC(Data Capture and Compare) driver

2021-03-10 Thread Souradeep Chowdhury
The DCC is a DMA engine designed to store register values either in
case of a system crash or in case of software triggers manually done
by the user. Using DCC hardware and the sysfs interface of the driver
the user can exploit various functionalities of DCC. The user can specify
the register addresses, the values of which is stored by DCC in it's
dedicated SRAM. The register addresses can be used either to read from,
write to, first read and store value and then write or to loop. All these
options can be exploited using the sysfs interface given to the user.
Following are the sysfs interfaces exposed in DCC driver which are
documented
1)trigger
2)config
3)config_write
4)config_reset
5)enable
6)rd_mod_wr
7)loop

Signed-off-by: Souradeep Chowdhury 
---
 Documentation/ABI/testing/sysfs-driver-dcc | 74 ++
 1 file changed, 74 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-driver-dcc

diff --git a/Documentation/ABI/testing/sysfs-driver-dcc 
b/Documentation/ABI/testing/sysfs-driver-dcc
new file mode 100644
index 000..7a855ca
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-driver-dcc
@@ -0,0 +1,74 @@
+What:   /sys/bus/platform/devices/.../trigger
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file allows the software trigger to be enabled
+   by the user through the sysfs interface.Through this
+   interface the user can manually start a software trigger
+   in dcc where by the dcc driver stores the current status
+   of the specified registers in dcc sram.
+
+What:   /sys/bus/platform/devices/.../enable
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file allows the user to manually enable or
+   disable dcc driver.The dcc hardware needs to be
+   enabled before use.
+
+What:   /sys/bus/platform/devices/.../config
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file allows user to configure the register values
+   along with addresses to the dcc driver.This register
+   addresses are used to read from,write or loop through.
+   To enable all these options separate sysfs files have
+   are created.
+
+What:   /sys/bus/platform/devices/.../config_write
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file allows user to write a value to the register
+   address given as argument.The values are entered in the
+   form of  .
+
+What:   /sys/bus/platform/devices/.../config_reset
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file is used to reset the configuration of
+   a dcc driver to the default configuration.
+
+What:   /sys/bus/platform/devices/.../loop
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file is used to enter the loop count as dcc
+   driver gives the option to loop multiple times on
+   the same register and store the values for each
+   loop.
+
+What:   /sys/bus/platform/devices/.../rd_mod_wr
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file is used to read the value of the register
+   and then write the value given as an argument to the
+   register address in config.The address argument should
+   be given of the form  .
+
+What:   /sys/bus/platform/devices/.../ready
+Date:   February 2021
+Contact:Souradeep Chowdhury 
+Description:
+   This file is used to check the status of the dcc
+   hardware if it's ready to take the inputs.
+
+What:  /sys/bus/platform/devices/.../curr_list
+Date:  February 2021
+Contact:   Souradeep Chowdhury 
+Description:
+   This file is used to configure the linkedlist data
+   to be used while configuring addresses.
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation



[PATCH V1 2/6] soc: qcom: dcc: Add driver support for Data Capture and Compare unit(DCC)

2021-03-10 Thread Souradeep Chowdhury
The DCC is a DMA Engine designed to capture and store data
during system crash or software triggers. The DCC operates
based on link list entries which provides it with data and
addresses and the function it needs to perform. These
functions are read, write and loop. Added the basic driver
in this patch which contains a probe method which instantiates
the resources needed by the driver. DCC has it's own SRAM which
needs to be instantiated at probe time as well.

Signed-off-by: Souradeep Chowdhury 
---
 drivers/soc/qcom/Kconfig  |   8 +
 drivers/soc/qcom/Makefile |   1 +
 drivers/soc/qcom/dcc.c| 388 ++
 3 files changed, 397 insertions(+)
 create mode 100644 drivers/soc/qcom/dcc.c

diff --git a/drivers/soc/qcom/Kconfig b/drivers/soc/qcom/Kconfig
index 79b568f..8819e0b 100644
--- a/drivers/soc/qcom/Kconfig
+++ b/drivers/soc/qcom/Kconfig
@@ -69,6 +69,14 @@ config QCOM_LLCC
  SDM845. This provides interfaces to clients that use the LLCC.
  Say yes here to enable LLCC slice driver.
 
+config QCOM_DCC
+   tristate "Qualcomm Technologies, Inc. Data Capture and Compare engine 
driver"
+   depends on ARCH_QCOM || COMPILE_TEST
+   help
+ This option enables driver for Data Capture and Compare engine. DCC
+ driver provides interface to configure DCC block and read back
+ captured data from DCC's internal SRAM.
+
 config QCOM_KRYO_L2_ACCESSORS
bool
depends on ARCH_QCOM && ARM64 || COMPILE_TEST
diff --git a/drivers/soc/qcom/Makefile b/drivers/soc/qcom/Makefile
index ad675a6..1b00870 100644
--- a/drivers/soc/qcom/Makefile
+++ b/drivers/soc/qcom/Makefile
@@ -26,3 +26,4 @@ obj-$(CONFIG_QCOM_LLCC) += llcc-qcom.o
 obj-$(CONFIG_QCOM_RPMHPD) += rpmhpd.o
 obj-$(CONFIG_QCOM_RPMPD) += rpmpd.o
 obj-$(CONFIG_QCOM_KRYO_L2_ACCESSORS) +=kryo-l2-accessors.o
+obj-$(CONFIG_QCOM_DCC) += dcc.o
diff --git a/drivers/soc/qcom/dcc.c b/drivers/soc/qcom/dcc.c
new file mode 100644
index 000..89816bf
--- /dev/null
+++ b/drivers/soc/qcom/dcc.c
@@ -0,0 +1,388 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2015-2021, The Linux Foundation. All rights reserved.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define dcc_readl(drvdata, off)
\
+   readl(drvdata->base + dcc_offset_conv(drvdata, off))
+
+/* DCC registers */
+#define DCC_HW_INFO0x04
+#define DCC_LL_NUM_INFO0x10
+
+#define DCC_MAP_LEVEL1 0x18
+#define DCC_MAP_LEVEL2 0x34
+#define DCC_MAP_LEVEL3 0x4C
+
+#define DCC_MAP_OFFSET10x10
+#define DCC_MAP_OFFSET20x18
+#define DCC_MAP_OFFSET30x1C
+#define DCC_MAP_OFFSET40x8
+
+#define DCC_FIX_LOOP_OFFSET16
+#define DCC_VER_INFO_BIT   9
+
+#define DCC_MAX_LINK_LIST  8
+#define DCC_INVALID_LINK_LIST  GENMASK(7, 0)
+
+#define DCC_VER_MASK1  GENMASK(6, 0)
+#define DCC_VER_MASK2  GENMASK(5, 0)
+
+#define DCC_RD_MOD_WR_ADDR 0xC105E
+
+struct qcom_dcc_config {
+   const int dcc_ram_offset;
+};
+
+enum dcc_mem_map_ver {
+   DCC_MEM_MAP_VER1,
+   DCC_MEM_MAP_VER2,
+   DCC_MEM_MAP_VER3
+};
+
+enum dcc_descriptor_type {
+   DCC_ADDR_TYPE,
+   DCC_LOOP_TYPE,
+   DCC_READ_WRITE_TYPE,
+   DCC_WRITE_TYPE
+};
+
+struct dcc_config_entry {
+   u32 base;
+   u32 offset;
+   u32 len;
+   u32 index;
+   u32 loop_cnt;
+   u32 write_val;
+   u32 mask;
+   boolapb_bus;
+   enum dcc_descriptor_typedesc_type;
+   struct list_headlist;
+};
+
+struct dcc_drvdata {
+   void __iomem*base;
+   u32 reg_size;
+   struct device   *dev;
+   struct mutexmutex;
+   void __iomem*ram_base;
+   u32 ram_size;
+   u32 ram_offset;
+   enum dcc_mem_map_vermem_map_ver;
+   u32 ram_cfg;
+   u32 ram_start;
+   bool*enable;
+   bool*configured;
+   boolinterrupt_disable;
+   char*sram_node;
+   struct cdev sram_dev;
+   struct class*sram_class;
+   struct list_head*cfg_head;
+   u32 *nr_config;
+   u32 nr_link_list;
+   u8 

[PATCH V1 0/6] Add driver support for Data Capture and Compare Engine(DCC) for SM8150

2021-03-10 Thread Souradeep Chowdhury
DCC(Data Capture and Compare) is a DMA engine designed for debugging purposes. 
In case of a system
crash or manual software triggers by the user the DCC hardware stores the value 
at the register
addresses which can be used for debugging purposes. The DCC driver provides the 
user with sysfs
interface to configure the register addresses. The options that the DCC 
hardware provides include
reading from registers, writing to registers, first reading and then writing to 
registers and looping
through the values of the same register.

In certain cases a register write needs to be executed for accessing the rest 
of the registers, also
the user might want to record the changing values of a particular register with 
time for which he has
the option to use the loop feature. The options mentioned above are exposed to 
the user by sysfs files
once the driver is probed. The details and usage of this sysfs files are 
documented in
Documentation/ABI/testing/sysfs-driver-dcc. As an example if a user wants to 
configure to store 100 words
starting from address 0x8050 he should give inputs as following to the 
sysfs config file:-

echo  0x8050 100 > /sys/bus/platform/devices/.../config

Similarly if the user wants to write to a register using DCC hardware he should 
give following input to
config_write sysfs file:-
echo 0x8000 0xFF > /sys/bus/platform/devices/10a2000.dcc/config_write
All this read and write occurs at crash time or if the user manually invokes a 
software trigger.

Souradeep Chowdhury (6):
  dt-bindings: Added the yaml bindings for DCC
  soc: qcom: dcc: Add driver support for Data Capture and Compare
unit(DCC)
  soc: qcom: dcc: Add the sysfs variables to the Data Capture and
Compare driver(DCC)
  DCC: Added the sysfs entries for DCC(Data Capture and Compare) driver
  MAINTAINERS: Add the entry for DCC(Data Capture and Compare) driver
support
  arm64: dts: qcom: sm8150: Add Data Capture and Compare(DCC) support
node

 Documentation/ABI/testing/sysfs-driver-dcc |   74 +
 .../devicetree/bindings/arm/msm/qcom,dcc.yaml  |   49 +
 MAINTAINERS|8 +
 arch/arm64/boot/dts/qcom/sm8150.dtsi   |7 +
 drivers/soc/qcom/Kconfig   |8 +
 drivers/soc/qcom/Makefile  |1 +
 drivers/soc/qcom/dcc.c | 1551 
 7 files changed, 1698 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-driver-dcc
 create mode 100644 Documentation/devicetree/bindings/arm/msm/qcom,dcc.yaml
 create mode 100644 drivers/soc/qcom/dcc.c

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation



Re: [PATCH 1/2] dt-bindings: remoteproc: qcom: Add SC7280 WPSS support

2021-03-10 Thread Bjorn Andersson
On Wed 10 Mar 01:28 CST 2021, Rakesh Pillai wrote:

> Add WPSS PIL loading support for SC7280 SoCs.
> 

Acked-by: Bjorn Andersson 

But can you please follow up with a patch that converts this to yaml?

Regards,
Bjorn

> Signed-off-by: Rakesh Pillai 
> ---
>  .../bindings/remoteproc/qcom,hexagon-v56.txt   | 35 
> --
>  1 file changed, 20 insertions(+), 15 deletions(-)
> 
> diff --git 
> a/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt 
> b/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt
> index 1337a3d..edad5e8 100644
> --- a/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt
> +++ b/Documentation/devicetree/bindings/remoteproc/qcom,hexagon-v56.txt
> @@ -9,6 +9,7 @@ on the Qualcomm Technology Inc. Hexagon v56 core.
>   Definition: must be one of:
>   "qcom,qcs404-cdsp-pil",
>   "qcom,sdm845-adsp-pil"
> + "qcom,sc7280-wpss-pil"
>  
>  - reg:
>   Usage: required
> @@ -24,7 +25,13 @@ on the Qualcomm Technology Inc. Hexagon v56 core.
>  - interrupt-names:
>   Usage: required
>   Value type: 
> - Definition: must be "wdog", "fatal", "ready", "handover", "stop-ack"
> + Definition: The interrupts needed depends on the compatible string
> + qcom,sdm845-adsp-pil:
> + qcom,qcs404-cdsp-pil:
> + must be "wdog", "fatal", "ready", "handover", "stop-ack"
> + qcom,sc7280-wpss-pil:
> + must be "wdog", "fatal", "ready", "handover", "stop-ack"
> + "shutdown-ack"
>  
>  - clocks:
>   Usage: required
> @@ -35,19 +42,17 @@ on the Qualcomm Technology Inc. Hexagon v56 core.
>  - clock-names:
>   Usage: required for SDM845 ADSP
>   Value type: 
> - Definition: List of clock input name strings sorted in the same
> - order as the clocks property. Definition must have
> - "xo", "sway_cbcr", "lpass_ahbs_aon_cbcr",
> - "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", "qdsp6ss_sleep"
> - and "qdsp6ss_core".
> -
> -- clock-names:
> - Usage: required for QCS404 CDSP
> - Value type: 
> - Definition: List of clock input name strings sorted in the same
> - order as the clocks property. Definition must have
> - "xo", "sway", "tbu", "bimc", "ahb_aon", "q6ss_slave",
> - "q6ss_master", "q6_axim".
> + Definition: The clocks needed depends on the compatible string
> + qcom,sdm845-adsp-pil:
> + must be "xo", "sway_cbcr", "lpass_ahbs_aon_cbcr",
> + "lpass_ahbm_aon_cbcr", "qdsp6ss_xo", "qdsp6ss_sleep",
> + "qdsp6ss_core"
> + qcom,qcs404-cdsp-pil:
> + must be "xo", "sway", "tbu", "bimc", "ahb_aon", "q6ss_slave",
> + "q6ss_master", "q6_axim"
> + qcom,sc7280-wpss-pil:
> + must be "gcc_wpss_ahb_bdg_mst_clk", "gcc_wpss_ahb_clk",
> + "gcc_wpss_rscp_clk"
>  
>  - power-domains:
>   Usage: required
> @@ -65,7 +70,7 @@ on the Qualcomm Technology Inc. Hexagon v56 core.
>  Definition: must be "pdc_sync" and "cc_lpass"
>  
>  - reset-names:
> -Usage: required for QCS404 CDSP
> +Usage: required for QCS404 CDSP, SC7280 WPSS
>  Value type: 
>  Definition: must be "restart"
>  
> -- 
> 2.7.4
> 


Re: [RFC PATCH 0/3] hugetlb: add demote/split page functionality

2021-03-10 Thread Zi Yan
On 10 Mar 2021, at 11:23, Michal Hocko wrote:

> On Mon 08-03-21 16:18:52, Mike Kravetz wrote:
> [...]
>> Converting larger to smaller hugetlb pages can be accomplished today by
>> first freeing the larger page to the buddy allocator and then allocating
>> the smaller pages.  However, there are two issues with this approach:
>> 1) This process can take quite some time, especially if allocation of
>>the smaller pages is not immediate and requires migration/compaction.
>> 2) There is no guarantee that the total size of smaller pages allocated
>>will match the size of the larger page which was freed.  This is
>>because the area freed by the larger page could quickly be
>>fragmented.
>
> I will likely not surprise to show some level of reservation. While your
> concerns about reconfiguration by existing interfaces are quite real is
> this really a problem in practice? How often do you need such a
> reconfiguration?
>
> Is this all really worth the additional code to something as tricky as
> hugetlb code base?
>
>>  include/linux/hugetlb.h |   8 ++
>>  mm/hugetlb.c| 199 +++-
>>  2 files changed, 204 insertions(+), 3 deletions(-)
>>
>> -- 
>> 2.29.2
>>

The high level goal of this patchset seems to enable flexible huge page
allocation from a single pool, when multiple huge page sizes are available
to use. The limitation of existing mechanism is that user has to specify
how many huge pages he/she wants and how many gigantic pages he/she wants
before the actual use.

I just want to throw an idea here, please ignore if it is too crazy.
Could we have a variant buddy allocator for huge page allocations,
which only has available huge page orders in the free list? For example,
if user wants 2MB and 1GB pages, the allocator will only have order-9 and
order-19 pages; when order-9 pages run out, we can split order-19 pages;
if possible, adjacent order-9 pages can be merged back to order-19 pages.


—
Best Regards,
Yan Zi


signature.asc
Description: OpenPGP digital signature


Re: [PATCH v9 5/6] KVM: arm64: ioctl to fetch/store tags in a guest

2021-03-10 Thread Steven Price

On 09/03/2021 17:57, Marc Zyngier wrote:

On Mon, 01 Mar 2021 14:23:14 +,
Steven Price  wrote:


The VMM may not wish to have it's own mapping of guest memory mapped
with PROT_MTE because this causes problems if the VMM has tag checking
enabled (the guest controls the tags in physical RAM and it's unlikely
the tags are correct for the VMM).

Instead add a new ioctl which allows the VMM to easily read/write the
tags from guest memory, allowing the VMM's mapping to be non-PROT_MTE
while the VMM can still read/write the tags for the purpose of
migration.

Signed-off-by: Steven Price 
---
  arch/arm64/include/uapi/asm/kvm.h | 13 +++
  arch/arm64/kvm/arm.c  | 57 +++
  include/uapi/linux/kvm.h  |  1 +
  3 files changed, 71 insertions(+)

diff --git a/arch/arm64/include/uapi/asm/kvm.h 
b/arch/arm64/include/uapi/asm/kvm.h
index 24223adae150..5fc2534ac5df 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -184,6 +184,19 @@ struct kvm_vcpu_events {
__u32 reserved[12];
  };
  
+struct kvm_arm_copy_mte_tags {

+   __u64 guest_ipa;
+   __u64 length;
+   union {
+   void __user *addr;
+   __u64 padding;
+   };
+   __u64 flags;


I'd be keen on a couple of reserved __64s. Just in case...


Fair enough, I'll add a __u64 reserved[2];


+};
+
+#define KVM_ARM_TAGS_TO_GUEST  0
+#define KVM_ARM_TAGS_FROM_GUEST1
+
  /* If you need to interpret the index values, here is the key: */
  #define KVM_REG_ARM_COPROC_MASK   0x0FFF
  #define KVM_REG_ARM_COPROC_SHIFT  16
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 46bf319f6cb7..01d404833e24 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -1297,6 +1297,53 @@ static int kvm_vm_ioctl_set_device_addr(struct kvm *kvm,
}
  }
  
+static int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,

+ struct kvm_arm_copy_mte_tags *copy_tags)
+{
+   gpa_t guest_ipa = copy_tags->guest_ipa;
+   size_t length = copy_tags->length;
+   void __user *tags = copy_tags->addr;
+   gpa_t gfn;
+   bool write = !(copy_tags->flags & KVM_ARM_TAGS_FROM_GUEST);
+
+   if (copy_tags->flags & ~KVM_ARM_TAGS_FROM_GUEST)
+   return -EINVAL;
+
+   if (length & ~PAGE_MASK || guest_ipa & ~PAGE_MASK)
+   return -EINVAL;


It is a bit odd to require userspace to provide a page-aligned
addr/size, as it now has to find out about the kernel's page
size. MTE_GRANULE_SIZE-aligned values would make more sense. Is there
an underlying reason for this?


No fundamental reason, my thoughts were:

 * It's likely user space is naturally going to be using page-aligned 
quantities during migration, so it already has to care about this.


 * It makes the loop below easier.

 * It's easy to relax the restriction in the future if it becomes a 
problem, much harder to tighten it without breaking anything.


But I can switch to MTE_GRANULE_SIZE if you'd prefer, let me know.


+
+   gfn = gpa_to_gfn(guest_ipa);
+
+   while (length > 0) {
+   kvm_pfn_t pfn = gfn_to_pfn_prot(kvm, gfn, write, NULL);
+   void *maddr;
+   unsigned long num_tags = PAGE_SIZE / MTE_GRANULE_SIZE;
+
+   if (is_error_noslot_pfn(pfn))
+   return -ENOENT;
+
+   maddr = page_address(pfn_to_page(pfn));
+
+   if (!write) {
+   num_tags = mte_copy_tags_to_user(tags, maddr, num_tags);
+   kvm_release_pfn_clean(pfn);
+   } else {
+   num_tags = mte_copy_tags_from_user(maddr, tags,
+  num_tags);
+   kvm_release_pfn_dirty(pfn);
+   }
+


Is it actually safe to do this without holding any lock, without
checking anything against the mmu_notifier_seq? What if the pages are
being swapped out? Or the memslot removed from under your feet?

It looks... dangerous. Do you even want to allow this while vcpus are
actually running?


Umm... yeah I'm not sure how I managed to forgot the locks. This should 
be holding kvm->slots_lock to prevent the slot going under our feet. I 
was surprised that lockdep didn't catch that, until I noticed I'd 
disabled it and discovered why (the model makes it incredibly slow). 
However I've done a run with it enabled now - and with the 
kvm->slots_lock taken it's happy.


gfn_to_pfn_prot() internally calls a variant of get_user_pages() - so 
swapping out shouldn't be a problem.


In terms of running with the vcpus running - given this is going to be 
used for migration I think that's pretty much a requirement. We want to 
be able to dump the tags while executing to enable early transfer of the 
memory.


Steve


+   if (num_tags != PAGE_SIZE / MTE_GRANULE_SIZE)
+   

Re: [PATCH v2] mm: page_alloc: dump migrate-failed pages

2021-03-10 Thread Michal Hocko
On Wed 10-03-21 08:05:36, Minchan Kim wrote:
> On Wed, Mar 10, 2021 at 02:07:05PM +0100, Michal Hocko wrote:
[...]
> > The is a lot of churn indeed. Have you considered adding $FOO_lglvl
> > variants for those so that you can use them for your particular case
> > without affecting most of existing users? Something similar we have
> > discussed in other email thread regarding lru_add_drain_all?
> 
> I thought that way but didn't try since it couldn't make them
> atomic(For example, other printk place in other context will
> affect by the $FOO_lglvl).

I do not follow. I meant something like the following (likely incomplete
but you should get an idea).

diff --git a/include/linux/page_owner.h b/include/linux/page_owner.h
index 3468794f83d2..71b402eb8f78 100644
--- a/include/linux/page_owner.h
+++ b/include/linux/page_owner.h
@@ -14,7 +14,7 @@ extern void __set_page_owner(struct page *page,
 extern void __split_page_owner(struct page *page, unsigned int nr);
 extern void __copy_page_owner(struct page *oldpage, struct page *newpage);
 extern void __set_page_owner_migrate_reason(struct page *page, int reason);
-extern void __dump_page_owner(struct page *page);
+extern void __dump_page_owner(struct page *page, const char *loglvl);
 extern void pagetypeinfo_showmixedcount_print(struct seq_file *m,
pg_data_t *pgdat, struct zone *zone);
 
@@ -46,10 +46,10 @@ static inline void set_page_owner_migrate_reason(struct 
page *page, int reason)
if (static_branch_unlikely(_owner_inited))
__set_page_owner_migrate_reason(page, reason);
 }
-static inline void dump_page_owner(struct page *page)
+static inline void dump_page_owner(struct page *page, const char *loglvl)
 {
if (static_branch_unlikely(_owner_inited))
-   __dump_page_owner(page);
+   __dump_page_owner(page, loglvl);
 }
 #else
 static inline void reset_page_owner(struct page *page, unsigned int order)
@@ -69,7 +69,7 @@ static inline void copy_page_owner(struct page *oldpage, 
struct page *newpage)
 static inline void set_page_owner_migrate_reason(struct page *page, int reason)
 {
 }
-static inline void dump_page_owner(struct page *page)
+static inline void dump_page_owner(struct page *page, const char *loglvl)
 {
 }
 #endif /* CONFIG_PAGE_OWNER */
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9f8117c7cfdd..1b13135d9916 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -14,6 +14,18 @@
 #include 
 #include 
 
+void __stack_trace_print(const unsigned long *entries, unsigned int nr_entries,
+  int spacesconst, char *loglvl)
+{
+   unsigned int i;
+
+   if (WARN_ON(!entries))
+   return;
+
+   for (i = 0; i < nr_entries; i++)
+   printk("%s%*c%pS\n", loglvl, 1 + spaces, ' ', (void 
*)entries[i]);
+}
+
 /**
  * stack_trace_print - Print the entries in the stack trace
  * @entries:   Pointer to storage array
@@ -23,13 +35,7 @@
 void stack_trace_print(const unsigned long *entries, unsigned int nr_entries,
   int spaces)
 {
-   unsigned int i;
-
-   if (WARN_ON(!entries))
-   return;
-
-   for (i = 0; i < nr_entries; i++)
-   printk("%*c%pS\n", 1 + spaces, ' ', (void *)entries[i]);
+   __stack_trace_print(entries, nr_entries, spaces, KERN_DEFAULT);
 }
 EXPORT_SYMBOL_GPL(stack_trace_print);
 
diff --git a/mm/debug.c b/mm/debug.c
index 8a40b3fefbeb..b989ee2ffa89 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -42,7 +42,7 @@ const struct trace_print_flags vmaflag_names[] = {
{0, NULL}
 };
 
-void __dump_page(struct page *page, const char *reason)
+void __dump_page(struct page *page, const char *reason, const char *loglvl)
 {
struct page *head = compound_head(page);
struct address_space *mapping;
@@ -64,7 +64,7 @@ void __dump_page(struct page *page, const char *reason)
 * dump_page() when detected.
 */
if (page_poisoned) {
-   pr_warn("page:%px is uninitialized and poisoned", page);
+   printk("%spage:%px is uninitialized and poisoned", loglvl, 
page);
goto hex_only;
}
 
@@ -95,17 +95,17 @@ void __dump_page(struct page *page, const char *reason)
 */
mapcount = PageSlab(head) ? 0 : page_mapcount(page);
 
-   pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx 
pfn:%#lx\n",
+   printk("%spage:%p refcount:%d mapcount:%d mapping:%p index:%#lx 
pfn:%#lx\n", loglvl,
page, page_ref_count(head), mapcount, mapping,
page_to_pgoff(page), page_to_pfn(page));
if (compound) {
if (hpage_pincount_available(page)) {
-   pr_warn("head:%p order:%u compound_mapcount:%d 
compound_pincount:%d\n",
+   printk("%shead:%p order:%u compound_mapcount:%d 
compound_pincount:%d\n", loglvl,
head, 

[syzbot] BUG: unable to handle kernel access to user memory in schedule_tail

2021-03-10 Thread syzbot
Hello,

syzbot found the following issue on:

HEAD commit:0d7588ab riscv: process: Fix no prototype for arch_dup_tas..
git tree:   git://git.kernel.org/pub/scm/linux/kernel/git/riscv/linux.git 
fixes
console output: https://syzkaller.appspot.com/x/log.txt?x=1212c6e6d0
kernel config:  https://syzkaller.appspot.com/x/.config?x=e3c595255fb2d136
dashboard link: https://syzkaller.appspot.com/bug?extid=e74b94fe601ab9552d69
userspace arch: riscv64

Unfortunately, I don't have any reproducer for this issue yet.

IMPORTANT: if you fix the issue, please add the following tag to the commit:
Reported-by: syzbot+e74b94fe601ab9552...@syzkaller.appspotmail.com

Unable to handle kernel access to user memory without uaccess routines at 
virtual address 2749f0d0
Oops [#1]
Modules linked in:
CPU: 1 PID: 4875 Comm: syz-executor.0 Not tainted 
5.12.0-rc2-syzkaller-00467-g0d7588ab9ef9 #0
Hardware name: riscv-virtio,qemu (DT)
epc : schedule_tail+0x72/0xb2 kernel/sched/core.c:4264
 ra : task_pid_vnr include/linux/sched.h:1421 [inline]
 ra : schedule_tail+0x70/0xb2 kernel/sched/core.c:4264
epc : ffe8c8b0 ra : ffe8c8ae sp : ffe025d17ec0
 gp : ffe005d25378 tp : ffe00f0d t0 : 
 t1 : 0001 t2 : 000f4240 s0 : ffe025d17ee0
 s1 : 2749f0d0 a0 : 002a a1 : 0003
 a2 : 1ffc0cfac500 a3 : ffec80cc a4 : 5ae9db91c19bbe00
 a5 :  a6 : 00f0 a7 : ffe82eba
 s2 : 0004 s3 : ffe00eef96c0 s4 : ffe022c77fe0
 s5 : 4000 s6 : ffe067d74e00 s7 : ffe067d74850
 s8 : ffe067d73e18 s9 : ffe067d74e00 s10: ffe00eef96e8
 s11: 00ae6cdf8368 t3 : 5ae9db91c19bbe00 t4 : ffc4043cafb2
 t5 : ffc4043cafba t6 : 0004
status: 0120 badaddr: 2749f0d0 cause: 000f
Call Trace:
[] schedule_tail+0x72/0xb2 kernel/sched/core.c:4264
[] ret_from_exception+0x0/0x14
Dumping ftrace buffer:
   (ftrace buffer empty)
---[ end trace b5f8f9231dc87dda ]---


---
This report is generated by a bot. It may contain errors.
See https://goo.gl/tpsmEJ for more information about syzbot.
syzbot engineers can be reached at syzkal...@googlegroups.com.

syzbot will keep track of this issue. See:
https://goo.gl/tpsmEJ#status for how to communicate with syzbot.


Re: [PATCH] exit: trigger panic when init process is set to SIGNAL_GROUP_EXIT

2021-03-10 Thread Eric W. Biederman
qianli zhao  writes:

> Hi,Oleg
>
> Thanks for your replay.
>
>> To be honest, I don't understand the changelog. It seems that you want
>> to uglify the kernel to simplify the debugging of buggy init? Or what?
>
> My patch is for the following purpose:
> 1. I hope to fix the occurrence of unexpected panic
> - [   24.705390] kernel BUG at include/linux/pid_namespace.h:98!
> This problem occurs when both two init threads enter the do_exit,
> One of the init thread is syscall sys_exit_group,and set SIGNAL_GROUP_EXIT
> The other init thread perform ret_to_user()->get_signal() and found
> SIGNAL_GROUP_EXIT is set,then do_group_exit()->do_exit()
> Since there are no alive init threads it finally goes to
> zap_pid_ns_processes() and BUG().
> Timing details are in the changelog.

At the start of your changelog and your patch subject you describe what
you are doing but not why.  For the next revision of the patch please
lead with the why it makes what you are trying to do much easier to
understand.

> 2. I hope to fix the problem that coredump cannot be parsed after init
> crash Before my patch,ever init sub-thread will finish do_exit(),the last
> thread will trigger panic().
> Except for the thread that triggered the panic,the state(such as
> PF_EXITING etc) of the other threads is already exiting,so we can't
> parse coredump from fulldump
> In fact, when any one init has been set to SIGNAL_GROUP_EXIT, then we
> can trigger panic,and prevent other init threads from continuing to
> exit
>
>> Nor can I understand the patch. I fail to understand the games with
>> SIGNAL_UNKILLABLE and ->siglock.
>
> When first init thread panic,i don't want other init threads to still
> exit,this will destroy the state of other init threads.
> so i use SIGNAL_UNKILLABLE to mark this stat,prevent other init
> threads from continuing to exit
> In addition i use siglock to protect tsk->signal->flags.

It does not work to use SIGNAL_UNKILLABLE for this.  Normally init
has SIGNAL_UNKILLABLE set.  The only case that clears SIGNAL_UNKILLABLE
is force_sig_info_to_task.  If the init process exits with exit(2)
SIGNAL_UNKILLABLE will already be set.  Which means testing
SIGNAL_UNKILLABLE as your patch does will prevent the panic.

Further simply calling panic is sufficient to guarantee that the other
threads don't exit, and that whichever thread calls panic first
will be the reporting thread.  The rest of the threads will be caught
in panic_smp_self_stop(), if they happen to be running on other cpus.

So I would make the whole thing just be:

/* If global init has exited,
 * panic immediately to get a useable coredump.
 */
if (unlikely(is_global_init(tsk) &&
(thread_group_empty(tsk) ||
(tsk->signal->flags & SIGNAL_GROUP_EXIT {
panic("Attempted to kill init!  exitcode=0x%08x\n",
tsk->signal->group_exit_code ?: (int)code);
}

The thread_group_empty test is needed to handle single threaded
inits.

Do you think you can respin your patch as something like that?

Eric

>
>> And iiuc with this patch the kernel will crash if init's sub-thread execs,
>> signal_group_exit() returns T in this case.
>
> Oleg Nesterov  于2021年3月10日周三 上午2:27写道:
>>
>> On 03/09, Qianli Zhao wrote:
>> >
>> > From: Qianli Zhao 
>> >
>> > Once any init thread finds SIGNAL_GROUP_EXIT, trigger panic immediately
>> > instead of last thread of global init has exited, and do not allow other
>> > init threads to exit, protect task/memory state of all sub-threads for
>> > get reliable init coredump
>>
>> To be honest, I don't understand the changelog. It seems that you want
>> to uglify the kernel to simplify the debugging of buggy init? Or what?
>>
>> Nor can I understand the patch. I fail to understand the games with
>> SIGNAL_UNKILLABLE and ->siglock.
>>
>> And iiuc with this patch the kernel will crash if init's sub-thread execs,
>> signal_group_exit() returns T in this case.
>>
>> Oleg.
>>
>> > [   24.705376] Kernel panic - not syncing: Attempted to kill init! 
>> > exitcode=0x7f00
>> > [   24.705382] CPU: 4 PID: 552 Comm: init Tainted: G S O
>> > 4.14.180-perf-g4483caa8ae80-dirty #1
>> > [   24.705390] kernel BUG at include/linux/pid_namespace.h:98!
>> >
>> > PID: 552   CPU: 4   COMMAND: "init"
>> > PID: 1 CPU: 7   COMMAND: "init"
>> > core4 core7
>> > ...   sys_exit_group()
>> >   do_group_exit()
>> >   - sig->flags = SIGNAL_GROUP_EXIT
>> >   - zap_other_threads()
>> >   do_exit()
>> >   - PF_EXITING is set
>> > ret_to_user()
>> > do_notify_resume()
>> > get_signal()
>> > - signal_group_exit
>> > - goto fatal;
>> > do_group_exit()
>> > do_exit()
>> > - PF_EXITING is set
>> > - panic("Attempted to kill init! exitcode=0x%08x\n")
>> >

Re: [PATCH 2/2] remoteproc: qcom: q6v5_wpss: Add support for sc7280 WPSS

2021-03-10 Thread Bjorn Andersson
On Wed 10 Mar 01:28 CST 2021, Rakesh Pillai wrote:

> Add support for PIL loading of WPSS processor for SC7280
> WPSS boot will be requested by the wifi driver and hence
> disable auto-boot for WPSS. Also add a separate shutdown
> sequence handler for WPSS.
> 
> Signed-off-by: Rakesh Pillai 
> ---
>  drivers/remoteproc/qcom_q6v5_adsp.c | 77 
> -
>  1 file changed, 76 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/remoteproc/qcom_q6v5_adsp.c 
> b/drivers/remoteproc/qcom_q6v5_adsp.c
> index e024502..dc6b91d 100644
> --- a/drivers/remoteproc/qcom_q6v5_adsp.c
> +++ b/drivers/remoteproc/qcom_q6v5_adsp.c
> @@ -58,6 +58,8 @@ struct adsp_pil_data {
>   const char *ssr_name;
>   const char *sysmon_name;
>   int ssctl_id;
> + bool is_wpss;
> + bool auto_boot;
>  
>   const char **clk_ids;
>   int num_clks;
> @@ -96,8 +98,54 @@ struct qcom_adsp {
>   struct qcom_rproc_glink glink_subdev;
>   struct qcom_rproc_ssr ssr_subdev;
>   struct qcom_sysmon *sysmon;
> +
> + int (*shutdown)(struct qcom_adsp *adsp);
>  };
>  
> +static int qcom_wpss_shutdown(struct qcom_adsp *adsp)
> +{
> + unsigned long timeout;
> + unsigned int val;
> + int ret;
> +
> + regmap_write(adsp->halt_map, adsp->halt_lpass + LPASS_HALTREQ_REG, 1);
> +
> + /* Wait for halt ACK from QDSP6 */
> + timeout = jiffies + msecs_to_jiffies(ACK_TIMEOUT);
> + for (;;) {
> + ret = regmap_read(adsp->halt_map,
> +   adsp->halt_lpass + LPASS_HALTACK_REG, );
> + if (ret || val || time_after(jiffies, timeout))
> + break;
> +
> + usleep_range(1000, 1100);
> + }
> +
> + /* Place the WPSS processor into reset */
> + reset_control_assert(adsp->restart);
> + /* wait after asserting subsystem restart from AOSS */
> + usleep_range(100, 105);
> + /* Remove the WPSS reset */
> + reset_control_deassert(adsp->restart);
> +
> + usleep_range(100, 105);
> +
> + regmap_write(adsp->halt_map, adsp->halt_lpass + LPASS_HALTREQ_REG, 0);
> +
> + /* Wait for halt ACK from QDSP6 */
> + timeout = jiffies + msecs_to_jiffies(ACK_TIMEOUT);
> + for (;;) {
> + ret = regmap_read(adsp->halt_map,
> +   adsp->halt_lpass + LPASS_HALTACK_REG, );
> + if (ret || !val || time_after(jiffies, timeout))
> + break;
> +
> + usleep_range(1000, 1100);
> + }
> +
> + return 0;
> +}
> +
>  static int qcom_adsp_shutdown(struct qcom_adsp *adsp)
>  {
>   unsigned long timeout;
> @@ -270,7 +318,7 @@ static int adsp_stop(struct rproc *rproc)
>   if (ret == -ETIMEDOUT)
>   dev_err(adsp->dev, "timed out on wait\n");
>  
> - ret = qcom_adsp_shutdown(adsp);
> + ret = adsp->shutdown(adsp);
>   if (ret)
>   dev_err(adsp->dev, "failed to shutdown: %d\n", ret);
>  
> @@ -439,6 +487,8 @@ static int adsp_probe(struct platform_device *pdev)
>   dev_err(>dev, "unable to allocate remoteproc\n");
>   return -ENOMEM;
>   }
> +
> + rproc->auto_boot = desc->auto_boot;
>   rproc_coredump_set_elf_info(rproc, ELFCLASS32, EM_NONE);
>  
>   adsp = (struct qcom_adsp *)rproc->priv;
> @@ -447,6 +497,11 @@ static int adsp_probe(struct platform_device *pdev)
>   adsp->info_name = desc->sysmon_name;
>   platform_set_drvdata(pdev, adsp);
>  
> + if (desc->is_wpss)
> + adsp->shutdown = qcom_wpss_shutdown;
> + else
> + adsp->shutdown = qcom_adsp_shutdown;
> +
>   ret = adsp_alloc_memory_region(adsp);
>   if (ret)
>   goto free_rproc;
> @@ -515,6 +570,8 @@ static const struct adsp_pil_data adsp_resource_init = {
>   .ssr_name = "lpass",
>   .sysmon_name = "adsp",
>   .ssctl_id = 0x14,
> + .is_wpss = false,
> + .auto_boot = true;
>   .clk_ids = (const char*[]) {
>   "sway_cbcr", "lpass_ahbs_aon_cbcr", "lpass_ahbm_aon_cbcr",
>   "qdsp6ss_xo", "qdsp6ss_sleep", "qdsp6ss_core", NULL
> @@ -528,6 +585,8 @@ static const struct adsp_pil_data cdsp_resource_init = {
>   .ssr_name = "cdsp",
>   .sysmon_name = "cdsp",
>   .ssctl_id = 0x17,
> + .is_wpss = false,
> + .auto_boot = true;
>   .clk_ids = (const char*[]) {
>   "sway", "tbu", "bimc", "ahb_aon", "q6ss_slave", "q6ss_master",
>   "q6_axim", NULL
> @@ -535,7 +594,23 @@ static const struct adsp_pil_data cdsp_resource_init = {
>   .num_clks = 7,
>  };
>  
> +static const struct adsp_pil_data wpss_resource_init = {
> + .crash_reason_smem = 626,
> + .firmware_name = "wpss.mdt",
> + .ssr_name = "wpss",
> + .sysmon_name = "wpss",
> + .ssctl_id = 0x19,
> + .is_wpss = true,
> + .auto_boot = false;

Why is auto_boot false for the WPSS?

> + .clk_ids = (const char*[]) {
> + "gcc_wpss_ahb_bdg_mst_clk", 

[PATCH V2 25/25] perf/x86/rapl: Add support for Intel Alder Lake

2021-03-10 Thread kan . liang
From: Zhang Rui 

Alder Lake RAPL support is the same as previous Sky Lake.
Add Alder Lake model for RAPL.

Reviewed-by: Andi Kleen 
Signed-off-by: Zhang Rui 
---
 arch/x86/events/rapl.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index f42a704..84a1042 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -800,6 +800,8 @@ static const struct x86_cpu_id rapl_model_match[] 
__initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X,   _hsx),
X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, _skl),
X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE,   _skl),
+   X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,   _skl),
+   X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, _skl),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X,_spr),
X86_MATCH_VENDOR_FAM(AMD,   0x17,   _amd_fam17h),
X86_MATCH_VENDOR_FAM(HYGON, 0x18,   _amd_fam17h),
-- 
2.7.4



[PATCH V2 24/25] perf/x86/cstate: Add Alder Lake CPU support

2021-03-10 Thread kan . liang
From: Kan Liang 

Compared with the Rocket Lake, the CORE C1 Residency Counter is added
for Alder Lake, but the CORE C3 Residency Counter is removed. Other
counters are the same.

Create a new adl_cstates for Alder Lake. Update the comments
accordingly.

The External Design Specification (EDS) is not published yet. It comes
from an authoritative internal source.

The patch has been tested on real hardware.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/intel/cstate.c | 39 +--
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 407eee5..4333990 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -40,7 +40,7 @@
  * Model specific counters:
  * MSR_CORE_C1_RES: CORE C1 Residency Counter
  *  perf code: 0x00
- *  Available model: SLM,AMT,GLM,CNL,TNT
+ *  Available model: SLM,AMT,GLM,CNL,TNT,ADL
  *  Scope: Core (each processor core has a MSR)
  * MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
  *perf code: 0x01
@@ -51,46 +51,49 @@
  *perf code: 0x02
  *Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
  * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT,RKL
+ * TNT,RKL,ADL
  *Scope: Core
  * MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
  *perf code: 0x03
  *Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
- * ICL,TGL,RKL
+ * ICL,TGL,RKL,ADL
  *Scope: Core
  * MSR_PKG_C2_RESIDENCY:  Package C2 Residency Counter.
  *perf code: 0x00
  *Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
- * KBL,CML,ICL,TGL,TNT,RKL
+ * KBL,CML,ICL,TGL,TNT,RKL,ADL
  *Scope: Package (physical package)
  * MSR_PKG_C3_RESIDENCY:  Package C3 Residency Counter.
  *perf code: 0x01
  *Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
- * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL
+ * GLM,CNL,KBL,CML,ICL,TGL,TNT,RKL,
+ * ADL
  *Scope: Package (physical package)
  * MSR_PKG_C6_RESIDENCY:  Package C6 Residency Counter.
  *perf code: 0x02
  *Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
  * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL,
- * TNT,RKL
+ * TNT,RKL,ADL
  *Scope: Package (physical package)
  * MSR_PKG_C7_RESIDENCY:  Package C7 Residency Counter.
  *perf code: 0x03
  *Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
- * KBL,CML,ICL,TGL,RKL
+ * KBL,CML,ICL,TGL,RKL,ADL
  *Scope: Package (physical package)
  * MSR_PKG_C8_RESIDENCY:  Package C8 Residency Counter.
  *perf code: 0x04
- *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
+ *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
+ * ADL
  *Scope: Package (physical package)
  * MSR_PKG_C9_RESIDENCY:  Package C9 Residency Counter.
  *perf code: 0x05
- *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL
+ *Available model: HSW ULT,KBL,CNL,CML,ICL,TGL,RKL,
+ * ADL
  *Scope: Package (physical package)
  * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
  *perf code: 0x06
  *Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL,
- * TNT,RKL
+ * TNT,RKL,ADL
  *Scope: Package (physical package)
  *
  */
@@ -563,6 +566,20 @@ static const struct cstate_model icl_cstates __initconst = 
{
  BIT(PERF_CSTATE_PKG_C10_RES),
 };
 
+static const struct 

[PATCH V2 22/25] perf/x86/intel/uncore: Add Alder Lake support

2021-03-10 Thread kan . liang
From: Kan Liang 

The uncore subsystem for Alder Lake is similar to the previous Tiger
Lake.

The difference includes:
- New MSR addresses for global control, fixed counters, CBOX and ARB.
  Add a new adl_uncore_msr_ops for uncore operations.
- Add a new threshold field for CBOX.
- New PCIIDs for IMC devices.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/intel/uncore.c |   7 ++
 arch/x86/events/intel/uncore.h |   1 +
 arch/x86/events/intel/uncore_snb.c | 131 +
 3 files changed, 139 insertions(+)

diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 33c8180..3ad5df2 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1625,6 +1625,11 @@ static const struct intel_uncore_init_fun 
rkl_uncore_init __initconst = {
.pci_init = skl_uncore_pci_init,
 };
 
+static const struct intel_uncore_init_fun adl_uncore_init __initconst = {
+   .cpu_init = adl_uncore_cpu_init,
+   .mmio_init = tgl_uncore_mmio_init,
+};
+
 static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
.cpu_init = icx_uncore_cpu_init,
.pci_init = icx_uncore_pci_init,
@@ -1673,6 +1678,8 @@ static const struct x86_cpu_id intel_uncore_match[] 
__initconst = {
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, _l_uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE,   _uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ROCKETLAKE,  _uncore_init),
+   X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE,   _uncore_init),
+   X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, _uncore_init),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D,  _uncore_init),
{},
 };
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index a3c6e16..30e6557 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -567,6 +567,7 @@ void snb_uncore_cpu_init(void);
 void nhm_uncore_cpu_init(void);
 void skl_uncore_cpu_init(void);
 void icl_uncore_cpu_init(void);
+void adl_uncore_cpu_init(void);
 void tgl_uncore_cpu_init(void);
 void tgl_uncore_mmio_init(void);
 void tgl_l_uncore_mmio_init(void);
diff --git a/arch/x86/events/intel/uncore_snb.c 
b/arch/x86/events/intel/uncore_snb.c
index 5127128..0f63706 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -62,6 +62,8 @@
 #define PCI_DEVICE_ID_INTEL_TGL_H_IMC  0x9a36
 #define PCI_DEVICE_ID_INTEL_RKL_1_IMC  0x4c43
 #define PCI_DEVICE_ID_INTEL_RKL_2_IMC  0x4c53
+#define PCI_DEVICE_ID_INTEL_ADL_1_IMC  0x4660
+#define PCI_DEVICE_ID_INTEL_ADL_2_IMC  0x4641
 
 /* SNB event control */
 #define SNB_UNC_CTL_EV_SEL_MASK0x00ff
@@ -131,12 +133,33 @@
 #define ICL_UNC_ARB_PER_CTR0x3b1
 #define ICL_UNC_ARB_PERFEVTSEL 0x3b3
 
+/* ADL uncore global control */
+#define ADL_UNC_PERF_GLOBAL_CTL0x2ff0
+#define ADL_UNC_FIXED_CTR_CTRL  0x2fde
+#define ADL_UNC_FIXED_CTR   0x2fdf
+
+/* ADL Cbo register */
+#define ADL_UNC_CBO_0_PER_CTR0 0x2002
+#define ADL_UNC_CBO_0_PERFEVTSEL0  0x2000
+#define ADL_UNC_CTL_THRESHOLD  0x3f00
+#define ADL_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+SNB_UNC_CTL_UMASK_MASK | \
+SNB_UNC_CTL_EDGE_DET | \
+SNB_UNC_CTL_INVERT | \
+ADL_UNC_CTL_THRESHOLD)
+
+/* ADL ARB register */
+#define ADL_UNC_ARB_PER_CTR0   0x2FD2
+#define ADL_UNC_ARB_PERFEVTSEL00x2FD0
+#define ADL_UNC_ARB_MSR_OFFSET 0x8
+
 DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
 DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
 DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
 DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
 DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
 DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31");
+DEFINE_UNCORE_FORMAT_ATTR(threshold, threshold, "config:24-29");
 
 /* Sandy Bridge uncore support */
 static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct 
perf_event *event)
@@ -422,6 +445,106 @@ void tgl_uncore_cpu_init(void)
skl_uncore_msr_ops.init_box = rkl_uncore_msr_init_box;
 }
 
+static void adl_uncore_msr_init_box(struct intel_uncore_box *box)
+{
+   if (box->pmu->pmu_idx == 0)
+   wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static void adl_uncore_msr_enable_box(struct intel_uncore_box *box)
+{
+   wrmsrl(ADL_UNC_PERF_GLOBAL_CTL, SNB_UNC_GLOBAL_CTL_EN);
+}
+
+static void adl_uncore_msr_disable_box(struct intel_uncore_box *box)
+{
+   if (box->pmu->pmu_idx 

[PATCH V2 20/25] perf/x86/intel: Add Alder Lake Hybrid support

2021-03-10 Thread kan . liang
From: Kan Liang 

Alder Lake Hybrid system has two different types of core, Golden Cove
core and Gracemont core. The Golden Cove core is registered to
"cpu_core" PMU. The Gracemont core is registered to "cpu_atom" PMU.

The difference between the two PMUs include:
- Number of GP and fixed counters
- Events
- The "cpu_core" PMU supports Topdown metrics.
  The "cpu_atom" PMU supports PEBS-via-PT.

The "cpu_core" PMU is similar to the Sapphire Rapids PMU, but without
PMEM.
The "cpu_atom" PMU is similar to Tremont, but with different
event_constraints, extra_regs and number of counters.

The mem-loads AUX event workaround only applies to the Golden Cove core.

Users may disable all CPUs of the same CPU type on the command line or
in the BIOS. For this case, perf still initializes a PMU for the CPU
type, but will not register the PMU. The PMU will only be registered
when any corresponding CPU is online.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/intel/core.c | 248 ++-
 arch/x86/events/intel/ds.c   |   7 ++
 arch/x86/events/perf_event.h |   7 ++
 3 files changed, 261 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 8ac948a..568e100 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2076,6 +2076,14 @@ static struct extra_reg intel_tnt_extra_regs[] 
__read_mostly = {
EVENT_EXTRA_END
 };
 
+static struct extra_reg intel_grt_extra_regs[] __read_mostly = {
+   /* must define OFFCORE_RSP_X first, see intel_fixup_er() */
+   INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3full, 
RSP_0),
+   INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x3full, 
RSP_1),
+   INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x5d0),
+   EVENT_EXTRA_END
+};
+
 #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */
 #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */
 #define KNL_MCDRAM_LOCAL   BIT_ULL(21)
@@ -2430,6 +2438,16 @@ static int icl_set_topdown_event_period(struct 
perf_event *event)
return 0;
 }
 
+static int adl_set_topdown_event_period(struct perf_event *event)
+{
+   struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+   if (pmu->cpu_type != INTEL_HYBRID_TYPE_CORE)
+   return 0;
+
+   return icl_set_topdown_event_period(event);
+}
+
 static inline u64 icl_get_metrics_event_value(u64 metric, u64 slots, int idx)
 {
u32 val;
@@ -2570,6 +2588,17 @@ static u64 icl_update_topdown_event(struct perf_event 
*event)
 x86_pmu.num_topdown_events - 
1);
 }
 
+static u64 adl_update_topdown_event(struct perf_event *event)
+{
+   struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+   if (pmu->cpu_type != INTEL_HYBRID_TYPE_CORE)
+   return 0;
+
+   return icl_update_topdown_event(event);
+}
+
+
 static void intel_pmu_read_topdown_event(struct perf_event *event)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(_hw_events);
@@ -3658,6 +3687,17 @@ static inline bool is_mem_loads_aux_event(struct 
perf_event *event)
return (event->attr.config & INTEL_ARCH_EVENT_MASK) == 
X86_CONFIG(.event=0x03, .umask=0x82);
 }
 
+static inline bool require_mem_loads_aux_event(struct perf_event *event)
+{
+   if (!(x86_pmu.flags & PMU_FL_MEM_LOADS_AUX))
+   return false;
+
+   if (is_hybrid())
+   return hybrid_pmu(event->pmu)->cpu_type == 
INTEL_HYBRID_TYPE_CORE;
+
+   return true;
+}
+
 static inline bool intel_pmu_has_cap(struct perf_event *event, int idx)
 {
union perf_capabilities *intel_cap;
@@ -3782,7 +3822,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 * event. The rule is to simplify the implementation of the check.
 * That's because perf cannot have a complete group at the moment.
 */
-   if (x86_pmu.flags & PMU_FL_MEM_LOADS_AUX &&
+   if (require_mem_loads_aux_event(event) &&
(event->attr.sample_type & PERF_SAMPLE_DATA_SRC) &&
is_mem_loads_event(event)) {
struct perf_event *leader = event->group_leader;
@@ -4059,6 +4099,34 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, 
int idx,
return c;
 }
 
+static struct event_constraint *
+adl_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+ struct perf_event *event)
+{
+   struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+   if (pmu->cpu_type == INTEL_HYBRID_TYPE_CORE)
+   return spr_get_event_constraints(cpuc, idx, event);
+   else if (pmu->cpu_type == INTEL_HYBRID_TYPE_ATOM)
+   return tnt_get_event_constraints(cpuc, idx, event);
+
+   WARN_ON(1);
+   return 
+}
+
+static int adl_hw_config(struct perf_event *event)
+{
+   struct x86_hybrid_pmu *pmu = hybrid_pmu(event->pmu);
+
+   if (pmu->cpu_type == 

[PATCH V2 19/25] perf/x86: Support filter_match callback

2021-03-10 Thread kan . liang
From: Kan Liang 

Implement filter_match callback for X86, which check whether an event is
schedulable on the current CPU.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   | 10 ++
 arch/x86/events/perf_event.h |  1 +
 2 files changed, 11 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 43f5529..b675283 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2625,6 +2625,14 @@ static int x86_pmu_aux_output_match(struct perf_event 
*event)
return 0;
 }
 
+static int x86_pmu_filter_match(struct perf_event *event)
+{
+   if (x86_pmu.filter_match)
+   return x86_pmu.filter_match(event);
+
+   return 1;
+}
+
 static struct pmu pmu = {
.pmu_enable = x86_pmu_enable,
.pmu_disable= x86_pmu_disable,
@@ -2652,6 +2660,8 @@ static struct pmu pmu = {
.check_period   = x86_pmu_check_period,
 
.aux_output_match   = x86_pmu_aux_output_match,
+
+   .filter_match   = x86_pmu_filter_match,
 };
 
 void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 36a08709..74bdcfe 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -865,6 +865,7 @@ struct x86_pmu {
 
int (*aux_output_match) (struct perf_event *event);
 
+   int (*filter_match)(struct perf_event *event);
/*
 * Hybrid support
 *
-- 
2.7.4



[PATCH V2 21/25] perf: Introduce PERF_TYPE_HARDWARE_PMU and PERF_TYPE_HW_CACHE_PMU

2021-03-10 Thread kan . liang
From: Kan Liang 

Current Hardware events and Hardware cache events have special perf
types, PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE. The two types don't
pass the PMU type in the user interface. For a hybrid system, the perf
subsystem doesn't know which PMU the events belong to. The first capable
PMU will always be assigned to the events. The events never get a chance
to run on the other capable PMUs.

Add a PMU aware version PERF_TYPE_HARDWARE_PMU and
PERF_TYPE_HW_CACHE_PMU. The PMU type ID is stored at attr.config[40:32].
Support the new types for X86.

Suggested-by: Andi Kleen 
Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c  | 10 --
 include/uapi/linux/perf_event.h | 26 ++
 kernel/events/core.c| 14 +-
 3 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index b675283..f031d00 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -484,7 +484,7 @@ int x86_setup_perfctr(struct perf_event *event)
if (attr->type == event->pmu->type)
return x86_pmu_extra_regs(event->attr.config, event);
 
-   if (attr->type == PERF_TYPE_HW_CACHE)
+   if ((attr->type == PERF_TYPE_HW_CACHE) || (attr->type == 
PERF_TYPE_HW_CACHE_PMU))
return set_ext_hw_attr(hwc, event);
 
if (attr->config >= x86_pmu.max_events)
@@ -2427,9 +2427,15 @@ static int x86_pmu_event_init(struct perf_event *event)
 
if ((event->attr.type != event->pmu->type) &&
(event->attr.type != PERF_TYPE_HARDWARE) &&
-   (event->attr.type != PERF_TYPE_HW_CACHE))
+   (event->attr.type != PERF_TYPE_HW_CACHE) &&
+   (event->attr.type != PERF_TYPE_HARDWARE_PMU) &&
+   (event->attr.type != PERF_TYPE_HW_CACHE_PMU))
return -ENOENT;
 
+   if ((event->attr.type == PERF_TYPE_HARDWARE_PMU) ||
+   (event->attr.type == PERF_TYPE_HW_CACHE_PMU))
+   event->attr.config &= PERF_HW_CACHE_EVENT_MASK;
+
if (is_hybrid() && (event->cpu != -1)) {
pmu = hybrid_pmu(event->pmu);
if (!cpumask_test_cpu(event->cpu, >supported_cpus))
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index ad15e40..c0a511e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -33,6 +33,8 @@ enum perf_type_id {
PERF_TYPE_HW_CACHE  = 3,
PERF_TYPE_RAW   = 4,
PERF_TYPE_BREAKPOINT= 5,
+   PERF_TYPE_HARDWARE_PMU  = 6,
+   PERF_TYPE_HW_CACHE_PMU  = 7,
 
PERF_TYPE_MAX,  /* non-ABI */
 };
@@ -95,6 +97,30 @@ enum perf_hw_cache_op_result_id {
 };
 
 /*
+ * attr.config layout for type PERF_TYPE_HARDWARE* and PERF_TYPE_HW_CACHE*
+ * PERF_TYPE_HARDWARE: 0xAA
+ * AA: hardware event ID
+ * PERF_TYPE_HW_CACHE: 0xCCBBAA
+ * AA: hardware cache ID
+ * BB: hardware cache op ID
+ * CC: hardware cache op result ID
+ * PERF_TYPE_HARDWARE_PMU: 0xDD00AA
+ * AA: hardware event ID
+ * DD: PMU type ID
+ * PERF_TYPE_HW_CACHE_PMU: 0xDD00CCBBAA
+ * AA: hardware cache ID
+ * BB: hardware cache op ID
+ * CC: hardware cache op result ID
+ * DD: PMU type ID
+ */
+#define PERF_HW_CACHE_ID_SHIFT 0
+#define PERF_HW_CACHE_OP_ID_SHIFT  8
+#define PERF_HW_CACHE_OP_RESULT_ID_SHIFT   16
+#define PERF_HW_CACHE_EVENT_MASK   0xff
+
+#define PERF_PMU_TYPE_SHIFT32
+
+/*
  * Special "software" events provided by the kernel, even if the hardware
  * does not support performance events. These events measure various
  * physical and sw events of the kernel (and allow the profiling of them as
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0aeca5f..6d7524e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11059,6 +11059,14 @@ static int perf_try_init_event(struct pmu *pmu, struct 
perf_event *event)
return ret;
 }
 
+static bool perf_event_is_hw_pmu_type(struct perf_event *event)
+{
+   int type = event->attr.type;
+
+   return type == PERF_TYPE_HARDWARE_PMU ||
+  type == PERF_TYPE_HW_CACHE_PMU;
+}
+
 static struct pmu *perf_init_event(struct perf_event *event)
 {
int idx, type, ret;
@@ -11082,13 +11090,17 @@ static struct pmu *perf_init_event(struct perf_event 
*event)
if (type == PERF_TYPE_HARDWARE || type == PERF_TYPE_HW_CACHE)
type = PERF_TYPE_RAW;
 
+   if (perf_event_is_hw_pmu_type(event))
+   type = 

[PATCH V2 23/25] perf/x86/msr: Add Alder Lake CPU support

2021-03-10 Thread kan . liang
From: Kan Liang 

PPERF and SMI_COUNT MSRs are also supported on Alder Lake.

The External Design Specification (EDS) is not published yet. It comes
from an authoritative internal source.

The patch has been tested on real hardware.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/msr.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index 680404c..c853b28 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -100,6 +100,8 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_TIGERLAKE_L:
case INTEL_FAM6_TIGERLAKE:
case INTEL_FAM6_ROCKETLAKE:
+   case INTEL_FAM6_ALDERLAKE:
+   case INTEL_FAM6_ALDERLAKE_L:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
-- 
2.7.4



[PATCH V2 18/25] perf/x86/intel: Add attr_update for Hybrid PMUs

2021-03-10 Thread kan . liang
From: Kan Liang 

The attribute_group for Hybrid PMUs should be different from the previous
cpu PMU. For example, cpumask is required for a Hybrid PMU. The PMU type
should be included in the event and format attribute.

Add hybrid_attr_update for the Hybrid PMU.
Check the PMU type in is_visible() function. Only display the event or
format for the matched Hybrid PMU.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/intel/core.c | 120 ---
 1 file changed, 114 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 0026a54..8ac948a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -5266,6 +5266,106 @@ static const struct attribute_group *attr_update[] = {
NULL,
 };
 
+static bool is_attr_for_this_pmu(struct kobject *kobj, struct attribute *attr)
+{
+   struct device *dev = kobj_to_dev(kobj);
+   struct x86_hybrid_pmu *pmu =
+   container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+   struct perf_pmu_events_hybrid_attr *pmu_attr =
+   container_of(attr, struct perf_pmu_events_hybrid_attr, 
attr.attr);
+
+   return pmu->cpu_type & pmu_attr->pmu_type;
+}
+
+static umode_t hybrid_events_is_visible(struct kobject *kobj,
+   struct attribute *attr, int i)
+{
+   return is_attr_for_this_pmu(kobj, attr) ? attr->mode : 0;
+}
+
+static inline int hybrid_find_supported_cpu(struct x86_hybrid_pmu *pmu)
+{
+   int cpu = cpumask_first(>supported_cpus);
+
+   return (cpu >= nr_cpu_ids) ? -1 : cpu;
+}
+
+static umode_t hybrid_tsx_is_visible(struct kobject *kobj,
+struct attribute *attr, int i)
+{
+   struct device *dev = kobj_to_dev(kobj);
+   struct x86_hybrid_pmu *pmu =
+container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+   int cpu = hybrid_find_supported_cpu(pmu);
+
+   return (cpu >= 0) && is_attr_for_this_pmu(kobj, attr) && 
cpu_has(_data(cpu), X86_FEATURE_RTM) ? attr->mode : 0;
+}
+
+static umode_t hybrid_format_is_visible(struct kobject *kobj,
+   struct attribute *attr, int i)
+{
+   struct device *dev = kobj_to_dev(kobj);
+   struct x86_hybrid_pmu *pmu =
+   container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+   struct perf_pmu_format_hybrid_attr *pmu_attr =
+   container_of(attr, struct perf_pmu_format_hybrid_attr, 
attr.attr);
+   int cpu = hybrid_find_supported_cpu(pmu);
+
+   return (cpu >= 0) && (pmu->cpu_type & pmu_attr->pmu_type) ? attr->mode 
: 0;
+}
+
+static struct attribute_group hybrid_group_events_td  = {
+   .name   = "events",
+   .is_visible = hybrid_events_is_visible,
+};
+
+static struct attribute_group hybrid_group_events_mem = {
+   .name   = "events",
+   .is_visible = hybrid_events_is_visible,
+};
+
+static struct attribute_group hybrid_group_events_tsx = {
+   .name   = "events",
+   .is_visible = hybrid_tsx_is_visible,
+};
+
+static struct attribute_group hybrid_group_format_extra = {
+   .name   = "format",
+   .is_visible = hybrid_format_is_visible,
+};
+
+static ssize_t intel_hybrid_get_attr_cpus(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+   struct x86_hybrid_pmu *pmu =
+   container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+   return cpumap_print_to_pagebuf(true, buf, >supported_cpus);
+}
+
+static DEVICE_ATTR(cpus, S_IRUGO, intel_hybrid_get_attr_cpus, NULL);
+static struct attribute *intel_hybrid_cpus_attrs[] = {
+   _attr_cpus.attr,
+   NULL,
+};
+
+static struct attribute_group hybrid_group_cpus = {
+   .attrs  = intel_hybrid_cpus_attrs,
+};
+
+static const struct attribute_group *hybrid_attr_update[] = {
+   _group_events_td,
+   _group_events_mem,
+   _group_events_tsx,
+   _caps_gen,
+   _caps_lbr,
+   _group_format_extra,
+   _default,
+   _group_cpus,
+   NULL,
+};
+
 static struct attribute *empty_attrs;
 
 __init int intel_pmu_init(void)
@@ -5896,14 +5996,22 @@ __init int intel_pmu_init(void)
 
snprintf(pmu_name_str, sizeof(pmu_name_str), "%s", name);
 
+   if (!is_hybrid()) {
+   group_events_td.attrs  = td_attr;
+   group_events_mem.attrs = mem_attr;
+   group_events_tsx.attrs = tsx_attr;
+   group_format_extra.attrs = extra_attr;
+   group_format_extra_skl.attrs = extra_skl_attr;
 
-   group_events_td.attrs  = td_attr;
-   group_events_mem.attrs = mem_attr;
-   group_events_tsx.attrs = tsx_attr;
-   group_format_extra.attrs = extra_attr;
-   group_format_extra_skl.attrs = extra_skl_attr;
+   

[PATCH V2 17/25] perf/x86: Add structures for the attributes of Hybrid PMUs

2021-03-10 Thread kan . liang
From: Kan Liang 

Hybrid PMUs have different events and formats. In theory, Hybrid PMU
specific attributes should be maintained in the dedicated struct
x86_hybrid_pmu, but it wastes space because the events and formats are
similar among Hybrid PMUs.

To reduce duplication, all hybrid PMUs will share a group of attributes
in the following patch. To distinguish an attribute from different
Hybrid PMUs, a PMU aware attribute structure is introduced. A PMU type
is required for the attribute structure. The type is internal usage. It
is not visible in the sysfs API.

Hybrid PMUs may support the same event name, but with different event
encoding, e.g., the mem-loads event on an Atom PMU has different event
encoding from a Core PMU. It brings issue if two attributes are
created for them. Current sysfs_update_group finds an attribute by
searching the attr name (aka event name). If two attributes have the
same event name, the first attribute will be replaced.
To address the issue, only one attribute is created for the event. The
event_str is extended and stores event encodings from all Hybrid PMUs.
Each event encoding is divided by ";". The order of the event encodings
must follow the order of the hybrid PMU index. The event_str is internal
usage as well. When a user wants to show the attribute of a Hybrid PMU,
only the corresponding part of the string is displayed.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   | 43 +++
 arch/x86/events/perf_event.h | 19 +++
 include/linux/perf_event.h   | 12 
 3 files changed, 74 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7dc1430..43f5529 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1864,6 +1864,49 @@ ssize_t events_ht_sysfs_show(struct device *dev, struct 
device_attribute *attr,
pmu_attr->event_str_noht);
 }
 
+ssize_t events_hybrid_sysfs_show(struct device *dev,
+struct device_attribute *attr,
+char *page)
+{
+   struct perf_pmu_events_hybrid_attr *pmu_attr =
+   container_of(attr, struct perf_pmu_events_hybrid_attr, attr);
+   struct x86_hybrid_pmu *pmu;
+   const char *str, *next_str;
+   int i;
+
+   if (hweight64(pmu_attr->pmu_type) == 1)
+   return sprintf(page, "%s", pmu_attr->event_str);
+
+   /*
+* Hybrid PMUs may support the same event name, but with different
+* event encoding, e.g., the mem-loads event on an Atom PMU has
+* different event encoding from a Core PMU.
+*
+* The event_str includes all event encodings. Each event encoding
+* is divided by ";". The order of the event encodings must follow
+* the order of the hybrid PMU index.
+*/
+   pmu = container_of(dev_get_drvdata(dev), struct x86_hybrid_pmu, pmu);
+
+   str = pmu_attr->event_str;
+   for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+   if (!(x86_pmu.hybrid_pmu[i].cpu_type & pmu_attr->pmu_type))
+   continue;
+   if (x86_pmu.hybrid_pmu[i].cpu_type & pmu->cpu_type) {
+   next_str = strchr(str, ';');
+   if (next_str)
+   return snprintf(page, next_str - str + 1, "%s", 
str);
+   else
+   return sprintf(page, "%s", str);
+   }
+   str = strchr(str, ';');
+   str++;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(events_hybrid_sysfs_show);
+
 EVENT_ATTR(cpu-cycles, CPU_CYCLES  );
 EVENT_ATTR(instructions,   INSTRUCTIONS);
 EVENT_ATTR(cache-references,   CACHE_REFERENCES);
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 809c036..36a08709 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -964,6 +964,22 @@ static struct perf_pmu_events_ht_attr event_attr_##v = {   
\
.event_str_ht   = ht,   \
 }
 
+#define EVENT_ATTR_STR_HYBRID(_name, v, str, _pmu) \
+static struct perf_pmu_events_hybrid_attr event_attr_##v = {   \
+   .attr   = __ATTR(_name, 0444, events_hybrid_sysfs_show, NULL),\
+   .id = 0,\
+   .event_str  = str,  \
+   .pmu_type   = _pmu, \
+}
+
+#define FORMAT_HYBRID_PTR(_id) (_attr_hybrid_##_id.attr.attr)
+
+#define FORMAT_ATTR_HYBRID(_name, _pmu)
\
+static struct perf_pmu_format_hybrid_attr format_attr_hybrid_##_name = {\
+   .attr   = __ATTR_RO(_name), \
+   .pmu_type  

[PATCH V2 16/25] perf/x86: Register hybrid PMUs

2021-03-10 Thread kan . liang
From: Kan Liang 

Different hybrid PMUs have different PMU capabilities and events. Perf
should registers a dedicated PMU for each of them.

To check the X86 event, perf has to go through all possible hybrid pmus.

Only the PMU for the boot CPU is registered in init_hw_perf_events()
because the boot CPU is the only online CPU at that moment.
The init_hybrid_pmu() is introduced to register and initialize the other
type of PMUs when the new type of CPU is online.

All hybrid PMUs have capability PERF_PMU_CAP_HETEROGENEOUS_CPUS.
The PMU name for hybrid PMUs will be "cpu_XXX", which will be assigned
later in a separated patch.

The PMU type id for the core PMU is still PERF_TYPE_RAW. For the other
hybrid PMUs, the PMU type id is not hard code.

The event->cpu must be compatitable with the supported CPUs of the PMU.
Add a check in the x86_pmu_event_init().

The events in a group must be from the same type of hybrid PMU.
The fake cpuc used in the validation must be from the supported CPU of
the event->pmu.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   | 110 +---
 arch/x86/events/intel/core.c | 116 ++-
 arch/x86/events/perf_event.h |  12 +
 3 files changed, 218 insertions(+), 20 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 15d9325..7dc1430 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -481,7 +481,7 @@ int x86_setup_perfctr(struct perf_event *event)
local64_set(>period_left, hwc->sample_period);
}
 
-   if (attr->type == PERF_TYPE_RAW)
+   if (attr->type == event->pmu->type)
return x86_pmu_extra_regs(event->attr.config, event);
 
if (attr->type == PERF_TYPE_HW_CACHE)
@@ -616,7 +616,7 @@ int x86_pmu_hw_config(struct perf_event *event)
if (!event->attr.exclude_kernel)
event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
 
-   if (event->attr.type == PERF_TYPE_RAW)
+   if (event->attr.type == event->pmu->type)
event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
if (event->attr.sample_period && x86_pmu.limit_period) {
@@ -745,7 +745,17 @@ void x86_pmu_enable_all(int added)
 
 static inline int is_x86_event(struct perf_event *event)
 {
-   return event->pmu == 
+   int i;
+
+   if (!is_hybrid())
+   return event->pmu == 
+
+   for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+   if (event->pmu == _pmu.hybrid_pmu[i].pmu)
+   return true;
+   }
+
+   return false;
 }
 
 struct pmu *x86_get_pmu(unsigned int cpu)
@@ -2061,8 +2071,11 @@ static int __init init_hw_perf_events(void)
 
pmu.attr_update = x86_pmu.attr_update;
 
-   x86_pmu_show_pmu_cap(x86_pmu.num_counters, x86_pmu.num_counters_fixed,
-x86_pmu.intel_ctrl);
+   if (!is_hybrid()) {
+   x86_pmu_show_pmu_cap(x86_pmu.num_counters,
+x86_pmu.num_counters_fixed,
+x86_pmu.intel_ctrl);
+   }
 
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
@@ -2092,9 +2105,37 @@ static int __init init_hw_perf_events(void)
if (err)
goto out1;
 
-   err = perf_pmu_register(, "cpu", PERF_TYPE_RAW);
-   if (err)
-   goto out2;
+   if (!is_hybrid()) {
+   err = perf_pmu_register(, "cpu", PERF_TYPE_RAW);
+   if (err)
+   goto out2;
+   } else {
+   u8 cpu_type = get_hybrid_cpu_type(smp_processor_id());
+   struct x86_hybrid_pmu *hybrid_pmu;
+   int i;
+
+   for (i = 0; i < x86_pmu.num_hybrid_pmus; i++) {
+   hybrid_pmu = _pmu.hybrid_pmu[i];
+
+   hybrid_pmu->pmu = pmu;
+   hybrid_pmu->pmu.type = -1;
+   hybrid_pmu->pmu.attr_update = x86_pmu.attr_update;
+   hybrid_pmu->pmu.capabilities |= 
PERF_PMU_CAP_HETEROGENEOUS_CPUS;
+
+   /* Only register the PMU for the boot CPU */
+   if (hybrid_pmu->cpu_type != cpu_type)
+   continue;
+
+   err = perf_pmu_register(_pmu->pmu, 
hybrid_pmu->name,
+   
x86_get_hybrid_pmu_type(cpu_type));
+   if (err) {
+   pr_warn("Failed to register a PMU, err %d\n", 
err);
+   kfree(x86_pmu.hybrid_pmu);
+   x86_pmu.hybrid_pmu = NULL;
+   goto out2;
+   }
+   }
+   }
 
return 0;
 
@@ -2219,16 +2260,25 @@ static void free_fake_cpuc(struct cpu_hw_events *cpuc)
kfree(cpuc);
 }
 
-static struct 

[PATCH V2 11/25] perf/x86/intel: Factor out intel_pmu_check_num_counters

2021-03-10 Thread kan . liang
From: Kan Liang 

Each Hybrid PMU has to check its own number of counters and mask fixed
counters before registration.

The intel_pmu_check_num_counters will be reused later when registering a
dedicated hybrid PMU.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/intel/core.c | 38 --
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 1030fa9..2da6ae2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4215,6 +4215,26 @@ static void flip_smm_bit(void *data)
}
 }
 
+static void intel_pmu_check_num_counters(int *num_counters,
+int *num_counters_fixed,
+u64 *intel_ctrl, u64 fixed_mask)
+{
+   if (*num_counters > INTEL_PMC_MAX_GENERIC) {
+   WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
+*num_counters, INTEL_PMC_MAX_GENERIC);
+   *num_counters = INTEL_PMC_MAX_GENERIC;
+   }
+   *intel_ctrl = (1ULL << *num_counters) - 1;
+
+   if (*num_counters_fixed > INTEL_PMC_MAX_FIXED) {
+   WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
+*num_counters_fixed, INTEL_PMC_MAX_FIXED);
+   *num_counters_fixed = INTEL_PMC_MAX_FIXED;
+   }
+
+   *intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED;
+}
+
 static void intel_pmu_cpu_starting(int cpu)
 {
struct cpu_hw_events *cpuc = _cpu(cpu_hw_events, cpu);
@@ -5706,20 +5726,10 @@ __init int intel_pmu_init(void)
 
x86_pmu.attr_update = attr_update;
 
-   if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) {
-   WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
-x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC);
-   x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC;
-   }
-   x86_pmu.intel_ctrl = (1ULL << x86_pmu.num_counters) - 1;
-
-   if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) {
-   WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
-x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED);
-   x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED;
-   }
-
-   x86_pmu.intel_ctrl |= (u64)fixed_mask << INTEL_PMC_IDX_FIXED;
+   intel_pmu_check_num_counters(_pmu.num_counters,
+_pmu.num_counters_fixed,
+_pmu.intel_ctrl,
+(u64)fixed_mask);
 
/* AnyThread may be deprecated on arch perfmon v5 or later */
if (x86_pmu.intel_cap.anythread_deprecated)
-- 
2.7.4



[PATCH V2 10/25] perf/x86: Hybrid PMU support for extra_regs

2021-03-10 Thread kan . liang
From: Kan Liang 

Different hybrid PMU may have different extra registers, e.g. Core PMU
may have offcore registers, frontend register and ldlat register. Atom
core may only have offcore registers and ldlat register. Each hybrid PMU
should use its own extra_regs.

An Intel Hybrid system should always have extra registers.
Unconditionally allocate shared_regs for Intel Hybrid system.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   |  5 +++--
 arch/x86/events/intel/core.c | 15 +--
 arch/x86/events/perf_event.h |  1 +
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 38c271a..32e8fed 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -149,15 +149,16 @@ u64 x86_perf_event_update(struct perf_event *event)
  */
 static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
 {
+   struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
struct hw_perf_event_extra *reg;
struct extra_reg *er;
 
reg = >hw.extra_reg;
 
-   if (!x86_pmu.extra_regs)
+   if (!extra_regs)
return 0;
 
-   for (er = x86_pmu.extra_regs; er->msr; er++) {
+   for (er = extra_regs; er->msr; er++) {
if (er->event != (config & er->config_mask))
continue;
if (event->attr.config1 & ~er->valid_mask)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 572509f..1030fa9 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2966,8 +2966,10 @@ intel_vlbr_constraints(struct perf_event *event)
return NULL;
 }
 
-static int intel_alt_er(int idx, u64 config)
+static int intel_alt_er(struct cpu_hw_events *cpuc,
+   int idx, u64 config)
 {
+   struct extra_reg *extra_regs = hybrid(cpuc->pmu, extra_regs);
int alt_idx = idx;
 
if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
@@ -2979,7 +2981,7 @@ static int intel_alt_er(int idx, u64 config)
if (idx == EXTRA_REG_RSP_1)
alt_idx = EXTRA_REG_RSP_0;
 
-   if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+   if (config & ~extra_regs[alt_idx].valid_mask)
return idx;
 
return alt_idx;
@@ -2987,15 +2989,16 @@ static int intel_alt_er(int idx, u64 config)
 
 static void intel_fixup_er(struct perf_event *event, int idx)
 {
+   struct extra_reg *extra_regs = hybrid(event->pmu, extra_regs);
event->hw.extra_reg.idx = idx;
 
if (idx == EXTRA_REG_RSP_0) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-   event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event;
+   event->hw.config |= extra_regs[EXTRA_REG_RSP_0].event;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0;
} else if (idx == EXTRA_REG_RSP_1) {
event->hw.config &= ~INTEL_ARCH_EVENT_MASK;
-   event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event;
+   event->hw.config |= extra_regs[EXTRA_REG_RSP_1].event;
event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1;
}
 }
@@ -3071,7 +3074,7 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events 
*cpuc,
 */
c = NULL;
} else {
-   idx = intel_alt_er(idx, reg->config);
+   idx = intel_alt_er(cpuc, idx, reg->config);
if (idx != reg->idx) {
raw_spin_unlock_irqrestore(>lock, flags);
goto again;
@@ -4158,7 +4161,7 @@ int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int 
cpu)
 {
cpuc->pebs_record_size = x86_pmu.pebs_record_size;
 
-   if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+   if (is_hybrid() || x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
cpuc->shared_regs = allocate_shared_regs(cpu);
if (!cpuc->shared_regs)
goto err;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 9316fec..e65a477 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -651,6 +651,7 @@ struct x86_hybrid_pmu {
[PERF_COUNT_HW_CACHE_RESULT_MAX];
struct event_constraint *event_constraints;
struct event_constraint *pebs_constraints;
+   struct extra_reg*extra_regs;
 };
 
 static __always_inline bool is_hybrid(void)
-- 
2.7.4



[PATCH V2 13/25] perf/x86/intel: Factor out intel_pmu_check_extra_regs

2021-03-10 Thread kan . liang
From: Kan Liang 

Each Hybrid PMU has to check and update its own extra registers before
registration.

The intel_pmu_check_extra_regs will be reused later when registering a
dedicated hybrid PMU.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/intel/core.c | 37 +++--
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 6df8edd7..cdfbab3 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4278,6 +4278,28 @@ static void intel_pmu_check_event_constraints(struct 
event_constraint *event_con
}
 }
 
+static bool check_msr(unsigned long msr, u64 mask);
+
+static void intel_pmu_check_extra_regs(struct extra_reg *extra_regs)
+{
+   struct extra_reg *er;
+
+   /*
+* Access extra MSR may cause #GP under certain circumstances.
+* E.g. KVM doesn't support offcore event
+* Check all extra_regs here.
+*/
+   if (!extra_regs)
+   return;
+
+   for (er = extra_regs; er->msr; er++) {
+   er->extra_msr_access = check_msr(er->msr, 0x11UL);
+   /* Disable LBR select mapping */
+   if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
+   x86_pmu.lbr_sel_map = NULL;
+   }
+}
+
 static void intel_pmu_cpu_starting(int cpu)
 {
struct cpu_hw_events *cpuc = _cpu(cpu_hw_events, cpu);
@@ -5141,7 +5163,6 @@ __init int intel_pmu_init(void)
union cpuid10_eax eax;
union cpuid10_ebx ebx;
unsigned int fixed_mask;
-   struct extra_reg *er;
bool pmem = false;
int version, i;
char *name;
@@ -5798,19 +5819,7 @@ __init int intel_pmu_init(void)
if (x86_pmu.lbr_nr)
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
 
-   /*
-* Access extra MSR may cause #GP under certain circumstances.
-* E.g. KVM doesn't support offcore event
-* Check all extra_regs here.
-*/
-   if (x86_pmu.extra_regs) {
-   for (er = x86_pmu.extra_regs; er->msr; er++) {
-   er->extra_msr_access = check_msr(er->msr, 0x11UL);
-   /* Disable LBR select mapping */
-   if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
-   x86_pmu.lbr_sel_map = NULL;
-   }
-   }
+   intel_pmu_check_extra_regs(x86_pmu.extra_regs);
 
/* Support full width counters using alternative MSR range */
if (x86_pmu.intel_cap.full_width_write) {
-- 
2.7.4



[PATCH V2 14/25] perf/x86: Remove temporary pmu assignment in event_init

2021-03-10 Thread kan . liang
From: Kan Liang 

The temporary pmu assignment in event_init is unnecessary.

The assignment was introduced by commit 8113070d6639 ("perf_events:
Add fast-path to the rescheduling code"). At that time, event->pmu is
not assigned yet when initializing an event. The assignment is required.
However, from commit 7e5b2a01d2ca ("perf: provide PMU when initing
events"), the event->pmu is provided before event_init is invoked.
The temporary pmu assignment in event_init should be removed.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c | 11 ---
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 32e8fed..96b4a72 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2302,7 +2302,6 @@ static int validate_group(struct perf_event *event)
 
 static int x86_pmu_event_init(struct perf_event *event)
 {
-   struct pmu *tmp;
int err;
 
switch (event->attr.type) {
@@ -2317,20 +2316,10 @@ static int x86_pmu_event_init(struct perf_event *event)
 
err = __x86_pmu_event_init(event);
if (!err) {
-   /*
-* we temporarily connect event to its pmu
-* such that validate_group() can classify
-* it as an x86 event using is_x86_event()
-*/
-   tmp = event->pmu;
-   event->pmu = 
-
if (event->group_leader != event)
err = validate_group(event);
else
err = validate_event(event);
-
-   event->pmu = tmp;
}
if (err) {
if (event->destroy)
-- 
2.7.4



[PATCH V2 15/25] perf/x86: Factor out x86_pmu_show_pmu_cap

2021-03-10 Thread kan . liang
From: Kan Liang 

The PMU capabilities are different among hybrid PMUs. Perf should dump
the PMU capabilities information for each hybrid PMU.

Factor out x86_pmu_show_pmu_cap() which shows the PMU capabilities
information. The function will be reused later when registering a
dedicated hybrid PMU.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   | 25 -
 arch/x86/events/perf_event.h |  3 +++
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 96b4a72..15d9325 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1987,6 +1987,20 @@ perf_guest_get_msrs_nop(int *nr)
return NULL;
 }
 
+void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
+ u64 intel_ctrl)
+{
+   pr_info("... version:%d\n", x86_pmu.version);
+   pr_info("... bit width:  %d\n", x86_pmu.cntval_bits);
+   pr_info("... generic registers:  %d\n", num_counters);
+   pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
+   pr_info("... max period: %016Lx\n", x86_pmu.max_period);
+   pr_info("... fixed-purpose events:   %lu\n",
+   hweight641ULL << num_counters_fixed) - 1)
+   << INTEL_PMC_IDX_FIXED) & intel_ctrl));
+   pr_info("... event mask: %016Lx\n", intel_ctrl);
+}
+
 static int __init init_hw_perf_events(void)
 {
struct x86_pmu_quirk *quirk;
@@ -2047,15 +2061,8 @@ static int __init init_hw_perf_events(void)
 
pmu.attr_update = x86_pmu.attr_update;
 
-   pr_info("... version:%d\n", x86_pmu.version);
-   pr_info("... bit width:  %d\n", x86_pmu.cntval_bits);
-   pr_info("... generic registers:  %d\n", x86_pmu.num_counters);
-   pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
-   pr_info("... max period: %016Lx\n", x86_pmu.max_period);
-   pr_info("... fixed-purpose events:   %lu\n",
-   hweight641ULL << x86_pmu.num_counters_fixed) - 1)
-   << INTEL_PMC_IDX_FIXED) & 
x86_pmu.intel_ctrl));
-   pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
+   x86_pmu_show_pmu_cap(x86_pmu.num_counters, x86_pmu.num_counters_fixed,
+x86_pmu.intel_ctrl);
 
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index e65a477..3ec5914 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1085,6 +1085,9 @@ void x86_pmu_enable_event(struct perf_event *event);
 
 int x86_pmu_handle_irq(struct pt_regs *regs);
 
+void x86_pmu_show_pmu_cap(int num_counters, int num_counters_fixed,
+ u64 intel_ctrl);
+
 extern struct event_constraint emptyconstraint;
 
 extern struct event_constraint unconstrained;
-- 
2.7.4



[PATCH V2 12/25] perf/x86/intel: Factor out intel_pmu_check_event_constraints

2021-03-10 Thread kan . liang
From: Kan Liang 

Each Hybrid PMU has to check and update its own event constraints before
registration.

The intel_pmu_check_event_constraints will be reused later when
registering a dedicated hybrid PMU.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/intel/core.c | 82 +---
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 2da6ae2..6df8edd7 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4235,6 +4235,49 @@ static void intel_pmu_check_num_counters(int 
*num_counters,
*intel_ctrl |= fixed_mask << INTEL_PMC_IDX_FIXED;
 }
 
+static void intel_pmu_check_event_constraints(struct event_constraint 
*event_constraints,
+ int num_counters,
+ int num_counters_fixed,
+ u64 intel_ctrl)
+{
+   struct event_constraint *c;
+
+   if (!event_constraints)
+   return;
+
+   /*
+* event on fixed counter2 (REF_CYCLES) only works on this
+* counter, so do not extend mask to generic counters
+*/
+   for_each_event_constraint(c, event_constraints) {
+   /*
+* Don't extend the topdown slots and metrics
+* events to the generic counters.
+*/
+   if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
+   /*
+* Disable topdown slots and metrics events,
+* if slots event is not in CPUID.
+*/
+   if (!(INTEL_PMC_MSK_FIXED_SLOTS & intel_ctrl))
+   c->idxmsk64 = 0;
+   c->weight = hweight64(c->idxmsk64);
+   continue;
+   }
+
+   if (c->cmask == FIXED_EVENT_FLAGS) {
+   /* Disabled fixed counters which are not in CPUID */
+   c->idxmsk64 &= intel_ctrl;
+
+   if (c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES)
+   c->idxmsk64 |= (1ULL << num_counters) - 1;
+   }
+   c->idxmsk64 &=
+   ~(~0ULL << (INTEL_PMC_IDX_FIXED + num_counters_fixed));
+   c->weight = hweight64(c->idxmsk64);
+   }
+}
+
 static void intel_pmu_cpu_starting(int cpu)
 {
struct cpu_hw_events *cpuc = _cpu(cpu_hw_events, cpu);
@@ -5097,7 +5140,6 @@ __init int intel_pmu_init(void)
union cpuid10_edx edx;
union cpuid10_eax eax;
union cpuid10_ebx ebx;
-   struct event_constraint *c;
unsigned int fixed_mask;
struct extra_reg *er;
bool pmem = false;
@@ -5735,40 +5777,10 @@ __init int intel_pmu_init(void)
if (x86_pmu.intel_cap.anythread_deprecated)
x86_pmu.format_attrs = intel_arch_formats_attr;
 
-   if (x86_pmu.event_constraints) {
-   /*
-* event on fixed counter2 (REF_CYCLES) only works on this
-* counter, so do not extend mask to generic counters
-*/
-   for_each_event_constraint(c, x86_pmu.event_constraints) {
-   /*
-* Don't extend the topdown slots and metrics
-* events to the generic counters.
-*/
-   if (c->idxmsk64 & INTEL_PMC_MSK_TOPDOWN) {
-   /*
-* Disable topdown slots and metrics events,
-* if slots event is not in CPUID.
-*/
-   if (!(INTEL_PMC_MSK_FIXED_SLOTS & 
x86_pmu.intel_ctrl))
-   c->idxmsk64 = 0;
-   c->weight = hweight64(c->idxmsk64);
-   continue;
-   }
-
-   if (c->cmask == FIXED_EVENT_FLAGS) {
-   /* Disabled fixed counters which are not in 
CPUID */
-   c->idxmsk64 &= x86_pmu.intel_ctrl;
-
-   if (c->idxmsk64 != 
INTEL_PMC_MSK_FIXED_REF_CYCLES)
-   c->idxmsk64 |= (1ULL << 
x86_pmu.num_counters) - 1;
-   }
-   c->idxmsk64 &=
-   ~(~0ULL << (INTEL_PMC_IDX_FIXED + 
x86_pmu.num_counters_fixed));
-   c->weight = hweight64(c->idxmsk64);
-   }
-   }
-
+   intel_pmu_check_event_constraints(x86_pmu.event_constraints,
+ x86_pmu.num_counters,
+ x86_pmu.num_counters_fixed,
+ x86_pmu.intel_ctrl);
/*
   

[PATCH V2 09/25] perf/x86: Hybrid PMU support for event constraints

2021-03-10 Thread kan . liang
From: Kan Liang 

The events are different among hybrid PMUs. Each hybrid PMU should use
its own event constraints.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   | 3 ++-
 arch/x86/events/intel/core.c | 5 +++--
 arch/x86/events/intel/ds.c   | 5 +++--
 arch/x86/events/perf_event.h | 2 ++
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 1db4a67..38c271a 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1522,6 +1522,7 @@ void perf_event_print_debug(void)
struct cpu_hw_events *cpuc = _cpu(cpu_hw_events, cpu);
int num_counters = hybrid(cpuc->pmu, num_counters);
int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
+   struct event_constraint *pebs_constraints = hybrid(cpuc->pmu, 
pebs_constraints);
unsigned long flags;
int idx;
 
@@ -1541,7 +1542,7 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: status: %016llx\n", cpu, status);
pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed:  %016llx\n", cpu, fixed);
-   if (x86_pmu.pebs_constraints) {
+   if (pebs_constraints) {
rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
pr_info("CPU#%d: pebs:   %016llx\n", cpu, pebs);
}
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 540dba2..572509f 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3136,10 +3136,11 @@ struct event_constraint *
 x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
  struct perf_event *event)
 {
+   struct event_constraint *event_constraints = hybrid(cpuc->pmu, 
event_constraints);
struct event_constraint *c;
 
-   if (x86_pmu.event_constraints) {
-   for_each_event_constraint(c, x86_pmu.event_constraints) {
+   if (event_constraints) {
+   for_each_event_constraint(c, event_constraints) {
if (constraint_match(c, event->hw.config)) {
event->hw.flags |= c->flags;
return c;
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 312bf3b..f1402bc 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -959,13 +959,14 @@ struct event_constraint 
intel_spr_pebs_event_constraints[] = {
 
 struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 {
+   struct event_constraint *pebs_constraints = hybrid(event->pmu, 
pebs_constraints);
struct event_constraint *c;
 
if (!event->attr.precise_ip)
return NULL;
 
-   if (x86_pmu.pebs_constraints) {
-   for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+   if (pebs_constraints) {
+   for_each_event_constraint(c, pebs_constraints) {
if (constraint_match(c, event->hw.config)) {
event->hw.flags |= c->flags;
return c;
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index f464bda..9316fec 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -649,6 +649,8 @@ struct x86_hybrid_pmu {
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX];
+   struct event_constraint *event_constraints;
+   struct event_constraint *pebs_constraints;
 };
 
 static __always_inline bool is_hybrid(void)
-- 
2.7.4



[PATCH V2 06/25] perf/x86: Hybrid PMU support for counters

2021-03-10 Thread kan . liang
From: Kan Liang 

The number of GP and fixed counters are different among hybrid PMUs.
Each hybrid PMU should use its own counter related information.

When handling a certain hybrid PMU, apply the number of counters from
the corresponding hybrid PMU.

When reserving the counters in the initialization of a new event,
reserve all possible counters. Add a num_hybrid_pmus to indicate the
number of the hybrid PMUs.

The number of counter recored in the global x86_pmu is for the
architecture counters which are available for all hybrid PMUs. KVM
doesn't support the hybrid PMU yet. Return the number of the
architecture counters for now.

For the functions only available for the old platforms, e.g.,
intel_pmu_drain_pebs_nhm(), nothing is changed.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   | 55 ++--
 arch/x86/events/intel/core.c |  8 ---
 arch/x86/events/intel/ds.c   | 14 +++
 arch/x86/events/perf_event.h |  4 
 4 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 90eb82e..039a851 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -180,16 +180,29 @@ static DEFINE_MUTEX(pmc_reserve_mutex);
 
 #ifdef CONFIG_X86_LOCAL_APIC
 
+static inline int get_possible_num_counters(void)
+{
+   int i, num_counters = x86_pmu.num_counters;
+
+   if (!is_hybrid())
+   return num_counters;
+
+   for (i = 0; i < x86_pmu.num_hybrid_pmus; i++)
+   num_counters = max_t(int, num_counters, 
x86_pmu.hybrid_pmu[i].num_counters);
+
+   return num_counters;
+}
+
 static bool reserve_pmc_hardware(void)
 {
-   int i;
+   int i, num_counters = get_possible_num_counters();
 
-   for (i = 0; i < x86_pmu.num_counters; i++) {
+   for (i = 0; i < num_counters; i++) {
if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
goto perfctr_fail;
}
 
-   for (i = 0; i < x86_pmu.num_counters; i++) {
+   for (i = 0; i < num_counters; i++) {
if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
goto eventsel_fail;
}
@@ -200,7 +213,7 @@ static bool reserve_pmc_hardware(void)
for (i--; i >= 0; i--)
release_evntsel_nmi(x86_pmu_config_addr(i));
 
-   i = x86_pmu.num_counters;
+   i = num_counters;
 
 perfctr_fail:
for (i--; i >= 0; i--)
@@ -211,9 +224,9 @@ static bool reserve_pmc_hardware(void)
 
 static void release_pmc_hardware(void)
 {
-   int i;
+   int i, num_counters = get_possible_num_counters();
 
-   for (i = 0; i < x86_pmu.num_counters; i++) {
+   for (i = 0; i < num_counters; i++) {
release_perfctr_nmi(x86_pmu_event_addr(i));
release_evntsel_nmi(x86_pmu_config_addr(i));
}
@@ -941,6 +954,7 @@ EXPORT_SYMBOL_GPL(perf_assign_events);
 
 int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 {
+   int num_counters = hybrid(cpuc->pmu, num_counters);
struct event_constraint *c;
struct perf_event *e;
int n0, i, wmin, wmax, unsched = 0;
@@ -1016,7 +1030,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int 
n, int *assign)
 
/* slow path */
if (i != n) {
-   int gpmax = x86_pmu.num_counters;
+   int gpmax = num_counters;
 
/*
 * Do not allow scheduling of more than half the available
@@ -1037,7 +1051,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int 
n, int *assign)
 * the extra Merge events needed by large increment events.
 */
if (x86_pmu.flags & PMU_FL_PAIR) {
-   gpmax = x86_pmu.num_counters - cpuc->n_pair;
+   gpmax = num_counters - cpuc->n_pair;
WARN_ON(gpmax <= 0);
}
 
@@ -1124,10 +1138,12 @@ static int collect_event(struct cpu_hw_events *cpuc, 
struct perf_event *event,
  */
 static int collect_events(struct cpu_hw_events *cpuc, struct perf_event 
*leader, bool dogrp)
 {
+   int num_counters = hybrid(cpuc->pmu, num_counters);
+   int num_counters_fixed = hybrid(cpuc->pmu, num_counters_fixed);
struct perf_event *event;
int n, max_count;
 
-   max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
+   max_count = num_counters + num_counters_fixed;
 
/* current number of events already accepted */
n = cpuc->n_events;
@@ -1495,18 +1511,18 @@ void perf_event_print_debug(void)
 {
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
u64 pebs, debugctl;
-   struct cpu_hw_events *cpuc;
+   int cpu = smp_processor_id();
+   struct cpu_hw_events *cpuc = _cpu(cpu_hw_events, cpu);
+   int num_counters = hybrid(cpuc->pmu, num_counters);
+   int num_counters_fixed = hybrid(cpuc->pmu, 

[PATCH V2 07/25] perf/x86: Hybrid PMU support for unconstrained

2021-03-10 Thread kan . liang
From: Kan Liang 

The unconstrained value depends on the number of GP and fixed counters.
Each hybrid PMU should use its own unconstrained.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
---
 arch/x86/events/intel/core.c | 5 -
 arch/x86/events/perf_event.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index e187cf5..540dba2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3147,7 +3147,10 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, 
int idx,
}
}
 
-   return 
+   if (!is_hybrid() || !cpuc->pmu)
+   return 
+
+   return _pmu(cpuc->pmu)->unconstrained;
 }
 
 static struct event_constraint *
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index d2f97bd..7bce193 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -639,6 +639,7 @@ struct x86_hybrid_pmu {
int max_pebs_events;
int num_counters;
int num_counters_fixed;
+   struct event_constraint unconstrained;
 };
 
 static __always_inline bool is_hybrid(void)
-- 
2.7.4



[PATCH V2 08/25] perf/x86: Hybrid PMU support for hardware cache event

2021-03-10 Thread kan . liang
From: Kan Liang 

The hardware cache events are different among hybrid PMUs. Each hybrid
PMU should have its own hw cache event table.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   | 11 +--
 arch/x86/events/perf_event.h |  9 +
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 039a851..1db4a67 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -352,6 +352,7 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct 
perf_event *event)
 {
struct perf_event_attr *attr = >attr;
unsigned int cache_type, cache_op, cache_result;
+   struct x86_hybrid_pmu *pmu = is_hybrid() ? hybrid_pmu(event->pmu) : 
NULL;
u64 config, val;
 
config = attr->config;
@@ -371,7 +372,10 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct 
perf_event *event)
return -EINVAL;
cache_result = array_index_nospec(cache_result, 
PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-   val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+   if (pmu)
+   val = 
pmu->hw_cache_event_ids[cache_type][cache_op][cache_result];
+   else
+   val = hw_cache_event_ids[cache_type][cache_op][cache_result];
 
if (val == 0)
return -ENOENT;
@@ -380,7 +384,10 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct 
perf_event *event)
return -EINVAL;
 
hwc->config |= val;
-   attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
+   if (pmu)
+   attr->config1 = 
pmu->hw_cache_extra_regs[cache_type][cache_op][cache_result];
+   else
+   attr->config1 = 
hw_cache_extra_regs[cache_type][cache_op][cache_result];
return x86_pmu_extra_regs(val, event);
 }
 
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 7bce193..f464bda 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -640,6 +640,15 @@ struct x86_hybrid_pmu {
int num_counters;
int num_counters_fixed;
struct event_constraint unconstrained;
+
+   u64 hw_cache_event_ids
+   [PERF_COUNT_HW_CACHE_MAX]
+   [PERF_COUNT_HW_CACHE_OP_MAX]
+   [PERF_COUNT_HW_CACHE_RESULT_MAX];
+   u64 hw_cache_extra_regs
+   [PERF_COUNT_HW_CACHE_MAX]
+   [PERF_COUNT_HW_CACHE_OP_MAX]
+   [PERF_COUNT_HW_CACHE_RESULT_MAX];
 };
 
 static __always_inline bool is_hybrid(void)
-- 
2.7.4



[PATCH V2 05/25] perf/x86: Hybrid PMU support for intel_ctrl

2021-03-10 Thread kan . liang
From: Kan Liang 

The intel_ctrl is the counter mask of a PMU. The PMU counter information
may be different among hybrid PMUs, each hybrid PMU should use its own
intel_ctrl to check and access the counters.

When handling a certain hybrid PMU, apply the intel_ctrl from the
corresponding hybrid PMU.

When checking the HW existence, apply the PMU and number of counters
from the corresponding hybrid PMU as well. Perf will check the HW
existence for each Hybrid PMU before registration. Expose the
check_hw_exists() for a later patch.

Reviewed-by: Andi Kleen 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   | 14 +++---
 arch/x86/events/intel/core.c | 14 +-
 arch/x86/events/perf_event.h | 10 --
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 316efb1..90eb82e 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -226,7 +226,7 @@ static void release_pmc_hardware(void) {}
 
 #endif
 
-static bool check_hw_exists(void)
+bool check_hw_exists(struct pmu *pmu, int num_counters, int num_counters_fixed)
 {
u64 val, val_fail = -1, val_new= ~0;
int i, reg, reg_fail = -1, ret = 0;
@@ -237,7 +237,7 @@ static bool check_hw_exists(void)
 * Check to see if the BIOS enabled any of the counters, if so
 * complain and bail.
 */
-   for (i = 0; i < x86_pmu.num_counters; i++) {
+   for (i = 0; i < num_counters; i++) {
reg = x86_pmu_config_addr(i);
ret = rdmsrl_safe(reg, );
if (ret)
@@ -251,13 +251,13 @@ static bool check_hw_exists(void)
}
}
 
-   if (x86_pmu.num_counters_fixed) {
+   if (num_counters_fixed) {
reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
ret = rdmsrl_safe(reg, );
if (ret)
goto msr_fail;
-   for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
-   if (fixed_counter_disabled(i))
+   for (i = 0; i < num_counters_fixed; i++) {
+   if (fixed_counter_disabled(i, pmu))
continue;
if (val & (0x03 << i*4)) {
bios_fail = 1;
@@ -1543,7 +1543,7 @@ void perf_event_print_debug(void)
cpu, idx, prev_left);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-   if (fixed_counter_disabled(idx))
+   if (fixed_counter_disabled(idx, cpuc->pmu))
continue;
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
@@ -1995,7 +1995,7 @@ static int __init init_hw_perf_events(void)
pmu_check_apic();
 
/* sanity check that the hardware exists or is emulated */
-   if (!check_hw_exists())
+   if (!check_hw_exists(, x86_pmu.num_counters, 
x86_pmu.num_counters_fixed))
return 0;
 
pr_cont("%s PMU driver.\n", x86_pmu.name);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index e466a49..1f1c003 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2153,10 +2153,11 @@ static void intel_pmu_disable_all(void)
 static void __intel_pmu_enable_all(int added, bool pmi)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(_hw_events);
+   u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
 
intel_pmu_lbr_enable_all(pmi);
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
-   x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
+  intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
 
if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
struct perf_event *event =
@@ -2709,6 +2710,7 @@ int intel_pmu_save_and_restart(struct perf_event *event)
 static void intel_pmu_reset(void)
 {
struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
+   struct cpu_hw_events *cpuc = this_cpu_ptr(_hw_events);
unsigned long flags;
int idx;
 
@@ -2724,7 +2726,7 @@ static void intel_pmu_reset(void)
wrmsrl_safe(x86_pmu_event_addr(idx),  0ull);
}
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
-   if (fixed_counter_disabled(idx))
+   if (fixed_counter_disabled(idx, cpuc->pmu))
continue;
wrmsrl_safe(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
}
@@ -2753,6 +2755,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 
status)
struct cpu_hw_events *cpuc = this_cpu_ptr(_hw_events);
int bit;
int handled = 0;
+   u64 intel_ctrl = hybrid(cpuc->pmu, intel_ctrl);
 
inc_irq_stat(apic_perf_irqs);
 
@@ -2798,7 +2801,7 @@ static int handle_pmi_common(struct pt_regs *regs, u64 
status)
 
handled++;
x86_pmu.drain_pebs(regs, );
-   status &= 

[PATCH V2 03/25] perf/x86: Track pmu in per-CPU cpu_hw_events

2021-03-10 Thread kan . liang
From: Kan Liang 

Some platforms, e.g. Alder Lake, have hybrid architecture. In the same
package, there may be more than one type of CPU. The PMU capabilities
are different among different types of CPU. Perf will register a
dedicated PMU for each type of CPU.

Add a 'pmu' variable in the struct cpu_hw_events to track the dedicated
PMU of the current CPU.

Current x86_get_pmu() use the global 'pmu', which will be broken on a
hybrid platform. Modify it to apply the 'pmu' of the specific CPU.

Initialize the per-CPU 'pmu' variable with the global 'pmu'. There is
nothing changed for the non-hybrid platforms.

The is_x86_event() will be updated in the later patch ("perf/x86:
Register hybrid PMUs") for hybrid platforms. For the non-hybrid
platforms, nothing is changed here.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   | 17 +
 arch/x86/events/intel/core.c |  2 +-
 arch/x86/events/intel/ds.c   |  4 ++--
 arch/x86/events/intel/lbr.c  |  9 +
 arch/x86/events/perf_event.h |  4 +++-
 5 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 6ddeed3..4873e40 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -45,9 +45,11 @@
 #include "perf_event.h"
 
 struct x86_pmu x86_pmu __read_mostly;
+static struct pmu pmu;
 
 DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
.enabled = 1,
+   .pmu = ,
 };
 
 DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
@@ -720,16 +722,23 @@ void x86_pmu_enable_all(int added)
}
 }
 
-static struct pmu pmu;
-
 static inline int is_x86_event(struct perf_event *event)
 {
return event->pmu == 
 }
 
-struct pmu *x86_get_pmu(void)
+struct pmu *x86_get_pmu(unsigned int cpu)
 {
-   return 
+   struct cpu_hw_events *cpuc = _cpu(cpu_hw_events, cpu);
+
+   /*
+* All CPUs of the hybrid type have been offline.
+* The x86_get_pmu() should not be invoked.
+*/
+   if (WARN_ON_ONCE(!cpuc->pmu))
+   return 
+
+   return cpuc->pmu;
 }
 /*
  * Event scheduler state:
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 5bac48d..6a4d6b2 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4873,7 +4873,7 @@ static void update_tfa_sched(void *ignored)
 * and if so force schedule out for all event types all contexts
 */
if (test_bit(3, cpuc->active_mask))
-   perf_pmu_resched(x86_get_pmu());
+   perf_pmu_resched(x86_get_pmu(smp_processor_id()));
 }
 
 static ssize_t show_sysctl_tfa(struct device *cdev,
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 7ebae18..1bfea8c 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2192,7 +2192,7 @@ void __init intel_ds_init(void)
PERF_SAMPLE_TIME;
x86_pmu.flags |= PMU_FL_PEBS_ALL;
pebs_qual = "-baseline";
-   x86_get_pmu()->capabilities |= 
PERF_PMU_CAP_EXTENDED_REGS;
+   x86_get_pmu(smp_processor_id())->capabilities 
|= PERF_PMU_CAP_EXTENDED_REGS;
} else {
/* Only basic record supported */
x86_pmu.large_pebs_flags &=
@@ -2207,7 +2207,7 @@ void __init intel_ds_init(void)
 
if (x86_pmu.intel_cap.pebs_output_pt_available) {
pr_cont("PEBS-via-PT, ");
-   x86_get_pmu()->capabilities |= 
PERF_PMU_CAP_AUX_OUTPUT;
+   x86_get_pmu(smp_processor_id())->capabilities 
|= PERF_PMU_CAP_AUX_OUTPUT;
}
 
break;
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 21890da..bb4486c 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -705,7 +705,7 @@ void intel_pmu_lbr_add(struct perf_event *event)
 
 void release_lbr_buffers(void)
 {
-   struct kmem_cache *kmem_cache = x86_get_pmu()->task_ctx_cache;
+   struct kmem_cache *kmem_cache;
struct cpu_hw_events *cpuc;
int cpu;
 
@@ -714,6 +714,7 @@ void release_lbr_buffers(void)
 
for_each_possible_cpu(cpu) {
cpuc = per_cpu_ptr(_hw_events, cpu);
+   kmem_cache = x86_get_pmu(cpu)->task_ctx_cache;
if (kmem_cache && cpuc->lbr_xsave) {
kmem_cache_free(kmem_cache, cpuc->lbr_xsave);
cpuc->lbr_xsave = NULL;
@@ -1609,7 +1610,7 @@ void intel_pmu_lbr_init_hsw(void)
x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
 
-   x86_get_pmu()->task_ctx_cache = create_lbr_kmem_cache(size, 0);
+   x86_get_pmu(smp_processor_id())->task_ctx_cache = 

[PATCH V2 04/25] perf/x86/intel: Hybrid PMU support for perf capabilities

2021-03-10 Thread kan . liang
From: Kan Liang 

Some platforms, e.g. Alder Lake, have hybrid architecture. Although most
PMU capabilities are the same, there are still some unique PMU
capabilities for different hybrid PMUs. Perf should register a dedicated
pmu for each hybrid PMU.

Add a new struct x86_hybrid_pmu, which saves the dedicated pmu and
capabilities for each hybrid PMU.

The architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicates the
architecture features which are available on all hybrid PMUs. The
architecture features are stored in the global x86_pmu.intel_cap.

For Alder Lake, the model-specific features are perf metrics and
PEBS-via-PT. The corresponding bits of the global x86_pmu.intel_cap
should be 0 for these two features. Perf should not use the global
intel_cap to check the features on a hybrid system.
Add a dedicated intel_cap in the x86_hybrid_pmu to store the
model-specific capabilities. Use the dedicated intel_cap to replace
the global intel_cap for thse two features. The dedicated intel_cap
will be set in the following "Add Alder Lake Hybrid support" patch.

Suggested-by: Peter Zijlstra (Intel) 
Signed-off-by: Kan Liang 
---
 arch/x86/events/core.c   |  6 --
 arch/x86/events/intel/core.c | 27 ++-
 arch/x86/events/intel/ds.c   |  2 +-
 arch/x86/events/perf_event.h | 35 +++
 arch/x86/include/asm/msr-index.h |  3 +++
 5 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 4873e40..316efb1 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1101,8 +1101,9 @@ static void del_nr_metric_event(struct cpu_hw_events 
*cpuc,
 static int collect_event(struct cpu_hw_events *cpuc, struct perf_event *event,
 int max_count, int n)
 {
+   union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
 
-   if (x86_pmu.intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
+   if (intel_cap.perf_metrics && add_nr_metric_event(cpuc, event))
return -EINVAL;
 
if (n >= max_count + cpuc->n_metric)
@@ -1578,6 +1579,7 @@ void x86_pmu_stop(struct perf_event *event, int flags)
 static void x86_pmu_del(struct perf_event *event, int flags)
 {
struct cpu_hw_events *cpuc = this_cpu_ptr(_hw_events);
+   union perf_capabilities intel_cap = hybrid(cpuc->pmu, intel_cap);
int i;
 
/*
@@ -1617,7 +1619,7 @@ static void x86_pmu_del(struct perf_event *event, int 
flags)
}
cpuc->event_constraint[i-1] = NULL;
--cpuc->n_events;
-   if (x86_pmu.intel_cap.perf_metrics)
+   if (intel_cap.perf_metrics)
del_nr_metric_event(cpuc, event);
 
perf_event_update_userpage(event);
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 6a4d6b2..e466a49 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3646,6 +3646,15 @@ static inline bool is_mem_loads_aux_event(struct 
perf_event *event)
return (event->attr.config & INTEL_ARCH_EVENT_MASK) == 
X86_CONFIG(.event=0x03, .umask=0x82);
 }
 
+static inline bool intel_pmu_has_cap(struct perf_event *event, int idx)
+{
+   union perf_capabilities *intel_cap;
+
+   intel_cap = is_hybrid() ? _pmu(event->pmu)->intel_cap :
+ _pmu.intel_cap;
+
+   return test_bit(idx, (unsigned long *)_cap->capabilities);
+}
 
 static int intel_pmu_hw_config(struct perf_event *event)
 {
@@ -3709,7 +3718,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 * with a slots event as group leader. When the slots event
 * is used in a metrics group, it too cannot support sampling.
 */
-   if (x86_pmu.intel_cap.perf_metrics && is_topdown_event(event)) {
+   if (intel_pmu_has_cap(event, PERF_CAP_METRICS_IDX) && 
is_topdown_event(event)) {
if (event->attr.config1 || event->attr.config2)
return -EINVAL;
 
@@ -4216,8 +4225,16 @@ static void intel_pmu_cpu_starting(int cpu)
if (x86_pmu.version > 1)
flip_smm_bit(_pmu.attr_freeze_on_smi);
 
-   /* Disable perf metrics if any added CPU doesn't support it. */
-   if (x86_pmu.intel_cap.perf_metrics) {
+   /*
+* Disable perf metrics if any added CPU doesn't support it.
+*
+* Turn off the check for a hybrid architecture, because the
+* architecture MSR, MSR_IA32_PERF_CAPABILITIES, only indicate
+* the architecture features. The perf metrics is a model-specific
+* feature for now. The corresponding bit should always be 0 on
+* a hybrid platform, e.g., Alder Lake.
+*/
+   if (!is_hybrid() && x86_pmu.intel_cap.perf_metrics) {
union perf_capabilities perf_cap;
 
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap.capabilities);
@@ -4327,7 +4344,7 @@ static int intel_pmu_check_period(struct 

[PATCH V2 1/25] x86/cpufeatures: Enumerate Intel Hybrid Technology feature bit

2021-03-10 Thread kan . liang
From: Ricardo Neri 

Add feature enumeration to identify a processor with Intel Hybrid
Technology: one in which CPUs of more than one type are the same package.
On a hybrid processor, all CPUs support the same homogeneous (i.e.,
symmetric) instruction set. All CPUs enumerate the same features in CPUID.
Thus, software (user space and kernel) can run and migrate to any CPU in
the system as well as utilize any of the enumerated features without any
change or special provisions. The main difference among CPUs in a hybrid
processor are power and performance properties.

Cc: Andi Kleen 
Cc: Kan Liang 
Cc: "Peter Zijlstra (Intel)" 
Cc: "Rafael J. Wysocki" 
Cc: "Ravi V. Shankar" 
Cc: Srinivas Pandruvada 
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Len Brown 
Reviewed-by: Tony Luck 
Signed-off-by: Ricardo Neri 
---
Changes since v1 (as part of patchset for perf change for Alderlake)
 * None

Changes since v1 (in a separate posting):
 * Reworded commit message to clearly state what is Intel Hybrid
   Technology. Stress that all CPUs can run the same instruction
   set and support the same features.
---
 arch/x86/include/asm/cpufeatures.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index cc96e26d69f7..e7cfc9eedf8d 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -374,6 +374,7 @@
 #define X86_FEATURE_MD_CLEAR   (18*32+10) /* VERW clears CPU buffers */
 #define X86_FEATURE_TSX_FORCE_ABORT(18*32+13) /* "" TSX_FORCE_ABORT */
 #define X86_FEATURE_SERIALIZE  (18*32+14) /* SERIALIZE instruction */
+#define X86_FEATURE_HYBRID_CPU (18*32+15) /* This part has CPUs of 
more than one type */
 #define X86_FEATURE_TSXLDTRK   (18*32+16) /* TSX Suspend Load Address 
Tracking */
 #define X86_FEATURE_PCONFIG(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_ARCH_LBR   (18*32+19) /* Intel ARCH LBR */
-- 
2.17.1



[PATCH V2 2/25] x86/cpu: Add helper functions to get parameters of hybrid CPUs

2021-03-10 Thread kan . liang
From: Ricardo Neri 

On processors with Intel Hybrid Technology (i.e., one having more than one
type of CPU in the same package), all CPUs support the same instruction
set and enumerate the same features on CPUID. Thus, all software can run
on any CPU without restrictions. However, there may be model-specific
differences among types of CPUs. For instance, each type of CPU may support
a different number of performance counters. Also, machine check error banks
may be wired differently. Even though most software will not care about
these differences, kernel subsystems dealing with these differences must
know. Add a new member to cpuinfo_x86 that subsystems can query to know
the type of CPU.

Hybrid processors also have a native model ID to uniquely identify the
micro-architecture of each CPU. Please note that the native model ID is not
related with the existing x86_model_id read from CPUID leaf 0x1.

Add helper functions and definitions to get both the CPU type and the
combined CPU type and native model ID. Only expose get_hybrid_cpu_type().
The sole known use case (i.e., perf for hybrid processors) only wants to
know the type but not the model ID. get_hybrid_params() can be exposed in
the future if new use cases arise.

The Intel Software Developer's Manual defines the CPU type and the CPU
native model ID as 8-bit and 24-bit identifiers, respectively.

Cc: Andi Kleen 
Cc: Andy Lutomirski 
Cc: Dave Hansen 
Cc: Kan Liang 
Cc: "Peter Zijlstra (Intel)" 
Cc: "Rafael J. Wysocki" 
Cc: "Ravi V. Shankar" 
Cc: Srinivas Pandruvada 
Cc: linux-kernel@vger.kernel.org
Reviewed-by: Len Brown 
Reviewed-by: Tony Luck 
Signed-off-by: Ricardo Neri 
---
Changes since v1 (as part of patchset for perf change for Alderlake)
 * Removed cpuinfo_x86.x86_cpu_type. It can be added later if needed.
   Instead, implement helper functions that subsystems can use.(Boris)
 * Add definitions for Atom and Core CPU types. (Kan)

Changes since v1 (in a separate posting)
 * Simplify code by using cpuid_eax(). (Boris)
 * Reworded the commit message to clarify the concept of Intel Hybrid
   Technology. Stress that all CPUs can run the same instruction set
   and support the same features.
---
 arch/x86/include/asm/cpu.h  |  6 
 arch/x86/include/asm/intel-family.h |  4 +++
 arch/x86/include/asm/processor.h|  3 ++
 arch/x86/kernel/cpu/intel.c | 44 +
 4 files changed, 57 insertions(+)

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index da78ccbd493b..845c478dfcd4 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -45,6 +45,7 @@ extern void __init cpu_set_core_cap_bits(struct cpuinfo_x86 
*c);
 extern void switch_to_sld(unsigned long tifn);
 extern bool handle_user_split_lock(struct pt_regs *regs, long error_code);
 extern bool handle_guest_split_lock(unsigned long ip);
+u8 get_hybrid_cpu_type(int cpu);
 #else
 static inline void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c) {}
 static inline void switch_to_sld(unsigned long tifn) {}
@@ -57,6 +58,11 @@ static inline bool handle_guest_split_lock(unsigned long ip)
 {
return false;
 }
+
+static inline u8 get_hybrid_cpu_type(int cpu)
+{
+   return 0;
+}
 #endif
 #ifdef CONFIG_IA32_FEAT_CTL
 void init_ia32_feat_ctl(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/intel-family.h 
b/arch/x86/include/asm/intel-family.h
index 9abe842dbd84..b5fb475bdf10 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -134,4 +134,8 @@
 /* Family 5 */
 #define INTEL_FAM5_QUARK_X1000 0x09 /* Quark X1000 SoC */
 
+/* Types of CPUs in hybrid parts. */
+#define INTEL_HYBRID_TYPE_ATOM 0x20
+#define INTEL_HYBRID_TYPE_CORE 0x40
+
 #endif /* _ASM_X86_INTEL_FAMILY_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index dc6d149bf851..d43d7a11afba 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -166,6 +166,9 @@ enum cpuid_regs_idx {
 
 #define X86_VENDOR_UNKNOWN 0xff
 
+#define X86_HYBRID_CPU_TYPE_ID_SHIFT   24
+#define X86_HYBRID_CPU_NATIVE_MODEL_ID_MASK0xff
+
 /*
  * capabilities of CPUs
  */
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0e422a544835..aa0dac287c1f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -1195,3 +1195,47 @@ void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
cpu_model_supports_sld = true;
split_lock_setup();
 }
+
+static void read_hybrid_params(void *data)
+{
+   u32 *eax = data;
+
+   *eax = cpuid_eax(0x001a);
+}
+
+/**
+ * get_hybrid_params() - Get type and model ID of a hybrid CPU
+ * @cpu:   CPU of which hybrid parameters are queried.
+ *
+ * Returns the combined CPU type [31:24] and the native model ID [23:0] of a
+ * CPU in a hybrid processor. It must be called with preemption disabled.
+ * If @cpu is invalid or the processor 

[PATCH V2 00/25] Add Alder Lake support for perf (kernel)

2021-03-10 Thread kan . liang
From: Kan Liang 

Changes since V1:
- Drop all user space patches, which will be reviewed later separately.
- Don't save the CPU type in struct cpuinfo_x86. Instead, provide helper
  functions to get parameters of hybrid CPUs. (Boris)
- Rework the perf kernel patches according to Peter's suggestion. The
  key changes include,
  - Code style changes. Drop all the macro which names in capital
letters.
  - Drop the hybrid PMU index, track the pointer of the hybrid PMU in
the per-CPU struct cpu_hw_events.
  - Fix the x86_get_pmu() support
  - Fix the allocate_fake_cpuc() support
  - Fix validate_group() support
  - Dynamically allocate the *hybrid_pmu for each hybrid PMU

Alder Lake uses a hybrid architecture utilizing Golden Cove cores
and Gracemont cores. On such architectures, all CPUs support the same,
homogeneous and symmetric, instruction set. Also, CPUID enumerate
the same features for all CPUs. There may be model-specific differences,
such as those addressed in this patchset.

The first two patches enumerate the hybrid CPU feature bit and provide
helper functions to get parameters of hybrid CPUs. (The initial idea [1]
was to save the CPU type in a new field x86_cpu_type in struct cpuinfo_x86.
Since the only user of the new field is perf, querying the
X86_FEATURE_HYBRID_CPU at the call site is a simpler alternative.[2])
Compared with the initial submission, the below two concerns[3][4] are
also addressed,
- Provide a good use case, PMU.
- Clarify what Intel Hybrid Technology is and is not.

The PMU capabilities for Golden Cove core and Gracemont core are not the
same. The key differences include the number of counters, events, perf
metrics feature, and PEBS-via-PT feature. A dedicated hybrid PMU has to
be registered for each of them. However, the current perf X86 assumes
that there is only one CPU PMU. To handle the hybrid PMUs, the patchset
- Introduce a new struct x86_hybrid_pmu to save the unique capabilities
  from different PMUs. It's part of the global x86_pmu. The architecture
  capabilities, which are available for all PMUs, are still saved in
  the global x86_pmu. To save the space, the x86_hybrid_pmu is
  dynamically allocated.
- The hybrid PMU registration has been moved to the cpu_starting(),
  because only boot CPU is available when invoking the
  init_hw_perf_events().
- Hybrid PMUs have different events and formats. Add new structures and
  helpers for events attribute and format attribute which take the PMU
  type into account.
- Add a PMU aware version PERF_TYPE_HARDWARE_PMU and
  PERF_TYPE_HW_CACHE_PMU to facilitate user space tools

The uncore, MSR and cstate are the same between hybrid CPUs.
Don't need to register hybrid PMUs for them.

The generic code kernel/events/core.c is not hybrid friendly either,
especially for the per-task monitoring. Peter once proposed a
patchset[5], but it hasn't been merged. This patchset doesn't intend to
improve the generic code (which can be improved later separately). It
still uses the capability PERF_PMU_CAP_HETEROGENEOUS_CPUS for each
hybrid PMUs. For per-task and system-wide monitoring, user space tools
have to create events on all available hybrid PMUs. The events which are
from different hybrid PMUs cannot be included in the same group.

[1]. 
https://lore.kernel.org/lkml/20201002201931.2826-1-ricardo.neri-calde...@linux.intel.com/
[2]. https://lore.kernel.org/lkml/20210208175640.gd18...@zn.tnic/
[3]. https://lore.kernel.org/lkml/20201002203452.ge17...@zn.tnic/
[4]. https://lore.kernel.org/lkml/87r1qgccku@nanos.tec.linutronix.de/
[5]. 
https://lkml.kernel.org/r/20181010104559.go5...@hirez.programming.kicks-ass.net/

Kan Liang (22):
  perf/x86: Track pmu in per-CPU cpu_hw_events
  perf/x86/intel: Hybrid PMU support for perf capabilities
  perf/x86: Hybrid PMU support for intel_ctrl
  perf/x86: Hybrid PMU support for counters
  perf/x86: Hybrid PMU support for unconstrained
  perf/x86: Hybrid PMU support for hardware cache event
  perf/x86: Hybrid PMU support for event constraints
  perf/x86: Hybrid PMU support for extra_regs
  perf/x86/intel: Factor out intel_pmu_check_num_counters
  perf/x86/intel: Factor out intel_pmu_check_event_constraints
  perf/x86/intel: Factor out intel_pmu_check_extra_regs
  perf/x86: Remove temporary pmu assignment in event_init
  perf/x86: Factor out x86_pmu_show_pmu_cap
  perf/x86: Register hybrid PMUs
  perf/x86: Add structures for the attributes of Hybrid PMUs
  perf/x86/intel: Add attr_update for Hybrid PMUs
  perf/x86: Support filter_match callback
  perf/x86/intel: Add Alder Lake Hybrid support
  perf: Introduce PERF_TYPE_HARDWARE_PMU and PERF_TYPE_HW_CACHE_PMU
  perf/x86/intel/uncore: Add Alder Lake support
  perf/x86/msr: Add Alder Lake CPU support
  perf/x86/cstate: Add Alder Lake CPU support

Ricardo Neri (2):
  x86/cpufeatures: Enumerate Intel Hybrid Technology feature bit
  x86/cpu: Add helper functions to get parameters of hybrid CPUs

Zhang Rui (1):
  perf/x86/rapl: Add support for Intel 

Re: [RFC v2 3/5] arm64: socfpga: rename ARCH_STRATIX10 to ARCH_SOCFPGA64

2021-03-10 Thread Arnd Bergmann
On Wed, Mar 10, 2021 at 4:54 PM Krzysztof Kozlowski
 wrote:
> On 10/03/2021 16:47, Krzysztof Kozlowski wrote:
> > This edac Altera driver is very weird... it uses the same compatible
> > differently depending whether this is 32-bit or 64-bit (e.g. Stratix
> > 10)! On ARMv7 the compatible means for example one IRQ... On ARMv8, we
> > have two. It's quite a new code (2019 from Intel), not some ancient
> > legacy, so it should never have been accepted...
>
> Oh, it's not that horrible as it sounds. They actually have different
> compatibles for edac driver with these differences (e.g. in interrupts).
> They just do not use them and instead check for the basic (common?)
> compatible and architecture... Anyway without testing I am not the
> person to fix the edac driver.

Ok, This should be fixed properly as you describe, but as a quick hack
it wouldn't be hard to just change the #ifdef to check for CONFIG_64BIT
instead of CONFIG_ARCH_STRATIX10 during the rename of the config
symbol.

   Arnd


Re: [PATCH v5 3/5] riscv: Separate memory init from paging init

2021-03-10 Thread Geert Uytterhoeven
Hi Atish,

On Thu, Nov 19, 2020 at 1:40 AM Atish Patra  wrote:
> Currently, we perform some memory init functions in paging init. But,
> that will be an issue for NUMA support where DT needs to be flattened
> before numa initialization and memblock_present can only be called
> after numa initialization.
>
> Move memory initialization related functions to a separate function.
>
> Signed-off-by: Atish Patra 
> Reviewed-by: Greentime Hu 
> Reviewed-by: Anup Patel 
> Reviewed-by: Palmer Dabbelt 

This is now commit cbd34f4bb37d62d8 in v5.12-rc1, breaking the boot on
Vexriscv:

[0.00] earlycon: sbi0 at I/O port 0x0 (options '')
[0.00] printk: bootconsole [sbi0] enabled
[0.00] printk: debug: ignoring loglevel setting.
[0.00] Initial ramdisk at: 0x(ptrval) (8388608 bytes)
[0.00] Unable to handle kernel paging request at virtual
address c808
[0.00] Oops [#1]
[0.00] CPU: 0 PID: 0 Comm: swapper Not tainted
5.11.0-orangecrab-00023-g7c4fc8e3e982 #129
[0.00] epc: c04d6624 ra : c04d6524 sp : c05ddf70
[0.00]  gp : c0678bc0 tp : c05e5b40 t0 : c800
[0.00]  t1 : 0003 t2 :  s0 : c05ddfc0
[0.00]  s1 : c800 a0 :  a1 : c7e0
[0.00]  a2 : 0005 a3 : 0001 a4 : 000c
[0.00]  a5 :  a6 : c04fe000 a7 : 000c
[0.00]  s2 : c04fe098 s3 : 00a0 s4 : c760
[0.00]  s5 : c04fe0dc s6 : 8200 s7 : c059f19c
[0.00]  s8 : 81000200 s9 : c059f1b8 s10: 8200
[0.00]  s11: c059f19c t3 : 405dba80 t4 : c05e6f08
[0.00]  t5 : 81000200 t6 : 40501000
[0.00] status: 0100 badaddr: c808 cause: 000f
[0.00] random: get_random_bytes called from
print_oops_end_marker+0x38/0x7c with crng_init=0
[0.00] ---[ end trace  ]---
[0.00] Kernel panic - not syncing: Attempted to kill the idle task!

Note that I have "[PATCH v2 3/4] RISC-V: Fix L1_CACHE_BYTES for RV32"[1]
applied, to avoid another crash (7c4fc8e3e982 = v5.11 + [1] +
cherry-picked commits from the riscv-for-linus-5.12-mw0 pull request).

If I revert the L1_CACHE_BYTES change, the boot continues, but I'm back
to the old issue fixed by [1]:

[   22.126687] Freeing initrd memory: 8192K
[   22.321811] workingset: timestamp_bits=30 max_order=15 bucket_order=0
[   29.001509] Block layer SCSI generic (bsg) driver version 0.4
loaded (major 253)
[   29.021555] io scheduler mq-deadline registered
[   29.033692] io scheduler kyber registered
[   29.141294] Unable to handle kernel paging request at virtual
address 69726573
[   29.158523] Oops [#1]
[   29.162232] CPU: 0 PID: 1 Comm: swapper Not tainted
5.11.0-orangecrab-00023-g7c4fc8e3e982-dirty #132
[   29.171970] epc: c000d3b0 ra : c000eb74 sp : c182dca0
[   29.178786]  gp : c067aee0 tp : c183 t0 : c18d75e0
[   29.185935]  t1 : 0003 t2 :  s0 : c182dcb0
[   29.193028]  s1 :  a0 : c05eab14 a1 : c18d75c0
[   29.200067]  a2 : c7ffe384 a3 : 69726573 a4 : f00b
[   29.207095]  a5 : f000 a6 : c7f8 a7 : 
[   29.214141]  s2 : 01001f00 s3 : c05eb000 s4 : c067c000
[   29.221171]  s5 : c000ec0c s6 : 8000 s7 : c05eaad4
[   29.228200]  s8 : c05eab58 s9 : c05a1000 s10: c18d75c0
[   29.235238]  s11: c05eab14 t3 : 20b9a6cc t4 : 0001
[   29.242277]  t5 :  t6 : c188cd50
[   29.247588] status: 0120 badaddr: 69726573 cause: 000d
[   29.274424] ---[ end trace 69dee1b9ca96f1d6 ]---
[   29.282859] note: swapper[1] exited with preempt_count 1
[   29.293156] Kernel panic - not syncing: Attempted to kill init!
exitcode=0x000b

[1] 
https://lore.kernel.org/linux-riscv/20210111234504.3782179-4-atish.pa...@wdc.com/

Will have a deeper look later...

Thanks for any suggestions!

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds


Re: [PATCH V2] ASoC: soc-core: Prevent warning if no DMI table is present

2021-03-10 Thread Pierre-Louis Bossart




On 3/10/21 10:37 AM, Takashi Iwai wrote:

On Wed, 10 Mar 2021 17:18:14 +0100,
Mark Brown wrote:


On Wed, Mar 10, 2021 at 09:44:07AM -0600, Pierre-Louis Bossart wrote:

On 3/10/21 7:35 AM, Mark Brown wrote:



Just change it to a system level check for ACPI, checking for OF would
leave problems for board files or any other alternative firmware
interfaces.



did you mean if (!IS_ENABLED(CONFIG_ACPI)) ?


Is there a runtime check?


Well, basically both DMI and ACPI are completely different things, so
I don't think it's right to check the availability of ACPI as a signal
of the availability of DMI.


would this work?

if (!IS_ENABLED(CONFIG_DMI))
return 0;


Re: [PATCH v10 1/2] scsi: ufs: Enable power management for wlun

2021-03-10 Thread Asutosh Das (asd)

On 3/10/2021 8:27 AM, Alan Stern wrote:

On Tue, Mar 09, 2021 at 08:04:53PM -0800, Asutosh Das (asd) wrote:

On 3/9/2021 7:14 PM, Alan Stern wrote:

On Tue, Mar 09, 2021 at 07:04:34PM -0800, Asutosh Das (asd) wrote:

Hello
I & Can (thanks CanG) debugged this further:

Looks like this issue can occur if the sd probe is asynchronous.

Essentially, the sd_probe() is done asynchronously and driver_probe_device()
invokes pm_runtime_get_suppliers() before invoking sd_probe().

But scsi_probe_and_add_lun() runs in a separate context.
So the scsi_autopm_put_device() invoked from scsi_scan_host() context
reduces the link->rpm_active to 1. And sd_probe() invokes
scsi_autopm_put_device() and starts a timer. And then driver_probe_device()
invoked from __device_attach_async_helper context reduces the
link->rpm_active to 1 thus enabling the supplier to suspend before the
consumer suspends.



I don't see a way around this. Please let me know if you
(@Alan/@Bart/@Adrian) have any thoughts on this.


How about changing the SCSI core so that it does a runtime_get before
starting an async probe, and the async probe routine does a
runtime_put when it is finished?  In other words, don't allow a device
to go into runtime suspend while it is waiting to be probed.

I don't think that would be too intrusive.

Alan Stern



Hi Alan
Thanks for the suggestion.

Am trying to understand:

Do you mean something like this:

int scsi_sysfs_add_sdev(struct scsi_device *sdev)
{

scsi_autopm_get_device(sdev);
pm_runtime_get_noresume(>sdev_gendev);
[...]
scsi_autopm_put_device(sdev);
[...]
}

static int sd_probe(struct device *dev)
{
[...]
pm_runtime_put_noidle(dev);
scsi_autopm_put_device(sdp);
[...]
}

This may work (I'm limited by my imagination in scsi layer :) ).


I'm not sure about this.  To be honest, I did not read the entirety of
your last message; it had way too much detail.  THere's a time and place
for that, but when you're brainstorming to figure out the underlying
cause of a problem and come up with a strategy to fix it, you want to
concentrate on the overall picture, not the details.

As I understand the situation, you've get a SCSI target with multiple
logical units, let's say A and B, and you need to make sure that A never
goes into runtime suspend unless B is already suspended.  In other
words, B always has to suspend before A and resume after A.

To do this, you register a device link with A as the supplier and B as
the consumer.  Then the PM core takes care of the ordering for you.

But I don't understand when you set up the device link.  If the timing
is wrong then, thanks to async SCSI probing, you may have a situation
where A is registered before B and before the link is set up.  Then
there's temporarily nothing to stop A from suspending before B.

You also need to prevent each device from suspending before it is
probed.  That's the easy part I was trying to address before (although
it may not be so easy if the drivers are in loadable modules and not
present in the kernel).

You need to think through these issues before proposing actual changes.


But the pm_runtime_put_noidle() would have to be added to all registered
scsi_driver{}, perhaps? Or may be I can check for sdp->type?


Like this; it's too early to worry about this sort of thing.

Alan Stern


Hi Alan
Thanks. Understood.

I will check the details and see if I can come up with something.
I'll propose an alternate fix otherwise and drop this change altogether.

Thanks!
-asd

--
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
Linux Foundation Collaborative Project


Re: [PATCH 4/9] drm: remove the drm file system

2021-03-10 Thread Al Viro
On Tue, Mar 09, 2021 at 04:53:43PM +0100, Christoph Hellwig wrote:
> Just use the generic anon_inode file system.

Are you changing the lifetime rules for that module?


Re: [PATCH v3 06/15] mfd: Add ROHM BD71815 ID

2021-03-10 Thread Lee Jones
On Wed, 10 Mar 2021, Matti Vaittinen wrote:

> 
> On Wed, 2021-03-10 at 13:31 +, Lee Jones wrote:
> > On Wed, 10 Mar 2021, Matti Vaittinen wrote:
> > 
> > > On Wed, 2021-03-10 at 11:17 +, Lee Jones wrote:
> > > > On Wed, 10 Mar 2021, Vaittinen, Matti wrote:
> > > > 
> > > > > Hello Lee,
> > > > > 
> > > > > On Wed, 2021-03-10 at 10:36 +, Lee Jones wrote:
> > > > > > On Mon, 08 Mar 2021, Matti Vaittinen wrote:
> > > > > > 
> > > > > > > Add chip ID for ROHM BD71815 and PMIC so that drivers can
> > > > > > > identify
> > > > > > > this IC.
> > > > > > > 
> > > > > > > Signed-off-by: Matti Vaittinen <
> > > > > > > matti.vaitti...@fi.rohmeurope.com>
> > > > > > > Acked-for-MFD-by: Lee Jones 
> > > > > > > ---
> > > > > > >  include/linux/mfd/rohm-generic.h | 1 +
> > > > > > >  1 file changed, 1 insertion(+)
> > > > > > > 
> > > > > > > diff --git a/include/linux/mfd/rohm-generic.h
> > > > > > > b/include/linux/mfd/rohm-generic.h
> > > > > > > index 66f673c35303..e5392bcbc098 100644
> > > > > > > --- a/include/linux/mfd/rohm-generic.h
> > > > > > > +++ b/include/linux/mfd/rohm-generic.h
> > > > > > > @@ -14,6 +14,7 @@ enum rohm_chip_type {
> > > > > > >   ROHM_CHIP_TYPE_BD71828,
> > > > > > >   ROHM_CHIP_TYPE_BD9571,
> > > > > > >   ROHM_CHIP_TYPE_BD9574,
> > > > > > > + ROHM_CHIP_TYPE_BD71815,
> > > > > > 
> > > > > > Is there a technical reason why these can't be re-ordered?
> > > > > 
> > > > > No, I don't think so.
> > > > > 
> > > > > BTW. there will probably be a (trivial) conflict here as both
> > > > > this
> > > > > series and the BD9576/BD9573 series add an ID here. Let me
> > > > > guess,
> > > > > you'd
> > > > 
> > > > That's fine.  I will resolve that manually.
> > > 
> > > Thanks :)
> > > 
> > > > > like to see them sorted?
> > > > 
> > > > Wouldn't that be nice? :)
> > > Aesthetics is not really my cup of tea. OTOH, if amount of IDs
> > > grow,
> > > then sorting helps spotting whether some IC has an ID here. So yes,
> > > it
> > > kind of makes sense.
> > 
> > By 'nice' I don't mean 'pretty'.
> > 
> > I mean 'improving readability/maintainability would be nice'.
> > 
> > > Can you do sorting while resolving the conflict between series or
> > > do
> > > you want me to
> > > a) do sorting if (when) I re-spin the series
> > > b) send separate sorting patch as a part of this series
> > > c) send sepatate sorting patch after all the pending patches
> > > touching
> > > these IDs have been merged?
> > 
> > I'll let you use your imagination.
> > 
> 
> Right :)
> 
> I'll sort the ID enum when I respin a series which is touching it, ok?
> Or do you want me to resend this even if there were no other changes?
> 
> It's just an old habit to add new enums at the bottom to maintain
> binary compatibility - which does not matter in this case.

I won't let this alone hold up merging of the whole set, but it looks
like you're still short of quite a few reviews.  I'd be surprised if
it's this version that gets applied.

-- 
Lee Jones [李琼斯]
Senior Technical Lead - Developer Services
Linaro.org │ Open source software for Arm SoCs
Follow Linaro: Facebook | Twitter | Blog


RE: [EXT] Re: [net-next] net: mvpp2: Add reserved port private flag configuration

2021-03-10 Thread Stefan Chulski



> -Original Message-
> From: Andrew Lunn 
> Sent: Wednesday, March 10, 2021 5:51 PM
> To: Stefan Chulski 
> Cc: net...@vger.kernel.org; thomas.petazz...@bootlin.com;
> da...@davemloft.net; Nadav Haklai ; Yan
> Markman ; linux-kernel@vger.kernel.org;
> k...@kernel.org; li...@armlinux.org.uk; m...@semihalf.com;
> rmk+ker...@armlinux.org.uk; aten...@kernel.org; rab...@solid-run.com
> Subject: [EXT] Re: [net-next] net: mvpp2: Add reserved port private flag
> configuration
> 
> External Email
> 
> --
> >  static void mvpp2_ethtool_get_strings(struct net_device *netdev, u32
> sset,
> >   u8 *data)
> >  {
> > struct mvpp2_port *port = netdev_priv(netdev);
> > int i, q;
> >
> > -   if (sset != ETH_SS_STATS)
> > -   return;
> > +   switch (sset) {
> > +   case ETH_SS_STATS:
> > +   for (i = 0; i < ARRAY_SIZE(mvpp2_ethtool_mib_regs); i++) {
> > +   strscpy(data, mvpp2_ethtool_mib_regs[i].string,
> > +   ETH_GSTRING_LEN);
> > +   data += ETH_GSTRING_LEN;
> > +   }
> 
> Hi Stefan
> 
> Maybe rename the existing function to
> mvpp2_ethtool_get_strings_stats() and turn it into a helper. Add a new
> mvpp2_ethtool_get_strings_priv() helper. And a new
> mvpp2_ethtool_get_strings() which just calls the two helpers. 

OK, I can do this.

> Overall the
> patch should be smaller and much easier to review.
> 
> Andrew

Make it patch series? I can split it to 2/3 patches.
Thanks,
Stefan.



Re: [PATCH V2] ASoC: soc-core: Prevent warning if no DMI table is present

2021-03-10 Thread Takashi Iwai
On Wed, 10 Mar 2021 17:18:14 +0100,
Mark Brown wrote:
> 
> On Wed, Mar 10, 2021 at 09:44:07AM -0600, Pierre-Louis Bossart wrote:
> > On 3/10/21 7:35 AM, Mark Brown wrote:
> 
> > > Just change it to a system level check for ACPI, checking for OF would
> > > leave problems for board files or any other alternative firmware
> > > interfaces.
> 
> > did you mean if (!IS_ENABLED(CONFIG_ACPI)) ?
> 
> Is there a runtime check?

Well, basically both DMI and ACPI are completely different things, so
I don't think it's right to check the availability of ACPI as a signal
of the availability of DMI.


Takashi


Re: [PATCH 3/9] powerpc/pseries: remove the ppc-cmm file system

2021-03-10 Thread Al Viro
On Tue, Mar 09, 2021 at 04:53:42PM +0100, Christoph Hellwig wrote:
> Just use the generic anon_inode file system.

Umm...  The only problem I see here is the lifetime rules for
that module, and that's not something introduced in this patchset.
Said that, looks like the logics around that place is duplicated in
cmm.c, vmw_balloon.c and virtion_balloon.c and I wonder if it would
be better off with a helper in mm/balloon.c to be used for that setup...


[PATCH]] drm/amdgpu/gfx9: add gfxoff quirk

2021-03-10 Thread Daniel Gomez
Disabling GFXOFF via the quirk list fixes a hardware lockup in
Ryzen V1605B, RAVEN 0x1002:0x15DD rev 0x83.

Signed-off-by: Daniel Gomez 
---

This patch is a continuation of the work here:
https://lkml.org/lkml/2021/2/3/122 where a hardware lockup was discussed and
a dma_fence deadlock was provoke as a side effect. To reproduce the issue
please refer to the above link.

The hardware lockup was introduced in 5.6-rc1 for our particular revision as it
wasn't part of the new blacklist. Before that, in kernel v5.5, this hardware was
working fine without any hardware lock because the GFXOFF was actually disabled
by the if condition for the CHIP_RAVEN case. So this patch, adds the 'Radeon
Vega Mobile Series [1002:15dd] (rev 83)' to the blacklist to disable the GFXOFF.

But besides the fix, I'd like to ask from where this revision comes from. Is it
an ASIC revision or is it hardcoded in the VBIOS from our vendor? From what I
can see, it comes from the ASIC and I wonder if somehow we can get an APU in the
future, 'not blacklisted', with the same problem. Then, should this table only
filter for the vendor and device and not the revision? Do you know if there are
any revisions for the 1002:15dd validated, tested and functional?

Logs:
[   27.708348] [drm] initializing kernel modesetting (RAVEN
0x1002:0x15DD 0x1002:0x15DD 0x83).
[   27.789156] amdgpu: ATOM BIOS: 113-RAVEN-115

Thanks in advance,
Daniel

 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 65db88bb6cbc..319d4b99aec8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -1243,6 +1243,8 @@ static const struct amdgpu_gfxoff_quirk 
amdgpu_gfxoff_quirk_list[] = {
{ 0x1002, 0x15dd, 0x103c, 0x83e7, 0xd3 },
/* GFXOFF is unstable on C6 parts with a VBIOS 113-RAVEN-114 */
{ 0x1002, 0x15dd, 0x1002, 0x15dd, 0xc6 },
+   /* GFXOFF provokes a hw lockup on 83 parts with a VBIOS 113-RAVEN-115 */
+   { 0x1002, 0x15dd, 0x1002, 0x15dd, 0x83 },
{ 0, 0, 0, 0, 0 },
 };

--
2.30.1



Re: [PATCH v6 3/6] x86/elf: Support a new ELF aux vector AT_MINSIGSTKSZ

2021-03-10 Thread Bae, Chang Seok
On Mar 5, 2021, at 02:43, Borislav Petkov  wrote:
> On Sat, Feb 27, 2021 at 08:59:08AM -0800, Chang S. Bae wrote:
>> Historically, signal.h defines MINSIGSTKSZ (2KB) and SIGSTKSZ (8KB), for
>> use by all architectures with sigaltstack(2). Over time, the hardware state
>> size grew, but these constants did not evolve. Today, literal use of these
>> constants on several architectures may result in signal stack overflow, and
>> thus user data corruption.
>> 
>> A few years ago, the ARM team addressed this issue by establishing
>> getauxval(AT_MINSIGSTKSZ). This enables the kernel to supply at runtime
>> value that is an appropriate replacement on the current and future
>> hardware.
>> 
>> Add getauxval(AT_MINSIGSTKSZ) support to x86, analogous to the support
>> added for ARM in commit 94b07c1f8c39 ("arm64: signal: Report signal frame
>> size to userspace via auxv").
>> 
>> Also, include a documentation to describe x86-specific auxiliary vectors.
>> 
>> Reported-by: Florian Weimer 
>> Fixes: c2bc11f10a39 ("x86, AVX-512: Enable AVX-512 States Context Switch")
> 
> Right, so this has a Fixes: tag and points to bugzilla entry which talks
> about signal stack corruption with AVX-512F.
> 
> But if this is going to be backported to stable, then the patch(es)
> should be minimal and not contain documentation. And if so, one will
> need all three to be backported, which means, a cc:stable should contain
> a comment explaining that.
> 
> Or am I misreading and they should not need to be backported to stable
> because some ?
> 
> Also, I'm not sure backporting a patch to stable which changes ABI is
> ok. It probably is but I don't know.
> 
> So what's the deal here?

Yeah, right. While this attempts to fix the issue, it involves the ABI change.
Len and I think PATCH5 [1] is rather a backport candidate as it gives a more
reasonable behavior.

At least, I can make a new patch for this documentation if you think it is the
right way.

> You also need:
> 
> diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
> index 4693e192b447..d58614d5cde6 100644
> --- a/Documentation/x86/index.rst
> +++ b/Documentation/x86/index.rst
> @@ -35,3 +35,4 @@ x86-specific Documentation
>sva
>sgx
>features
> +   elf_auxvec
> 
> to add this to the TOC.

Ah, will do that.

>> +   #include 
>> +   #include 
>> +
>> +   #ifndef AT_MINSIGSTKSZ
>> +   #define AT_MINSIGSTKSZ   51
>> +   #endif
>> +
>> +   stack_t ss;
>> +   int err;
>> +
>> +   ss.ss_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
>> +   ss.ss_sp = malloc(ss.ss_size);
>> +   ...
>> +
>> +   err = sigaltstack(, NULL);
>> +   ...
> 
> That source code needs some special markup to look like source code -
> currently, the result looks bad.

How about this code:

#include 
#include 
#include 
#include 
#include 
#include 

#ifndef AT_MINSIGSTKSZ
#define AT_MINSIGSTKSZ  51
#endif

stack_t ss;

ss.ss_sp = malloc(ss.ss_size);
assert(ss.ss_sp);

ss.ss_size = getauxval(AT_MINSIGSTKSZ) + SIGSTKSZ;
ss.ss_flags = 0;

if (sigaltstack(, NULL))
err(1, "sigaltstack");


>> +2. The exposed auxiliary vectors
>> +-
>> +
>> +AT_SYSINFO
>> +The entry point to the system call function the virtual Dynamic Shared
>> +Object (vDSO), not exported on 64-bit.
> 
> I can't parse that sentence.
> 
>> +
>> +AT_SYSINFO_EHDR
>> +The start address of the page containing vDSO.
>   ^
>   the
>> +
>> +AT_MINSIGSTKSZ
>> +The minimum stack size required to deliver a signal. It is a calculated
>> +sigframe size based on the largest possible user context. When programs
>> +use sigaltstack() to provide alternate signal stack, that stack must be
>> +at least the size to function properly on this hardware. Note that this
>> +is a minimum of the kernel to correctly get to the signal handler.
> 
> I get what this is trying to say but it reads weird. Simplify pls.
> 
>> +Additional space must be added to handle objects pushed onto the stack
>> +by the signal handlers, as well as for nested signal delivery.
>> +
>> +The purpose of this parameter is to accommodate the different stack
>> +sizes required by different hardware configuration. E.g., the x86
>> +system supporting the Advanced Vector Extension needs at least 8KB more
>> +than the one without it.
> 
> That could be simplified too.

Rewrote like this:

AT_SYSINFO is used for locating the vsyscall entry point.  It is not exported
on 64-bit mode.

AT_SYSINFO_EHDR is the start address of the page containing the vDSO.

AT_MINSIGSTKSZ denotes the minimum stack size required by the kernel to
deliver a signal to user-space.  AT_MINSIGSTKSZ comprehends the space consumed
by the kernel to accommodate the user context for the current hardware
configuration.  It does not comprehend subsequent user-space stack
consumption, which must be added by the user.  (e.g. Above, user-space 

Re: [PATCH v4 0/4] Some improvement for Intel MAX 10 MFD drivers

2021-03-10 Thread Lee Jones
On Wed, 10 Mar 2021, Xu Yilun wrote:

> This patchset is some improvements for intel-m10-bmc and its subdevs.
> 
> Main changes from v1:
> - Add a patch (#2) to simplify the definition of the legacy version reg.
> - Add a patch (#4), add entry in MAINTAINERS for intel-m10-bmc mfd driver
>   and the subdev drivers.
> 
> Main changes from v2:
> - Add Tom Rix as the reviewer for intel-m10-bmc mfd driver and the subdev
>   drivers.
> - Rebased to 5.12-rc1
> 
> Main changes from v3:
> - Improve the comments for valid version check.

Good enough.

All applied, thanks.

-- 
Lee Jones [李琼斯]
Senior Technical Lead - Developer Services
Linaro.org │ Open source software for Arm SoCs
Follow Linaro: Facebook | Twitter | Blog


RE: [PATCH v3 1/3] scsi: ufshcd: use a function to calculate versions

2021-03-10 Thread Avri Altman
> @@ -9298,10 +9291,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem
> *mmio_base, unsigned int irq)
> /* Get UFS version supported by the controller */
> hba->ufs_version = ufshcd_get_ufs_version(hba);
> 
> -   if ((hba->ufs_version != UFSHCI_VERSION_10) &&
> -   (hba->ufs_version != UFSHCI_VERSION_11) &&
> -   (hba->ufs_version != UFSHCI_VERSION_20) &&
> -   (hba->ufs_version != UFSHCI_VERSION_21))
> +   if (hba->ufs_version < ufshci_version(1, 0))
> dev_err(hba->dev, "invalid UFS version 0x%x\n",
> hba->ufs_version);
Here you replaces the specific allowable values, with an expression
That doesn't really reflects those values.


[PATCH 3/3] arm64: dts: qcom: sm8150: add i2c nodes

2021-03-10 Thread Caleb Connolly
Tested on the OnePlus 7 Pro (including DMA).

Signed-off-by: Caleb Connolly 
---
 arch/arm64/boot/dts/qcom/sm8150.dtsi | 521 +++
 1 file changed, 521 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/sm8150.dtsi 
b/arch/arm64/boot/dts/qcom/sm8150.dtsi
index 543417d74216..0a38ad54c715 100644
--- a/arch/arm64/boot/dts/qcom/sm8150.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8150.dtsi
@@ -588,6 +588,111 @@ qupv3_id_0: geniqup@8c {
#size-cells = <2>;
ranges;
status = "disabled";
+
+   i2c0: i2c@88 {
+   compatible = "qcom,geni-i2c";
+   reg = <0 0x0088 0 0x4000>;
+   clock-names = "se";
+   clocks = < GCC_QUPV3_WRAP0_S0_CLK>;
+   pinctrl-names = "default";
+   pinctrl-0 = <_i2c0_default>;
+   interrupts = ;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   status = "disabled";
+   };
+
+   i2c1: i2c@884000 {
+   compatible = "qcom,geni-i2c";
+   reg = <0 0x00884000 0 0x4000>;
+   clock-names = "se";
+   clocks = < GCC_QUPV3_WRAP0_S1_CLK>;
+   pinctrl-names = "default";
+   pinctrl-0 = <_i2c1_default>;
+   interrupts = ;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   status = "disabled";
+   };
+
+   i2c2: i2c@888000 {
+   compatible = "qcom,geni-i2c";
+   reg = <0 0x00888000 0 0x4000>;
+   clock-names = "se";
+   clocks = < GCC_QUPV3_WRAP0_S2_CLK>;
+   pinctrl-names = "default";
+   pinctrl-0 = <_i2c2_default>;
+   interrupts = ;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   status = "disabled";
+   };
+
+   i2c3: i2c@88c000 {
+   compatible = "qcom,geni-i2c";
+   reg = <0 0x0088c000 0 0x4000>;
+   clock-names = "se";
+   clocks = < GCC_QUPV3_WRAP0_S3_CLK>;
+   pinctrl-names = "default";
+   pinctrl-0 = <_i2c3_default>;
+   interrupts = ;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   status = "disabled";
+   };
+
+   i2c4: i2c@89 {
+   compatible = "qcom,geni-i2c";
+   reg = <0 0x0089 0 0x4000>;
+   clock-names = "se";
+   clocks = < GCC_QUPV3_WRAP0_S4_CLK>;
+   pinctrl-names = "default";
+   pinctrl-0 = <_i2c4_default>;
+   interrupts = ;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   status = "disabled";
+   };
+
+   i2c5: i2c@894000 {
+   compatible = "qcom,geni-i2c";
+   reg = <0 0x00894000 0 0x4000>;
+   clock-names = "se";
+   clocks = < GCC_QUPV3_WRAP0_S5_CLK>;
+   pinctrl-names = "default";
+   pinctrl-0 = <_i2c5_default>;
+   interrupts = ;
+   #address-cells = <1>;
+   #size-cells = <0>;
+   status = "disabled";
+   };
+
+   i2c6: i2c@898000 {
+   compatible = "qcom,geni-i2c";
+   reg = <0 0x00898000 0 0x4000>;
+   clock-names = "se";
+   clocks = < GCC_QUPV3_WRAP0_S6_CLK>;
+   pinctrl-names = "default";
+   pinctrl-0 = <_i2c6_default>;
+   interrupts = ;
+   #address-cells = <1>;
+   #size-cells = <0>;
+

[PATCH 2/3] arm64: dts: qcom: sm8150: add iommus to qups

2021-03-10 Thread Caleb Connolly
Hook up the SMMU for doing DMA over i2c. Some peripherals like
touchscreens easily exceed 32-bytes per transfer, causing errors and
lockups without this.

Signed-off-by: Caleb Connolly 
---
Fixes i2c on the OnePlus 7, without this touching the screen with more
than 4 fingers causes the device to lock up and reboot.
---
 arch/arm64/boot/dts/qcom/sm8150.dtsi | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/sm8150.dtsi 
b/arch/arm64/boot/dts/qcom/sm8150.dtsi
index 03e05d98daf2..543417d74216 100644
--- a/arch/arm64/boot/dts/qcom/sm8150.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8150.dtsi
@@ -583,6 +583,7 @@ qupv3_id_0: geniqup@8c {
clock-names = "m-ahb", "s-ahb";
clocks = < GCC_QUPV3_WRAP_0_M_AHB_CLK>,
 < GCC_QUPV3_WRAP_0_S_AHB_CLK>;
+   iommus = <_smmu 0xc3 0x0>;
#address-cells = <2>;
#size-cells = <2>;
ranges;
@@ -595,6 +596,7 @@ qupv3_id_1: geniqup@ac {
clock-names = "m-ahb", "s-ahb";
clocks = < GCC_QUPV3_WRAP_1_M_AHB_CLK>,
 < GCC_QUPV3_WRAP_1_S_AHB_CLK>;
+   iommus = <_smmu 0x603 0x0>;
#address-cells = <2>;
#size-cells = <2>;
ranges;
@@ -617,6 +619,7 @@ qupv3_id_2: geniqup@cc {
clock-names = "m-ahb", "s-ahb";
clocks = < GCC_QUPV3_WRAP_2_M_AHB_CLK>,
 < GCC_QUPV3_WRAP_2_S_AHB_CLK>;
+   iommus = <_smmu 0x7a3 0x0>;
#address-cells = <2>;
#size-cells = <2>;
ranges;
-- 
2.29.2




[PATCH 1/3] arm64: dts: qcom: sm8150: add other QUP nodes

2021-03-10 Thread Caleb Connolly
Add the first and third qupv3 nodes used to hook
up peripherals on some devices.

Signed-off-by: Caleb Connolly 
---
 arch/arm64/boot/dts/qcom/sm8150.dtsi | 25 +
 1 file changed, 25 insertions(+)

diff --git a/arch/arm64/boot/dts/qcom/sm8150.dtsi 
b/arch/arm64/boot/dts/qcom/sm8150.dtsi
index e5bb17bc2f46..03e05d98daf2 100644
--- a/arch/arm64/boot/dts/qcom/sm8150.dtsi
+++ b/arch/arm64/boot/dts/qcom/sm8150.dtsi
@@ -577,6 +577,18 @@ gcc: clock-controller@10 {
 <_clk>;
};
 
+   qupv3_id_0: geniqup@8c {
+   compatible = "qcom,geni-se-qup";
+   reg = <0x0 0x008c 0x0 0x6000>;
+   clock-names = "m-ahb", "s-ahb";
+   clocks = < GCC_QUPV3_WRAP_0_M_AHB_CLK>,
+< GCC_QUPV3_WRAP_0_S_AHB_CLK>;
+   #address-cells = <2>;
+   #size-cells = <2>;
+   ranges;
+   status = "disabled";
+   };
+
qupv3_id_1: geniqup@ac {
compatible = "qcom,geni-se-qup";
reg = <0x0 0x00ac 0x0 0x6000>;
@@ -598,6 +610,19 @@ uart2: serial@a9 {
};
};
 
+   qupv3_id_2: geniqup@cc {
+   compatible = "qcom,geni-se-qup";
+   reg = <0x0 0x00cc 0x0 0x6000>;
+
+   clock-names = "m-ahb", "s-ahb";
+   clocks = < GCC_QUPV3_WRAP_2_M_AHB_CLK>,
+< GCC_QUPV3_WRAP_2_S_AHB_CLK>;
+   #address-cells = <2>;
+   #size-cells = <2>;
+   ranges;
+   status = "disabled";
+   };
+
config_noc: interconnect@150 {
compatible = "qcom,sm8150-config-noc";
reg = <0 0x0150 0 0x7400>;
-- 
2.29.2




Re: [PATCH] net: add net namespace inode for all net_dev events

2021-03-10 Thread Steven Rostedt
On Wed, 10 Mar 2021 17:03:40 +0800
Tony Lu  wrote:

> I use pahole to read vmlinux.o directly with defconfig and
> CONFIG_DEBUG_INFO enabled, the result shows 22 structs prefixed with
> trace_event_raw_ that have at least one hole.

I was thinking of pahole too ;-)

But the information can also be captured from the format files with simple
scripts as well. And perhaps be more tuned to see if there's actually a fix
for them, and ignore reporting it if there is no fix, as all trace events
are 4 byte aligned, and if we are off by one, sometimes it doesn't matter.

-- Steve


Re: [PATCH v6 1/6] uapi: Define the aux vector AT_MINSIGSTKSZ

2021-03-10 Thread Bae, Chang Seok
On Mar 1, 2021, at 11:09, Borislav Petkov  wrote:
> On Sat, Feb 27, 2021 at 08:59:06AM -0800, Chang S. Bae wrote:
>> 
>> diff --git a/include/uapi/linux/auxvec.h b/include/uapi/linux/auxvec.h
>> index abe5f2b6581b..15be98c75174 100644
>> --- a/include/uapi/linux/auxvec.h
>> +++ b/include/uapi/linux/auxvec.h
>> @@ -33,5 +33,8 @@
>> 
>> #define AT_EXECFN  31/* filename of program */
>> 
>> +#ifndef AT_MINSIGSTKSZ
>> +#define AT_MINSIGSTKSZ  51  /* stack needed for signal delivery  */
> 
> I know glibc's comment says a similar thing but the correct thing to say
> here should be "minimal stack size for signal delivery" or so. Even the
> variable name alludes to that too.

Yeah, you’re right.

Thanks,
Chang

Re: [PATCH V11 3/5] kbuild: Allow .dtso format for overlay source files

2021-03-10 Thread Masahiro Yamada
On Wed, Mar 10, 2021 at 11:47 PM Viresh Kumar  wrote:
>
> On 10-03-21, 20:24, Masahiro Yamada wrote:
> > On Wed, Mar 10, 2021 at 2:35 PM Viresh Kumar  
> > wrote:
> > > diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
> > > index bc045a54a34e..59e86f67f9e0 100644
> > > --- a/scripts/Makefile.lib
> > > +++ b/scripts/Makefile.lib
> > > @@ -339,7 +339,7 @@ $(obj)/%.dtb.S: $(obj)/%.dtb FORCE
> > >
> > >  quiet_cmd_dtc = DTC $@
> > >  cmd_dtc = $(HOSTCC) -E $(dtc_cpp_flags) -x assembler-with-cpp -o 
> > > $(dtc-tmp) $< ; \
> > > -   $(DTC) -O $(patsubst .%,%,$(suffix $@)) -o $@ -b 0 \
> > > +   $(DTC) -I dts -O $(patsubst .%,%,$(suffix $@)) -o $@ -b 0 \
> >
> > Even without "-I dts",
> >
> >inform = guess_input_format(arg, "dts");
> >
> > seems to fall back to "dts" anyway,
>
> I missed this TBH.
>
> > but I guess you wanted to make this explicit, correct?
>
> That can be a reason now :)
>
> > I will drop the ugly -O.
> > https://patchwork.kernel.org/project/linux-kbuild/patch/20210310110824.782209-1-masahi...@kernel.org/
>
> But if we are going to depend on DTC to guess it right, then we
> shouldn't add -I at all..
>
> > I will queue it to linux-kbuild/fixes.
> >
> >
> >
> > > $(addprefix -i,$(dir $<) $(DTC_INCLUDE)) $(DTC_FLAGS) \
> > > -d $(depfile).dtc.tmp $(dtc-tmp) ; \
> > > cat $(depfile).pre.tmp $(depfile).dtc.tmp > $(depfile)
> > > @@ -347,9 +347,13 @@ cmd_dtc = $(HOSTCC) -E $(dtc_cpp_flags) -x 
> > > assembler-with-cpp -o $(dtc-tmp) $< ;
> > >  $(obj)/%.dtb: $(src)/%.dts $(DTC) FORCE
> > > $(call if_changed_dep,dtc)
> > >
> > > +# Required for of unit-test files as they can't be renamed to .dtso
> >
> > If you go with *.dtso, I think you will rename
> > all *.dts under the drivers/ directory.
> >
> > What is blocking you from making this consistent?
>
> The unit-test dts files are designed differently (we have had lots of
> discussion between Frank and David on that) and they aren't purely
> overlay or base files. They are designed to do some tricky testing and
> renaming them to .dtso won't be right, we are just reusing them to do
> static (build time) testing as well.


I still do not understand.

If they are not overlay files, why
do you need to have them suffixed with .dtbo?

".dts -> .dtb" should be enough.

Why do you need to do ".dts  -> .dtbo" ?




> I think it would be better if we can drop the existing %.dtbo rule
> here (i.e. dtbo from .dts) and do some magic in unit-test's Makefile,
> so it is localised at least instead of it here.
>
> Any ideas for that ?

I do not know.

My impression is you are doing something fishy.




-- 
Best Regards
Masahiro Yamada


Re: [PATCH v10 1/2] scsi: ufs: Enable power management for wlun

2021-03-10 Thread Alan Stern
On Tue, Mar 09, 2021 at 08:04:53PM -0800, Asutosh Das (asd) wrote:
> On 3/9/2021 7:14 PM, Alan Stern wrote:
> > On Tue, Mar 09, 2021 at 07:04:34PM -0800, Asutosh Das (asd) wrote:
> > > Hello
> > > I & Can (thanks CanG) debugged this further:
> > > 
> > > Looks like this issue can occur if the sd probe is asynchronous.
> > > 
> > > Essentially, the sd_probe() is done asynchronously and 
> > > driver_probe_device()
> > > invokes pm_runtime_get_suppliers() before invoking sd_probe().
> > > 
> > > But scsi_probe_and_add_lun() runs in a separate context.
> > > So the scsi_autopm_put_device() invoked from scsi_scan_host() context
> > > reduces the link->rpm_active to 1. And sd_probe() invokes
> > > scsi_autopm_put_device() and starts a timer. And then 
> > > driver_probe_device()
> > > invoked from __device_attach_async_helper context reduces the
> > > link->rpm_active to 1 thus enabling the supplier to suspend before the
> > > consumer suspends.
> > 
> > > I don't see a way around this. Please let me know if you
> > > (@Alan/@Bart/@Adrian) have any thoughts on this.
> > 
> > How about changing the SCSI core so that it does a runtime_get before
> > starting an async probe, and the async probe routine does a
> > runtime_put when it is finished?  In other words, don't allow a device
> > to go into runtime suspend while it is waiting to be probed.
> > 
> > I don't think that would be too intrusive.
> > 
> > Alan Stern
> > 
> 
> Hi Alan
> Thanks for the suggestion.
> 
> Am trying to understand:
> 
> Do you mean something like this:
> 
> int scsi_sysfs_add_sdev(struct scsi_device *sdev)
> {
>   
>   scsi_autopm_get_device(sdev);
>   pm_runtime_get_noresume(>sdev_gendev);
>   [...]
>   scsi_autopm_put_device(sdev);
>   [...]
> }
> 
> static int sd_probe(struct device *dev)
> {
>   [...]
>   pm_runtime_put_noidle(dev);
>   scsi_autopm_put_device(sdp);
>   [...]
> }
> 
> This may work (I'm limited by my imagination in scsi layer :) ).

I'm not sure about this.  To be honest, I did not read the entirety of 
your last message; it had way too much detail.  THere's a time and place 
for that, but when you're brainstorming to figure out the underlying 
cause of a problem and come up with a strategy to fix it, you want to 
concentrate on the overall picture, not the details.

As I understand the situation, you've get a SCSI target with multiple 
logical units, let's say A and B, and you need to make sure that A never 
goes into runtime suspend unless B is already suspended.  In other 
words, B always has to suspend before A and resume after A.

To do this, you register a device link with A as the supplier and B as 
the consumer.  Then the PM core takes care of the ordering for you.

But I don't understand when you set up the device link.  If the timing 
is wrong then, thanks to async SCSI probing, you may have a situation 
where A is registered before B and before the link is set up.  Then 
there's temporarily nothing to stop A from suspending before B.

You also need to prevent each device from suspending before it is 
probed.  That's the easy part I was trying to address before (although 
it may not be so easy if the drivers are in loadable modules and not 
present in the kernel).

You need to think through these issues before proposing actual changes.

> But the pm_runtime_put_noidle() would have to be added to all registered
> scsi_driver{}, perhaps? Or may be I can check for sdp->type?

Like this; it's too early to worry about this sort of thing.

Alan Stern


[PATCH v2] mm, tracing: improve rss_stat tracepoint message

2021-03-10 Thread Ovidiu Panait
Adjust the rss_stat tracepoint to print the name of the resident page type
that got updated (e.g. MM_ANONPAGES/MM_FILEPAGES), rather than the numeric
index corresponding to it (the __entry->member value):

Before this patch:
--
rss_stat: mm_id=1216113068 curr=0 member=1 size=28672B
rss_stat: mm_id=1216113068 curr=0 member=1 size=0B
rss_stat: mm_id=534402304 curr=1 member=0 size=188416B
rss_stat: mm_id=534402304 curr=1 member=1 size=40960B

After this patch:
-
rss_stat: mm_id=1726253524 curr=1 type=MM_ANONPAGES size=40960B
rss_stat: mm_id=1726253524 curr=1 type=MM_FILEPAGES size=663552B
rss_stat: mm_id=1726253524 curr=1 type=MM_ANONPAGES size=65536B
rss_stat: mm_id=1726253524 curr=1 type=MM_FILEPAGES size=647168B

Use TRACE_DEFINE_ENUM()/__print_symbolic() logic to map the enum values to
the strings they represent, so that userspace tools can also parse the raw
data correctly.

Signed-off-by: Ovidiu Panait 
Signed-off-by: Steven Rostedt 
---
v2: - rework the patch so that user space tools can parse the raw
  data correctly

 include/trace/events/kmem.h | 24 ++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 3a60b6b6db32..829a75692cc0 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -343,6 +343,26 @@ static unsigned int __maybe_unused mm_ptr_to_hash(const 
void *ptr)
 #define __PTR_TO_HASHVAL
 #endif
 
+#define TRACE_MM_PAGES \
+   EM(MM_FILEPAGES)\
+   EM(MM_ANONPAGES)\
+   EM(MM_SWAPENTS) \
+   EMe(MM_SHMEMPAGES)
+
+#undef EM
+#undef EMe
+
+#define EM(a)  TRACE_DEFINE_ENUM(a);
+#define EMe(a) TRACE_DEFINE_ENUM(a);
+
+TRACE_MM_PAGES
+
+#undef EM
+#undef EMe
+
+#define EM(a)  { a, #a },
+#define EMe(a) { a, #a }
+
 TRACE_EVENT(rss_stat,
 
TP_PROTO(struct mm_struct *mm,
@@ -365,10 +385,10 @@ TRACE_EVENT(rss_stat,
__entry->size = (count << PAGE_SHIFT);
),
 
-   TP_printk("mm_id=%u curr=%d member=%d size=%ldB",
+   TP_printk("mm_id=%u curr=%d type=%s size=%ldB",
__entry->mm_id,
__entry->curr,
-   __entry->member,
+   __print_symbolic(__entry->member, TRACE_MM_PAGES),
__entry->size)
);
 #endif /* _TRACE_KMEM_H */
-- 
2.17.1



Re: [PATCH] arm64: make STACKPROTECTOR_PER_TASK configurable.

2021-03-10 Thread Michal Suchánek
On Wed, Mar 10, 2021 at 04:07:00AM +0900, Masahiro Yamada wrote:
> On Wed, Mar 10, 2021 at 12:10 AM Michal Suchánek  wrote:
> >
> > On Tue, Mar 09, 2021 at 11:53:21PM +0900, Masahiro Yamada wrote:
> > > On Tue, Mar 9, 2021 at 10:35 PM Michal Suchánek  wrote:
> > > >
> > > > On Tue, Mar 09, 2021 at 10:22:36PM +0900, Masahiro Yamada wrote:
> > > > > On Tue, Mar 9, 2021 at 9:35 PM Michal Suchanek  
> > > > > wrote:
> > > > > >
> > > > > > When using dummy-tools STACKPROTECTOR_PER_TASK is unconditionally
> > > > > > selected. This defeats the purpose of the all-enabled tool.
> > > > > >
> > > > > > Description copied from arm
> > > > > >
> > > > > > Cc: Masahiro Yamada 
> > > > > > Signed-off-by: Michal Suchanek 
> > > > >
> > > > >
> > > > > Could you explain what problem
> > > > > this patch is trying to solve?
> > > >
> > > > The option cannot be disabled when compiler has the required capability.
> > >
> > >
> > > Yes.
> > > Currently, this symbol claims "def_bool y",
> > > so there is no way to disable it.
> > >
> > > But, it comes from the nature of Kconfig in general.
> > >
> > > dummy-tools is completely unrelated here.
> >
> > dummy-tools makes all configuration options available in order to be
> > able to author configuration files on system different from the one
> > where the kernel is built. This prevents authoring a configuration file
> > with this option disabled.
> 
> 
> No.
> dummy-tools enables as many $(cc-option, ...)
> and $(shell, ...) as possible. That's it.
> 
> 
> In my understanding, STACKPROTECTOR_PER_TASK
> should not be user-configurable.
> That is why 'def_bool y'.

And that's wrong. Either it's a required copiler feature and then it
should be always enabled. Or it is optional and then it should be
possible to disable - one or the other.

When it is optional you should be able to

 - (de) select in randconfig
 - test/bisect the old code path without crippling your toolchain
 - author configuration file that does not have the option enabled for
   build on system that uses older toolchain

Thanks

Michal


Re: [RFC PATCH 0/3] hugetlb: add demote/split page functionality

2021-03-10 Thread Michal Hocko
On Mon 08-03-21 16:18:52, Mike Kravetz wrote:
[...]
> Converting larger to smaller hugetlb pages can be accomplished today by
> first freeing the larger page to the buddy allocator and then allocating
> the smaller pages.  However, there are two issues with this approach:
> 1) This process can take quite some time, especially if allocation of
>the smaller pages is not immediate and requires migration/compaction.
> 2) There is no guarantee that the total size of smaller pages allocated
>will match the size of the larger page which was freed.  This is
>because the area freed by the larger page could quickly be
>fragmented.

I will likely not surprise to show some level of reservation. While your
concerns about reconfiguration by existing interfaces are quite real is
this really a problem in practice? How often do you need such a
reconfiguration?

Is this all really worth the additional code to something as tricky as
hugetlb code base?

>  include/linux/hugetlb.h |   8 ++
>  mm/hugetlb.c| 199 +++-
>  2 files changed, 204 insertions(+), 3 deletions(-)
> 
> -- 
> 2.29.2
> 

-- 
Michal Hocko
SUSE Labs


[PATCH v5 3/3] arm64: dts: ti: k3-j7200: Add support for higher speed modes and update delay select values for MMCSD subsystems

2021-03-10 Thread Aswath Govindraju
The following speed modes are now supported in J7200 SoC,
- HS200 and HS400 modes at 1.8 V card voltage, in MMCSD0 subsystem [1].
- UHS-I speed modes in MMCSD1 subsystem [1].

Add support for UHS-I modes by adding voltage regulator device tree nodes
and corresponding pinmux details, to power cycle and voltage switch cards.
Set respective tags in sdhci0 and remove no-1-8-v tag from sdhci1
device tree nodes.

Also update the delay values for various speed modes supported, based on
the latest J7200 datasheet[2]

[1] - section 12.3.6.1.1 MMCSD Features, in
  https://www.ti.com/lit/ug/spruiu1a/spruiu1a.pdf

[2] - https://www.ti.com/lit/ds/symlink/dra821a.pdf

Signed-off-by: Aswath Govindraju 
---
 .../dts/ti/k3-j7200-common-proc-board.dts | 42 +++
 arch/arm64/boot/dts/ti/k3-j7200-main.dtsi | 14 ++-
 2 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/boot/dts/ti/k3-j7200-common-proc-board.dts 
b/arch/arm64/boot/dts/ti/k3-j7200-common-proc-board.dts
index b493f939b09a..6f90c3b1cf45 100644
--- a/arch/arm64/boot/dts/ti/k3-j7200-common-proc-board.dts
+++ b/arch/arm64/boot/dts/ti/k3-j7200-common-proc-board.dts
@@ -16,6 +16,29 @@
stdout-path = "serial2:115200n8";
bootargs = "console=ttyS2,115200n8 
earlycon=ns16550a,mmio32,0x0280";
};
+
+   vdd_mmc1: fixedregulator-sd {
+   compatible = "regulator-fixed";
+   regulator-name = "vdd_mmc1";
+   regulator-min-microvolt = <330>;
+   regulator-max-microvolt = <330>;
+   regulator-boot-on;
+   enable-active-high;
+   gpios = < 2 GPIO_ACTIVE_HIGH>;
+   };
+
+   vdd_sd_dv: gpio-regulator-vdd-sd-dv {
+   compatible = "regulator-gpio";
+   regulator-name = "vdd_sd_dv";
+   pinctrl-names = "default";
+   pinctrl-0 = <_sd_dv_pins_default>;
+   regulator-min-microvolt = <180>;
+   regulator-max-microvolt = <330>;
+   regulator-boot-on;
+   gpios = <_gpio0 55 GPIO_ACTIVE_HIGH>;
+   states = <180 0x0
+ 330 0x1>;
+   };
 };
 
 _pmx0 {
@@ -45,6 +68,13 @@
 };
 
 _pmx0 {
+   main_i2c0_pins_default: main-i2c0-pins-default {
+   pinctrl-single,pins = <
+   J721E_IOPAD(0xd4, PIN_INPUT_PULLUP, 0) /* (V3) I2C0_SCL 
*/
+   J721E_IOPAD(0xd8, PIN_INPUT_PULLUP, 0) /* (W2) I2C0_SDA 
*/
+   >;
+   };
+
main_i2c1_pins_default: main-i2c1-pins-default {
pinctrl-single,pins = <
J721E_IOPAD(0xdc, PIN_INPUT_PULLUP, 3) /* (U3) 
ECAP0_IN_APWM_OUT.I2C1_SCL */
@@ -70,6 +100,12 @@
J721E_IOPAD(0x120, PIN_OUTPUT, 0) /* (T4) USB0_DRVVBUS 
*/
>;
};
+
+   vdd_sd_dv_pins_default: vdd_sd_dv_pins_default {
+   pinctrl-single,pins = <
+   J721E_IOPAD(0xd0, PIN_INPUT, 7) /* (T5) 
SPI0_D1.GPIO0_55 */
+   >;
+   };
 };
 
 _uart0 {
@@ -157,6 +193,10 @@
 };
 
 _i2c0 {
+   pinctrl-names = "default";
+   pinctrl-0 = <_i2c0_pins_default>;
+   clock-frequency = <40>;
+
exp1: gpio@20 {
compatible = "ti,tca6416";
reg = <0x20>;
@@ -206,6 +246,8 @@
/* SD card */
pinctrl-0 = <_mmc1_pins_default>;
pinctrl-names = "default";
+   vmmc-supply = <_mmc1>;
+   vqmmc-supply = <_sd_dv>;
ti,driver-strength-ohm = <50>;
disable-wp;
 };
diff --git a/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi 
b/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi
index e60650a62b14..f86c493a44f1 100644
--- a/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi
+++ b/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi
@@ -512,11 +512,16 @@
ti,otap-del-sel-mmc-hs = <0x0>;
ti,otap-del-sel-ddr52 = <0x6>;
ti,otap-del-sel-hs200 = <0x8>;
-   ti,otap-del-sel-hs400 = <0x0>;
+   ti,otap-del-sel-hs400 = <0x5>;
+   ti,itap-del-sel-legacy = <0x10>;
+   ti,itap-del-sel-mmc-hs = <0xa>;
ti,strobe-sel = <0x77>;
+   ti,clkbuf-sel = <0x7>;
ti,trm-icp = <0x8>;
bus-width = <8>;
mmc-ddr-1_8v;
+   mmc-hs200-1_8v;
+   mmc-hs400-1_8v;
dma-coherent;
};
 
@@ -534,7 +539,12 @@
ti,otap-del-sel-sdr50 = <0xc>;
ti,otap-del-sel-sdr104 = <0x5>;
ti,otap-del-sel-ddr50 = <0xc>;
-   no-1-8-v;
+   ti,itap-del-sel-legacy = <0x0>;
+   ti,itap-del-sel-sd-hs = <0x0>;
+   ti,itap-del-sel-sdr12 = <0x0>;
+   ti,itap-del-sel-sdr25 = <0x0>;
+   ti,clkbuf-sel = <0x7>;
+   ti,trm-icp = <0x8>;
dma-coherent;

Re: [PATCH v4 4/5] perf stat: Enable iostat mode for x86 platforms

2021-03-10 Thread Alexander Antonov



On 3/9/2021 10:51 AM, liuqi (BA) wrote:

Hi Alexander,

On 2021/2/3 21:58, Alexander Antonov wrote:

This functionality is based on recently introduced sysfs attributes
for Intel® Xeon® Scalable processor family (code name Skylake-SP):
Commit bb42b3d39781 ("perf/x86/intel/uncore: Expose an Uncore unit to
IIO PMON mapping")

Mode is intended to provide four I/O performance metrics in MB per each
PCIe root port:
  - Inbound Read: I/O devices below root port read from the host memory
  - Inbound Write: I/O devices below root port write to the host memory
  - Outbound Read: CPU reads from I/O devices below root port
  - Outbound Write: CPU writes to I/O devices below root port

Each metric requiries only one uncore event which increments at every 4B
transfer in corresponding direction. The formulas to compute metrics
are generic:
 #EventCount * 4B / (1024 * 1024)

Signed-off-by: Alexander Antonov
---
  tools/perf/Documentation/perf-iostat.txt |  88 ++
  tools/perf/Makefile.perf |   5 +-
  tools/perf/arch/x86/util/Build   |   1 +
  tools/perf/arch/x86/util/iostat.c    | 345 +++
  tools/perf/command-list.txt  |   1 +
  tools/perf/perf-iostat.sh    |  12 +
  6 files changed, 451 insertions(+), 1 deletion(-)
  create mode 100644 tools/perf/Documentation/perf-iostat.txt
  create mode 100644 tools/perf/perf-iostat.sh

diff --git a/tools/perf/Documentation/perf-iostat.txt 
b/tools/perf/Documentation/perf-iostat.txt

new file mode 100644
index ..165176944031
--- /dev/null
+++ b/tools/perf/Documentation/perf-iostat.txt
@@ -0,0 +1,88 @@
+perf-iostat(1)
+===
+
+NAME
+
+perf-iostat - Show I/O performance metrics
+
+SYNOPSIS
+
+[verse]
+'perf iostat' list
+'perf iostat'  --  []
+
+DESCRIPTION
+---
+Mode is intended to provide four I/O performance metrics per each 
PCIe root port:

+
+- Inbound Read   - I/O devices below root port read from the host 
memory, in MB

+
+- Inbound Write  - I/O devices below root port write to the host 
memory, in MB

+
+- Outbound Read  - CPU reads from I/O devices below root port, in MB
+
+- Outbound Write - CPU writes to I/O devices below root port, in MB
+
+OPTIONS
+---
+...::
+    Any command you can specify in a shell.
+
+list::
+    List all PCIe root ports.


I noticed that "iostat" commond and cmd_iostat() callback function is 
not registered in cmd_struct in perf.c. So I think "perf iostat list" 
perhaps can not work properly.


I also test this patchset on x86 platform, and here is the log:

root@ubuntu:/home/lq# ./perf iostat list
perf: 'iostat' is not a perf-command. See 'perf --help'.
root@ubuntu:/home/lq# ./perf stat --iostat
^C
 Performance counter stats for 'system wide':

   port Inbound Read(MB)    Inbound Write(MB) Outbound 
Read(MB)   Outbound Write(MB)

:00    0 0    0  0
:80    0 0    0  0
:17    0 0    0  0
:85    0 0    0  0
:3a    0 0    0  0
:ae    0 0    0  0
:5d    0 0    0  0
:d7    0 0    0  0

   0.611303832 seconds time elapsed


root@ubuntu:/home/lq# ./perf stat --iostat=:17
^C
 Performance counter stats for 'system wide':

   port Inbound Read(MB)    Inbound Write(MB) Outbound 
Read(MB)   Outbound Write(MB)

:17    0 0    0  0

   0.521317572 seconds time elapsed

So how does following perf iostat list work, did I miss something?

Thanks,
Qi


Hello,

The 'iostat' mode uses aliases mechanism in perf same as 'perf archive' and
in this case you don't need to add function callback into cmd_struct.
For example, the command 'perf iostat list' will be converted to
'perf stat --iostat=list'.

After building the perf tool you should have two shell scripts in tools/perf
directory and one of them is executable, for example:
# make -C tools/perf
# ls -l tools/perf/perf-iostat*
-rwxr-xr-x 1 root root 290 Mar 10 18:17 perf-iostat
-rw-r--r-- 1 root root 290 Feb  3 15:14 perf-iostat.sh

It should be possible to run 'perf iostat' from build directory:
# cd tools/perf
# ./perf iostat list
S0-uncore_iio_0<:00>
S1-uncore_iio_0<:80>
S0-uncore_iio_1<:17>
S1-uncore_iio_1<:85>
S0-uncore_iio_2<:3a>
S1-uncore_iio_2<:ae>
S0-uncore_iio_3<:5d>
S1-uncore_iio_3<:d7>

Also you can copy 'perf-iostat' to ~/libexec/perf-core/ or just run 
'make install'


# make install
# cp perf /usr/bin/
# ls -lh ~/libexec/perf-core/
total 24K
-rwxr-xr-x 1 root root 1.4K Mar 10 18:17 perf-archive
-rwxr-xr-x 1 root root  290 Mar 10 18:17 perf-iostat

[PATCH v5 2/3] arm64: dts: ti: k3-j7200-common-proc-board: Disable unused gpio modules

2021-03-10 Thread Aswath Govindraju
From: Faiz Abbas 

There are 6 gpio instances inside SoC with 2 groups as show below:
Group one: wkup_gpio0, wkup_gpio1
Group two: main_gpio0, main_gpio2, main_gpio4, main_gpio6

Only one instance from each group can be used at a time. So use main_gpio0
and wkup_gpio0 in current linux context and disable the rest of the nodes.

Signed-off-by: Faiz Abbas 
Signed-off-by: Sekhar Nori 
Signed-off-by: Aswath Govindraju 
---
 .../boot/dts/ti/k3-j7200-common-proc-board.dts   | 16 
 1 file changed, 16 insertions(+)

diff --git a/arch/arm64/boot/dts/ti/k3-j7200-common-proc-board.dts 
b/arch/arm64/boot/dts/ti/k3-j7200-common-proc-board.dts
index 4a7182abccf5..b493f939b09a 100644
--- a/arch/arm64/boot/dts/ti/k3-j7200-common-proc-board.dts
+++ b/arch/arm64/boot/dts/ti/k3-j7200-common-proc-board.dts
@@ -122,6 +122,22 @@
status = "disabled";
 };
 
+_gpio2 {
+   status = "disabled";
+};
+
+_gpio4 {
+   status = "disabled";
+};
+
+_gpio6 {
+   status = "disabled";
+};
+
+_gpio1 {
+   status = "disabled";
+};
+
 _cpsw {
pinctrl-names = "default";
pinctrl-0 = <_cpsw_pins_default _mdio_pins_default>;
-- 
2.17.1



[PATCH v5 1/3] arm64: dts: ti: k3-j7200: Add gpio nodes

2021-03-10 Thread Aswath Govindraju
From: Faiz Abbas 

There are 4 instances of gpio modules in main domain:
gpio0, gpio2, gpio4 and gpio6

Groups are created to provide protection between different processor
virtual worlds. Each of these modules I/O pins are muxed within the
group. Exactly one module can be selected to control the corresponding
pin by selecting it in the pad mux configuration registers.

This group in main domain pins out 69 lines (5 banks). Add DT modes for
each module instance in the main domain.

Similar to the gpio groups in main domain, there is one gpio group in
wakeup domain with 2 module instances in it.

The gpio group pins out 72 pins (6 banks) of the first 85 gpio lines. Add
DT nodes for each module instance in the wakeup domain.

Signed-off-by: Faiz Abbas 
Signed-off-by: Sekhar Nori 
Signed-off-by: Aswath Govindraju 
---
 arch/arm64/boot/dts/ti/k3-j7200-main.dtsi | 72 +++
 .../boot/dts/ti/k3-j7200-mcu-wakeup.dtsi  | 34 +
 2 files changed, 106 insertions(+)

diff --git a/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi 
b/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi
index 17477ab0fd8e..e60650a62b14 100644
--- a/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi
+++ b/arch/arm64/boot/dts/ti/k3-j7200-main.dtsi
@@ -672,6 +672,78 @@
};
};
 
+   main_gpio0: gpio@60 {
+   compatible = "ti,j721e-gpio", "ti,keystone-gpio";
+   reg = <0x00 0x0060 0x00 0x100>;
+   gpio-controller;
+   #gpio-cells = <2>;
+   interrupt-parent = <_gpio_intr>;
+   interrupts = <145>, <146>, <147>, <148>,
+<149>;
+   interrupt-controller;
+   #interrupt-cells = <2>;
+   #address-cells = <0>;
+   ti,ngpio = <69>;
+   ti,davinci-gpio-unbanked = <0>;
+   power-domains = <_pds 105 TI_SCI_PD_EXCLUSIVE>;
+   clocks = <_clks 105 0>;
+   clock-names = "gpio";
+   };
+
+   main_gpio2: gpio@61 {
+   compatible = "ti,j721e-gpio", "ti,keystone-gpio";
+   reg = <0x00 0x0061 0x00 0x100>;
+   gpio-controller;
+   #gpio-cells = <2>;
+   interrupt-parent = <_gpio_intr>;
+   interrupts = <154>, <155>, <156>, <157>,
+<158>;
+   interrupt-controller;
+   #interrupt-cells = <2>;
+   #address-cells = <0>;
+   ti,ngpio = <69>;
+   ti,davinci-gpio-unbanked = <0>;
+   power-domains = <_pds 107 TI_SCI_PD_EXCLUSIVE>;
+   clocks = <_clks 107 0>;
+   clock-names = "gpio";
+   };
+
+   main_gpio4: gpio@62 {
+   compatible = "ti,j721e-gpio", "ti,keystone-gpio";
+   reg = <0x00 0x0062 0x00 0x100>;
+   gpio-controller;
+   #gpio-cells = <2>;
+   interrupt-parent = <_gpio_intr>;
+   interrupts = <163>, <164>, <165>, <166>,
+<167>;
+   interrupt-controller;
+   #interrupt-cells = <2>;
+   #address-cells = <0>;
+   ti,ngpio = <69>;
+   ti,davinci-gpio-unbanked = <0>;
+   power-domains = <_pds 109 TI_SCI_PD_EXCLUSIVE>;
+   clocks = <_clks 109 0>;
+   clock-names = "gpio";
+   };
+
+   main_gpio6: gpio@63 {
+   compatible = "ti,j721e-gpio", "ti,keystone-gpio";
+   reg = <0x00 0x0063 0x00 0x100>;
+   gpio-controller;
+   #gpio-cells = <2>;
+   interrupt-parent = <_gpio_intr>;
+   interrupts = <172>, <173>, <174>, <175>,
+<176>;
+   interrupt-controller;
+   #interrupt-cells = <2>;
+   #address-cells = <0>;
+   ti,ngpio = <69>;
+   ti,davinci-gpio-unbanked = <0>;
+   power-domains = <_pds 111 TI_SCI_PD_EXCLUSIVE>;
+   clocks = <_clks 111 0>;
+   clock-names = "gpio";
+   };
+
main_r5fss0: r5fss@5c0 {
compatible = "ti,j7200-r5fss";
ti,cluster-mode = <1>;
diff --git a/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi 
b/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi
index 359e3e8a8cd0..1dd5b30edc6c 100644
--- a/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi
+++ b/arch/arm64/boot/dts/ti/k3-j7200-mcu-wakeup.dtsi
@@ -107,6 +107,40 @@
ti,interrupt-ranges = <16 960 16>;
};
 
+   wkup_gpio0: gpio@4211 {
+   compatible = "ti,j721e-gpio", "ti,keystone-gpio";
+   reg = <0x00 0x4211 0x00 0x100>;
+   gpio-controller;
+   #gpio-cells = <2>;
+   interrupt-parent = <_gpio_intr>;
+   interrupts = <103>, <104>, <105>, <106>, <107>, <108>;
+   interrupt-controller;

<    5   6   7   8   9   10   11   12   13   14   >