Re: [PATCH] infiniband-diags/saquery.c: switchinfo support added

2013-02-28 Thread Ira Weiny
Your mailer line wrapped the patch.  Could you resend.

Also, could you add this to doc/rst/saquery.8.in.rst

Thanks,
Ira

On Thu, 28 Feb 2013 08:11:38 +0200
Husam Kahalah hkaha...@asaltech.com wrote:

 From d61f2580b829fa2fc56aea15bd98ef3a607d9aba Mon Sep 17 00:00:00 2001
 
 From: Husam kahalah hkaha...@asaltech.com
 Date: Mon, 25 Feb 2013 10:53:25 +0200
 Subject: [PATCH]  infiniband-diags/saquery.c: switchinfo support added
 
 ---
  src/saquery.c |   55 
 +++
  1 files changed, 55 insertions(+), 0 deletions(-)
 
 diff --git a/src/saquery.c b/src/saquery.c
 index d31d77d..6390bcd 100644
 --- a/src/saquery.c
 +++ b/src/saquery.c
 @@ -481,6 +481,41 @@ static void dump_service_record(void *data)
  cl_ntoh64(p_sr-service_data64[1]));
  }
 
 +static void dump_switch_info_record(void *data)
 +{
 +  ib_switch_info_record_t *p_sir = data;
 +
 +  printf(SwitchInfoRecord dump:\n
 +\t\tRID\n
 +\t\tlid.%u\n
 +\t\tSwitchInfo dump:\n
 +\t\tlin_cap.0x%X\n
 +\t\trand_cap0x%X\n
 +\t\tmcast_cap...0x%X\n
 +\t\tlin_top.0x%X\n
 +\t\tdef_port%u\n
 +\t\tdef_mcast_pri_port..%u\n
 +\t\tdef_mcast_not_port..%u\n
 +\t\tlife_state..%u\n
 +\t\tlids_per_port...0x%X\n
 +\t\tenforce_cap.0x%X\n
 +\t\tflags...%u\n
 +\t\tmcast_top...0x%X\n,
 +cl_ntoh16(p_sir-lid),
 +cl_ntoh16(p_sir-switch_info.lin_cap),
 +cl_ntoh16(p_sir-switch_info.rand_cap),
 +cl_ntoh16(p_sir-switch_info.mcast_cap),
 +cl_ntoh16(p_sir-switch_info.lin_top),
 +p_sir-switch_info.def_port,
 +p_sir-switch_info.def_mcast_pri_port,
 +p_sir-switch_info.def_mcast_not_port,
 +p_sir-switch_info.life_state,
 +cl_ntoh16(p_sir-switch_info.lids_per_port),
 +cl_ntoh16(p_sir-switch_info.enforce_cap),
 +p_sir-switch_info.flags,
 +cl_ntoh16(p_sir-switch_info.mcast_top));
 +}
 +
  static void dump_inform_info_record(void *data)
  {
   char gid_str[INET6_ADDRSTRLEN];
 @@ -1150,6 +1185,24 @@ static int query_service_records(const struct 
 query_cmd *q, struct sa_handle * h
dump_service_record);
  }
 
 +static int query_switchinfo_records(const struct query_cmd *q,
 + struct sa_handle * h, struct query_params *p,
 + int argc, char *argv[])
 +{
 +  ib_switch_info_record_t swir;
 + ib_net64_t comp_mask = 0;
 + int lid = 0;
 +
 + if (argc  0)
 +  parse_lid_and_ports(h, argv[0], lid, NULL, NULL);
 +
 +  memset(swir, 0, sizeof(swir));
 +  CHECK_AND_SET_VAL(lid, 16, 0, swir.lid, SWIR, LID);
 +
 + return get_and_dump_any_records(h, IB_SA_ATTR_SWITCHINFORECORD, 0, 
 comp_mask,
 + swir, sizeof(swir), dump_switch_info_record);
 +}
 +
  static int query_inform_info_records(const struct query_cmd *q,
   struct sa_handle * h, struct query_params *p,
   int argc, char *argv[])
 @@ -1342,6 +1395,8 @@ static const struct query_cmd query_cmds[] = {
[[mlid]/[position]/[block]], query_mft_records},
{GUIDInfoRecord, GIR, IB_SA_ATTR_GUIDINFORECORD,
[[lid]/[block]], query_guidinfo_records},
 +  {SwitchInfoRecord, SWIR, IB_SA_ATTR_SWITCHINFORECORD,
 +  [lid], query_switchinfo_records},
{0}
  };
 
 -- 
 1.7.1


-- 
Ira Weiny
Member of Technical Staff
Lawrence Livermore National Lab
925-423-8008
wei...@llnl.gov
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] infiniband-diags/saquery.c: switchinfo support added

2013-02-21 Thread Ira Weiny
 state (MCMemberRecord)},
{proxy_join, 'X', 1, NULL, Proxy join (MCMemberRecord)},
 +  {swir_lid, 22, 1, NULL,
 +  switchInfo_lid (SwitchInfoRecord)
 +  LID of switch port 0},

As above no need for the additional option.

Thanks,
Ira

{0}
};
  
 -- 
 1.7.1
  


-- 
Ira Weiny
Member of Technical Staff
Lawrence Livermore National Lab
925-423-8008
wei...@llnl.gov
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] infiniband-diags/saquery.c: switchinfo support added

2013-02-21 Thread Ira Weiny
On Thu, 21 Feb 2013 10:09:00 -0800
Ira Weiny wei...@llnl.gov wrote:

[snip]

  + ib_net16_t swir_lid;
 
 We don't need this field or lid option.  Other records simply have an 
 optional parameter to specify the lid.
 
 The usage should be:
 
 SwitchRecord [lid]
 

Sorry I meant SwitchInfoRecord or SWIR.

Ira

[snip]
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] infiniband-diags/saquery.c: switchinfo support added

2013-02-24 Thread Ira Weiny
On Sun, 24 Feb 2013 10:22:47 +0200
Husam Kahalah hkaha...@asaltech.com wrote:

 Thank you for your time, I have the following notes:
 I plan to add support for more records other than switchInfoRecord like 
 smInfo, multipath, nodeInfo

That would be great!

 I plan also to add component filter to those records attributes in the same 
 way that pathInfo and MCR were implemented, this is the reason why I started 
 adding attributes to query_params structure 
 I'am not talking about RID(record identifier) attributes only, but also 
 about other attributes that may differ between records.
 If it is available to add to query_params structure in that way let me know 
 , otherwise I will add just RID attributes in the suggested way.
 

I am not against adding fields to query_params.  However, we have a standard 
way to specify the RID components like LID which should be followed.

If you need new components to be specified for new attributes it is fine to add 
them to query_params.  That said, any components which already have options 
should be overloaded; for example MultiPathRecord.SL should use the --sl option.

Does this make sense?

Thanks,
Ira

 
 -Original Message-
 From: Ira Weiny wei...@llnl.gov
 To: Ira Weiny wei...@llnl.gov
 Cc: Husam Kahalah hkaha...@asaltech.com, linux-r...@vger.kernel.org, 
 linux-kernel@vger.kernel.org
 Date: Thu, 21 Feb 2013 10:33:44 -0800
 Subject: Re: [PATCH] infiniband-diags/saquery.c: switchinfo support added
 
 On Thu, 21 Feb 2013 10:09:00 -0800
 Ira Weiny wei...@llnl.gov wrote:
 
 [snip]
 
   + ib_net16_t swir_lid;
 
  We don't need this field or lid option.  Other records simply have an 
 optional parameter to specify the lid.
 
  The usage should be:
 
  SwitchRecord [lid]
 
 
 Sorry I meant SwitchInfoRecord or SWIR.
 
 Ira
 
 [snip]
 --
 To unsubscribe from this list: send the line unsubscribe linux-rdma in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html


-- 
Ira Weiny
Member of Technical Staff
Lawrence Livermore National Lab
925-423-8008
wei...@llnl.gov
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH RFC] IB/mad: remove obsolete snoop interface

2015-09-30 Thread ira . weiny
From: Ira Weiny <ira.we...@intel.com>

This interface has no current users and is obsolete.

Signed-off-by: Ira Weiny <ira.we...@intel.com>
---

This has undergone a medium level of testing.  I have run it against
mlx4, qib, and OPA hardware and it does not seem to cause any regressions.

 drivers/infiniband/core/mad.c  | 226 +
 drivers/infiniband/core/mad_priv.h |  13 ---
 2 files changed, 5 insertions(+), 234 deletions(-)

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 4b5c72311deb..08ab92604e88 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -428,132 +428,12 @@ error1:
 }
 EXPORT_SYMBOL(ib_register_mad_agent);
 
-static inline int is_snooping_sends(int mad_snoop_flags)
-{
-   return (mad_snoop_flags &
-   (/*IB_MAD_SNOOP_POSTED_SENDS |
-IB_MAD_SNOOP_RMPP_SENDS |*/
-IB_MAD_SNOOP_SEND_COMPLETIONS /*|
-IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/));
-}
-
-static inline int is_snooping_recvs(int mad_snoop_flags)
-{
-   return (mad_snoop_flags &
-   (IB_MAD_SNOOP_RECVS /*|
-IB_MAD_SNOOP_RMPP_RECVS*/));
-}
-
-static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
-   struct ib_mad_snoop_private *mad_snoop_priv)
-{
-   struct ib_mad_snoop_private **new_snoop_table;
-   unsigned long flags;
-   int i;
-
-   spin_lock_irqsave(_info->snoop_lock, flags);
-   /* Check for empty slot in array. */
-   for (i = 0; i < qp_info->snoop_table_size; i++)
-   if (!qp_info->snoop_table[i])
-   break;
-
-   if (i == qp_info->snoop_table_size) {
-   /* Grow table. */
-   new_snoop_table = krealloc(qp_info->snoop_table,
-  sizeof mad_snoop_priv *
-  (qp_info->snoop_table_size + 1),
-  GFP_ATOMIC);
-   if (!new_snoop_table) {
-   i = -ENOMEM;
-   goto out;
-   }
-
-   qp_info->snoop_table = new_snoop_table;
-   qp_info->snoop_table_size++;
-   }
-   qp_info->snoop_table[i] = mad_snoop_priv;
-   atomic_inc(_info->snoop_count);
-out:
-   spin_unlock_irqrestore(_info->snoop_lock, flags);
-   return i;
-}
-
-struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
-  u8 port_num,
-  enum ib_qp_type qp_type,
-  int mad_snoop_flags,
-  ib_mad_snoop_handler snoop_handler,
-  ib_mad_recv_handler recv_handler,
-  void *context)
-{
-   struct ib_mad_port_private *port_priv;
-   struct ib_mad_agent *ret;
-   struct ib_mad_snoop_private *mad_snoop_priv;
-   int qpn;
-
-   /* Validate parameters */
-   if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) ||
-   (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) {
-   ret = ERR_PTR(-EINVAL);
-   goto error1;
-   }
-   qpn = get_spl_qp_index(qp_type);
-   if (qpn == -1) {
-   ret = ERR_PTR(-EINVAL);
-   goto error1;
-   }
-   port_priv = ib_get_mad_port(device, port_num);
-   if (!port_priv) {
-   ret = ERR_PTR(-ENODEV);
-   goto error1;
-   }
-   /* Allocate structures */
-   mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL);
-   if (!mad_snoop_priv) {
-   ret = ERR_PTR(-ENOMEM);
-   goto error1;
-   }
-
-   /* Now, fill in the various structures */
-   mad_snoop_priv->qp_info = _priv->qp_info[qpn];
-   mad_snoop_priv->agent.device = device;
-   mad_snoop_priv->agent.recv_handler = recv_handler;
-   mad_snoop_priv->agent.snoop_handler = snoop_handler;
-   mad_snoop_priv->agent.context = context;
-   mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp;
-   mad_snoop_priv->agent.port_num = port_num;
-   mad_snoop_priv->mad_snoop_flags = mad_snoop_flags;
-   init_completion(_snoop_priv->comp);
-   mad_snoop_priv->snoop_index = register_snoop_agent(
-   _priv->qp_info[qpn],
-   mad_snoop_priv);
-   if (mad_snoop_priv->snoop_index < 0) {
-   ret = ERR_PTR(mad_snoop_priv->snoop_index);
-   goto error2;
-   }
-
-   atomic_set(_snoop_priv->refcount, 1);
-   return _snoop_priv->agent;
-
-error2:
-   kfree(mad_sn

Re: [PATCH 0/7] IB/hfi1: Remove write() and use ioctl() for user access

2016-04-15 Thread Ira Weiny
On Fri, Apr 15, 2016 at 07:01:26AM +0300, Leon Romanovsky wrote:
> On Thu, Apr 14, 2016 at 01:48:31PM -0400, Ira Weiny wrote:
> > On Thu, Apr 14, 2016 at 10:45:50AM -0600, Jason Gunthorpe wrote:
> > > On Thu, Apr 14, 2016 at 08:41:35AM -0700, Dennis Dalessandro wrote:
> > > > This patch series removes the write() interface for user access in 
> > > > favor of an
> > > > ioctl() based approach. This is in response to the complaint that we had
> > > > different handlers for write() and writev() doing different things and 
> > > > expecting
> > > > different types of data. See:
> > > 
> > > I think we should wait on applying these patches until we globally sort 
> > > out
> > > what to do with the rdma uapi.
> > > 
> > > It just doesn't make alot of sense for drivers to have their own personal
> > > char devices. :(
> > 
> > I'm afraid I have to disagree at this time.  Someday we may have "1 char 
> > device
> > to rule them all" but right now we don't have any line of sight to that
> > solution.  It may be _years_ before we can agree to the semantics which will
> > work for all high speed, kernel bypass, rdma, low latency, network devices.
> 
> You didn't ever try to come and work on the solution. We talked about
> finite time frame (_months_) which is doable based on knowledge that user
> space parts are developed by the same companies and all our future changes
> will be in one subsystem.

How can you say that I am not working on a solution?

We spent most of last week discussing possible solutions and I am in support of
a more common core.  But ask yourself this.


If hfi1 did not support verbs at all would this even be an issue?


> 
> You were supposed to prepare "wish list" from this new API as an initial
> phase. If you do it, you will find that it is very short and in the
> initial meeting you will see that it similar to other participants in
> linux-rdma community.

The list of operations may be short.  But the way in which you do those in a
performant way for each hardware device is _very_ different.  This is a problem
which has been debated for years and no one has come up with an elegant
solution.  Every solution ends up being, to quote a presenter at last weeks
conference, "shoving a square peg into a round hole".

Until we all admit 2 things.

1) That there are devices which don't operate on QPs
2) That the High Speed interconnect core should present something more
   abstract than a QP interface

we are not really creating a common layer.

I do admit Jasons idea has some merit but I'm just not sure it provides so much
benefit that it is worth the effort at this time.

Ira



Re: [PATCH 0/7] IB/hfi1: Remove write() and use ioctl() for user access

2016-04-15 Thread Ira Weiny
On Sat, Apr 16, 2016 at 12:23:28AM +0300, Leon Romanovsky wrote:

> 
> Intel as usual decided to do it in their way and the result is presented
> on this mailing list.

Excuse me, but this statement is completely unfair.  We were specifically asked
by Al and Linus to fix our char device with regards to the write/writev
inconsistency.

https://www.spinics.net/lists/linux-rdma/msg34451.html

Which is _exactly_ what this patch series does.

Do you have a technical reason that this patch series does not fix the
write/writev issue brought up by Al?

Ira



Re: [PATCH 0/7] IB/hfi1: Remove write() and use ioctl() for user access

2016-04-18 Thread Ira Weiny
On Mon, Apr 18, 2016 at 11:24:11AM -0700, Christoph Hellwig wrote:
> On Mon, Apr 18, 2016 at 11:40:47AM -0600, Jason Gunthorpe wrote:
> > I wasn't arguing this should integrate into verbs in some way, only
> > that the way to access the driver-specific uAPI of a RDMA device should
> > be through the RDMA common uAPI and not through a random char dev.
> 
> Well, it's stuff not related to our RDMA userspace API (which _is_
> Verbs, not counting for the complete crackpot abuse in usnic), but
> very device specific. 
> 
> The stuff the intel driver are doing isn't pretty, but unfortunately
> not unusual either - lots of SCSI or network driver have ioctls
> like that.  Now we could argue if the ioctls should be one the
> main node (uverbs) or the a driver private chardev, or not exist
> at all and people will have to patch the driver with some vendor
> version if they really need it.  Examples for either of these
> choices exist in the tree.

I'm a bit confused by what you are suggesting that "people will have to patch
the driver with some vendor version if they really need it."?

Could you elaborate?

PSM is the primary performant path for this device.  Without it this device is
severely limited in its intended functionality.

We are strongly motivated to have all of our functionality included in the
mainstream kernel.  So for eprom/snoop we would really like to find a way to
include all this functionality.

Ira

> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/7] IB/hfi1: Remove write() and use ioctl() for user access

2016-04-14 Thread Ira Weiny
On Thu, Apr 14, 2016 at 10:45:50AM -0600, Jason Gunthorpe wrote:
> On Thu, Apr 14, 2016 at 08:41:35AM -0700, Dennis Dalessandro wrote:
> > This patch series removes the write() interface for user access in favor of 
> > an
> > ioctl() based approach. This is in response to the complaint that we had
> > different handlers for write() and writev() doing different things and 
> > expecting
> > different types of data. See:
> 
> I think we should wait on applying these patches until we globally sort out
> what to do with the rdma uapi.
> 
> It just doesn't make alot of sense for drivers to have their own personal
> char devices. :(

I'm afraid I have to disagree at this time.  Someday we may have "1 char device
to rule them all" but right now we don't have any line of sight to that
solution.  It may be _years_ before we can agree to the semantics which will
work for all high speed, kernel bypass, rdma, low latency, network devices.

We need to fix the write/writev problem now.[1]

Ira

[1] https://www.spinics.net/lists/linux-rdma/msg34451.html



Re: [PATCH] staging/rdma/hfi1: select CRC32

2016-04-14 Thread Ira Weiny
On Fri, Apr 01, 2016 at 01:04:05AM +0200, Markus Böhme wrote:
> The function parse_platform_config in firmware.c calls crc32_le.
> Building without CRC32 selected causes a link error:
> 
> drivers/built-in.o: In function `parse_platform_config':
> (.text+0x92ffa): undefined reference to `crc32_le'
> 
> Signed-off-by: Markus Böhme <markus.boe...@mailbox.org>

Doug did you pick this up?

Reviewed-by: Ira Weiny <ira.we...@intel.com>

> ---
>  drivers/staging/rdma/hfi1/Kconfig |1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/staging/rdma/hfi1/Kconfig 
> b/drivers/staging/rdma/hfi1/Kconfig
> index 3e668d8..a925fb0 100644
> --- a/drivers/staging/rdma/hfi1/Kconfig
> +++ b/drivers/staging/rdma/hfi1/Kconfig
> @@ -2,6 +2,7 @@ config INFINIBAND_HFI1
>   tristate "Intel OPA Gen1 support"
>   depends on X86_64 && INFINIBAND_RDMAVT
>   select MMU_NOTIFIER
> + select CRC32
>   default m
>   ---help---
>   This is a low-level driver for Intel OPA Gen1 adapter.
> -- 
> 1.7.10.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] infiniband-diags/saquery.c: switchinfo support added

2013-02-28 Thread Ira Weiny
Your mailer line wrapped the patch.  Could you resend.

Also, could you add this to doc/rst/saquery.8.in.rst

Thanks,
Ira

On Thu, 28 Feb 2013 08:11:38 +0200
"Husam Kahalah"  wrote:

> From d61f2580b829fa2fc56aea15bd98ef3a607d9aba Mon Sep 17 00:00:00 2001
> 
> From: Husam kahalah 
> Date: Mon, 25 Feb 2013 10:53:25 +0200
> Subject: [PATCH]  infiniband-diags/saquery.c: switchinfo support added
> 
> ---
>  src/saquery.c |   55 
> +++
>  1 files changed, 55 insertions(+), 0 deletions(-)
> 
> diff --git a/src/saquery.c b/src/saquery.c
> index d31d77d..6390bcd 100644
> --- a/src/saquery.c
> +++ b/src/saquery.c
> @@ -481,6 +481,41 @@ static void dump_service_record(void *data)
>  cl_ntoh64(p_sr->service_data64[1]));
>  }
> 
> +static void dump_switch_info_record(void *data)
> +{
> +  ib_switch_info_record_t *p_sir = data;
> +
> +  printf("SwitchInfoRecord dump:\n"
> +"\t\tRID\n"
> +"\t\tlid.%u\n"
> +"\t\tSwitchInfo dump:\n"
> +"\t\tlin_cap.0x%X\n"
> +"\t\trand_cap0x%X\n"
> +"\t\tmcast_cap...0x%X\n"
> +"\t\tlin_top.0x%X\n"
> +"\t\tdef_port%u\n"
> +"\t\tdef_mcast_pri_port..%u\n"
> +"\t\tdef_mcast_not_port..%u\n"
> +"\t\tlife_state..%u\n"
> +"\t\tlids_per_port...0x%X\n"
> +"\t\tenforce_cap.0x%X\n"
> +"\t\tflags...%u\n"
> +"\t\tmcast_top...0x%X\n",
> +cl_ntoh16(p_sir->lid),
> +cl_ntoh16(p_sir->switch_info.lin_cap),
> +cl_ntoh16(p_sir->switch_info.rand_cap),
> +cl_ntoh16(p_sir->switch_info.mcast_cap),
> +cl_ntoh16(p_sir->switch_info.lin_top),
> +p_sir->switch_info.def_port,
> +p_sir->switch_info.def_mcast_pri_port,
> +p_sir->switch_info.def_mcast_not_port,
> +p_sir->switch_info.life_state,
> +cl_ntoh16(p_sir->switch_info.lids_per_port),
> +cl_ntoh16(p_sir->switch_info.enforce_cap),
> +p_sir->switch_info.flags,
> +cl_ntoh16(p_sir->switch_info.mcast_top));
> +}
> +
>  static void dump_inform_info_record(void *data)
>  {
>   char gid_str[INET6_ADDRSTRLEN];
> @@ -1150,6 +1185,24 @@ static int query_service_records(const struct 
> query_cmd *q, struct sa_handle * h
>dump_service_record);
>  }
> 
> +static int query_switchinfo_records(const struct query_cmd *q,
> + struct sa_handle * h, struct query_params *p,
> + int argc, char *argv[])
> +{
> +  ib_switch_info_record_t swir;
> + ib_net64_t comp_mask = 0;
> + int lid = 0;
> +
> + if (argc > 0)
> +  parse_lid_and_ports(h, argv[0], , NULL, NULL);
> +
> +  memset(, 0, sizeof(swir));
> +  CHECK_AND_SET_VAL(lid, 16, 0, swir.lid, SWIR, LID);
> +
> + return get_and_dump_any_records(h, IB_SA_ATTR_SWITCHINFORECORD, 0, 
> comp_mask,
> + , sizeof(swir), dump_switch_info_record);
> +}
> +
>  static int query_inform_info_records(const struct query_cmd *q,
>   struct sa_handle * h, struct query_params *p,
>   int argc, char *argv[])
> @@ -1342,6 +1395,8 @@ static const struct query_cmd query_cmds[] = {
>"[[mlid]/[position]/[block]]", query_mft_records},
>{"GUIDInfoRecord", "GIR", IB_SA_ATTR_GUIDINFORECORD,
>"[[lid]/[block]]", query_guidinfo_records},
> +  {"SwitchInfoRecord", "SWIR", IB_SA_ATTR_SWITCHINFORECORD,
> +  "[lid]", query_switchinfo_records},
>{0}
>  };
> 
> -- 
> 1.7.1


-- 
Ira Weiny
Member of Technical Staff
Lawrence Livermore National Lab
925-423-8008
wei...@llnl.gov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] infiniband-diags/saquery.c: switchinfo support added

2013-02-21 Thread Ira Weiny
ptarg)
>   case 'X':
>p->proxy_join = strtoul(optarg, NULL, 0);
>break;
> + case 22 :
> +  p->swir_lid = (ib_net16_t) strtoul(optarg, NULL, 0);
> + break;

As above no need for the additional option.

>default:
>   return -1;
>   }
> @@ -1584,6 +1641,7 @@ int main(int argc, char **argv)
>{"x", 'x', 0, NULL, "get LinkRecord info"},
>{"c", 'c', 0, NULL, "get the SA's class port info"},
>{"S", 'S', 0, NULL, "get ServiceRecord info"},
> +  {"W", 'W', 0, NULL, "get SwitchInfoRecord"},
>{"I", 'I', 0, NULL, "get InformInfoRecord (subscription) info"},
>{"list", 'D', 0, NULL, "the node desc of the CA's"},
>{"src-to-dst", 1, 1, "", "get a PathRecord for"
> @@ -1633,6 +1691,9 @@ int main(int argc, char **argv)
>{"scope", 21, 1, NULL, "Scope (MCMemberRecord)"},
>{"join_state", 'J', 1, NULL, "Join state (MCMemberRecord)"},
>{"proxy_join", 'X', 1, NULL, "Proxy join (MCMemberRecord)"},
> +  {"swir_lid", 22, 1, NULL,
> +  "switchInfo_lid (SwitchInfoRecord)"
> + " LID of switch port 0"},

As above no need for the additional option.

Thanks,
Ira

>{0}
>};
>  
> -- 
> 1.7.1
>  


-- 
Ira Weiny
Member of Technical Staff
Lawrence Livermore National Lab
925-423-8008
wei...@llnl.gov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] infiniband-diags/saquery.c: switchinfo support added

2013-02-21 Thread Ira Weiny
On Thu, 21 Feb 2013 10:09:00 -0800
Ira Weiny  wrote:

[snip]

> > + ib_net16_t swir_lid;
> 
> We don't need this field or lid option.  Other records simply have an 
> optional parameter to specify the lid.
> 
> The usage should be:
> 
> SwitchRecord [lid]
> 

Sorry I meant "SwitchInfoRecord" or SWIR.

Ira

[snip]
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] infiniband-diags/saquery.c: switchinfo support added

2013-02-24 Thread Ira Weiny
On Sun, 24 Feb 2013 10:22:47 +0200
"Husam Kahalah"  wrote:

> Thank you for your time, I have the following notes:
> I plan to add support for more records other than switchInfoRecord like 
> smInfo, multipath, nodeInfo

That would be great!

> I plan also to add component filter to those records attributes in the same 
> way that pathInfo and MCR were implemented, this is the reason why I started 
> adding attributes to query_params structure 
> I'am not talking about RID(record identifier) attributes only, but also 
> about other attributes that may differ between records.
> If it is available to add to query_params structure in that way let me know 
> , otherwise I will add just RID attributes in the suggested way.
> 

I am not against adding fields to query_params.  However, we have a standard 
way to specify the RID components like LID which should be followed.

If you need new components to be specified for new attributes it is fine to add 
them to query_params.  That said, any components which already have options 
should be overloaded; for example MultiPathRecord.SL should use the --sl option.

Does this make sense?

Thanks,
Ira

> 
> -----Original Message-
> From: Ira Weiny 
> To: Ira Weiny 
> Cc: "Husam Kahalah" , linux-r...@vger.kernel.org, 
> linux-kernel@vger.kernel.org
> Date: Thu, 21 Feb 2013 10:33:44 -0800
> Subject: Re: [PATCH] infiniband-diags/saquery.c: switchinfo support added
> 
> On Thu, 21 Feb 2013 10:09:00 -0800
> Ira Weiny  wrote:
> 
> [snip]
> 
> > > + ib_net16_t swir_lid;
> >
> > We don't need this field or lid option.  Other records simply have an 
> optional parameter to specify the lid.
> >
> > The usage should be:
> >
> > SwitchRecord [lid]
> >
> 
> Sorry I meant "SwitchInfoRecord" or SWIR.
> 
> Ira
> 
> [snip]
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


-- 
Ira Weiny
Member of Technical Staff
Lawrence Livermore National Lab
925-423-8008
wei...@llnl.gov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v2] Documentation/dax: Update description of DAX policy changing

2021-01-05 Thread Ira Weiny
On Wed, Jan 06, 2021 at 09:50:00AM +0800, Hao Li wrote:
> After commit 77573fa310d9 ("fs: Kill DCACHE_DONTCACHE dentry even if
> DCACHE_REFERENCED is set"), changes to DAX policy will take effect
> as soon as all references to this file are gone.
> 
> Update the documentation accordingly.
> 
> Signed-off-by: Hao Li 

LGTM

Reviewed-by: Ira Weiny 

> ---
> Changes in v2:
>   * simplify sentences and fix style problems.
> 
>  Documentation/filesystems/dax.txt | 17 +++--
>  1 file changed, 3 insertions(+), 14 deletions(-)
> 
> diff --git a/Documentation/filesystems/dax.txt 
> b/Documentation/filesystems/dax.txt
> index 8fdb78f3c6c9..e03c20564f3a 100644
> --- a/Documentation/filesystems/dax.txt
> +++ b/Documentation/filesystems/dax.txt
> @@ -83,20 +83,9 @@ Summary
> directories.  This has runtime constraints and limitations that are
> described in 6) below.
>  
> - 6. When changing the S_DAX policy via toggling the persistent FS_XFLAG_DAX 
> flag,
> -the change in behaviour for existing regular files may not occur
> -immediately.  If the change must take effect immediately, the 
> administrator
> -needs to:
> -
> -a) stop the application so there are no active references to the data set
> -   the policy change will affect
> -
> -b) evict the data set from kernel caches so it will be re-instantiated 
> when
> -   the application is restarted. This can be achieved by:
> -
> -   i. drop-caches
> -   ii. a filesystem unmount and mount cycle
> -   iii. a system reboot
> + 6. When changing the S_DAX policy via toggling the persistent FS_XFLAG_DAX
> +flag, the change to existing regular files won't take effect until the
> +files are closed by all processes.
>  
>  
>  Details
> -- 
> 2.29.2
> 
> 
> 


[PATCH] memremap: Convert devmap static branch to {inc,dec}

2020-08-10 Thread ira . weiny
From: Ira Weiny 

While reviewing Protection Key Supervisor support it was pointed out
that using a counter to track static branch enable was an anti-pattern
which was better solved using the provided static_branch_{inc,dec}
functions.[1]

Fix up devmap_managed_key to work the same way.  Also this should be
safer because there is a very small (very unlikely) race when multiple
callers try to enable at the same time.

[1] 
https://lore.kernel.org/lkml/20200714194031.gi5...@worktop.programming.kicks-ass.net/

Cc: Dan Williams 
Cc: Vishal Verma 
Signed-off-by: Ira Weiny 
---
 mm/memremap.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/mm/memremap.c b/mm/memremap.c
index 03e38b7a38f1..9fb9ad500e78 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -40,12 +40,10 @@ EXPORT_SYMBOL_GPL(memremap_compat_align);
 #ifdef CONFIG_DEV_PAGEMAP_OPS
 DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
 EXPORT_SYMBOL(devmap_managed_key);
-static atomic_t devmap_managed_enable;
 
 static void devmap_managed_enable_put(void)
 {
-   if (atomic_dec_and_test(_managed_enable))
-   static_branch_disable(_managed_key);
+   static_branch_dec(_managed_key);
 }
 
 static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
@@ -56,8 +54,7 @@ static int devmap_managed_enable_get(struct dev_pagemap 
*pgmap)
return -EINVAL;
}
 
-   if (atomic_inc_return(_managed_enable) == 1)
-   static_branch_enable(_managed_key);
+   static_branch_inc(_managed_key);
return 0;
 }
 #else
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH] net/tls: Fix kmap usage

2020-08-10 Thread ira . weiny
From: Ira Weiny 

When MSG_OOB is specified to tls_device_sendpage() the mapped page is
never unmapped.

Hold off mapping the page until after the flags are checked and the page
is actually needed.

Signed-off-by: Ira Weiny 
---
 net/tls/tls_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index 0e55f8365ce2..0cbad566f281 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -561,7 +561,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page,
 {
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct iov_iter msg_iter;
-   char *kaddr = kmap(page);
+   char *kaddr;
struct kvec iov;
int rc;
 
@@ -576,6 +576,7 @@ int tls_device_sendpage(struct sock *sk, struct page *page,
goto out;
}
 
+   kaddr = kmap(page);
iov.iov_base = kaddr + offset;
iov.iov_len = size;
iov_iter_kvec(_iter, WRITE, , 1, size);
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH 0/2] Cyrpto: Clean up kmap() use

2020-08-10 Thread ira . weiny
From: Ira Weiny 

While going through kmap() users the following 2 issues were found via code
inspection.

Ira Weiny (2):
  crypto/ux500: Fix kmap() bug
  crypto: Remove unused async iterators

 crypto/ahash.c| 41 +++
 drivers/crypto/ux500/hash/hash_core.c | 30 
 include/crypto/internal/hash.h| 13 -
 3 files changed, 22 insertions(+), 62 deletions(-)

-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH 1/2] crypto/ux500: Fix kmap() bug

2020-08-10 Thread ira . weiny
From: Ira Weiny 

Once the crypto hash walk is started by crypto_hash_walk_first()
returning non-zero, crypto_hash_walk_done() must be called to unmap any
memory which was mapped by *_walk_first().

Ensure crypto_hash_walk_done() is called properly by:

1) Re-arranging the check for device data to be prior to calling
   *_walk_first()
2) on error call crypto_hash_walk_done() with an error code to
   allow the hash walk code to clean up.

While we are at it clean up the 'out' label to be more meaningful.

Signed-off-by: Ira Weiny 

---
Not tested.
Found via code inspection.
---
 drivers/crypto/ux500/hash/hash_core.c | 30 ---
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/drivers/crypto/ux500/hash/hash_core.c 
b/drivers/crypto/ux500/hash/hash_core.c
index c24f2db8d5e8..5f407a3420f8 100644
--- a/drivers/crypto/ux500/hash/hash_core.c
+++ b/drivers/crypto/ux500/hash/hash_core.c
@@ -1071,27 +1071,32 @@ int hash_hw_update(struct ahash_request *req)
struct hash_ctx *ctx = crypto_ahash_ctx(tfm);
struct hash_req_ctx *req_ctx = ahash_request_ctx(req);
struct crypto_hash_walk walk;
-   int msg_length = crypto_hash_walk_first(req, );
-
-   /* Empty message ("") is correct indata */
-   if (msg_length == 0)
-   return ret;
+   int msg_length;
 
index = req_ctx->state.index;
buffer = (u8 *)req_ctx->state.buffer;
 
+   ret = hash_get_device_data(ctx, _data);
+   if (ret)
+   return ret;
+
+   msg_length = crypto_hash_walk_first(req, );
+
+   /* Empty message ("") is correct indata */
+   if (msg_length == 0) {
+   ret = 0;
+   goto release_dev;
+   }
+
/* Check if ctx->state.length + msg_length
   overflows */
if (msg_length > (req_ctx->state.length.low_word + msg_length) &&
HASH_HIGH_WORD_MAX_VAL == req_ctx->state.length.high_word) {
pr_err("%s: HASH_MSG_LENGTH_OVERFLOW!\n", __func__);
-   return -EPERM;
+   ret = crypto_hash_walk_done(, -EPERM);
+   goto release_dev;
}
 
-   ret = hash_get_device_data(ctx, _data);
-   if (ret)
-   return ret;
-
/* Main loop */
while (0 != msg_length) {
data_buffer = walk.data;
@@ -1101,7 +1106,8 @@ int hash_hw_update(struct ahash_request *req)
if (ret) {
dev_err(device_data->dev, "%s: 
hash_internal_hw_update() failed!\n",
__func__);
-   goto out;
+   crypto_hash_walk_done(, ret);
+   goto release_dev;
}
 
msg_length = crypto_hash_walk_done(, 0);
@@ -,7 +1117,7 @@ int hash_hw_update(struct ahash_request *req)
dev_dbg(device_data->dev, "%s: indata length=%d, bin=%d\n",
__func__, req_ctx->state.index, req_ctx->state.bit_index);
 
-out:
+release_dev:
release_hash_device(device_data);
 
return ret;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH 2/2] crypto: Remove unused async iterators

2020-08-10 Thread ira . weiny
From: Ira Weiny 

Revert "crypto: hash - Add real ahash walk interface"
This reverts commit 75ecb231ff45b54afa9f4ec9137965c3c00868f4.

The callers of the functions in this commit were removed in ab8085c130ed

Remove these unused calls.

Fixes: ab8085c130ed ("crypto: x86 - remove SHA multibuffer routines and 
mcryptd")
Cc: Ard Biesheuvel 
Signed-off-by: Ira Weiny 
---
 crypto/ahash.c | 41 --
 include/crypto/internal/hash.h | 13 ---
 2 files changed, 4 insertions(+), 50 deletions(-)

diff --git a/crypto/ahash.c b/crypto/ahash.c
index 68a0f0cb75c4..9c23b606949e 100644
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -10,7 +10,6 @@
 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -46,10 +45,7 @@ static int hash_walk_next(struct crypto_hash_walk *walk)
unsigned int nbytes = min(walk->entrylen,
  ((unsigned int)(PAGE_SIZE)) - offset);
 
-   if (walk->flags & CRYPTO_ALG_ASYNC)
-   walk->data = kmap(walk->pg);
-   else
-   walk->data = kmap_atomic(walk->pg);
+   walk->data = kmap_atomic(walk->pg);
walk->data += offset;
 
if (offset & alignmask) {
@@ -99,16 +95,8 @@ int crypto_hash_walk_done(struct crypto_hash_walk *walk, int 
err)
}
}
 
-   if (walk->flags & CRYPTO_ALG_ASYNC)
-   kunmap(walk->pg);
-   else {
-   kunmap_atomic(walk->data);
-   /*
-* The may sleep test only makes sense for sync users.
-* Async users don't need to sleep here anyway.
-*/
-   crypto_yield(walk->flags);
-   }
+   kunmap_atomic(walk->data);
+   crypto_yield(walk->flags);
 
if (err)
return err;
@@ -140,33 +128,12 @@ int crypto_hash_walk_first(struct ahash_request *req,
 
walk->alignmask = crypto_ahash_alignmask(crypto_ahash_reqtfm(req));
walk->sg = req->src;
-   walk->flags = req->base.flags & CRYPTO_TFM_REQ_MASK;
+   walk->flags = req->base.flags;
 
return hash_walk_new_entry(walk);
 }
 EXPORT_SYMBOL_GPL(crypto_hash_walk_first);
 
-int crypto_ahash_walk_first(struct ahash_request *req,
-   struct crypto_hash_walk *walk)
-{
-   walk->total = req->nbytes;
-
-   if (!walk->total) {
-   walk->entrylen = 0;
-   return 0;
-   }
-
-   walk->alignmask = crypto_ahash_alignmask(crypto_ahash_reqtfm(req));
-   walk->sg = req->src;
-   walk->flags = req->base.flags & CRYPTO_TFM_REQ_MASK;
-   walk->flags |= CRYPTO_ALG_ASYNC;
-
-   BUILD_BUG_ON(CRYPTO_TFM_REQ_MASK & CRYPTO_ALG_ASYNC);
-
-   return hash_walk_new_entry(walk);
-}
-EXPORT_SYMBOL_GPL(crypto_ahash_walk_first);
-
 static int ahash_setkey_unaligned(struct crypto_ahash *tfm, const u8 *key,
unsigned int keylen)
 {
diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h
index 89f6f46ab2b8..6d3ad5ac4d28 100644
--- a/include/crypto/internal/hash.h
+++ b/include/crypto/internal/hash.h
@@ -62,25 +62,12 @@ struct crypto_shash_spawn {
 int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err);
 int crypto_hash_walk_first(struct ahash_request *req,
   struct crypto_hash_walk *walk);
-int crypto_ahash_walk_first(struct ahash_request *req,
-  struct crypto_hash_walk *walk);
-
-static inline int crypto_ahash_walk_done(struct crypto_hash_walk *walk,
-int err)
-{
-   return crypto_hash_walk_done(walk, err);
-}
 
 static inline int crypto_hash_walk_last(struct crypto_hash_walk *walk)
 {
return !(walk->entrylen | walk->total);
 }
 
-static inline int crypto_ahash_walk_last(struct crypto_hash_walk *walk)
-{
-   return crypto_hash_walk_last(walk);
-}
-
 int crypto_register_ahash(struct ahash_alg *alg);
 void crypto_unregister_ahash(struct ahash_alg *alg);
 int crypto_register_ahashes(struct ahash_alg *algs, int count);
-- 
2.28.0.rc0.12.gb6a658bd00c9



Re: [PATCH] man/statx: Add STATX_ATTR_DAX

2020-09-29 Thread Ira Weiny
On Tue, Sep 29, 2020 at 10:38:46AM +0200, Michael Kerrisk (man-pages) wrote:
> Hello Ira,
> 
> On 9/28/20 6:42 PM, Ira Weiny wrote:
> > On Mon, May 04, 2020 at 05:20:16PM -0700, 'Ira Weiny' wrote:
> >> From: Ira Weiny 
> >>
> >> Linux 5.8 is slated to have STATX_ATTR_DAX support.
> >>
> >> https://lore.kernel.org/lkml/20200428002142.404144-4-ira.we...@intel.com/
> >> https://lore.kernel.org/lkml/20200504161352.GA13783@magnolia/
> >>
> >> Add the text to the statx man page.
> >>
> >> Signed-off-by: Ira Weiny 
> > 
> > Have I sent this to the wrong list?  Or perhaps I have missed a reply.
> 
> No, it's just me being a bit slow, I'm sorry. Thank you for pining.

NP

> 
> > I don't see this applied to the man-pages project.[1]  But perhaps I am 
> > looking
> > at the wrong place?
> 
> Your patch is applied now, and pushed to kernel .org. Thanks!

Sweet!  Thank you!
Ira

> 
> Cheers,
> 
> Michael
> 


Re: [PATCH RFC V3 4/9] x86/pks: Preserve the PKRS MSR on context switch

2020-10-14 Thread Ira Weiny
On Tue, Oct 13, 2020 at 11:31:45AM -0700, Dave Hansen wrote:
> On 10/9/20 12:42 PM, ira.we...@intel.com wrote:
> > From: Ira Weiny 
> > 
> > The PKRS MSR is defined as a per-logical-processor register.  This
> > isolates memory access by logical CPU.  Unfortunately, the MSR is not
> > managed by XSAVE.  Therefore, tasks must save/restore the MSR value on
> > context switch.
> > 
> > Define a saved PKRS value in the task struct, as well as a cached
> > per-logical-processor MSR value which mirrors the MSR value of the
> > current CPU.  Initialize all tasks with the default MSR value.  Then, on
> > schedule in, check the saved task MSR vs the per-cpu value.  If
> > different proceed to write the MSR.  If not avoid the overhead of the
> > MSR write and continue.
> 
> It's probably nice to note how the WRMSR is special here, in addition to
> the comments below.

Sure,

> 
> >  #endif /*_ASM_X86_PKEYS_INTERNAL_H */
> > diff --git a/arch/x86/include/asm/processor.h 
> > b/arch/x86/include/asm/processor.h
> > index 97143d87994c..da2381136b2d 100644
> > --- a/arch/x86/include/asm/processor.h
> > +++ b/arch/x86/include/asm/processor.h
> > @@ -18,6 +18,7 @@ struct vm86;
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -542,6 +543,11 @@ struct thread_struct {
> >  
> > unsigned intsig_on_uaccess_err:1;
> >  
> > +#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
> > +   /* Saved Protection key register for supervisor mappings */
> > +   u32 saved_pkrs;
> > +#endif
> 
> Could you take a look around thread_struct and see if there are some
> other MSRs near which you can stash this?  This seems like a bit of a
> lonely place.

Are you more concerned with aesthetics or the in memory struct layout?

How about I put it after error_code?

unsigned long   error_code; 
+   
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+   /* Saved Protection key register for supervisor mappings */ 
+   u32 saved_pkrs; 
+#endif 
+   

?

> 
> ...
> >  void flush_thread(void)
> >  {
> > struct task_struct *tsk = current;
> > @@ -195,6 +212,8 @@ void flush_thread(void)
> > memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
> >  
> > fpu__clear_all(>thread.fpu);
> > +
> > +   pks_init_task(tsk);
> >  }
> >  
> >  void disable_TSC(void)
> > @@ -644,6 +663,8 @@ void __switch_to_xtra(struct task_struct *prev_p, 
> > struct task_struct *next_p)
> >  
> > if ((tifp ^ tifn) & _TIF_SLD)
> > switch_to_sld(tifn);
> > +
> > +   pks_sched_in();
> >  }
> >  
> >  /*
> > diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
> > index 3cf8f775f36d..30f65dd3d0c5 100644
> > --- a/arch/x86/mm/pkeys.c
> > +++ b/arch/x86/mm/pkeys.c
> > @@ -229,3 +229,31 @@ u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int 
> > flags)
> >  
> > return pk_reg;
> >  }
> > +
> > +DEFINE_PER_CPU(u32, pkrs_cache);
> > +
> > +/**
> > + * It should also be noted that the underlying WRMSR(MSR_IA32_PKRS) is not
> > + * serializing but still maintains ordering properties similar to WRPKRU.
> > + * The current SDM section on PKRS needs updating but should be the same as
> > + * that of WRPKRU.  So to quote from the WRPKRU text:
> > + *
> > + * WRPKRU will never execute transiently. Memory accesses
> > + * affected by PKRU register will not execute (even transiently)
> > + * until all prior executions of WRPKRU have completed execution
> > + * and updated the PKRU register.
> > + */
> > +void write_pkrs(u32 new_pkrs)
> > +{
> > +   u32 *pkrs;
> > +
> > +   if (!static_cpu_has(X86_FEATURE_PKS))
> > +   return;
> > +
> > +   pkrs = get_cpu_ptr(_cache);
> > +   if (*pkrs != new_pkrs) {
> > +   *pkrs = new_pkrs;
> > +   wrmsrl(MSR_IA32_PKRS, new_pkrs);
> > +   }
> > +   put_cpu_ptr(pkrs);
> > +}
> > 
> 
> It bugs me a *bit* that this is being called in a preempt-disabled
> region, but we still bother with 

Re: [PATCH RFC V3 5/9] x86/pks: Add PKS kernel API

2020-10-14 Thread Ira Weiny
On Tue, Oct 13, 2020 at 11:43:57AM -0700, Dave Hansen wrote:
> > +static inline void pks_update_protection(int pkey, unsigned long 
> > protection)
> > +{
> > +   current->thread.saved_pkrs = update_pkey_val(current->thread.saved_pkrs,
> > +pkey, protection);
> > +   preempt_disable();
> > +   write_pkrs(current->thread.saved_pkrs);
> > +   preempt_enable();
> > +}
> 
> Why does this need preempt count manipulation in addition to the
> get/put_cpu_var() inside of write_pkrs()?

This is a bug.  The disable should be around the update_pkey_val().

> 
> > +/**
> > + * PKS access control functions
> > + *
> > + * Change the access of the domain specified by the pkey.  These are global
> > + * updates.  They only affects the current running thread.  It is 
> > undefined and
> > + * a bug for users to call this without having allocated a pkey and using 
> > it as
> > + * pkey here.
> > + *
> > + * pks_mknoaccess()
> > + * Disable all access to the domain
> > + * pks_mkread()
> > + * Make the domain Read only
> > + * pks_mkrdwr()
> > + * Make the domain Read/Write
> > + *
> > + * @pkey the pkey for which the access should change.
> > + *
> > + */
> > +void pks_mknoaccess(int pkey)
> > +{
> > +   pks_update_protection(pkey, PKEY_DISABLE_ACCESS);
> > +}
> > +EXPORT_SYMBOL_GPL(pks_mknoaccess);
> 
> These are named like PTE manipulation functions, which is kinda weird.
> 
> What's wrong with: pks_disable_access(pkey) ?

Internal review suggested these names.  I'm not dead set on them.

FWIW I would rather they not get to wordy.

I was trying to get some consistency with pks_mk*() as meaning PKS 'make' X.

Do me 'disable' implies a state transition where 'make' implies we are
'setting' an absolute value.  I think the later is a better name.  And 'make'
made more sense because 'set' is so overloaded IHO.

> 
> > +void pks_mkread(int pkey)
> > +{
> > +   pks_update_protection(pkey, PKEY_DISABLE_WRITE);
> > +}
> > +EXPORT_SYMBOL_GPL(pks_mkread);
> 
> I really don't like this name.  It doesn't make readable, or even
> read-only, *especially* if it was already access-disabled.

Ok.

But it does sense if going from access-disable to read, correct?.  I could see
this being better named pks_mkreadonly() so that going from RW to this would
make more sense.  Especially after thinking about it above 'read only' needs to
be in the name.

Before I change anything I'd like to get consensus on naming.

How about the following?

pks_mk_noaccess()
pks_mk_readonly()
pks_mk_readwrite()

?

> 
> > +static const char pks_key_user0[] = "kernel";
> > +
> > +/* Store names of allocated keys for debug.  Key 0 is reserved for the 
> > kernel.  */
> > +static const char *pks_key_users[PKS_NUM_KEYS] = {
> > +   pks_key_user0
> > +};
> > +
> > +/*
> > + * Each key is represented by a bit.  Bit 0 is set for key 0 and reserved 
> > for
> > + * its use.  We use ulong for the bit operations but only 16 bits are used.
> > + */
> > +static unsigned long pks_key_allocation_map = 1 << PKS_KERN_DEFAULT_KEY;
> > +
> > +/*
> > + * pks_key_alloc - Allocate a PKS key
> > + *
> > + * @pkey_user: String stored for debugging of key exhaustion.  The caller 
> > is
> > + * responsible to maintain this memory until pks_key_free().
> > + */
> > +int pks_key_alloc(const char * const pkey_user)
> > +{
> > +   int nr;
> > +
> > +   if (!cpu_feature_enabled(X86_FEATURE_PKS))
> > +   return -EINVAL;
> 
> I'm not sure I like -EINVAL for this.  I thought we returned -ENOSPC for
> this case for user pkeys.

-ENOTSUP?

I'm not really sure anyone will need to know the difference between the
platform not supporting the key vs running out of them.  But they are 2
different error conditions.

> 
> > +   while (1) {
> > +   nr = find_first_zero_bit(_key_allocation_map, PKS_NUM_KEYS);
> > +   if (nr >= PKS_NUM_KEYS) {
> > +   pr_info("Cannot allocate supervisor key for %s.\n",
> > +   pkey_user);
> > +   return -ENOSPC;

We return -ENOSPC here when running out of keys.

> > +   }
> > +   if (!test_and_set_bit_lock(nr, _key_allocation_map))
> > +   break;
> > +   }
> > +
> > +   /* for debugging key exhaustion */
> > +   pks_key_users[nr] = pkey_user;
> > +
> > +   return nr;
> > +}
> > +EXPORT_SYMBOL_GPL(pks_key_alloc);
> > +
> > +/*
> > + * pks_key_free - Free a previously allocate PKS key
> > + *
> > + * @pkey: Key to be free'ed
> > + */
> > +void pks_key_free(int pkey)
> > +{
> > +   if (!cpu_feature_enabled(X86_FEATURE_PKS))
> > +   return;
> > +
> > +   if (pkey >= PKS_NUM_KEYS || pkey <= PKS_KERN_DEFAULT_KEY)
> > +   return;
> 
> This seems worthy of a WARN_ON_ONCE() at least.  It's essentially
> corrupt data coming into a kernel API.

Ok, Done,
Ira



Re: [PATCH RFC V3 7/9] x86/entry: Preserve PKRS MSR across exceptions

2020-10-14 Thread Ira Weiny
On Tue, Oct 13, 2020 at 11:52:32AM -0700, Dave Hansen wrote:
> On 10/9/20 12:42 PM, ira.we...@intel.com wrote:
> > @@ -341,6 +341,9 @@ noinstr void irqentry_enter(struct pt_regs *regs, 
> > irqentry_state_t *state)
> > /* Use the combo lockdep/tracing function */
> > trace_hardirqs_off();
> > instrumentation_end();
> > +
> > +done:
> > +   irq_save_pkrs(state);
> >  }
> 
> One nit: This saves *and* sets PKRS.  It's not obvious from the call
> here that PKRS is altered at this site.  Seems like there could be a
> better name.
> 
> Even if we did:
> 
>   irq_save_set_pkrs(state, INIT_VAL);
> 
> It would probably compile down to the same thing, but be *really*
> obvious what's going on.

I suppose that is true.  But I think it is odd having a parameter which is the
same for every call site.

But I'm not going to quibble over something like this.

Changed,
Ira

> 
> >  void irqentry_exit_cond_resched(void)
> > @@ -362,7 +365,12 @@ noinstr void irqentry_exit(struct pt_regs *regs, 
> > irqentry_state_t *state)
> > /* Check whether this returns to user mode */
> > if (user_mode(regs)) {
> > irqentry_exit_to_user_mode(regs);
> > -   } else if (!regs_irqs_disabled(regs)) {
> > +   return;
> > +   }
> > +
> > +   irq_restore_pkrs(state);
> > +
> > +   if (!regs_irqs_disabled(regs)) {
> > /*
> >  * If RCU was not watching on entry this needs to be done
> >  * carefully and needs the same ordering of lockdep/tracing
> > 
> 


Re: [PATCH RFC V3 8/9] x86/fault: Report the PKRS state on fault

2020-10-14 Thread Ira Weiny
On Tue, Oct 13, 2020 at 11:56:53AM -0700, Dave Hansen wrote:
> > @@ -548,6 +549,11 @@ show_fault_oops(struct pt_regs *regs, unsigned long 
> > error_code, unsigned long ad
> >  (error_code & X86_PF_PK)? "protection keys violation" :
> >"permissions violation");
> >  
> > +#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
> > +   if (irq_state && (error_code & X86_PF_PK))
> > +   pr_alert("PKRS: 0x%x\n", irq_state->pkrs);
> > +#endif
> 
> This means everyone will see 'PKRS: 0x0', even if they're on non-PKS
> hardware.  I think I'd rather have this only show PKRS when we're on
> cpu_feature_enabled(PKS) hardware.

Good catch, thanks.

> 
> ...
> > @@ -1148,14 +1156,15 @@ static int fault_in_kernel_space(unsigned long 
> > address)
> >   */
> >  static void
> >  do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
> > -  unsigned long address)
> > +  unsigned long address, irqentry_state_t *irq_state)
> >  {
> > /*
> > -* Protection keys exceptions only happen on user pages.  We
> > -* have no user pages in the kernel portion of the address
> > -* space, so do not expect them here.
> > +* If protection keys are not enabled for kernel space
> > +* do not expect Pkey errors here.
> >  */
> 
> Let's fix the double-negative:
> 
>   /*
>* PF_PK is only expected on kernel addresses whenn
>* supervisor pkeys are enabled:
>*/

done. thanks.

> 
> > -   WARN_ON_ONCE(hw_error_code & X86_PF_PK);
> > +   if (!IS_ENABLED(CONFIG_ARCH_HAS_SUPERVISOR_PKEYS) ||
> > +   !cpu_feature_enabled(X86_FEATURE_PKS))
> > +   WARN_ON_ONCE(hw_error_code & X86_PF_PK);
> 
> Yeah, please stick X86_FEATURE_PKS in disabled-features so you can use
> cpu_feature_enabled(X86_FEATURE_PKS) by itself here..

done.

thanks,
Ira



Re: [PATCH RFC V3 7/9] x86/entry: Preserve PKRS MSR across exceptions

2020-10-14 Thread Ira Weiny
On Wed, Oct 14, 2020 at 09:06:44PM -0700, Dave Hansen wrote:
> On 10/14/20 8:46 PM, Ira Weiny wrote:
> > On Tue, Oct 13, 2020 at 11:52:32AM -0700, Dave Hansen wrote:
> >> On 10/9/20 12:42 PM, ira.we...@intel.com wrote:
> >>> @@ -341,6 +341,9 @@ noinstr void irqentry_enter(struct pt_regs *regs, 
> >>> irqentry_state_t *state)
> >>>   /* Use the combo lockdep/tracing function */
> >>>   trace_hardirqs_off();
> >>>   instrumentation_end();
> >>> +
> >>> +done:
> >>> + irq_save_pkrs(state);
> >>>  }
> >> One nit: This saves *and* sets PKRS.  It's not obvious from the call
> >> here that PKRS is altered at this site.  Seems like there could be a
> >> better name.
> >>
> >> Even if we did:
> >>
> >>irq_save_set_pkrs(state, INIT_VAL);
> >>
> >> It would probably compile down to the same thing, but be *really*
> >> obvious what's going on.
> > I suppose that is true.  But I think it is odd having a parameter which is 
> > the
> > same for every call site.
> 
> Well, it depends on what you optimize for.  I'm trying to optimize for
> the code being understood quickly the first time someone reads it.  To
> me, that's more important than minimizing the number of function
> parameters (which are essentially free).
>

Agreed.  Sorry I was not trying to be confrontational.  There is just enough
other things which are going to take me time to get right I need to focus on
them...  :-D

Sorry,
Ira


Re: [PATCH RFC V3 9/9] x86/pks: Add PKS test code

2020-10-14 Thread Ira Weiny
On Tue, Oct 13, 2020 at 12:02:07PM -0700, Dave Hansen wrote:
> On 10/9/20 12:42 PM, ira.we...@intel.com wrote:
> >  #ifdef CONFIG_X86_32
> > /*
> >  * We can fault-in kernel-space virtual memory on-demand. The
> > diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
> > index cc3510cde64e..f9552bd9341f 100644
> > --- a/include/linux/pkeys.h
> > +++ b/include/linux/pkeys.h
> > @@ -47,7 +47,6 @@ static inline bool arch_pkeys_enabled(void)
> >  static inline void copy_init_pkru_to_fpregs(void)
> >  {
> >  }
> > -
> >  #endif /* ! CONFIG_ARCH_HAS_PKEYS */
> 
> ^ Whitespace damage

Done.

> 
> >  #ifndef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index 0c781f912f9f..f015c09ba5a1 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -2400,6 +2400,18 @@ config HYPERV_TESTING
> > help
> >   Select this option to enable Hyper-V vmbus testing.
> >  
> > +config PKS_TESTING
> > +   bool "PKey(S)upervisor testing"
> 
> Seems like we need a space in there somewhere.

heheh...  yea...

> 
> > +   pid = fork();
> > +   if (pid == 0) {
> > +   fd = open("/sys/kernel/debug/x86/run_pks", O_RDWR);
> > +   if (fd < 0) {
> > +   printf("cannot open file\n");
> > +   return -1;
> > +   }
> > +
> 
> Will this return code make anybody mad?  Should we have a nicer return
> code for when this is running on non-PKS hardware?

I'm not sure it will matter much but I think it is better to report the missing
file.[1]

> 
> I'm not going to be too picky about this.  I'll just ask one question:
> Has this found real bugs for you?

Many, especially regressions as things have changed.

> 
> Reviewed-by: Dave Hansen 
> 

Thanks,
Ira

[1]

diff --git a/tools/testing/selftests/x86/test_pks.c 
b/tools/testing/selftests/x86/test_pks.c
index 8037a2a9ff5f..11be4e212d54 100644
--- a/tools/testing/selftests/x86/test_pks.c
+++ b/tools/testing/selftests/x86/test_pks.c
@@ -11,6 +11,8 @@
 #include 
 #include 
 
+#define PKS_TEST_FILE "/sys/kernel/debug/x86/run_pks"
+
 int main(void)
 {
cpu_set_t cpuset;
@@ -25,9 +27,9 @@ int main(void)
 
pid = fork();
if (pid == 0) {
-   fd = open("/sys/kernel/debug/x86/run_pks", O_RDWR);
+   fd = open(PKS_TEST_FILE, O_RDWR);
if (fd < 0) {
-   printf("cannot open file\n");
+   printf("cannot open %s\n", PKS_TEST_FILE);
return -1;
}
 
@@ -45,9 +47,9 @@ int main(void)
} else {
sleep(2);
 
-   fd = open("/sys/kernel/debug/x86/run_pks", O_RDWR);
+   fd = open(PKS_TEST_FILE, O_RDWR);
if (fd < 0) {
-   printf("cannot open file\n");
+   printf("cannot open %s\n", PKS_TEST_FILE);
return -1;
}
 



Re: [PATCH] IB/rdmavt: Fix sizeof mismatch

2020-10-08 Thread Ira Weiny
On Thu, Oct 08, 2020 at 10:52:04AM +0100, Colin King wrote:
> From: Colin Ian King 
> 
> An incorrect sizeof is being used, struct rvt_ibport ** is not correct,
> it should be struct rvt_ibport *. Note that since ** is the same size as
> * this is not causing any issues.  Improve this fix by using
> sizeof(*rdi->ports) as this allows us to not even reference the type
> of the pointer.  Also remove line breaks as the entire statement can
> fit on one line.
> 
> Addresses-Coverity: ("Sizeof not portable (SIZEOF_MISMATCH)")
> Fixes: ff6acd69518e ("IB/rdmavt: Add device structure allocation")
> Signed-off-by: Colin Ian King 

Reviewed-by: Ira Weiny 

> ---
>  drivers/infiniband/sw/rdmavt/vt.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rdmavt/vt.c 
> b/drivers/infiniband/sw/rdmavt/vt.c
> index f904bb34477a..2d534c450f3c 100644
> --- a/drivers/infiniband/sw/rdmavt/vt.c
> +++ b/drivers/infiniband/sw/rdmavt/vt.c
> @@ -95,9 +95,7 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int 
> nports)
>   if (!rdi)
>   return rdi;
>  
> - rdi->ports = kcalloc(nports,
> -  sizeof(struct rvt_ibport **),
> -  GFP_KERNEL);
> + rdi->ports = kcalloc(nports, sizeof(*rdi->ports), GFP_KERNEL);
>   if (!rdi->ports)
>   ib_dealloc_device(>ibdev);
>  
> -- 
> 2.27.0
> 


Re: [PATCH] mm: make device private reference counts zero based

2020-10-09 Thread Ira Weiny
On Thu, Oct 08, 2020 at 10:25:44AM -0700, Ralph Campbell wrote:
> ZONE_DEVICE struct pages have an extra reference count that complicates the
> code for put_page() and several places in the kernel that need to check the
> reference count to see that a page is not being used (gup, compaction,
> migration, etc.). Clean up the code so the reference count doesn't need to
> be treated specially for device private pages, leaving DAX as still being
> a special case.

What about the check in mc_handle_swap_pte()?

mm/memcontrol.c:

5513 /*
5514  * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which 
have
5515  * a refcount of 1 when free (unlike normal page)
5516  */
5517 if (!page_ref_add_unless(page, 1, 1))
5518 return NULL;

... does that need to change?  Perhaps just the comment?

> 
> Signed-off-by: Ralph Campbell 
> ---
> 

[snip]

>  
>  void put_devmap_managed_page(struct page *page);
> diff --git a/lib/test_hmm.c b/lib/test_hmm.c
> index e151a7f10519..bf92a261fa6f 100644
> --- a/lib/test_hmm.c
> +++ b/lib/test_hmm.c
> @@ -509,10 +509,15 @@ static bool dmirror_allocate_chunk(struct 
> dmirror_device *mdevice,
>   mdevice->devmem_count * (DEVMEM_CHUNK_SIZE / (1024 * 1024)),
>   pfn_first, pfn_last);
>  
> + /*
> +  * Pages are created with an initial reference count of one but should
> +  * have a reference count of zero while in the free state.
> +  */
>   spin_lock(>lock);
>   for (pfn = pfn_first; pfn < pfn_last; pfn++) {
>   struct page *page = pfn_to_page(pfn);
>  
> + set_page_count(page, 0);

This confuses me.  How does this and init_page_count() not confuse the buddy
allocator?  Don't you have to reset the refcount somewhere after the test?

>   page->zone_device_data = mdevice->free_pages;
>   mdevice->free_pages = page;
>   }
> @@ -561,7 +566,7 @@ static struct page *dmirror_devmem_alloc_page(struct 
> dmirror_device *mdevice)
>   }
>  
>   dpage->zone_device_data = rpage;
> - get_page(dpage);
> + init_page_count(dpage);
>   lock_page(dpage);
>   return dpage;
>  
> diff --git a/mm/internal.h b/mm/internal.h
> index c43ccdddb0f6..e1443b73aa9b 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
>  

[snip]

> diff --git a/mm/swap.c b/mm/swap.c
> index 0eb057141a04..93d880c6f73c 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -116,12 +116,11 @@ static void __put_compound_page(struct page *page)
>  void __put_page(struct page *page)
>  {
>   if (is_zone_device_page(page)) {
> - put_dev_pagemap(page->pgmap);
> -
>   /*
>* The page belongs to the device that created pgmap. Do
>* not return it to page allocator.
>*/
> + free_zone_device_page(page);

I really like this.

Ira



[PATCH RFC V3 3/9] x86/pks: Enable Protection Keys Supervisor (PKS)

2020-10-09 Thread ira . weiny
From: Fenghua Yu 

Protection Keys for Supervisor pages (PKS) enables fast, hardware thread
specific, manipulation of permission restrictions on supervisor page
mappings.  It uses the same mechanism of Protection Keys as those on
User mappings but applies that mechanism to supervisor mappings using a
supervisor specific MSR.

Kernel users can thus defines 'domains' of page mappings which have an
extra level of protection beyond those specified in the supervisor page
table entries.

Define ARCH_HAS_SUPERVISOR_PKEYS to distinguish this functionality from
the existing ARCH_HAS_PKEYS and then enable PKS when configured and
indicated by the CPU instance.  While not strictly necessary in this
patch, ARCH_HAS_SUPERVISOR_PKEYS separates this functionality through
the patch series so it is introduced here.

Reviewed-by: Dan Williams 
Co-developed-by: Ira Weiny 
Signed-off-by: Ira Weiny 
Signed-off-by: Fenghua Yu 
---
 arch/x86/Kconfig|  1 +
 arch/x86/include/asm/cpufeatures.h  |  1 +
 arch/x86/include/uapi/asm/processor-flags.h |  2 ++
 arch/x86/kernel/cpu/common.c| 15 +++
 mm/Kconfig  |  2 ++
 5 files changed, 21 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7101ac64bb20..1bfb912342a3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1873,6 +1873,7 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS
depends on X86_64 && (CPU_SUP_INTEL || CPU_SUP_AMD)
select ARCH_USES_HIGH_VMA_FLAGS
select ARCH_HAS_PKEYS
+   select ARCH_HAS_SUPERVISOR_PKEYS
help
  Memory Protection Keys provides a mechanism for enforcing
  page-based protections, but without requiring modification of the
diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index 2901d5df4366..7646b03fc3f4 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -353,6 +353,7 @@
 #define X86_FEATURE_CLDEMOTE   (16*32+25) /* CLDEMOTE instruction */
 #define X86_FEATURE_MOVDIRI(16*32+27) /* MOVDIRI instruction */
 #define X86_FEATURE_MOVDIR64B  (16*32+28) /* MOVDIR64B instruction */
+#define X86_FEATURE_PKS(16*32+31) /* Protection Keys 
for Supervisor pages */
 
 /* AMD-defined CPU features, CPUID level 0x8007 (EBX), word 17 */
 #define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery 
support */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h 
b/arch/x86/include/uapi/asm/processor-flags.h
index bcba3c643e63..191c574b2390 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -130,6 +130,8 @@
 #define X86_CR4_SMAP   _BITUL(X86_CR4_SMAP_BIT)
 #define X86_CR4_PKE_BIT22 /* enable Protection Keys support */
 #define X86_CR4_PKE_BITUL(X86_CR4_PKE_BIT)
+#define X86_CR4_PKS_BIT24 /* enable Protection Keys for 
Supervisor */
+#define X86_CR4_PKS_BITUL(X86_CR4_PKS_BIT)
 
 /*
  * x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c5d6f17d9b9d..a129d5e4afab 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1447,6 +1447,20 @@ static void validate_apic_and_package_id(struct 
cpuinfo_x86 *c)
 #endif
 }
 
+/*
+ * PKS is independent of PKU and either or both may be supported on a CPU.
+ * Configure PKS if the cpu supports the feature.
+ */
+static void setup_pks(void)
+{
+   if (!IS_ENABLED(CONFIG_ARCH_HAS_SUPERVISOR_PKEYS))
+   return;
+   if (!cpu_feature_enabled(X86_FEATURE_PKS))
+   return;
+
+   cr4_set_bits(X86_CR4_PKS);
+}
+
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
@@ -1544,6 +1558,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 
x86_init_rdrand(c);
setup_pku(c);
+   setup_pks();
 
/*
 * Clear/Set all flags overridden by options, need do it
diff --git a/mm/Kconfig b/mm/Kconfig
index 6c974888f86f..1b9bc004d9bc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -822,6 +822,8 @@ config ARCH_USES_HIGH_VMA_FLAGS
bool
 config ARCH_HAS_PKEYS
bool
+config ARCH_HAS_SUPERVISOR_PKEYS
+   bool
 
 config PERCPU_STATS
bool "Collect percpu memory statistics"
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC V3 6/9] x86/entry: Pass irqentry_state_t by reference

2020-10-09 Thread ira . weiny
From: Ira Weiny 

In preparation for adding PKS information to struct irqentry_state_t
change all call sites and usages to pass the struct by reference
instead of by value.

Signed-off-by: Ira Weiny 
---
 arch/x86/entry/common.c | 16 +++-
 arch/x86/include/asm/idtentry.h | 29 +
 arch/x86/kernel/kvm.c   |  4 ++--
 arch/x86/kernel/nmi.c   |  7 ---
 arch/x86/kernel/traps.c | 21 +
 arch/x86/mm/fault.c |  4 ++--
 include/linux/entry-common.h|  7 ---
 kernel/entry/common.c   | 20 
 8 files changed, 57 insertions(+), 51 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 870efeec8bda..305da13770b6 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -209,9 +209,9 @@ SYSCALL_DEFINE0(ni_syscall)
return -ENOSYS;
 }
 
-noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
+noinstr void idtentry_enter_nmi(struct pt_regs *regs, irqentry_state_t 
*irq_state)
 {
-   bool irq_state = lockdep_hardirqs_enabled();
+   irq_state->exit_rcu = lockdep_hardirqs_enabled();
 
__nmi_enter();
lockdep_hardirqs_off(CALLER_ADDR0);
@@ -222,15 +222,13 @@ noinstr bool idtentry_enter_nmi(struct pt_regs *regs)
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();
-
-   return irq_state;
 }
 
-noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore)
+noinstr void idtentry_exit_nmi(struct pt_regs *regs, irqentry_state_t 
*irq_state)
 {
instrumentation_begin();
ftrace_nmi_exit();
-   if (restore) {
+   if (irq_state->exit_rcu) {
trace_hardirqs_on_prepare();
lockdep_hardirqs_on_prepare(CALLER_ADDR0);
}
@@ -238,7 +236,7 @@ noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool 
restore)
 
rcu_nmi_exit();
lockdep_hardirq_exit();
-   if (restore)
+   if (irq_state->exit_rcu)
lockdep_hardirqs_on(CALLER_ADDR0);
__nmi_exit();
 }
@@ -295,7 +293,7 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct 
pt_regs *regs)
bool inhcall;
irqentry_state_t state;
 
-   state = irqentry_enter(regs);
+   irqentry_enter(regs, );
old_regs = set_irq_regs(regs);
 
instrumentation_begin();
@@ -311,7 +309,7 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct 
pt_regs *regs)
instrumentation_end();
restore_inhcall(inhcall);
} else {
-   irqentry_exit(regs, state);
+   irqentry_exit(regs, );
}
 }
 #endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index a0638640f1ed..622889ba21d0 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -11,8 +11,8 @@
 
 #include 
 
-bool idtentry_enter_nmi(struct pt_regs *regs);
-void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state);
+void idtentry_enter_nmi(struct pt_regs *regs, irqentry_state_t *irq_state);
+void idtentry_exit_nmi(struct pt_regs *regs, irqentry_state_t *irq_state);
 
 /**
  * DECLARE_IDTENTRY - Declare functions for simple IDT entry points
@@ -52,12 +52,13 @@ static __always_inline void __##func(struct pt_regs *regs); 
\
\
 __visible noinstr void func(struct pt_regs *regs)  \
 {  \
-   irqentry_state_t state = irqentry_enter(regs);  \
+   irqentry_state_t state; \
\
+   irqentry_enter(regs, );   \
instrumentation_begin();\
__##func (regs);\
instrumentation_end();  \
-   irqentry_exit(regs, state); \
+   irqentry_exit(regs, );\
 }  \
\
 static __always_inline void __##func(struct pt_regs *regs)
@@ -99,12 +100,13 @@ static __always_inline void __##func(struct pt_regs *regs, 
\
 __visible noinstr void func(struct pt_regs *regs,  \
unsigned long error_code)   \
 {  \
-   irqentry_state_t state = irqentry_enter(regs);  \
+   irqentry_stat

[PATCH RFC V3 5/9] x86/pks: Add PKS kernel API

2020-10-09 Thread ira . weiny
From: Fenghua Yu 

PKS allows kernel users to define domains of page mappings which have
additional protections beyond the paging protections.

Add an API to allocate, use, and free a protection key which identifies
such a domain.  Export 5 new symbols pks_key_alloc(), pks_mknoaccess(),
pks_mkread(), pks_mkrdwr(), and pks_key_free().  Add 2 new macros;
PAGE_KERNEL_PKEY(key) and _PAGE_PKEY(pkey).

Update the protection key documentation to cover pkeys on supervisor
pages.

Co-developed-by: Ira Weiny 
Signed-off-by: Ira Weiny 
Signed-off-by: Fenghua Yu 
---
 Documentation/core-api/protection-keys.rst | 101 ++---
 arch/x86/include/asm/pgtable_types.h   |  12 ++
 arch/x86/include/asm/pkeys.h   |  11 ++
 arch/x86/include/asm/pkeys_common.h|   4 +
 arch/x86/mm/pkeys.c| 125 +
 include/linux/pgtable.h|   4 +
 include/linux/pkeys.h  |  22 
 7 files changed, 261 insertions(+), 18 deletions(-)

diff --git a/Documentation/core-api/protection-keys.rst 
b/Documentation/core-api/protection-keys.rst
index ec575e72d0b2..00a046a913e4 100644
--- a/Documentation/core-api/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
@@ -4,25 +4,33 @@
 Memory Protection Keys
 ==
 
-Memory Protection Keys for Userspace (PKU aka PKEYs) is a feature
-which is found on Intel's Skylake (and later) "Scalable Processor"
-Server CPUs. It will be available in future non-server Intel parts
-and future AMD processors.
-
-For anyone wishing to test or use this feature, it is available in
-Amazon's EC2 C5 instances and is known to work there using an Ubuntu
-17.04 image.
-
 Memory Protection Keys provides a mechanism for enforcing page-based
 protections, but without requiring modification of the page tables
-when an application changes protection domains.  It works by
-dedicating 4 previously ignored bits in each page table entry to a
-"protection key", giving 16 possible keys.
+when an application changes protection domains.
+
+PKeys Userspace (PKU) is a feature which is found on Intel's Skylake "Scalable
+Processor" Server CPUs and later.  And It will be available in future
+non-server Intel parts and future AMD processors.
+
+Future Intel processors will support Protection Keys for Supervisor pages
+(PKS).
+
+For anyone wishing to test or use user space pkeys, it is available in Amazon's
+EC2 C5 instances and is known to work there using an Ubuntu 17.04 image.
+
+pkeys work by dedicating 4 previously Reserved bits in each page table entry to
+a "protection key", giving 16 possible keys.  User and Supervisor pages are
+treated separately.
+
+Protections for each page are controlled with per CPU registers for each type
+of page User and Supervisor.  Each of these 32 bit register stores two separate
+bits (Access Disable and Write Disable) for each key.
 
-There is also a new user-accessible register (PKRU) with two separate
-bits (Access Disable and Write Disable) for each key.  Being a CPU
-register, PKRU is inherently thread-local, potentially giving each
-thread a different set of protections from every other thread.
+For Userspace the register is user-accessible (rdpkru/wrpkru).  For
+Supervisor, the register (MSR_IA32_PKRS) is accessible only to the kernel.
+
+Being a CPU register, pkeys are inherently thread-local, potentially giving
+each thread an independent set of protections from every other thread.
 
 There are two new instructions (RDPKRU/WRPKRU) for reading and writing
 to the new register.  The feature is only available in 64-bit mode,
@@ -30,8 +38,11 @@ even though there is theoretically space in the PAE PTEs.  
These
 permissions are enforced on data access only and have no effect on
 instruction fetches.
 
-Syscalls
-
+For kernel space rdmsr/wrmsr are used to access the kernel MSRs.
+
+
+Syscalls for user space keys
+
 
 There are 3 system calls which directly interact with pkeys::
 
@@ -98,3 +109,57 @@ with a read()::
 The kernel will send a SIGSEGV in both cases, but si_code will be set
 to SEGV_PKERR when violating protection keys versus SEGV_ACCERR when
 the plain mprotect() permissions are violated.
+
+
+Kernel API for PKS support
+==
+
+The following interface is used to allocate, use, and free a pkey which defines
+a 'protection domain' within the kernel.  Setting a pkey value in a supervisor
+mapping adds that mapping to the protection domain.
+
+int pks_key_alloc(const char * const pkey_user);
+#define PAGE_KERNEL_PKEY(pkey)
+#define _PAGE_KEY(pkey)
+void pks_mknoaccess(int pkey);
+void pks_mkread(int pkey);
+void pks_mkrdwr(int pkey);
+void pks_key_free(int pkey);
+
+pks_key_alloc() allocates keys dynamically to allow better use of the limited
+key space.
+
+Callers of pks_key_alloc() _must_ be prepared for

[PATCH RFC V3 0/9] PKS: Add Protection Keys Supervisor (PKS) support RFC v3

2020-10-09 Thread ira . weiny
From: Ira Weiny 

This RFC series has been reviewed by Dave Hansen.

Introduce a new page protection mechanism for supervisor pages, Protection Key
Supervisor (PKS).

2 use cases for PKS are being developed, trusted keys and PMEM.  Trusted keys
is a newer use case which is still being explored.  PMEM was submitted as part
of the RFC (v2) series[1].  However, since then it was found that some callers
of kmap() require a global implementation of PKS.  Specifically some users of
kmap() expect mappings to be available to all kernel threads.  While global use
of PKS is rare it needs to be included for correctness.  Unfortunately the
kmap() updates required a large patch series to make the needed changes at the
various kmap() call sites so that patch set has been split out.  Because the
global PKS feature is only required for that use case it will be deferred to
that set as well.[2]  This patch set is being submitted as a precursor to both
of the use cases.

For an overview of the entire PKS ecosystem, a git tree including this series
and the 2 use cases can be found here:

https://github.com/weiny2/linux-kernel/tree/pks-rfc-v3


PKS enables protections on 'domains' of supervisor pages to limit supervisor
mode access to those pages beyond the normal paging protections.  PKS works in
a similar fashion to user space pkeys, PKU.  As with PKU, supervisor pkeys are
checked in addition to normal paging protections and Access or Writes can be
disabled via a MSR update without TLB flushes when permissions change.  Also
like PKU, a page mapping is assigned to a domain by setting pkey bits in the
page table entry for that mapping.

Access is controlled through a PKRS register which is updated via WRMSR/RDMSR.

XSAVE is not supported for the PKRS MSR.  Therefore the implementation
saves/restores the MSR across context switches and during exceptions.  Nested
exceptions are supported by each exception getting a new PKS state.

For consistent behavior with current paging protections, pkey 0 is reserved and
configured to allow full access via the pkey mechanism, thus preserving the
default paging protections on mappings with the default pkey value of 0.

Other keys, (1-15) are allocated by an allocator which prepares us for key
contention from day one.  Kernel users should be prepared for the allocator to
fail either because of key exhaustion or due to PKS not being supported on the
arch and/or CPU instance.

The following are key attributes of PKS.

   1) Fast switching of permissions
1a) Prevents access without page table manipulations
1b) No TLB flushes required
   2) Works on a per thread basis

PKS is available with 4 and 5 level paging.  Like PKRU it consumes 4 bits from
the PTE to store the pkey within the entry.


[1] https://lore.kernel.org/lkml/20200717072056.73134-1-ira.we...@intel.com/
[2] 
https://github.com/weiny2/linux-kernel/commit/f10abb0f0d7b4e14f03fc8890313a5830cde1e49
and a testing patch

https://github.com/weiny2/linux-kernel/commit/2a8e0fc7654a7c69b243d628f63b01ff26a5a797


Fenghua Yu (3):
  x86/fpu: Refactor arch_set_user_pkey_access() for PKS support
  x86/pks: Enable Protection Keys Supervisor (PKS)
  x86/pks: Add PKS kernel API

Ira Weiny (6):
  x86/pkeys: Create pkeys_common.h
  x86/pks: Preserve the PKRS MSR on context switch
  x86/entry: Pass irqentry_state_t by reference
  x86/entry: Preserve PKRS MSR across exceptions
  x86/fault: Report the PKRS state on fault
  x86/pks: Add PKS test code

 Documentation/core-api/protection-keys.rst  | 102 ++-
 arch/x86/Kconfig|   1 +
 arch/x86/entry/common.c |  57 +-
 arch/x86/include/asm/cpufeatures.h  |   1 +
 arch/x86/include/asm/idtentry.h |  29 +-
 arch/x86/include/asm/msr-index.h|   1 +
 arch/x86/include/asm/pgtable.h  |  13 +-
 arch/x86/include/asm/pgtable_types.h|  12 +
 arch/x86/include/asm/pkeys.h|  15 +
 arch/x86/include/asm/pkeys_common.h |  36 +
 arch/x86/include/asm/processor.h|  13 +
 arch/x86/include/uapi/asm/processor-flags.h |   2 +
 arch/x86/kernel/cpu/common.c|  17 +
 arch/x86/kernel/cpu/mce/core.c  |   4 +
 arch/x86/kernel/fpu/xstate.c|  22 +-
 arch/x86/kernel/kvm.c   |   4 +-
 arch/x86/kernel/nmi.c   |   7 +-
 arch/x86/kernel/process.c   |  21 +
 arch/x86/kernel/traps.c |  21 +-
 arch/x86/mm/fault.c |  86 ++-
 arch/x86/mm/pkeys.c | 188 +-
 include/linux/entry-common.h|  19 +-
 include/linux/pgtable.h |   4 +
 include/linux/pkeys.h   |  23 +-
 kernel/entry/common.c   |  28 +-
 lib/Kconfig.debug   |  12 +
 lib/Makefile|   3 +
 lib/pks/Makefile

[PATCH RFC V3 4/9] x86/pks: Preserve the PKRS MSR on context switch

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The PKRS MSR is defined as a per-logical-processor register.  This
isolates memory access by logical CPU.  Unfortunately, the MSR is not
managed by XSAVE.  Therefore, tasks must save/restore the MSR value on
context switch.

Define a saved PKRS value in the task struct, as well as a cached
per-logical-processor MSR value which mirrors the MSR value of the
current CPU.  Initialize all tasks with the default MSR value.  Then, on
schedule in, check the saved task MSR vs the per-cpu value.  If
different proceed to write the MSR.  If not avoid the overhead of the
MSR write and continue.

Follow on patches will update the saved PKRS as well as the MSR if
needed.

Co-developed-by: Fenghua Yu 
Signed-off-by: Fenghua Yu 
Signed-off-by: Ira Weiny 
---
 arch/x86/include/asm/msr-index.h|  1 +
 arch/x86/include/asm/pkeys_common.h | 20 
 arch/x86/include/asm/processor.h| 13 +
 arch/x86/kernel/cpu/common.c|  2 ++
 arch/x86/kernel/process.c   | 21 +
 arch/x86/mm/pkeys.c | 28 
 6 files changed, 85 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 2859ee4f39a8..e467e087f1b3 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -747,6 +747,7 @@
 
 #define MSR_IA32_TSC_DEADLINE  0x06E0
 
+#define MSR_IA32_PKRS  0x06E1
 
 #define MSR_TSX_FORCE_ABORT0x010F
 
diff --git a/arch/x86/include/asm/pkeys_common.h 
b/arch/x86/include/asm/pkeys_common.h
index a9f086f1e4b4..05781be33c14 100644
--- a/arch/x86/include/asm/pkeys_common.h
+++ b/arch/x86/include/asm/pkeys_common.h
@@ -8,4 +8,24 @@
 
 #define PKR_AD_KEY(pkey)   (PKR_AD_BIT << ((pkey) * PKR_BITS_PER_PKEY))
 
+/*
+ * Define a default PKRS value for each task.
+ *
+ * Key 0 has no restriction.  All other keys are set to the most restrictive
+ * value which is access disabled (AD=1).
+ *
+ * NOTE: This needs to be a macro to be used as part of the INIT_THREAD macro.
+ */
+#define INIT_PKRS_VALUE (PKR_AD_KEY(1) | PKR_AD_KEY(2) | PKR_AD_KEY(3) | \
+PKR_AD_KEY(4) | PKR_AD_KEY(5) | PKR_AD_KEY(6) | \
+PKR_AD_KEY(7) | PKR_AD_KEY(8) | PKR_AD_KEY(9) | \
+PKR_AD_KEY(10) | PKR_AD_KEY(11) | PKR_AD_KEY(12) | \
+PKR_AD_KEY(13) | PKR_AD_KEY(14) | PKR_AD_KEY(15))
+
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+void write_pkrs(u32 new_pkrs);
+#else
+static inline void write_pkrs(u32 new_pkrs) { }
+#endif
+
 #endif /*_ASM_X86_PKEYS_INTERNAL_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 97143d87994c..da2381136b2d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -18,6 +18,7 @@ struct vm86;
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -542,6 +543,11 @@ struct thread_struct {
 
unsigned intsig_on_uaccess_err:1;
 
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+   /* Saved Protection key register for supervisor mappings */
+   u32 saved_pkrs;
+#endif
+
/* Floating point and extended processor state */
struct fpu  fpu;
/*
@@ -840,8 +846,15 @@ static inline void spin_lock_prefetch(const void *x)
 #define STACK_TOP  TASK_SIZE_LOW
 #define STACK_TOP_MAX  TASK_SIZE_MAX
 
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+#define INIT_THREAD_PKRS   .saved_pkrs = INIT_PKRS_VALUE
+#else
+#define INIT_THREAD_PKRS   0
+#endif
+
 #define INIT_THREAD  { \
.addr_limit = KERNEL_DS,\
+   INIT_THREAD_PKRS,   \
 }
 
 extern unsigned long KSTK_ESP(struct task_struct *task);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a129d5e4afab..968863d59b6c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -57,6 +57,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "cpu.h"
 
@@ -1458,6 +1459,7 @@ static void setup_pks(void)
if (!cpu_feature_enabled(X86_FEATURE_PKS))
return;
 
+   write_pkrs(INIT_PKRS_VALUE);
cr4_set_bits(X86_CR4_PKS);
 }
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba4593a913fa..eb3a95a69392 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "process.h"
 
@@ -187,6 +188,22 @@ int copy_thread(unsigned long clone_flags, unsigned long 
sp, unsigned long arg,
return ret;
 }
 
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+DECLARE_PER_CPU(u32, pkrs_cache);
+static inline void pks_init_task(struct task_struct *tsk)
+{
+   /* New tasks get the most restrictive

[PATCH RFC V3 2/9] x86/fpu: Refactor arch_set_user_pkey_access() for PKS support

2020-10-09 Thread ira . weiny
From: Fenghua Yu 

Define a helper, update_pkey_val(), which will be used to support both
Protection Key User (PKU) and the new Protection Key for Supervisor
(PKS) in subsequent patches.

Co-developed-by: Ira Weiny 
Signed-off-by: Ira Weiny 
Signed-off-by: Fenghua Yu 
---
 arch/x86/include/asm/pkeys.h |  2 ++
 arch/x86/kernel/fpu/xstate.c | 22 --
 arch/x86/mm/pkeys.c  | 21 +
 3 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index f9feba80894b..4526245b03e5 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -136,4 +136,6 @@ static inline int vma_pkey(struct vm_area_struct *vma)
return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT;
 }
 
+u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags);
+
 #endif /*_ASM_X86_PKEYS_H */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index b55cf3acd82a..725f10670d0a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -990,9 +990,7 @@ const void *get_xsave_field_ptr(int xfeature_nr)
 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val)
 {
-   u32 old_pkru;
-   int pkey_shift = (pkey * PKR_BITS_PER_PKEY);
-   u32 new_pkru_bits = 0;
+   u32 pkru;
 
/*
 * This check implies XSAVE support.  OSPKE only gets
@@ -1008,21 +1006,9 @@ int arch_set_user_pkey_access(struct task_struct *tsk, 
int pkey,
 */
WARN_ON_ONCE(pkey >= arch_max_pkey());
 
-   /* Set the bits we need in PKRU:  */
-   if (init_val & PKEY_DISABLE_ACCESS)
-   new_pkru_bits |= PKR_AD_BIT;
-   if (init_val & PKEY_DISABLE_WRITE)
-   new_pkru_bits |= PKR_WD_BIT;
-
-   /* Shift the bits in to the correct place in PKRU for pkey: */
-   new_pkru_bits <<= pkey_shift;
-
-   /* Get old PKRU and mask off any old bits in place: */
-   old_pkru = read_pkru();
-   old_pkru &= ~((PKR_AD_BIT|PKR_WD_BIT) << pkey_shift);
-
-   /* Write old part along with new part: */
-   write_pkru(old_pkru | new_pkru_bits);
+   pkru = read_pkru();
+   pkru = update_pkey_val(pkru, pkey, init_val);
+   write_pkru(pkru);
 
return 0;
 }
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index f5efb4007e74..3cf8f775f36d 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -208,3 +208,24 @@ static __init int setup_init_pkru(char *opt)
return 1;
 }
 __setup("init_pkru=", setup_init_pkru);
+
+/*
+ * Update the pk_reg value and return it.
+ *
+ * Kernel users use the same flags as user space:
+ * PKEY_DISABLE_ACCESS
+ * PKEY_DISABLE_WRITE
+ */
+u32 update_pkey_val(u32 pk_reg, int pkey, unsigned int flags)
+{
+   int pkey_shift = pkey * PKR_BITS_PER_PKEY;
+
+   pk_reg &= ~(((1 << PKR_BITS_PER_PKEY) - 1) << pkey_shift);
+
+   if (flags & PKEY_DISABLE_ACCESS)
+   pk_reg |= PKR_AD_BIT << pkey_shift;
+   if (flags & PKEY_DISABLE_WRITE)
+   pk_reg |= PKR_WD_BIT << pkey_shift;
+
+   return pk_reg;
+}
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC V3 9/9] x86/pks: Add PKS test code

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The core PKS functionality provides an interface for kernel users to
reserve keys to their domains set up the page tables with those keys and
control access to those domains when needed.

Define test code which exercises the core functionality of PKS via a
debugfs entry.  Basic checks can be triggered on boot with a kernel
command line option while both basic and preemption checks can be
triggered with separate debugfs values.

debugfs controls are:

'0' -- Run access tests with a single pkey
'1' -- Set up the pkey register with no access for the pkey allocated to
   this fd
'2' -- Check that the pkey register updated in '1' is still the same.
   (To be used after a forced context switch.)
'3' -- Allocate all pkeys possible and run tests on each pkey allocated.
   DEFAULT when run at boot.

Closing the fd will cleanup and release the pkey, therefore to exercise
context switch testing a user space program is provided in:

.../tools/testing/selftests/x86/test_pks.c

Co-developed-by: Fenghua Yu 
Signed-off-by: Fenghua Yu 
Signed-off-by: Ira Weiny 
---
 Documentation/core-api/protection-keys.rst |   1 +
 arch/x86/mm/fault.c|  23 +
 include/linux/pkeys.h  |   1 -
 lib/Kconfig.debug  |  12 +
 lib/Makefile   |   3 +
 lib/pks/Makefile   |   3 +
 lib/pks/pks_test.c | 690 +
 tools/testing/selftests/x86/Makefile   |   3 +-
 tools/testing/selftests/x86/test_pks.c |  65 ++
 9 files changed, 799 insertions(+), 2 deletions(-)
 create mode 100644 lib/pks/Makefile
 create mode 100644 lib/pks/pks_test.c
 create mode 100644 tools/testing/selftests/x86/test_pks.c

diff --git a/Documentation/core-api/protection-keys.rst 
b/Documentation/core-api/protection-keys.rst
index 00a046a913e4..c60366921d60 100644
--- a/Documentation/core-api/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
@@ -163,3 +163,4 @@ of WRPKRU.  So to quote from the WRPKRU text:
until all prior executions of WRPKRU have completed execution
and updated the PKRU register.
 
+Example code can be found in lib/pks/pks_test.c
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ee761c993f58..dd5af9399131 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -18,6 +18,7 @@
 #include  /* faulthandler_disabled()  */
 #include  /* efi_recover_from_page_fault()*/
 #include 
+#include 
 
 #include /* boot_cpu_has, ...*/
 #include  /* dotraplinkage, ...   */
@@ -1149,6 +1150,25 @@ static int fault_in_kernel_space(unsigned long address)
return address >= TASK_SIZE_MAX;
 }
 
+#ifdef CONFIG_PKS_TESTING
+bool pks_test_callback(irqentry_state_t *irq_state);
+static bool handle_pks_testing(unsigned long hw_error_code, irqentry_state_t 
*irq_state)
+{
+   /*
+* If we get a protection key exception it could be because we
+* are running the PKS test.  If so, pks_test_callback() will
+* clear the protection mechanism and return true to indicate
+* the fault was handled.
+*/
+   return (hw_error_code & X86_PF_PK) && pks_test_callback(irq_state);
+}
+#else
+static bool handle_pks_testing(unsigned long hw_error_code, irqentry_state_t 
*irq_state)
+{
+   return false;
+}
+#endif
+
 /*
  * Called for all faults where 'address' is part of the kernel address
  * space.  Might get called for faults that originate from *code* that
@@ -1166,6 +1186,9 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long 
hw_error_code,
!cpu_feature_enabled(X86_FEATURE_PKS))
WARN_ON_ONCE(hw_error_code & X86_PF_PK);
 
+   if (handle_pks_testing(hw_error_code, irq_state))
+   return;
+
 #ifdef CONFIG_X86_32
/*
 * We can fault-in kernel-space virtual memory on-demand. The
diff --git a/include/linux/pkeys.h b/include/linux/pkeys.h
index cc3510cde64e..f9552bd9341f 100644
--- a/include/linux/pkeys.h
+++ b/include/linux/pkeys.h
@@ -47,7 +47,6 @@ static inline bool arch_pkeys_enabled(void)
 static inline void copy_init_pkru_to_fpregs(void)
 {
 }
-
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 #ifndef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 0c781f912f9f..f015c09ba5a1 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2400,6 +2400,18 @@ config HYPERV_TESTING
help
  Select this option to enable Hyper-V vmbus testing.
 
+config PKS_TESTING
+   bool "PKey(S)upervisor testing"
+   default n
+   depends on ARCH_HAS_SUPERVISOR_PKEYS
+   help
+ Select this option to enable testing of PKS core software and
+ hardware.  The PKS core provides a mechanism to allocate keys as well
+ as maintain the protection settings across context

[PATCH RFC V3 7/9] x86/entry: Preserve PKRS MSR across exceptions

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The PKRS MSR is not managed by XSAVE.  It is preserved through a context
switch but this support leaves exception handling code open to memory
accesses during exceptions.

2 possible places for preserving this state were considered,
irqentry_state_t or pt_regs.[1]  pt_regs was much more complicated and
was potentially fraught with unintended consequences.[2]
irqentry_state_t was already an object being used in the exception
handling and is straightforward.  It is also easy for any number of
nested states to be tracked and eventually can be enhanced to store the
reference counting required to support PKS through kmap reentry

Preserve the current task's PKRS values in irqentry_state_t on exception
entry and restoring them on exception exit.

Each nested exception is further saved allowing for any number of levels
of exception handling.

Peter and Thomas both suggested parts of the patch, IDT and NMI respectively.

[1] 
https://lore.kernel.org/lkml/calcetrve1i5jdyzd_bcctxqjn+ze3t38efpgjxn1f577m36...@mail.gmail.com/
[2] https://lore.kernel.org/lkml/874kpxx4jf@nanos.tec.linutronix.de/#t

Cc: Dave Hansen 
Cc: Andy Lutomirski 
Suggested-by: Peter Zijlstra 
Suggested-by: Thomas Gleixner 
Signed-off-by: Ira Weiny 
---
 arch/x86/entry/common.c | 43 +
 arch/x86/include/asm/pkeys_common.h |  5 ++--
 arch/x86/kernel/cpu/mce/core.c  |  4 +++
 arch/x86/mm/pkeys.c |  2 +-
 include/linux/entry-common.h| 12 
 kernel/entry/common.c   | 12 ++--
 6 files changed, 73 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 305da13770b6..324a8fd5ac10 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef CONFIG_XEN_PV
 #include 
@@ -222,6 +223,8 @@ noinstr void idtentry_enter_nmi(struct pt_regs *regs, 
irqentry_state_t *irq_stat
trace_hardirqs_off_finish();
ftrace_nmi_enter();
instrumentation_end();
+
+   irq_save_pkrs(irq_state);
 }
 
 noinstr void idtentry_exit_nmi(struct pt_regs *regs, irqentry_state_t 
*irq_state)
@@ -238,9 +241,47 @@ noinstr void idtentry_exit_nmi(struct pt_regs *regs, 
irqentry_state_t *irq_state
lockdep_hardirq_exit();
if (irq_state->exit_rcu)
lockdep_hardirqs_on(CALLER_ADDR0);
+
+   irq_restore_pkrs(irq_state);
__nmi_exit();
 }
 
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+/*
+ * PKRS is a per-logical-processor MSR which overlays additional protection for
+ * pages which have been mapped with a protection key.
+ *
+ * The register is not maintained with XSAVE so we have to maintain the MSR
+ * value in software during context switch and exception handling.
+ *
+ * Context switches save the MSR in the task struct thus taking that value to
+ * other processors if necessary.
+ *
+ * To protect against exceptions having access to this memory we save the
+ * current running value and set the default PKRS value for the duration of the
+ * exception.  Thus preventing exception handlers from having the elevated
+ * access of the interrupted task.
+ */
+noinstr void irq_save_pkrs(irqentry_state_t *state)
+{
+   if (!cpu_feature_enabled(X86_FEATURE_PKS))
+   return;
+
+   state->thread_pkrs = current->thread.saved_pkrs;
+   state->pkrs = this_cpu_read(pkrs_cache);
+   write_pkrs(INIT_PKRS_VALUE);
+}
+
+noinstr void irq_restore_pkrs(irqentry_state_t *state)
+{
+   if (!cpu_feature_enabled(X86_FEATURE_PKS))
+   return;
+
+   write_pkrs(state->pkrs);
+   current->thread.saved_pkrs = state->thread_pkrs;
+}
+#endif /* CONFIG_ARCH_HAS_SUPERVISOR_PKEYS */
+
 #ifdef CONFIG_XEN_PV
 #ifndef CONFIG_PREEMPTION
 /*
@@ -304,6 +345,8 @@ __visible noinstr void xen_pv_evtchn_do_upcall(struct 
pt_regs *regs)
 
inhcall = get_and_clear_inhcall();
if (inhcall && !WARN_ON_ONCE(state.exit_rcu)) {
+   /* Normally called by irqentry_exit, we must restore pkrs here 
*/
+   irq_restore_pkrs();
instrumentation_begin();
irqentry_exit_cond_resched();
instrumentation_end();
diff --git a/arch/x86/include/asm/pkeys_common.h 
b/arch/x86/include/asm/pkeys_common.h
index 40464c170522..8961e2ddd6ff 100644
--- a/arch/x86/include/asm/pkeys_common.h
+++ b/arch/x86/include/asm/pkeys_common.h
@@ -27,9 +27,10 @@
 #definePKS_NUM_KEYS16
 
 #ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
-void write_pkrs(u32 new_pkrs);
+DECLARE_PER_CPU(u32, pkrs_cache);
+noinstr void write_pkrs(u32 new_pkrs);
 #else
-static inline void write_pkrs(u32 new_pkrs) { }
+static __always_inline void write_pkrs(u32 new_pkrs) { }
 #endif
 
 #endif /*_ASM_X86_PKEYS_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index f43a78bde670..abcd41f19669 100644
--- a/a

[PATCH RFC V3 8/9] x86/fault: Report the PKRS state on fault

2020-10-09 Thread ira . weiny
From: Ira Weiny 

When only user space pkeys are enabled faulting within the kernel was an
unexpected condition which should never happen, therefore a WARN_ON was
added to the kernel fault handler to detect if it ever did.  Now that
PKS can be enabled this is no longer the case.

Report a Pkey fault with a normal splat and add the PKRS state to the
fault splat text.  Note the PKS register is reset during an exception
therefore the saved PKRS value from before the beginning of the
exception is passed down.

If PKS is not enabled, or not active, maintain the WARN_ON_ONCE() from
before.

Because each fault has its own state the pkrs information will be
correctly reported even if a fault 'faults'.

Suggested-by: Andy Lutomirski 
Signed-off-by: Ira Weiny 
---
 arch/x86/mm/fault.c | 59 ++---
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e55bc4bff389..ee761c993f58 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -504,7 +504,8 @@ static void show_ldttss(const struct desc_ptr *gdt, const 
char *name, u16 index)
 }
 
 static void
-show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long 
address)
+show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long 
address,
+   irqentry_state_t *irq_state)
 {
if (!oops_may_print())
return;
@@ -548,6 +549,11 @@ show_fault_oops(struct pt_regs *regs, unsigned long 
error_code, unsigned long ad
 (error_code & X86_PF_PK)? "protection keys violation" :
   "permissions violation");
 
+#ifdef CONFIG_ARCH_HAS_SUPERVISOR_PKEYS
+   if (irq_state && (error_code & X86_PF_PK))
+   pr_alert("PKRS: 0x%x\n", irq_state->pkrs);
+#endif
+
if (!(error_code & X86_PF_USER) && user_mode(regs)) {
struct desc_ptr idt, gdt;
u16 ldtr, tr;
@@ -626,7 +632,8 @@ static void set_signal_archinfo(unsigned long address,
 
 static noinline void
 no_context(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, int signal, int si_code)
+  unsigned long address, int signal, int si_code,
+  irqentry_state_t *irq_state)
 {
struct task_struct *tsk = current;
unsigned long flags;
@@ -732,7 +739,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 */
flags = oops_begin();
 
-   show_fault_oops(regs, error_code, address);
+   show_fault_oops(regs, error_code, address, irq_state);
 
if (task_stack_end_corrupted(tsk))
printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
@@ -785,7 +792,8 @@ static bool is_vsyscall_vaddr(unsigned long vaddr)
 
 static void
 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-  unsigned long address, u32 pkey, int si_code)
+  unsigned long address, u32 pkey, int si_code,
+  irqentry_state_t *state)
 {
struct task_struct *tsk = current;
 
@@ -832,14 +840,14 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned 
long error_code,
if (is_f00f_bug(regs, address))
return;
 
-   no_context(regs, error_code, address, SIGSEGV, si_code);
+   no_context(regs, error_code, address, SIGSEGV, si_code, state);
 }
 
 static noinline void
 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
-unsigned long address)
+unsigned long address, irqentry_state_t *state)
 {
-   __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
+   __bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR, 
state);
 }
 
 static void
@@ -853,7 +861,7 @@ __bad_area(struct pt_regs *regs, unsigned long error_code,
 */
mmap_read_unlock(mm);
 
-   __bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
+   __bad_area_nosemaphore(regs, error_code, address, pkey, si_code, NULL);
 }
 
 static noinline void
@@ -923,7 +931,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, 
unsigned long address,
 {
/* Kernel mode? Handle exceptions or die: */
if (!(error_code & X86_PF_USER)) {
-   no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
+   no_context(regs, error_code, address, SIGBUS, BUS_ADRERR, NULL);
return;
}
 
@@ -957,7 +965,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long 
error_code,
   unsigned long address, vm_fault_t fault)
 {
if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
-   no_context(regs, error_code, address, 0, 0);
+   no_context(regs, error_code, address, 0, 0, NULL);
return;
   

[PATCH RFC V3 1/9] x86/pkeys: Create pkeys_common.h

2020-10-09 Thread ira . weiny
From: Ira Weiny 

Protection Keys User (PKU) and Protection Keys Supervisor (PKS) work in
similar fashions and can share common defines.  Normally, these defines
would be put in asm/pkeys.h to be used internally and externally to the
arch code.  However, the defines are required in pgtable.h and inclusion
of pkeys.h in that header creates complex dependencies which are best
resolved in a separate header.

Share these defines by moving those them into a new header, change their
names to reflect the common use, and include the header where needed.

Signed-off-by: Ira Weiny 

---
NOTE: The initialization of init_pkru_value cause checkpatch errors
because of the space after the '(' in the macros.  We leave this as is
because it is more readable in this format.  And it was existing code.
---
 arch/x86/include/asm/pgtable.h  | 13 ++---
 arch/x86/include/asm/pkeys.h|  2 ++
 arch/x86/include/asm/pkeys_common.h | 11 +++
 arch/x86/kernel/fpu/xstate.c|  8 
 arch/x86/mm/pkeys.c | 14 ++
 5 files changed, 29 insertions(+), 19 deletions(-)
 create mode 100644 arch/x86/include/asm/pkeys_common.h

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b836138ce852..2576154be6cf 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1361,9 +1361,7 @@ static inline pmd_t pmd_swp_clear_uffd_wp(pmd_t pmd)
 }
 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_WP */
 
-#define PKRU_AD_BIT 0x1
-#define PKRU_WD_BIT 0x2
-#define PKRU_BITS_PER_PKEY 2
+#include 
 
 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 extern u32 init_pkru_value;
@@ -1373,18 +1371,19 @@ extern u32 init_pkru_value;
 
 static inline bool __pkru_allows_read(u32 pkru, u16 pkey)
 {
-   int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
-   return !(pkru & (PKRU_AD_BIT << pkru_pkey_bits));
+   int pkru_pkey_bits = pkey * PKR_BITS_PER_PKEY;
+
+   return !(pkru & (PKR_AD_BIT << pkru_pkey_bits));
 }
 
 static inline bool __pkru_allows_write(u32 pkru, u16 pkey)
 {
-   int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY;
+   int pkru_pkey_bits = pkey * PKR_BITS_PER_PKEY;
/*
 * Access-disable disables writes too so we need to check
 * both bits here.
 */
-   return !(pkru & ((PKRU_AD_BIT|PKRU_WD_BIT) << pkru_pkey_bits));
+   return !(pkru & ((PKR_AD_BIT|PKR_WD_BIT) << pkru_pkey_bits));
 }
 
 static inline u16 pte_flags_pkey(unsigned long pte_flags)
diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h
index 2ff9b98812b7..f9feba80894b 100644
--- a/arch/x86/include/asm/pkeys.h
+++ b/arch/x86/include/asm/pkeys.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_X86_PKEYS_H
 #define _ASM_X86_PKEYS_H
 
+#include 
+
 #define ARCH_DEFAULT_PKEY  0
 
 /*
diff --git a/arch/x86/include/asm/pkeys_common.h 
b/arch/x86/include/asm/pkeys_common.h
new file mode 100644
index ..a9f086f1e4b4
--- /dev/null
+++ b/arch/x86/include/asm/pkeys_common.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_PKEYS_INTERNAL_H
+#define _ASM_X86_PKEYS_INTERNAL_H
+
+#define PKR_AD_BIT 0x1
+#define PKR_WD_BIT 0x2
+#define PKR_BITS_PER_PKEY 2
+
+#define PKR_AD_KEY(pkey)   (PKR_AD_BIT << ((pkey) * PKR_BITS_PER_PKEY))
+
+#endif /*_ASM_X86_PKEYS_INTERNAL_H */
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 038e19c0019e..b55cf3acd82a 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -991,7 +991,7 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int 
pkey,
unsigned long init_val)
 {
u32 old_pkru;
-   int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
+   int pkey_shift = (pkey * PKR_BITS_PER_PKEY);
u32 new_pkru_bits = 0;
 
/*
@@ -1010,16 +1010,16 @@ int arch_set_user_pkey_access(struct task_struct *tsk, 
int pkey,
 
/* Set the bits we need in PKRU:  */
if (init_val & PKEY_DISABLE_ACCESS)
-   new_pkru_bits |= PKRU_AD_BIT;
+   new_pkru_bits |= PKR_AD_BIT;
if (init_val & PKEY_DISABLE_WRITE)
-   new_pkru_bits |= PKRU_WD_BIT;
+   new_pkru_bits |= PKR_WD_BIT;
 
/* Shift the bits in to the correct place in PKRU for pkey: */
new_pkru_bits <<= pkey_shift;
 
/* Get old PKRU and mask off any old bits in place: */
old_pkru = read_pkru();
-   old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
+   old_pkru &= ~((PKR_AD_BIT|PKR_WD_BIT) << pkey_shift);
 
/* Write old part along with new part: */
write_pkru(old_pkru | new_pkru_bits);
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 8873ed1438a9..f5efb4007e74 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -111,19 +111,17 @@ int __arch_override_mprotect_pkey(struct vm_area_struct 
*vma, int prot, int pkey
r

[PATCH RFC PKS/PMEM 01/58] x86/pks: Add a global pkrs option

2020-10-09 Thread ira . weiny
From: Ira Weiny 

Some users, such as kmap(), sometimes requires PKS to be global.
However, updating all CPUs, and worse yet all threads is expensive.

Introduce a global PKRS state which is checked at critical times to
allow the state to enable access when global PKS is required.  To
accomplish this with minimal locking; the code is carefully designed
with the following key concepts.

1) Borrow the idea of lazy TLB invalidations from the fault handler
   code.  When enabling PKS access we anticipate that other threads are
   not yet running.  However, if they are we catch the fault and clean
   up the MSR value.

2) When disabling PKS access we force all MSR values across all CPU's.
   This is required to block access as soon as possible.[1]  However, it
   is key that we never attempt to update the per-task PKS values
   directly.  See next point.

3) Per-task PKS values never get updated with global PKS values.  This
   is key to prevent locking requirements and a nearly intractable
   problem of trying to update every task in the system.  Here are a few
   key points.

   3a) The MSR value can be updated with the global PKS value if that
   global value happened to change while the task was running.

   3b) If the task was sleeping while the global PKS was updated then
   the global value is added in when task's are scheduled.

   3c) If the global PKS value restricts access the MSR is updated as
   soon as possible[1] and the thread value is not updated which ensures
   the thread does not retain the elevated privileges after a context
   switch.

4) Follow on patches must be careful to preserve the separation of the
   thread PKRS value and the MSR value.

5) Access Disable on any individual pkey is turned into (Access Disable
   | Write Disable) to facilitate faster integration of the global value
   into the thread local MSR through a simple '&' operation.  Doing
   otherwise would result in complicated individual bit manipulation for
   each pkey.

[1] There is a race condition which is ignored which is required for
performance issues.  This potentially allows access to a thread until
the end of it's time slice.  After the context switch the global value
will be restored.

Signed-off-by: Ira Weiny 
---
 Documentation/core-api/protection-keys.rst |  11 +-
 arch/x86/entry/common.c|   7 +
 arch/x86/include/asm/pkeys.h   |   6 +-
 arch/x86/include/asm/pkeys_common.h|   8 +-
 arch/x86/kernel/process.c  |  74 +++-
 arch/x86/mm/fault.c| 189 -
 arch/x86/mm/pkeys.c|  88 --
 include/linux/pkeys.h  |   6 +-
 lib/pks/pks_test.c |  16 +-
 9 files changed, 329 insertions(+), 76 deletions(-)

diff --git a/Documentation/core-api/protection-keys.rst 
b/Documentation/core-api/protection-keys.rst
index c60366921d60..9e8a98653e13 100644
--- a/Documentation/core-api/protection-keys.rst
+++ b/Documentation/core-api/protection-keys.rst
@@ -121,9 +121,9 @@ mapping adds that mapping to the protection domain.
 int pks_key_alloc(const char * const pkey_user);
 #define PAGE_KERNEL_PKEY(pkey)
 #define _PAGE_KEY(pkey)
-void pks_mknoaccess(int pkey);
-void pks_mkread(int pkey);
-void pks_mkrdwr(int pkey);
+void pks_mknoaccess(int pkey, bool global);
+void pks_mkread(int pkey, bool global);
+void pks_mkrdwr(int pkey, bool global);
 void pks_key_free(int pkey);
 
 pks_key_alloc() allocates keys dynamically to allow better use of the limited
@@ -141,7 +141,10 @@ _PAGE_KEY().
 The pks_mk*() family of calls allows kernel users the ability to change the
 protections for the domain identified by the pkey specified.  3 states are
 available pks_mknoaccess(), pks_mkread(), and pks_mkrdwr() which set the access
-to none, read, and read/write respectively.
+to none, read, and read/write respectively.  'global' specifies that the
+protection should be set across all threads (logical CPU's) not just the
+current running thread/CPU.  This increases the overhead of PKS and lessens the
+protection so it should be used sparingly.
 
 Finally, pks_key_free() allows a user to return the key to the allocator for
 use by others.
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 324a8fd5ac10..86ad32e0095e 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -261,12 +261,19 @@ noinstr void idtentry_exit_nmi(struct pt_regs *regs, 
irqentry_state_t *irq_state
  * current running value and set the default PKRS value for the duration of the
  * exception.  Thus preventing exception handlers from having the elevated
  * access of the interrupted task.
+ *
+ * NOTE That the thread saved PKRS must be preserved separately to ensure
+ * global overrides do not 'stick' on a thread.
  */
 noinstr void irq_save_pkrs(irqentry_state_t *s

[PATCH RFC PKS/PMEM 00/58] PMEM: Introduce stray write protection for PMEM

2020-10-09 Thread ira . weiny
From: Ira Weiny 

Should a stray write in the kernel occur persistent memory is affected more
than regular memory.  A write to the wrong area of memory could result in
latent data corruption which will will persist after a reboot.  PKS provides a
nice way to restrict access to persistent memory kernel mappings, while
providing fast access when needed.

Since the last RFC[1] this patch set has grown quite a bit.  It now depends on
the core patches submitted separately.


https://lore.kernel.org/lkml/20201009194258.3207172-1-ira.we...@intel.com/

And contained in the git tree here:

https://github.com/weiny2/linux-kernel/tree/pks-rfc-v3

However, functionally there is only 1 major change from the last RFC.
Specifically, kmap() is most often used within a single thread in a 'map/do
something/unmap' pattern.  In fact this is the pattern used in ~90% of the
callers of kmap().  This pattern works very well for the pmem use case and the
testing which was done.  However, there were another ~20-30 kmap users which do
not follow this pattern.  Some of them seem to expect the mapping to be
'global' while others require a detailed audit to be sure.[2][3]

While we don't anticipate global mappings to pmem there is a danger in
changing the semantics of kmap().  Effectively, this would cause an unresolved
page fault with little to no information about why.

There were a number of options considered.

1) Attempt to change all the thread local kmap() calls to kmap_atomic()
2) Introduce a flags parameter to kmap() to indicate if the mapping should be
   global or not
3) Change ~20-30 call sites to 'kmap_global()' to indicate that they require a
   global mapping of the pages
4) Change ~209 call sites to 'kmap_thread()' to indicate that the mapping is to
   be used within that thread of execution only

Option 1 is simply not feasible kmap_atomic() is not the same semantic as
kmap() within a single tread.  Option 2 would require all of the call sites of
kmap() to change.  Option 3 seems like a good minimal change but there is a
danger that new code may miss the semantic change of kmap() and not get the
behavior intended for future users.  Therefore, option #4 was chosen.

To handle the global PKRS state in the most efficient manner possible.  We
lazily override the thread specific PKRS key value only when needed because we
anticipate PKS to not be needed will not be needed most of the time.  And even
when it is used 90% of the time it is a thread local call.


[1] https://lore.kernel.org/lkml/20200717072056.73134-1-ira.we...@intel.com/

[2] The following list of callers continue calling kmap() (utilizing the global
PKRS).  It would be nice if more of them could be converted to kmap_thread()

drivers/firewire/net.c: ptr = 
kmap(dev->broadcast_rcv_buffer.pages[u]);
drivers/gpu/drm/i915/gem/i915_gem_pages.c:  return 
kmap(sg_page(sgt->sgl));
drivers/gpu/drm/ttm/ttm_bo_util.c:  map->virtual = 
kmap(map->page);
drivers/infiniband/hw/qib/qib_user_sdma.c:  mpage = kmap(page);
drivers/misc/vmw_vmci/vmci_host.c:  context->notify = 
kmap(context->notify_page) + (uva & (PAGE_SIZE - 1));
drivers/misc/xilinx_sdfec.c:addr = kmap(pages[i]);
drivers/mmc/host/usdhi6rol0.c:  host->pg.mapped = 
kmap(host->pg.page);
drivers/mmc/host/usdhi6rol0.c:  host->pg.mapped = kmap(host->pg.page);
drivers/mmc/host/usdhi6rol0.c:  host->pg.mapped = kmap(host->pg.page);
drivers/nvme/target/tcp.c:  iov->iov_base = 
kmap(sg_page(sg)) + sg->offset + sg_offset;
drivers/scsi/libiscsi_tcp.c:segment->sg_mapped = 
kmap(sg_page(sg));
drivers/target/iscsi/iscsi_target.c:iov[i].iov_base = 
kmap(sg_page(sg)) + sg->offset + page_off;
drivers/target/target_core_transport.c: return 
kmap(sg_page(sg)) + sg->offset;
fs/btrfs/check-integrity.c: block_ctx->datav[i] = 
kmap(block_ctx->pagev[i]);
fs/ceph/dir.c:  cache_ctl->dentries = kmap(cache_ctl->page);
fs/ceph/inode.c:ctl->dentries = kmap(ctl->page);
fs/erofs/zpvec.h:   kmap_atomic(ctor->curr) : 
kmap(ctor->curr);
lib/scatterlist.c:  miter->addr = kmap(miter->page) + 
miter->__offset;
net/ceph/pagelist.c:pl->mapped_tail = kmap(page);
net/ceph/pagelist.c:pl->mapped_tail = kmap(page);
virt/kvm/kvm_main.c:hva = kmap(page);

[3] The following appear to follow the same pattern as ext2 which was converted
after some code audit.  So I _think_ they too could be converted to
k[un]map_thread().

fs/freevxfs/vxfs_subr.c|75| kmap(pp);
fs/jfs/jfs_metapage.c|102| kmap(page);
fs/jfs/jfs_metapage.c|156| kmap(page);
fs/minix/dir.c|72| kmap(pa

[PATCH RFC PKS/PMEM 02/58] x86/pks/test: Add testing for global option

2020-10-09 Thread ira . weiny
From: Ira Weiny 

Now that PKS can be enabled globaly (for all threads) add a test which
spawns a thread and tests the same PKS functionality.

The test enables/disables PKS in 1 thread while attempting to access the
page in another thread.  We use the same test array as in the 'local'
PKS testing.

Signed-off-by: Ira Weiny 
---
 arch/x86/mm/fault.c |   4 ++
 lib/pks/pks_test.c  | 128 +---
 2 files changed, 124 insertions(+), 8 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4b4ff9efa298..4c74f52fbc23 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1108,6 +1108,10 @@ static int spurious_kernel_fault_check(unsigned long 
error_code, pte_t *pte,
if (global_pkey_is_enabled(pte, is_write, irq_state))
return 1;
 
+   /*
+* NOTE: This must be after the global_pkey_is_enabled() call
+* to allow the fixup code to be tested.
+*/
if (handle_pks_testing(error_code, irq_state))
return 1;
 
diff --git a/lib/pks/pks_test.c b/lib/pks/pks_test.c
index 286c8b8457da..dfddccbe4cb6 100644
--- a/lib/pks/pks_test.c
+++ b/lib/pks/pks_test.c
@@ -154,7 +154,8 @@ static void check_exception(irqentry_state_t *irq_state)
}
 
/* Check the exception state */
-   if (!check_pkrs(test_armed_key, PKEY_DISABLE_ACCESS)) {
+   if (!check_pkrs(test_armed_key,
+   PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)) {
pr_err(" FAIL: PKRS cache and MSR\n");
test_exception_ctx->pass = false;
}
@@ -308,24 +309,29 @@ static int test_it(struct pks_test_ctx *ctx, struct 
pks_access_test *test, void
return ret;
 }
 
-static int run_access_test(struct pks_test_ctx *ctx,
-  struct pks_access_test *test,
-  void *ptr)
+static void set_protection(int pkey, enum pks_access_mode mode, bool global)
 {
-   switch (test->mode) {
+   switch (mode) {
case PKS_TEST_NO_ACCESS:
-   pks_mknoaccess(ctx->pkey, false);
+   pks_mknoaccess(pkey, global);
break;
case PKS_TEST_RDWR:
-   pks_mkrdwr(ctx->pkey, false);
+   pks_mkrdwr(pkey, global);
break;
case PKS_TEST_RDONLY:
-   pks_mkread(ctx->pkey, false);
+   pks_mkread(pkey, global);
break;
default:
pr_err("BUG in test invalid mode\n");
break;
}
+}
+
+static int run_access_test(struct pks_test_ctx *ctx,
+  struct pks_access_test *test,
+  void *ptr)
+{
+   set_protection(ctx->pkey, test->mode, false);
 
return test_it(ctx, test, ptr);
 }
@@ -516,6 +522,110 @@ static void run_exception_test(void)
 pass ? "PASS" : "FAIL");
 }
 
+struct shared_data {
+   struct mutex lock;
+   struct pks_test_ctx *ctx;
+   void *kmap_addr;
+   struct pks_access_test *test;
+};
+
+static int thread_main(void *d)
+{
+   struct shared_data *data = d;
+   struct pks_test_ctx *ctx = data->ctx;
+
+   while (!kthread_should_stop()) {
+   mutex_lock(>lock);
+   /*
+* wait for the main thread to hand us the page
+* We should be spinning so hopefully we will not have gotten
+* the global value from a schedule in.
+*/
+   if (data->kmap_addr) {
+   if (test_it(ctx, data->test, data->kmap_addr))
+   ctx->pass = false;
+   data->kmap_addr = NULL;
+   }
+   mutex_unlock(>lock);
+   }
+
+   return 0;
+}
+
+static void run_thread_access_test(struct shared_data *data,
+  struct pks_test_ctx *ctx,
+  struct pks_access_test *test,
+  void *ptr)
+{
+   set_protection(ctx->pkey, test->mode, true);
+
+   pr_info("checking...  mode %s; write %s\n",
+   get_mode_str(test->mode), test->write ? "TRUE" : 
"FALSE");
+
+   mutex_lock(>lock);
+   data->test = test;
+   data->kmap_addr = ptr;
+   mutex_unlock(>lock);
+
+   while (data->kmap_addr) {
+   msleep(10);
+   }
+}
+
+static void run_global_test(void)
+{
+   struct task_struct *other_task;
+   struct pks_test_ctx *ctx;
+   struct shared_data data;
+   bool pass = true;
+   void *ptr;
+   int i;
+
+   pr_inf

[PATCH RFC PKS/PMEM 10/58] drivers/rdma: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in these drivers are localized to a single thread.  To
avoid the over head of global PKRS updates use the new kmap_thread()
call.

Cc: Mike Marciniszyn 
Cc: Dennis Dalessandro 
Cc: Doug Ledford 
Cc: Jason Gunthorpe 
Cc: Faisal Latif 
Cc: Shiraz Saleem 
Cc: Bernard Metzler 
Signed-off-by: Ira Weiny 
---
 drivers/infiniband/hw/hfi1/sdma.c  |  4 ++--
 drivers/infiniband/hw/i40iw/i40iw_cm.c | 10 +-
 drivers/infiniband/sw/siw/siw_qp_tx.c  | 14 +++---
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/infiniband/hw/hfi1/sdma.c 
b/drivers/infiniband/hw/hfi1/sdma.c
index 04575c9afd61..09d206e3229a 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -3130,7 +3130,7 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, 
struct sdma_txreq *tx,
}
 
if (type == SDMA_MAP_PAGE) {
-   kvaddr = kmap(page);
+   kvaddr = kmap_thread(page);
kvaddr += offset;
} else if (WARN_ON(!kvaddr)) {
__sdma_txclean(dd, tx);
@@ -3140,7 +3140,7 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, 
struct sdma_txreq *tx,
memcpy(tx->coalesce_buf + tx->coalesce_idx, kvaddr, len);
tx->coalesce_idx += len;
if (type == SDMA_MAP_PAGE)
-   kunmap(page);
+   kunmap_thread(page);
 
/* If there is more data, return */
if (tx->tlen - tx->coalesce_idx)
diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c 
b/drivers/infiniband/hw/i40iw/i40iw_cm.c
index a3b95805c154..122d7a5642a1 100644
--- a/drivers/infiniband/hw/i40iw/i40iw_cm.c
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -3721,7 +3721,7 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct 
iw_cm_conn_param *conn_param)
ibmr->device = iwpd->ibpd.device;
iwqp->lsmm_mr = ibmr;
if (iwqp->page)
-   iwqp->sc_qp.qp_uk.sq_base = kmap(iwqp->page);
+   iwqp->sc_qp.qp_uk.sq_base = kmap_thread(iwqp->page);
dev->iw_priv_qp_ops->qp_send_lsmm(>sc_qp,
iwqp->ietf_mem.va,
(accept.size + 
conn_param->private_data_len),
@@ -3729,12 +3729,12 @@ int i40iw_accept(struct iw_cm_id *cm_id, struct 
iw_cm_conn_param *conn_param)
 
} else {
if (iwqp->page)
-   iwqp->sc_qp.qp_uk.sq_base = kmap(iwqp->page);
+   iwqp->sc_qp.qp_uk.sq_base = kmap_thread(iwqp->page);
dev->iw_priv_qp_ops->qp_send_lsmm(>sc_qp, NULL, 0, 0);
}
 
if (iwqp->page)
-   kunmap(iwqp->page);
+   kunmap_thread(iwqp->page);
 
iwqp->cm_id = cm_id;
cm_node->cm_id = cm_id;
@@ -4102,10 +4102,10 @@ static void i40iw_cm_event_connected(struct 
i40iw_cm_event *event)
i40iw_cm_init_tsa_conn(iwqp, cm_node);
read0 = (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO);
if (iwqp->page)
-   iwqp->sc_qp.qp_uk.sq_base = kmap(iwqp->page);
+   iwqp->sc_qp.qp_uk.sq_base = kmap_thread(iwqp->page);
dev->iw_priv_qp_ops->qp_send_rtt(>sc_qp, read0);
if (iwqp->page)
-   kunmap(iwqp->page);
+   kunmap_thread(iwqp->page);
 
memset(, 0, sizeof(attr));
attr.qp_state = IB_QPS_RTS;
diff --git a/drivers/infiniband/sw/siw/siw_qp_tx.c 
b/drivers/infiniband/sw/siw/siw_qp_tx.c
index d19d8325588b..4ed37c328d02 100644
--- a/drivers/infiniband/sw/siw/siw_qp_tx.c
+++ b/drivers/infiniband/sw/siw/siw_qp_tx.c
@@ -76,7 +76,7 @@ static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void 
*paddr)
if (unlikely(!p))
return -EFAULT;
 
-   buffer = kmap(p);
+   buffer = kmap_thread(p);
 
if (likely(PAGE_SIZE - off >= bytes)) {
memcpy(paddr, buffer + off, bytes);
@@ -84,7 +84,7 @@ static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void 
*paddr)
unsigned long part = bytes - (PAGE_SIZE - off);
 
memcpy(paddr, buffer + off, part);
-   kunmap(p);
+   kunmap_thread(p);
 
if (!mem->is_pbl)
p = siw_get_upage(mem->umem,
@@ -96,10 +96,10 @@ static int siw_try_1seg(struct siw_iwarp_tx *c_tx, void 
*paddr)
if (unlikely(!p))
return -EFAULT;
 
-   

[PATCH RFC PKS/PMEM 09/58] drivers/gpu: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls in the gpu stack are localized to a single thread.
To avoid the over head of global PKRS updates use the new kmap_thread()
call.

Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Patrik Jakobsson 
Signed-off-by: Ira Weiny 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c  | 12 ++--
 drivers/gpu/drm/gma500/gma_display.c |  4 ++--
 drivers/gpu/drm/gma500/mmu.c | 10 +-
 drivers/gpu/drm/i915/gem/i915_gem_shmem.c|  4 ++--
 .../gpu/drm/i915/gem/selftests/i915_gem_context.c|  4 ++--
 drivers/gpu/drm/i915/gem/selftests/i915_gem_mman.c   |  8 
 drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c |  4 ++--
 drivers/gpu/drm/i915/gt/intel_gtt.c  |  4 ++--
 drivers/gpu/drm/i915/gt/shmem_utils.c|  4 ++--
 drivers/gpu/drm/i915/i915_gem.c  |  8 
 drivers/gpu/drm/i915/i915_gpu_error.c|  4 ++--
 drivers/gpu/drm/i915/selftests/i915_perf.c   |  4 ++--
 drivers/gpu/drm/radeon/radeon_ttm.c  |  4 ++--
 13 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 978bae731398..bd564bccb7a3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -2437,11 +2437,11 @@ static ssize_t amdgpu_ttm_gtt_read(struct file *f, char 
__user *buf,
 
page = adev->gart.pages[p];
if (page) {
-   ptr = kmap(page);
+   ptr = kmap_thread(page);
ptr += off;
 
r = copy_to_user(buf, ptr, cur_size);
-   kunmap(adev->gart.pages[p]);
+   kunmap_thread(adev->gart.pages[p]);
} else
r = clear_user(buf, cur_size);
 
@@ -2507,9 +2507,9 @@ static ssize_t amdgpu_iomem_read(struct file *f, char 
__user *buf,
if (p->mapping != adev->mman.bdev.dev_mapping)
return -EPERM;
 
-   ptr = kmap(p);
+   ptr = kmap_thread(p);
r = copy_to_user(buf, ptr + off, bytes);
-   kunmap(p);
+   kunmap_thread(p);
if (r)
return -EFAULT;
 
@@ -2558,9 +2558,9 @@ static ssize_t amdgpu_iomem_write(struct file *f, const 
char __user *buf,
if (p->mapping != adev->mman.bdev.dev_mapping)
return -EPERM;
 
-   ptr = kmap(p);
+   ptr = kmap_thread(p);
r = copy_from_user(ptr + off, buf, bytes);
-   kunmap(p);
+   kunmap_thread(p);
if (r)
return -EFAULT;
 
diff --git a/drivers/gpu/drm/gma500/gma_display.c 
b/drivers/gpu/drm/gma500/gma_display.c
index 3df6d6e850f5..35f4e55c941f 100644
--- a/drivers/gpu/drm/gma500/gma_display.c
+++ b/drivers/gpu/drm/gma500/gma_display.c
@@ -400,9 +400,9 @@ int gma_crtc_cursor_set(struct drm_crtc *crtc,
/* Copy the cursor to cursor mem */
tmp_dst = dev_priv->vram_addr + cursor_gt->offset;
for (i = 0; i < cursor_pages; i++) {
-   tmp_src = kmap(gt->pages[i]);
+   tmp_src = kmap_thread(gt->pages[i]);
memcpy(tmp_dst, tmp_src, PAGE_SIZE);
-   kunmap(gt->pages[i]);
+   kunmap_thread(gt->pages[i]);
tmp_dst += PAGE_SIZE;
}
 
diff --git a/drivers/gpu/drm/gma500/mmu.c b/drivers/gpu/drm/gma500/mmu.c
index 505044c9a673..fba7a3a461fd 100644
--- a/drivers/gpu/drm/gma500/mmu.c
+++ b/drivers/gpu/drm/gma500/mmu.c
@@ -192,20 +192,20 @@ struct psb_mmu_pd *psb_mmu_alloc_pd(struct psb_mmu_driver 
*driver,
pd->invalid_pte = 0;
}
 
-   v = kmap(pd->dummy_pt);
+   v = kmap_thread(pd->dummy_pt);
for (i = 0; i < (PAGE_SIZE / sizeof(uint32_t)); ++i)
v[i] = pd->invalid_pte;
 
-   kunmap(pd->dummy_pt);
+   kunmap_thread(pd->dummy_pt);
 
-   v = kmap(pd->p);
+   v = kmap_thread(pd->p);
for (i = 0; i < (PAGE_SIZE / sizeof(uint32_t)); ++i)
v[i] = pd->invalid_pde;
 
-   kunmap(pd->p);
+   kunmap_thread(pd->p);
 
clear_page(kmap(pd->dummy_page));
-   kunmap(pd->dummy_page);
+   kunmap_thread(pd->dummy_page);
 
pd->tables = vmalloc_user(sizeof(struct psb_mmu_pt *) * 1024);
if (!pd->tables)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c 
b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
index 38113d3c0138..274424795fb7 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -566,9 +566,9 @@ i915_g

[PATCH RFC PKS/PMEM 16/58] fs/gfs2: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Bob Peterson 
Cc: Andreas Gruenbacher 
Signed-off-by: Ira Weiny 
---
 fs/gfs2/bmap.c   | 4 ++--
 fs/gfs2/ops_fstype.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 0f69fbd4af66..375af4528411 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -67,7 +67,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct 
buffer_head *dibh,
}
 
if (!PageUptodate(page)) {
-   void *kaddr = kmap(page);
+   void *kaddr = kmap_thread(page);
u64 dsize = i_size_read(inode);
  
if (dsize > gfs2_max_stuffed_size(ip))
@@ -75,7 +75,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct 
buffer_head *dibh,
 
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
-   kunmap(page);
+   kunmap_thread(page);
 
SetPageUptodate(page);
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 6d18d2c91add..a5d20d9b504a 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -263,9 +263,9 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t 
sector, int silent)
__free_page(page);
return -EIO;
}
-   p = kmap(page);
+   p = kmap_thread(page);
gfs2_sb_in(sdp, p);
-   kunmap(page);
+   kunmap_thread(page);
__free_page(page);
return gfs2_check_sb(sdp, silent);
 }
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 06/58] kmap: Introduce k[un]map_thread debugging

2020-10-09 Thread ira . weiny
From: Ira Weiny 

Most kmap() callers use the map within a single thread and have no need
for the protection domain to be enabled globally.

To differentiate these kmap users, new k[un]map_thread() calls were
introduced which are thread local.

To aid in debugging the new use of kmap_thread(), add a reference count,
a check on that count, and tracing to ID where mapping errors occur.

Cc: Juri Lelli 
Cc: Vincent Guittot 
Cc: Dietmar Eggemann 
Cc: Steven Rostedt 
Cc: Ben Segall 
Cc: Mel Gorman 
Signed-off-by: Ira Weiny 
---
 include/linux/highmem.h|  5 +++
 include/linux/sched.h  |  5 +++
 include/trace/events/kmap_thread.h | 56 ++
 init/init_task.c   |  3 ++
 kernel/fork.c  | 15 
 lib/Kconfig.debug  |  8 +
 mm/debug.c | 23 
 7 files changed, 115 insertions(+)
 create mode 100644 include/trace/events/kmap_thread.h

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index ef7813544719..22d1c000802e 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -247,6 +247,10 @@ static inline void kunmap(struct page *page)
__kunmap(page, true);
 }
 
+#ifdef CONFIG_DEBUG_KMAP_THREAD
+void *kmap_thread(struct page *page);
+void kunmap_thread(struct page *page);
+#else
 static inline void *kmap_thread(struct page *page)
 {
return __kmap(page, false);
@@ -255,6 +259,7 @@ static inline void kunmap_thread(struct page *page)
 {
__kunmap(page, false);
 }
+#endif
 
 /*
  * Prevent people trying to call kunmap_atomic() as if it were kunmap()
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 25d97ab6c757..4627ea4a49e6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1318,6 +1318,11 @@ struct task_struct {
 #ifdef CONFIG_ZONE_DEVICE_ACCESS_PROTECTION
unsigned intdev_page_access_ref;
 #endif
+
+#ifdef CONFIG_DEBUG_KMAP_THREAD
+   unsigned intkmap_thread_cnt;
+#endif
+
/*
 * New fields for task_struct should be added above here, so that
 * they are included in the randomized portion of task_struct.
diff --git a/include/trace/events/kmap_thread.h 
b/include/trace/events/kmap_thread.h
new file mode 100644
index ..e7143cfe0daf
--- /dev/null
+++ b/include/trace/events/kmap_thread.h
@@ -0,0 +1,56 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+
+/*
+ * Copyright (c) 2020 Intel Corporation.  All rights reserved.
+ *
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kmap_thread
+
+#if !defined(_TRACE_KMAP_THREAD_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KMAP_THREAD_H
+
+#include 
+
+DECLARE_EVENT_CLASS(kmap_thread_template,
+   TP_PROTO(struct task_struct *tsk, struct page *page,
+void *caller_addr, int cnt),
+   TP_ARGS(tsk, page, caller_addr, cnt),
+
+   TP_STRUCT__entry(
+   __field(int, pid)
+   __field(struct page *, page)
+   __field(void *, caller_addr)
+   __field(int, cnt)
+   ),
+
+   TP_fast_assign(
+   __entry->pid = tsk->pid;
+   __entry->page = page;
+   __entry->caller_addr = caller_addr;
+   __entry->cnt = cnt;
+   ),
+
+   TP_printk("PID %d; (%d) %pS %p",
+   __entry->pid,
+   __entry->cnt,
+   __entry->caller_addr,
+   __entry->page
+   )
+);
+
+DEFINE_EVENT(kmap_thread_template, kmap_thread,
+   TP_PROTO(struct task_struct *tsk, struct page *page,
+void *caller_addr, int cnt),
+   TP_ARGS(tsk, page, caller_addr, cnt));
+
+DEFINE_EVENT(kmap_thread_template, kunmap_thread,
+   TP_PROTO(struct task_struct *tsk, struct page *page,
+void *caller_addr, int cnt),
+   TP_ARGS(tsk, page, caller_addr, cnt));
+
+
+#endif /* _TRACE_KMAP_THREAD_H */
+
+#include 
diff --git a/init/init_task.c b/init/init_task.c
index 9b39f25de59b..19f09965eb34 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -212,6 +212,9 @@ struct task_struct init_task
 #ifdef CONFIG_ZONE_DEVICE_ACCESS_PROTECTION
.dev_page_access_ref = 0,
 #endif
+#ifdef CONFIG_DEBUG_KMAP_THREAD
+   .kmap_thread_cnt = 0,
+#endif
 };
 EXPORT_SYMBOL(init_task);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index b6a3ee328a89..2c66e49b7614 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -722,6 +722,17 @@ static inline void put_signal_struct(struct signal_struct 
*sig)
free_signal_struct(sig);
 }
 
+#ifdef CONFIG_DEBUG_KMAP_THREAD
+static void check_outstanding_kmap_thread(struct task_struct *tsk)
+{
+   if (tsk->kmap_thread_cnt)
+   pr_warn(KERN_ERR "WARNING: PID %d; Failed to kunmap_thread() 
[cnt %d]\n",
+   tsk->pid, tsk->kmap_thread_cnt);
+}
+#else
+static void check_outs

[PATCH RFC PKS/PMEM 15/58] fs/ecryptfs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Herbert Xu 
Cc: Eric Biggers 
Cc: Aditya Pakki 
Signed-off-by: Ira Weiny 
---
 fs/ecryptfs/crypto.c | 8 
 fs/ecryptfs/read_write.c | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 0681540c48d9..e73e00994bee 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -469,10 +469,10 @@ int ecryptfs_encrypt_page(struct page *page)
}
 
lower_offset = lower_offset_for_page(crypt_stat, page);
-   enc_extent_virt = kmap(enc_extent_page);
+   enc_extent_virt = kmap_thread(enc_extent_page);
rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
  PAGE_SIZE);
-   kunmap(enc_extent_page);
+   kunmap_thread(enc_extent_page);
if (rc < 0) {
ecryptfs_printk(KERN_ERR,
"Error attempting to write lower page; rc = [%d]\n",
@@ -518,10 +518,10 @@ int ecryptfs_decrypt_page(struct page *page)
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
 
lower_offset = lower_offset_for_page(crypt_stat, page);
-   page_virt = kmap(page);
+   page_virt = kmap_thread(page);
rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE,
 ecryptfs_inode);
-   kunmap(page);
+   kunmap_thread(page);
if (rc < 0) {
ecryptfs_printk(KERN_ERR,
"Error attempting to read lower page; rc = [%d]\n",
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 0438997ac9d8..5eca4330c0c0 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -64,11 +64,11 @@ int ecryptfs_write_lower_page_segment(struct inode 
*ecryptfs_inode,
 
offset = loff_t)page_for_lower->index) << PAGE_SHIFT)
  + offset_in_page);
-   virt = kmap(page_for_lower);
+   virt = kmap_thread(page_for_lower);
rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
if (rc > 0)
rc = 0;
-   kunmap(page_for_lower);
+   kunmap_thread(page_for_lower);
return rc;
 }
 
@@ -251,11 +251,11 @@ int ecryptfs_read_lower_page_segment(struct page 
*page_for_ecryptfs,
int rc;
 
offset = loff_t)page_index) << PAGE_SHIFT) + offset_in_page);
-   virt = kmap(page_for_ecryptfs);
+   virt = kmap_thread(page_for_ecryptfs);
rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
if (rc > 0)
rc = 0;
-   kunmap(page_for_ecryptfs);
+   kunmap_thread(page_for_ecryptfs);
flush_dcache_page(page_for_ecryptfs);
return rc;
 }
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 17/58] fs/nilfs2: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Ryusuke Konishi 
Signed-off-by: Ira Weiny 
---
 fs/nilfs2/alloc.c  | 34 +-
 fs/nilfs2/cpfile.c |  4 ++--
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index adf3bb0a8048..2aa4c34094ef 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -524,7 +524,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
ret = nilfs_palloc_get_desc_block(inode, group, 1, _bh);
if (ret < 0)
return ret;
-   desc_kaddr = kmap(desc_bh->b_page);
+   desc_kaddr = kmap_thread(desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(
inode, group, desc_bh, desc_kaddr);
n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
@@ -536,7 +536,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
inode, group, 1, _bh);
if (ret < 0)
goto out_desc;
-   bitmap_kaddr = kmap(bitmap_bh->b_page);
+   bitmap_kaddr = kmap_thread(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
bitmap, group_offset,
@@ -547,21 +547,21 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
desc, lock, -1);
req->pr_entry_nr =
entries_per_group * group + pos;
-   kunmap(desc_bh->b_page);
-   kunmap(bitmap_bh->b_page);
+   kunmap_thread(desc_bh->b_page);
+   kunmap_thread(bitmap_bh->b_page);
 
req->pr_desc_bh = desc_bh;
req->pr_bitmap_bh = bitmap_bh;
return 0;
}
-   kunmap(bitmap_bh->b_page);
+   kunmap_thread(bitmap_bh->b_page);
brelse(bitmap_bh);
}
 
group_offset = 0;
}
 
-   kunmap(desc_bh->b_page);
+   kunmap_thread(desc_bh->b_page);
brelse(desc_bh);
}
 
@@ -569,7 +569,7 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
return -ENOSPC;
 
  out_desc:
-   kunmap(desc_bh->b_page);
+   kunmap_thread(desc_bh->b_page);
brelse(desc_bh);
return ret;
 }
@@ -605,10 +605,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
spinlock_t *lock;
 
group = nilfs_palloc_group(inode, req->pr_entry_nr, _offset);
-   desc_kaddr = kmap(req->pr_desc_bh->b_page);
+   desc_kaddr = kmap_thread(req->pr_desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(inode, group,
 req->pr_desc_bh, desc_kaddr);
-   bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+   bitmap_kaddr = kmap_thread(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
lock = nilfs_mdt_bgl_lock(inode, group);
 
@@ -620,8 +620,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
 
-   kunmap(req->pr_bitmap_bh->b_page);
-   kunmap(req->pr_desc_bh->b_page);
+   kunmap_thread(req->pr_bitmap_bh->b_page);
+   kunmap_thread(req->pr_desc_bh->b_page);
 
mark_buffer_dirty(req->pr_desc_bh);
mark_buffer_dirty(req->pr_bitmap_bh);
@@ -646,10 +646,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
spinlock_t *lock;
 
group = nilfs_palloc_group(inode, req->pr_entry_nr, _offset);
-   desc_kaddr = kmap(req->pr_desc_bh->b_page);
+   desc_kaddr = kmap_thread(req->pr_desc_bh->b_page);
desc = nilfs_palloc_block_get_group_desc(inode, group,
 req->pr_desc_bh, desc_kaddr);
-   bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+   bitmap_kaddr = kmap_thread(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
lock = nilfs_mdt_bgl_lock(inode, group);
 
@@ -661,8 +661,8 @@ void nilfs_palloc_abort_all

[PATCH RFC PKS/PMEM 18/58] fs/hfs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Signed-off-by: Ira Weiny 
---
 fs/hfs/bnode.c | 14 +++---
 fs/hfs/btree.c | 20 ++--
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index b63a4df7327b..8b4d02576405 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -23,8 +23,8 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf,
off += node->page_offset;
page = node->page[0];
 
-   memcpy(buf, kmap(page) + off, len);
-   kunmap(page);
+   memcpy(buf, kmap_thread(page) + off, len);
+   kunmap_thread(page);
 }
 
 u16 hfs_bnode_read_u16(struct hfs_bnode *node, int off)
@@ -108,9 +108,9 @@ void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst,
src_page = src_node->page[0];
dst_page = dst_node->page[0];
 
-   memcpy(kmap(dst_page) + dst, kmap(src_page) + src, len);
-   kunmap(src_page);
-   kunmap(dst_page);
+   memcpy(kmap_thread(dst_page) + dst, kmap_thread(src_page) + src, len);
+   kunmap_thread(src_page);
+   kunmap_thread(dst_page);
set_page_dirty(dst_page);
 }
 
@@ -125,9 +125,9 @@ void hfs_bnode_move(struct hfs_bnode *node, int dst, int 
src, int len)
src += node->page_offset;
dst += node->page_offset;
page = node->page[0];
-   ptr = kmap(page);
+   ptr = kmap_thread(page);
memmove(ptr + dst, ptr + src, len);
-   kunmap(page);
+   kunmap_thread(page);
set_page_dirty(page);
 }
 
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 19017d296173..bd4a6d35e361 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -80,7 +80,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 
id, btree_keycmp ke
goto free_inode;
 
/* Load the header */
-   head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct 
hfs_bnode_desc));
+   head = (struct hfs_btree_header_rec *)(kmap_thread(page) + 
sizeof(struct hfs_bnode_desc));
tree->root = be32_to_cpu(head->root);
tree->leaf_count = be32_to_cpu(head->leaf_count);
tree->leaf_head = be32_to_cpu(head->leaf_head);
@@ -119,7 +119,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, 
u32 id, btree_keycmp ke
tree->node_size_shift = ffs(size) - 1;
tree->pages_per_bnode = (tree->node_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-   kunmap(page);
+   kunmap_thread(page);
put_page(page);
return tree;
 
@@ -268,7 +268,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
 
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
-   data = kmap(*pagep);
+   data = kmap_thread(*pagep);
off &= ~PAGE_MASK;
idx = 0;
 
@@ -281,7 +281,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
idx += i;
data[off] |= m;
set_page_dirty(*pagep);
-   kunmap(*pagep);
+   kunmap_thread(*pagep);
tree->free_nodes--;
mark_inode_dirty(tree->inode);
hfs_bnode_put(node);
@@ -290,14 +290,14 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
}
}
if (++off >= PAGE_SIZE) {
-   kunmap(*pagep);
-   data = kmap(*++pagep);
+   kunmap_thread(*pagep);
+   data = kmap_thread(*++pagep);
off = 0;
}
idx += 8;
len--;
}
-   kunmap(*pagep);
+   kunmap_thread(*pagep);
nidx = node->next;
if (!nidx) {
printk(KERN_DEBUG "create new bmap node...\n");
@@ -313,7 +313,7 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
off = off16;
off += node->page_offset;
pagep = node->page + (off >> PAGE_SHIFT);
-   data = kmap(*pagep);
+   data = kmap_thread(*pagep);
off &= ~PAGE_MASK;
}
 }
@@ -360,7 +360,7 @@ void hfs_bmap_free(struct hfs_bnode *node)
}
off += node->page_offset + nidx / 8;
page = node->page[off >> PAGE_SHIFT];
-   data = kmap(page);
+   data = kmap_thread(page);
off &= ~PAGE_MASK;
m = 1 <

[PATCH RFC PKS/PMEM 56/58] dax: Stray access protection for dax_direct_access()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

dax_direct_access() is a special case of accessing pmem via a page
offset and without a struct page.

Because the dax driver is well aware of the special protections it has
mapped memory with, call dev_access_[en|dis]able() directly instead of
the unnecessary overhead of trying to get a page to kmap.

Similar to kmap, we leverage existing functions, dax_read_[un]lock(),
because they are already required to surround the use of the memory
returned from dax_direct_access().

Signed-off-by: Ira Weiny 
---
 drivers/dax/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index e84070b55463..0ddb3ee73e36 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -30,6 +30,7 @@ static DEFINE_SPINLOCK(dax_host_lock);
 
 int dax_read_lock(void)
 {
+   dev_access_enable(false);
return srcu_read_lock(_srcu);
 }
 EXPORT_SYMBOL_GPL(dax_read_lock);
@@ -37,6 +38,7 @@ EXPORT_SYMBOL_GPL(dax_read_lock);
 void dax_read_unlock(int id)
 {
srcu_read_unlock(_srcu, id);
+   dev_access_disable(false);
 }
 EXPORT_SYMBOL_GPL(dax_read_unlock);
 
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 48/58] drivers/md: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Coly Li  (maintainer:BCACHE (BLOCK LAYER CACHE))
Cc: Kent Overstreet  (maintainer:BCACHE (BLOCK LAYER 
CACHE))
Signed-off-by: Ira Weiny 
---
 drivers/md/bcache/request.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index c7cadaafa947..a4571f6d09dd 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -44,10 +44,10 @@ static void bio_csum(struct bio *bio, struct bkey *k)
uint64_t csum = 0;
 
bio_for_each_segment(bv, bio, iter) {
-   void *d = kmap(bv.bv_page) + bv.bv_offset;
+   void *d = kmap_thread(bv.bv_page) + bv.bv_offset;
 
csum = bch_crc64_update(csum, d, bv.bv_len);
-   kunmap(bv.bv_page);
+   kunmap_thread(bv.bv_page);
}
 
k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 39/58] fs/jffs2: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over head of
global PKRS updates use the new kmap_thread() call.

Signed-off-by: Ira Weiny 
---
 fs/jffs2/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 3e6d54f9b011..14dd2b18cc16 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -287,13 +287,13 @@ static int jffs2_write_end(struct file *filp, struct 
address_space *mapping,
 
/* In 2.4, it was already kmapped by generic_file_write(). Doesn't
   hurt to do it again. The alternative is ifdefs, which are ugly. */
-   kmap(pg);
+   kmap_thread(pg);
 
ret = jffs2_write_inode_range(c, f, ri, page_address(pg) + 
aligned_start,
  (pg->index << PAGE_SHIFT) + aligned_start,
  end - aligned_start, );
 
-   kunmap(pg);
+   kunmap_thread(pg);
 
if (ret) {
/* There was an error writing. */
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 49/58] drivers/misc: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Greg Kroah-Hartman 
Signed-off-by: Ira Weiny 
---
 drivers/misc/vmw_vmci/vmci_queue_pair.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/vmw_vmci/vmci_queue_pair.c 
b/drivers/misc/vmw_vmci/vmci_queue_pair.c
index 8531ae781195..f308abb8ad03 100644
--- a/drivers/misc/vmw_vmci/vmci_queue_pair.c
+++ b/drivers/misc/vmw_vmci/vmci_queue_pair.c
@@ -343,7 +343,7 @@ static int qp_memcpy_to_queue_iter(struct vmci_queue *queue,
size_t to_copy;
 
if (kernel_if->host)
-   va = kmap(kernel_if->u.h.page[page_index]);
+   va = kmap_thread(kernel_if->u.h.page[page_index]);
else
va = kernel_if->u.g.vas[page_index + 1];
/* Skip header. */
@@ -357,12 +357,12 @@ static int qp_memcpy_to_queue_iter(struct vmci_queue 
*queue,
if (!copy_from_iter_full((u8 *)va + page_offset, to_copy,
 from)) {
if (kernel_if->host)
-   kunmap(kernel_if->u.h.page[page_index]);
+   kunmap_thread(kernel_if->u.h.page[page_index]);
return VMCI_ERROR_INVALID_ARGS;
}
bytes_copied += to_copy;
if (kernel_if->host)
-   kunmap(kernel_if->u.h.page[page_index]);
+   kunmap_thread(kernel_if->u.h.page[page_index]);
}
 
return VMCI_SUCCESS;
@@ -391,7 +391,7 @@ static int qp_memcpy_from_queue_iter(struct iov_iter *to,
int err;
 
if (kernel_if->host)
-   va = kmap(kernel_if->u.h.page[page_index]);
+   va = kmap_thread(kernel_if->u.h.page[page_index]);
else
va = kernel_if->u.g.vas[page_index + 1];
/* Skip header. */
@@ -405,12 +405,12 @@ static int qp_memcpy_from_queue_iter(struct iov_iter *to,
err = copy_to_iter((u8 *)va + page_offset, to_copy, to);
if (err != to_copy) {
if (kernel_if->host)
-   kunmap(kernel_if->u.h.page[page_index]);
+   kunmap_thread(kernel_if->u.h.page[page_index]);
return VMCI_ERROR_INVALID_ARGS;
}
bytes_copied += to_copy;
if (kernel_if->host)
-   kunmap(kernel_if->u.h.page[page_index]);
+   kunmap_thread(kernel_if->u.h.page[page_index]);
}
 
return VMCI_SUCCESS;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 55/58] samples: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Kirti Wankhede 
Signed-off-by: Ira Weiny 
---
 samples/vfio-mdev/mbochs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/samples/vfio-mdev/mbochs.c b/samples/vfio-mdev/mbochs.c
index 3cc5e5921682..6d95422c0b46 100644
--- a/samples/vfio-mdev/mbochs.c
+++ b/samples/vfio-mdev/mbochs.c
@@ -479,12 +479,12 @@ static ssize_t mdev_access(struct mdev_device *mdev, char 
*buf, size_t count,
pos -= MBOCHS_MMIO_BAR_OFFSET;
poff = pos & ~PAGE_MASK;
pg = __mbochs_get_page(mdev_state, pos >> PAGE_SHIFT);
-   map = kmap(pg);
+   map = kmap_thread(pg);
if (is_write)
memcpy(map + poff, buf, count);
else
memcpy(buf, map + poff, count);
-   kunmap(pg);
+   kunmap_thread(pg);
put_page(pg);
 
} else {
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 58/58] [dax|pmem]: Enable stray access protection

2020-10-09 Thread ira . weiny
From: Ira Weiny 

Protecting against stray writes is particularly important for PMEM
because, unlike writes to anonymous memory, writes to PMEM persists
across a reboot.  Thus data corruption could result in permanent loss of
data.

While stray writes are more serious than reads, protection is also
enabled for reads.  This helps to detect bugs in code which would
incorrectly access device memory and prevents a more serious machine
checks should those bug reads from a poison page.

Enable stray access protection by setting the flag in pgmap which
requests it.  There is no option presented to the user.  If Zone Device
Access Protection not be supported this flag will have no affect.

Signed-off-by: Ira Weiny 
---
 drivers/dax/device.c  | 2 ++
 drivers/nvdimm/pmem.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/drivers/dax/device.c b/drivers/dax/device.c
index 1e89513f3c59..e6fb35b4f0fb 100644
--- a/drivers/dax/device.c
+++ b/drivers/dax/device.c
@@ -430,6 +430,8 @@ int dev_dax_probe(struct device *dev)
}
 
dev_dax->pgmap.type = MEMORY_DEVICE_GENERIC;
+   dev_dax->pgmap.flags |= PGMAP_PROT_ENABLED;
+
addr = devm_memremap_pages(dev, _dax->pgmap);
if (IS_ERR(addr))
return PTR_ERR(addr);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index e4dc1ae990fc..9fcd8338e23f 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -426,6 +426,8 @@ static int pmem_attach_disk(struct device *dev,
return -EBUSY;
}
 
+   pmem->pgmap.flags |= PGMAP_PROT_ENABLED;
+
q = blk_alloc_queue(dev_to_node(dev));
if (!q)
return -ENOMEM;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 54/58] powerpc: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Signed-off-by: Ira Weiny 
---
 arch/powerpc/mm/mem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 42e25874f5a8..6ef557b8dda6 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -573,9 +573,9 @@ void flush_icache_user_page(struct vm_area_struct *vma, 
struct page *page,
 {
unsigned long maddr;
 
-   maddr = (unsigned long) kmap(page) + (addr & ~PAGE_MASK);
+   maddr = (unsigned long) kmap_thread(page) + (addr & ~PAGE_MASK);
flush_icache_range(maddr, maddr + len);
-   kunmap(page);
+   kunmap_thread(page);
 }
 
 /*
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 53/58] lib: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Alexander Viro 
Cc: "Jérôme Glisse" 
Cc: Martin KaFai Lau 
Cc: Song Liu 
Cc: Yonghong Song 
Cc: Andrii Nakryiko 
Cc: John Fastabend 
Cc: KP Singh 
Signed-off-by: Ira Weiny 
---
 lib/iov_iter.c | 12 ++--
 lib/test_bpf.c |  4 ++--
 lib/test_hmm.c |  8 
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 5e40786c8f12..1d47f957cf95 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -208,7 +208,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, 
size_t offset, size_t b
}
/* Too bad - revert to non-atomic kmap */
 
-   kaddr = kmap(page);
+   kaddr = kmap_thread(page);
from = kaddr + offset;
left = copyout(buf, from, copy);
copy -= left;
@@ -225,7 +225,7 @@ static size_t copy_page_to_iter_iovec(struct page *page, 
size_t offset, size_t b
from += copy;
bytes -= copy;
}
-   kunmap(page);
+   kunmap_thread(page);
 
 done:
if (skip == iov->iov_len) {
@@ -292,7 +292,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, 
size_t offset, size_t
}
/* Too bad - revert to non-atomic kmap */
 
-   kaddr = kmap(page);
+   kaddr = kmap_thread(page);
to = kaddr + offset;
left = copyin(to, buf, copy);
copy -= left;
@@ -309,7 +309,7 @@ static size_t copy_page_from_iter_iovec(struct page *page, 
size_t offset, size_t
to += copy;
bytes -= copy;
}
-   kunmap(page);
+   kunmap_thread(page);
 
 done:
if (skip == iov->iov_len) {
@@ -1742,10 +1742,10 @@ int iov_iter_for_each_range(struct iov_iter *i, size_t 
bytes,
return 0;
 
iterate_all_kinds(i, bytes, v, -EINVAL, ({
-   w.iov_base = kmap(v.bv_page) + v.bv_offset;
+   w.iov_base = kmap_thread(v.bv_page) + v.bv_offset;
w.iov_len = v.bv_len;
err = f(, context);
-   kunmap(v.bv_page);
+   kunmap_thread(v.bv_page);
err;}), ({
w = v;
err = f(, context);})
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index ca7d635bccd9..441f822f56ba 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -6506,11 +6506,11 @@ static void *generate_test_data(struct bpf_test *test, 
int sub)
if (!page)
goto err_kfree_skb;
 
-   ptr = kmap(page);
+   ptr = kmap_thread(page);
if (!ptr)
goto err_free_page;
memcpy(ptr, test->frag_data, MAX_DATA);
-   kunmap(page);
+   kunmap_thread(page);
skb_add_rx_frag(skb, 0, page, 0, MAX_DATA, MAX_DATA);
}
 
diff --git a/lib/test_hmm.c b/lib/test_hmm.c
index e7dc3de355b7..e40d26f97f45 100644
--- a/lib/test_hmm.c
+++ b/lib/test_hmm.c
@@ -329,9 +329,9 @@ static int dmirror_do_read(struct dmirror *dmirror, 
unsigned long start,
if (!page)
return -ENOENT;
 
-   tmp = kmap(page);
+   tmp = kmap_thread(page);
memcpy(ptr, tmp, PAGE_SIZE);
-   kunmap(page);
+   kunmap_thread(page);
 
ptr += PAGE_SIZE;
bounce->cpages++;
@@ -398,9 +398,9 @@ static int dmirror_do_write(struct dmirror *dmirror, 
unsigned long start,
if (!page || xa_pointer_tag(entry) != DPT_XA_TAG_WRITE)
return -ENOENT;
 
-   tmp = kmap(page);
+   tmp = kmap_thread(page);
memcpy(tmp, ptr, PAGE_SIZE);
-   kunmap(page);
+   kunmap_thread(page);
 
ptr += PAGE_SIZE;
bounce->cpages++;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 57/58] nvdimm/pmem: Stray access protection for pmem->virt_addr

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The pmem driver uses a cached virtual address to access its memory
directly.  Because the nvdimm driver is well aware of the special
protections it has mapped memory with, we call dev_access_[en|dis]able()
around the direct pmem->virt_addr (pmem_addr) usage instead of the
unnecessary overhead of trying to get a page to kmap.

Signed-off-by: Ira Weiny 
---
 drivers/nvdimm/pmem.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index fab29b514372..e4dc1ae990fc 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -148,7 +148,9 @@ static blk_status_t pmem_do_read(struct pmem_device *pmem,
if (unlikely(is_bad_pmem(>bb, sector, len)))
return BLK_STS_IOERR;
 
+   dev_access_enable(false);
rc = read_pmem(page, page_off, pmem_addr, len);
+   dev_access_disable(false);
flush_dcache_page(page);
return rc;
 }
@@ -180,11 +182,13 @@ static blk_status_t pmem_do_write(struct pmem_device 
*pmem,
 * after clear poison.
 */
flush_dcache_page(page);
+   dev_access_enable(false);
write_pmem(pmem_addr, page, page_off, len);
if (unlikely(bad_pmem)) {
rc = pmem_clear_poison(pmem, pmem_off, len);
write_pmem(pmem_addr, page, page_off, len);
}
+   dev_access_disable(false);
 
return rc;
 }
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 52/58] mm: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Signed-off-by: Ira Weiny 
---
 mm/memory.c  | 8 
 mm/swapfile.c| 4 ++--
 mm/userfaultfd.c | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index fcfc4ca36eba..75a054882d7a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4945,7 +4945,7 @@ int __access_remote_vm(struct task_struct *tsk, struct 
mm_struct *mm,
if (bytes > PAGE_SIZE-offset)
bytes = PAGE_SIZE-offset;
 
-   maddr = kmap(page);
+   maddr = kmap_thread(page);
if (write) {
copy_to_user_page(vma, page, addr,
  maddr + offset, buf, bytes);
@@ -4954,7 +4954,7 @@ int __access_remote_vm(struct task_struct *tsk, struct 
mm_struct *mm,
copy_from_user_page(vma, page, addr,
buf, maddr + offset, bytes);
}
-   kunmap(page);
+   kunmap_thread(page);
put_page(page);
}
len -= bytes;
@@ -5216,14 +5216,14 @@ long copy_huge_page_from_user(struct page *dst_page,
 
for (i = 0; i < pages_per_huge_page; i++) {
if (allow_pagefault)
-   page_kaddr = kmap(dst_page + i);
+   page_kaddr = kmap_thread(dst_page + i);
else
page_kaddr = kmap_atomic(dst_page + i);
rc = copy_from_user(page_kaddr,
(const void __user *)(src + i * PAGE_SIZE),
PAGE_SIZE);
if (allow_pagefault)
-   kunmap(dst_page + i);
+   kunmap_thread(dst_page + i);
else
kunmap_atomic(page_kaddr);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index debc94155f74..e3296ff95648 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -3219,7 +3219,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, 
int, swap_flags)
error = PTR_ERR(page);
goto bad_swap_unlock_inode;
}
-   swap_header = kmap(page);
+   swap_header = kmap_thread(page);
 
maxpages = read_swap_header(p, swap_header, inode);
if (unlikely(!maxpages)) {
@@ -3395,7 +3395,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, 
int, swap_flags)
filp_close(swap_file, NULL);
 out:
if (page && !IS_ERR(page)) {
-   kunmap(page);
+   kunmap_thread(page);
put_page(page);
}
if (name)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9a3d451402d7..4d38c881bb2d 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -586,11 +586,11 @@ static __always_inline ssize_t __mcopy_atomic(struct 
mm_struct *dst_mm,
mmap_read_unlock(dst_mm);
BUG_ON(!page);
 
-   page_kaddr = kmap(page);
+   page_kaddr = kmap_thread(page);
err = copy_from_user(page_kaddr,
 (const void __user *) src_addr,
 PAGE_SIZE);
-   kunmap(page);
+   kunmap_thread(page);
if (unlikely(err)) {
err = -EFAULT;
goto out;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 50/58] drivers/android: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Greg Kroah-Hartman 
Signed-off-by: Ira Weiny 
---
 drivers/android/binder_alloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 69609696a843..5f50856caad7 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -1118,9 +1118,9 @@ binder_alloc_copy_user_to_buffer(struct binder_alloc 
*alloc,
page = binder_alloc_get_page(alloc, buffer,
 buffer_offset, );
size = min_t(size_t, bytes, PAGE_SIZE - pgoff);
-   kptr = kmap(page) + pgoff;
+   kptr = kmap_thread(page) + pgoff;
ret = copy_from_user(kptr, from, size);
-   kunmap(page);
+   kunmap_thread(page);
if (ret)
return bytes - size + ret;
bytes -= size;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 51/58] kernel: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

This kmap() call is localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Eric Biederman 
Signed-off-by: Ira Weiny 
---
 kernel/kexec_core.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index c19c0dad1ebe..272a9920c0d6 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -815,7 +815,7 @@ static int kimage_load_normal_segment(struct kimage *image,
if (result < 0)
goto out;
 
-   ptr = kmap(page);
+   ptr = kmap_thread(page);
/* Start with a clear page */
clear_page(ptr);
ptr += maddr & ~PAGE_MASK;
@@ -828,7 +828,7 @@ static int kimage_load_normal_segment(struct kimage *image,
memcpy(ptr, kbuf, uchunk);
else
result = copy_from_user(ptr, buf, uchunk);
-   kunmap(page);
+   kunmap_thread(page);
if (result) {
result = -EFAULT;
goto out;
@@ -879,7 +879,7 @@ static int kimage_load_crash_segment(struct kimage *image,
goto out;
}
arch_kexec_post_alloc_pages(page_address(page), 1, 0);
-   ptr = kmap(page);
+   ptr = kmap_thread(page);
ptr += maddr & ~PAGE_MASK;
mchunk = min_t(size_t, mbytes,
PAGE_SIZE - (maddr & ~PAGE_MASK));
@@ -895,7 +895,7 @@ static int kimage_load_crash_segment(struct kimage *image,
else
result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
-   kunmap(page);
+   kunmap_thread(page);
arch_kexec_pre_free_pages(page_address(page), 1);
if (result) {
result = -EFAULT;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 40/58] net: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls in these drivers are localized to a single thread.
To avoid the over head of global PKRS updates use the new kmap_thread()
call.

Cc: "David S. Miller" 
Cc: Jakub Kicinski 
Cc: Alexey Kuznetsov 
Cc: Hideaki YOSHIFUJI 
Cc: Trond Myklebust 
Cc: Anna Schumaker 
Cc: Boris Pismenny 
Cc: Aviad Yehezkel 
Cc: John Fastabend 
Cc: Daniel Borkmann 
Signed-off-by: Ira Weiny 
---
 net/ceph/messenger.c | 4 ++--
 net/core/datagram.c  | 4 ++--
 net/core/sock.c  | 8 
 net/ipv4/ip_output.c | 4 ++--
 net/sunrpc/cache.c   | 4 ++--
 net/sunrpc/xdr.c | 8 
 net/tls/tls_device.c | 4 ++--
 7 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d4d7a0e52491..0c49b8e333da 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1535,10 +1535,10 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page,
 {
char *kaddr;
 
-   kaddr = kmap(page);
+   kaddr = kmap_thread(page);
BUG_ON(kaddr == NULL);
crc = crc32c(crc, kaddr + page_offset, length);
-   kunmap(page);
+   kunmap_thread(page);
 
return crc;
 }
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 639745d4f3b9..cbd0a343074a 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -441,14 +441,14 @@ static int __skb_datagram_iter(const struct sk_buff *skb, 
int offset,
end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
struct page *page = skb_frag_page(frag);
-   u8 *vaddr = kmap(page);
+   u8 *vaddr = kmap_thread(page);
 
if (copy > len)
copy = len;
n = INDIRECT_CALL_1(cb, simple_copy_to_iter,
vaddr + skb_frag_off(frag) + offset - 
start,
copy, data, to);
-   kunmap(page);
+   kunmap_thread(page);
offset += n;
if (n != copy)
goto short_copy;
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c5c6b18eff4..9b46a75cd8c1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2846,11 +2846,11 @@ ssize_t sock_no_sendpage(struct socket *sock, struct 
page *page, int offset, siz
ssize_t res;
struct msghdr msg = {.msg_flags = flags};
struct kvec iov;
-   char *kaddr = kmap(page);
+   char *kaddr = kmap_thread(page);
iov.iov_base = kaddr + offset;
iov.iov_len = size;
res = kernel_sendmsg(sock, , , 1, size);
-   kunmap(page);
+   kunmap_thread(page);
return res;
 }
 EXPORT_SYMBOL(sock_no_sendpage);
@@ -2861,12 +2861,12 @@ ssize_t sock_no_sendpage_locked(struct sock *sk, struct 
page *page,
ssize_t res;
struct msghdr msg = {.msg_flags = flags};
struct kvec iov;
-   char *kaddr = kmap(page);
+   char *kaddr = kmap_thread(page);
 
iov.iov_base = kaddr + offset;
iov.iov_len = size;
res = kernel_sendmsg_locked(sk, , , 1, size);
-   kunmap(page);
+   kunmap_thread(page);
return res;
 }
 EXPORT_SYMBOL(sock_no_sendpage_locked);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e6f2ada9e7d5..05304fb251a4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -949,9 +949,9 @@ csum_page(struct page *page, int offset, int copy)
 {
char *kaddr;
__wsum csum;
-   kaddr = kmap(page);
+   kaddr = kmap_thread(page);
csum = csum_partial(kaddr + offset, copy, 0);
-   kunmap(page);
+   kunmap_thread(page);
return csum;
 }
 
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index baef5ee43dbb..88193f2a8e6f 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -935,9 +935,9 @@ static ssize_t cache_downcall(struct address_space *mapping,
if (!page)
goto out_slow;
 
-   kaddr = kmap(page);
+   kaddr = kmap_thread(page);
ret = cache_do_downcall(kaddr, buf, count, cd);
-   kunmap(page);
+   kunmap_thread(page);
unlock_page(page);
put_page(page);
return ret;
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index be11d672b5b9..00afbb48fb0a 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1353,7 +1353,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
base &= ~PAGE_MASK;
avail_page = min_t(unsigned int, PAGE_SIZE - base,
avail_here);
-   c = kmap(*ppages) + base;
+   c = kmap_thread(*ppages) + base;
 
while (avail_here) {
avail_here -= avail_page;
@@ -1429,9 +1429,9 @@ xdr_xcode_array2(struct xdr_buf *bu

[PATCH RFC PKS/PMEM 46/58] drives/staging: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Greg Kroah-Hartman 
Signed-off-by: Ira Weiny 
---
 drivers/staging/rts5208/rtsx_transport.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/staging/rts5208/rtsx_transport.c 
b/drivers/staging/rts5208/rtsx_transport.c
index 0027bcf638ad..f747cc23951b 100644
--- a/drivers/staging/rts5208/rtsx_transport.c
+++ b/drivers/staging/rts5208/rtsx_transport.c
@@ -92,13 +92,13 @@ unsigned int rtsx_stor_access_xfer_buf(unsigned char 
*buffer,
while (sglen > 0) {
unsigned int plen = min(sglen, (unsigned int)
PAGE_SIZE - poff);
-   unsigned char *ptr = kmap(page);
+   unsigned char *ptr = kmap_thread(page);
 
if (dir == TO_XFER_BUF)
memcpy(ptr + poff, buffer + cnt, plen);
else
memcpy(buffer + cnt, ptr + poff, plen);
-   kunmap(page);
+   kunmap_thread(page);
 
/* Start at the beginning of the next page */
poff = 0;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 45/58] drivers/firmware: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Ard Biesheuvel 
Signed-off-by: Ira Weiny 
---
 drivers/firmware/efi/capsule-loader.c | 6 +++---
 drivers/firmware/efi/capsule.c| 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/firmware/efi/capsule-loader.c 
b/drivers/firmware/efi/capsule-loader.c
index 4dde8edd53b6..aa2e0b5940fd 100644
--- a/drivers/firmware/efi/capsule-loader.c
+++ b/drivers/firmware/efi/capsule-loader.c
@@ -197,7 +197,7 @@ static ssize_t efi_capsule_write(struct file *file, const 
char __user *buff,
page = cap_info->pages[cap_info->index - 1];
}
 
-   kbuff = kmap(page);
+   kbuff = kmap_thread(page);
kbuff += PAGE_SIZE - cap_info->page_bytes_remain;
 
/* Copy capsule binary data from user space to kernel space buffer */
@@ -217,7 +217,7 @@ static ssize_t efi_capsule_write(struct file *file, const 
char __user *buff,
}
 
cap_info->count += write_byte;
-   kunmap(page);
+   kunmap_thread(page);
 
/* Submit the full binary to efi_capsule_update() API */
if (cap_info->header.headersize > 0 &&
@@ -236,7 +236,7 @@ static ssize_t efi_capsule_write(struct file *file, const 
char __user *buff,
return write_byte;
 
 fail_unmap:
-   kunmap(page);
+   kunmap_thread(page);
 failed:
efi_free_all_buff_pages(cap_info);
return ret;
diff --git a/drivers/firmware/efi/capsule.c b/drivers/firmware/efi/capsule.c
index 598b7800d14e..edb7797b0e4f 100644
--- a/drivers/firmware/efi/capsule.c
+++ b/drivers/firmware/efi/capsule.c
@@ -244,7 +244,7 @@ int efi_capsule_update(efi_capsule_header_t *capsule, 
phys_addr_t *pages)
for (i = 0; i < sg_count; i++) {
efi_capsule_block_desc_t *sglist;
 
-   sglist = kmap(sg_pages[i]);
+   sglist = kmap_thread(sg_pages[i]);
 
for (j = 0; j < SGLIST_PER_PAGE && count > 0; j++) {
u64 sz = min_t(u64, imagesize,
@@ -265,7 +265,7 @@ int efi_capsule_update(efi_capsule_header_t *capsule, 
phys_addr_t *pages)
else
sglist[j].data = page_to_phys(sg_pages[i + 1]);
 
-   kunmap(sg_pages[i]);
+   kunmap_thread(sg_pages[i]);
}
 
mutex_lock(_mutex);
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 47/58] drivers/mtd: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Miquel Raynal 
Cc: Richard Weinberger 
Cc: Vignesh Raghavendra 
Signed-off-by: Ira Weiny 
---
 drivers/mtd/mtd_blkdevs.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 0c05f77f9b21..4b18998273fa 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -88,14 +88,14 @@ static blk_status_t do_blktrans_request(struct 
mtd_blktrans_ops *tr,
return BLK_STS_IOERR;
return BLK_STS_OK;
case REQ_OP_READ:
-   buf = kmap(bio_page(req->bio)) + bio_offset(req->bio);
+   buf = kmap_thread(bio_page(req->bio)) + bio_offset(req->bio);
for (; nsect > 0; nsect--, block++, buf += tr->blksize) {
if (tr->readsect(dev, block, buf)) {
-   kunmap(bio_page(req->bio));
+   kunmap_thread(bio_page(req->bio));
return BLK_STS_IOERR;
}
}
-   kunmap(bio_page(req->bio));
+   kunmap_thread(bio_page(req->bio));
rq_flush_dcache_pages(req);
return BLK_STS_OK;
case REQ_OP_WRITE:
@@ -103,14 +103,14 @@ static blk_status_t do_blktrans_request(struct 
mtd_blktrans_ops *tr,
return BLK_STS_IOERR;
 
rq_flush_dcache_pages(req);
-   buf = kmap(bio_page(req->bio)) + bio_offset(req->bio);
+   buf = kmap_thread(bio_page(req->bio)) + bio_offset(req->bio);
for (; nsect > 0; nsect--, block++, buf += tr->blksize) {
if (tr->writesect(dev, block, buf)) {
-   kunmap(bio_page(req->bio));
+   kunmap_thread(bio_page(req->bio));
return BLK_STS_IOERR;
}
}
-   kunmap(bio_page(req->bio));
+   kunmap_thread(bio_page(req->bio));
return BLK_STS_OK;
default:
return BLK_STS_IOERR;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 44/58] drivers/xen: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Stefano Stabellini 
Signed-off-by: Ira Weiny 
---
 drivers/xen/gntalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c
index 3fa40c723e8e..3b78e055feff 100644
--- a/drivers/xen/gntalloc.c
+++ b/drivers/xen/gntalloc.c
@@ -184,9 +184,9 @@ static int add_grefs(struct ioctl_gntalloc_alloc_gref *op,
 static void __del_gref(struct gntalloc_gref *gref)
 {
if (gref->notify.flags & UNMAP_NOTIFY_CLEAR_BYTE) {
-   uint8_t *tmp = kmap(gref->page);
+   uint8_t *tmp = kmap_thread(gref->page);
tmp[gref->notify.pgoff] = 0;
-   kunmap(gref->page);
+   kunmap_thread(gref->page);
}
if (gref->notify.flags & UNMAP_NOTIFY_SEND_EVENT) {
notify_remote_via_evtchn(gref->notify.event);
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 43/58] drivers/mmc: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Ulf Hansson 
Cc: Sascha Sommer 
Signed-off-by: Ira Weiny 
---
 drivers/mmc/host/mmc_spi.c| 4 ++--
 drivers/mmc/host/sdricoh_cs.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c
index 18a850f37ddc..ab28e7103b8d 100644
--- a/drivers/mmc/host/mmc_spi.c
+++ b/drivers/mmc/host/mmc_spi.c
@@ -918,7 +918,7 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct 
mmc_command *cmd,
}
 
/* allow pio too; we don't allow highmem */
-   kmap_addr = kmap(sg_page(sg));
+   kmap_addr = kmap_thread(sg_page(sg));
if (direction == DMA_TO_DEVICE)
t->tx_buf = kmap_addr + sg->offset;
else
@@ -950,7 +950,7 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct 
mmc_command *cmd,
/* discard mappings */
if (direction == DMA_FROM_DEVICE)
flush_kernel_dcache_page(sg_page(sg));
-   kunmap(sg_page(sg));
+   kunmap_thread(sg_page(sg));
if (dma_dev)
dma_unmap_page(dma_dev, dma_addr, PAGE_SIZE, dir);
 
diff --git a/drivers/mmc/host/sdricoh_cs.c b/drivers/mmc/host/sdricoh_cs.c
index 76a8cd3a186f..7806bc69c4f1 100644
--- a/drivers/mmc/host/sdricoh_cs.c
+++ b/drivers/mmc/host/sdricoh_cs.c
@@ -312,11 +312,11 @@ static void sdricoh_request(struct mmc_host *mmc, struct 
mmc_request *mrq)
int result;
page = sg_page(data->sg);
 
-   buf = kmap(page) + data->sg->offset + (len * i);
+   buf = kmap_thread(page) + data->sg->offset + (len * i);
result =
sdricoh_blockio(host,
data->flags & MMC_DATA_READ, buf, len);
-   kunmap(page);
+   kunmap_thread(page);
flush_dcache_page(page);
if (result) {
dev_err(dev, "sdricoh_request: cmd %i "
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 38/58] fs/isofs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over head of
global PKRS updates use the new kmap_thread() call.

Signed-off-by: Ira Weiny 
---
 fs/isofs/compress.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index bc12ac7e2312..ddd3fd99d2e1 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -344,7 +344,7 @@ static int zisofs_readpage(struct file *file, struct page 
*page)
pages[i] = grab_cache_page_nowait(mapping, index);
if (pages[i]) {
ClearPageError(pages[i]);
-   kmap(pages[i]);
+   kmap_thread(pages[i]);
}
}
 
@@ -356,7 +356,7 @@ static int zisofs_readpage(struct file *file, struct page 
*page)
flush_dcache_page(pages[i]);
if (i == full_page && err)
SetPageError(pages[i]);
-   kunmap(pages[i]);
+   kunmap_thread(pages[i]);
unlock_page(pages[i]);
if (i != full_page)
put_page(pages[i]);
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 37/58] fs/ext2: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS update use the new kmap_thread() call instead.

Cc: Jan Kara 
Signed-off-by: Ira Weiny 
---
 fs/ext2/dir.c  | 2 +-
 fs/ext2/ext2.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index f3194bf20733..abe97ba458c8 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -196,7 +196,7 @@ static struct page * ext2_get_page(struct inode *dir, 
unsigned long n,
struct address_space *mapping = dir->i_mapping;
struct page *page = read_mapping_page(mapping, n, NULL);
if (!IS_ERR(page)) {
-   kmap(page);
+   kmap_thread(page);
if (unlikely(!PageChecked(page))) {
if (PageError(page) || !ext2_check_page(page, quiet))
goto fail;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 021ec8b42ac3..9bcb6714c255 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -749,7 +749,7 @@ extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode 
*, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct 
page *, struct inode *, int);
 static inline void ext2_put_page(struct page *page)
 {
-   kunmap(page);
+   kunmap_thread(page);
put_page(page);
 }
 
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 42/58] drivers/scsi: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: "James E.J. Bottomley" 
Cc: "Martin K. Petersen" 
Signed-off-by: Ira Weiny 
---
 drivers/scsi/ipr.c | 8 
 drivers/scsi/pmcraid.c | 8 
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c
index b0aa58d117cc..a5a0b8feb661 100644
--- a/drivers/scsi/ipr.c
+++ b/drivers/scsi/ipr.c
@@ -3923,9 +3923,9 @@ static int ipr_copy_ucode_buffer(struct ipr_sglist 
*sglist,
buffer += bsize_elem) {
struct page *page = sg_page(sg);
 
-   kaddr = kmap(page);
+   kaddr = kmap_thread(page);
memcpy(kaddr, buffer, bsize_elem);
-   kunmap(page);
+   kunmap_thread(page);
 
sg->length = bsize_elem;
 
@@ -3938,9 +3938,9 @@ static int ipr_copy_ucode_buffer(struct ipr_sglist 
*sglist,
if (len % bsize_elem) {
struct page *page = sg_page(sg);
 
-   kaddr = kmap(page);
+   kaddr = kmap_thread(page);
memcpy(kaddr, buffer, len % bsize_elem);
-   kunmap(page);
+   kunmap_thread(page);
 
sg->length = len % bsize_elem;
}
diff --git a/drivers/scsi/pmcraid.c b/drivers/scsi/pmcraid.c
index aa9ae2ae8579..4b05ba4b8a11 100644
--- a/drivers/scsi/pmcraid.c
+++ b/drivers/scsi/pmcraid.c
@@ -3269,13 +3269,13 @@ static int pmcraid_copy_sglist(
for (i = 0; i < (len / bsize_elem); i++, sg = sg_next(sg), buffer += 
bsize_elem) {
struct page *page = sg_page(sg);
 
-   kaddr = kmap(page);
+   kaddr = kmap_thread(page);
if (direction == DMA_TO_DEVICE)
rc = copy_from_user(kaddr, buffer, bsize_elem);
else
rc = copy_to_user(buffer, kaddr, bsize_elem);
 
-   kunmap(page);
+   kunmap_thread(page);
 
if (rc) {
pmcraid_err("failed to copy user data into sg list\n");
@@ -3288,14 +3288,14 @@ static int pmcraid_copy_sglist(
if (len % bsize_elem) {
struct page *page = sg_page(sg);
 
-   kaddr = kmap(page);
+   kaddr = kmap_thread(page);
 
if (direction == DMA_TO_DEVICE)
rc = copy_from_user(kaddr, buffer, len % bsize_elem);
else
rc = copy_to_user(buffer, kaddr, len % bsize_elem);
 
-   kunmap(page);
+   kunmap_thread(page);
 
sg->length = len % bsize_elem;
}
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 36/58] fs/ext2: Use ext2_put_page

2020-10-09 Thread ira . weiny
From: Ira Weiny 

There are 3 places in namei.c where the equivalent of ext2_put_page() is
open coded.  We want to use k[un]map_thread() instead of k[un]map() in
ext2_[get|put]_page().

Move ext2_put_page() to ext2.h and use it in namei.c in prep for
converting the k[un]map() code.

Cc: Jan Kara 
Signed-off-by: Ira Weiny 
---
 fs/ext2/dir.c   |  6 --
 fs/ext2/ext2.h  |  8 
 fs/ext2/namei.c | 15 +--
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 70355ab6740e..f3194bf20733 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -66,12 +66,6 @@ static inline unsigned ext2_chunk_size(struct inode *inode)
return inode->i_sb->s_blocksize;
 }
 
-static inline void ext2_put_page(struct page *page)
-{
-   kunmap(page);
-   put_page(page);
-}
-
 /*
  * Return the offset into page `page_nr' of the last valid
  * byte in that page, plus one.
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 5136b7289e8d..021ec8b42ac3 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -16,6 +16,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 /* XXX Here for now... not interested in restructing headers JUST now */
 
@@ -745,6 +747,12 @@ extern int ext2_delete_entry (struct ext2_dir_entry_2 *, 
struct page *);
 extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct 
page *, struct inode *, int);
+static inline void ext2_put_page(struct page *page)
+{
+   kunmap(page);
+   put_page(page);
+}
+
 
 /* ialloc.c */
 extern struct inode * ext2_new_inode (struct inode *, umode_t, const struct 
qstr *);
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 5bf2c145643b..ea980f1e2e99 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -389,23 +389,18 @@ static int ext2_rename (struct inode * old_dir, struct 
dentry * old_dentry,
if (dir_de) {
if (old_dir != new_dir)
ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
-   else {
-   kunmap(dir_page);
-   put_page(dir_page);
-   }
+   else
+   ext2_put_page(dir_page);
inode_dec_link_count(old_dir);
}
return 0;
 
 
 out_dir:
-   if (dir_de) {
-   kunmap(dir_page);
-   put_page(dir_page);
-   }
+   if (dir_de)
+   ext2_put_page(dir_page);
 out_old:
-   kunmap(old_page);
-   put_page(old_page);
+   ext2_put_page(old_page);
 out:
return err;
 }
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 41/58] drivers/target: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls in this driver are localized to a single thread.  To
avoid the over head of global PKRS updates use the new kmap_thread()
call.

Signed-off-by: Ira Weiny 
---
 drivers/target/target_core_iblock.c| 4 ++--
 drivers/target/target_core_rd.c| 4 ++--
 drivers/target/target_core_transport.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/target/target_core_iblock.c 
b/drivers/target/target_core_iblock.c
index 1c181d31f4c8..df7b1568edb3 100644
--- a/drivers/target/target_core_iblock.c
+++ b/drivers/target/target_core_iblock.c
@@ -415,7 +415,7 @@ iblock_execute_zero_out(struct block_device *bdev, struct 
se_cmd *cmd)
unsigned char *buf, *not_zero;
int ret;
 
-   buf = kmap(sg_page(sg)) + sg->offset;
+   buf = kmap_thread(sg_page(sg)) + sg->offset;
if (!buf)
return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
/*
@@ -423,7 +423,7 @@ iblock_execute_zero_out(struct block_device *bdev, struct 
se_cmd *cmd)
 * incoming WRITE_SAME payload does not contain zeros.
 */
not_zero = memchr_inv(buf, 0x00, cmd->data_length);
-   kunmap(sg_page(sg));
+   kunmap_thread(sg_page(sg));
 
if (not_zero)
return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
diff --git a/drivers/target/target_core_rd.c b/drivers/target/target_core_rd.c
index 408bd975170b..dbbdd39c5bf9 100644
--- a/drivers/target/target_core_rd.c
+++ b/drivers/target/target_core_rd.c
@@ -159,9 +159,9 @@ static int rd_allocate_sgl_table(struct rd_dev *rd_dev, 
struct rd_dev_sg_table *
sg_assign_page([j], pg);
sg[j].length = PAGE_SIZE;
 
-   p = kmap(pg);
+   p = kmap_thread(pg);
memset(p, init_payload, PAGE_SIZE);
-   kunmap(pg);
+   kunmap_thread(pg);
}
 
page_offset += sg_per_table;
diff --git a/drivers/target/target_core_transport.c 
b/drivers/target/target_core_transport.c
index ff26ab0a5f60..8d0bae5a92e5 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -1692,11 +1692,11 @@ int target_submit_cmd_map_sgls(struct se_cmd *se_cmd, 
struct se_session *se_sess
unsigned char *buf = NULL;
 
if (sgl)
-   buf = kmap(sg_page(sgl)) + sgl->offset;
+   buf = kmap_thread(sg_page(sgl)) + sgl->offset;
 
if (buf) {
memset(buf, 0, sgl->length);
-   kunmap(sg_page(sgl));
+   kunmap_thread(sg_page(sgl));
}
}
 
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 31/58] fs/vboxsf: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Hans de Goede 
Signed-off-by: Ira Weiny 
---
 fs/vboxsf/file.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index c4ab5996d97a..d9c7e6b7b4cc 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -216,7 +216,7 @@ static int vboxsf_readpage(struct file *file, struct page 
*page)
u8 *buf;
int err;
 
-   buf = kmap(page);
+   buf = kmap_thread(page);
 
err = vboxsf_read(sf_handle->root, sf_handle->handle, off, , buf);
if (err == 0) {
@@ -227,7 +227,7 @@ static int vboxsf_readpage(struct file *file, struct page 
*page)
SetPageError(page);
}
 
-   kunmap(page);
+   kunmap_thread(page);
unlock_page(page);
return err;
 }
@@ -268,10 +268,10 @@ static int vboxsf_writepage(struct page *page, struct 
writeback_control *wbc)
if (!sf_handle)
return -EBADF;
 
-   buf = kmap(page);
+   buf = kmap_thread(page);
err = vboxsf_write(sf_handle->root, sf_handle->handle,
   off, , buf);
-   kunmap(page);
+   kunmap_thread(page);
 
kref_put(_handle->refcount, vboxsf_handle_release);
 
@@ -302,10 +302,10 @@ static int vboxsf_write_end(struct file *file, struct 
address_space *mapping,
if (!PageUptodate(page) && copied < len)
zero_user(page, from + copied, len - copied);
 
-   buf = kmap(page);
+   buf = kmap_thread(page);
err = vboxsf_write(sf_handle->root, sf_handle->handle,
   pos, , buf + from);
-   kunmap(page);
+   kunmap_thread(page);
 
if (err) {
nwritten = 0;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 29/58] fs/ntfs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Anton Altaparmakov 
Signed-off-by: Ira Weiny 
---
 fs/ntfs/aops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index bb0a43860ad2..11633d732809 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1099,7 +1099,7 @@ static int ntfs_write_mst_block(struct page *page,
if (!nr_bhs)
goto done;
/* Map the page so we can access its contents. */
-   kaddr = kmap(page);
+   kaddr = kmap_thread(page);
/* Clear the page uptodate flag whilst the mst fixups are applied. */
BUG_ON(!PageUptodate(page));
ClearPageUptodate(page);
@@ -1276,7 +1276,7 @@ static int ntfs_write_mst_block(struct page *page,
iput(VFS_I(base_tni));
}
SetPageUptodate(page);
-   kunmap(page);
+   kunmap_thread(page);
 done:
if (unlikely(err && err != -ENOMEM)) {
/*
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 35/58] fs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

These kmap() calls are localized to a single thread.  To avoid the over
head of global PKRS updates use the new kmap_thread() call.

Cc: Alexander Viro 
Cc: Jens Axboe 
Signed-off-by: Ira Weiny 
---
 fs/aio.c  |  4 ++--
 fs/binfmt_elf.c   |  4 ++--
 fs/binfmt_elf_fdpic.c |  4 ++--
 fs/exec.c | 10 +-
 fs/io_uring.c |  4 ++--
 fs/splice.c   |  4 ++--
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index d5ec30385566..27f95996d25f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1223,10 +1223,10 @@ static long aio_read_events_ring(struct kioctx *ctx,
avail = min(avail, nr - ret);
avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos);
 
-   ev = kmap(page);
+   ev = kmap_thread(page);
copy_ret = copy_to_user(event + ret, ev + pos,
sizeof(*ev) * avail);
-   kunmap(page);
+   kunmap_thread(page);
 
if (unlikely(copy_ret)) {
ret = -EFAULT;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 13d053982dd7..1a332ef1ae03 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2430,9 +2430,9 @@ static int elf_core_dump(struct coredump_params *cprm)
 
page = get_dump_page(addr);
if (page) {
-   void *kaddr = kmap(page);
+   void *kaddr = kmap_thread(page);
stop = !dump_emit(cprm, kaddr, PAGE_SIZE);
-   kunmap(page);
+   kunmap_thread(page);
put_page(page);
} else
stop = !dump_skip(cprm, PAGE_SIZE);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 50f845702b92..8fbe188e0fdd 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1542,9 +1542,9 @@ static bool elf_fdpic_dump_segments(struct 
coredump_params *cprm)
bool res;
struct page *page = get_dump_page(addr);
if (page) {
-   void *kaddr = kmap(page);
+   void *kaddr = kmap_thread(page);
res = dump_emit(cprm, kaddr, PAGE_SIZE);
-   kunmap(page);
+   kunmap_thread(page);
put_page(page);
} else {
res = dump_skip(cprm, PAGE_SIZE);
diff --git a/fs/exec.c b/fs/exec.c
index a91003e28eaa..3948b8511e3a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -575,11 +575,11 @@ static int copy_strings(int argc, struct user_arg_ptr 
argv,
 
if (kmapped_page) {
flush_kernel_dcache_page(kmapped_page);
-   kunmap(kmapped_page);
+   kunmap_thread(kmapped_page);
put_arg_page(kmapped_page);
}
kmapped_page = page;
-   kaddr = kmap(kmapped_page);
+   kaddr = kmap_thread(kmapped_page);
kpos = pos & PAGE_MASK;
flush_arg_page(bprm, kpos, kmapped_page);
}
@@ -593,7 +593,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv,
 out:
if (kmapped_page) {
flush_kernel_dcache_page(kmapped_page);
-   kunmap(kmapped_page);
+   kunmap_thread(kmapped_page);
put_arg_page(kmapped_page);
}
return ret;
@@ -871,11 +871,11 @@ int transfer_args_to_stack(struct linux_binprm *bprm,
 
for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
-   char *src = kmap(bprm->page[index]) + offset;
+   char *src = kmap_thread(bprm->page[index]) + offset;
sp -= PAGE_SIZE - offset;
if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
ret = -EFAULT;
-   kunmap(bprm->page[index]);
+   kunmap_thread(bprm->page[index]);
if (ret)
goto out;
}
diff --git a/fs/io_uring.c b/fs/io_uring.c
index aae0ef2ec34d..f59bb079822d 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -2903,7 +2903,7 @@ static ssize_t loop_rw_iter(int rw, struct file *file, 
struct kiocb *kiocb,
iovec = iov_iter_iovec(iter);
} else {
/* fixed buffers import bvec */
-   iovec.

[PATCH RFC PKS/PMEM 33/58] fs/cramfs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Nicolas Pitre 
Signed-off-by: Ira Weiny 
---
 fs/cramfs/inode.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 912308600d39..003c014a42ed 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -247,8 +247,8 @@ static void *cramfs_blkdev_read(struct super_block *sb, 
unsigned int offset,
struct page *page = pages[i];
 
if (page) {
-   memcpy(data, kmap(page), PAGE_SIZE);
-   kunmap(page);
+   memcpy(data, kmap_thread(page), PAGE_SIZE);
+   kunmap_thread(page);
put_page(page);
} else
memset(data, 0, PAGE_SIZE);
@@ -826,7 +826,7 @@ static int cramfs_readpage(struct file *file, struct page 
*page)
 
maxblock = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
bytes_filled = 0;
-   pgdata = kmap(page);
+   pgdata = kmap_thread(page);
 
if (page->index < maxblock) {
struct super_block *sb = inode->i_sb;
@@ -914,13 +914,13 @@ static int cramfs_readpage(struct file *file, struct page 
*page)
 
memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled);
flush_dcache_page(page);
-   kunmap(page);
+   kunmap_thread(page);
SetPageUptodate(page);
unlock_page(page);
return 0;
 
 err:
-   kunmap(page);
+   kunmap_thread(page);
ClearPageUptodate(page);
SetPageError(page);
unlock_page(page);
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 28/58] fs/cachefiles: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: David Howells 
Signed-off-by: Ira Weiny 
---
 fs/cachefiles/rdwr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 3080cda9e824..2468e5c067ba 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -936,9 +936,9 @@ int cachefiles_write_page(struct fscache_storage *op, 
struct page *page)
}
}
 
-   data = kmap(page);
+   data = kmap_thread(page);
ret = kernel_write(file, data, len, );
-   kunmap(page);
+   kunmap_thread(page);
fput(file);
if (ret != len)
goto error_eio;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 26/58] fs/zonefs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Damien Le Moal 
Cc: Naohiro Aota 
Signed-off-by: Ira Weiny 
---
 fs/zonefs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index 8ec7c8f109d7..2fd6c86beee1 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -1297,7 +1297,7 @@ static int zonefs_read_super(struct super_block *sb)
if (ret)
goto free_page;
 
-   super = kmap(page);
+   super = kmap_thread(page);
 
ret = -EINVAL;
if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC)
@@ -1349,7 +1349,7 @@ static int zonefs_read_super(struct super_block *sb)
ret = 0;
 
 unmap:
-   kunmap(page);
+   kunmap_thread(page);
 free_page:
__free_page(page);
 
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 32/58] fs/hostfs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Jeff Dike 
Cc: Richard Weinberger 
Cc: Anton Ivanov 
Signed-off-by: Ira Weiny 
---
 fs/hostfs/hostfs_kern.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index c070c0d8e3e9..608efd0f83cb 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -409,7 +409,7 @@ static int hostfs_writepage(struct page *page, struct 
writeback_control *wbc)
if (page->index >= end_index)
count = inode->i_size & (PAGE_SIZE-1);
 
-   buffer = kmap(page);
+   buffer = kmap_thread(page);
 
err = write_file(HOSTFS_I(inode)->fd, , buffer, count);
if (err != count) {
@@ -425,7 +425,7 @@ static int hostfs_writepage(struct page *page, struct 
writeback_control *wbc)
err = 0;
 
  out:
-   kunmap(page);
+   kunmap_thread(page);
 
unlock_page(page);
return err;
@@ -437,7 +437,7 @@ static int hostfs_readpage(struct file *file, struct page 
*page)
loff_t start = page_offset(page);
int bytes_read, ret = 0;
 
-   buffer = kmap(page);
+   buffer = kmap_thread(page);
bytes_read = read_file(FILE_HOSTFS_I(file)->fd, , buffer,
PAGE_SIZE);
if (bytes_read < 0) {
@@ -454,7 +454,7 @@ static int hostfs_readpage(struct file *file, struct page 
*page)
 
  out:
flush_dcache_page(page);
-   kunmap(page);
+   kunmap_thread(page);
unlock_page(page);
return ret;
 }
@@ -480,9 +480,9 @@ static int hostfs_write_end(struct file *file, struct 
address_space *mapping,
unsigned from = pos & (PAGE_SIZE - 1);
int err;
 
-   buffer = kmap(page);
+   buffer = kmap_thread(page);
err = write_file(FILE_HOSTFS_I(file)->fd, , buffer + from, copied);
-   kunmap(page);
+   kunmap_thread(page);
 
if (!PageUptodate(page) && err == PAGE_SIZE)
SetPageUptodate(page);
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 34/58] fs/erofs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Gao Xiang 
Cc: Chao Yu 
Signed-off-by: Ira Weiny 
---
 fs/erofs/super.c | 4 ++--
 fs/erofs/xattr.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index ddaa516c008a..41696b60f1b3 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -139,7 +139,7 @@ static int erofs_read_superblock(struct super_block *sb)
 
sbi = EROFS_SB(sb);
 
-   data = kmap(page);
+   data = kmap_thread(page);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
 
ret = -EINVAL;
@@ -189,7 +189,7 @@ static int erofs_read_superblock(struct super_block *sb)
}
ret = 0;
 out:
-   kunmap(page);
+   kunmap_thread(page);
put_page(page);
return ret;
 }
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index c8c381eadcd6..1771baa99d77 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -20,7 +20,7 @@ static inline void xattr_iter_end(struct xattr_iter *it, bool 
atomic)
 {
/* the only user of kunmap() is 'init_inode_xattrs' */
if (!atomic)
-   kunmap(it->page);
+   kunmap_thread(it->page);
else
kunmap_atomic(it->kaddr);
 
@@ -96,7 +96,7 @@ static int init_inode_xattrs(struct inode *inode)
}
 
/* read in shared xattr array (non-atomic, see kmalloc below) */
-   it.kaddr = kmap(it.page);
+   it.kaddr = kmap_thread(it.page);
atomic_map = false;
 
ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs);
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 30/58] fs/romfs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Signed-off-by: Ira Weiny 
---
 fs/romfs/super.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e582d001f792..9050074c6755 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -107,7 +107,7 @@ static int romfs_readpage(struct file *file, struct page 
*page)
void *buf;
int ret;
 
-   buf = kmap(page);
+   buf = kmap_thread(page);
if (!buf)
return -ENOMEM;
 
@@ -136,7 +136,7 @@ static int romfs_readpage(struct file *file, struct page 
*page)
SetPageUptodate(page);
 
flush_dcache_page(page);
-   kunmap(page);
+   kunmap_thread(page);
unlock_page(page);
return ret;
 }
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 27/58] fs/ubifs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Richard Weinberger 
Signed-off-by: Ira Weiny 
---
 fs/ubifs/file.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b77d1637bbbc..a3537447a885 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -111,7 +111,7 @@ static int do_readpage(struct page *page)
ubifs_assert(c, !PageChecked(page));
ubifs_assert(c, !PagePrivate(page));
 
-   addr = kmap(page);
+   addr = kmap_thread(page);
 
block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
@@ -174,7 +174,7 @@ static int do_readpage(struct page *page)
SetPageUptodate(page);
ClearPageError(page);
flush_dcache_page(page);
-   kunmap(page);
+   kunmap_thread(page);
return 0;
 
 error:
@@ -182,7 +182,7 @@ static int do_readpage(struct page *page)
ClearPageUptodate(page);
SetPageError(page);
flush_dcache_page(page);
-   kunmap(page);
+   kunmap_thread(page);
return err;
 }
 
@@ -616,7 +616,7 @@ static int populate_page(struct ubifs_info *c, struct page 
*page,
dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
inode->i_ino, page->index, i_size, page->flags);
 
-   addr = zaddr = kmap(page);
+   addr = zaddr = kmap_thread(page);
 
end_index = (i_size - 1) >> PAGE_SHIFT;
if (!i_size || page->index > end_index) {
@@ -692,7 +692,7 @@ static int populate_page(struct ubifs_info *c, struct page 
*page,
SetPageUptodate(page);
ClearPageError(page);
flush_dcache_page(page);
-   kunmap(page);
+   kunmap_thread(page);
*n = nn;
return 0;
 
@@ -700,7 +700,7 @@ static int populate_page(struct ubifs_info *c, struct page 
*page,
ClearPageUptodate(page);
SetPageError(page);
flush_dcache_page(page);
-   kunmap(page);
+   kunmap_thread(page);
ubifs_err(c, "bad data node (block %u, inode %lu)",
  page_block, inode->i_ino);
return -EINVAL;
@@ -918,7 +918,7 @@ static int do_writepage(struct page *page, int len)
/* Update radix tree tags */
set_page_writeback(page);
 
-   addr = kmap(page);
+   addr = kmap_thread(page);
block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
i = 0;
while (len) {
@@ -950,7 +950,7 @@ static int do_writepage(struct page *page, int len)
ClearPagePrivate(page);
ClearPageChecked(page);
 
-   kunmap(page);
+   kunmap_thread(page);
unlock_page(page);
end_page_writeback(page);
return err;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 23/58] fs/fuse: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Miklos Szeredi 
Signed-off-by: Ira Weiny 
---
 fs/fuse/readdir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 90e3f01bd796..953ffe6f56e3 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -536,9 +536,9 @@ static int fuse_readdir_cached(struct file *file, struct 
dir_context *ctx)
 * Contents of the page are now protected against changing by holding
 * the page lock.
 */
-   addr = kmap(page);
+   addr = kmap_thread(page);
res = fuse_parse_cache(ff, addr, size, ctx);
-   kunmap(page);
+   kunmap_thread(page);
unlock_page(page);
put_page(page);
 
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 21/58] fs/nfs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Trond Myklebust 
Cc: Anna Schumaker 
Signed-off-by: Ira Weiny 
---
 fs/nfs/dir.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index cb52db9a0cfb..fee321acccb4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -213,7 +213,7 @@ int nfs_readdir_make_qstr(struct qstr *string, const char 
*name, unsigned int le
 static
 int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page)
 {
-   struct nfs_cache_array *array = kmap(page);
+   struct nfs_cache_array *array = kmap_thread(page);
struct nfs_cache_array_entry *cache_entry;
int ret;
 
@@ -235,7 +235,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, 
struct page *page)
if (entry->eof != 0)
array->eof_index = array->size;
 out:
-   kunmap(page);
+   kunmap_thread(page);
return ret;
 }
 
@@ -347,7 +347,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
struct nfs_cache_array *array;
int status;
 
-   array = kmap(desc->page);
+   array = kmap_thread(desc->page);
 
if (*desc->dir_cookie == 0)
status = nfs_readdir_search_for_pos(array, desc);
@@ -359,7 +359,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc)
desc->current_index += array->size;
desc->page_index++;
}
-   kunmap(desc->page);
+   kunmap_thread(desc->page);
return status;
 }
 
@@ -602,10 +602,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t 
*desc, struct nfs_entry *en
 
 out_nopages:
if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) {
-   array = kmap(page);
+   array = kmap_thread(page);
array->eof_index = array->size;
status = 0;
-   kunmap(page);
+   kunmap_thread(page);
}
 
put_page(scratch);
@@ -669,7 +669,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t 
*desc, struct page *page,
goto out;
}
 
-   array = kmap(page);
+   array = kmap_thread(page);
 
status = nfs_readdir_alloc_pages(pages, array_size);
if (status < 0)
@@ -691,7 +691,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t 
*desc, struct page *page,
 
nfs_readdir_free_pages(pages, array_size);
 out_release_array:
-   kunmap(page);
+   kunmap_thread(page);
nfs4_label_free(entry.label);
 out:
nfs_free_fattr(entry.fattr);
@@ -803,7 +803,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
struct nfs_cache_array *array = NULL;
struct nfs_open_dir_context *ctx = file->private_data;
 
-   array = kmap(desc->page);
+   array = kmap_thread(desc->page);
for (i = desc->cache_entry_index; i < array->size; i++) {
struct nfs_cache_array_entry *ent;
 
@@ -827,7 +827,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
if (array->eof_index >= 0)
desc->eof = true;
 
-   kunmap(desc->page);
+   kunmap_thread(desc->page);
dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; 
returning = %d\n",
(unsigned long long)*desc->dir_cookie, res);
return res;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 25/58] fs/reiserfs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Jan Kara 
Cc: "Theodore Ts'o" 
Cc: Randy Dunlap 
Cc: Alex Shi 
Signed-off-by: Ira Weiny 
---
 fs/reiserfs/journal.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index e98f99338f8f..be8f56261e8c 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -4194,11 +4194,11 @@ static int do_journal_end(struct 
reiserfs_transaction_handle *th, int flags)
SB_ONDISK_JOURNAL_SIZE(sb)));
set_buffer_uptodate(tmp_bh);
page = cn->bh->b_page;
-   addr = kmap(page);
+   addr = kmap_thread(page);
memcpy(tmp_bh->b_data,
   addr + offset_in_page(cn->bh->b_data),
   cn->bh->b_size);
-   kunmap(page);
+   kunmap_thread(page);
mark_buffer_dirty(tmp_bh);
jindex++;
set_buffer_journal_dirty(cn->bh);
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 24/58] fs/freevxfs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Christoph Hellwig 
Signed-off-by: Ira Weiny 
---
 fs/freevxfs/vxfs_immed.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index bfc780c682fb..9c42fec4cd85 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -69,9 +69,9 @@ vxfs_immed_readpage(struct file *fp, struct page *pp)
u_int64_t   offset = (u_int64_t)pp->index << PAGE_SHIFT;
caddr_t kaddr;
 
-   kaddr = kmap(pp);
+   kaddr = kmap_thread(pp);
memcpy(kaddr, vip->vii_immed.vi_immed + offset, PAGE_SIZE);
-   kunmap(pp);
+   kunmap_thread(pp);

flush_dcache_page(pp);
SetPageUptodate(pp);
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 20/58] fs/jffs2: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: David Woodhouse 
Cc: Richard Weinberger 
Signed-off-by: Ira Weiny 
---
 fs/jffs2/file.c | 4 ++--
 fs/jffs2/gc.c   | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index f8fb89b10227..3e6d54f9b011 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -88,7 +88,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, 
struct page *pg)
 
BUG_ON(!PageLocked(pg));
 
-   pg_buf = kmap(pg);
+   pg_buf = kmap_thread(pg);
/* FIXME: Can kmap fail? */
 
ret = jffs2_read_inode_range(c, f, pg_buf, pg->index << PAGE_SHIFT,
@@ -103,7 +103,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, 
struct page *pg)
}
 
flush_dcache_page(pg);
-   kunmap(pg);
+   kunmap_thread(pg);
 
jffs2_dbg(2, "readpage finished\n");
return ret;
diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c
index 373b3b7c9f44..a7259783ab84 100644
--- a/fs/jffs2/gc.c
+++ b/fs/jffs2/gc.c
@@ -1335,7 +1335,7 @@ static int jffs2_garbage_collect_dnode(struct 
jffs2_sb_info *c, struct jffs2_era
return PTR_ERR(page);
}
 
-   pg_ptr = kmap(page);
+   pg_ptr = kmap_thread(page);
mutex_lock(>sem);
 
offset = start;
@@ -1400,7 +1400,7 @@ static int jffs2_garbage_collect_dnode(struct 
jffs2_sb_info *c, struct jffs2_era
}
}
 
-   kunmap(page);
+   kunmap_thread(page);
put_page(page);
return ret;
 }
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 19/58] fs/hfsplus: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Signed-off-by: Ira Weiny 
---
 fs/hfsplus/bitmap.c |  20 -
 fs/hfsplus/bnode.c  | 102 ++--
 fs/hfsplus/btree.c  |  18 
 3 files changed, 70 insertions(+), 70 deletions(-)

diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index cebce0cfe340..9ec7c1559a0c 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -39,7 +39,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
start = size;
goto out;
}
-   pptr = kmap(page);
+   pptr = kmap_thread(page);
curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
i = offset % 32;
offset &= ~(PAGE_CACHE_BITS - 1);
@@ -74,7 +74,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
}
curr++;
}
-   kunmap(page);
+   kunmap_thread(page);
offset += PAGE_CACHE_BITS;
if (offset >= size)
break;
@@ -84,7 +84,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
start = size;
goto out;
}
-   curr = pptr = kmap(page);
+   curr = pptr = kmap_thread(page);
if ((size ^ offset) / PAGE_CACHE_BITS)
end = pptr + PAGE_CACHE_BITS / 32;
else
@@ -127,7 +127,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
len -= 32;
}
set_page_dirty(page);
-   kunmap(page);
+   kunmap_thread(page);
offset += PAGE_CACHE_BITS;
page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS,
 NULL);
@@ -135,7 +135,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
start = size;
goto out;
}
-   pptr = kmap(page);
+   pptr = kmap_thread(page);
curr = pptr;
end = pptr + PAGE_CACHE_BITS / 32;
}
@@ -151,7 +151,7 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size,
 done:
*curr = cpu_to_be32(n);
set_page_dirty(page);
-   kunmap(page);
+   kunmap_thread(page);
*max = offset + (curr - pptr) * 32 + i - start;
sbi->free_blocks -= *max;
hfsplus_mark_mdb_dirty(sb);
@@ -185,7 +185,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, 
u32 count)
page = read_mapping_page(mapping, pnr, NULL);
if (IS_ERR(page))
goto kaboom;
-   pptr = kmap(page);
+   pptr = kmap_thread(page);
curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32;
end = pptr + PAGE_CACHE_BITS / 32;
len = count;
@@ -215,11 +215,11 @@ int hfsplus_block_free(struct super_block *sb, u32 
offset, u32 count)
if (!count)
break;
set_page_dirty(page);
-   kunmap(page);
+   kunmap_thread(page);
page = read_mapping_page(mapping, ++pnr, NULL);
if (IS_ERR(page))
goto kaboom;
-   pptr = kmap(page);
+   pptr = kmap_thread(page);
curr = pptr;
end = pptr + PAGE_CACHE_BITS / 32;
}
@@ -231,7 +231,7 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, 
u32 count)
}
 out:
set_page_dirty(page);
-   kunmap(page);
+   kunmap_thread(page);
sbi->free_blocks += len;
hfsplus_mark_mdb_dirty(sb);
mutex_unlock(>alloc_mutex);
diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c
index 177fae4e6581..62757d92fbbd 100644
--- a/fs/hfsplus/bnode.c
+++ b/fs/hfsplus/bnode.c
@@ -29,14 +29,14 @@ void hfs_bnode_read(struct hfs_bnode *node, void *buf, int 
off, int len)
off &= ~PAGE_MASK;
 
l = min_t(int, len, PAGE_SIZE - off);
-   memcpy(buf, kmap(*pagep) + off, l);
-   kunmap(*pagep);
+   memcpy(buf, kmap_thread(*pagep) + off, l);
+   kunmap_thread(*pagep);
 
while ((len -= l) != 0) {
buf += l;
l = min_t(int, len, PAGE_SIZE);
-   memcpy(buf, kmap(*++pagep), l);
-   kunmap(*pagep);
+   memcpy(buf, kmap_thread(*++pagep), l);
+   kunmap_thread(*pagep);
}
 }
 
@@ -82,16 +82,16 @@ void hfs_bnode_write(struct hfs_bnode *node, void *buf, int 
off, int len)
off &= ~PAGE_MASK;
 
l = min_t(int, len, PAGE_SIZE - off);
-   memcpy(kmap(*pagep) + off, buf, l);
+   memcpy(kmap_thread(*pagep) + off, buf, l);
set_page_dirt

[PATCH RFC PKS/PMEM 22/58] fs/f2fs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Jaegeuk Kim 
Cc: Chao Yu 
Signed-off-by: Ira Weiny 
---
 fs/f2fs/f2fs.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index d9e52a7f3702..ff72a45a577e 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -2410,12 +2410,12 @@ static inline struct page *f2fs_pagecache_get_page(
 
 static inline void f2fs_copy_page(struct page *src, struct page *dst)
 {
-   char *src_kaddr = kmap(src);
-   char *dst_kaddr = kmap(dst);
+   char *src_kaddr = kmap_thread(src);
+   char *dst_kaddr = kmap_thread(dst);
 
memcpy(dst_kaddr, src_kaddr, PAGE_SIZE);
-   kunmap(dst);
-   kunmap(src);
+   kunmap_thread(dst);
+   kunmap_thread(src);
 }
 
 static inline void f2fs_put_page(struct page *page, int unlock)
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 13/58] fs/btrfs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Chris Mason 
Cc: Josef Bacik 
Cc: David Sterba 
Signed-off-by: Ira Weiny 
---
 fs/btrfs/check-integrity.c |  4 ++--
 fs/btrfs/compression.c |  4 ++--
 fs/btrfs/inode.c   | 16 
 fs/btrfs/lzo.c | 24 
 fs/btrfs/raid56.c  | 34 +-
 fs/btrfs/reflink.c |  8 
 fs/btrfs/send.c|  4 ++--
 fs/btrfs/zlib.c| 32 
 fs/btrfs/zstd.c| 20 ++--
 9 files changed, 73 insertions(+), 73 deletions(-)

diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 81a8c87a5afb..9e5a02512ab5 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -2706,7 +2706,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
 
bio_for_each_segment(bvec, bio, iter) {
BUG_ON(bvec.bv_len != PAGE_SIZE);
-   mapped_datav[i] = kmap(bvec.bv_page);
+   mapped_datav[i] = kmap_thread(bvec.bv_page);
i++;
 
if (dev_state->state->print_mask &
@@ -2720,7 +2720,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
  bio, _is_patched,
  bio->bi_opf);
bio_for_each_segment(bvec, bio, iter)
-   kunmap(bvec.bv_page);
+   kunmap_thread(bvec.bv_page);
kfree(mapped_datav);
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 1ab56a734e70..5944fb36d68a 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -1626,7 +1626,7 @@ static void heuristic_collect_sample(struct inode *inode, 
u64 start, u64 end,
curr_sample_pos = 0;
while (index < index_end) {
page = find_get_page(inode->i_mapping, index);
-   in_data = kmap(page);
+   in_data = kmap_thread(page);
/* Handle case where the start is not aligned to PAGE_SIZE */
i = start % PAGE_SIZE;
while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
@@ -1639,7 +1639,7 @@ static void heuristic_collect_sample(struct inode *inode, 
u64 start, u64 end,
start += SAMPLING_INTERVAL;
curr_sample_pos += SAMPLING_READ_SIZE;
}
-   kunmap(page);
+   kunmap_thread(page);
put_page(page);
 
index++;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9570458aa847..9710a52c6c42 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4603,7 +4603,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t 
from, loff_t len,
if (offset != blocksize) {
if (!len)
len = blocksize - offset;
-   kaddr = kmap(page);
+   kaddr = kmap_thread(page);
if (front)
memset(kaddr + (block_start - page_offset(page)),
0, offset);
@@ -4611,7 +4611,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t 
from, loff_t len,
memset(kaddr + (block_start - page_offset(page)) +  
offset,
0, len);
flush_dcache_page(page);
-   kunmap(page);
+   kunmap_thread(page);
}
ClearPageChecked(page);
set_page_dirty(page);
@@ -6509,9 +6509,9 @@ static noinline int uncompress_inline(struct btrfs_path 
*path,
 */
 
if (max_size + pg_offset < PAGE_SIZE) {
-   char *map = kmap(page);
+   char *map = kmap_thread(page);
memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - 
pg_offset);
-   kunmap(page);
+   kunmap_thread(page);
}
kfree(tmp);
return ret;
@@ -6704,7 +6704,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode 
*inode,
goto out;
}
} else {
-   map = kmap(page);
+   map = kmap_thread(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
   copy_size);
if (pg_offset + copy_size < PAGE_SIZE) {
@@ -6712,7 +6712,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode 
*inode,
   PAGE_SIZE - pg_offset -
   

[PATCH RFC PKS/PMEM 12/58] fs/afs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To
avoid the over head of global PKRS updates use the new kmap_thread()
call.

Cc: David Howells 
Signed-off-by: Ira Weiny 
---
 fs/afs/dir.c  | 16 
 fs/afs/dir_edit.c | 16 
 fs/afs/mntpt.c|  4 ++--
 fs/afs/write.c|  4 ++--
 4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 1d2e61e0ab04..5d01cdb590de 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -127,14 +127,14 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, 
struct page *page,
qty /= sizeof(union afs_xdr_dir_block);
 
/* check them */
-   dbuf = kmap(page);
+   dbuf = kmap_thread(page);
for (tmp = 0; tmp < qty; tmp++) {
if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) {
printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n",
   __func__, dvnode->vfs_inode.i_ino, tmp, qty,
   ntohs(dbuf->blocks[tmp].hdr.magic));
trace_afs_dir_check_failed(dvnode, off, i_size);
-   kunmap(page);
+   kunmap_thread(page);
trace_afs_file_error(dvnode, -EIO, 
afs_file_error_dir_bad_magic);
goto error;
}
@@ -146,7 +146,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, 
struct page *page,
((u8 *)>blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0;
}
 
-   kunmap(page);
+   kunmap_thread(page);
 
 checked:
afs_stat_v(dvnode, n_read_dir);
@@ -177,13 +177,13 @@ static bool afs_dir_check_pages(struct afs_vnode *dvnode, 
struct afs_read *req)
req->pos, req->index, req->nr_pages, req->offset);
 
for (i = 0; i < req->nr_pages; i++) {
-   dbuf = kmap(req->pages[i]);
+   dbuf = kmap_thread(req->pages[i]);
for (j = 0; j < qty; j++) {
union afs_xdr_dir_block *block = >blocks[j];
 
pr_warn("[%02x] %32phN\n", i * qty + j, block);
}
-   kunmap(req->pages[i]);
+   kunmap_thread(req->pages[i]);
}
return false;
 }
@@ -481,7 +481,7 @@ static int afs_dir_iterate(struct inode *dir, struct 
dir_context *ctx,
 
limit = blkoff & ~(PAGE_SIZE - 1);
 
-   dbuf = kmap(page);
+   dbuf = kmap_thread(page);
 
/* deal with the individual blocks stashed on this page */
do {
@@ -489,7 +489,7 @@ static int afs_dir_iterate(struct inode *dir, struct 
dir_context *ctx,
   sizeof(union afs_xdr_dir_block)];
ret = afs_dir_iterate_block(dvnode, ctx, dblock, 
blkoff);
if (ret != 1) {
-   kunmap(page);
+   kunmap_thread(page);
goto out;
}
 
@@ -497,7 +497,7 @@ static int afs_dir_iterate(struct inode *dir, struct 
dir_context *ctx,
 
} while (ctx->pos < dir->i_size && blkoff < limit);
 
-   kunmap(page);
+   kunmap_thread(page);
ret = 0;
}
 
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index b108528bf010..35ed6828e205 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -218,7 +218,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
need_slots = round_up(12 + name->len + 1 + 4, AFS_DIR_DIRENT_SIZE);
need_slots /= AFS_DIR_DIRENT_SIZE;
 
-   meta_page = kmap(page0);
+   meta_page = kmap_thread(page0);
meta = _page->blocks[0];
if (i_size == 0)
goto new_directory;
@@ -247,7 +247,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
set_page_private(page, 1);
SetPagePrivate(page);
}
-   dir_page = kmap(page);
+   dir_page = kmap_thread(page);
}
 
/* Abandon the edit if we got a callback break. */
@@ -284,7 +284,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
 
if (page != page0) {
unlock_page(page);
-   kunmap(page);
+   kunmap_thread(page);
put_page(page);
}
}
@@ -323,7 +323,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode,
afs_set_contig_bits(block, slot, need_slots);
if (page != page0) {
unlock_page(page);
-   kunmap(page);
+   kunmap_thread(page);
put_page(page);
}
 
@@ -337,7 +337,7 @@ void afs_edit_dir_add(struct afs

[PATCH RFC PKS/PMEM 14/58] fs/cifs: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this FS are localized to a single thread.  To avoid
the over head of global PKRS updates use the new kmap_thread() call.

Cc: Steve French 
Signed-off-by: Ira Weiny 
---
 fs/cifs/cifsencrypt.c |  6 +++---
 fs/cifs/file.c| 16 
 fs/cifs/smb2ops.c |  8 
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 9daa256f69d4..2f8232d01a56 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -82,17 +82,17 @@ int __cifs_calc_signature(struct smb_rqst *rqst,
 
rqst_page_get_length(rqst, i, , );
 
-   kaddr = (char *) kmap(rqst->rq_pages[i]) + offset;
+   kaddr = (char *) kmap_thread(rqst->rq_pages[i]) + offset;
 
rc = crypto_shash_update(shash, kaddr, len);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with payload\n",
 __func__);
-   kunmap(rqst->rq_pages[i]);
+   kunmap_thread(rqst->rq_pages[i]);
return rc;
}
 
-   kunmap(rqst->rq_pages[i]);
+   kunmap_thread(rqst->rq_pages[i]);
}
 
rc = crypto_shash_final(shash, signature);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index be46fab4c96d..6db2caab8852 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2145,17 +2145,17 @@ static int cifs_partialpagewrite(struct page *page, 
unsigned from, unsigned to)
inode = page->mapping->host;
 
offset += (loff_t)from;
-   write_data = kmap(page);
+   write_data = kmap_thread(page);
write_data += from;
 
if ((to > PAGE_SIZE) || (from > to)) {
-   kunmap(page);
+   kunmap_thread(page);
return -EIO;
}
 
/* racing with truncate? */
if (offset > mapping->host->i_size) {
-   kunmap(page);
+   kunmap_thread(page);
return 0; /* don't care */
}
 
@@ -2183,7 +2183,7 @@ static int cifs_partialpagewrite(struct page *page, 
unsigned from, unsigned to)
rc = -EIO;
}
 
-   kunmap(page);
+   kunmap_thread(page);
return rc;
 }
 
@@ -2559,10 +2559,10 @@ static int cifs_write_end(struct file *file, struct 
address_space *mapping,
   known which we might as well leverage */
/* BB check if anything else missing out of ppw
   such as updating last write time */
-   page_data = kmap(page);
+   page_data = kmap_thread(page);
rc = cifs_write(cfile, pid, page_data + offset, copied, );
/* if (rc < 0) should we set writebehind rc? */
-   kunmap(page);
+   kunmap_thread(page);
 
free_xid(xid);
} else {
@@ -4511,7 +4511,7 @@ static int cifs_readpage_worker(struct file *file, struct 
page *page,
if (rc == 0)
goto read_complete;
 
-   read_data = kmap(page);
+   read_data = kmap_thread(page);
/* for reads over a certain size could initiate async read ahead */
 
rc = cifs_read(file, read_data, PAGE_SIZE, poffset);
@@ -4540,7 +4540,7 @@ static int cifs_readpage_worker(struct file *file, struct 
page *page,
rc = 0;
 
 io_error:
-   kunmap(page);
+   kunmap_thread(page);
unlock_page(page);
 
 read_complete:
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 32f90dc82c84..a3e7ebab38b6 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -4068,12 +4068,12 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, 
int num_rqst,
 
rqst_page_get_length(_rq[i], j, , );
 
-   dst = (char *) kmap(new_rq[i].rq_pages[j]) + offset;
-   src = (char *) kmap(old_rq[i - 1].rq_pages[j]) + offset;
+   dst = (char *) kmap_thread(new_rq[i].rq_pages[j]) + 
offset;
+   src = (char *) kmap_thread(old_rq[i - 1].rq_pages[j]) + 
offset;
 
memcpy(dst, src, len);
-   kunmap(new_rq[i].rq_pages[j]);
-   kunmap(old_rq[i - 1].rq_pages[j]);
+   kunmap_thread(new_rq[i].rq_pages[j]);
+   kunmap_thread(old_rq[i - 1].rq_pages[j]);
}
}
 
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 11/58] drivers/net: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in these drivers are localized to a single thread.  To
avoid the over head of global PKRS updates use the new kmap_thread()
call.

Cc: "David S. Miller" 
Cc: Jakub Kicinski 
Cc: Jesse Brandeburg 
Signed-off-by: Ira Weiny 
---
 drivers/net/ethernet/intel/igb/igb_ethtool.c | 4 ++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c 
b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 6e8231c1ddf0..ac9189752012 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -1794,14 +1794,14 @@ static int igb_check_lbtest_frame(struct igb_rx_buffer 
*rx_buffer,
 
frame_size >>= 1;
 
-   data = kmap(rx_buffer->page);
+   data = kmap_thread(rx_buffer->page);
 
if (data[3] != 0xFF ||
data[frame_size + 10] != 0xBE ||
data[frame_size + 12] != 0xAF)
match = false;
 
-   kunmap(rx_buffer->page);
+   kunmap_thread(rx_buffer->page);
 
return match;
 }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 71ec908266a6..7d469425f8b4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -1963,14 +1963,14 @@ static bool ixgbe_check_lbtest_frame(struct 
ixgbe_rx_buffer *rx_buffer,
 
frame_size >>= 1;
 
-   data = kmap(rx_buffer->page) + rx_buffer->page_offset;
+   data = kmap_thread(rx_buffer->page) + rx_buffer->page_offset;
 
if (data[3] != 0xFF ||
data[frame_size + 10] != 0xBE ||
data[frame_size + 12] != 0xAF)
match = false;
 
-   kunmap(rx_buffer->page);
+   kunmap_thread(rx_buffer->page);
 
return match;
 }
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 07/58] drivers/drbd: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this driver are localized to a single thread.  To
avoid the over head of global PKRS updates use the new kmap_thread()
call.

Cc: Jens Axboe 
Signed-off-by: Ira Weiny 
---
 drivers/block/drbd/drbd_main.c |  4 ++--
 drivers/block/drbd/drbd_receiver.c | 12 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 573dbf6f0c31..f0d0c6b0745e 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1532,9 +1532,9 @@ static int _drbd_no_send_page(struct drbd_peer_device 
*peer_device, struct page
int err;
 
socket = peer_device->connection->data.socket;
-   addr = kmap(page) + offset;
+   addr = kmap_thread(page) + offset;
err = drbd_send_all(peer_device->connection, socket, addr, size, 
msg_flags);
-   kunmap(page);
+   kunmap_thread(page);
if (!err)
peer_device->device->send_cnt += size >> 9;
return err;
diff --git a/drivers/block/drbd/drbd_receiver.c 
b/drivers/block/drbd/drbd_receiver.c
index 422363daa618..4704bc0564e2 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1951,13 +1951,13 @@ read_in_block(struct drbd_peer_device *peer_device, u64 
id, sector_t sector,
page = peer_req->pages;
page_chain_for_each(page) {
unsigned len = min_t(int, ds, PAGE_SIZE);
-   data = kmap(page);
+   data = kmap_thread(page);
err = drbd_recv_all_warn(peer_device->connection, data, len);
if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
drbd_err(device, "Fault injection: Corrupting data on 
receive\n");
data[0] = data[0] ^ (unsigned long)-1;
}
-   kunmap(page);
+   kunmap_thread(page);
if (err) {
drbd_free_peer_req(device, peer_req);
return NULL;
@@ -1992,7 +1992,7 @@ static int drbd_drain_block(struct drbd_peer_device 
*peer_device, int data_size)
 
page = drbd_alloc_pages(peer_device, 1, 1);
 
-   data = kmap(page);
+   data = kmap_thread(page);
while (data_size) {
unsigned int len = min_t(int, data_size, PAGE_SIZE);
 
@@ -2001,7 +2001,7 @@ static int drbd_drain_block(struct drbd_peer_device 
*peer_device, int data_size)
break;
data_size -= len;
}
-   kunmap(page);
+   kunmap_thread(page);
drbd_free_pages(peer_device->device, page, 0);
return err;
 }
@@ -2033,10 +2033,10 @@ static int recv_dless_read(struct drbd_peer_device 
*peer_device, struct drbd_req
D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
 
bio_for_each_segment(bvec, bio, iter) {
-   void *mapped = kmap(bvec.bv_page) + bvec.bv_offset;
+   void *mapped = kmap_thread(bvec.bv_page) + bvec.bv_offset;
expect = min_t(int, data_size, bvec.bv_len);
err = drbd_recv_all_warn(peer_device->connection, mapped, 
expect);
-   kunmap(bvec.bv_page);
+   kunmap_thread(bvec.bv_page);
if (err)
return err;
data_size -= expect;
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 08/58] drivers/firmware_loader: Utilize new kmap_thread()

2020-10-09 Thread ira . weiny
From: Ira Weiny 

The kmap() calls in this driver are localized to a single thread.  To
avoid the over head of global PKRS updates use the new kmap_thread()
call.

Cc: Luis Chamberlain 
Signed-off-by: Ira Weiny 
---
 drivers/base/firmware_loader/fallback.c | 4 ++--
 drivers/base/firmware_loader/main.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/base/firmware_loader/fallback.c 
b/drivers/base/firmware_loader/fallback.c
index 283ca2de76d4..22dea9ba7a37 100644
--- a/drivers/base/firmware_loader/fallback.c
+++ b/drivers/base/firmware_loader/fallback.c
@@ -322,14 +322,14 @@ static void firmware_rw(struct fw_priv *fw_priv, char 
*buffer,
int page_ofs = offset & (PAGE_SIZE-1);
int page_cnt = min_t(size_t, PAGE_SIZE - page_ofs, count);
 
-   page_data = kmap(fw_priv->pages[page_nr]);
+   page_data = kmap_thread(fw_priv->pages[page_nr]);
 
if (read)
memcpy(buffer, page_data + page_ofs, page_cnt);
else
memcpy(page_data + page_ofs, buffer, page_cnt);
 
-   kunmap(fw_priv->pages[page_nr]);
+   kunmap_thread(fw_priv->pages[page_nr]);
buffer += page_cnt;
offset += page_cnt;
count -= page_cnt;
diff --git a/drivers/base/firmware_loader/main.c 
b/drivers/base/firmware_loader/main.c
index 63b9714a0154..cc884c9f8742 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -409,11 +409,11 @@ static int fw_decompress_xz_pages(struct device *dev, 
struct fw_priv *fw_priv,
 
/* decompress onto the new allocated page */
page = fw_priv->pages[fw_priv->nr_pages - 1];
-   xz_buf.out = kmap(page);
+   xz_buf.out = kmap_thread(page);
xz_buf.out_pos = 0;
xz_buf.out_size = PAGE_SIZE;
xz_ret = xz_dec_run(xz_dec, _buf);
-   kunmap(page);
+   kunmap_thread(page);
fw_priv->size += xz_buf.out_pos;
/* partial decompression means either end or error */
if (xz_buf.out_pos != PAGE_SIZE)
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 05/58] kmap: Introduce k[un]map_thread

2020-10-09 Thread ira . weiny
From: Ira Weiny 

To correctly support the semantics of kmap() with Kernel protection keys
(PKS), kmap() may be required to set the protections on multiple
processors (globally).  Enabling PKS globally can be very expensive
depending on the requested operation.  Furthermore, enabling a domain
globally reduces the protection afforded by PKS.

Most kmap() (Aprox 209 of 229) callers use the map within a single thread and
have no need for the protection domain to be enabled globally.  However, the
remaining callers do not follow this pattern and, as best I can tell, expect
the mapping to be 'global' and available to any thread who may access the
mapping.[1]

We don't anticipate global mappings to pmem, however in general there is a
danger in changing the semantics of kmap().  Effectively, this would cause an
unresolved page fault with little to no information about why the failure
occurred.

To resolve this a number of options were considered.

1) Attempt to change all the thread local kmap() calls to kmap_atomic()[2]
2) Introduce a flags parameter to kmap() to indicate if the mapping should be
   global or not
3) Change ~20 call sites to 'kmap_global()' to indicate that they require a
   global enablement of the pages.
4) Change ~209 call sites to 'kmap_thread()' to indicate that the mapping is to
   be used within that thread of execution only

Option 1 is simply not feasible.  Option 2 would require all of the call sites
of kmap() to change.  Option 3 seems like a good minimal change but there is a
danger that new code may miss the semantic change of kmap() and not get the
behavior the developer intended.  Therefore, #4 was chosen.

Subsequent patches will convert most ~90% of the kmap callers to this new call
leaving about 10% of the existing kmap callers to enable PKS globally.

Cc: Randy Dunlap 
Signed-off-by: Ira Weiny 
---
 include/linux/highmem.h | 34 ++
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 2a9806e3b8d2..ef7813544719 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -60,7 +60,7 @@ static inline void kmap_flush_tlb(unsigned long addr) { }
 #endif
 
 void *kmap_high(struct page *page);
-static inline void *kmap(struct page *page)
+static inline void *__kmap(struct page *page, bool global)
 {
void *addr;
 
@@ -74,20 +74,20 @@ static inline void *kmap(struct page *page)
 * Even non-highmem pages may have additional access protections which
 * need to be checked and potentially enabled.
 */
-   dev_page_enable_access(page, true);
+   dev_page_enable_access(page, global);
return addr;
 }
 
 void kunmap_high(struct page *page);
 
-static inline void kunmap(struct page *page)
+static inline void __kunmap(struct page *page, bool global)
 {
might_sleep();
/*
 * Even non-highmem pages may have additional access protections which
 * need to be checked and potentially disabled.
 */
-   dev_page_disable_access(page, true);
+   dev_page_disable_access(page, global);
if (!PageHighMem(page))
return;
kunmap_high(page);
@@ -160,10 +160,10 @@ static inline struct page *kmap_to_page(void *addr)
 
 static inline unsigned long totalhigh_pages(void) { return 0UL; }
 
-static inline void *kmap(struct page *page)
+static inline void *__kmap(struct page *page, bool global)
 {
might_sleep();
-   dev_page_enable_access(page, true);
+   dev_page_enable_access(page, global);
return page_address(page);
 }
 
@@ -171,9 +171,9 @@ static inline void kunmap_high(struct page *page)
 {
 }
 
-static inline void kunmap(struct page *page)
+static inline void __kunmap(struct page *page, bool global)
 {
-   dev_page_disable_access(page, true);
+   dev_page_disable_access(page, global);
 #ifdef ARCH_HAS_FLUSH_ON_KUNMAP
kunmap_flush_on_unmap(page_address(page));
 #endif
@@ -238,6 +238,24 @@ static inline void kmap_atomic_idx_pop(void)
 
 #endif
 
+static inline void *kmap(struct page *page)
+{
+   return __kmap(page, true);
+}
+static inline void kunmap(struct page *page)
+{
+   __kunmap(page, true);
+}
+
+static inline void *kmap_thread(struct page *page)
+{
+   return __kmap(page, false);
+}
+static inline void kunmap_thread(struct page *page)
+{
+   __kunmap(page, false);
+}
+
 /*
  * Prevent people trying to call kunmap_atomic() as if it were kunmap()
  * kunmap_atomic() should get the return value of kmap_atomic, not the page.
-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/PMEM 04/58] kmap: Add stray access protection for device pages

2020-10-09 Thread ira . weiny
From: Ira Weiny 

Device managed pages may have additional protections.  These protections
need to be removed prior to valid use by kernel users.

Check for special treatment of device managed pages in kmap and take
action if needed.  We use kmap as an interface for generic kernel code
because under normal circumstances it would be a bug for general kernel
code to not use kmap prior to accessing kernel memory.  Therefore, this
should allow any valid kernel users to seamlessly use these pages
without issues.

Because of the critical nature of kmap it must be pointed out that the
over head on regular DRAM is carefully implemented to be as fast as
possible.  Furthermore the underlying MSR write required on device pages
when protected is better than a normal MSR write.

Specifically, WRMSR(MSR_IA32_PKRS) is not serializing but still
maintains ordering properties similar to WRPKRU.  The current SDM
section on PKRS needs updating but should be the same as that of WRPKRU.
So to quote from the WRPKRU text:

WRPKRU will never execute speculatively. Memory accesses
affected by PKRU register will not execute (even speculatively)
until all prior executions of WRPKRU have completed execution
and updated the PKRU register.

Still this will make accessing pmem more expensive from the kernel but
the overhead is minimized and many pmem users access this memory through
user page mappings which are not affected at all.

Cc: Randy Dunlap 
Signed-off-by: Ira Weiny 
---
 include/linux/highmem.h | 32 +++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 14e6202ce47f..2a9806e3b8d2 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -31,6 +32,20 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, 
int size)
 
 #include 
 
+static inline void dev_page_enable_access(struct page *page, bool global)
+{
+   if (!page_is_access_protected(page))
+   return;
+   dev_access_enable(global);
+}
+
+static inline void dev_page_disable_access(struct page *page, bool global)
+{
+   if (!page_is_access_protected(page))
+   return;
+   dev_access_disable(global);
+}
+
 #ifdef CONFIG_HIGHMEM
 extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot);
 extern void kunmap_atomic_high(void *kvaddr);
@@ -55,6 +70,11 @@ static inline void *kmap(struct page *page)
else
addr = kmap_high(page);
kmap_flush_tlb((unsigned long)addr);
+   /*
+* Even non-highmem pages may have additional access protections which
+* need to be checked and potentially enabled.
+*/
+   dev_page_enable_access(page, true);
return addr;
 }
 
@@ -63,6 +83,11 @@ void kunmap_high(struct page *page);
 static inline void kunmap(struct page *page)
 {
might_sleep();
+   /*
+* Even non-highmem pages may have additional access protections which
+* need to be checked and potentially disabled.
+*/
+   dev_page_disable_access(page, true);
if (!PageHighMem(page))
return;
kunmap_high(page);
@@ -85,6 +110,7 @@ static inline void *kmap_atomic_prot(struct page *page, 
pgprot_t prot)
 {
preempt_disable();
pagefault_disable();
+   dev_page_enable_access(page, false);
if (!PageHighMem(page))
return page_address(page);
return kmap_atomic_high_prot(page, prot);
@@ -137,6 +163,7 @@ static inline unsigned long totalhigh_pages(void) { return 
0UL; }
 static inline void *kmap(struct page *page)
 {
might_sleep();
+   dev_page_enable_access(page, true);
return page_address(page);
 }
 
@@ -146,6 +173,7 @@ static inline void kunmap_high(struct page *page)
 
 static inline void kunmap(struct page *page)
 {
+   dev_page_disable_access(page, true);
 #ifdef ARCH_HAS_FLUSH_ON_KUNMAP
kunmap_flush_on_unmap(page_address(page));
 #endif
@@ -155,6 +183,7 @@ static inline void *kmap_atomic(struct page *page)
 {
preempt_disable();
pagefault_disable();
+   dev_page_enable_access(page, false);
return page_address(page);
 }
 #define kmap_atomic_prot(page, prot)   kmap_atomic(page)
@@ -216,7 +245,8 @@ static inline void kmap_atomic_idx_pop(void)
 #define kunmap_atomic(addr) \
 do {\
BUILD_BUG_ON(__same_type((addr), struct page *));   \
-   kunmap_atomic_high(addr);  \
+   dev_page_disable_access(kmap_to_page(addr), false); \
+   kunmap_atomic_high(addr);   \
pagefault_enable(); \
preempt_enable();   \
 } while (0)
-- 
2.28.0

[PATCH RFC PKS/PMEM 03/58] memremap: Add zone device access protection

2020-10-09 Thread ira . weiny
From: Ira Weiny 

Device managed memory exposes itself to the kernel direct map which
allows stray pointers to access these device memories.

Stray pointers to normal memory may result in a crash or other
undesirable behavior which, while unfortunate, are usually recoverable
with a reboot.  Stray access, specifically stray writes, to areas such
as non-volatile memory are permanent in nature and thus are more likely
to result in permanent user data loss vs stray access to other memory
areas.

Furthermore, we protect against reads which can help with speculative
reads to poison areas as well.  But this is a secondary reason.

Set up an infrastructure for extra device access protection. Then
implement the new protection using the new Protection Keys Supervisor
(PKS) on architectures which support it.

To enable this extra protection devices specify a flag in the pgmap to
indicate that these areas wish to use additional protection.

Kernel code which intends to access this memory can do so automatically
through the use of the kmap infrastructure calling into
dev_access_[enable|disable]() described here.  The kmap infrastructure
is implemented in a follow on patch.

In addition, users can directly enable/disable the access through
dev_access_[enable|disable]() if they have a priori knowledge of the
type of pages they are accessing.

All calls to enable/disable protection flow through
dev_access_[enable|disable]() and are nestable by the use of a per task
reference count.  This reference count does 2 things.

1) Allows a thread to nest calls to disable protection such that the
   first call to re-enable protection does not 'break' the last access of
   the pmem device memory.

2) Provides faster performance by avoiding lots of MSR writes.  For
   example, looping over a sequence of pmem pages.

In addition, we must ensure the reference count is preserved through an
exception so we add the count to irqentry_state_t and save/restore the
reference count while giving exceptions their own count should they use
a kmap call.

The following shows how this works through an exception:

...
// ref == 0
dev_access_enable()  // ref += 1 ==> disable protection
irq()
// enable protection
// ref = 0
_handler()
dev_access_enable()   // ref += 1 ==> 
disable protection
dev_access_disable()  // ref -= 1 ==> 
enable protection
// WARN_ON(ref != 0)
// disable protection
do_pmem_thing()  // all good here
dev_access_disable() // ref -= 1 ==> 0 ==> enable protection
...

Nested exceptions operate the same way with each exception storing the
interrupted exception state all the way down.

The pkey value is never free'ed as this optimizes the implementation to
be either on or off using a static branch conditional in the fast paths.

Cc: Juri Lelli 
Cc: Vincent Guittot 
Cc: Dietmar Eggemann 
Cc: Steven Rostedt 
Cc: Ben Segall 
Cc: Mel Gorman 
Signed-off-by: Ira Weiny 
---
 arch/x86/entry/common.c  | 21 +
 include/linux/entry-common.h |  3 ++
 include/linux/memremap.h |  1 +
 include/linux/mm.h   | 43 +
 include/linux/sched.h|  3 ++
 init/init_task.c |  3 ++
 kernel/fork.c|  3 ++
 mm/Kconfig   | 13 ++
 mm/memremap.c| 90 
 9 files changed, 180 insertions(+)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 86ad32e0095e..3680724c1a4d 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -264,12 +264,27 @@ noinstr void idtentry_exit_nmi(struct pt_regs *regs, 
irqentry_state_t *irq_state
  *
  * NOTE That the thread saved PKRS must be preserved separately to ensure
  * global overrides do not 'stick' on a thread.
+ *
+ * Furthermore, Zone Device Access Protection maintains access in a re-entrant
+ * manner through a reference count which also needs to be maintained should
+ * exception handlers use those interfaces for memory access.  Here we start
+ * off the exception handler ref count to 0 and ensure it is 0 when the
+ * exception is done.  Then restore it for the interrupted task.
  */
 noinstr void irq_save_pkrs(irqentry_state_t *state)
 {
if (!cpu_feature_enabled(X86_FEATURE_PKS))
return;
 
+#ifdef CONFIG_ZONE_DEVICE_ACCESS_PROTECTION
+   /*
+* Save the ref count of the current running process and set it to 0
+* for any irq users to properly track re-entrance
+*/
+   state->pkrs_ref = current->dev_page_access_ref;
+   current->dev_page_access_ref = 0;
+#endif
+
/*
 * The thread_pkrs must be maintained separately to prevent global
 * overrides from 'sticking' on a t

[PATCH RFC PKS/Trusted keys 0/2] trusted keys: Add PKS protection to trusted keys

2020-10-09 Thread ira . weiny
From: Ira Weiny 

Leaking a trusted key would be a critical security issue.  PKS provides an
additional mechanism to restrict access to the memory holding trusted keys.

This series depends on the core patches and PMEM PKS API change submitted
separately.

Core PKS support:

https://lore.kernel.org/lkml/20201009194258.3207172-1-ira.we...@intel.com/

PKS/PMEM support (includes global API change):

https://lore.kernel.org/lkml/20201009195033.3208459-1-ira.we...@intel.com/

And contained in the git tree here:

https://github.com/weiny2/linux-kernel/tree/pks-rfc-v3


Provide a skeleton of a new allocation call which provides a PKS restricted
mapping of the trusted key memory.  Allocate a PKS domain (pkey), create a
mapping with that key, and enable/disable access as needed to that mapping.

The issue with this approach is that it fails to protect the direct mapping.
The current ideas to protect the direct mapping are:

1) Do nothing.
2) Allow the direct map to be fragmented through a set_memory_pks() 
like call.
3) Piggy back on secretmem's solution to map out some direct map memory
   then overlay that with PKS.
4) Integrate PKS into secretmem and use this enhanced secretmem for
   trusted keys.

Doing nothing is not really providing the level of security we need for this
use case.

Allowing the direct map to fragment may be ok as trusted keys don't use a lot
of pages and are usually allocated early in the system boot but that is not
always the case.  In addition the current code could be made
to not allocating an entire page for each key to limit the number of pages, and
therfore the fragmentation, needed.

The use of secretmem is complicated by its newness but whatever solution is
used there should be used here.  And probalby through some nice interface.

The final thought is to determine if a 'general allocator' for PKS protected
memory should be developed at all.  The current implementation is limited in
key space and so a higher bar to entry may be a good thing.  On the other hand,
dealing with mappings for the average driver writer is complicated and doing
this wrong could result in a false sense of 'security'.


Elena Reshetova (1):
  keys/trusted: protect trusted keys using PKS

Ira Weiny (1):
  vmalloc: Add vmalloc_pks() call

 Documentation/core-api/protection-keys.rst |  4 +
 include/keys/trusted-type.h|  2 +-
 include/keys/trusted_tpm.h | 15 
 include/linux/vmalloc.h|  1 +
 mm/vmalloc.c   | 28 +++
 security/keys/encrypted-keys/encrypted.c   | 38 ++---
 security/keys/trusted-keys/trusted_tpm1.c  | 90 +++---
 security/keys/trusted-keys/trusted_tpm2.c  |  9 +++
 8 files changed, 164 insertions(+), 23 deletions(-)

-- 
2.28.0.rc0.12.gb6a658bd00c9



[PATCH RFC PKS/Trusted keys 2/2] keys/trusted: protect trusted keys using PKS

2020-10-09 Thread ira . weiny
From: Elena Reshetova 

Enable additional protection for trusted keys against arbitrary reads
using Protection Keys Supervisor (PKS) feature on architectures which
support it.  PKS feature allows different 'domains' of page mappings
which have an extra level of protection beyond those specified in the
supervisor page table entries. Pages can be placed into such a 'domain'
upon allocation and the access to them is enforced based on the
per-thread per-CPU granularity.

The PKS protection 'domain' is acquired upon TPM initialization and, if
successful, will be used to limit access (both read and write) to pages
containing trusted key plaintext buffer. When a legitimate operation needs
to be performed on this buffer, the access to it is temporally enabled
for a particular thread on a particular CPU.

This is not a perfect solution, but a first step towards providing an
additional protection for the trusted keys residing in the kernel
memory.  Currently, it still does not prevent direct map aliased
attacks, i.e.  when the page with the key content is accessed via the
kernel direct map directly.  In order to address this concern, future
work could utilize PKS in secretmem[1] and/or secretment in trusted
keys.  For this patch, we are focused on just using PKS to remove the
non-direct map alias access to this key data when not needed.

In order to limit access to the pages containing the trusted key
plaintext buffer, trusted_key_enable/disable_access functions are used.
The "disable" function removes access (both read and write) to the pages
allocated for the trusted keys PKS 'domain' on a given CPU and for the
given thread, and the "enable" one enables read and write access on a
given CPU and for the given thread. By default the access to the trusted
keys PKS 'domain' is disabled for all threads on all CPUs.

[1] https://lore.kernel.org/linux-mm/20200924132904.1391-1-r...@kernel.org/

Signed-off-by: Elena Reshetova 
Signed-off-by: Ira Weiny 
---
 include/keys/trusted-type.h   |  2 +-
 include/keys/trusted_tpm.h| 15 
 security/keys/encrypted-keys/encrypted.c  | 38 +++---
 security/keys/trusted-keys/trusted_tpm1.c | 90 ---
 security/keys/trusted-keys/trusted_tpm2.c |  9 +++
 5 files changed, 131 insertions(+), 23 deletions(-)

diff --git a/include/keys/trusted-type.h b/include/keys/trusted-type.h
index a94c03a61d8f..826525e22b45 100644
--- a/include/keys/trusted-type.h
+++ b/include/keys/trusted-type.h
@@ -22,7 +22,7 @@ struct trusted_key_payload {
unsigned int key_len;
unsigned int blob_len;
unsigned char migratable;
-   unsigned char key[MAX_KEY_SIZE + 1];
+   unsigned char  *key;
unsigned char blob[MAX_BLOB_SIZE];
 };
 
diff --git a/include/keys/trusted_tpm.h b/include/keys/trusted_tpm.h
index a56d8e1298f2..5aa254c70820 100644
--- a/include/keys/trusted_tpm.h
+++ b/include/keys/trusted_tpm.h
@@ -47,6 +47,21 @@ int tpm2_unseal_trusted(struct tpm_chip *chip,
struct trusted_key_payload *payload,
struct trusted_key_options *options);
 
+#if defined(CONFIG_TRUSTED_KEYS) || \
+   (defined(CONFIG_TRUSTED_KEYS_MODULE) && 
defined(CONFIG_ENCRYPTED_KEYS_MODULE))
+void trusted_key_enable_access(void);
+void trusted_key_disable_access(void);
+#else
+static inline void trusted_key_enable_access(void)
+{
+
+}
+static inline void trusted_key_disable_access(void)
+{
+
+}
+#endif
+
 #define TPM_DEBUG 0
 
 #if TPM_DEBUG
diff --git a/security/keys/encrypted-keys/encrypted.c 
b/security/keys/encrypted-keys/encrypted.c
index 192e531c146f..d996e9fd2dec 100644
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -344,10 +345,12 @@ static int calc_hmac(u8 *digest, const u8 *key, unsigned 
int keylen,
 }
 
 enum derived_key_type { ENC_KEY, AUTH_KEY };
+enum master_key_type { TRUSTED_KEY, USER_KEY };
 
 /* Derive authentication/encryption key from trusted key */
 static int get_derived_key(u8 *derived_key, enum derived_key_type key_type,
-  const u8 *master_key, size_t master_keylen)
+  const u8 *master_key, size_t master_keylen,
+  enum master_key_type m_key_type)
 {
u8 *derived_buf;
unsigned int derived_buf_len;
@@ -366,8 +369,13 @@ static int get_derived_key(u8 *derived_key, enum 
derived_key_type key_type,
else
strcpy(derived_buf, "ENC_KEY");
 
+   if (unlikely(m_key_type == TRUSTED_KEY))
+   trusted_key_enable_access();
memcpy(derived_buf + strlen(derived_buf) + 1, master_key,
   master_keylen);
+   if (unlikely(m_key_type == TRUSTED_KEY))
+   trusted_key_disable_access();
+
ret = crypto_shash_tfm_digest(

  1   2   3   4   5   6   7   8   9   10   >