Re: [PATCH v4 0/3] cgroup: New misc cgroup controller

2021-04-04 Thread Vipin Sharma
On Sun, Apr 4, 2021 at 10:35 AM Tejun Heo  wrote:
>
> Applied to cgroup/for-5.13. If there are further issues, let's address them
> incrementally.
>
> Thanks.
>
> --
> tejun

Thanks Tejun for accepting and guiding through each version of this
patch series.


[PATCH v4 3/3] svm/sev: Register SEV and SEV-ES ASIDs to the misc controller

2021-03-29 Thread Vipin Sharma
Secure Encrypted Virtualization (SEV) and Secure Encrypted
Virtualization - Encrypted State (SEV-ES) ASIDs are used to encrypt KVMs
on AMD platform. These ASIDs are available in the limited quantities on
a host.

Register their capacity and usage to the misc controller for tracking
via cgroups.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
---
 arch/x86/kvm/svm/sev.c  | 70 +++--
 arch/x86/kvm/svm/svm.h  |  1 +
 include/linux/misc_cgroup.h |  6 
 kernel/cgroup/misc.c|  6 
 4 files changed, 73 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 874ea309279f..214eefb20414 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -28,6 +29,21 @@
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
+#ifndef CONFIG_KVM_AMD_SEV
+/*
+ * When this config is not defined, SEV feature is not supported and APIs in
+ * this file are not used but this file still gets compiled into the KVM AMD
+ * module.
+ *
+ * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
+ * misc_res_type {} defined in linux/misc_cgroup.h.
+ *
+ * Below macros allow compilation to succeed.
+ */
+#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
+#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
+#endif
+
 static u8 sev_enc_bit;
 static int sev_flush_asids(void);
 static DECLARE_RWSEM(sev_deactivate_lock);
@@ -89,8 +105,19 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
 
 static int sev_asid_new(struct kvm_sev_info *sev)
 {
-   int pos, min_asid, max_asid;
+   int pos, min_asid, max_asid, ret;
bool retry = true;
+   enum misc_res_type type;
+
+   type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+   WARN_ON(sev->misc_cg);
+   sev->misc_cg = get_current_misc_cg();
+   ret = misc_cg_try_charge(type, sev->misc_cg, 1);
+   if (ret) {
+   put_misc_cg(sev->misc_cg);
+   sev->misc_cg = NULL;
+   return ret;
+   }
 
mutex_lock(_bitmap_lock);
 
@@ -108,7 +135,8 @@ static int sev_asid_new(struct kvm_sev_info *sev)
goto again;
}
mutex_unlock(_bitmap_lock);
-   return -EBUSY;
+   ret = -EBUSY;
+   goto e_uncharge;
}
 
__set_bit(pos, sev_asid_bitmap);
@@ -116,6 +144,11 @@ static int sev_asid_new(struct kvm_sev_info *sev)
mutex_unlock(_bitmap_lock);
 
return pos + 1;
+e_uncharge:
+   misc_cg_uncharge(type, sev->misc_cg, 1);
+   put_misc_cg(sev->misc_cg);
+   sev->misc_cg = NULL;
+   return ret;
 }
 
 static int sev_get_asid(struct kvm *kvm)
@@ -125,14 +158,15 @@ static int sev_get_asid(struct kvm *kvm)
return sev->asid;
 }
 
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm_sev_info *sev)
 {
struct svm_cpu_data *sd;
int cpu, pos;
+   enum misc_res_type type;
 
mutex_lock(_bitmap_lock);
 
-   pos = asid - 1;
+   pos = sev->asid - 1;
__set_bit(pos, sev_reclaim_asid_bitmap);
 
for_each_possible_cpu(cpu) {
@@ -141,6 +175,11 @@ static void sev_asid_free(int asid)
}
 
mutex_unlock(_bitmap_lock);
+
+   type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+   misc_cg_uncharge(type, sev->misc_cg, 1);
+   put_misc_cg(sev->misc_cg);
+   sev->misc_cg = NULL;
 }
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -188,19 +227,20 @@ static int sev_guest_init(struct kvm *kvm, struct 
kvm_sev_cmd *argp)
asid = sev_asid_new(sev);
if (asid < 0)
return ret;
+   sev->asid = asid;
 
ret = sev_platform_init(>error);
if (ret)
goto e_free;
 
sev->active = true;
-   sev->asid = asid;
INIT_LIST_HEAD(>regions_list);
 
return 0;
 
 e_free:
-   sev_asid_free(asid);
+   sev_asid_free(sev);
+   sev->asid = 0;
return ret;
 }
 
@@ -1315,12 +1355,12 @@ void sev_vm_destroy(struct kvm *kvm)
mutex_unlock(>lock);
 
sev_unbind_asid(kvm, sev->handle);
-   sev_asid_free(sev->asid);
+   sev_asid_free(sev);
 }
 
 void __init sev_hardware_setup(void)
 {
-   unsigned int eax, ebx, ecx, edx;
+   unsigned int eax, ebx, ecx, edx, sev_asid_count, sev_es_asid_count;
bool sev_es_supported = false;
bool sev_supported = false;
 
@@ -1352,7 +1392,11 @@ void __init sev_hardware_setup(void)
if (!sev_reclaim_asid_bitmap)
goto out;
 
-   pr_info("SEV supported: %u ASIDs\n", max_sev_asid - min_sev_asid + 1);
+   sev_asid_count = max_sev_asid - min_sev_asid + 1;
+   if (misc_cg_set_capacity(MISC_CG_RES_SEV, se

[PATCH v4 1/3] cgroup: Add misc cgroup controller

2021-03-29 Thread Vipin Sharma
The Miscellaneous cgroup provides the resource limiting and tracking
mechanism for the scalar resources which cannot be abstracted like the
other cgroup resources. Controller is enabled by the CONFIG_CGROUP_MISC
config option.

A resource can be added to the controller via enum misc_res_type{} in
the include/linux/misc_cgroup.h file and the corresponding name via
misc_res_name[] in the kernel/cgroup/misc.c file. Provider of the
resource must set its capacity prior to using the resource by calling
misc_cg_set_capacity().

Once a capacity is set then the resource usage can be updated using
charge and uncharge APIs. All of the APIs to interact with misc
controller are in include/linux/misc_cgroup.h.

Miscellaneous controller provides 3 interface files. If two misc
resources (res_a and res_b) are registered then:

misc.capacity
A read-only flat-keyed file shown only in the root cgroup.  It shows
miscellaneous scalar resources available on the platform along with
their quantities::

$ cat misc.capacity
res_a 50
res_b 10

misc.current
A read-only flat-keyed file shown in the non-root cgroups.  It shows
the current usage of the resources in the cgroup and its children::

$ cat misc.current
res_a 3
res_b 0

misc.max
A read-write flat-keyed file shown in the non root cgroups. Allowed
maximum usage of the resources in the cgroup and its children.::

$ cat misc.max
res_a max
res_b 4

Limit can be set by::

# echo res_a 1 > misc.max

Limit can be set to max by::

# echo res_a max > misc.max

Limits can be set more than the capacity value in the misc.capacity
file.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
---
 include/linux/cgroup_subsys.h |   4 +
 include/linux/misc_cgroup.h   | 126 +++
 init/Kconfig  |  14 ++
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/misc.c  | 401 ++
 5 files changed, 546 insertions(+)
 create mode 100644 include/linux/misc_cgroup.h
 create mode 100644 kernel/cgroup/misc.c

diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b4..445235487230 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -61,6 +61,10 @@ SUBSYS(pids)
 SUBSYS(rdma)
 #endif
 
+#if IS_ENABLED(CONFIG_CGROUP_MISC)
+SUBSYS(misc)
+#endif
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
new file mode 100644
index ..1195d36558b4
--- /dev/null
+++ b/include/linux/misc_cgroup.h
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Miscellaneous cgroup controller.
+ *
+ * Copyright 2020 Google LLC
+ * Author: Vipin Sharma 
+ */
+#ifndef _MISC_CGROUP_H_
+#define _MISC_CGROUP_H_
+
+/**
+ * Types of misc cgroup entries supported by the host.
+ */
+enum misc_res_type {
+   MISC_CG_RES_TYPES
+};
+
+struct misc_cg;
+
+#ifdef CONFIG_CGROUP_MISC
+
+#include 
+
+/**
+ * struct misc_res: Per cgroup per misc type resource
+ * @max: Maximum limit on the resource.
+ * @usage: Current usage of the resource.
+ * @failed: True if charged failed for the resource in a cgroup.
+ */
+struct misc_res {
+   unsigned long max;
+   atomic_long_t usage;
+   bool failed;
+};
+
+/**
+ * struct misc_cg - Miscellaneous controller's cgroup structure.
+ * @css: cgroup subsys state object.
+ * @res: Array of misc resources usage in the cgroup.
+ */
+struct misc_cg {
+   struct cgroup_subsys_state css;
+   struct misc_res res[MISC_CG_RES_TYPES];
+};
+
+unsigned long misc_cg_res_total_usage(enum misc_res_type type);
+int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity);
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
+  unsigned long amount);
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
+ unsigned long amount);
+
+/**
+ * css_misc() - Get misc cgroup from the css.
+ * @css: cgroup subsys state object.
+ *
+ * Context: Any context.
+ * Return:
+ * * %NULL - If @css is null.
+ * * struct misc_cg* - misc cgroup pointer of the passed css.
+ */
+static inline struct misc_cg *css_misc(struct cgroup_subsys_state *css)
+{
+   return css ? container_of(css, struct misc_cg, css) : NULL;
+}
+
+/*
+ * get_current_misc_cg() - Find and get the misc cgroup of the current task.
+ *
+ * Returned cgroup has its ref count increased by 1. Caller must call
+ * put_misc_cg() to return the reference.
+ *
+ * Return: Misc cgroup to which the current task belongs to.
+ */
+static inline struct misc_cg *get_current_misc_cg(void)
+{
+   return css_misc(task_get_css(current, misc_cgrp_id));
+}
+
+/*
+ * put_misc_cg() - Put the misc cgroup and reduce its ref count.
+ * @cg - cgroup to put.
+ */
+static inline void put_misc_cg(struct misc_cg *cg)
+{
+   if (cg)
+   css_put(>css);
+}
+
+#else /* !CONFIG_CG

[PATCH v4 2/3] cgroup: Miscellaneous cgroup documentation.

2021-03-29 Thread Vipin Sharma
Documentation of miscellaneous cgroup controller. This new controller is
used to track and limit the usage of scalar resources.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
---
 Documentation/admin-guide/cgroup-v1/index.rst |  1 +
 Documentation/admin-guide/cgroup-v1/misc.rst  |  4 +
 Documentation/admin-guide/cgroup-v2.rst   | 73 ++-
 3 files changed, 76 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/misc.rst

diff --git a/Documentation/admin-guide/cgroup-v1/index.rst 
b/Documentation/admin-guide/cgroup-v1/index.rst
index 226f64473e8e..99fbc8a64ba9 100644
--- a/Documentation/admin-guide/cgroup-v1/index.rst
+++ b/Documentation/admin-guide/cgroup-v1/index.rst
@@ -17,6 +17,7 @@ Control Groups version 1
 hugetlb
 memcg_test
 memory
+misc
 net_cls
 net_prio
 pids
diff --git a/Documentation/admin-guide/cgroup-v1/misc.rst 
b/Documentation/admin-guide/cgroup-v1/misc.rst
new file mode 100644
index ..661614c24df3
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/misc.rst
@@ -0,0 +1,4 @@
+===
+Misc controller
+===
+Please refer "Misc" documentation in Documentation/admin-guide/cgroup-v2.rst
diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 64c62b979f2f..b1e81aa8598a 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -65,8 +65,11 @@ v1 is available under 
:ref:`Documentation/admin-guide/cgroup-v1/index.rst  misc.max
+
+   Limit can be set to max by::
+
+ # echo res_a max > misc.max
+
+Limits can be set higher than the capacity value in the misc.capacity
+file.
+
+Migration and Ownership
+~~~
+
+A miscellaneous scalar resource is charged to the cgroup in which it is used
+first, and stays charged to that cgroup until that resource is freed. Migrating
+a process to a different cgroup does not move the charge to the destination
+cgroup where the process has moved.
+
+Others
+--
+
 perf_event
 ~~
 
-- 
2.31.0.291.g576ba9dcdaf-goog



[PATCH v4 0/3] cgroup: New misc cgroup controller

2021-03-29 Thread Vipin Sharma
Hello,

This patch series is creating a new misc cgroup controller for limiting
and tracking of resources which are not abstract like other cgroup
controllers.

This controller was initially proposed as encryption_id but after the
feedbacks and use cases for other resources, it is now changed to misc
cgroup.
https://lore.kernel.org/lkml/20210108012846.4134815-2-vipi...@google.com/

Most of the cloud infrastructure use cgroups for knowing the host state,
track the resources usage, enforce limits on them, etc. They use this
info to optimize work allocation in the fleet and make sure no rogue job
consumes more than it needs and starves others.

There are resources on a system which are not abstract enough like other
cgroup controllers and are available in a limited quantity on a host.

One of them is Secure Encrypted Virtualization (SEV) ASID on AMD CPU.
SEV ASIDs are used for creating encrypted VMs. SEV is mostly be used by
the cloud providers for providing confidential VMs. Since SEV ASIDs are
limited, there is a need to schedule encrypted VMs in a cloud
infrastructure based on SEV ASIDs availability and also to limit its
usage.

There are similar requirements for other resource types like TDX keys,
IOASIDs and SEID.

Adding these resources to a cgroup controller is a natural choice with
least amount of friction. Cgroup itself says it is a mechanism to
distribute system resources along the hierarchy in a controlled
mechanism and configurable manner. Most of the resources in cgroups are
abstracted enough but there are still some resources which are not
abstract but have limited availability or have specific use cases.

Misc controller is a generic controller which can be used by these
kinds of resources.

One suggestion was to use BPF for this purpose, however, there are
couple of things which might not be addressed with BPF:
1. Which controller to use in v1 case? These are not abstract resources
   so in v1 where each controller have their own hierarchy it might not
   be easy to identify the best controller to use for BPF.

2. Abstracting out a single BPF program which can help with all of the
   resources types might not be possible, because resources we are
   working with are not similar and abstract enough, for example network
   packets, and there will be different places in the source code to use
   these resources.

A new cgroup controller tends to give much easier and well integrated
solution when it comes to scheduling and limiting a resource with
existing tools in a cloud infrastructure.

Changes in RFC v4:
1. Misc controller patch is split into two patches. One for generic misc
   controller and second for adding SEV and SEV-ES resource.
2. Using READ_ONCE and WRITE_ONCE for variable accesses.
3. Updated documentation.
4. Changed EXPORT_SYMBOL to EXPORT_SYMBOL_GPL.
5. Included cgroup header in misc_cgroup.h.
6. misc_cg_reduce_charge changed to misc_cg_cancel_charge.
7. misc_cg set to NULL after uncharge.
8. Added WARN_ON if misc_cg not NULL before charging in SEV/SEV-ES.

Changes in RFC v3:
1. Changed implementation to support 64 bit counters.
2. Print kernel logs only once per resource per cgroup.
3. Capacity can be set less than the current usage.

Changes in RFC v2:
1. Documentation fixes.
2. Added kernel log messages.
3. Changed charge API to treat misc_cg as input parameter.
4. Added helper APIs to get and release references on the cgroup.

[1] https://lore.kernel.org/lkml/20210218195549.1696769-1-vipi...@google.com
[2] https://lore.kernel.org/lkml/20210302081705.1990283-1-vipi...@google.com/
[3] https://lore.kernel.org/lkml/20210304231946.2766648-1-vipi...@google.com/

Vipin Sharma (3):
  cgroup: Add misc cgroup controller
  cgroup: Miscellaneous cgroup documentation.
  svm/sev: Register SEV and SEV-ES ASIDs to the misc controller

 Documentation/admin-guide/cgroup-v1/index.rst |   1 +
 Documentation/admin-guide/cgroup-v1/misc.rst  |   4 +
 Documentation/admin-guide/cgroup-v2.rst   |  73 +++-
 arch/x86/kvm/svm/sev.c|  70 ++-
 arch/x86/kvm/svm/svm.h|   1 +
 include/linux/cgroup_subsys.h |   4 +
 include/linux/misc_cgroup.h   | 132 ++
 init/Kconfig  |  14 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/misc.c  | 407 ++
 10 files changed, 695 insertions(+), 12 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/misc.rst
 create mode 100644 include/linux/misc_cgroup.h
 create mode 100644 kernel/cgroup/misc.c

-- 
2.31.0.291.g576ba9dcdaf-goog



Re: [Patch v3 1/2] cgroup: sev: Add misc cgroup controller

2021-03-24 Thread Vipin Sharma
On Wed, Mar 24, 2021 at 09:17:01AM -0700, Jacob Pan wrote:
> I didn't mean the users of misc_cgroup will use css directly. I meant if I
> want to use misc cgruop in ioasid.c, I have to do the following to avoid
> undefined css:
> #include 
> #include 
> 
> So it might be simpler if you do #include  inside
> misc_cgroup.h. Then in ioasid.c, I only need to do
> #include .

Sorry, I misunderstood the comment first time. I agree with you, I will
add cgroup header file in the misc_cgroup.h after  #ifdef
CONFIG_CGROUP_MISC statement.

Thanks
Vipin


Re: [Patch v3 1/2] cgroup: sev: Add misc cgroup controller

2021-03-22 Thread Vipin Sharma
On Fri, Mar 19, 2021 at 02:28:01PM -0700, Jacob Pan wrote:
> On Thu,  4 Mar 2021 15:19:45 -0800, Vipin Sharma  wrote:
> > +#ifndef _MISC_CGROUP_H_
> > +#define _MISC_CGROUP_H_
> > +
> nit: should you do #include ?
> Otherwise, css may be undefined.

User of this controller will use get_curernt_misc_cg() API which returns
a pointer. Ideally the user should use this pointer and they shouldn't have
any need to access "css" in their code. They also don't need to create a
object of 'struct misc_cg{}', because that won't be correct misc cgroup
object. They should just declare a pointer like we are doing here in
'struct kvm_sev_info {}'.

If they do need to use "css" then they can include cgroup header in their
code.

Let me know if I am overlooking something here.

Thanks
Vipin Sharma


Re: [Patch v3 0/2] cgroup: New misc cgroup controller

2021-03-22 Thread Vipin Sharma
On Mon, Mar 15, 2021 at 08:10:09PM +0100, Michal Koutný wrote:
> On Fri, Mar 12, 2021 at 09:49:26AM -0800, Vipin Sharma  
> wrote:
> > I will add some more information in the cover letter of the next version.
> Thanks.
> 
> > Each one coming up with their own interaction is a duplicate effort
> > when they all need similar thing.
> Could this be expressed as a new BPF hook (when allocating/freeing such
> a resource unit)?
> 
> The decision could be made based on the configured limit or even some
> other predicate.
> 
> (I saw this proposed already but I haven't seen some more reasoning
> whether it's worse/better. IMO, BPF hooks are "cheaper" than full-blown
> controllers, though it's still new user API.)

I am not much knowledgeable with BPF, so, I might be wrong here.

There are couple of things which might not be addressed with BPF:
1. Which controller to use in v1 case? These are not abstract resources
   so in v1 where each controller have their own hierarchy it might not
   be easy to identify the best controller.

2. It seems to me that we won't be able to abstract out a single BPF
   program which can help with all of the resources types we are
   planning to use, again, because it is not an abstract type like
   network packets, and there will be different places in the source
   code to use these resources.

To me a cgroup tends to give much easier and well integrated solution when it
comes to scheduling and limiting a resource with existing tools in a
cloud infrastructure.

> 
> 
> > As per my understanding this is the only for way for loadable modules
> > (kvm-amd in this case) to access Kernel APIs. Let me know if there is a
> > better way to do it.
> I understood the symbols are exported for such modularized builds.
> However, making them non-GPL exposes them to any out-of-tree modules,
> although, the resource types are supposed to stay hardcoded in the misc
> controller. So my point was to make them EXPORT_SYMBOL_GPL to mark
> they're just a means of implementing the modularized builds and not an
> API. (But they'd remain API for out-of-tree GPL modules anyway, so take
> this reasoning of mine with a grain of salt.)
> 
I see, I will change it to GPL.

Thanks
Vipin



Re: [RFC v2 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-12 Thread Vipin Sharma
On Fri, Mar 12, 2021 at 12:58:21PM -0800, Jacob Pan wrote:
> Hi Vipin & Tejun,
> 
> Sorry for the late reply, I sent from a different email address than I
> intended. Please see my comments inline.
> 
> 
> On Thu, 4 Mar 2021 03:51:16 -0500, Tejun Heo  wrote:
> 
> > Hello,
> > 
> > On Wed, Mar 03, 2021 at 10:22:03PM -0800, Vipin Sharma wrote:
> > > > I am trying to see if IOASIDs cgroup can also fit in this misc
> > > > controller as yet another resource type.
> > > > https://lore.kernel.org/linux-iommu/20210303131726.7a8cb169@jacob-builder/T/#u
> > > > However, unlike sev IOASIDs need to be migrated if the process is
> > > > moved to another cgroup. i.e. charge the destination and uncharge the
> > > > source.
> > > > 
> > > > Do you think this behavior can be achieved by differentiating resource
> > > > types? i.e. add attach callbacks for certain types. Having a single
> > > > misc interface seems cleaner than creating another controller.  
> > > 
> > > I think it makes sense to add support for migration for the resources
> > > which need it. Resources like SEV, SEV-ES will not participate in
> > > migration and won't stop can_attach() to succeed, other resources which
> > > need migration will allow or stop based on their limits and capacity in
> > > the destination.  
> > 
> Sounds good. Perhaps some capability/feature flags for each resource such
> that different behavior can be accommodated?
> Could you please include me in your future posting? I will rebase on yours.

Hi Jacob

Based on Tejun's response, I will not add charge migration support in
misc controller.

I can definitly add you in my future posting, if you still wanna use it
without charge migration support.

Thanks
Vipin


Re: [Patch v3 1/2] cgroup: sev: Add misc cgroup controller

2021-03-12 Thread Vipin Sharma
On Thu, Mar 11, 2021 at 07:59:03PM +0100, Michal Koutný wrote:
> > +#ifndef CONFIG_KVM_AMD_SEV
> > +/*
> > + * When this config is not defined, SEV feature is not supported and APIs 
> > in
> > + * this file are not used but this file still gets compiled into the KVM 
> > AMD
> > + * module.
> > + *
> > + * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the 
> > enum
> > + * misc_res_type {} defined in linux/misc_cgroup.h.
> BTW, was there any progress on conditioning sev.c build on
> CONFIG_KVM_AMD_SEV? (So that the defines workaround isn't needeed.)

Tom, Brijesh,
Is this something you guys thought about or have some plans to do in the
future? Basically to not include sev.c in compilation if
CONFIG_KVM_AMD_SEV is disabled.

Michal,
This should not be a blocker for misc controller. This thing can change
independent of misc cgroup.

Thanks
Vipin


Re: [Patch v3 1/2] cgroup: sev: Add misc cgroup controller

2021-03-12 Thread Vipin Sharma
On Thu, Mar 11, 2021 at 07:59:03PM +0100, Michal Koutný wrote:
> Given different two-fold nature (SEV caller vs misc controller) of some
> remarks below, I think it makes sense to split this into two patches:
> a) generic controller implementation,
> b) hooking the controller into SEV ASIDs management.

Sounds good. I will split it.

> > +   if (misc_res_capacity[type])
> > +   cg->res[type].max = max;
> In theory, parallel writers can clash here, so having the limit atomic
> type to prevent this would resolve it. See also commit a713af394cf3
> ("cgroup: pids: use atomic64_t for pids->limit").

We should be fine without atomic64_t because we are using unsigned
long and not 64 bit explicitly. This will work on both 32 and 64 bit
machines.

But I will add READ_ONCE and WRITE_ONCE because of potential chances of
load tearing and store tearing.

Do you agree?

> > +static int misc_cg_capacity_show(struct seq_file *sf, void *v)
> > +{
> > +   int i;
> > +   unsigned long cap;
> > +
> > +   for (i = 0; i < MISC_CG_RES_TYPES; i++) {
> > +   cap = READ_ONCE(misc_res_capacity[i]);
> Why is READ_ONCE only here and not in other places that (actually) check
> against the set capacity value? Also, there should be a paired
> WRITE_ONCCE in misc_cg_set_capacity().

This was only here to avoid multiple reads of capacity and making sure
if condition and seq_print will see the same value. Also, I was not
aware of load and store tearing of properly aligned and machine word
size variables. I will add READ_ONCE and WRITE_ONCE at
other places.

Thanks
Vipin


Re: [Patch v3 0/2] cgroup: New misc cgroup controller

2021-03-12 Thread Vipin Sharma
On Thu, Mar 11, 2021 at 07:58:19PM +0100, Michal Koutný wrote:
> I admit, I didn't follow the past dicussions completely, however,
> (Vipin) could it be in the cover letter/commit messages shortly
> summarized why cgroups and a controller were chosen to implement
> restrictions of these resources, what were the alternatives any why were
> they rejected?

I will add some more information in the cover letter of the next version.

Basically, SEV will mostly be used by cloud providers for providing
confidential VMs. Since they are limited we need a good way to schedule
these jobs in cloud infrastructure. To achieve this we either come up
with some ioctl for "/dev/sev" to know about its usage, availability,
etc. This requires existing scheduling mechanism in the cloud to have an
extension for this interaction. Now same thing needs to be done for TDX.
IBM SEID doesn't have scarcity of this resource but they are also
interested in tracking and limiting the usage. Each one coming up with
their own interaction is a duplicate effort when they all need similar
thing. One can say that abstraction should be at KVM level but these
resources can be used outside VM as well.

Most of the cloud infrastructure use cgroups for knowing the host state,
track the resources usage, enforce limits on them, etc. They use this
info to optimize work allocation in the fleet and make sure no rogue job
consumes more than it needs and starves other. Adding these resources
to cgroup is a natural choice with least friction. Cgroup itself says it
is a mechanism to distribute system resources along the hierarchy in a
controlled mechanism and configurable manner. Most of the resources in
cgroups are abstracted enough but their are still resources which are
not abstract but have limited availability or have specific use cases.

> 
> In the previous discussion, I saw the reasoning for the list of the
> resources to be hardwired in the controller itself in order to get some
> scrutiny of possible changes. That makes sense to me. But with that, is
> it necessary to commit to the new controller API via EXPORT_SYMBOL? (I
> don't mean this as a licensing question but what the external API should
> be (if any).)

As per my understanding this is the only for way for loadable modules
(kvm-amd in this case) to access Kernel APIs. Let me know if there is a
better way to do it.

> 
> Besides the generic remarks above, I'd still suggest some slight
> implementation changes, posted inline to the patch.

I will work on them.

I appreciate you guys taking out time and helping me out with this patch
series.

Thanks
Vipin


[Patch v3 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-04 Thread Vipin Sharma
Documentation of miscellaneous cgroup controller. This new controller is
used to track and limit the usage of scalar resources.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
---
 Documentation/admin-guide/cgroup-v1/index.rst |  1 +
 Documentation/admin-guide/cgroup-v1/misc.rst  |  4 ++
 Documentation/admin-guide/cgroup-v2.rst   | 69 ++-
 3 files changed, 72 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/misc.rst

diff --git a/Documentation/admin-guide/cgroup-v1/index.rst 
b/Documentation/admin-guide/cgroup-v1/index.rst
index 226f64473e8e..99fbc8a64ba9 100644
--- a/Documentation/admin-guide/cgroup-v1/index.rst
+++ b/Documentation/admin-guide/cgroup-v1/index.rst
@@ -17,6 +17,7 @@ Control Groups version 1
 hugetlb
 memcg_test
 memory
+misc
 net_cls
 net_prio
 pids
diff --git a/Documentation/admin-guide/cgroup-v1/misc.rst 
b/Documentation/admin-guide/cgroup-v1/misc.rst
new file mode 100644
index ..661614c24df3
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/misc.rst
@@ -0,0 +1,4 @@
+===
+Misc controller
+===
+Please refer "Misc" documentation in Documentation/admin-guide/cgroup-v2.rst
diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 1de8695c264b..74777323b7fd 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -63,8 +63,11 @@ v1 is available under 
:ref:`Documentation/admin-guide/cgroup-v1/index.rst  misc.max
+
+   Limit can be set to max by::
+
+ # echo sev max > misc.max
+
+Limits can be set higher than the capacity value in the misc.capacity
+file.
+
+Migration and Ownership
+~~~
+
+A miscellaneous scalar resource is charged to the cgroup in which it is used
+first, and stays charged to that cgroup until that resource is freed. Migrating
+a process to a different cgroup does not move the charge to the destination
+cgroup where the process has moved.
+
+Others
+--
+
 perf_event
 ~~
 
-- 
2.30.1.766.gb4fecdf3b7-goog



[Patch v3 1/2] cgroup: sev: Add misc cgroup controller

2021-03-04 Thread Vipin Sharma
The Miscellaneous cgroup provides the resource limiting and tracking
mechanism for the scalar resources which cannot be abstracted like the
other cgroup resources. Controller is enabled by the CONFIG_CGROUP_MISC
config option.

The first two resources added to the miscellaneous controller are Secure
Encrypted Virtualization (SEV) ASIDs and SEV - Encrypted State (SEV-ES)
ASIDs. These limited ASIDs are used for encrypting virtual machines
memory on the AMD platform

Miscellaneous controller provides 3 interface files:

misc.capacity
  A read-only flat-keyed file shown only in the root cgroup.  It shows
  miscellaneous scalar resources available on the platform along with
  their quantities::

$ cat misc.capacity
sev 50
sev_es 10

misc.current
  A read-only flat-keyed file shown in the non-root cgroups.  It shows
  the current usage of the resources in the cgroup and its children::

$ cat misc.current
sev 3
sev_es 0

misc.max
  A read-write flat-keyed file shown in the non root cgroups. Allowed
  maximum usage of the resources in the cgroup and its children.::

$ cat misc.max
sev max
sev_es 4

  Limit can be set by::

# echo sev 1 > misc.max

  Limit can be set to max by::

# echo sev max > misc.max

  Limits can be set more than the capacity value in the misc.capacity
  file.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
---
 arch/x86/kvm/svm/sev.c|  65 +-
 arch/x86/kvm/svm/svm.h|   1 +
 include/linux/cgroup_subsys.h |   4 +
 include/linux/misc_cgroup.h   | 130 +++
 init/Kconfig  |  14 ++
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/misc.c  | 402 ++
 7 files changed, 607 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/misc_cgroup.h
 create mode 100644 kernel/cgroup/misc.c

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 48017fef1cd9..dd05a1522862 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -27,6 +28,21 @@
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
+#ifndef CONFIG_KVM_AMD_SEV
+/*
+ * When this config is not defined, SEV feature is not supported and APIs in
+ * this file are not used but this file still gets compiled into the KVM AMD
+ * module.
+ *
+ * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
+ * misc_res_type {} defined in linux/misc_cgroup.h.
+ *
+ * Below macros allow compilation to succeed.
+ */
+#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
+#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
+#endif
+
 static u8 sev_enc_bit;
 static int sev_flush_asids(void);
 static DECLARE_RWSEM(sev_deactivate_lock);
@@ -88,8 +104,17 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
 
 static int sev_asid_new(struct kvm_sev_info *sev)
 {
-   int pos, min_asid, max_asid;
+   int pos, min_asid, max_asid, ret;
bool retry = true;
+   enum misc_res_type type;
+
+   type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+   sev->misc_cg = get_current_misc_cg();
+   ret = misc_cg_try_charge(type, sev->misc_cg, 1);
+   if (ret) {
+   put_misc_cg(sev->misc_cg);
+   return ret;
+   }
 
mutex_lock(_bitmap_lock);
 
@@ -107,7 +132,8 @@ static int sev_asid_new(struct kvm_sev_info *sev)
goto again;
}
mutex_unlock(_bitmap_lock);
-   return -EBUSY;
+   ret = -EBUSY;
+   goto e_uncharge;
}
 
__set_bit(pos, sev_asid_bitmap);
@@ -115,6 +141,10 @@ static int sev_asid_new(struct kvm_sev_info *sev)
mutex_unlock(_bitmap_lock);
 
return pos + 1;
+e_uncharge:
+   misc_cg_uncharge(type, sev->misc_cg, 1);
+   put_misc_cg(sev->misc_cg);
+   return ret;
 }
 
 static int sev_get_asid(struct kvm *kvm)
@@ -124,14 +154,15 @@ static int sev_get_asid(struct kvm *kvm)
return sev->asid;
 }
 
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm_sev_info *sev)
 {
struct svm_cpu_data *sd;
int cpu, pos;
+   enum misc_res_type type;
 
mutex_lock(_bitmap_lock);
 
-   pos = asid - 1;
+   pos = sev->asid - 1;
__set_bit(pos, sev_reclaim_asid_bitmap);
 
for_each_possible_cpu(cpu) {
@@ -140,6 +171,10 @@ static void sev_asid_free(int asid)
}
 
mutex_unlock(_bitmap_lock);
+
+   type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+   misc_cg_uncharge(type, sev->misc_cg, 1);
+   put_misc_cg(sev->misc_cg);
 }
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -187,19 +222,19 @@ static int sev_guest_init(struct kvm *kvm, struct 
kvm_sev_cmd *argp)
asid = sev_asid_new(sev);
i

[Patch v3 0/2] cgroup: New misc cgroup controller

2021-03-04 Thread Vipin Sharma
Hello

This patch series is creating a new misc cgroup controller for limiting
and tracking of resources which are not abstract like other cgroup
controllers.

This controller was initially proposed as encryption_id but after
the feedbacks, it is now changed to misc cgroup.
https://lore.kernel.org/lkml/20210108012846.4134815-2-vipi...@google.com/

Changes in RFC v3:
1. Changed implementation to support 64 bit counters.
2. Print kernel logs only once per resource per cgroup.
3. Capacity can be set less than the current usage.

Changes in RFC v2:
1. Documentation fixes.
2. Added kernel log messages.
3. Changed charge API to treat misc_cg as input parameter.
4. Added helper APIs to get and release references on the cgroup.

[1] https://lore.kernel.org/lkml/20210218195549.1696769-1-vipi...@google.com
[2] https://lore.kernel.org/lkml/20210302081705.1990283-1-vipi...@google.com/

Vipin Sharma (2):
  cgroup: sev: Add misc cgroup controller
  cgroup: sev: Miscellaneous cgroup documentation.

 Documentation/admin-guide/cgroup-v1/index.rst |   1 +
 Documentation/admin-guide/cgroup-v1/misc.rst  |   4 +
 Documentation/admin-guide/cgroup-v2.rst   |  69 ++-
 arch/x86/kvm/svm/sev.c|  65 ++-
 arch/x86/kvm/svm/svm.h|   1 +
 include/linux/cgroup_subsys.h |   4 +
 include/linux/misc_cgroup.h   | 130 ++
 init/Kconfig  |  14 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/misc.c  | 402 ++
 10 files changed, 679 insertions(+), 12 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/misc.rst
 create mode 100644 include/linux/misc_cgroup.h
 create mode 100644 kernel/cgroup/misc.c

-- 
2.30.1.766.gb4fecdf3b7-goog



Re: [RFC v2 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-03 Thread Vipin Sharma
On Wed, Mar 03, 2021 at 06:55:13PM -0800, Jacob Pan wrote:
> Hi Vipin,
> 
> On Tue,  2 Mar 2021 00:17:05 -0800, Vipin Sharma  wrote:
> 
> > +Migration and Ownership
> > +~~~
> > +
> > +A miscellaneous scalar resource is charged to the cgroup in which it is
> > used +first, and stays charged to that cgroup until that resource is
> > freed. Migrating +a process to a different cgroup does not move the
> > charge to the destination +cgroup where the process has moved.
> > +
> I am trying to see if IOASIDs cgroup can also fit in this misc controller
> as yet another resource type.
> https://lore.kernel.org/linux-iommu/20210303131726.7a8cb169@jacob-builder/T/#u
> However, unlike sev IOASIDs need to be migrated if the process is moved to
> another cgroup. i.e. charge the destination and uncharge the source.
> 
> Do you think this behavior can be achieved by differentiating resource
> types? i.e. add attach callbacks for certain types. Having a single misc
> interface seems cleaner than creating another controller.

I think it makes sense to add support for migration for the resources
which need it. Resources like SEV, SEV-ES will not participate in
migration and won't stop can_attach() to succeed, other resources which
need migration will allow or stop based on their limits and capacity in
the destination.


Re: [RFC v2 1/2] cgroup: sev: Add misc cgroup controller

2021-03-03 Thread Vipin Sharma
On Wed, Mar 03, 2021 at 10:42:37AM -0500, Tejun Heo wrote:
> > +   atomic_t usage;
> > +};
> 
> Can we do 64bits so that something which counts memory can use this too?
> 
Sure.

> > +
> > +   if (usage > capacity)
> > +   return -EBUSY;
> 
> I'd rather go with allowing bringing down capacity below usage so that the
> users can set it to a lower value to drain existing usages while denying new
> ones. It's not like it's difficult to check the current total usage from the
> caller side, so I'm not sure it's very useful to shift the condition check
> here.
> 

Okay, I will change the code to set new capacity unconditionally.

Right now there is no API for the caller to know total usage, unless they
keep their own tally, I was thinking it will be useful to add one more API

unsigned long misc_cg_res_total_usage(enum misc_res_type type)

It will return root_cg usage for "type" resource.
Will it be fine?

> > +   pr_info("cgroup: charge rejected by misc controller for 
> > %s resource in ",
> > +   misc_res_name[type]);
> > +   pr_cont_cgroup_path(i->css.cgroup);
> > +   pr_cont("\n");
> 
> Should have commented on this in the priv thread but don't print something
> on every rejection. This often becomes a nuisance and can make an easy DoS
> vector at worst. If you wanna do it, print it once per cgroup or sth like
> that.

I didn't think in that way. Thanks, I will print it once per cgroup.

Thanks
Vipin


[RFC v2 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-03-02 Thread Vipin Sharma
Documentation of miscellaneous cgroup controller. This new controller is
used to track and limit the usage of scalar resources.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
---
 Documentation/admin-guide/cgroup-v1/index.rst |  1 +
 Documentation/admin-guide/cgroup-v1/misc.rst  |  4 ++
 Documentation/admin-guide/cgroup-v2.rst   | 69 ++-
 3 files changed, 72 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/misc.rst

diff --git a/Documentation/admin-guide/cgroup-v1/index.rst 
b/Documentation/admin-guide/cgroup-v1/index.rst
index 226f64473e8e..99fbc8a64ba9 100644
--- a/Documentation/admin-guide/cgroup-v1/index.rst
+++ b/Documentation/admin-guide/cgroup-v1/index.rst
@@ -17,6 +17,7 @@ Control Groups version 1
 hugetlb
 memcg_test
 memory
+misc
 net_cls
 net_prio
 pids
diff --git a/Documentation/admin-guide/cgroup-v1/misc.rst 
b/Documentation/admin-guide/cgroup-v1/misc.rst
new file mode 100644
index ..661614c24df3
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/misc.rst
@@ -0,0 +1,4 @@
+===
+Misc controller
+===
+Please refer "Misc" documentation in Documentation/admin-guide/cgroup-v2.rst
diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 1de8695c264b..74777323b7fd 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -63,8 +63,11 @@ v1 is available under 
:ref:`Documentation/admin-guide/cgroup-v1/index.rst  misc.max
+
+   Limit can be set to max by::
+
+ # echo sev max > misc.max
+
+Limits can be set higher than the capacity value in the misc.capacity
+file.
+
+Migration and Ownership
+~~~
+
+A miscellaneous scalar resource is charged to the cgroup in which it is used
+first, and stays charged to that cgroup until that resource is freed. Migrating
+a process to a different cgroup does not move the charge to the destination
+cgroup where the process has moved.
+
+Others
+--
+
 perf_event
 ~~
 
-- 
2.30.1.766.gb4fecdf3b7-goog



[RFC v2 1/2] cgroup: sev: Add misc cgroup controller

2021-03-02 Thread Vipin Sharma
The Miscellaneous cgroup provides the resource limiting and tracking
mechanism for the scalar resources which cannot be abstracted like the
other cgroup resources. Controller is enabled by the CONFIG_CGROUP_MISC
config option.

The first two resources added to the miscellaneous controller are Secure
Encrypted Virtualization (SEV) ASIDs and SEV - Encrypted State (SEV-ES)
ASIDs. These limited ASIDs are used for encrypting virtual machines
memory on the AMD platform

Miscellaneous controller provides 3 interface files:

misc.capacity
  A read-only flat-keyed file shown only in the root cgroup.  It shows
  miscellaneous scalar resources available on the platform along with
  their quantities::

$ cat misc.capacity
sev 50
sev_es 10

misc.current
  A read-only flat-keyed file shown in the non-root cgroups.  It shows
  the current usage of the resources in the cgroup and its children::

$ cat misc.current
sev 3
sev_es 0

misc.max
  A read-write flat-keyed file shown in the non root cgroups. Allowed
  maximum usage of the resources in the cgroup and its children.::

$ cat misc.max
sev max
sev_es 4

  Limit can be set by::

# echo sev 1 > misc.max

  Limit can be set to max by::

# echo sev max > misc.max

  Limits can be set more than the capacity value in the misc.capacity
  file.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
---
 arch/x86/kvm/svm/sev.c|  65 +-
 arch/x86/kvm/svm/svm.h|   1 +
 include/linux/cgroup_subsys.h |   4 +
 include/linux/misc_cgroup.h   | 122 ++
 init/Kconfig  |  14 ++
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/misc.c  | 423 ++
 7 files changed, 620 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/misc_cgroup.h
 create mode 100644 kernel/cgroup/misc.c

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 48017fef1cd9..dd05a1522862 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -27,6 +28,21 @@
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
+#ifndef CONFIG_KVM_AMD_SEV
+/*
+ * When this config is not defined, SEV feature is not supported and APIs in
+ * this file are not used but this file still gets compiled into the KVM AMD
+ * module.
+ *
+ * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
+ * misc_res_type {} defined in linux/misc_cgroup.h.
+ *
+ * Below macros allow compilation to succeed.
+ */
+#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
+#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
+#endif
+
 static u8 sev_enc_bit;
 static int sev_flush_asids(void);
 static DECLARE_RWSEM(sev_deactivate_lock);
@@ -88,8 +104,17 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
 
 static int sev_asid_new(struct kvm_sev_info *sev)
 {
-   int pos, min_asid, max_asid;
+   int pos, min_asid, max_asid, ret;
bool retry = true;
+   enum misc_res_type type;
+
+   type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+   sev->misc_cg = get_current_misc_cg();
+   ret = misc_cg_try_charge(type, sev->misc_cg, 1);
+   if (ret) {
+   put_misc_cg(sev->misc_cg);
+   return ret;
+   }
 
mutex_lock(_bitmap_lock);
 
@@ -107,7 +132,8 @@ static int sev_asid_new(struct kvm_sev_info *sev)
goto again;
}
mutex_unlock(_bitmap_lock);
-   return -EBUSY;
+   ret = -EBUSY;
+   goto e_uncharge;
}
 
__set_bit(pos, sev_asid_bitmap);
@@ -115,6 +141,10 @@ static int sev_asid_new(struct kvm_sev_info *sev)
mutex_unlock(_bitmap_lock);
 
return pos + 1;
+e_uncharge:
+   misc_cg_uncharge(type, sev->misc_cg, 1);
+   put_misc_cg(sev->misc_cg);
+   return ret;
 }
 
 static int sev_get_asid(struct kvm *kvm)
@@ -124,14 +154,15 @@ static int sev_get_asid(struct kvm *kvm)
return sev->asid;
 }
 
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm_sev_info *sev)
 {
struct svm_cpu_data *sd;
int cpu, pos;
+   enum misc_res_type type;
 
mutex_lock(_bitmap_lock);
 
-   pos = asid - 1;
+   pos = sev->asid - 1;
__set_bit(pos, sev_reclaim_asid_bitmap);
 
for_each_possible_cpu(cpu) {
@@ -140,6 +171,10 @@ static void sev_asid_free(int asid)
}
 
mutex_unlock(_bitmap_lock);
+
+   type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+   misc_cg_uncharge(type, sev->misc_cg, 1);
+   put_misc_cg(sev->misc_cg);
 }
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -187,19 +222,19 @@ static int sev_guest_init(struct kvm *kvm, struct 
kvm_sev_cmd *argp)
asid = sev_asid_new(sev);
i

[RFC v2 0/2] cgroup: New misc cgroup controller

2021-03-02 Thread Vipin Sharma
Hello

This patch series is creating a new misc cgroup controller for limiting
and tracking of resources which are not abstract like other cgroup
controllers.

This controller was initially proposed as encryption_id but after
the feedbacks, it is now changed to misc cgroup.
https://lore.kernel.org/lkml/20210108012846.4134815-2-vipi...@google.com/

Changes in RFC v2:
1. Documentation fixes.
2. Added kernel log messages.
3. Changed charge API to treat misc_cg as input parameter.
4. Added helper APIs to get and release references on the cgroup.

[1] https://lore.kernel.org/lkml/20210218195549.1696769-1-vipi...@google.com
Vipin Sharma (2):
  cgroup: sev: Add misc cgroup controller
  cgroup: sev: Miscellaneous cgroup documentation.

 Documentation/admin-guide/cgroup-v1/index.rst |   1 +
 Documentation/admin-guide/cgroup-v1/misc.rst  |   4 +
 Documentation/admin-guide/cgroup-v2.rst   |  69 ++-
 arch/x86/kvm/svm/sev.c|  65 ++-
 arch/x86/kvm/svm/svm.h|   1 +
 include/linux/cgroup_subsys.h |   4 +
 include/linux/misc_cgroup.h   | 122 +
 init/Kconfig  |  14 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/misc.c  | 423 ++
 10 files changed, 692 insertions(+), 12 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/misc.rst
 create mode 100644 include/linux/misc_cgroup.h
 create mode 100644 kernel/cgroup/misc.c

-- 
2.30.1.766.gb4fecdf3b7-goog



Re: [RFC 1/2] cgroup: sev: Add misc cgroup controller

2021-02-25 Thread Vipin Sharma
On Thu, Feb 25, 2021 at 10:52:49AM +0100, Michal Koutný wrote:
> On Wed, Feb 24, 2021 at 08:57:36PM -0800, Vipin Sharma  
> wrote:
> > This function is meant for hot unplug functionality too.
> Then I'm wondering if the current form is sufficient, i.e. the generic
> controller can hardly implement preemption but possibly it should
> prevent any additional charges of the resource. (Or this can be
> implemented the other subsystem and explained in the
> misc_cg_set_capacity() docs.)

My approach here is that it is the responsibility of the caller to:
1. Check the return value and proceed accordingly.
2. Ideally, let all of the usage be 0 before deactivating this resource
   by setting capacity to 0

But I see your point that it makes sense for this call to always
succeed. I think I can simplify this function now to just have xchg() (for
memory barrier) so that new value is immediately reflected in
misc_cg_try_charge() and no new charges will succeed.

Is the above change good?

> 
> > Just to be on the same page are you talking about adding an events file
> > like in pids?
> Actually, I meant just the kernel log message. As it's the simpler part
> and even pid events have some inconsistencies wrt hierarchical
> reporting.

I see, thanks, I will add some log messages, 

if (new_usage > res->max || new_usage > misc_res_capacity[type)) {
  pr_info("cgroup: charge rejected by misc controller for %s resource in ",
  misc_res_name[type]);
  pr_cont_cgroup_path(i->css.cgroup);
  pr_cont("\n");
  ...
}

Only difference compared to pids will be that here logs will be printed
for every failure.

I was thinking to add more information in the log like what is the current
limits (max and capacity) and what new usage would have been. Will there
be any objection to extra information?

> 
> > However, if I take reference at the first charge and remove reference at
> > last uncharge then I can keep the ref count in correct sync.
> I see now how it works. I still find it a bit complex. What about making
> misc_cg an input parameter and making it the callers responsibility to
> keep a reference? (Perhaps with helpers for the most common case.)

Yeah, that can simplify the misc controller, I will have to add couple of
more helper APIs for callers having simple use cases. I will make this
change.

Thanks
Vipin


Re: [RFC 1/2] cgroup: sev: Add misc cgroup controller

2021-02-24 Thread Vipin Sharma
On Tue, Feb 23, 2021 at 07:24:55PM +0100, Michal Koutný wrote:
> On Thu, Feb 18, 2021 at 11:55:48AM -0800, Vipin Sharma  
> wrote:
> > --- a/arch/x86/kvm/svm/sev.c
> > +++ b/arch/x86/kvm/svm/sev.c
> > [...]
> > +#ifndef CONFIG_KVM_AMD_SEV
> > +/*
> > + * When this config is not defined, SEV feature is not supported and APIs 
> > in
> > + * this file are not used but this file still gets compiled into the KVM 
> > AMD
> > + * module.
> I'm not familiar with the layout of KVM/SEV compile targets but wouldn't
> it be simpler to exclude whole svm/sev.c when !CONFIG_KVM_AMD_SEV?
> 

Tom,
Is there any plan to exclude sev.c compilation if CONFIG_KVM_AMD_SEV is
not set?

> > +++ b/kernel/cgroup/misc.c
> > [...]
> > +/**
> > + * misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
> > + * @type: Type of the misc res.
> > + * @capacity: Supported capacity of the misc res on the host.
> > + *
> > + * If capacity is 0 then the charging a misc cgroup fails for that type.
> > + *
> > + * The caller must serialize invocations on the same resource.
> > + *
> > + * Context: Process context.
> > + * Return:
> > + * * %0 - Successfully registered the capacity.
> > + * * %-EINVAL - If @type is invalid.
> > + * * %-EBUSY - If current usage is more than the capacity.
> When is this function supposed to be called? At boot only or is this
> meant for some kind of hot unplug functionality too?
> 

This function is meant for hot unplug functionality too.

> > +int misc_cg_try_charge(enum misc_res_type type, struct misc_cg **cg,
> > +  unsigned int amount)
> > [...]
> > +   new_usage = atomic_add_return(amount, >usage);
> > +   if (new_usage > res->max ||
> > +   new_usage > misc_res_capacity[type]) {
> > +   ret = -EBUSY;
> I'm not sure the user of this resource accounting will always be able to
> interpret EBUSY returned from depths of the subsystem.
> See what's done in pids controller in order to give some useful
> information about why operation failed.

Just to be on the same page are you talking about adding an events file
like in pids?

> 
> > +   goto err_charge;
> > +   }
> > +
> > +   // First one to charge gets a reference.
> > +   if (new_usage == amount)
> > +   css_get(>css);
> 1) Use the /* comment */ style.
> 2) You pin the whole path from task_cg up to root (on the first charge).
> That's unnecessary since children reference their parents.
> Also why do you get the reference only for the first charger? While it
> may work, it seems too convoluted to me.
> It'd be worth documenting what the caller can expect wrt to ref count of
> the returned misc_cg.

Suppose a user charges 5 resources in a single charge call but uncharges
them in 5 separate calls one by one. I cannot take reference on every
charge and put the reference for every uncharge as it is not guaranteed
to have equal number of charge-uncharge pairs and we will end up with
the wrong ref count.

However, if I take reference at the first charge and remove reference at
last uncharge then I can keep the ref count in correct sync.

I can rewrite if condition to (new_usage == amount && task_cg == i)
this will avoid pinning whole path up to the root. I was thinking that
original code was simpler, clearly I was wrong.

Thanks
Vipin


Re: [RFC 0/2] cgroup: New misc cgroup controller

2021-02-24 Thread Vipin Sharma
On Tue, Feb 23, 2021 at 07:24:33PM +0100, Michal Koutný wrote:
> Hello.
> 
> On Thu, Feb 18, 2021 at 11:55:47AM -0800, Vipin Sharma  
> wrote:
> > This patch is creating a new misc cgroup controller for allocation and
> > tracking of resources which are not abstract like other cgroup
> > controllers.
> Please don't refer to this as "allocation" anywhere, that has a specific
> meaning (see Resource Distribution Models in
> Documentation/admin-gruide/cgroup-v2.rst).

Yeah, it should be "Limits". I will update the text.

> 
> > This controller was initially proposed as encryption_id but after
> > the feedbacks, it is now changed to misc cgroup.
> > https://lore.kernel.org/lkml/20210108012846.4134815-2-vipi...@google.com/
> Interesting generalization. I wonder what else could fit under this as
> well. (It resembles pids controller on the cover.)
> 
> > Please provide any feedback for this RFC or if it is good for
> > merging then I can send a patch for merging.
> A new controller is added exposed with v1 attributes. I'm not convinced
> it is desirable to change the frozen v1 controllers' API? (And therefore
> promote it as well.)

This is a very trivial controller. I believe once it becomes public it
can be helpful down the line to the v1 users, who cannot use v2 yet, for
some simple resource accounting, like us, we have the need for ASIDs
accounting in v1 until we move to v2. If the community has strong
objection then I can remove v1 support.

> 
> Beware, bikeshedding. The name is very non-descriptive, potentially
> suggesting catch-all semantics. It'd deserve a further thought. My idea
> would be limit(s) or counter controller.

Limit and counter are kind of suggesting the underlying technique for
accounting instead of a generic name to denote resource types managed by
this controller. One can argue that if top level names are similar to
Resource Destribution Model then may be it makes sense to combine
resources with similar accounting technique under one controller.

I am looking at misc as a controller which is for resources not having
proper home in other controllers or not big enough to deserve their own
controller.

I agree with you coming up with a name which check all boxes of
requirements won't be possible. We have discussed name sev cgroup,
kvm cgroup, encryption_id cgroup before reaching to an agreement on misc
cgroup. I am open to other names if they are better suited for this
controller and makes more sense.

Thanks
Vipin


Re: [RFC 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-02-24 Thread Vipin Sharma
On Fri, Feb 19, 2021 at 11:02:41AM -0800, Randy Dunlap wrote:
> > +++ b/Documentation/admin-guide/cgroup-v1/misc.rst
> > @@ -0,0 +1 @@
> > +/Documentation/admin-guide/cgroup-v2.rst
> What is the purpose of this (above) file?

This new controller has both cgroup v1 and v2 support. Tejun suggested
if we can point to v2 doc from v1. If this is not recommended approach I
can add all of the v2 documention of misc controller here, let me know.

I missed a heading and adding this file in cgroup-v1/index.rst.
Fixed it now.

> > +Limits can be set more than the capacity value in the misc.capacity
> 
>  higher than
> 

Done

> > +a process to a different cgroup do not move the charge to the destination
> 
>does

Done

> > +Others
> >  
> 
> That underline is too short for "Others".
> 

Fixed.

> Try building this doc file, please.
> 
> next-20210219/Documentation/admin-guide/cgroup-v2.rst:2196: WARNING: 
> Unexpected indentation.
> next-20210219/Documentation/admin-guide/cgroup-v2.rst:2203: WARNING: 
> Unexpected indentation.
> next-20210219/Documentation/admin-guide/cgroup-v2.rst:2210: WARNING: 
> Unexpected indentation.
> next-20210219/Documentation/admin-guide/cgroup-v2.rst:2232: WARNING: Title 
> underline too short.
> 
> Others
> 
> next-20210219/Documentation/admin-guide/cgroup-v2.rst:2232: WARNING: Title 
> underline too short.
> 
> 
> I think that the first 3 warnings are due to missing a blank line after ::
> or they could have something to do with mixed tabs and spaces in the misc.*
> properties descriptions.
> 

Sorry, I was not familiar with Sphinx build and didn't build using that.
I have fixed all of the above warnings. My next patch will reflect
fixes.

Thanks



[RFC 2/2] cgroup: sev: Miscellaneous cgroup documentation.

2021-02-18 Thread Vipin Sharma
Documentation of miscellaneous cgroup controller. This new controller is
used to track and limit usage of scalar resources.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
---
 Documentation/admin-guide/cgroup-v1/misc.rst |  1 +
 Documentation/admin-guide/cgroup-v2.rst  | 64 +++-
 2 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/misc.rst

diff --git a/Documentation/admin-guide/cgroup-v1/misc.rst 
b/Documentation/admin-guide/cgroup-v1/misc.rst
new file mode 100644
index ..8e9e9311daeb
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/misc.rst
@@ -0,0 +1 @@
+/Documentation/admin-guide/cgroup-v2.rst
diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 1de8695c264b..1a41a3623b9b 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -63,8 +63,11 @@ v1 is available under 
:ref:`Documentation/admin-guide/cgroup-v1/index.rst  misc.max
+
+   Limit can be set to max by::
+ # echo sev max > misc.max
+
+Limits can be set more than the capacity value in the misc.capacity
+file.
+
+Migration and Ownership
+~~~
+
+A miscellaneous scalar resource is charged to the cgroup in which it is used
+first, and stays charged to that cgroup until that resource is freed. Migrating
+a process to a different cgroup do not move the charge to the destination
+cgroup where the process has moved.
+
+Others
 
 
 perf_event
-- 
2.30.0.617.g56c4b15f3c-goog



[RFC 1/2] cgroup: sev: Add misc cgroup controller

2021-02-18 Thread Vipin Sharma
The Miscellaneous cgroup provides the resource allocation and tracking
mechanism for the scalar resources which cannot be abstracted like the
other cgroup resources. Controller is enabled by the CONFIG_CGROUP_MISC
config option.

The first two resources added to the miscellaneous controller are Secure
Encrypted Virtualization (SEV) ASIDs and SEV - Encrypted State (SEV-ES)
ASIDs. These limited ASIDs are used for encrypting virtual machines
memory on the AMD platform

Miscellaneous controller provides 3 interface files:

misc.capacity
  A read-only flat-keyed file shown only in the root cgroup.  It shows
  miscellaneous scalar resources available on the platform along with
  their quantities
$ cat misc.capacity
sev 50
sev_es 10

misc.current
  A read-only flat-keyed file shown in the non-root cgroups.  It shows
  the current usage of the resources in the cgroup and its children.
$ cat misc.current
sev 3
sev_es 0

misc.max
  A read-write flat-keyed file shown in the non root cgroups. Allowed
  maximum usage of the resources in the cgroup and its children.::
$ cat misc.max
sev max
sev_es 4

  Limit can be set by::
# echo sev 1 > misc.max

  Limit can be set to max by::
# echo sev max > misc.max

  Limits can be set more than the capacity value in the misc.capacity
  file.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
---
 arch/x86/kvm/svm/sev.c|  60 -
 arch/x86/kvm/svm/svm.h|   1 +
 include/linux/cgroup_subsys.h |   4 +
 include/linux/misc_cgroup.h   |  75 ++
 init/Kconfig  |  14 ++
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/misc.c  | 456 ++
 7 files changed, 601 insertions(+), 10 deletions(-)
 create mode 100644 include/linux/misc_cgroup.h
 create mode 100644 kernel/cgroup/misc.c

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 48017fef1cd9..1617bb2ce83e 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -27,6 +28,21 @@
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
+#ifndef CONFIG_KVM_AMD_SEV
+/*
+ * When this config is not defined, SEV feature is not supported and APIs in
+ * this file are not used but this file still gets compiled into the KVM AMD
+ * module.
+ *
+ * We will not have MISC_CG_RES_SEV and MISC_CG_RES_SEV_ES entries in the enum
+ * misc_res_type {} defined in linux/misc_cgroup.h.
+ *
+ * Below macros allow compilation to succeed.
+ */
+#define MISC_CG_RES_SEV MISC_CG_RES_TYPES
+#define MISC_CG_RES_SEV_ES MISC_CG_RES_TYPES
+#endif
+
 static u8 sev_enc_bit;
 static int sev_flush_asids(void);
 static DECLARE_RWSEM(sev_deactivate_lock);
@@ -88,8 +104,14 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
 
 static int sev_asid_new(struct kvm_sev_info *sev)
 {
-   int pos, min_asid, max_asid;
+   int pos, min_asid, max_asid, ret;
bool retry = true;
+   enum misc_res_type type;
+
+   type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+   ret = misc_cg_try_charge(type, >misc_cg, 1);
+   if (ret)
+   return ret;
 
mutex_lock(_bitmap_lock);
 
@@ -107,7 +129,8 @@ static int sev_asid_new(struct kvm_sev_info *sev)
goto again;
}
mutex_unlock(_bitmap_lock);
-   return -EBUSY;
+   ret = -EBUSY;
+   goto e_uncharge;
}
 
__set_bit(pos, sev_asid_bitmap);
@@ -115,6 +138,9 @@ static int sev_asid_new(struct kvm_sev_info *sev)
mutex_unlock(_bitmap_lock);
 
return pos + 1;
+e_uncharge:
+   misc_cg_uncharge(type, sev->misc_cg, 1);
+   return ret;
 }
 
 static int sev_get_asid(struct kvm *kvm)
@@ -124,14 +150,15 @@ static int sev_get_asid(struct kvm *kvm)
return sev->asid;
 }
 
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm_sev_info *sev)
 {
struct svm_cpu_data *sd;
int cpu, pos;
+   enum misc_res_type type;
 
mutex_lock(_bitmap_lock);
 
-   pos = asid - 1;
+   pos = sev->asid - 1;
__set_bit(pos, sev_reclaim_asid_bitmap);
 
for_each_possible_cpu(cpu) {
@@ -140,6 +167,9 @@ static void sev_asid_free(int asid)
}
 
mutex_unlock(_bitmap_lock);
+
+   type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV;
+   misc_cg_uncharge(type, sev->misc_cg, 1);
 }
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -187,19 +217,19 @@ static int sev_guest_init(struct kvm *kvm, struct 
kvm_sev_cmd *argp)
asid = sev_asid_new(sev);
if (asid < 0)
return ret;
+   sev->asid = asid;
 
ret = sev_platform_init(>error);
if (ret)
goto e_free;
 
sev->

[RFC 0/2] cgroup: New misc cgroup controller

2021-02-18 Thread Vipin Sharma
Hello,

This patch is creating a new misc cgroup controller for allocation and
tracking of resources which are not abstract like other cgroup
controllers.

This controller was initially proposed as encryption_id but after
the feedbacks, it is now changed to misc cgroup.
https://lore.kernel.org/lkml/20210108012846.4134815-2-vipi...@google.com/

Changes from the encryption_id controller are:
1. There are only 3 files misc.{capacity, max, current} for all
   resources compared to each resource having their own 3 files in
   encryption_id cgroup.
2. If a resource capacity is 0 then it is considered inactive and won't
   show up in control files.
2. This is a lockless implementation similar to page counter APIs
   compared to single lock implementation in encryption_id cgroup.

Please provide any feedback for this RFC or if it is good for
merging then I can send a patch for merging.

Thanks

Vipin Sharma (2):
  cgroup: sev: Add misc cgroup controller
  cgroup: sev: Miscellaneous cgroup documentation.

 Documentation/admin-guide/cgroup-v1/misc.rst |   1 +
 Documentation/admin-guide/cgroup-v2.rst  |  64 ++-
 arch/x86/kvm/svm/sev.c   |  60 ++-
 arch/x86/kvm/svm/svm.h   |   1 +
 include/linux/cgroup_subsys.h|   4 +
 include/linux/misc_cgroup.h  |  75 +++
 init/Kconfig |  14 +
 kernel/cgroup/Makefile   |   1 +
 kernel/cgroup/misc.c | 456 +++
 9 files changed, 664 insertions(+), 12 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/misc.rst
 create mode 100644 include/linux/misc_cgroup.h
 create mode 100644 kernel/cgroup/misc.c

-- 
2.30.0.617.g56c4b15f3c-goog



Re: [Patch v4 1/2] cgroup: svm: Add Encryption ID controller

2021-01-26 Thread Vipin Sharma
On Tue, Jan 26, 2021 at 05:01:04PM -0500, Tejun Heo wrote:
> The whole thing seems pretty immature to me and I agree with you that coming
> up with an abstraction at this stage feels risky.
> 
> I'm leaning towards creating a misc controller to shove these things into:
> 
> * misc.max and misc.current: nested keyed files listing max and current
>   usage for the cgroup.
> 
> * Have an API to activate or update a given resource with total resource
>   count. I'd much prefer the resource list to be in the controller itself
>   rather than being through some dynamic API just so that there is some
>   review in what keys get added.
> 
> * Top level cgroup lists which resource is active and how many are
>   available.

Sounds good, we can have a single top level stat file

misc.stat
  Shows how many are supported on the host:
  $ cat misc.stat
  sev 500
  sev_es 10

If total value of some resource is 0 then it will be considered inactive and
won't show in misc.{stat, current, max}

We discussed earlier, instead of having "stat" file we should show
"current" and "capacity" files in the root but I think we can just have stat
at top showing total resources to keep it consistent with other cgroup
files.

Thanks
Vipin



Re: [Patch v4 1/2] cgroup: svm: Add Encryption ID controller

2021-01-21 Thread Vipin Sharma
On Wed, Jan 20, 2021 at 06:32:56PM -0500, Tejun Heo wrote:
> I don't know how many times I have to repeat the same point to get it
> across. For any question about actual abstraction, you haven't provided any
> kind of actual research or analysis and just keep pushing the same thing
> over and over again. Maybe the situation is such that it makes sense to
> change the rule but that needs substantial justifications. I've been asking
> to see whether there are such justifications but all I've been getting are
> empty answers. Until such discussions take place, please consider the series
> nacked and please excuse if I don't respond promptly in this thread.

I am sorry Tejun that you felt your feedback and questions are being ignored
or not answered properly by me. It was not my intent. Let me try again.

I am not able to come up with an abstraction for underlying the hardware
like we have for memory, cpu, and io with their respective cgroup
controllers, because each vendor is solving VM security issue in
different ways. For example:

s390 is using Ultravisor (UV) to disable access to the VMs memory from
the host.  All KVM interaction with their Protected Virtual Machines
(PVM) are handled through UV APIs. Here an encrypted guest image is
loaded first which is decrypted by UV and then UV disallows access to
PVMs memory and register state from KVM or other PVMs. PVMs are assigned
IDs known as secure execution IDs (SEID).  These IDs are not scarce
resource on the host.

AMD is encrypting runtime memory of a VM using an hardware AES engine in
the memory controller and keys are managed by an Arm based coprocessor
inside the CPU, for encryption and decryption of the data flow between
CPU and memory.  Their offering is known as Secure Encrypted
Virtualization (SEV). There are also two more enhanced offerings SEV-ES,
(memory + guest register state encryption), SEV-SNP (SEV-ES + memory
integrity protection + TCB rollback) in later generation of CPUs. At any
time only a limited number of IDs can be used simultaneously in the
processor. Initially only SEV IDs we available on the CPUs but in the
later generations of CPUs with the addition of SEV-ES, IDs were divided
in two groups SEV ASIDs for SEV guests, and SEV-ES ASIDs for SEV-ES and
SEV-SNP VMs. SEV firmware doesn't allow SEV ASIDs to launch SEV-ES and
SEV-SNP VMs. Ideally, I think its better to use SEV-SNP as it provides
highest protection but support in vmm and guest kernels are not there
yet. Also, old HW will not be able to run SEV-ES or SEV-SNP as they can
only run SEV ASIDs. I dont have data in terms of drawbacks running VM on
SEV-SNP in terms of speed and cost but I think it will be dependent on
workloads.

Intel has come up with Trusted Domain Extension (TDX) for their secure
VMs offering. They allow a VM to use multiple keys for private pages and
for pages shared with other VMs. Overall, this is called as Multi-Key
Total Memory Encryption (MKTME). A fixed number of encryption keys are
supported in MKTME engine. During execution these keys are identified
using KeyIDs which are present in upper bits of platform physical
addresses.

Only limited form of abstraction present here is that all are providing
a way to have secure VMs and processes, either through single key
encryption, multiple key encryptions or access denial.

A common abstraction of different underlying security behavior/approach
can mislead users in giving impression that all secure VMs/processes are
same. In my opinion, this kind of thing can work when we talk about
memory, cpu, etc, but for security related stuff will do more harm to
the end user than the benefit of simplicity of abstraction. The name of
the underlying feature also tells what kind of security guarantees a
user can expect on the platform for a VM and what kind is used.

Taking a step back, in the current scenario, we have some global shared
resources which are limited for SEV, SEV-ES, and TDX. There is also a
need for tracking and controlling on all 4 features for now. This is a
case for some kind of cgroup behavior to limit and control an aggregate
of processes using these system resources. After all, "cgroup is a
mechanism to organize processes hierarchically and distribute system
resources along the hierarchy in a controlled and configurable manner."

We are using SEV in KVM and outside KVM also for other products on
horizon. As cgroups are commonly used in many infrastructures for
resource control, scheduling, and tracking, this patch is helping us in
allocating jobs in the infrastructure along with memory, cpu and other
constraints in a coherent way.

If you feel encryption id cgroup is not good for long term or a
too specific use case then may be there should be a common cgroup which
can be a home for this kind and other kind of future resources where
there is need to limit a global resource allocation but are not abstract
or cannot be abstracted as the other existing cgroups. My current patch
is very generic and with 

Re: [Patch v4 1/2] cgroup: svm: Add Encryption ID controller

2021-01-20 Thread Vipin Sharma
On Wed, Jan 20, 2021 at 11:40:18AM -0500, Tejun Heo wrote:
> Hello,
> 
> On Tue, Jan 19, 2021 at 11:13:51PM -0800, Vipin Sharma wrote:
> > > Can you please elaborate? I skimmed through the amd manual and it seemed 
> > > to
> > > say that SEV-ES ASIDs are superset of SEV but !SEV-ES ASIDs. What's the 
> > > use
> > > case for mixing those two?
> > 
> > For example, customers can be given options for which kind of protection 
> > they
> > want to choose for their workloads based on factors like data protection
> > requirement, cost, speed, etc.
> 
> So, I'm looking for is a bit more in-depth analysis than that. ie. What's
> the downside of SEV && !SEV-ES and is the disticntion something inherently
> useful?

I will leave this for AMD folks to respond, as they can give much better
answer than me.

> > > I'm very reluctant to ack vendor specific interfaces for a few reasons but
> > > most importantly because they usually indicate abstraction and/or the
> > > underlying feature not being sufficiently developed and they tend to 
> > > become
> > > baggages after a while. So, here are my suggestions:
> > 
> > My first patch was only for SEV, but soon we got comments that this can
> > be abstracted and used by TDX and SEID for their use cases.
> > 
> > I see this patch as providing an abstraction for simple accounting of
> > resources used for creating secure execution contexts. Here, secure
> > execution is achieved through different means. SEID, TDX, and SEV
> > provide security using different features and capabilities. I am not
> > sure if we will reach a point where all three and other vendors will use
> > the same approach and technology for this purpose.
> > 
> > Instead of each one coming up with their own resource tracking for their
> > features, this patch is providing a common framework and cgroup for
> > tracking these resources.
> 
> What's implemented is a shared place where similar things can be thrown in
> bu from user's perspective the underlying hardware feature isn't really
> abstracted. It's just exposing whatever hardware knobs there are. If you
> look at any other cgroup controllers, nothing is exposing this level of
> hardware dependent details and I'd really like to keep it that way.

RDMA cgroup expose hardware details to users. In rdma.{max, current}
interface files we can see actual hardware names. Only difference
compared to Encryption ID cgroup is that latter is exposing that detail
via file names.

Will you prefer that encryption ID cgroup do things similar to RDMA
cgroup? It can have 3 files
1. encids.capacity (only on root)
   Shows features (SEV, SEV-ES, TDX, SEID) available along with capacity
   on the host.
   $ cat encids.capacity
   sev 400
   sev-es 100

2. encids.max (only on non-root)
   Allows setting of the max value of a feature in the cgroup.
   It will only show max for features shown in the capacity file.
   $ cat encids.max
   sev max
   sev-es 100

3. encids.current (all levels)
   Shows total getting used in the cgroup and its descendants.
   $ cat encids.current
   sev 3
   sev-es 0

> 
> So, what I'm asking for is more in-depth analysis of the landscape and
> inherent differences among different vendor implementations to see whether
> there can be better approaches or we should just wait and see.
> 
> > > * If there can be a shared abstraction which hopefully makes intuitive
> > >   sense, that'd be ideal. It doesn't have to be one knob but it shouldn't 
> > > be
> > >   something arbitrary to specific vendors.
> > 
> > I think we should see these as features provided on a host. Tasks can
> > be executed securely on a host with the guarantees provided by the
> > specific feature (SEV, SEV-ES, TDX, SEID) used by the task.
> > 
> > I don't think each H/W vendor can agree to a common set of security
> > guarantees and approach.
> 
> Do TDX and SEID have multiple key types tho?
To my limited knowledge I don't think so. I don't know their future
plans.

> 
> > > * If we aren't there yet and vendor-specific interface is a must, attach
> > >   that part to an interface which is already vendor-aware.
> > Sorry, I don't understand this approach. Can you please give more
> > details about it?
> 
> Attaching the interface to kvm side, most likely, instead of exposing the
> feature through cgroup.
I am little confused, do you mean moving files from the kernel/cgroup/
to kvm related directories or you are recommending not to use cgroup at
all?  I hope it is the former :)

Only issue with this is that TDX is not limited to KVM, they have
potential use cases for MKTME without KVM.

Thanks
Vipin


Re: [Patch v4 1/2] cgroup: svm: Add Encryption ID controller

2021-01-19 Thread Vipin Sharma
On Tue, Jan 19, 2021 at 10:51:24AM -0500, Tejun Heo wrote:
> Hello,
> 
> On Fri, Jan 15, 2021 at 08:32:19PM -0800, Vipin Sharma wrote:
> > SEV-ES has stronger memory encryption gurantees compared to SEV, apart
> > from encrypting the application memory it also encrypts register state
> > among other things. In a single host ASIDs can be distributed between
> > these two types by BIOS settings.
> > 
> > Currently, Google Cloud has Confidential VM machines offering using SEV.
> > ASIDs are not compatible between SEV and SEV-ES, so a VM running on SEV
> > cannot run on SEV-ES and vice versa
> > 
> > There are use cases for both types of VMs getting used in future.
> 
> Can you please elaborate? I skimmed through the amd manual and it seemed to
> say that SEV-ES ASIDs are superset of SEV but !SEV-ES ASIDs. What's the use
> case for mixing those two?

For example, customers can be given options for which kind of protection they
want to choose for their workloads based on factors like data protection
requirement, cost, speed, etc.

In terms of features SEV-ES is superset of SEV but that doesn't mean SEV
ASIDs are superset of SEV ASIDs. SEV ASIDs cannot be used for SEV-ES VMs
and similarly SEV-ES ASIDs cannot be used for SEV VMs. Once a system is
booted, based on the BIOS settings each type will have their own
capacity and that number cannot be changed until the next boot and BIOS
changes.

We are not mixing the two types of ASIDs, they are separate and used
separately.

> 
> > > > > > Other ID types can be easily added in the controller in the same 
> > > > > > way.
> > > > > 
> > > > > I'm not sure this is necessarily a good thing.
> > > > 
> > > > This is to just say that when Intel and PowerPC changes are ready it
> > > > won't be difficult for them to add their controller.
> > > 
> > > I'm not really enthused about having per-hardware-type control knobs. None
> > > of other controllers behave that way. Unless it can be abstracted into
> > > something common, I'm likely to object.
> > 
> > There was a discussion in Patch v1 and consensus was to have individual
> > files because it makes kernel implementation extremely simple.
> > 
> > https://lore.kernel.org/lkml/alpine.deb.2.23.453.2011131615510.333...@chino.kir.corp.google.com/#t
> 
> I'm very reluctant to ack vendor specific interfaces for a few reasons but
> most importantly because they usually indicate abstraction and/or the
> underlying feature not being sufficiently developed and they tend to become
> baggages after a while. So, here are my suggestions:

My first patch was only for SEV, but soon we got comments that this can
be abstracted and used by TDX and SEID for their use cases.

I see this patch as providing an abstraction for simple accounting of
resources used for creating secure execution contexts. Here, secure
execution is achieved through different means. SEID, TDX, and SEV
provide security using different features and capabilities. I am not
sure if we will reach a point where all three and other vendors will use
the same approach and technology for this purpose.

Instead of each one coming up with their own resource tracking for their
features, this patch is providing a common framework and cgroup for
tracking these resources.

> 
> * If there can be a shared abstraction which hopefully makes intuitive
>   sense, that'd be ideal. It doesn't have to be one knob but it shouldn't be
>   something arbitrary to specific vendors.

I think we should see these as features provided on a host. Tasks can
be executed securely on a host with the guarantees provided by the
specific feature (SEV, SEV-ES, TDX, SEID) used by the task.

I don't think each H/W vendor can agree to a common set of security
guarantees and approach.

> 
> * If we aren't there yet and vendor-specific interface is a must, attach
>   that part to an interface which is already vendor-aware.
Sorry, I don't understand this approach. Can you please give more
details about it?

Thanks
Vipin


Re: [Patch v5 2/2] cgroup: svm: Encryption IDs cgroup documentation.

2021-01-19 Thread Vipin Sharma
On Mon, Jan 18, 2021 at 9:55 AM Randy Dunlap  wrote:
>
> On 1/15/21 6:32 PM, Vipin Sharma wrote:
> > Documentation of Encryption IDs controller. This new controller is used
> > to track and limit usage of hardware memory encryption capabilities on
> > the CPUs.
> >
> > Signed-off-by: Vipin Sharma 
> > Reviewed-by: David Rientjes 
> > Reviewed-by: Dionna Glaze 
> > ---
> >  .../admin-guide/cgroup-v1/encryption_ids.rst  |  1 +
> >  Documentation/admin-guide/cgroup-v2.rst   | 78 ++-
> >  2 files changed, 77 insertions(+), 2 deletions(-)
> >  create mode 100644 Documentation/admin-guide/cgroup-v1/encryption_ids.rst
> >
> > diff --git a/Documentation/admin-guide/cgroup-v1/encryption_ids.rst 
> > b/Documentation/admin-guide/cgroup-v1/encryption_ids.rst
> > new file mode 100644
> > index ..8e9e9311daeb
> > --- /dev/null
> > +++ b/Documentation/admin-guide/cgroup-v1/encryption_ids.rst
> > @@ -0,0 +1 @@
> > +/Documentation/admin-guide/cgroup-v2.rst
> > diff --git a/Documentation/admin-guide/cgroup-v2.rst 
> > b/Documentation/admin-guide/cgroup-v2.rst
> > index 63521cd36ce5..72993571de2e 100644
> > --- a/Documentation/admin-guide/cgroup-v2.rst
> > +++ b/Documentation/admin-guide/cgroup-v2.rst
> > @@ -63,8 +63,11 @@ v1 is available under 
> > :ref:`Documentation/admin-guide/cgroup-v1/index.rst  > 5-7-1. RDMA Interface Files
> >   5-8. HugeTLB
> > 5.8-1. HugeTLB Interface Files
> > - 5-8. Misc
> > -   5-8-1. perf_event
> > + 5-9. Encryption IDs
> > +   5.9-1 Encryption IDs Interface Files
> > +   5.9-2 Migration and Ownership
> > + 5-10. Misc
> > +   5-10-1. perf_event
> >   5-N. Non-normative information
> > 5-N-1. CPU controller root cgroup process behaviour
> > 5-N-2. IO controller root cgroup process behaviour
> > @@ -2160,6 +2163,77 @@ HugeTLB Interface Files
> >   are local to the cgroup i.e. not hierarchical. The file modified event
> >   generated on this file reflects only the local events.
> >
> > +Encryption IDs
> > +--
> > +
> > +There are multiple hardware memory encryption capabilities provided by the
> > +hardware vendors, like Secure Encrypted Virtualization (SEV) and SEV 
> > Encrypted
> > +State (SEV-ES) from AMD.
> > +
> > +These features are being used in encrypting virtual machines (VMs) and user
> > +space programs. However, only a small number of keys/IDs can be used
> > +simultaneously.
> > +
> > +This limited availability of these IDs requires system admin to optimize
>
>   admins
>
> > +allocation, control, and track the usage of the resources in the cloud
> > +infrastructure. This resource also needs to be protected from getting 
> > exhausted
> > +by some malicious program and causing starvation for other programs.
> > +
> > +Encryption IDs controller provides capability to register the resource for
>
>The Encryption IDs controller provides the capability to register the 
> resource for
>
> > +controlling and tracking through the cgroups.
>
> through cgroups.
>
> > +
> > +Encryption IDs Interface Files
> > +~~
> > +
> > +Each encryption ID type have their own interface files,
>
>has its own
>
> > +encids.[ID TYPE].{max, current, stat}, where "ID TYPE" can be sev and
>
>  or
>
> > +sev-es.
> > +
> > +  encids.[ID TYPE].stat
> > +A read-only flat-keyed single value file. This file exists only in 
> > the
> > +root cgroup.
> > +
> > +It shows the total number of encryption IDs available and 
> > currently in
> > +use on the platform::
> > +  # cat encids.sev.stat
> > +  total 509
> > +  used 0
>
> This is described above as a single-value file...
>
> Is the max value a hardware limit or a software (flexible) limit?
>
>
> > +
> > +  encids.[ID TYPE].max
> > +A read-write file which exists on the non-root cgroups. File is 
> > used to
> > +set maximum count of "[ID TYPE]" which can be used in the cgroup.
> > +
> > +Limit can be set to max by::
> > +  # echo max > encids.sev.max
> > +
> > +Limit can be set by::
> > +   

Re: [Patch v4 1/2] cgroup: svm: Add Encryption ID controller

2021-01-15 Thread Vipin Sharma
On Fri, Jan 15, 2021 at 10:43:32PM -0500, Tejun Heo wrote:
> On Fri, Jan 15, 2021 at 02:18:40PM -0800, Vipin Sharma wrote:
> > > * Why is .sev a separate namespace? Isn't the controller supposed to cover
> > >   encryption ids across different implementations? It's not like multiple
> > >   types of IDs can be in use on the same machine, right?
> > > 
> > 
> > On AMD platform we have two types SEV and SEV-ES which can exists
> > simultaneously and they have their own quota.
> 
> Can you please give a brief explanation of the two and lay out a scenario
> where the two are being used / allocated disjointly?
> 

SEV-ES has stronger memory encryption gurantees compared to SEV, apart
from encrypting the application memory it also encrypts register state
among other things. In a single host ASIDs can be distributed between
these two types by BIOS settings.

Currently, Google Cloud has Confidential VM machines offering using SEV.
ASIDs are not compatible between SEV and SEV-ES, so a VM running on SEV
cannot run on SEV-ES and vice versa

There are use cases for both types of VMs getting used in future.

> > > > Other ID types can be easily added in the controller in the same way.
> > > 
> > > I'm not sure this is necessarily a good thing.
> > 
> > This is to just say that when Intel and PowerPC changes are ready it
> > won't be difficult for them to add their controller.
> 
> I'm not really enthused about having per-hardware-type control knobs. None
> of other controllers behave that way. Unless it can be abstracted into
> something common, I'm likely to object.

There was a discussion in Patch v1 and consensus was to have individual
files because it makes kernel implementation extremely simple.

https://lore.kernel.org/lkml/alpine.deb.2.23.453.2011131615510.333...@chino.kir.corp.google.com/#t

> 
> > > > +static int enc_id_cg_stat_show(struct seq_file *sf, void *v)
> > > > +{
> > > > +   unsigned long flags;
> > > > +   enum encryption_id_type type = seq_cft(sf)->private;
> > > > +
> > > > +   spin_lock_irqsave(_id_cg_lock, flags);
> > > > +
> > > > +   seq_printf(sf, "total %u\n", enc_id_capacity[type]);
> > > > +   seq_printf(sf, "used %u\n", root_cg.res[type].usage);
> > > 
> > > Dup with .current and no need to show total on every cgroup, right?
> > 
> > This is for the stat file which will only be seen in the root cgroup
> > directory.  It is to know overall picture for the resource, what is the
> > total capacity and what is the current usage. ".current" file is not
> > shown on the root cgroup.
> 
> Ah, missed the flags. It's odd for the usage to be presented in two
> different ways tho. I think it'd make more sense w/ cgroup.current at root
> level. Is the total number available somewhere else in the system?

This information is not available anywhere else in the system. Only
other way to get this value is to use CPUID instruction (0x801F) of
the processor. Which also has disdvantage if sev module in kernel
doesn't use all of the available ASIDs for its work (right now it uses
all) then there will be a mismatch between what user get through their
code and what is actually getting used in the kernel by sev.

In cgroup v2, I didn't see current files for other cgroups in root
folder that is why I didn't show that file in root folder.

Will you be fine if I show two files in the root, something like:

encids.sev.capacity
encids.sev.current

In non root folder, it will be:
encids.sev.max
encids.sev.current

I still prefer encids.sev.stat, as it won't repeat same information in
each cgroup but let me know what you think.

Thanks


[Patch v5 2/2] cgroup: svm: Encryption IDs cgroup documentation.

2021-01-15 Thread Vipin Sharma
Documentation of Encryption IDs controller. This new controller is used
to track and limit usage of hardware memory encryption capabilities on
the CPUs.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
Reviewed-by: Dionna Glaze 
---
 .../admin-guide/cgroup-v1/encryption_ids.rst  |  1 +
 Documentation/admin-guide/cgroup-v2.rst   | 78 ++-
 2 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/encryption_ids.rst

diff --git a/Documentation/admin-guide/cgroup-v1/encryption_ids.rst 
b/Documentation/admin-guide/cgroup-v1/encryption_ids.rst
new file mode 100644
index ..8e9e9311daeb
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/encryption_ids.rst
@@ -0,0 +1 @@
+/Documentation/admin-guide/cgroup-v2.rst
diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 63521cd36ce5..72993571de2e 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -63,8 +63,11 @@ v1 is available under 
:ref:`Documentation/admin-guide/cgroup-v1/index.rst  encids.sev.max
+
+Limit can be set by::
+  # echo 100 > encids.sev.max
+
+This file shows the max limit of the encryption ID in the cgroup::
+  # cat encids.sev.max
+  max
+
+OR::
+  # cat encids.sev.max
+  100
+
+Limits can be set more than the "total" capacity value in the
+encids.[ID TYPE].stat file, however, the controller ensures
+that the usage never exceeds the "total" and the max limit.
+
+  encids.[ID TYPE].current
+A read-only single value file which exists on non-root cgroups.
+
+Shows the total number of encrypted IDs being used in the cgroup.
+
+Migration and Ownership
+~~~
+
+An encryption ID is charged to the cgroup in which it is used first, and
+stays charged to that cgroup until that ID is freed. Migrating a process
+to a different cgroup do not move the charge to the destination cgroup
+where the process has moved.
+
 Misc
 
 
-- 
2.30.0.284.gd98b1dd5eaa7-goog



[Patch v5 1/2] cgroup: svm: Add Encryption ID controller

2021-01-15 Thread Vipin Sharma
Hardware memory encryption is available on multiple generic CPUs. For
example AMD has Secure Encrypted Virtualization (SEV) and SEV -
Encrypted State (SEV-ES).

These memory encryptions are useful in creating encrypted virtual
machines (VMs) and user space programs.

There are limited number of encryption IDs that can be used
simultaneously on a machine for encryption. This generates a need for
the system admin to track, limit, allocate resources, and optimally
schedule VMs and user workloads in the cloud infrastructure. Some
malicious programs can exhaust all of these resources on a host causing
starvation of other workloads.

Encryption ID controller allows control of these resources using
Cgroups.

Controller is enabled by CGROUP_ENCRYPTION_IDS config option.
Encryption controller provide 3 interface files for each encryption ID
type. For example, in SEV:

1. encids.sev.max
Sets the maximum usage of SEV IDs in the cgroup.
2. encids.sev.current
Current usage of SEV IDs in the cgroup and its children.
3. encids.sev.stat
Shown only at the root cgroup. Displays total SEV IDs available
on the platform and current usage count.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
Reviewed-by: Dionna Glaze 
Acked-by: Brijesh Singh 
---
 arch/x86/kvm/svm/sev.c|  52 +++-
 include/linux/cgroup_subsys.h |   4 +
 include/linux/encryption_ids_cgroup.h |  72 +
 include/linux/kvm_host.h  |   4 +
 init/Kconfig  |  14 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/encryption_ids.c| 421 ++
 7 files changed, 556 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/encryption_ids_cgroup.h
 create mode 100644 kernel/cgroup/encryption_ids.c

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index c8ffdbc81709..13d9e9ea6dc8 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -86,10 +87,18 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
return true;
 }
 
-static int sev_asid_new(struct kvm_sev_info *sev)
+static int sev_asid_new(struct kvm *kvm)
 {
-   int pos, min_asid, max_asid;
+   int pos, min_asid, max_asid, ret;
+   struct kvm_sev_info *sev = _kvm_svm(kvm)->sev_info;
bool retry = true;
+   enum encryption_id_type type;
+
+   type = sev->es_active ? ENCRYPTION_ID_SEV_ES : ENCRYPTION_ID_SEV;
+
+   ret = enc_id_cg_try_charge(kvm, type, 1);
+   if (ret)
+   return ret;
 
mutex_lock(_bitmap_lock);
 
@@ -107,7 +116,8 @@ static int sev_asid_new(struct kvm_sev_info *sev)
goto again;
}
mutex_unlock(_bitmap_lock);
-   return -EBUSY;
+   ret = -EBUSY;
+   goto e_uncharge;
}
 
__set_bit(pos, sev_asid_bitmap);
@@ -115,6 +125,9 @@ static int sev_asid_new(struct kvm_sev_info *sev)
mutex_unlock(_bitmap_lock);
 
return pos + 1;
+e_uncharge:
+   enc_id_cg_uncharge(kvm, type, 1);
+   return ret;
 }
 
 static int sev_get_asid(struct kvm *kvm)
@@ -124,14 +137,16 @@ static int sev_get_asid(struct kvm *kvm)
return sev->asid;
 }
 
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm *kvm)
 {
struct svm_cpu_data *sd;
int cpu, pos;
+   struct kvm_sev_info *sev = _kvm_svm(kvm)->sev_info;
+   enum encryption_id_type type;
 
mutex_lock(_bitmap_lock);
 
-   pos = asid - 1;
+   pos = sev->asid - 1;
__set_bit(pos, sev_reclaim_asid_bitmap);
 
for_each_possible_cpu(cpu) {
@@ -140,6 +155,9 @@ static void sev_asid_free(int asid)
}
 
mutex_unlock(_bitmap_lock);
+
+   type = sev->es_active ? ENCRYPTION_ID_SEV_ES : ENCRYPTION_ID_SEV;
+   enc_id_cg_uncharge(kvm, type, 1);
 }
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -184,22 +202,22 @@ static int sev_guest_init(struct kvm *kvm, struct 
kvm_sev_cmd *argp)
if (unlikely(sev->active))
return ret;
 
-   asid = sev_asid_new(sev);
+   asid = sev_asid_new(kvm);
if (asid < 0)
return ret;
+   sev->asid = asid;
 
ret = sev_platform_init(>error);
if (ret)
goto e_free;
 
sev->active = true;
-   sev->asid = asid;
INIT_LIST_HEAD(>regions_list);
 
return 0;
 
 e_free:
-   sev_asid_free(asid);
+   sev_asid_free(kvm);
return ret;
 }
 
@@ -1240,12 +1258,12 @@ void sev_vm_destroy(struct kvm *kvm)
mutex_unlock(>lock);
 
sev_unbind_asid(kvm, sev->handle);
-   sev_asid_free(sev->asid);
+   sev_asid_free(kvm);
 }
 
 void __init sev_hardware_setup(void)
 {
-   unsigned int eax, ebx, ecx, edx;
+  

[Patch v5 0/2] cgroup: KVM: New Encryption IDs cgroup controller

2021-01-15 Thread Vipin Sharma
Hello,

This patch adds a new cgroup controller, Encryption IDs, to track and
limit the usage of encryption IDs on a host.

AMD provides Secure Encrypted Virtualization (SEV) and SEV with
Encrypted State (SEV-ES) to encrypt the guest OS's memory using limited
number of Address Space Identifiers (ASIDs).

This limited number of ASIDs creates issues like SEV ASID starvation and
unoptimized scheduling in the cloud infrastucture.

In the RFC patch v1, I provided only SEV cgroup controller but based
on the feedback and discussion it became clear that this cgroup
controller can be extended to be used by Intel's Trusted Domain
Extension (TDX) and s390's protected virtualization Secure Execution IDs
(SEID)

This patch series provides a generic Encryption IDs controller with
tracking support of the SEV and SEV-ES ASIDs.

Changes in v5:
- Changed controller filenames from encryption_ids.*.* to encids.*.*
- Documentation of cgroup v1 now points to cgroup v2.

Changes in v4:
- The max value can be set lower than the current.
- Added SEV-ES support.

Changes in v3:
- Fixes a build error when CONFIG_CGROUP is disabled.

Changes in v2:
- Changed cgroup name from sev to encryption_ids.
- Replaced SEV specific names in APIs and documentations with generic
  encryption IDs.
- Providing 3 cgroup files per encryption ID type. For example in SEV,
  - encryption_ids.sev.stat (only in the root cgroup directory).
  - encryption_ids.sev.max
  - encryption_ids.sev.current

[1] https://lore.kernel.org/lkml/20200922004024.3699923-1-vipi...@google.com/
[2] https://lore.kernel.org/lkml/20201208213531.2626955-1-vipi...@google.com/
[3] https://lore.kernel.org/lkml/20201209205413.3391139-1-vipi...@google.com/
[4] https://lore.kernel.org/lkml/20210108012846.4134815-1-vipi...@google.com/

Vipin Sharma (2):
  cgroup: svm: Add Encryption ID controller
  cgroup: svm: Encryption IDs cgroup documentation.

 .../admin-guide/cgroup-v1/encryption_ids.rst  |   1 +
 Documentation/admin-guide/cgroup-v2.rst   |  78 +++-
 arch/x86/kvm/svm/sev.c|  52 ++-
 include/linux/cgroup_subsys.h |   4 +
 include/linux/encryption_ids_cgroup.h |  72 +++
 include/linux/kvm_host.h  |   4 +
 init/Kconfig  |  14 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/encryption_ids.c| 421 ++
 9 files changed, 633 insertions(+), 14 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/encryption_ids.rst
 create mode 100644 include/linux/encryption_ids_cgroup.h
 create mode 100644 kernel/cgroup/encryption_ids.c

-- 
2.30.0.284.gd98b1dd5eaa7-goog



Re: [Patch v4 1/2] cgroup: svm: Add Encryption ID controller

2021-01-15 Thread Vipin Sharma
On Fri, Jan 15, 2021 at 03:59:25PM -0500, Tejun Heo wrote:
> Hello,
> 
> On Thu, Jan 07, 2021 at 05:28:45PM -0800, Vipin Sharma wrote:
> > 1. encrpytion_ids.sev.max
> > Sets the maximum usage of SEV IDs in the cgroup.
> > 2. encryption_ids.sev.current
> > Current usage of SEV IDs in the cgroup and its children.
> > 3. encryption_ids.sev.stat
> > Shown only at the root cgroup. Displays total SEV IDs available
> > on the platform and current usage count.
> 
> Sorry, should have raised these earlier:
> 
> * Can we shorten the name to encids?

Sure.

> 
> * Why is .sev a separate namespace? Isn't the controller supposed to cover
>   encryption ids across different implementations? It's not like multiple
>   types of IDs can be in use on the same machine, right?
> 

On AMD platform we have two types SEV and SEV-ES which can exists
simultaneously and they have their own quota.

> > Other ID types can be easily added in the controller in the same way.
> 
> I'm not sure this is necessarily a good thing.

This is to just say that when Intel and PowerPC changes are ready it
won't be difficult for them to add their controller.

> 
> > +/**
> > + * enc_id_cg_uncharge_hierarchy() - Uncharge the enryption ID cgroup 
> > hierarchy.
> > + * @start_cg: Starting cgroup.
> > + * @stop_cg: cgroup at which uncharge stops.
> > + * @type: type of encryption ID to uncharge.
> > + * @amount: Charge amount.
> > + *
> > + * Uncharge the cgroup tree from the given start cgroup to the stop cgroup.
> > + *
> > + * Context: Any context. Expects enc_id_cg_lock to be held by the caller.
> > + */
> > +static void enc_id_cg_uncharge_hierarchy(struct encryption_id_cgroup 
> > *start_cg,
> > +struct encryption_id_cgroup *stop_cg,
> > +enum encryption_id_type type,
> > +unsigned int amount)
> > +{
> > +   struct encryption_id_cgroup *i;
> > +
> > +   lockdep_assert_held(_id_cg_lock);
> > +
> > +   for (i = start_cg; i != stop_cg; i = parent_enc(i)) {
> > +   WARN_ON_ONCE(i->res[type].usage < amount);
> > +   i->res[type].usage -= amount;
> > +   }
> > +   css_put(_cg->css);
> 
> I'm curious whether this is necessary given that a css can't be destroyed
> while tasks are attached. Are there cases where this wouldn't hold true? If
> so, it'd be great to have some comments on how that can happen.

We are not moving charges when a task moves out. This can lead us to the
cases where all of the tasks in the cgroup have moved out but it
still has charges. In that scenarios cgroup can be deleted. Taking a
reference will make sure cgroup is atleast present internally.

Also, struct encryption_ic_cgroup has a reference to the cgroup which is
used during uncharge call to correctly identify from which cgroup charge
should be deducted.

> 
> > +/**
> > + * enc_id_cg_max_write() - Update the maximum limit of the cgroup.
> > + * @of: Handler for the file.
> > + * @buf: Data from the user. It should be either "max", 0, or a positive
> > + *  integer.
> > + * @nbytes: Number of bytes of the data.
> > + * @off: Offset in the file.
> > + *
> > + * Uses cft->private value to determine for which enryption ID type 
> > results be
> > + * shown.
> > + *
> > + * Context: Any context. Takes and releases enc_id_cg_lock.
> > + * Return:
> > + * * >= 0 - Number of bytes processed in the input.
> > + * * -EINVAL - If buf is not valid.
> > + * * -ERANGE - If number is bigger than unsigned int capacity.
> > + * * -EBUSY - If usage can become more than max limit.
> 
> The aboves are stale, right?

-EBUSY is not valid anymore. We can now set max to be less than the usage. I
will remove it in the next patch.

> 
> > +static int enc_id_cg_stat_show(struct seq_file *sf, void *v)
> > +{
> > +   unsigned long flags;
> > +   enum encryption_id_type type = seq_cft(sf)->private;
> > +
> > +   spin_lock_irqsave(_id_cg_lock, flags);
> > +
> > +   seq_printf(sf, "total %u\n", enc_id_capacity[type]);
> > +   seq_printf(sf, "used %u\n", root_cg.res[type].usage);
> 
> Dup with .current and no need to show total on every cgroup, right?

This is for the stat file which will only be seen in the root cgroup
directory.  It is to know overall picture for the resource, what is the
total capacity and what is the current usage. ".current" file is not
shown on the root cgroup.

This information is good for resource allocation in the cloud
infrastructure.

> 
> Thanks.
> 
> -- 
> tejun


Re: [Patch v4 2/2] cgroup: svm: Encryption IDs cgroup documentation.

2021-01-15 Thread Vipin Sharma
On Fri, Jan 15, 2021 at 04:00:26PM -0500, Tejun Heo wrote:
> On Thu, Jan 07, 2021 at 05:28:46PM -0800, Vipin Sharma wrote:
> > Documentation for both cgroup versions, v1 and v2, of Encryption IDs
> > controller. This new controller is used to track and limit usage of
> > hardware memory encryption capabilities on the CPUs.
> > 
> > Signed-off-by: Vipin Sharma 
> > Reviewed-by: David Rientjes 
> > Reviewed-by: Dionna Glaze 
> > ---
> >  .../admin-guide/cgroup-v1/encryption_ids.rst  | 108 ++
> >  Documentation/admin-guide/cgroup-v2.rst   |  78 -
> 
> Given how trivial it is, I'm not gonna object to adding new v1 interface but
> maybe just point to v2 doc from v1?
> 

Sure, I will just add the path to v2 doc in v1.

> Thanks.
> 
> -- 
> tejun


[Patch v4 1/2] cgroup: svm: Add Encryption ID controller

2021-01-07 Thread Vipin Sharma
Hardware memory encryption is available on multiple generic CPUs. For
example AMD has Secure Encrypted Virtualization (SEV) and SEV -
Encrypted State (SEV-ES).

These memory encryptions are useful in creating encrypted virtual
machines (VMs) and user space programs.

There are limited number of encryption IDs that can be used
simultaneously on a machine for encryption. This generates a need for
the system admin to track, limit, allocate resources, and optimally
schedule VMs and user workloads in the cloud infrastructure. Some
malicious programs can exhaust all of these resources on a host causing
starvation of other workloads.

Encryption ID controller allows control of these resources using
Cgroups.

Controller is enabled by CGROUP_ENCRYPTION_IDS config option.
Encryption controller provide 3 interface files for each encryption ID
type. For example, in SEV:

1. encrpytion_ids.sev.max
Sets the maximum usage of SEV IDs in the cgroup.
2. encryption_ids.sev.current
Current usage of SEV IDs in the cgroup and its children.
3. encryption_ids.sev.stat
Shown only at the root cgroup. Displays total SEV IDs available
on the platform and current usage count.

Other ID types can be easily added in the controller in the same way.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
Reviewed-by: Dionna Glaze 
---
 arch/x86/kvm/svm/sev.c|  52 +++-
 include/linux/cgroup_subsys.h |   4 +
 include/linux/encryption_ids_cgroup.h |  72 +
 include/linux/kvm_host.h  |   4 +
 init/Kconfig  |  14 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/encryption_ids.c| 422 ++
 7 files changed, 557 insertions(+), 12 deletions(-)
 create mode 100644 include/linux/encryption_ids_cgroup.h
 create mode 100644 kernel/cgroup/encryption_ids.c

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 9858d5ae9ddd..1924ab2eaf11 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -86,10 +87,18 @@ static bool __sev_recycle_asids(int min_asid, int max_asid)
return true;
 }
 
-static int sev_asid_new(struct kvm_sev_info *sev)
+static int sev_asid_new(struct kvm *kvm)
 {
-   int pos, min_asid, max_asid;
+   int pos, min_asid, max_asid, ret;
+   struct kvm_sev_info *sev = _kvm_svm(kvm)->sev_info;
bool retry = true;
+   enum encryption_id_type type;
+
+   type = sev->es_active ? ENCRYPTION_ID_SEV_ES : ENCRYPTION_ID_SEV;
+
+   ret = enc_id_cg_try_charge(kvm, type, 1);
+   if (ret)
+   return ret;
 
mutex_lock(_bitmap_lock);
 
@@ -107,7 +116,8 @@ static int sev_asid_new(struct kvm_sev_info *sev)
goto again;
}
mutex_unlock(_bitmap_lock);
-   return -EBUSY;
+   ret = -EBUSY;
+   goto e_uncharge;
}
 
__set_bit(pos, sev_asid_bitmap);
@@ -115,6 +125,9 @@ static int sev_asid_new(struct kvm_sev_info *sev)
mutex_unlock(_bitmap_lock);
 
return pos + 1;
+e_uncharge:
+   enc_id_cg_uncharge(kvm, type, 1);
+   return ret;
 }
 
 static int sev_get_asid(struct kvm *kvm)
@@ -124,14 +137,16 @@ static int sev_get_asid(struct kvm *kvm)
return sev->asid;
 }
 
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm *kvm)
 {
struct svm_cpu_data *sd;
int cpu, pos;
+   struct kvm_sev_info *sev = _kvm_svm(kvm)->sev_info;
+   enum encryption_id_type type;
 
mutex_lock(_bitmap_lock);
 
-   pos = asid - 1;
+   pos = sev->asid - 1;
__set_bit(pos, sev_reclaim_asid_bitmap);
 
for_each_possible_cpu(cpu) {
@@ -140,6 +155,9 @@ static void sev_asid_free(int asid)
}
 
mutex_unlock(_bitmap_lock);
+
+   type = sev->es_active ? ENCRYPTION_ID_SEV_ES : ENCRYPTION_ID_SEV;
+   enc_id_cg_uncharge(kvm, type, 1);
 }
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -184,22 +202,22 @@ static int sev_guest_init(struct kvm *kvm, struct 
kvm_sev_cmd *argp)
if (unlikely(sev->active))
return ret;
 
-   asid = sev_asid_new(sev);
+   asid = sev_asid_new(kvm);
if (asid < 0)
return ret;
+   sev->asid = asid;
 
ret = sev_platform_init(>error);
if (ret)
goto e_free;
 
sev->active = true;
-   sev->asid = asid;
INIT_LIST_HEAD(>regions_list);
 
return 0;
 
 e_free:
-   sev_asid_free(asid);
+   sev_asid_free(kvm);
return ret;
 }
 
@@ -1240,12 +1258,12 @@ void sev_vm_destroy(struct kvm *kvm)
mutex_unlock(>lock);
 
sev_unbind_asid(kvm, sev->handle);
-   sev_asid_free(sev->asid);
+   sev_asid_free(kvm);
 }
 
 void __init sev_hardw

[Patch v4 2/2] cgroup: svm: Encryption IDs cgroup documentation.

2021-01-07 Thread Vipin Sharma
Documentation for both cgroup versions, v1 and v2, of Encryption IDs
controller. This new controller is used to track and limit usage of
hardware memory encryption capabilities on the CPUs.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
Reviewed-by: Dionna Glaze 
---
 .../admin-guide/cgroup-v1/encryption_ids.rst  | 108 ++
 Documentation/admin-guide/cgroup-v2.rst   |  78 -
 2 files changed, 184 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/encryption_ids.rst

diff --git a/Documentation/admin-guide/cgroup-v1/encryption_ids.rst 
b/Documentation/admin-guide/cgroup-v1/encryption_ids.rst
new file mode 100644
index ..891143b4e229
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/encryption_ids.rst
@@ -0,0 +1,108 @@
+=
+Encryption IDs Controller
+=
+
+Overview
+
+There are multiple hardware memory encryption capabilities provided by the
+hardware vendors, like Secure Encrypted Virtualization (SEV) and SEV Encrypted
+State (SEV-ES) from AMD.
+
+These features are being used in encrypting virtual machines (VMs) and user
+space programs. However, only a small number of keys/IDs can be used
+simultaneously.
+
+This limited availability of these IDs requires system admin to optimize
+allocation, control, and track the usage of the resources in the cloud
+infrastructure. This resource also needs to be protected from getting exhausted
+by some malicious program and causing starvation for other programs.
+
+Encryption IDs controller provides capability to register the resource for
+controlling and tracking through the cgroups.
+
+How to Enable Controller
+
+
+- Enable Encryption controller::
+
+CONFIG_CGROUP_ENCRYPTION_IDS=y
+
+- Above options will build Encryption controller support in the kernel.
+  To mount the Encryption controller::
+
+mount -t cgroup -o encryption none /sys/fs/cgroup/encryption
+
+
+Interface Files
+===
+Each encryption ID type have their own interface files,
+encryption_id.[ID TYPE].{max, current, stat}, where "ID TYPE" can be sev and
+sev-es.
+
+  encryption_ids.[ID TYPE].stat
+A read-only flat-keyed single value file. This file exists only in the
+root cgroup.
+
+It shows the total number of encryption IDs available and currently in
+use on the platform::
+  # cat encryption.sev.stat
+  total 509
+  used 0
+
+  encryption_ids.[ID TYPE].max
+A read-write file which exists on the non-root cgroups. File is used to
+set maximum count of "[ID TYPE]" which can be used in the cgroup.
+
+Limit can be set to max by::
+  # echo max > encryption.sev.max
+
+Limit can be set by::
+  # echo 100 > encryption.sev.max
+
+This file shows the max limit of the encryption ID in the cgroup::
+  # cat encryption.sev.max
+  max
+
+OR::
+  # cat encryption.sev.max
+  100
+
+Limits can be set more than the "total" capacity value in the
+encryption_ids.[ID TYPE].stat file, however, the controller ensures
+that the usage never exceeds the "total" and the max limit.
+
+  encryption_ids.[ID TYPE].current
+A read-only single value file which exists on non-root cgroups.
+
+Shows the total number of encrypted IDs being used in the cgroup.
+
+Hierarchy
+=
+
+Encryption IDs controller supports hierarchical accounting. It supports
+following features:
+
+1. Current usage in the cgroup shows IDs used in the cgroup and its descendent 
cgroups.
+2. Current usage can never exceed the corresponding max limit set in the cgroup
+   and its ancestor's chain up to the root.
+
+Suppose the following example hierarchy::
+
+root
+/  \
+   AB
+   |
+   C
+
+1. A will show the count of IDs used in A and C.
+2. C's current IDs usage may not exceed any of the max limits set in C, A, or
+   root.
+
+Migration and ownership
+===
+
+An encryption ID is charged to the cgroup in which it is used first, and
+stays charged to that cgroup until that ID is freed. Migrating a process
+to a different cgroup do not move the charge to the destination cgroup
+where the process has moved.
+
diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 63521cd36ce5..b6ea47b9e882 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -63,8 +63,11 @@ v1 is available under 
:ref:`Documentation/admin-guide/cgroup-v1/index.rst  encryption.sev.max
+
+Limit can be set by::
+  # echo 100 > encryption.sev.max
+
+This file shows the max limit of the encryption ID in the cgroup::
+

[Patch v4 0/2] cgroup: KVM: New Encryption IDs cgroup controller

2021-01-07 Thread Vipin Sharma
Hello,

This patch adds a new cgroup controller, Encryption IDs, to track and
limit the usage of encryption IDs on a host.

AMD provides Secure Encrypted Virtualization (SEV) and SEV with
Encrypted State (SEV-ES) to encrypt the guest OS's memory using limited
number of Address Space Identifiers (ASIDs).

This limited number of ASIDs creates issues like SEV ASID starvation and
unoptimized scheduling in the cloud infrastucture.

In the RFC patch v1, I provided only SEV cgroup controller but based
on the feedback and discussion it became clear that this cgroup
controller can be extended to be used by Intel's Trusted Domain
Extension (TDX) and s390's protected virtualization Secure Execution IDs
(SEID)

This patch series provides a generic Encryption IDs controller with
tracking support of the SEV and SEV-ES ASIDs.

Changes in v4:
- The max value can be set lower than the current.
- Added SEV-ES support.

Changes in v3:
- Fixes a build error when CONFIG_CGROUP is disabled.

Changes in v2:
- Changed cgroup name from sev to encryption_ids.
- Replaced SEV specific names in APIs and documentations with generic
  encryption IDs.
- Providing 3 cgroup files per encryption ID type. For example in SEV,
  - encryption_ids.sev.stat (only in the root cgroup directory).
  - encryption_ids.sev.max
  - encryption_ids.sev.current

[1] https://lore.kernel.org/lkml/20200922004024.3699923-1-vipi...@google.com/
[2] https://lore.kernel.org/lkml/20201208213531.2626955-1-vipi...@google.com/
[3] https://lore.kernel.org/lkml/20201209205413.3391139-1-vipi...@google.com/

Vipin Sharma (2):
  cgroup: svm: Add Encryption ID controller
  cgroup: svm: Encryption IDs cgroup documentation.

 .../admin-guide/cgroup-v1/encryption_ids.rst  | 108 +
 Documentation/admin-guide/cgroup-v2.rst   |  78 +++-
 arch/x86/kvm/svm/sev.c|  52 ++-
 include/linux/cgroup_subsys.h |   4 +
 include/linux/encryption_ids_cgroup.h |  72 +++
 include/linux/kvm_host.h  |   4 +
 init/Kconfig  |  14 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/encryption_ids.c| 422 ++
 9 files changed, 741 insertions(+), 14 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/encryption_ids.rst
 create mode 100644 include/linux/encryption_ids_cgroup.h
 create mode 100644 kernel/cgroup/encryption_ids.c

-- 
2.29.2.729.g45daf8777d-goog



Re: [Patch v3 0/2] cgroup: KVM: New Encryption IDs cgroup controller

2021-01-06 Thread Vipin Sharma
On Tue, Jan 05, 2021 at 10:36:40AM -0500, Tejun Heo wrote:
> Happy new year!
> 
> On Wed, Dec 16, 2020 at 12:02:37PM -0800, Vipin Sharma wrote:
> > I like the idea of having a separate controller to keep the code simple and
> > easier for maintenance.
> 
> Yeah, the more I think about it, keeping it separate seems like the right
> thing to do. What bothers me primarily is that the internal logic is
> identical between the RDMA controller and this one. If you wanna try
> factoring them out into library, great. If not, I don't think it should
> block merging this controller. We can get to refactoring later.
> 

Happy new year!
Sounds great, I will send out a new patch which will not reject the new
max limit based on the current usage. It will not include refactoring
out common code between RDMA and Encryption ID controller. We can pursue
that later.

Thanks
Vipin


Re: [Patch v3 0/2] cgroup: KVM: New Encryption IDs cgroup controller

2020-12-16 Thread Vipin Sharma
On Wed, Dec 9, 2020 at 12:59 PM Tejun Heo  wrote:
> * I don't have an overall objection. In terms of behavior, the only thing
>   which stood out was input rejection depending on the current usage. The
>   preferred way of handling that is rejecting future allocations rather than
>   failing configuration as that makes it impossible e.g. to lower limit and
>   drain existing usages from outside the container.

Thanks. In next version of the patch I will remove rejection of max value
based on current usage.

On Wed, Dec 16, 2020 at 7:27 AM Tejun Heo  wrote:
> On Thu, Dec 10, 2020 at 03:44:35PM -0800, David Rientjes wrote:
> > Concern with a single misc controller would be that any subsystem that
> > wants to use it has to exactly fit this support: current, max, stat,
> > nothing more.  The moment a controller needs some additional support, and
> > its controller is already implemented in previous kernel versionv as a
> > part of "misc," we face a problem.
>
> Yeah, that's a valid concern, but given the history, there doesn't seem to
> be much need beyond that for these use cases and the limited need seems
> inherent to the way the resources are defined and consumed. So yeah, it can
> either way.

I think a misc controller should be able support other "Resource Distribution
Models" mentioned in the cgroup v2 documentation besides limits. There might be
use cases in future which want to use weight, protection or allocation models.
If that happens it will be more difficult to support these different resources.
This will also mean the same hierarchy might get charged differently by the
same controller.

I like the idea of having a separate controller to keep the code simple and
easier for maintenance.

If you decide to have a separate misc controller please let me know what will
be the overall expectations and I can change my patch to reflect that,
otherwise I can send out a new patch with just removal of max input rejection.

My understanding with a "misc" controller is it will be something like
- cgroup v2
  cgroup/misc.encryption_ids.{sev, tdx, seid}.{stat, max, current}

- cgroup v1
  cgroup/misc/misc.encryption_ids.{sev, tdx, seid}.{stat, max, current}

Thanks
Vipin


[Patch v3 2/2] cgroup: svm: Encryption IDs cgroup documentation.

2020-12-09 Thread Vipin Sharma
Documentation for both cgroup versions, v1 and v2, of Encryption IDs
controller. This new controller is used to track and limit usage of
hardware memory encryption capabilities on the CPUs.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
Reviewed-by: Dionna Glaze 
---
 .../admin-guide/cgroup-v1/encryption_ids.rst  | 108 ++
 Documentation/admin-guide/cgroup-v2.rst   |  78 -
 2 files changed, 184 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/encryption_ids.rst

diff --git a/Documentation/admin-guide/cgroup-v1/encryption_ids.rst 
b/Documentation/admin-guide/cgroup-v1/encryption_ids.rst
new file mode 100644
index ..891143b4e229
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/encryption_ids.rst
@@ -0,0 +1,108 @@
+=
+Encryption IDs Controller
+=
+
+Overview
+
+There are multiple hardware memory encryption capabilities provided by the
+hardware vendors, like Secure Encrypted Virtualization (SEV) and SEV Encrypted
+State (SEV-ES) from AMD.
+
+These features are being used in encrypting virtual machines (VMs) and user
+space programs. However, only a small number of keys/IDs can be used
+simultaneously.
+
+This limited availability of these IDs requires system admin to optimize
+allocation, control, and track the usage of the resources in the cloud
+infrastructure. This resource also needs to be protected from getting exhausted
+by some malicious program and causing starvation for other programs.
+
+Encryption IDs controller provides capability to register the resource for
+controlling and tracking through the cgroups.
+
+How to Enable Controller
+
+
+- Enable Encryption controller::
+
+CONFIG_CGROUP_ENCRYPTION_IDS=y
+
+- Above options will build Encryption controller support in the kernel.
+  To mount the Encryption controller::
+
+mount -t cgroup -o encryption none /sys/fs/cgroup/encryption
+
+
+Interface Files
+===
+Each encryption ID type have their own interface files,
+encryption_id.[ID TYPE].{max, current, stat}, where "ID TYPE" can be sev and
+sev-es.
+
+  encryption_ids.[ID TYPE].stat
+A read-only flat-keyed single value file. This file exists only in the
+root cgroup.
+
+It shows the total number of encryption IDs available and currently in
+use on the platform::
+  # cat encryption.sev.stat
+  total 509
+  used 0
+
+  encryption_ids.[ID TYPE].max
+A read-write file which exists on the non-root cgroups. File is used to
+set maximum count of "[ID TYPE]" which can be used in the cgroup.
+
+Limit can be set to max by::
+  # echo max > encryption.sev.max
+
+Limit can be set by::
+  # echo 100 > encryption.sev.max
+
+This file shows the max limit of the encryption ID in the cgroup::
+  # cat encryption.sev.max
+  max
+
+OR::
+  # cat encryption.sev.max
+  100
+
+Limits can be set more than the "total" capacity value in the
+encryption_ids.[ID TYPE].stat file, however, the controller ensures
+that the usage never exceeds the "total" and the max limit.
+
+  encryption_ids.[ID TYPE].current
+A read-only single value file which exists on non-root cgroups.
+
+Shows the total number of encrypted IDs being used in the cgroup.
+
+Hierarchy
+=
+
+Encryption IDs controller supports hierarchical accounting. It supports
+following features:
+
+1. Current usage in the cgroup shows IDs used in the cgroup and its descendent 
cgroups.
+2. Current usage can never exceed the corresponding max limit set in the cgroup
+   and its ancestor's chain up to the root.
+
+Suppose the following example hierarchy::
+
+root
+/  \
+   AB
+   |
+   C
+
+1. A will show the count of IDs used in A and C.
+2. C's current IDs usage may not exceed any of the max limits set in C, A, or
+   root.
+
+Migration and ownership
+===
+
+An encryption ID is charged to the cgroup in which it is used first, and
+stays charged to that cgroup until that ID is freed. Migrating a process
+to a different cgroup do not move the charge to the destination cgroup
+where the process has moved.
+
diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 608d7c279396..7938bb7c6e1c 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -63,8 +63,11 @@ v1 is available under 
:ref:`Documentation/admin-guide/cgroup-v1/index.rst  encryption.sev.max
+
+Limit can be set by::
+  # echo 100 > encryption.sev.max
+
+This file shows the max limit of the encryption ID in the cgroup::
+

[Patch v3 1/2] cgroup: svm: Add Encryption ID controller

2020-12-09 Thread Vipin Sharma
Hardware memory encryption is available on multiple generic CPUs. For
example AMD has Secure Encrypted Virtualization (SEV) and SEV -
Encrypted State (SEV-ES).

These memory encryptions are useful in creating encrypted virtual
machines (VMs) and user space programs.

There are limited number of encryption IDs that can be used
simultaneously on a machine for encryption. This generates a need for
the system admin to track, limit, allocate resources, and optimally
schedule VMs and user workloads in the cloud infrastructure. Some
malicious programs can exhaust all of these resources on a host causing
starvation of other workloads.

Encryption ID controller allows control of these resources using
Cgroups.

Controller is enabled by CGROUP_ENCRYPTION_IDS config option.
Encryption controller provide 3 interface files for each encryption ID
type. For example, in SEV:

1. encryption_ids.sev.stat
Shown only at the root cgroup. Displays total SEV IDs available
on the platform and current usage count.
2. encrpytion_ids.sev.max
Sets the maximum usage of SEV IDs in the cgroup.
3. encryption_ids.sev.current
Current usage of SEV IDs in the cgroup and its children.

Other ID types can be easily added in the controller in the same way.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
Reviewed-by: Dionna Glaze 
Reported-by: kernel test robot 
---
 arch/x86/kvm/svm/sev.c|  28 +-
 include/linux/cgroup_subsys.h |   4 +
 include/linux/encryption_ids_cgroup.h |  71 +
 include/linux/kvm_host.h  |   4 +
 init/Kconfig  |  14 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/encryption_ids.c| 430 ++
 7 files changed, 545 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/encryption_ids_cgroup.h
 create mode 100644 kernel/cgroup/encryption_ids.c

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 566f4d18185b..83c23d1d568a 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "x86.h"
 #include "svm.h"
@@ -77,10 +78,14 @@ static bool __sev_recycle_asids(void)
return true;
 }
 
-static int sev_asid_new(void)
+static int sev_asid_new(struct kvm *kvm)
 {
bool retry = true;
-   int pos;
+   int pos, ret;
+
+   ret = enc_id_cg_try_charge(kvm, ENCRYPTION_ID_SEV, 1);
+   if (ret)
+   return ret;
 
mutex_lock(_bitmap_lock);
 
@@ -95,7 +100,8 @@ static int sev_asid_new(void)
goto again;
}
mutex_unlock(_bitmap_lock);
-   return -EBUSY;
+   ret = -EBUSY;
+   goto e_uncharge;
}
 
__set_bit(pos, sev_asid_bitmap);
@@ -103,6 +109,9 @@ static int sev_asid_new(void)
mutex_unlock(_bitmap_lock);
 
return pos + 1;
+e_uncharge:
+   enc_id_cg_uncharge(kvm, ENCRYPTION_ID_SEV, 1);
+   return ret;
 }
 
 static int sev_get_asid(struct kvm *kvm)
@@ -112,7 +121,7 @@ static int sev_get_asid(struct kvm *kvm)
return sev->asid;
 }
 
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm *kvm, int asid)
 {
struct svm_cpu_data *sd;
int cpu, pos;
@@ -128,6 +137,7 @@ static void sev_asid_free(int asid)
}
 
mutex_unlock(_bitmap_lock);
+   enc_id_cg_uncharge(kvm, ENCRYPTION_ID_SEV, 1);
 }
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -172,7 +182,7 @@ static int sev_guest_init(struct kvm *kvm, struct 
kvm_sev_cmd *argp)
if (unlikely(sev->active))
return ret;
 
-   asid = sev_asid_new();
+   asid = sev_asid_new(kvm);
if (asid < 0)
return ret;
 
@@ -187,7 +197,7 @@ static int sev_guest_init(struct kvm *kvm, struct 
kvm_sev_cmd *argp)
return 0;
 
 e_free:
-   sev_asid_free(asid);
+   sev_asid_free(kvm, asid);
return ret;
 }
 
@@ -1122,7 +1132,7 @@ void sev_vm_destroy(struct kvm *kvm)
mutex_unlock(>lock);
 
sev_unbind_asid(kvm, sev->handle);
-   sev_asid_free(sev->asid);
+   sev_asid_free(kvm, sev->asid);
 }
 
 int __init sev_hardware_setup(void)
@@ -1148,6 +1158,9 @@ int __init sev_hardware_setup(void)
if (!sev_reclaim_asid_bitmap)
return 1;
 
+   if (enc_id_cg_set_capacity(ENCRYPTION_ID_SEV, max_sev_asid))
+   return 1;
+
status = kmalloc(sizeof(*status), GFP_KERNEL);
if (!status)
return 1;
@@ -1177,6 +1190,7 @@ void sev_hardware_teardown(void)
 
bitmap_free(sev_asid_bitmap);
bitmap_free(sev_reclaim_asid_bitmap);
+   enc_id_cg_set_capacity(ENCRYPTION_ID_SEV, 0);
 
sev_flush_asids();
 }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b4..83754f58c05e 100644

[Patch v3 0/2] cgroup: KVM: New Encryption IDs cgroup controller

2020-12-09 Thread Vipin Sharma
Hello,

This patch adds a new cgroup controller, Encryption IDs, to track and
limit the usage of encryption IDs on a host.

AMD provides Secure Encrypted Virtualization (SEV) and SEV with
Encrypted State (SEV-ES) to encrypt the guest OS's memory using limited
number of Address Space Identifiers (ASIDs).

This limited number of ASIDs creates issues like SEV ASID starvation and
unoptimized scheduling in the cloud infrastucture.

In the RFC patch v1, I provided only SEV cgroup controller but based
on the feedback and discussion it became clear that this cgroup
controller can be extended to be used by Intel's Trusted Domain
Extension (TDX) and s390's protected virtualization Secure Execution IDs
(SEID)

This patch series provides a generic Encryption IDs controller with
tracking support of the SEV ASIDs.

Changes in v3:
- Fixes a build error when CONFIG_CGROUP is disabled.

Changes in v2:
- Changed cgroup name from sev to encryption_ids.
- Replaced SEV specific names in APIs and documentations with generic
  encryption IDs.
- Providing 3 cgroup files per encryption ID type. For example in SEV,
  - encryption_ids.sev.stat (only in the root cgroup directory).
  - encryption_ids.sev.max
  - encryption_ids.sev.current

Thanks
Vipin Sharma

[1] https://lore.kernel.org/lkml/20200922004024.3699923-1-vipi...@google.com/#r
[2] https://lore.kernel.org/lkml/20201208213531.2626955-1-vipi...@google.com/

 .../admin-guide/cgroup-v1/encryption_ids.rst  | 108 +
 Documentation/admin-guide/cgroup-v2.rst   |  78 +++-
 arch/x86/kvm/svm/sev.c|  28 +-
 include/linux/cgroup_subsys.h |   4 +
 include/linux/encryption_ids_cgroup.h |  71 +++
 include/linux/kvm_host.h  |   4 +
 init/Kconfig  |  14 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/encryption_ids.c| 430 ++
 9 files changed, 729 insertions(+), 9 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/encryption_ids.rst
 create mode 100644 include/linux/encryption_ids_cgroup.h
 create mode 100644 kernel/cgroup/encryption_ids.c

-- 
2.29.2.576.ga3fc446d84-goog



Re: [Patch v2 0/2] cgroup: KVM: New Encryption IDs cgroup controller

2020-12-09 Thread Vipin Sharma
On Tue, Dec 08, 2020 at 01:35:29PM -0800, Vipin Sharma wrote:
> Hello,
> 
> This patch adds a new cgroup controller, Encryption IDs, to track and
> limit the usage of encryption IDs on a host.
> 
> AMD provides Secure Encrypted Virtualization (SEV) and SEV with
> Encrypted State (SEV-ES) to encrypt the guest OS's memory using limited
> number of Address Space Identifiers (ASIDs).
> 
> This limited number of ASIDs creates issues like SEV ASID starvation and
> unoptimized scheduling in the cloud infrastucture.
> 
> In the RFC patch v1, I provided only SEV cgroup controller but based
> on the feedback and discussion it became clear that this cgroup
> controller can be extended to be used by Intel's Trusted Domain
> Extension (TDX) and s390's protected virtualization Secure Execution IDs
> (SEID)
> 
> This patch series provides a generic Encryption IDs controller with
> tracking support of the SEV ASIDs.
> 
> Changes in v2:
> - Changed cgroup name from sev to encryption_ids.
> - Replaced SEV specific names in APIs and documentations with generic
>   encryption IDs.
> - Providing 3 cgroup files per encryption ID type. For example in SEV,
>   - encryption_ids.sev.stat (only in the root cgroup directory).
>   - encryption_ids.sev.max
>   - encryption_ids.sev.current
> 
> Thanks
> Vipin Sharma
> 
> [1] 
> https://lore.kernel.org/lkml/20200922004024.3699923-1-vipi...@google.com/#r
> 
> Vipin Sharma (2):
>   cgroup: svm: Add Encryption ID controller
>   cgroup: svm: Encryption IDs cgroup documentation.
> 
>  .../admin-guide/cgroup-v1/encryption_ids.rst  | 108 +
>  Documentation/admin-guide/cgroup-v2.rst   |  78 +++-
>  arch/x86/kvm/svm/sev.c|  28 +-
>  include/linux/cgroup_subsys.h |   4 +
>  include/linux/encryption_ids_cgroup.h |  70 +++
>  include/linux/kvm_host.h  |   4 +
>  init/Kconfig  |  14 +
>  kernel/cgroup/Makefile|   1 +
>  kernel/cgroup/encryption_ids.c| 430 ++
>  9 files changed, 728 insertions(+), 9 deletions(-)
>  create mode 100644 Documentation/admin-guide/cgroup-v1/encryption_ids.rst
>  create mode 100644 include/linux/encryption_ids_cgroup.h
>  create mode 100644 kernel/cgroup/encryption_ids.c
> 
> --
> 2.29.2.576.ga3fc446d84-goog
> 

Please ignore this version of patch series, I will send out v3 soon. v2
has build failure when CONFIG_CGROUP is disabled.


[Patch v2 2/2] cgroup: SVM: Encryption IDs cgroup documentation.

2020-12-08 Thread Vipin Sharma
Documentation for both cgroup versions, v1 and v2, of Encryption IDs
controller. This new controller is used to track and limit usage of
hardware memory encryption capabilities on the CPUs.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
Reviewed-by: Dionna Glaze 
---
 .../admin-guide/cgroup-v1/encryption_ids.rst  | 108 ++
 Documentation/admin-guide/cgroup-v2.rst   |  78 -
 2 files changed, 184 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/encryption_ids.rst

diff --git a/Documentation/admin-guide/cgroup-v1/encryption_ids.rst 
b/Documentation/admin-guide/cgroup-v1/encryption_ids.rst
new file mode 100644
index ..891143b4e229
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/encryption_ids.rst
@@ -0,0 +1,108 @@
+=
+Encryption IDs Controller
+=
+
+Overview
+
+There are multiple hardware memory encryption capabilities provided by the
+hardware vendors, like Secure Encrypted Virtualization (SEV) and SEV Encrypted
+State (SEV-ES) from AMD.
+
+These features are being used in encrypting virtual machines (VMs) and user
+space programs. However, only a small number of keys/IDs can be used
+simultaneously.
+
+This limited availability of these IDs requires system admin to optimize
+allocation, control, and track the usage of the resources in the cloud
+infrastructure. This resource also needs to be protected from getting exhausted
+by some malicious program and causing starvation for other programs.
+
+Encryption IDs controller provides capability to register the resource for
+controlling and tracking through the cgroups.
+
+How to Enable Controller
+
+
+- Enable Encryption controller::
+
+CONFIG_CGROUP_ENCRYPTION_IDS=y
+
+- Above options will build Encryption controller support in the kernel.
+  To mount the Encryption controller::
+
+mount -t cgroup -o encryption none /sys/fs/cgroup/encryption
+
+
+Interface Files
+===
+Each encryption ID type have their own interface files,
+encryption_id.[ID TYPE].{max, current, stat}, where "ID TYPE" can be sev and
+sev-es.
+
+  encryption_ids.[ID TYPE].stat
+A read-only flat-keyed single value file. This file exists only in the
+root cgroup.
+
+It shows the total number of encryption IDs available and currently in
+use on the platform::
+  # cat encryption.sev.stat
+  total 509
+  used 0
+
+  encryption_ids.[ID TYPE].max
+A read-write file which exists on the non-root cgroups. File is used to
+set maximum count of "[ID TYPE]" which can be used in the cgroup.
+
+Limit can be set to max by::
+  # echo max > encryption.sev.max
+
+Limit can be set by::
+  # echo 100 > encryption.sev.max
+
+This file shows the max limit of the encryption ID in the cgroup::
+  # cat encryption.sev.max
+  max
+
+OR::
+  # cat encryption.sev.max
+  100
+
+Limits can be set more than the "total" capacity value in the
+encryption_ids.[ID TYPE].stat file, however, the controller ensures
+that the usage never exceeds the "total" and the max limit.
+
+  encryption_ids.[ID TYPE].current
+A read-only single value file which exists on non-root cgroups.
+
+Shows the total number of encrypted IDs being used in the cgroup.
+
+Hierarchy
+=
+
+Encryption IDs controller supports hierarchical accounting. It supports
+following features:
+
+1. Current usage in the cgroup shows IDs used in the cgroup and its descendent 
cgroups.
+2. Current usage can never exceed the corresponding max limit set in the cgroup
+   and its ancestor's chain up to the root.
+
+Suppose the following example hierarchy::
+
+root
+/  \
+   AB
+   |
+   C
+
+1. A will show the count of IDs used in A and C.
+2. C's current IDs usage may not exceed any of the max limits set in C, A, or
+   root.
+
+Migration and ownership
+===
+
+An encryption ID is charged to the cgroup in which it is used first, and
+stays charged to that cgroup until that ID is freed. Migrating a process
+to a different cgroup do not move the charge to the destination cgroup
+where the process has moved.
+
diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 608d7c279396..7938bb7c6e1c 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -63,8 +63,11 @@ v1 is available under 
:ref:`Documentation/admin-guide/cgroup-v1/index.rst  encryption.sev.max
+
+Limit can be set by::
+  # echo 100 > encryption.sev.max
+
+This file shows the max limit of the encryption ID in the cgroup::
+

[Patch v2 1/2] cgroup: SVM: Add Encryption ID controller

2020-12-08 Thread Vipin Sharma
Hardware memory encryption is available on multiple generic CPUs. For
example AMD has Secure Encrypted Virtualization (SEV) and SEV -
Encrypted State (SEV-ES).

These memory encryptions are useful in creating encrypted virtual
machines (VMs) and user space programs.

There are limited number of encryption IDs that can be used
simultaneously on a machine for encryption. This generates a need for
the system admin to track, limit, allocate resources, and optimally
schedule VMs and user workloads in the cloud infrastructure. Some
malicious programs can exhaust all of these resources on a host causing
starvation of other workloads.

Encryption ID controller allows control of these resources using
Cgroups.

Controller is enabled by CGROUP_ENCRYPTION_IDS config option.
Encryption controller provide 3 interface files for each encryption ID
type. For example, in SEV:

1. encryption_ids.sev.stat
Shown only at the root cgroup. Displays total SEV IDs available
on the platform and current usage count.
2. encrpytion_ids.sev.max
Sets the maximum usage of SEV IDs in the cgroup.
3. encryption_ids.sev.current
Current usage of SEV IDs in the cgroup and its children.

Other ID types can be easily added in the controller in the same way.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
Reviewed-by: Dionna Glaze 
---
 arch/x86/kvm/svm/sev.c|  28 +-
 include/linux/cgroup_subsys.h |   4 +
 include/linux/encryption_ids_cgroup.h |  70 +
 include/linux/kvm_host.h  |   4 +
 init/Kconfig  |  14 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/encryption_ids.c| 430 ++
 7 files changed, 544 insertions(+), 7 deletions(-)
 create mode 100644 include/linux/encryption_ids_cgroup.h
 create mode 100644 kernel/cgroup/encryption_ids.c

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 566f4d18185b..83c23d1d568a 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "x86.h"
 #include "svm.h"
@@ -77,10 +78,14 @@ static bool __sev_recycle_asids(void)
return true;
 }
 
-static int sev_asid_new(void)
+static int sev_asid_new(struct kvm *kvm)
 {
bool retry = true;
-   int pos;
+   int pos, ret;
+
+   ret = enc_id_cg_try_charge(kvm, ENCRYPTION_ID_SEV, 1);
+   if (ret)
+   return ret;
 
mutex_lock(_bitmap_lock);
 
@@ -95,7 +100,8 @@ static int sev_asid_new(void)
goto again;
}
mutex_unlock(_bitmap_lock);
-   return -EBUSY;
+   ret = -EBUSY;
+   goto e_uncharge;
}
 
__set_bit(pos, sev_asid_bitmap);
@@ -103,6 +109,9 @@ static int sev_asid_new(void)
mutex_unlock(_bitmap_lock);
 
return pos + 1;
+e_uncharge:
+   enc_id_cg_uncharge(kvm, ENCRYPTION_ID_SEV, 1);
+   return ret;
 }
 
 static int sev_get_asid(struct kvm *kvm)
@@ -112,7 +121,7 @@ static int sev_get_asid(struct kvm *kvm)
return sev->asid;
 }
 
-static void sev_asid_free(int asid)
+static void sev_asid_free(struct kvm *kvm, int asid)
 {
struct svm_cpu_data *sd;
int cpu, pos;
@@ -128,6 +137,7 @@ static void sev_asid_free(int asid)
}
 
mutex_unlock(_bitmap_lock);
+   enc_id_cg_uncharge(kvm, ENCRYPTION_ID_SEV, 1);
 }
 
 static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
@@ -172,7 +182,7 @@ static int sev_guest_init(struct kvm *kvm, struct 
kvm_sev_cmd *argp)
if (unlikely(sev->active))
return ret;
 
-   asid = sev_asid_new();
+   asid = sev_asid_new(kvm);
if (asid < 0)
return ret;
 
@@ -187,7 +197,7 @@ static int sev_guest_init(struct kvm *kvm, struct 
kvm_sev_cmd *argp)
return 0;
 
 e_free:
-   sev_asid_free(asid);
+   sev_asid_free(kvm, asid);
return ret;
 }
 
@@ -1122,7 +1132,7 @@ void sev_vm_destroy(struct kvm *kvm)
mutex_unlock(>lock);
 
sev_unbind_asid(kvm, sev->handle);
-   sev_asid_free(sev->asid);
+   sev_asid_free(kvm, sev->asid);
 }
 
 int __init sev_hardware_setup(void)
@@ -1148,6 +1158,9 @@ int __init sev_hardware_setup(void)
if (!sev_reclaim_asid_bitmap)
return 1;
 
+   if (enc_id_cg_set_capacity(ENCRYPTION_ID_SEV, max_sev_asid))
+   return 1;
+
status = kmalloc(sizeof(*status), GFP_KERNEL);
if (!status)
return 1;
@@ -1177,6 +1190,7 @@ void sev_hardware_teardown(void)
 
bitmap_free(sev_asid_bitmap);
bitmap_free(sev_reclaim_asid_bitmap);
+   enc_id_cg_set_capacity(ENCRYPTION_ID_SEV, 0);
 
sev_flush_asids();
 }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index acb77dcff3b4..83754f58c05e 100644
--- a/include/linux/cgroup_subsys.h
+++

[Patch v2 0/2] cgroup: KVM: New Encryption IDs cgroup controller

2020-12-08 Thread Vipin Sharma
Hello,

This patch adds a new cgroup controller, Encryption IDs, to track and
limit the usage of encryption IDs on a host.

AMD provides Secure Encrypted Virtualization (SEV) and SEV with
Encrypted State (SEV-ES) to encrypt the guest OS's memory using limited
number of Address Space Identifiers (ASIDs).

This limited number of ASIDs creates issues like SEV ASID starvation and
unoptimized scheduling in the cloud infrastucture.

In the RFC patch v1, I provided only SEV cgroup controller but based
on the feedback and discussion it became clear that this cgroup
controller can be extended to be used by Intel's Trusted Domain
Extension (TDX) and s390's protected virtualization Secure Execution IDs
(SEID)

This patch series provides a generic Encryption IDs controller with
tracking support of the SEV ASIDs.

Changes in v2:
- Changed cgroup name from sev to encryption_ids.
- Replaced SEV specific names in APIs and documentations with generic
  encryption IDs.
- Providing 3 cgroup files per encryption ID type. For example in SEV,
  - encryption_ids.sev.stat (only in the root cgroup directory).
  - encryption_ids.sev.max
  - encryption_ids.sev.current

Thanks
Vipin Sharma

[1] https://lore.kernel.org/lkml/20200922004024.3699923-1-vipi...@google.com/#r

Vipin Sharma (2):
  cgroup: svm: Add Encryption ID controller
  cgroup: svm: Encryption IDs cgroup documentation.

 .../admin-guide/cgroup-v1/encryption_ids.rst  | 108 +
 Documentation/admin-guide/cgroup-v2.rst   |  78 +++-
 arch/x86/kvm/svm/sev.c|  28 +-
 include/linux/cgroup_subsys.h |   4 +
 include/linux/encryption_ids_cgroup.h |  70 +++
 include/linux/kvm_host.h  |   4 +
 init/Kconfig  |  14 +
 kernel/cgroup/Makefile|   1 +
 kernel/cgroup/encryption_ids.c| 430 ++
 9 files changed, 728 insertions(+), 9 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/encryption_ids.rst
 create mode 100644 include/linux/encryption_ids_cgroup.h
 create mode 100644 kernel/cgroup/encryption_ids.c

--
2.29.2.576.ga3fc446d84-goog



Re: [RFC Patch 0/2] KVM: SVM: Cgroup support for SVM SEV ASIDs

2020-11-24 Thread Vipin Sharma
On Tue, Nov 24, 2020 at 09:27:25PM +, Sean Christopherson wrote:
> On Tue, Nov 24, 2020, Vipin Sharma wrote:
> > On Tue, Nov 24, 2020 at 12:18:45PM -0800, David Rientjes wrote:
> > > On Tue, 24 Nov 2020, Vipin Sharma wrote:
> > > 
> > > > > > Looping Janosch and Christian back into the thread. 
> > > > > >   
> > > > > > 
> > > > > >   
> > > > > > I interpret this suggestion as  
> > > > > >   
> > > > > > encryption.{sev,sev_es,keyids}.{max,current,events} for AMD and 
> > > > > > Intel 
> > > > > 
> > > > > I think it makes sense to use encryption_ids instead of simply 
> > > > > encryption, that
> > > > > way it's clear the cgroup is accounting ids as opposed to restricting 
> > > > > what
> > > > > techs can be used on yes/no basis.
> > > > > 
> > > 
> > > Agreed.
> > > 
> > > > > > offerings, which was my thought on this as well.
> > > > > >   
> > > > > > 
> > > > > >   
> > > > > > Certainly the kernel could provide a single interface for all of 
> > > > > > these and
> > > > > > key value pairs depending on the underlying encryption technology 
> > > > > > but it  
> > > > > > seems to only introduce additional complexity in the kernel in 
> > > > > > string 
> > > > > > parsing that can otherwise be avoided.  I think we all agree that a 
> > > > > > single
> > > > > > interface for all encryption keys or one-value-per-file could be 
> > > > > > done in  
> > > > > > the kernel and handled by any userspace agent that is configuring 
> > > > > > these   
> > > > > > values. 
> > > > > >   
> > > > > > 
> > > > > >   
> > > > > > I think Vipin is adding a root level file that describes how many 
> > > > > > keys we 
> > > > > > have available on the platform for each technology.  So I think 
> > > > > > this comes
> > > > > > down to, for example, a single encryption.max file vs   
> > > > > >   
> > > > > > encryption.{sev,sev_es,keyid}.max.  SEV and SEV-ES ASIDs are 
> > > > > > provisioned  
> > > > > 
> > > > > Are you suggesting that the cgroup omit "current" and "events"?  I 
> > > > > agree there's
> > > > > no need to enumerate platform total, but not knowing how many of the 
> > > > > allowed IDs
> > > > > have been allocated seems problematic.
> > > > > 
> > > > 
> > > > We will be showing encryption_ids.{sev,sev_es}.{max,current}
> > > > I am inclined to not provide "events" as I am not using it, let me know
> > > > if this file is required, I can provide it then.
> 
> I've no objection to omitting current until it's needed.
> 
> > > > I will provide an encryption_ids.{sev,sev_es}.stat file, which shows
> > > > total available ids on the platform. This one will be useful for
> > > > scheduling jobs in the cloud infrastructure based on total supported
> > > > capacity.
> > > > 
> > > 
> > > Makes sense.  I assume the stat file is only at the cgroup root level 
> > > since it would otherwise be duplicating its contents in every cgroup 
> > > under 
> > > it.  Probably not very helpful for child cgroup to see stat = 509 ASIDs 
> > > but max = 100 :)
> > 
> > Yes, only at root.
> 
> Is a root level stat file needed?  Can't the infrastructure do .max - .current
> on the root cgroup to calculate the number of available ids in the system?

For an efficient scheduling of workloads in the cloud infrastructure, a
scheduler needs to know the total capacity supported and the current
usage of the host to get the overall picture. T

Re: [RFC Patch 0/2] KVM: SVM: Cgroup support for SVM SEV ASIDs

2020-11-24 Thread Vipin Sharma
On Tue, Nov 24, 2020 at 12:18:45PM -0800, David Rientjes wrote:
> On Tue, 24 Nov 2020, Vipin Sharma wrote:
> 
> > > > Looping Janosch and Christian back into the thread. 
> > > >   
> > > > 
> > > >   
> > > > I interpret this suggestion as  
> > > >   
> > > > encryption.{sev,sev_es,keyids}.{max,current,events} for AMD and Intel   
> > > >   
> > > 
> > > I think it makes sense to use encryption_ids instead of simply 
> > > encryption, that
> > > way it's clear the cgroup is accounting ids as opposed to restricting what
> > > techs can be used on yes/no basis.
> > > 
> 
> Agreed.
> 
> > > > offerings, which was my thought on this as well.
> > > >   
> > > > 
> > > >   
> > > > Certainly the kernel could provide a single interface for all of these 
> > > > and
> > > > key value pairs depending on the underlying encryption technology but 
> > > > it  
> > > > seems to only introduce additional complexity in the kernel in string   
> > > >   
> > > > parsing that can otherwise be avoided.  I think we all agree that a 
> > > > single
> > > > interface for all encryption keys or one-value-per-file could be done 
> > > > in  
> > > > the kernel and handled by any userspace agent that is configuring these 
> > > >   
> > > > values. 
> > > >   
> > > > 
> > > >   
> > > > I think Vipin is adding a root level file that describes how many keys 
> > > > we 
> > > > have available on the platform for each technology.  So I think this 
> > > > comes
> > > > down to, for example, a single encryption.max file vs   
> > > >   
> > > > encryption.{sev,sev_es,keyid}.max.  SEV and SEV-ES ASIDs are 
> > > > provisioned  
> > > 
> > > Are you suggesting that the cgroup omit "current" and "events"?  I agree 
> > > there's
> > > no need to enumerate platform total, but not knowing how many of the 
> > > allowed IDs
> > > have been allocated seems problematic.
> > > 
> > 
> > We will be showing encryption_ids.{sev,sev_es}.{max,current}
> > I am inclined to not provide "events" as I am not using it, let me know
> > if this file is required, I can provide it then.
> > 
> > I will provide an encryption_ids.{sev,sev_es}.stat file, which shows
> > total available ids on the platform. This one will be useful for
> > scheduling jobs in the cloud infrastructure based on total supported
> > capacity.
> > 
> 
> Makes sense.  I assume the stat file is only at the cgroup root level 
> since it would otherwise be duplicating its contents in every cgroup under 
> it.  Probably not very helpful for child cgroup to see stat = 509 ASIDs 
> but max = 100 :)

Yes, only at root.


Re: [RFC Patch 0/2] KVM: SVM: Cgroup support for SVM SEV ASIDs

2020-11-24 Thread Vipin Sharma
On Tue, Nov 24, 2020 at 07:16:29PM +, Sean Christopherson wrote:
> On Fri, Nov 13, 2020, David Rientjes wrote:   
>   
> > 
> >   
> > On Mon, 2 Nov 2020, Sean Christopherson wrote:  
> >   
> > 
> >   
> > > On Fri, Oct 02, 2020 at 01:48:10PM -0700, Vipin Sharma wrote: 
> > >   
> > > > On Fri, Sep 25, 2020 at 03:22:20PM -0700, Vipin Sharma wrote:   
> > > >   
> > > > > I agree with you that the abstract name is better than the concrete   
> > > > >   
> > > > > name, I also feel that we must provide HW extensions. Here is one 
> > > > >   
> > > > > approach: 
> > > > >   
> > > > >   
> > > > >   
> > > > > Cgroup name: cpu_encryption, encryption_slots, or memcrypt (open to   
> > > > >   
> > > > > suggestions)  
> > > > >   
> > > > >   
> > > > >   
> > > > > Control files: slots.{max, current, events}   
> > > > >   
> > >   
> > >   
> > > I don't particularly like the "slots" name, mostly because it could be 
> > > confused
> > > with KVM's memslots.  Maybe encryption_ids.ids.{max, current, events}?  I 
> > > don't
> > > love those names either, but "encryption" and "IDs" are the two obvious   
> > >   
> > > commonalities betwee TDX's encryption key IDs and SEV's encryption 
> > > address  
> > > space IDs.
> > >   
> > >   
> > >   
> > 
> >   
> > Looping Janosch and Christian back into the thread. 
> >   
> > 
> >   
> > I interpret this suggestion as  
> >   
> > encryption.{sev,sev_es,keyids}.{max,current,events} for AMD and Intel   
> >   
> 
> I think it makes sense to use encryption_ids instead of simply encryption, 
> that
> way it's clear the cgroup is accounting ids as opposed to restricting what
> techs can be used on yes/no basis.
> 
> > offerings, which was my thought on this as well.
> >   
> > 
> >   
> > Certainly the kernel could provide a single interface for all of these and  
> >   
> > key value pairs depending on the underlying encryption technology but it
> >   
> > seems to only introduce additional complexity in the kernel in string   
> >   
> > parsing that can otherwise be avoided.  I think we all agree that a single  
> >   
> > interface for all encryption keys or one-value-per-file could be done in
> >   
> > the kernel and handled by any userspace agent that is configuring these 
> >   
> > values. 
> >   
> > 
> >   
> > I think Vipin is adding a root level file that describes how many keys we   
> >   
> > have available on the platform for each technology.  So I think this comes  
> >   
> > down to, for example, a single encryption.max file vs   
> >   
> > encryption.{sev,sev_es,keyid}.max.  SEV and SEV-ES ASIDs are provisioned
> >   
> 
> Are you suggesting that the cgroup omit "current" and "events"?  I agree 
> there's
> no need to enumerate platform total, but not knowing how many of the allowed 
> IDs
> have been allocated seems problematic.
> 

We will be showing encryption_ids.{sev,sev_es}.{max,current}
I am inclined t

Re: [RFC Patch 0/2] KVM: SVM: Cgroup support for SVM SEV ASIDs

2020-10-02 Thread Vipin Sharma
On Fri, Sep 25, 2020 at 03:22:20PM -0700, Vipin Sharma wrote:
> On Thu, Sep 24, 2020 at 02:55:18PM -0500, Tom Lendacky wrote:
> > On 9/24/20 2:21 PM, Sean Christopherson wrote:
> > > On Tue, Sep 22, 2020 at 02:14:04PM -0700, Vipin Sharma wrote:
> > > > On Mon, Sep 21, 2020 at 06:48:38PM -0700, Sean Christopherson wrote:
> > > > > On Mon, Sep 21, 2020 at 05:40:22PM -0700, Vipin Sharma wrote:
> > > > > > Hello,
> > > > > > 
> > > > > > This patch series adds a new SEV controller for tracking and 
> > > > > > limiting
> > > > > > the usage of SEV ASIDs on the AMD SVM platform.
> > > > > > 
> > > > > > SEV ASIDs are used in creating encrypted VM and lightweight 
> > > > > > sandboxes
> > > > > > but this resource is in very limited quantity on a host.
> > > > > > 
> > > > > > This limited quantity creates issues like SEV ASID starvation and
> > > > > > unoptimized scheduling in the cloud infrastructure.
> > > > > > 
> > > > > > SEV controller provides SEV ASID tracking and resource control
> > > > > > mechanisms.
> > > > > 
> > > > > This should be genericized to not be SEV specific.  TDX has a similar
> > > > > scarcity issue in the form of key IDs, which IIUC are analogous to 
> > > > > SEV ASIDs
> > > > > (gave myself a quick crash course on SEV ASIDs).  Functionally, I 
> > > > > doubt it
> > > > > would change anything, I think it'd just be a bunch of renaming.  The 
> > > > > hardest
> > > > > part would probably be figuring out a name :-).
> > > > > 
> > > > > Another idea would be to go even more generic and implement a KVM 
> > > > > cgroup
> > > > > that accounts the number of VMs of a particular type, e.g. legacy, 
> > > > > SEV,
> > > > > SEV-ES?, and TDX.  That has potential future problems though as it 
> > > > > falls
> > > > > apart if hardware every supports 1:MANY VMs:KEYS, or if there is a 
> > > > > need to
> > > > > account keys outside of KVM, e.g. if MKTME for non-KVM cases ever 
> > > > > sees the
> > > > > light of day.
> > > > 
> > > > I read about the TDX and its use of the KeyID for encrypting VMs. TDX
> > > > has two kinds of KeyIDs private and shared.
> > > 
> > > To clarify, "shared" KeyIDs are simply legacy MKTME KeyIDs.  This is 
> > > relevant
> > > because those KeyIDs can be used without TDX or KVM in the picture.
> > > 
> > > > On AMD platform there are two types of ASIDs for encryption.
> > > > 1. SEV ASID - Normal runtime guest memory encryption.
> > > > 2. SEV-ES ASID - Extends SEV ASID by adding register state encryption 
> > > > with
> > > >  integrity.
> > > > 
> > > > Both types of ASIDs have their own maximum value which is provisioned in
> > > > the firmware
> > > 
> > > Ugh, I missed that detail in the SEV-ES RFC.  Does SNP add another ASID 
> > > type,
> > > or does it reuse SEV-ES ASIDs?  If it does add another type, is that trend
> > > expected to continue, i.e. will SEV end up with SEV, SEV-ES, SEV-ES-SNP,
> > > SEV-ES-SNP-X, SEV-ES-SNP-X-Y, etc...?
> > 
> > SEV-SNP and SEV-ES share the same ASID range.
> > 
> > Thanks,
> > Tom
> > 
> > > 
> > > > So, we are talking about 4 different types of resources:
> > > > 1. AMD SEV ASID (implemented in this patch as sev.* files in SEV cgroup)
> > > > 2. AMD SEV-ES ASID (in future, adding files like sev_es.*)
> > > > 3. Intel TDX private KeyID
> > > > 4. Intel TDX shared KeyID
> > > > 
> > > > TDX private KeyID is similar to SEV and SEV-ES ASID. I think coming up
> > > > with the same name which can be used by both platforms will not be easy,
> > > > and extensible with the future enhancements. This will get even more
> > > > difficult if Arm also comes up with something similar but with different
> > > > nuances.
> > > 
> > > Honest question, what's easier for userspace/orchestration layers?  
> > > Having an
> > > abstract but common name, or conrete but different names?  My gut 
> > > reaction is
> > > to provide

Re: [RFC Patch 0/2] KVM: SVM: Cgroup support for SVM SEV ASIDs

2020-09-25 Thread Vipin Sharma
On Thu, Sep 24, 2020 at 02:55:18PM -0500, Tom Lendacky wrote:
> On 9/24/20 2:21 PM, Sean Christopherson wrote:
> > On Tue, Sep 22, 2020 at 02:14:04PM -0700, Vipin Sharma wrote:
> > > On Mon, Sep 21, 2020 at 06:48:38PM -0700, Sean Christopherson wrote:
> > > > On Mon, Sep 21, 2020 at 05:40:22PM -0700, Vipin Sharma wrote:
> > > > > Hello,
> > > > > 
> > > > > This patch series adds a new SEV controller for tracking and limiting
> > > > > the usage of SEV ASIDs on the AMD SVM platform.
> > > > > 
> > > > > SEV ASIDs are used in creating encrypted VM and lightweight sandboxes
> > > > > but this resource is in very limited quantity on a host.
> > > > > 
> > > > > This limited quantity creates issues like SEV ASID starvation and
> > > > > unoptimized scheduling in the cloud infrastructure.
> > > > > 
> > > > > SEV controller provides SEV ASID tracking and resource control
> > > > > mechanisms.
> > > > 
> > > > This should be genericized to not be SEV specific.  TDX has a similar
> > > > scarcity issue in the form of key IDs, which IIUC are analogous to SEV 
> > > > ASIDs
> > > > (gave myself a quick crash course on SEV ASIDs).  Functionally, I doubt 
> > > > it
> > > > would change anything, I think it'd just be a bunch of renaming.  The 
> > > > hardest
> > > > part would probably be figuring out a name :-).
> > > > 
> > > > Another idea would be to go even more generic and implement a KVM cgroup
> > > > that accounts the number of VMs of a particular type, e.g. legacy, SEV,
> > > > SEV-ES?, and TDX.  That has potential future problems though as it falls
> > > > apart if hardware every supports 1:MANY VMs:KEYS, or if there is a need 
> > > > to
> > > > account keys outside of KVM, e.g. if MKTME for non-KVM cases ever sees 
> > > > the
> > > > light of day.
> > > 
> > > I read about the TDX and its use of the KeyID for encrypting VMs. TDX
> > > has two kinds of KeyIDs private and shared.
> > 
> > To clarify, "shared" KeyIDs are simply legacy MKTME KeyIDs.  This is 
> > relevant
> > because those KeyIDs can be used without TDX or KVM in the picture.
> > 
> > > On AMD platform there are two types of ASIDs for encryption.
> > > 1. SEV ASID - Normal runtime guest memory encryption.
> > > 2. SEV-ES ASID - Extends SEV ASID by adding register state encryption with
> > >integrity.
> > > 
> > > Both types of ASIDs have their own maximum value which is provisioned in
> > > the firmware
> > 
> > Ugh, I missed that detail in the SEV-ES RFC.  Does SNP add another ASID 
> > type,
> > or does it reuse SEV-ES ASIDs?  If it does add another type, is that trend
> > expected to continue, i.e. will SEV end up with SEV, SEV-ES, SEV-ES-SNP,
> > SEV-ES-SNP-X, SEV-ES-SNP-X-Y, etc...?
> 
> SEV-SNP and SEV-ES share the same ASID range.
> 
> Thanks,
> Tom
> 
> > 
> > > So, we are talking about 4 different types of resources:
> > > 1. AMD SEV ASID (implemented in this patch as sev.* files in SEV cgroup)
> > > 2. AMD SEV-ES ASID (in future, adding files like sev_es.*)
> > > 3. Intel TDX private KeyID
> > > 4. Intel TDX shared KeyID
> > > 
> > > TDX private KeyID is similar to SEV and SEV-ES ASID. I think coming up
> > > with the same name which can be used by both platforms will not be easy,
> > > and extensible with the future enhancements. This will get even more
> > > difficult if Arm also comes up with something similar but with different
> > > nuances.
> > 
> > Honest question, what's easier for userspace/orchestration layers?  Having 
> > an
> > abstract but common name, or conrete but different names?  My gut reaction 
> > is
> > to provide a common interface, but I can see how that could do more harm 
> > than
> > good, e.g. some amount of hardware capabilitiy discovery is possible with
> > concrete names.  And I'm guessing there's already a fair amount of vendor
> > specific knowledge bleeding into userspace for these features...

I agree with you that the abstract name is better than the concrete
name, I also feel that we must provide HW extensions. Here is one
approach:

Cgroup name: cpu_encryption, encryption_slots, or memcrypt (open to
suggestions)

Control files: slots.{max, current, events}

Contents of the slot.max:
default max
 

Re: [RFC Patch 0/2] KVM: SVM: Cgroup support for SVM SEV ASIDs

2020-09-22 Thread Vipin Sharma
On Mon, Sep 21, 2020 at 06:48:38PM -0700, Sean Christopherson wrote:
> On Mon, Sep 21, 2020 at 05:40:22PM -0700, Vipin Sharma wrote:
> > Hello,
> > 
> > This patch series adds a new SEV controller for tracking and limiting
> > the usage of SEV ASIDs on the AMD SVM platform.
> > 
> > SEV ASIDs are used in creating encrypted VM and lightweight sandboxes
> > but this resource is in very limited quantity on a host.
> > 
> > This limited quantity creates issues like SEV ASID starvation and
> > unoptimized scheduling in the cloud infrastructure.
> > 
> > SEV controller provides SEV ASID tracking and resource control
> > mechanisms.
> 
> This should be genericized to not be SEV specific.  TDX has a similar
> scarcity issue in the form of key IDs, which IIUC are analogous to SEV ASIDs
> (gave myself a quick crash course on SEV ASIDs).  Functionally, I doubt it
> would change anything, I think it'd just be a bunch of renaming.  The hardest
> part would probably be figuring out a name :-).
> 
> Another idea would be to go even more generic and implement a KVM cgroup
> that accounts the number of VMs of a particular type, e.g. legacy, SEV,
> SEV-ES?, and TDX.  That has potential future problems though as it falls
> apart if hardware every supports 1:MANY VMs:KEYS, or if there is a need to
> account keys outside of KVM, e.g. if MKTME for non-KVM cases ever sees the
> light of day.

I read about the TDX and its use of the KeyID for encrypting VMs. TDX
has two kinds of KeyIDs private and shared.

On AMD platform there are two types of ASIDs for encryption.
1. SEV ASID - Normal runtime guest memory encryption.
2. SEV-ES ASID - Extends SEV ASID by adding register state encryption with
 integrity.

Both types of ASIDs have their own maximum value which is provisioned in
the firmware

So, we are talking about 4 different types of resources:
1. AMD SEV ASID (implemented in this patch as sev.* files in SEV cgroup)
2. AMD SEV-ES ASID (in future, adding files like sev_es.*)
3. Intel TDX private KeyID
4. Intel TDX shared KeyID

TDX private KeyID is similar to SEV and SEV-ES ASID. I think coming up
with the same name which can be used by both platforms will not be easy,
and extensible with the future enhancements. This will get even more
difficult if Arm also comes up with something similar but with different
nuances.

I like the idea of the KVM cgroup and when it is mounted it will have
different files based on the hardware platform.

1. KVM cgroup on AMD will have:
sev.max & sev.current.
sev_es.max & sev_es.current.

2. KVM cgroup mounted on Intel:
tdx_private_keys.max
tdx_shared_keys.max

The KVM cgroup can be used to have control files which are generic (no
use case in my mind right now) and hardware platform specific files
also.


Re: [RFC Patch 1/2] KVM: SVM: Create SEV cgroup controller.

2020-09-22 Thread Vipin Sharma
On Mon, Sep 21, 2020 at 06:22:28PM -0700, Sean Christopherson wrote:
> On Mon, Sep 21, 2020 at 06:04:04PM -0700, Randy Dunlap wrote:
> > Hi,
> > 
> > On 9/21/20 5:40 PM, Vipin Sharma wrote:
> > > diff --git a/init/Kconfig b/init/Kconfig
> > > index d6a0b31b13dc..1a57c362b803 100644
> > > --- a/init/Kconfig
> > > +++ b/init/Kconfig
> > > @@ -1101,6 +1101,20 @@ config CGROUP_BPF
> > > BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
> > > inet sockets.
> > >  
> > > +config CGROUP_SEV
> > > + bool "SEV ASID controller"
> > > + depends on KVM_AMD_SEV
> > > + default n
> > > + help
> > > +   Provides a controller for AMD SEV ASIDs. This controller limits and
> > > +   shows the total usage of SEV ASIDs used in encrypted VMs on AMD
> > > +   processors. Whenever a new encrypted VM is created using SEV on an
> > > +   AMD processor, this controller will check the current limit in the
> > > +   cgroup to which the task belongs and will deny the SEV ASID if the
> > > +   cgroup has already reached its limit.
> > > +
> > > +   Say N if unsure.
> > 
> > Something here (either in the bool prompt string or the help text) should
> > let a reader know w.t.h. SEV means.
> > 
> > Without having to look in other places...
> 
> ASIDs too.  I'd also love to see more info in the docs and/or cover letter
> to explain why ASID management on SEV requires a cgroup.  I know what an
> ASID is, and have a decent idea of how KVM manages ASIDs for legacy VMs, but
> I know nothing about why ASIDs are limited for SEV and not legacy VMs.

Thanks for the feedback, I will add more details in the Kconfig and the
documentation about SEV and ASID.


[RFC Patch 1/2] KVM: SVM: Create SEV cgroup controller.

2020-09-21 Thread Vipin Sharma
Create SEV cgroup controller for SEV ASIDs on the AMD platform.

SEV ASIDs are used to encrypt virtual machines memory and isolate the
guests from the hypervisor. However, number of SEV ASIDs are limited on
a platform. This leads to the resource constraints and cause issues
like:

1. Some applications exhausting all of the SEV ASIDs and depriving
   others on a host.
2. No capability with the system admin to allocate and limit the number
   of SEV ASIDs used by tasks.
3. Difficult for the cloud service providers to optimally schedule VMs
   and sandboxes across its fleet without knowing the overall picture of
   SEV ASIDs usage.

SEV controller tracks the usage and provides capability to limit SEV
ASIDs used by tasks.

Controller is enabled by CGROUP_SEV config option, it is dependent on
KVM_AMD_SEV option in the config file.

SEV Controller has 3 interface files:

1. max - Sets the max limit of the SEV ASIDs in the cgroup.

2. current - Shows the current count of the SEV ASIDs in the cgroup.

3. events - Event file to show the SEV ASIDs allocation denied in the
cgroup.

When kvm-amd module is installed it calls SEV controller API and informs
how many SEV ASIDs are available on the platform. Controller use this
value to allocate an array which stores ASID to cgroup mapping.

New SEV ASID allocation gets charged to the task's SEV cgroup. Migration
of charge is not supported, so, a charged ASID remains charged to the
same cgroup until that SEV ASID is freed. This feature is similar to the
memory cgroup as it is a stateful resource

On deletion of an empty cgroup whose tasks have moved to some other
cgroup but a SEV ASID is still charged to it, the SEV ASID gets mapped
to the parent cgroup.

Mapping array tells which cgroup to uncharge, and update mapping when
the cgroup is deleted. Mapping array is freed when kvm-amd module is
unloaded.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
Reviewed-by: Dionna Glaze 
Reviewed-by: Erdem Aktas 
---
 arch/x86/kvm/Makefile |   1 +
 arch/x86/kvm/svm/sev.c|  16 +-
 arch/x86/kvm/svm/sev_cgroup.c | 414 ++
 arch/x86/kvm/svm/sev_cgroup.h |  40 
 include/linux/cgroup_subsys.h |   3 +
 init/Kconfig  |  14 ++
 6 files changed, 487 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kvm/svm/sev_cgroup.c
 create mode 100644 arch/x86/kvm/svm/sev_cgroup.h

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 4a3081e9f4b5..bbbf10fc1b50 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -16,6 +16,7 @@ kvm-$(CONFIG_KVM_ASYNC_PF)+= $(KVM)/async_pf.o
 kvm-y  += x86.o emulate.o i8259.o irq.o lapic.o \
   i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \
   hyperv.o debugfs.o mmu/mmu.o mmu/page_track.o
+kvm-$(CONFIG_CGROUP_SEV)   += svm/sev_cgroup.o
 
 kvm-intel-y+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o 
vmx/evmcs.o vmx/nested.o
 kvm-amd-y  += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o 
svm/avic.o svm/sev.o
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 7bf7bf734979..2cc0bea21a76 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -17,6 +17,7 @@
 
 #include "x86.h"
 #include "svm.h"
+#include "sev_cgroup.h"
 
 static int sev_flush_asids(void);
 static DECLARE_RWSEM(sev_deactivate_lock);
@@ -80,7 +81,7 @@ static bool __sev_recycle_asids(void)
 static int sev_asid_new(void)
 {
bool retry = true;
-   int pos;
+   int pos, ret;
 
mutex_lock(_bitmap_lock);
 
@@ -98,6 +99,12 @@ static int sev_asid_new(void)
return -EBUSY;
}
 
+   ret = sev_asid_try_charge(pos);
+   if (ret) {
+   mutex_unlock(_bitmap_lock);
+   return ret;
+   }
+
__set_bit(pos, sev_asid_bitmap);
 
mutex_unlock(_bitmap_lock);
@@ -127,6 +134,8 @@ static void sev_asid_free(int asid)
sd->sev_vmcbs[pos] = NULL;
}
 
+   sev_asid_uncharge(pos);
+
mutex_unlock(_bitmap_lock);
 }
 
@@ -1143,6 +1152,9 @@ int __init sev_hardware_setup(void)
if (!status)
return 1;
 
+   if (sev_cgroup_setup(max_sev_asid))
+   return 1;
+
/*
 * Check SEV platform status.
 *
@@ -1157,6 +1169,7 @@ int __init sev_hardware_setup(void)
pr_info("SEV supported\n");
 
 err:
+   sev_cgroup_teardown();
kfree(status);
return rc;
 }
@@ -1170,6 +1183,7 @@ void sev_hardware_teardown(void)
bitmap_free(sev_reclaim_asid_bitmap);
 
sev_flush_asids();
+   sev_cgroup_teardown();
 }
 
 void pre_sev_run(struct vcpu_svm *svm, int cpu)
diff --git a/arch/x86/kvm/svm/sev_cgroup.c b/arch/x86/kvm/svm/sev_cgroup.c
new file mode 100644
index ..f76a934b8cf2
--- /dev/null
+++ b/arch/x86/kvm/svm/sev_cgroup.c
@@ -0,0 +1,

[RFC Patch 0/2] KVM: SVM: Cgroup support for SVM SEV ASIDs

2020-09-21 Thread Vipin Sharma
Hello,

This patch series adds a new SEV controller for tracking and limiting
the usage of SEV ASIDs on the AMD SVM platform.

SEV ASIDs are used in creating encrypted VM and lightweight sandboxes
but this resource is in very limited quantity on a host.

This limited quantity creates issues like SEV ASID starvation and
unoptimized scheduling in the cloud infrastructure.

SEV controller provides SEV ASID tracking and resource control
mechanisms.

Patch 1 - Overview, motivation, and implementation details of the SEV
  controller.
Patch 2 - Kernel documentation of the SEV controller for both
  cgroup v1 and v2.

Thanks

Vipin Sharma (2):
  KVM: SVM: Create SEV cgroup controller.
  KVM: SVM: SEV cgroup controller documentation

 Documentation/admin-guide/cgroup-v1/sev.rst |  94 +
 Documentation/admin-guide/cgroup-v2.rst |  56 ++-
 arch/x86/kvm/Makefile   |   1 +
 arch/x86/kvm/svm/sev.c  |  16 +-
 arch/x86/kvm/svm/sev_cgroup.c   | 414 
 arch/x86/kvm/svm/sev_cgroup.h   |  40 ++
 include/linux/cgroup_subsys.h   |   3 +
 init/Kconfig|  14 +
 8 files changed, 634 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/sev.rst
 create mode 100644 arch/x86/kvm/svm/sev_cgroup.c
 create mode 100644 arch/x86/kvm/svm/sev_cgroup.h

-- 
2.28.0.681.g6f77f65b4e-goog



[RFC Patch 2/2] KVM: SVM: SEV cgroup controller documentation

2020-09-21 Thread Vipin Sharma
SEV cgroup controller documentation.

Documentation for both cgroup versions, v1 and v2, of SEV cgroup
controller. SEV controller is used to distribute and account SEV ASIDs
usage by KVM on AMD processor.

Signed-off-by: Vipin Sharma 
Reviewed-by: David Rientjes 
Reviewed-by: Dionna Glaze 
Reviewed-by: Erdem Aktas 
---
 Documentation/admin-guide/cgroup-v1/sev.rst | 94 +
 Documentation/admin-guide/cgroup-v2.rst | 56 +++-
 2 files changed, 147 insertions(+), 3 deletions(-)
 create mode 100644 Documentation/admin-guide/cgroup-v1/sev.rst

diff --git a/Documentation/admin-guide/cgroup-v1/sev.rst 
b/Documentation/admin-guide/cgroup-v1/sev.rst
new file mode 100644
index ..04d0024360a1
--- /dev/null
+++ b/Documentation/admin-guide/cgroup-v1/sev.rst
@@ -0,0 +1,94 @@
+==
+SEV Controller
+==
+
+Overview
+
+
+The SEV controller regulates the distribution of SEV ASIDs. SEV ASIDs are used
+in creating encrypted VMs on AMD processors. SEV ASIDs are stateful and one
+ASID is only used in one KVM object at a time. It cannot be used with other KVM
+before unbinding it from the previous KVM.
+
+All SEV ASIDs are tracked by this controller and it allows for accounting and
+distribution of this resource.
+
+How to Enable Controller
+
+
+- Enable memory encryption on AMD platform::
+
+CONFIG_KVM_AMD_SEV=y
+
+- Enable SEV controller::
+
+CONFIG_CGROUP_SEV=y
+
+- Above options will build SEV controller support in the kernel.
+  To mount sev controller::
+
+mount -t cgroup -o sev none /sys/fs/cgroup/sev
+
+Interface Files
+==
+
+  sev.current
+A read-only single value file which exists on non-root cgroups.
+
+The total number of SEV ASIDs currently in use by the cgroup and its
+descendants.
+
+  sev.max
+A read-write single value file which exists on non-root cgroups. The
+default is "max".
+
+SEV ASIDs usage hard limit. If the cgroup's current SEV ASIDs usage
+reach this limit then the new SEV VMs creation will return error
+-EBUSY.  This limit cannot be set lower than sev.current.
+
+  sev.events
+A read-only flat-keyed single value file which exists on non-root
+cgroups. A value change in this file generates a file modified event.
+
+  max
+ The number of times the cgroup's SEV ASIDs usage was about to
+ go over the max limit. This is a tally of SEV VM creation
+ failures in the cgroup.
+
+Hierarchy
+=
+
+SEV controller supports hierarchical accounting. It supports following
+features:
+
+1. SEV ASID usage in the cgroup includes itself and its descendent cgroups.
+2. SEV ASID usage can never exceed the max limit set in the cgroup and its
+   ancestor's chain up to the root.
+3. SEV events keep a tally of SEV VM creation failures in the cgroup and not in
+   its child subtree.
+
+Suppose the following example hierarchy::
+
+root
+/  \
+   AB
+   |
+   C
+
+1. A will show the count of SEV ASID used in A and C.
+2. C's SEV ASID usage may not exceed any of the max limits set in C, A, or
+   root.
+3. A's event file lists only SEV VM creation failed in A, and not the ones in
+   C.
+
+Migration and SEV ASID ownership
+
+
+An SEV ASID is charged to the cgroup which instantiated it, and stays charged
+to that cgroup until that SEV ASID is freed. Migrating a process to a different
+cgroup do not move the SEV ASID charge to the destination cgroup where the
+process has moved.
+
+Deletion of a cgroup with existing ASIDs charges will migrate those ASIDs to
+the parent cgroup.
+
diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 6be43781ec7f..66b8bdee8ff3 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -63,8 +63,11 @@ v1 is available under 
:ref:`Documentation/admin-guide/cgroup-v1/index.rst