Re: [PATCH v5 16/18] x86/sgx: Limit process EPC usage with misc cgroup controller

2023-10-22 Thread Haitao Huang

On Mon, 09 Oct 2023 19:26:01 -0500, Huang, Kai  wrote:



@@ -332,6 +336,7 @@ void sgx_isolate_epc_pages(struct sgx_epc_lru_lists  
*lru, size_t nr_to_scan,

  * sgx_reclaim_epc_pages() - Reclaim EPC pages from the consumers
  * @nr_to_scan: Number of EPC pages to scan for reclaim
  * @ignore_age: Reclaim a page even if it is young
+ * @epc_cg: EPC cgroup from which to reclaim
  *
  * Take a fixed number of pages from the head of the active page pool  
and
  * reclaim them to the enclave's private shmem files. Skip the pages,  
which have
@@ -345,7 +350,8 @@ void sgx_isolate_epc_pages(struct sgx_epc_lru_lists  
*lru, size_t nr_to_scan,
  * problematic as it would increase the lock contention too much,  
which would

  * halt forward progress.
  */
-size_t sgx_reclaim_epc_pages(size_t nr_to_scan, bool ignore_age)
+size_t sgx_reclaim_epc_pages(size_t nr_to_scan, bool ignore_age,
+struct sgx_epc_cgroup *epc_cg)
 {
struct sgx_backing backing[SGX_NR_TO_SCAN_MAX];
struct sgx_epc_page *epc_page, *tmp;
@@ -355,7 +361,15 @@ size_t sgx_reclaim_epc_pages(size_t nr_to_scan,  
bool ignore_age)

LIST_HEAD(iso);
size_t ret, i;

-   sgx_isolate_epc_pages(_global_lru, nr_to_scan, );
+   /*
+* If a specific cgroup is not being targeted, take from the global
+* list first, even when cgroups are enabled.  If there are
+* pages on the global LRU then they should get reclaimed asap.
+*/


This is probably some obsolete comments I should have removed. When cgroup  
is enabled, reclaimables will be always in a cgroup, the root by default.  
(!epc_cg) condition is harmless but not needed because the global list  
will be empty if cgroup is enabled.



+   if (!IS_ENABLED(CONFIG_CGROUP_SGX_EPC) || !epc_cg)
+   sgx_isolate_epc_pages(_global_lru, _to_scan, );
+
+   sgx_epc_cgroup_isolate_pages(epc_cg, _to_scan, );




So it should have been:

+   if (!IS_ENABLED(CONFIG_CGROUP_SGX_EPC))
+   sgx_isolate_epc_pages(_global_lru, _to_scan, );
+   else
+   sgx_epc_cgroup_isolate_pages(epc_cg, _to_scan, );

Or just encapsulate the difference in  sgx_epc_cgroup_isolate_pages

(I wish such code can be somehow moved to the earlier patches, so that  
we can

get early idea that how sgx_reclaim_epc_pages() is supposed to be used.)



I'll will try to restructure and split this patch. Now that we are not  
going to deal with unreclaimable, it'd be simpler and also easier to  
restructure.


So here when we are not targeting a specific EPC cgroup, we always  
reclaim from

the global list first, ...

[...]



if (list_empty())
return 0;
@@ -423,7 +437,7 @@ static bool sgx_should_reclaim(unsigned long  
watermark)

 void sgx_reclaim_direct(void)
 {
if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
-   sgx_reclaim_epc_pages(SGX_NR_TO_SCAN, false);
+   sgx_reclaim_epc_pages(SGX_NR_TO_SCAN, false, NULL);


... and we always try to reclaim the global list first when directly  
reclaim is

desired, even the enclave is within some EPC cgroup.  ...


 }

 static int ksgxd(void *p)
@@ -446,7 +460,7 @@ static int ksgxd(void *p)
 sgx_should_reclaim(SGX_NR_HIGH_PAGES));

if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
-   sgx_reclaim_epc_pages(SGX_NR_TO_SCAN, false);
+   sgx_reclaim_epc_pages(SGX_NR_TO_SCAN, false, NULL);


... and in ksgxd() as well, which I guess is somehow acceptable.  ...



cond_resched();
}
@@ -600,6 +614,11 @@ int sgx_drop_epc_page(struct sgx_epc_page *page)
 struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
 {
struct sgx_epc_page *page;
+   struct sgx_epc_cgroup *epc_cg;
+
+   epc_cg = sgx_epc_cgroup_try_charge(reclaim);
+   if (IS_ERR(epc_cg))
+   return ERR_CAST(epc_cg);


I think I need add comments to clarify after this point is the global  
reclaimer only to keep the global free page water mark satisfied. So all  
reclaiming is from the root if cgroup is enabled, otherwise from the  
global LRU (no change from current implementation).




for ( ; ; ) {
page = __sgx_alloc_epc_page();
@@ -608,8 +627,10 @@ struct sgx_epc_page *sgx_alloc_epc_page(void  
*owner, bool reclaim)

break;
}

-   if (!sgx_can_reclaim())
-   return ERR_PTR(-ENOMEM);
+   if (!sgx_can_reclaim()) {
+   page = ERR_PTR(-ENOMEM);
+   break;
+   }

if (!reclaim) {
page = ERR_PTR(-EBUSY);
@@ -621,10 +642,17 @@ struct sgx_epc_page *sgx_alloc_epc_page(void  
*owner, bool reclaim)

break;
}

-   

Re: [PATCH v5 16/18] x86/sgx: Limit process EPC usage with misc cgroup controller

2023-10-19 Thread Haitao Huang

On Tue, 17 Oct 2023 13:54:54 -0500, Michal Koutný  wrote:

On Fri, Sep 22, 2023 at 08:06:55PM -0700, Haitao Huang  
 wrote:

+static void sgx_epc_cgroup_free(struct misc_cg *cg)
+{
+   struct sgx_epc_cgroup *epc_cg;
+
+   epc_cg = sgx_epc_cgroup_from_misc_cg(cg);


It should check for !epc_cg since the misc controller implementation
in misc_cg_alloc() would roll back even on non-allocated resources.


Good catch. Will fix.




+   cancel_work_sync(_cg->reclaim_work);
+   kfree(epc_cg);
+}
+
+static void sgx_epc_cgroup_max_write(struct misc_cg *cg)
+{
+   struct sgx_epc_reclaim_control rc;
+   struct sgx_epc_cgroup *epc_cg;
+
+   epc_cg = sgx_epc_cgroup_from_misc_cg(cg);
+
+   sgx_epc_reclaim_control_init(, epc_cg);
+   /* Let the reclaimer to do the work so user is not blocked */
+   queue_work(sgx_epc_cg_wq, _cg->reclaim_work);


This is weird. The writer will never learn about the result of the
operation.


Right. With the new plan, this callback will be removed.

Thanks
Haitao


Re: [PATCH v5 16/18] x86/sgx: Limit process EPC usage with misc cgroup controller

2023-10-17 Thread Michal Koutný
On Fri, Sep 22, 2023 at 08:06:55PM -0700, Haitao Huang 
 wrote:
> +static void sgx_epc_cgroup_free(struct misc_cg *cg)
> +{
> + struct sgx_epc_cgroup *epc_cg;
> +
> + epc_cg = sgx_epc_cgroup_from_misc_cg(cg);

It should check for !epc_cg since the misc controller implementation
in misc_cg_alloc() would roll back even on non-allocated resources.

> + cancel_work_sync(_cg->reclaim_work);
> + kfree(epc_cg);
> +}
> +
> +static void sgx_epc_cgroup_max_write(struct misc_cg *cg)
> +{
> + struct sgx_epc_reclaim_control rc;
> + struct sgx_epc_cgroup *epc_cg;
> +
> + epc_cg = sgx_epc_cgroup_from_misc_cg(cg);
> +
> + sgx_epc_reclaim_control_init(, epc_cg);
> + /* Let the reclaimer to do the work so user is not blocked */
> + queue_work(sgx_epc_cg_wq, _cg->reclaim_work);

This is weird. The writer will never learn about the result of the
operation.

Thanks,
Michal


signature.asc
Description: PGP signature


Re: [PATCH v5 16/18] x86/sgx: Limit process EPC usage with misc cgroup controller

2023-10-10 Thread Huang, Kai
On Fri, 2023-09-22 at 20:06 -0700, Haitao Huang wrote:
> From: Kristen Carlson Accardi 
> 
> Implement support for cgroup control of SGX Enclave Page Cache (EPC)
> memory using the misc cgroup controller. EPC memory is independent
> from normal system memory, e.g. must be reserved at boot from RAM and
> cannot be converted between EPC and normal memory while the system is
> running. EPC is managed by the SGX subsystem and is not accounted by
> the memory controller.
> 
> Much like normal system memory, EPC memory can be overcommitted via
> virtual memory techniques and pages can be swapped out of the EPC to
> their backing store (normal system memory, e.g. shmem).  The SGX EPC
> subsystem is analogous to the memory subsystem and the SGX EPC controller
> is in turn analogous to the memory controller; it implements limit and
> protection models for EPC memory.
> 
> The misc controller provides a mechanism to set a hard limit of EPC
> usage via the "sgx_epc" resource in "misc.max". The total EPC memory
> available on the system is reported via the "sgx_epc" resource in
> "misc.capacity".
> 
> This patch was modified from its original version to use the misc cgroup
> controller instead of a custom controller.
> 
> 

[...]

> 
> 7) Other minor refactoring:
> - Remove unused params in epc_cgroup APIs
> - centralize uncharge into sgx_free_epc_page()
> ---
>  arch/x86/Kconfig |  13 +
>  arch/x86/kernel/cpu/sgx/Makefile |   1 +
>  arch/x86/kernel/cpu/sgx/epc_cgroup.c | 415 +++
>  arch/x86/kernel/cpu/sgx/epc_cgroup.h |  59 
>  arch/x86/kernel/cpu/sgx/main.c   |  68 -
>  arch/x86/kernel/cpu/sgx/sgx.h|  17 +-
>  6 files changed, 556 insertions(+), 17 deletions(-)
>  create mode 100644 arch/x86/kernel/cpu/sgx/epc_cgroup.c
>  create mode 100644 arch/x86/kernel/cpu/sgx/epc_cgroup.h

As mentioned before, this patch is pretty large thus it's hard to review.  I
think we should try to split into smaller patches so that they can be
reviewable.

I cannot recall how many times that I've done scroll up/down just to find some
function.

Any idea to further split this patch?

Also, I am thinking _perhaps_ the way of organizing the patches of this patchset
can be improved.  I had an impression that this patchset is organized in this
way:  

1) There are many small patches to adjust the elemental code pieces to suit EPC
cgroup support, but many of them don't have enough design information to
justify, but only says "EPC cgroup will use later" etc.

2) And then the EPC cgroup is implemented in one large patch at the end.

Both 1) and 2) are hard to review.  I need to do a lot of back and forth to
review this series.

I am not finger pointing at anything because it's not easy at all, but just want
to explore options that may make this series easier to review.

For instance, will below make more sense:

Instead of implementing EPC cgroup in one big patch, we introduce key
structures, elemental helpers in separate patch(es) at early position so that
it's easy to review some basic logic/conversion.

And then we may move some key logic of handling EPC cgroup, e.g., reclaim logic,
to early patches when we adjust the elemental code pieces for EPC cgroup.

Perhaps it's worth to try, but just my 2cents.


Re: [PATCH v5 16/18] x86/sgx: Limit process EPC usage with misc cgroup controller

2023-10-10 Thread Huang, Kai

> +
> +static inline struct sgx_epc_cgroup *sgx_epc_cgroup_from_misc_cg(struct 
> misc_cg *cg)
> +{
> + if (cg)
> + return (struct sgx_epc_cgroup 
> *)(cg->res[MISC_CG_RES_SGX_EPC].priv);
> +
> + return NULL;
> +}
> +
> 

Is it good idea to allow passing a NULL @cg to this basic function?

Why not only call this function when @cg is valid?

> +
> +static int __sgx_epc_cgroup_try_charge(struct sgx_epc_cgroup *epc_cg,
> +bool reclaim)
> +{
> + struct sgx_epc_reclaim_control rc;
> + unsigned int nr_empty = 0;
> +
> + sgx_epc_reclaim_control_init(, epc_cg);
> +
> + for (;;) {
> + if (!misc_cg_try_charge(MISC_CG_RES_SGX_EPC, epc_cg->cg,
> + PAGE_SIZE))
> + break;
> +
> + if (sgx_epc_cgroup_lru_empty(epc_cg))
> + return -ENOMEM;
> +
> + if (signal_pending(current))
> + return -ERESTARTSYS;
> +
> + if (!reclaim) {
> + queue_work(sgx_epc_cg_wq, _cg->reclaim_work);
> + return -EBUSY;
> + }
> +
> + if (!sgx_epc_cgroup_reclaim_pages(1, )) {
> + if (sgx_epc_cgroup_reclaim_failed()) {
> + if (++nr_empty > SGX_EPC_RECLAIM_OOM_THRESHOLD)
> + return -ENOMEM;
> + schedule();
> + }
> + }
> + }
> + if (epc_cg->cg != misc_cg_root())
> + css_get(_cg->cg->css);

I don't quite understand why root is treated specially.

And I thought get_current_misc_cg() in sgx_epc_cgroup_try_charge() already grabs
the reference before calling this function?  Why do it again?

> +
> + return 0;
> +}
> +
> +/**
> + * sgx_epc_cgroup_try_charge() - hierarchically try to charge a single EPC 
> page
> + * @mm:  the mm_struct of the process to charge
> + * @reclaim: whether or not synchronous reclaim is allowed
> + *
> + * Returns EPC cgroup or NULL on success, -errno on failure.
> + */
> +struct sgx_epc_cgroup *sgx_epc_cgroup_try_charge(bool reclaim)
> +{
> + struct sgx_epc_cgroup *epc_cg;
> + int ret;
> +
> + if (sgx_epc_cgroup_disabled())
> + return NULL;
> +
> + epc_cg = sgx_epc_cgroup_from_misc_cg(get_current_misc_cg());
> + ret = __sgx_epc_cgroup_try_charge(epc_cg, reclaim);
> + put_misc_cg(epc_cg->cg);
> +
> + if (ret)
> + return ERR_PTR(ret);
> +
> + return epc_cg;
> +}
> +
> +/**
> + * sgx_epc_cgroup_uncharge() - hierarchically uncharge EPC pages
> + * @epc_cg:  the charged epc cgroup
> + */
> +void sgx_epc_cgroup_uncharge(struct sgx_epc_cgroup *epc_cg)
> +{
> + if (sgx_epc_cgroup_disabled())
> + return;
> +
> + misc_cg_uncharge(MISC_CG_RES_SGX_EPC, epc_cg->cg, PAGE_SIZE);
> +
> + if (epc_cg->cg != misc_cg_root())
> + put_misc_cg(epc_cg->cg);

Again why root is special?  And where is the get_misc_cg()?

Oh is it the 

if (epc_cg->cg != misc_cg_root())
css_get(_cg->cg->css);

in __sgx_epc_cgroup_try_charge()?

That's horrible to follow.  Can this be explicitly done in
sgx_epc_cgroup_try_charge() and sgx_epc_cgroup_uncharge(), that is, grab the
reference in the former and release the reference in the latter?



Re: [PATCH v5 16/18] x86/sgx: Limit process EPC usage with misc cgroup controller

2023-10-09 Thread Huang, Kai

> @@ -332,6 +336,7 @@ void sgx_isolate_epc_pages(struct sgx_epc_lru_lists *lru, 
> size_t nr_to_scan,
>   * sgx_reclaim_epc_pages() - Reclaim EPC pages from the consumers
>   * @nr_to_scan:   Number of EPC pages to scan for reclaim
>   * @ignore_age:   Reclaim a page even if it is young
> + * @epc_cg:   EPC cgroup from which to reclaim
>   *
>   * Take a fixed number of pages from the head of the active page pool and
>   * reclaim them to the enclave's private shmem files. Skip the pages, which 
> have
> @@ -345,7 +350,8 @@ void sgx_isolate_epc_pages(struct sgx_epc_lru_lists *lru, 
> size_t nr_to_scan,
>   * problematic as it would increase the lock contention too much, which would
>   * halt forward progress.
>   */
> -size_t sgx_reclaim_epc_pages(size_t nr_to_scan, bool ignore_age)
> +size_t sgx_reclaim_epc_pages(size_t nr_to_scan, bool ignore_age,
> +  struct sgx_epc_cgroup *epc_cg)
>  {
>   struct sgx_backing backing[SGX_NR_TO_SCAN_MAX];
>   struct sgx_epc_page *epc_page, *tmp;
> @@ -355,7 +361,15 @@ size_t sgx_reclaim_epc_pages(size_t nr_to_scan, bool 
> ignore_age)
>   LIST_HEAD(iso);
>   size_t ret, i;
>  
> - sgx_isolate_epc_pages(_global_lru, nr_to_scan, );
> + /*
> +  * If a specific cgroup is not being targeted, take from the global
> +  * list first, even when cgroups are enabled.  If there are
> +  * pages on the global LRU then they should get reclaimed asap.
> +  */
> + if (!IS_ENABLED(CONFIG_CGROUP_SGX_EPC) || !epc_cg)
> + sgx_isolate_epc_pages(_global_lru, _to_scan, );
> +
> + sgx_epc_cgroup_isolate_pages(epc_cg, _to_scan, );

(I wish such code can be somehow moved to the earlier patches, so that we can
get early idea that how sgx_reclaim_epc_pages() is supposed to be used.)

So here when we are not targeting a specific EPC cgroup, we always reclaim from
the global list first, ...

[...]

>  
>   if (list_empty())
>   return 0;
> @@ -423,7 +437,7 @@ static bool sgx_should_reclaim(unsigned long watermark)
>  void sgx_reclaim_direct(void)
>  {
>   if (sgx_should_reclaim(SGX_NR_LOW_PAGES))
> - sgx_reclaim_epc_pages(SGX_NR_TO_SCAN, false);
> + sgx_reclaim_epc_pages(SGX_NR_TO_SCAN, false, NULL);

... and we always try to reclaim the global list first when directly reclaim is
desired, even the enclave is within some EPC cgroup.  ... 

>  }
>  
>  static int ksgxd(void *p)
> @@ -446,7 +460,7 @@ static int ksgxd(void *p)
>sgx_should_reclaim(SGX_NR_HIGH_PAGES));
>  
>   if (sgx_should_reclaim(SGX_NR_HIGH_PAGES))
> - sgx_reclaim_epc_pages(SGX_NR_TO_SCAN, false);
> + sgx_reclaim_epc_pages(SGX_NR_TO_SCAN, false, NULL);

... and in ksgxd() as well, which I guess is somehow acceptable.  ...

>  
>   cond_resched();
>   }
> @@ -600,6 +614,11 @@ int sgx_drop_epc_page(struct sgx_epc_page *page)
>  struct sgx_epc_page *sgx_alloc_epc_page(void *owner, bool reclaim)
>  {
>   struct sgx_epc_page *page;
> + struct sgx_epc_cgroup *epc_cg;
> +
> + epc_cg = sgx_epc_cgroup_try_charge(reclaim);
> + if (IS_ERR(epc_cg))
> + return ERR_CAST(epc_cg);
>  
>   for ( ; ; ) {
>   page = __sgx_alloc_epc_page();
> @@ -608,8 +627,10 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, 
> bool reclaim)
>   break;
>   }
>  
> - if (!sgx_can_reclaim())
> - return ERR_PTR(-ENOMEM);
> + if (!sgx_can_reclaim()) {
> + page = ERR_PTR(-ENOMEM);
> + break;
> + }
>  
>   if (!reclaim) {
>   page = ERR_PTR(-EBUSY);
> @@ -621,10 +642,17 @@ struct sgx_epc_page *sgx_alloc_epc_page(void *owner, 
> bool reclaim)
>   break;
>   }
>  
> - sgx_reclaim_epc_pages(SGX_NR_TO_SCAN, false);
> + sgx_reclaim_epc_pages(SGX_NR_TO_SCAN, false, NULL);

... and when an EPC page is allocated, no matter whether the EPC page belongs to
any cgroup or not.

When we are allocating EPC page for one enclave, if that enclave belongs to some
cgroup, is it more reasonable to reclaim EPC pages from it's own group (and the
children under it)?

You already got the current EPC cgroup at the beginning of sgx_alloc_epc_page()
when you want to charge the EPC allocation.

>   cond_resched();
>   }
>  


Re: [PATCH v5 16/18] x86/sgx: Limit process EPC usage with misc cgroup controller

2023-10-09 Thread Huang, Kai

> +static inline struct sgx_epc_lru_lists *epc_cg_lru(struct sgx_epc_cgroup 
> *epc_cg)
> +{
> + if (epc_cg)
> + return _cg->lru;
> + return NULL;
> +}
> 

It's legal to return NULL EPC cgroup for a given EPC page, i.e., when the
enclave isn't assigned to any cgroup.  But ...

>  
>  static inline struct sgx_epc_lru_lists *sgx_lru_lists(struct sgx_epc_page 
> *epc_page)
>  {
> + if (IS_ENABLED(CONFIG_CGROUP_SGX_EPC))
> + return epc_cg_lru(epc_page->epc_cg);
> +
>   return _global_lru;
>  }

... here is it legal to return a NULL LRU list?

It appears you always want to return a valid LRU list.  That is, if EPC cgroup
is enabled, and when the EPC page doesn't belong to any cgroup, then you want to
return the sgx_global_lru ?



Re: [PATCH v5 16/18] x86/sgx: Limit process EPC usage with misc cgroup controller

2023-10-09 Thread Huang, Kai

> +/**
> + * sgx_epc_cgroup_lru_empty() - check if a cgroup tree has no pages on its 
> lrus
> + * @root:root of the tree to check
> + *
> + * Return: %true if all cgroups under the specified root have empty LRU 
> lists.
> + * Used to avoid livelocks due to a cgroup having a non-zero charge count but
> + * no pages on its LRUs, e.g. due to a dead enclave waiting to be released or
> + * because all pages in the cgroup are unreclaimable.
> + */
> +bool sgx_epc_cgroup_lru_empty(struct sgx_epc_cgroup *root)
> +{
> + struct cgroup_subsys_state *css_root;
> + struct cgroup_subsys_state *pos;
> + struct sgx_epc_cgroup *epc_cg;
> + bool ret = true;
> +
> + /*
> +  * Caller ensure css_root ref acquired
> +  */
> + css_root = root ? >cg->css : &(misc_cg_root()->css);
> +
> + rcu_read_lock();
> + css_for_each_descendant_pre(pos, css_root) {
> + if (!css_tryget(pos))
> + break;
> +
> + rcu_read_unlock();
> +
> + epc_cg = sgx_epc_cgroup_from_misc_cg(css_misc(pos));
> +
> + spin_lock(_cg->lru.lock);
> + ret = list_empty(_cg->lru.reclaimable);
> + spin_unlock(_cg->lru.lock);
> +
> + rcu_read_lock();
> + css_put(pos);
> + if (!ret)
> + break;
> + }
> +
> + rcu_read_unlock();
> +
> + return ret;
> +}
> 

[...]

> 
>  static inline bool sgx_can_reclaim(void)
>  {
> + if (IS_ENABLED(CONFIG_CGROUP_SGX_EPC))
> + return !sgx_epc_cgroup_lru_empty(NULL);
> +

Is it better to keep a root sgx_epc_cgroup and pass the root instead of NULL?

>   return !list_empty(_global_lru.reclaimable);
>  }
>  


Re: [PATCH v5 16/18] x86/sgx: Limit process EPC usage with misc cgroup controller

2023-10-05 Thread Huang, Kai

> ---
>  arch/x86/Kconfig |  13 +
>  arch/x86/kernel/cpu/sgx/Makefile |   1 +
>  arch/x86/kernel/cpu/sgx/epc_cgroup.c | 415 +++
>  arch/x86/kernel/cpu/sgx/epc_cgroup.h |  59 
>  arch/x86/kernel/cpu/sgx/main.c   |  68 -
>  arch/x86/kernel/cpu/sgx/sgx.h|  17 +-
>  6 files changed, 556 insertions(+), 17 deletions(-)
>  create mode 100644 arch/x86/kernel/cpu/sgx/epc_cgroup.c
>  create mode 100644 arch/x86/kernel/cpu/sgx/epc_cgroup.h

Given how large this patch is, it's better to split if we can.

It seems we can at least split ...

[...]

> 
> @@ -970,6 +1005,7 @@ static void __init arch_update_sysfs_visibility(int nid) 
> {}
>  static bool __init sgx_page_cache_init(void)
>  {
>   u32 eax, ebx, ecx, edx, type;
> + u64 capacity = 0;
>   u64 pa, size;
>   int nid;
>   int i;
> @@ -1020,6 +1056,7 @@ static bool __init sgx_page_cache_init(void)
>  
>   sgx_epc_sections[i].node =  _numa_nodes[nid];
>   sgx_numa_nodes[nid].size += size;
> + capacity += size;
>  
>   sgx_nr_epc_sections++;
>   }
> @@ -1029,6 +1066,9 @@ static bool __init sgx_page_cache_init(void)
>   return false;
>   }
>  
> + misc_cg_set_capacity(MISC_CG_RES_SGX_EPC, capacity);
> + sgx_epc_total_pages = capacity >> PAGE_SHIFT;
> +
>   return true;
>  }
> 

... setting up capacity out as a separate patch, as it is a top-level only file
showing the maximum instances of the resource.

I'll review rest later.


Re: [PATCH v5 16/18] x86/sgx: Limit process EPC usage with misc cgroup controller

2023-09-25 Thread Jarkko Sakkinen
On Sat Sep 23, 2023 at 6:06 AM EEST, Haitao Huang wrote:
> From: Kristen Carlson Accardi 
>
> Implement support for cgroup control of SGX Enclave Page Cache (EPC)
> memory using the misc cgroup controller. EPC memory is independent
> from normal system memory, e.g. must be reserved at boot from RAM and
> cannot be converted between EPC and normal memory while the system is
> running. EPC is managed by the SGX subsystem and is not accounted by
> the memory controller.
>
> Much like normal system memory, EPC memory can be overcommitted via
> virtual memory techniques and pages can be swapped out of the EPC to
> their backing store (normal system memory, e.g. shmem).  The SGX EPC
> subsystem is analogous to the memory subsystem and the SGX EPC controller
> is in turn analogous to the memory controller; it implements limit and
> protection models for EPC memory.
>
> The misc controller provides a mechanism to set a hard limit of EPC
> usage via the "sgx_epc" resource in "misc.max". The total EPC memory
> available on the system is reported via the "sgx_epc" resource in
> "misc.capacity".
>
> This patch was modified from its original version to use the misc cgroup
> controller instead of a custom controller.
>
> Co-developed-by: Sean Christopherson 
> Signed-off-by: Sean Christopherson 
> Signed-off-by: Kristen Carlson Accardi 
> Co-developed-by: Haitao Huang 
> Signed-off-by: Haitao Huang 
> Tested-by: Mikko Ylinen 
>
> Cc: Sean Christopherson 
> ---
> V5:
> - kernel-doc fixes (Jarkko)
>
> V4:
> - Fix a white space issue in Kconfig (Randy).
> - Update comments for LRU list as it can be owned by a cgroup.
> - Fix comments for sgx_reclaim_epc_pages() and use IS_ENABLED consistently 
> (Mikko)
>
> V3:
>
> 1) Use the same maximum number of reclaiming candidate pages to be
> processed, SGX_NR_TO_SCAN_MAX, for each reclaiming iteration in both
> cgroup worker function and ksgxd. This fixes an overflow in the
> backing store buffer with the same fixed size allocated on stack in
> sgx_reclaim_epc_pages().
>
> 2) Initialize max for root EPC cgroup. Otherwise, all
> misc_cg_try_charge() calls would fail as it checks for all limits of
> ancestors all the way to the root node.
>
> 3) Start reclaiming whenever misc_cg_try_charge fails. Removed all
> re-checks for limits and current usage. For all purposes and intent,
> when misc_try_charge() fails, reclaiming is needed. This also corrects
> an error of not reclaiming when the child limit is larger than one of
> its ancestors.
>
> 4) Handle failure on charging to the root EPC cgroup. Failure on charging
> to root means we are at or above capacity, so start reclaiming or return
> OOM error.
>
> 5) Removed the custom cgroup tree walking iterator with epoch tracking
> logic. Replaced it with just the plain css_for_each_descendant_pre
> iterator. The custom iterator implemented a rather complex epoch scheme
> I believe was intended to prevent extra reclaiming from multiple worker
> threads doing the same walk but it turned out not matter much as each
> thread would only reclaim when usage is above limit. Using the plain
> css_for_each_descendant_pre iterator simplified code a bit.
>
> 6) Do not reclaim synchronously in misc_max_write callback which would
> block the user. Instead queue an async work item to run the reclaiming
> loop.
>
> 7) Other minor refactoring:
> - Remove unused params in epc_cgroup APIs
> - centralize uncharge into sgx_free_epc_page()
> ---
>  arch/x86/Kconfig |  13 +
>  arch/x86/kernel/cpu/sgx/Makefile |   1 +
>  arch/x86/kernel/cpu/sgx/epc_cgroup.c | 415 +++
>  arch/x86/kernel/cpu/sgx/epc_cgroup.h |  59 
>  arch/x86/kernel/cpu/sgx/main.c   |  68 -
>  arch/x86/kernel/cpu/sgx/sgx.h|  17 +-
>  6 files changed, 556 insertions(+), 17 deletions(-)
>  create mode 100644 arch/x86/kernel/cpu/sgx/epc_cgroup.c
>  create mode 100644 arch/x86/kernel/cpu/sgx/epc_cgroup.h
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 66bfabae8814..e17c5dc3aea4 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1921,6 +1921,19 @@ config X86_SGX
>  
> If unsure, say N.
>  
> +config CGROUP_SGX_EPC
> + bool "Miscellaneous Cgroup Controller for Enclave Page Cache (EPC) for 
> Intel SGX"
> + depends on X86_SGX && CGROUP_MISC
> + help
> +   Provides control over the EPC footprint of tasks in a cgroup via
> +   the Miscellaneous cgroup controller.
> +
> +   EPC is a subset of regular memory that is usable only by SGX
> +   enclaves and is very limited in quantity, e.g. less than 1%
> +   of total DRAM.
> +
> +   Say N if unsure.
> +
>  config X86_USER_SHADOW_STACK
>   bool "X86 userspace shadow stack"
>   depends on AS_WRUSS
> diff --git a/arch/x86/kernel/cpu/sgx/Makefile 
> b/arch/x86/kernel/cpu/sgx/Makefile
> index 9c1656779b2a..12901a488da7 100644
> --- a/arch/x86/kernel/cpu/sgx/Makefile
> +++ b/arch/x86/kernel/cpu/sgx/Makefile
> @@ -4,3 +4,4