[PATCH V2 1/9] fork: Make IO worker options flag based

2021-09-21 Thread Mike Christie
This patchset adds a couple new options to kernel_clone_args for IO thread
like/related users. Instead of adding new fields to kernel_clone_args for
each option, this moves us to a flags based approach by first converting
io_thread.

Signed-off-by: Mike Christie 
---
 include/linux/sched/task.h | 4 +++-
 kernel/fork.c  | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ef02be869cf2..48417c735438 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -18,8 +18,11 @@ struct css_set;
 /* All the bits taken by the old clone syscall. */
 #define CLONE_LEGACY_FLAGS 0xULL
 
+#define KERN_WORKER_IO BIT(0)
+
 struct kernel_clone_args {
u64 flags;
+   u32 worker_flags;
int __user *pidfd;
int __user *child_tid;
int __user *parent_tid;
@@ -31,7 +34,6 @@ struct kernel_clone_args {
/* Number of elements in *set_tid */
size_t set_tid_size;
int cgroup;
-   int io_thread;
struct cgroup *cgrp;
struct css_set *cset;
 };
diff --git a/kernel/fork.c b/kernel/fork.c
index 38681ad44c76..3988106e9609 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2026,7 +2026,7 @@ static __latent_entropy struct task_struct *copy_process(
p = dup_task_struct(current, node);
if (!p)
goto fork_out;
-   if (args->io_thread) {
+   if (args->worker_flags & KERN_WORKER_IO) {
/*
 * Mark us an IO worker, and block any signal that isn't
 * fatal or STOP
@@ -2526,7 +2526,7 @@ struct task_struct *create_io_thread(int (*fn)(void *), 
void *arg, int node)
.exit_signal= (lower_32_bits(flags) & CSIGNAL),
.stack  = (unsigned long)fn,
.stack_size = (unsigned long)arg,
-   .io_thread  = 1,
+   .worker_flags   = KERN_WORKER_IO,
};
 
return copy_process(NULL, 0, node, );
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH V2 2/9] fork: pass worker_flags to copy_thread

2021-09-21 Thread Mike Christie
We need to break up PF_IO_WORKER into the parts that are used for
scheduling and signal handling and the part that tells copy_thread to
treat it as a special type of thread during setup. This patch passes the
worker_flags to copy_thread, so in the next patch we can add new worker
flags that function can see.

Signed-off-by: Mike Christie 
---
 arch/alpha/kernel/process.c  | 2 +-
 arch/arc/kernel/process.c| 2 +-
 arch/arm/kernel/process.c| 3 ++-
 arch/arm64/kernel/process.c  | 3 ++-
 arch/csky/kernel/process.c   | 3 ++-
 arch/h8300/kernel/process.c  | 3 ++-
 arch/hexagon/kernel/process.c| 2 +-
 arch/ia64/kernel/process.c   | 3 ++-
 arch/m68k/kernel/process.c   | 2 +-
 arch/microblaze/kernel/process.c | 2 +-
 arch/mips/kernel/process.c   | 2 +-
 arch/nds32/kernel/process.c  | 3 ++-
 arch/nios2/kernel/process.c  | 2 +-
 arch/openrisc/kernel/process.c   | 3 ++-
 arch/parisc/kernel/process.c | 3 ++-
 arch/powerpc/kernel/process.c| 2 +-
 arch/riscv/kernel/process.c  | 2 +-
 arch/s390/kernel/process.c   | 3 ++-
 arch/sh/kernel/process_32.c  | 2 +-
 arch/sparc/kernel/process_32.c   | 2 +-
 arch/sparc/kernel/process_64.c   | 2 +-
 arch/um/kernel/process.c | 3 ++-
 arch/x86/kernel/process.c| 2 +-
 arch/xtensa/kernel/process.c | 2 +-
 include/linux/sched/task.h   | 2 +-
 kernel/fork.c| 3 ++-
 26 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index a5123ea426ce..6005b0dfe7e2 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -235,7 +235,7 @@ release_thread(struct task_struct *dead_task)
  */
 int copy_thread(unsigned long clone_flags, unsigned long usp,
unsigned long kthread_arg, struct task_struct *p,
-   unsigned long tls)
+   unsigned long tls, u32 worker_flags)
 {
extern void ret_from_fork(void);
extern void ret_from_kernel_thread(void);
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index 3793876f42d9..4e307e5b5205 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -164,7 +164,7 @@ asmlinkage void ret_from_fork(void);
  */
 int copy_thread(unsigned long clone_flags, unsigned long usp,
unsigned long kthread_arg, struct task_struct *p,
-   unsigned long tls)
+   unsigned long tls, u32 worker_flags)
 {
struct pt_regs *c_regs;/* child's pt_regs */
unsigned long *childksp;   /* to unwind out of __switch_to() */
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 0e2d3051741e..07aeb6ab 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -230,7 +230,8 @@ void release_thread(struct task_struct *dead_task)
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 int copy_thread(unsigned long clone_flags, unsigned long stack_start,
-   unsigned long stk_sz, struct task_struct *p, unsigned long tls)
+   unsigned long stk_sz, struct task_struct *p, unsigned long tls,
+   u32 worker_flags)
 {
struct thread_info *thread = task_thread_info(p);
struct pt_regs *childregs = task_pt_regs(p);
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 40adb8cdbf5a..7979ec253c29 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -316,7 +316,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct 
task_struct *src)
 asmlinkage void ret_from_fork(void) asm("ret_from_fork");
 
 int copy_thread(unsigned long clone_flags, unsigned long stack_start,
-   unsigned long stk_sz, struct task_struct *p, unsigned long tls)
+   unsigned long stk_sz, struct task_struct *p, unsigned long tls,
+   u32 worker_flags)
 {
struct pt_regs *childregs = task_pt_regs(p);
 
diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c
index 3d0ca22cd0e2..f38b668515ae 100644
--- a/arch/csky/kernel/process.c
+++ b/arch/csky/kernel/process.c
@@ -34,7 +34,8 @@ int copy_thread(unsigned long clone_flags,
unsigned long usp,
unsigned long kthread_arg,
struct task_struct *p,
-   unsigned long tls)
+   unsigned long tls,
+   u32 worker_flags)
 {
struct switch_stack *childstack;
struct pt_regs *childregs = task_pt_regs(p);
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 2ac27e4248a4..9a8f6c033ad1 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -106,7 +106,8 @@ void flush_thread(void)
 }
 
 int copy_thread(unsigned long clone_flags, unsigned long usp,
-   unsigned long topstk, struct task_struct *p, unsigned long tls)
+   unsigned long topstk, struct task_struct *p, unsigned long tls,
+   u32 

[PATCH V2 4/9] fork: add option to not clone or dup files

2021-09-21 Thread Mike Christie
Each vhost device gets a thread that is used to perform IO and management
operations. Instead of a thread that is accessing a device, the thread is
part of the device, so when it calls the kernel_worker() function added in
the next patch we can't dup or clone the parent's files/FDS because it would
do an extra increment on ourself.

Later, when we do:

Qemu process exits:
do_exit -> exit_files -> put_files_struct -> close_files

we would leak the device's resources because of that extra refcount
on the fd or file_struct.

This patch adds a no_files option so these worker threads can prevent
taking an extra refcount on themselves.

Signed-off-by: Mike Christie 
---
 include/linux/sched/task.h |  1 +
 kernel/fork.c  | 11 +--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index cf7c9fffc839..e165cc67fd3c 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -20,6 +20,7 @@ struct css_set;
 
 #define KERN_WORKER_IO BIT(0)
 #define KERN_WORKER_USER   BIT(1)
+#define KERN_WORKER_NO_FILES   BIT(2)
 
 struct kernel_clone_args {
u64 flags;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4b0e8257993b..98264cf1d6a6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1532,7 +1532,8 @@ static int copy_fs(unsigned long clone_flags, struct 
task_struct *tsk)
return 0;
 }
 
-static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
+static int copy_files(unsigned long clone_flags, struct task_struct *tsk,
+ int no_files)
 {
struct files_struct *oldf, *newf;
int error = 0;
@@ -1544,6 +1545,11 @@ static int copy_files(unsigned long clone_flags, struct 
task_struct *tsk)
if (!oldf)
goto out;
 
+   if (no_files) {
+   tsk->files = NULL;
+   goto out;
+   }
+
if (clone_flags & CLONE_FILES) {
atomic_inc(>count);
goto out;
@@ -2179,7 +2185,8 @@ static __latent_entropy struct task_struct *copy_process(
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_security;
-   retval = copy_files(clone_flags, p);
+   retval = copy_files(clone_flags, p,
+   args->worker_flags & KERN_WORKER_NO_FILES);
if (retval)
goto bad_fork_cleanup_semundo;
retval = copy_fs(clone_flags, p);
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH V2 5/9] fork: add helper to clone a process

2021-09-21 Thread Mike Christie
The vhost layer has similar requirements as io_uring where its worker
threads need to access the userspace thread's memory, want to inherit the
parents's cgroups and namespaces, and be checked against the parent's
RLIMITs. Right now, the vhost layer uses the kthread API which has
kthread_use_mm for mem access, and those threads can use
cgroup_attach_task_all for v1 cgroups, but there are no helpers for the
other items.

This adds a helper to clone a process so we can inherit everything we
want in one call. It's a more generic version of create_io_thread which
will be used by the vhost layer and io_uring in later patches in this set.

Signed-off-by: Mike Christie 
---
 include/linux/sched/task.h |  6 -
 kernel/fork.c  | 48 ++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index e165cc67fd3c..ba0499b6627c 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -87,7 +87,11 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern pid_t kernel_clone(struct kernel_clone_args *kargs);
-struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node);
+struct task_struct *create_io_thread(int (*fn)(void *i), void *arg, int node);
+struct task_struct *kernel_worker(int (*fn)(void *), void *arg, int node,
+ unsigned long clone_flags, u32 worker_flags);
+__printf(2, 3)
+void kernel_worker_start(struct task_struct *tsk, const char namefmt[], ...);
 struct task_struct *fork_idle(int);
 struct mm_struct *copy_init_mm(void);
 extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
diff --git a/kernel/fork.c b/kernel/fork.c
index 98264cf1d6a6..3f3fcabffa5f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2540,6 +2540,54 @@ struct task_struct *create_io_thread(int (*fn)(void *), 
void *arg, int node)
return copy_process(NULL, 0, node, );
 }
 
+/**
+ * kernel_worker - create a copy of a process to be used by the kernel
+ * @fn: thread stack
+ * @arg: data to be passed to fn
+ * @node: numa node to allocate task from
+ * @clone_flags: CLONE flags
+ * @worker_flags: KERN_WORKER flags
+ *
+ * This returns a created task, or an error pointer. The returned task is
+ * inactive, and the caller must fire it up through kernel_worker_start(). If
+ * this is an PF_IO_WORKER all singals but KILL and STOP are blocked.
+ */
+struct task_struct *kernel_worker(int (*fn)(void *), void *arg, int node,
+ unsigned long clone_flags, u32 worker_flags)
+{
+   struct kernel_clone_args args = {
+   .flags  = ((lower_32_bits(clone_flags) | CLONE_VM |
+  CLONE_UNTRACED) & ~CSIGNAL),
+   .exit_signal= (lower_32_bits(clone_flags) & CSIGNAL),
+   .stack  = (unsigned long)fn,
+   .stack_size = (unsigned long)arg,
+   .worker_flags   = KERN_WORKER_USER | worker_flags,
+   };
+
+   return copy_process(NULL, 0, node, );
+}
+EXPORT_SYMBOL_GPL(kernel_worker);
+
+/**
+ * kernel_worker_start - Start a task created with kernel_worker
+ * @tsk: task to wake up
+ * @namefmt: printf-style format string for the thread name
+ * @arg: arguments for @namefmt
+ */
+void kernel_worker_start(struct task_struct *tsk, const char namefmt[], ...)
+{
+   char name[TASK_COMM_LEN];
+   va_list args;
+
+   va_start(args, namefmt);
+   vsnprintf(name, sizeof(name), namefmt, args);
+   set_task_comm(tsk, name);
+   va_end(args);
+
+   wake_up_new_task(tsk);
+}
+EXPORT_SYMBOL_GPL(kernel_worker_start);
+
 /*
  *  Ok, this is the main fork-routine.
  *
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH V2 3/9] fork: move PF_IO_WORKER's kernel frame setup to new flag

2021-09-21 Thread Mike Christie
The vhost worker threads need the same frame setup as io_uring's worker
threads, but handle signals differently and do not need the same
scheduling behavior. This patch separate's the frame setup parts of
PF_IO_WORKER into a kernel_clone_args flag, KERN_WORKER_USER.

Signed-off-by: Mike Christie 
---
 arch/alpha/kernel/process.c  | 3 ++-
 arch/arc/kernel/process.c| 3 ++-
 arch/arm/kernel/process.c| 3 ++-
 arch/arm64/kernel/process.c  | 3 ++-
 arch/csky/kernel/process.c   | 3 ++-
 arch/h8300/kernel/process.c  | 3 ++-
 arch/hexagon/kernel/process.c| 3 ++-
 arch/ia64/kernel/process.c   | 3 ++-
 arch/m68k/kernel/process.c   | 3 ++-
 arch/microblaze/kernel/process.c | 3 ++-
 arch/mips/kernel/process.c   | 3 ++-
 arch/nds32/kernel/process.c  | 3 ++-
 arch/nios2/kernel/process.c  | 3 ++-
 arch/openrisc/kernel/process.c   | 3 ++-
 arch/parisc/kernel/process.c | 3 ++-
 arch/powerpc/kernel/process.c| 3 ++-
 arch/riscv/kernel/process.c  | 3 ++-
 arch/s390/kernel/process.c   | 3 ++-
 arch/sh/kernel/process_32.c  | 3 ++-
 arch/sparc/kernel/process_32.c   | 3 ++-
 arch/sparc/kernel/process_64.c   | 3 ++-
 arch/um/kernel/process.c | 3 ++-
 arch/x86/kernel/process.c| 4 ++--
 arch/xtensa/kernel/process.c | 3 ++-
 include/linux/sched/task.h   | 1 +
 kernel/fork.c| 2 +-
 26 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 6005b0dfe7e2..f7432f921fb2 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -249,7 +249,8 @@ int copy_thread(unsigned long clone_flags, unsigned long 
usp,
childti->pcb.ksp = (unsigned long) childstack;
childti->pcb.flags = 1; /* set FEN, clear everything else */
 
-   if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+   if (unlikely(p->flags & (PF_KTHREAD) ||
+worker_flags & KERN_WORKER_USER)) {
/* kernel thread */
memset(childstack, 0,
sizeof(struct switch_stack) + sizeof(struct pt_regs));
diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c
index 4e307e5b5205..4dca64153972 100644
--- a/arch/arc/kernel/process.c
+++ b/arch/arc/kernel/process.c
@@ -191,7 +191,8 @@ int copy_thread(unsigned long clone_flags, unsigned long 
usp,
childksp[0] = 0;/* fp */
childksp[1] = (unsigned long)ret_from_fork; /* blink */
 
-   if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+   if (unlikely(p->flags & (PF_KTHREAD) ||
+worker_flags & KERN_WORKER_USER)) {
memset(c_regs, 0, sizeof(struct pt_regs));
 
c_callee->r13 = kthread_arg;
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 07aeb6ab..3d80f8ee2829 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -248,7 +248,8 @@ int copy_thread(unsigned long clone_flags, unsigned long 
stack_start,
thread->cpu_domain = get_domain();
 #endif
 
-   if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER {
+   if (unlikely(p->flags & (PF_KTHREAD) ||
+worker_flags & KERN_WORKER_USER)) {
*childregs = *current_pt_regs();
childregs->ARM_r0 = 0;
if (stack_start)
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 7979ec253c29..2fcc2be22d25 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -334,7 +334,8 @@ int copy_thread(unsigned long clone_flags, unsigned long 
stack_start,
 
ptrauth_thread_init_kernel(p);
 
-   if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER {
+   if (unlikely(p->flags & (PF_KTHREAD) ||
+worker_flags & KERN_WORKER_USER)) {
*childregs = *current_pt_regs();
childregs->regs[0] = 0;
 
diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c
index f38b668515ae..17927e15f4ff 100644
--- a/arch/csky/kernel/process.c
+++ b/arch/csky/kernel/process.c
@@ -50,7 +50,8 @@ int copy_thread(unsigned long clone_flags,
/* setup thread.sp for switch_to !!! */
p->thread.sp = (unsigned long)childstack;
 
-   if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+   if (unlikely(p->flags & (PF_KTHREAD) ||
+worker_flags & KERN_WORKER_USER)) {
memset(childregs, 0, sizeof(struct pt_regs));
childstack->r15 = (unsigned long) ret_from_kernel_thread;
childstack->r10 = kthread_arg;
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 9a8f6c033ad1..066001ca931a 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -113,7 +113,8 @@ int copy_thread(unsigned long clone_flags, unsigned long 
usp,
 
childregs = (struct pt_regs *) (THREAD_SIZE + 

[PATCH V2 7/9] fork: Add worker flag to ignore signals

2021-09-21 Thread Mike Christie
The kthread API creates threads that ignore all signals by default so
modules like vhost that will move from that API to kernel_worker will
not be expecting them. This patch adds a worker flag that tells
kernel_worker to setup the task to ignore signals.

Signed-off-by: Mike Christie 
---
 include/linux/sched/task.h |  1 +
 kernel/fork.c  | 11 ++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 781abbc1c288..aefa0d221b57 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -21,6 +21,7 @@ struct css_set;
 #define KERN_WORKER_IO BIT(0)
 #define KERN_WORKER_USER   BIT(1)
 #define KERN_WORKER_NO_FILES   BIT(2)
+#define KERN_WORKER_NO_SIGSBIT(3)
 
 struct kernel_clone_args {
u64 flags;
diff --git a/kernel/fork.c b/kernel/fork.c
index 3f3fcabffa5f..34d3dca70cfb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2555,6 +2555,8 @@ struct task_struct *create_io_thread(int (*fn)(void *), 
void *arg, int node)
 struct task_struct *kernel_worker(int (*fn)(void *), void *arg, int node,
  unsigned long clone_flags, u32 worker_flags)
 {
+   struct task_struct *tsk;
+
struct kernel_clone_args args = {
.flags  = ((lower_32_bits(clone_flags) | CLONE_VM |
   CLONE_UNTRACED) & ~CSIGNAL),
@@ -2564,7 +2566,14 @@ struct task_struct *kernel_worker(int (*fn)(void *), 
void *arg, int node,
.worker_flags   = KERN_WORKER_USER | worker_flags,
};
 
-   return copy_process(NULL, 0, node, );
+   tsk = copy_process(NULL, 0, node, );
+   if (IS_ERR(tsk))
+   return tsk;
+
+   if (worker_flags & KERN_WORKER_NO_SIGS)
+   ignore_signals(tsk);
+
+   return tsk;
 }
 EXPORT_SYMBOL_GPL(kernel_worker);
 
-- 
2.25.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH V2 6/9] io_uring: switch to kernel_worker

2021-09-21 Thread Mike Christie
Convert io_uring and io-wq to use kernel_worker.

Signed-off-by: Mike Christie 
---
 fs/io-wq.c | 15 ---
 fs/io_uring.c  | 11 +--
 include/linux/sched/task.h |  1 -
 3 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/fs/io-wq.c b/fs/io-wq.c
index c2e0e8e80949..74e68132a227 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -69,6 +69,9 @@ struct io_worker {
 
 #define IO_WQ_NR_HASH_BUCKETS  (1u << IO_WQ_HASH_ORDER)
 
+#define IO_WQ_CLONE_FLAGS  (CLONE_FS | CLONE_FILES | CLONE_SIGHAND | \
+CLONE_THREAD | CLONE_IO)
+
 struct io_wqe_acct {
unsigned nr_workers;
unsigned max_workers;
@@ -549,13 +552,9 @@ static int io_wqe_worker(void *data)
struct io_wqe *wqe = worker->wqe;
struct io_wq *wq = wqe->wq;
bool last_timeout = false;
-   char buf[TASK_COMM_LEN];
 
worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING);
 
-   snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid);
-   set_task_comm(current, buf);
-
while (!test_bit(IO_WQ_BIT_EXIT, >state)) {
long ret;
 
@@ -652,7 +651,7 @@ static void io_init_new_worker(struct io_wqe *wqe, struct 
io_worker *worker,
list_add_tail_rcu(>all_list, >all_list);
worker->flags |= IO_WORKER_F_FREE;
raw_spin_unlock(>lock);
-   wake_up_new_task(tsk);
+   kernel_worker_start(tsk, "iou-wrk-%d", wqe->wq->task->pid);
 }
 
 static bool io_wq_work_match_all(struct io_wq_work *work, void *data)
@@ -682,7 +681,8 @@ static void create_worker_cont(struct callback_head *cb)
worker = container_of(cb, struct io_worker, create_work);
clear_bit_unlock(0, >create_state);
wqe = worker->wqe;
-   tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
+   tsk = kernel_worker(io_wqe_worker, worker, wqe->node,
+   IO_WQ_CLONE_FLAGS, KERN_WORKER_IO);
if (!IS_ERR(tsk)) {
io_init_new_worker(wqe, worker, tsk);
io_worker_release(worker);
@@ -752,7 +752,8 @@ static bool create_io_worker(struct io_wq *wq, struct 
io_wqe *wqe, int index)
if (index == IO_WQ_ACCT_BOUND)
worker->flags |= IO_WORKER_F_BOUND;
 
-   tsk = create_io_thread(io_wqe_worker, worker, wqe->node);
+   tsk = kernel_worker(io_wqe_worker, worker, wqe->node, IO_WQ_CLONE_FLAGS,
+   KERN_WORKER_IO);
if (!IS_ERR(tsk)) {
io_init_new_worker(wqe, worker, tsk);
} else if (!io_should_retry_thread(PTR_ERR(tsk))) {
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e372d5b9f6dc..1df7bec8bd76 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -7393,12 +7393,8 @@ static int io_sq_thread(void *data)
struct io_sq_data *sqd = data;
struct io_ring_ctx *ctx;
unsigned long timeout = 0;
-   char buf[TASK_COMM_LEN];
DEFINE_WAIT(wait);
 
-   snprintf(buf, sizeof(buf), "iou-sqp-%d", sqd->task_pid);
-   set_task_comm(current, buf);
-
if (sqd->sq_cpu != -1)
set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu));
else
@@ -8580,6 +8576,8 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
fdput(f);
}
if (ctx->flags & IORING_SETUP_SQPOLL) {
+   unsigned long flags = CLONE_FS | CLONE_FILES | CLONE_SIGHAND |
+   CLONE_THREAD | CLONE_IO;
struct task_struct *tsk;
struct io_sq_data *sqd;
bool attached;
@@ -8621,7 +8619,8 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 
sqd->task_pid = current->pid;
sqd->task_tgid = current->tgid;
-   tsk = create_io_thread(io_sq_thread, sqd, NUMA_NO_NODE);
+   tsk = kernel_worker(io_sq_thread, sqd, NUMA_NO_NODE,
+   flags, KERN_WORKER_IO);
if (IS_ERR(tsk)) {
ret = PTR_ERR(tsk);
goto err_sqpoll;
@@ -8629,7 +8628,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx,
 
sqd->thread = tsk;
ret = io_uring_alloc_task_context(tsk, ctx);
-   wake_up_new_task(tsk);
+   kernel_worker_start(tsk, "iou-sqp-%d", sqd->task_pid);
if (ret)
goto err;
} else if (p->flags & IORING_SETUP_SQ_AFF) {
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index ba0499b6627c..781abbc1c288 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -87,7 +87,6 @@ extern void exit_files(struct task_struct *);
 extern void exit_itimers(struct signal_struct *);
 
 extern pid_t kernel_clone(struct kernel_clone_args *kargs);
-struct task_struct *create_io_thread(int (*fn)(void *i), void *arg, int node);
 struct task_struct *kernel_worker(int (*fn)(void *), void *arg, int node,
  

[PATCH V2 9/9] vhost: use kernel_worker to check RLIMITs and inherit v2 cgroups

2021-09-21 Thread Mike Christie
For vhost workers we use the kthread API which inherit's its values from
and checks against the kthreadd thread. This results in cgroups v2 not
working and the wrong RLIMITs being checked. This patch has us use the
kernel_copy_process function which will inherit its values/checks from the
thread that owns the device.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 68 ---
 drivers/vhost/vhost.h |  7 -
 2 files changed, 31 insertions(+), 44 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index c9a1f706989c..7a5142dcde1b 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -22,7 +22,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -344,17 +343,14 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 static int vhost_worker(void *data)
 {
struct vhost_worker *worker = data;
-   struct vhost_dev *dev = worker->dev;
struct vhost_work *work, *work_next;
struct llist_node *node;
 
-   kthread_use_mm(dev->mm);
-
for (;;) {
/* mb paired w/ kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
 
-   if (kthread_should_stop()) {
+   if (test_bit(VHOST_WORKER_FLAG_STOP, >flags)) {
__set_current_state(TASK_RUNNING);
break;
}
@@ -376,8 +372,9 @@ static int vhost_worker(void *data)
schedule();
}
}
-   kthread_unuse_mm(dev->mm);
-   return 0;
+
+   complete(worker->exit_done);
+   do_exit(0);
 }
 
 static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
@@ -517,31 +514,6 @@ long vhost_dev_check_owner(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_dev_check_owner);
 
-struct vhost_attach_cgroups_struct {
-   struct vhost_work work;
-   struct task_struct *owner;
-   int ret;
-};
-
-static void vhost_attach_cgroups_work(struct vhost_work *work)
-{
-   struct vhost_attach_cgroups_struct *s;
-
-   s = container_of(work, struct vhost_attach_cgroups_struct, work);
-   s->ret = cgroup_attach_task_all(s->owner, current);
-}
-
-static int vhost_attach_cgroups(struct vhost_dev *dev)
-{
-   struct vhost_attach_cgroups_struct attach;
-
-   attach.owner = current;
-   vhost_work_init(, vhost_attach_cgroups_work);
-   vhost_work_queue(dev, );
-   vhost_work_dev_flush(dev);
-   return attach.ret;
-}
-
 /* Caller should have device mutex */
 bool vhost_dev_has_owner(struct vhost_dev *dev)
 {
@@ -579,6 +551,16 @@ static void vhost_detach_mm(struct vhost_dev *dev)
dev->mm = NULL;
 }
 
+static void vhost_worker_stop(struct vhost_worker *worker)
+{
+   DECLARE_COMPLETION_ONSTACK(exit_done);
+
+   worker->exit_done = _done;
+   set_bit(VHOST_WORKER_FLAG_STOP, >flags);
+   wake_up_process(worker->task);
+   wait_for_completion(worker->exit_done);
+}
+
 static void vhost_worker_free(struct vhost_dev *dev)
 {
struct vhost_worker *worker = dev->worker;
@@ -588,7 +570,7 @@ static void vhost_worker_free(struct vhost_dev *dev)
 
dev->worker = NULL;
WARN_ON(!llist_empty(>work_list));
-   kthread_stop(worker->task);
+   vhost_worker_stop(worker);
kfree(worker);
 }
 
@@ -603,27 +585,27 @@ static int vhost_worker_create(struct vhost_dev *dev)
return -ENOMEM;
 
dev->worker = worker;
-   worker->dev = dev;
worker->kcov_handle = kcov_common_handle();
init_llist_head(>work_list);
 
-   task = kthread_create(vhost_worker, worker, "vhost-%d", current->pid);
+   /*
+* vhost used to use the kthread API which ignores all signals by
+* default and the drivers expect this behavior. So we do not want to
+* ineherit the parent's signal handlers and set our worker to ignore
+* everything below.
+*/
+   task = kernel_worker(vhost_worker, worker, NUMA_NO_NODE,
+CLONE_FS | CLONE_CLEAR_SIGHAND,
+KERN_WORKER_NO_FILES | KERN_WORKER_NO_SIGS);
if (IS_ERR(task)) {
ret = PTR_ERR(task);
goto free_worker;
}
 
worker->task = task;
-   wake_up_process(task); /* avoid contributing to loadavg */
-
-   ret = vhost_attach_cgroups(dev);
-   if (ret)
-   goto stop_worker;
-
+   kernel_worker_start(task, "vhost-%d", current->pid);
return 0;
 
-stop_worker:
-   kthread_stop(worker->task);
 free_worker:
kfree(worker);
dev->worker = NULL;
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 102ce25e4e13..09748694cb66 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -25,11 +25,16 @@ struct vhost_work {
unsigned long   flags;
 };
 
+enum {
+   VHOST_WORKER_FLAG_STOP,
+};
+
 struct vhost_worker {

[PATCH V2 8/9] vhost: move worker thread fields to new struct

2021-09-21 Thread Mike Christie
This is just a prep patch. It moves the worker related fields to a new
vhost_worker struct and moves the code around to create some helpers that
will be used in the next patches.

Signed-off-by: Mike Christie 
Reviewed-by: Stefan Hajnoczi 
---
 drivers/vhost/vhost.c | 98 ---
 drivers/vhost/vhost.h | 11 +++--
 2 files changed, 72 insertions(+), 37 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 59edb5a1ffe2..c9a1f706989c 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -263,8 +263,8 @@ void vhost_work_queue(struct vhost_dev *dev, struct 
vhost_work *work)
 * sure it was not in the list.
 * test_and_set_bit() implies a memory barrier.
 */
-   llist_add(>node, >work_list);
-   wake_up_process(dev->worker);
+   llist_add(>node, >worker->work_list);
+   wake_up_process(dev->worker->task);
}
 }
 EXPORT_SYMBOL_GPL(vhost_work_queue);
@@ -272,7 +272,7 @@ EXPORT_SYMBOL_GPL(vhost_work_queue);
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_has_work(struct vhost_dev *dev)
 {
-   return !llist_empty(>work_list);
+   return dev->worker && !llist_empty(>worker->work_list);
 }
 EXPORT_SYMBOL_GPL(vhost_has_work);
 
@@ -343,7 +343,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 
 static int vhost_worker(void *data)
 {
-   struct vhost_dev *dev = data;
+   struct vhost_worker *worker = data;
+   struct vhost_dev *dev = worker->dev;
struct vhost_work *work, *work_next;
struct llist_node *node;
 
@@ -358,7 +359,7 @@ static int vhost_worker(void *data)
break;
}
 
-   node = llist_del_all(>work_list);
+   node = llist_del_all(>work_list);
if (!node)
schedule();
 
@@ -368,7 +369,7 @@ static int vhost_worker(void *data)
llist_for_each_entry_safe(work, work_next, node, node) {
clear_bit(VHOST_WORK_QUEUED, >flags);
__set_current_state(TASK_RUNNING);
-   kcov_remote_start_common(dev->kcov_handle);
+   kcov_remote_start_common(worker->kcov_handle);
work->fn(work);
kcov_remote_stop();
if (need_resched())
@@ -487,7 +488,6 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->byte_weight = byte_weight;
dev->use_worker = use_worker;
dev->msg_handler = msg_handler;
-   init_llist_head(>work_list);
init_waitqueue_head(>wait);
INIT_LIST_HEAD(>read_list);
INIT_LIST_HEAD(>pending_list);
@@ -579,10 +579,60 @@ static void vhost_detach_mm(struct vhost_dev *dev)
dev->mm = NULL;
 }
 
+static void vhost_worker_free(struct vhost_dev *dev)
+{
+   struct vhost_worker *worker = dev->worker;
+
+   if (!worker)
+   return;
+
+   dev->worker = NULL;
+   WARN_ON(!llist_empty(>work_list));
+   kthread_stop(worker->task);
+   kfree(worker);
+}
+
+static int vhost_worker_create(struct vhost_dev *dev)
+{
+   struct vhost_worker *worker;
+   struct task_struct *task;
+   int ret;
+
+   worker = kzalloc(sizeof(*worker), GFP_KERNEL_ACCOUNT);
+   if (!worker)
+   return -ENOMEM;
+
+   dev->worker = worker;
+   worker->dev = dev;
+   worker->kcov_handle = kcov_common_handle();
+   init_llist_head(>work_list);
+
+   task = kthread_create(vhost_worker, worker, "vhost-%d", current->pid);
+   if (IS_ERR(task)) {
+   ret = PTR_ERR(task);
+   goto free_worker;
+   }
+
+   worker->task = task;
+   wake_up_process(task); /* avoid contributing to loadavg */
+
+   ret = vhost_attach_cgroups(dev);
+   if (ret)
+   goto stop_worker;
+
+   return 0;
+
+stop_worker:
+   kthread_stop(worker->task);
+free_worker:
+   kfree(worker);
+   dev->worker = NULL;
+   return ret;
+}
+
 /* Caller should have device mutex */
 long vhost_dev_set_owner(struct vhost_dev *dev)
 {
-   struct task_struct *worker;
int err;
 
/* Is there an owner already? */
@@ -593,36 +643,21 @@ long vhost_dev_set_owner(struct vhost_dev *dev)
 
vhost_attach_mm(dev);
 
-   dev->kcov_handle = kcov_common_handle();
if (dev->use_worker) {
-   worker = kthread_create(vhost_worker, dev,
-   "vhost-%d", current->pid);
-   if (IS_ERR(worker)) {
-   err = PTR_ERR(worker);
-   goto err_worker;
-   }
-
-   dev->worker = worker;
-   wake_up_process(worker); /* avoid contributing to loadavg */
-
-   err = vhost_attach_cgroups(dev);
+   err = vhost_worker_create(dev);
 

[PATCH V2 0/9] Use copy_process/create_io_thread in vhost layer

2021-09-21 Thread Mike Christie
The following patches were made over Linus's tree but also apply over
Jens's 5.16 io_uring branch and Michaels' vhost/next branch.

This is version 2 of the patchset and should handle all the review
comments posted in V1 here:

https://lore.kernel.org/all/20210916212051.6918-1-michael.chris...@oracle.com/

If I missed a comment, please let me know.

This patchset allows the vhost layer to do a copy_process on the thread
that does the VHOST_SET_OWNER ioctl like how io_uring does a copy_process
against its userspace app (Jens, the patches make create_io_thread more
generic so that's why you are cc'd). This allows the vhost layer's worker
threads to inherit cgroups, namespaces, address space, etc and this worker
thread will also be accounted for against that owner/parent process's
RLIMIT_NPROC limit.

If you are not familiar with qemu and vhost here is more detailed
problem description:

Qemu will create vhost devices in the kernel which perform network, SCSI,
etc IO and management operations from worker threads created by the
kthread API. Because the kthread API does a copy_process on the kthreadd
thread, the vhost layer has to use kthread_use_mm to access the Qemu
thread's memory and cgroup_attach_task_all to add itself to the Qemu
thread's cgroups.

The problem with this approach is that we then have to add new functions/
args/functionality for every thing we want to inherit. I started doing
that here:

https://lkml.org/lkml/2021/6/23/1233

for the RLIMIT_NPROC check, but it seems it might be easier to just
inherit everything from the beginning, becuase I'd need to do something
like that patch several times. For example, the current approach does not
support cgroups v2 so commands like virsh emulatorpin do not work. The
qemu process can go over its RLIMIT_NPROC. And for future vhost interfaces
where we export the vhost thread pid we will want the namespace info.

V2:
- Rename kernel_copy_process to kernel_worker.
- Instead of exporting functions, make kernel_worker() a proper
  function/API that does common work for the caller.
- Instead of adding new fields to kernel_clone_args for each option
  make it flag based similar to CLONE_*.
- Drop unused completion struct in vhost.
- Fix compile warnings by merging vhost cgroup cleanup patch and
  vhost conversion patch.
 


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/1] virtio/s390: fix vritio-ccw device teardown

2021-09-21 Thread Cornelia Huck
On Tue, Sep 21 2021, Halil Pasic  wrote:

> On Mon, 20 Sep 2021 12:07:23 +0200
> Cornelia Huck  wrote:
>
>> On Mon, Sep 20 2021, Vineeth Vijayan  wrote:
>> 
>> > On Mon, 2021-09-20 at 00:39 +0200, Halil Pasic wrote:  
>> >> On Fri, 17 Sep 2021 10:40:20 +0200
>> >> Cornelia Huck  wrote:
>> >>   
>> > ...snip...  
>> >> > > 
>> >> > > Thanks, if I find time for it, I will try to understand this
>> >> > > better and
>> >> > > come back with my findings.
>> >> > >
>> >> > > > > * Can virtio_ccw_remove() get called while !cdev->online and 
>> >> > > > >   virtio_ccw_online() is running on a different cpu? If yes,
>> >> > > > > what would
>> >> > > > >   happen then?  
>> >> > > > 
>> >> > > > All of the remove/online/... etc. callbacks are invoked via the
>> >> > > > ccw bus
>> >> > > > code. We have to trust that it gets it correct :) (Or have the
>> >> > > > common
>> >> > > > I/O layer maintainers double-check it.)
>> >> > > > 
>> >> > > 
>> >> > > Vineeth, what is your take on this? Are the struct ccw_driver
>> >> > > virtio_ccw_remove and the virtio_ccw_online callbacks mutually
>> >> > > exclusive. Please notice that we may initiate the onlining by
>> >> > > calling ccw_device_set_online() from a workqueue.
>> >> > > 
>> >> > > @Conny: I'm not sure what is your definition of 'it gets it
>> >> > > correct'...
>> >> > > I doubt CIO can make things 100% foolproof in this area.
>> >> > 
>> >> > Not 100% foolproof, but "don't online a device that is in the
>> >> > progress
>> >> > of going away" seems pretty basic to me.
>> >> >   
>> >> 
>> >> I hope Vineeth will chime in on this.  
>> > Considering the online/offline processing, 
>> > The ccw_device_set_offline function or the online/offline is handled
>> > inside device_lock. Also, the online_store function takes care of
>> > avoiding multiple online/offline processing. 
>> >
>> > Now, when we consider the unconditional remove of the device,
>> > I am not familiar with the virtio_ccw driver. My assumptions are based
>> > on how CIO/dasd drivers works. If i understand correctly, the dasd
>> > driver sets different flags to make sure that a device_open is getting
>> > prevented while the the device is in progress of offline-ing.   
>> 
>> Hm, if we are invoking the online/offline callbacks under the device
>> lock already, 
>
> I believe we have a misunderstanding here. I believe that Vineeth is
> trying to tell us, that online_store_handle_offline() and
> online_store_handle_offline() are called under the a device lock of
> the ccw device. Right, Vineeth?
>
> Conny, I believe, by online/offline callbacks, you mean
> virtio_ccw_online() and virtio_ccw_offline(), right?

Whatever the common I/O layer invokes.

>
> But the thing is that virtio_ccw_online() may get called (and is
> typically called, AFAICT) with no locks held via:
> virtio_ccw_probe() --> async_schedule(virtio_ccw_auto_online, cdev)
> -*-> virtio_ccw_auto_online(cdev) --> ccw_device_set_online(cdev) -->
> virtio_ccw_online()

That's the common I/O layer in there again?

>
> Furthermore after a closer look, I believe because we don't take
> a reference to the cdev in probe, we may get virtio_ccw_auto_online()
> called with an invalid pointer (the pointer is guaranteed to be valid
> in probe, but because of async we have no guarantee that it will be
> called in the context of probe).
>
> Shouldn't we take a reference to the cdev in probe? BTW what is the
> reason for the async?

I don't know.

>
>
>> how would that affect the remove callback?
>
> I believe dev->bus->remove(dev) is called by 
> bus_remove_device() with the device lock held. I.e. I believe that means
> that virtio_ccw_remove() is called with the ccw devices device lock
> held. Vineeth can you confirm that?
>
>
> The thing is, both virtio_ccw_remove() and virtio_ccw_offline() are
> very similar, with the notable exception that offline assumes we are
> online() at the moment, while remove() does the same only if it
> decides based on vcdev && cdev->online that we are online.
>
>
>> Shouldn't they
>> be serialized under the device lock already? I think we are fine.
>
> AFAICT virtio_ccw_remove() and virtio_ccw_offline() are serialized
> against each other under the device lock. And also against
> virtio_ccw_online() iff it was initiated via the sysfs, and not via
> the auto-online mechanism.
>
> Thus I don't think we are fine at the moment.

I don't understand this, sorry.

>
>> 
>> For dasd, I think they also need to deal with the block device
>> lifetimes. For virtio-ccw, we are basically a transport that does not
>> know about devices further down the chain (that are associated with the
>> virtio device, whose lifetime is tied to online/offline processing.) I'd
>> presume that the serialization above would be enough.
>> 
>
> I don't know about dasd that much. For the reasons stated above, I don't
> think the serialization we have right now is entirely sufficient.

I'm not sure it makes sense to discuss this further 

Re: [PATCH RFC v1 03/11] iommu/virtio: Handle incoming page faults

2021-09-21 Thread Jean-Philippe Brucker
On Fri, Apr 23, 2021 at 03:21:39PM +0530, Vivek Gautam wrote:
> Redirect the incoming page faults to the registered fault handler
> that can take the fault information such as, pasid, page request
> group-id, address and pasid flags.
> 
> Signed-off-by: Vivek Gautam 
> ---
>  drivers/iommu/virtio-iommu.c  | 80 ++-
>  include/uapi/linux/virtio_iommu.h |  1 +
>  2 files changed, 80 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index c970f386f031..fd237cad1ce5 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -37,6 +37,13 @@
>  /* Some architectures need an Address Space ID for each page table */
>  DEFINE_XARRAY_ALLOC1(viommu_asid_xa);
>  
> +struct viommu_dev_pri_work {
> + struct work_struct  work;
> + struct viommu_dev   *dev;
> + struct virtio_iommu_fault   *vfault;
> + u32 endpoint;
> +};
> +
>  struct viommu_dev {
>   struct iommu_device iommu;
>   struct device   *dev;
> @@ -49,6 +56,8 @@ struct viommu_dev {
>   struct list_headrequests;
>   void*evts;
>   struct list_headendpoints;
> + struct workqueue_struct *pri_wq;
> + struct viommu_dev_pri_work  *pri_work;

IOPF already has a workqueue, so the driver doesn't need one.
iommu_report_device_fault() should be fast enough to be called from the
event handler.

>  
>   /* Device configuration */
>   struct iommu_domain_geometrygeometry;
> @@ -666,6 +675,58 @@ static int viommu_probe_endpoint(struct viommu_dev 
> *viommu, struct device *dev)
>   return ret;
>  }
>  
> +static void viommu_handle_ppr(struct work_struct *work)
> +{
> + struct viommu_dev_pri_work *pwork =
> + container_of(work, struct viommu_dev_pri_work, 
> work);
> + struct viommu_dev *viommu = pwork->dev;
> + struct virtio_iommu_fault *vfault = pwork->vfault;
> + struct viommu_endpoint *vdev;
> + struct viommu_ep_entry *ep;
> + struct iommu_fault_event fault_evt = {
> + .fault.type = IOMMU_FAULT_PAGE_REQ,
> + };
> + struct iommu_fault_page_request *prq = _evt.fault.prm;
> +
> + u32 flags   = le32_to_cpu(vfault->flags);
> + u32 prq_flags   = le32_to_cpu(vfault->pr_evt_flags);
> + u32 endpoint= pwork->endpoint;
> +
> + memset(prq, 0, sizeof(struct iommu_fault_page_request));

The fault_evt struct is already initialized

> + prq->addr = le64_to_cpu(vfault->address);
> +
> + if (prq_flags & VIRTIO_IOMMU_FAULT_PRQ_F_LAST_PAGE)
> + prq->flags |= IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
> + if (prq_flags & VIRTIO_IOMMU_FAULT_PRQ_F_PASID_VALID) {
> + prq->flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
> + prq->pasid = le32_to_cpu(vfault->pasid);
> + prq->grpid = le32_to_cpu(vfault->grpid);
> + }
> +
> + if (flags & VIRTIO_IOMMU_FAULT_F_READ)
> + prq->perm |= IOMMU_FAULT_PERM_READ;
> + if (flags & VIRTIO_IOMMU_FAULT_F_WRITE)
> + prq->perm |= IOMMU_FAULT_PERM_WRITE;
> + if (flags & VIRTIO_IOMMU_FAULT_F_EXEC)
> + prq->perm |= IOMMU_FAULT_PERM_EXEC;
> + if (flags & VIRTIO_IOMMU_FAULT_F_PRIV)
> + prq->perm |= IOMMU_FAULT_PERM_PRIV;
> +
> + list_for_each_entry(ep, >endpoints, list) {
> + if (ep->eid == endpoint) {
> + vdev = ep->vdev;
> + break;
> + }
> + }
> +
> + if ((prq_flags & VIRTIO_IOMMU_FAULT_PRQ_F_PASID_VALID) &&
> + (prq_flags & VIRTIO_IOMMU_FAULT_PRQ_F_NEEDS_PASID))
> + prq->flags |= IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
> +
> + if (iommu_report_device_fault(vdev->dev, _evt))
> + dev_err(vdev->dev, "Couldn't handle page request\n");

An error likely means that nobody registered a fault handler, but we could
display a few more details about the fault that would help debug the
endpoint

> +}
> +
>  static int viommu_fault_handler(struct viommu_dev *viommu,
>   struct virtio_iommu_fault *fault)
>  {
> @@ -679,7 +740,13 @@ static int viommu_fault_handler(struct viommu_dev 
> *viommu,
>   u32 pasid   = le32_to_cpu(fault->pasid);
>  
>   if (type == VIRTIO_IOMMU_FAULT_F_PAGE_REQ) {
> - dev_info(viommu->dev, "Page request fault - unhandled\n");
> + dev_info_ratelimited(viommu->dev,
> +  "Page request fault from EP %u\n",
> +  endpoint);

That's rather for debugging the virtio-iommu driver, so should be
dev_dbg() (or removed entirely)

> +
> + viommu->pri_work->vfault = fault;
> + viommu->pri_work->endpoint = endpoint;
> + queue_work(viommu->pri_wq, >pri_work->work);
>

Re: [PATCH RFC v1 05/11] iommu/virtio: Add SVA feature and related enable/disable callbacks

2021-09-21 Thread Jean-Philippe Brucker
On Fri, Apr 23, 2021 at 03:21:41PM +0530, Vivek Gautam wrote:
> Add a feature flag to virtio iommu for Shared virtual addressing
> (SVA). This feature would indicate the availablily path for handling
> device page faults, and the provision for sending page response.

In this case the feature should probably be called PAGE_REQUEST or
similar. SVA aggregates PF + PASID + shared page tables

Thanks,
Jean

> Also add necessary methods to enable and disable SVA so that the
> masters can enable the SVA path. This also requires enabling the
> PRI capability on the device.
> 
> Signed-off-by: Vivek Gautam 
> ---
>  drivers/iommu/virtio-iommu.c  | 268 ++
>  include/uapi/linux/virtio_iommu.h |   1 +
>  2 files changed, 269 insertions(+)
> 
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index 3da5f0807711..250c137a211b 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -18,6 +18,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -26,6 +27,7 @@
>  
>  #include 
>  #include "iommu-pasid-table.h"
> +#include "iommu-sva-lib.h"
>  
>  #define MSI_IOVA_BASE0x800
>  #define MSI_IOVA_LENGTH  0x10
> @@ -37,6 +39,9 @@
>  /* Some architectures need an Address Space ID for each page table */
>  DEFINE_XARRAY_ALLOC1(viommu_asid_xa);
>  
> +static DEFINE_MUTEX(sva_lock);
> +static DEFINE_MUTEX(iopf_lock);
> +
>  struct viommu_dev_pri_work {
>   struct work_struct  work;
>   struct viommu_dev   *dev;
> @@ -71,6 +76,7 @@ struct viommu_dev {
>  
>   boolhas_map:1;
>   boolhas_table:1;
> + boolhas_sva:1;
>  };
>  
>  struct viommu_mapping {
> @@ -124,6 +130,12 @@ struct viommu_endpoint {
>   void*pstf;
>   /* Preferred page table format */
>   void*pgtf;
> +
> + /* sva */
> + boolats_supported;
> + boolpri_supported;
> + boolsva_enabled;
> + booliopf_enabled;
>  };
>  
>  struct viommu_ep_entry {
> @@ -582,6 +594,64 @@ static int viommu_add_pstf(struct viommu_endpoint *vdev, 
> void *pstf, size_t len)
>   return 0;
>  }
>  
> +static int viommu_init_ats_pri(struct viommu_endpoint *vdev)
> +{
> + struct device *dev = vdev->dev;
> + struct pci_dev *pdev = to_pci_dev(dev);
> +
> + if (!dev_is_pci(vdev->dev))
> + return -EINVAL;
> +
> + if (pci_ats_supported(pdev))
> + vdev->ats_supported = true;
> +
> + if (pci_pri_supported(pdev))
> + vdev->pri_supported = true;
> +
> + return 0;
> +}
> +
> +static int viommu_enable_pri(struct viommu_endpoint *vdev)
> +{
> + int ret;
> + struct pci_dev *pdev;
> +
> + /* Let's allow only 4 requests for PRI right now */
> + size_t max_inflight_pprs = 4;
> +
> + if (!vdev->pri_supported || !vdev->ats_supported)
> + return -ENODEV;
> +
> + pdev = to_pci_dev(vdev->dev);
> +
> + ret = pci_reset_pri(pdev);
> + if (ret)
> + return ret;
> +
> + ret = pci_enable_pri(pdev, max_inflight_pprs);
> + if (ret) {
> + dev_err(vdev->dev, "cannot enable PRI: %d\n", ret);
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +static void viommu_disable_pri(struct viommu_endpoint *vdev)
> +{
> + struct pci_dev *pdev;
> +
> + if (!dev_is_pci(vdev->dev))
> + return;
> +
> + pdev = to_pci_dev(vdev->dev);
> +
> + if (!pdev->pri_enabled)
> + return;
> +
> + pci_disable_pri(pdev);
> +}
> +
>  static int viommu_init_queues(struct viommu_dev *viommu)
>  {
>   viommu->iopf_pri = iopf_queue_alloc(dev_name(viommu->dev));
> @@ -684,6 +754,10 @@ static int viommu_probe_endpoint(struct viommu_dev 
> *viommu, struct device *dev)
>   if (ret)
>   goto out_free_eps;
>  
> + ret = viommu_init_ats_pri(vdev);
> + if (ret)
> + goto out_free_eps;
> +
>   kfree(probe);
>   return 0;
>  
> @@ -1681,6 +1755,194 @@ static int viommu_of_xlate(struct device *dev, struct 
> of_phandle_args *args)
>   return iommu_fwspec_add_ids(dev, args->args, 1);
>  }
>  
> +static bool viommu_endpoint_iopf_supported(struct viommu_endpoint *vdev)
> +{
> + /* TODO: support Stall model later */
> + return vdev->pri_supported;
> +}
> +
> +bool viommu_endpoint_sva_supported(struct viommu_endpoint *vdev)
> +{
> + struct viommu_dev *viommu = vdev->viommu;
> +
> + if (!viommu->has_sva)
> + return false;
> +
> + return vdev->pasid_bits;
> +}
> +
> +bool viommu_endpoint_sva_enabled(struct viommu_endpoint *vdev)
> +{
> + bool enabled;
> +
> + mutex_lock(_lock);
> + 

Re: [PATCH RFC v1 08/11] iommu/arm-smmu-v3: Implement shared context alloc and free ops

2021-09-21 Thread Jean-Philippe Brucker
On Fri, Apr 23, 2021 at 03:21:44PM +0530, Vivek Gautam wrote:
> Implementing the alloc_shared_cd and free_shared_cd in cd-lib, and
> start using them for arm-smmu-v3-sva implementation.
> 
> Signed-off-by: Vivek Gautam 
> ---
>  .../arm/arm-smmu-v3/arm-smmu-v3-cd-lib.c  | 71 
>  .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 83 ---
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  1 -
>  drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   | 14 
>  4 files changed, 73 insertions(+), 96 deletions(-)
> 
> diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-cd-lib.c 
> b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-cd-lib.c
> index 537b7c784d40..b87829796596 100644
> --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-cd-lib.c
> +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-cd-lib.c
> @@ -285,16 +285,14 @@ static bool arm_smmu_free_asid(struct xarray *xa, void 
> *cookie_cd)
>   * descriptor is using it, try to replace it.
>   */
>  static struct arm_smmu_ctx_desc *
> -arm_smmu_share_asid(struct mm_struct *mm, u16 asid)
> +arm_smmu_share_asid(struct iommu_pasid_table *tbl, struct mm_struct *mm,
> + struct xarray *xa, u16 asid, u32 asid_bits)

xa and asid_bits could be stored in some arch-specific section of the
iommu_pasid_table struct. Other table drivers wouldn't need those
arguments.

More a comment for the parent series: it may be clearer to give a
different prefix to functions in this file (arm_smmu_cd_, pst_arm_?).
Reading this patch I'm a little confused by what belongs in the IOMMU
driver and what is done by this library. (I also keep reading 'tbl' as
'tlb'. Maybe we could make it 'table' since that doesn't take a lot more
space)

>  {
>   int ret;
>   u32 new_asid;
>   struct arm_smmu_ctx_desc *cd;
> - struct arm_smmu_device *smmu;
> - struct arm_smmu_domain *smmu_domain;
> - struct iommu_pasid_table *tbl;
>  
> - cd = xa_load(_smmu_asid_xa, asid);
> + cd = xa_load(xa, asid);
>   if (!cd)
>   return NULL;
>  
> @@ -306,12 +304,8 @@ arm_smmu_share_asid(struct mm_struct *mm, u16 asid)
>   return cd;
>   }
>  
> - smmu_domain = container_of(cd, struct arm_smmu_domain, s1_cfg.cd);
> - smmu = smmu_domain->smmu;
> - tbl = smmu_domain->tbl;
> -
> - ret = xa_alloc(_smmu_asid_xa, _asid, cd,
> -XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
> + ret = xa_alloc(xa, _asid, cd, XA_LIMIT(1, (1 << asid_bits) - 1),
> +GFP_KERNEL);
>   if (ret)
>   return ERR_PTR(-ENOSPC);
>   /*
> @@ -325,48 +319,52 @@ arm_smmu_share_asid(struct mm_struct *mm, u16 asid)
>* be some overlap between use of both ASIDs, until we invalidate the
>* TLB.
>*/
> - ret = iommu_psdtable_write(tbl, >cfg, 0, cd);
> + ret = arm_smmu_write_ctx_desc(>cfg, 0, cd);
>   if (ret)
>   return ERR_PTR(-ENOSYS);
>  
>   /* Invalidate TLB entries previously associated with that context */
> - iommu_psdtable_flush_tlb(tbl, smmu_domain, asid);
> + iommu_psdtable_flush_tlb(tbl, tbl->cookie, asid);
>  
> - xa_erase(_smmu_asid_xa, asid);
> + xa_erase(xa, asid);
>   return NULL;
>  }
>  
> -struct arm_smmu_ctx_desc *
> -arm_smmu_alloc_shared_cd(struct iommu_pasid_table *tbl, struct mm_struct *mm)
> +static struct iommu_psdtable_mmu_notifier *
> +arm_smmu_alloc_shared_cd(struct iommu_pasid_table *tbl, struct mm_struct *mm,
> +  struct xarray *xa, u32 asid_bits)
>  {
>   u16 asid;
>   int err = 0;
>   u64 tcr, par, reg;
>   struct arm_smmu_ctx_desc *cd;
>   struct arm_smmu_ctx_desc *ret = NULL;
> + struct iommu_psdtable_mmu_notifier *pst_mn;
>  
>   asid = arm64_mm_context_get(mm);
>   if (!asid)
>   return ERR_PTR(-ESRCH);
>  
> + pst_mn = kzalloc(sizeof(*pst_mn), GFP_KERNEL);
> + if (!pst_mn) {
> + err = -ENOMEM;
> + goto out_put_context;
> + }
> +
>   cd = kzalloc(sizeof(*cd), GFP_KERNEL);
>   if (!cd) {
>   err = -ENOMEM;
> - goto out_put_context;
> + goto out_free_mn;
>   }
>  
>   refcount_set(>refs, 1);
>  
> - mutex_lock(_smmu_asid_lock);
> - ret = arm_smmu_share_asid(mm, asid);
> + ret = arm_smmu_share_asid(tbl, mm, xa, asid, asid_bits);
>   if (ret) {
> - mutex_unlock(_smmu_asid_lock);
>   goto out_free_cd;
>   }
>  
> - err = xa_insert(_smmu_asid_xa, asid, cd, GFP_KERNEL);
> - mutex_unlock(_smmu_asid_lock);
> -
> + err = xa_insert(xa, asid, cd, GFP_KERNEL);
>   if (err)
>   goto out_free_asid;
>  
> @@ -406,21 +404,26 @@ arm_smmu_alloc_shared_cd(struct iommu_pasid_table *tbl, 
> struct mm_struct *mm)
>   cd->asid = asid;
>   cd->mm = mm;
>  
> - return cd;
> + pst_mn->vendor.cd = cd;
> + return pst_mn;
>  
>  out_free_asid:
> - 

Re: [PATCH RFC v1 09/11] iommu/virtio: Implement sva bind/unbind calls

2021-09-21 Thread Jean-Philippe Brucker
On Fri, Apr 23, 2021 at 03:21:45PM +0530, Vivek Gautam wrote:
> SVA bind and unbind implementations will allow to prepare translation
> context with CPU page tables that can be programmed into host iommu
> hardware to realize shared address space utilization between the CPU
> and virtualized devices using virtio-iommu.
> 
> Signed-off-by: Vivek Gautam 
> ---
>  drivers/iommu/virtio-iommu.c  | 199 +-
>  include/uapi/linux/virtio_iommu.h |   2 +
>  2 files changed, 199 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index 250c137a211b..08f1294baeab 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -14,6 +14,9 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -28,6 +31,7 @@
>  #include 
>  #include "iommu-pasid-table.h"
>  #include "iommu-sva-lib.h"
> +#include "io-pgtable-arm.h"

Is this used here?

>  
>  #define MSI_IOVA_BASE0x800
>  #define MSI_IOVA_LENGTH  0x10
> @@ -41,6 +45,7 @@ DEFINE_XARRAY_ALLOC1(viommu_asid_xa);
>  
>  static DEFINE_MUTEX(sva_lock);
>  static DEFINE_MUTEX(iopf_lock);
> +static DEFINE_MUTEX(viommu_asid_lock);
>  
>  struct viommu_dev_pri_work {
>   struct work_struct  work;
> @@ -88,10 +93,22 @@ struct viommu_mapping {
>  struct viommu_mm {
>   int pasid;
>   u64 archid;
> + struct viommu_sva_bond  *bond;
>   struct io_pgtable_ops   *ops;
>   struct viommu_domain*domain;
>  };
>  
> +struct viommu_sva_bond {
> + struct iommu_svasva;
> + struct mm_struct*mm;
> + struct iommu_psdtable_mmu_notifier  *viommu_mn;
> + struct list_headlist;
> + refcount_t  refs;
> +};
> +
> +#define sva_to_viommu_bond(handle) \
> + container_of(handle, struct viommu_sva_bond, sva)
> +
>  struct viommu_domain {
>   struct iommu_domain domain;
>   struct viommu_dev   *viommu;
> @@ -136,6 +153,7 @@ struct viommu_endpoint {
>   boolpri_supported;
>   boolsva_enabled;
>   booliopf_enabled;
> + struct list_headbonds;
>  };
>  
>  struct viommu_ep_entry {
> @@ -1423,14 +1441,15 @@ static int viommu_attach_pasid_table(struct 
> viommu_endpoint *vdev,
>  
>   pst_cfg->iommu_dev = viommu->dev->parent;
>  
> + mutex_lock(_asid_lock);
>   /* Prepare PASID tables info to allocate a new table */
>   ret = viommu_prepare_pst(vdev, pst_cfg, fmt);
>   if (ret)
> - return ret;
> + goto err_out_unlock;
>  
>   ret = iommu_psdtable_alloc(tbl, pst_cfg);
>   if (ret)
> - return ret;
> + goto err_out_unlock;
>  
>   pst_cfg->iommu_dev = viommu->dev->parent;
>   pst_cfg->fmt = PASID_TABLE_ARM_SMMU_V3;
> @@ -1452,6 +1471,7 @@ static int viommu_attach_pasid_table(struct 
> viommu_endpoint *vdev,
>   if (ret)
>   goto err_free_ops;
>   }
> + mutex_unlock(_asid_lock);
>   } else {
>   /* TODO: otherwise, check for compatibility with vdev. */
>   return -ENOSYS;
> @@ -1467,6 +1487,8 @@ static int viommu_attach_pasid_table(struct 
> viommu_endpoint *vdev,
>  err_free_psdtable:
>   iommu_psdtable_free(tbl, >cfg);
>  
> +err_out_unlock:
> + mutex_unlock(_asid_lock);
>   return ret;
>  }
>  
> @@ -1706,6 +1728,7 @@ static struct iommu_device *viommu_probe_device(struct 
> device *dev)
>   vdev->dev = dev;
>   vdev->viommu = viommu;
>   INIT_LIST_HEAD(>resv_regions);
> + INIT_LIST_HEAD(>bonds);
>   dev_iommu_priv_set(dev, vdev);
>  
>   if (viommu->probe_size) {
> @@ -1755,6 +1778,175 @@ static int viommu_of_xlate(struct device *dev, struct 
> of_phandle_args *args)
>   return iommu_fwspec_add_ids(dev, args->args, 1);
>  }
>  
> +static u32 viommu_sva_get_pasid(struct iommu_sva *handle)
> +{
> + struct viommu_sva_bond *bond = sva_to_viommu_bond(handle);
> +
> + return bond->mm->pasid;
> +}
> +
> +static void viommu_mmu_notifier_free(struct mmu_notifier *mn)
> +{
> + kfree(mn_to_pstiommu(mn));
> +}
> +
> +static struct mmu_notifier_ops viommu_mmu_notifier_ops = {
> + .free_notifier  = viommu_mmu_notifier_free,

.invalidate_range and .release will be needed as well, to keep up to date
with changes to the address space

> +};
> +
> +/* Allocate or get existing MMU notifier for this {domain, mm} pair */
> +static struct iommu_psdtable_mmu_notifier *
> +viommu_mmu_notifier_get(struct 

Re: [PATCH RFC v1 01/11] uapi/virtio-iommu: Add page request grp-id and flags information

2021-09-21 Thread Jean-Philippe Brucker
Hi Vivek,

Thanks a lot for your work on this

On Fri, Apr 23, 2021 at 03:21:37PM +0530, Vivek Gautam wrote:
> Add fault information for group-id and necessary flags for page
> request faults that can be handled by page fault handler in
> virtio-iommu driver.
> 
> Signed-off-by: Vivek Gautam 
> Cc: Joerg Roedel 
> Cc: Will Deacon 
> Cc: Robin Murphy 
> Cc: Jean-Philippe Brucker 
> Cc: Eric Auger 
> Cc: Alex Williamson 
> Cc: Kevin Tian 
> Cc: Jacob Pan 
> Cc: Liu Yi L 
> Cc: Lorenzo Pieralisi 
> Cc: Shameerali Kolothum Thodi 
> ---
>  include/uapi/linux/virtio_iommu.h | 13 +
>  1 file changed, 13 insertions(+)
> 
> diff --git a/include/uapi/linux/virtio_iommu.h 
> b/include/uapi/linux/virtio_iommu.h
> index f8bf927a0689..accc3318ce46 100644
> --- a/include/uapi/linux/virtio_iommu.h
> +++ b/include/uapi/linux/virtio_iommu.h
> @@ -307,14 +307,27 @@ struct virtio_iommu_req_invalidate {
>  #define VIRTIO_IOMMU_FAULT_F_DMA_UNRECOV 1
>  #define VIRTIO_IOMMU_FAULT_F_PAGE_REQ2
>  
> +#define VIRTIO_IOMMU_FAULT_PRQ_F_PASID_VALID (1 << 0)
> +#define VIRTIO_IOMMU_FAULT_PRQ_F_LAST_PAGE   (1 << 1)
> +#define VIRTIO_IOMMU_FAULT_PRQ_F_PRIV_DATA   (1 << 2)
> +#define VIRTIO_IOMMU_FAULT_PRQ_F_NEEDS_PASID (1 << 3)

I don't think this one is necessary here. The NEEDS_PASID flags added by
commit 970471914c67 ("iommu: Allow page responses without PASID") mainly
helps Linux keep track of things internally. It does tell the fault
handler whether to reply with PASID or not, but we don't need that here.
The virtio-iommu driver knows whether a PASID is required by looking at
the "PRG Response PASID Required" bit in the PCIe capability. For non-PCIe
faults (e.g. SMMU stall), I'm guessing we'll need a PROBE property to
declare that the endpoint supports recoverable faults anyway, so "PASID
required in response" can go through there as well.

> +
> +#define VIRTIO_IOMMU_FAULT_UNREC_F_PASID_VALID   (1 << 0)
> +#define VIRTIO_IOMMU_FAULT_UNREC_F_ADDR_VALID(1 << 1)
> +#define VIRTIO_IOMMU_FAULT_UNREC_F_FETCH_ADDR_VALID  (1 << 2)
> +
>  struct virtio_iommu_fault {
>   __u8reason;
>   __u8reserved[3];
>   __le16  flt_type;
>   __u8reserved2[2];
> + /* flags is actually permission flags */

It's also used for declaring validity of fields.
VIRTIO_IOMMU_FAULT_F_ADDRESS already tells whether the address field is
valid, so all the other flags introduced by this patch can go in here.

>   __le32  flags;
> + /* flags for PASID and Page request handling info */
> + __le32  pr_evt_flags;
>   __le32  endpoint;
>   __le32  pasid;
> + __le32  grpid;

I'm not sure why we made it 32-bit in Linux UAPI, it's a little wasteful.
PCIe PRGI is 9-bits and SMMU STAG is 16-bits. Since the scope of the grpid
is the endpoint, 16-bit means 64k in-flight faults per endpoint, which
seems more than enough.

New fields must be appended at the end of the struct, because old drivers
will expect to find the 'endpoint' field at this offset. You could remove
'reserved3' while adding 'grpid', to keep the struct layout.

>   __u8reserved3[4];
>   __le64  address;
>   __u8reserved4[8];


So the base structure, currently in the spec, looks like this:

struct virtio_iommu_fault {
u8   reason;
u8   reserved[3];
le32 flags;
le32 endpoint;
le32 reserved1;
le64 address;
};

#define VIRTIO_IOMMU_FAULT_F_READ   (1 << 0)
#define VIRTIO_IOMMU_FAULT_F_WRITE  (1 << 1)
#define VIRTIO_IOMMU_FAULT_F_ADDRESS(1 << 8)

The extended struct could be:

struct virtio_iommu_fault {
u8   reason;
u8   reserved[3];
le32 flags;
le32 endpoint;
le32 pasid;
le64 address;
/* Page request group ID */
le16 group_id;
u8   reserved1[6];
/* For VT-d private data */
le64 private_data[2];
};

#define VIRTIO_IOMMU_FAULT_F_READ   (1 << 0)
#define VIRTIO_IOMMU_FAULT_F_WRITE  (1 << 1)
#define VIRTIO_IOMMU_FAULT_F_EXEC   (1 << 2)
#define VIRTIO_IOMMU_FAULT_F_PRIVILEGED (1 << 3)
/* Last fault in group */
#define VIRTIO_IOMMU_FAULT_F_LAST   (1 << 4)
/* Fault is a recoverable page request and requires a response */
#define VIRTIO_IOMMU_FAULT_F_PAGE_REQ   (1 << 5)


Re: [PATCH RFC v1 02/11] iommu/virtio: Maintain a list of endpoints served by viommu_dev

2021-09-21 Thread Jean-Philippe Brucker
On Fri, Apr 23, 2021 at 03:21:38PM +0530, Vivek Gautam wrote:
> Keeping a record of list of endpoints that are served by the virtio-iommu
> device would help in redirecting the requests of page faults to the
> correct endpoint device to handle such requests.
> 
> Signed-off-by: Vivek Gautam 
> ---
>  drivers/iommu/virtio-iommu.c | 21 +
>  1 file changed, 21 insertions(+)
> 
> diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
> index 50039070e2aa..c970f386f031 100644
> --- a/drivers/iommu/virtio-iommu.c
> +++ b/drivers/iommu/virtio-iommu.c
> @@ -48,6 +48,7 @@ struct viommu_dev {
>   spinlock_t  request_lock;
>   struct list_headrequests;
>   void*evts;
> + struct list_headendpoints;

As we're going to search by ID, an xarray or rb_tree would be more
appropriate than a list

>  
>   /* Device configuration */
>   struct iommu_domain_geometrygeometry;
> @@ -115,6 +116,12 @@ struct viommu_endpoint {
>   void*pgtf;
>  };
>  
> +struct viommu_ep_entry {
> + u32 eid;
> + struct viommu_endpoint  *vdev;
> + struct list_headlist;
> +};

No need for a separate struct, I think you can just add the list head and
id into viommu_endpoint.

> +
>  struct viommu_request {
>   struct list_headlist;
>   void*writeback;
> @@ -573,6 +580,7 @@ static int viommu_probe_endpoint(struct viommu_dev 
> *viommu, struct device *dev)
>   size_t probe_len;
>   struct virtio_iommu_req_probe *probe;
>   struct virtio_iommu_probe_property *prop;
> + struct viommu_ep_entry *ep;
>   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
>   struct viommu_endpoint *vdev = dev_iommu_priv_get(dev);
>  
> @@ -640,6 +648,18 @@ static int viommu_probe_endpoint(struct viommu_dev 
> *viommu, struct device *dev)
>   prop = (void *)probe->properties + cur;
>   type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK;
>   }
> + if (ret)
> + goto out_free;
> +
> + ep = kzalloc(sizeof(*ep), GFP_KERNEL);
> + if (!ep) {
> + ret = -ENOMEM;
> + goto out_free;
> + }
> + ep->eid = probe->endpoint;
> + ep->vdev = vdev;
> +
> + list_add(>list, >endpoints);

This should be in viommu_probe_device() (viommu_probe_endpoint() is only
called if F_PROBE is negotiated). I think we need a lock for this
list/xarray

Thanks,
Jean

>  
>  out_free:
>   kfree(probe);
> @@ -1649,6 +1669,7 @@ static int viommu_probe(struct virtio_device *vdev)
>   viommu->dev = dev;
>   viommu->vdev = vdev;
>   INIT_LIST_HEAD(>requests);
> + INIT_LIST_HEAD(>endpoints);
>  
>   ret = viommu_init_vqs(viommu);
>   if (ret)
> -- 
> 2.17.1
> 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH RFC v1 10/11] uapi/virtio-iommu: Add a new request type to send page response

2021-09-21 Thread Jean-Philippe Brucker
On Fri, Apr 23, 2021 at 03:21:46PM +0530, Vivek Gautam wrote:
> Once the page faults are handled, the response has to be sent to
> virtio-iommu backend, from where it can be sent to the host to
> prepare the response to a generated io page fault by the device.
> Add a new virt-queue request type to handle this.
> 
> Signed-off-by: Vivek Gautam 
> ---
>  include/uapi/linux/virtio_iommu.h | 18 ++
>  1 file changed, 18 insertions(+)
> 
> diff --git a/include/uapi/linux/virtio_iommu.h 
> b/include/uapi/linux/virtio_iommu.h
> index c12d9b6a7243..1b174b98663a 100644
> --- a/include/uapi/linux/virtio_iommu.h
> +++ b/include/uapi/linux/virtio_iommu.h
> @@ -48,6 +48,7 @@ struct virtio_iommu_config {
>  #define VIRTIO_IOMMU_T_PROBE 0x05
>  #define VIRTIO_IOMMU_T_ATTACH_TABLE  0x06
>  #define VIRTIO_IOMMU_T_INVALIDATE0x07
> +#define VIRTIO_IOMMU_T_PAGE_RESP 0x08
>  
>  /* Status types */
>  #define VIRTIO_IOMMU_S_OK0x00
> @@ -70,6 +71,23 @@ struct virtio_iommu_req_tail {
>   __u8reserved[3];
>  };
>  
> +struct virtio_iommu_req_page_resp {
> + struct virtio_iommu_req_headhead;
> + __le32  domain;

I don't think we need this field, since the fault report doesn't come with
a domain.

> + __le32  endpoint;
> +#define VIRTIO_IOMMU_PAGE_RESP_PASID_VALID   (1 << 0)

To be consistent with the rest of the document this would be
VIRTIO_IOMMU_PAGE_RESP_F_PASID_VALID

> + __le32  flags;
> + __le32  pasid;
> + __le32  grpid;
> +#define VIRTIO_IOMMU_PAGE_RESP_SUCCESS   (0x0)
> +#define VIRTIO_IOMMU_PAGE_RESP_INVALID   (0x1)
> +#define VIRTIO_IOMMU_PAGE_RESP_FAILURE   (0x2)
> + __le16  resp_code;
> + __u8pasid_valid;

This field isn't needed since there already is
VIRTIO_IOMMU_PAGE_RESP_PASID_VALID

> + __u8reserved[9];
> + struct virtio_iommu_req_tailtail;
> +};

I'd align the size of the struct to 16 bytes, but I don't think that's
strictly necessary.

Thanks,
Jean

> +
>  struct virtio_iommu_req_attach {
>   struct virtio_iommu_req_headhead;
>   __le32  domain;
> -- 
> 2.17.1
> 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH 1/1] virtio/s390: fix vritio-ccw device teardown

2021-09-21 Thread Halil Pasic
On Tue, 21 Sep 2021 15:31:03 +0200
Vineeth Vijayan  wrote:

> On Tue, 2021-09-21 at 05:25 +0200, Halil Pasic wrote:
> > On Mon, 20 Sep 2021 12:07:23 +0200
> > Cornelia Huck  wrote:
> >   
> > > On Mon, Sep 20 2021, Vineeth Vijayan  wrote:
> > >   
> > > > On Mon, 2021-09-20 at 00:39 +0200, Halil Pasic wrote:
> > > > > On Fri, 17 Sep 2021 10:40:20 +0200
> > > > > Cornelia Huck  wrote:
> > > > > 
> > > > ...snip...
> > > > > > > Thanks, if I find time for it, I will try to understand
> > > > > > > this
> > > > > > > better and
> > > > > > > come back with my findings.
> > > > > > >  
> > > > > > > > > * Can virtio_ccw_remove() get called while !cdev-  
> > > > > > > > > >online and   
> > > > > > > > >   virtio_ccw_online() is running on a different cpu? If
> > > > > > > > > yes,
> > > > > > > > > what would
> > > > > > > > >   happen then?
> > > > > > > > 
> > > > > > > > All of the remove/online/... etc. callbacks are invoked
> > > > > > > > via the
> > > > > > > > ccw bus
> > > > > > > > code. We have to trust that it gets it correct :) (Or
> > > > > > > > have the
> > > > > > > > common
> > > > > > > > I/O layer maintainers double-check it.)
> > > > > > > >   
> > > > > > > 
> > > > > > > Vineeth, what is your take on this? Are the struct
> > > > > > > ccw_driver
> > > > > > > virtio_ccw_remove and the virtio_ccw_online callbacks
> > > > > > > mutually
> > > > > > > exclusive. Please notice that we may initiate the onlining
> > > > > > > by
> > > > > > > calling ccw_device_set_online() from a workqueue.
> > > > > > > 
> > > > > > > @Conny: I'm not sure what is your definition of 'it gets it
> > > > > > > correct'...
> > > > > > > I doubt CIO can make things 100% foolproof in this
> > > > > > > area.  
> > > > > > 
> > > > > > Not 100% foolproof, but "don't online a device that is in the
> > > > > > progress
> > > > > > of going away" seems pretty basic to me.
> > > > > > 
> > > > > 
> > > > > I hope Vineeth will chime in on this.
> > > > Considering the online/offline processing, 
> > > > The ccw_device_set_offline function or the online/offline is
> > > > handled
> > > > inside device_lock. Also, the online_store function takes care of
> > > > avoiding multiple online/offline processing. 
> > > > 
> > > > Now, when we consider the unconditional remove of the device,
> > > > I am not familiar with the virtio_ccw driver. My assumptions are
> > > > based
> > > > on how CIO/dasd drivers works. If i understand correctly, the
> > > > dasd
> > > > driver sets different flags to make sure that a device_open is
> > > > getting
> > > > prevented while the the device is in progress of offline-ing. 
> > > 
> > > Hm, if we are invoking the online/offline callbacks under the
> > > device
> > > lock already,   
> > 
> > I believe we have a misunderstanding here. I believe that Vineeth is
> > trying to tell us, that online_store_handle_offline() and
> > online_store_handle_offline() are called under the a device lock of
> > the ccw device. Right, Vineeth?  
> Yes. I wanted to bring-out both the scenario.The set_offline/_online()
> calls and the unconditional-remove call.

I don't understand the paragraph above. I can't map the terms
set_offline/_online() and unconditional-remove call to chunks of code.
:( 

> For the set_online The virtio_ccw_online() also invoked with ccwlock
> held. (ref: ccw_device_set_online)

I don't think virtio_ccw_online() is invoked with the ccwlock held. I
think we call virtio_ccw_online() in this line:
https://elixir.bootlin.com/linux/v5.15-rc2/source/drivers/s390/cio/device.c#L394
and we have released the cdev->ccwlock literally 2 lines above.


> > 
> > Conny, I believe, by online/offline callbacks, you mean
> > virtio_ccw_online() and virtio_ccw_offline(), right?
> > 
> > But the thing is that virtio_ccw_online() may get called (and is
> > typically called, AFAICT) with no locks held via:
> > virtio_ccw_probe() --> async_schedule(virtio_ccw_auto_online, cdev)
> > -*-> virtio_ccw_auto_online(cdev) --> ccw_device_set_online(cdev) -->
> > virtio_ccw_online()
> > 
> > Furthermore after a closer look, I believe because we don't take
> > a reference to the cdev in probe, we may get virtio_ccw_auto_online()
> > called with an invalid pointer (the pointer is guaranteed to be valid
> > in probe, but because of async we have no guarantee that it will be
> > called in the context of probe).
> > 
> > Shouldn't we take a reference to the cdev in probe?  
> We just had a quick look at the virtio_ccw_probe() function.
> Did you mean to have a get_device() during the probe() and put_device()
> just after the virtio_ccw_auto_online() ?

Yes, that would ensure that cdev pointer is still valid when
virtio_ccw_auto_online() is executed, and that things are cleaned up
properly, I guess. But I'm not 100% sure about all the interactions.
AFAIR ccw_device_set_online(cdev) would bail out if !drv. But then
we have the case where we already assigned it to a new driver