Re: [PATCH v8 4/6] LoongArch: KVM: Add vcpu search support from physical cpuid

2024-05-08 Thread maobibo




On 2024/5/8 下午1:00, Huacai Chen wrote:

On Tue, May 7, 2024 at 11:06 AM maobibo  wrote:




On 2024/5/7 上午10:05, Huacai Chen wrote:

On Tue, May 7, 2024 at 9:40 AM maobibo  wrote:




On 2024/5/6 下午10:17, Huacai Chen wrote:

On Mon, May 6, 2024 at 6:05 PM maobibo  wrote:




On 2024/5/6 下午5:40, Huacai Chen wrote:

On Mon, May 6, 2024 at 5:35 PM maobibo  wrote:




On 2024/5/6 下午4:59, Huacai Chen wrote:

On Mon, May 6, 2024 at 4:18 PM maobibo  wrote:




On 2024/5/6 下午3:06, Huacai Chen wrote:

Hi, Bibo,

On Mon, May 6, 2024 at 2:36 PM maobibo  wrote:




On 2024/5/6 上午9:49, Huacai Chen wrote:

Hi, Bibo,

On Sun, Apr 28, 2024 at 6:05 PM Bibo Mao  wrote:


Physical cpuid is used for interrupt routing for irqchips such as
ipi/msi/extioi interrupt controller. And physical cpuid is stored
at CSR register LOONGARCH_CSR_CPUID, it can not be changed once vcpu
is created and physical cpuid of two vcpus cannot be the same.

Different irqchips have different size declaration about physical cpuid,
max cpuid value for CSR LOONGARCH_CSR_CPUID on 3A5000 is 512, max cpuid
supported by IPI hardware is 1024, 256 for extioi irqchip, and 65536
for MSI irqchip.

The smallest value from all interrupt controllers is selected now,
and the max cpuid size is defines as 256 by KVM which comes from
extioi irqchip.

Signed-off-by: Bibo Mao 
---
arch/loongarch/include/asm/kvm_host.h | 26 
arch/loongarch/include/asm/kvm_vcpu.h |  1 +
arch/loongarch/kvm/vcpu.c | 93 ++-
arch/loongarch/kvm/vm.c   | 11 
4 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..3ba16ef1fe69 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -64,6 +64,30 @@ struct kvm_world_switch {

#define MAX_PGTABLE_LEVELS 4

+/*
+ * Physical cpu id is used for interrupt routing, there are different
+ * definitions about physical cpuid on different hardwares.
+ *  For LOONGARCH_CSR_CPUID register, max cpuid size if 512
+ *  For IPI HW, max dest CPUID size 1024
+ *  For extioi interrupt controller, max dest CPUID size is 256
+ *  For MSI interrupt controller, max supported CPUID size is 65536
+ *
+ * Currently max CPUID is defined as 256 for KVM hypervisor, in future
+ * it will be expanded to 4096, including 16 packages at most. And every
+ * package supports at most 256 vcpus
+ */
+#define KVM_MAX_PHYID  256
+
+struct kvm_phyid_info {
+   struct kvm_vcpu *vcpu;
+   boolenabled;
+};
+
+struct kvm_phyid_map {
+   int max_phyid;
+   struct kvm_phyid_info phys_map[KVM_MAX_PHYID];
+};
+
struct kvm_arch {
   /* Guest physical mm */
   kvm_pte_t *pgd;
@@ -71,6 +95,8 @@ struct kvm_arch {
   unsigned long invalid_ptes[MAX_PGTABLE_LEVELS];
   unsigned int  pte_shifts[MAX_PGTABLE_LEVELS];
   unsigned int  root_level;
+   spinlock_tphyid_map_lock;
+   struct kvm_phyid_map  *phyid_map;

   s64 time_offset;
   struct kvm_context __percpu *vmcs;
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index 0cb4fdb8a9b5..9f53950959da 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -81,6 +81,7 @@ void kvm_save_timer(struct kvm_vcpu *vcpu);
void kvm_restore_timer(struct kvm_vcpu *vcpu);

int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct 
kvm_interrupt *irq);
+struct kvm_vcpu *kvm_get_vcpu_by_cpuid(struct kvm *kvm, int cpuid);

/*
 * Loongarch KVM guest interrupt handling
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 3a8779065f73..b633fd28b8db 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -274,6 +274,95 @@ static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int 
id, u64 *val)
   return 0;
}

+static inline int kvm_set_cpuid(struct kvm_vcpu *vcpu, u64 val)
+{
+   int cpuid;
+   struct loongarch_csrs *csr = vcpu->arch.csr;
+   struct kvm_phyid_map  *map;
+
+   if (val >= KVM_MAX_PHYID)
+   return -EINVAL;
+
+   cpuid = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+   map = vcpu->kvm->arch.phyid_map;
+   spin_lock(>kvm->arch.phyid_map_lock);
+   if (map->phys_map[cpuid].enabled) {
+   /*
+* Cpuid is already set before
+* Forbid changing different cpuid at runtime
+*/
+   if (cpuid != val) {
+   /*
+* Cpuid 0 is initial value for vcpu, maybe invalid
+* unset value for vcpu
+*/
+   if (cpuid) {
+   spin_unlock(>

Re: [PATCH v8 4/6] LoongArch: KVM: Add vcpu search support from physical cpuid

2024-05-06 Thread maobibo




On 2024/5/7 上午10:05, Huacai Chen wrote:

On Tue, May 7, 2024 at 9:40 AM maobibo  wrote:




On 2024/5/6 下午10:17, Huacai Chen wrote:

On Mon, May 6, 2024 at 6:05 PM maobibo  wrote:




On 2024/5/6 下午5:40, Huacai Chen wrote:

On Mon, May 6, 2024 at 5:35 PM maobibo  wrote:




On 2024/5/6 下午4:59, Huacai Chen wrote:

On Mon, May 6, 2024 at 4:18 PM maobibo  wrote:




On 2024/5/6 下午3:06, Huacai Chen wrote:

Hi, Bibo,

On Mon, May 6, 2024 at 2:36 PM maobibo  wrote:




On 2024/5/6 上午9:49, Huacai Chen wrote:

Hi, Bibo,

On Sun, Apr 28, 2024 at 6:05 PM Bibo Mao  wrote:


Physical cpuid is used for interrupt routing for irqchips such as
ipi/msi/extioi interrupt controller. And physical cpuid is stored
at CSR register LOONGARCH_CSR_CPUID, it can not be changed once vcpu
is created and physical cpuid of two vcpus cannot be the same.

Different irqchips have different size declaration about physical cpuid,
max cpuid value for CSR LOONGARCH_CSR_CPUID on 3A5000 is 512, max cpuid
supported by IPI hardware is 1024, 256 for extioi irqchip, and 65536
for MSI irqchip.

The smallest value from all interrupt controllers is selected now,
and the max cpuid size is defines as 256 by KVM which comes from
extioi irqchip.

Signed-off-by: Bibo Mao 
---
   arch/loongarch/include/asm/kvm_host.h | 26 
   arch/loongarch/include/asm/kvm_vcpu.h |  1 +
   arch/loongarch/kvm/vcpu.c | 93 ++-
   arch/loongarch/kvm/vm.c   | 11 
   4 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..3ba16ef1fe69 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -64,6 +64,30 @@ struct kvm_world_switch {

   #define MAX_PGTABLE_LEVELS 4

+/*
+ * Physical cpu id is used for interrupt routing, there are different
+ * definitions about physical cpuid on different hardwares.
+ *  For LOONGARCH_CSR_CPUID register, max cpuid size if 512
+ *  For IPI HW, max dest CPUID size 1024
+ *  For extioi interrupt controller, max dest CPUID size is 256
+ *  For MSI interrupt controller, max supported CPUID size is 65536
+ *
+ * Currently max CPUID is defined as 256 for KVM hypervisor, in future
+ * it will be expanded to 4096, including 16 packages at most. And every
+ * package supports at most 256 vcpus
+ */
+#define KVM_MAX_PHYID  256
+
+struct kvm_phyid_info {
+   struct kvm_vcpu *vcpu;
+   boolenabled;
+};
+
+struct kvm_phyid_map {
+   int max_phyid;
+   struct kvm_phyid_info phys_map[KVM_MAX_PHYID];
+};
+
   struct kvm_arch {
  /* Guest physical mm */
  kvm_pte_t *pgd;
@@ -71,6 +95,8 @@ struct kvm_arch {
  unsigned long invalid_ptes[MAX_PGTABLE_LEVELS];
  unsigned int  pte_shifts[MAX_PGTABLE_LEVELS];
  unsigned int  root_level;
+   spinlock_tphyid_map_lock;
+   struct kvm_phyid_map  *phyid_map;

  s64 time_offset;
  struct kvm_context __percpu *vmcs;
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index 0cb4fdb8a9b5..9f53950959da 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -81,6 +81,7 @@ void kvm_save_timer(struct kvm_vcpu *vcpu);
   void kvm_restore_timer(struct kvm_vcpu *vcpu);

   int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt 
*irq);
+struct kvm_vcpu *kvm_get_vcpu_by_cpuid(struct kvm *kvm, int cpuid);

   /*
* Loongarch KVM guest interrupt handling
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 3a8779065f73..b633fd28b8db 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -274,6 +274,95 @@ static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int 
id, u64 *val)
  return 0;
   }

+static inline int kvm_set_cpuid(struct kvm_vcpu *vcpu, u64 val)
+{
+   int cpuid;
+   struct loongarch_csrs *csr = vcpu->arch.csr;
+   struct kvm_phyid_map  *map;
+
+   if (val >= KVM_MAX_PHYID)
+   return -EINVAL;
+
+   cpuid = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+   map = vcpu->kvm->arch.phyid_map;
+   spin_lock(>kvm->arch.phyid_map_lock);
+   if (map->phys_map[cpuid].enabled) {
+   /*
+* Cpuid is already set before
+* Forbid changing different cpuid at runtime
+*/
+   if (cpuid != val) {
+   /*
+* Cpuid 0 is initial value for vcpu, maybe invalid
+* unset value for vcpu
+*/
+   if (cpuid) {
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return -EINVAL;
+

Re: [PATCH v8 4/6] LoongArch: KVM: Add vcpu search support from physical cpuid

2024-05-06 Thread maobibo




On 2024/5/6 下午10:17, Huacai Chen wrote:

On Mon, May 6, 2024 at 6:05 PM maobibo  wrote:




On 2024/5/6 下午5:40, Huacai Chen wrote:

On Mon, May 6, 2024 at 5:35 PM maobibo  wrote:




On 2024/5/6 下午4:59, Huacai Chen wrote:

On Mon, May 6, 2024 at 4:18 PM maobibo  wrote:




On 2024/5/6 下午3:06, Huacai Chen wrote:

Hi, Bibo,

On Mon, May 6, 2024 at 2:36 PM maobibo  wrote:




On 2024/5/6 上午9:49, Huacai Chen wrote:

Hi, Bibo,

On Sun, Apr 28, 2024 at 6:05 PM Bibo Mao  wrote:


Physical cpuid is used for interrupt routing for irqchips such as
ipi/msi/extioi interrupt controller. And physical cpuid is stored
at CSR register LOONGARCH_CSR_CPUID, it can not be changed once vcpu
is created and physical cpuid of two vcpus cannot be the same.

Different irqchips have different size declaration about physical cpuid,
max cpuid value for CSR LOONGARCH_CSR_CPUID on 3A5000 is 512, max cpuid
supported by IPI hardware is 1024, 256 for extioi irqchip, and 65536
for MSI irqchip.

The smallest value from all interrupt controllers is selected now,
and the max cpuid size is defines as 256 by KVM which comes from
extioi irqchip.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/kvm_host.h | 26 
  arch/loongarch/include/asm/kvm_vcpu.h |  1 +
  arch/loongarch/kvm/vcpu.c | 93 ++-
  arch/loongarch/kvm/vm.c   | 11 
  4 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..3ba16ef1fe69 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -64,6 +64,30 @@ struct kvm_world_switch {

  #define MAX_PGTABLE_LEVELS 4

+/*
+ * Physical cpu id is used for interrupt routing, there are different
+ * definitions about physical cpuid on different hardwares.
+ *  For LOONGARCH_CSR_CPUID register, max cpuid size if 512
+ *  For IPI HW, max dest CPUID size 1024
+ *  For extioi interrupt controller, max dest CPUID size is 256
+ *  For MSI interrupt controller, max supported CPUID size is 65536
+ *
+ * Currently max CPUID is defined as 256 for KVM hypervisor, in future
+ * it will be expanded to 4096, including 16 packages at most. And every
+ * package supports at most 256 vcpus
+ */
+#define KVM_MAX_PHYID  256
+
+struct kvm_phyid_info {
+   struct kvm_vcpu *vcpu;
+   boolenabled;
+};
+
+struct kvm_phyid_map {
+   int max_phyid;
+   struct kvm_phyid_info phys_map[KVM_MAX_PHYID];
+};
+
  struct kvm_arch {
 /* Guest physical mm */
 kvm_pte_t *pgd;
@@ -71,6 +95,8 @@ struct kvm_arch {
 unsigned long invalid_ptes[MAX_PGTABLE_LEVELS];
 unsigned int  pte_shifts[MAX_PGTABLE_LEVELS];
 unsigned int  root_level;
+   spinlock_tphyid_map_lock;
+   struct kvm_phyid_map  *phyid_map;

 s64 time_offset;
 struct kvm_context __percpu *vmcs;
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index 0cb4fdb8a9b5..9f53950959da 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -81,6 +81,7 @@ void kvm_save_timer(struct kvm_vcpu *vcpu);
  void kvm_restore_timer(struct kvm_vcpu *vcpu);

  int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt 
*irq);
+struct kvm_vcpu *kvm_get_vcpu_by_cpuid(struct kvm *kvm, int cpuid);

  /*
   * Loongarch KVM guest interrupt handling
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 3a8779065f73..b633fd28b8db 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -274,6 +274,95 @@ static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int 
id, u64 *val)
 return 0;
  }

+static inline int kvm_set_cpuid(struct kvm_vcpu *vcpu, u64 val)
+{
+   int cpuid;
+   struct loongarch_csrs *csr = vcpu->arch.csr;
+   struct kvm_phyid_map  *map;
+
+   if (val >= KVM_MAX_PHYID)
+   return -EINVAL;
+
+   cpuid = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+   map = vcpu->kvm->arch.phyid_map;
+   spin_lock(>kvm->arch.phyid_map_lock);
+   if (map->phys_map[cpuid].enabled) {
+   /*
+* Cpuid is already set before
+* Forbid changing different cpuid at runtime
+*/
+   if (cpuid != val) {
+   /*
+* Cpuid 0 is initial value for vcpu, maybe invalid
+* unset value for vcpu
+*/
+   if (cpuid) {
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return -EINVAL;
+   }
+   } else {
+/* Discard duplicated cpuid set */
+   spin_unloc

Re: [PATCH v8 4/6] LoongArch: KVM: Add vcpu search support from physical cpuid

2024-05-06 Thread maobibo




On 2024/5/6 下午5:40, Huacai Chen wrote:

On Mon, May 6, 2024 at 5:35 PM maobibo  wrote:




On 2024/5/6 下午4:59, Huacai Chen wrote:

On Mon, May 6, 2024 at 4:18 PM maobibo  wrote:




On 2024/5/6 下午3:06, Huacai Chen wrote:

Hi, Bibo,

On Mon, May 6, 2024 at 2:36 PM maobibo  wrote:




On 2024/5/6 上午9:49, Huacai Chen wrote:

Hi, Bibo,

On Sun, Apr 28, 2024 at 6:05 PM Bibo Mao  wrote:


Physical cpuid is used for interrupt routing for irqchips such as
ipi/msi/extioi interrupt controller. And physical cpuid is stored
at CSR register LOONGARCH_CSR_CPUID, it can not be changed once vcpu
is created and physical cpuid of two vcpus cannot be the same.

Different irqchips have different size declaration about physical cpuid,
max cpuid value for CSR LOONGARCH_CSR_CPUID on 3A5000 is 512, max cpuid
supported by IPI hardware is 1024, 256 for extioi irqchip, and 65536
for MSI irqchip.

The smallest value from all interrupt controllers is selected now,
and the max cpuid size is defines as 256 by KVM which comes from
extioi irqchip.

Signed-off-by: Bibo Mao 
---
 arch/loongarch/include/asm/kvm_host.h | 26 
 arch/loongarch/include/asm/kvm_vcpu.h |  1 +
 arch/loongarch/kvm/vcpu.c | 93 ++-
 arch/loongarch/kvm/vm.c   | 11 
 4 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..3ba16ef1fe69 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -64,6 +64,30 @@ struct kvm_world_switch {

 #define MAX_PGTABLE_LEVELS 4

+/*
+ * Physical cpu id is used for interrupt routing, there are different
+ * definitions about physical cpuid on different hardwares.
+ *  For LOONGARCH_CSR_CPUID register, max cpuid size if 512
+ *  For IPI HW, max dest CPUID size 1024
+ *  For extioi interrupt controller, max dest CPUID size is 256
+ *  For MSI interrupt controller, max supported CPUID size is 65536
+ *
+ * Currently max CPUID is defined as 256 for KVM hypervisor, in future
+ * it will be expanded to 4096, including 16 packages at most. And every
+ * package supports at most 256 vcpus
+ */
+#define KVM_MAX_PHYID  256
+
+struct kvm_phyid_info {
+   struct kvm_vcpu *vcpu;
+   boolenabled;
+};
+
+struct kvm_phyid_map {
+   int max_phyid;
+   struct kvm_phyid_info phys_map[KVM_MAX_PHYID];
+};
+
 struct kvm_arch {
/* Guest physical mm */
kvm_pte_t *pgd;
@@ -71,6 +95,8 @@ struct kvm_arch {
unsigned long invalid_ptes[MAX_PGTABLE_LEVELS];
unsigned int  pte_shifts[MAX_PGTABLE_LEVELS];
unsigned int  root_level;
+   spinlock_tphyid_map_lock;
+   struct kvm_phyid_map  *phyid_map;

s64 time_offset;
struct kvm_context __percpu *vmcs;
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index 0cb4fdb8a9b5..9f53950959da 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -81,6 +81,7 @@ void kvm_save_timer(struct kvm_vcpu *vcpu);
 void kvm_restore_timer(struct kvm_vcpu *vcpu);

 int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt 
*irq);
+struct kvm_vcpu *kvm_get_vcpu_by_cpuid(struct kvm *kvm, int cpuid);

 /*
  * Loongarch KVM guest interrupt handling
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 3a8779065f73..b633fd28b8db 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -274,6 +274,95 @@ static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int 
id, u64 *val)
return 0;
 }

+static inline int kvm_set_cpuid(struct kvm_vcpu *vcpu, u64 val)
+{
+   int cpuid;
+   struct loongarch_csrs *csr = vcpu->arch.csr;
+   struct kvm_phyid_map  *map;
+
+   if (val >= KVM_MAX_PHYID)
+   return -EINVAL;
+
+   cpuid = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+   map = vcpu->kvm->arch.phyid_map;
+   spin_lock(>kvm->arch.phyid_map_lock);
+   if (map->phys_map[cpuid].enabled) {
+   /*
+* Cpuid is already set before
+* Forbid changing different cpuid at runtime
+*/
+   if (cpuid != val) {
+   /*
+* Cpuid 0 is initial value for vcpu, maybe invalid
+* unset value for vcpu
+*/
+   if (cpuid) {
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return -EINVAL;
+   }
+   } else {
+/* Discard duplicated cpuid set */
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return 0;
+   }
+   }

I have changed the log

Re: [PATCH v8 4/6] LoongArch: KVM: Add vcpu search support from physical cpuid

2024-05-06 Thread maobibo




On 2024/5/6 下午4:59, Huacai Chen wrote:

On Mon, May 6, 2024 at 4:18 PM maobibo  wrote:




On 2024/5/6 下午3:06, Huacai Chen wrote:

Hi, Bibo,

On Mon, May 6, 2024 at 2:36 PM maobibo  wrote:




On 2024/5/6 上午9:49, Huacai Chen wrote:

Hi, Bibo,

On Sun, Apr 28, 2024 at 6:05 PM Bibo Mao  wrote:


Physical cpuid is used for interrupt routing for irqchips such as
ipi/msi/extioi interrupt controller. And physical cpuid is stored
at CSR register LOONGARCH_CSR_CPUID, it can not be changed once vcpu
is created and physical cpuid of two vcpus cannot be the same.

Different irqchips have different size declaration about physical cpuid,
max cpuid value for CSR LOONGARCH_CSR_CPUID on 3A5000 is 512, max cpuid
supported by IPI hardware is 1024, 256 for extioi irqchip, and 65536
for MSI irqchip.

The smallest value from all interrupt controllers is selected now,
and the max cpuid size is defines as 256 by KVM which comes from
extioi irqchip.

Signed-off-by: Bibo Mao 
---
arch/loongarch/include/asm/kvm_host.h | 26 
arch/loongarch/include/asm/kvm_vcpu.h |  1 +
arch/loongarch/kvm/vcpu.c | 93 ++-
arch/loongarch/kvm/vm.c   | 11 
4 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..3ba16ef1fe69 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -64,6 +64,30 @@ struct kvm_world_switch {

#define MAX_PGTABLE_LEVELS 4

+/*
+ * Physical cpu id is used for interrupt routing, there are different
+ * definitions about physical cpuid on different hardwares.
+ *  For LOONGARCH_CSR_CPUID register, max cpuid size if 512
+ *  For IPI HW, max dest CPUID size 1024
+ *  For extioi interrupt controller, max dest CPUID size is 256
+ *  For MSI interrupt controller, max supported CPUID size is 65536
+ *
+ * Currently max CPUID is defined as 256 for KVM hypervisor, in future
+ * it will be expanded to 4096, including 16 packages at most. And every
+ * package supports at most 256 vcpus
+ */
+#define KVM_MAX_PHYID  256
+
+struct kvm_phyid_info {
+   struct kvm_vcpu *vcpu;
+   boolenabled;
+};
+
+struct kvm_phyid_map {
+   int max_phyid;
+   struct kvm_phyid_info phys_map[KVM_MAX_PHYID];
+};
+
struct kvm_arch {
   /* Guest physical mm */
   kvm_pte_t *pgd;
@@ -71,6 +95,8 @@ struct kvm_arch {
   unsigned long invalid_ptes[MAX_PGTABLE_LEVELS];
   unsigned int  pte_shifts[MAX_PGTABLE_LEVELS];
   unsigned int  root_level;
+   spinlock_tphyid_map_lock;
+   struct kvm_phyid_map  *phyid_map;

   s64 time_offset;
   struct kvm_context __percpu *vmcs;
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index 0cb4fdb8a9b5..9f53950959da 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -81,6 +81,7 @@ void kvm_save_timer(struct kvm_vcpu *vcpu);
void kvm_restore_timer(struct kvm_vcpu *vcpu);

int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt 
*irq);
+struct kvm_vcpu *kvm_get_vcpu_by_cpuid(struct kvm *kvm, int cpuid);

/*
 * Loongarch KVM guest interrupt handling
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 3a8779065f73..b633fd28b8db 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -274,6 +274,95 @@ static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int 
id, u64 *val)
   return 0;
}

+static inline int kvm_set_cpuid(struct kvm_vcpu *vcpu, u64 val)
+{
+   int cpuid;
+   struct loongarch_csrs *csr = vcpu->arch.csr;
+   struct kvm_phyid_map  *map;
+
+   if (val >= KVM_MAX_PHYID)
+   return -EINVAL;
+
+   cpuid = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+   map = vcpu->kvm->arch.phyid_map;
+   spin_lock(>kvm->arch.phyid_map_lock);
+   if (map->phys_map[cpuid].enabled) {
+   /*
+* Cpuid is already set before
+* Forbid changing different cpuid at runtime
+*/
+   if (cpuid != val) {
+   /*
+* Cpuid 0 is initial value for vcpu, maybe invalid
+* unset value for vcpu
+*/
+   if (cpuid) {
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return -EINVAL;
+   }
+   } else {
+/* Discard duplicated cpuid set */
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return 0;
+   }
+   }

I have changed the logic and comments when I apply, you can double
check whether it is correct.

I checkout the latest vers

Re: [PATCH v8 4/6] LoongArch: KVM: Add vcpu search support from physical cpuid

2024-05-06 Thread maobibo




On 2024/5/6 下午3:06, Huacai Chen wrote:

Hi, Bibo,

On Mon, May 6, 2024 at 2:36 PM maobibo  wrote:




On 2024/5/6 上午9:49, Huacai Chen wrote:

Hi, Bibo,

On Sun, Apr 28, 2024 at 6:05 PM Bibo Mao  wrote:


Physical cpuid is used for interrupt routing for irqchips such as
ipi/msi/extioi interrupt controller. And physical cpuid is stored
at CSR register LOONGARCH_CSR_CPUID, it can not be changed once vcpu
is created and physical cpuid of two vcpus cannot be the same.

Different irqchips have different size declaration about physical cpuid,
max cpuid value for CSR LOONGARCH_CSR_CPUID on 3A5000 is 512, max cpuid
supported by IPI hardware is 1024, 256 for extioi irqchip, and 65536
for MSI irqchip.

The smallest value from all interrupt controllers is selected now,
and the max cpuid size is defines as 256 by KVM which comes from
extioi irqchip.

Signed-off-by: Bibo Mao 
---
   arch/loongarch/include/asm/kvm_host.h | 26 
   arch/loongarch/include/asm/kvm_vcpu.h |  1 +
   arch/loongarch/kvm/vcpu.c | 93 ++-
   arch/loongarch/kvm/vm.c   | 11 
   4 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..3ba16ef1fe69 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -64,6 +64,30 @@ struct kvm_world_switch {

   #define MAX_PGTABLE_LEVELS 4

+/*
+ * Physical cpu id is used for interrupt routing, there are different
+ * definitions about physical cpuid on different hardwares.
+ *  For LOONGARCH_CSR_CPUID register, max cpuid size if 512
+ *  For IPI HW, max dest CPUID size 1024
+ *  For extioi interrupt controller, max dest CPUID size is 256
+ *  For MSI interrupt controller, max supported CPUID size is 65536
+ *
+ * Currently max CPUID is defined as 256 for KVM hypervisor, in future
+ * it will be expanded to 4096, including 16 packages at most. And every
+ * package supports at most 256 vcpus
+ */
+#define KVM_MAX_PHYID  256
+
+struct kvm_phyid_info {
+   struct kvm_vcpu *vcpu;
+   boolenabled;
+};
+
+struct kvm_phyid_map {
+   int max_phyid;
+   struct kvm_phyid_info phys_map[KVM_MAX_PHYID];
+};
+
   struct kvm_arch {
  /* Guest physical mm */
  kvm_pte_t *pgd;
@@ -71,6 +95,8 @@ struct kvm_arch {
  unsigned long invalid_ptes[MAX_PGTABLE_LEVELS];
  unsigned int  pte_shifts[MAX_PGTABLE_LEVELS];
  unsigned int  root_level;
+   spinlock_tphyid_map_lock;
+   struct kvm_phyid_map  *phyid_map;

  s64 time_offset;
  struct kvm_context __percpu *vmcs;
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index 0cb4fdb8a9b5..9f53950959da 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -81,6 +81,7 @@ void kvm_save_timer(struct kvm_vcpu *vcpu);
   void kvm_restore_timer(struct kvm_vcpu *vcpu);

   int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt 
*irq);
+struct kvm_vcpu *kvm_get_vcpu_by_cpuid(struct kvm *kvm, int cpuid);

   /*
* Loongarch KVM guest interrupt handling
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 3a8779065f73..b633fd28b8db 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -274,6 +274,95 @@ static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int 
id, u64 *val)
  return 0;
   }

+static inline int kvm_set_cpuid(struct kvm_vcpu *vcpu, u64 val)
+{
+   int cpuid;
+   struct loongarch_csrs *csr = vcpu->arch.csr;
+   struct kvm_phyid_map  *map;
+
+   if (val >= KVM_MAX_PHYID)
+   return -EINVAL;
+
+   cpuid = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+   map = vcpu->kvm->arch.phyid_map;
+   spin_lock(>kvm->arch.phyid_map_lock);
+   if (map->phys_map[cpuid].enabled) {
+   /*
+* Cpuid is already set before
+* Forbid changing different cpuid at runtime
+*/
+   if (cpuid != val) {
+   /*
+* Cpuid 0 is initial value for vcpu, maybe invalid
+* unset value for vcpu
+*/
+   if (cpuid) {
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return -EINVAL;
+   }
+   } else {
+/* Discard duplicated cpuid set */
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return 0;
+   }
+   }

I have changed the logic and comments when I apply, you can double
check whether it is correct.

I checkout the latest version, the modification in function
kvm_set_cpuid() is good for me.

Now the modified version is like this:

+

Re: [PATCH v8 6/6] LoongArch: Add pv ipi support on guest kernel side

2024-05-06 Thread maobibo




On 2024/5/6 上午9:53, Huacai Chen wrote:

Hi, Bibo,

On Sun, Apr 28, 2024 at 6:05 PM Bibo Mao  wrote:


PARAVIRT option and pv ipi is added on guest kernel side, function
pv_ipi_init() is to add ipi sending and ipi receiving hooks. This function
firstly checks whether system runs on VM mode. If kernel runs on VM mode,
it will call function kvm_para_available() to detect current hypervirsor
type. Now only KVM type detection is supported, the paravirt function can
work only if current hypervisor type is KVM, since there is only KVM
supported on LoongArch now.

PV IPI uses virtual IPI sender and virtual IPI receiver function. With
virutal IPI sender, ipi message is stored in DDR memory rather than
emulated HW. IPI multicast is supported, and 128 vcpus can received IPIs
at the same time like X86 KVM method. Hypercall method is used for IPI
sending.

With virtual IPI receiver, HW SW0 is used rather than real IPI HW. Since
VCPU has separate HW SW0 like HW timer, there is no trap in IPI interrupt
acknowledge. And IPI message is stored in DDR, no trap in get IPI message.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/Kconfig|   9 ++
  arch/loongarch/include/asm/hardirq.h  |   1 +
  arch/loongarch/include/asm/paravirt.h |  27 
  .../include/asm/paravirt_api_clock.h  |   1 +
  arch/loongarch/kernel/Makefile|   1 +
  arch/loongarch/kernel/irq.c   |   2 +-
  arch/loongarch/kernel/paravirt.c  | 151 ++
  arch/loongarch/kernel/smp.c   |   4 +-
  8 files changed, 194 insertions(+), 2 deletions(-)
  create mode 100644 arch/loongarch/include/asm/paravirt.h
  create mode 100644 arch/loongarch/include/asm/paravirt_api_clock.h
  create mode 100644 arch/loongarch/kernel/paravirt.c

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 54ad04dacdee..0a1540a8853e 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -583,6 +583,15 @@ config CPU_HAS_PREFETCH
 bool
 default y

+config PARAVIRT
+   bool "Enable paravirtualization code"
+   depends on AS_HAS_LVZ_EXTENSION
+   help
+  This changes the kernel so it can modify itself when it is run
+ under a hypervisor, potentially improving performance significantly
+ over full virtualization.  However, when run without a hypervisor
+ the kernel is theoretically slower and slightly larger.
+
  config ARCH_SUPPORTS_KEXEC
 def_bool y

diff --git a/arch/loongarch/include/asm/hardirq.h 
b/arch/loongarch/include/asm/hardirq.h
index 9f0038e19c7f..b26d596a73aa 100644
--- a/arch/loongarch/include/asm/hardirq.h
+++ b/arch/loongarch/include/asm/hardirq.h
@@ -21,6 +21,7 @@ enum ipi_msg_type {
  typedef struct {
 unsigned int ipi_irqs[NR_IPI];
 unsigned int __softirq_pending;
+   atomic_t message cacheline_aligned_in_smp;
  } cacheline_aligned irq_cpustat_t;

  DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/loongarch/include/asm/paravirt.h 
b/arch/loongarch/include/asm/paravirt.h
new file mode 100644
index ..58f7b7b89f2c
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_PARAVIRT_H
+#define _ASM_LOONGARCH_PARAVIRT_H
+
+#ifdef CONFIG_PARAVIRT
+#include 
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
+
+u64 dummy_steal_clock(int cpu);
+DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+   return static_call(pv_steal_clock)(cpu);
+}
+
+int pv_ipi_init(void);
+#else
+static inline int pv_ipi_init(void)
+{
+   return 0;
+}
+
+#endif // CONFIG_PARAVIRT
+#endif
diff --git a/arch/loongarch/include/asm/paravirt_api_clock.h 
b/arch/loongarch/include/asm/paravirt_api_clock.h
new file mode 100644
index ..65ac7cee0dad
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt_api_clock.h
@@ -0,0 +1 @@
+#include 
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index 3a7620b66bc6..c9bfeda89e40 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_MODULES) += module.o module-sections.o
  obj-$(CONFIG_STACKTRACE)   += stacktrace.o

  obj-$(CONFIG_PROC_FS)  += proc.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o

  obj-$(CONFIG_SMP)  += smp.o

diff --git a/arch/loongarch/kernel/irq.c b/arch/loongarch/kernel/irq.c
index ce36897d1e5a..4863e6c1b739 100644
--- a/arch/loongarch/kernel/irq.c
+++ b/arch/loongarch/kernel/irq.c
@@ -113,5 +113,5 @@ void __init init_IRQ(void)
 per_cpu(irq_stack, i), per_cpu(irq_stack, i) + 
IRQ_STACK_SIZE);
 }

-   set_csr_ecfg(ECFGF_IP0 | ECFGF_IP1 | ECFGF_IP2 | ECFGF_IPI | ECFGF_PMC);
+   

Re: [PATCH v8 4/6] LoongArch: KVM: Add vcpu search support from physical cpuid

2024-05-06 Thread maobibo




On 2024/5/6 上午9:49, Huacai Chen wrote:

Hi, Bibo,

On Sun, Apr 28, 2024 at 6:05 PM Bibo Mao  wrote:


Physical cpuid is used for interrupt routing for irqchips such as
ipi/msi/extioi interrupt controller. And physical cpuid is stored
at CSR register LOONGARCH_CSR_CPUID, it can not be changed once vcpu
is created and physical cpuid of two vcpus cannot be the same.

Different irqchips have different size declaration about physical cpuid,
max cpuid value for CSR LOONGARCH_CSR_CPUID on 3A5000 is 512, max cpuid
supported by IPI hardware is 1024, 256 for extioi irqchip, and 65536
for MSI irqchip.

The smallest value from all interrupt controllers is selected now,
and the max cpuid size is defines as 256 by KVM which comes from
extioi irqchip.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/kvm_host.h | 26 
  arch/loongarch/include/asm/kvm_vcpu.h |  1 +
  arch/loongarch/kvm/vcpu.c | 93 ++-
  arch/loongarch/kvm/vm.c   | 11 
  4 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..3ba16ef1fe69 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -64,6 +64,30 @@ struct kvm_world_switch {

  #define MAX_PGTABLE_LEVELS 4

+/*
+ * Physical cpu id is used for interrupt routing, there are different
+ * definitions about physical cpuid on different hardwares.
+ *  For LOONGARCH_CSR_CPUID register, max cpuid size if 512
+ *  For IPI HW, max dest CPUID size 1024
+ *  For extioi interrupt controller, max dest CPUID size is 256
+ *  For MSI interrupt controller, max supported CPUID size is 65536
+ *
+ * Currently max CPUID is defined as 256 for KVM hypervisor, in future
+ * it will be expanded to 4096, including 16 packages at most. And every
+ * package supports at most 256 vcpus
+ */
+#define KVM_MAX_PHYID  256
+
+struct kvm_phyid_info {
+   struct kvm_vcpu *vcpu;
+   boolenabled;
+};
+
+struct kvm_phyid_map {
+   int max_phyid;
+   struct kvm_phyid_info phys_map[KVM_MAX_PHYID];
+};
+
  struct kvm_arch {
 /* Guest physical mm */
 kvm_pte_t *pgd;
@@ -71,6 +95,8 @@ struct kvm_arch {
 unsigned long invalid_ptes[MAX_PGTABLE_LEVELS];
 unsigned int  pte_shifts[MAX_PGTABLE_LEVELS];
 unsigned int  root_level;
+   spinlock_tphyid_map_lock;
+   struct kvm_phyid_map  *phyid_map;

 s64 time_offset;
 struct kvm_context __percpu *vmcs;
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index 0cb4fdb8a9b5..9f53950959da 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -81,6 +81,7 @@ void kvm_save_timer(struct kvm_vcpu *vcpu);
  void kvm_restore_timer(struct kvm_vcpu *vcpu);

  int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt 
*irq);
+struct kvm_vcpu *kvm_get_vcpu_by_cpuid(struct kvm *kvm, int cpuid);

  /*
   * Loongarch KVM guest interrupt handling
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 3a8779065f73..b633fd28b8db 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -274,6 +274,95 @@ static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int 
id, u64 *val)
 return 0;
  }

+static inline int kvm_set_cpuid(struct kvm_vcpu *vcpu, u64 val)
+{
+   int cpuid;
+   struct loongarch_csrs *csr = vcpu->arch.csr;
+   struct kvm_phyid_map  *map;
+
+   if (val >= KVM_MAX_PHYID)
+   return -EINVAL;
+
+   cpuid = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+   map = vcpu->kvm->arch.phyid_map;
+   spin_lock(>kvm->arch.phyid_map_lock);
+   if (map->phys_map[cpuid].enabled) {
+   /*
+* Cpuid is already set before
+* Forbid changing different cpuid at runtime
+*/
+   if (cpuid != val) {
+   /*
+* Cpuid 0 is initial value for vcpu, maybe invalid
+* unset value for vcpu
+*/
+   if (cpuid) {
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return -EINVAL;
+   }
+   } else {
+/* Discard duplicated cpuid set */
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return 0;
+   }
+   }

I have changed the logic and comments when I apply, you can double
check whether it is correct.
I checkout the latest version, the modification in function 
kvm_set_cpuid() is good for me.



+
+   if (map->phys_map[val].enabled) {
+   /*
+* New cpuid is already set with other vcpu
+* Forbid sharing the same cpuid between different vcpus
+   

Re: [PATCH v8 0/6] LoongArch: Add pv ipi support on LoongArch VM

2024-05-06 Thread maobibo




On 2024/5/6 上午9:45, Huacai Chen wrote:

Hi, Bibo,

I have done an off-list discussion with some KVM experts, and they
think user-space have its right to know PV features, so cpucfg
solution is acceptable.

And I applied this series with some modifications at
https://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai/linux-loongson.git/log/?h=loongarch-kvm
You can test it now. But it seems the upstream qemu cannot enable PV IPI now.
VM with 128/256 vcpus boots with this series in loongarch-kvm branch. 
And pv ipi works by information "cat /proc/interrupts". There need small 
modification with qemu like this, and we

will submit the patch to qemu after it is merged.

diff --git a/hw/loongarch/virt.c b/hw/loongarch/virt.c
index 441d764843..9f7556cd93 100644
--- a/hw/loongarch/virt.c
+++ b/hw/loongarch/virt.c
@@ -15,6 +15,8 @@
 #include "sysemu/runstate.h"
 #include "sysemu/reset.h"
 #include "sysemu/rtc.h"
+#include "sysemu/tcg.h"
+#include "sysemu/kvm.h"
 #include "hw/loongarch/virt.h"
 #include "exec/address-spaces.h"
 #include "hw/irq.h"
@@ -786,12 +788,18 @@ static void loongarch_qemu_write(void *opaque, 
hwaddr addr,


 static uint64_t loongarch_qemu_read(void *opaque, hwaddr addr, 
unsigned size)

 {
+uint64_t ret = 0;
+
 switch (addr) {
 case VERSION_REG:
 return 0x11ULL;
 case FEATURE_REG:
-return 1ULL << IOCSRF_MSI | 1ULL << IOCSRF_EXTIOI |
+ret =1ULL << IOCSRF_MSI | 1ULL << IOCSRF_EXTIOI |
1ULL << IOCSRF_CSRIPI;
+if (kvm_enabled()) {
+ret |= 1ULL << IOCSRF_VM;
+}
+   return ret;
 case VENDOR_REG:
 return 0x6e6f73676e6f6f4cULL; /* "Loongson" */
 case CPUNAME_REG:


Regards
Bibo Mao


I will reply to other patches about my modifications.

Huacai

On Sun, Apr 28, 2024 at 6:05 PM Bibo Mao  wrote:


On physical machine, ipi HW uses IOCSR registers, however there is trap
into hypervisor when vcpu accesses IOCSR registers if system is in VM
mode. SWI is a interrupt mechanism like SGI on ARM, software can send
interrupt to CPU, only that on LoongArch SWI can only be sent to local CPU
now. So SWI can not used for IPI on real HW system, however it can be used
on VM when combined with hypercall method. IPI can be sent with hypercall
method and SWI interrupt is injected to vcpu, vcpu can treat SWI
interrupt as IPI.

With PV IPI supported, there is one trap with IPI sending, however with IPI
receiving there is no trap. with IOCSR HW ipi method, there will be one
trap with IPI sending and two trap with ipi receiving.

Also IPI multicast support is added for VM, the idea comes from x86 PV ipi.
IPI can be sent to 128 vcpus in one time. With IPI multicast support, trap
will be reduced greatly.

Here is the microbenchmarck data with "perf bench futex wake" testcase on
3C5000 single-way machine, there are 16 cpus on 3C5000 single-way machine,
VM has 16 vcpus also. The benchmark data is ms time unit to wakeup 16
threads, the performance is better if data is smaller.

physical machine 0.0176 ms
VM original  0.1140 ms
VM with pv ipi patch 0.0481 ms

It passes to boot with 128/256 vcpus, and passes to run runltp command
with package ltp-20230516.

---
v7 --- v8:
  1. Remove kernel PLV mode checking with cpucfg emulation for hypervisor
feature inquiry.
  2. Remove document about loongarch hypercall ABI per request of huacai,
will add English/Chinese doc at the same time in later.

v6 --- v7:
   1. Refine LoongArch virt document by review comments.
   2. Add function kvm_read_reg()/kvm_write_reg() in hypercall emulation,
and later it can be used for other trap emulations.

v5 --- v6:
   1. Add privilege checking when emulating cpucfg at index 0x400 --
0x40FF, return 0 if not executed at kernel mode.
   2. Add document about LoongArch pv ipi with new creatly directory
Documentation/virt/kvm/loongarch/
   3. Fix pv ipi handling in kvm backend function kvm_pv_send_ipi(),
where min should plus BITS_PER_LONG with second bitmap, otherwise
VM with more than 64 vpus fails to boot.
   4. Adjust patch order and code refine with review comments.

v4 --- v5:
   1. Refresh function/macro name from review comments.

v3 --- v4:
   1. Modfiy pv ipi hook function name call_func_ipi() and
call_func_single_ipi() with send_ipi_mask()/send_ipi_single(), since pv
ipi is used for both remote function call and reschedule notification.
   2. Refresh changelog.

v2 --- v3:
   1. Add 128 vcpu ipi multicast support like x86
   2. Change cpucfg base address from 0x1000 to 0x4000, in order
to avoid confliction with future hw usage
   3. Adjust patch order in this patchset, move patch
Refine-ipi-ops-on-LoongArch-platform to the first one.

v1 --- v2:
   1. Add hw cpuid map support since ipi routing uses hw cpuid
   2. Refine changelog description
   3. Add hypercall statistic support for vcpu
   4. Set percpu pv ipi message buffer aligned with cacheline
   5. Refine pv ipi 

Re: [PATCH v8 2/6] LoongArch: KVM: Add hypercall instruction emulation support

2024-05-05 Thread maobibo




On 2024/5/6 上午9:54, Huacai Chen wrote:

Hi, Bibo,

On Sun, Apr 28, 2024 at 6:05 PM Bibo Mao  wrote:


On LoongArch system, there is hypercall instruction special for
virtualization. When system executes this instruction on host side,
there is illegal instruction exception reported, however it will
trap into host when it is executed in VM mode.

When hypercall is emulated, A0 register is set with value
KVM_HCALL_INVALID_CODE, rather than inject EXCCODE_INE invalid
instruction exception. So VM can continue to executing the next code.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/Kbuild  |  1 -
  arch/loongarch/include/asm/kvm_para.h  | 26 ++
  arch/loongarch/include/uapi/asm/Kbuild |  2 --
  arch/loongarch/kvm/exit.c  | 10 ++
  4 files changed, 36 insertions(+), 3 deletions(-)
  create mode 100644 arch/loongarch/include/asm/kvm_para.h
  delete mode 100644 arch/loongarch/include/uapi/asm/Kbuild

diff --git a/arch/loongarch/include/asm/Kbuild 
b/arch/loongarch/include/asm/Kbuild
index 2dbec7853ae8..c862672ed953 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -26,4 +26,3 @@ generic-y += poll.h
  generic-y += param.h
  generic-y += posix_types.h
  generic-y += resource.h
-generic-y += kvm_para.h
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
new file mode 100644
index ..d48f993ae206
--- /dev/null
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_KVM_PARA_H
+#define _ASM_LOONGARCH_KVM_PARA_H
+
+/*
+ * LoongArch hypercall return code
+ */
+#define KVM_HCALL_STATUS_SUCCESS   0
+#define KVM_HCALL_INVALID_CODE -1UL
+#define KVM_HCALL_INVALID_PARAMETER-2UL
+
+static inline unsigned int kvm_arch_para_features(void)
+{
+   return 0;
+}
+
+static inline unsigned int kvm_arch_para_hints(void)
+{
+   return 0;
+}
+
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+   return false;
+}
+#endif /* _ASM_LOONGARCH_KVM_PARA_H */
diff --git a/arch/loongarch/include/uapi/asm/Kbuild 
b/arch/loongarch/include/uapi/asm/Kbuild
deleted file mode 100644
index 4aa680ca2e5f..
--- a/arch/loongarch/include/uapi/asm/Kbuild
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-generic-y += kvm_para.h

This file shouldn't be removed.
yes, uapi kvm_param.h is needed for Loongarch, and there will be problem 
if it is removed. And it should kept unchanged.


Regards
Bibo Mao


Huacai


diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index ed1d89d53e2e..923bbca9bd22 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -685,6 +685,15 @@ static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu)
 return RESUME_GUEST;
  }

+static int kvm_handle_hypercall(struct kvm_vcpu *vcpu)
+{
+   update_pc(>arch);
+
+   /* Treat it as noop intruction, only set return value */
+   vcpu->arch.gprs[LOONGARCH_GPR_A0] = KVM_HCALL_INVALID_CODE;
+   return RESUME_GUEST;
+}
+
  /*
   * LoongArch KVM callback handling for unimplemented guest exiting
   */
@@ -716,6 +725,7 @@ static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = 
{
 [EXCCODE_LSXDIS]= kvm_handle_lsx_disabled,
 [EXCCODE_LASXDIS]   = kvm_handle_lasx_disabled,
 [EXCCODE_GSPR]  = kvm_handle_gspr,
+   [EXCCODE_HVC]   = kvm_handle_hypercall,
  };

  int kvm_handle_fault(struct kvm_vcpu *vcpu, int fault)
--
2.39.3







Re: [PATCH v8 4/6] LoongArch: KVM: Add vcpu search support from physical cpuid

2024-05-05 Thread maobibo

Huacai,

Many thanks for reviewing pv ipi patchset.
And I reply inline.

On 2024/5/6 上午9:49, Huacai Chen wrote:

Hi, Bibo,

On Sun, Apr 28, 2024 at 6:05 PM Bibo Mao  wrote:


Physical cpuid is used for interrupt routing for irqchips such as
ipi/msi/extioi interrupt controller. And physical cpuid is stored
at CSR register LOONGARCH_CSR_CPUID, it can not be changed once vcpu
is created and physical cpuid of two vcpus cannot be the same.

Different irqchips have different size declaration about physical cpuid,
max cpuid value for CSR LOONGARCH_CSR_CPUID on 3A5000 is 512, max cpuid
supported by IPI hardware is 1024, 256 for extioi irqchip, and 65536
for MSI irqchip.

The smallest value from all interrupt controllers is selected now,
and the max cpuid size is defines as 256 by KVM which comes from
extioi irqchip.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/kvm_host.h | 26 
  arch/loongarch/include/asm/kvm_vcpu.h |  1 +
  arch/loongarch/kvm/vcpu.c | 93 ++-
  arch/loongarch/kvm/vm.c   | 11 
  4 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..3ba16ef1fe69 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -64,6 +64,30 @@ struct kvm_world_switch {

  #define MAX_PGTABLE_LEVELS 4

+/*
+ * Physical cpu id is used for interrupt routing, there are different
+ * definitions about physical cpuid on different hardwares.
+ *  For LOONGARCH_CSR_CPUID register, max cpuid size if 512
+ *  For IPI HW, max dest CPUID size 1024
+ *  For extioi interrupt controller, max dest CPUID size is 256
+ *  For MSI interrupt controller, max supported CPUID size is 65536
+ *
+ * Currently max CPUID is defined as 256 for KVM hypervisor, in future
+ * it will be expanded to 4096, including 16 packages at most. And every
+ * package supports at most 256 vcpus
+ */
+#define KVM_MAX_PHYID  256
+
+struct kvm_phyid_info {
+   struct kvm_vcpu *vcpu;
+   boolenabled;
+};
+
+struct kvm_phyid_map {
+   int max_phyid;
+   struct kvm_phyid_info phys_map[KVM_MAX_PHYID];
+};
+
  struct kvm_arch {
 /* Guest physical mm */
 kvm_pte_t *pgd;
@@ -71,6 +95,8 @@ struct kvm_arch {
 unsigned long invalid_ptes[MAX_PGTABLE_LEVELS];
 unsigned int  pte_shifts[MAX_PGTABLE_LEVELS];
 unsigned int  root_level;
+   spinlock_tphyid_map_lock;
+   struct kvm_phyid_map  *phyid_map;

 s64 time_offset;
 struct kvm_context __percpu *vmcs;
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index 0cb4fdb8a9b5..9f53950959da 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -81,6 +81,7 @@ void kvm_save_timer(struct kvm_vcpu *vcpu);
  void kvm_restore_timer(struct kvm_vcpu *vcpu);

  int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt 
*irq);
+struct kvm_vcpu *kvm_get_vcpu_by_cpuid(struct kvm *kvm, int cpuid);

  /*
   * Loongarch KVM guest interrupt handling
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 3a8779065f73..b633fd28b8db 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -274,6 +274,95 @@ static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int 
id, u64 *val)
 return 0;
  }

+static inline int kvm_set_cpuid(struct kvm_vcpu *vcpu, u64 val)
+{
+   int cpuid;
+   struct loongarch_csrs *csr = vcpu->arch.csr;
+   struct kvm_phyid_map  *map;
+
+   if (val >= KVM_MAX_PHYID)
+   return -EINVAL;
+
+   cpuid = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+   map = vcpu->kvm->arch.phyid_map;
+   spin_lock(>kvm->arch.phyid_map_lock);
+   if (map->phys_map[cpuid].enabled) {
+   /*
+* Cpuid is already set before
+* Forbid changing different cpuid at runtime
+*/
+   if (cpuid != val) {
+   /*
+* Cpuid 0 is initial value for vcpu, maybe invalid
+* unset value for vcpu
+*/
+   if (cpuid) {
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return -EINVAL;
+   }
+   } else {
+/* Discard duplicated cpuid set */
+   spin_unlock(>kvm->arch.phyid_map_lock);
+   return 0;
+   }
+   }

I have changed the logic and comments when I apply, you can double
check whether it is correct.

Will do.




+
+   if (map->phys_map[val].enabled) {
+   /*
+* New cpuid is already set with other vcpu
+* Forbid sharing the same cpuid between different vcpus
+*/
+   

Re: [PATCH v2 2/2] LoongArch: Add steal time support in guest side

2024-04-30 Thread maobibo




On 2024/4/30 下午4:23, Huacai Chen wrote:

Hi, Bibo,

On Tue, Apr 30, 2024 at 9:45 AM Bibo Mao  wrote:


Percpu struct kvm_steal_time is added here, its size is 64 bytes and
also defined as 64 bytes, so that the whole structure is in one physical
page.

When vcpu is onlined, function pv_enable_steal_time() is called. This
function will pass guest physical address of struct kvm_steal_time and
tells hypervisor to enable steal time. When vcpu is offline, physical
address is set as 0 and tells hypervisor to disable steal time.

Here is output of vmstat on guest when there is workload on both host
and guest. It includes steal time stat information.

procs ---memory-- -io -system-- --cpu-
  r  b   swpd   free  inact active   bibo   in   cs us sy id wa st
15  1  0 7583616 184112  72208200  162   52 31  6 43  0 20
17  0  0 7583616 184704  721920 0 6318 6885  5 60  8  5 22
16  0  0 7583616 185392  721440 0 1766 1081  0 49  0  1 50
16  0  0 7583616 184816  723040 0 6300 6166  4 62 12  2 20
18  0  0 7583632 184480  722400 0 2814 1754  2 58  4  1 35

Signed-off-by: Bibo Mao 
---
  arch/loongarch/Kconfig|  11 +++
  arch/loongarch/include/asm/paravirt.h |   5 +
  arch/loongarch/kernel/paravirt.c  | 131 ++
  arch/loongarch/kernel/time.c  |   2 +
  4 files changed, 149 insertions(+)

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 0a1540a8853e..f3a03c33a052 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -592,6 +592,17 @@ config PARAVIRT
   over full virtualization.  However, when run without a hypervisor
   the kernel is theoretically slower and slightly larger.

+config PARAVIRT_TIME_ACCOUNTING
+   bool "Paravirtual steal time accounting"
+   select PARAVIRT
+   help
+ Select this option to enable fine granularity task steal time
+ accounting. Time spent executing other tasks in parallel with
+ the current vCPU is discounted from the vCPU power. To account for
+ that, there can be a small performance impact.
+
+ If in doubt, say N here.
+

Can we use a hidden selection manner, which means:

config PARAVIRT_TIME_ACCOUNTING
def_bool PARAVIRT

Because I think we needn't give too many choices to users (and bring
much more effort to test).

PowerPC even hide all the PARAVIRT config options...
see arch/powerpc/platforms/pseries/Kconfig


I do not know neither :(  It is just used at beginning, maybe there is
negative effect or potential issue, I just think that it can be selected 
by user for easily debugging. After it is matured, hidden selection 
manner can be used.


Regards
Bibo Mao


Huacai


  config ARCH_SUPPORTS_KEXEC
 def_bool y

diff --git a/arch/loongarch/include/asm/paravirt.h 
b/arch/loongarch/include/asm/paravirt.h
index 58f7b7b89f2c..fe27fb5e82b8 100644
--- a/arch/loongarch/include/asm/paravirt.h
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -17,11 +17,16 @@ static inline u64 paravirt_steal_clock(int cpu)
  }

  int pv_ipi_init(void);
+int __init pv_time_init(void);
  #else
  static inline int pv_ipi_init(void)
  {
 return 0;
  }

+static inline int pv_time_init(void)
+{
+   return 0;
+}
  #endif // CONFIG_PARAVIRT
  #endif
diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
index 9044ed62045c..3f83afc7e2d2 100644
--- a/arch/loongarch/kernel/paravirt.c
+++ b/arch/loongarch/kernel/paravirt.c
@@ -5,10 +5,13 @@
  #include 
  #include 
  #include 
+#include 
  #include 

  struct static_key paravirt_steal_enabled;
  struct static_key paravirt_steal_rq_enabled;
+static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static int has_steal_clock;

  static u64 native_steal_clock(int cpu)
  {
@@ -17,6 +20,57 @@ static u64 native_steal_clock(int cpu)

  DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);

+static bool steal_acc = true;
+static int __init parse_no_stealacc(char *arg)
+{
+   steal_acc = false;
+   return 0;
+}
+early_param("no-steal-acc", parse_no_stealacc);
+
+static u64 para_steal_clock(int cpu)
+{
+   u64 steal;
+   struct kvm_steal_time *src;
+   int version;
+
+   src = _cpu(steal_time, cpu);
+   do {
+
+   version = src->version;
+   /* Make sure that the version is read before the steal */
+   virt_rmb();
+   steal = src->steal;
+   /* Make sure that the steal is read before the next version */
+   virt_rmb();
+
+   } while ((version & 1) || (version != src->version));
+   return steal;
+}
+
+static int pv_enable_steal_time(void)
+{
+   int cpu = smp_processor_id();
+   struct kvm_steal_time *st;
+   unsigned long addr;
+
+   if (!has_steal_clock)
+   return -EPERM;
+
+   st = _cpu(steal_time, cpu);
+   addr = 

Re: [PATCH 1/4] KVM: delete .change_pte MMU notifier callback

2024-04-08 Thread maobibo




On 2024/4/5 下午7:58, Paolo Bonzini wrote:

The .change_pte() MMU notifier callback was intended as an
optimization. The original point of it was that KSM could tell KVM to flip
its secondary PTE to a new location without having to first zap it. At
the time there was also an .invalidate_page() callback; both of them were
*not* bracketed by calls to mmu_notifier_invalidate_range_{start,end}(),
and .invalidate_page() also doubled as a fallback implementation of
change_pte().

Later on, however, both callbacks were changed to occur within an
invalidate_range_start/end() block.

In the case of .change_pte(), commit 6bdb913f0a70 ("mm: wrap calls to
set_pte_at_notify with invalidate_range_start and invalidate_range_end",
2012-10-09) did so to remove the fallback from .invalidate_page() to
change_pte() and allow sleepable .invalidate_page() hooks.

This however made KVM's usage of the .change_pte() callback completely
moot, because KVM unmaps the sPTEs during .invalidate_range_start()
and therefore .change_pte() has no hope of finding a sPTE to change.
Drop the generic KVM code that dispatches to kvm_set_spte_gfn(), as
well as all the architecture specific implementations.

Signed-off-by: Paolo Bonzini 
---
  arch/arm64/kvm/mmu.c  | 34 -
  arch/loongarch/include/asm/kvm_host.h |  1 -
  arch/loongarch/kvm/mmu.c  | 32 
  arch/mips/kvm/mmu.c   | 30 ---
  arch/powerpc/include/asm/kvm_ppc.h|  1 -
  arch/powerpc/kvm/book3s.c |  5 ---
  arch/powerpc/kvm/book3s.h |  1 -
  arch/powerpc/kvm/book3s_64_mmu_hv.c   | 12 --
  arch/powerpc/kvm/book3s_hv.c  |  1 -
  arch/powerpc/kvm/book3s_pr.c  |  7 
  arch/powerpc/kvm/e500_mmu_host.c  |  6 ---
  arch/riscv/kvm/mmu.c  | 20 --
  arch/x86/kvm/mmu/mmu.c| 54 +--
  arch/x86/kvm/mmu/spte.c   | 16 
  arch/x86/kvm/mmu/spte.h   |  2 -
  arch/x86/kvm/mmu/tdp_mmu.c| 46 ---
  arch/x86/kvm/mmu/tdp_mmu.h|  1 -
  include/linux/kvm_host.h  |  2 -
  include/trace/events/kvm.h| 15 
  virt/kvm/kvm_main.c   | 43 -
  20 files changed, 2 insertions(+), 327 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index dc04bc767865..ff17849be9f4 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1768,40 +1768,6 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct 
kvm_gfn_range *range)
return false;
  }
  
-bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)

-{
-   kvm_pfn_t pfn = pte_pfn(range->arg.pte);
-
-   if (!kvm->arch.mmu.pgt)
-   return false;
-
-   WARN_ON(range->end - range->start != 1);
-
-   /*
-* If the page isn't tagged, defer to user_mem_abort() for sanitising
-* the MTE tags. The S2 pte should have been unmapped by
-* mmu_notifier_invalidate_range_end().
-*/
-   if (kvm_has_mte(kvm) && !page_mte_tagged(pfn_to_page(pfn)))
-   return false;
-
-   /*
-* We've moved a page around, probably through CoW, so let's treat
-* it just like a translation fault and the map handler will clean
-* the cache to the PoC.
-*
-* The MMU notifiers will have unmapped a huge PMD before calling
-* ->change_pte() (which in turn calls kvm_set_spte_gfn()) and
-* therefore we never need to clear out a huge PMD through this
-* calling path and a memcache is not required.
-*/
-   kvm_pgtable_stage2_map(kvm->arch.mmu.pgt, range->start << PAGE_SHIFT,
-  PAGE_SIZE, __pfn_to_phys(pfn),
-  KVM_PGTABLE_PROT_R, NULL, 0);
-
-   return false;
-}
-
  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
  {
u64 size = (range->end - range->start) << PAGE_SHIFT;
diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..69305441f40d 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -203,7 +203,6 @@ void kvm_flush_tlb_all(void);
  void kvm_flush_tlb_gpa(struct kvm_vcpu *vcpu, unsigned long gpa);
  int kvm_handle_mm_fault(struct kvm_vcpu *vcpu, unsigned long badv, bool 
write);
  
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);

  int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long 
end, bool blockable);
  int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
  int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
diff --git a/arch/loongarch/kvm/mmu.c b/arch/loongarch/kvm/mmu.c
index a556cff35740..98883aa23ab8 100644
--- a/arch/loongarch/kvm/mmu.c
+++ b/arch/loongarch/kvm/mmu.c
@@ -494,38 +494,6 @@ bool kvm_unmap_gfn_range(struct 

Re: [PATCH v7 3/7] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-04-01 Thread maobibo




On 2024/4/2 上午10:49, Xi Ruoyao wrote:

On Tue, 2024-04-02 at 09:43 +0800, maobibo wrote:

Sorry for the late reply, but I think it may be a bit non-constructive
to repeatedly submit the same code without due explanation in our
previous review threads. Let me try to recollect some of the details
though...

Because your review comments about hypercall method is wrong, I need not
adopt it.


Again it's unfair to say so considering the lack of LVZ documentation.

/* snip */



1. T0-T7 are scratch registers during SYSCALL ABI, this is what you
suggest, does there exist information leaking to user space from T0-T7
registers?


It's not a problem.  When syscall returns RESTORE_ALL_AND_RET is invoked
despite T0-T7 are not saved.  So a "junk" value will be read from the
leading PT_SIZE bytes of the kernel stack for this thread.

The leading PT_SIZE bytes of the kernel stack is dedicated for storing
the struct pt_regs representing the reg file of the thread in the
userspace.
Not all syscalls use leading PT_SIZE bytes of the kernel stack. It is 
complicated if syscall is combined with interrupt and singals.




Thus we may only read out the userspace T0-T7 value stored when the same
thread was interrupted or trapped last time, or 0 (if the thread was
never interrupted or trapped before).

And it's impossible to read some data used by the kernel internally, or
some data of another thread.
Are you sure that it's impossible to read some data used by the kernel 
internally?


Regards
Bibo Mao


But indeed there is some improvement here.  Zeroing these registers
seems cleaner than reading out the junk values, and also faster (move
$t0, $r0 is faster than ld.d $t0, $sp, PT_R12).  Not sure if it's worthy
to violate Huacai's "keep things simple" aspiration though.






Re: [PATCH v7 3/7] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-04-01 Thread maobibo




On 2024/4/2 上午10:49, Xi Ruoyao wrote:

On Tue, 2024-04-02 at 09:43 +0800, maobibo wrote:

Sorry for the late reply, but I think it may be a bit non-constructive
to repeatedly submit the same code without due explanation in our
previous review threads. Let me try to recollect some of the details
though...

Because your review comments about hypercall method is wrong, I need not
adopt it.


Again it's unfair to say so considering the lack of LVZ documentation.

/* snip */



1. T0-T7 are scratch registers during SYSCALL ABI, this is what you
suggest, does there exist information leaking to user space from T0-T7
registers?


It's not a problem.  When syscall returns RESTORE_ALL_AND_RET is invoked
despite T0-T7 are not saved.  So a "junk" value will be read from the
leading PT_SIZE bytes of the kernel stack for this thread.

For you it is "junk" value, some guys maybe thinks it is useful.

There is another issue, since kernel restore T0-T7 registers and user 
space save T0-T7. Why T0-T7 is scratch registers rather than preserve 
registers like other architecture? What is the advantage if it is 
scratch registers?


Regards
Bibo Mao


The leading PT_SIZE bytes of the kernel stack is dedicated for storing
the struct pt_regs representing the reg file of the thread in the
userspace.

Thus we may only read out the userspace T0-T7 value stored when the same
thread was interrupted or trapped last time, or 0 (if the thread was
never interrupted or trapped before).

And it's impossible to read some data used by the kernel internally, or
some data of another thread.

But indeed there is some improvement here.  Zeroing these registers
seems cleaner than reading out the junk values, and also faster (move
$t0, $r0 is faster than ld.d $t0, $sp, PT_R12).  Not sure if it's worthy
to violate Huacai's "keep things simple" aspiration though.






Re: [PATCH v7 3/7] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-04-01 Thread maobibo




On 2024/3/24 上午3:02, WANG Xuerui wrote:

On 3/15/24 16:07, Bibo Mao wrote:

Instruction cpucfg can be used to get processor features. And there
is trap exception when it is executed in VM mode, and also it is
to provide cpu features to VM. On real hardware cpucfg area 0 - 20
is used.  Here one specified area 0x4000 -- 0x40ff is used
for KVM hypervisor to privide PV features, and the area can be extended
for other hypervisors in future. This area will never be used for
real HW, it is only used by software.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/inst.h  |  1 +
  arch/loongarch/include/asm/loongarch.h | 10 +
  arch/loongarch/kvm/exit.c  | 59 +++---
  3 files changed, 54 insertions(+), 16 deletions(-)



Sorry for the late reply, but I think it may be a bit non-constructive 
to repeatedly submit the same code without due explanation in our 
previous review threads. Let me try to recollect some of the details 
though...
Because your review comments about hypercall method is wrong, I need not 
adopt it.


If I remember correctly, during the previous reviews, it was mentioned 
that the only upsides of using CPUCFG were:


- it was exactly identical to the x86 approach,
- it would not require access to the LoongArch Reference Manual Volume 3 
to use, and

- it was plain old data.

But, for the first point, we don't have to follow x86 convention after 
X86 virtualization is successfully and widely applied in our life and 
products. It it normal to follow it if there is not obvious issues.


all. The second reason might be compelling, but on the one hand that's 
another problem orthogonal to the current one, and on the other hand 
HVCL is:


- already effectively public because of the fact that this very patchset 
is public,
- its semantics is trivial to implement even without access to the LVZ 
manual, because of its striking similarity with SYSCALL, and
- by being a function call, we reserve the possibility for hypervisors 
to invoke logic for self-identification purposes, even if this is likely 
overkill from today's perspective.


And, even if we decide that using HVCL for self-identification is 
overkill after all, we still have another choice that's IOCSR. We 
already read LOONGARCH_IOCSR_FEATURES (0x8) for its bit 11 (IOCSRF_VM) 
to populate the CPU_FEATURE_HYPERVISOR bit, and it's only natural that 
we put the identification word in the IOCSR space. As far as I can see, 
the IOCSR space is plenty and equally available for making reservations; 
it can only be even easier when it's done by a Loongson team.
IOCSR method is possible also, about chip design CPUCFG is used for cpu 
features and IOCSR is for device featurs. Here CPUCFG method is 
selected, I am KVM LoongArch maintainer and I can decide to select 
methods if the method works well. Is that right?


If you are interested in KVM LoongArch, you can submit more patches and 
become maintainer or write new hypervisor support such xen/xvisor etc, 
and use your method.


Also you are interested in Linux kernel, there are some issues. Can you 
help to improve it?


1. T0-T7 are scratch registers during SYSCALL ABI, this is what you 
suggest, does there exist information leaking to user space from T0-T7 
registers?


2. LoongArch KVM depends on AS_HAS_LVZ_EXTENSION, which requires the 
latest binutils. It is also what you suggest. Some kernel developers 
does not have the latest binutils and common kvm code is modified and 
LoongArch KVM fails to compile. But they can not find it since their 
LoongArch cross-compile is old and LoongArch KVM is disabled. This issue 
can be found at https://lkml.org/lkml/2023/11/15/828.


Regards
Bibo Mao


Finally, I've mentioned multiple times, that varying CPUCFG behavior 
based on PLV is not something well documented on the manuals, hence not 
friendly to low-level developers. Devs of third-party firmware and/or 
kernels do exist, I've personally spoken to some of them on the 
2023-11-18 3A6000 release event; in order for the varying CPUCFG 
behavior approach to pass for me, at the very least, the LoongArch 
reference manual must be amended to explicitly include an explanation of 
it, and a reference to potential use cases.







Re: [PATCH v7 7/7] Documentation: KVM: Add hypercall for LoongArch

2024-04-01 Thread maobibo




On 2024/3/24 上午2:40, WANG Xuerui wrote:

On 3/15/24 16:11, Bibo Mao wrote:

[snip]
+KVM hypercall ABI
+=
+
+Hypercall ABI on KVM is simple, only one scratch register a0 and at most
+five generic registers used as input parameter. FP register and 
vector register
+is not used for input register and should not be modified during 
hypercall.
+Hypercall function can be inlined since there is only one scratch 
register.


Maybe it's better to describe the list of preserved registers with an 
expression such as "all non-GPR registers shall remain unmodified after 
returning from the hypercall", to guard ourselves against future ISA 
state additions.
Sorry, I do not understand. What is the meaning of "all non-GPR 
registers"?  Can you give an example?


Regards
Bibo Mao


But I still maintain that it's better to promise less here, and only 
hint on the extensive preservation of context as an implementation 
detail. It is for not losing our ability to save/restore less in the 
future, should we decide to do so.







Re: [PATCH v6 7/7] Documentation: KVM: Add hypercall for LoongArch

2024-03-05 Thread maobibo




On 2024/3/6 上午2:26, WANG Xuerui wrote:

On 3/4/24 17:10, maobibo wrote:

On 2024/3/2 下午5:41, WANG Xuerui wrote:

On 3/2/24 16:47, Bibo Mao wrote:

[snip]
+Querying for existence
+==
+
+To find out if we're running on KVM or not, cpucfg can be used with 
index
+CPUCFG_KVM_BASE (0x4000), cpucfg range between 0x4000 - 
0x40FF
+is marked as a specially reserved range. All existing and future 
processors

+will not implement any features in this range.
+
+When Linux is running on KVM, cpucfg with index CPUCFG_KVM_BASE 
(0x4000)

+returns magic string "KVM\0"
+
+Once you determined you're running under a PV capable KVM, you can 
now use

+hypercalls as described below.


So this is still the approach similar to the x86 CPUID-based 
implementation. But here the non-privileged behavior isn't specified 
-- I see there is PLV checking in Patch 3 but it's safer to have the 
requirement spelled out here too.


But I still think this approach touches more places than strictly 
needed. As it is currently the case in 
arch/loongarch/kernel/cpu-probe.c, the FEATURES IOCSR is checked for 
a bit IOCSRF_VM that already signifies presence of a hypervisor; if 
this information can be interpreted as availability of the HVCL 
instruction (which I suppose is the case -- a hypervisor can always 
trap-and-emulate in case HVCL isn't provided by hardware), here we 
can already start making calls with HVCL.


We can and should define a uniform interface for probing the 
hypervisor kind, similar to the centrally-managed RISC-V SBI 
implementation ID registry [1]: otherwise future non-KVM hypervisors 
would have to


1. somehow pretend they are KVM and eventually fail to do so, leading 
to subtle incompatibilities,

2. invent another way of probing for their existence,
3. piggy-back on the current KVM definition, which is inelegant 
(reading the LoongArch-KVM-defined CPUCFG leaf only to find it's not 
KVM) and utterly makes the definition here *not* KVM-specific.


[1]: 
https://github.com/riscv-non-isa/riscv-sbi-doc/blob/v2.0/src/ext-base.adoc 



Sorry, I know nothing about riscv. Can you describe how 
sbi_get_mimpid() is implemented in detailed? Is it a simple library or 
need trap into secure mode or need trap into hypervisor mode?


For these simple interfaces you can expect trivial implementation. See 
for example [OpenSBI]'s respective code.


[OpenSBI]: 
https://github.com/riscv-software-src/opensbi/blob/v1.4/lib/sbi/sbi_ecall.c#L29-L34 




My take on this:

To check if we are running on Linux KVM or not, first check IOCSR 0x8 
(``LOONGARCH_IOCSR_FEATURES``) for bit 11 (``IOCSRF_VM``); we are 
running under a hypervisor if the bit is set. Then invoke ``HVCL 0`` 
to find out the hypervisor implementation ID; a return value in 
``$a0`` of 0x004d564b (``KVM\0``) means Linux KVM, in which case the 
rest of the convention applies.


I do not think so. `HVCL 0` requires that hypercall ABIs need be 
unified for all hypervisors. Instead it is not necessary, each 
hypervisor can has its own hypercall ABI.


I don't think agreeing upon the ABI of HVCL 0 is going to affect ABI of 
other hypercalls. Plus, as long as people don't invent something that 
they think is smart and deviate from the platform calling convention, 
I'd expect every hypervisor to have identical ABI apart from the exact 
HVCL operation ID chosen.



+
+KVM hypercall ABI
+=
+
+Hypercall ABI on KVM is simple, only one scratch register a0 (v0) 
and at most
+five generic registers used as input parameter. FP register and 
vector register
+is not used for input register and should not be modified during 
hypercall.
+Hypercall function can be inlined since there is only one scratch 
register.


It should be pointed out explicitly that on hypercall return all 
Well, return value description will added. What do think about the 
meaning of return value for KVM_HCALL_FUNC_PV_IPI hypercall?  The 
number of CPUs with IPI delivered successfully like kvm x86 or simply 
success/failure?
architectural state except ``$a0`` is preserved. Or is the whole 
``$a0 - $t8`` range clobbered, just like with Linux syscalls?



what is advantage with $a0 - > $t8 clobbered?


Because then a hypercall is going to behave identical as an ordinary C 
function call, which is easy for people and compilers to understand.


If you really understand detailed behavior about hypercall/syscall, the 
conclusion may be different.


If T0 - T8 is clobbered with hypercall instruction, hypercall caller 
need save clobbered register, now hypercall exception save/restore all 
the registers during VM exits. If so, hypercall caller need not save 
general registers and it is not necessary scratched for hypercall ABI.


Until now all the discussion the macro level, no detail code level.

Can you show me some example code where T0-T8 need not save/restore 
during LoongArch hypercall exception?


Regards
Bibo Mao

It seems that with linux Loongarch sy

Re: [PATCH v6 7/7] Documentation: KVM: Add hypercall for LoongArch

2024-03-04 Thread maobibo




On 2024/3/2 下午5:41, WANG Xuerui wrote:

On 3/2/24 16:47, Bibo Mao wrote:

Add documentation topic for using pv_virt when running as a guest
on KVM hypervisor.

Signed-off-by: Bibo Mao 
---
  Documentation/virt/kvm/index.rst  |  1 +
  .../virt/kvm/loongarch/hypercalls.rst | 79 +++
  Documentation/virt/kvm/loongarch/index.rst    | 10 +++
  3 files changed, 90 insertions(+)
  create mode 100644 Documentation/virt/kvm/loongarch/hypercalls.rst
  create mode 100644 Documentation/virt/kvm/loongarch/index.rst

diff --git a/Documentation/virt/kvm/index.rst 
b/Documentation/virt/kvm/index.rst

index ad13ec55ddfe..9ca5a45c2140 100644
--- a/Documentation/virt/kvm/index.rst
+++ b/Documentation/virt/kvm/index.rst
@@ -14,6 +14,7 @@ KVM
 s390/index
 ppc-pv
 x86/index
+   loongarch/index
 locking
 vcpu-requests
diff --git a/Documentation/virt/kvm/loongarch/hypercalls.rst 
b/Documentation/virt/kvm/loongarch/hypercalls.rst

new file mode 100644
index ..1679e48d67d2
--- /dev/null
+++ b/Documentation/virt/kvm/loongarch/hypercalls.rst
@@ -0,0 +1,79 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+===
+The LoongArch paravirtual interface
+===
+
+KVM hypercalls use the HVCL instruction with code 0x100, and the 
hypercall

+number is put in a0 and up to five arguments may be placed in a1-a5, the
+return value is placed in v0 (alias with a0).


Just say a0: the name v0 is long deprecated (has been the case ever 
since LoongArch got mainlined).



Sure, will modify since you are compiler export :)


+
+The code for that interface can be found in arch/loongarch/kvm/*
+
+Querying for existence
+==
+
+To find out if we're running on KVM or not, cpucfg can be used with 
index
+CPUCFG_KVM_BASE (0x4000), cpucfg range between 0x4000 - 
0x40FF
+is marked as a specially reserved range. All existing and future 
processors

+will not implement any features in this range.
+
+When Linux is running on KVM, cpucfg with index CPUCFG_KVM_BASE 
(0x4000)

+returns magic string "KVM\0"
+
+Once you determined you're running under a PV capable KVM, you can 
now use

+hypercalls as described below.


So this is still the approach similar to the x86 CPUID-based 
implementation. But here the non-privileged behavior isn't specified -- 
I see there is PLV checking in Patch 3 but it's safer to have the 
requirement spelled out here too.


But I still think this approach touches more places than strictly 
needed. As it is currently the case in 
arch/loongarch/kernel/cpu-probe.c, the FEATURES IOCSR is checked for a 
bit IOCSRF_VM that already signifies presence of a hypervisor; if this 
information can be interpreted as availability of the HVCL instruction 
(which I suppose is the case -- a hypervisor can always trap-and-emulate 
in case HVCL isn't provided by hardware), here we can already start 
making calls with HVCL.


We can and should define a uniform interface for probing the hypervisor 
kind, similar to the centrally-managed RISC-V SBI implementation ID 
registry [1]: otherwise future non-KVM hypervisors would have to


1. somehow pretend they are KVM and eventually fail to do so, leading to 
subtle incompatibilities,

2. invent another way of probing for their existence,
3. piggy-back on the current KVM definition, which is inelegant (reading 
the LoongArch-KVM-defined CPUCFG leaf only to find it's not KVM) and 
utterly makes the definition here *not* KVM-specific.


[1]: 
https://github.com/riscv-non-isa/riscv-sbi-doc/blob/v2.0/src/ext-base.adoc


Sorry, I know nothing about riscv. Can you describe how sbi_get_mimpid() 
is implemented in detailed? Is it a simple library or need trap into 
secure mode or need trap into hypervisor mode?



My take on this:

To check if we are running on Linux KVM or not, first check IOCSR 0x8 
(``LOONGARCH_IOCSR_FEATURES``) for bit 11 (``IOCSRF_VM``); we are 
running under a hypervisor if the bit is set. Then invoke ``HVCL 0`` to 
find out the hypervisor implementation ID; a return value in ``$a0`` of 
0x004d564b (``KVM\0``) means Linux KVM, in which case the rest of the 
convention applies.


I do not think so. `HVCL 0` requires that hypercall ABIs need be unified 
for all hypervisors. Instead it is not necessary, each hypervisor can 
has its own hypercall ABI.



+
+KVM hypercall ABI
+=
+
+Hypercall ABI on KVM is simple, only one scratch register a0 (v0) and 
at most
+five generic registers used as input parameter. FP register and 
vector register
+is not used for input register and should not be modified during 
hypercall.
+Hypercall function can be inlined since there is only one scratch 
register.


It should be pointed out explicitly that on hypercall return all 
Well, return value description will added. What do think about the 
meaning of return value for KVM_HCALL_FUNC_PV_IPI hypercall?  The number 
of CPUs with IPI delivered 

Re: [PATCH v5 3/6] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-03-01 Thread maobibo




On 2024/2/27 下午6:19, WANG Xuerui wrote:

On 2/27/24 18:12, maobibo wrote:



On 2024/2/27 下午5:10, WANG Xuerui wrote:

On 2/27/24 11:14, maobibo wrote:



On 2024/2/27 上午4:02, Jiaxun Yang wrote:



在2024年2月26日二月 上午8:04,maobibo写道:

On 2024/2/26 下午2:12, Huacai Chen wrote:
On Mon, Feb 26, 2024 at 10:04 AM maobibo  
wrote:




On 2024/2/24 下午5:13, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 22, 2024 at 11:28 AM Bibo Mao  
wrote:


Instruction cpucfg can be used to get processor features. And 
there

is trap exception when it is executed in VM mode, and also it is
to provide cpu features to VM. On real hardware cpucfg area 0 
- 20
is used.  Here one specified area 0x4000 -- 0x40ff is 
used
for KVM hypervisor to privide PV features, and the area can be 
extended

for other hypervisors in future. This area will never be used for
real HW, it is only used by software.
After reading and thinking, I find that the hypercall method 
which is

used in our productive kernel is better than this cpucfg method.
Because hypercall is more simple and straightforward, plus we 
don't

worry about conflicting with the real hardware.
No, I do not think so. cpucfg is simper than hypercall, 
hypercall can
be in effect when system runs in guest mode. In some scenario 
like TCG

mode, hypercall is illegal intruction, however cpucfg can work.

Nearly all architectures use hypercall except x86 for its historical
Only x86 support multiple hypervisors and there is multiple 
hypervisor

in x86 only. It is an advantage, not historical reason.


I do believe that all those stuff should not be exposed to guest 
user space

for security reasons.
Can you add PLV checking when cpucfg 0x4000-0x40FF is 
emulated? if it is user mode return value is zero and it is kernel 
mode emulated value will be returned. It can avoid information leaking.


I've suggested this approach in another reply [1], but I've rechecked 
the manual, and it turns out this behavior is not permitted by the 
current wording. See LoongArch Reference Manual v1.10, Volume 1, 
Section 2.2.10.5 "CPUCFG":


 > CPUCFG 访问未定义的配置字将读回全 0 值。
 >
 > Reads of undefined CPUCFG configuration words shall return 
all-zeroes.


This sentence mentions no distinction based on privilege modes, so it 
can only mean the behavior applies universally regardless of 
privilege modes.


I think if you want to make CPUCFG behavior PLV-dependent, you may 
have to ask the LoongArch spec editors, internally or in public, for 
a new spec revision.
No, CPUCFG behavior between CPUCFG0-CPUCFG21 is unchanged, only that 
it can be defined by software since CPUCFG 0x4 is used by 
software.


The 0x4000 range is not mentioned in the manuals. I know you've 
confirmed privately with HW team but this needs to be properly 
documented for public projects to properly rely on.


(There are already multiple third-party LoongArch implementers as of 
late 2023, so any ISA-level change like this would best be 
coordinated, to minimize surprises.)

With document Vol 4-23
https://www.intel.com/content/dam/develop/external/us/en/documents/335592-sdm-vol-4.pdf 



There is one line "MSR address range between 4000H - 40FFH is 
marked as a specially reserved range. All existing and
future processors will not implement any features using any MSR in 
this range."


Thanks for providing this info, now at least we know why it's this 
specific range of 0x40XX that's chosen.




It only says that it is reserved, it does not say detailed software 
behavior. Software behavior is defined in hypervisor such as:
https://github.com/MicrosoftDocs/Virtualization-Documentation/blob/main/tlfs/Requirements%20for%20Implementing%20the%20Microsoft%20Hypervisor%20Interface.pdf 


https://kb.vmware.com/s/article/1009458

If hypercall method is used, there should be ABI also like aarch64:
https://documentation-service.arm.com/static/6013e5faeee5236980d08619


Yes proper documentation of public API surface is always necessary 
*before* doing real work. Because right now the hypercall provider is 
Linux KVM, maybe we can document the existing and planned hypercall 
usage and ABI in the kernel docs along with code changes.



yes, I will add hypercall in kernel docs.

SMC calling convention is ABI between OS and secure firmware, LoongArch 
has no secure mode, it not necessary to has such doc.The hypercall 
calling convention is relative with hypervisor SW, not relative with CPU 
HW vendor. Each hypervisor maybe has its separate hypercall calling 
convention just like syscall ABIs.


Regards
Bibo Mao




Re: [PATCH v5 3/6] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-02-27 Thread maobibo




On 2024/2/27 下午5:10, WANG Xuerui wrote:

On 2/27/24 11:14, maobibo wrote:



On 2024/2/27 上午4:02, Jiaxun Yang wrote:



在2024年2月26日二月 上午8:04,maobibo写道:

On 2024/2/26 下午2:12, Huacai Chen wrote:

On Mon, Feb 26, 2024 at 10:04 AM maobibo  wrote:




On 2024/2/24 下午5:13, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 22, 2024 at 11:28 AM Bibo Mao  
wrote:


Instruction cpucfg can be used to get processor features. And there
is trap exception when it is executed in VM mode, and also it is
to provide cpu features to VM. On real hardware cpucfg area 0 - 20
is used.  Here one specified area 0x4000 -- 0x40ff is used
for KVM hypervisor to privide PV features, and the area can be 
extended

for other hypervisors in future. This area will never be used for
real HW, it is only used by software.
After reading and thinking, I find that the hypercall method 
which is

used in our productive kernel is better than this cpucfg method.
Because hypercall is more simple and straightforward, plus we don't
worry about conflicting with the real hardware.

No, I do not think so. cpucfg is simper than hypercall, hypercall can
be in effect when system runs in guest mode. In some scenario like 
TCG

mode, hypercall is illegal intruction, however cpucfg can work.

Nearly all architectures use hypercall except x86 for its historical

Only x86 support multiple hypervisors and there is multiple hypervisor
in x86 only. It is an advantage, not historical reason.


I do believe that all those stuff should not be exposed to guest user 
space

for security reasons.
Can you add PLV checking when cpucfg 0x4000-0x40FF is 
emulated? if it is user mode return value is zero and it is kernel 
mode emulated value will be returned. It can avoid information leaking.


I've suggested this approach in another reply [1], but I've rechecked 
the manual, and it turns out this behavior is not permitted by the 
current wording. See LoongArch Reference Manual v1.10, Volume 1, Section 
2.2.10.5 "CPUCFG":


 > CPUCFG 访问未定义的配置字将读回全 0 值。
 >
 > Reads of undefined CPUCFG configuration words shall return all-zeroes.

This sentence mentions no distinction based on privilege modes, so it 
can only mean the behavior applies universally regardless of privilege 
modes.


I think if you want to make CPUCFG behavior PLV-dependent, you may have 
to ask the LoongArch spec editors, internally or in public, for a new 
spec revision.
No, CPUCFG behavior between CPUCFG0-CPUCFG21 is unchanged, only that it 
can be defined by software since CPUCFG 0x4 is used by software. >
(There are already multiple third-party LoongArch implementers as of 
late 2023, so any ISA-level change like this would best be coordinated, 
to minimize surprises.)

With document Vol 4-23
https://www.intel.com/content/dam/develop/external/us/en/documents/335592-sdm-vol-4.pdf

There is one line "MSR address range between 4000H - 40FFH is 
marked as a specially reserved range. All existing and
future processors will not implement any features using any MSR in this 
range."


It only says that it is reserved, it does not say detailed software 
behavior. Software behavior is defined in hypervisor such as:

https://github.com/MicrosoftDocs/Virtualization-Documentation/blob/main/tlfs/Requirements%20for%20Implementing%20the%20Microsoft%20Hypervisor%20Interface.pdf
https://kb.vmware.com/s/article/1009458

If hypercall method is used, there should be ABI also like aarch64:
https://documentation-service.arm.com/static/6013e5faeee5236980d08619

Regards
Bibo Mao



[1]: 
https://lore.kernel.org/loongarch/d8994f0f-d789-46d2-bc4d-f9b37fb39...@xen0n.name/ 








Re: [PATCH v5 3/6] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-02-27 Thread maobibo




On 2024/2/27 下午5:05, Xi Ruoyao wrote:

On Tue, 2024-02-27 at 15:09 +0800, maobibo wrote:


It is difficult to find an area unused by HW for CSR method since the
small CSR ID range.


We may use IOCSR instead.  In kernel/cpu-probe.c there are already some
IOCSR reads.


yes, IOCSR can be used and one IOCSR area will be reserved for software. 
In general CSR/CPUCFG is to describe CPU features and IOCSR is to 
describe board/device features.


CSR area is limited to 16K, it is difficult for HW guys to reserve 
special area for software usage. IOCSR/CPUCFG is possible however.


Regards
Bibo Mao



It is difficult to find an area unused by HW for CSR method since the
small CSR ID range. However it is easy for cpucfg. Here I doubt whether
you really know about LoongArch LVZ.


It's unfair to accuse the others for not knowing about LVZ considering
the lack of public documentation.






Re: [PATCH v5 3/6] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-02-26 Thread maobibo




On 2024/2/27 下午1:23, Jiaxun Yang wrote:



在2024年2月27日二月 上午3:14,maobibo写道:

On 2024/2/27 上午4:02, Jiaxun Yang wrote:



在2024年2月26日二月 上午8:04,maobibo写道:

On 2024/2/26 下午2:12, Huacai Chen wrote:

On Mon, Feb 26, 2024 at 10:04 AM maobibo  wrote:




On 2024/2/24 下午5:13, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 22, 2024 at 11:28 AM Bibo Mao  wrote:


Instruction cpucfg can be used to get processor features. And there
is trap exception when it is executed in VM mode, and also it is
to provide cpu features to VM. On real hardware cpucfg area 0 - 20
is used.  Here one specified area 0x4000 -- 0x40ff is used
for KVM hypervisor to privide PV features, and the area can be extended
for other hypervisors in future. This area will never be used for
real HW, it is only used by software.

After reading and thinking, I find that the hypercall method which is
used in our productive kernel is better than this cpucfg method.
Because hypercall is more simple and straightforward, plus we don't
worry about conflicting with the real hardware.

No, I do not think so. cpucfg is simper than hypercall, hypercall can
be in effect when system runs in guest mode. In some scenario like TCG
mode, hypercall is illegal intruction, however cpucfg can work.

Nearly all architectures use hypercall except x86 for its historical

Only x86 support multiple hypervisors and there is multiple hypervisor
in x86 only. It is an advantage, not historical reason.


I do believe that all those stuff should not be exposed to guest user space
for security reasons.

Can you add PLV checking when cpucfg 0x4000-0x40FF is emulated?
if it is user mode return value is zero and it is kernel mode emulated
value will be returned. It can avoid information leaking.


Please don’t do insane stuff here, applications are not expecting exception from
cpucfg.
Sorry, I do not understand. Can you describe the behavior about cpucfg 
instruction from applications? Why is there no exception for cpucfg.






Also for different implementations of hypervisors they may have different
PV features behavior, using hypcall to perform feature detection
can pass more information to help us cope with hypervisor diversity.

How do different hypervisors can be detected firstly?  On x86 MSR is
used for all hypervisors detection and on ARM64 hyperv used
acpi_gbl_FADT and kvm use smc forcely, host mode can execute smc
instruction without exception on ARM64.


That’s hypcall ABI design choices, those information can come from firmware
or privileged CSRs on LoongArch.
Firstly the firmware or privileged CSRs is not relative with hypcall ABI 
design choices.  With CSR instruction, CSR ID is encoded in CSR 
instruction, range about CSR ID is 16K; for cpucfg instruction, cpucfg 
area is passed with register, range is UINT_MAX at least.


It is difficult to find an area unused by HW for CSR method since the 
small CSR ID range. However it is easy for cpucfg. Here I doubt whether 
you really know about LoongArch LVZ.






I do not know why hypercall is better than cpucfg on LoongArch, cpucfg
is basic intruction however hypercall is not, it is part of LVZ feature.


KVM can only work with LVZ right?
Linux kernel need boot well with TCG and KVM mode, hypercall is illegal 
instruction with TCG mode.


Regards
Bibo Mao







reasons. If we use CPUCFG, then the hypervisor information is
unnecessarily leaked to userspace, and this may be a security issue.
Meanwhile, I don't think TCG mode needs PV features.

Besides PV features, there is other features different with real hw such
as virtio device, virtual interrupt controller.


Those are *device* level information, they must be passed in firmware
interfaces to keep processor emulation sane.

File arch/x86/hyperv/hv_apic.c can be referenced, apic features comes
from ms_hyperv.hints and HYPERV_CPUID_ENLIGHTMENT_INFO cpuid info, not
must be passed by firmware interface.


That’s not KVM, that’s Hyper V. At Linux Kernel we enjoy the benefits of better
modularity on device abstractions, please don’t break it.

Thanks



Regards
Bibo Mao


Thanks



Regards
Bibo Mao



I consulted with Jiaxun before, and maybe he can give some more comments.



Extioi virtualization extension will be added later, cpucfg can be used
to get extioi features. It is unlikely that extioi driver depends on
PARA_VIRT macro if hypercall is used to get features.

CPUCFG is per-core information, if we really need something about
extioi, it should be in iocsr (LOONGARCH_IOCSR_FEATURES).


Huacai



Regards
Bibo Mao



Huacai



Signed-off-by: Bibo Mao 
---
 arch/loongarch/include/asm/inst.h  |  1 +
 arch/loongarch/include/asm/loongarch.h | 10 ++
 arch/loongarch/kvm/exit.c  | 46 +-
 3 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/arch/loongarch/include/asm/inst.h 
b/arch/loongarch/include/asm/inst.h
index d8f637f9e400..ad120f924905 100644
--- a/arch/loongarch/include/asm/inst.h
+++ b/arch

Re: [PATCH v5 3/6] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-02-26 Thread maobibo




On 2024/2/27 上午4:02, Jiaxun Yang wrote:



在2024年2月26日二月 上午8:04,maobibo写道:

On 2024/2/26 下午2:12, Huacai Chen wrote:

On Mon, Feb 26, 2024 at 10:04 AM maobibo  wrote:




On 2024/2/24 下午5:13, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 22, 2024 at 11:28 AM Bibo Mao  wrote:


Instruction cpucfg can be used to get processor features. And there
is trap exception when it is executed in VM mode, and also it is
to provide cpu features to VM. On real hardware cpucfg area 0 - 20
is used.  Here one specified area 0x4000 -- 0x40ff is used
for KVM hypervisor to privide PV features, and the area can be extended
for other hypervisors in future. This area will never be used for
real HW, it is only used by software.

After reading and thinking, I find that the hypercall method which is
used in our productive kernel is better than this cpucfg method.
Because hypercall is more simple and straightforward, plus we don't
worry about conflicting with the real hardware.

No, I do not think so. cpucfg is simper than hypercall, hypercall can
be in effect when system runs in guest mode. In some scenario like TCG
mode, hypercall is illegal intruction, however cpucfg can work.

Nearly all architectures use hypercall except x86 for its historical

Only x86 support multiple hypervisors and there is multiple hypervisor
in x86 only. It is an advantage, not historical reason.


I do believe that all those stuff should not be exposed to guest user space
for security reasons.
Can you add PLV checking when cpucfg 0x4000-0x40FF is emulated? 
if it is user mode return value is zero and it is kernel mode emulated 
value will be returned. It can avoid information leaking.




Also for different implementations of hypervisors they may have different
PV features behavior, using hypcall to perform feature detection
can pass more information to help us cope with hypervisor diversity.
How do different hypervisors can be detected firstly?  On x86 MSR is 
used for all hypervisors detection and on ARM64 hyperv used 
acpi_gbl_FADT and kvm use smc forcely, host mode can execute smc 
instruction without exception on ARM64.


I do not know why hypercall is better than cpucfg on LoongArch, cpucfg 
is basic intruction however hypercall is not, it is part of LVZ feature.





reasons. If we use CPUCFG, then the hypervisor information is
unnecessarily leaked to userspace, and this may be a security issue.
Meanwhile, I don't think TCG mode needs PV features.

Besides PV features, there is other features different with real hw such
as virtio device, virtual interrupt controller.


Those are *device* level information, they must be passed in firmware
interfaces to keep processor emulation sane.
File arch/x86/hyperv/hv_apic.c can be referenced, apic features comes 
from ms_hyperv.hints and HYPERV_CPUID_ENLIGHTMENT_INFO cpuid info, not 
must be passed by firmware interface.


Regards
Bibo Mao


Thanks



Regards
Bibo Mao



I consulted with Jiaxun before, and maybe he can give some more comments.



Extioi virtualization extension will be added later, cpucfg can be used
to get extioi features. It is unlikely that extioi driver depends on
PARA_VIRT macro if hypercall is used to get features.

CPUCFG is per-core information, if we really need something about
extioi, it should be in iocsr (LOONGARCH_IOCSR_FEATURES).


Huacai



Regards
Bibo Mao



Huacai



Signed-off-by: Bibo Mao 
---
arch/loongarch/include/asm/inst.h  |  1 +
arch/loongarch/include/asm/loongarch.h | 10 ++
arch/loongarch/kvm/exit.c  | 46 +-
3 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/arch/loongarch/include/asm/inst.h 
b/arch/loongarch/include/asm/inst.h
index d8f637f9e400..ad120f924905 100644
--- a/arch/loongarch/include/asm/inst.h
+++ b/arch/loongarch/include/asm/inst.h
@@ -67,6 +67,7 @@ enum reg2_op {
   revhd_op= 0x11,
   extwh_op= 0x16,
   extwb_op= 0x17,
+   cpucfg_op   = 0x1b,
   iocsrrdb_op = 0x19200,
   iocsrrdh_op = 0x19201,
   iocsrrdw_op = 0x19202,
diff --git a/arch/loongarch/include/asm/loongarch.h 
b/arch/loongarch/include/asm/loongarch.h
index 46366e783c84..a1d22e8b6f94 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -158,6 +158,16 @@
#define  CPUCFG48_VFPU_CG  BIT(2)
#define  CPUCFG48_RAM_CG   BIT(3)

+/*
+ * cpucfg index area: 0x4000 -- 0x40ff
+ * SW emulation for KVM hypervirsor
+ */
+#define CPUCFG_KVM_BASE0x4000UL
+#define CPUCFG_KVM_SIZE0x100
+#define CPUCFG_KVM_SIG CPUCFG_KVM_BASE
+#define  KVM_SIGNATURE "KVM\0"
+#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4)
+
#ifndef __ASSEMBLY__

/* CSR */
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index 923bbca9bd22..6a38fd59d

Re: [PATCH v5 3/6] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-02-26 Thread maobibo




On 2024/2/26 下午2:12, Huacai Chen wrote:

On Mon, Feb 26, 2024 at 10:04 AM maobibo  wrote:




On 2024/2/24 下午5:13, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 22, 2024 at 11:28 AM Bibo Mao  wrote:


Instruction cpucfg can be used to get processor features. And there
is trap exception when it is executed in VM mode, and also it is
to provide cpu features to VM. On real hardware cpucfg area 0 - 20
is used.  Here one specified area 0x4000 -- 0x40ff is used
for KVM hypervisor to privide PV features, and the area can be extended
for other hypervisors in future. This area will never be used for
real HW, it is only used by software.

After reading and thinking, I find that the hypercall method which is
used in our productive kernel is better than this cpucfg method.
Because hypercall is more simple and straightforward, plus we don't
worry about conflicting with the real hardware.

No, I do not think so. cpucfg is simper than hypercall, hypercall can
be in effect when system runs in guest mode. In some scenario like TCG
mode, hypercall is illegal intruction, however cpucfg can work.

Nearly all architectures use hypercall except x86 for its historical
Only x86 support multiple hypervisors and there is multiple hypervisor 
in x86 only. It is an advantage, not historical reason.



reasons. If we use CPUCFG, then the hypervisor information is
unnecessarily leaked to userspace, and this may be a security issue.
Meanwhile, I don't think TCG mode needs PV features.
Besides PV features, there is other features different with real hw such 
as virtio device, virtual interrupt controller.


Regards
Bibo Mao



I consulted with Jiaxun before, and maybe he can give some more comments.



Extioi virtualization extension will be added later, cpucfg can be used
to get extioi features. It is unlikely that extioi driver depends on
PARA_VIRT macro if hypercall is used to get features.

CPUCFG is per-core information, if we really need something about
extioi, it should be in iocsr (LOONGARCH_IOCSR_FEATURES).


Huacai



Regards
Bibo Mao



Huacai



Signed-off-by: Bibo Mao 
---
   arch/loongarch/include/asm/inst.h  |  1 +
   arch/loongarch/include/asm/loongarch.h | 10 ++
   arch/loongarch/kvm/exit.c  | 46 +-
   3 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/arch/loongarch/include/asm/inst.h 
b/arch/loongarch/include/asm/inst.h
index d8f637f9e400..ad120f924905 100644
--- a/arch/loongarch/include/asm/inst.h
+++ b/arch/loongarch/include/asm/inst.h
@@ -67,6 +67,7 @@ enum reg2_op {
  revhd_op= 0x11,
  extwh_op= 0x16,
  extwb_op= 0x17,
+   cpucfg_op   = 0x1b,
  iocsrrdb_op = 0x19200,
  iocsrrdh_op = 0x19201,
  iocsrrdw_op = 0x19202,
diff --git a/arch/loongarch/include/asm/loongarch.h 
b/arch/loongarch/include/asm/loongarch.h
index 46366e783c84..a1d22e8b6f94 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -158,6 +158,16 @@
   #define  CPUCFG48_VFPU_CG  BIT(2)
   #define  CPUCFG48_RAM_CG   BIT(3)

+/*
+ * cpucfg index area: 0x4000 -- 0x40ff
+ * SW emulation for KVM hypervirsor
+ */
+#define CPUCFG_KVM_BASE0x4000UL
+#define CPUCFG_KVM_SIZE0x100
+#define CPUCFG_KVM_SIG CPUCFG_KVM_BASE
+#define  KVM_SIGNATURE "KVM\0"
+#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4)
+
   #ifndef __ASSEMBLY__

   /* CSR */
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index 923bbca9bd22..6a38fd59d86d 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -206,10 +206,37 @@ int kvm_emu_idle(struct kvm_vcpu *vcpu)
  return EMULATE_DONE;
   }

-static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
+static int kvm_emu_cpucfg(struct kvm_vcpu *vcpu, larch_inst inst)
   {
  int rd, rj;
  unsigned int index;
+
+   rd = inst.reg2_format.rd;
+   rj = inst.reg2_format.rj;
+   ++vcpu->stat.cpucfg_exits;
+   index = vcpu->arch.gprs[rj];
+
+   /*
+* By LoongArch Reference Manual 2.2.10.5
+* Return value is 0 for undefined cpucfg index
+*/
+   switch (index) {
+   case 0 ... (KVM_MAX_CPUCFG_REGS - 1):
+   vcpu->arch.gprs[rd] = vcpu->arch.cpucfg[index];
+   break;
+   case CPUCFG_KVM_SIG:
+   vcpu->arch.gprs[rd] = *(unsigned int *)KVM_SIGNATURE;
+   break;
+   default:
+   vcpu->arch.gprs[rd] = 0;
+   break;
+   }
+
+   return EMULATE_DONE;
+}
+
+static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
+{
  unsigned long curr_pc;
  larch_inst inst;
  enum emulation_result er = EMULATE_DONE;
@@ -224,21 +251,8 @@ static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
  er = EM

Re: [PATCH v5 3/6] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-02-26 Thread maobibo




On 2024/2/26 下午1:25, WANG Xuerui wrote:

Hi,

On 2/26/24 10:04, maobibo wrote:

On 2024/2/24 下午5:13, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 22, 2024 at 11:28 AM Bibo Mao  wrote:


Instruction cpucfg can be used to get processor features. And there
is trap exception when it is executed in VM mode, and also it is
to provide cpu features to VM. On real hardware cpucfg area 0 - 20
is used.  Here one specified area 0x4000 -- 0x40ff is used
for KVM hypervisor to privide PV features, and the area can be extended
for other hypervisors in future. This area will never be used for
real HW, it is only used by software.

After reading and thinking, I find that the hypercall method which is
used in our productive kernel is better than this cpucfg method.
Because hypercall is more simple and straightforward, plus we don't
worry about conflicting with the real hardware.

No, I do not think so. cpucfg is simper than hypercall, hypercall can
be in effect when system runs in guest mode. In some scenario like TCG 
mode, hypercall is illegal intruction, however cpucfg can work.


While the CPUCFG instruction is universally available, it's also 
unprivileged, so any additional CPUCFG behavior also automatically 
becomes UAPI, which likely isn't what you expect. Hypervisor 
implementation details shouldn't be leaked to userland because it has no 
reason to care -- even though userland learns about the capabilities, it 
cannot actually access the resources, because relevant CSRs and/or 
instructions are privileged. Worse, the unnecessary exposure of 
information could be a problem security-wise.

cpucfg is read-only and used to represent current hw cpu features,
why do you think there is security issue?  Is there security issue about 
cpucfg2 and cpucfg6 since it can be accessed in user space also?


PMU feature is defined in cpucfg6, PMU driver is written in kernel mode.


A possible way to preserve the unprivileged CPUCFG behavior would be 
acting differently based on guest CSR.CRMD.PLV: only returning data for 
the new configuration space when guest is not in PLV3. But this behavior 
isn't explicitly allowed nor disallowed in the LoongArch manuals, and is 
in my opinion unnecessarily complex.


And regarding the lack of hypcall support from QEMU system mode 
emulation on TCG, I'd argue it's simply a matter of adding support in 
target/loongarch64. This would be attractive because it will enable easy 
development and testing of hypervisor software with QEMU -- both locally 
and in CI.

Hypercall is part of hardware assisted virtualization LVZ, do you think
only adding hypercall instruction withou LVZ is possible?



Extioi virtualization extension will be added later, cpucfg can be 
used to get extioi features. It is unlikely that extioi driver depends 
on PARA_VIRT macro if hypercall is used to get features.
And the EXTIOI feature too isn't something usable from unprivileged 
code, so I don't think it will affect the conclusions above.

Sorry, I do not know what do you mean.


Regards
Bibo Mao








Re: [PATCH v5 6/6] LoongArch: Add pv ipi support on LoongArch system

2024-02-25 Thread maobibo




On 2024/2/24 下午5:19, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 22, 2024 at 11:28 AM Bibo Mao  wrote:


On LoongArch system, ipi hw uses iocsr registers, there is one iocsr
register access on ipi sending, and two iocsr access on ipi receiving
which is ipi interrupt handler. On VM mode all iocsr accessing will
cause VM to trap into hypervisor. So with one ipi hw notification
there will be three times of trap.

PV ipi is added for VM, hypercall instruction is used for ipi sender,
and hypervisor will inject SWI to destination vcpu. During SWI interrupt
handler, only estat CSR register is written to clear irq. Estat CSR
register access will not trap into hypervisor. So with pv ipi supported,
there is one trap with pv ipi sender, and no trap with ipi receiver,
there is only one trap with ipi notification.

Also this patch adds ipi multicast support, the method is similar with
x86. With ipi multicast support, ipi notification can be sent to at most
128 vcpus at one time. It reduces trap times into hypervisor greatly.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/hardirq.h   |   1 +
  arch/loongarch/include/asm/kvm_host.h  |   1 +
  arch/loongarch/include/asm/kvm_para.h  | 123 +
  arch/loongarch/include/asm/loongarch.h |   1 +
  arch/loongarch/kernel/irq.c|   2 +-
  arch/loongarch/kernel/paravirt.c   | 112 ++
  arch/loongarch/kernel/setup.c  |   1 +
  arch/loongarch/kernel/smp.c|   2 +-
  arch/loongarch/kvm/exit.c  |  73 ++-
  arch/loongarch/kvm/vcpu.c  |   1 +
  10 files changed, 313 insertions(+), 4 deletions(-)

diff --git a/arch/loongarch/include/asm/hardirq.h 
b/arch/loongarch/include/asm/hardirq.h
index 9f0038e19c7f..b26d596a73aa 100644
--- a/arch/loongarch/include/asm/hardirq.h
+++ b/arch/loongarch/include/asm/hardirq.h
@@ -21,6 +21,7 @@ enum ipi_msg_type {
  typedef struct {
 unsigned int ipi_irqs[NR_IPI];
 unsigned int __softirq_pending;
+   atomic_t message cacheline_aligned_in_smp;
  } cacheline_aligned irq_cpustat_t;

  DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 3ba16ef1fe69..0b96c6303cf7 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -43,6 +43,7 @@ struct kvm_vcpu_stat {
 u64 idle_exits;
 u64 cpucfg_exits;
 u64 signal_exits;
+   u64 hypercall_exits;
  };

  #define KVM_MEM_HUGEPAGE_CAPABLE   (1UL << 0)
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index af5d677a9052..a82bffbbf8a1 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -8,6 +8,9 @@
  #define HYPERVISOR_KVM 1
  #define HYPERVISOR_VENDOR_SHIFT8
  #define HYPERCALL_CODE(vendor, code)   ((vendor << HYPERVISOR_VENDOR_SHIFT) + 
code)
+#define KVM_HCALL_CODE_PV_SERVICE  0
+#define KVM_HCALL_PV_SERVICE   HYPERCALL_CODE(HYPERVISOR_KVM, 
KVM_HCALL_CODE_PV_SERVICE)
+#define  KVM_HCALL_FUNC_PV_IPI 1

  /*
   * LoongArch hypercall return code
@@ -16,6 +19,126 @@
  #define KVM_HCALL_INVALID_CODE -1UL
  #define KVM_HCALL_INVALID_PARAMETER-2UL

+/*
+ * Hypercall interface for KVM hypervisor
+ *
+ * a0: function identifier
+ * a1-a6: args
+ * Return value will be placed in v0.
+ * Up to 6 arguments are passed in a1, a2, a3, a4, a5, a6.
+ */
+static __always_inline long kvm_hypercall(u64 fid)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HCALL_PV_SERVICE)
+   : "=r" (ret)
+   : "r" (fun)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long kvm_hypercall1(u64 fid, unsigned long arg0)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+   register unsigned long a1  asm("a1") = arg0;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HCALL_PV_SERVICE)
+   : "=r" (ret)
+   : "r" (fun), "r" (a1)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long kvm_hypercall2(u64 fid,
+   unsigned long arg0, unsigned long arg1)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+   register unsigned long a1  asm("a1") = arg0;
+   register unsigned long a2  asm("a2") = arg1;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HCALL_PV_SERVICE)
+   : "=r" (ret)
+   : "r" (fun), "r" (a1), "r" (a2)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long 

Re: [PATCH v5 4/6] LoongArch: Add paravirt interface for guest kernel

2024-02-25 Thread maobibo




On 2024/2/24 下午5:15, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 22, 2024 at 11:28 AM Bibo Mao  wrote:


Paravirt interface pv_ipi_init() is added here for guest kernel, it
firstly checks whether system runs on VM mode. If kernel runs on VM mode,
it will call function kvm_para_available() to detect current VMM type.
Now only KVM VMM type is detected,the paravirt function can work only if
current VMM is KVM hypervisor, since there is only KVM hypervisor
supported on LoongArch now.

There is not effective with pv_ipi_init() now, it is dummy function.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/Kconfig|  9 
  arch/loongarch/include/asm/kvm_para.h |  7 
  arch/loongarch/include/asm/paravirt.h | 27 
  .../include/asm/paravirt_api_clock.h  |  1 +
  arch/loongarch/kernel/Makefile|  1 +
  arch/loongarch/kernel/paravirt.c  | 41 +++
  arch/loongarch/kernel/setup.c |  1 +
  7 files changed, 87 insertions(+)
  create mode 100644 arch/loongarch/include/asm/paravirt.h
  create mode 100644 arch/loongarch/include/asm/paravirt_api_clock.h
  create mode 100644 arch/loongarch/kernel/paravirt.c

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 929f68926b34..fdaae9a0435c 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -587,6 +587,15 @@ config CPU_HAS_PREFETCH
 bool
 default y

+config PARAVIRT
+   bool "Enable paravirtualization code"
+   depends on AS_HAS_LVZ_EXTENSION
+   help
+  This changes the kernel so it can modify itself when it is run
+ under a hypervisor, potentially improving performance significantly
+ over full virtualization.  However, when run without a hypervisor
+ the kernel is theoretically slower and slightly larger.
+
  config ARCH_SUPPORTS_KEXEC
 def_bool y

diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index d48f993ae206..af5d677a9052 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -2,6 +2,13 @@
  #ifndef _ASM_LOONGARCH_KVM_PARA_H
  #define _ASM_LOONGARCH_KVM_PARA_H

+/*
+ * Hypercall code field
+ */
+#define HYPERVISOR_KVM 1
+#define HYPERVISOR_VENDOR_SHIFT8
+#define HYPERCALL_CODE(vendor, code)   ((vendor << HYPERVISOR_VENDOR_SHIFT) + 
code)
+
  /*
   * LoongArch hypercall return code
   */
diff --git a/arch/loongarch/include/asm/paravirt.h 
b/arch/loongarch/include/asm/paravirt.h
new file mode 100644
index ..58f7b7b89f2c
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_PARAVIRT_H
+#define _ASM_LOONGARCH_PARAVIRT_H
+
+#ifdef CONFIG_PARAVIRT
+#include 
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
+
+u64 dummy_steal_clock(int cpu);
+DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+   return static_call(pv_steal_clock)(cpu);
+}
+
+int pv_ipi_init(void);
+#else
+static inline int pv_ipi_init(void)
+{
+   return 0;
+}
+
+#endif // CONFIG_PARAVIRT
+#endif
diff --git a/arch/loongarch/include/asm/paravirt_api_clock.h 
b/arch/loongarch/include/asm/paravirt_api_clock.h
new file mode 100644
index ..65ac7cee0dad
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt_api_clock.h
@@ -0,0 +1 @@
+#include 
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index 3c808c680370..662e6e9de12d 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_MODULES) += module.o module-sections.o
  obj-$(CONFIG_STACKTRACE)   += stacktrace.o

  obj-$(CONFIG_PROC_FS)  += proc.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o

  obj-$(CONFIG_SMP)  += smp.o

diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
new file mode 100644
index ..5cf794e8490f
--- /dev/null
+++ b/arch/loongarch/kernel/paravirt.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct static_key paravirt_steal_enabled;
+struct static_key paravirt_steal_rq_enabled;
+
+static u64 native_steal_clock(int cpu)
+{
+   return 0;
+}
+
+DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);
+
+static bool kvm_para_available(void)
+{
+   static int hypervisor_type;
+   int config;
+
+   if (!hypervisor_type) {
+   config = read_cpucfg(CPUCFG_KVM_SIG);
+   if (!memcmp(, KVM_SIGNATURE, 4))
+   hypervisor_type = HYPERVISOR_KVM;
+   }
+
+   return hypervisor_type == HYPERVISOR_KVM;
+}
+
+int __init pv_ipi_init(void)
+{
+   if (!cpu_has_hypervisor)
+   

Re: [PATCH v5 3/6] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-02-25 Thread maobibo




On 2024/2/24 下午5:13, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 22, 2024 at 11:28 AM Bibo Mao  wrote:


Instruction cpucfg can be used to get processor features. And there
is trap exception when it is executed in VM mode, and also it is
to provide cpu features to VM. On real hardware cpucfg area 0 - 20
is used.  Here one specified area 0x4000 -- 0x40ff is used
for KVM hypervisor to privide PV features, and the area can be extended
for other hypervisors in future. This area will never be used for
real HW, it is only used by software.

After reading and thinking, I find that the hypercall method which is
used in our productive kernel is better than this cpucfg method.
Because hypercall is more simple and straightforward, plus we don't
worry about conflicting with the real hardware.
About cpucfg area 0x4000 -- 0x40ff, I have negotiated with chip 
designer. This area will never be used with real hardware.


Regards
Bibo Mao


Huacai



Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/inst.h  |  1 +
  arch/loongarch/include/asm/loongarch.h | 10 ++
  arch/loongarch/kvm/exit.c  | 46 +-
  3 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/arch/loongarch/include/asm/inst.h 
b/arch/loongarch/include/asm/inst.h
index d8f637f9e400..ad120f924905 100644
--- a/arch/loongarch/include/asm/inst.h
+++ b/arch/loongarch/include/asm/inst.h
@@ -67,6 +67,7 @@ enum reg2_op {
 revhd_op= 0x11,
 extwh_op= 0x16,
 extwb_op= 0x17,
+   cpucfg_op   = 0x1b,
 iocsrrdb_op = 0x19200,
 iocsrrdh_op = 0x19201,
 iocsrrdw_op = 0x19202,
diff --git a/arch/loongarch/include/asm/loongarch.h 
b/arch/loongarch/include/asm/loongarch.h
index 46366e783c84..a1d22e8b6f94 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -158,6 +158,16 @@
  #define  CPUCFG48_VFPU_CG  BIT(2)
  #define  CPUCFG48_RAM_CG   BIT(3)

+/*
+ * cpucfg index area: 0x4000 -- 0x40ff
+ * SW emulation for KVM hypervirsor
+ */
+#define CPUCFG_KVM_BASE0x4000UL
+#define CPUCFG_KVM_SIZE0x100
+#define CPUCFG_KVM_SIG CPUCFG_KVM_BASE
+#define  KVM_SIGNATURE "KVM\0"
+#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4)
+
  #ifndef __ASSEMBLY__

  /* CSR */
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index 923bbca9bd22..6a38fd59d86d 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -206,10 +206,37 @@ int kvm_emu_idle(struct kvm_vcpu *vcpu)
 return EMULATE_DONE;
  }

-static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
+static int kvm_emu_cpucfg(struct kvm_vcpu *vcpu, larch_inst inst)
  {
 int rd, rj;
 unsigned int index;
+
+   rd = inst.reg2_format.rd;
+   rj = inst.reg2_format.rj;
+   ++vcpu->stat.cpucfg_exits;
+   index = vcpu->arch.gprs[rj];
+
+   /*
+* By LoongArch Reference Manual 2.2.10.5
+* Return value is 0 for undefined cpucfg index
+*/
+   switch (index) {
+   case 0 ... (KVM_MAX_CPUCFG_REGS - 1):
+   vcpu->arch.gprs[rd] = vcpu->arch.cpucfg[index];
+   break;
+   case CPUCFG_KVM_SIG:
+   vcpu->arch.gprs[rd] = *(unsigned int *)KVM_SIGNATURE;
+   break;
+   default:
+   vcpu->arch.gprs[rd] = 0;
+   break;
+   }
+
+   return EMULATE_DONE;
+}
+
+static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
+{
 unsigned long curr_pc;
 larch_inst inst;
 enum emulation_result er = EMULATE_DONE;
@@ -224,21 +251,8 @@ static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
 er = EMULATE_FAIL;
 switch (((inst.word >> 24) & 0xff)) {
 case 0x0: /* CPUCFG GSPR */
-   if (inst.reg2_format.opcode == 0x1B) {
-   rd = inst.reg2_format.rd;
-   rj = inst.reg2_format.rj;
-   ++vcpu->stat.cpucfg_exits;
-   index = vcpu->arch.gprs[rj];
-   er = EMULATE_DONE;
-   /*
-* By LoongArch Reference Manual 2.2.10.5
-* return value is 0 for undefined cpucfg index
-*/
-   if (index < KVM_MAX_CPUCFG_REGS)
-   vcpu->arch.gprs[rd] = vcpu->arch.cpucfg[index];
-   else
-   vcpu->arch.gprs[rd] = 0;
-   }
+   if (inst.reg2_format.opcode == cpucfg_op)
+   er = kvm_emu_cpucfg(vcpu, inst);
 break;
 case 0x4: /* CSR{RD,WR,XCHG} GSPR */
 er = kvm_handle_csr(vcpu, inst);
--
2.39.3






Re: [PATCH v5 3/6] LoongArch: KVM: Add cpucfg area for kvm hypervisor

2024-02-25 Thread maobibo




On 2024/2/24 下午5:13, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 22, 2024 at 11:28 AM Bibo Mao  wrote:


Instruction cpucfg can be used to get processor features. And there
is trap exception when it is executed in VM mode, and also it is
to provide cpu features to VM. On real hardware cpucfg area 0 - 20
is used.  Here one specified area 0x4000 -- 0x40ff is used
for KVM hypervisor to privide PV features, and the area can be extended
for other hypervisors in future. This area will never be used for
real HW, it is only used by software.

After reading and thinking, I find that the hypercall method which is
used in our productive kernel is better than this cpucfg method.
Because hypercall is more simple and straightforward, plus we don't
worry about conflicting with the real hardware.

No, I do not think so. cpucfg is simper than hypercall, hypercall can
be in effect when system runs in guest mode. In some scenario like TCG 
mode, hypercall is illegal intruction, however cpucfg can work.


Extioi virtualization extension will be added later, cpucfg can be used 
to get extioi features. It is unlikely that extioi driver depends on 
PARA_VIRT macro if hypercall is used to get features.


Regards
Bibo Mao



Huacai



Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/inst.h  |  1 +
  arch/loongarch/include/asm/loongarch.h | 10 ++
  arch/loongarch/kvm/exit.c  | 46 +-
  3 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/arch/loongarch/include/asm/inst.h 
b/arch/loongarch/include/asm/inst.h
index d8f637f9e400..ad120f924905 100644
--- a/arch/loongarch/include/asm/inst.h
+++ b/arch/loongarch/include/asm/inst.h
@@ -67,6 +67,7 @@ enum reg2_op {
 revhd_op= 0x11,
 extwh_op= 0x16,
 extwb_op= 0x17,
+   cpucfg_op   = 0x1b,
 iocsrrdb_op = 0x19200,
 iocsrrdh_op = 0x19201,
 iocsrrdw_op = 0x19202,
diff --git a/arch/loongarch/include/asm/loongarch.h 
b/arch/loongarch/include/asm/loongarch.h
index 46366e783c84..a1d22e8b6f94 100644
--- a/arch/loongarch/include/asm/loongarch.h
+++ b/arch/loongarch/include/asm/loongarch.h
@@ -158,6 +158,16 @@
  #define  CPUCFG48_VFPU_CG  BIT(2)
  #define  CPUCFG48_RAM_CG   BIT(3)

+/*
+ * cpucfg index area: 0x4000 -- 0x40ff
+ * SW emulation for KVM hypervirsor
+ */
+#define CPUCFG_KVM_BASE0x4000UL
+#define CPUCFG_KVM_SIZE0x100
+#define CPUCFG_KVM_SIG CPUCFG_KVM_BASE
+#define  KVM_SIGNATURE "KVM\0"
+#define CPUCFG_KVM_FEATURE (CPUCFG_KVM_BASE + 4)
+
  #ifndef __ASSEMBLY__

  /* CSR */
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index 923bbca9bd22..6a38fd59d86d 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -206,10 +206,37 @@ int kvm_emu_idle(struct kvm_vcpu *vcpu)
 return EMULATE_DONE;
  }

-static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
+static int kvm_emu_cpucfg(struct kvm_vcpu *vcpu, larch_inst inst)
  {
 int rd, rj;
 unsigned int index;
+
+   rd = inst.reg2_format.rd;
+   rj = inst.reg2_format.rj;
+   ++vcpu->stat.cpucfg_exits;
+   index = vcpu->arch.gprs[rj];
+
+   /*
+* By LoongArch Reference Manual 2.2.10.5
+* Return value is 0 for undefined cpucfg index
+*/
+   switch (index) {
+   case 0 ... (KVM_MAX_CPUCFG_REGS - 1):
+   vcpu->arch.gprs[rd] = vcpu->arch.cpucfg[index];
+   break;
+   case CPUCFG_KVM_SIG:
+   vcpu->arch.gprs[rd] = *(unsigned int *)KVM_SIGNATURE;
+   break;
+   default:
+   vcpu->arch.gprs[rd] = 0;
+   break;
+   }
+
+   return EMULATE_DONE;
+}
+
+static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
+{
 unsigned long curr_pc;
 larch_inst inst;
 enum emulation_result er = EMULATE_DONE;
@@ -224,21 +251,8 @@ static int kvm_trap_handle_gspr(struct kvm_vcpu *vcpu)
 er = EMULATE_FAIL;
 switch (((inst.word >> 24) & 0xff)) {
 case 0x0: /* CPUCFG GSPR */
-   if (inst.reg2_format.opcode == 0x1B) {
-   rd = inst.reg2_format.rd;
-   rj = inst.reg2_format.rj;
-   ++vcpu->stat.cpucfg_exits;
-   index = vcpu->arch.gprs[rj];
-   er = EMULATE_DONE;
-   /*
-* By LoongArch Reference Manual 2.2.10.5
-* return value is 0 for undefined cpucfg index
-*/
-   if (index < KVM_MAX_CPUCFG_REGS)
-   vcpu->arch.gprs[rd] = vcpu->arch.cpucfg[index];
-   else
-   vcpu->arch.gprs[rd] = 0;
-   }
+   if (inst.reg2_format.opcode == 

Re: [PATCH v4 0/6] LoongArch: Add pv ipi support on LoongArch VM

2024-02-22 Thread maobibo




On 2024/2/22 下午5:34, WANG Xuerui wrote:

On 2/17/24 11:15, maobibo wrote:

On 2024/2/15 下午6:25, WANG Xuerui wrote:

On 2/15/24 18:11, WANG Xuerui wrote:
Sorry for the late reply (and Happy Chinese New Year), and thanks 
for providing microbenchmark numbers! But it seems the more 
comprehensive CoreMark results were omitted (that's also absent in 
v3)? While the 


Of course the benchmark suite should be UnixBench instead of 
CoreMark. Lesson: don't multi-task code reviews, especially not after 
consuming beer -- a cup of coffee won't fully cancel the influence. ;-)


Where is rule about benchmark choices like UnixBench/Coremark for ipi 
improvement?


Sorry for the late reply. The rules are mostly unwritten, but in general 
you can think of the preference of benchmark suites as a matter of 
"effectiveness" -- the closer it's to some real workload in the wild, 
the better. Micro-benchmarks is okay for illustrating the points, but 
without demonstrating the impact on realistic workloads, a change could 
be "useless" in practice or even decrease various performance metrics 
(be that throughput or latency or anything that matters in the certain 
case), but get accepted without notice.
yes, micro-benchmark cannot represent the real world, however it does 
not mean that UnixBench/Coremark should be run. You need to point out 
what is the negative effective from code, or what is the possible real 
scenario which may benefit. And points out the reasonable benchmark 
sensitive for IPIs rather than blindly saying UnixBench/Coremark.


Regards
Bibo Mao








Re: [PATCH v4 6/6] LoongArch: Add pv ipi support on LoongArch system

2024-02-19 Thread maobibo




On 2024/2/19 下午5:02, Huacai Chen wrote:

On Mon, Feb 19, 2024 at 3:37 PM maobibo  wrote:




On 2024/2/19 下午3:16, Huacai Chen wrote:

On Mon, Feb 19, 2024 at 12:18 PM maobibo  wrote:




On 2024/2/19 上午10:45, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 1, 2024 at 11:20 AM Bibo Mao  wrote:


On LoongArch system, ipi hw uses iocsr registers, there is one iocsr
register access on ipi sending, and two iocsr access on ipi receiving
which is ipi interrupt handler. On VM mode all iocsr registers
accessing will cause VM to trap into hypervisor. So with ipi hw
notification once there will be three times of trap.

This patch adds pv ipi support for VM, hypercall instruction is used
to ipi sender, and hypervisor will inject SWI on the VM. During SWI
interrupt handler, only estat CSR register is written to clear irq.
Estat CSR register access will not trap into hypervisor. So with pv ipi
supported, pv ipi sender will trap into hypervsor one time, pv ipi
revicer will not trap, there is only one time of trap.

Also this patch adds ipi multicast support, the method is similar with
x86. With ipi multicast support, ipi notification can be sent to at most
128 vcpus at one time. It reduces trap times into hypervisor greatly.

Signed-off-by: Bibo Mao 
---
arch/loongarch/include/asm/hardirq.h   |   1 +
arch/loongarch/include/asm/kvm_host.h  |   1 +
arch/loongarch/include/asm/kvm_para.h  | 124 +
arch/loongarch/include/asm/loongarch.h |   1 +
arch/loongarch/kernel/irq.c|   2 +-
arch/loongarch/kernel/paravirt.c   | 113 ++
arch/loongarch/kernel/smp.c|   2 +-
arch/loongarch/kvm/exit.c  |  73 ++-
arch/loongarch/kvm/vcpu.c  |   1 +
9 files changed, 314 insertions(+), 4 deletions(-)

diff --git a/arch/loongarch/include/asm/hardirq.h 
b/arch/loongarch/include/asm/hardirq.h
index 9f0038e19c7f..8a611843c1f0 100644
--- a/arch/loongarch/include/asm/hardirq.h
+++ b/arch/loongarch/include/asm/hardirq.h
@@ -21,6 +21,7 @@ enum ipi_msg_type {
typedef struct {
   unsigned int ipi_irqs[NR_IPI];
   unsigned int __softirq_pending;
+   atomic_t messages cacheline_aligned_in_smp;

Do we really need atomic_t? A plain "unsigned int" can reduce cost
significantly.

For IPI, there are multiple senders and one receiver, the sender uses
atomic_fetch_or(action, >messages) and the receiver uses
atomic_xchg(>messages, 0) to clear message.

There needs sync mechanism between senders and receiver, atomic is the
most simple method.

At least from receiver side, the native IPI doesn't need atomic for
read and clear:
static u32 ipi_read_clear(int cpu)
{
  u32 action;

  /* Load the ipi register to figure out what we're supposed to do */
  action = iocsr_read32(LOONGARCH_IOCSR_IPI_STATUS);
  /* Clear the ipi register to clear the interrupt */
  iocsr_write32(action, LOONGARCH_IOCSR_IPI_CLEAR);
  wbflush();

It is because on physical hardware it is two IOCSR registers and also
there is no method to use atomic read and clear method for IOCSR registers.

However if ipi message is stored on ddr memory, atomic read/clear can
used. Your can compare price of one iocsr_read32 + one iocsr_write32 +
wbflush with one atomic_xchg(>messages, 0)

atomic ops is of course cheaper than iocsr, but unsigned int is even cheaper.
Can you give an example about how to use unsigned int for ipi sender and 
ipi handler? When clearing ipi message, other ipi senders need know 
immediately, else ipi senders will call hypercall notification 
unconditionally.


Regards
Bibo Mao





Regards
Bibo Mao


  return action;
}




} cacheline_aligned irq_cpustat_t;

DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 57399d7cf8b7..1bf927e2bfac 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -43,6 +43,7 @@ struct kvm_vcpu_stat {
   u64 idle_exits;
   u64 cpucfg_exits;
   u64 signal_exits;
+   u64 hvcl_exits;

hypercall_exits is better.

yeap, hypercall_exits is better, will fix in next version.



};

#define KVM_MEM_HUGEPAGE_CAPABLE   (1UL << 0)
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index 41200e922a82..a25a84e372b9 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -9,6 +9,10 @@
#define HYPERVISOR_VENDOR_SHIFT8
#define HYPERCALL_CODE(vendor, code)   ((vendor << HYPERVISOR_VENDOR_SHIFT) 
+ code)

+#define KVM_HC_CODE_SERVICE0
+#define KVM_HC_SERVICE HYPERCALL_CODE(HYPERVISOR_KVM, 
KVM_HC_CODE_SERVICE)
+#define  KVM_HC_FUNC_IPI   1

Change HC to HCALL is bette

Re: [PATCH v4 4/6] LoongArch: Add paravirt interface for guest kernel

2024-02-19 Thread maobibo




On 2024/2/19 下午5:38, Huacai Chen wrote:

On Mon, Feb 19, 2024 at 5:21 PM maobibo  wrote:




On 2024/2/19 下午4:48, Huacai Chen wrote:

On Mon, Feb 19, 2024 at 12:11 PM maobibo  wrote:




On 2024/2/19 上午10:42, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 1, 2024 at 11:19 AM Bibo Mao  wrote:


The patch adds paravirt interface for guest kernel, function
pv_guest_initi() firstly checks whether system runs on VM mode. If kernel
runs on VM mode, it will call function kvm_para_available() to detect
whether current VMM is KVM hypervisor. And the paravirt function can work
only if current VMM is KVM hypervisor, since there is only KVM hypervisor
supported on LoongArch now.

This patch only adds paravirt interface for guest kernel, however there
is not effective pv functions added here.

Signed-off-by: Bibo Mao 
---
arch/loongarch/Kconfig|  9 
arch/loongarch/include/asm/kvm_para.h |  7 
arch/loongarch/include/asm/paravirt.h | 27 
.../include/asm/paravirt_api_clock.h  |  1 +
arch/loongarch/kernel/Makefile|  1 +
arch/loongarch/kernel/paravirt.c  | 41 +++
arch/loongarch/kernel/setup.c |  2 +
7 files changed, 88 insertions(+)
create mode 100644 arch/loongarch/include/asm/paravirt.h
create mode 100644 arch/loongarch/include/asm/paravirt_api_clock.h
create mode 100644 arch/loongarch/kernel/paravirt.c

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 10959e6c3583..817a56dff80f 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -585,6 +585,15 @@ config CPU_HAS_PREFETCH
   bool
   default y

+config PARAVIRT
+   bool "Enable paravirtualization code"
+   depends on AS_HAS_LVZ_EXTENSION
+   help
+  This changes the kernel so it can modify itself when it is run
+ under a hypervisor, potentially improving performance significantly
+ over full virtualization.  However, when run without a hypervisor
+ the kernel is theoretically slower and slightly larger.
+
config ARCH_SUPPORTS_KEXEC
   def_bool y

diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index 9425d3b7e486..41200e922a82 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -2,6 +2,13 @@
#ifndef _ASM_LOONGARCH_KVM_PARA_H
#define _ASM_LOONGARCH_KVM_PARA_H

+/*
+ * Hypcall code field
+ */
+#define HYPERVISOR_KVM 1
+#define HYPERVISOR_VENDOR_SHIFT8
+#define HYPERCALL_CODE(vendor, code)   ((vendor << HYPERVISOR_VENDOR_SHIFT) + 
code)
+
/*
 * LoongArch hypcall return code
 */
diff --git a/arch/loongarch/include/asm/paravirt.h 
b/arch/loongarch/include/asm/paravirt.h
new file mode 100644
index ..b64813592ba0
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_PARAVIRT_H
+#define _ASM_LOONGARCH_PARAVIRT_H
+
+#ifdef CONFIG_PARAVIRT
+#include 
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
+
+u64 dummy_steal_clock(int cpu);
+DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+   return static_call(pv_steal_clock)(cpu);
+}

The steal time code can be removed in this patch, I think.


Originally I want to remove this piece of code, but it fails to compile
if CONFIG_PARAVIRT is selected. Here is reference code, function
paravirt_steal_clock() must be defined if CONFIG_PARAVIRT is selected.

static __always_inline u64 steal_account_process_time(u64 maxtime)
{
#ifdef CONFIG_PARAVIRT
   if (static_key_false(_steal_enabled)) {
   u64 steal;

   steal = paravirt_steal_clock(smp_processor_id());
   steal -= this_rq()->prev_steal_time;
   steal = min(steal, maxtime);
   account_steal_time(steal);
   this_rq()->prev_steal_time += steal;

   return steal;
   }
#endif
   return 0;
}

OK, then keep it.




+
+int pv_guest_init(void);
+#else
+static inline int pv_guest_init(void)
+{
+   return 0;
+}
+
+#endif // CONFIG_PARAVIRT
+#endif
diff --git a/arch/loongarch/include/asm/paravirt_api_clock.h 
b/arch/loongarch/include/asm/paravirt_api_clock.h
new file mode 100644
index ..65ac7cee0dad
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt_api_clock.h
@@ -0,0 +1 @@
+#include 
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index 3c808c680370..662e6e9de12d 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_MODULES) += module.o module-sections.o
obj-$(CONFIG_STACKTRACE)

Re: [PATCH v4 4/6] LoongArch: Add paravirt interface for guest kernel

2024-02-19 Thread maobibo




On 2024/2/19 下午4:48, Huacai Chen wrote:

On Mon, Feb 19, 2024 at 12:11 PM maobibo  wrote:




On 2024/2/19 上午10:42, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 1, 2024 at 11:19 AM Bibo Mao  wrote:


The patch adds paravirt interface for guest kernel, function
pv_guest_initi() firstly checks whether system runs on VM mode. If kernel
runs on VM mode, it will call function kvm_para_available() to detect
whether current VMM is KVM hypervisor. And the paravirt function can work
only if current VMM is KVM hypervisor, since there is only KVM hypervisor
supported on LoongArch now.

This patch only adds paravirt interface for guest kernel, however there
is not effective pv functions added here.

Signed-off-by: Bibo Mao 
---
   arch/loongarch/Kconfig|  9 
   arch/loongarch/include/asm/kvm_para.h |  7 
   arch/loongarch/include/asm/paravirt.h | 27 
   .../include/asm/paravirt_api_clock.h  |  1 +
   arch/loongarch/kernel/Makefile|  1 +
   arch/loongarch/kernel/paravirt.c  | 41 +++
   arch/loongarch/kernel/setup.c |  2 +
   7 files changed, 88 insertions(+)
   create mode 100644 arch/loongarch/include/asm/paravirt.h
   create mode 100644 arch/loongarch/include/asm/paravirt_api_clock.h
   create mode 100644 arch/loongarch/kernel/paravirt.c

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 10959e6c3583..817a56dff80f 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -585,6 +585,15 @@ config CPU_HAS_PREFETCH
  bool
  default y

+config PARAVIRT
+   bool "Enable paravirtualization code"
+   depends on AS_HAS_LVZ_EXTENSION
+   help
+  This changes the kernel so it can modify itself when it is run
+ under a hypervisor, potentially improving performance significantly
+ over full virtualization.  However, when run without a hypervisor
+ the kernel is theoretically slower and slightly larger.
+
   config ARCH_SUPPORTS_KEXEC
  def_bool y

diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index 9425d3b7e486..41200e922a82 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -2,6 +2,13 @@
   #ifndef _ASM_LOONGARCH_KVM_PARA_H
   #define _ASM_LOONGARCH_KVM_PARA_H

+/*
+ * Hypcall code field
+ */
+#define HYPERVISOR_KVM 1
+#define HYPERVISOR_VENDOR_SHIFT8
+#define HYPERCALL_CODE(vendor, code)   ((vendor << HYPERVISOR_VENDOR_SHIFT) + 
code)
+
   /*
* LoongArch hypcall return code
*/
diff --git a/arch/loongarch/include/asm/paravirt.h 
b/arch/loongarch/include/asm/paravirt.h
new file mode 100644
index ..b64813592ba0
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_PARAVIRT_H
+#define _ASM_LOONGARCH_PARAVIRT_H
+
+#ifdef CONFIG_PARAVIRT
+#include 
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
+
+u64 dummy_steal_clock(int cpu);
+DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+   return static_call(pv_steal_clock)(cpu);
+}

The steal time code can be removed in this patch, I think.


Originally I want to remove this piece of code, but it fails to compile
if CONFIG_PARAVIRT is selected. Here is reference code, function
paravirt_steal_clock() must be defined if CONFIG_PARAVIRT is selected.

static __always_inline u64 steal_account_process_time(u64 maxtime)
{
#ifdef CONFIG_PARAVIRT
  if (static_key_false(_steal_enabled)) {
  u64 steal;

  steal = paravirt_steal_clock(smp_processor_id());
  steal -= this_rq()->prev_steal_time;
  steal = min(steal, maxtime);
  account_steal_time(steal);
  this_rq()->prev_steal_time += steal;

  return steal;
  }
#endif
  return 0;
}

OK, then keep it.




+
+int pv_guest_init(void);
+#else
+static inline int pv_guest_init(void)
+{
+   return 0;
+}
+
+#endif // CONFIG_PARAVIRT
+#endif
diff --git a/arch/loongarch/include/asm/paravirt_api_clock.h 
b/arch/loongarch/include/asm/paravirt_api_clock.h
new file mode 100644
index ..65ac7cee0dad
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt_api_clock.h
@@ -0,0 +1 @@
+#include 
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index 3c808c680370..662e6e9de12d 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_MODULES) += module.o module-sections.o
   obj-$(CONFIG_STACKTRACE)   += stacktrace.o

   obj-$(CONFIG_PROC_FS)  += proc.o
+obj-$(CONFIG_PARAVIRT) += paravirt.

Re: [PATCH v4 6/6] LoongArch: Add pv ipi support on LoongArch system

2024-02-18 Thread maobibo




On 2024/2/19 下午3:16, Huacai Chen wrote:

On Mon, Feb 19, 2024 at 12:18 PM maobibo  wrote:




On 2024/2/19 上午10:45, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 1, 2024 at 11:20 AM Bibo Mao  wrote:


On LoongArch system, ipi hw uses iocsr registers, there is one iocsr
register access on ipi sending, and two iocsr access on ipi receiving
which is ipi interrupt handler. On VM mode all iocsr registers
accessing will cause VM to trap into hypervisor. So with ipi hw
notification once there will be three times of trap.

This patch adds pv ipi support for VM, hypercall instruction is used
to ipi sender, and hypervisor will inject SWI on the VM. During SWI
interrupt handler, only estat CSR register is written to clear irq.
Estat CSR register access will not trap into hypervisor. So with pv ipi
supported, pv ipi sender will trap into hypervsor one time, pv ipi
revicer will not trap, there is only one time of trap.

Also this patch adds ipi multicast support, the method is similar with
x86. With ipi multicast support, ipi notification can be sent to at most
128 vcpus at one time. It reduces trap times into hypervisor greatly.

Signed-off-by: Bibo Mao 
---
   arch/loongarch/include/asm/hardirq.h   |   1 +
   arch/loongarch/include/asm/kvm_host.h  |   1 +
   arch/loongarch/include/asm/kvm_para.h  | 124 +
   arch/loongarch/include/asm/loongarch.h |   1 +
   arch/loongarch/kernel/irq.c|   2 +-
   arch/loongarch/kernel/paravirt.c   | 113 ++
   arch/loongarch/kernel/smp.c|   2 +-
   arch/loongarch/kvm/exit.c  |  73 ++-
   arch/loongarch/kvm/vcpu.c  |   1 +
   9 files changed, 314 insertions(+), 4 deletions(-)

diff --git a/arch/loongarch/include/asm/hardirq.h 
b/arch/loongarch/include/asm/hardirq.h
index 9f0038e19c7f..8a611843c1f0 100644
--- a/arch/loongarch/include/asm/hardirq.h
+++ b/arch/loongarch/include/asm/hardirq.h
@@ -21,6 +21,7 @@ enum ipi_msg_type {
   typedef struct {
  unsigned int ipi_irqs[NR_IPI];
  unsigned int __softirq_pending;
+   atomic_t messages cacheline_aligned_in_smp;

Do we really need atomic_t? A plain "unsigned int" can reduce cost
significantly.

For IPI, there are multiple senders and one receiver, the sender uses
atomic_fetch_or(action, >messages) and the receiver uses
atomic_xchg(>messages, 0) to clear message.

There needs sync mechanism between senders and receiver, atomic is the
most simple method.

At least from receiver side, the native IPI doesn't need atomic for
read and clear:
static u32 ipi_read_clear(int cpu)
{
 u32 action;

 /* Load the ipi register to figure out what we're supposed to do */
 action = iocsr_read32(LOONGARCH_IOCSR_IPI_STATUS);
 /* Clear the ipi register to clear the interrupt */
 iocsr_write32(action, LOONGARCH_IOCSR_IPI_CLEAR);
 wbflush();
It is because on physical hardware it is two IOCSR registers and also 
there is no method to use atomic read and clear method for IOCSR registers.


However if ipi message is stored on ddr memory, atomic read/clear can 
used. Your can compare price of one iocsr_read32 + one iocsr_write32 + 
wbflush with one atomic_xchg(>messages, 0)


Regards
Bibo Mao


 return action;
}




   } cacheline_aligned irq_cpustat_t;

   DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 57399d7cf8b7..1bf927e2bfac 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -43,6 +43,7 @@ struct kvm_vcpu_stat {
  u64 idle_exits;
  u64 cpucfg_exits;
  u64 signal_exits;
+   u64 hvcl_exits;

hypercall_exits is better.

yeap, hypercall_exits is better, will fix in next version.



   };

   #define KVM_MEM_HUGEPAGE_CAPABLE   (1UL << 0)
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index 41200e922a82..a25a84e372b9 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -9,6 +9,10 @@
   #define HYPERVISOR_VENDOR_SHIFT8
   #define HYPERCALL_CODE(vendor, code)   ((vendor << HYPERVISOR_VENDOR_SHIFT) 
+ code)

+#define KVM_HC_CODE_SERVICE0
+#define KVM_HC_SERVICE HYPERCALL_CODE(HYPERVISOR_KVM, 
KVM_HC_CODE_SERVICE)
+#define  KVM_HC_FUNC_IPI   1

Change HC to HCALL is better.

will modify in next version.



+
   /*
* LoongArch hypcall return code
*/
@@ -16,6 +20,126 @@
   #define KVM_HC_INVALID_CODE-1UL
   #define KVM_HC_INVALID_PARAMETER   -2UL

+/*
+ * Hypercalls interface for KVM hypervisor
+ *
+ * a0: function identifier
+ * a1-a6: args
+ * Return value will be placed in v0.
+ * Up to 6 arguments are passed in a1, a2, a3, a4, a5, a6.
+ */
+static __always_inline long kvm_hypercall(u64 fid)
+{
+ 

Re: [PATCH v4 6/6] LoongArch: Add pv ipi support on LoongArch system

2024-02-18 Thread maobibo




On 2024/2/19 上午10:45, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 1, 2024 at 11:20 AM Bibo Mao  wrote:


On LoongArch system, ipi hw uses iocsr registers, there is one iocsr
register access on ipi sending, and two iocsr access on ipi receiving
which is ipi interrupt handler. On VM mode all iocsr registers
accessing will cause VM to trap into hypervisor. So with ipi hw
notification once there will be three times of trap.

This patch adds pv ipi support for VM, hypercall instruction is used
to ipi sender, and hypervisor will inject SWI on the VM. During SWI
interrupt handler, only estat CSR register is written to clear irq.
Estat CSR register access will not trap into hypervisor. So with pv ipi
supported, pv ipi sender will trap into hypervsor one time, pv ipi
revicer will not trap, there is only one time of trap.

Also this patch adds ipi multicast support, the method is similar with
x86. With ipi multicast support, ipi notification can be sent to at most
128 vcpus at one time. It reduces trap times into hypervisor greatly.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/hardirq.h   |   1 +
  arch/loongarch/include/asm/kvm_host.h  |   1 +
  arch/loongarch/include/asm/kvm_para.h  | 124 +
  arch/loongarch/include/asm/loongarch.h |   1 +
  arch/loongarch/kernel/irq.c|   2 +-
  arch/loongarch/kernel/paravirt.c   | 113 ++
  arch/loongarch/kernel/smp.c|   2 +-
  arch/loongarch/kvm/exit.c  |  73 ++-
  arch/loongarch/kvm/vcpu.c  |   1 +
  9 files changed, 314 insertions(+), 4 deletions(-)

diff --git a/arch/loongarch/include/asm/hardirq.h 
b/arch/loongarch/include/asm/hardirq.h
index 9f0038e19c7f..8a611843c1f0 100644
--- a/arch/loongarch/include/asm/hardirq.h
+++ b/arch/loongarch/include/asm/hardirq.h
@@ -21,6 +21,7 @@ enum ipi_msg_type {
  typedef struct {
 unsigned int ipi_irqs[NR_IPI];
 unsigned int __softirq_pending;
+   atomic_t messages cacheline_aligned_in_smp;

Do we really need atomic_t? A plain "unsigned int" can reduce cost
significantly.
For IPI, there are multiple senders and one receiver, the sender uses 
atomic_fetch_or(action, >messages) and the receiver uses 
atomic_xchg(>messages, 0) to clear message.


There needs sync mechanism between senders and receiver, atomic is the 
most simple method.



  } cacheline_aligned irq_cpustat_t;

  DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 57399d7cf8b7..1bf927e2bfac 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -43,6 +43,7 @@ struct kvm_vcpu_stat {
 u64 idle_exits;
 u64 cpucfg_exits;
 u64 signal_exits;
+   u64 hvcl_exits;

hypercall_exits is better.

yeap, hypercall_exits is better, will fix in next version.



  };

  #define KVM_MEM_HUGEPAGE_CAPABLE   (1UL << 0)
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index 41200e922a82..a25a84e372b9 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -9,6 +9,10 @@
  #define HYPERVISOR_VENDOR_SHIFT8
  #define HYPERCALL_CODE(vendor, code)   ((vendor << HYPERVISOR_VENDOR_SHIFT) + 
code)

+#define KVM_HC_CODE_SERVICE0
+#define KVM_HC_SERVICE HYPERCALL_CODE(HYPERVISOR_KVM, 
KVM_HC_CODE_SERVICE)
+#define  KVM_HC_FUNC_IPI   1

Change HC to HCALL is better.

will modify in next version.



+
  /*
   * LoongArch hypcall return code
   */
@@ -16,6 +20,126 @@
  #define KVM_HC_INVALID_CODE-1UL
  #define KVM_HC_INVALID_PARAMETER   -2UL

+/*
+ * Hypercalls interface for KVM hypervisor
+ *
+ * a0: function identifier
+ * a1-a6: args
+ * Return value will be placed in v0.
+ * Up to 6 arguments are passed in a1, a2, a3, a4, a5, a6.
+ */
+static __always_inline long kvm_hypercall(u64 fid)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HC_SERVICE)
+   : "=r" (ret)
+   : "r" (fun)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long kvm_hypercall1(u64 fid, unsigned long arg0)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+   register unsigned long a1  asm("a1") = arg0;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HC_SERVICE)
+   : "=r" (ret)
+   : "r" (fun), "r" (a1)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long kvm_hypercall2(u64 fid,
+   unsigned long arg0, unsigned long arg1)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+   register 

Re: [PATCH v4 4/6] LoongArch: Add paravirt interface for guest kernel

2024-02-18 Thread maobibo




On 2024/2/19 上午10:42, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 1, 2024 at 11:19 AM Bibo Mao  wrote:


The patch adds paravirt interface for guest kernel, function
pv_guest_initi() firstly checks whether system runs on VM mode. If kernel
runs on VM mode, it will call function kvm_para_available() to detect
whether current VMM is KVM hypervisor. And the paravirt function can work
only if current VMM is KVM hypervisor, since there is only KVM hypervisor
supported on LoongArch now.

This patch only adds paravirt interface for guest kernel, however there
is not effective pv functions added here.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/Kconfig|  9 
  arch/loongarch/include/asm/kvm_para.h |  7 
  arch/loongarch/include/asm/paravirt.h | 27 
  .../include/asm/paravirt_api_clock.h  |  1 +
  arch/loongarch/kernel/Makefile|  1 +
  arch/loongarch/kernel/paravirt.c  | 41 +++
  arch/loongarch/kernel/setup.c |  2 +
  7 files changed, 88 insertions(+)
  create mode 100644 arch/loongarch/include/asm/paravirt.h
  create mode 100644 arch/loongarch/include/asm/paravirt_api_clock.h
  create mode 100644 arch/loongarch/kernel/paravirt.c

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index 10959e6c3583..817a56dff80f 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -585,6 +585,15 @@ config CPU_HAS_PREFETCH
 bool
 default y

+config PARAVIRT
+   bool "Enable paravirtualization code"
+   depends on AS_HAS_LVZ_EXTENSION
+   help
+  This changes the kernel so it can modify itself when it is run
+ under a hypervisor, potentially improving performance significantly
+ over full virtualization.  However, when run without a hypervisor
+ the kernel is theoretically slower and slightly larger.
+
  config ARCH_SUPPORTS_KEXEC
 def_bool y

diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index 9425d3b7e486..41200e922a82 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -2,6 +2,13 @@
  #ifndef _ASM_LOONGARCH_KVM_PARA_H
  #define _ASM_LOONGARCH_KVM_PARA_H

+/*
+ * Hypcall code field
+ */
+#define HYPERVISOR_KVM 1
+#define HYPERVISOR_VENDOR_SHIFT8
+#define HYPERCALL_CODE(vendor, code)   ((vendor << HYPERVISOR_VENDOR_SHIFT) + 
code)
+
  /*
   * LoongArch hypcall return code
   */
diff --git a/arch/loongarch/include/asm/paravirt.h 
b/arch/loongarch/include/asm/paravirt.h
new file mode 100644
index ..b64813592ba0
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_PARAVIRT_H
+#define _ASM_LOONGARCH_PARAVIRT_H
+
+#ifdef CONFIG_PARAVIRT
+#include 
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
+
+u64 dummy_steal_clock(int cpu);
+DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+   return static_call(pv_steal_clock)(cpu);
+}

The steal time code can be removed in this patch, I think.

Originally I want to remove this piece of code, but it fails to compile 
if CONFIG_PARAVIRT is selected. Here is reference code, function 
paravirt_steal_clock() must be defined if CONFIG_PARAVIRT is selected.


static __always_inline u64 steal_account_process_time(u64 maxtime)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(_steal_enabled)) {
u64 steal;

steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
steal = min(steal, maxtime);
account_steal_time(steal);
this_rq()->prev_steal_time += steal;

return steal;
}
#endif
return 0;
}


+
+int pv_guest_init(void);
+#else
+static inline int pv_guest_init(void)
+{
+   return 0;
+}
+
+#endif // CONFIG_PARAVIRT
+#endif
diff --git a/arch/loongarch/include/asm/paravirt_api_clock.h 
b/arch/loongarch/include/asm/paravirt_api_clock.h
new file mode 100644
index ..65ac7cee0dad
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt_api_clock.h
@@ -0,0 +1 @@
+#include 
diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile
index 3c808c680370..662e6e9de12d 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_MODULES) += module.o module-sections.o
  obj-$(CONFIG_STACKTRACE)   += stacktrace.o

  obj-$(CONFIG_PROC_FS)  += proc.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o

  obj-$(CONFIG_SMP)  += smp.o

diff --git a/arch/loongarch/kernel/paravirt.c b/arch/loongarch/kernel/paravirt.c
new file mode 100644
index ..21d01d05791a
--- /dev/null

Re: [PATCH v4 2/6] LoongArch: KVM: Add hypercall instruction emulation support

2024-02-18 Thread maobibo



On 2024/2/19 上午10:41, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 1, 2024 at 11:19 AM Bibo Mao  wrote:


On LoongArch system, hypercall instruction is supported when system
runs on VM mode. This patch adds dummy function with hypercall
instruction emulation, rather than inject EXCCODE_INE invalid
instruction exception.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/Kbuild  |  1 -
  arch/loongarch/include/asm/kvm_para.h  | 26 ++
  arch/loongarch/include/uapi/asm/Kbuild |  2 --
  arch/loongarch/kvm/exit.c  | 10 ++
  4 files changed, 36 insertions(+), 3 deletions(-)
  create mode 100644 arch/loongarch/include/asm/kvm_para.h
  delete mode 100644 arch/loongarch/include/uapi/asm/Kbuild

diff --git a/arch/loongarch/include/asm/Kbuild 
b/arch/loongarch/include/asm/Kbuild
index 93783fa24f6e..22991a6f0e2b 100644
--- a/arch/loongarch/include/asm/Kbuild
+++ b/arch/loongarch/include/asm/Kbuild
@@ -23,4 +23,3 @@ generic-y += poll.h
  generic-y += param.h
  generic-y += posix_types.h
  generic-y += resource.h
-generic-y += kvm_para.h
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
new file mode 100644
index ..9425d3b7e486
--- /dev/null
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_KVM_PARA_H
+#define _ASM_LOONGARCH_KVM_PARA_H
+
+/*
+ * LoongArch hypcall return code

Maybe using "hypercall" in comments is better.

will modify in next patch.




+ */
+#define KVM_HC_STATUS_SUCCESS  0
+#define KVM_HC_INVALID_CODE-1UL
+#define KVM_HC_INVALID_PARAMETER   -2UL

Maybe KVM_HCALL_SUCCESS/KVM_HCALL_INVALID_CODE/KVM_HCALL_PARAMETER is better.

yes, KVM_HCALL_ sounds better. Will modify it.

Regards
Bibo Mao


Huacai


+
+static inline unsigned int kvm_arch_para_features(void)
+{
+   return 0;
+}
+
+static inline unsigned int kvm_arch_para_hints(void)
+{
+   return 0;
+}
+
+static inline bool kvm_check_and_clear_guest_paused(void)
+{
+   return false;
+}
+#endif /* _ASM_LOONGARCH_KVM_PARA_H */
diff --git a/arch/loongarch/include/uapi/asm/Kbuild 
b/arch/loongarch/include/uapi/asm/Kbuild
deleted file mode 100644
index 4aa680ca2e5f..
--- a/arch/loongarch/include/uapi/asm/Kbuild
+++ /dev/null
@@ -1,2 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-generic-y += kvm_para.h
diff --git a/arch/loongarch/kvm/exit.c b/arch/loongarch/kvm/exit.c
index ed1d89d53e2e..d15c71320a11 100644
--- a/arch/loongarch/kvm/exit.c
+++ b/arch/loongarch/kvm/exit.c
@@ -685,6 +685,15 @@ static int kvm_handle_lasx_disabled(struct kvm_vcpu *vcpu)
 return RESUME_GUEST;
  }

+static int kvm_handle_hypcall(struct kvm_vcpu *vcpu)
+{
+   update_pc(>arch);
+
+   /* Treat it as noop intruction, only set return value */
+   vcpu->arch.gprs[LOONGARCH_GPR_A0] = KVM_HC_INVALID_CODE;
+   return RESUME_GUEST;
+}
+
  /*
   * LoongArch KVM callback handling for unimplemented guest exiting
   */
@@ -716,6 +725,7 @@ static exit_handle_fn kvm_fault_tables[EXCCODE_INT_START] = 
{
 [EXCCODE_LSXDIS]= kvm_handle_lsx_disabled,
 [EXCCODE_LASXDIS]   = kvm_handle_lasx_disabled,
 [EXCCODE_GSPR]  = kvm_handle_gspr,
+   [EXCCODE_HVC]   = kvm_handle_hypcall,
  };

  int kvm_handle_fault(struct kvm_vcpu *vcpu, int fault)
--
2.39.3






Re: [PATCH v4 1/6] LoongArch/smp: Refine ipi ops on LoongArch platform

2024-02-18 Thread maobibo

Huacai,

Thanks for your reviewing, I reply inline.

On 2024/2/19 上午10:39, Huacai Chen wrote:

Hi, Bibo,

On Thu, Feb 1, 2024 at 11:19 AM Bibo Mao  wrote:


This patch refines ipi handling on LoongArch platform, there are
three changes with this patch.
1. Add generic get_percpu_irq() api, replace some percpu irq functions
such as get_ipi_irq()/get_pmc_irq()/get_timer_irq() with get_percpu_irq().

2. Change parameter action definition with function
loongson_send_ipi_single() and loongson_send_ipi_mask(). Normal decimal
encoding is used rather than binary bitmap encoding for ipi action, ipi
hw sender uses devimal action code, and ipi receiver will get binary bitmap
encoding, the ipi hw will convert it into bitmap in ipi message buffer.

What is "devimal" here? Maybe decimal?

yeap, it should be decimal.





3. Add structure smp_ops on LoongArch platform so that pv ipi can be used
later.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/hardirq.h |  4 ++
  arch/loongarch/include/asm/irq.h | 10 -
  arch/loongarch/include/asm/smp.h | 31 +++
  arch/loongarch/kernel/irq.c  | 22 +--
  arch/loongarch/kernel/perf_event.c   | 14 +--
  arch/loongarch/kernel/smp.c  | 58 +++-
  arch/loongarch/kernel/time.c | 12 +-
  7 files changed, 71 insertions(+), 80 deletions(-)

diff --git a/arch/loongarch/include/asm/hardirq.h 
b/arch/loongarch/include/asm/hardirq.h
index 0ef3b18f8980..9f0038e19c7f 100644
--- a/arch/loongarch/include/asm/hardirq.h
+++ b/arch/loongarch/include/asm/hardirq.h
@@ -12,6 +12,10 @@
  extern void ack_bad_irq(unsigned int irq);
  #define ack_bad_irq ack_bad_irq

+enum ipi_msg_type {
+   IPI_RESCHEDULE,
+   IPI_CALL_FUNCTION,
+};
  #define NR_IPI 2

  typedef struct {
diff --git a/arch/loongarch/include/asm/irq.h b/arch/loongarch/include/asm/irq.h
index 218b4da0ea90..00101b6d601e 100644
--- a/arch/loongarch/include/asm/irq.h
+++ b/arch/loongarch/include/asm/irq.h
@@ -117,8 +117,16 @@ extern struct fwnode_handle *liointc_handle;
  extern struct fwnode_handle *pch_lpc_handle;
  extern struct fwnode_handle *pch_pic_handle[MAX_IO_PICS];

-extern irqreturn_t loongson_ipi_interrupt(int irq, void *dev);
+static inline int get_percpu_irq(int vector)
+{
+   struct irq_domain *d;
+
+   d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY);
+   if (d)
+   return irq_create_mapping(d, vector);

+   return -EINVAL;
+}
  #include 

  #endif /* _ASM_IRQ_H */
diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h
index f81e5f01d619..8a42632b038a 100644
--- a/arch/loongarch/include/asm/smp.h
+++ b/arch/loongarch/include/asm/smp.h
@@ -12,6 +12,13 @@
  #include 
  #include 

+struct smp_ops {
+   void (*init_ipi)(void);
+   void (*send_ipi_mask)(const struct cpumask *mask, unsigned int action);
+   void (*send_ipi_single)(int cpu, unsigned int action);
+};
+
+extern struct smp_ops smp_ops;
  extern int smp_num_siblings;
  extern int num_processors;
  extern int disabled_cpus;
@@ -24,8 +31,6 @@ void loongson_prepare_cpus(unsigned int max_cpus);
  void loongson_boot_secondary(int cpu, struct task_struct *idle);
  void loongson_init_secondary(void);
  void loongson_smp_finish(void);
-void loongson_send_ipi_single(int cpu, unsigned int action);
-void loongson_send_ipi_mask(const struct cpumask *mask, unsigned int action);
  #ifdef CONFIG_HOTPLUG_CPU
  int loongson_cpu_disable(void);
  void loongson_cpu_die(unsigned int cpu);
@@ -59,9 +64,12 @@ extern int __cpu_logical_map[NR_CPUS];

  #define cpu_physical_id(cpu)   cpu_logical_map(cpu)

-#define SMP_BOOT_CPU   0x1
-#define SMP_RESCHEDULE 0x2
-#define SMP_CALL_FUNCTION  0x4
+#define ACTTION_BOOT_CPU   0
+#define ACTTION_RESCHEDULE 1
+#define ACTTION_CALL_FUNCTION  2

ACTTION? ACTION?

it should be ACTION_xxx, will refresh it in next patch.

Regards
Bibo Mao


Huacai


+#define SMP_BOOT_CPU   BIT(ACTTION_BOOT_CPU)
+#define SMP_RESCHEDULE BIT(ACTTION_RESCHEDULE)
+#define SMP_CALL_FUNCTION  BIT(ACTTION_CALL_FUNCTION)

  struct secondary_data {
 unsigned long stack;
@@ -71,7 +79,8 @@ extern struct secondary_data cpuboot_data;

  extern asmlinkage void smpboot_entry(void);
  extern asmlinkage void start_secondary(void);
-
+extern void arch_send_call_function_single_ipi(int cpu);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
  extern void calculate_cpu_foreign_map(void);

  /*
@@ -79,16 +88,6 @@ extern void calculate_cpu_foreign_map(void);
   */
  extern void show_ipi_list(struct seq_file *p, int prec);

-static inline void arch_send_call_function_single_ipi(int cpu)
-{
-   loongson_send_ipi_single(cpu, SMP_CALL_FUNCTION);
-}
-
-static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
-{
-   loongson_send_ipi_mask(mask, SMP_CALL_FUNCTION);
-}
-
  #ifdef CONFIG_HOTPLUG_CPU
  static inline int 

Re: [PATCH v4 0/6] LoongArch: Add pv ipi support on LoongArch VM

2024-02-16 Thread maobibo




On 2024/2/15 下午6:25, WANG Xuerui wrote:

On 2/15/24 18:11, WANG Xuerui wrote:
Sorry for the late reply (and Happy Chinese New Year), and thanks for 
providing microbenchmark numbers! But it seems the more comprehensive 
CoreMark results were omitted (that's also absent in v3)? While the 


Of course the benchmark suite should be UnixBench instead of CoreMark. 
Lesson: don't multi-task code reviews, especially not after consuming 
beer -- a cup of coffee won't fully cancel the influence. ;-)


Where is rule about benchmark choices like UnixBench/Coremark for ipi 
improvement?


Regards
Bibo Mao




Re: [PATCH v3 5/6] LoongArch: KVM: Add physical cpuid map support

2024-01-29 Thread maobibo




On 2024/1/29 下午9:11, Huacai Chen wrote:

Hi, Bibo,

Without this patch I can also create a SMP VM, so what problem does
this patch want to solve?
With ipi irqchip, physical cpuid is used for dest cpu rather than 
logical cpuid. And if ipi device is emulated in qemu side, there is 
find_cpu_by_archid to get dest vcpu in file hw/intc/loongarch_ipi.c


Here with hypercall method, ipi is emulated in kvm kernel side, there
should be the same physical cpuid searching logic. And function 
kvm_get_vcpu_by_cpuid is used with pv_ipi backend.


Regards
Bibo Mao



Huacai

On Mon, Jan 22, 2024 at 6:03 PM Bibo Mao  wrote:


Physical cpuid is used to irq routing for irqchips such as ipi/msi/
extioi interrupt controller. And physical cpuid is stored at CSR
register LOONGARCH_CSR_CPUID, it can not be changed once vcpu is
created. Since different irqchips have different size definition
about physical cpuid, KVM uses the smallest cpuid from extioi, and
the max cpuid size is defines as 256.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/kvm_host.h | 26 
  arch/loongarch/include/asm/kvm_vcpu.h |  1 +
  arch/loongarch/kvm/vcpu.c | 93 ++-
  arch/loongarch/kvm/vm.c   | 11 
  4 files changed, 130 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 2d62f7b0d377..57399d7cf8b7 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -64,6 +64,30 @@ struct kvm_world_switch {

  #define MAX_PGTABLE_LEVELS 4

+/*
+ * Physical cpu id is used for interrupt routing, there are different
+ * definitions about physical cpuid on different hardwares.
+ *  For LOONGARCH_CSR_CPUID register, max cpuid size if 512
+ *  For IPI HW, max dest CPUID size 1024
+ *  For extioi interrupt controller, max dest CPUID size is 256
+ *  For MSI interrupt controller, max supported CPUID size is 65536
+ *
+ * Currently max CPUID is defined as 256 for KVM hypervisor, in future
+ * it will be expanded to 4096, including 16 packages at most. And every
+ * package supports at most 256 vcpus
+ */
+#define KVM_MAX_PHYID  256
+
+struct kvm_phyid_info {
+   struct kvm_vcpu *vcpu;
+   boolenabled;
+};
+
+struct kvm_phyid_map {
+   int max_phyid;
+   struct kvm_phyid_info phys_map[KVM_MAX_PHYID];
+};
+
  struct kvm_arch {
 /* Guest physical mm */
 kvm_pte_t *pgd;
@@ -71,6 +95,8 @@ struct kvm_arch {
 unsigned long invalid_ptes[MAX_PGTABLE_LEVELS];
 unsigned int  pte_shifts[MAX_PGTABLE_LEVELS];
 unsigned int  root_level;
+   struct mutex  phyid_map_lock;
+   struct kvm_phyid_map  *phyid_map;

 s64 time_offset;
 struct kvm_context __percpu *vmcs;
diff --git a/arch/loongarch/include/asm/kvm_vcpu.h 
b/arch/loongarch/include/asm/kvm_vcpu.h
index e71ceb88f29e..2402129ee955 100644
--- a/arch/loongarch/include/asm/kvm_vcpu.h
+++ b/arch/loongarch/include/asm/kvm_vcpu.h
@@ -81,6 +81,7 @@ void kvm_save_timer(struct kvm_vcpu *vcpu);
  void kvm_restore_timer(struct kvm_vcpu *vcpu);

  int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt 
*irq);
+struct kvm_vcpu *kvm_get_vcpu_by_cpuid(struct kvm *kvm, int cpuid);

  /*
   * Loongarch KVM guest interrupt handling
diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c
index 27701991886d..97ca9c7160e6 100644
--- a/arch/loongarch/kvm/vcpu.c
+++ b/arch/loongarch/kvm/vcpu.c
@@ -274,6 +274,95 @@ static int _kvm_getcsr(struct kvm_vcpu *vcpu, unsigned int 
id, u64 *val)
 return 0;
  }

+static inline int kvm_set_cpuid(struct kvm_vcpu *vcpu, u64 val)
+{
+   int cpuid;
+   struct loongarch_csrs *csr = vcpu->arch.csr;
+   struct kvm_phyid_map  *map;
+
+   if (val >= KVM_MAX_PHYID)
+   return -EINVAL;
+
+   cpuid = kvm_read_sw_gcsr(csr, LOONGARCH_CSR_ESTAT);
+   map = vcpu->kvm->arch.phyid_map;
+   mutex_lock(>kvm->arch.phyid_map_lock);
+   if (map->phys_map[cpuid].enabled) {
+   /*
+* Cpuid is already set before
+* Forbid changing different cpuid at runtime
+*/
+   if (cpuid != val) {
+   /*
+* Cpuid 0 is initial value for vcpu, maybe invalid
+* unset value for vcpu
+*/
+   if (cpuid) {
+   mutex_unlock(>kvm->arch.phyid_map_lock);
+   return -EINVAL;
+   }
+   } else {
+/* Discard duplicated cpuid set */
+   mutex_unlock(>kvm->arch.phyid_map_lock);
+   return 0;
+   }
+   }
+
+   if (map->phys_map[val].enabled) {
+   /*
+* New cpuid is already set with other vcpu
+* Forbid sharing 

Re: [PATCH v3 6/6] LoongArch: Add pv ipi support on LoongArch system

2024-01-29 Thread maobibo




On 2024/1/29 下午9:10, Huacai Chen wrote:

Hi, Bibo,

On Mon, Jan 22, 2024 at 6:03 PM Bibo Mao  wrote:


On LoongArch system, ipi hw uses iocsr registers, there is one iocsr
register access on ipi sender and two iocsr access on ipi receiver
which is ipi interrupt handler. On VM mode all iocsr registers
accessing will trap into hypervisor. So with one ipi hw notification
there will be three times of trap.

This patch adds pv ipi support for VM, hypercall instruction is used
to ipi sender, and hypervisor will inject SWI on the VM. During SWI
interrupt handler, only estat CSR register is written to clear irq.
Estat CSR register access will not trap into hypervisor. So with pv ipi
supported, pv ipi sender will trap into hypervsor one time, pv ipi
revicer will not trap, there is only one time of trap.

Also this patch adds ipi multicast support, the method is similar with
x86. With ipi multicast support, ipi notification can be sent to at most
128 vcpus at one time. It reduces trap into hypervisor greatly.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/hardirq.h   |   1 +
  arch/loongarch/include/asm/kvm_host.h  |   1 +
  arch/loongarch/include/asm/kvm_para.h  | 124 +
  arch/loongarch/include/asm/loongarch.h |   1 +
  arch/loongarch/kernel/irq.c|   2 +-
  arch/loongarch/kernel/paravirt.c   | 113 ++
  arch/loongarch/kernel/smp.c|   2 +-
  arch/loongarch/kvm/exit.c  |  73 ++-
  arch/loongarch/kvm/vcpu.c  |   1 +
  9 files changed, 314 insertions(+), 4 deletions(-)

diff --git a/arch/loongarch/include/asm/hardirq.h 
b/arch/loongarch/include/asm/hardirq.h
index 9f0038e19c7f..8a611843c1f0 100644
--- a/arch/loongarch/include/asm/hardirq.h
+++ b/arch/loongarch/include/asm/hardirq.h
@@ -21,6 +21,7 @@ enum ipi_msg_type {
  typedef struct {
 unsigned int ipi_irqs[NR_IPI];
 unsigned int __softirq_pending;
+   atomic_t messages cacheline_aligned_in_smp;
  } cacheline_aligned irq_cpustat_t;

  DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
diff --git a/arch/loongarch/include/asm/kvm_host.h 
b/arch/loongarch/include/asm/kvm_host.h
index 57399d7cf8b7..1bf927e2bfac 100644
--- a/arch/loongarch/include/asm/kvm_host.h
+++ b/arch/loongarch/include/asm/kvm_host.h
@@ -43,6 +43,7 @@ struct kvm_vcpu_stat {
 u64 idle_exits;
 u64 cpucfg_exits;
 u64 signal_exits;
+   u64 hvcl_exits;
  };

  #define KVM_MEM_HUGEPAGE_CAPABLE   (1UL << 0)
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h
index 41200e922a82..a25a84e372b9 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -9,6 +9,10 @@
  #define HYPERVISOR_VENDOR_SHIFT8
  #define HYPERCALL_CODE(vendor, code)   ((vendor << HYPERVISOR_VENDOR_SHIFT) + 
code)

+#define KVM_HC_CODE_SERVICE0
+#define KVM_HC_SERVICE HYPERCALL_CODE(HYPERVISOR_KVM, 
KVM_HC_CODE_SERVICE)
+#define  KVM_HC_FUNC_IPI   1
+
  /*
   * LoongArch hypcall return code
   */
@@ -16,6 +20,126 @@
  #define KVM_HC_INVALID_CODE-1UL
  #define KVM_HC_INVALID_PARAMETER   -2UL

+/*
+ * Hypercalls interface for KVM hypervisor
+ *
+ * a0: function identifier
+ * a1-a6: args
+ * Return value will be placed in v0.
+ * Up to 6 arguments are passed in a1, a2, a3, a4, a5, a6.
+ */
+static __always_inline long kvm_hypercall(u64 fid)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HC_SERVICE)
+   : "=r" (ret)
+   : "r" (fun)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long kvm_hypercall1(u64 fid, unsigned long arg0)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+   register unsigned long a1  asm("a1") = arg0;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HC_SERVICE)
+   : "=r" (ret)
+   : "r" (fun), "r" (a1)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long kvm_hypercall2(u64 fid,
+   unsigned long arg0, unsigned long arg1)
+{
+   register long ret asm("v0");
+   register unsigned long fun asm("a0") = fid;
+   register unsigned long a1  asm("a1") = arg0;
+   register unsigned long a2  asm("a2") = arg1;
+
+   __asm__ __volatile__(
+   "hvcl "__stringify(KVM_HC_SERVICE)
+   : "=r" (ret)
+   : "r" (fun), "r" (a1), "r" (a2)
+   : "memory"
+   );
+
+   return ret;
+}
+
+static __always_inline long kvm_hypercall3(u64 fid,
+   unsigned long arg0, unsigned long arg1, unsigned long arg2)
+{
+   register long ret asm("v0");
+   

Re: [PATCH v3 1/6] LoongArch/smp: Refine ipi ops on LoongArch platform

2024-01-29 Thread maobibo




On 2024/1/29 下午8:38, Huacai Chen wrote:

Hi, Bibo,

On Mon, Jan 22, 2024 at 6:03 PM Bibo Mao  wrote:


This patch refines ipi handling on LoongArch platform, there are
three changes with this patch.
1. Add generic get_percpu_irq api, replace some percpu irq function
such as get_ipi_irq/get_pmc_irq/get_timer_irq with get_percpu_irq.

2. Change parameter action definition with function
loongson_send_ipi_single and loongson_send_ipi_mask. Code encoding is used
here rather than bitmap encoding for ipi action, ipi hw sender uses action
code, and ipi receiver will get action bitmap encoding, the ipi hw will
convert it into bitmap in ipi message buffer.

3. Add smp_ops on LoongArch platform so that pv ipi can be used later.

Signed-off-by: Bibo Mao 
---
  arch/loongarch/include/asm/hardirq.h |  4 ++
  arch/loongarch/include/asm/irq.h | 10 -
  arch/loongarch/include/asm/smp.h | 31 +++
  arch/loongarch/kernel/irq.c  | 22 +--
  arch/loongarch/kernel/perf_event.c   | 14 +--
  arch/loongarch/kernel/smp.c  | 58 +++-
  arch/loongarch/kernel/time.c | 12 +-
  7 files changed, 71 insertions(+), 80 deletions(-)

diff --git a/arch/loongarch/include/asm/hardirq.h 
b/arch/loongarch/include/asm/hardirq.h
index 0ef3b18f8980..9f0038e19c7f 100644
--- a/arch/loongarch/include/asm/hardirq.h
+++ b/arch/loongarch/include/asm/hardirq.h
@@ -12,6 +12,10 @@
  extern void ack_bad_irq(unsigned int irq);
  #define ack_bad_irq ack_bad_irq

+enum ipi_msg_type {
+   IPI_RESCHEDULE,
+   IPI_CALL_FUNCTION,
+};
  #define NR_IPI 2

  typedef struct {
diff --git a/arch/loongarch/include/asm/irq.h b/arch/loongarch/include/asm/irq.h
index 218b4da0ea90..00101b6d601e 100644
--- a/arch/loongarch/include/asm/irq.h
+++ b/arch/loongarch/include/asm/irq.h
@@ -117,8 +117,16 @@ extern struct fwnode_handle *liointc_handle;
  extern struct fwnode_handle *pch_lpc_handle;
  extern struct fwnode_handle *pch_pic_handle[MAX_IO_PICS];

-extern irqreturn_t loongson_ipi_interrupt(int irq, void *dev);
+static inline int get_percpu_irq(int vector)
+{
+   struct irq_domain *d;
+
+   d = irq_find_matching_fwnode(cpuintc_handle, DOMAIN_BUS_ANY);
+   if (d)
+   return irq_create_mapping(d, vector);

+   return -EINVAL;
+}
  #include 

  #endif /* _ASM_IRQ_H */
diff --git a/arch/loongarch/include/asm/smp.h b/arch/loongarch/include/asm/smp.h
index f81e5f01d619..330f1cb3741c 100644
--- a/arch/loongarch/include/asm/smp.h
+++ b/arch/loongarch/include/asm/smp.h
@@ -12,6 +12,13 @@
  #include 
  #include 

+struct smp_ops {
+   void (*call_func_ipi)(const struct cpumask *mask, unsigned int action);
+   void (*call_func_single_ipi)(int cpu, unsigned int action);

To keep consistency, it is better to use call_func_ipi_single and
call_func_ipi_mask.
yes, how about using send_ipi_single/send_ipi_mask here? since both 
function arch_smp_send_reschedule() and 
arch_send_call_function_single_ipi use smp_ops.





+   void (*ipi_init)(void);
+};
+
+extern struct smp_ops smp_ops;
  extern int smp_num_siblings;
  extern int num_processors;
  extern int disabled_cpus;
@@ -24,8 +31,6 @@ void loongson_prepare_cpus(unsigned int max_cpus);
  void loongson_boot_secondary(int cpu, struct task_struct *idle);
  void loongson_init_secondary(void);
  void loongson_smp_finish(void);
-void loongson_send_ipi_single(int cpu, unsigned int action);
-void loongson_send_ipi_mask(const struct cpumask *mask, unsigned int action);
  #ifdef CONFIG_HOTPLUG_CPU
  int loongson_cpu_disable(void);
  void loongson_cpu_die(unsigned int cpu);
@@ -59,9 +64,12 @@ extern int __cpu_logical_map[NR_CPUS];

  #define cpu_physical_id(cpu)   cpu_logical_map(cpu)

-#define SMP_BOOT_CPU   0x1
-#define SMP_RESCHEDULE 0x2
-#define SMP_CALL_FUNCTION  0x4
+#define ACTTION_BOOT_CPU   0
+#define ACTTION_RESCHEDULE 1
+#define ACTTION_CALL_FUNCTION  2
+#define SMP_BOOT_CPU   BIT(ACTTION_BOOT_CPU)
+#define SMP_RESCHEDULE BIT(ACTTION_RESCHEDULE)
+#define SMP_CALL_FUNCTION  BIT(ACTTION_CALL_FUNCTION)

  struct secondary_data {
 unsigned long stack;
@@ -71,7 +79,8 @@ extern struct secondary_data cpuboot_data;

  extern asmlinkage void smpboot_entry(void);
  extern asmlinkage void start_secondary(void);
-
+extern void arch_send_call_function_single_ipi(int cpu);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);

Similarly, to keep consistency, it is better to use
arch_send_function_ipi_single and arch_send_function_ipi_mask.
These two functions are used by all architectures and called in commcon 
code send_call_function_single_ipi(). It is the same with removed static 
inline function as follows:


 -static inline void arch_send_call_function_single_ipi(int cpu)
 -{
 -   loongson_send_ipi_single(cpu, SMP_CALL_FUNCTION);
 -}
 -
 -static inline void arch_send_call_function_ipi_mask(const struct 
cpumask *mask)

 -{
 -   

Re: [PATCH 4/5] LoongArch: Add paravirt interface for guest kernel

2024-01-03 Thread maobibo




On 2024/1/3 下午4:14, Juergen Gross wrote:

On 03.01.24 09:00, maobibo wrote:



On 2024/1/3 下午3:40, Jürgen Groß wrote:

On 03.01.24 08:16, Bibo Mao wrote:

The patch add paravirt interface for guest kernel, it checks whether
system runs on VM mode. If it is, it will detect hypervisor type. And
returns true it is KVM hypervisor, else return false. Currently only
KVM hypervisor is supported, so there is only hypervisor detection
for KVM type.


I guess you are talking of pv_guest_init() here? Or do you mean
kvm_para_available()?

yes, it is pv_guest_init. It will be better if all hypervisor detection
is called in function pv_guest_init. Currently there is only kvm 
hypervisor, kvm_para_available is hard-coded in pv_guest_init here.


I think this is no problem as long as there are not more hypervisors
supported.



I can split file paravirt.c into paravirt.c and kvm.c, paravirt.c is 
used for hypervisor detection, and move code relative with pv_ipi into 
kvm.c


I wouldn't do that right now.

Just be a little bit more specific in the commit message (use the related
function name instead of "it").

Sure, will do this in next version, and thanks for your guidance.

Regards
Bibo Mao



Juergen





Re: [PATCH 4/5] LoongArch: Add paravirt interface for guest kernel

2024-01-03 Thread maobibo




On 2024/1/3 下午3:40, Jürgen Groß wrote:

On 03.01.24 08:16, Bibo Mao wrote:

The patch add paravirt interface for guest kernel, it checks whether
system runs on VM mode. If it is, it will detect hypervisor type. And
returns true it is KVM hypervisor, else return false. Currently only
KVM hypervisor is supported, so there is only hypervisor detection
for KVM type.


I guess you are talking of pv_guest_init() here? Or do you mean
kvm_para_available()?

yes, it is pv_guest_init. It will be better if all hypervisor detection
is called in function pv_guest_init. Currently there is only kvm 
hypervisor, kvm_para_available is hard-coded in pv_guest_init here.


I can split file paravirt.c into paravirt.c and kvm.c, paravirt.c is 
used for hypervisor detection, and move code relative with pv_ipi into kvm.c


Regards
Bibo Mao




Signed-off-by: Bibo Mao 
---
  arch/loongarch/Kconfig    |  8 
  arch/loongarch/include/asm/kvm_para.h |  7 
  arch/loongarch/include/asm/paravirt.h | 27 
  .../include/asm/paravirt_api_clock.h  |  1 +
  arch/loongarch/kernel/Makefile    |  1 +
  arch/loongarch/kernel/paravirt.c  | 41 +++
  arch/loongarch/kernel/setup.c |  2 +
  7 files changed, 87 insertions(+)
  create mode 100644 arch/loongarch/include/asm/paravirt.h
  create mode 100644 arch/loongarch/include/asm/paravirt_api_clock.h
  create mode 100644 arch/loongarch/kernel/paravirt.c

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index ee123820a476..940e5960d297 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -564,6 +564,14 @@ config CPU_HAS_PREFETCH
  bool
  default y
+config PARAVIRT
+    bool "Enable paravirtualization code"
+    help
+  This changes the kernel so it can modify itself when it is run
+  under a hypervisor, potentially improving performance 
significantly

+  over full virtualization.  However, when run without a hypervisor
+  the kernel is theoretically slower and slightly larger.
+
  config ARCH_SUPPORTS_KEXEC
  def_bool y
diff --git a/arch/loongarch/include/asm/kvm_para.h 
b/arch/loongarch/include/asm/kvm_para.h

index 9425d3b7e486..41200e922a82 100644
--- a/arch/loongarch/include/asm/kvm_para.h
+++ b/arch/loongarch/include/asm/kvm_para.h
@@ -2,6 +2,13 @@
  #ifndef _ASM_LOONGARCH_KVM_PARA_H
  #define _ASM_LOONGARCH_KVM_PARA_H
+/*
+ * Hypcall code field
+ */
+#define HYPERVISOR_KVM    1
+#define HYPERVISOR_VENDOR_SHIFT    8
+#define HYPERCALL_CODE(vendor, code)    ((vendor << 
HYPERVISOR_VENDOR_SHIFT) + code)

+
  /*
   * LoongArch hypcall return code
   */
diff --git a/arch/loongarch/include/asm/paravirt.h 
b/arch/loongarch/include/asm/paravirt.h

new file mode 100644
index ..b64813592ba0
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_LOONGARCH_PARAVIRT_H
+#define _ASM_LOONGARCH_PARAVIRT_H
+
+#ifdef CONFIG_PARAVIRT
+#include 
+struct static_key;
+extern struct static_key paravirt_steal_enabled;
+extern struct static_key paravirt_steal_rq_enabled;
+
+u64 dummy_steal_clock(int cpu);
+DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock);
+
+static inline u64 paravirt_steal_clock(int cpu)
+{
+    return static_call(pv_steal_clock)(cpu);
+}
+
+int pv_guest_init(void);
+#else
+static inline int pv_guest_init(void)
+{
+    return 0;
+}
+
+#endif // CONFIG_PARAVIRT
+#endif
diff --git a/arch/loongarch/include/asm/paravirt_api_clock.h 
b/arch/loongarch/include/asm/paravirt_api_clock.h

new file mode 100644
index ..65ac7cee0dad
--- /dev/null
+++ b/arch/loongarch/include/asm/paravirt_api_clock.h
@@ -0,0 +1 @@
+#include 
diff --git a/arch/loongarch/kernel/Makefile 
b/arch/loongarch/kernel/Makefile

index 3c808c680370..662e6e9de12d 100644
--- a/arch/loongarch/kernel/Makefile
+++ b/arch/loongarch/kernel/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_MODULES)    += module.o 
module-sections.o

  obj-$(CONFIG_STACKTRACE)    += stacktrace.o
  obj-$(CONFIG_PROC_FS)    += proc.o
+obj-$(CONFIG_PARAVIRT)    += paravirt.o
  obj-$(CONFIG_SMP)    += smp.o
diff --git a/arch/loongarch/kernel/paravirt.c 
b/arch/loongarch/kernel/paravirt.c

new file mode 100644
index ..21d01d05791a
--- /dev/null
+++ b/arch/loongarch/kernel/paravirt.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct static_key paravirt_steal_enabled;
+struct static_key paravirt_steal_rq_enabled;
+
+static u64 native_steal_clock(int cpu)
+{
+    return 0;
+}
+
+DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock);


This is the 4th arch with the same definition of native_steal_clock() and
pv_steal_clock. I think we should add a common file kernel/paravirt.c and
move the related functions from the archs into the new file.

If you don't want to do that I can prepare a 

Re: [PATCH 1/3] mm: set page fault address for update_mmu_cache_pmd

2020-06-30 Thread maobibo



On 06/30/2020 06:42 PM, maobibo wrote:
> 
> 
> On 06/30/2020 06:09 PM, Kirill A. Shutemov wrote:
>> On Wed, Jun 24, 2020 at 05:26:30PM +0800, Bibo Mao wrote:
>>> update_mmu_cache_pmd is used to update tlb for the pmd entry by
>>> software. On MIPS system, the tlb entry indexed by page fault
>>> address maybe exists already, only that tlb entry may be small
>>> page, also it may be huge page. Before updating pmd entry with
>>> huge page size, older tlb entry need to be invalidated.
>>>
>>> Here page fault address is passed to function update_mmu_cache_pmd,
>>> rather than pmd huge page start address. The page fault address
>>> can be used for invalidating older tlb entry.
>>>
>>> Signed-off-by: Bibo Mao 
>>> ---
>>>  arch/mips/include/asm/pgtable.h | 9 +
>>>  mm/huge_memory.c| 7 ---
>>>  mm/memory.c | 2 +-
>>>  3 files changed, 14 insertions(+), 4 deletions(-)
>>>
>>> diff --git a/arch/mips/include/asm/pgtable.h 
>>> b/arch/mips/include/asm/pgtable.h
>>> index dd7a0f5..bd81661 100644
>>> --- a/arch/mips/include/asm/pgtable.h
>>> +++ b/arch/mips/include/asm/pgtable.h
>>> @@ -554,11 +554,20 @@ static inline void update_mmu_cache(struct 
>>> vm_area_struct *vma,
>>>  #define__HAVE_ARCH_UPDATE_MMU_TLB
>>>  #define update_mmu_tlb update_mmu_cache
>>>  
>>> +extern void local_flush_tlb_page(struct vm_area_struct *vma,
>>> +   unsigned long page);
>>>  static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
>>> unsigned long address, pmd_t *pmdp)
>>>  {
>>> pte_t pte = *(pte_t *)pmdp;
>>>  
>>> +   /*
>>> +* If pmd_none is true, older tlb entry will be normal page.
>>> +* here to invalidate older tlb entry indexed by address
>>> +* parameter address must be page fault address rather than
>>> +* start address of pmd huge page
>>> +*/
>>> +   local_flush_tlb_page(vma, address);
>>
>> Can't say I follow what is going on.
>>
>> Why local? What happens on SMP?
>>
>> And don't you want to flush PMD_SIZE range around the address?
> There exists two conditions:
> 1. The address is accessed for the first time, there will be one tlb entry 
> with normal page
>size, and privilege for the tlb entry is none. If new tlb entry wants to 
> be added with
>huge page size, the older tlb entry needs to be removed.  Local flushing 
> is enough, if there
>are smp threads running, there will be page fault handing since privilege 
> level is none. During
>page fault handling, the other threads will do the same work, flush local 
> entry, update new entry
>with huge page size.
> 
> 2. It is not accessed by the first time, there exists old tlb entry with huge 
> page such as COW scenery.
>local_flush_tlb_page is not necessary here, old tlb with huge page will be 
> replace with new tlb
>in function __update_tlb.
> 
> For PMD_SIZE range around the address, there exists one tlb entry with huge 
> page size, or one tlb entry
> with normal page size and zero privilege. It is impossible that there exists 
> two or more tlb entries
> with normal page within PMD_SIZE range, so we do not need flush pmd range 
> size, just flush one tlb entry
> is ok.
Sorry for the noise, please discard the patch.

Actually there exists two or more tlb entries with normal page within 
PMD_SIZE range. If multiple threads run on UP or one CPU, these threads
are access the same huge page but different normal pages. Page fault
happens on thread1 and thread1 is sched out during page fault handing.
thread2 is sched in and page fault happens again, there will be two
tlb entries with normal page. This problem exists even without the patch.


> 
> regards
> bibo,mao
> 
>>
>>> __update_tlb(vma, address, pte);
>>>  }
>>>  
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 78c84be..0f9187b 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -780,6 +780,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, 
>>> unsigned long addr,
>>> pgtable_t pgtable)
>>>  {
>>> struct mm_struct *mm = vma->vm_mm;
>>> +   unsigned long start = addr & PMD_MASK;
>>> pmd_t entry;
>>> spinlock_t *ptl;
>>>  
>>> @@ -792,7 +793,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, 
>>> unsigned long add

Re: [PATCH 1/3] mm: set page fault address for update_mmu_cache_pmd

2020-06-30 Thread maobibo



On 06/30/2020 06:09 PM, Kirill A. Shutemov wrote:
> On Wed, Jun 24, 2020 at 05:26:30PM +0800, Bibo Mao wrote:
>> update_mmu_cache_pmd is used to update tlb for the pmd entry by
>> software. On MIPS system, the tlb entry indexed by page fault
>> address maybe exists already, only that tlb entry may be small
>> page, also it may be huge page. Before updating pmd entry with
>> huge page size, older tlb entry need to be invalidated.
>>
>> Here page fault address is passed to function update_mmu_cache_pmd,
>> rather than pmd huge page start address. The page fault address
>> can be used for invalidating older tlb entry.
>>
>> Signed-off-by: Bibo Mao 
>> ---
>>  arch/mips/include/asm/pgtable.h | 9 +
>>  mm/huge_memory.c| 7 ---
>>  mm/memory.c | 2 +-
>>  3 files changed, 14 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/mips/include/asm/pgtable.h 
>> b/arch/mips/include/asm/pgtable.h
>> index dd7a0f5..bd81661 100644
>> --- a/arch/mips/include/asm/pgtable.h
>> +++ b/arch/mips/include/asm/pgtable.h
>> @@ -554,11 +554,20 @@ static inline void update_mmu_cache(struct 
>> vm_area_struct *vma,
>>  #define __HAVE_ARCH_UPDATE_MMU_TLB
>>  #define update_mmu_tlb  update_mmu_cache
>>  
>> +extern void local_flush_tlb_page(struct vm_area_struct *vma,
>> +unsigned long page);
>>  static inline void update_mmu_cache_pmd(struct vm_area_struct *vma,
>>  unsigned long address, pmd_t *pmdp)
>>  {
>>  pte_t pte = *(pte_t *)pmdp;
>>  
>> +/*
>> + * If pmd_none is true, older tlb entry will be normal page.
>> + * here to invalidate older tlb entry indexed by address
>> + * parameter address must be page fault address rather than
>> + * start address of pmd huge page
>> + */
>> +local_flush_tlb_page(vma, address);
> 
> Can't say I follow what is going on.
> 
> Why local? What happens on SMP?
> 
> And don't you want to flush PMD_SIZE range around the address?
There exists two conditions:
1. The address is accessed for the first time, there will be one tlb entry with 
normal page
   size, and privilege for the tlb entry is none. If new tlb entry wants to be 
added with
   huge page size, the older tlb entry needs to be removed.  Local flushing is 
enough, if there
   are smp threads running, there will be page fault handing since privilege 
level is none. During
   page fault handling, the other threads will do the same work, flush local 
entry, update new entry
   with huge page size.

2. It is not accessed by the first time, there exists old tlb entry with huge 
page such as COW scenery.
   local_flush_tlb_page is not necessary here, old tlb with huge page will be 
replace with new tlb
   in function __update_tlb.

For PMD_SIZE range around the address, there exists one tlb entry with huge 
page size, or one tlb entry
with normal page size and zero privilege. It is impossible that there exists 
two or more tlb entries
with normal page within PMD_SIZE range, so we do not need flush pmd range size, 
just flush one tlb entry
is ok.

regards
bibo,mao

> 
>>  __update_tlb(vma, address, pte);
>>  }
>>  
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 78c84be..0f9187b 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -780,6 +780,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, 
>> unsigned long addr,
>>  pgtable_t pgtable)
>>  {
>>  struct mm_struct *mm = vma->vm_mm;
>> +unsigned long start = addr & PMD_MASK;
>>  pmd_t entry;
>>  spinlock_t *ptl;
>>  
>> @@ -792,7 +793,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, 
>> unsigned long addr,
>>  }
>>  entry = pmd_mkyoung(*pmd);
>>  entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>> -if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
>> +if (pmdp_set_access_flags(vma, start, pmd, entry, 1))
>>  update_mmu_cache_pmd(vma, addr, pmd);
>>  }
>>  
>> @@ -813,7 +814,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, 
>> unsigned long addr,
>>  pgtable = NULL;
>>  }
>>  
>> -set_pmd_at(mm, addr, pmd, entry);
>> +set_pmd_at(mm, start, pmd, entry);
>>  update_mmu_cache_pmd(vma, addr, pmd);
>>  
>>  out_unlock:
>> @@ -864,7 +865,7 @@ vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, 
>> pfn_t pfn,
>>  
>>  track_pfn_insert(vma, , pfn);
>>  
>> -insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
>> +insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, pgprot, write, 
>> pgtable);
>>  return VM_FAULT_NOPAGE;
>>  }
>>  EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
>> diff --git a/mm/memory.c b/mm/memory.c
>> index dc7f354..c703458 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -3592,7 +3592,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, 
>> struct page *page)
>>  
>>

Re: [PATCH 2/3] mm/huge_memory.c: update tlb entry if pmd is changed

2020-06-25 Thread maobibo



On 06/25/2020 08:30 AM, Mike Kravetz wrote:
> On 6/24/20 2:26 AM, Bibo Mao wrote:
>> When set_pmd_at is called in function do_huge_pmd_anonymous_page,
>> new tlb entry can be added by software on MIPS platform.
>>
>> Here add update_mmu_cache_pmd when pmd entry is set, and
>> update_mmu_cache_pmd is defined as empty excepts arc/mips platform.
>> This patch has no negative effect on other platforms except arc/mips
>> system.
> 
> I am confused by this comment.  It appears that update_mmu_cache_pmd
> is defined as non-empty on arc, mips, powerpc and sparc architectures.
> Am I missing something?
ohh, sparc is missing here, it not defined as empty. On powerpc it is defined
as empty.

> 
> If those architectures do provide update_mmu_cache_pmd, then the previous
> patch and this one now call update_mmu_cache_pmd with the actual faulting
> address instead of the huge page aligned address.  This was intentional
> for mips.  However, are there any potential issues on the other architectures?
It is not special for mips, only that fault address is useful on mips system.
In function huge_pmd_set_accessed/do_huge_pmd_wp_page, update_mmu_cache_pmd is
called with vmf->address, rather than start address of pmd page.

regards
bibo,mao

> I am no expert in any of those architectures.  arc looks like it could be
> problematic as update_mmu_cache_pmd calls update_mmu_cache and then
> operates on (address & PAGE_MASK).  That could now be different.



Re: [PATCH] MIPS: Do not flush tlb when setting pmd entry

2020-06-23 Thread maobibo



On 06/22/2020 11:48 PM, Thomas Bogendoerfer wrote:
> On Sat, Jun 20, 2020 at 11:47:35AM +0800, maobibo wrote:
>>
>>
>> On 06/17/2020 07:14 PM, Thomas Bogendoerfer wrote:
>>> On Tue, Jun 16, 2020 at 06:34:21PM +0800, maobibo wrote:
>>>>
>>>>
>>>> On 06/15/2020 06:14 PM, Thomas Bogendoerfer wrote:
>>>>> On Wed, Jun 03, 2020 at 05:42:13PM +0800, Bibo Mao wrote:
>>>>>> Function set_pmd_at is to set pmd entry, if tlb entry need to
>>>>>> be flushed, there exists pmdp_huge_clear_flush alike function
>>>>>> before set_pmd_at is called. So it is not necessary to call
>>>>>> flush_tlb_all in this function.
>>>>>
>>>>> have you checked all set_pmd_at() calls ? I found a few case where
>>>>> it's not clear to me, if tlb flushing is done... If you think this
>>>>> is still the right thing to do, please change arch/mips/mm/pgtable-32.c
>>>>> as well.
>>>> well, I will double check this and do more testing about thp and hugepage.
>>>
>>> I was more concerned about
>>>
>>> fs/dax.c
>>> fs/proc/task_mmu.c
>>> mm/rmap.c
>>
>> I think that flush_tlb_all should not be called in function set_pmd_at
>> on mips platform. However update_mmu_cache_pmd() should be called __after__
>> set_pmd_at() function to update tlb entry at some places, it is another 
>> issue.
>> Here is my analysis in the three files where set_pmd_at is called.
>> [..]
> 
> thank you for confirming that we are good with removing flush_tlb_all().
Sorry, there is something wrong if remove flush_tlb_all(). If pmd_none is true,
pmd points to invalid_pte_table, maybe there exists pte entry with normal page 
size
for fault address. And we need invalidate this pte entry like it is done in 
function build_huge_handler_tail in file arch/mips/mm/tlbex.c

I will send another patch.
> 
> Thomas.
> 



Re: [PATCH v8] mm: Proactive compaction

2020-06-22 Thread maobibo



On 06/23/2020 10:26 AM, Nathan Chancellor wrote:
> On Tue, Jun 16, 2020 at 01:45:27PM -0700, Nitin Gupta wrote:
>> For some applications, we need to allocate almost all memory as
>> hugepages. However, on a running system, higher-order allocations can
>> fail if the memory is fragmented. Linux kernel currently does on-demand
>> compaction as we request more hugepages, but this style of compaction
>> incurs very high latency. Experiments with one-time full memory
>> compaction (followed by hugepage allocations) show that kernel is able
>> to restore a highly fragmented memory state to a fairly compacted memory
>> state within <1 sec for a 32G system. Such data suggests that a more
>> proactive compaction can help us allocate a large fraction of memory as
>> hugepages keeping allocation latencies low.
>>
>> For a more proactive compaction, the approach taken here is to define a
>> new sysctl called 'vm.compaction_proactiveness' which dictates bounds
>> for external fragmentation which kcompactd tries to maintain.
>>
>> The tunable takes a value in range [0, 100], with a default of 20.
>>
>> Note that a previous version of this patch [1] was found to introduce
>> too many tunables (per-order extfrag{low, high}), but this one reduces
>> them to just one sysctl. Also, the new tunable is an opaque value
>> instead of asking for specific bounds of "external fragmentation", which
>> would have been difficult to estimate. The internal interpretation of
>> this opaque value allows for future fine-tuning.
>>
>> Currently, we use a simple translation from this tunable to [low, high]
>> "fragmentation score" thresholds (low=100-proactiveness, high=low+10%).
>> The score for a node is defined as weighted mean of per-zone external
>> fragmentation. A zone's present_pages determines its weight.
>>
>> To periodically check per-node score, we reuse per-node kcompactd
>> threads, which are woken up every 500 milliseconds to check the same. If
>> a node's score exceeds its high threshold (as derived from user-provided
>> proactiveness value), proactive compaction is started until its score
>> reaches its low threshold value. By default, proactiveness is set to 20,
>> which implies threshold values of low=80 and high=90.
>>
>> This patch is largely based on ideas from Michal Hocko [2]. See also the
>> LWN article [3].
>>
>> Performance data
>> 
>>
>> System: x64_64, 1T RAM, 80 CPU threads.
>> Kernel: 5.6.0-rc3 + this patch
>>
>> echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/enabled
>> echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/defrag
>>
>> Before starting the driver, the system was fragmented from a userspace
>> program that allocates all memory and then for each 2M aligned section,
>> frees 3/4 of base pages using munmap. The workload is mainly anonymous
>> userspace pages, which are easy to move around. I intentionally avoided
>> unmovable pages in this test to see how much latency we incur when
>> hugepage allocations hit direct compaction.
>>
>> 1. Kernel hugepage allocation latencies
>>
>> With the system in such a fragmented state, a kernel driver then
>> allocates as many hugepages as possible and measures allocation
>> latency:
>>
>> (all latency values are in microseconds)
>>
>> - With vanilla 5.6.0-rc3
>>
>>   percentile latency
>>   –– –––
>> 57894
>>109496
>>25   12561
>>30   15295
>>40   18244
>>50   21229
>>60   27556
>>75   30147
>>80   31047
>>90   32859
>>95   33799
>>
>> Total 2M hugepages allocated = 383859 (749G worth of hugepages out of
>> 762G total free => 98% of free memory could be allocated as hugepages)
>>
>> - With 5.6.0-rc3 + this patch, with proactiveness=20
>>
>> sysctl -w vm.compaction_proactiveness=20
>>
>>   percentile latency
>>   –– –––
>> 5   2
>>10   2
>>25   3
>>30   3
>>40   3
>>50   4
>>60   4
>>75   4
>>80   4
>>90   5
>>95 429
>>
>> Total 2M hugepages allocated = 384105 (750G worth of hugepages out of
>> 762G total free => 98% of free memory could be allocated as hugepages)
>>
>> 2. JAVA heap allocation
>>
>> In this test, we first fragment memory using the same method as for (1).
>>
>> Then, we start a Java process with a heap size set to 700G and request
>> the heap to be allocated with THP hugepages. We also set THP to madvise
>> to allow hugepage backing of this heap.
>>
>> /usr/bin/time
>>  java -Xms700G -Xmx700G -XX:+UseTransparentHugePages -XX:+AlwaysPreTouch
>>
>> The above command allocates 700G of Java heap using hugepages.
>>
>> - With vanilla 5.6.0-rc3
>>
>> 17.39user 1666.48system 27:37.89elapsed
>>
>> - With 5.6.0-rc3 + this patch, with proactiveness=20
>>
>> 8.35user 194.58system 3:19.62elapsed
>>
>> Elapsed time remains around 3:15, as proactiveness is further increased.

Re: [PATCH] MIPS: Do not flush tlb when setting pmd entry

2020-06-19 Thread maobibo



On 06/17/2020 07:14 PM, Thomas Bogendoerfer wrote:
> On Tue, Jun 16, 2020 at 06:34:21PM +0800, maobibo wrote:
>>
>>
>> On 06/15/2020 06:14 PM, Thomas Bogendoerfer wrote:
>>> On Wed, Jun 03, 2020 at 05:42:13PM +0800, Bibo Mao wrote:
>>>> Function set_pmd_at is to set pmd entry, if tlb entry need to
>>>> be flushed, there exists pmdp_huge_clear_flush alike function
>>>> before set_pmd_at is called. So it is not necessary to call
>>>> flush_tlb_all in this function.
>>>
>>> have you checked all set_pmd_at() calls ? I found a few case where
>>> it's not clear to me, if tlb flushing is done... If you think this
>>> is still the right thing to do, please change arch/mips/mm/pgtable-32.c
>>> as well.
>> well, I will double check this and do more testing about thp and hugepage.
> 
> I was more concerned about
> 
> fs/dax.c
> fs/proc/task_mmu.c
> mm/rmap.c

I think that flush_tlb_all should not be called in function set_pmd_at
on mips platform. However update_mmu_cache_pmd() should be called __after__
set_pmd_at() function to update tlb entry at some places, it is another issue.
Here is my analysis in the three files where set_pmd_at is called.

in file fs/dax.c
--
if (pmdp) {
#ifdef CONFIG_FS_DAX_PMD
pmd_t pmd;

if (pfn != pmd_pfn(*pmdp))
goto unlock_pmd;  
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
goto unlock_pmd;

flush_cache_page(vma, address, pfn);
pmd = pmdp_invalidate(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
unlock_pmd:  
#endif 
--
pmdp_invalidate is called here to flush pmd range already, it is not necessary
to flush pmd range in function set_pmd_at

--
if (!pmd_none(*(vmf->pmd))) {
spin_unlock(ptl);
goto fallback;
}

if (pgtable) {
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
mm_inc_nr_ptes(vma->vm_mm);
}
pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
pmd_entry = pmd_mkhuge(pmd_entry);
set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
spin_unlock(ptl);
trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
return VM_FAULT_NOPAGE;
--
pmd entry is none, does not need to flush pmd range


in file fs/proc/task_mmu.c
--
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
pmd_t old, pmd = *pmdp;

if (pmd_present(pmd)) {
/* See comment in change_huge_pmd() */
old = pmdp_invalidate(vma, addr, pmdp);
if (pmd_dirty(old))
pmd = pmd_mkdirty(pmd);
if (pmd_young(old))
pmd = pmd_mkyoung(pmd);

pmd = pmd_wrprotect(pmd);
pmd = pmd_clear_soft_dirty(pmd);

set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
pmd = pmd_swp_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
}
--
At the first place where set_pmd_at is called, pmdp_invalidate is called to
flush pmd range. At the second place, on mips system 
pmd_swp_clear_soft_dirty(pmd)
is equal to pmd, pmd entry has no change, does not need to flush pmd range

in file linux/mm/rmap.c:
--
pmd_t *pmd = pvmw.pmd;
pmd_t entry;
  
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;

flush_cache_page(vma, address, page_to_pfn(page));
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
ret = 1;
--
pmdp_invalidate is called to flush pmd range, does not need to flush pmd range
again in function set_pmd_at.

> 
> Thomas.
> 



Re: [PATCH] MIPS: Do not flush tlb when setting pmd entry

2020-06-16 Thread maobibo



On 06/15/2020 06:14 PM, Thomas Bogendoerfer wrote:
> On Wed, Jun 03, 2020 at 05:42:13PM +0800, Bibo Mao wrote:
>> Function set_pmd_at is to set pmd entry, if tlb entry need to
>> be flushed, there exists pmdp_huge_clear_flush alike function
>> before set_pmd_at is called. So it is not necessary to call
>> flush_tlb_all in this function.
> 
> have you checked all set_pmd_at() calls ? I found a few case where
> it's not clear to me, if tlb flushing is done... If you think this
> is still the right thing to do, please change arch/mips/mm/pgtable-32.c
> as well.
well, I will double check this and do more testing about thp and hugepage.

regards
bibo,mao
> 
> Thomas.
> 



Re: [PATCH 1/2] MIPS: set page access bit with pgprot on some MIPS platform

2020-06-05 Thread maobibo



On 06/05/2020 05:39 PM, Jiaxun Yang wrote:
> On Fri,  5 Jun 2020 17:11:05 +0800
> Bibo Mao  wrote:
> 
>> On MIPS system which has rixi hardware bit, page access bit is not
>> set in pgrot. For memory reading, there will be one page fault to
>> allocate physical page; however valid bit is not set, there will
>> be the second fast tlb-miss fault handling to set valid/access bit.
>>
>> This patch set page access/valid bit with pgrot if there is reading
>> access privilege. It will reduce one tlb-miss handling for memory
>> reading access.
>>
>> The valid/access bit will be cleared in order to track memory
>> accessing activity. If the page is accessed, tlb-miss fast handling
>> will set valid/access bit, pte_sw_mkyoung is not necessary in slow
>> page fault path. This patch removes pte_sw_mkyoung function which
>> is defined as empty function except MIPS system.
>>
>> Signed-off-by: Bibo Mao 
>> ---
> 
> Thanks for tracking it down.
> 
> Could you please make the patch tittle more clear?
> "Some" looks confuse to me, "systems with RIXI" would be better.

Sure, will add it.

> 
> - Jiaxun
> 



Re: [GIT PULL] MIPS changes for v5.8-rc1

2020-06-03 Thread maobibo



On 06/04/2020 05:00 AM, Linus Torvalds wrote:
> On Tue, Jun 2, 2020 at 5:55 AM Thomas Bogendoerfer
>  wrote:
>>
>> Bibo Mao (4):
>>   mm/memory.c: Add memory read privilege on page fault handling
> 
> Hmm. That's a horribly named commit, but can you clarify why this
> didn't just use the existing pte_mkyoung?
> 
> These are all paths that set the dirty bit early if it's a write, I
> don't see why they wouldn't set the accessed bit too.
> 
> Even on architectures that manage the accessed bit in hardware, this
> might be a (tiny) performance advantage because it avoids a page table
> walker microfault to set the bit when it's accessed.
On architectures that manage the access bit in hardware, access bit is
set at the beginning even if there is no memory access. On MIPS system
access bit is not set at the beginning, it is set in page fault handling.

There are two ways to solve this, one way is to set access bit in page
fault stage like mk_dirty for write accessing, this will bring out
one/two cycle performance penalty for other architectures. The other way
is to set access bit in the beginning like other architectures
even if there is no memory access.

I am ok with the former method, only that it will influence other arches
just one or two cycles. For the latter I am investigating why access bit
is not set at the beginning in MIPS system.

regards
bibo, mao

> 
> We already do the pte_mkyoung() in paths like cow_user_page(), so the
> others may have been just oversights?
> 
> Or was there possibly some reason we didn't want to do it here?
> 
> Linus
> 



Re: [PATCH v7 2/4] mm/memory.c: Update local TLB if PTE entry exists

2020-05-28 Thread maobibo



On 05/29/2020 03:23 AM, Andrew Morton wrote:
> On Wed, 27 May 2020 10:25:18 +0800 Bibo Mao  wrote:
> 
>> If two threads concurrently fault at the same page, the thread that
>> won the race updates the PTE and its local TLB. For now, the other
>> thread gives up, simply does nothing, and continues.
>>
>> It could happen that this second thread triggers another fault, whereby
>> it only updates its local TLB while handling the fault. Instead of
>> triggering another fault, let's directly update the local TLB of the
>> second thread. Function update_mmu_tlb is used here to update local
>> TLB on the second thread, and it is defined as empty on other arches.
>>
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2752,6 +2752,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>>  new_page = old_page;
>>  page_copied = 1;
>>  } else {
>> +update_mmu_tlb(vma, vmf->address, vmf->pte);
>>  mem_cgroup_cancel_charge(new_page, memcg, false);
>>  }
>>  
> 
> When applying your patches on top of the -mm tree's changes, the above
> hunk didn't apply.  The entire `else' block was removed by
> https://ozlabs.org/~akpm/mmotm/broken-out/mm-memcontrol-convert-anon-and-file-thp-to-new-mem_cgroup_charge-api.patch
> 
> I assumed that dropping this hunk was appropriate.  Please check?
yes, that is appropriate. Sorry to bother you, originally I should format the
patch based on mm-tree.



Re: [PATCH v3 3/3] mm/memory.c: Add memory read privilege before filling PTE entry

2020-05-27 Thread maobibo



On 05/28/2020 04:55 AM, Hugh Dickins wrote:
> On Tue, 19 May 2020, maobibo wrote:
>> On 05/19/2020 04:57 AM, Andrew Morton wrote:
>>> On Mon, 18 May 2020 13:08:49 +0800 Bibo Mao  wrote:
>>>
>>>> On mips platform, hw PTE entry valid bit is set in pte_mkyoung
>>>> function, it is used to set physical page with readable privilege.
>>>
>>> pte_mkyoung() seems to be a strange place to set the pte's valid bit. 
>>> Why is it done there?  Can it be done within mips's mk_pte()?
>> On MIPS system hardware cannot set PAGE_ACCESS bit when accessing the page,
>> software sets PAGE_ACCESS software bit and PAGE_VALID hw bit together during 
>> page
>> fault stage.
>>
>> If mk_pte is called in page fault flow, it is ok to set both bits. If it is 
>> not 
>> called in page fault, PAGE_ACCESS is set however there is no actual memory 
>> accessing.
> 
> Sorry for joining in so late, but would you please explain that some more:
> preferably in the final commit message, if not here.
> 
> I still don't understand why this is not done in the same way as on other
> architectures - those that care (I just checked x86, powerpc, arm, arm64,
> but not all of them) make sure that all the bits they want are there in
> mm/mmap.c's protection_map[16], which then feeds into vma->vm_page_prot,
> and so into mk_pte() as Andrew indicated.
> 
> And I can see that arch/mips/mm/cache.c has a setup_protection_map()
> to do that: why does it not set the additional bits that you want?
> including the valid bit and the accessed (young) bit, as others do.
> Are you saying that there are circumstances in which it is wrong
> for mk_pte() to set the additional bits?
MIPS is actually strange here, _PAGE_ACCESSED is not set in protection_map.
I do not understand history of mips neither. 

On x86/aarch/powerpc system, _PAGE_ACCESSED bit is set in the beginning. How 
does
software track memory page accessing frequency?  Does software not care current
status about _PAGE_ACCESSED bit, just calles madvise_cold to clear this bit, and
then watches whether this bit is changed or not?

regards
bibo,mao
> 
> I'm afraid that generic mm developers will have no clue as to whether
> or not to add a pte_sw_mkyoung() after a mk_pte(); and generic source
> will be the cleaner if it turns out not to be needed (but thank you
> for making sure that it does nothing on the other architectures).
> 
> Hugh
> 



Re: [PATCH v6 1/4] MIPS: Do not flush tlb page when updating PTE entry

2020-05-26 Thread maobibo



On 05/25/2020 04:31 PM, Sergei Shtylyov wrote:
> On 25.05.2020 11:12, Sergei Shtylyov wrote:
> 
>>> It is not necessary to flush tlb page on all CPUs if suitable PTE
>>> entry exists already during page fault handling, just updating
>>> TLB is fine.
>>>
>>> Here redefine flush_tlb_fix_spurious_fault as empty on MIPS system.
>>
>> Need empty line here.
>>
>>> V6:
>>> - Add update_mmu_tlb function as empty on all platform except mips
>>>system, we use this function to update local tlb for page fault
>>>smp-race handling
>>> V5:
>>> - define update_mmu_cache function specified on MIPS platform, and
>>>add page fault smp-race stats info
>>> V4:
>>> - add pte_sw_mkyoung function to implement readable privilege, and
>>>this function is  only in effect on MIPS system.
>>> - add page valid bit judgement in function pte_modify
>>> V3:
>>> - add detailed changelog, modify typo issue in patch V2
>>> v2:
>>> - split flush_tlb_fix_spurious_fault and tlb update into two patches
>>> - comments typo modification
>>> - separate tlb update and add pte readable privilege into two patches
>>
>>It was a bad idea to keep the version change log in the 1st patch only,
>> we have either cover letter for that, or all the individual patches...
> 
>Sorry for noticing this only now. With 4 patches, you should have a cover 
> letter anyway...
Thanks for reviewing my patch, a cover letter will be added.

> 

>>> Signed-off-by: Bibo Mao 
>> [...]
> 
> MBR, Sergei



Re: [PATCH v6 1/4] MIPS: Do not flush tlb page when updating PTE entry

2020-05-26 Thread maobibo



On 05/26/2020 05:42 AM, Andrew Morton wrote:
> On Mon, 25 May 2020 10:52:37 +0800 Bibo Mao  wrote:
> 
>> It is not necessary to flush tlb page on all CPUs if suitable PTE
>> entry exists already during page fault handling, just updating
>> TLB is fine.
>>
>> Here redefine flush_tlb_fix_spurious_fault as empty on MIPS system.
>>
>> ...
>>
>> --- a/arch/mips/include/asm/pgtable.h
>> +++ b/arch/mips/include/asm/pgtable.h
>> @@ -478,6 +478,8 @@ static inline pgprot_t pgprot_writecombine(pgprot_t 
>> _prot)
>>  return __pgprot(prot);
>>  }
>>  
>> +#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
>> +
> 
> static inline C would be preferred, if that works.  For a number of reasons:
> 
> - looks nicer
> 
> - more likely to get a code comment (for some reason)
> 
> - adds typechecking.  So right now a MIPS developer could do
> 
>   struct wibble a;
>   struct wobble b;
> 
>   flush_tlb_fix_spurious_fault(, );
> 
>   and there would be no compiler warning.  Then the code gets merged
>   upstream and in come the embarrassing emails!
> 
> - avoids unused-var warnings
> 
>   foo()
>   {
>   struct address_space a;
>   struct vm_area_struct v;
> 
>   flush_tlb_fix_spurious_fault(, );
>   }
> 
> will generate unused-variable warnings if
> flush_tlb_fix_spurious_fault() is a macro.  Making
> flush_tlb_fix_spurious_fault() inlined C prevents this.
> 
Sure, I will modify this and send another version.



Re: [PATCH v6 3/4] mm/memory.c: Add memory read privilege on page fault handling

2020-05-26 Thread maobibo



On 05/26/2020 05:44 AM, Andrew Morton wrote:
> On Mon, 25 May 2020 10:52:39 +0800 Bibo Mao  wrote:
> 
>> Here add pte_sw_mkyoung function to make page readable on MIPS
>> platform during page fault handling. This patch improves page
>> fault latency about 10% on my MIPS machine with lmbench
>> lat_pagefault case.
>>
>> It is noop function on other arches, there is no negative
>> influence on those architectures.
> 
> Acked-by: Andrew Morton 
> 
> Should I take these, or would the mips tree be preferred?  I'm OK
> either way, but probably the MIPS tree would be better?
Thanks for reviewing again and again.
This patch is based on mips-next, maybe MIPS tree will be better.



Re: [PATCH v5 2/4] mm/memory.c: Update local TLB if PTE entry exists

2020-05-22 Thread maobibo



On 05/22/2020 03:22 AM, Andrew Morton wrote:
> On Thu, 21 May 2020 11:30:35 +0800 Bibo Mao  wrote:
> 
>> If two threads concurrently fault at the same address, the thread that
>> won the race updates the PTE and its local TLB. For now, the other
>> thread gives up, simply does nothing, and continues.
>>
>> It could happen that this second thread triggers another fault, whereby
>> it only updates its local TLB while handling the fault. Instead of
>> triggering another fault, let's directly update the local TLB of the
>> second thread.
>>
>> It is only useful to architectures where software can update TLB, it may
>> bring out some negative effect if update_mmu_cache is used for other
>> purpose also. It seldom happens where multiple threads access the same
>> page at the same time, so the negative effect is limited on other arches.
>>
>> With specjvm2008 workload, smp-race pgfault counts is about 3% to 4%
>> of the total pgfault counts by watching /proc/vmstats information
>>
> 
> I'm sorry to keep thrashing this for so long, but I'd really prefer not
> to add any overhead to architectures which don't need it.  However,
> we're getting somewhere!
> 
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2436,10 +2436,9 @@ static inline bool cow_user_page(struct page *dst, 
>> struct page *src,
>>  if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>>  /*
>>   * Other thread has already handled the fault
>> - * and we don't need to do anything. If it's
>> - * not the case, the fault will be triggered
>> - * again on the same address.
>> + * and update local tlb only
>>   */
>> +update_mmu_cache(vma, addr, vmf->pte);
> 
> Now, all the patch does is to add new calls to update_mmu_cache().
> 
> So can we replace all these with a call to a new
> update_mmu_cache_sw_tlb() (or whatever) which is a no-op on
> architectures which don't need the additional call?
> 
> Also, I wonder about the long-term maintainability.  People who
> regularly work on this code won't be thinking of this MIPS peculiarity
> and it's likely that any new calls to update_mmu_cache_sw_tlb() won't
> be added where they should have been.  Hopefully copy-and-paste from
> the existing code will serve us well.  Please do ensure that the
> update_mmu_cache_sw_tlb() implementation is carefully commented so
> that people can understand where they should (and shouldn't) include
> this call.
Well, I will do that. MIPS is actually somewhat different with generic
architectures, and old MIPS system does not support hardware page walk,
it requires software to update TLB entry.

regards
bibo, mao

 



Re: [PATCH v4 3/4] mm/memory.c: Add memory read privilege on page fault handling

2020-05-20 Thread maobibo



On 05/20/2020 09:30 AM, Andrew Morton wrote:
> On Tue, 19 May 2020 18:03:29 +0800 Bibo Mao  wrote:
> 
>> Here add pte_sw_mkyoung function to make page readable on MIPS
>> platform during page fault handling. This patch improves page
>> fault latency about 10% on my MIPS machine with lmbench
>> lat_pagefault case.
>>
>> It is noop function on other arches, there is no negative
>> influence on those architectures.
>>
>> --- a/arch/mips/include/asm/pgtable.h
>> +++ b/arch/mips/include/asm/pgtable.h
>> @@ -414,6 +414,8 @@ static inline pte_t pte_mkyoung(pte_t pte)
>>  return pte;
>>  }
>>  
>> +#define pte_sw_mkyoung  pte_mkyoung
>> +
>>  #ifdef CONFIG_MIPS_HUGE_TLB_SUPPORT
>>  static inline int pte_huge(pte_t pte)   { return pte_val(pte) & 
>> _PAGE_HUGE; }
>>  
>> --- a/include/asm-generic/pgtable.h
>> +++ b/include/asm-generic/pgtable.h
>> @@ -227,6 +227,21 @@ static inline void ptep_set_wrprotect(struct mm_struct 
>> *mm, unsigned long addres
>>  }
>>  #endif
>>  
>> +/*
>> + * On some architectures hardware does not set page access bit when 
>> accessing
>> + * memory page, it is responsibilty of software setting this bit. It brings
>> + * out extra page fault penalty to track page access bit. For optimization 
>> page
>> + * access bit can be set during all page fault flow on these arches.
>> + * To be differentiate with macro pte_mkyoung, this macro is used on 
>> platforms
>> + * where software maintains page access bit.
>> + */
>> +#ifndef pte_sw_mkyoung
>> +static inline pte_t pte_sw_mkyoung(pte_t pte)
>> +{
>> +return pte;
>> +}
>> +#endif
> 
> Yup, that's neat enough.  Thanks for making this change.  It looks like
> all architectures include asm-generic/pgtable.h so that's fine.
> 
> It's conventional to add a
> 
> #define pte_sw_mkyoung pte_sw_mkyoung
> 
> immediately above the #endif there, so we can't try to implement
> pte_sw_mkyoung() twice if this header gets included twice.  But the
> header has #ifndef _ASM_GENERIC_PGTABLE_H around the whole thing so
> that should be OK.

Sure, will do, and thanks for your kindly help and guidance

 



Re: [PATCH v4 2/4] mm/memory.c: Update local TLB if PTE entry exists

2020-05-20 Thread maobibo



On 05/20/2020 09:26 AM, Andrew Morton wrote:
> On Tue, 19 May 2020 18:03:28 +0800 Bibo Mao  wrote:
> 
>> If two threads concurrently fault at the same address, the thread that
>> won the race updates the PTE and its local TLB. For now, the other
>> thread gives up, simply does nothing, and continues.
>>
>> It could happen that this second thread triggers another fault, whereby
>> it only updates its local TLB while handling the fault. Instead of
>> triggering another fault, let's directly update the local TLB of the
>> second thread.
>>
>> It is only useful to architectures where software can update TLB, it may
>> bring out some negative effect if update_mmu_cache is used for other
>> purpose also. It seldom happens where multiple threads access the same
>> page at the same time, so the negative effect is limited on other arches.
> 
> I'm still worried about the impact on other architectures.  The
> additional update_mmu_cache() calls won't occur only when multiple
> threads are racing against the same page, I think?  For example,
> insert_pfn() will do this when making a read-only page a writable one.
How about defining ptep_set_access_flags function like this on mips system?
which is the same on riscv platform.

static inline int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
pte_t entry, int dirty)
{
if (!pte_same(*ptep, entry))
set_pte_at(vma->vm_mm, address, ptep, entry);
/*
 * update_mmu_cache will unconditionally execute, handling both
 * the case that the PTE changed and the spurious fault case.
 */
return true;
}

And keep the following piece of code unchanged, the change will be smaller.
@@ -1770,8 +1770,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, 
unsigned long addr,
}
entry = pte_mkyoung(*pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-   if (ptep_set_access_flags(vma, addr, pte, entry, 1))
-   update_mmu_cache(vma, addr, pte);
+   ptep_set_access_flags(vma, addr, pte, entry, 1);
+   update_mmu_cache(vma, addr, pte);
}

@@ -2436,17 +2436,16 @@ static inline bool cow_user_page(struct page *dst, 
struct page *src,
entry = pte_mkyoung(vmf->orig_pte);
-   if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
-   update_mmu_cache(vma, addr, vmf->pte);
+   ptep_set_access_flags(vma, addr, vmf->pte, entry, 0);
+   update_mmu_cache(vma, addr, vmf->pte);
}
@@ -2618,8 +2618,8 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = pte_mkyoung(vmf->orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-   if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
-   update_mmu_cache(vma, vmf->address, vmf->pte);
+   ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1);
+   update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
 }



> 
> Would you have time to add some instrumentation into update_mmu_cache()
> (maybe a tracepoint) and see what effect this change has upon the
> frequency at which update_mmu_cache() is called for a selection of
> workloads?  And add this info to the changelog to set minds at ease?
OK, I will add some instrumentation data in the changelog.

 



Re: [PATCH v3 3/3] mm/memory.c: Add memory read privilege before filling PTE entry

2020-05-18 Thread maobibo



On 05/19/2020 04:57 AM, Andrew Morton wrote:
> On Mon, 18 May 2020 13:08:49 +0800 Bibo Mao  wrote:
> 
>> On mips platform, hw PTE entry valid bit is set in pte_mkyoung
>> function, it is used to set physical page with readable privilege.
> 
> pte_mkyoung() seems to be a strange place to set the pte's valid bit. 
> Why is it done there?  Can it be done within mips's mk_pte()?
On MIPS system hardware cannot set PAGE_ACCESS bit when accessing the page,
software sets PAGE_ACCESS software bit and PAGE_VALID hw bit together during 
page
fault stage.

If mk_pte is called in page fault flow, it is ok to set both bits. If it is not 
called in page fault, PAGE_ACCESS is set however there is no actual memory 
accessing.

> 
>> Here add pte_mkyoung function to make page readable on MIPS platform
>> during page fault handling. This patch improves page fault latency
>> about 10% on my MIPS machine with lmbench lat_pagefault case.
>>
>> ...
>>
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2704,6 +2704,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>>  }
>>  flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
>>  entry = mk_pte(new_page, vma->vm_page_prot);
>> +entry = pte_mkyoung(entry);
>>  entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> 
> What is the effect on non-mips machines?  If it's only "additional
> overhead" then it would be better to add a new pte_mkvalid() (or
> whatever) and arrange for it to be a no-op on all but mips?

how about adding pte_sw_mkyoung alike function which is a noop on all but mips?
this function is used to set PAGE_ACCESS bit and PAGE_VALID bit on mips 
platform.

regards
bibo,mao

> 
>>  /*
>>   * Clear the pte entry and flush it first, before updating the
>> @@ -3378,6 +3379,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault 
>> *vmf)
>>  __SetPageUptodate(page);
>>  
>>  entry = mk_pte(page, vma->vm_page_prot);
>> +entry = pte_mkyoung(entry);
>>  if (vma->vm_flags & VM_WRITE)
>>  entry = pte_mkwrite(pte_mkdirty(entry));
>>  
>> @@ -3660,6 +3662,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct 
>> mem_cgroup *memcg,
>>  
>>  flush_icache_page(vma, page);
>>  entry = mk_pte(page, vma->vm_page_prot);
>> +entry = pte_mkyoung(entry);
>>  if (write)
>>  entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>>  /* copy-on-write page */
>> diff --git a/mm/mprotect.c b/mm/mprotect.c
>> index 494192ca..673f1cd 100644
>> --- a/mm/mprotect.c
>> +++ b/mm/mprotect.c
>> @@ -131,6 +131,8 @@ static unsigned long change_pte_range(struct 
>> vm_area_struct *vma, pmd_t *pmd,
>>  ptent = pte_clear_uffd_wp(ptent);
>>  }
>>  
>> +if (vma->vm_flags & VM_READ)
>> +ptent = pte_mkyoung(ptent);
>>  /* Avoid taking write faults for known dirty pages */
>>  if (dirty_accountable && pte_dirty(ptent) &&
>>  (pte_soft_dirty(ptent) ||



Re: mm/memory.c: Add update local tlb for smp race

2020-05-17 Thread maobibo



On 05/16/2020 05:34 PM, maobibo wrote:
> 
> 
> On 05/15/2020 09:50 PM, David Hildenbrand wrote:
>> On 14.05.20 08:50, Bibo Mao wrote:
>>> If there are two threads hitting page fault at the address, one
>>> thread updates pte entry and local tlb, the other thread can update
>>> local tlb also, rather than give up and let page fault happening
>>> again.
>>
>> Let me suggest
>>
>> "mm/memory: optimize concurrent page faults at same address
>>
>> If two threads concurrently fault at the same address, the thread that
>> won the race updates the PTE and its local TLB. For now, the other
>> thread gives up, simply does nothing, and continues.
>>
>> It could happen that this second thread triggers another fault, whereby
>> it only updates its local TLB while handling the fault. Instead of
>> triggering another fault, let's directly update the local TLB of the
>> second thread.
>> "
>>
>> If I got the intention of this patch correctly.
>>
>> Are there any performance numbers to support this patch?
>>
>> (I can't say too much about the correctness and/or usefulness of this patch)
> 
> yes, that is the situation. On MIPS platform software can update TLB,
> so update_mmu_cache is used here. This does not happen frequently, and with 
> the three series patches in later mail. I test lat_pagefault in lmbench, here 
> is is result:
> 
> with these three series patches, 
> # ./lat_pagefault  -N 10  /tmp/1 
> Pagefaults on /tmp/1: 1.4973 microseconds
> # ./lat_pagefault -P 4 -N 10  /tmp/1 
> Pagefaults on /tmp/1: 1.5716 microseconds
> 
> original version, without these three series patch
> #  ./lat_pagefault  -N 10  /tmp/1 
> Pagefaults on /tmp/1: 1.6489 microseconds
> # ./lat_pagefault -P 4 -N 10  /tmp/1
> Pagefaults on /tmp/1: 1.7214 microseconds
> 

I tested the three patches one by one, there is no obvious improvement with
lat_pagefault case, I guess that it happens seldom where multiple threads access
the same page at the same time. 

The improvement is because of another modification where pte_mkyoung is added
to get readable privilege on MIPS system.

regards
bibo, mao

>>>
>>> modified:   mm/memory.c
>>
>> This does not belong into a patch description.
> 
> well, I will modify the patch description.
> 
> regards
> bibo,mao
> 
> 
>>
>>
>>> Signed-off-by: Bibo Mao 
>>> ---
>>>  mm/memory.c | 30 ++
>>>  1 file changed, 22 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/mm/memory.c b/mm/memory.c
>>> index f703fe8..3a741ce 100644
>>> --- a/mm/memory.c
>>> +++ b/mm/memory.c
>>> @@ -2436,11 +2436,10 @@ static inline bool cow_user_page(struct page *dst, 
>>> struct page *src,
>>> if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>>> /*
>>>  * Other thread has already handled the fault
>>> -* and we don't need to do anything. If it's
>>> -* not the case, the fault will be triggered
>>> -* again on the same address.
>>> +* and update local tlb only
>>>  */
>>> ret = false;
>>> +   update_mmu_cache(vma, addr, vmf->pte);
>>> goto pte_unlock;
>>> }
>>>  
>>> @@ -2463,8 +2462,9 @@ static inline bool cow_user_page(struct page *dst, 
>>> struct page *src,
>>> vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, >ptl);
>>> locked = true;
>>> if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>>> -   /* The PTE changed under us. Retry page fault. */
>>> +   /* The PTE changed under us. update local tlb */
>>> ret = false;
>>> +   update_mmu_cache(vma, addr, vmf->pte);
>>> goto pte_unlock;
>>> }
>>>  
>>> @@ -2704,6 +2704,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>>> }
>>> flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
>>> entry = mk_pte(new_page, vma->vm_page_prot);
>>> +   entry = pte_mkyoung(entry);
>>> entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>>> /*
>>>  * Clear the pte entry and flush it first, before updating th

Re: [PATCH 2/3] mm/memory.c: Update local TLB if PTE entry exists

2020-05-16 Thread maobibo



On 05/16/2020 04:40 AM, Andrew Morton wrote:
> On Fri, 15 May 2020 12:10:08 +0800 Bibo Mao  wrote:
> 
>> If there are two threads hitting page fault at the same page,
>> one thread updates PTE entry and local TLB, the other can
>> update local tlb also, rather than give up and do page fault
>> again.
>>
>> ...
>>
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -1770,8 +1770,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct 
>> *vma, unsigned long addr,
>>  }
>>  entry = pte_mkyoung(*pte);
>>  entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>> -if (ptep_set_access_flags(vma, addr, pte, entry, 1))
>> -update_mmu_cache(vma, addr, pte);
>> +ptep_set_access_flags(vma, addr, pte, entry, 1);
>> +update_mmu_cache(vma, addr, pte);
> 
> Presumably these changes mean that other architectures will run
> update_mmu_cache() more frequently than they used to.  How much more
> frequently, and what will be the impact of this change?  (Please fully
> explain all this in the changelog).
> 
It is only useful for those architects where software can update tlb, if the  
function update_mmu_cache is used for other reason, it will bring out somewhat 
impact, and I will explain it in the changelog.

>>  }
>>  goto out_unlock;
>>  }
>>
>> ...
>>
>> @@ -2463,7 +2462,8 @@ static inline bool cow_user_page(struct page *dst, 
>> struct page *src,
>>  vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, >ptl);
>>  locked = true;
>>  if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>> -/* The PTE changed under us. Retry page fault. */
>> +/* The PTE changed under us, update local tlb */
>> +pdate_mmu_cache(vma, addr, vmf->pte);
> 
> Missing a 'u' there.  Which tells me this patch isn't the one which you
> tested!
> 
Sorry about it, I will refresh the patch and add modification about this 
obvious typo

regards
bibo, mao
>>  ret = false;
>>  goto pte_unlock;
>>  }
>>
>> ...
>>



Re: mm/memory.c: Add update local tlb for smp race

2020-05-16 Thread maobibo



On 05/15/2020 09:50 PM, David Hildenbrand wrote:
> On 14.05.20 08:50, Bibo Mao wrote:
>> If there are two threads hitting page fault at the address, one
>> thread updates pte entry and local tlb, the other thread can update
>> local tlb also, rather than give up and let page fault happening
>> again.
> 
> Let me suggest
> 
> "mm/memory: optimize concurrent page faults at same address
> 
> If two threads concurrently fault at the same address, the thread that
> won the race updates the PTE and its local TLB. For now, the other
> thread gives up, simply does nothing, and continues.
> 
> It could happen that this second thread triggers another fault, whereby
> it only updates its local TLB while handling the fault. Instead of
> triggering another fault, let's directly update the local TLB of the
> second thread.
> "
> 
> If I got the intention of this patch correctly.
> 
> Are there any performance numbers to support this patch?
> 
> (I can't say too much about the correctness and/or usefulness of this patch)

yes, that is the situation. On MIPS platform software can update TLB,
so update_mmu_cache is used here. This does not happen frequently, and with the 
three series patches in later mail. I test lat_pagefault in lmbench, here is is 
result:

with these three series patches, 
# ./lat_pagefault  -N 10  /tmp/1 
Pagefaults on /tmp/1: 1.4973 microseconds
# ./lat_pagefault -P 4 -N 10  /tmp/1 
Pagefaults on /tmp/1: 1.5716 microseconds

original version, without these three series patch
#  ./lat_pagefault  -N 10  /tmp/1 
Pagefaults on /tmp/1: 1.6489 microseconds
# ./lat_pagefault -P 4 -N 10  /tmp/1
Pagefaults on /tmp/1: 1.7214 microseconds

>>
>>  modified:   mm/memory.c
> 
> This does not belong into a patch description.

well, I will modify the patch description.

regards
bibo,mao


> 
> 
>> Signed-off-by: Bibo Mao 
>> ---
>>  mm/memory.c | 30 ++
>>  1 file changed, 22 insertions(+), 8 deletions(-)
>>
>> diff --git a/mm/memory.c b/mm/memory.c
>> index f703fe8..3a741ce 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -2436,11 +2436,10 @@ static inline bool cow_user_page(struct page *dst, 
>> struct page *src,
>>  if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>>  /*
>>   * Other thread has already handled the fault
>> - * and we don't need to do anything. If it's
>> - * not the case, the fault will be triggered
>> - * again on the same address.
>> + * and update local tlb only
>>   */
>>  ret = false;
>> +update_mmu_cache(vma, addr, vmf->pte);
>>  goto pte_unlock;
>>  }
>>  
>> @@ -2463,8 +2462,9 @@ static inline bool cow_user_page(struct page *dst, 
>> struct page *src,
>>  vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, >ptl);
>>  locked = true;
>>  if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
>> -/* The PTE changed under us. Retry page fault. */
>> +/* The PTE changed under us. update local tlb */
>>  ret = false;
>> +update_mmu_cache(vma, addr, vmf->pte);
>>  goto pte_unlock;
>>  }
>>  
>> @@ -2704,6 +2704,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>>  }
>>  flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
>>  entry = mk_pte(new_page, vma->vm_page_prot);
>> +entry = pte_mkyoung(entry);
>>  entry = maybe_mkwrite(pte_mkdirty(entry), vma);
>>  /*
>>   * Clear the pte entry and flush it first, before updating the
>> @@ -2752,6 +2753,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
>>  new_page = old_page;
>>  page_copied = 1;
>>  } else {
>> +update_mmu_cache(vma, vmf->address, vmf->pte);
>>  mem_cgroup_cancel_charge(new_page, memcg, false);
>>  }
>>  
>> @@ -2812,6 +2814,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
>>   * pte_offset_map_lock.
>>   */
>>  if (!pte_same(*vmf->pte, vmf->orig_pte)) {
>> +update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
>>  pte_unmap_unlock(vmf->pte, vmf->ptl);
>>  return VM_FAULT_NOPAGE;
>>  }
>> @@ -2936,6 +2939,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
>>  vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
>>  vmf->address, >ptl);
>>  if (!pte_same(*vmf->pte, vmf->orig_pte)) {
>> +update_mmu_cache(vma, vmf->address, vmf->pte);
>>  unlock_page(vmf->page);
>>  pte_unmap_unlock(vmf->pte, vmf->ptl);
>>  put_page(vmf->page);

Re: [PATCH] MIPS: update tlb even if pte entry has no change

2020-05-14 Thread maobibo



On 05/14/2020 05:37 PM, Sergei Shtylyov wrote:
> On 14.05.2020 12:35, Sergei Shtylyov wrote:
> 
>>> From: bibo mao 
>>>
>>> If there are two threads reading the same memory and tlb miss happens,
>>> one thread fills pte entry, the other reads new pte value during page fault
>>> handling. PTE value may be updated before page faul, so the process need
>>
>> Fault.
> 
>And "needs".
> 
>>> need update tlb still.
> 
>Oh, and one "need" is enough. :-)

Thank for reviewing my patch, will fix this typo issue in next version.

Best Regards
bibo, mao

> 
>>> Also this patch define flush_tlb_fix_spurious_fault as empty, since it not
>>> necessary to flush the page for all CPUs
>>>
>>> Signed-off-by: Bibo Mao 
>> [...]
> 
> MBR, Sergei



Re: [PATCH] MIPS: update tlb even if pte entry has no change

2020-05-14 Thread maobibo



On 05/14/2020 05:33 PM, Mike Rapoport wrote:
> On Thu, May 14, 2020 at 10:17:57AM +0800, Bibo Mao wrote:
>> From: bibo mao 
>>
>> If there are two threads reading the same memory and tlb miss happens,
>> one thread fills pte entry, the other reads new pte value during page fault
>> handling. PTE value may be updated before page faul, so the process need
>> need update tlb still.
>>
>> Also this patch define flush_tlb_fix_spurious_fault as empty, since it not
>> necessary to flush the page for all CPUs
> 
> I'm not familiar enough with MIPS TLB workings, but it seems that adding
> a MIPS version of ptep_set_access_flags() and relaxing
> flush_tlb_fix_spurious_fault() are two separate patches.

Well, will split it into two patches.


regards
bibo, mao
> 
>> Signed-off-by: Bibo Mao 
>> ---
>>  arch/mips/include/asm/pgtable.h | 20 
>>  1 file changed, 20 insertions(+)
>>
>> diff --git a/arch/mips/include/asm/pgtable.h 
>> b/arch/mips/include/asm/pgtable.h
>> index aab0ec1..d0a4940 100644
>> --- a/arch/mips/include/asm/pgtable.h
>> +++ b/arch/mips/include/asm/pgtable.h
>> @@ -635,6 +635,26 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd)
>>  return pmd;
>>  }
>>  
>> +#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0)
>> +
>> +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
>> +int ptep_set_access_flags(struct vm_area_struct *vma,
>> +unsigned long address, pte_t *ptep,
>> +pte_t entry, int dirty)
>> +{
>> +int changed = !pte_same(*ptep, entry);
>> +
>> +if (changed)
>> +set_pte_at(vma->vm_mm, address, ptep, entry);
>> +else
>> +/* update tlb with latest pte entry still, tlb entry is old
>> + * since there is page fault
>> + */
>> +update_mmu_cache(vma, address, ptep);
>> +
>> +return changed;
>> +}
>> +
>>  /*
>>   * The generic version pmdp_huge_get_and_clear uses a version of 
>> pmd_clear() with a
>>   * different prototype.
>> -- 
>> 1.8.3.1
>>
>