[PATCH][resend] perf x86 kvm-stat: support to analyze kvm msr

2021-04-15 Thread Li RongQing
From: Lei Zhao 

usage:
- kvm stat
  run a command and gather performance counter statistics

- show the result:
  perf kvm stat report --event=msr

See the msr events:

Analyze events for all VMs, all VCPUs:

MSR Access Samples  Samples% Time%  Min Time Max Time  Avg time

  0x6e0:W   67007  98.17%   98.31%  0.59us   10.69us  0.90us ( +-  0.10% )
  0x830:W1186   1.74%1.60%  0.53us  108.34us  0.82us ( +- 11.02% )
   0x3b:R  66   0.10%0.09%  0.56us1.26us  0.80us ( +-  3.24% )

Total Samples:68259, Total events handled time:61150.95us.

Signed-off-by: Li RongQing 
Signed-off-by: Lei Zhao 
---
 tools/perf/arch/x86/util/kvm-stat.c |   46 +++
 1 files changed, 46 insertions(+), 0 deletions(-)

diff --git a/tools/perf/arch/x86/util/kvm-stat.c 
b/tools/perf/arch/x86/util/kvm-stat.c
index 0729204..c5dd54f 100644
--- a/tools/perf/arch/x86/util/kvm-stat.c
+++ b/tools/perf/arch/x86/util/kvm-stat.c
@@ -133,11 +133,56 @@ static void ioport_event_decode_key(struct perf_kvm_stat 
*kvm __maybe_unused,
.name = "IO Port Access"
 };
 
+ /* The time of emulation msr is from kvm_msr to kvm_entry. */
+static void msr_event_get_key(struct evsel *evsel,
+struct perf_sample *sample,
+struct event_key *key)
+{
+   key->key  = evsel__intval(evsel, sample, "ecx");
+   key->info = evsel__intval(evsel, sample, "write");
+}
+
+static bool msr_event_begin(struct evsel *evsel,
+  struct perf_sample *sample,
+  struct event_key *key)
+{
+   if (!strcmp(evsel->name, "kvm:kvm_msr")) {
+   msr_event_get_key(evsel, sample, key);
+   return true;
+   }
+
+   return false;
+}
+
+static bool msr_event_end(struct evsel *evsel,
+struct perf_sample *sample __maybe_unused,
+struct event_key *key __maybe_unused)
+{
+   return kvm_entry_event(evsel);
+}
+
+static void msr_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+   struct event_key *key,
+   char *decode)
+{
+   scnprintf(decode, decode_str_len, "%#llx:%s",
+ (unsigned long long)key->key,
+ key->info ? "W" : "R");
+}
+
+static struct kvm_events_ops msr_events = {
+   .is_begin_event = msr_event_begin,
+   .is_end_event = msr_event_end,
+   .decode_key = msr_event_decode_key,
+   .name = "MSR Access"
+};
+
 const char *kvm_events_tp[] = {
"kvm:kvm_entry",
"kvm:kvm_exit",
"kvm:kvm_mmio",
"kvm:kvm_pio",
+   "kvm:kvm_msr",
NULL,
 };
 
@@ -145,6 +190,7 @@ struct kvm_reg_events_ops kvm_reg_events_ops[] = {
{ .name = "vmexit", .ops = _events },
{ .name = "mmio", .ops = _events },
{ .name = "ioport", .ops = _events },
+   { .name = "msr", .ops = _events },
{ NULL, NULL },
 };
 
-- 
1.7.1



[PATCH] perf x86 kvm-stat: support to analyze kvm msr

2021-03-24 Thread Li RongQing
From: Lei Zhao 

usage:
- kvm stat
  run a command and gather performance counter statistics

- show the result:
  perf kvm stat report --event=msr

See the msr events:

Analyze events for all VMs, all VCPUs:

MSR Access Samples  Samples% Time%  Min Time Max Time  Avg time

  0x6e0:W   67007  98.17%   98.31%  0.59us   10.69us  0.90us ( +-  0.10% )
  0x830:W1186   1.74%1.60%  0.53us  108.34us  0.82us ( +- 11.02% )
   0x3b:R  66   0.10%0.09%  0.56us1.26us  0.80us ( +-  3.24% )

Total Samples:68259, Total events handled time:61150.95us.

Signed-off-by: Li RongQing 
Signed-off-by: Lei Zhao 
---
 tools/perf/arch/x86/util/kvm-stat.c | 46 +
 1 file changed, 46 insertions(+)

diff --git a/tools/perf/arch/x86/util/kvm-stat.c 
b/tools/perf/arch/x86/util/kvm-stat.c
index 072920475b65..c5dd54f6ef5e 100644
--- a/tools/perf/arch/x86/util/kvm-stat.c
+++ b/tools/perf/arch/x86/util/kvm-stat.c
@@ -133,11 +133,56 @@ static struct kvm_events_ops ioport_events = {
.name = "IO Port Access"
 };
 
+ /* The time of emulation msr is from kvm_msr to kvm_entry. */
+static void msr_event_get_key(struct evsel *evsel,
+struct perf_sample *sample,
+struct event_key *key)
+{
+   key->key  = evsel__intval(evsel, sample, "ecx");
+   key->info = evsel__intval(evsel, sample, "write");
+}
+
+static bool msr_event_begin(struct evsel *evsel,
+  struct perf_sample *sample,
+  struct event_key *key)
+{
+   if (!strcmp(evsel->name, "kvm:kvm_msr")) {
+   msr_event_get_key(evsel, sample, key);
+   return true;
+   }
+
+   return false;
+}
+
+static bool msr_event_end(struct evsel *evsel,
+struct perf_sample *sample __maybe_unused,
+struct event_key *key __maybe_unused)
+{
+   return kvm_entry_event(evsel);
+}
+
+static void msr_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+   struct event_key *key,
+   char *decode)
+{
+   scnprintf(decode, decode_str_len, "%#llx:%s",
+ (unsigned long long)key->key,
+ key->info ? "W" : "R");
+}
+
+static struct kvm_events_ops msr_events = {
+   .is_begin_event = msr_event_begin,
+   .is_end_event = msr_event_end,
+   .decode_key = msr_event_decode_key,
+   .name = "MSR Access"
+};
+
 const char *kvm_events_tp[] = {
"kvm:kvm_entry",
"kvm:kvm_exit",
"kvm:kvm_mmio",
"kvm:kvm_pio",
+   "kvm:kvm_msr",
NULL,
 };
 
@@ -145,6 +190,7 @@ struct kvm_reg_events_ops kvm_reg_events_ops[] = {
{ .name = "vmexit", .ops = _events },
{ .name = "mmio", .ops = _events },
{ .name = "ioport", .ops = _events },
+   { .name = "msr", .ops = _events },
{ NULL, NULL },
 };
 
-- 
2.17.3



[PATCH] alarmtimer: Do not mess with an enqueued hrtimer

2021-01-06 Thread Li RongQing
when an hrtimer is enqueued already, its expires should be not
changed, otherwise, this will corrupts the ordering of the
timerqueue RB tree, if other hrtimer is enqueued before this
hrtimer is restarted, whole RB tree is completely hosed

Fixes: 6cffe00f7d4e ("alarmtimer: Add functions for timerfd support")
Signed-off-by: Li RongQing 
---
 kernel/time/alarmtimer.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f4ace1bf8382..3b34995ab8d2 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -388,8 +388,7 @@ void alarm_restart(struct alarm *alarm)
unsigned long flags;
 
spin_lock_irqsave(>lock, flags);
-   hrtimer_set_expires(>timer, alarm->node.expires);
-   hrtimer_restart(>timer);
+   hrtimer_start(>timer, alarm->node.expires, HRTIMER_MODE_ABS);
alarmtimer_enqueue(base, alarm);
spin_unlock_irqrestore(>lock, flags);
 }
-- 
2.17.3



[PATCH] random: get rid of dead codes from credit_entropy_bits

2020-06-28 Thread Li RongQing
After commit 90ea1c6436d2 ("random: remove the blocking pool"),
has_initialized is zero always, and initialized of struct
entropy_store is not used

Signed-off-by: Li RongQing 
---
 drivers/char/random.c | 8 +---
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/drivers/char/random.c b/drivers/char/random.c
index a7cf6aa65908..288cc4464a69 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -500,7 +500,6 @@ struct entropy_store {
unsigned short add_ptr;
unsigned short input_rotate;
int entropy_count;
-   unsigned int initialized:1;
unsigned int last_data_init:1;
__u8 last_data[EXTRACT_SIZE];
 };
@@ -660,7 +659,7 @@ static void process_random_ready_list(void)
  */
 static void credit_entropy_bits(struct entropy_store *r, int nbits)
 {
-   int entropy_count, orig, has_initialized = 0;
+   int entropy_count, orig;
const int pool_size = r->poolinfo->poolfracbits;
int nfrac = nbits << ENTROPY_SHIFT;
 
@@ -717,11 +716,6 @@ static void credit_entropy_bits(struct entropy_store *r, 
int nbits)
if (cmpxchg(>entropy_count, orig, entropy_count) != orig)
goto retry;
 
-   if (has_initialized) {
-   r->initialized = 1;
-   kill_fasync(, SIGIO, POLL_IN);
-   }
-
trace_credit_entropy_bits(r->name, nbits,
  entropy_count >> ENTROPY_SHIFT, _RET_IP_);
 
-- 
2.16.2



[PATCH][v7] KVM: X86: support APERF/MPERF registers

2020-06-08 Thread Li RongQing
Guest kernel reports a fixed cpu frequency in /proc/cpuinfo,
this is confused to user when turbo is enable, and aperf/mperf
can be used to show current cpu frequency after 7d5905dc14a
"(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
so guest should support aperf/mperf capability

This patch implements aperf/mperf by three mode: none, software
emulation, and pass-through

None: default mode, guest does not support aperf/mperf

Software emulation: the period of aperf/mperf in guest mode are
accumulated as emulated value

Pass-though: it is only suitable for pinned vcpu

And a per-VM capability is added to configure aperfmperf mode

Signed-off-by: Li RongQing 
Signed-off-by: Chai Wen 
Signed-off-by: Jia Lina 
---
diff v6:
drop the unneed check from kvm_update_cpuid and __do_cpuid_func
add the validation check in kvm_vm_ioctl_enable_cap
thank for Jim Mattson,  Paolo Bonzini and Xiaoyao Li

diff v5:
return error if guest is configured with aperf/mperf, but host cpu has not

diff v4:
fix maybe-uninitialized warning

diff v3:
fix interception of MSR_IA32_APERF/MPERF in svm
thanks for wei.huang2 

diff v2:
support aperfmperf pass though
move common codes to kvm_get_msr_common
thanks for Xiaoyao Li and Peter Zijlstra

diff v1:
1. support AMD, but not test
2. support per-vm capability to enable
Documentation/virt/kvm/api.rst  | 16 
 arch/x86/include/asm/kvm_host.h | 11 
 arch/x86/kvm/svm/svm.c  |  8 ++
 arch/x86/kvm/vmx/vmx.c  |  6 +
 arch/x86/kvm/x86.c  | 56 +
 arch/x86/kvm/x86.h  | 15 +++
 include/uapi/linux/kvm.h|  1 +
 7 files changed, 113 insertions(+)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 426f94582b7a..ae30ac02a771 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6150,3 +6150,19 @@ KVM can therefore start protected VMs.
 This capability governs the KVM_S390_PV_COMMAND ioctl and the
 KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected
 guests when the state change is invalid.
+
+8.23 KVM_CAP_APERFMPERF
+
+
+:Architectures: x86
+:Parameters: args[0] is aperfmperf mode;
+ 0 for not support, it is default mode
+ 1 for software emulation
+ 2 for pass-through which is only suitable for pinned vcpu
+:Returns: 0 on success, -EINVAL when args[0] contains invalid,
+   -EBUSY if vcpus has been created
+
+Enabling this capability on a VM provides guest with aperf/mperf
+register, which are used to get cpu running frequency currently
+
+Do not enable KVM_CAP_APERFMPERF if host does not support aperf/mperf
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1da5858501ca..7d1d3668c4f1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -829,6 +829,9 @@ struct kvm_vcpu_arch {
 
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
+
+   u64 v_mperf;
+   u64 v_aperf;
 };
 
 struct kvm_lpage_info {
@@ -907,6 +910,12 @@ enum kvm_irqchip_mode {
KVM_IRQCHIP_SPLIT,/* created with KVM_CAP_SPLIT_IRQCHIP */
 };
 
+enum kvm_aperfmperf_mode {
+   KVM_APERFMPERF_NONE,
+   KVM_APERFMPERF_SOFT,  /* software emulate aperfmperf */
+   KVM_APERFMPERF_PT,/* pass-through aperfmperf to guest */
+};
+
 #define APICV_INHIBIT_REASON_DISABLE0
 #define APICV_INHIBIT_REASON_HYPERV 1
 #define APICV_INHIBIT_REASON_NESTED 2
@@ -1004,6 +1013,8 @@ struct kvm_arch {
 
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
+
+   enum kvm_aperfmperf_mode aperfmperf_mode;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 9e333b91ff78..0db7d866e09f 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1198,6 +1198,14 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->msrpm = page_address(msrpm_pages);
svm_vcpu_init_msrpm(svm->msrpm);
 
+   if (guest_aperfmperf_soft(vcpu->kvm)) {
+   set_msr_interception(svm->msrpm, MSR_IA32_MPERF, 0, 0);
+   set_msr_interception(svm->msrpm, MSR_IA32_APERF, 0, 0);
+   } else if (guest_aperfmperf_pt(vcpu->kvm)) {
+   set_msr_interception(svm->msrpm, MSR_IA32_MPERF, 1, 0);
+   set_msr_interception(svm->msrpm, MSR_IA32_APERF, 1, 0);
+   }
+
svm->nested.msrpm = page_address(nested_msrpm_pages);
svm_vcpu_init_msrpm(svm->nested.msrpm);
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 170cc76a581f..952e3728ca86 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6914,6 +6914,12 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
vmx_disable_intercept_for_msr(msr_bi

答复: [PATCH][v6] KVM: X86: support APERF/MPERF registers

2020-06-06 Thread Li,Rongqing


> -邮件原件-
> 发件人: Paolo Bonzini [mailto:pbonz...@redhat.com]
> 发送时间: 2020年6月6日 1:22
> 收件人: Jim Mattson 
> 抄送: Xiaoyao Li ; Li,Rongqing ;
> LKML ; kvm list ; the
> arch/x86 maintainers ; H . Peter Anvin ;
> Borislav Petkov ; Ingo Molnar ; Thomas
> Gleixner ; Wanpeng Li ; Vitaly
> Kuznetsov ; Sean Christopherson
> ; wei.hua...@amd.com
> 主题: Re: [PATCH][v6] KVM: X86: support APERF/MPERF registers
> 
> On 05/06/20 19:16, Jim Mattson wrote:
> >>>> @@ -4930,6 +4939,11 @@ int kvm_vm_ioctl_enable_cap(struct kvm
> *kvm,
> >>>>   kvm->arch.exception_payload_enabled = cap->args[0];
> >>>>   r = 0;
> >>>>   break;
> >>>> +case KVM_CAP_APERFMPERF:
> >>>> +kvm->arch.aperfmperf_mode =
> >>>> +boot_cpu_has(X86_FEATURE_APERFMPERF) ?
> cap->args[0] :
> >>>> + 0;
> >>> Shouldn't check whether cap->args[0] is a valid value?
> >> Yes, only valid values should be allowed.
> >>
> >> Also, it should fail with -EINVAL if the host does not have
> >> X86_FEATURE_APERFMPERF.
> > Should enabling/disabling this capability be disallowed once vCPUs
> > have been created?
> >
> 
> That's a good idea, yes.
> 
> Paolo


Thank you all, I will send a new version

-Li


答复: 答复: [PATCH][v6] KVM: X86: support APERF/MPERF registers

2020-06-05 Thread Li,Rongqing


> -邮件原件-
> 发件人: Like Xu [mailto:like...@linux.intel.com]
> 发送时间: 2020年6月5日 13:29
> 收件人: Li,Rongqing ; like...@intel.com
> 抄送: linux-kernel@vger.kernel.org; k...@vger.kernel.org; x...@kernel.org;
> h...@zytor.com; b...@alien8.de; mi...@redhat.com; t...@linutronix.de;
> jmatt...@google.com; wanpen...@tencent.com; vkuzn...@redhat.com;
> sean.j.christopher...@intel.com; pbonz...@redhat.com; xiaoyao...@intel.com;
> wei.hua...@amd.com
> 主题: Re: 答复: [PATCH][v6] KVM: X86: support APERF/MPERF registers
> 
> On 2020/6/5 12:23, Li,Rongqing wrote:
> >
> >
> >> -邮件原件-
> >> 发件人: Xu, Like [mailto:like...@intel.com]
> >> 发送时间: 2020年6月5日 10:32
> >> 收件人: Li,Rongqing 
> >> 抄送: linux-kernel@vger.kernel.org; k...@vger.kernel.org;
> >> x...@kernel.org; h...@zytor.com; b...@alien8.de; mi...@redhat.com;
> >> t...@linutronix.de; jmatt...@google.com; wanpen...@tencent.com;
> >> vkuzn...@redhat.com; sean.j.christopher...@intel.com;
> >> pbonz...@redhat.com; xiaoyao...@intel.com; wei.hua...@amd.com
> >> 主题: Re: [PATCH][v6] KVM: X86: support APERF/MPERF registers
> >>
> >> Hi RongQing,
> >>
> >> On 2020/6/5 9:44, Li RongQing wrote:
> >>> Guest kernel reports a fixed cpu frequency in /proc/cpuinfo, this is
> >>> confused to user when turbo is enable, and aperf/mperf can be used
> >>> to show current cpu frequency after 7d5905dc14a
> >>> "(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
> >>> so guest should support aperf/mperf capability
> >>>
> >>> This patch implements aperf/mperf by three mode: none, software
> >>> emulation, and pass-through
> >>>
> >>> None: default mode, guest does not support aperf/mperf
> >> s/None/Note
> >>>
> >>> Software emulation: the period of aperf/mperf in guest mode are
> >>> accumulated as emulated value
> >>>
> >>> Pass-though: it is only suitable for KVM_HINTS_REALTIME, Because
> >>> that hint guarantees we have a 1:1 vCPU:CPU binding and guaranteed
> >>> no over-commit.
> >> The flag "KVM_HINTS_REALTIME 0" (in the
> >> Documentation/virt/kvm/cpuid.rst) is claimed as "guest checks this
> >> feature bit to determine that vCPUs are never preempted for an unlimited
> time allowing optimizations".
> >>
> >> I couldn't see its relationship with "1:1 vCPU: pCPU binding".
> >> The patch doesn't check this flag as well for your pass-through purpose.
> >>
> >> Thanks,
> >> Like Xu
> >
> >
> > I think this is user space jobs to bind HINT_REALTIME and mperf passthrough,
> KVM just do what userspace wants.
> >
> 
> That's fine for user space to bind HINT_REALTIME and mperf passthrough,
> But I was asking why HINT_REALTIME means "1:1 vCPU: pCPU binding".
> 
> As you said, "Pass-though: it is only suitable for KVM_HINTS_REALTIME", which
> means, KVM needs to make sure the kvm->arch.aperfmperf_mode value could
> "only" be set to KVM_APERFMPERF_PT when the check
> kvm_para_has_hint(KVM_HINTS_REALTIME) is passed.
> 

pining vcpu can ensure that guest get correct mperf/aperf, but a user
has the choice to not pin, at that condition, do not think it is bug, this 
wants to say

> Specifically, the KVM_HINTS_REALTIME is a per-kvm capability while the
> kvm_aperfmperf_mode is a per-vm capability. It's unresolved.
> 

Do you have any solution?

-Rongqing

> KVM doesn't always do what userspace wants especially you're trying to
> expose some features about power and thermal management in the
> virtualization context.
> 
> > and this gives user space a possibility, guest has passthrough
> > mperfaperf without HINT_REALTIME, guest can get coarse cpu frequency
> > without performance effect if guest can endure error frequency
> > occasionally
> >
> 
> 
> >
> > -Li
> >



答复: [PATCH][v6] KVM: X86: support APERF/MPERF registers

2020-06-04 Thread Li,Rongqing


> -邮件原件-
> 发件人: Xu, Like [mailto:like...@intel.com]
> 发送时间: 2020年6月5日 10:32
> 收件人: Li,Rongqing 
> 抄送: linux-kernel@vger.kernel.org; k...@vger.kernel.org; x...@kernel.org;
> h...@zytor.com; b...@alien8.de; mi...@redhat.com; t...@linutronix.de;
> jmatt...@google.com; wanpen...@tencent.com; vkuzn...@redhat.com;
> sean.j.christopher...@intel.com; pbonz...@redhat.com; xiaoyao...@intel.com;
> wei.hua...@amd.com
> 主题: Re: [PATCH][v6] KVM: X86: support APERF/MPERF registers
> 
> Hi RongQing,
> 
> On 2020/6/5 9:44, Li RongQing wrote:
> > Guest kernel reports a fixed cpu frequency in /proc/cpuinfo, this is
> > confused to user when turbo is enable, and aperf/mperf can be used to
> > show current cpu frequency after 7d5905dc14a
> > "(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
> > so guest should support aperf/mperf capability
> >
> > This patch implements aperf/mperf by three mode: none, software
> > emulation, and pass-through
> >
> > None: default mode, guest does not support aperf/mperf
> s/None/Note
> >
> > Software emulation: the period of aperf/mperf in guest mode are
> > accumulated as emulated value
> >
> > Pass-though: it is only suitable for KVM_HINTS_REALTIME, Because that
> > hint guarantees we have a 1:1 vCPU:CPU binding and guaranteed no
> > over-commit.
> The flag "KVM_HINTS_REALTIME 0" (in the Documentation/virt/kvm/cpuid.rst)
> is claimed as "guest checks this feature bit to determine that vCPUs are never
> preempted for an unlimited time allowing optimizations".
> 
> I couldn't see its relationship with "1:1 vCPU: pCPU binding".
> The patch doesn't check this flag as well for your pass-through purpose.
> 
> Thanks,
> Like Xu


I think this is user space jobs to bind HINT_REALTIME and mperf passthrough, 
KVM just do what userspace wants.

and this gives user space a possibility, guest has passthrough mperfaperf 
without HINT_REALTIME, guest can get coarse cpu frequency without performance 
effect if guest can endure error frequency occasionally


-Li 



[PATCH][v6] KVM: X86: support APERF/MPERF registers

2020-06-04 Thread Li RongQing
Guest kernel reports a fixed cpu frequency in /proc/cpuinfo,
this is confused to user when turbo is enable, and aperf/mperf
can be used to show current cpu frequency after 7d5905dc14a
"(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
so guest should support aperf/mperf capability

This patch implements aperf/mperf by three mode: none, software
emulation, and pass-through

None: default mode, guest does not support aperf/mperf

Software emulation: the period of aperf/mperf in guest mode are
accumulated as emulated value

Pass-though: it is only suitable for KVM_HINTS_REALTIME, Because
that hint guarantees we have a 1:1 vCPU:CPU binding and guaranteed
no over-commit.

And a per-VM capability is added to configure aperfmperf mode

Signed-off-by: Li RongQing 
Signed-off-by: Chai Wen 
Signed-off-by: Jia Lina 
---
diff v5:
return error if guest is configured with mperf/aperf, but host cpu has not

diff v4:
fix maybe-uninitialized warning

diff v3:
fix interception of MSR_IA32_MPERF/APERF in svm

diff v2:
support aperfmperf pass though
move common codes to kvm_get_msr_common

diff v1:
1. support AMD, but not test
2. support per-vm capability to enable


 Documentation/virt/kvm/api.rst  | 10 ++
 arch/x86/include/asm/kvm_host.h | 11 +++
 arch/x86/kvm/cpuid.c| 15 ++-
 arch/x86/kvm/svm/svm.c  |  8 
 arch/x86/kvm/vmx/vmx.c  |  6 ++
 arch/x86/kvm/x86.c  | 42 +
 arch/x86/kvm/x86.h  | 15 +++
 include/uapi/linux/kvm.h|  1 +
 8 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index d871dacb984e..f854f4da6fd8 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6126,3 +6126,13 @@ KVM can therefore start protected VMs.
 This capability governs the KVM_S390_PV_COMMAND ioctl and the
 KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected
 guests when the state change is invalid.
+
+8.23 KVM_CAP_APERFMPERF
+
+
+:Architectures: x86
+:Parameters: args[0] is aperfmperf mode;
+ 0 for not support, 1 for software emulation, 2 for pass-through
+:Returns: 0 on success; -1 on error
+
+This capability indicates that KVM supports APERF and MPERF MSR registers
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fd78bd44b2d6..14643f8af9c4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -824,6 +824,9 @@ struct kvm_vcpu_arch {
 
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
+
+   u64 v_mperf;
+   u64 v_aperf;
 };
 
 struct kvm_lpage_info {
@@ -889,6 +892,12 @@ enum kvm_irqchip_mode {
KVM_IRQCHIP_SPLIT,/* created with KVM_CAP_SPLIT_IRQCHIP */
 };
 
+enum kvm_aperfmperf_mode {
+   KVM_APERFMPERF_NONE,
+   KVM_APERFMPERF_SOFT,  /* software emulate aperfmperf */
+   KVM_APERFMPERF_PT,/* pass-through aperfmperf to guest */
+};
+
 #define APICV_INHIBIT_REASON_DISABLE0
 #define APICV_INHIBIT_REASON_HYPERV 1
 #define APICV_INHIBIT_REASON_NESTED 2
@@ -986,6 +995,8 @@ struct kvm_arch {
 
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
+
+   enum kvm_aperfmperf_mode aperfmperf_mode;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index cd708b0b460a..80f18b29a845 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -122,6 +122,16 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
   MSR_IA32_MISC_ENABLE_MWAIT);
}
 
+   best = kvm_find_cpuid_entry(vcpu, 6, 0);
+   if (best) {
+   if (guest_has_aperfmperf(vcpu->kvm)) {
+   if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
+   return -EINVAL;
+   best->ecx |= 1;
+   } else {
+   best->ecx &= ~1;
+   }
+   }
/* Note, maxphyaddr must be updated before tdp_level. */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu);
@@ -557,7 +567,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array 
*array, u32 function)
case 6: /* Thermal management */
entry->eax = 0x4; /* allow ARAT */
entry->ebx = 0;
-   entry->ecx = 0;
+   if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+   entry->ecx = 0x1;
+   else
+   entry->ecx = 0x0;
entry->edx = 0;
break;
/* function 7 has additional index. */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e9c0fb68387d..1d38fe3afc0d

答复: 答复: [PATCH][v5] KVM: X86: support APERF/MPERF registers

2020-06-04 Thread Li,Rongqing
> IMO, If we really want to ensure the correctness of userspace provided CPUID
> settings, we need to return ERROR to userspace instead of fixing it siliently.
> 

Ok , I will make it return a error

Thanks

-Li


> - Xiaoyao


答复: [PATCH][v5] KVM: X86: support APERF/MPERF registers

2020-05-30 Thread Li,Rongqing


> -邮件原件-
> 发件人: Xiaoyao Li [mailto:xiaoyao...@intel.com]
> 发送时间: 2020年5月30日 18:40
> 收件人: Li,Rongqing ; linux-kernel@vger.kernel.org;
> k...@vger.kernel.org; x...@kernel.org; h...@zytor.com; b...@alien8.de;
> mi...@redhat.com; t...@linutronix.de; jmatt...@google.com;
> wanpen...@tencent.com; vkuzn...@redhat.com;
> sean.j.christopher...@intel.com; pbonz...@redhat.com;
> wei.hua...@amd.com
> 主题: Re: [PATCH][v5] KVM: X86: support APERF/MPERF registers
> 
> On 5/30/2020 12:35 PM, Li RongQing wrote:
> > Guest kernel reports a fixed cpu frequency in /proc/cpuinfo, this is
> > confused to user when turbo is enable, and aperf/mperf can be used to
> > show current cpu frequency after 7d5905dc14a
> > "(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
> > so guest should support aperf/mperf capability
> >
> > This patch implements aperf/mperf by three mode: none, software
> > emulation, and pass-through
> >
> > None: default mode, guest does not support aperf/mperf
> >
> > Software emulation: the period of aperf/mperf in guest mode are
> > accumulated as emulated value
> >
> > Pass-though: it is only suitable for KVM_HINTS_REALTIME, Because that
> > hint guarantees we have a 1:1 vCPU:CPU binding and guaranteed no
> > over-commit.
> >
> > And a per-VM capability is added to configure aperfmperf mode
> >
> 
> [...]
> 
> > diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index
> > cd708b0b460a..c960dda4251b 100644
> > --- a/arch/x86/kvm/cpuid.c
> > +++ b/arch/x86/kvm/cpuid.c
> > @@ -122,6 +122,14 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
> >MSR_IA32_MISC_ENABLE_MWAIT);
> > }
> >
> > +   best = kvm_find_cpuid_entry(vcpu, 6, 0);
> > +   if (best) {
> > +   if (guest_has_aperfmperf(vcpu->kvm) &&
> > +   boot_cpu_has(X86_FEATURE_APERFMPERF))
> > +   best->ecx |= 1;
> > +   else
> > +   best->ecx &= ~1;
> > +   }
> 
> In my understanding, KVM allows userspace to set a CPUID feature bit for
> guest even if hardware doesn't support the feature.
> 
> So what makes X86_FEATURE_APERFMPERF different here? Is there any
> concern I miss?
> 
> -Xiaoyao

Whether software emulation for aperf/mperf or pass-through depends on host cpu 
aperf/mperf feature.
 
Software emulation: the period of aperf/mperf in guest mode are accumulated as 
emulated value

-Li


[PATCH][v5] KVM: X86: support APERF/MPERF registers

2020-05-29 Thread Li RongQing
Guest kernel reports a fixed cpu frequency in /proc/cpuinfo,
this is confused to user when turbo is enable, and aperf/mperf
can be used to show current cpu frequency after 7d5905dc14a
"(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
so guest should support aperf/mperf capability

This patch implements aperf/mperf by three mode: none, software
emulation, and pass-through

None: default mode, guest does not support aperf/mperf

Software emulation: the period of aperf/mperf in guest mode are
accumulated as emulated value

Pass-though: it is only suitable for KVM_HINTS_REALTIME, Because
that hint guarantees we have a 1:1 vCPU:CPU binding and guaranteed
no over-commit.

And a per-VM capability is added to configure aperfmperf mode

Signed-off-by: Li RongQing 
Signed-off-by: Chai Wen 
Signed-off-by: Jia Lina 
---
diff v4:
fix maybe-uninitialized warning

diff v3:
fix interception of MSR_IA32_MPERF/APERF in svm

diff v2:
support aperfmperf pass though
move common codes to kvm_get_msr_common

diff v1:
1. support AMD, but not test
2. support per-vm capability to enable

 Documentation/virt/kvm/api.rst  | 10 ++
 arch/x86/include/asm/kvm_host.h | 11 +++
 arch/x86/kvm/cpuid.c| 13 -
 arch/x86/kvm/svm/svm.c  |  8 
 arch/x86/kvm/vmx/vmx.c  |  6 ++
 arch/x86/kvm/x86.c  | 42 +
 arch/x86/kvm/x86.h  | 15 +++
 include/uapi/linux/kvm.h|  1 +
 8 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index d871dacb984e..f854f4da6fd8 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6126,3 +6126,13 @@ KVM can therefore start protected VMs.
 This capability governs the KVM_S390_PV_COMMAND ioctl and the
 KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected
 guests when the state change is invalid.
+
+8.23 KVM_CAP_APERFMPERF
+
+
+:Architectures: x86
+:Parameters: args[0] is aperfmperf mode;
+ 0 for not support, 1 for software emulation, 2 for pass-through
+:Returns: 0 on success; -1 on error
+
+This capability indicates that KVM supports APERF and MPERF MSR registers
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index fd78bd44b2d6..14643f8af9c4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -824,6 +824,9 @@ struct kvm_vcpu_arch {
 
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
+
+   u64 v_mperf;
+   u64 v_aperf;
 };
 
 struct kvm_lpage_info {
@@ -889,6 +892,12 @@ enum kvm_irqchip_mode {
KVM_IRQCHIP_SPLIT,/* created with KVM_CAP_SPLIT_IRQCHIP */
 };
 
+enum kvm_aperfmperf_mode {
+   KVM_APERFMPERF_NONE,
+   KVM_APERFMPERF_SOFT,  /* software emulate aperfmperf */
+   KVM_APERFMPERF_PT,/* pass-through aperfmperf to guest */
+};
+
 #define APICV_INHIBIT_REASON_DISABLE0
 #define APICV_INHIBIT_REASON_HYPERV 1
 #define APICV_INHIBIT_REASON_NESTED 2
@@ -986,6 +995,8 @@ struct kvm_arch {
 
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
+
+   enum kvm_aperfmperf_mode aperfmperf_mode;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index cd708b0b460a..c960dda4251b 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -122,6 +122,14 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
   MSR_IA32_MISC_ENABLE_MWAIT);
}
 
+   best = kvm_find_cpuid_entry(vcpu, 6, 0);
+   if (best) {
+   if (guest_has_aperfmperf(vcpu->kvm) &&
+   boot_cpu_has(X86_FEATURE_APERFMPERF))
+   best->ecx |= 1;
+   else
+   best->ecx &= ~1;
+   }
/* Note, maxphyaddr must be updated before tdp_level. */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu);
@@ -557,7 +565,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array 
*array, u32 function)
case 6: /* Thermal management */
entry->eax = 0x4; /* allow ARAT */
entry->ebx = 0;
-   entry->ecx = 0;
+   if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+   entry->ecx = 0x1;
+   else
+   entry->ecx = 0x0;
entry->edx = 0;
break;
/* function 7 has additional index. */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index e9c0fb68387d..1d38fe3afc0d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1200,6 +1200,14 @@ static int svm_create_vcpu(struct kvm_vc

答复: [PATCH] [v4] KVM: X86: support APERF/MPERF registers

2020-05-07 Thread Li,Rongqing
> Hi Li,
> 
> Thank you for the patch! Perhaps something to improve:
> 
> [auto build test WARNING on kvm/linux-next] [also build test WARNING on
> next-20200505] [cannot apply to tip/auto-latest linus/master linux/master
> v5.7-rc4] [if your patch is applied to the wrong git tree, please drop us a 
> note to
> help improve the system. BTW, we also suggest to use '--base' option to 
> specify
> the base tree in git format-patch, please see
> https://stackoverflow.com/a/37406982]
> 
> url:
> https://github.com/0day-ci/linux/commits/Li-RongQing/KVM-X86-support-APE
> RF-MPERF-registers/20200507-023327
> base:   https://git.kernel.org/pub/scm/virt/kvm/kvm.git linux-next
> config: x86_64-allmodconfig (attached as .config)
> compiler: gcc-7 (Ubuntu 7.5.0-6ubuntu2) 7.5.0
> reproduce:
> # save the attached .config to linux build tree
> make ARCH=x86_64
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kbuild test robot 
> 
> Note: it may well be a FALSE warning. FWIW you are at least aware of it now.
> http://gcc.gnu.org/wiki/Better_Uninitialized_Warnings
> 
> All warnings (new ones prefixed by >>):
> 
>arch/x86/kvm/x86.c: In function 'vcpu_enter_guest':
> >> arch/x86/kvm/x86.c:8219:13: warning: 'aperf' may be used
> >> uninitialized in this function [-Wmaybe-uninitialized]
>  u64 mperf, aperf;
> ^
> >> arch/x86/kvm/x86.c:8219:6: warning: 'mperf' may be used uninitialized
> >> in this function [-Wmaybe-uninitialized]
>  u64 mperf, aperf;
>  ^

I think this is a FALSE warning, set and use mperf/aperf only if 
enable_aperfmperf is true


-Li


[PATCH] [v4] KVM: X86: support APERF/MPERF registers

2020-05-06 Thread Li RongQing
Guest kernel reports a fixed cpu frequency in /proc/cpuinfo,
this is confused to user when turbo is enable, and aperf/mperf
can be used to show current cpu frequency after 7d5905dc14a
"(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
so guest should support aperf/mperf capability

This patch implements aperf/mperf by three mode: none, software
emulation, and pass-through

none: default mode, guest does not support aperf/mperf

software emulation: the period of aperf/mperf in guest mode are
accumulated as emulated value

pass-though: it is only suitable for KVM_HINTS_REALTIME, Because
that hint guarantees we have a 1:1 vCPU:CPU binding and guaranteed
no over-commit.

and a per-VM capability is added to configure aperfmperf mode

Signed-off-by: Li RongQing 
Signed-off-by: Chai Wen 
Signed-off-by: Jia Lina 
---
diff v3:
fix interception of MSR_IA32_MPERF/APERF in svm

diff v2:
support aperfmperf pass though
move common codes to kvm_get_msr_common

diff v1:
1. support AMD, but not test
2. support per-vm capability to enable

 Documentation/virt/kvm/api.rst  | 10 ++
 arch/x86/include/asm/kvm_host.h | 11 +++
 arch/x86/kvm/cpuid.c| 13 -
 arch/x86/kvm/svm/svm.c  |  8 
 arch/x86/kvm/vmx/vmx.c  |  6 ++
 arch/x86/kvm/x86.c  | 42 +
 arch/x86/kvm/x86.h  | 15 +++
 include/uapi/linux/kvm.h|  1 +
 8 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index d871dac..f854f4d 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6126,3 +6126,13 @@ KVM can therefore start protected VMs.
 This capability governs the KVM_S390_PV_COMMAND ioctl and the
 KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected
 guests when the state change is invalid.
+
+8.23 KVM_CAP_APERFMPERF
+
+
+:Architectures: x86
+:Parameters: args[0] is aperfmperf mode;
+ 0 for not support, 1 for software emulation, 2 for pass-through
+:Returns: 0 on success; -1 on error
+
+This capability indicates that KVM supports APERF and MPERF MSR registers
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a239a29..cccbb24 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -825,6 +825,9 @@ struct kvm_vcpu_arch {
 
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
+
+   u64 v_mperf;
+   u64 v_aperf;
 };
 
 struct kvm_lpage_info {
@@ -890,6 +893,12 @@ enum kvm_irqchip_mode {
KVM_IRQCHIP_SPLIT,/* created with KVM_CAP_SPLIT_IRQCHIP */
 };
 
+enum kvm_aperfmperf_mode {
+   KVM_APERFMPERF_NONE,
+   KVM_APERFMPERF_SOFT,  /* software emulate aperfmperf */
+   KVM_APERFMPERF_PT,/* pass-through aperfmperf to guest */
+};
+
 #define APICV_INHIBIT_REASON_DISABLE0
 #define APICV_INHIBIT_REASON_HYPERV 1
 #define APICV_INHIBIT_REASON_NESTED 2
@@ -987,6 +996,8 @@ struct kvm_arch {
 
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
+
+   enum kvm_aperfmperf_mode aperfmperf_mode;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 6828be9..8a9771e5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -124,6 +124,14 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
   MSR_IA32_MISC_ENABLE_MWAIT);
}
 
+   best = kvm_find_cpuid_entry(vcpu, 6, 0);
+   if (best) {
+   if (guest_has_aperfmperf(vcpu->kvm) &&
+   boot_cpu_has(X86_FEATURE_APERFMPERF))
+   best->ecx |= 1;
+   else
+   best->ecx &= ~1;
+   }
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
kvm_mmu_reset_context(vcpu);
@@ -558,7 +566,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array 
*array, u32 function)
case 6: /* Thermal management */
entry->eax = 0x4; /* allow ARAT */
entry->ebx = 0;
-   entry->ecx = 0;
+   if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+   entry->ecx = 0x1;
+   else
+   entry->ecx = 0x0;
entry->edx = 0;
break;
/* function 7 has additional index. */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index c86f727..4fa002d 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -1213,6 +1213,14 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->msrpm = page_address(msrpm_pages);
svm_vcpu_init_msrpm(svm->msrpm);
 
+   if (guest_aperfmperf_soft(vcp

答复: [PATCH] [v3] kvm: x86: support APERF/MPERF registers

2020-05-03 Thread Li,Rongqing


> -邮件原件-
> 发件人: Wei Huang [mailto:wei.hua...@amd.com]
> 发送时间: 2020年5月2日 5:31
> 收件人: Li,Rongqing 
> 抄送: linux-kernel@vger.kernel.org; k...@vger.kernel.org; x...@kernel.org;
> h...@zytor.com; b...@alien8.de; mi...@redhat.com; t...@linutronix.de;
> jmatt...@google.com; wanpen...@tencent.com; vkuzn...@redhat.com;
> sean.j.christopher...@intel.com; pbonz...@redhat.com; xiaoyao...@intel.com
> 主题: Re: [PATCH] [v3] kvm: x86: support APERF/MPERF registers
> 重要性: 高
> 
> On 04/30 06:45, Li RongQing wrote:
> > Guest kernel reports a fixed cpu frequency in /proc/cpuinfo, this is
> > confused to user when turbo is enable, and aperf/mperf can be used to
> > show current cpu frequency after 7d5905dc14a
> > "(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
> > so guest should support aperf/mperf capability
> >
> > this patch implements aperf/mperf by three mode: none, software
>   
>   This
> 
> > emulation, and pass-through
> >
> > none: default mode, guest does not support aperf/mperf
> >
> > software emulation: the period of aperf/mperf in guest mode are
> > accumulated as emulated value
> >
> > pass-though: it is only suitable for KVM_HINTS_REALTIME, Because that
> > hint guarantees we have a 1:1 vCPU:CPU binding and guaranteed no
> > over-commit.
> 
> If we save/restore the values of aperf/mperf properly during vcpu migration
> among different cores, is pinning still required?
> 

I think it can be as a new mode, it maybe add more msr operation, like write

> >
> > and a per-VM capability is added to configure aperfmperf mode
> >
> > Signed-off-by: Li RongQing 
> > Signed-off-by: Chai Wen 
> > Signed-off-by: Jia Lina 
> > ---
> > diff v2:
> > support aperfmperf pass though
> > move common codes to kvm_get_msr_common
> >
> > diff v1:
> > 1. support AMD, but not test
> 
> pt-mode doesn't work doesn't work on AMD. See below.
> 
> > 2. support per-vm capability to enable  Documentation/virt/kvm/api.rst
> > | 10 ++  arch/x86/include/asm/kvm_host.h | 11 +++
> >  arch/x86/kvm/cpuid.c| 13 -
> >  arch/x86/kvm/svm.c  |  8 
> >  arch/x86/kvm/vmx/vmx.c  |  6 ++
> >  arch/x86/kvm/x86.c  | 42
> +
> >  arch/x86/kvm/x86.h  | 15 +++
> >  include/uapi/linux/kvm.h|  1 +
> >  8 files changed, 105 insertions(+), 1 deletion(-)
> >
> > diff --git a/Documentation/virt/kvm/api.rst
> > b/Documentation/virt/kvm/api.rst index efbbe570aa9b..c3be3b6a1717
> > 100644
> > --- a/Documentation/virt/kvm/api.rst
> > +++ b/Documentation/virt/kvm/api.rst
> > @@ -6109,3 +6109,13 @@ KVM can therefore start protected VMs.
> >  This capability governs the KVM_S390_PV_COMMAND ioctl and the
> > KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for
> protected
> > guests when the state change is invalid.
> > +
> > +8.23 KVM_CAP_APERFMPERF
> > +
> > +
> > +:Architectures: x86
> > +:Parameters: args[0] is aperfmperf mode;
> > + 0 for not support, 1 for software emulation, 2 for
> > +pass-through
> > +:Returns: 0 on success; -1 on error
> > +
> > +This capability indicates that KVM supports APERF and MPERF MSR
> > +registers
> > diff --git a/arch/x86/include/asm/kvm_host.h
> > b/arch/x86/include/asm/kvm_host.h index 42a2d0d3984a..81477f676f60
> > 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -820,6 +820,9 @@ struct kvm_vcpu_arch {
> >
> > /* AMD MSRC001_0015 Hardware Configuration */
> > u64 msr_hwcr;
> > +
> > +   u64 v_mperf;
> > +   u64 v_aperf;
> >  };
> >
> >  struct kvm_lpage_info {
> > @@ -885,6 +888,12 @@ enum kvm_irqchip_mode {
> > KVM_IRQCHIP_SPLIT,/* created with KVM_CAP_SPLIT_IRQCHIP
> */
> >  };
> >
> > +enum kvm_aperfmperf_mode {
> > +   KVM_APERFMPERF_NONE,
> > +   KVM_APERFMPERF_SOFT,  /* software emulate aperfmperf */
> > +   KVM_APERFMPERF_PT,/* pass-through aperfmperf to guest */
> > +};
> > +
> >  #define APICV_INHIBIT_REASON_DISABLE0
> >  #define APICV_INHIBIT_REASON_HYPERV 1
> >  #define APICV_INHIBIT_REASON_NESTED 2
> > @@ -982,6 +991,8 @@ struct kvm_arch {
> >
> > struct kvm_pmu_event_filter *pmu_event_filter;
> > struct task_struct *nx_lpage_recovery_thread;
> > +
> &g

[PATCH] [v3] kvm: x86: support APERF/MPERF registers

2020-04-30 Thread Li RongQing
Guest kernel reports a fixed cpu frequency in /proc/cpuinfo,
this is confused to user when turbo is enable, and aperf/mperf
can be used to show current cpu frequency after 7d5905dc14a
"(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
so guest should support aperf/mperf capability

this patch implements aperf/mperf by three mode: none, software
emulation, and pass-through

none: default mode, guest does not support aperf/mperf

software emulation: the period of aperf/mperf in guest mode are
accumulated as emulated value

pass-though: it is only suitable for KVM_HINTS_REALTIME, Because
that hint guarantees we have a 1:1 vCPU:CPU binding and guaranteed
no over-commit.

and a per-VM capability is added to configure aperfmperf mode

Signed-off-by: Li RongQing 
Signed-off-by: Chai Wen 
Signed-off-by: Jia Lina 
---
diff v2:
support aperfmperf pass though
move common codes to kvm_get_msr_common

diff v1:
1. support AMD, but not test
2. support per-vm capability to enable
 Documentation/virt/kvm/api.rst  | 10 ++
 arch/x86/include/asm/kvm_host.h | 11 +++
 arch/x86/kvm/cpuid.c| 13 -
 arch/x86/kvm/svm.c  |  8 
 arch/x86/kvm/vmx/vmx.c  |  6 ++
 arch/x86/kvm/x86.c  | 42 +
 arch/x86/kvm/x86.h  | 15 +++
 include/uapi/linux/kvm.h|  1 +
 8 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index efbbe570aa9b..c3be3b6a1717 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6109,3 +6109,13 @@ KVM can therefore start protected VMs.
 This capability governs the KVM_S390_PV_COMMAND ioctl and the
 KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected
 guests when the state change is invalid.
+
+8.23 KVM_CAP_APERFMPERF
+
+
+:Architectures: x86
+:Parameters: args[0] is aperfmperf mode;
+ 0 for not support, 1 for software emulation, 2 for pass-through
+:Returns: 0 on success; -1 on error
+
+This capability indicates that KVM supports APERF and MPERF MSR registers
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 42a2d0d3984a..81477f676f60 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -820,6 +820,9 @@ struct kvm_vcpu_arch {
 
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
+
+   u64 v_mperf;
+   u64 v_aperf;
 };
 
 struct kvm_lpage_info {
@@ -885,6 +888,12 @@ enum kvm_irqchip_mode {
KVM_IRQCHIP_SPLIT,/* created with KVM_CAP_SPLIT_IRQCHIP */
 };
 
+enum kvm_aperfmperf_mode {
+   KVM_APERFMPERF_NONE,
+   KVM_APERFMPERF_SOFT,  /* software emulate aperfmperf */
+   KVM_APERFMPERF_PT,/* pass-through aperfmperf to guest */
+};
+
 #define APICV_INHIBIT_REASON_DISABLE0
 #define APICV_INHIBIT_REASON_HYPERV 1
 #define APICV_INHIBIT_REASON_NESTED 2
@@ -982,6 +991,8 @@ struct kvm_arch {
 
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
+
+   enum kvm_aperfmperf_mode aperfmperf_mode;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 901cd1fdecd9..7a64ea2c3eef 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -124,6 +124,14 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
   MSR_IA32_MISC_ENABLE_MWAIT);
}
 
+   best = kvm_find_cpuid_entry(vcpu, 6, 0);
+   if (best) {
+   if (guest_has_aperfmperf(vcpu->kvm) &&
+   boot_cpu_has(X86_FEATURE_APERFMPERF))
+   best->ecx |= 1;
+   else
+   best->ecx &= ~1;
+   }
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
kvm_mmu_reset_context(vcpu);
@@ -558,7 +566,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array 
*array, u32 function)
case 6: /* Thermal management */
entry->eax = 0x4; /* allow ARAT */
entry->ebx = 0;
-   entry->ecx = 0;
+   if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+   entry->ecx = 0x1;
+   else
+   entry->ecx = 0x0;
entry->edx = 0;
break;
/* function 7 has additional index. */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 851e9cc79930..5646b6475049 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2292,6 +2292,14 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
svm->msrpm = page_address(msrpm_pages);
svm_vcpu_init_msrpm(svm->msrpm);
 
+   if (guest_aperfmperf_soft(vcpu->kvm)) {
+   

答复: [PATCH][v2] kvm: x86: emulate APERF/MPERF registers

2020-04-30 Thread Li,Rongqing


> -邮件原件-
> 发件人: Xiaoyao Li [mailto:xiaoyao...@intel.com]
> 发送时间: 2020年4月30日 14:49
> 收件人: Li,Rongqing ; linux-kernel@vger.kernel.org;
> k...@vger.kernel.org; x...@kernel.org; h...@zytor.com; b...@alien8.de;
> mi...@redhat.com; t...@linutronix.de; j...@8bytes.org;
> jmatt...@google.com; wanpen...@tencent.com; vkuzn...@redhat.com;
> sean.j.christopher...@intel.com; pbonz...@redhat.com
> 主题: Re: [PATCH][v2] kvm: x86: emulate APERF/MPERF registers
> 
> On 4/29/2020 1:46 PM, Li RongQing wrote:
> > Guest kernel reports a fixed cpu frequency in /proc/cpuinfo, this is
> > confused to user when turbo is enable, and aperf/mperf can be used to
> > show current cpu frequency after 7d5905dc14a
> > "(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
> > so we should emulate aperf mperf to achieve it
> >
> > the period of aperf/mperf in guest mode are accumulated as emulated
> > value, and add per-VM knod to enable emulate mperfaperf
> >
> > diff v1:
> > 1. support AMD
> > 2. support per-vm capability to enable
> >
> [...]
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index
> > 851e9cc79930..1d157a8dba46 100644
> > --- a/arch/x86/kvm/svm.c
> > +++ b/arch/x86/kvm/svm.c
> > @@ -4310,6 +4310,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu,
> struct msr_data *msr_info)
> > case MSR_F10H_DECFG:
> > msr_info->data = svm->msr_decfg;
> > break;
> > +   case MSR_IA32_MPERF:
> > +   msr_info->data = vcpu->arch.v_mperf;
> > +   break;
> > +   case MSR_IA32_APERF:
> > +   msr_info->data = vcpu->arch.v_aperf;
> > +   break;
> > default:
> > return kvm_get_msr_common(vcpu, msr_info);
> > }
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index
> > 91749f1254e8..b05e276e262b 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -1914,6 +1914,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu,
> struct msr_data *msr_info)
> > !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
> > return 1;
> > goto find_shared_msr;
> > +   case MSR_IA32_MPERF:
> > +   msr_info->data = vcpu->arch.v_mperf;
> > +   break;
> > +   case MSR_IA32_APERF:
> > +   msr_info->data = vcpu->arch.v_aperf;
> > +   break;
> 
> They are same for both vmx and svm, you can put them in
> kvm_get_msr_common()
> 

Ok

> BTW, are those two MSR always readable regardless of guest's CPUID?

It should be, not sure if there is abnormal

thanks
-LiRongQing

> > default:
> > find_shared_msr:
> > msr = find_msr_entry(vmx, msr_info->index);
> 



答复: [PATCH][v2] kvm: x86: emulate APERF/MPERF registers

2020-04-29 Thread Li,Rongqing


> -邮件原件-
> 发件人: Paolo Bonzini [mailto:pbonz...@redhat.com]
> 发送时间: 2020年4月29日 18:21
> 收件人: Peter Zijlstra ; Li,Rongqing
> 
> 抄送: linux-kernel@vger.kernel.org; k...@vger.kernel.org; x...@kernel.org;
> h...@zytor.com; b...@alien8.de; mi...@redhat.com; t...@linutronix.de;
> j...@8bytes.org; jmatt...@google.com; wanpen...@tencent.com;
> vkuzn...@redhat.com; sean.j.christopher...@intel.com
> 主题: Re: [PATCH][v2] kvm: x86: emulate APERF/MPERF registers
> 
> On 29/04/20 10:54, Peter Zijlstra wrote:
> > On Wed, Apr 29, 2020 at 01:46:36PM +0800, Li RongQing wrote:
> >> Guest kernel reports a fixed cpu frequency in /proc/cpuinfo, this is
> >> confused to user when turbo is enable, and aperf/mperf can be used to
> >> show current cpu frequency after 7d5905dc14a
> >> "(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
> >> so we should emulate aperf mperf to achieve it
> >>
> >> the period of aperf/mperf in guest mode are accumulated as emulated
> >> value, and add per-VM knod to enable emulate mperfaperf
> >>
> >> diff v1:
> >> 1. support AMD
> >> 2. support per-vm capability to enable
> > Would it make sense to provide a pass-through APERF/MPERF for
> > KVM_HINTS_REALTIME ? Because that hint guarantees we have a 1:1
> > vCPU:CPU binding and guaranteed no over-commit.
> >
> 
> Yes but that's up to userspace.
> 
> Paolo

Seem kernel should give the capability to userspace to disable the intercept 
mperf/aperf for KVM_HINTS_REALTIME

So I will change this patch to support three mode mperfaperf:  none, software 
emulate, and pt


diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1d157a8dba46..6b05f78bde78 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1657,9 +1657,11 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_SKINIT);
set_intercept(svm, INTERCEPT_WBINVD);
set_intercept(svm, INTERCEPT_XSETBV);
-   set_intercept(svm, INTERCEPT_RDPRU);
set_intercept(svm, INTERCEPT_RSM);
 
+   if (!guest_mperfaperf_pt(svm->vcpu.kvm))
+   set_intercept(svm, INTERCEPT_RDPRU);
+
if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
set_intercept(svm, INTERCEPT_MONITOR);
set_intercept(svm, INTERCEPT_MWAIT);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b05e276e262b..231732924c50 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6765,6 +6765,12 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
vmx_disable_intercept_for_msr(msr_bitmap, 
MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
vmx_disable_intercept_for_msr(msr_bitmap, 
MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
}
+
+   if (guest_mperfaperf_pt(vcpu->kvm)) {
+   vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_MPERF, 
MSR_TYPE_R);
+   vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_APERF, 
MSR_TYPE_R);
+   }
+
vmx->msr_bitmap_mode = 0;
 
vmx->loaded_vmcs = >vmcs01;


-Li


答复: [PATCH][v2] kvm: x86: emulate APERF/MPERF registers

2020-04-29 Thread Li,Rongqing
> 
> Would it make sense to provide a pass-through APERF/MPERF for
> KVM_HINTS_REALTIME ? Because that hint guarantees we have a 1:1
> vCPU:CPU binding and guaranteed no over-commit.

Make sense

I think this can be done in a separate patch for KVM_HINTS_REALTIME

Thanks

=Li


答复: [PATCH] x86: move turbo_disabled() out of intel_set_max_freq_ratio

2020-04-29 Thread Li,Rongqing

> Hello,
> 
> the problem is that turbo can be enabled/disabled by the firmware at runtime,
> after the machine has booted.
> 
> This happens for example with the Dell XPS 13, where turbo gets disabled by
> the firmware if the machine is disconnected from AC power and runs on battery.
> The laptop could boot on battery (turbo disabled), then after some time the
> user connects the AC power supply, turbo gets enabled, and with your patch
> we wouldn't know what is the turbo_freq/base_freq ratio to do frequency
> invariance (we skipped reading MSR_TURBO_RATIO_LIMIT at boot because
> turbo was disabled at that timed).
> 
> This behavior was requested by reviewers in this thread:
> https://lore.kernel.org/lkml/1906426.HDqaVa71mF@kreacher/
> and implemented with 918229cdd5ab ("x86/intel_pstate: Handle runtime turbo
> disablement/enablement in frequency invariance").
> 


Thanks for you explanation

Sorry for noise

-Li

> 
> Thanks,
> Giovanni Gherdovich


[PATCH][v2] kvm: x86: emulate APERF/MPERF registers

2020-04-28 Thread Li RongQing
Guest kernel reports a fixed cpu frequency in /proc/cpuinfo,
this is confused to user when turbo is enable, and aperf/mperf
can be used to show current cpu frequency after 7d5905dc14a
"(x86 / CPU: Always show current CPU frequency in /proc/cpuinfo)"
so we should emulate aperf mperf to achieve it

the period of aperf/mperf in guest mode are accumulated as
emulated value, and add per-VM knod to enable emulate mperfaperf

diff v1:
1. support AMD
2. support per-vm capability to enable

Signed-off-by: Li RongQing 
Signed-off-by: Chai Wen 
Signed-off-by: Jia Lina 
---
 Documentation/virt/kvm/api.rst  |  7 +++
 arch/x86/include/asm/kvm_host.h |  4 
 arch/x86/kvm/cpuid.c| 13 -
 arch/x86/kvm/svm.c  |  6 ++
 arch/x86/kvm/vmx/vmx.c  |  6 ++
 arch/x86/kvm/x86.c  | 37 +
 arch/x86/kvm/x86.h  |  6 ++
 include/uapi/linux/kvm.h|  1 +
 8 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index efbbe570aa9b..dc4b4036e5d2 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6109,3 +6109,10 @@ KVM can therefore start protected VMs.
 This capability governs the KVM_S390_PV_COMMAND ioctl and the
 KVM_MP_STATE_LOAD MP_STATE. KVM_SET_MP_STATE can fail for protected
 guests when the state change is invalid.
+
+8.23 KVM_CAP_MPERFAPERF
+
+
+:Architectures: x86
+
+This capability indicates that KVM supports APERF and MPERF MSR registers
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 42a2d0d3984a..58fd3254804f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -820,6 +820,9 @@ struct kvm_vcpu_arch {
 
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
+
+   u64 v_mperf;
+   u64 v_aperf;
 };
 
 struct kvm_lpage_info {
@@ -979,6 +982,7 @@ struct kvm_arch {
 
bool guest_can_read_msr_platform_info;
bool exception_payload_enabled;
+   bool guest_has_mperfaperf;
 
struct kvm_pmu_event_filter *pmu_event_filter;
struct task_struct *nx_lpage_recovery_thread;
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 901cd1fdecd9..3bdd907981b5 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -124,6 +124,14 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
   MSR_IA32_MISC_ENABLE_MWAIT);
}
 
+   best = kvm_find_cpuid_entry(vcpu, 6, 0);
+   if (best) {
+   if (guest_has_mperfaperf(vcpu->kvm) &&
+   boot_cpu_has(X86_FEATURE_APERFMPERF))
+   best->ecx |= 1;
+   else
+   best->ecx &= ~1;
+   }
/* Update physical-address width */
vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
kvm_mmu_reset_context(vcpu);
@@ -558,7 +566,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array 
*array, u32 function)
case 6: /* Thermal management */
entry->eax = 0x4; /* allow ARAT */
entry->ebx = 0;
-   entry->ecx = 0;
+   if (boot_cpu_has(X86_FEATURE_APERFMPERF))
+   entry->ecx = 0x1;
+   else
+   entry->ecx = 0x0;
entry->edx = 0;
break;
/* function 7 has additional index. */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 851e9cc79930..1d157a8dba46 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4310,6 +4310,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
case MSR_F10H_DECFG:
msr_info->data = svm->msr_decfg;
break;
+   case MSR_IA32_MPERF:
+   msr_info->data = vcpu->arch.v_mperf;
+   break;
+   case MSR_IA32_APERF:
+   msr_info->data = vcpu->arch.v_aperf;
+   break;
default:
return kvm_get_msr_common(vcpu, msr_info);
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 91749f1254e8..b05e276e262b 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1914,6 +1914,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
goto find_shared_msr;
+   case MSR_IA32_MPERF:
+   msr_info->data = vcpu->arch.v_mperf;
+   break;
+   case MSR_IA32_APERF:
+   msr_info->data = vcpu->arch.v_aperf;
+   break;
default:
find_shared_msr:
msr = find_msr_entry(vmx, msr_info->index);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/

[PATCH] x86: move turbo_disabled() out of intel_set_max_freq_ratio

2020-04-28 Thread Li RongQing
move the turbo_disabled before intel_set_max_freq_ratio,
when turbo is disabled, the max frequency ratio is a const
value, it is unnecessary to read MSR_TURBO_RATIO* msr to
compute

Signed-off-by: Li RongQing 
---
 arch/x86/kernel/smpboot.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fe3ab9632f3b..8979c459df2f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1987,7 +1987,7 @@ static bool intel_set_max_freq_ratio(void)
 out:
arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
base_freq);
-   arch_set_max_freq_ratio(turbo_disabled());
+   arch_set_max_freq_ratio(false);
return true;
 }
 
@@ -2009,6 +2009,9 @@ static void init_freq_invariance(void)
if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF))
return;
 
+   if (turbo_disabled())
+   return;
+
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
ret = intel_set_max_freq_ratio();
 
-- 
2.16.2



答复: [PATCH][v2] watchdog/hardlockup: reassign last_timestamp when enable nmi event

2019-10-22 Thread Li,Rongqing
Ping

Thanks

-Li

> -邮件原件-
> 发件人: linux-kernel-ow...@vger.kernel.org
> [mailto:linux-kernel-ow...@vger.kernel.org] 代表 Li RongQing
> 发送时间: 2019年10月15日 14:34
> 收件人: a...@linux-foundation.org; sergey.senozhatsky.w...@gmail.com;
> linux-kernel@vger.kernel.org; t...@linutronix.de; kan.li...@intel.com
> 主题: [PATCH][v2] watchdog/hardlockup: reassign last_timestamp when
> enable nmi event
> 
> last_timestamp is not initialized and is zero after boot, or stop to forward 
> when
> nmi watchdog is disabled; and false positives still is possible when restart 
> NMI
> timer after stopping 120 seconds
> 
> so reassign last_timestamp always when enable nmi event
> 
> Fixes: 7edaeb6841df ("kernel/watchdog: Prevent false positives with turbo
> modes")
> Signed-off-by: Li RongQing 
> Signed-off-by: Zhang Yu 
> ---
> 
> v1-->v2: make it be able to be compiled on no
> v1-->CONFIG_HARDLOCKUP_CHECK_TIMESTAMP platform
> 
> kernel/watchdog_hld.c | 18 +-
>  1 file changed, 17 insertions(+), 1 deletion(-)
> 
> diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index
> 247bf0b1582c..f14d18280387 100644
> --- a/kernel/watchdog_hld.c
> +++ b/kernel/watchdog_hld.c
> @@ -91,11 +91,24 @@ static bool watchdog_check_timestamp(void)
>   __this_cpu_write(last_timestamp, now);
>   return true;
>  }
> +
> +static void watchdog_touch_timestamp(int cpu) {
> +
> + ktime_t now = ktime_get_mono_fast_ns();
> +
> + per_cpu(last_timestamp, cpu) = now;
> +}
>  #else
>  static inline bool watchdog_check_timestamp(void)  {
>   return true;
>  }
> +
> +static void watchdog_touch_timestamp(int cpu) {
> +
> +}
>  #endif
> 
>  static struct perf_event_attr wd_hw_attr = { @@ -196,6 +209,7 @@ void
> hardlockup_detector_perf_enable(void)
>   if (!atomic_fetch_inc(_cpus))
>   pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
> 
> + watchdog_touch_timestamp(smp_processor_id());
>   perf_event_enable(this_cpu_read(watchdog_ev));
>  }
> 
> @@ -274,8 +288,10 @@ void __init hardlockup_detector_perf_restart(void)
>   for_each_online_cpu(cpu) {
>   struct perf_event *event = per_cpu(watchdog_ev, cpu);
> 
> - if (event)
> + if (event) {
> + watchdog_touch_timestamp(cpu);
>   perf_event_enable(event);
> + }
>   }
>  }
> 
> --
> 2.16.2



[PATCH] mm: remove VM_ACCT(PAGE_SIZE) when charge and uncharge

2019-10-17 Thread Li RongQing
VM_ACCT(PAGE_SIZE) is one, and it is unnecessary to multiply by it

Signed-off-by: Li RongQing 
---
 mm/shmem.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/mm/shmem.c b/mm/shmem.c
index cd570cc79c76..f01df46ef2ff 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -202,14 +202,13 @@ static inline int shmem_acct_block(unsigned long flags, 
long pages)
if (!(flags & VM_NORESERVE))
return 0;
 
-   return security_vm_enough_memory_mm(current->mm,
-   pages * VM_ACCT(PAGE_SIZE));
+   return security_vm_enough_memory_mm(current->mm, pages);
 }
 
 static inline void shmem_unacct_blocks(unsigned long flags, long pages)
 {
if (flags & VM_NORESERVE)
-   vm_unacct_memory(pages * VM_ACCT(PAGE_SIZE));
+   vm_unacct_memory(pages);
 }
 
 static inline bool shmem_inode_acct_block(struct inode *inode, long pages)
-- 
2.16.2



[PATCH][v2] watchdog/hardlockup: reassign last_timestamp when enable nmi event

2019-10-15 Thread Li RongQing
last_timestamp is not initialized and is zero after boot, or stop
to forward when nmi watchdog is disabled; and false positives still
is possible when restart NMI timer after stopping 120 seconds

so reassign last_timestamp always when enable nmi event

Fixes: 7edaeb6841df ("kernel/watchdog: Prevent false positives with turbo 
modes")
Signed-off-by: Li RongQing 
Signed-off-by: Zhang Yu 
---

v1-->v2: make it be able to be compiled on no CONFIG_HARDLOCKUP_CHECK_TIMESTAMP 
platform 

kernel/watchdog_hld.c | 18 +-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..f14d18280387 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -91,11 +91,24 @@ static bool watchdog_check_timestamp(void)
__this_cpu_write(last_timestamp, now);
return true;
 }
+
+static void watchdog_touch_timestamp(int cpu)
+{
+
+   ktime_t now = ktime_get_mono_fast_ns();
+
+   per_cpu(last_timestamp, cpu) = now;
+}
 #else
 static inline bool watchdog_check_timestamp(void)
 {
return true;
 }
+
+static void watchdog_touch_timestamp(int cpu)
+{
+
+}
 #endif
 
 static struct perf_event_attr wd_hw_attr = {
@@ -196,6 +209,7 @@ void hardlockup_detector_perf_enable(void)
if (!atomic_fetch_inc(_cpus))
pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
 
+   watchdog_touch_timestamp(smp_processor_id());
perf_event_enable(this_cpu_read(watchdog_ev));
 }
 
@@ -274,8 +288,10 @@ void __init hardlockup_detector_perf_restart(void)
for_each_online_cpu(cpu) {
struct perf_event *event = per_cpu(watchdog_ev, cpu);
 
-   if (event)
+   if (event) {
+   watchdog_touch_timestamp(cpu);
perf_event_enable(event);
+   }
}
 }
 
-- 
2.16.2



[PATCH] watchdog/hardlockup: reassign last_timestamp when enable nmi event

2019-10-12 Thread Li RongQing
last_timestamp is not initialized and is zero after boot, or stop
to forward when nmi watchdog is disabled; and false positives still
is possible when restart NMI timer after stopping 120 seconds

so reassign last_timestamp always when enable nmi event

Fixes: 7edaeb6841df ("kernel/watchdog: Prevent false positives with turbo 
modes")
Signed-off-by: Li RongQing 
---
 kernel/watchdog_hld.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 247bf0b1582c..fc3a5c7ccd82 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -189,6 +189,8 @@ static int hardlockup_detector_event_create(void)
  */
 void hardlockup_detector_perf_enable(void)
 {
+   ktime_t now = ktime_get_mono_fast_ns();
+
if (hardlockup_detector_event_create())
return;
 
@@ -196,6 +198,7 @@ void hardlockup_detector_perf_enable(void)
if (!atomic_fetch_inc(_cpus))
pr_info("Enabled. Permanently consumes one hw-PMU counter.\n");
 
+   this_cpu_write(last_timestamp, now);
perf_event_enable(this_cpu_read(watchdog_ev));
 }
 
@@ -274,8 +277,12 @@ void __init hardlockup_detector_perf_restart(void)
for_each_online_cpu(cpu) {
struct perf_event *event = per_cpu(watchdog_ev, cpu);
 
-   if (event)
+   if (event) {
+   ktime_t now = ktime_get_mono_fast_ns();
+
+   per_cpu(last_timestamp, cpu) = now;
perf_event_enable(event);
+   }
}
 }
 
-- 
2.16.2



[PATCH][RFC] perf/x86: avoid false-positives hard lockup

2019-10-10 Thread Li RongQing
if perf counter is used as nmi watchdog, and twice nmi in soft
watchdog sample period will trigger hard lockup

make sure left time is not less than soft watchdog period by
compared with 3/5 period to skip forward, since soft watchdog
sample period is 2/5 of watchdog_thresh, nmi watchdog sample
period, computed by set_sample_period

Signed-off-by: Li RongQing 
---
 arch/x86/events/core.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 7b21455d7504..1f5309456d4c 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -1196,7 +1196,11 @@ int x86_perf_event_set_period(struct perf_event *event)
/*
 * If we are way outside a reasonable range then just skip forward:
 */
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF
+   if (unlikely(left <= -(period * 3 / 5))) {
+#else
if (unlikely(left <= -period)) {
+#endif
left = period;
local64_set(>period_left, left);
hwc->last_period = period;
-- 
2.16.2



[tip: timers/urgent] timer: Read jiffies once when forwarding base clk

2019-09-19 Thread tip-bot2 for Li RongQing
The following commit has been merged into the timers/urgent branch of tip:

Commit-ID: e430d802d6a3aaf61bd3ed03d9404888a29b9bf9
Gitweb:
https://git.kernel.org/tip/e430d802d6a3aaf61bd3ed03d9404888a29b9bf9
Author:Li RongQing 
AuthorDate:Thu, 19 Sep 2019 20:04:47 +08:00
Committer: Thomas Gleixner 
CommitterDate: Thu, 19 Sep 2019 17:50:11 +02:00

timer: Read jiffies once when forwarding base clk

The timer delayed for more than 3 seconds warning was triggered during
testing.

  Workqueue: events_unbound sched_tick_remote
  RIP: 0010:sched_tick_remote+0xee/0x100
  ...
  Call Trace:
   process_one_work+0x18c/0x3a0
   worker_thread+0x30/0x380
   kthread+0x113/0x130
   ret_from_fork+0x22/0x40

The reason is that the code in collect_expired_timers() uses jiffies
unprotected:

if (next_event > jiffies)
base->clk = jiffies;

As the compiler is allowed to reload the value base->clk can advance
between the check and the store and in the worst case advance farther than
next event. That causes the timer expiry to be delayed until the wheel
pointer wraps around.

Convert the code to use READ_ONCE()

Fixes: 236968383cf5 ("timers: Optimize collect_expired_timers() for NOHZ")
Signed-off-by: Li RongQing 
Signed-off-by: Liang ZhiCheng 
Signed-off-by: Thomas Gleixner 
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/1568894687-14499-1-git-send-email-lirongq...@baidu.com

---
 kernel/time/timer.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 0e315a2..4820823 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1678,24 +1678,26 @@ void timer_clear_idle(void)
 static int collect_expired_timers(struct timer_base *base,
  struct hlist_head *heads)
 {
+   unsigned long now = READ_ONCE(jiffies);
+
/*
 * NOHZ optimization. After a long idle sleep we need to forward the
 * base to current jiffies. Avoid a loop by searching the bitfield for
 * the next expiring timer.
 */
-   if ((long)(jiffies - base->clk) > 2) {
+   if ((long)(now - base->clk) > 2) {
unsigned long next = __next_timer_interrupt(base);
 
/*
 * If the next timer is ahead of time forward to current
 * jiffies, otherwise forward to the next expiry time:
 */
-   if (time_after(next, jiffies)) {
+   if (time_after(next, now)) {
/*
 * The call site will increment base->clk and then
 * terminate the expiry loop immediately.
 */
-   base->clk = jiffies;
+   base->clk = now;
return 0;
}
base->clk = next;


[PATCH] timer: read jiffies once when forwarding base clk

2019-09-19 Thread Li RongQing
The below calltrace was reported, the cause is that timer is delayed
bigger than 3 seconds

Hardware name: New H3C Technologies Co.,Ltd. UniServer R4950 G3/RS41R4950, 
BIOS 2.00.06 V700R003
Workqueue: events_unbound sched_tick_remote
RIP: 0010:sched_tick_remote+0xee/0x100
...
Call Trace:
process_one_work+0x18c/0x3a0
worker_thread+0x30/0x380
? process_one_work+0x3a0/0x3a0
kthread+0x113/0x130
? kthread_create_worker_on_cpu+0x70/0x70
ret_from_fork+0x22/0x40
---[ end trace 41bd884127493e39 ]---

then write a program to test timer latency, it can reproduce this issue

   static void sched_l_tick(struct timer_list *t)
   {
   unsigned long delta = jiffies - set_time;

   if (delta > 3*HZ)
   printk("abnormal %ld %d\n", delta, raw_smp_processor_id());

   set_time = jiffies+HZ;
   mod_timer(t, jiffies + HZ);
}

further investigation shows jiffies maybe change when advence this base clk,
twice read of jiffies maybe lead to that base clk is bigger than truely next
event, and fire timer is skipped, so read jiffies once,

Signed-off-by: Li RongQing 
Signed-off-by: Liang ZhiCheng 
---
 kernel/time/timer.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 343c7ba33b1c..e2dbd0223635 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1593,24 +1593,27 @@ void timer_clear_idle(void)
 static int collect_expired_timers(struct timer_base *base,
  struct hlist_head *heads)
 {
+   unsigned long jnow;
+
+   jnow = READ_ONCE(jiffies);
/*
 * NOHZ optimization. After a long idle sleep we need to forward the
 * base to current jiffies. Avoid a loop by searching the bitfield for
 * the next expiring timer.
 */
-   if ((long)(jiffies - base->clk) > 2) {
+   if ((long)(jnow - base->clk) > 2) {
unsigned long next = __next_timer_interrupt(base);
 
/*
 * If the next timer is ahead of time forward to current
 * jiffies, otherwise forward to the next expiry time:
 */
-   if (time_after(next, jiffies)) {
+   if (time_after(next, jnow)) {
/*
 * The call site will increment base->clk and then
 * terminate the expiry loop immediately.
 */
-   base->clk = jiffies;
+   base->clk = jnow;
return 0;
}
base->clk = next;
-- 
2.16.2



答复: [PATCH] TTY: serial_core, add ->install

2019-04-17 Thread Li,Rongqing


> -邮件原件-
> 发件人: linux-kernel-ow...@vger.kernel.org
> [mailto:linux-kernel-ow...@vger.kernel.org] 代表 Jiri Slaby
> 发送时间: 2019年4月17日 16:59
> 收件人: gre...@linuxfoundation.org
> 抄送: linux-ser...@vger.kernel.org; linux-kernel@vger.kernel.org; Jiri Slaby
> ; Li,Rongqing ; Wang,Li(ACG Cloud)
> ; Zhang,Yu(ACG Cloud) ;
> stable 
> 主题: [PATCH] TTY: serial_core, add ->install
> 
> We need to compute the uart state only on the first open. This is usually 
> what is
> done in the ->install hook. serial_core used to do this in ->open on every 
> open.
> So move it to ->install.
> 
> As a side effect, it ensures the state is set properly in the window after
> tty_init_dev is called, but before uart_open. This fixes a bunch of races
> between tty_open and flush_to_ldisc we were dealing with recently.
> 
> One of such bugs was attempted to fix in commit fedb5760648a (serial:
> fix race between flush_to_ldisc and tty_open), but it only took care of a 
> couple
> of functions (uart_start and uart_unthrottle).  I was able to reproduce the
> crash on a SLE system, but in uart_write_room which is also called from
> flush_to_ldisc via process_echoes. I was *unable* to reproduce the bug 
> locally.
> It is due to having this patch in my queue since 2012!
> 
>  general protection fault:  [#1] SMP KASAN PTI
>  CPU: 1 PID: 5 Comm: kworker/u4:0 Tainted: G L
> 4.12.14-396-default #1 SLE15-SP1 (unreleased)
>  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> rel-1.12.0-0-ga698c89-prebuilt.qemu.org 04/01/2014
>  Workqueue: events_unbound flush_to_ldisc
>  task: 8800427d8040 task.stack: 8800427f
>  RIP: 0010:uart_write_room+0xc4/0x590
>  RSP: 0018:8800427f7088 EFLAGS: 00010202
>  RAX: dc00 RBX:  RCX: 
>  RDX: 002f RSI: 00ee RDI: 88003888bd90
>  RBP: b9545850 R08: 0001 R09: 0400
>  R10: 8800427d825c R11: 006e R12: 1100084fee12
>  R13: c94c5000 R14: 88003888bb28 R15: 0178
>  FS:  () GS:88004330()
> knlGS:
>  CS:  0010 DS:  ES:  CR0: 80050033
>  CR2: 561da0794148 CR3: 0ebf4000 CR4: 06e0
> Call Trace:
>   tty_write_room+0x6d/0xc0
>   __process_echoes+0x55/0x870
>   n_tty_receive_buf_common+0x105e/0x26d0
>   tty_ldisc_receive_buf+0xb7/0x1c0
>   tty_port_default_receive_buf+0x107/0x180
>   flush_to_ldisc+0x35d/0x5c0
> ...
> 
> 0 in rbx means tty->driver_data is NULL in uart_write_room. 0x178 is tried to
> be dereferenced (0x178 >> 3 is 0x2f in rdx) at uart_write_room+0xc4. 0x178 is
> exactly (struct uart_state *)NULL->refcount used in uart_port_lock from
> uart_write_room.
> 
> So revert the upstream commit here as my local patch should fix the whole
> family.
> 
> Signed-off-by: Jiri Slaby 
> Cc: Li RongQing 
> Cc: Wang Li 
> Cc: Zhang Yu 
> Cc: Greg Kroah-Hartman 
> Cc: stable 
> ---
> 
> = NOTE =
> 
> Could you test your use-case at Baidu, guys, please?
> 

Sorry, we have not the environment to test it, it happens when we upgrades BMC

-RongQing



[PATCH] mm, slab: remove unneed check in cpuup_canceled

2019-03-21 Thread Li RongQing
nc is a member of percpu allocation memory, and impossible NULL

Signed-off-by: Li RongQing 
---
 mm/slab.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index 28652e4218e0..f1420e14875a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -990,10 +990,8 @@ static void cpuup_canceled(long cpu)
 
/* cpu is dead; no one can alloc from it. */
nc = per_cpu_ptr(cachep->cpu_cache, cpu);
-   if (nc) {
-   free_block(cachep, nc->entry, nc->avail, node, );
-   nc->avail = 0;
-   }
+   free_block(cachep, nc->entry, nc->avail, node, );
+   nc->avail = 0;
 
if (!cpumask_empty(mask)) {
spin_unlock_irq(>list_lock);
-- 
2.16.2



[PATCH][v2] ipc: prevent lockup on alloc_msg and free_msg

2019-03-07 Thread Li RongQing
From: Li Rongqing 

msgctl10 of ltp triggers the following lockup When CONFIG_KASAN
is enabled on large memory SMP systems, the pages initialization
can take a long time, if msgctl10 requests a huge block memory,
and it will block rcu scheduler, so release cpu actively.

After adding schedule() in free_msg, free_msg can not be called
when holding spinlock, so adding msg to a tmp list, and free it
out of spinlock

[79441.630467] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
[79441.637566] rcu: Tasks blocked on level-1 rcu_node (CPUs 16-31): P32505
[79441.645355] rcu: Tasks blocked on level-1 rcu_node (CPUs 48-63): P34978
[79441.653149] rcu: (detected by 11, t=35024 jiffies, g=44237529, 
q=16542267)
[79441.661247] msgctl10R  running task21608 32505   2794 0x0082
[79441.669455] Call Trace:
[79441.736659]  preempt_schedule_irq+0x4c/0xb0
[79441.741578]  retint_kernel+0x1b/0x2d
[79441.745796] RIP: 0010:__is_insn_slot_addr+0xfb/0x250
[79441.751595] Code: 82 1d 00 48 8b 9b 90 00 00 00 4c 89 f7 49 c1 ee 03 e8 59 
83 1d 00 48 b8 00 00 00 00 00 fc ff df 4c 39 eb 48 89 9d 58 ff ff ff <41> c6 04 
06 f8 74 66 4c 8d 75 98 4c 89 f1 48 c1 e9 03 48 01 c8 48
[79441.773232] RSP: 0018:88bce041f758 EFLAGS: 0246 ORIG_RAX: 
ff13
[79441.782071] RAX: dc00 RBX: 8471bc50 RCX: 828a2a57
[79441.790337] RDX: dc00 RSI: dc00 RDI: 88bce041f780
[79441.798612] RBP: 88bce041f828 R08: ed15f3f4c5b3 R09: ed15f3f4c5b3
[79441.806877] R10: 0001 R11: ed15f3f4c5b2 R12: 00318aee9b73
[79441.815139] R13: 8471bc50 R14: 11179c083ef0 R15: 11179c083eec
[79441.848618]  kernel_text_address+0xc1/0x100
[79441.853542]  __kernel_text_address+0xe/0x30
[79441.858453]  unwind_get_return_address+0x2f/0x50
[79441.863864]  __save_stack_trace+0x92/0x100
[79441.868742]  create_object+0x380/0x650
[79441.911831]  __kmalloc+0x14c/0x2b0
[79441.915874]  load_msg+0x38/0x1a0
[79441.919726]  do_msgsnd+0x19e/0xcf0
[79442.006475]  do_syscall_64+0x117/0x400
[79442.037964]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

[79386.022357] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
[79386.029455] rcu: Tasks blocked on level-1 rcu_node (CPUs 0-15): P32170
[79386.037146] rcu: (detected by 14, t=35016 jiffies, g=44237525, 
q=12423063)
[79386.045242] msgctl10R  running task21608 32170  32155 0x0082
[79386.053447] Call Trace:
[79386.107584]  preempt_schedule_irq+0x4c/0xb0
[79386.112495]  retint_kernel+0x1b/0x2d
[79386.116712] RIP: 0010:lock_acquire+0x4d/0x340
[79386.121816] Code: 48 81 ec c0 00 00 00 45 89 c6 4d 89 cf 48 8d 6c 24 20 48 
89 3c 24 48 8d bb e4 0c 00 00 89 74 24 0c 48 c7 44 24 20 b3 8a b5 41 <48> c1 ed 
03 48 c7 44 24 28 b4 25 18 84 48 c7 44 24 30 d0 54 7a 82
[79386.143446] RSP: 0018:88af83417738 EFLAGS: 0282 ORIG_RAX: 
ff13
[79386.152278] RAX: dc00 RBX: 88bd335f3080 RCX: 0002
[79386.160543] RDX:  RSI:  RDI: 88bd335f3d64
[79386.168798] RBP: 88af83417758 R08:  R09: 
[79386.177049] R10: 0001 R11: ed13f3f745b2 R12: 
[79386.185308] R13: 0002 R14:  R15: 
[79386.213791]  is_bpf_text_address+0x32/0xe0
[79386.223516]  kernel_text_address+0xec/0x100
[79386.233532]  __kernel_text_address+0xe/0x30
[79386.238448]  unwind_get_return_address+0x2f/0x50
[79386.243858]  __save_stack_trace+0x92/0x100
[79386.252648]  save_stack+0x32/0xb0
[79386.357923]  __kasan_slab_free+0x130/0x180
[79386.362745]  kfree+0xfa/0x2d0
[79386.366291]  free_msg+0x24/0x50
[79386.370020]  do_msgrcv+0x508/0xe60
[79386.446596]  do_syscall_64+0x117/0x400
[79386.478122]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Signed-off-by: Li RongQing 
Signed-off-by: Zhang Yu 
---
v1 -->v2: move free_msg out of spinlock in mqueue_evict_inode
  move schedule() from loop end to loop start, since it does not 
isolate two kmallocs in loop end


 ipc/mqueue.c  | 10 --
 ipc/msgutil.c |  6 ++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c0d58f390..bce7af154 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -391,7 +391,8 @@ static void mqueue_evict_inode(struct inode *inode)
struct user_struct *user;
unsigned long mq_bytes, mq_treesize;
struct ipc_namespace *ipc_ns;
-   struct msg_msg *msg;
+   struct msg_msg *msg, *nmsg;
+   LIST_HEAD(tmp_msg);
 
clear_inode(inode);
 
@@ -402,10 +403,15 @@ static void mqueue_evict_inode(struct inode *inode)
info = MQUEUE_I(inode);
spin_lock(>lock);
while ((msg = msg_get(info)) != NULL)
-   free_msg(msg);
+   list_add_tail(>m_list, _msg);
kfree(info->node_cache);
spin_unlock(>lock);
 
+   list_for_each_entry_s

答复: [PATCH] ipc: prevent lockup on alloc_msg and free_msg

2019-03-07 Thread Li,Rongqing


> -邮件原件-
> 发件人: linux-kernel-ow...@vger.kernel.org
> [mailto:linux-kernel-ow...@vger.kernel.org] 代表 Andrew Morton
> 发送时间: 2019年3月8日 2:10
> 收件人: Li,Rongqing 
> 抄送: linux-kernel@vger.kernel.org; Davidlohr Bueso ;
> Dominik Brodowski ; Manfred Spraul
> ; Arnd Bergmann 
> 主题: Re: [PATCH] ipc: prevent lockup on alloc_msg and free_msg
> 
> On Thu,  7 Mar 2019 16:10:22 +0800 Li RongQing 
> wrote:
> 
> > From: Li Rongqing 
> >
> > msgctl10 of ltp triggers the following lockup When CONFIG_KASAN is
> > enabled on large memory SMP systems, the pages initialization can take
> > a long time, if msgctl10 requests a huge block memory, and it will
> > block rcu scheduler, so release cpu actively.
> >
> > ...
> >
> > Signed-off-by: Zhang Yu 
> > Signed-off-by: Li RongQing 
> 
> This signoff ordering somewhat implies that Zhang Yu was the author.
> But you added "From: Li Rongqing", so you will be recorded as the patch's
> author.  Is this correct?
> 
Thanks for your review.
I will revert this order

> > --- a/ipc/msgutil.c
> > +++ b/ipc/msgutil.c
> > @@ -18,6 +18,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >
> >  #include "util.h"
> >
> > @@ -72,6 +73,7 @@ static struct msg_msg *alloc_msg(size_t len)
> > seg->next = NULL;
> > pseg = >next;
> > len -= alen;
> > +   cond_resched();
> > }
> 
> This looks OK.
> 
> > return msg;
> > @@ -178,5 +180,6 @@ void free_msg(struct msg_msg *msg)
> > struct msg_msgseg *tmp = seg->next;
> > kfree(seg);
> > seg = tmp;
> > +   cond_resched();
> > }
> 
> This does not.  mqueue_evict_inode() (at least) calls free_msg() from under
> spin_lock().


I will try to fix it by moving the free_msg() out of spinlock , thanks

-RongQing



[PATCH] ipc: prevent lockup on alloc_msg and free_msg

2019-03-07 Thread Li RongQing
From: Li Rongqing 

msgctl10 of ltp triggers the following lockup When CONFIG_KASAN
is enabled on large memory SMP systems, the pages initialization
can take a long time, if msgctl10 requests a huge block memory,
and it will block rcu scheduler, so release cpu actively.

[79441.630467] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
[79441.637566] rcu: Tasks blocked on level-1 rcu_node (CPUs 16-31): P32505
[79441.645355] rcu: Tasks blocked on level-1 rcu_node (CPUs 48-63): P34978
[79441.653149] rcu: (detected by 11, t=35024 jiffies, g=44237529, 
q=16542267)
[79441.661247] msgctl10R  running task21608 32505   2794 0x0082
[79441.669455] Call Trace:
[79441.736659]  preempt_schedule_irq+0x4c/0xb0
[79441.741578]  retint_kernel+0x1b/0x2d
[79441.745796] RIP: 0010:__is_insn_slot_addr+0xfb/0x250
[79441.751595] Code: 82 1d 00 48 8b 9b 90 00 00 00 4c 89 f7 49 c1 ee 03 e8 59 
83 1d 00 48 b8 00 00 00 00 00 fc ff df 4c 39 eb 48 89 9d 58 ff ff ff <41> c6 04 
06 f8 74 66 4c 8d 75 98 4c 89 f1 48 c1 e9 03 48 01 c8 48
[79441.773232] RSP: 0018:88bce041f758 EFLAGS: 0246 ORIG_RAX: 
ff13
[79441.782071] RAX: dc00 RBX: 8471bc50 RCX: 828a2a57
[79441.790337] RDX: dc00 RSI: dc00 RDI: 88bce041f780
[79441.798612] RBP: 88bce041f828 R08: ed15f3f4c5b3 R09: ed15f3f4c5b3
[79441.806877] R10: 0001 R11: ed15f3f4c5b2 R12: 00318aee9b73
[79441.815139] R13: 8471bc50 R14: 11179c083ef0 R15: 11179c083eec
[79441.848618]  kernel_text_address+0xc1/0x100
[79441.853542]  __kernel_text_address+0xe/0x30
[79441.858453]  unwind_get_return_address+0x2f/0x50
[79441.863864]  __save_stack_trace+0x92/0x100
[79441.868742]  create_object+0x380/0x650
[79441.911831]  __kmalloc+0x14c/0x2b0
[79441.915874]  load_msg+0x38/0x1a0
[79441.919726]  do_msgsnd+0x19e/0xcf0
[79442.006475]  do_syscall_64+0x117/0x400
[79442.037964]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

[79386.022357] rcu: INFO: rcu_preempt detected stalls on CPUs/tasks:
[79386.029455] rcu: Tasks blocked on level-1 rcu_node (CPUs 0-15): P32170
[79386.037146] rcu: (detected by 14, t=35016 jiffies, g=44237525, 
q=12423063)
[79386.045242] msgctl10R  running task21608 32170  32155 0x0082
[79386.053447] Call Trace:
[79386.107584]  preempt_schedule_irq+0x4c/0xb0
[79386.112495]  retint_kernel+0x1b/0x2d
[79386.116712] RIP: 0010:lock_acquire+0x4d/0x340
[79386.121816] Code: 48 81 ec c0 00 00 00 45 89 c6 4d 89 cf 48 8d 6c 24 20 48 
89 3c 24 48 8d bb e4 0c 00 00 89 74 24 0c 48 c7 44 24 20 b3 8a b5 41 <48> c1 ed 
03 48 c7 44 24 28 b4 25 18 84 48 c7 44 24 30 d0 54 7a 82
[79386.143446] RSP: 0018:88af83417738 EFLAGS: 0282 ORIG_RAX: 
ff13
[79386.152278] RAX: dc00 RBX: 88bd335f3080 RCX: 0002
[79386.160543] RDX:  RSI:  RDI: 88bd335f3d64
[79386.168798] RBP: 88af83417758 R08:  R09: 
[79386.177049] R10: 0001 R11: ed13f3f745b2 R12: 
[79386.185308] R13: 0002 R14:  R15: 
[79386.213791]  is_bpf_text_address+0x32/0xe0
[79386.223516]  kernel_text_address+0xec/0x100
[79386.233532]  __kernel_text_address+0xe/0x30
[79386.238448]  unwind_get_return_address+0x2f/0x50
[79386.243858]  __save_stack_trace+0x92/0x100
[79386.252648]  save_stack+0x32/0xb0
[79386.357923]  __kasan_slab_free+0x130/0x180
[79386.362745]  kfree+0xfa/0x2d0
[79386.366291]  free_msg+0x24/0x50
[79386.370020]  do_msgrcv+0x508/0xe60
[79386.446596]  do_syscall_64+0x117/0x400
[79386.478122]  entry_SYSCALL_64_after_hwframe+0x49/0xbe

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 ipc/msgutil.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 84598025a..b5d0fc27e 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "util.h"
 
@@ -72,6 +73,7 @@ static struct msg_msg *alloc_msg(size_t len)
seg->next = NULL;
pseg = >next;
len -= alen;
+   cond_resched();
}
 
return msg;
@@ -178,5 +180,6 @@ void free_msg(struct msg_msg *msg)
struct msg_msgseg *tmp = seg->next;
kfree(seg);
seg = tmp;
+   cond_resched();
}
 }
-- 
2.16.2



答复: [PATCH] audit: fix a memleak caused by auditing load module

2019-03-05 Thread Li,Rongqing


> -邮件原件-
> 发件人: Paul Moore [mailto:p...@paul-moore.com]
> 发送时间: 2019年3月5日 22:18
> 收件人: Li,Rongqing 
> 抄送: Eric Paris ; linux-au...@redhat.com;
> linux-kernel@vger.kernel.org
> 主题: Re: [PATCH] audit: fix a memleak caused by auditing load module
> 
> On Tue, Mar 5, 2019 at 6:14 AM Li RongQing  wrote:
> > we should always free context->module.name, since it will be allocated
> > unconditionally and audit_log_start() can fail with other reasons, and
> > audit_log_exit maybe not called
> >
> > unreferenced object 0x88af90837d20 (size 8):
> >   comm "modprobe", pid 1036, jiffies 4294704867 (age 3069.138s)
> >   hex dump (first 8 bytes):
> > 69 78 67 62 65 00 ff ff  ixgbe...
> >   backtrace:
> > [<08da28fe>] __audit_log_kern_module+0x33/0x80
> > [<c1491e61>] load_module+0x64f/0x3850
> > [<7fc9ae3f>] __do_sys_init_module+0x218/0x250
> > [<00d4a478>] do_syscall_64+0x117/0x400
> > [<4924ded8>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
> > [<00007dc331dd>] 0x
> >
> > Fixes: ca86cad7380e3 ("audit: log module name on init_module")
> > Signed-off-by: Zhang Yu 
> > Signed-off-by: Li RongQing 
> > ---
> >  kernel/auditsc.c | 22 --
> >  1 file changed, 20 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/auditsc.c b/kernel/auditsc.c index
> > b2d1f043f..2bd80375f 100644
> > --- a/kernel/auditsc.c
> > +++ b/kernel/auditsc.c
> > @@ -1186,8 +1186,13 @@ static void show_special(struct audit_context
> *context, int *call_panic)
> > int i;
> >
> > ab = audit_log_start(context, GFP_KERNEL, context->type);
> > -   if (!ab)
> > +   if (!ab) {
> > +   if (context->type == AUDIT_KERN_MODULE) {
> > +   kfree(context->module.name);
> > +   context->module.name = NULL;
> > +   }
> > return;
> > +   }
> 
> Hello.
> 
> Thanks for the patch, but I have to ask if you've considered freeing the 
> module
> name in audit_free_context()?  That seems like the correct way to solve this
> issue.
> 

It does not work that move the freeing of module.name in audit_free_context

Since we should free module.name based on context->types is AUDIT_KERN_MODULE, 
but __audit_syscall_exit is called first, and will set context->type to 0,
When audit_free_context is called, context->type is 0, will cause to fail.

I will change this patches as below:

commit ee32ec2354b47a824e5e63d4f46567d577a02824 (HEAD -> master)
Author: Li RongQing 
Date:   Tue Mar 5 15:42:09 2019 +0800

audit: fix a memleak caused by auditing load module

module.name will be allocated unconditionally when auditing load
module, and audit_log_start() can fail with other reasons, or
audit_log_exit maybe not called, caused module.name is released

so always free module.name in audit_free_context

unreferenced object 0x88af90837d20 (size 8):
  comm "modprobe", pid 1036, jiffies 4294704867 (age 3069.138s)
  hex dump (first 8 bytes):
69 78 67 62 65 00 ff ff  ixgbe...
  backtrace:
[<08da28fe>] __audit_log_kern_module+0x33/0x80
[<c1491e61>] load_module+0x64f/0x3850
[<7fc9ae3f>] __do_sys_init_module+0x218/0x250
[<00d4a478>] do_syscall_64+0x117/0x400
[<4924ded8>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[<7dc331dd>] 0x

Fixes: ca86cad7380e3 ("audit: log module name on init_module")
Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b2d1f043f..07728b07a 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -964,6 +964,9 @@ int audit_alloc(struct task_struct *tsk)
 
 static inline void audit_free_context(struct audit_context *context)
 {
+   if (context->type == AUDIT_KERN_MODULE)
+   kfree(context->module.name);
+
audit_free_names(context);
unroll_tree_refs(context, NULL, 0);
free_tree_refs(context);
@@ -1282,6 +1285,8 @@ static void show_special(struct audit_context *context, 
int *call_panic)
if (context->module.name) {
audit_log_untrustedstring(ab, context->module.name);
kfree(context->module.name);
+   context->module.name = NULL;
+   context->type = 0;
} else
 

[PATCH] audit: fix a memleak caused by auditing load module

2019-03-05 Thread Li RongQing
we should always free context->module.name, since it will be
allocated unconditionally and audit_log_start() can fail with
other reasons, and audit_log_exit maybe not called

unreferenced object 0x88af90837d20 (size 8):
  comm "modprobe", pid 1036, jiffies 4294704867 (age 3069.138s)
  hex dump (first 8 bytes):
69 78 67 62 65 00 ff ff  ixgbe...
  backtrace:
[<08da28fe>] __audit_log_kern_module+0x33/0x80
[<c1491e61>] load_module+0x64f/0x3850
[<7fc9ae3f>] __do_sys_init_module+0x218/0x250
[<00d4a478>] do_syscall_64+0x117/0x400
[<4924ded8>] entry_SYSCALL_64_after_hwframe+0x49/0xbe
[<7dc331dd>] 0x

Fixes: ca86cad7380e3 ("audit: log module name on init_module")
Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 kernel/auditsc.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b2d1f043f..2bd80375f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1186,8 +1186,13 @@ static void show_special(struct audit_context *context, 
int *call_panic)
int i;
 
ab = audit_log_start(context, GFP_KERNEL, context->type);
-   if (!ab)
+   if (!ab) {
+   if (context->type == AUDIT_KERN_MODULE) {
+   kfree(context->module.name);
+   context->module.name = NULL;
+   }
return;
+   }
 
switch (context->type) {
case AUDIT_SOCKETCALL: {
@@ -1354,8 +1359,15 @@ static void audit_log_exit(struct audit_context 
*context, struct task_struct *ts
context->personality = tsk->personality;
 
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
-   if (!ab)
+
+   if (!ab) {
+   if (context->type == AUDIT_KERN_MODULE) {
+   kfree(context->module.name);
+   context->module.name = NULL;
+   }
return; /* audit_panic has been called */
+   }
+
audit_log_format(ab, "arch=%x syscall=%d",
 context->arch, context->major);
if (context->personality != PER_LINUX)
@@ -1576,6 +1588,12 @@ void __audit_syscall_exit(int success, long return_code)
 
if (context->in_syscall && context->current_state == 
AUDIT_RECORD_CONTEXT)
audit_log_exit(context, current);
+   else {
+   if (context->type == AUDIT_KERN_MODULE) {
+   kfree(context->module.name);
+   context->module.name = NULL;
+   }
+   }
 
context->in_syscall = 0;
context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
-- 
2.16.2



[PATCH][v2] time: Introduce jiffies64_to_msecs()

2019-02-27 Thread Li RongQing
there is a similar helper in net/netfilter/nf_tables_api.c,
this maybe become a common request someday, so move it to
time.c

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 v1-->v2: using jiffies64_to_msecs in nf_tables_api.c

 include/linux/jiffies.h   |  1 +
 kernel/time/time.c| 10 ++
 net/netfilter/nf_tables_api.c |  4 +---
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index fa928242567d..1b6d31da7cbc 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -297,6 +297,7 @@ static inline u64 jiffies_to_nsecs(const unsigned long j)
 }
 
 extern u64 jiffies64_to_nsecs(u64 j);
+extern u64 jiffies64_to_msecs(u64 j);
 
 extern unsigned long __msecs_to_jiffies(const unsigned int m);
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 2edb5088a70b..0083eb711fb7 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -719,6 +719,16 @@ u64 jiffies64_to_nsecs(u64 j)
 }
 EXPORT_SYMBOL(jiffies64_to_nsecs);
 
+u64 jiffies64_to_msecs(const u64 j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+   return (MSEC_PER_SEC / HZ) * j;
+#else
+   return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
+#endif
+}
+EXPORT_SYMBOL(jiffies64_to_msecs);
+
 /**
  * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
  *
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index e1a88ba2249e..8763b2798788 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3184,9 +3184,7 @@ static int nf_msecs_to_jiffies64(const struct nlattr 
*nla, u64 *result)
 
 static __be64 nf_jiffies64_to_msecs(u64 input)
 {
-   u64 ms = jiffies64_to_nsecs(input);
-
-   return cpu_to_be64(div_u64(ms, NSEC_PER_MSEC));
+   return cpu_to_be64(jiffies64_to_msecs(input));
 }
 
 static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
-- 
2.16.2



答复: [PATCH] time: Introduce jiffies64_to_msecs()

2019-02-27 Thread Li,Rongqing


> -邮件原件-
> 发件人: Thomas Gleixner [mailto:t...@linutronix.de]
> 发送时间: 2019年2月27日 19:18
> 收件人: Li,Rongqing 
> 抄送: netfilter-de...@vger.kernel.org; linux-kernel@vger.kernel.org;
> sb...@kernel.org; john.stu...@linaro.org
> 主题: Re: [PATCH] time: Introduce jiffies64_to_msecs()
> 
> On Wed, 27 Feb 2019, Li RongQing wrote:
> 
> > there is a similar helper in net/netfilter/nf_tables_api.c, this maybe
> > become a common request someday.
> 
> Maybe is not a really good justification for adding that. At least you should
> provide a patch which replaces the instance in the netfilter code.
> 
> Thanks,
> 


OK, I will send V2

Thanks
-rongqing



[PATCH] time: Introduce jiffies64_to_msecs()

2019-02-27 Thread Li RongQing
there is a similar helper in net/netfilter/nf_tables_api.c,
this maybe become a common request someday.

and avoid to call div_64 at some condition

Signed-off-by: Li RongQing 
---
 include/linux/jiffies.h |  1 +
 kernel/time/time.c  | 10 ++
 2 files changed, 11 insertions(+)

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index fa928242567d..1b6d31da7cbc 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -297,6 +297,7 @@ static inline u64 jiffies_to_nsecs(const unsigned long j)
 }
 
 extern u64 jiffies64_to_nsecs(u64 j);
+extern u64 jiffies64_to_msecs(u64 j);
 
 extern unsigned long __msecs_to_jiffies(const unsigned int m);
 #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 2edb5088a70b..0083eb711fb7 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -719,6 +719,16 @@ u64 jiffies64_to_nsecs(u64 j)
 }
 EXPORT_SYMBOL(jiffies64_to_nsecs);
 
+u64 jiffies64_to_msecs(const u64 j)
+{
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+   return (MSEC_PER_SEC / HZ) * j;
+#else
+   return div_u64(j * HZ_TO_MSEC_NUM, HZ_TO_MSEC_DEN);
+#endif
+}
+EXPORT_SYMBOL(jiffies64_to_msecs);
+
 /**
  * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
  *
-- 
2.16.2



答复: [PATCH] XArray tests: allocation has to be GFP_ATOMIC under rcu_read_lock

2019-02-10 Thread Li,Rongqing


> -邮件原件-
> 发件人: Matthew Wilcox [mailto:wi...@infradead.org]
> 发送时间: 2019年2月3日 7:20
> 收件人: Li,Rongqing 
> 抄送: linux-kernel@vger.kernel.org
> 主题: Re: [PATCH] XArray tests: allocation has to be GFP_ATOMIC under
> rcu_read_lock
> 
> On Tue, Jan 29, 2019 at 07:08:42PM +0800, Li RongQing wrote:
> > -   XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL);
> > +   XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_ATOMIC) != NULL);
> 
> Did you try running this change in userspace too?  I don't think it'll work.
> 
> I think the right change would be:
> 

Could you send this patch?

Thanks

-Li RongQing 


> +++ b/lib/test_xarray.c
> @@ -107,8 +107,11 @@ static noinline void check_xas_retry(struct xarray
> *xa)
> XA_BUG_ON(xa, xas.xa_node != XAS_RESTART);
> XA_BUG_ON(xa, xas_next_entry(, ULONG_MAX) !=
> xa_mk_value(0));
> XA_BUG_ON(xa, xas.xa_node != NULL);
> +   rcu_read_unlock();
> 
> XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL);
> +
> +   rcu_read_lock();
> XA_BUG_ON(xa, !xa_is_internal(xas_reload()));
> xas.xa_node = XAS_RESTART;
> XA_BUG_ON(xa, xas_next_entry(, ULONG_MAX) !=
> xa_mk_value(0));



答复: [PATCH][V5] tty: fix race between flush_to_ldisc and tty_open

2019-01-31 Thread Li,Rongqing


> -邮件原件-
> 发件人: linux-kernel-ow...@vger.kernel.org
> [mailto:linux-kernel-ow...@vger.kernel.org] 代表 Greg Kroah-Hartman
> 发送时间: 2019年1月31日 18:55
> 收件人: Li,Rongqing 
> 抄送: linux-ser...@vger.kernel.org; linux-kernel@vger.kernel.org;
> jsl...@suse.com; gko...@codeaurora.org
> 主题: Re: [PATCH][V5] tty: fix race between flush_to_ldisc and tty_open
> 
> On Thu, Jan 31, 2019 at 05:43:16PM +0800, Li RongQing wrote:
> > There still is a race window after the commit b027e2298bd588
> > ("tty: fix data race between tty_init_dev and flush of buf"), and we
> > encountered this crash issue if receive_buf call comes before tty
> > initialization completes in tty_open and
> > tty->driver_data may be NULL.
> >
> > CPU0CPU1
> > 
> >   tty_open
> >tty_init_dev
> >  tty_ldisc_unlock
> >schedule flush_to_ldisc
> > receive_buf
> >   tty_port_default_receive_buf
> >tty_ldisc_receive_buf
> > n_tty_receive_buf_common
> >   __receive_buf
> >uart_flush_chars
> > uart_start
> > /*tty->driver_data is NULL*/
> >tty->ops->open
> >/*init tty->driver_data*/
> >
> > it can be fixed by extending ldisc semaphore lock in tty_init_dev to
> > driver_data initialized completely after tty->ops->open(), but this
> > will lead to get lock on one function and unlock in some other
> > function, and hard to maintain, so fix this race only by checking
> > tty->driver_data when receiving, and return if tty->driver_data
> > is NULL, and n_tty_receive_buf_common maybe calls uart_unthrottle, so
> > add the same check
> >
> > Signed-off-by: Wang Li 
> > Signed-off-by: Zhang Yu 
> > Signed-off-by: Li RongQing 
> > ---
> > V5: move check into uart_start from n_tty_receive_buf_common
> > V4: add version information
> > V3: not used ldisc semaphore lock, only checking tty->driver_data with
> > NULL
> > V2: fix building error by EXPORT_SYMBOL tty_ldisc_unlock
> > V1: extend ldisc lock to protect that tty->driver_data is inited
> >
> >  drivers/tty/serial/serial_core.c | 6 ++
> >  1 file changed, 6 insertions(+)
> >
> > diff --git a/drivers/tty/serial/serial_core.c
> > b/drivers/tty/serial/serial_core.c
> > index 5c01bb6d1c24..556f50aa1b58 100644
> > --- a/drivers/tty/serial/serial_core.c
> > +++ b/drivers/tty/serial/serial_core.c
> > @@ -130,6 +130,9 @@ static void uart_start(struct tty_struct *tty)
> > struct uart_port *port;
> > unsigned long flags;
> >
> > +   if (!state)
> > +   return;
> > +
> > port = uart_port_lock(state, flags);
> > __uart_start(tty);
> > uart_port_unlock(port, flags);
> > @@ -727,6 +730,9 @@ static void uart_unthrottle(struct tty_struct *tty)
> > upstat_t mask = UPSTAT_SYNC_FIFO;
> > struct uart_port *port;
> >
> > +   if (!state)
> > +   return;
> > +
> > port = uart_port_ref(state);
> > if (!port)
> > return;
> > --
> > 2.16.2
> 
> 
> Hm, I wrote this patch, not you, right?  So shouldn't I get the
> credit/blame for it?  :)
> 

Welcome you to add your credit/blame/signature
and I am not clear the rule, and be afraid to become fake

> Also, this is a bug in the serial code, not necessarily the tty layer,
> so the subject should change...
> 
> And you did test this, right?

I add some delay in tty_init_dev to simulate this issue.  it can fix this my 
issue.

Thanks

-RongQing

> 
> thanks,
> 
> greg k-h


[PATCH][V5] tty: fix race between flush_to_ldisc and tty_open

2019-01-31 Thread Li RongQing
There still is a race window after the commit b027e2298bd588
("tty: fix data race between tty_init_dev and flush of buf"),
and we encountered this crash issue if receive_buf call comes
before tty initialization completes in tty_open and
tty->driver_data may be NULL.

CPU0CPU1

  tty_open
   tty_init_dev
 tty_ldisc_unlock
   schedule
flush_to_ldisc
 receive_buf
  tty_port_default_receive_buf
   tty_ldisc_receive_buf
n_tty_receive_buf_common
  __receive_buf
   uart_flush_chars
uart_start
/*tty->driver_data is NULL*/
   tty->ops->open
   /*init tty->driver_data*/

it can be fixed by extending ldisc semaphore lock in tty_init_dev
to driver_data initialized completely after tty->ops->open(), but
this will lead to get lock on one function and unlock in some other
function, and hard to maintain, so fix this race only by checking
tty->driver_data when receiving, and return if tty->driver_data
is NULL, and n_tty_receive_buf_common maybe calls uart_unthrottle,
so add the same check

Signed-off-by: Wang Li 
Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
V5: move check into uart_start from n_tty_receive_buf_common
V4: add version information
V3: not used ldisc semaphore lock, only checking tty->driver_data with NULL
V2: fix building error by EXPORT_SYMBOL tty_ldisc_unlock
V1: extend ldisc lock to protect that tty->driver_data is inited

 drivers/tty/serial/serial_core.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index 5c01bb6d1c24..556f50aa1b58 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -130,6 +130,9 @@ static void uart_start(struct tty_struct *tty)
struct uart_port *port;
unsigned long flags;
 
+   if (!state)
+   return;
+
port = uart_port_lock(state, flags);
__uart_start(tty);
uart_port_unlock(port, flags);
@@ -727,6 +730,9 @@ static void uart_unthrottle(struct tty_struct *tty)
upstat_t mask = UPSTAT_SYNC_FIFO;
struct uart_port *port;
 
+   if (!state)
+   return;
+
port = uart_port_ref(state);
if (!port)
return;
-- 
2.16.2



答复: 答复: 答复: [PATCH][v4] tty: fix race between flush_to_ldisc and tty_open

2019-01-30 Thread Li,Rongqing


> -邮件原件-
> 发件人: Greg KH [mailto:gre...@linuxfoundation.org]
> 发送时间: 2019年1月31日 14:52
> 收件人: Li,Rongqing 
> 抄送: jsl...@suse.com; linux-kernel@vger.kernel.org; gko...@codeaurora.org;
> linux-ser...@vger.kernel.org
> 主题: Re: 答复: 答复: [PATCH][v4] tty: fix race between flush_to_ldisc and
> tty_open
> 
> On Thu, Jan 31, 2019 at 02:15:35AM +, Li,Rongqing wrote:
> >
> >
> > > -邮件原件-
> > > 发件人: Greg KH [mailto:gre...@linuxfoundation.org]
> > > 发送时间: 2019年1月30日 21:17
> > > 收件人: Li,Rongqing 
> > > 抄送: jsl...@suse.com; linux-kernel@vger.kernel.org;
> > > gko...@codeaurora.org; linux-ser...@vger.kernel.org
> > > 主题: Re: 答复: [PATCH][v4] tty: fix race between flush_to_ldisc and
> > > tty_open
> > >
> > > On Wed, Jan 30, 2019 at 12:48:42PM +, Li,Rongqing wrote:
> > > >
> > > >
> > > > > -邮件原件-
> > > > > 发件人: linux-kernel-ow...@vger.kernel.org
> > > > > [mailto:linux-kernel-ow...@vger.kernel.org] 代表 Greg KH
> > > > > 发送时间: 2019年1月30日 18:19
> > > > > 收件人: Li,Rongqing 
> > > > > 抄送: jsl...@suse.com; linux-kernel@vger.kernel.org;
> > > > > gko...@codeaurora.org
> > > > > 主题: Re: [PATCH][v4] tty: fix race between flush_to_ldisc and
> > > > > tty_open
> > > > >
> > > > > On Fri, Jan 18, 2019 at 05:27:17PM +0800, Li RongQing wrote:
> > > > > > There still is a race window after the commit b027e2298bd588
> > > > > > ("tty: fix data race between tty_init_dev and flush of buf"),
> > > > > > and we encountered this crash issue if receive_buf call comes
> > > > > > before tty initialization completes in n_tty_open and
> > > > > > tty->driver_data may be NULL.
> > > > > >
> > > > > > CPU0CPU1
> > > > > > 
> > > > > >  n_tty_open
> > > > > >tty_init_dev
> > > > > >  tty_ldisc_unlock
> > > > > >schedule
> flush_to_ldisc
> > > > > > receive_buf
> > > > > >   tty_port_default_receive_buf
> > > > > >tty_ldisc_receive_buf
> > > > > > n_tty_receive_buf_common
> > > > > >   __receive_buf
> > > > > >uart_flush_chars
> > > > > > uart_start
> > > > > > /*tty->driver_data is NULL*/
> > > > > >tty->ops->open
> > > > > >/*init tty->driver_data*/
> > > > > >
> > > > > > it can be fixed by extending ldisc semaphore lock in
> > > > > > tty_init_dev to driver_data initialized completely after
> > > > > > tty->ops->open(), but this will lead to put lock on one
> > > > > > function and unlock in some other function, and hard to
> > > > > > maintain, so fix this race only by checking
> > > > > > tty->driver_data when receiving, and return if
> > > > > > tty->tty->driver_data
> > > > > > is NULL
> > > > > >
> > > > > > Signed-off-by: Wang Li 
> > > > > > Signed-off-by: Zhang Yu 
> > > > > > Signed-off-by: Li RongQing 
> > > > > > ---
> > > > > > V4: add version information
> > > > > > V3: not used ldisc semaphore lock, only checking
> > > > > > tty->driver_data with NULL
> > > > > > V2: fix building error by EXPORT_SYMBOL tty_ldisc_unlock
> > > > > > V1: extend ldisc lock to protect that tty->driver_data is
> > > > > > inited
> > > > > >
> > > > > > drivers/tty/tty_port.c | 3 +++
> > > > > >  1 file changed, 3 insertions(+)
> > > > > >
> > > > > > diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
> > > > > > index
> > > > > > 044c3cbdcfa4..86d0bec38322 100644
> > > > > > --- a/drivers/tty/tty_port.c
> > > > > > +++ b/drivers/tty/tty_port.c
> > > > > > @@ -31,6 +31,9 @@ static int
> > > > 

答复: 答复: [PATCH][v4] tty: fix race between flush_to_ldisc and tty_open

2019-01-30 Thread Li,Rongqing


> -邮件原件-
> 发件人: Greg KH [mailto:gre...@linuxfoundation.org]
> 发送时间: 2019年1月30日 21:17
> 收件人: Li,Rongqing 
> 抄送: jsl...@suse.com; linux-kernel@vger.kernel.org; gko...@codeaurora.org;
> linux-ser...@vger.kernel.org
> 主题: Re: 答复: [PATCH][v4] tty: fix race between flush_to_ldisc and tty_open
> 
> On Wed, Jan 30, 2019 at 12:48:42PM +, Li,Rongqing wrote:
> >
> >
> > > -邮件原件-
> > > 发件人: linux-kernel-ow...@vger.kernel.org
> > > [mailto:linux-kernel-ow...@vger.kernel.org] 代表 Greg KH
> > > 发送时间: 2019年1月30日 18:19
> > > 收件人: Li,Rongqing 
> > > 抄送: jsl...@suse.com; linux-kernel@vger.kernel.org;
> > > gko...@codeaurora.org
> > > 主题: Re: [PATCH][v4] tty: fix race between flush_to_ldisc and
> > > tty_open
> > >
> > > On Fri, Jan 18, 2019 at 05:27:17PM +0800, Li RongQing wrote:
> > > > There still is a race window after the commit b027e2298bd588
> > > > ("tty: fix data race between tty_init_dev and flush of buf"), and
> > > > we encountered this crash issue if receive_buf call comes before
> > > > tty initialization completes in n_tty_open and
> > > > tty->driver_data may be NULL.
> > > >
> > > > CPU0CPU1
> > > > 
> > > >  n_tty_open
> > > >tty_init_dev
> > > >  tty_ldisc_unlock
> > > >schedule flush_to_ldisc
> > > > receive_buf
> > > >   tty_port_default_receive_buf
> > > >tty_ldisc_receive_buf
> > > > n_tty_receive_buf_common
> > > >   __receive_buf
> > > >uart_flush_chars
> > > > uart_start
> > > > /*tty->driver_data is NULL*/
> > > >tty->ops->open
> > > >/*init tty->driver_data*/
> > > >
> > > > it can be fixed by extending ldisc semaphore lock in tty_init_dev
> > > > to driver_data initialized completely after tty->ops->open(), but
> > > > this will lead to put lock on one function and unlock in some
> > > > other function, and hard to maintain, so fix this race only by
> > > > checking
> > > > tty->driver_data when receiving, and return if tty->driver_data
> > > > is NULL
> > > >
> > > > Signed-off-by: Wang Li 
> > > > Signed-off-by: Zhang Yu 
> > > > Signed-off-by: Li RongQing 
> > > > ---
> > > > V4: add version information
> > > > V3: not used ldisc semaphore lock, only checking tty->driver_data
> > > > with NULL
> > > > V2: fix building error by EXPORT_SYMBOL tty_ldisc_unlock
> > > > V1: extend ldisc lock to protect that tty->driver_data is inited
> > > >
> > > > drivers/tty/tty_port.c | 3 +++
> > > >  1 file changed, 3 insertions(+)
> > > >
> > > > diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index
> > > > 044c3cbdcfa4..86d0bec38322 100644
> > > > --- a/drivers/tty/tty_port.c
> > > > +++ b/drivers/tty/tty_port.c
> > > > @@ -31,6 +31,9 @@ static int tty_port_default_receive_buf(struct
> > > > tty_port
> > > *port,
> > > > if (!tty)
> > > > return 0;
> > > >
> > > > +   if (!tty->driver_data)
> > > > +   return 0;
> > > > +
> > >
> > > How is this working?  What is setting driver_data to NULL to "stop" this
> race?
> > >
> >
> >
> > if tty->driver_data is NULL and return,  tty_port_default_receive_buf
> > will not step to uart_start which access tty->driver_data and trigger
> > panic before tty_open, so it can fix the system panic
> >
> > > There's no requirement that a tty driver set this field to NULL when it is
> "done"
> > > with the tty device, so I think you are just getting lucky in that
> > > your specific driver happens to be doing this.
> > >
> >
> > when tty_open is running, tty is allocated by kzalloc in tty_init_dev
> > which called by tty_open_by_driver, tty is inited to 0
> >
> > > What driver are you testing this against?
> > >
> >
> > 8250
> 
> Ok, as this is specific to the uart core, how about this patch instead:
> 
> diff --git a/drivers/tty/serial/serial_core.c 
> b/drivers/tty/serial/serial_core.c
> index 5c01bb6d1c24..b56a6250df3f 100644
> --- a/drivers/tty/serial/serial_core.c
> +++ b/drivers/tty/serial/serial_core.c
> @@ -130,6 +130,9 @@ static void uart_start(struct tty_struct *tty)
>   struct uart_port *port;
>   unsigned long flags;
> 
> + if (!state)
> + return;
> +
>   port = uart_port_lock(state, flags);
>   __uart_start(tty);
>   uart_port_unlock(port, flags);


If move the check into uart_start, i am afraid that it maybe not fully fix this 
issue,
Since n_tty_receive_buf_common maybe call n_tty_check_throttle/ 
tty_unthrottle_safe which maybe use the tty->driver_data

if tty is not fully opened, I think no gain to step into more function

thanks

-RongQing


答复: [PATCH][v4] tty: fix race between flush_to_ldisc and tty_open

2019-01-30 Thread Li,Rongqing


> -邮件原件-
> 发件人: linux-kernel-ow...@vger.kernel.org
> [mailto:linux-kernel-ow...@vger.kernel.org] 代表 Greg KH
> 发送时间: 2019年1月30日 18:19
> 收件人: Li,Rongqing 
> 抄送: jsl...@suse.com; linux-kernel@vger.kernel.org; gko...@codeaurora.org
> 主题: Re: [PATCH][v4] tty: fix race between flush_to_ldisc and tty_open
> 
> On Fri, Jan 18, 2019 at 05:27:17PM +0800, Li RongQing wrote:
> > There still is a race window after the commit b027e2298bd588
> > ("tty: fix data race between tty_init_dev and flush of buf"), and we
> > encountered this crash issue if receive_buf call comes before tty
> > initialization completes in n_tty_open and
> > tty->driver_data may be NULL.
> >
> > CPU0CPU1
> > 
> >  n_tty_open
> >tty_init_dev
> >  tty_ldisc_unlock
> >schedule flush_to_ldisc
> > receive_buf
> >   tty_port_default_receive_buf
> >tty_ldisc_receive_buf
> > n_tty_receive_buf_common
> >   __receive_buf
> >uart_flush_chars
> > uart_start
> > /*tty->driver_data is NULL*/
> >tty->ops->open
> >/*init tty->driver_data*/
> >
> > it can be fixed by extending ldisc semaphore lock in tty_init_dev to
> > driver_data initialized completely after tty->ops->open(), but this
> > will lead to put lock on one function and unlock in some other
> > function, and hard to maintain, so fix this race only by checking
> > tty->driver_data when receiving, and return if tty->driver_data
> > is NULL
> >
> > Signed-off-by: Wang Li 
> > Signed-off-by: Zhang Yu 
> > Signed-off-by: Li RongQing 
> > ---
> > V4: add version information
> > V3: not used ldisc semaphore lock, only checking tty->driver_data with
> > NULL
> > V2: fix building error by EXPORT_SYMBOL tty_ldisc_unlock
> > V1: extend ldisc lock to protect that tty->driver_data is inited
> >
> > drivers/tty/tty_port.c | 3 +++
> >  1 file changed, 3 insertions(+)
> >
> > diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index
> > 044c3cbdcfa4..86d0bec38322 100644
> > --- a/drivers/tty/tty_port.c
> > +++ b/drivers/tty/tty_port.c
> > @@ -31,6 +31,9 @@ static int tty_port_default_receive_buf(struct tty_port
> *port,
> > if (!tty)
> > return 0;
> >
> > +   if (!tty->driver_data)
> > +   return 0;
> > +
> 
> How is this working?  What is setting driver_data to NULL to "stop" this race?
> 


if tty->driver_data is NULL and return,  tty_port_default_receive_buf will not 
step to
uart_start which access tty->driver_data and trigger panic before tty_open, so 
it can
fix the system panic

> There's no requirement that a tty driver set this field to NULL when it is 
> "done"
> with the tty device, so I think you are just getting lucky in that your 
> specific
> driver happens to be doing this.
> 

when tty_open is running, tty is allocated by kzalloc in tty_init_dev which 
called
by tty_open_by_driver, tty is inited to 0

> What driver are you testing this against?
> 

8250

Thanks

-RongQing

> thanks,
> 
> greg k-h


答复: [PATCH][v4] tty: fix race between flush_to_ldisc and tty_open

2019-01-30 Thread Li,Rongqing


> -邮件原件-
> 发件人: Kohli, Gaurav [mailto:gko...@codeaurora.org]
> 发送时间: 2019年1月18日 20:51
> 收件人: Li,Rongqing ; gre...@linuxfoundation.org;
> jsl...@suse.com; linux-kernel@vger.kernel.org
> 主题: Re: [PATCH][v4] tty: fix race between flush_to_ldisc and tty_open
> 
> 
> 
> On 1/18/2019 2:57 PM, Li RongQing wrote:
> > There still is a race window after the commit b027e2298bd588
> > ("tty: fix data race between tty_init_dev and flush of buf"), and we
> > encountered this crash issue if receive_buf call comes before tty
> > initialization completes in n_tty_open and
> > tty->driver_data may be NULL.
> >
> > CPU0CPU1
> > 
> >   n_tty_open
> > tty_init_dev
> >   tty_ldisc_unlock
> > schedule flush_to_ldisc
> >   receive_buf
> >tty_port_default_receive_buf
> > tty_ldisc_receive_buf
> >  n_tty_receive_buf_common
> >__receive_buf
> > uart_flush_chars
> >  uart_start
> >  /*tty->driver_data is NULL*/
> > tty->ops->open
> > /*init tty->driver_data*/
> >
> > it can be fixed by extending ldisc semaphore lock in tty_init_dev to
> > driver_data initialized completely after tty->ops->open(), but this
> > will lead to put lock on one function and unlock in some other
> > function, and hard to maintain, so fix this race only by checking
> > tty->driver_data when receiving, and return if tty->driver_data
> > is NULL
> >
> > Signed-off-by: Wang Li 
> > Signed-off-by: Zhang Yu 
> > Signed-off-by: Li RongQing 
> > ---
> > V4: add version information
> > V3: not used ldisc semaphore lock, only checking tty->driver_data with
> > NULL
> > V2: fix building error by EXPORT_SYMBOL tty_ldisc_unlock
> > V1: extend ldisc lock to protect that tty->driver_data is inited
> >
> > drivers/tty/tty_port.c | 3 +++
> >   1 file changed, 3 insertions(+)
> >
> > diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c index
> > 044c3cbdcfa4..86d0bec38322 100644
> > --- a/drivers/tty/tty_port.c
> > +++ b/drivers/tty/tty_port.c
> > @@ -31,6 +31,9 @@ static int tty_port_default_receive_buf(struct tty_port
> *port,
> > if (!tty)
> > return 0;
> >
> > +   if (!tty->driver_data)
> > +   return 0;
> > +
> > disc = tty_ldisc_ref(tty);
> > if (!disc)
> > return 0;
> >
> Acked-by: Gaurav Kohli 
> 
> It looks good to me w.r.t previous approach, but Let's Maintainer decide once.
> 

Thanks for your review, this one is simple and safe, it is used as live-patch 
online

-RongQing


> Regards
> Gaurav
> --
> Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc.
> is a member of the Code Aurora Forum, a Linux Foundation Collaborative
> Project.


[PATCH] XArray tests: allocation has to be GFP_ATOMIC under rcu_read_lock

2019-01-29 Thread Li RongQing
Signed-off-by: Li RongQing 
---
 lib/test_xarray.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/test_xarray.c b/lib/test_xarray.c
index c596a957f764..5b671c83b73d 100644
--- a/lib/test_xarray.c
+++ b/lib/test_xarray.c
@@ -108,7 +108,7 @@ static noinline void check_xas_retry(struct xarray *xa)
XA_BUG_ON(xa, xas_next_entry(, ULONG_MAX) != xa_mk_value(0));
XA_BUG_ON(xa, xas.xa_node != NULL);
 
-   XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_KERNEL) != NULL);
+   XA_BUG_ON(xa, xa_store_index(xa, 1, GFP_ATOMIC) != NULL);
XA_BUG_ON(xa, !xa_is_internal(xas_reload()));
xas.xa_node = XAS_RESTART;
XA_BUG_ON(xa, xas_next_entry(, ULONG_MAX) != xa_mk_value(0));
-- 
2.16.2



[PATCH][v4] tty: fix race between flush_to_ldisc and tty_open

2019-01-18 Thread Li RongQing
There still is a race window after the commit b027e2298bd588
("tty: fix data race between tty_init_dev and flush of buf"),
and we encountered this crash issue if receive_buf call comes
before tty initialization completes in n_tty_open and
tty->driver_data may be NULL.

CPU0CPU1

 n_tty_open
   tty_init_dev
 tty_ldisc_unlock
   schedule
flush_to_ldisc
 receive_buf
  tty_port_default_receive_buf
   tty_ldisc_receive_buf
n_tty_receive_buf_common
  __receive_buf
   uart_flush_chars
uart_start
/*tty->driver_data is NULL*/
   tty->ops->open
   /*init tty->driver_data*/

it can be fixed by extending ldisc semaphore lock in tty_init_dev
to driver_data initialized completely after tty->ops->open(), but
this will lead to put lock on one function and unlock in some other
function, and hard to maintain, so fix this race only by checking
tty->driver_data when receiving, and return if tty->driver_data
is NULL

Signed-off-by: Wang Li 
Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
V4: add version information
V3: not used ldisc semaphore lock, only checking tty->driver_data with NULL
V2: fix building error by EXPORT_SYMBOL tty_ldisc_unlock
V1: extend ldisc lock to protect that tty->driver_data is inited 

drivers/tty/tty_port.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 044c3cbdcfa4..86d0bec38322 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -31,6 +31,9 @@ static int tty_port_default_receive_buf(struct tty_port *port,
if (!tty)
return 0;
 
+   if (!tty->driver_data)
+   return 0;
+
disc = tty_ldisc_ref(tty);
if (!disc)
return 0;
-- 
2.16.2



[PATCH][v3] tty: fix race between flush_to_ldisc and tty_open

2019-01-18 Thread Li RongQing
There still is a race window after the commit b027e2298bd588
("tty: fix data race between tty_init_dev and flush of buf"),
and we encountered this crash issue if receive_buf call comes
before tty initialization completes in n_tty_open and
tty->driver_data may be NULL.

CPU0CPU1

 n_tty_open
   tty_init_dev
 tty_ldisc_unlock
   schedule
flush_to_ldisc
 receive_buf
  tty_port_default_receive_buf
   tty_ldisc_receive_buf
n_tty_receive_buf_common
  __receive_buf
   uart_flush_chars
uart_start
/*tty->driver_data is NULL*/
   tty->ops->open
   /*init tty->driver_data*/

it can be fixed by extending ldisc semaphore lock in tty_init_dev
to driver_data initialized completely after tty->ops->open(), but
this will lead to put lock on one function and unlock in some other
function, and hard to maintain, so fix this race only by checking
tty->driver_data when receiving, and return if tty->driver_data
is NULL

Signed-off-by: Wang Li 
Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 drivers/tty/tty_port.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 044c3cbdcfa4..86d0bec38322 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -31,6 +31,9 @@ static int tty_port_default_receive_buf(struct tty_port *port,
if (!tty)
return 0;
 
+   if (!tty->driver_data)
+   return 0;
+
disc = tty_ldisc_ref(tty);
if (!disc)
return 0;
-- 
2.16.2



答复: [PATCH][v2] tty: fix race between flush_to_ldisc and tty_open

2019-01-13 Thread Li,Rongqing
> -邮件原件-
> 发件人: Kohli, Gaurav [mailto:gko...@codeaurora.org]
> 发送时间: 2019年1月11日 21:57
> 收件人: Li,Rongqing ; linux-kernel@vger.kernel.org;
> jsl...@suse.com; gre...@linuxfoundation.org; a...@linux.intel.com
> 主题: Re: [PATCH][v2] tty: fix race between flush_to_ldisc and tty_open
> 
> Hi,
> 
> it don't seems to be good idea to put lock on one function and unlock in some
> other function. If in future some one has to call tty_init_dev, how he can 
> track
> the unlocking as well of ldisc lock.
> 
> Regards
> Gaurav
> 

This similar condition has existed for a long time, tty_unlock(tty) must be 
called in
some other functions who call tty_init_dev, since tty_init_dev hold tty_lock, 
and does
not release it

so I think it is user responsibility to fully understand tty_init_dev; and I 
can add some 
comments for tty_init_dev if this patch can be acceptable;


or a workaround like below:
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c
index a42a028a9d4e..2f5ad256b6ad 100644
--- a/drivers/tty/tty_buffer.c
+++ b/drivers/tty/tty_buffer.c
@@ -425,7 +425,7 @@ static void flush_to_ldisc(struct work_struct *work)
struct tty_ldisc *disc;
 
tty = port->itty;
-   if (tty == NULL)
+   if (tty == NULL || tty->driver_data == NULL)
return;
 
disc = tty_ldisc_ref(tty);

thanks

-RongQing





[PATCH][V2][resend] tty: fix race between flush_to_ldisc and tty_open

2019-01-10 Thread Li RongQing
There still is a race window after the commit b027e2298bd588
("tty: fix data race between tty_init_dev and flush of buf"),
if receive_buf call comes before tty initialization completes
in n_tty_open and tty->driver_data may be NULL.

CPU0CPU1

 n_tty_open
   tty_init_dev
 tty_ldisc_unlock
   schedule
flush_to_ldisc
  n_tty_receive_buf
uart_flush_chars
  uart_start
  /*tty->driver_data is NULL*/
   tty->ops->open
   /*init tty->driver_data*/

Extending ldisc semaphore lock in tty_init_dev to driver_data
initialized completely after tty->ops->open().

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 drivers/staging/speakup/spk_ttyio.c |  1 +
 drivers/tty/pty.c   |  2 ++
 drivers/tty/serdev/serdev-ttyport.c |  2 ++
 drivers/tty/tty_io.c| 14 +++---
 drivers/tty/tty_ldisc.c |  1 +
 5 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/speakup/spk_ttyio.c 
b/drivers/staging/speakup/spk_ttyio.c
index 979e3ae249c1..c31f08c98383 100644
--- a/drivers/staging/speakup/spk_ttyio.c
+++ b/drivers/staging/speakup/spk_ttyio.c
@@ -155,6 +155,7 @@ static int spk_ttyio_initialise_ldisc(struct spk_synth 
*synth)
else
ret = -ENODEV;
 
+   tty_ldisc_unlock(tty);
if (ret) {
tty_unlock(tty);
return ret;
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 00099a8439d2..1b9684d4f718 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -873,9 +873,11 @@ static int ptmx_open(struct inode *inode, struct file 
*filp)
 
tty_debug_hangup(tty, "opening (count=%d)\n", tty->count);
 
+   tty_ldisc_unlock(tty);
tty_unlock(tty);
return 0;
 err_release:
+   tty_ldisc_unlock(tty);
tty_unlock(tty);
// This will also put-ref the fsi
tty_release(inode, filp);
diff --git a/drivers/tty/serdev/serdev-ttyport.c 
b/drivers/tty/serdev/serdev-ttyport.c
index fa1672993b4c..ce16cb303e28 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -123,6 +123,7 @@ static int ttyport_open(struct serdev_controller *ctrl)
if (ret)
goto err_close;
 
+   tty_ldisc_unlock(tty);
tty_unlock(serport->tty);
 
/* Bring the UART into a known 8 bits no parity hw fc state */
@@ -145,6 +146,7 @@ static int ttyport_open(struct serdev_controller *ctrl)
 err_close:
tty->ops->close(tty, NULL);
 err_unlock:
+   tty_ldisc_unlock(tty);
tty_unlock(tty);
tty_release_struct(tty, serport->tty_idx);
 
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 687250ec8032..199f45e2e1b1 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1351,7 +1351,6 @@ struct tty_struct *tty_init_dev(struct tty_driver 
*driver, int idx)
retval = tty_ldisc_setup(tty, tty->link);
if (retval)
goto err_release_tty;
-   tty_ldisc_unlock(tty);
/* Return the tty locked so that it cannot vanish under the caller */
return tty;
 
@@ -1926,7 +1925,7 @@ EXPORT_SYMBOL_GPL(tty_kopen);
  *   - concurrent tty removal from driver table
  */
 static struct tty_struct *tty_open_by_driver(dev_t device, struct inode *inode,
-struct file *filp)
+struct file *filp, bool *unlock)
 {
struct tty_struct *tty;
struct tty_driver *driver = NULL;
@@ -1970,6 +1969,7 @@ static struct tty_struct *tty_open_by_driver(dev_t 
device, struct inode *inode,
}
} else { /* Returns with the tty_lock held for now */
tty = tty_init_dev(driver, index);
+   *unlock = true;
mutex_unlock(_mutex);
}
 out:
@@ -2007,6 +2007,7 @@ static int tty_open(struct inode *inode, struct file 
*filp)
int noctty, retval;
dev_t device = inode->i_rdev;
unsigned saved_flags = filp->f_flags;
+   bool unlock = false;
 
nonseekable_open(inode, filp);
 
@@ -2017,7 +2018,7 @@ static int tty_open(struct inode *inode, struct file 
*filp)
 
tty = tty_open_current_tty(device, filp);
if (!tty)
-   tty = tty_open_by_driver(device, inode, filp);
+   tty = tty_open_by_driver(device, inode, filp, );
 
if (IS_ERR(tty)) {
tty_free_file(filp);
@@ -2042,6 +2043,10 @@ static int tty_open(struct inode *inode, struct file 
*filp)
if (retval) {
tty_debug_hangup(tty, "open error %d, releasing\n", retv

[PATCH][v2] tty: fix race between flush_to_ldisc and tty_open

2018-12-23 Thread Li RongQing
There still is a race window after the commit b027e2298bd588
("tty: fix data race between tty_init_dev and flush of buf"),
if receive_buf call comes before tty initialization completes
in n_tty_open and tty->driver_data may be NULL.

CPU0 CPU1
 
 n_tty_open
   tty_init_dev
 tty_ldisc_unlock
   schedule
flush_to_ldisc
  n_tty_receive_buf
uart_flush_chars
  uart_start
  /*tty->driver_data is NULL*/
   tty->ops->open
   /*init tty->driver_data*/

Extending ldisc semaphore lock in tty_init_dev to driver_data
initialized completely after tty->ops->open().

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 drivers/staging/speakup/spk_ttyio.c |  1 +
 drivers/tty/pty.c   |  2 ++
 drivers/tty/serdev/serdev-ttyport.c |  2 ++
 drivers/tty/tty_io.c| 14 +++---
 drivers/tty/tty_ldisc.c |  1 +
 5 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/speakup/spk_ttyio.c 
b/drivers/staging/speakup/spk_ttyio.c
index 979e3ae249c1..c31f08c98383 100644
--- a/drivers/staging/speakup/spk_ttyio.c
+++ b/drivers/staging/speakup/spk_ttyio.c
@@ -155,6 +155,7 @@ static int spk_ttyio_initialise_ldisc(struct spk_synth 
*synth)
else
ret = -ENODEV;
 
+   tty_ldisc_unlock(tty);
if (ret) {
tty_unlock(tty);
return ret;
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 00099a8439d2..1b9684d4f718 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -873,9 +873,11 @@ static int ptmx_open(struct inode *inode, struct file 
*filp)
 
tty_debug_hangup(tty, "opening (count=%d)\n", tty->count);
 
+   tty_ldisc_unlock(tty);
tty_unlock(tty);
return 0;
 err_release:
+   tty_ldisc_unlock(tty);
tty_unlock(tty);
// This will also put-ref the fsi
tty_release(inode, filp);
diff --git a/drivers/tty/serdev/serdev-ttyport.c 
b/drivers/tty/serdev/serdev-ttyport.c
index fa1672993b4c..ce16cb303e28 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -123,6 +123,7 @@ static int ttyport_open(struct serdev_controller *ctrl)
if (ret)
goto err_close;
 
+   tty_ldisc_unlock(tty);
tty_unlock(serport->tty);
 
/* Bring the UART into a known 8 bits no parity hw fc state */
@@ -145,6 +146,7 @@ static int ttyport_open(struct serdev_controller *ctrl)
 err_close:
tty->ops->close(tty, NULL);
 err_unlock:
+   tty_ldisc_unlock(tty);
tty_unlock(tty);
tty_release_struct(tty, serport->tty_idx);
 
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 687250ec8032..199f45e2e1b1 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1351,7 +1351,6 @@ struct tty_struct *tty_init_dev(struct tty_driver 
*driver, int idx)
retval = tty_ldisc_setup(tty, tty->link);
if (retval)
goto err_release_tty;
-   tty_ldisc_unlock(tty);
/* Return the tty locked so that it cannot vanish under the caller */
return tty;
 
@@ -1926,7 +1925,7 @@ EXPORT_SYMBOL_GPL(tty_kopen);
  *   - concurrent tty removal from driver table
  */
 static struct tty_struct *tty_open_by_driver(dev_t device, struct inode *inode,
-struct file *filp)
+struct file *filp, bool *unlock)
 {
struct tty_struct *tty;
struct tty_driver *driver = NULL;
@@ -1970,6 +1969,7 @@ static struct tty_struct *tty_open_by_driver(dev_t 
device, struct inode *inode,
}
} else { /* Returns with the tty_lock held for now */
tty = tty_init_dev(driver, index);
+   *unlock = true;
mutex_unlock(_mutex);
}
 out:
@@ -2007,6 +2007,7 @@ static int tty_open(struct inode *inode, struct file 
*filp)
int noctty, retval;
dev_t device = inode->i_rdev;
unsigned saved_flags = filp->f_flags;
+   bool unlock = false;
 
nonseekable_open(inode, filp);
 
@@ -2017,7 +2018,7 @@ static int tty_open(struct inode *inode, struct file 
*filp)
 
tty = tty_open_current_tty(device, filp);
if (!tty)
-   tty = tty_open_by_driver(device, inode, filp);
+   tty = tty_open_by_driver(device, inode, filp, );
 
if (IS_ERR(tty)) {
tty_free_file(filp);
@@ -2042,6 +2043,10 @@ static int tty_open(struct inode *inode, struct file 
*filp)
if (retval) {
tty_debug_hangup(tty, "open error %d, releasing\n", retval);
 
+   if (unlock) {
+

[PATCH] tty: fix race between flush_to_ldisc and tty_open

2018-12-21 Thread Li RongQing
There still can be a race after the commit b027e2298bd588
("tty: fix data race between tty_init_dev and flush of buf"),
if receive_buf call comes before tty initialization completes
in n_tty_open and tty->driver_data may be NULL.

CPU0CPU1

  n_tty_open
  tty_init_dev
  tty_ldisc_unlock
  schedule
flush_to_ldisc
n_tty_receive_buf
uart_flush_chars
uart_start

Extending ldisc semaphore lock in tty_init_dev till driver_data
initializes completely after tty->ops->open().

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
we see this bug in centos 7.4, and think b027e2298bd588 can not
fix it, since driver_data is NULL;

the trace is that:

[exception RIP: uart_start+24]
RIP: 814908b8  RSP: 881c23ab7d18  RFLAGS: 00010282
RAX:   RBX: 88182bafc400  RCX: 
RDX: 00020001  RSI: 001d  RDI: 88182bafc400
RBP: 881c23ab7d30   R8: 881fff956060   R9: 81b5dc20
R10: 5a07  R11: 0001  R12: 88182bafc400
R13: 881ff90c852d  R14: 88182baff400  R15: 
ORIG_RAX:   CS: 0010  SS: 0018
#11 [881c23ab7d38] uart_flush_chars at 81490ade
#12 [881c23ab7d48] n_tty_receive_buf at 81478b1d
#13 [881c23ab7de0] flush_to_ldisc at 8147bdc4
#14 [881c23ab7e28] process_one_work at 8109e210
#15 [881c23ab7e70] worker_thread at 8109e69e
#16 [881c23ab7ec8] kthread at 810a62d1
#17 [881c23ab7f50] ret_from_fork at 8171d677

other one

PID: 922TASK: 881c7fbc1f40  CPU: 29  COMMAND: "agetty"
 #0 [8818e9677a20] __schedule at 817114f2
 #1 [8818e9677a78] preempt_schedule at 81711f4f
 #2 [8818e9677a90] _raw_spin_unlock_irqrestore at 81713f7d
 #3 [8818e9677aa8] __wake_up at 810b03c4
 #4 [8818e9677ae0] n_tty_set_termios at 814770d3
 #5 [8818e9677b10] n_tty_open at 814772ec` 
 
 
 drivers/staging/speakup/spk_ttyio.c |  1 +
 drivers/tty/pty.c   |  2 ++
 drivers/tty/serdev/serdev-ttyport.c |  2 ++
 drivers/tty/tty_io.c| 14 +++---
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/speakup/spk_ttyio.c 
b/drivers/staging/speakup/spk_ttyio.c
index 979e3ae249c1..c31f08c98383 100644
--- a/drivers/staging/speakup/spk_ttyio.c
+++ b/drivers/staging/speakup/spk_ttyio.c
@@ -155,6 +155,7 @@ static int spk_ttyio_initialise_ldisc(struct spk_synth 
*synth)
else
ret = -ENODEV;
 
+   tty_ldisc_unlock(tty);
if (ret) {
tty_unlock(tty);
return ret;
diff --git a/drivers/tty/pty.c b/drivers/tty/pty.c
index 00099a8439d2..1b9684d4f718 100644
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -873,9 +873,11 @@ static int ptmx_open(struct inode *inode, struct file 
*filp)
 
tty_debug_hangup(tty, "opening (count=%d)\n", tty->count);
 
+   tty_ldisc_unlock(tty);
tty_unlock(tty);
return 0;
 err_release:
+   tty_ldisc_unlock(tty);
tty_unlock(tty);
// This will also put-ref the fsi
tty_release(inode, filp);
diff --git a/drivers/tty/serdev/serdev-ttyport.c 
b/drivers/tty/serdev/serdev-ttyport.c
index fa1672993b4c..ce16cb303e28 100644
--- a/drivers/tty/serdev/serdev-ttyport.c
+++ b/drivers/tty/serdev/serdev-ttyport.c
@@ -123,6 +123,7 @@ static int ttyport_open(struct serdev_controller *ctrl)
if (ret)
goto err_close;
 
+   tty_ldisc_unlock(tty);
tty_unlock(serport->tty);
 
/* Bring the UART into a known 8 bits no parity hw fc state */
@@ -145,6 +146,7 @@ static int ttyport_open(struct serdev_controller *ctrl)
 err_close:
tty->ops->close(tty, NULL);
 err_unlock:
+   tty_ldisc_unlock(tty);
tty_unlock(tty);
tty_release_struct(tty, serport->tty_idx);
 
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 687250ec8032..199f45e2e1b1 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1351,7 +1351,6 @@ struct tty_struct *tty_init_dev(struct tty_driver 
*driver, int idx)
retval = tty_ldisc_setup(tty, tty->link);
if (retval)
goto err_release_tty;
-   tty_ldisc_unlock(tty);
/* Return the tty locked so that it cannot vanish under the caller */
return tty;
 
@@ -1926,7 +1925,7 @@ EXPORT_SYMBOL_GPL(tty_kopen);
  *   - concurrent tty removal from driver table
  */
 static struct tty_struct *tty_open_by_driver(dev_t device, struct inode *inode,
-struct file *filp)
+ 

[PATCH] memcg: remove congestion wait when force empty

2018-09-12 Thread Li RongQing
memory.force_empty is used to empty a memory cgoup memory before
rmdir it, avoid to charge those memory into parent cgroup

when try_to_free_mem_cgroup_pages returns 0, guess there maybe be
lots of writeback, so wait. but the waiting and sleep will called
in shrink_inactive_list, based on numbers of isolated page, so
remove this wait to reduce unnecessary delay

Signed-off-by: Li RongQing 
---
 mm/memcontrol.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4ead5a4817de..35bd43eaa97e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2897,12 +2897,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup 
*memcg)
 
progress = try_to_free_mem_cgroup_pages(memcg, 1,
GFP_KERNEL, true);
-   if (!progress) {
+   if (!progress)
nr_retries--;
-   /* maybe some writeback is necessary */
-   congestion_wait(BLK_RW_ASYNC, HZ/10);
-   }
-
}
 
return 0;
-- 
2.16.2



[PATCH] memcg: remove congestion wait when force empty

2018-09-12 Thread Li RongQing
memory.force_empty is used to empty a memory cgoup memory before
rmdir it, avoid to charge those memory into parent cgroup

when try_to_free_mem_cgroup_pages returns 0, guess there maybe be
lots of writeback, so wait. but the waiting and sleep will called
in shrink_inactive_list, based on numbers of isolated page, so
remove this wait to reduce unnecessary delay

Signed-off-by: Li RongQing 
---
 mm/memcontrol.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4ead5a4817de..35bd43eaa97e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2897,12 +2897,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup 
*memcg)
 
progress = try_to_free_mem_cgroup_pages(memcg, 1,
GFP_KERNEL, true);
-   if (!progress) {
+   if (!progress)
nr_retries--;
-   /* maybe some writeback is necessary */
-   congestion_wait(BLK_RW_ASYNC, HZ/10);
-   }
-
}
 
return 0;
-- 
2.16.2



[PATCH] mm: introduce kvvirt_to_page() helper

2018-08-16 Thread Li RongQing
The new helper returns address mapping page, which has several users
in individual subsystem, like mem_to_page in xfs_buf.c and pgv_to_page
in af_packet.c, after this, they can be unified

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 include/linux/mm.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 68a5121694ef..bb34a3c71df5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -599,6 +599,14 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t 
flags)
return kvmalloc_array(n, size, flags | __GFP_ZERO);
 }
 
+static inline struct page *kvvirt_to_page(const void *addr)
+{
+   if (!is_vmalloc_addr(addr))
+   return virt_to_page(addr);
+   else
+   return vmalloc_to_page(addr);
+}
+
 extern void kvfree(const void *addr);
 
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
-- 
2.16.2



[PATCH] mm: introduce kvvirt_to_page() helper

2018-08-16 Thread Li RongQing
The new helper returns address mapping page, which has several users
in individual subsystem, like mem_to_page in xfs_buf.c and pgv_to_page
in af_packet.c, after this, they can be unified

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 include/linux/mm.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 68a5121694ef..bb34a3c71df5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -599,6 +599,14 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t 
flags)
return kvmalloc_array(n, size, flags | __GFP_ZERO);
 }
 
+static inline struct page *kvvirt_to_page(const void *addr)
+{
+   if (!is_vmalloc_addr(addr))
+   return virt_to_page(addr);
+   else
+   return vmalloc_to_page(addr);
+}
+
 extern void kvfree(const void *addr);
 
 static inline atomic_t *compound_mapcount_ptr(struct page *page)
-- 
2.16.2



[PATCH 1/2] mm: add a function to return a bdi_writeback dirty page statistic

2018-08-01 Thread Li RongQing
this is a preparation to optimise a full writeback
when reclaim memory

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 include/linux/memcontrol.h | 2 +-
 mm/memcontrol.c| 6 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb116e925..58e29555ac81 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1141,7 +1141,7 @@ struct wb_domain *mem_cgroup_wb_domain(struct 
bdi_writeback *wb);
 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 unsigned long *pheadroom, unsigned long *pdirty,
 unsigned long *pwriteback);
-
+unsigned long mem_cgroup_wb_dirty_stats(struct bdi_writeback *wb);
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b3143e..82d3061e91d1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3640,6 +3640,12 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, 
unsigned long *pfilepages,
}
 }
 
+unsigned long mem_cgroup_wb_dirty_stats(struct bdi_writeback *wb)
+{
+   struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+   return memcg_page_state(memcg, NR_FILE_DIRTY);
+}
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
-- 
2.16.2



[PATCH 1/2] mm: add a function to return a bdi_writeback dirty page statistic

2018-08-01 Thread Li RongQing
this is a preparation to optimise a full writeback
when reclaim memory

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 include/linux/memcontrol.h | 2 +-
 mm/memcontrol.c| 6 ++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6c6fb116e925..58e29555ac81 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1141,7 +1141,7 @@ struct wb_domain *mem_cgroup_wb_domain(struct 
bdi_writeback *wb);
 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 unsigned long *pheadroom, unsigned long *pdirty,
 unsigned long *pwriteback);
-
+unsigned long mem_cgroup_wb_dirty_stats(struct bdi_writeback *wb);
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8c0280b3143e..82d3061e91d1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3640,6 +3640,12 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, 
unsigned long *pfilepages,
}
 }
 
+unsigned long mem_cgroup_wb_dirty_stats(struct bdi_writeback *wb)
+{
+   struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+   return memcg_page_state(memcg, NR_FILE_DIRTY);
+}
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
-- 
2.16.2



[PATCH 2/2] fs/writeback: do memory cgroup related writeback firstly

2018-08-01 Thread Li RongQing
When a machine has hundreds of memory cgroups, and some cgroups
generate more or less dirty pages, but a cgroup of them has lots
of memory pressure and always tries to reclaim dirty page, then it
will trigger all cgroups to writeback, which is less efficient:

1.if the used memory in a memory cgroup reaches its limit,
it is useless to writeback other cgroups.
2.other cgroups can wait more time to merge write request

so replace the full flush with flushing writeback of memory cgroup
whose tasks tries to reclaim memory and trigger writeback, if
nothing is writeback, then fallback a full flush

After this patch, the writing performance enhance 5% in below setup:
  $mount -t cgroup none -o memory /cgroups/memory/
  $mkdir /cgroups/memory/x1
  $echo $$ > /cgroups/memory/x1/tasks
  $echo 100M > /cgroups/memory/x1/memory.limit_in_bytes
  $cd /cgroups/memory/
  $seq 1|xargs  mkdir
  $fio -filename=/home/test1 -direct=0 -iodepth 1 -thread -rw=write 
-ioengine=libaio -bs=16k -size=20G
Before:
WRITE: io=20480MB, aggrb=779031KB/s, minb=779031KB/s, maxb=779031KB/s, 
mint=26920msec, maxt=26920msec
After:
WRITE: io=20480MB, aggrb=831708KB/s, minb=831708KB/s, maxb=831708KB/s, 
mint=25215msec, maxt=25215msec

And this patch can reduce io util in this condition, like there
is two disks, one disks is used to store all kinds of logs, it
should be less io pressure, and other is used to store hadoop data
which will write lots of data to disk, but both disk io utils are
high in fact, since when hadoop reclaims memory, it will wake all
memory cgroup writeback.

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 fs/fs-writeback.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 471d863958bc..475cada5d1cf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -35,6 +35,11 @@
  */
 #define MIN_WRITEBACK_PAGES(4096UL >> (PAGE_SHIFT - 10))
 
+/*
+ * if WB cgroup dirty pages is bigger than it, not start a full flush
+ */
+#define MIN_WB_DIRTY_PAGES 64
+
 struct wb_completion {
atomic_tcnt;
 };
@@ -2005,6 +2010,32 @@ void wakeup_flusher_threads(enum wb_reason reason)
if (blk_needs_flush_plug(current))
blk_schedule_flush_plug(current);
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+   if (reason == WB_REASON_VMSCAN) {
+   unsigned long tmp, pdirty = 0;
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(bdi, _list, bdi_list) {
+   struct bdi_writeback *wb = wb_find_current(bdi);
+
+   if (wb) {
+   tmp = mem_cgroup_wb_dirty_stats(wb);
+   if (tmp) {
+   pdirty += tmp;
+   wb_start_writeback(wb, reason);
+
+   if (wb == >wb)
+   pdirty += MIN_WB_DIRTY_PAGES;
+   }
+   }
+   }
+   rcu_read_unlock();
+
+   if (pdirty > MIN_WB_DIRTY_PAGES)
+   return;
+   }
+#endif
+
rcu_read_lock();
list_for_each_entry_rcu(bdi, _list, bdi_list)
__wakeup_flusher_threads_bdi(bdi, reason);
-- 
2.16.2



[PATCH 2/2] fs/writeback: do memory cgroup related writeback firstly

2018-08-01 Thread Li RongQing
When a machine has hundreds of memory cgroups, and some cgroups
generate more or less dirty pages, but a cgroup of them has lots
of memory pressure and always tries to reclaim dirty page, then it
will trigger all cgroups to writeback, which is less efficient:

1.if the used memory in a memory cgroup reaches its limit,
it is useless to writeback other cgroups.
2.other cgroups can wait more time to merge write request

so replace the full flush with flushing writeback of memory cgroup
whose tasks tries to reclaim memory and trigger writeback, if
nothing is writeback, then fallback a full flush

After this patch, the writing performance enhance 5% in below setup:
  $mount -t cgroup none -o memory /cgroups/memory/
  $mkdir /cgroups/memory/x1
  $echo $$ > /cgroups/memory/x1/tasks
  $echo 100M > /cgroups/memory/x1/memory.limit_in_bytes
  $cd /cgroups/memory/
  $seq 1|xargs  mkdir
  $fio -filename=/home/test1 -direct=0 -iodepth 1 -thread -rw=write 
-ioengine=libaio -bs=16k -size=20G
Before:
WRITE: io=20480MB, aggrb=779031KB/s, minb=779031KB/s, maxb=779031KB/s, 
mint=26920msec, maxt=26920msec
After:
WRITE: io=20480MB, aggrb=831708KB/s, minb=831708KB/s, maxb=831708KB/s, 
mint=25215msec, maxt=25215msec

And this patch can reduce io util in this condition, like there
is two disks, one disks is used to store all kinds of logs, it
should be less io pressure, and other is used to store hadoop data
which will write lots of data to disk, but both disk io utils are
high in fact, since when hadoop reclaims memory, it will wake all
memory cgroup writeback.

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 fs/fs-writeback.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 471d863958bc..475cada5d1cf 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -35,6 +35,11 @@
  */
 #define MIN_WRITEBACK_PAGES(4096UL >> (PAGE_SHIFT - 10))
 
+/*
+ * if WB cgroup dirty pages is bigger than it, not start a full flush
+ */
+#define MIN_WB_DIRTY_PAGES 64
+
 struct wb_completion {
atomic_tcnt;
 };
@@ -2005,6 +2010,32 @@ void wakeup_flusher_threads(enum wb_reason reason)
if (blk_needs_flush_plug(current))
blk_schedule_flush_plug(current);
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+   if (reason == WB_REASON_VMSCAN) {
+   unsigned long tmp, pdirty = 0;
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(bdi, _list, bdi_list) {
+   struct bdi_writeback *wb = wb_find_current(bdi);
+
+   if (wb) {
+   tmp = mem_cgroup_wb_dirty_stats(wb);
+   if (tmp) {
+   pdirty += tmp;
+   wb_start_writeback(wb, reason);
+
+   if (wb == >wb)
+   pdirty += MIN_WB_DIRTY_PAGES;
+   }
+   }
+   }
+   rcu_read_unlock();
+
+   if (pdirty > MIN_WB_DIRTY_PAGES)
+   return;
+   }
+#endif
+
rcu_read_lock();
list_for_each_entry_rcu(bdi, _list, bdi_list)
__wakeup_flusher_threads_bdi(bdi, reason);
-- 
2.16.2



[PATCH] x86/acpi: fix the comments in acpi_parse_lapic

2018-06-07 Thread Li RongQing
This should be permit to preallocate memory for all NR_CPUS

Signed-off-by: Li RongQing 
---
 arch/x86/kernel/acpi/boot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3b20607d581b..8ae88605a5eb 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -223,7 +223,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * We need to register disabled CPU as well to permit
 * counting disabled CPUs. This allows us to size
 * cpus_possible_map more accurately, to permit
-* to not preallocating memory for all NR_CPUS
+* to preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
if (!apic->apic_id_valid(apic_id)) {
@@ -260,7 +260,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, 
const unsigned long end)
 * We need to register disabled CPU as well to permit
 * counting disabled CPUs. This allows us to size
 * cpus_possible_map more accurately, to permit
-* to not preallocating memory for all NR_CPUS
+* to preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
acpi_register_lapic(processor->id,  /* APIC ID */
-- 
2.16.2



[PATCH] x86/acpi: fix the comments in acpi_parse_lapic

2018-06-07 Thread Li RongQing
This should be permit to preallocate memory for all NR_CPUS

Signed-off-by: Li RongQing 
---
 arch/x86/kernel/acpi/boot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3b20607d581b..8ae88605a5eb 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -223,7 +223,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * We need to register disabled CPU as well to permit
 * counting disabled CPUs. This allows us to size
 * cpus_possible_map more accurately, to permit
-* to not preallocating memory for all NR_CPUS
+* to preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
if (!apic->apic_id_valid(apic_id)) {
@@ -260,7 +260,7 @@ acpi_parse_lapic(struct acpi_subtable_header * header, 
const unsigned long end)
 * We need to register disabled CPU as well to permit
 * counting disabled CPUs. This allows us to size
 * cpus_possible_map more accurately, to permit
-* to not preallocating memory for all NR_CPUS
+* to preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
acpi_register_lapic(processor->id,  /* APIC ID */
-- 
2.16.2



[PATCH] sched/fair: remove stale comments about tg_unthrottle_up

2018-04-10 Thread Li RongQing
After commit 82958366cfea ("sched: Replace update_shares weight
distribution with per-entity computation"), tg_unthrottle_up
did not update the weight

Signed-off-by: Li RongQing <lirongq...@baidu.com>
---
 kernel/sched/fair.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0951d1c58d2f..b885ed6fd97b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4703,7 +4703,6 @@ static inline int throttled_lb_pair(struct task_group *tg,
   throttled_hierarchy(dest_cfs_rq);
 }
 
-/* updated child weight may affect parent so we have to do this bottom up */
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
struct rq *rq = data;
-- 
2.11.0



[PATCH] sched/fair: remove stale comments about tg_unthrottle_up

2018-04-10 Thread Li RongQing
After commit 82958366cfea ("sched: Replace update_shares weight
distribution with per-entity computation"), tg_unthrottle_up
did not update the weight

Signed-off-by: Li RongQing 
---
 kernel/sched/fair.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0951d1c58d2f..b885ed6fd97b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4703,7 +4703,6 @@ static inline int throttled_lb_pair(struct task_group *tg,
   throttled_hierarchy(dest_cfs_rq);
 }
 
-/* updated child weight may affect parent so we have to do this bottom up */
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
struct rq *rq = data;
-- 
2.11.0



[tip:x86/urgent] x86/apic: Fix signedness bug in APIC ID validity checks

2018-04-10 Thread tip-bot for Li RongQing
Commit-ID:  a774635db5c430cbf21fa5d2f2df3d23aaa8e782
Gitweb: https://git.kernel.org/tip/a774635db5c430cbf21fa5d2f2df3d23aaa8e782
Author: Li RongQing <lirongq...@baidu.com>
AuthorDate: Tue, 10 Apr 2018 09:16:06 +0800
Committer:  Thomas Gleixner <t...@linutronix.de>
CommitDate: Tue, 10 Apr 2018 16:46:39 +0200

x86/apic: Fix signedness bug in APIC ID validity checks

The APIC ID as parsed from ACPI MADT is validity checked with the
apic->apic_id_valid() callback, which depends on the selected APIC type.

For non X2APIC types APIC IDs >= 0xFF are invalid, but values > 0x7FFF
are detected as valid. This happens because the 'apicid' argument of the
apic_id_valid() callback is type 'int'. So the resulting comparison

   apicid < 0xFF

evaluates to true for all unsigned int values > 0x7FFF which are handed
to default_apic_id_valid(). As a consequence, invalid APIC IDs in !X2APIC
mode are considered valid and accounted as possible CPUs.

Change the apicid argument type of the apic_id_valid() callback to u32 so
the evaluation is unsigned and returns the correct result.

[ tglx: Massaged changelog ]

Signed-off-by: Li RongQing <lirongq...@baidu.com>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Cc: sta...@vger.kernel.org
Cc: jgr...@suse.com
Cc: Dou Liyang <douly.f...@cn.fujitsu.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: h...@zytor.com
Link: 
https://lkml.kernel.org/r/1523322966-10296-1-git-send-email-lirongq...@baidu.com
---
 arch/x86/include/asm/apic.h  |  4 ++--
 arch/x86/kernel/acpi/boot.c  | 13 -
 arch/x86/kernel/apic/apic_common.c   |  2 +-
 arch/x86/kernel/apic/apic_numachip.c |  2 +-
 arch/x86/kernel/apic/x2apic.h|  2 +-
 arch/x86/kernel/apic/x2apic_phys.c   |  2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c   |  2 +-
 arch/x86/xen/apic.c  |  2 +-
 8 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 98722773391d..f01eef8b392e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -319,7 +319,7 @@ struct apic {
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
-   int (*apic_id_valid)(int apicid);
+   int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
 
bool(*check_apicid_used)(physid_mask_t *map, int apicid);
@@ -492,7 +492,7 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
 }
 
-extern int default_apic_id_valid(int apicid);
+extern int default_apic_id_valid(u32 apicid);
 extern int default_acpi_madt_oem_check(char *, char *);
 extern void default_setup_apic_routing(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2aa92094b59d..5ee33a6e33bb 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 {
struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
-   int apic_id;
+   u32 apic_id;
u8 enabled;
 #endif
 
@@ -222,10 +222,13 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * to not preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
-   if (!apic->apic_id_valid(apic_id) && enabled)
-   printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
-   else
-   acpi_register_lapic(apic_id, processor->uid, enabled);
+   if (!apic->apic_id_valid(apic_id)) {
+   if (enabled)
+   pr_warn(PREFIX "x2apic entry ignored\n");
+   return 0;
+   }
+
+   acpi_register_lapic(apic_id, processor->uid, enabled);
 #else
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
 #endif
diff --git a/arch/x86/kernel/apic/apic_common.c 
b/arch/x86/kernel/apic/apic_common.c
index a360801779ae..02b4839478b1 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -40,7 +40,7 @@ int default_check_phys_apicid_present(int phys_apicid)
return physid_isset(phys_apicid, phys_cpu_present_map);
 }
 
-int default_apic_id_valid(int apicid)
+int default_apic_id_valid(u32 apicid)
 {
return (apicid < 255);
 }
diff --git a/arch/x86/kernel/apic/apic_numachip.c 
b/arch/x86/kernel/apic/apic_numachip.c
index 134e04506ab4..78778b54f904 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,7 +56,7 @@ static u32 numachip2_set_apic_id(unsigned int id)
return id << 24;
 }
 
-static int numachip_apic_id_valid(int apicid)
+static int numachip_apic_id_valid(u32 apicid)
 {
/* Trust what bootloader passes in MADT *

[tip:x86/urgent] x86/apic: Fix signedness bug in APIC ID validity checks

2018-04-10 Thread tip-bot for Li RongQing
Commit-ID:  a774635db5c430cbf21fa5d2f2df3d23aaa8e782
Gitweb: https://git.kernel.org/tip/a774635db5c430cbf21fa5d2f2df3d23aaa8e782
Author: Li RongQing 
AuthorDate: Tue, 10 Apr 2018 09:16:06 +0800
Committer:  Thomas Gleixner 
CommitDate: Tue, 10 Apr 2018 16:46:39 +0200

x86/apic: Fix signedness bug in APIC ID validity checks

The APIC ID as parsed from ACPI MADT is validity checked with the
apic->apic_id_valid() callback, which depends on the selected APIC type.

For non X2APIC types APIC IDs >= 0xFF are invalid, but values > 0x7FFF
are detected as valid. This happens because the 'apicid' argument of the
apic_id_valid() callback is type 'int'. So the resulting comparison

   apicid < 0xFF

evaluates to true for all unsigned int values > 0x7FFF which are handed
to default_apic_id_valid(). As a consequence, invalid APIC IDs in !X2APIC
mode are considered valid and accounted as possible CPUs.

Change the apicid argument type of the apic_id_valid() callback to u32 so
the evaluation is unsigned and returns the correct result.

[ tglx: Massaged changelog ]

Signed-off-by: Li RongQing 
Signed-off-by: Thomas Gleixner 
Cc: sta...@vger.kernel.org
Cc: jgr...@suse.com
Cc: Dou Liyang 
Cc: Peter Zijlstra 
Cc: h...@zytor.com
Link: 
https://lkml.kernel.org/r/1523322966-10296-1-git-send-email-lirongq...@baidu.com
---
 arch/x86/include/asm/apic.h  |  4 ++--
 arch/x86/kernel/acpi/boot.c  | 13 -
 arch/x86/kernel/apic/apic_common.c   |  2 +-
 arch/x86/kernel/apic/apic_numachip.c |  2 +-
 arch/x86/kernel/apic/x2apic.h|  2 +-
 arch/x86/kernel/apic/x2apic_phys.c   |  2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c   |  2 +-
 arch/x86/xen/apic.c  |  2 +-
 8 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 98722773391d..f01eef8b392e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -319,7 +319,7 @@ struct apic {
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
-   int (*apic_id_valid)(int apicid);
+   int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
 
bool(*check_apicid_used)(physid_mask_t *map, int apicid);
@@ -492,7 +492,7 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
 }
 
-extern int default_apic_id_valid(int apicid);
+extern int default_apic_id_valid(u32 apicid);
 extern int default_acpi_madt_oem_check(char *, char *);
 extern void default_setup_apic_routing(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2aa92094b59d..5ee33a6e33bb 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 {
struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
-   int apic_id;
+   u32 apic_id;
u8 enabled;
 #endif
 
@@ -222,10 +222,13 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * to not preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
-   if (!apic->apic_id_valid(apic_id) && enabled)
-   printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
-   else
-   acpi_register_lapic(apic_id, processor->uid, enabled);
+   if (!apic->apic_id_valid(apic_id)) {
+   if (enabled)
+   pr_warn(PREFIX "x2apic entry ignored\n");
+   return 0;
+   }
+
+   acpi_register_lapic(apic_id, processor->uid, enabled);
 #else
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
 #endif
diff --git a/arch/x86/kernel/apic/apic_common.c 
b/arch/x86/kernel/apic/apic_common.c
index a360801779ae..02b4839478b1 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -40,7 +40,7 @@ int default_check_phys_apicid_present(int phys_apicid)
return physid_isset(phys_apicid, phys_cpu_present_map);
 }
 
-int default_apic_id_valid(int apicid)
+int default_apic_id_valid(u32 apicid)
 {
return (apicid < 255);
 }
diff --git a/arch/x86/kernel/apic/apic_numachip.c 
b/arch/x86/kernel/apic/apic_numachip.c
index 134e04506ab4..78778b54f904 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,7 +56,7 @@ static u32 numachip2_set_apic_id(unsigned int id)
return id << 24;
 }
 
-static int numachip_apic_id_valid(int apicid)
+static int numachip_apic_id_valid(u32 apicid)
 {
/* Trust what bootloader passes in MADT */
return 1;
diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h
index b107de381cb5..a49b3604027f 100644
--- a/arch/x86/kernel/apic

[tip:x86/urgent] x86/apic: Fix signedness bug in APIC ID validity checks

2018-04-10 Thread tip-bot for Li RongQing
Commit-ID:  57e95bee9f89560c5952409be8b41a4a2b7e384c
Gitweb: https://git.kernel.org/tip/57e95bee9f89560c5952409be8b41a4a2b7e384c
Author: Li RongQing <lirongq...@baidu.com>
AuthorDate: Tue, 10 Apr 2018 09:16:06 +0800
Committer:  Thomas Gleixner <t...@linutronix.de>
CommitDate: Tue, 10 Apr 2018 16:33:21 +0200

x86/apic: Fix signedness bug in APIC ID validity checks

The APIC ID as parsed from ACPI MADT is validity checked with the
apic->apic_id_valid() callback, which depends on the selected APIC type.

For non X2APIC types APIC IDs >= 0xFF are invalid, but values > 0x7FFF
are detected as valid. This happens because the 'apicid' argument of the
apic_id_valid() callback is type 'int'. So the resulting comparison

   apicid < 0xFF

evaluates to true for all unsigned int values > 0x7FFF which are handed
to default_apic_id_valid(). As a consequence, invalid APIC IDs in !X2APIC
mode are considered valid and accounted as possible CPUs.

Change the apicid argument type of the apic_id_valid() callback to u32 so
the evaluation is unsigned and returns the correct result.

[ tglx: Massaged changelog ]

Signed-off-by: Li RongQing <lirongq...@baidu.com>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Cc: sta...@vger.kernel.org
Cc: jgr...@suse.com
Cc: Dou Liyang <douly.f...@cn.fujitsu.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: h...@zytor.com
Link: 
https://lkml.kernel.org/r/1523322966-10296-1-git-send-email-lirongq...@baidu.com

---
 arch/x86/include/asm/apic.h  |  4 ++--
 arch/x86/kernel/acpi/boot.c  | 13 -
 arch/x86/kernel/apic/apic_common.c   |  2 +-
 arch/x86/kernel/apic/apic_numachip.c |  2 +-
 arch/x86/kernel/apic/x2apic.h|  2 +-
 arch/x86/kernel/apic/x2apic_phys.c   |  2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c   |  2 +-
 arch/x86/xen/apic.c  |  2 +-
 8 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 98722773391d..f01eef8b392e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -319,7 +319,7 @@ struct apic {
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
-   int (*apic_id_valid)(int apicid);
+   int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
 
bool(*check_apicid_used)(physid_mask_t *map, int apicid);
@@ -492,7 +492,7 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
 }
 
-extern int default_apic_id_valid(int apicid);
+extern int default_apic_id_valid(u32 apicid);
 extern int default_acpi_madt_oem_check(char *, char *);
 extern void default_setup_apic_routing(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2aa92094b59d..cc7ddc932cf2 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 {
struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
-   int apic_id;
+   u32 apic_id;
u8 enabled;
 #endif
 
@@ -222,10 +222,13 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * to not preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
-   if (!apic->apic_id_valid(apic_id) && enabled)
-   printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
-   else
-   acpi_register_lapic(apic_id, processor->uid, enabled);
+   if (!apic->apic_id_valid(apic_id)) {
+   if (enabled)
+   printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+   return 0;
+   }
+
+   acpi_register_lapic(apic_id, processor->uid, enabled);
 #else
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
 #endif
diff --git a/arch/x86/kernel/apic/apic_common.c 
b/arch/x86/kernel/apic/apic_common.c
index a360801779ae..02b4839478b1 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -40,7 +40,7 @@ int default_check_phys_apicid_present(int phys_apicid)
return physid_isset(phys_apicid, phys_cpu_present_map);
 }
 
-int default_apic_id_valid(int apicid)
+int default_apic_id_valid(u32 apicid)
 {
return (apicid < 255);
 }
diff --git a/arch/x86/kernel/apic/apic_numachip.c 
b/arch/x86/kernel/apic/apic_numachip.c
index 134e04506ab4..78778b54f904 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,7 +56,7 @@ static u32 numachip2_set_apic_id(unsigned int id)
return id << 24;
 }
 
-static int numachip_apic_id_valid(int apicid)
+static int numachip_apic_id_valid(u32 apicid)
 {
/* Trust what bootloader pas

[tip:x86/urgent] x86/apic: Fix signedness bug in APIC ID validity checks

2018-04-10 Thread tip-bot for Li RongQing
Commit-ID:  57e95bee9f89560c5952409be8b41a4a2b7e384c
Gitweb: https://git.kernel.org/tip/57e95bee9f89560c5952409be8b41a4a2b7e384c
Author: Li RongQing 
AuthorDate: Tue, 10 Apr 2018 09:16:06 +0800
Committer:  Thomas Gleixner 
CommitDate: Tue, 10 Apr 2018 16:33:21 +0200

x86/apic: Fix signedness bug in APIC ID validity checks

The APIC ID as parsed from ACPI MADT is validity checked with the
apic->apic_id_valid() callback, which depends on the selected APIC type.

For non X2APIC types APIC IDs >= 0xFF are invalid, but values > 0x7FFF
are detected as valid. This happens because the 'apicid' argument of the
apic_id_valid() callback is type 'int'. So the resulting comparison

   apicid < 0xFF

evaluates to true for all unsigned int values > 0x7FFF which are handed
to default_apic_id_valid(). As a consequence, invalid APIC IDs in !X2APIC
mode are considered valid and accounted as possible CPUs.

Change the apicid argument type of the apic_id_valid() callback to u32 so
the evaluation is unsigned and returns the correct result.

[ tglx: Massaged changelog ]

Signed-off-by: Li RongQing 
Signed-off-by: Thomas Gleixner 
Cc: sta...@vger.kernel.org
Cc: jgr...@suse.com
Cc: Dou Liyang 
Cc: Peter Zijlstra 
Cc: h...@zytor.com
Link: 
https://lkml.kernel.org/r/1523322966-10296-1-git-send-email-lirongq...@baidu.com

---
 arch/x86/include/asm/apic.h  |  4 ++--
 arch/x86/kernel/acpi/boot.c  | 13 -
 arch/x86/kernel/apic/apic_common.c   |  2 +-
 arch/x86/kernel/apic/apic_numachip.c |  2 +-
 arch/x86/kernel/apic/x2apic.h|  2 +-
 arch/x86/kernel/apic/x2apic_phys.c   |  2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c   |  2 +-
 arch/x86/xen/apic.c  |  2 +-
 8 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 98722773391d..f01eef8b392e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -319,7 +319,7 @@ struct apic {
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
-   int (*apic_id_valid)(int apicid);
+   int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
 
bool(*check_apicid_used)(physid_mask_t *map, int apicid);
@@ -492,7 +492,7 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
 }
 
-extern int default_apic_id_valid(int apicid);
+extern int default_apic_id_valid(u32 apicid);
 extern int default_acpi_madt_oem_check(char *, char *);
 extern void default_setup_apic_routing(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 2aa92094b59d..cc7ddc932cf2 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 {
struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
-   int apic_id;
+   u32 apic_id;
u8 enabled;
 #endif
 
@@ -222,10 +222,13 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * to not preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
-   if (!apic->apic_id_valid(apic_id) && enabled)
-   printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
-   else
-   acpi_register_lapic(apic_id, processor->uid, enabled);
+   if (!apic->apic_id_valid(apic_id)) {
+   if (enabled)
+   printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+   return 0;
+   }
+
+   acpi_register_lapic(apic_id, processor->uid, enabled);
 #else
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
 #endif
diff --git a/arch/x86/kernel/apic/apic_common.c 
b/arch/x86/kernel/apic/apic_common.c
index a360801779ae..02b4839478b1 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -40,7 +40,7 @@ int default_check_phys_apicid_present(int phys_apicid)
return physid_isset(phys_apicid, phys_cpu_present_map);
 }
 
-int default_apic_id_valid(int apicid)
+int default_apic_id_valid(u32 apicid)
 {
return (apicid < 255);
 }
diff --git a/arch/x86/kernel/apic/apic_numachip.c 
b/arch/x86/kernel/apic/apic_numachip.c
index 134e04506ab4..78778b54f904 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,7 +56,7 @@ static u32 numachip2_set_apic_id(unsigned int id)
return id << 24;
 }
 
-static int numachip_apic_id_valid(int apicid)
+static int numachip_apic_id_valid(u32 apicid)
 {
/* Trust what bootloader passes in MADT */
return 1;
diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h
index b107de381cb5..a49b3604027f 100644
--- a/arch/x86/kernel

[PATCH][V2] x86/acpi: Prevent all the X2APIC Id from being parsed in non-x2apic mode

2018-04-09 Thread Li RongQing
the values of x2APIC ID is greater than 0xff in ACPI MADT, if acpi
is apic_flat, default_apic_id_valid() is called to check id which
is converted from u32 to int, will return true if id is larger than
0x7fff, this is wrong

and if local_apic_id is invalid, we should prevent it from being
accounted

This fixes a bug that some Purley platform displays many possible cpu

Signed-off-by: Li RongQing <lirongq...@baidu.com>
Cc: Dou Liyang <douly.f...@cn.fujitsu.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Thomas Gleixner <t...@linutronix.de>
---
 arch/x86/include/asm/apic.h  |  4 ++--
 arch/x86/kernel/acpi/boot.c  | 13 -
 arch/x86/kernel/apic/apic_common.c   |  2 +-
 arch/x86/kernel/apic/apic_numachip.c |  2 +-
 arch/x86/kernel/apic/x2apic.h|  2 +-
 arch/x86/kernel/apic/x2apic_phys.c   |  2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c   |  2 +-
 arch/x86/xen/apic.c  |  2 +-
 8 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 40a3d3642f3a..08acd954f00e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -313,7 +313,7 @@ struct apic {
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
-   int (*apic_id_valid)(int apicid);
+   int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
 
bool(*check_apicid_used)(physid_mask_t *map, int apicid);
@@ -486,7 +486,7 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
 }
 
-extern int default_apic_id_valid(int apicid);
+extern int default_apic_id_valid(u32 apicid);
 extern int default_acpi_madt_oem_check(char *, char *);
 extern void default_setup_apic_routing(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7a37d9357bc4..4ba949de1ca9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 {
struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
-   int apic_id;
+   u32 apic_id;
u8 enabled;
 #endif
 
@@ -222,10 +222,13 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * to not preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
-   if (!apic->apic_id_valid(apic_id) && enabled)
-   printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
-   else
-   acpi_register_lapic(apic_id, processor->uid, enabled);
+   if (!apic->apic_id_valid(apic_id)) {
+   if (enabled)
+   printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+   return 0;
+   }
+
+   acpi_register_lapic(apic_id, processor->uid, enabled);
 #else
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
 #endif
diff --git a/arch/x86/kernel/apic/apic_common.c 
b/arch/x86/kernel/apic/apic_common.c
index a360801779ae..02b4839478b1 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -40,7 +40,7 @@ int default_check_phys_apicid_present(int phys_apicid)
return physid_isset(phys_apicid, phys_cpu_present_map);
 }
 
-int default_apic_id_valid(int apicid)
+int default_apic_id_valid(u32 apicid)
 {
return (apicid < 255);
 }
diff --git a/arch/x86/kernel/apic/apic_numachip.c 
b/arch/x86/kernel/apic/apic_numachip.c
index 134e04506ab4..78778b54f904 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,7 +56,7 @@ static u32 numachip2_set_apic_id(unsigned int id)
return id << 24;
 }
 
-static int numachip_apic_id_valid(int apicid)
+static int numachip_apic_id_valid(u32 apicid)
 {
/* Trust what bootloader passes in MADT */
return 1;
diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h
index b107de381cb5..a49b3604027f 100644
--- a/arch/x86/kernel/apic/x2apic.h
+++ b/arch/x86/kernel/apic/x2apic.h
@@ -1,6 +1,6 @@
 /* Common bits for X2APIC cluster/physical modes. */
 
-int x2apic_apic_id_valid(int apicid);
+int x2apic_apic_id_valid(u32 apicid);
 int x2apic_apic_id_registered(void);
 void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int 
dest);
 unsigned int x2apic_get_apic_id(unsigned long id);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c 
b/arch/x86/kernel/apic/x2apic_phys.c
index e2829bf40e4a..b5cf9e7b3830 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -101,7 +101,7 @@ static int x2apic_phys_probe(void)
 }
 
 /* Common x2apic functions, also used by x2apic_cluster */
-int x2apic_apic_id_valid(int apicid)
+int x2apic_apic_id_valid(u32 a

[PATCH][V2] x86/acpi: Prevent all the X2APIC Id from being parsed in non-x2apic mode

2018-04-09 Thread Li RongQing
the values of x2APIC ID is greater than 0xff in ACPI MADT, if acpi
is apic_flat, default_apic_id_valid() is called to check id which
is converted from u32 to int, will return true if id is larger than
0x7fff, this is wrong

and if local_apic_id is invalid, we should prevent it from being
accounted

This fixes a bug that some Purley platform displays many possible cpu

Signed-off-by: Li RongQing 
Cc: Dou Liyang 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
---
 arch/x86/include/asm/apic.h  |  4 ++--
 arch/x86/kernel/acpi/boot.c  | 13 -
 arch/x86/kernel/apic/apic_common.c   |  2 +-
 arch/x86/kernel/apic/apic_numachip.c |  2 +-
 arch/x86/kernel/apic/x2apic.h|  2 +-
 arch/x86/kernel/apic/x2apic_phys.c   |  2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c   |  2 +-
 arch/x86/xen/apic.c  |  2 +-
 8 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 40a3d3642f3a..08acd954f00e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -313,7 +313,7 @@ struct apic {
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
-   int (*apic_id_valid)(int apicid);
+   int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
 
bool(*check_apicid_used)(physid_mask_t *map, int apicid);
@@ -486,7 +486,7 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
 }
 
-extern int default_apic_id_valid(int apicid);
+extern int default_apic_id_valid(u32 apicid);
 extern int default_acpi_madt_oem_check(char *, char *);
 extern void default_setup_apic_routing(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7a37d9357bc4..4ba949de1ca9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 {
struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
-   int apic_id;
+   u32 apic_id;
u8 enabled;
 #endif
 
@@ -222,10 +222,13 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * to not preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
-   if (!apic->apic_id_valid(apic_id) && enabled)
-   printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
-   else
-   acpi_register_lapic(apic_id, processor->uid, enabled);
+   if (!apic->apic_id_valid(apic_id)) {
+   if (enabled)
+   printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
+   return 0;
+   }
+
+   acpi_register_lapic(apic_id, processor->uid, enabled);
 #else
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
 #endif
diff --git a/arch/x86/kernel/apic/apic_common.c 
b/arch/x86/kernel/apic/apic_common.c
index a360801779ae..02b4839478b1 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -40,7 +40,7 @@ int default_check_phys_apicid_present(int phys_apicid)
return physid_isset(phys_apicid, phys_cpu_present_map);
 }
 
-int default_apic_id_valid(int apicid)
+int default_apic_id_valid(u32 apicid)
 {
return (apicid < 255);
 }
diff --git a/arch/x86/kernel/apic/apic_numachip.c 
b/arch/x86/kernel/apic/apic_numachip.c
index 134e04506ab4..78778b54f904 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,7 +56,7 @@ static u32 numachip2_set_apic_id(unsigned int id)
return id << 24;
 }
 
-static int numachip_apic_id_valid(int apicid)
+static int numachip_apic_id_valid(u32 apicid)
 {
/* Trust what bootloader passes in MADT */
return 1;
diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h
index b107de381cb5..a49b3604027f 100644
--- a/arch/x86/kernel/apic/x2apic.h
+++ b/arch/x86/kernel/apic/x2apic.h
@@ -1,6 +1,6 @@
 /* Common bits for X2APIC cluster/physical modes. */
 
-int x2apic_apic_id_valid(int apicid);
+int x2apic_apic_id_valid(u32 apicid);
 int x2apic_apic_id_registered(void);
 void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int 
dest);
 unsigned int x2apic_get_apic_id(unsigned long id);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c 
b/arch/x86/kernel/apic/x2apic_phys.c
index e2829bf40e4a..b5cf9e7b3830 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -101,7 +101,7 @@ static int x2apic_phys_probe(void)
 }
 
 /* Common x2apic functions, also used by x2apic_cluster */
-int x2apic_apic_id_valid(int apicid)
+int x2apic_apic_id_valid(u32 apicid)
 {
return 1;
 }
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c 
b/arch/x86/kernel/apic/x2apic_uv_x.c

答复: 答复: [RFC PATCH] x86/acpi: Prevent x2apic id -1 from being accounted

2018-04-09 Thread Li,Rongqing


> -邮件原件-
> 发件人: linux-kernel-ow...@vger.kernel.org [mailto:linux-kernel-
> ow...@vger.kernel.org] 代表 Dou Liyang
> 发送时间: 2018年4月9日 16:47
> 收件人: Li,Rongqing <lirongq...@baidu.com>; linux-kernel@vger.kernel.org;
> t...@linutronix.de; mi...@redhat.com; h...@zytor.com; jgr...@suse.com;
> x...@kernel.org; pet...@infradead.org
> 主题: Re: 答复: [RFC PATCH] x86/acpi: Prevent x2apic id -1 from being
> accounted
> 
> RongQing,
> 
> At 04/09/2018 02:38 PM, Li,Rongqing wrote:
> >
> >
> >> -邮件原件-
> >> 发件人: Dou Liyang [mailto:douly.f...@cn.fujitsu.com]
> >> 发送时间: 2018年4月9日 13:38
> >> 收件人: Li,Rongqing <lirongq...@baidu.com>;
> >> linux-kernel@vger.kernel.org; t...@linutronix.de; mi...@redhat.com;
> >> h...@zytor.com; jgr...@suse.com; x...@kernel.org;
> pet...@infradead.org
> >> 主题: Re: [RFC PATCH] x86/acpi: Prevent x2apic id -1 from being
> >> accounted
> >>
> >> Hi RongQing,
> >>
> >> Is there an local x2apic whose ID is  in your machine?
> >>
> >
> > I think no
> 
> [...]
> 
> > [0.00] ACPI: X2APIC (apic_id[0x] uid[0x00] disabled)
> > [0.00] ACPI: X2APIC (apic_id[0x] uid[0x01] disabled)
> > [0.00] ACPI: X2APIC (apic_id[0x] uid[0x02] disabled)
> 
> Ah, sure enough!
> 
> [...]
> >
> >
> >> At 04/08/2018 07:38 PM, Li RongQing wrote:
> >>> local_apic_id of acpi_madt_local_x2apic is u32, it is converted to
> >>> int when checked by default_apic_id_valid() and return true if it is
> >>> larger than 0x7fff, this is wrong
> >>>
> >>
> >> For x2apic enabled systems,
> >>
> >> - the byte length of X2APIC ID is 4, and it can be larger than
> >>   0x7fff in theory
> >>
> >
> > Yes
> >
> >> - the ->apic_id_valid points to x2apic_apic_id_valid(), which always
> >>   return _ture_ , not default_apic_id_valid().
> >>
> >
> > To this machine, the apic is apic_flat
> 
> I see, I am sorry the title and changelog make me misunderstand.
> 
>    Here, actually, we prevent all the X2APIC Id from being parsed in
>non-x2apic mode, not just 0x. because the values of x2APIC ID
>must be 255 and greater in ACPI MADT.
> 

Thanks. How about the below modification:

commit 96ba42cf87ce0e62d54c01bfa9a9479b2e87
Author: Li RongQing <lirongq...@baidu.com>
Date:   Sun Apr 8 18:54:10 2018 +0800

x86/acpi: Prevent all the X2APIC Id from being parsed in non-x2apic mode

the values of x2APIC ID is greater than 0xff in ACPI MADT, if acpi
is apic_flat, default_apic_id_valid() is called to check id which
is converted from u32 to int, will return true if id is larger than
0x7fff, this is wrong

and if local_apic_id is invalid, we should prevent it from being
accounted

This fixes a bug that Purley platform displays too many possible cpu

Signed-off-by: Li RongQing <lirongq...@baidu.com>
Suggested-by:: Dou Liyang <douly.f...@cn.fujitsu.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Thomas Gleixner <t...@linutronix.de>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 40a3d3642f3a..08acd954f00e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -313,7 +313,7 @@ struct apic {
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
-   int (*apic_id_valid)(int apicid);
+   int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
 
bool(*check_apicid_used)(physid_mask_t *map, int apicid);
@@ -486,7 +486,7 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
 }
 
-extern int default_apic_id_valid(int apicid);
+extern int default_apic_id_valid(u32 apicid);
 extern int default_acpi_madt_oem_check(char *, char *);
 extern void default_setup_apic_routing(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7a37d9357bc4..4ba949de1ca9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 {
struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
-   int apic_id;
+   u32 apic_id;
u8 enabled;
 #endif
 
@@ -222,10 +222,13 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * to not preallocating memory for all NR_CPUS
 * when we use CPU hotplug.

答复: 答复: [RFC PATCH] x86/acpi: Prevent x2apic id -1 from being accounted

2018-04-09 Thread Li,Rongqing


> -邮件原件-
> 发件人: linux-kernel-ow...@vger.kernel.org [mailto:linux-kernel-
> ow...@vger.kernel.org] 代表 Dou Liyang
> 发送时间: 2018年4月9日 16:47
> 收件人: Li,Rongqing ; linux-kernel@vger.kernel.org;
> t...@linutronix.de; mi...@redhat.com; h...@zytor.com; jgr...@suse.com;
> x...@kernel.org; pet...@infradead.org
> 主题: Re: 答复: [RFC PATCH] x86/acpi: Prevent x2apic id -1 from being
> accounted
> 
> RongQing,
> 
> At 04/09/2018 02:38 PM, Li,Rongqing wrote:
> >
> >
> >> -邮件原件-
> >> 发件人: Dou Liyang [mailto:douly.f...@cn.fujitsu.com]
> >> 发送时间: 2018年4月9日 13:38
> >> 收件人: Li,Rongqing ;
> >> linux-kernel@vger.kernel.org; t...@linutronix.de; mi...@redhat.com;
> >> h...@zytor.com; jgr...@suse.com; x...@kernel.org;
> pet...@infradead.org
> >> 主题: Re: [RFC PATCH] x86/acpi: Prevent x2apic id -1 from being
> >> accounted
> >>
> >> Hi RongQing,
> >>
> >> Is there an local x2apic whose ID is  in your machine?
> >>
> >
> > I think no
> 
> [...]
> 
> > [0.00] ACPI: X2APIC (apic_id[0x] uid[0x00] disabled)
> > [0.000000] ACPI: X2APIC (apic_id[0x] uid[0x01] disabled)
> > [0.00] ACPI: X2APIC (apic_id[0x] uid[0x02] disabled)
> 
> Ah, sure enough!
> 
> [...]
> >
> >
> >> At 04/08/2018 07:38 PM, Li RongQing wrote:
> >>> local_apic_id of acpi_madt_local_x2apic is u32, it is converted to
> >>> int when checked by default_apic_id_valid() and return true if it is
> >>> larger than 0x7fff, this is wrong
> >>>
> >>
> >> For x2apic enabled systems,
> >>
> >> - the byte length of X2APIC ID is 4, and it can be larger than
> >>   0x7fff in theory
> >>
> >
> > Yes
> >
> >> - the ->apic_id_valid points to x2apic_apic_id_valid(), which always
> >>   return _ture_ , not default_apic_id_valid().
> >>
> >
> > To this machine, the apic is apic_flat
> 
> I see, I am sorry the title and changelog make me misunderstand.
> 
>    Here, actually, we prevent all the X2APIC Id from being parsed in
>non-x2apic mode, not just 0x. because the values of x2APIC ID
>must be 255 and greater in ACPI MADT.
> 

Thanks. How about the below modification:

commit 96ba42cf87ce0e62d54c01bfa9a9479b2e87
Author: Li RongQing 
Date:   Sun Apr 8 18:54:10 2018 +0800

x86/acpi: Prevent all the X2APIC Id from being parsed in non-x2apic mode

the values of x2APIC ID is greater than 0xff in ACPI MADT, if acpi
is apic_flat, default_apic_id_valid() is called to check id which
is converted from u32 to int, will return true if id is larger than
0x7fff, this is wrong

and if local_apic_id is invalid, we should prevent it from being
accounted

This fixes a bug that Purley platform displays too many possible cpu

Signed-off-by: Li RongQing 
Suggested-by:: Dou Liyang 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 40a3d3642f3a..08acd954f00e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -313,7 +313,7 @@ struct apic {
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
-   int (*apic_id_valid)(int apicid);
+   int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
 
bool(*check_apicid_used)(physid_mask_t *map, int apicid);
@@ -486,7 +486,7 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
 }
 
-extern int default_apic_id_valid(int apicid);
+extern int default_apic_id_valid(u32 apicid);
 extern int default_acpi_madt_oem_check(char *, char *);
 extern void default_setup_apic_routing(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7a37d9357bc4..4ba949de1ca9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 {
struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
-   int apic_id;
+   u32 apic_id;
u8 enabled;
 #endif
 
@@ -222,10 +222,13 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * to not preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
-   if (!apic->apic_id_valid(apic_id) && enabled)
-   printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
-   else
-   acpi_register_lapic(apic_i

[RFC PATCH] x86/acpi: Prevent x2apic id -1 from being accounted

2018-04-08 Thread Li RongQing
local_apic_id of acpi_madt_local_x2apic is u32, it is converted to
int when checked by default_apic_id_valid() and return true if it is
larger than 0x7fff, this is wrong

and if local_apic_id is invalid, we should prevent it from being
accounted

This fixes a bug that Purley platform displays too many possible cpu

Signed-off-by: Li RongQing <lirongq...@baidu.com>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Dou Liyang <douly.f...@cn.fujitsu.com>
---
 arch/x86/include/asm/apic.h  |  4 ++--
 arch/x86/kernel/acpi/boot.c  | 10 ++
 arch/x86/kernel/apic/apic_common.c   |  2 +-
 arch/x86/kernel/apic/apic_numachip.c |  2 +-
 arch/x86/kernel/apic/x2apic.h|  2 +-
 arch/x86/kernel/apic/x2apic_phys.c   |  2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c   |  2 +-
 arch/x86/xen/apic.c  |  2 +-
 8 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 40a3d3642f3a..08acd954f00e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -313,7 +313,7 @@ struct apic {
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
-   int (*apic_id_valid)(int apicid);
+   int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
 
bool(*check_apicid_used)(physid_mask_t *map, int apicid);
@@ -486,7 +486,7 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
 }
 
-extern int default_apic_id_valid(int apicid);
+extern int default_apic_id_valid(u32 apicid);
 extern int default_acpi_madt_oem_check(char *, char *);
 extern void default_setup_apic_routing(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7a37d9357bc4..7412564dc2a7 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 {
struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
-   int apic_id;
+   u32 apic_id;
u8 enabled;
 #endif
 
@@ -222,10 +222,12 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * to not preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
-   if (!apic->apic_id_valid(apic_id) && enabled)
+   if (!apic->apic_id_valid(apic_id)) {
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
-   else
-   acpi_register_lapic(apic_id, processor->uid, enabled);
+   return 0;
+   }
+
+   acpi_register_lapic(apic_id, processor->uid, enabled);
 #else
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
 #endif
diff --git a/arch/x86/kernel/apic/apic_common.c 
b/arch/x86/kernel/apic/apic_common.c
index a360801779ae..02b4839478b1 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -40,7 +40,7 @@ int default_check_phys_apicid_present(int phys_apicid)
return physid_isset(phys_apicid, phys_cpu_present_map);
 }
 
-int default_apic_id_valid(int apicid)
+int default_apic_id_valid(u32 apicid)
 {
return (apicid < 255);
 }
diff --git a/arch/x86/kernel/apic/apic_numachip.c 
b/arch/x86/kernel/apic/apic_numachip.c
index 134e04506ab4..78778b54f904 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,7 +56,7 @@ static u32 numachip2_set_apic_id(unsigned int id)
return id << 24;
 }
 
-static int numachip_apic_id_valid(int apicid)
+static int numachip_apic_id_valid(u32 apicid)
 {
/* Trust what bootloader passes in MADT */
return 1;
diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h
index b107de381cb5..a49b3604027f 100644
--- a/arch/x86/kernel/apic/x2apic.h
+++ b/arch/x86/kernel/apic/x2apic.h
@@ -1,6 +1,6 @@
 /* Common bits for X2APIC cluster/physical modes. */
 
-int x2apic_apic_id_valid(int apicid);
+int x2apic_apic_id_valid(u32 apicid);
 int x2apic_apic_id_registered(void);
 void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int 
dest);
 unsigned int x2apic_get_apic_id(unsigned long id);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c 
b/arch/x86/kernel/apic/x2apic_phys.c
index e2829bf40e4a..b5cf9e7b3830 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -101,7 +101,7 @@ static int x2apic_phys_probe(void)
 }
 
 /* Common x2apic functions, also used by x2apic_cluster */
-int x2apic_apic_id_valid(int apicid)
+int x2apic_apic_id_valid(u32 apicid)
 {
return 1;
 }
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c 
b/arch/x86/kernel/apic/x2apic_uv_x.c
index f11910b44638..efaf2d4f9c3c 100644
--- a/arch/x86/

[RFC PATCH] x86/acpi: Prevent x2apic id -1 from being accounted

2018-04-08 Thread Li RongQing
local_apic_id of acpi_madt_local_x2apic is u32, it is converted to
int when checked by default_apic_id_valid() and return true if it is
larger than 0x7fff, this is wrong

and if local_apic_id is invalid, we should prevent it from being
accounted

This fixes a bug that Purley platform displays too many possible cpu

Signed-off-by: Li RongQing 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Dou Liyang 
---
 arch/x86/include/asm/apic.h  |  4 ++--
 arch/x86/kernel/acpi/boot.c  | 10 ++
 arch/x86/kernel/apic/apic_common.c   |  2 +-
 arch/x86/kernel/apic/apic_numachip.c |  2 +-
 arch/x86/kernel/apic/x2apic.h|  2 +-
 arch/x86/kernel/apic/x2apic_phys.c   |  2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c   |  2 +-
 arch/x86/xen/apic.c  |  2 +-
 8 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 40a3d3642f3a..08acd954f00e 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -313,7 +313,7 @@ struct apic {
/* Probe, setup and smpboot functions */
int (*probe)(void);
int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
-   int (*apic_id_valid)(int apicid);
+   int (*apic_id_valid)(u32 apicid);
int (*apic_id_registered)(void);
 
bool(*check_apicid_used)(physid_mask_t *map, int apicid);
@@ -486,7 +486,7 @@ static inline unsigned int read_apic_id(void)
return apic->get_apic_id(reg);
 }
 
-extern int default_apic_id_valid(int apicid);
+extern int default_apic_id_valid(u32 apicid);
 extern int default_acpi_madt_oem_check(char *, char *);
 extern void default_setup_apic_routing(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 7a37d9357bc4..7412564dc2a7 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -200,7 +200,7 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 {
struct acpi_madt_local_x2apic *processor = NULL;
 #ifdef CONFIG_X86_X2APIC
-   int apic_id;
+   u32 apic_id;
u8 enabled;
 #endif
 
@@ -222,10 +222,12 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, 
const unsigned long end)
 * to not preallocating memory for all NR_CPUS
 * when we use CPU hotplug.
 */
-   if (!apic->apic_id_valid(apic_id) && enabled)
+   if (!apic->apic_id_valid(apic_id)) {
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
-   else
-   acpi_register_lapic(apic_id, processor->uid, enabled);
+   return 0;
+   }
+
+   acpi_register_lapic(apic_id, processor->uid, enabled);
 #else
printk(KERN_WARNING PREFIX "x2apic entry ignored\n");
 #endif
diff --git a/arch/x86/kernel/apic/apic_common.c 
b/arch/x86/kernel/apic/apic_common.c
index a360801779ae..02b4839478b1 100644
--- a/arch/x86/kernel/apic/apic_common.c
+++ b/arch/x86/kernel/apic/apic_common.c
@@ -40,7 +40,7 @@ int default_check_phys_apicid_present(int phys_apicid)
return physid_isset(phys_apicid, phys_cpu_present_map);
 }
 
-int default_apic_id_valid(int apicid)
+int default_apic_id_valid(u32 apicid)
 {
return (apicid < 255);
 }
diff --git a/arch/x86/kernel/apic/apic_numachip.c 
b/arch/x86/kernel/apic/apic_numachip.c
index 134e04506ab4..78778b54f904 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -56,7 +56,7 @@ static u32 numachip2_set_apic_id(unsigned int id)
return id << 24;
 }
 
-static int numachip_apic_id_valid(int apicid)
+static int numachip_apic_id_valid(u32 apicid)
 {
/* Trust what bootloader passes in MADT */
return 1;
diff --git a/arch/x86/kernel/apic/x2apic.h b/arch/x86/kernel/apic/x2apic.h
index b107de381cb5..a49b3604027f 100644
--- a/arch/x86/kernel/apic/x2apic.h
+++ b/arch/x86/kernel/apic/x2apic.h
@@ -1,6 +1,6 @@
 /* Common bits for X2APIC cluster/physical modes. */
 
-int x2apic_apic_id_valid(int apicid);
+int x2apic_apic_id_valid(u32 apicid);
 int x2apic_apic_id_registered(void);
 void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int 
dest);
 unsigned int x2apic_get_apic_id(unsigned long id);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c 
b/arch/x86/kernel/apic/x2apic_phys.c
index e2829bf40e4a..b5cf9e7b3830 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -101,7 +101,7 @@ static int x2apic_phys_probe(void)
 }
 
 /* Common x2apic functions, also used by x2apic_cluster */
-int x2apic_apic_id_valid(int apicid)
+int x2apic_apic_id_valid(u32 apicid)
 {
return 1;
 }
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c 
b/arch/x86/kernel/apic/x2apic_uv_x.c
index f11910b44638..efaf2d4f9c3c 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -557,7 +557,7 @@ static void uv_send_IPI_all(

re: [PATCH] mm: avoid the unnecessary waiting when force empty a cgroup

2018-04-03 Thread Li,Rongqing


> -邮件原件-
> 发件人: Michal Hocko [mailto:mho...@kernel.org]
> 发送时间: 2018年4月3日 16:05
> 收件人: Li,Rongqing <lirongq...@baidu.com>
> 抄送: han...@cmpxchg.org; vdavydov@gmail.com;
> cgro...@vger.kernel.org; linux...@kvack.org;
> linux-kernel@vger.kernel.org
> 主题: Re: [PATCH] mm: avoid the unnecessary waiting when force empty a
> cgroup
> 
> On Tue 03-04-18 15:12:09, Li RongQing wrote:
> > The number of writeback and dirty page can be read out from memcg, the
> > unnecessary waiting can be avoided by these counts
> 
> This changelog doesn't explain the problem and how the patch fixes it.

If a process in a memory cgroup takes some RSS, when force empty this memory 
cgroup, congestion_wait will be called unconditionally, there is 0.5 seconds 
delay

If use this patch, nearly no delay.


> Why do wee another throttling when we do already throttle in the reclaim
> path?

Do you mean we should remove congestion_wait(BLK_RW_ASYNC, HZ/10) from 
mem_cgroup_force_empty, since try_to_free_mem_cgroup_pages 
[shrink_inactive_list] has called congestion_wait


-RongQing

> 
> > Signed-off-by: Li RongQing <lirongq...@baidu.com>
> > ---
> >  mm/memcontrol.c | 8 ++--
> >  1 file changed, 6 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c index
> > 9ec024b862ac..5258651bd4ec 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -2613,9 +2613,13 @@ static int mem_cgroup_force_empty(struct
> mem_cgroup *memcg)
> > progress = try_to_free_mem_cgroup_pages(memcg, 1,
> > GFP_KERNEL, true);
> > if (!progress) {
> > +   unsigned long num;
> > +
> > +   num = memcg_page_state(memcg, NR_WRITEBACK) +
> > +   memcg_page_state(memcg, NR_FILE_DIRTY);
> > nr_retries--;
> > -   /* maybe some writeback is necessary */
> > -   congestion_wait(BLK_RW_ASYNC, HZ/10);
> > +   if (num)
> > +   congestion_wait(BLK_RW_ASYNC, HZ/10);
> > }
> >
> > }
> > --
> > 2.11.0
> 
> --
> Michal Hocko
> SUSE Labs


re: [PATCH] mm: avoid the unnecessary waiting when force empty a cgroup

2018-04-03 Thread Li,Rongqing


> -邮件原件-
> 发件人: Michal Hocko [mailto:mho...@kernel.org]
> 发送时间: 2018年4月3日 16:05
> 收件人: Li,Rongqing 
> 抄送: han...@cmpxchg.org; vdavydov@gmail.com;
> cgro...@vger.kernel.org; linux...@kvack.org;
> linux-kernel@vger.kernel.org
> 主题: Re: [PATCH] mm: avoid the unnecessary waiting when force empty a
> cgroup
> 
> On Tue 03-04-18 15:12:09, Li RongQing wrote:
> > The number of writeback and dirty page can be read out from memcg, the
> > unnecessary waiting can be avoided by these counts
> 
> This changelog doesn't explain the problem and how the patch fixes it.

If a process in a memory cgroup takes some RSS, when force empty this memory 
cgroup, congestion_wait will be called unconditionally, there is 0.5 seconds 
delay

If use this patch, nearly no delay.


> Why do wee another throttling when we do already throttle in the reclaim
> path?

Do you mean we should remove congestion_wait(BLK_RW_ASYNC, HZ/10) from 
mem_cgroup_force_empty, since try_to_free_mem_cgroup_pages 
[shrink_inactive_list] has called congestion_wait


-RongQing

> 
> > Signed-off-by: Li RongQing 
> > ---
> >  mm/memcontrol.c | 8 ++--
> >  1 file changed, 6 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/memcontrol.c b/mm/memcontrol.c index
> > 9ec024b862ac..5258651bd4ec 100644
> > --- a/mm/memcontrol.c
> > +++ b/mm/memcontrol.c
> > @@ -2613,9 +2613,13 @@ static int mem_cgroup_force_empty(struct
> mem_cgroup *memcg)
> > progress = try_to_free_mem_cgroup_pages(memcg, 1,
> > GFP_KERNEL, true);
> > if (!progress) {
> > +   unsigned long num;
> > +
> > +   num = memcg_page_state(memcg, NR_WRITEBACK) +
> > +   memcg_page_state(memcg, NR_FILE_DIRTY);
> > nr_retries--;
> > -   /* maybe some writeback is necessary */
> > -   congestion_wait(BLK_RW_ASYNC, HZ/10);
> > +   if (num)
> > +   congestion_wait(BLK_RW_ASYNC, HZ/10);
> > }
> >
> > }
> > --
> > 2.11.0
> 
> --
> Michal Hocko
> SUSE Labs


[PATCH] mm: avoid the unnecessary waiting when force empty a cgroup

2018-04-03 Thread Li RongQing
The number of writeback and dirty page can be read out from memcg,
the unnecessary waiting can be avoided by these counts

Signed-off-by: Li RongQing <lirongq...@baidu.com>
---
 mm/memcontrol.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9ec024b862ac..5258651bd4ec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2613,9 +2613,13 @@ static int mem_cgroup_force_empty(struct mem_cgroup 
*memcg)
progress = try_to_free_mem_cgroup_pages(memcg, 1,
GFP_KERNEL, true);
if (!progress) {
+   unsigned long num;
+
+   num = memcg_page_state(memcg, NR_WRITEBACK) +
+   memcg_page_state(memcg, NR_FILE_DIRTY);
nr_retries--;
-   /* maybe some writeback is necessary */
-   congestion_wait(BLK_RW_ASYNC, HZ/10);
+   if (num)
+   congestion_wait(BLK_RW_ASYNC, HZ/10);
}
 
}
-- 
2.11.0



[PATCH] mm: avoid the unnecessary waiting when force empty a cgroup

2018-04-03 Thread Li RongQing
The number of writeback and dirty page can be read out from memcg,
the unnecessary waiting can be avoided by these counts

Signed-off-by: Li RongQing 
---
 mm/memcontrol.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9ec024b862ac..5258651bd4ec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2613,9 +2613,13 @@ static int mem_cgroup_force_empty(struct mem_cgroup 
*memcg)
progress = try_to_free_mem_cgroup_pages(memcg, 1,
GFP_KERNEL, true);
if (!progress) {
+   unsigned long num;
+
+   num = memcg_page_state(memcg, NR_WRITEBACK) +
+   memcg_page_state(memcg, NR_FILE_DIRTY);
nr_retries--;
-   /* maybe some writeback is necessary */
-   congestion_wait(BLK_RW_ASYNC, HZ/10);
+   if (num)
+   congestion_wait(BLK_RW_ASYNC, HZ/10);
}
 
}
-- 
2.11.0



[PATCH] genirq: only scan the present CPUs

2018-04-01 Thread Li RongQing
lots of application will read /proc/stat, like ps and vmstat, but we
find the reading time are spreading on Purley platform which has lots
of possible CPUs and interrupt.

To reduce the reading time, only scan the present CPUs, not all possible
CPUs, which speeds the reading of /proc/stat 20 times on Purley platform
which has 56 present CPUs, and 224 possible CPUs

Signed-off-by: Li RongQing <lirongq...@baidu.com>
---
 kernel/irq/irqdesc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 49b54e9979cc..8f489b73733e 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -902,7 +902,7 @@ unsigned int kstat_irqs(unsigned int irq)
 
if (!desc || !desc->kstat_irqs)
return 0;
-   for_each_possible_cpu(cpu)
+   for_each_present_cpu(cpu)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
 }
-- 
2.11.0



[PATCH] genirq: only scan the present CPUs

2018-04-01 Thread Li RongQing
lots of application will read /proc/stat, like ps and vmstat, but we
find the reading time are spreading on Purley platform which has lots
of possible CPUs and interrupt.

To reduce the reading time, only scan the present CPUs, not all possible
CPUs, which speeds the reading of /proc/stat 20 times on Purley platform
which has 56 present CPUs, and 224 possible CPUs

Signed-off-by: Li RongQing 
---
 kernel/irq/irqdesc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 49b54e9979cc..8f489b73733e 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -902,7 +902,7 @@ unsigned int kstat_irqs(unsigned int irq)
 
if (!desc || !desc->kstat_irqs)
return 0;
-   for_each_possible_cpu(cpu)
+   for_each_present_cpu(cpu)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
 }
-- 
2.11.0



[PATCH] mm: limit a process RSS

2018-03-29 Thread Li RongQing
we cannot limit a process RSS although there is ulimit -m,
not sure why and when ulimit -m is not working, make it work

similar requirement:
https://stackoverflow.com/questions/3360348/why-ulimit-cant-limit-resident-memory-successfully-and-how

Signed-off-by: Li RongQing <lirongq...@baidu.com>
---
 mm/memory.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index 5fcfc24904d1..50cf9399477c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4140,6 +4140,9 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
ret = __handle_mm_fault(vma, address, flags);
 
if (flags & FAULT_FLAG_USER) {
+   unsigned long total_rss = get_mm_rss(current->mm);
+   u64 rlimit;
+
mem_cgroup_oom_disable();
/*
 * The task may have entered a memcg OOM situation but
@@ -4149,6 +4152,17 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
 */
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
+
+   rlimit = current->signal->rlim[RLIMIT_RSS].rlim_cur;
+
+   if (unlikely(total_rss > (rlimit >> PAGE_SHIFT)) &&
+   (current->pid != 1)) {
+
+   pr_info("kill process %s rsslimit[%lluK] rss[%luK]\n",
+   current->comm, (rlimit >> 10),
+   total_rss << (PAGE_SHIFT - 10));
+   do_group_exit(SIGKILL);
+   }
}
 
return ret;
-- 
2.11.0



[PATCH] mm: limit a process RSS

2018-03-29 Thread Li RongQing
we cannot limit a process RSS although there is ulimit -m,
not sure why and when ulimit -m is not working, make it work

similar requirement:
https://stackoverflow.com/questions/3360348/why-ulimit-cant-limit-resident-memory-successfully-and-how

Signed-off-by: Li RongQing 
---
 mm/memory.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index 5fcfc24904d1..50cf9399477c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4140,6 +4140,9 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
ret = __handle_mm_fault(vma, address, flags);
 
if (flags & FAULT_FLAG_USER) {
+   unsigned long total_rss = get_mm_rss(current->mm);
+   u64 rlimit;
+
mem_cgroup_oom_disable();
/*
 * The task may have entered a memcg OOM situation but
@@ -4149,6 +4152,17 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned 
long address,
 */
if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
mem_cgroup_oom_synchronize(false);
+
+   rlimit = current->signal->rlim[RLIMIT_RSS].rlim_cur;
+
+   if (unlikely(total_rss > (rlimit >> PAGE_SHIFT)) &&
+   (current->pid != 1)) {
+
+   pr_info("kill process %s rsslimit[%lluK] rss[%luK]\n",
+   current->comm, (rlimit >> 10),
+   total_rss << (PAGE_SHIFT - 10));
+   do_group_exit(SIGKILL);
+   }
}
 
return ret;
-- 
2.11.0



[PATCH] mm/list_lru: replace spinlock with RCU in __list_lru_count_one

2018-03-27 Thread Li RongQing
when reclaim memory, shink_slab will take lots of time even if
no memory is reclaimed, since list_lru_count_one called by it
needs to take a spinlock

try to optimize it by replacing spinlock with RCU in
__list_lru_count_one

$dd if=aaa  of=bbb  bs=1k count=3886080
$rm -f bbb
$time echo 1 >/cgroup/memory/test/memory.limit_in_bytes

Before: 0m0.415s ===> after: 0m0.395s

Signed-off-by: Li RongQing <lirongq...@baidu.com>
---
 include/linux/list_lru.h |  2 ++
 mm/list_lru.c| 69 ++--
 2 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index bb8129a3474d..ae472538038e 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -29,6 +29,7 @@ struct list_lru_one {
struct list_headlist;
/* may become negative during memcg reparenting */
longnr_items;
+   struct rcu_head rcu;
 };
 
 struct list_lru_memcg {
@@ -46,6 +47,7 @@ struct list_lru_node {
struct list_lru_memcg   *memcg_lrus;
 #endif
long nr_items;
+   struct rcu_head rcu;
 } cacheline_aligned_in_smp;
 
 struct list_lru {
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fd41e969ede5..4c58ed861729 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -52,13 +52,13 @@ static inline bool list_lru_memcg_aware(struct list_lru 
*lru)
 static inline struct list_lru_one *
 list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
 {
-   /*
-* The lock protects the array of per cgroup lists from relocation
-* (see memcg_update_list_lru_node).
-*/
-   lockdep_assert_held(>lock);
-   if (nlru->memcg_lrus && idx >= 0)
-   return nlru->memcg_lrus->lru[idx];
+   struct list_lru_memcg *tmp;
+
+   WARN_ON_ONCE(!rcu_read_lock_held());
+
+   tmp = rcu_dereference(nlru->memcg_lrus);
+   if (tmp && idx >= 0)
+   return rcu_dereference(tmp->lru[idx]);
 
return >lru;
 }
@@ -113,14 +113,17 @@ bool list_lru_add(struct list_lru *lru, struct list_head 
*item)
struct list_lru_one *l;
 
spin_lock(>lock);
+   rcu_read_lock();
if (list_empty(item)) {
l = list_lru_from_kmem(nlru, item);
list_add_tail(item, >list);
l->nr_items++;
nlru->nr_items++;
+   rcu_read_unlock();
spin_unlock(>lock);
return true;
}
+   rcu_read_unlock();
spin_unlock(>lock);
return false;
 }
@@ -133,14 +136,17 @@ bool list_lru_del(struct list_lru *lru, struct list_head 
*item)
struct list_lru_one *l;
 
spin_lock(>lock);
+   rcu_read_lock();
if (!list_empty(item)) {
l = list_lru_from_kmem(nlru, item);
list_del_init(item);
l->nr_items--;
nlru->nr_items--;
+   rcu_read_unlock();
spin_unlock(>lock);
return true;
}
+   rcu_read_unlock();
spin_unlock(>lock);
return false;
 }
@@ -166,12 +172,13 @@ static unsigned long __list_lru_count_one(struct list_lru 
*lru,
 {
struct list_lru_node *nlru = >node[nid];
struct list_lru_one *l;
-   unsigned long count;
+   unsigned long count = 0;
 
-   spin_lock(>lock);
+   rcu_read_lock();
l = list_lru_from_memcg_idx(nlru, memcg_idx);
-   count = l->nr_items;
-   spin_unlock(>lock);
+   if (l)
+   count = l->nr_items;
+   rcu_read_unlock();
 
return count;
 }
@@ -204,6 +211,7 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int 
memcg_idx,
unsigned long isolated = 0;
 
spin_lock(>lock);
+   rcu_read_lock();
l = list_lru_from_memcg_idx(nlru, memcg_idx);
 restart:
list_for_each_safe(item, n, >list) {
@@ -250,6 +258,7 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int 
memcg_idx,
}
}
 
+   rcu_read_unlock();
spin_unlock(>lock);
return isolated;
 }
@@ -296,9 +305,14 @@ static void __memcg_destroy_list_lru_node(struct 
list_lru_memcg *memcg_lrus,
  int begin, int end)
 {
int i;
+   struct list_lru_one *tmp;
 
-   for (i = begin; i < end; i++)
-   kfree(memcg_lrus->lru[i]);
+   for (i = begin; i < end; i++) {
+   tmp = memcg_lrus->lru[i];
+   rcu_assign_pointer(memcg_lrus->lru[i], NULL);
+   if (tmp)
+   kfree_rcu(tmp, rcu);
+   }
 }
 
 static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
@@ -314,7 +328,7 @@ static int __memcg_init_list_lru_node(struct list_lru_memcg 
*memcg_lrus,
 

[PATCH] mm/list_lru: replace spinlock with RCU in __list_lru_count_one

2018-03-27 Thread Li RongQing
when reclaim memory, shink_slab will take lots of time even if
no memory is reclaimed, since list_lru_count_one called by it
needs to take a spinlock

try to optimize it by replacing spinlock with RCU in
__list_lru_count_one

$dd if=aaa  of=bbb  bs=1k count=3886080
$rm -f bbb
$time echo 1 >/cgroup/memory/test/memory.limit_in_bytes

Before: 0m0.415s ===> after: 0m0.395s

Signed-off-by: Li RongQing 
---
 include/linux/list_lru.h |  2 ++
 mm/list_lru.c| 69 ++--
 2 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index bb8129a3474d..ae472538038e 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -29,6 +29,7 @@ struct list_lru_one {
struct list_headlist;
/* may become negative during memcg reparenting */
longnr_items;
+   struct rcu_head rcu;
 };
 
 struct list_lru_memcg {
@@ -46,6 +47,7 @@ struct list_lru_node {
struct list_lru_memcg   *memcg_lrus;
 #endif
long nr_items;
+   struct rcu_head rcu;
 } cacheline_aligned_in_smp;
 
 struct list_lru {
diff --git a/mm/list_lru.c b/mm/list_lru.c
index fd41e969ede5..4c58ed861729 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -52,13 +52,13 @@ static inline bool list_lru_memcg_aware(struct list_lru 
*lru)
 static inline struct list_lru_one *
 list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
 {
-   /*
-* The lock protects the array of per cgroup lists from relocation
-* (see memcg_update_list_lru_node).
-*/
-   lockdep_assert_held(>lock);
-   if (nlru->memcg_lrus && idx >= 0)
-   return nlru->memcg_lrus->lru[idx];
+   struct list_lru_memcg *tmp;
+
+   WARN_ON_ONCE(!rcu_read_lock_held());
+
+   tmp = rcu_dereference(nlru->memcg_lrus);
+   if (tmp && idx >= 0)
+   return rcu_dereference(tmp->lru[idx]);
 
return >lru;
 }
@@ -113,14 +113,17 @@ bool list_lru_add(struct list_lru *lru, struct list_head 
*item)
struct list_lru_one *l;
 
spin_lock(>lock);
+   rcu_read_lock();
if (list_empty(item)) {
l = list_lru_from_kmem(nlru, item);
list_add_tail(item, >list);
l->nr_items++;
nlru->nr_items++;
+   rcu_read_unlock();
spin_unlock(>lock);
return true;
}
+   rcu_read_unlock();
spin_unlock(>lock);
return false;
 }
@@ -133,14 +136,17 @@ bool list_lru_del(struct list_lru *lru, struct list_head 
*item)
struct list_lru_one *l;
 
spin_lock(>lock);
+   rcu_read_lock();
if (!list_empty(item)) {
l = list_lru_from_kmem(nlru, item);
list_del_init(item);
l->nr_items--;
nlru->nr_items--;
+   rcu_read_unlock();
spin_unlock(>lock);
return true;
}
+   rcu_read_unlock();
spin_unlock(>lock);
return false;
 }
@@ -166,12 +172,13 @@ static unsigned long __list_lru_count_one(struct list_lru 
*lru,
 {
struct list_lru_node *nlru = >node[nid];
struct list_lru_one *l;
-   unsigned long count;
+   unsigned long count = 0;
 
-   spin_lock(>lock);
+   rcu_read_lock();
l = list_lru_from_memcg_idx(nlru, memcg_idx);
-   count = l->nr_items;
-   spin_unlock(>lock);
+   if (l)
+   count = l->nr_items;
+   rcu_read_unlock();
 
return count;
 }
@@ -204,6 +211,7 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int 
memcg_idx,
unsigned long isolated = 0;
 
spin_lock(>lock);
+   rcu_read_lock();
l = list_lru_from_memcg_idx(nlru, memcg_idx);
 restart:
list_for_each_safe(item, n, >list) {
@@ -250,6 +258,7 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int 
memcg_idx,
}
}
 
+   rcu_read_unlock();
spin_unlock(>lock);
return isolated;
 }
@@ -296,9 +305,14 @@ static void __memcg_destroy_list_lru_node(struct 
list_lru_memcg *memcg_lrus,
  int begin, int end)
 {
int i;
+   struct list_lru_one *tmp;
 
-   for (i = begin; i < end; i++)
-   kfree(memcg_lrus->lru[i]);
+   for (i = begin; i < end; i++) {
+   tmp = memcg_lrus->lru[i];
+   rcu_assign_pointer(memcg_lrus->lru[i], NULL);
+   if (tmp)
+   kfree_rcu(tmp, rcu);
+   }
 }
 
 static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
@@ -314,7 +328,7 @@ static int __memcg_init_list_lru_node(struct list_lru_memcg 
*memcg_lrus,
goto fail;
 

答复: 答复: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a memory cgroup

2018-03-23 Thread Li,Rongqing


> -邮件原件-
> 发件人: Michal Hocko [mailto:mho...@kernel.org]
> 发送时间: 2018年3月23日 18:09
> 收件人: Li,Rongqing <lirongq...@baidu.com>
> 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> <aryabi...@virtuozzo.com>
> 主题: Re: 答复: 答复: [PATCH] mm/memcontrol.c: speed up to force empty
> a memory cgroup
> 
> On Fri 23-03-18 02:58:36, Li,Rongqing wrote:
> >
> >
> > > -邮件原件-
> > > 发件人: linux-kernel-ow...@vger.kernel.org
> > > [mailto:linux-kernel-ow...@vger.kernel.org] 代表 Li,Rongqing
> > > 发送时间: 2018年3月19日 18:52
> > > 收件人: Michal Hocko <mho...@kernel.org>
> > > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > > <aryabi...@virtuozzo.com>
> > > 主题: 答复: 答复: [PATCH] mm/memcontrol.c: speed up to force
> empty a
> > > memory cgroup
> > >
> > >
> > >
> > > > -邮件原件-
> > > > 发件人: Michal Hocko [mailto:mho...@kernel.org]
> > > > 发送时间: 2018年3月19日 18:38
> > > > 收件人: Li,Rongqing <lirongq...@baidu.com>
> > > > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > > > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > > > <aryabi...@virtuozzo.com>
> > > > 主题: Re: 答复: [PATCH] mm/memcontrol.c: speed up to force empty
> a
> > > memory
> > > > cgroup
> > > >
> > > > On Mon 19-03-18 10:00:41, Li,Rongqing wrote:
> > > > >
> > > > >
> > > > > > -邮件原件-
> > > > > > 发件人: Michal Hocko [mailto:mho...@kernel.org]
> > > > > > 发送时间: 2018年3月19日 16:54
> > > > > > 收件人: Li,Rongqing <lirongq...@baidu.com>
> > > > > > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > > > > > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > > > > > <aryabi...@virtuozzo.com>
> > > > > > 主题: Re: [PATCH] mm/memcontrol.c: speed up to force empty a
> > > > memory
> > > > > > cgroup
> > > > > >
> > > > > > On Mon 19-03-18 16:29:30, Li RongQing wrote:
> > > > > > > mem_cgroup_force_empty() tries to free only 32
> > > > (SWAP_CLUSTER_MAX)
> > > > > > > pages on each iteration, if a memory cgroup has lots of page
> > > > > > > cache, it will take many iterations to empty all page cache,
> > > > > > > so increase the reclaimed number per iteration to speed it
> > > > > > > up. same as in
> > > > > > > mem_cgroup_resize_limit()
> > > > > > >
> > > > > > > a simple test show:
> > > > > > >
> > > > > > >   $dd if=aaa  of=bbb  bs=1k count=3886080
> > > > > > >   $rm -f bbb
> > > > > > >   $time echo
> > > > 1 >/cgroup/memory/test/memory.limit_in_bytes
> > > > > > >
> > > > > > > Before: 0m0.252s ===> after: 0m0.178s
> > > > > >
> > > > > > Andrey was proposing something similar [1]. My main objection
> > > > > > was that his approach might lead to over-reclaim. Your
> > > > > > approach is more conservative because it just increases the
> > > > > > batch size. The size is still rather arbitrary. Same as
> > > > > > SWAP_CLUSTER_MAX but that one is a commonly used unit of
> reclaim in the MM code.
> > > > > >
> > > > > > I would be really curious about more detailed explanation why
> > > > > > having a larger batch yields to a better performance because
> > > > > > we are doingg SWAP_CLUSTER_MAX batches at the lower reclaim
> > > > > > level
> > > anyway.
> > > > > >
> > > > >
> > > > > Although SWAP_CLUSTER_MAX is used at the lower level, but the
> > > > > call stack of try_to_free_mem_cgroup_pages is too long, increase
> > > > > the nr_to_reclaim can reduce times of calling
> > > > > function[do_try_to_free_pages, shrink_zones, hrink_node ]
> > > > >
> > > > > mem_cgroup_resize_limit
> > > > > --->try_to_free_mem_cgroup_pages:  .nr_to_reclaim = max(1024,
> > > > > --->SW

答复: 答复: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a memory cgroup

2018-03-23 Thread Li,Rongqing


> -邮件原件-
> 发件人: Michal Hocko [mailto:mho...@kernel.org]
> 发送时间: 2018年3月23日 18:09
> 收件人: Li,Rongqing 
> 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> 
> 主题: Re: 答复: 答复: [PATCH] mm/memcontrol.c: speed up to force empty
> a memory cgroup
> 
> On Fri 23-03-18 02:58:36, Li,Rongqing wrote:
> >
> >
> > > -邮件原件-
> > > 发件人: linux-kernel-ow...@vger.kernel.org
> > > [mailto:linux-kernel-ow...@vger.kernel.org] 代表 Li,Rongqing
> > > 发送时间: 2018年3月19日 18:52
> > > 收件人: Michal Hocko 
> > > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > > 
> > > 主题: 答复: 答复: [PATCH] mm/memcontrol.c: speed up to force
> empty a
> > > memory cgroup
> > >
> > >
> > >
> > > > -邮件原件-
> > > > 发件人: Michal Hocko [mailto:mho...@kernel.org]
> > > > 发送时间: 2018年3月19日 18:38
> > > > 收件人: Li,Rongqing 
> > > > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > > > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > > > 
> > > > 主题: Re: 答复: [PATCH] mm/memcontrol.c: speed up to force empty
> a
> > > memory
> > > > cgroup
> > > >
> > > > On Mon 19-03-18 10:00:41, Li,Rongqing wrote:
> > > > >
> > > > >
> > > > > > -邮件原件-
> > > > > > 发件人: Michal Hocko [mailto:mho...@kernel.org]
> > > > > > 发送时间: 2018年3月19日 16:54
> > > > > > 收件人: Li,Rongqing 
> > > > > > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > > > > > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > > > > > 
> > > > > > 主题: Re: [PATCH] mm/memcontrol.c: speed up to force empty a
> > > > memory
> > > > > > cgroup
> > > > > >
> > > > > > On Mon 19-03-18 16:29:30, Li RongQing wrote:
> > > > > > > mem_cgroup_force_empty() tries to free only 32
> > > > (SWAP_CLUSTER_MAX)
> > > > > > > pages on each iteration, if a memory cgroup has lots of page
> > > > > > > cache, it will take many iterations to empty all page cache,
> > > > > > > so increase the reclaimed number per iteration to speed it
> > > > > > > up. same as in
> > > > > > > mem_cgroup_resize_limit()
> > > > > > >
> > > > > > > a simple test show:
> > > > > > >
> > > > > > >   $dd if=aaa  of=bbb  bs=1k count=3886080
> > > > > > >   $rm -f bbb
> > > > > > >   $time echo
> > > > 1 >/cgroup/memory/test/memory.limit_in_bytes
> > > > > > >
> > > > > > > Before: 0m0.252s ===> after: 0m0.178s
> > > > > >
> > > > > > Andrey was proposing something similar [1]. My main objection
> > > > > > was that his approach might lead to over-reclaim. Your
> > > > > > approach is more conservative because it just increases the
> > > > > > batch size. The size is still rather arbitrary. Same as
> > > > > > SWAP_CLUSTER_MAX but that one is a commonly used unit of
> reclaim in the MM code.
> > > > > >
> > > > > > I would be really curious about more detailed explanation why
> > > > > > having a larger batch yields to a better performance because
> > > > > > we are doingg SWAP_CLUSTER_MAX batches at the lower reclaim
> > > > > > level
> > > anyway.
> > > > > >
> > > > >
> > > > > Although SWAP_CLUSTER_MAX is used at the lower level, but the
> > > > > call stack of try_to_free_mem_cgroup_pages is too long, increase
> > > > > the nr_to_reclaim can reduce times of calling
> > > > > function[do_try_to_free_pages, shrink_zones, hrink_node ]
> > > > >
> > > > > mem_cgroup_resize_limit
> > > > > --->try_to_free_mem_cgroup_pages:  .nr_to_reclaim = max(1024,
> > > > > --->SWAP_CLUSTER_MAX),
> > > > >---> do_try_to_free_pages
> > > > >  ---> shrink_zones
> > > > >   --->shrink_node
> > > > >---> shrink_node_memcg
> > > > >

答复: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a memory cgroup

2018-03-22 Thread Li,Rongqing


> -邮件原件-
> 发件人: linux-kernel-ow...@vger.kernel.org
> [mailto:linux-kernel-ow...@vger.kernel.org] 代表 Li,Rongqing
> 发送时间: 2018年3月19日 18:52
> 收件人: Michal Hocko <mho...@kernel.org>
> 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> <aryabi...@virtuozzo.com>
> 主题: 答复: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a
> memory cgroup
> 
> 
> 
> > -邮件原件-
> > 发件人: Michal Hocko [mailto:mho...@kernel.org]
> > 发送时间: 2018年3月19日 18:38
> > 收件人: Li,Rongqing <lirongq...@baidu.com>
> > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > <aryabi...@virtuozzo.com>
> > 主题: Re: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a
> memory
> > cgroup
> >
> > On Mon 19-03-18 10:00:41, Li,Rongqing wrote:
> > >
> > >
> > > > -邮件原件-
> > > > 发件人: Michal Hocko [mailto:mho...@kernel.org]
> > > > 发送时间: 2018年3月19日 16:54
> > > > 收件人: Li,Rongqing <lirongq...@baidu.com>
> > > > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > > > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > > > <aryabi...@virtuozzo.com>
> > > > 主题: Re: [PATCH] mm/memcontrol.c: speed up to force empty a
> > memory
> > > > cgroup
> > > >
> > > > On Mon 19-03-18 16:29:30, Li RongQing wrote:
> > > > > mem_cgroup_force_empty() tries to free only 32
> > (SWAP_CLUSTER_MAX)
> > > > > pages on each iteration, if a memory cgroup has lots of page
> > > > > cache, it will take many iterations to empty all page cache, so
> > > > > increase the reclaimed number per iteration to speed it up. same
> > > > > as in
> > > > > mem_cgroup_resize_limit()
> > > > >
> > > > > a simple test show:
> > > > >
> > > > >   $dd if=aaa  of=bbb  bs=1k count=3886080
> > > > >   $rm -f bbb
> > > > >   $time echo
> > 1 >/cgroup/memory/test/memory.limit_in_bytes
> > > > >
> > > > > Before: 0m0.252s ===> after: 0m0.178s
> > > >
> > > > Andrey was proposing something similar [1]. My main objection was
> > > > that his approach might lead to over-reclaim. Your approach is
> > > > more conservative because it just increases the batch size. The
> > > > size is still rather arbitrary. Same as SWAP_CLUSTER_MAX but that
> > > > one is a commonly used unit of reclaim in the MM code.
> > > >
> > > > I would be really curious about more detailed explanation why
> > > > having a larger batch yields to a better performance because we
> > > > are doingg SWAP_CLUSTER_MAX batches at the lower reclaim level
> anyway.
> > > >
> > >
> > > Although SWAP_CLUSTER_MAX is used at the lower level, but the call
> > > stack of try_to_free_mem_cgroup_pages is too long, increase the
> > > nr_to_reclaim can reduce times of calling
> > > function[do_try_to_free_pages, shrink_zones, hrink_node ]
> > >
> > > mem_cgroup_resize_limit
> > > --->try_to_free_mem_cgroup_pages:  .nr_to_reclaim = max(1024,
> > > --->SWAP_CLUSTER_MAX),
> > >---> do_try_to_free_pages
> > >  ---> shrink_zones
> > >   --->shrink_node
> > >---> shrink_node_memcg
> > >  ---> shrink_list  <---loop will happen in this place
> > [times=1024/32]
> > >---> shrink_page_list
> >
> > Can you actually measure this to be the culprit. Because we should
> > rethink our call path if it is too complicated/deep to perform well.
> > Adding arbitrary batch sizes doesn't sound like a good way to go to me.
> 
> Ok, I will try
> 
http://pasted.co/4edbcfff

This is result from ftrace graph, it maybe prove that the deep call path leads 
to low performance.

And when increase reclaiming page in try_to_free_mem_cgroup_pages, it can 
reduce calling of shrink_slab, which save times, in my cases, page caches 
occupy most memory, slab is little, but shrink_slab will be called everytime

Mutex_lock 1 us

try_to_free_mem_cgroup_pages
  do_try_to_free_pages ! 185.020 us
shrink_node  ! 116.529 us
  shrink_node_memcg   39.203
  shrink_inactive_list  33.960
  shrink_slab   72.955

shrink_node  61.502 us
  shrink_node_memcg   3.955
  shrink_slab   54.296 us

-RongQing

> -RongQing
> > --
> > Michal Hocko
> > SUSE Labs


答复: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a memory cgroup

2018-03-22 Thread Li,Rongqing


> -邮件原件-
> 发件人: linux-kernel-ow...@vger.kernel.org
> [mailto:linux-kernel-ow...@vger.kernel.org] 代表 Li,Rongqing
> 发送时间: 2018年3月19日 18:52
> 收件人: Michal Hocko 
> 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> 
> 主题: 答复: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a
> memory cgroup
> 
> 
> 
> > -邮件原件-
> > 发件人: Michal Hocko [mailto:mho...@kernel.org]
> > 发送时间: 2018年3月19日 18:38
> > 收件人: Li,Rongqing 
> > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > 
> > 主题: Re: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a
> memory
> > cgroup
> >
> > On Mon 19-03-18 10:00:41, Li,Rongqing wrote:
> > >
> > >
> > > > -邮件原件-
> > > > 发件人: Michal Hocko [mailto:mho...@kernel.org]
> > > > 发送时间: 2018年3月19日 16:54
> > > > 收件人: Li,Rongqing 
> > > > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > > > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > > > 
> > > > 主题: Re: [PATCH] mm/memcontrol.c: speed up to force empty a
> > memory
> > > > cgroup
> > > >
> > > > On Mon 19-03-18 16:29:30, Li RongQing wrote:
> > > > > mem_cgroup_force_empty() tries to free only 32
> > (SWAP_CLUSTER_MAX)
> > > > > pages on each iteration, if a memory cgroup has lots of page
> > > > > cache, it will take many iterations to empty all page cache, so
> > > > > increase the reclaimed number per iteration to speed it up. same
> > > > > as in
> > > > > mem_cgroup_resize_limit()
> > > > >
> > > > > a simple test show:
> > > > >
> > > > >   $dd if=aaa  of=bbb  bs=1k count=3886080
> > > > >   $rm -f bbb
> > > > >   $time echo
> > 1 >/cgroup/memory/test/memory.limit_in_bytes
> > > > >
> > > > > Before: 0m0.252s ===> after: 0m0.178s
> > > >
> > > > Andrey was proposing something similar [1]. My main objection was
> > > > that his approach might lead to over-reclaim. Your approach is
> > > > more conservative because it just increases the batch size. The
> > > > size is still rather arbitrary. Same as SWAP_CLUSTER_MAX but that
> > > > one is a commonly used unit of reclaim in the MM code.
> > > >
> > > > I would be really curious about more detailed explanation why
> > > > having a larger batch yields to a better performance because we
> > > > are doingg SWAP_CLUSTER_MAX batches at the lower reclaim level
> anyway.
> > > >
> > >
> > > Although SWAP_CLUSTER_MAX is used at the lower level, but the call
> > > stack of try_to_free_mem_cgroup_pages is too long, increase the
> > > nr_to_reclaim can reduce times of calling
> > > function[do_try_to_free_pages, shrink_zones, hrink_node ]
> > >
> > > mem_cgroup_resize_limit
> > > --->try_to_free_mem_cgroup_pages:  .nr_to_reclaim = max(1024,
> > > --->SWAP_CLUSTER_MAX),
> > >---> do_try_to_free_pages
> > >  ---> shrink_zones
> > >   --->shrink_node
> > >---> shrink_node_memcg
> > >  ---> shrink_list  <---loop will happen in this place
> > [times=1024/32]
> > >---> shrink_page_list
> >
> > Can you actually measure this to be the culprit. Because we should
> > rethink our call path if it is too complicated/deep to perform well.
> > Adding arbitrary batch sizes doesn't sound like a good way to go to me.
> 
> Ok, I will try
> 
http://pasted.co/4edbcfff

This is result from ftrace graph, it maybe prove that the deep call path leads 
to low performance.

And when increase reclaiming page in try_to_free_mem_cgroup_pages, it can 
reduce calling of shrink_slab, which save times, in my cases, page caches 
occupy most memory, slab is little, but shrink_slab will be called everytime

Mutex_lock 1 us

try_to_free_mem_cgroup_pages
  do_try_to_free_pages ! 185.020 us
shrink_node  ! 116.529 us
  shrink_node_memcg   39.203
  shrink_inactive_list  33.960
  shrink_slab   72.955

shrink_node  61.502 us
  shrink_node_memcg   3.955
  shrink_slab   54.296 us

-RongQing

> -RongQing
> > --
> > Michal Hocko
> > SUSE Labs


答复: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a memory cgroup

2018-03-19 Thread Li,Rongqing


> -邮件原件-
> 发件人: Michal Hocko [mailto:mho...@kernel.org]
> 发送时间: 2018年3月19日 18:38
> 收件人: Li,Rongqing <lirongq...@baidu.com>
> 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> <aryabi...@virtuozzo.com>
> 主题: Re: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a
> memory cgroup
> 
> On Mon 19-03-18 10:00:41, Li,Rongqing wrote:
> >
> >
> > > -邮件原件-
> > > 发件人: Michal Hocko [mailto:mho...@kernel.org]
> > > 发送时间: 2018年3月19日 16:54
> > > 收件人: Li,Rongqing <lirongq...@baidu.com>
> > > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > > <aryabi...@virtuozzo.com>
> > > 主题: Re: [PATCH] mm/memcontrol.c: speed up to force empty a
> memory
> > > cgroup
> > >
> > > On Mon 19-03-18 16:29:30, Li RongQing wrote:
> > > > mem_cgroup_force_empty() tries to free only 32
> (SWAP_CLUSTER_MAX)
> > > > pages on each iteration, if a memory cgroup has lots of page
> > > > cache, it will take many iterations to empty all page cache, so
> > > > increase the reclaimed number per iteration to speed it up. same
> > > > as in
> > > > mem_cgroup_resize_limit()
> > > >
> > > > a simple test show:
> > > >
> > > >   $dd if=aaa  of=bbb  bs=1k count=3886080
> > > >   $rm -f bbb
> > > >   $time echo
> 1 >/cgroup/memory/test/memory.limit_in_bytes
> > > >
> > > > Before: 0m0.252s ===> after: 0m0.178s
> > >
> > > Andrey was proposing something similar [1]. My main objection was
> > > that his approach might lead to over-reclaim. Your approach is more
> > > conservative because it just increases the batch size. The size is
> > > still rather arbitrary. Same as SWAP_CLUSTER_MAX but that one is a
> > > commonly used unit of reclaim in the MM code.
> > >
> > > I would be really curious about more detailed explanation why having
> > > a larger batch yields to a better performance because we are doingg
> > > SWAP_CLUSTER_MAX batches at the lower reclaim level anyway.
> > >
> >
> > Although SWAP_CLUSTER_MAX is used at the lower level, but the call
> > stack of try_to_free_mem_cgroup_pages is too long, increase the
> > nr_to_reclaim can reduce times of calling
> > function[do_try_to_free_pages, shrink_zones, hrink_node ]
> >
> > mem_cgroup_resize_limit
> > --->try_to_free_mem_cgroup_pages:  .nr_to_reclaim = max(1024,
> > --->SWAP_CLUSTER_MAX),
> >---> do_try_to_free_pages
> >  ---> shrink_zones
> >   --->shrink_node
> >---> shrink_node_memcg
> >  ---> shrink_list  <---loop will happen in this place
> [times=1024/32]
> >---> shrink_page_list
> 
> Can you actually measure this to be the culprit. Because we should rethink
> our call path if it is too complicated/deep to perform well.
> Adding arbitrary batch sizes doesn't sound like a good way to go to me.

Ok, I will try

-RongQing
> --
> Michal Hocko
> SUSE Labs


答复: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a memory cgroup

2018-03-19 Thread Li,Rongqing


> -邮件原件-
> 发件人: Michal Hocko [mailto:mho...@kernel.org]
> 发送时间: 2018年3月19日 18:38
> 收件人: Li,Rongqing 
> 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> 
> 主题: Re: 答复: [PATCH] mm/memcontrol.c: speed up to force empty a
> memory cgroup
> 
> On Mon 19-03-18 10:00:41, Li,Rongqing wrote:
> >
> >
> > > -邮件原件-
> > > 发件人: Michal Hocko [mailto:mho...@kernel.org]
> > > 发送时间: 2018年3月19日 16:54
> > > 收件人: Li,Rongqing 
> > > 抄送: linux-kernel@vger.kernel.org; linux...@kvack.org;
> > > cgro...@vger.kernel.org; han...@cmpxchg.org; Andrey Ryabinin
> > > 
> > > 主题: Re: [PATCH] mm/memcontrol.c: speed up to force empty a
> memory
> > > cgroup
> > >
> > > On Mon 19-03-18 16:29:30, Li RongQing wrote:
> > > > mem_cgroup_force_empty() tries to free only 32
> (SWAP_CLUSTER_MAX)
> > > > pages on each iteration, if a memory cgroup has lots of page
> > > > cache, it will take many iterations to empty all page cache, so
> > > > increase the reclaimed number per iteration to speed it up. same
> > > > as in
> > > > mem_cgroup_resize_limit()
> > > >
> > > > a simple test show:
> > > >
> > > >   $dd if=aaa  of=bbb  bs=1k count=3886080
> > > >   $rm -f bbb
> > > >   $time echo
> 1 >/cgroup/memory/test/memory.limit_in_bytes
> > > >
> > > > Before: 0m0.252s ===> after: 0m0.178s
> > >
> > > Andrey was proposing something similar [1]. My main objection was
> > > that his approach might lead to over-reclaim. Your approach is more
> > > conservative because it just increases the batch size. The size is
> > > still rather arbitrary. Same as SWAP_CLUSTER_MAX but that one is a
> > > commonly used unit of reclaim in the MM code.
> > >
> > > I would be really curious about more detailed explanation why having
> > > a larger batch yields to a better performance because we are doingg
> > > SWAP_CLUSTER_MAX batches at the lower reclaim level anyway.
> > >
> >
> > Although SWAP_CLUSTER_MAX is used at the lower level, but the call
> > stack of try_to_free_mem_cgroup_pages is too long, increase the
> > nr_to_reclaim can reduce times of calling
> > function[do_try_to_free_pages, shrink_zones, hrink_node ]
> >
> > mem_cgroup_resize_limit
> > --->try_to_free_mem_cgroup_pages:  .nr_to_reclaim = max(1024,
> > --->SWAP_CLUSTER_MAX),
> >---> do_try_to_free_pages
> >  ---> shrink_zones
> >   --->shrink_node
> >---> shrink_node_memcg
> >  ---> shrink_list  <---loop will happen in this place
> [times=1024/32]
> >---> shrink_page_list
> 
> Can you actually measure this to be the culprit. Because we should rethink
> our call path if it is too complicated/deep to perform well.
> Adding arbitrary batch sizes doesn't sound like a good way to go to me.

Ok, I will try

-RongQing
> --
> Michal Hocko
> SUSE Labs


[PATCH] mm/memcontrol.c: speed up to force empty a memory cgroup

2018-03-19 Thread Li RongQing
mem_cgroup_force_empty() tries to free only 32 (SWAP_CLUSTER_MAX) pages
on each iteration, if a memory cgroup has lots of page cache, it will
take many iterations to empty all page cache, so increase the reclaimed
number per iteration to speed it up. same as in mem_cgroup_resize_limit()

a simple test show:

  $dd if=aaa  of=bbb  bs=1k count=3886080
  $rm -f bbb
  $time echo 1 >/cgroup/memory/test/memory.limit_in_bytes

Before: 0m0.252s ===> after: 0m0.178s

Signed-off-by: Li RongQing <lirongq...@baidu.com>
---
 mm/memcontrol.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 670e99b68aa6..8910d9e8e908 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2480,7 +2480,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup 
*memcg,
if (!ret)
break;
 
-   if (!try_to_free_mem_cgroup_pages(memcg, 1,
+   if (!try_to_free_mem_cgroup_pages(memcg, 1024,
GFP_KERNEL, !memsw)) {
ret = -EBUSY;
break;
@@ -2610,7 +2610,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup 
*memcg)
if (signal_pending(current))
return -EINTR;
 
-   progress = try_to_free_mem_cgroup_pages(memcg, 1,
+   progress = try_to_free_mem_cgroup_pages(memcg, 1024,
GFP_KERNEL, true);
if (!progress) {
nr_retries--;
-- 
2.11.0



[PATCH] mm/memcontrol.c: speed up to force empty a memory cgroup

2018-03-19 Thread Li RongQing
mem_cgroup_force_empty() tries to free only 32 (SWAP_CLUSTER_MAX) pages
on each iteration, if a memory cgroup has lots of page cache, it will
take many iterations to empty all page cache, so increase the reclaimed
number per iteration to speed it up. same as in mem_cgroup_resize_limit()

a simple test show:

  $dd if=aaa  of=bbb  bs=1k count=3886080
  $rm -f bbb
  $time echo 1 >/cgroup/memory/test/memory.limit_in_bytes

Before: 0m0.252s ===> after: 0m0.178s

Signed-off-by: Li RongQing 
---
 mm/memcontrol.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 670e99b68aa6..8910d9e8e908 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2480,7 +2480,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup 
*memcg,
if (!ret)
break;
 
-   if (!try_to_free_mem_cgroup_pages(memcg, 1,
+   if (!try_to_free_mem_cgroup_pages(memcg, 1024,
GFP_KERNEL, !memsw)) {
ret = -EBUSY;
break;
@@ -2610,7 +2610,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup 
*memcg)
if (signal_pending(current))
return -EINTR;
 
-   progress = try_to_free_mem_cgroup_pages(memcg, 1,
+   progress = try_to_free_mem_cgroup_pages(memcg, 1024,
GFP_KERNEL, true);
if (!progress) {
nr_retries--;
-- 
2.11.0



[tip:x86/urgent] Documentation, x86, resctrl: Make text and sample command match

2018-02-28 Thread tip-bot for Li RongQing
Commit-ID:  30009746168da0f1f648881f77083c40e226a8a0
Gitweb: https://git.kernel.org/tip/30009746168da0f1f648881f77083c40e226a8a0
Author: Li RongQing <lirongq...@baidu.com>
AuthorDate: Tue, 27 Feb 2018 14:17:51 +0800
Committer:  Thomas Gleixner <t...@linutronix.de>
CommitDate: Wed, 28 Feb 2018 19:59:05 +0100

Documentation, x86, resctrl: Make text and sample command match

The text says "Move the cpus 4-7 over to p1", but the sample command writes
to p0/cpus.

Signed-off-by: Li RongQing <lirongq...@baidu.com>
Signed-off-by: Thomas Gleixner <t...@linutronix.de>
Cc: fenghua...@intel.com
Cc: linux-...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/1519712271-8802-1-git-send-email-lirongq...@baidu.com

---
 Documentation/x86/intel_rdt_ui.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/x86/intel_rdt_ui.txt 
b/Documentation/x86/intel_rdt_ui.txt
index 756fd76b78a6..71c30984e94d 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -671,7 +671,7 @@ occupancy of the real time threads on these cores.
 # mkdir p1
 
 Move the cpus 4-7 over to p1
-# echo f0 > p0/cpus
+# echo f0 > p1/cpus
 
 View the llc occupancy snapshot
 


[tip:x86/urgent] Documentation, x86, resctrl: Make text and sample command match

2018-02-28 Thread tip-bot for Li RongQing
Commit-ID:  30009746168da0f1f648881f77083c40e226a8a0
Gitweb: https://git.kernel.org/tip/30009746168da0f1f648881f77083c40e226a8a0
Author: Li RongQing 
AuthorDate: Tue, 27 Feb 2018 14:17:51 +0800
Committer:  Thomas Gleixner 
CommitDate: Wed, 28 Feb 2018 19:59:05 +0100

Documentation, x86, resctrl: Make text and sample command match

The text says "Move the cpus 4-7 over to p1", but the sample command writes
to p0/cpus.

Signed-off-by: Li RongQing 
Signed-off-by: Thomas Gleixner 
Cc: fenghua...@intel.com
Cc: linux-...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/1519712271-8802-1-git-send-email-lirongq...@baidu.com

---
 Documentation/x86/intel_rdt_ui.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/x86/intel_rdt_ui.txt 
b/Documentation/x86/intel_rdt_ui.txt
index 756fd76b78a6..71c30984e94d 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -671,7 +671,7 @@ occupancy of the real time threads on these cores.
 # mkdir p1
 
 Move the cpus 4-7 over to p1
-# echo f0 > p0/cpus
+# echo f0 > p1/cpus
 
 View the llc occupancy snapshot
 


[PATCH] Documentation: fix the wrong path in intel_rdt_ui.txt

2018-02-26 Thread Li RongQing
the note says "Move the cpus 4-7 over to p1", but echo command
writes f0 to p0/cpus

Signed-off-by: Li RongQing <lirongq...@baidu.com>
Cc: Fenghua Yu <fenghua...@intel.com>
---
 Documentation/x86/intel_rdt_ui.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/x86/intel_rdt_ui.txt 
b/Documentation/x86/intel_rdt_ui.txt
index 756fd76b78a6..71c30984e94d 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -671,7 +671,7 @@ occupancy of the real time threads on these cores.
 # mkdir p1
 
 Move the cpus 4-7 over to p1
-# echo f0 > p0/cpus
+# echo f0 > p1/cpus
 
 View the llc occupancy snapshot
 
-- 
2.11.0



[PATCH] Documentation: fix the wrong path in intel_rdt_ui.txt

2018-02-26 Thread Li RongQing
the note says "Move the cpus 4-7 over to p1", but echo command
writes f0 to p0/cpus

Signed-off-by: Li RongQing 
Cc: Fenghua Yu 
---
 Documentation/x86/intel_rdt_ui.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/x86/intel_rdt_ui.txt 
b/Documentation/x86/intel_rdt_ui.txt
index 756fd76b78a6..71c30984e94d 100644
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -671,7 +671,7 @@ occupancy of the real time threads on these cores.
 # mkdir p1
 
 Move the cpus 4-7 over to p1
-# echo f0 > p0/cpus
+# echo f0 > p1/cpus
 
 View the llc occupancy snapshot
 
-- 
2.11.0



Re: high latency in CFS when disable autogroup, bug or not?

2017-10-23 Thread Li RongQing
I find the root cause, the delayed process run a very long time
before, and its vruntime become very large, and must wait all process;
so it is delayed

the reason is below:

1. there is a low weight load process A (nice=19, weight=15)
2. there is a process B which is doing IO
3. there is a process C whose nice is 0

the running status likes below:

Step 1: when C is running, B wakeup, and preempt C;  B start to run
Step 2: when B sleeps, vruntime of A is min, but A cannot preempt
C(cfq->last); then C restore to run; then repeat Step 1;


A can not preempt B and C, since vruntime of B and C is not larger
that 4ms*1024/15 [sched_wakeup_granularity_ns *1024/wight of nice 19]


but this condition will block other all process(about 500 processes)
running in  4ms*1024/15;


so I think we should consider more when doing preempt

/*
 * Pick the next process, keeping these things in mind, in this order:
 * 1) keep things fair between processes/task groups
 * 2) pick the "next" process, since someone really wants that to run
 * 3) pick the "last" process, for cache locality
 * 4) do not run the "skip" process, if something else is available
 */
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *left = se;

/*
 * Avoid running the skip buddy, if running something else can
 * be done without getting too unfair.
 */
if (cfs_rq->skip == se) {
struct sched_entity *second = __pick_next_entity(se);
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}

/*
 * Prefer last buddy, try to return the CPU to a preempted task.
 */
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;

/*
 * Someone really wants this to run. If it's not unfair, run it.
 */
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
se = cfs_rq->next;

clear_buddies(cfs_rq, se);

return se;
}


On Fri, Sep 1, 2017 at 11:28 AM, Li RongQing <roy.qing...@gmail.com> wrote:
> I have a programe to test CFS latency or fairness on high load,
> the test shows CFS has high latency when disable autogroup
>
> the program is in
> https://gist.github.com/anonymous/af21ae289cfa5c6310eeac73b7a478ff.
> the programe creates many threads, every thread does thing like:
>
> do {
> now =gettime()
> loop to spend 10 microseconds
> delta = gettime()-now
>
> if (delta > max_latency) {
>  max_latency=delta
>  printf( max_latency);
> }
> } while(1)
>
>
> I run this program on a machine with 48 processor,  this programe
> created 2000 thread,
> then every processor has about 40 pthread,  every thread should finish
> its 10ms computation in 400ms= 40pthread*10ms
> but test result show some thread takes very long time
>
> # ./pipe_test -n 2000
> eat cpu with 2000 threads, delay time 1000 ms
> 648 delay 1037 ms
> 48923 delay 1038 ms
> 1810 delay 1349 ms
> 49142 delay 1482 ms
> 1728 delay 1574 ms
> 1518 delay 1713 ms
> 808 delay 1714 ms
> 1702 delay 1733 ms
> 49004 delay 1783 ms
> 48821 delay 1785 ms
> 451 delay 1865 ms
> 990 delay 1910 ms
> 1626 delay 1957 ms
> 537 delay 2420 ms
> 2021 delay 3242 ms
> 763 delay 3488 ms
> 622 delay 3614 ms
> 1887 delay 4474 ms
> 1267 delay 4924 ms
> 721 delay 5406 ms
> 1810 delay 5873 ms
> 1470 delay 5969 ms
> 1702 delay 7834 ms
> 48821 delay 8063 ms
> ^C
>
> The kernel version is 4.9.23 and I disable autogroup;   if autogroup
> is enabled, no this issue
>
>
> ftrace result:
>
><...>-48821 [003] d... 64151.635476: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
> next_comm=baas_agent next_pid=17118 next_prio=120
><...>-48821 [003] d... 64151.636531: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
> next_comm=baas_agent next_pid=17118 next_prio=120
><...>-48821 [003] d... 64151.639570: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
> next_comm=baas_agent next_pid=17275 next_prio=120
><...>-48821 [003] d... 64159.703051: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R+ ==>
> next_comm=kworker/u97:0 next_pid=36929 next_prio=120
><...>-48821 [003] d... 64159.703091: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
> next_comm=kworker/u97:0 next_pid=36929 next_prio=120
><...>-48821 [003] d... 64159.703978: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_

Re: high latency in CFS when disable autogroup, bug or not?

2017-10-23 Thread Li RongQing
I find the root cause, the delayed process run a very long time
before, and its vruntime become very large, and must wait all process;
so it is delayed

the reason is below:

1. there is a low weight load process A (nice=19, weight=15)
2. there is a process B which is doing IO
3. there is a process C whose nice is 0

the running status likes below:

Step 1: when C is running, B wakeup, and preempt C;  B start to run
Step 2: when B sleeps, vruntime of A is min, but A cannot preempt
C(cfq->last); then C restore to run; then repeat Step 1;


A can not preempt B and C, since vruntime of B and C is not larger
that 4ms*1024/15 [sched_wakeup_granularity_ns *1024/wight of nice 19]


but this condition will block other all process(about 500 processes)
running in  4ms*1024/15;


so I think we should consider more when doing preempt

/*
 * Pick the next process, keeping these things in mind, in this order:
 * 1) keep things fair between processes/task groups
 * 2) pick the "next" process, since someone really wants that to run
 * 3) pick the "last" process, for cache locality
 * 4) do not run the "skip" process, if something else is available
 */
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *left = se;

/*
 * Avoid running the skip buddy, if running something else can
 * be done without getting too unfair.
 */
if (cfs_rq->skip == se) {
struct sched_entity *second = __pick_next_entity(se);
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}

/*
 * Prefer last buddy, try to return the CPU to a preempted task.
 */
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;

/*
 * Someone really wants this to run. If it's not unfair, run it.
 */
if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
se = cfs_rq->next;

clear_buddies(cfs_rq, se);

return se;
}


On Fri, Sep 1, 2017 at 11:28 AM, Li RongQing  wrote:
> I have a programe to test CFS latency or fairness on high load,
> the test shows CFS has high latency when disable autogroup
>
> the program is in
> https://gist.github.com/anonymous/af21ae289cfa5c6310eeac73b7a478ff.
> the programe creates many threads, every thread does thing like:
>
> do {
> now =gettime()
> loop to spend 10 microseconds
> delta = gettime()-now
>
> if (delta > max_latency) {
>  max_latency=delta
>  printf( max_latency);
> }
> } while(1)
>
>
> I run this program on a machine with 48 processor,  this programe
> created 2000 thread,
> then every processor has about 40 pthread,  every thread should finish
> its 10ms computation in 400ms= 40pthread*10ms
> but test result show some thread takes very long time
>
> # ./pipe_test -n 2000
> eat cpu with 2000 threads, delay time 1000 ms
> 648 delay 1037 ms
> 48923 delay 1038 ms
> 1810 delay 1349 ms
> 49142 delay 1482 ms
> 1728 delay 1574 ms
> 1518 delay 1713 ms
> 808 delay 1714 ms
> 1702 delay 1733 ms
> 49004 delay 1783 ms
> 48821 delay 1785 ms
> 451 delay 1865 ms
> 990 delay 1910 ms
> 1626 delay 1957 ms
> 537 delay 2420 ms
> 2021 delay 3242 ms
> 763 delay 3488 ms
> 622 delay 3614 ms
> 1887 delay 4474 ms
> 1267 delay 4924 ms
> 721 delay 5406 ms
> 1810 delay 5873 ms
> 1470 delay 5969 ms
> 1702 delay 7834 ms
> 48821 delay 8063 ms
> ^C
>
> The kernel version is 4.9.23 and I disable autogroup;   if autogroup
> is enabled, no this issue
>
>
> ftrace result:
>
><...>-48821 [003] d... 64151.635476: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
> next_comm=baas_agent next_pid=17118 next_prio=120
><...>-48821 [003] d... 64151.636531: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
> next_comm=baas_agent next_pid=17118 next_prio=120
><...>-48821 [003] d... 64151.639570: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
> next_comm=baas_agent next_pid=17275 next_prio=120
><...>-48821 [003] d... 64159.703051: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R+ ==>
> next_comm=kworker/u97:0 next_pid=36929 next_prio=120
><...>-48821 [003] d... 64159.703091: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
> next_comm=kworker/u97:0 next_pid=36929 next_prio=120
><...>-48821 [003] d... 64159.703978: sched_switch:
> prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
> next_

high latency in CFS when disable autogroup, bug or not?

2017-08-31 Thread Li RongQing
I have a programe to test CFS latency or fairness on high load,
the test shows CFS has high latency when disable autogroup

the program is in
https://gist.github.com/anonymous/af21ae289cfa5c6310eeac73b7a478ff.
the programe creates many threads, every thread does thing like:

do {
now =gettime()
loop to spend 10 microseconds
delta = gettime()-now

if (delta > max_latency) {
 max_latency=delta
 printf( max_latency);
}
} while(1)


I run this program on a machine with 48 processor,  this programe
created 2000 thread,
then every processor has about 40 pthread,  every thread should finish
its 10ms computation in 400ms= 40pthread*10ms
but test result show some thread takes very long time

# ./pipe_test -n 2000
eat cpu with 2000 threads, delay time 1000 ms
648 delay 1037 ms
48923 delay 1038 ms
1810 delay 1349 ms
49142 delay 1482 ms
1728 delay 1574 ms
1518 delay 1713 ms
808 delay 1714 ms
1702 delay 1733 ms
49004 delay 1783 ms
48821 delay 1785 ms
451 delay 1865 ms
990 delay 1910 ms
1626 delay 1957 ms
537 delay 2420 ms
2021 delay 3242 ms
763 delay 3488 ms
622 delay 3614 ms
1887 delay 4474 ms
1267 delay 4924 ms
721 delay 5406 ms
1810 delay 5873 ms
1470 delay 5969 ms
1702 delay 7834 ms
48821 delay 8063 ms
^C

The kernel version is 4.9.23 and I disable autogroup;   if autogroup
is enabled, no this issue


ftrace result:

   <...>-48821 [003] d... 64151.635476: sched_switch:
prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
next_comm=baas_agent next_pid=17118 next_prio=120
   <...>-48821 [003] d... 64151.636531: sched_switch:
prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
next_comm=baas_agent next_pid=17118 next_prio=120
   <...>-48821 [003] d... 64151.639570: sched_switch:
prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
next_comm=baas_agent next_pid=17275 next_prio=120
   <...>-48821 [003] d... 64159.703051: sched_switch:
prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R+ ==>
next_comm=kworker/u97:0 next_pid=36929 next_prio=120
   <...>-48821 [003] d... 64159.703091: sched_switch:
prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
next_comm=kworker/u97:0 next_pid=36929 next_prio=120
   <...>-48821 [003] d... 64159.703978: sched_switch:
prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
next_comm=baas_agent next_pid=17275 next_prio=120
   <...>-48821 [003] d... 64159.705542: sched_switch:
prev_comm=a.out prev_pid=48821 prev_prio=120 prev_state=R ==>
next_comm=baas_agent next_pid=16879 next_prio=120


# grep sched_migrate_task trace|grep 48821
   <...>-688   [019] d... 64161.828654: sched_migrate_task:
comm=a.out pid=48821 prio=120 orig_cpu=3 dest_cpu=19
   <...>-48821 [019] d... 64161.828862: sched_migrate_task:
comm=a.out pid=49053 prio=120 orig_cpu=43 dest_cpu=19
#


  1   2   >