any comments?

On 2017/5/4 10:31, Zefan Li wrote:
> It is assumed that the head of cache_groups always has valid RMID,
> which isn't true.
> 
> When we deallocate RMID from conflicting events currently we don't
> move them to the tail, and one of those events can happen to be in
> the head. Another case is we allocate RMIDs for all the events except
> the head event in intel_cqm_sched_in_event().
> 
> Besides there's another bug that we retry rotating without resetting
> nr_needed and start in __intel_cqm_rmid_rotate().
> 
> Those bugs combined together led to the following oops.
> 
> WARNING: at arch/x86/kernel/cpu/perf_event_intel_cqm.c:186 
> __put_rmid+0x28/0x80()
> ...
>  [<ffffffff8103a578>] __put_rmid+0x28/0x80
>  [<ffffffff8103a74a>] intel_cqm_rmid_rotate+0xba/0x440
>  [<ffffffff8109d8cb>] process_one_work+0x17b/0x470
>  [<ffffffff8109e69b>] worker_thread+0x11b/0x400
> ...
> BUG: unable to handle kernel NULL pointer dereference at           (null)
> ...
>  [<ffffffff8103a74a>] intel_cqm_rmid_rotate+0xba/0x440
>  [<ffffffff8109d8cb>] process_one_work+0x17b/0x470
>  [<ffffffff8109e69b>] worker_thread+0x11b/0x400
> 
> Cc: sta...@vger.kernel.org
> Signed-off-by: Zefan Li <lize...@huawei.com>
> ---
>  arch/x86/events/intel/cqm.c | 19 ++++++++++++++++---
>  1 file changed, 16 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
> index 8c00dc0..c06a5ba 100644
> --- a/arch/x86/events/intel/cqm.c
> +++ b/arch/x86/events/intel/cqm.c
> @@ -553,8 +553,13 @@ static bool intel_cqm_sched_in_event(u32 rmid)
>  
>       leader = list_first_entry(&cache_groups, struct perf_event,
>                                 hw.cqm_groups_entry);
> -     event = leader;
>  
> +     if (!list_empty(&cache_groups) && !__rmid_valid(leader->hw.cqm_rmid)) {
> +             intel_cqm_xchg_rmid(leader, rmid);
> +             return true;
> +     }
> +
> +     event = leader;
>       list_for_each_entry_continue(event, &cache_groups,
>                                    hw.cqm_groups_entry) {
>               if (__rmid_valid(event->hw.cqm_rmid))
> @@ -721,6 +726,7 @@ static void intel_cqm_sched_out_conflicting_events(struct 
> perf_event *event)
>  {
>       struct perf_event *group, *g;
>       u32 rmid;
> +     LIST_HEAD(conflicting_groups);
>  
>       lockdep_assert_held(&cache_mutex);
>  
> @@ -744,7 +750,11 @@ static void 
> intel_cqm_sched_out_conflicting_events(struct perf_event *event)
>  
>               intel_cqm_xchg_rmid(group, INVALID_RMID);
>               __put_rmid(rmid);
> +             list_move_tail(&group->hw.cqm_groups_entry,
> +                            &conflicting_groups);
>       }
> +
> +     list_splice_tail(&conflicting_groups, &cache_groups);
>  }
>  
>  /*
> @@ -773,9 +783,9 @@ static void intel_cqm_sched_out_conflicting_events(struct 
> perf_event *event)
>   */
>  static bool __intel_cqm_rmid_rotate(void)
>  {
> -     struct perf_event *group, *start = NULL;
> +     struct perf_event *group, *start;
>       unsigned int threshold_limit;
> -     unsigned int nr_needed = 0;
> +     unsigned int nr_needed;
>       unsigned int nr_available;
>       bool rotated = false;
>  
> @@ -789,6 +799,9 @@ static bool __intel_cqm_rmid_rotate(void)
>       if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
>               goto out;
>  
> +     nr_needed = 0;
> +     start = NULL;
> +
>       list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
>               if (!__rmid_valid(group->hw.cqm_rmid)) {
>                       if (!start)
> 

Reply via email to