[PATCH 1/3] drm/amdgpu: fix xgmi perfmon a-b-a problem
Mapping hw counters per event config will cause ABA problems so map per event instead. v2: Discontinue starting perf counters if add fails. Make it clear what's happening with pmc_start. Signed-off-by: Jonathan Kim --- drivers/gpu/drm/amd/amdgpu/amdgpu_df.h | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c | 42 ++ drivers/gpu/drm/amd/amdgpu/df_v3_6.c| 105 +++- 3 files changed, 78 insertions(+), 75 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h index 373cdebe0e2f..52488bb45112 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h @@ -44,11 +44,11 @@ struct amdgpu_df_funcs { void (*enable_ecc_force_par_wr_rmw)(struct amdgpu_device *adev, bool enable); int (*pmc_start)(struct amdgpu_device *adev, uint64_t config, -int is_add); +int counter_idx, int is_add); int (*pmc_stop)(struct amdgpu_device *adev, uint64_t config, -int is_remove); +int counter_idx, int is_remove); void (*pmc_get_count)(struct amdgpu_device *adev, uint64_t config, -uint64_t *count); +int counter_idx, uint64_t *count); uint64_t (*get_fica)(struct amdgpu_device *adev, uint32_t ficaa_val); void (*set_fica)(struct amdgpu_device *adev, uint32_t ficaa_val, uint32_t ficadl_val, uint32_t ficadh_val); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c index 69af462db34d..1b0ec715c8ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c @@ -64,6 +64,7 @@ static void amdgpu_perf_start(struct perf_event *event, int flags) struct amdgpu_pmu_entry *pe = container_of(event->pmu, struct amdgpu_pmu_entry, pmu); + int target_cntr = 0; if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -73,17 +74,24 @@ static void amdgpu_perf_start(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - if (!(flags & PERF_EF_RELOAD)) - pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, 1); + if (!(flags & PERF_EF_RELOAD)) { + target_cntr = pe->adev->df.funcs->pmc_start(pe->adev, + hwc->config, 0 /* unused */, + 1 /* add counter */); + if (target_cntr < 0) + break; + + hwc->idx = target_cntr; + } - pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, 0); + pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, + hwc->idx, 0); break; default: break; } perf_event_update_userpage(event); - } /* read perf counter */ @@ -101,8 +109,8 @@ static void amdgpu_perf_read(struct perf_event *event) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - pe->adev->df.funcs->pmc_get_count(pe->adev, hwc->config, - &count); + pe->adev->df.funcs->pmc_get_count(pe->adev, + hwc->config, hwc->idx, &count); break; default: count = 0; @@ -126,7 +134,8 @@ static void amdgpu_perf_stop(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, 0); + pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, hwc->idx, + 0); break; default: break; @@ -142,12 +151,11 @@ static void amdgpu_perf_stop(struct perf_event *event, int flags) hwc->state |= PERF_HES_UPTODATE; } -/* add perf counter */ +/* add perf counter */ static int amdgpu_perf_add(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; - int retval; - + int retval = 0, target_cntr; struct amdgpu_pmu_entry *pe = container_of(event->pmu, struct amdgpu_pmu_entry, pmu); @@ -156,8 +164,14 @@ static int amdgpu_perf_add(struct pe
[PATCH 1/3] drm/amdgpu: fix xgmi perfmon a-b-a problem
Mapping hw counters per event config will cause ABA problems so map per event instead. v2: Discontinue starting perf counters if add fails. Make it clear what's happening with pmc_start. Signed-off-by: Jonathan Kim --- drivers/gpu/drm/amd/amdgpu/amdgpu_df.h | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c | 42 ++ drivers/gpu/drm/amd/amdgpu/df_v3_6.c| 105 +++- 3 files changed, 78 insertions(+), 75 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h index 373cdebe0e2f..52488bb45112 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h @@ -44,11 +44,11 @@ struct amdgpu_df_funcs { void (*enable_ecc_force_par_wr_rmw)(struct amdgpu_device *adev, bool enable); int (*pmc_start)(struct amdgpu_device *adev, uint64_t config, -int is_add); +int counter_idx, int is_add); int (*pmc_stop)(struct amdgpu_device *adev, uint64_t config, -int is_remove); +int counter_idx, int is_remove); void (*pmc_get_count)(struct amdgpu_device *adev, uint64_t config, -uint64_t *count); +int counter_idx, uint64_t *count); uint64_t (*get_fica)(struct amdgpu_device *adev, uint32_t ficaa_val); void (*set_fica)(struct amdgpu_device *adev, uint32_t ficaa_val, uint32_t ficadl_val, uint32_t ficadh_val); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c index 69af462db34d..1b0ec715c8ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c @@ -64,6 +64,7 @@ static void amdgpu_perf_start(struct perf_event *event, int flags) struct amdgpu_pmu_entry *pe = container_of(event->pmu, struct amdgpu_pmu_entry, pmu); + int target_cntr = 0; if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -73,17 +74,24 @@ static void amdgpu_perf_start(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - if (!(flags & PERF_EF_RELOAD)) - pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, 1); + if (!(flags & PERF_EF_RELOAD)) { + target_cntr = pe->adev->df.funcs->pmc_start(pe->adev, + hwc->config, 0 /* unused */, + 1 /* add counter */); + if (target_cntr < 0) + break; + + hwc->idx = target_cntr; + } - pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, 0); + pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, + hwc->idx, 0); break; default: break; } perf_event_update_userpage(event); - } /* read perf counter */ @@ -101,8 +109,8 @@ static void amdgpu_perf_read(struct perf_event *event) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - pe->adev->df.funcs->pmc_get_count(pe->adev, hwc->config, - &count); + pe->adev->df.funcs->pmc_get_count(pe->adev, + hwc->config, hwc->idx, &count); break; default: count = 0; @@ -126,7 +134,8 @@ static void amdgpu_perf_stop(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, 0); + pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, hwc->idx, + 0); break; default: break; @@ -142,12 +151,11 @@ static void amdgpu_perf_stop(struct perf_event *event, int flags) hwc->state |= PERF_HES_UPTODATE; } -/* add perf counter */ +/* add perf counter */ static int amdgpu_perf_add(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; - int retval; - + int retval = 0, target_cntr; struct amdgpu_pmu_entry *pe = container_of(event->pmu, struct amdgpu_pmu_entry, pmu); @@ -156,8 +164,14 @@ static int amdgpu_perf_add(struct pe
[PATCH 1/3] drm/amdgpu: fix xgmi perfmon a-b-a problem
Mapping hw counters per event config will cause ABA problems so map per event instead. v2: Discontinue starting perf counters if add fails. Make it clear what's happening with pmc_start. Signed-off-by: Jonathan Kim --- drivers/gpu/drm/amd/amdgpu/amdgpu_df.h | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c | 42 ++ drivers/gpu/drm/amd/amdgpu/df_v3_6.c| 105 +++- 3 files changed, 78 insertions(+), 75 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h index 373cdebe0e2f..52488bb45112 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h @@ -44,11 +44,11 @@ struct amdgpu_df_funcs { void (*enable_ecc_force_par_wr_rmw)(struct amdgpu_device *adev, bool enable); int (*pmc_start)(struct amdgpu_device *adev, uint64_t config, -int is_add); +int counter_idx, int is_add); int (*pmc_stop)(struct amdgpu_device *adev, uint64_t config, -int is_remove); +int counter_idx, int is_remove); void (*pmc_get_count)(struct amdgpu_device *adev, uint64_t config, -uint64_t *count); +int counter_idx, uint64_t *count); uint64_t (*get_fica)(struct amdgpu_device *adev, uint32_t ficaa_val); void (*set_fica)(struct amdgpu_device *adev, uint32_t ficaa_val, uint32_t ficadl_val, uint32_t ficadh_val); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c index 69af462db34d..1b0ec715c8ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c @@ -64,6 +64,7 @@ static void amdgpu_perf_start(struct perf_event *event, int flags) struct amdgpu_pmu_entry *pe = container_of(event->pmu, struct amdgpu_pmu_entry, pmu); + int target_cntr = 0; if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -73,17 +74,24 @@ static void amdgpu_perf_start(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - if (!(flags & PERF_EF_RELOAD)) - pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, 1); + if (!(flags & PERF_EF_RELOAD)) { + target_cntr = pe->adev->df.funcs->pmc_start(pe->adev, + hwc->config, 0 /* unused */, + 1 /* add counter */); + if (target_cntr < 0) + break; + + hwc->idx = target_cntr; + } - pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, 0); + pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, + hwc->idx, 0); break; default: break; } perf_event_update_userpage(event); - } /* read perf counter */ @@ -101,8 +109,8 @@ static void amdgpu_perf_read(struct perf_event *event) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - pe->adev->df.funcs->pmc_get_count(pe->adev, hwc->config, - &count); + pe->adev->df.funcs->pmc_get_count(pe->adev, + hwc->config, hwc->idx, &count); break; default: count = 0; @@ -126,7 +134,8 @@ static void amdgpu_perf_stop(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, 0); + pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, hwc->idx, + 0); break; default: break; @@ -142,12 +151,11 @@ static void amdgpu_perf_stop(struct perf_event *event, int flags) hwc->state |= PERF_HES_UPTODATE; } -/* add perf counter */ +/* add perf counter */ static int amdgpu_perf_add(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; - int retval; - + int retval = 0, target_cntr; struct amdgpu_pmu_entry *pe = container_of(event->pmu, struct amdgpu_pmu_entry, pmu); @@ -156,8 +164,14 @@ static int amdgpu_perf_add(struct pe
[PATCH 1/3] drm/amdgpu: fix xgmi perfmon a-b-a problem
Mapping hw counters per event config will cause ABA problems so map per event instead. v2: Discontinue starting perf counters if add fails. Make it clear what's happening with pmc_start. Signed-off-by: Jonathan Kim --- drivers/gpu/drm/amd/amdgpu/amdgpu_df.h | 6 +- drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c | 42 ++ drivers/gpu/drm/amd/amdgpu/df_v3_6.c| 105 +++- 3 files changed, 78 insertions(+), 75 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h index 373cdebe0e2f..52488bb45112 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h @@ -44,11 +44,11 @@ struct amdgpu_df_funcs { void (*enable_ecc_force_par_wr_rmw)(struct amdgpu_device *adev, bool enable); int (*pmc_start)(struct amdgpu_device *adev, uint64_t config, -int is_add); +int counter_idx, int is_add); int (*pmc_stop)(struct amdgpu_device *adev, uint64_t config, -int is_remove); +int counter_idx, int is_remove); void (*pmc_get_count)(struct amdgpu_device *adev, uint64_t config, -uint64_t *count); +int counter_idx, uint64_t *count); uint64_t (*get_fica)(struct amdgpu_device *adev, uint32_t ficaa_val); void (*set_fica)(struct amdgpu_device *adev, uint32_t ficaa_val, uint32_t ficadl_val, uint32_t ficadh_val); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c index 69af462db34d..1b0ec715c8ba 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c @@ -64,6 +64,7 @@ static void amdgpu_perf_start(struct perf_event *event, int flags) struct amdgpu_pmu_entry *pe = container_of(event->pmu, struct amdgpu_pmu_entry, pmu); + int target_cntr = 0; if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) return; @@ -73,17 +74,24 @@ static void amdgpu_perf_start(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - if (!(flags & PERF_EF_RELOAD)) - pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, 1); + if (!(flags & PERF_EF_RELOAD)) { + target_cntr = pe->adev->df.funcs->pmc_start(pe->adev, + hwc->config, 0 /* unused */, + 1 /* add counter */); + if (target_cntr < 0) + break; + + hwc->idx = target_cntr; + } - pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, 0); + pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, + hwc->idx, 0); break; default: break; } perf_event_update_userpage(event); - } /* read perf counter */ @@ -101,8 +109,8 @@ static void amdgpu_perf_read(struct perf_event *event) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - pe->adev->df.funcs->pmc_get_count(pe->adev, hwc->config, - &count); + pe->adev->df.funcs->pmc_get_count(pe->adev, + hwc->config, hwc->idx, &count); break; default: count = 0; @@ -126,7 +134,8 @@ static void amdgpu_perf_stop(struct perf_event *event, int flags) switch (pe->pmu_perf_type) { case PERF_TYPE_AMDGPU_DF: - pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, 0); + pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, hwc->idx, + 0); break; default: break; @@ -142,12 +151,11 @@ static void amdgpu_perf_stop(struct perf_event *event, int flags) hwc->state |= PERF_HES_UPTODATE; } -/* add perf counter */ +/* add perf counter */ static int amdgpu_perf_add(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; - int retval; - + int retval = 0, target_cntr; struct amdgpu_pmu_entry *pe = container_of(event->pmu, struct amdgpu_pmu_entry, pmu); @@ -156,8 +164,14 @@ static int amdgpu_perf_add(struct pe
[PATCH 1/3] drm/amdgpu: fix xgmi perfmon a-b-a problem
[AMD Official Use Only - Internal Distribution Only] > -Original Message- > From: Kim, Jonathan > Sent: Tuesday, September 15, 2020 5:51 PM > To: amd-...@lists.ffreedesktop.org > Cc: Kasiviswanathan, Harish ; Kim, > Jonathan ; Kim, Jonathan > > Subject: [PATCH 1/3] drm/amdgpu: fix xgmi perfmon a-b-a problem > > Mapping hw counters per event config will cause ABA problems so map per > event instead. > > v2: Discontinue starting perf counters if add fails. Make it clear what's > happening with pmc_start. > > Signed-off-by: Jonathan Kim > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_df.h | 6 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c | 42 ++ > drivers/gpu/drm/amd/amdgpu/df_v3_6.c| 105 +++- > 3 files changed, 78 insertions(+), 75 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h > index 373cdebe0e2f..52488bb45112 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_df.h > @@ -44,11 +44,11 @@ struct amdgpu_df_funcs { > void (*enable_ecc_force_par_wr_rmw)(struct amdgpu_device > *adev, > bool enable); > int (*pmc_start)(struct amdgpu_device *adev, uint64_t config, > - int is_add); > + int counter_idx, int is_add); > int (*pmc_stop)(struct amdgpu_device *adev, uint64_t config, > - int is_remove); > + int counter_idx, int is_remove); > void (*pmc_get_count)(struct amdgpu_device *adev, uint64_t config, > - uint64_t *count); > + int counter_idx, uint64_t *count); > uint64_t (*get_fica)(struct amdgpu_device *adev, uint32_t > ficaa_val); > void (*set_fica)(struct amdgpu_device *adev, uint32_t ficaa_val, > uint32_t ficadl_val, uint32_t ficadh_val); diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c > index 69af462db34d..1b0ec715c8ba 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pmu.c > @@ -64,6 +64,7 @@ static void amdgpu_perf_start(struct perf_event > *event, int flags) > struct amdgpu_pmu_entry *pe = container_of(event->pmu, >struct amdgpu_pmu_entry, >pmu); > +int target_cntr = 0; > > if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) > return; > @@ -73,17 +74,24 @@ static void amdgpu_perf_start(struct perf_event > *event, int flags) > > switch (pe->pmu_perf_type) { > case PERF_TYPE_AMDGPU_DF: > -if (!(flags & PERF_EF_RELOAD)) > -pe->adev->df.funcs->pmc_start(pe->adev, hwc- > >config, 1); > +if (!(flags & PERF_EF_RELOAD)) { > +target_cntr = pe->adev->df.funcs->pmc_start(pe- > >adev, > +hwc->config, 0 /* unused */, > +1 /* add counter */); > +if (target_cntr < 0) > +break; > + > +hwc->idx = target_cntr; > +} > > -pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, 0); > +pe->adev->df.funcs->pmc_start(pe->adev, hwc->config, > +hwc->idx, 0); > break; > default: > break; > } > > perf_event_update_userpage(event); > - > } > > /* read perf counter */ > @@ -101,8 +109,8 @@ static void amdgpu_perf_read(struct perf_event > *event) > > switch (pe->pmu_perf_type) { > case PERF_TYPE_AMDGPU_DF: > -pe->adev->df.funcs->pmc_get_count(pe->adev, hwc- > >config, > - &count); > +pe->adev->df.funcs->pmc_get_count(pe->adev, > +hwc->config, hwc->idx, > &count); > break; > default: > count = 0; > @@ -126,7 +134,8 @@ static void amdgpu_perf_stop(struct perf_event > *event, int flags) > > switch (pe->pmu_perf_type) { > case PERF_TYPE_AMDGPU_DF: > -pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, 0); > +pe->adev->df.funcs->pmc_stop(pe->adev, hwc->config, hwc- > >idx, > +0); > break; > default: > break; > @@ -142,12 +151,11 @@ static void amdgpu_perf_stop(struct perf_event > *event, int flags) > hwc->state |= PERF_HES_UPTODATE; > } > > -/* add perf counter */ > +/* add perf counter */ > static int amdgpu_perf_add(struct perf_event *event, int flags) { > struct hw_perf_event *hwc = &event->hw; > -int retval; > - > +int retval = 0, target_cntr; > struct amdgpu_pmu_entry *pe = container_of(event->pmu, >struct amdgpu_pmu_entry, >pmu); > @@ -156,8 +164,14 @@ static int amdgpu_perf_add(struct perf_event > *event, int flags) > > switch (pe->pmu_perf_type) { > case PERF_TYPE_AMDGPU_DF: > -retval = pe->adev->df.funcs->pmc_start(pe->adev, > - hwc->config, 1); > +target_cntr = pe->adev->df.funcs->pmc_start(pe->a