[PATCH 06/22] 2.6.22-rc3 perfmon2 : PMU interruption support

Stephane Eranian Tue, 29 May 2007 07:21:47 -0700

This patch contains the PMU interruption support.


Perfmon2 registers a interrupt handler for the PMU is order:
        - to support 64-bit counter emulation
        - to support event-based sampling

Typically the PMU interrupt vector is very high priority to ensure
better coverage when sampling.

On PMU interrupt the handler, pfm_overflow_handler(), is invoked and:
        - check which counter overflowed
        - increment 64-bit software counter accordingly.
        - if there the software 64-bit counter overflowed, then a counter 
overflow
          is declared, otherwise execution resumes
        - on counter overflow, check if there is a samplinf format used and if 
so
          call its handler routine
        - if notification uis requested, then a message is appedend to the 
context message
          queue, any potential waiter is woken up
        - switches to a new event set if this is a requested by user (i.e., 
switch on overflow)

--- linux-2.6.22.base/perfmon/perfmon_intr.c    1969-12-31 16:00:00.000000000 
-0800
+++ linux-2.6.22/perfmon/perfmon_intr.c 2007-05-29 03:24:14.000000000 -0700
@@ -0,0 +1,558 @@
+/*
+ * perfmon_intr.c: perfmon2 interrupt handling
+ *
+ * This file implements the perfmon2 interface which
+ * provides access to the hardware performance counters
+ * of the host processor.
+ *
+ * The initial version of perfmon.c was written by
+ * Ganesh Venkitachalam, IBM Corp.
+ *
+ * Then it was modified for perfmon-1.x by Stephane Eranian and
+ * David Mosberger, Hewlett Packard Co.
+ *
+ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x
+ * by Stephane Eranian, Hewlett Packard Co.
+ *
+ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P.
+ * Contributed by Stephane Eranian <[EMAIL PROTECTED]>
+ *                David Mosberger-Tang <[EMAIL PROTECTED]>
+ *
+ * More information about perfmon available at:
+ *     http://perfmon2.sf.net
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ * 02111-1307 USA
+  */
+#include <linux/kernel.h>
+#include <linux/perfmon.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+static inline void pfm_mask_monitoring(struct pfm_context *ctx,
+                                      struct pfm_event_set *set)
+{
+       u64 now;
+
+       PFM_DBG_ovfl("masking monitoring");
+
+       now = sched_clock();
+
+       /*
+        * we save the PMD values such that we can read them while
+        * MASKED without having to have the thread stopped which
+        * is uncessary because monitoring is stopped
+        *
+        * XXX: could be avoided in system-wide
+        */
+       pfm_modview_begin(set);
+       pfm_save_pmds(ctx, set);
+       pfm_modview_end(set);
+       pfm_arch_mask_monitoring(ctx, set);
+       /*
+        * accumulate the set duration up to this point
+        */
+       set->duration += now - set->duration_start;
+}
+
+/*
+ * main overflow processing routine.
+ *
+ * set->num_ovfl_pmds is 0 when returning from this function even though
+ * set->ovfl_pmds[] may have bits set. When leaving set->num_ovfl_pmds
+ * must never be used to determine if there was a pending overflow.
+ */
+static void pfm_overflow_handler(struct pfm_context *ctx, struct pfm_event_set 
*set,
+                                unsigned long ip,
+                                struct pt_regs *regs)
+{
+       struct pfm_ovfl_arg *ovfl_arg;
+       struct pfm_event_set *set_orig;
+       void *hdr;
+       u64 old_val, ovfl_mask, new_val, ovfl_thres;
+       u64 *ovfl_notify, *ovfl_pmds, *pend_ovfls;
+       u64 *smpl_pmds, *reset_pmds;
+       u64 now, *pmds, t0, t1;
+       u32 ovfl_ctrl, num_ovfl, num_ovfl_orig;
+       u16 i, max_pmd, max_cnt_pmd, first_cnt_pmd;
+       u8 must_switch, has_64b_ovfl;
+       u8 ctx_block, has_notify, has_ovfl_sw;
+
+       now = t0 = sched_clock();
+
+       ovfl_mask = pfm_pmu_conf->ovfl_mask;
+
+       max_pmd = pfm_pmu_conf->max_pmd;
+       first_cnt_pmd = pfm_pmu_conf->first_cnt_pmd;
+       max_cnt_pmd = pfm_pmu_conf->max_cnt_pmd;
+
+       ovfl_pmds = set->ovfl_pmds;
+       num_ovfl = num_ovfl_orig = set->npend_ovfls;
+       pend_ovfls = set->povfl_pmds;
+       has_ovfl_sw = set->flags & PFM_SETFL_OVFL_SWITCH;
+       set_orig = set;
+
+       if (unlikely(ctx->state == PFM_CTX_ZOMBIE))
+               goto stop_monitoring;
+
+       must_switch = has_64b_ovfl = 0;
+
+       hdr = ctx->smpl_addr;
+
+       PFM_DBG_ovfl("ovfl_pmds=0x%llx npend=%u ip=%p, blocking=%d "
+                    "u_pmds=0x%llx use_fmt=%u",
+                    (unsigned long long)pend_ovfls[0],
+                    num_ovfl,
+                    (void *)ip,
+                    ctx->flags.block,
+                    (unsigned long long)set->used_pmds[0],
+                    ctx->smpl_fmt != NULL);
+
+       /*
+        * initialize temporary bitvectors
+        * we allocate bitvectors in the context
+        * rather than on the stack to minimize stack
+        * space consumption. PMU interrupt is very high
+        * which implies possible deep nesting of interrupt
+        * hence limited kernel stack space.
+        *
+        * This is safe because a context can only be in the
+        * overflow handler once at a time
+        */
+       reset_pmds = set->reset_pmds;
+       ovfl_notify = ctx->ovfl_ovfl_notify;
+       pmds = set->view->set_pmds;
+
+       bitmap_zero(ulp(reset_pmds), max_pmd);
+
+       pfm_modview_begin(set);
+
+       /*
+        * first we update the virtual counters
+        *
+        * we leverage num_ovfl to minimize number of
+        * iterations of the loop.
+        *
+        * The i < max_cnt_pmd is just a sanity check
+        */
+       for (i = first_cnt_pmd; num_ovfl && i < max_cnt_pmd; i++) {
+               /*
+                * skip pmd which did not overflow
+                */
+               if (!test_bit(i, ulp(pend_ovfls)))
+                       continue;
+
+               num_ovfl--;
+
+               /*
+                * Note that the pmd is not necessarily 0 at this point as
+                * qualified events may have happened before the PMU was
+                * frozen. The residual count is not taken into consideration
+                * here but will be with any read of the pmd
+                */
+               old_val = new_val = pmds[i];
+               ovfl_thres = set->pmds[i].ovflsw_thres;
+               new_val += 1 + ovfl_mask;
+               pmds[i] = new_val;
+
+               /*
+                * check for overflow condition
+                */
+               if (likely(old_val > new_val)) {
+                       has_64b_ovfl = 1;
+                       if (has_ovfl_sw && ovfl_thres > 0) {
+                               if (ovfl_thres == 1)
+                                       must_switch = 1;
+                               set->pmds[i].ovflsw_thres = ovfl_thres - 1;
+                       }
+
+                       /*
+                        * what to reset because of this overflow
+                        */
+                       __set_bit(i, ulp(reset_pmds));
+
+                       bitmap_or(ulp(reset_pmds),
+                                 ulp(reset_pmds),
+                                 ulp(set->pmds[i].reset_pmds),
+                                 max_pmd);
+
+               } else {
+                       /* only keep track of 64-bit overflows */
+                       __clear_bit(i, ulp(pend_ovfls));
+                       /*
+                        * on some PMU, it may be necessary to re-arm the PMD
+                        */
+                       pfm_arch_ovfl_reset_pmd(ctx, i);
+               }
+
+               PFM_DBG_ovfl("pmd%u=0x%llx old_val=0x%llx "
+                            "hw_pmd=0x%llx o_pmds=0x%llx must_switch=%u "
+                            "o_thres=%llu o_thres_ref=%llu",
+                            i,
+                            (unsigned long long)new_val,
+                            (unsigned long long)old_val,
+                            (unsigned long long)pfm_read_pmd(ctx, i),
+                            (unsigned long long)ovfl_pmds[0],
+                            must_switch,
+                            (unsigned long long)set->pmds[i].ovflsw_thres,
+                            (unsigned long long)set->pmds[i].ovflsw_ref_thres);
+       }
+       pfm_modview_end(set);
+
+       /*
+        * mark the overflow as consumed
+        */
+       set->npend_ovfls = 0;
+
+       ctx_block = ctx->flags.block;
+       
+       t1 = sched_clock();
+       __get_cpu_var(pfm_stats).ccnt0 += t1 - t0;
+       t0 = t1;
+
+       /*
+        * there was no 64-bit overflow, nothing else to do
+        */
+       if (!has_64b_ovfl)
+               return;
+
+       /*
+        * copy pending_ovfls to ovfl_pmd. It is used in
+        * the notification message or getinfo_evtsets().
+        *
+        * pend_ovfls modified to reflect only 64-bit overflows
+        */
+       bitmap_copy(ulp(ovfl_pmds),
+                   ulp(pend_ovfls),
+                   max_cnt_pmd);
+
+       /*
+        * build ovfl_notify bitmask from ovfl_pmds
+        */
+       bitmap_and(ulp(ovfl_notify),
+                  ulp(pend_ovfls),
+                  ulp(set->ovfl_notify),
+                  max_cnt_pmd);
+
+       has_notify = !bitmap_empty(ulp(ovfl_notify), max_cnt_pmd);
+       /*
+        * must reset for next set of overflows
+        */
+       bitmap_zero(ulp(pend_ovfls), max_cnt_pmd);
+
+       /*
+        * check for format
+        */
+       if (likely(ctx->smpl_fmt)) {
+               u64 start_cycles, end_cycles;
+               u64 *cnt_pmds;
+               int j, k, ret = 0;
+
+               ovfl_ctrl = 0;
+               num_ovfl = num_ovfl_orig;
+               ovfl_arg = &ctx->ovfl_arg;
+               cnt_pmds = pfm_pmu_conf->cnt_pmds;
+
+               ovfl_arg->active_set = set->id;
+
+               for (i = first_cnt_pmd; num_ovfl && !ret; i++) {
+
+                       if (!test_bit(i, ulp(ovfl_pmds)))
+                               continue;
+
+                       num_ovfl--;
+
+                       ovfl_arg->ovfl_pmd = i;
+                       ovfl_arg->ovfl_ctrl = 0;
+
+                       ovfl_arg->pmd_last_reset = set->pmds[i].lval;
+                       ovfl_arg->pmd_eventid = set->pmds[i].eventid;
+
+                       /*
+                        * copy values of pmds of interest.
+                        * Sampling format may use them
+                        * We do not initialize the unused smpl_pmds_values
+                        */
+                       k = 0;
+                       smpl_pmds = set->pmds[i].smpl_pmds;
+                       if (!bitmap_empty(ulp(smpl_pmds), max_pmd)) {
+
+                               for (j = 0; j < max_pmd; j++) {
+
+                                       if (!test_bit(j, ulp(smpl_pmds)))
+                                               continue;
+
+                                       new_val = pfm_read_pmd(ctx, j);
+
+                                       /* for counters, build 64-bit value */
+                                       if (test_bit(j, ulp(cnt_pmds))) {
+                                               new_val = 
(set->view->set_pmds[j] & ~ovfl_mask)
+                                                       | (new_val & ovfl_mask);
+                                       }
+                                       ovfl_arg->smpl_pmds_values[k++] = 
new_val;
+
+                                       PFM_DBG_ovfl("s_pmd_val[%u]="
+                                                    "pmd%u=0x%llx",
+                                                    k, j,
+                                                    (unsigned long 
long)new_val);
+                               }
+                       }
+                       ovfl_arg->num_smpl_pmds = k;
+
+                       __get_cpu_var(pfm_stats).fmt_handler_calls++;
+
+                       start_cycles = sched_clock();
+
+                       /*
+                        * call custom buffer format record (handler) routine
+                        */
+                       ret = (*ctx->smpl_fmt->fmt_handler)(hdr,
+                                                           ovfl_arg,
+                                                           ip,
+                                                           now,
+                                                           regs);
+
+                       end_cycles = sched_clock();
+
+                       /*
+                        * for PFM_OVFL_CTRL_MASK and PFM_OVFL_CTRL_NOTIFY
+                        * we take the union
+                        *
+                        * The reset_pmds mask is constructed automatically
+                        * on overflow. When the actual reset takes place
+                        * depends on the masking, switch and notification
+                        * status. It may be deferred until pfm_restart().
+                        */
+                       ovfl_ctrl |= ovfl_arg->ovfl_ctrl;
+
+                       __get_cpu_var(pfm_stats).fmt_handler_ns += end_cycles
+                                                                - start_cycles;
+               }
+               /*
+                * when the format cannot handle the rest of the overflow,
+                * we abort right here
+                */
+               if (ret) {
+                       PFM_DBG_ovfl("handler aborted at PMD%u ret=%d",
+                                    i, ret);
+               }
+       } else {
+               /*
+                * When no sampling format is used, the default
+                * is:
+                *      - mask monitoring
+                *      - notify user if requested
+                *
+                * If notification is not requested, monitoring is masked
+                * and overflowed counters are not reset (saturation).
+                * This mimics the behavior of the default sampling format.
+                */
+               ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY;
+
+               if (!must_switch || has_notify)
+                       ovfl_ctrl |= PFM_OVFL_CTRL_MASK;
+       }
+       t1 = sched_clock();
+       __get_cpu_var(pfm_stats).ccnt1 += t1 - t0;
+       t0 = t1;
+
+       PFM_DBG_ovfl("set%u o_notify=0x%llx o_pmds=0x%llx "
+                    "r_pmds=0x%llx ovfl_ctrl=0x%x",
+                    set->id,
+                    (unsigned long long)ovfl_notify[0],
+                    (unsigned long long)ovfl_pmds[0],
+                    (unsigned long long)reset_pmds[0],
+                    ovfl_ctrl);
+
+       /*
+        * we only reset (short reset) when we are not masking. Otherwise
+        * the reset is postponed until restart.
+        */
+       if (likely(!(ovfl_ctrl & PFM_OVFL_CTRL_MASK))) {
+               if (must_switch) {
+                       /*
+                        * pfm_switch_sets() takes care
+                        * of resetting new set if needed
+                        */
+                       pfm_switch_sets_from_intr(ctx);
+
+                       /*
+                        * update our view of the active set
+                        */
+                       set = ctx->active_set;
+
+                       must_switch = 0;
+               } else if (ovfl_ctrl & PFM_OVFL_CTRL_RESET) {
+                       u16 nn;
+                       t0 = sched_clock();
+                       nn = bitmap_weight(ulp(reset_pmds), max_pmd);
+                       if (nn)
+                               pfm_reset_pmds(ctx, set, nn, 
PFM_PMD_RESET_SHORT);
+                       __get_cpu_var(pfm_stats).ccnt5 += sched_clock() - t0;
+               }
+               /*
+                * do not block if not masked
+                */
+               ctx_block = 0;
+       } else {
+               pfm_mask_monitoring(ctx, set);
+               ctx->state = PFM_CTX_MASKED;
+               ctx->flags.can_restart = 1;
+       }
+       /*
+        * if we have not switched here, then remember for the
+        * time monitoring is restarted
+        */
+       if (must_switch)
+               set->priv_flags |= PFM_SETFL_PRIV_SWITCH;
+
+       /*
+        * block only if CTRL_NOTIFY+CTRL_MASK and requested by user
+        *
+        * Defer notification until last operation in the handler
+        * to avoid spinlock contention
+        */
+       if (has_notify && (ovfl_ctrl & PFM_OVFL_CTRL_NOTIFY)) {
+               if (ctx_block) {
+                       ctx->flags.work_type = PFM_WORK_BLOCK;
+                       set_thread_flag(TIF_PERFMON_WORK);
+               }
+               pfm_ovfl_notify_user(ctx, set_orig, ip);
+       }
+
+       t1 = sched_clock();
+       __get_cpu_var(pfm_stats).ccnt2 += t1 - t0;
+
+       return;
+
+stop_monitoring:
+       /*
+        * Does not happen for a system-wide context nor for a
+        * self-monitored context. We cannot attach to kernel-only
+        * thread, thus it is safe to set TIF bits, i.e., the thread
+        * will eventually leave the kernel or die and either we will
+        * catch the context and clean it up in pfm_handler_work() or
+        * pfm_exit_thread().
+        */
+       PFM_DBG_ovfl("ctx is zombie, converted to spurious");
+
+       __pfm_stop(ctx);
+       ctx->flags.work_type = PFM_WORK_ZOMBIE;
+       set_thread_flag(TIF_PERFMON_WORK);
+}
+
+/*
+ * interrupts are masked
+ *
+ * Context locking necessary to avoid concurrent accesses from other CPUs
+ *     - For per-thread, we must prevent pfm_restart() which works when
+ *       context is LOADED or MASKED
+ */
+static void __pfm_interrupt_handler(unsigned long iip, struct pt_regs *regs)
+{
+       struct task_struct *task;
+       struct pfm_context *ctx;
+       struct pfm_event_set *set;
+       u64 t0;
+
+       t0 = sched_clock();
+       __get_cpu_var(pfm_stats).ovfl_intr_all_count++;
+
+       task = __get_cpu_var(pmu_owner);
+       ctx = __get_cpu_var(pmu_ctx);
+
+       if (unlikely(ctx == NULL)) {
+               PFM_DBG_ovfl("no ctx");
+               goto spurious;
+       }
+
+       spin_lock(&ctx->lock);
+
+       set = ctx->active_set;
+
+       /*
+        * For SMP per-thread, it is not possible to have
+        * owner != NULL && task != current.
+        *
+        * For UP per-thread, because of lazy save, it
+        * is possible to receive an interrupt in another task
+        * which is not using the PMU. This means
+        * that the interrupt was in-flight at the
+        * time of pfm_ctxswout_thread(). In that
+        * case it will be replayed when the task
+        * is scheduled again. Hence we convert to spurious.
+        *
+        * The basic rule is that an overflow is always
+        * processed in the context of the task that
+        * generated it for all per-thread contexts.
+        *
+        * for system-wide, task is always NULL
+        */
+#ifndef CONFIG_SMP
+       if (unlikely((task && current->pfm_context != ctx))) {
+               PFM_DBG_ovfl("spurious: not owned by current task");
+               goto spurious;
+       }
+#endif
+       if (unlikely(!pfm_arch_is_active(ctx))) {
+               PFM_DBG_ovfl("spurious: monitoring non active");
+               goto spurious;
+       }
+
+       /*
+        * freeze PMU and collect overflowed PMD registers
+        * into set->povfl_pmds. Number of overflowed PMDs reported
+        * in set->npend_ovfls
+        */
+       pfm_arch_intr_freeze_pmu(ctx, set);
+       if (unlikely(!set->npend_ovfls)) {
+               PFM_DBG_ovfl("no npend_ovfls");
+               goto spurious;
+       }
+
+       __get_cpu_var(pfm_stats).ovfl_intr_regular_count++;
+
+       __get_cpu_var(pfm_stats).ccnt3 += sched_clock() - t0;
+
+       pfm_overflow_handler(ctx, set, iip, regs);
+
+       pfm_arch_intr_unfreeze_pmu(ctx);
+
+       __get_cpu_var(pfm_stats).ccnt4 += sched_clock() - t0;
+
+       spin_unlock(&ctx->lock);
+
+       return;
+
+spurious:
+       /* ctx may be NULL */
+       pfm_arch_intr_unfreeze_pmu(ctx);
+       if (ctx)
+               spin_unlock(&ctx->lock);
+}
+
+void pfm_interrupt_handler(unsigned long iip, struct pt_regs *regs)
+{
+       u64 start;
+
+       BUG_ON(!irqs_disabled());
+
+       start = sched_clock();
+
+       __pfm_interrupt_handler(iip, regs);
+
+       __get_cpu_var(pfm_stats).ovfl_intr_ns += sched_clock() - start;
+}
+EXPORT_SYMBOL(pfm_interrupt_handler);
+
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 06/22] 2.6.22-rc3 perfmon2 : PMU interruption support

Reply via email to