On 13/02/2018, Jason Vas Dias <jason.vas.d...@gmail.com> wrote: > Good day - > > I'd much appreciate some advice as to why, on my Intel x86_64 > ( DisplayFamily_DisplayModel : 06_3CH ), running either Linux 4.12.10, > or Linux 3.10.0, any attempt to count all of : > PERF_COUNT_HW_BRANCH_INSTRUCTIONS > (or raw config 0xC4) , and > PERF_COUNT_HW_BRANCH_MISSES > (or raw config 0xC5), and > combined with > PERF_COUNT_HW_CACHE_REFERENCES > (or raw config 0x4F2E ), and > PERF_COUNT_HW_CACHE_MISSES > (or raw config 0x412E) , > results in ALL COUNTERS BEING 0 in a read of the Group FD or > mmap sample area. > > This is demonstrated by the example program, which will > use perf_event_open() to create a Group Leader FD for the first event, > and associate all other events with that Event Group , so that it > will read all events on the group FD . > > The perf_event_open() calls and the ioctl(event_fd, PERF_EVENT_IOC_ID, &id) > calls all return successfully , but if I combine ANY of > ( PERF_COUNT_HW_BRANCH_INSTRUCTIONS, > PERF_COUNT_HW_BRANCH_MISSES > ) with any of > ( PERF_COUNT_HW_CACHE_REFERENCES, > PERF_COUNT_HW_CACHE_MISSES > ) in the Event Group, ALL events have '0' event->value. > > Demo : > 1. Compile program to use kernel mapped Generic Events: > $ gcc -std=gnu11 -o perf_bug perf_bug.c > Running program shows all counters have 0 values, since both > CACHE & BRANCH hits+misses are being requested: > > $ ./perf_bug > EVENT: Branch Instructions : 0 > EVENT: Branch Misses : 0 > EVENT: Instructions : 0 > EVENT: CPU Cycles : 0 > EVENT: Ref. CPU Cycles : 0 > EVENT: Bus Cycles : 0 > EVENT: Cache References : 0 > EVENT: Cache Misses : 0 > > NOT registering interest in EITHER the BRANCH counters > OR the CACHE counters fixes the problem: > > Compile without registering for BRANCH_INSTRUCTIONS > or BRANCH_MISSES: > $ gcc -std=gnu11 -DNO_BUG_NO_BRANCH -o perf_bug perf_bug.c > $ ./perf_bug > EVENT: Instructions : 914 > EVENT: CPU Cycles : 4110 > EVENT: Ref. CPU Cycles : 4437 > EVENT: Bus Cycles : 152 > EVENT: Cache References : 1 > EVENT: Cache Misses : 1 > > Compile without registering for CACHE_REFERENCES or CACHE_MISSES: > $ gcc -std=gnu11 -DNO_BUG_NO_CACHE -o perf_bug perf_bug.c > $ ./perf_bug > EVENT: Branch Instructions : 106 > EVENT: Branch Misses : 6 > EVENT: Instructions : 914 > EVENT: CPU Cycles : 4132 > EVENT: Ref. CPU Cycles : 8526 > EVENT: Bus Cycles : 295 > > The same thing happens if I do not use Generic Events, but rather > "dynamic raw PMU" events, by putting the hex values from > /sys/bus/event_source/devices/cpu/events/? into the perf_event_attr > config, OR'ed with (1<<63), and using the PERF_TYPE_RAW perf_event_attr > type value : > > $ gcc -DUSE_RAW_PMU -o perf_bug perf_bug.c > $ ./perf_bug > EVENT: Branch Instructions : 0 > EVENT: Branch Misses : 0 > EVENT: Instructions : 0 > EVENT: CPU Cycles : 0 > EVENT: Ref. CPU Cycles : 0 > EVENT: Bus Cycles : 0 > EVENT: Cache References : 0 > EVENT: Cache Misses : 0 > > > $ gcc -DUSE_RAW_PMU -DNO_BUG_NO_BRANCH -o perf_bug perf_bug.c > $ ./perf_bug > EVENT: Instructions : 914 > EVENT: CPU Cycles : 4102 > EVENT: Ref. CPU Cycles : 4959 > EVENT: Bus Cycles : 171 > EVENT: Cache References : 2 > EVENT: Cache Misses : 2 > > $ gcc -DUSE_RAW_PMU -DNO_BUG_NO_CACHE -o perf_bug perf_bug.c > $ ./perf_bug > EVENT: Branch Instructions : 106 > EVENT: Branch Misses : 6 > EVENT: Instructions : 914 > EVENT: CPU Cycles : 4108 > EVENT: Ref. CPU Cycles : 10817 > EVENT: Bus Cycles : 373 > > > The perf tool itself seems to have the same issue: > > With CACHE & BRANCH counters does not work : > $ perf stat -e '{r0c4,r0c5,r0c0,r03c,r0300,r013c,r04F2E,r0412E}:SIu' sleep > 1 > > Performance counter stats for 'sleep 1': > > <not counted> r0c4 > (0.00%) > <not counted> r0c5 > (0.00%) > <not counted> r0c0 > (0.00%) > <not counted> r03c > (0.00%) > <not counted> r0300 > (0.00%) > <not counted> r013c > (0.00%) > <not counted> r04F2E > (0.00%) > <not supported> r0412E > > 1.001652932 seconds time elapsed > > Some events weren't counted. Try disabling the NMI watchdog: > echo 0 > /proc/sys/kernel/nmi_watchdog > perf stat ... > echo 1 > /proc/sys/kernel/nmi_watchdog > > Disabling the NMI watchdog makes no difference . > > It is very strange that perf thinks 'r0412E' is not supported : > $ cat /sys/bus/event_source/devices/cpu/cache_misses > event=0x2e,umask=0x41 > > The kernel should not be advertizing an unsupported event > in a /sys/bus/event_source/devices/cpu/events/ file, should it ? > > So perf stat has the same problem - without either Cache or Branch > counters seems to work fine: > > without cache: > $ perf stat -e '{r0c4,r0c5,r0c0,r03c,r0300,r013c}:SIu' sleep 1 > > Performance counter stats for 'sleep 1': > > 37740 r0c4 > 3557 r0c5 > 188552 r0c0 > 311684 r03c > 360963 r0300 > 12461 r013c > > 1.001508109 seconds time elapsed > > without branch: > $ perf stat -e '{r0c0,r03c,r0300,r013c,r04F2E,r0412E}:SIu' sleep 1 > > Performance counter stats for 'sleep 1': > > 188554 r0c0 > 320242 r03c > 452748 r0300 > 15633 r013c > 4145 r04F2E > 3022 r0412E > > 1.001810421 seconds time elapsed > > proving again that perf's claim that 'r0412E' is not supported is bogus. > The Intel SDM's table 19-1 Architectural events, which ALL Intel CPUs > are meant to support, does include 'Event: 2EH | Umask: 4FH : LLC > Reference ' and 'Event: 2EH | Umask: 41H : LLC Miss' , as well as : > 'Event : C4H | Umask: 00H : Branch Instructions Retired' and > 'Event : C5H | Umask: 00H : Branch Misses Retired' . > So why can't perf count them all in the same group? > > Please , can anyone enlighten me as to what is going on here ? > > Why can't I count all of > ( BRANCH_INSTRUCTIONS , BRANCH_MISSES , > CACHE_REFERENCES, CACHE_MISSES > ) > in the same Perf Event Group ? > > Thanks in advance for any replies, > Best Regards, > Jason >
Actually, it appears that ONLY the combination of 'BRANCH_MISSES' and 'CACHE_MISSES' makes all sampled counter values 0 ; if either counter is not requested, all other counters have non-zero values. I've updated the program to reflect this . And nmi_watchdog=1 DOES make a difference : if nmi_watchdog is > 0 , then ANY combination of {BRANCH,CACHE}_{REFS,MISSES} makes ALL sampled counter values be 0, but if nmi_watchdog is 0, then only the combination of BRANCH_MISSES and CACHE_MISSES makes all sampled values be 0 . This is a really nasty bug and makes the Linux PERF facility rather unusable ; because of this bug, Linux PERF provides no way of measuring cache and branch prediction performance at the same time for the same instruction sequence. Can anyone suggest a valid reason for this ? Or any workarounds ? If no-one suggests a workaround or valid reason I guess I should raise this as a serious bug . Thanks & Regards, Jason
/* Demonstration of Linux PERF bug: * Linux is unable to count BRANCH_INSTRUCTIONS or BRANCH_MISSES * at the same time as CACHE_REFERENCES or CACHE_MISSES. */ #include <sys/types.h> #include <stdint.h> #include <stdbool.h> #include <unistd.h> #include <sys/syscall.h> #include <sys/ioctl.h> #include <errno.h> #include <string.h> #include <stdio.h> #include <linux/perf_event.h> static int perf_event_open ( struct perf_event_attr *hw_event, pid_t pid, int cpu, int group_fd, unsigned long flags ) { int ret; ret = (int) syscall(__NR_perf_event_open, hw_event, pid, cpu, group_fd, flags); return ret; } int main( int argc, const char *const* argv) { struct perf_event_attr pea = {0}; struct evcfg { uint64_t perf_type; uint64_t perf_cfg; const char *name; int fd; uint64_t id; } pe [] = { #ifndef USE_RAW_PMU // so we can test using Generic Kernel Event mapping: #ifndef NO_BUG_NO_BRANCH #ifndef NO_BUG_NO_BRANCH_INST { PERF_TYPE_HARDWARE , PERF_COUNT_HW_BRANCH_INSTRUCTIONS , "Branch Instructions" , -1, 0 } #endif #ifndef NO_BUG_NO_BRANCH_MISS , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_BRANCH_MISSES , "Branch Misses" , -1, 0 } #endif , #endif { PERF_TYPE_HARDWARE , PERF_COUNT_HW_INSTRUCTIONS , "Instructions" , -1, 0 } , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_CPU_CYCLES , "CPU Cycles" , -1, 0 } , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_REF_CPU_CYCLES , "Ref. CPU Cycles" , -1, 0 } , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_BUS_CYCLES , "Bus Cycles" , -1, 0 } #ifndef NO_BUG_NO_CACHE #ifndef NO_BUG_NO_CACHE_REF , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_CACHE_REFERENCES , "Cache References" , -1, 0 } #endif #ifndef NO_BUG_NO_CACHE_MISS , { PERF_TYPE_HARDWARE , PERF_COUNT_HW_CACHE_MISSES , "Cache Misses" , -1, 0 } #endif #endif #else // or test using raw PMU codes - these come from the Intel SDM, Chapter 19, Table 19-1, // and I've checked they are identical to the values in // /sys/bus/event_source/devices/cpu/events/{ #ifndef NO_BUG_NO_BRANCH #ifndef NO_BUG_NO_BRANCH_INST { PERF_TYPE_RAW , (1UL<<63U) | 0xC4 // branch_instructions , "Branch Instructions" , -1, 0 } #endif #ifndef NO_BUG_NO_BRANCH_MISS , { PERF_TYPE_RAW , (1UL<<63U) | 0xC5 // branch_misses , "Branch Misses" , -1, 0 } #endif , #endif { PERF_TYPE_RAW , (1UL<<63U) | 0xC0 // instructions , "Instructions" , -1, 0 } , { PERF_TYPE_RAW , (1UL<<63U) | 0x3C // cpu cycles , "CPU Cycles" , -1, 0 } , { PERF_TYPE_RAW , (1UL<<63U) | 0x0300 // ref cpu cycles , "Ref. CPU Cycles" , -1, 0 } , { PERF_TYPE_RAW , (1UL<<63U) | 0x013C // bus cycles , "Bus Cycles" , -1, 0 } #ifndef NO_BUG_NO_CACHE #ifndef NO_BUG_NO_CACHE_REF , { PERF_TYPE_RAW , (1UL<<63U) | 0x04F2E // cache references , "Cache References" , -1, 0 } #endif #ifndef NO_BUG_NO_CACHE_MISS , { PERF_TYPE_RAW , (1UL<<63U) | 0x0412E // cache misses , "Cache Misses" , -1, 0 } #endif #endif #endif }; #define N_EV (sizeof(pe)/sizeof(struct evcfg)) int fd=-1; int n_ev=0; pid_t pid=getpid(); for(; n_ev < N_EV; n_ev += 1) { memset(&pea, '\0', sizeof(pea)); pea.size = PERF_ATTR_SIZE_VER5; pea.type = pe[n_ev].perf_type; pea.config = pe[n_ev].perf_cfg; pea.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID | PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING ; pea.disabled=1; pea.exclude_kernel = 1; pea.exclude_idle = 1; pea.exclude_hv = 1; if((pe[n_ev].fd = perf_event_open ( &pea, pid, -1, fd, 0) ) == -1 ) { fprintf(stderr,"perf_event_open failed : %d : '%s'.\n", errno, strerror(errno)); return 1; } if( fd == -1) fd = pe[n_ev].fd; // this is the Group Leader FD if( 0 != ioctl( pe[n_ev].fd, PERF_EVENT_IOC_ID, &pe[n_ev].id)) { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_ID) failed for #%d : %d : '%s'.\n", n_ev, errno, strerror(errno)); return 1; } } if( 0 != ioctl( fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP)) { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_RESET) failed : %d : '%s'.\n", errno, strerror(errno)); return 1; } // do something to measure - let's try 100 long divisions: uint64_t a_num = 0x0102030405060708; uint64_t b_num = ~a_num; int cnt=100; if( 0 != ioctl( fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP)) { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_ID) failed : %d : '%s'.\n", errno, strerror(errno)); return 1; } do { a_num=(b_num /= a_num); } while(--cnt); if( 0 != ioctl( fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP)) { fprintf(stderr,"ioctl(fd, PERF_EVENT_IOC_ID) failed : %d : '%s'.\n", errno, strerror(errno)); return 1; } struct { uint64_t nr, time_enabled, time_running; struct event { uint64_t value,id; } ev[N_EV]; } events; if( read(fd, &events, sizeof(events)) != sizeof(events)) { fprintf(stderr,"read of event group leader FD failed : %d : '%s'.\n", errno, strerror(errno)); return 1; } if( events.nr != N_EV ) { fprintf(stderr,"unexpected number of events read: %lu\n", events.nr); return 1; } struct event *ev = &events.ev[0]; bool non_zero_event=false; do { bool found=0; for(n_ev=0; n_ev < N_EV; n_ev += 1) { if( pe[n_ev].id == ev->id ) { found = true; break; } } if( ! found ) { fprintf(stderr,"Kernel returned unknown event ID: %lu", ev->id); return 1; } printf("EVENT: %s : %lu\n", pe[n_ev].name, ev->value); if (!non_zero_event) non_zero_event = ev->value != 0; ++ev; } while( --events.nr ); return (non_zero_event ? 0 : 1); }