Hi, Can you try:
- Reproducing with a more recent LTTng-UST/LTTng-Tools ? (e.g. 2.13). LTTng 2.7 is not supported anymore. - Try to reproduce with "per-pid" UST buffers rather than "per-uid", - Try to reproduce without the "tracefile rotation" mode (without --tracefile-count and --tracefile-size options to the channel). Also, do you happen to have traced applications which are frequently killed asynchronously while some of their threads are actively tracing by any chance ? Thanks, Mathieu ----- On Dec 7, 2021, at 4:03 AM, lttng-dev <lttng-dev@lists.lttng.org> wrote: > Hi, Lttng-dev: > I found a strange problem related to lttng-consumed and ctf files recently. > The > ctf files belongs to some CPUs have been stopped rotating but other ctf > files(belong to other CPUs) keeped working as usual. I am very sure all CPUs > were producing spans all the time. > #date; ls -ltrh channel0_0_* |tail -n 3; date;ls -ltrh channel0_1_* |tail -n 3 > Tue Dec 7 16:25:45 CST 2021 > -rw-rw---- 1 root root 1.8M Dec 7 16:20 channel0_0_17 > -rw-rw---- 1 root root 2.0M Dec 7 16:23 channel0_0_18 > -rw-rw---- 1 root root 916K Dec 7 16:24 channel0_0_19 > Tue Dec 7 16:25:45 CST 2021 > -rw-rw---- 1 root root 1.7M Dec 6 00:30 channel0_1_8 > -rw-rw---- 1 root root 1.9M Dec 6 00:31 channel0_1_9 > -rw-rw---- 1 root root 388K Dec 6 00:32 channel0_1_10 > Notice that the ctf files with CPU0 (channel0_0_19) was modified at the time > "now", but the ctf files with CPU1(channeo0_1_10) has been stopped working at > Dec 6. > I gdb lttng-consumerd(break at lib_ring_buffer_channel_do_read() and > lib_ring_buffer_poll_deliver()--I configured a read timer on channels). See > the > followings ( lib_ring_buffer_poll_deliver() ) > 0x00007f0f2418a445 <+213>: mov 0x0(%r13),%rcx > rcx:0x7f0f0402e610(handle->table) > 0x00007f0f2418a449 <+217>: mov 0x98(%rsi),%r9 rsi:0x7f0efb86f000(buf) > r9:0x86c400000(consumed_old = buf->consumed) > 0x00007f0f2418a450 <+224>: lea 0x150(%rsi),%rdi rdi:0x7f0efda75150 > 0x00007f0f2418a457 <+231>: mov 0x150(%rsi),%rax rax:3(cpu?) > 0x00007f0f2418a45e <+238>: mov 0x78(%rbp),%rdx rbp:chan > rdx:0x1000000(chan->backend.buf_size) > 0x00007f0f2418a462 <+242>: mov 0x88(%rbp),%r8d > r8d:0x14(20)(chan->backend.buf_size_order) > 0x00007f0f2418a469 <+249>: cmp %rax,0x8(%rcx) > 0x00007f0f2418a46d <+253>: jbe 0x7f0f2418a80c > <lib_ring_buffer_channel_do_read+1180> > 0x00007f0f2418a473 <+259>: shl $0x6,%rax rax:192 > 0x00007f0f2418a477 <+263>: sub $0x1,%rdx rdx:0xffffff 16777215 > (chan->backend.buf_size - 1) > 0x00007f0f2418a47b <+267>: lea 0x10(%rax,%rcx,1),%r10 rax:192 > rcx:0x7f0f0402e610(handle->table) r10:0x7f0f0402e6e0 > 0x00007f0f2418a480 <+272>: and %r9,%rdx r9:0x8fdc00000 rdx:0xc00000 12582912 > buf_trunc(consume_old, chan->backend.buf_size - 1) > 0x00007f0f2418a483 <+275>: mov 0x8(%rdi),%rax rdi:0x7f0efda75150 rax:2688 > ref_offset = (size_t) ref->offset > 0x00007f0f2418a487 <+279>: mov %r8d,%ecx ecx:0x14(20) > 0x00007f0f2418a48a <+282>: shr %cl,%rdx (....) > 0x00007f0f2418a48d <+285>: shl $0x7,%rdx > 0x00007f0f2418a491 <+289>: add %rax,%rdx > 0x00007f0f2418a494 <+292>: lea 0x80(%rdx),%rax > 0x00007f0f2418a49b <+299>: cmp 0x28(%r10),%rax > 0x00007f0f2418a49f <+303>: ja 0x7f0f2418a80c > <lib_ring_buffer_channel_do_read+1180> > 0x00007f0f2418a4a5 <+309>: mov 0x20(%r10),%rax r10:0x7f0f0402e6e0 > rax:0x7f0efb86f000(buf) > 0x00007f0f2418a4a9 <+313>: add %rdx,%rax rdx:3200 rax:0x7f0efb86fc80 > 0x00007f0f2418a4ac <+316>: mov (%rax),%rax > rax:0x86c00000(commit_count!!!!!!Incremented _once_ at sb switch cc_sb) > 0x00007f0f2418a4af <+319>: mov 0x80(%rbp),%r8 > r8:0x100000(chan->backend.subbuf_size) > 0x00007f0f2418a4b6 <+326>: mov 0x78(%rbp),%rdx > rdx:0x1000000(chan->backend.buf_size) > 0x00007f0f2418a4ba <+330>: mov 0x8c(%rbp),%ecx ecx:4 > 0x00007f0f2418a4c0 <+336>: mov 0x80(%rsi),%rdi rdi:0x86d400000 > 0x00007f0f2418a4c7 <+343>: sub %r8,%rax rax:0x86b00000(commit_count - > chan->backend.subbuf_size) > 0x00007f0f2418a4ca <+346>: and 0x8(%rbp),%rax rax & chan->commit_count_mask = > rax:0x86b00000 > 0x00007f0f2418a4ce <+350>: neg %rdx rdx:0x1000000(16M chan->backend.buf_size) > --> 0xffffffffff000000(-16777216) > 0x00007f0f2418a4d1 <+353>: and %r9,%rdx r9:0x86c400000(consume_old) > rdx:0x86c000000 > 0x00007f0f2418a4d4 <+356>: shr %cl,%rdx cl:4 rdx:0x86c00000 > 0x00007f0f2418a4d7 <+359>: cmp %rdx,%rax rax:0x86b0000 rdx:0x86c00000 > 0x00007f0f2418a4da <+362>: jne 0x7f0f2418a411 > <lib_ring_buffer_channel_do_read+161> > 0x00007f0f2418a4e0 <+368>: mov %r8,%rax > 209 static inline > 210 int lib_ring_buffer_poll_deliver(const struct > lttng_ust_lib_ring_buffer_config *config, > 211 struct lttng_ust_lib_ring_buffer *buf, > 212 struct channel *chan, > 213 struct lttng_ust_shm_handle *handle) > 214 { > 215 unsigned long consumed_old, consumed_idx, commit_count, write_offset; > 216 > 217 consumed_old = uatomic_read(&buf->consumed); > 218 consumed_idx = subbuf_index(consumed_old, chan); > 219 commit_count = v_read(config, &shmp_index(handle, buf->commit_cold, > consumed_idx)->cc_sb); > 220 /* > 221 * No memory barrier here, since we are only interested > 222 * in a statistically correct polling result. The next poll will > 223 * get the data is we are racing. The mb() that ensures correct > 224 * memory order is in get_subbuf. > 225 */ > 226 write_offset = v_read(config, &buf->offset); > 227 > 228 /* > 229 * Check that the subbuffer we are trying to consume has been > 230 * already fully committed. > 231 */ > 232 > 233 if (((commit_count - chan->backend.subbuf_size) > 234 & chan->commit_count_mask) > 235 - (buf_trunc(consumed_old, chan) > 236 >> chan->backend.num_subbuf_order) > 237 != 0) > 238 return 0; > 239 > 240 /* > 241 * Check that we are not about to read the same subbuffer in > 242 * which the writer head is. > 243 */ > 244 if (subbuf_trunc(write_offset, chan) - subbuf_trunc(consumed_old, chan) > 245 == 0) > 246 return 0; > 247 > 248 return 1; > 249 > 250 } > It seems that lib_ring_buffer_channel_do_read()-> > lib_ring_buffer_poll_deliver() > returned 0 at line 238 ($consume_old:0x86c400000, $commint_count:0x86c00000, > $write_offset:0x86d400000). > Buffer type: per UID > Channels: > ------------- > - channel0: [enabled] > Attributes: > overwrite mode: 0 > subbufers size: 1048576 > number of subbufers: 16 > switch timer interval: 20000000 > read timer interval: 60000000 > trace file count: 80 > trace file size (bytes): 2097152 > output: mmap() > PS: Lttng version is 2.7(I know it is an very old version:) ) > Do you have any ideas about this problem? Thanks in advance > Thanks a lot > zhenyu.ren > _______________________________________________ > lttng-dev mailing list > lttng-dev@lists.lttng.org > https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev -- Mathieu Desnoyers EfficiOS Inc. http://www.efficios.com
_______________________________________________ lttng-dev mailing list lttng-dev@lists.lttng.org https://lists.lttng.org/cgi-bin/mailman/listinfo/lttng-dev