Hello, > On 26 Jan 2026, at 3:27, Ming Lei <[email protected]> wrote: > > On Sun, Jan 25, 2026 at 08:35:20PM +0200, Alexander Atanasov wrote: >> On 25.01.26 17:28, Ming Lei wrote: >>> On Fri, Jan 23, 2026 at 05:00:33PM +0200, Alexander Atanasov wrote: >>>> >>>> >> >> [snip] >> >>> Can you test the following patch and see if re-order still can happen? >>> >>> >>> diff --git a/tools/testing/selftests/ublk/test_generic_01.sh >>> b/tools/testing/selftests/ublk/test_generic_01.sh >>> index 26cf3c7ceeb5..26d5e52ece29 100755 >>> --- a/tools/testing/selftests/ublk/test_generic_01.sh >>> +++ b/tools/testing/selftests/ublk/test_generic_01.sh >>> @@ -13,7 +13,7 @@ if ! _have_program fio; then >>> exit "$UBLK_SKIP_CODE" >>> fi >>> -_prep_test "null" "sequential io order" >>> +_prep_test "null" "ublk dispatch won't reorder IO" >>> dev_id=$(_add_ublk_dev -t null) >>> _check_add_dev $TID $? >>> @@ -39,9 +39,13 @@ fio --name=write_seq \ >>> ERR_CODE=$? >>> kill "$btrace_pid" >>> wait >>> -if grep -q "io_out_of_order" "$UBLK_TMP"; then >>> - cat "$UBLK_TMP" >>> + >>> +# Check for out-of-order completions detected by bpftrace >>> +if grep -q "^out_of_order:" "$UBLK_TMP"; then >>> + echo "I/O reordering detected:" >>> + grep "^out_of_order:" "$UBLK_TMP" >>> ERR_CODE=255 >>> fi >>> + >>> _cleanup_test "null" >>> _show_result $TID $ERR_CODE >>> diff --git a/tools/testing/selftests/ublk/trace/seq_io.bt >>> b/tools/testing/selftests/ublk/trace/seq_io.bt >>> index b2f60a92b118..60ac40e66606 100644 >>> --- a/tools/testing/selftests/ublk/trace/seq_io.bt >>> +++ b/tools/testing/selftests/ublk/trace/seq_io.bt >>> @@ -2,23 +2,45 @@ >>> $1: dev_t >>> $2: RWBS >>> $3: strlen($2) >>> + >>> + Track request order between block_io_start and block_rq_complete. >>> + For each request, record its start sequence number and verify >>> + completions happen in the same order. >>> */ >>> + >>> BEGIN { >>> - @last_rw[$1, str($2)] = (uint64)0; >>> + @start_seq = (uint64)0; >>> + @complete_seq = (uint64)0; >>> + @out_of_order = (uint64)0; >>> +} >>> + >>> +tracepoint:block:block_io_start >>> +{ >>> + if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { >>> + @start_order[args.sector] = @start_seq; >>> + @start_seq = @start_seq + 1; >>> + } >>> } >>> + >>> tracepoint:block:block_rq_complete >>> { >>> - $dev = $1; >>> if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { >>> - $last = @last_rw[$dev, str($2)]; >>> - if ((uint64)args.sector != $last) { >>> - printf("io_out_of_order: exp %llu actual %llu\n", >>> - args.sector, $last); >>> + $expected_order = @start_order[args.sector]; >>> + if ($expected_order != @complete_seq) { >>> + printf("out_of_order: sector %llu started at seq %llu but completed at >>> seq %llu\n", >>> + args.sector, $expected_order, @complete_seq); >>> + @out_of_order = @out_of_order + 1; >>> } >>> - @last_rw[$dev, str($2)] = (args.sector + args.nr_sector); >>> + delete(@start_order[args.sector]); >>> + @complete_seq = @complete_seq + 1; >>> } >>> } >>> END { >>> - clear(@last_rw); >>> + printf("total_start: %llu total_complete: %llu out_of_order: %llu\n", >>> + @start_seq, @complete_seq, @out_of_order); >>> + clear(@start_order); >>> + clear(@start_seq); >>> + clear(@complete_seq); >>> + clear(@out_of_order); >>> } >> >> >> First run after reboot always fails: >> >> ./test_generic_01.sh >> ublk selftest: generic_01 starting at 2026-01-25T19:08:47+02:00 >> I/O reordering detected: >> out_of_order: sector 112040 started at seq 88 but completed at seq 0 >> out_of_order: sector 112048 started at seq 89 but completed at seq 1 >> out_of_order: sector 112056 started at seq 90 but completed at seq 2 >> out_of_order: sector 112064 started at seq 91 but completed at seq 3 >> out_of_order: sector 112072 started at seq 92 but completed at seq 4 >> out_of_order: sector 112080 started at seq 93 but completed at seq 5 >> out_of_order: sector 112088 started at seq 94 but completed at seq 6 >> out_of_order: sector 112096 started at seq 95 but completed at seq 7 >> out_of_order: sector 112104 started at seq 96 but completed at seq 8 >> out_of_order: sector 112112 started at seq 97 but completed at seq 9 >> out_of_order: sector 112120 started at seq 98 but completed at seq 10 >> out_of_order: sector 112128 started at seq 99 but completed at seq 11 > > In the 1st bpf attachment, fio may be started between attaching > block_start_req and block_complete_req. Maybe the following patch can > solve it: > > > diff --git a/tools/testing/selftests/ublk/test_generic_01.sh > b/tools/testing/selftests/ublk/test_generic_01.sh > index 26cf3c7ceeb5..15cea9e55e1c 100755 > --- a/tools/testing/selftests/ublk/test_generic_01.sh > +++ b/tools/testing/selftests/ublk/test_generic_01.sh > @@ -13,7 +13,7 @@ if ! _have_program fio; then > exit "$UBLK_SKIP_CODE" > fi > > -_prep_test "null" "sequential io order" > +_prep_test "null" "ublk dispatch won't reorder IO" > > dev_id=$(_add_ublk_dev -t null) > _check_add_dev $TID $? > @@ -21,15 +21,20 @@ _check_add_dev $TID $? > dev_t=$(_get_disk_dev_t "$dev_id") > bpftrace trace/seq_io.bt "$dev_t" "W" 1 > "$UBLK_TMP" 2>&1 & > btrace_pid=$! > -sleep 2 > > -if ! kill -0 "$btrace_pid" > /dev/null 2>&1; then > +# Wait for bpftrace probes to be attached (BEGIN block prints BPFTRACE_READY) > +for _ in $(seq 100); do > + grep -q "BPFTRACE_READY" "$UBLK_TMP" 2>/dev/null && break > + sleep 0.1 > +done > + > +if ! kill -0 "$btrace_pid" 2>/dev/null; then > _cleanup_test "null" > exit "$UBLK_SKIP_CODE" > fi > > -# run fio over this ublk disk > -fio --name=write_seq \ > +# run fio over this ublk disk (pinned to CPU 0) > +taskset -c 0 fio --name=write_seq \ > --filename=/dev/ublkb"${dev_id}" \ > --ioengine=libaio --iodepth=16 \ > --rw=write \ > @@ -39,9 +44,13 @@ fio --name=write_seq \ > ERR_CODE=$? > kill "$btrace_pid" > wait > -if grep -q "io_out_of_order" "$UBLK_TMP"; then > - cat "$UBLK_TMP" > + > +# Check for out-of-order completions detected by bpftrace > +if grep -q "^out_of_order:" "$UBLK_TMP"; then > + echo "I/O reordering detected:" > + grep "^out_of_order:" "$UBLK_TMP" > ERR_CODE=255 > fi > + > _cleanup_test "null" > _show_result $TID $ERR_CODE > diff --git a/tools/testing/selftests/ublk/trace/seq_io.bt > b/tools/testing/selftests/ublk/trace/seq_io.bt > index b2f60a92b118..8ebe13d24c67 100644 > --- a/tools/testing/selftests/ublk/trace/seq_io.bt > +++ b/tools/testing/selftests/ublk/trace/seq_io.bt > @@ -2,23 +2,64 @@ > $1: dev_t > $2: RWBS > $3: strlen($2) > + > + Track request order between block_io_start and block_rq_complete. > + For each request, record its start sequence number and verify > + completions happen in the same order. > + > + Sequence starts at 1 so that 0 means "never seen" (bpftrace maps > + return 0 for missing keys). On first valid completion, sync > + complete_seq to handle probe attachment races. > + > + block_rq_complete is listed first to ensure it's attached before > + block_io_start, reducing the window for missed completions. > */ > + > BEGIN { > - @last_rw[$1, str($2)] = (uint64)0; > + @start_seq = (uint64)1; > + @complete_seq = (uint64)0; > + @out_of_order = (uint64)0; > + /* Initialize map type with dummy entry */ > + @start_order[0] = (uint64)0; > + delete(@start_order[0]); > + printf("BPFTRACE_READY\n"); > } > + > tracepoint:block:block_rq_complete > { > - $dev = $1; > if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { > - $last = @last_rw[$dev, str($2)]; > - if ((uint64)args.sector != $last) { > - printf("io_out_of_order: exp %llu actual %llu\n", > - args.sector, $last); > + $expected = @start_order[args.sector]; > + if ($expected > 0) { > + /* Sync complete_seq on first valid completion */ > + if (@complete_seq == 0) { > + @complete_seq = $expected; > + } > + > + if ($expected != @complete_seq) { > + printf("out_of_order: sector %llu started at seq %llu but completed at seq > %llu\n", > + args.sector, $expected, @complete_seq); > + @out_of_order = @out_of_order + 1; > + } > + delete(@start_order[args.sector]); > + @complete_seq = @complete_seq + 1; > } > - @last_rw[$dev, str($2)] = (args.sector + args.nr_sector); > + /* $expected == 0 means we never saw this sector start, skip it */ > + } > +} > + > +tracepoint:block:block_io_start > +{ > + if ((int64)args.dev == $1 && !strncmp(args.rwbs, str($2), $3)) { > + @start_order[args.sector] = @start_seq; > + @start_seq = @start_seq + 1; > } > } > > END { > - clear(@last_rw); > + printf("total_start: %llu total_complete: %llu out_of_order: %llu\n", > + @start_seq - 1, @complete_seq, @out_of_order); > + clear(@start_order); > + clear(@start_seq); > + clear(@complete_seq); > + clear(@out_of_order); > } >
This solves it - no more out of order detected. Looks like this auto mount explains why it misses a few seqs on the first run. [ 99.548551] ublk selftest: generic_01 starting at 2026-01-26T10:24:07+02:00 [ 104.173575] NOTICE: Automounting of tracing to debugfs is deprecated and will be removed in 2030 [ 112.680519] ublk selftest: generic_01 done at 2026-01-26T10:24:20+02:00 [ 116.564281] ublk selftest: generic_01 starting at 2026-01-26T10:24:24+02:00 [ 130.658273] ublk selftest: generic_01 done at 2026-01-26T10:24:38+02:00 have fun, alex
