= hirsute verification =
ubuntu@blanka:~/nvidia-dgx-2/tests$ cat /proc/version
Linux version 5.11.0-42-generic (buildd@lgw01-amd64-041) (gcc (Ubuntu
10.3.0-1ubuntu1) 10.3.0, GNU ld (GNU Binutils for Ubuntu) 2.36.1) #46-Ubuntu
SMP Fri Nov 26 12:04:17 UTC 2021
ubuntu@blanka:~/nvidia-dgx-2/tests$ ./nvidia-peermem-test.sh
+ export DEBCONF_FRONTEND=noninteractive
+ DEBCONF_FRONTEND=noninteractive
+ export DEBIAN_PRIORITY=critical
+ DEBIAN_PRIORITY=critical
+ SERVER_IFACE=enp148s0
+ SERVER_IP=192.168.5.1/24
+ SERVER_IB_BDF=0000:4b:00.0
+ CLIENT_IFACE=enp18s0
+ CLIENT_IP=192.168.5.2/24
+ CLIENT_IB_BDF=0000:ba:00.0
+ trap cleanup EXIT
+ sudo service unattended-upgrades stop
+ install_cuda_perftest
+ local release
+ local components
+ dpkg-query -W -f '${Version}' perftest
+ grep -q '+cuda.1$'
+ return
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_0
++++ dirname
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0/infiniband/mlx5_0
+++ dirname
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0/infiniband
++ basename
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:04.0/0000:0a:00.0/0000:0b:00.0/0000:0c:00.0
+ bdf=0000:0c:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_1
++++ dirname
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0/infiniband/mlx5_1
+++ dirname
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0/infiniband
++ basename
../../devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:08.0/0000:10:00.0/0000:11:10.0/0000:12:00.0
+ bdf=0000:12:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_2
++++ dirname
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0/infiniband/mlx5_2
+++ dirname
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0/infiniband
++ basename
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:04.0/0000:49:00.0/0000:4a:00.0/0000:4b:00.0
+ bdf=0000:4b:00.0
+ case "$bdf" in
++ basename /sys/class/infiniband/mlx5_2
+ server_ib_dev=mlx5_2
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_3
++++ dirname
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0/infiniband/mlx5_3
+++ dirname
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0/infiniband
++ basename
../../devices/pci0000:40/0000:40:01.1/0000:41:00.0/0000:42:08.0/0000:50:00.0/0000:51:10.0/0000:54:00.0
+ bdf=0000:54:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_4
++++ dirname
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0/infiniband/mlx5_4
+++ dirname
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0/infiniband
++ basename
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:04.0/0000:8b:00.0/0000:8c:00.0/0000:8d:00.0
+ bdf=0000:8d:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_5
++++ dirname
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0/infiniband/mlx5_5
+++ dirname
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0/infiniband
++ basename
../../devices/pci0000:80/0000:80:01.1/0000:81:00.0/0000:82:08.0/0000:92:00.0/0000:93:10.0/0000:94:00.0
+ bdf=0000:94:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_6
++++ dirname
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0/infiniband/mlx5_6
+++ dirname
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0/infiniband
++ basename
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:04.0/0000:b8:00.0/0000:b9:00.0/0000:ba:00.0
+ bdf=0000:ba:00.0
+ case "$bdf" in
++ basename /sys/class/infiniband/mlx5_6
+ client_ib_dev=mlx5_6
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_7
++++ dirname
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0/infiniband/mlx5_7
+++ dirname
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0/infiniband
++ basename
../../devices/pci0000:b0/0000:b0:01.1/0000:b1:00.0/0000:b2:08.0/0000:be:00.0/0000:bf:10.0/0000:cc:00.0
+ bdf=0000:cc:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_8
++++ dirname
../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/infiniband/mlx5_8
+++ dirname ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0/infiniband
++ basename ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.0
+ bdf=0000:e1:00.0
+ case "$bdf" in
+ for ibdev in /sys/class/infiniband/*
+++++ readlink /sys/class/infiniband/mlx5_9
++++ dirname
../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1/infiniband/mlx5_9
+++ dirname ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1/infiniband
++ basename ../../devices/pci0000:e0/0000:e0:03.1/0000:e1:00.1
+ bdf=0000:e1:00.1
+ case "$bdf" in
+ '[' -z mlx5_6 ']'
+ '[' -z mlx5_2 ']'
+ sudo rdma system set netns exclusive
+ sudo ip netns add peermemclient
+ sudo rdma dev set mlx5_6 netns peermemclient
+ sudo ip netns exec peermemclient ip link set dev lo up
+ sudo ip link set netns peermemclient enp18s0
+ sudo ip netns exec peermemclient ip addr add dev enp18s0 192.168.5.2/24
+ sudo ip netns exec peermemclient ip link set dev enp18s0 up
+ sudo ip addr add dev enp148s0 192.168.5.1/24
+ sudo ip link set dev enp148s0 up
+ sudo modprobe ib_umad
+ sudo modprobe nvidia-peermem
+ sudo_apt install -y opensm
+ sudo --preserve-env=DEBCONF_FRONTEND,DEBIAN_PRIORITY apt install -y opensm
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
opensm is already the newest version (3.3.23-2).
0 upgraded, 0 newly installed, 0 to remove and 0 not upgraded.
+ sudo service opensm start
+ use_cuda_needs_devid
+ ib_write_bw --help
+ grep use_cuda=
--use_cuda=<cuda device id> Use CUDA specific device for GPUDirect RDMA
testing
+ return 0
+ server_use_cuda_arg=--use_cuda=0
+ client_use_cuda_arg=--use_cuda=1
+ srvpid=7209
+ sleep 5
+ sudo ib_write_bw -a -d mlx5_2 --use_cuda=0
************************************
* Waiting for client to connect... *
************************************
+ sudo ip netns exec peermemclient ib_write_bw -a -d mlx5_6 192.168.5.1
--use_cuda=1
initializing CUDA
initializing CUDA
Listing all CUDA devices in system:
CUDA device 0: PCIe address is 07:00
CUDA device 1: PCIe address is 0F:00
CUDA device 2: PCIe address is 47:00
CUDA device 3: PCIe address is 4E:00
CUDA device 4: PCIe address is 87:00
CUDA device 5: PCIe address is 90:00
CUDA device 6: PCIe address is B7:00
CUDA device 7: PCIe address is BD:00
Picking device No. 1
[pid = 7216, dev = 1] device name = [NVIDIA A100-SXM4-40GB]
creating CUDA Ctx
Listing all CUDA devices in system:
CUDA device 0: PCIe address is 07:00
CUDA device 1: PCIe address is 0F:00
CUDA device 2: PCIe address is 47:00
CUDA device 3: PCIe address is 4E:00
CUDA device 4: PCIe address is 87:00
CUDA device 5: PCIe address is 90:00
CUDA device 6: PCIe address is B7:00
CUDA device 7: PCIe address is BD:00
Picking device No. 0
[pid = 7211, dev = 0] device name = [NVIDIA A100-SXM4-40GB]
creating CUDA Ctx
making it the current CUDA Ctx
cuMemAlloc() of a 16777216 bytes GPU buffer
allocated GPU buffer address at 00007f0eba000000 pointer=0x7f0eba000000
---------------------------------------------------------------------------------------
RDMA_Write BW Test
Dual-port : OFF Device : mlx5_6
Number of qps : 1 Transport type : IB
Connection type : RC Using SRQ : OFF
PCIe relax order: ON
ibv_wr* API : ON
TX depth : 128
CQ Moderation : 100
Mtu : 4096[B]
Link type : IB
Max inline data : 0[B]
rdma_cm QPs : OFF
Data ex. method : Ethernet
---------------------------------------------------------------------------------------
making it the current CUDA Ctx
cuMemAlloc() of a 16777216 bytes GPU buffer
allocated GPU buffer address at 00007f682e000000 pointer=0x7f682e000000
---------------------------------------------------------------------------------------
RDMA_Write BW Test
Dual-port : OFF Device : mlx5_2
Number of qps : 1 Transport type : IB
Connection type : RC Using SRQ : OFF
PCIe relax order: ON
ibv_wr* API : ON
CQ Moderation : 100
Mtu : 4096[B]
Link type : IB
Max inline data : 0[B]
rdma_cm QPs : OFF
Data ex. method : Ethernet
---------------------------------------------------------------------------------------
local address: LID 0x01 QPN 0x0107 PSN 0x90c1f2 RKey 0x17ecdc VAddr
0x007f682e800000
local address: LID 0x02 QPN 0x1883 PSN 0xa82bae RKey 0x17ece2 VAddr
0x007f0eba800000
remote address: LID 0x02 QPN 0x1883 PSN 0xa82bae RKey 0x17ece2 VAddr
0x007f0eba800000
remote address: LID 0x01 QPN 0x0107 PSN 0x90c1f2 RKey 0x17ecdc VAddr
0x007f682e800000
---------------------------------------------------------------------------------------
#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps]
---------------------------------------------------------------------------------------
#bytes #iterations BW peak[MB/sec] BW average[MB/sec] MsgRate[Mpps]
Conflicting CPU frequency values detected: 1500.000000 != 3391.375000. CPU
Frequency is not max.
2 5000 4.11 4.10 2.151153
Conflicting CPU frequency values detected: 1500.000000 != 3345.763000. CPU
Frequency is not max.
4 5000 8.07 8.04 2.108648
Conflicting CPU frequency values detected: 1500.000000 != 3362.509000. CPU
Frequency is not max.
8 5000 16.13 16.13 2.113996
Conflicting CPU frequency values detected: 1500.000000 != 3335.048000. CPU
Frequency is not max.
16 5000 32.30 32.19 2.109436
Conflicting CPU frequency values detected: 1500.000000 != 3339.906000. CPU
Frequency is not max.
32 5000 64.41 64.38 2.109663
Conflicting CPU frequency values detected: 1500.000000 != 3333.100000. CPU
Frequency is not max.
64 5000 129.43 129.12 2.115557
Conflicting CPU frequency values detected: 1500.000000 != 3349.864000. CPU
Frequency is not max.
128 5000 257.89 257.16 2.106668
Conflicting CPU frequency values detected: 1500.000000 != 3350.294000. CPU
Frequency is not max.
256 5000 516.27 515.84 2.112864
Conflicting CPU frequency values detected: 1500.000000 != 3340.996000. CPU
Frequency is not max.
512 5000 1024.81 1024.72 2.098633
Conflicting CPU frequency values detected: 1500.000000 != 3356.251000. CPU
Frequency is not max.
1024 5000 2053.47 2053.08 2.102352
Conflicting CPU frequency values detected: 1500.000000 != 3339.107000. CPU
Frequency is not max.
2048 5000 3864.52 3720.22 1.904755
Conflicting CPU frequency values detected: 1500.000000 != 3355.693000. CPU
Frequency is not max.
4096 5000 4494.10 4083.37 1.045344
Conflicting CPU frequency values detected: 1500.000000 != 3342.793000. CPU
Frequency is not max.
8192 5000 4590.54 4425.60 0.566476
Conflicting CPU frequency values detected: 1500.000000 != 3351.159000. CPU
Frequency is not max.
16384 5000 4517.28 4279.27 0.273873
Conflicting CPU frequency values detected: 1500.000000 != 3314.743000. CPU
Frequency is not max.
32768 5000 4460.95 4387.03 0.140385
Conflicting CPU frequency values detected: 1500.000000 != 3305.732000. CPU
Frequency is not max.
65536 5000 4465.92 4408.98 0.070544
Conflicting CPU frequency values detected: 1500.000000 != 3310.266000. CPU
Frequency is not max.
131072 5000 4449.90 4422.93 0.035383
Conflicting CPU frequency values detected: 1500.000000 != 3364.586000. CPU
Frequency is not max.
262144 5000 4443.64 4439.50 0.017758
Conflicting CPU frequency values detected: 1500.000000 != 3325.738000. CPU
Frequency is not max.
524288 5000 4444.42 4441.08 0.008882
Conflicting CPU frequency values detected: 1500.000000 != 3391.764000. CPU
Frequency is not max.
1048576 5000 4453.77 4452.52 0.004453
Conflicting CPU frequency values detected: 1500.000000 != 3391.441000. CPU
Frequency is not max.
2097152 5000 4450.29 4449.44 0.002225
Conflicting CPU frequency values detected: 1500.000000 != 1958.593000. CPU
Frequency is not max.
4194304 5000 4452.98 4451.38 0.001113
Conflicting CPU frequency values detected: 1500.000000 != 2246.050000. CPU
Frequency is not max.
8388608 5000 4453.11 4452.79 0.000557
---------------------------------------------------------------------------------------
8388608 5000 4453.11 4452.79 0.000557
---------------------------------------------------------------------------------------
deallocating RX GPU buffer 00007f682e000000
deallocating RX GPU buffer 00007f0eba000000
destroying current CUDA Ctx
destroying current CUDA Ctx
+ cleanup
+ '[' -n 7209 ']'
+ test -d /proc/7209
+ sudo kill 7209
kill: (7209): No such process
+ /bin/true
+ '[' -z '' ']'
+ sudo ip addr del dev enp148s0 192.168.5.1/24
+ sudo ip netns exec peermemclient ip addr del dev enp18s0 192.168.5.2/24
+ sudo ip netns delete peermemclient
ubuntu@blanka:~/nvidia-dgx-2/tests$ echo $?
0
** Tags removed: verification-needed-hirsute
** Tags added: verification-done-hirsute
--
You received this bug notification because you are a member of Ubuntu
Bugs, which is subscribed to Ubuntu.
https://bugs.launchpad.net/bugs/1947206
Title:
Updates to ib_peer_memory requested by Nvidia
To manage notifications about this bug go to:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1947206/+subscriptions
--
ubuntu-bugs mailing list
[email protected]
https://lists.ubuntu.com/mailman/listinfo/ubuntu-bugs