csanchezdll opened a new pull request, #8060: URL: https://github.com/apache/nuttx/pull/8060
## Summary Related mailing list thread: https://www.mail-archive.com/[email protected]/msg08872.html The CAN synthetic error frame generation code uses d_len/d_buf and calls can_input to inject the generated frame. This is (correctly) done from a worker thread, but it is currently not protected by net_lock/net_unlock. OTOH, fdcan_txpoll, which also calls uses d_len/d_buf and calls can_input, can be called from the application thread. Currently, is it possible for error frame generation code to "steal" the buffer, causing the frame coming from last write() to be silently dropped. ## Impact This can cause TX frame loss. ## Testing My board uses stm32h7. stm32h7 SocketCAN driver does not have error frame generation, but I have ported it from main stm32 arch directory (the driver is essentially identical). The following simple program shows this (CAN interface name might need to change in other boards): ``` #include <net/if.h> #include <string.h> #include <sys/ioctl.h> #include <sys/socket.h> #include <unistd.h> #include <nuttx/can.h> #include <netpacket/can.h> int main (int argc, char *argv[]) { int s; struct can_frame f; struct ifreq i; struct sockaddr_can a; s = socket(PF_CAN, SOCK_RAW | SOCK_NONBLOCK, CAN_RAW); memset(&a, 0, sizeof(a)); a.can_family = AF_CAN; strncpy(i.ifr_name, "can3", IFNAMSIZ); ioctl(s, SIOCGIFINDEX, &i); memset(&a, 0, sizeof(a)); a.can_family = AF_CAN; a.can_ifindex = i.ifr_ifindex; bind(s, (struct sockaddr *) &a, sizeof(a)); i.ifr_flags |= IFF_UP; ioctl(s, SIOCSIFFLAGS, &i); memset(&f, 0, sizeof(f)); f.can_id = 0x555; f.can_dlc = 8; write(s, &f, sizeof(f)); write(s, &f, sizeof(f)); return 0; } ``` Running without this patch: ``` Breakpoint 1, fdcan_txpoll (dev=0x24001cb8 <g_fdcan1+228>) at /home/carlossanchez/prj/go-posix/patched/nuttx/arch/arm/src/chip/stm32_fdcan_sock.c:957 957 { (gdb) c Continuing. Breakpoint 1, fdcan_txpoll (dev=0x24001cb8 <g_fdcan1+228>) at /home/carlossanchez/prj/go-posix/patched/nuttx/arch/arm/src/chip/stm32_fdcan_sock.c:957 957 { (gdb) p g_fdcan1->dev.d_len $1 = 16 (gdb) next 958 struct fdcan_driver_s *priv = (gdb) 965 if (priv->dev.d_len > 0) (gdb) p g_fdcan1->dev.d_len $2 = 0 ``` It can be seen `priv->dev.d_len` was overwritten, before `fdcan_txpoll` can actually begin processing it so the TX frame is lost. However, `write` return successfully (because the code is not supposed to fail at this point): ``` (gdb) up #1 0x08029d12 in devif_poll_can_connections (dev=0x24001cb8 <g_fdcan1+228>, callback=0x8013431 <fdcan_txpoll>) at devif/devif_poll.c:261 261 bstop = callback(dev); (gdb) #2 0x08029d94 in devif_poll (dev=0x24001cb8 <g_fdcan1+228>, callback=0x8013431 <fdcan_txpoll>) at devif/devif_poll.c:675 675 bstop = devif_poll_can_connections(dev, callback); (gdb) #3 0x0801342a in fdcan_txavail_work (arg=0x24001bd4 <g_fdcan1>) at /home/carlossanchez/prj/go-posix/patched/nuttx/arch/arm/src/chip/stm32_fdcan_sock.c:1904 1904 devif_poll(&priv->dev, fdcan_txpoll); (gdb) #4 fdcan_txavail (dev=<optimized out>) at /home/carlossanchez/prj/go-posix/patched/nuttx/arch/arm/src/chip/stm32_fdcan_sock.c:1944 1944 fdcan_txavail_work(priv); (gdb) #5 0x0802a8ea in netdev_txnotify_dev (dev=0x24001cb8 <g_fdcan1+228>) at netdev/netdev_txnotify.c:130 130 dev->d_txavail(dev); (gdb) #6 0x08039abc in can_sendmsg (psock=0x30000f00, msg=0x3000a1f4, flags=0) at can/can_sendmsg.c:259 259 netdev_txnotify_dev(dev); (gdb) #7 0x08038b46 in psock_sendmsg (psock=0x30000f00, msg=0x3000a1f4, flags=0) at socket/sendmsg.c:93 93 return psock->s_sockif->si_sendmsg(psock, msg, flags); (gdb) #8 0x08038838 in psock_send (psock=0x30000f00, buf=0x3000a2d0, len=16, flags=0) at socket/send.c:90 90 return psock_sendmsg(psock, &msg, flags); (gdb) #9 0x0802d39c in sock_file_write (filep=0x300062b0, buffer=0x3000a2d0 "\274k\001\b", buflen=16) at socket/socket.c:129 129 return psock_send(filep->f_priv, buffer, buflen, 0); (gdb) #10 0x0802cd98 in file_write (filep=0x300062b0, buf=0x3000a2d0, nbytes=16) at vfs/fs_write.c:89 89 return inode->u.i_ops->write(filep, buf, nbytes); (gdb) #11 0x0802cdda in nx_write (fd=3, buf=0x3000a2d0, nbytes=16) at vfs/fs_write.c:138 138 ret = file_write(filep, buf, nbytes); (gdb) #12 0x0802cdfc in write (fd=3, buf=0x3000a2d0, nbytes=16) at vfs/fs_write.c:202 202 ret = nx_write(fd, buf, nbytes); (gdb) finish Run till exit from #12 0x0802cdfc in write (fd=3, buf=0x3000a2d0, nbytes=16) at vfs/fs_write.c:202 main (argc=<optimized out>, argv=<optimized out>) at ../../../application/test/socket_can/socket_can.c:37 37 return 0; Value returned is $3 = 16 ``` With my patch: ``` Breakpoint 1, fdcan_txpoll (dev=0x24001cb8 <g_fdcan1+228>) at /home/carlossanchez/prj/go-posix/patched/nuttx/arch/arm/src/chip/stm32_fdcan_sock.c:957 957 { (gdb) p g_fdcan1->dev.d_len $1 = 16 (gdb) next 958 struct fdcan_driver_s *priv = (gdb) 965 if (priv->dev.d_len > 0) (gdb) p g_fdcan1->dev.d_len $2 = 16 ``` So the buffer is not overwritten now, and we can see the worker thread is stopped at `net_lock`: ``` (gdb) info_nxthreads target examined _target_arch.name=armv7e-m $_target_has_fpu : 1 $_target_has_smp : 0 saved current_tcb (pid=3) 0 Thread 0x240021bc (Name: Idle Task, State: Ready, Priority: 0, Stack: 768/1000) PC: 0x8027c70 in up_idle() 1 Thread 0x30000320 (Name: hpwork, State: Waiting,Semaphore, Priority: 224, Stack: 812/1992) PC: 0x8027932 in sys_call2() * 3 Thread 0x30005f80 (Name: main, State: Running, Priority: 100, Stack: 1216/16344) PC: 0x130 in No() (gdb) nxthread_all_bt 0 Thread 0x240021bc (Name: Idle Task, State: Ready, Priority: 0, Stack: 768/1000) PC: 0x8027c70 in up_idle() #0 up_idle () at common/arm_idle.c:48 #1 0x08015e70 in nx_start () at init/nx_start.c:742 #2 0x08010366 in __start () at chip/stm32_start.c:267 1 Thread 0x30000320 (Name: hpwork, State: Waiting,Semaphore, Priority: 224, Stack: 812/1992) PC: 0x8027932 in sys_call2() #0 sys_call2 (nbr=2, parm1=805307284, parm2=805347420) at /home/carlossanchez/prj/go-posix/patched/nuttx/include/arch/syscall.h:194 #1 0x08027952 in arm_switchcontext (saveregs=0x30000394, restoreregs=0x3000a05c) at common/arm_switchcontext.c:46 #2 0x08027690 in up_block_task (tcb=0x30000320, task_state=TSTATE_WAIT_SEM) at common/arm_blocktask.c:142 #3 0x080162fc in nxsem_wait (sem=0x240006e8 <g_netlock>) at semaphore/sem_wait.c:155 #4 0x08016342 in nxsem_wait_uninterruptible (sem=0x240006e8 <g_netlock>) at semaphore/sem_wait.c:224 #5 0x0802a50e in _net_takesem () at utils/net_lock.c:72 #6 0x0802a61a in net_lock () at utils/net_lock.c:173 #7 0x080135da in fdcan_error_work (arg=0x24001bd4 <g_fdcan1>) at /home/carlossanchez/prj/go-posix/patched/nuttx/arch/arm/src/chip/stm32_fdcan_sock.c:2736 #8 0x08016eee in work_thread (argc=2, argv=0x300006f8) at wqueue/kwork_thread.c:178 #9 0x08016c20 in nxtask_start () at task/task_start.c:129 #10 0x00000000 in ?? () 3 Thread 0x30005f80 (Name: main, State: Running, Priority: 100, Stack: 1216/16344) PC: 0xc0 in No() #0 0x000000c0 in ?? () #1 0x24005530 in idle_stack () Backtrace stopped: previous frame identical to this frame (corrupt stack?) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
