crash in nfs_timer

Alexander Bluhm Thu, 03 May 2018 11:57:23 -0700

Hi,

I have seen a crash in the regress/sys/nfs test.  It was during
run-regress-cleanup, the last command I see is:


umount -f -t nfs -h 127.0.0.1 -a || true

uvm_fault(0xd0d2f2a8, 0xefff0000, 0, 1) -> e
kernel: page fault trap, code=0
Stopped at      nfs_timer+0x30: cmpl    $0,0xc(%ebx)

ddb{0}> trace
nfs_timer(d631c300) at nfs_timer+0x30
softclock_thread(d570a444) at softclock_thread+0xc7

ddb{0}> ps
   PID     TID   PPID    UID  S       FLAGS  WAIT          COMMAND
*89864  371985      0      0  7  0x40014200                softclock

ddb{0}> show register
ds                  0x10
es                  0x10
fs                  0x20
gs                     0
edi           0xd631c300        end+0x5521300
esi           0xd0601600        nfs_timer
ebp           0xf52d7e7c
ebx           0xefff0017
edx                  0x1
ecx           0xd570a444        end+0x490f444
eax                    0
eip           0xd0601630        nfs_timer+0x30
cs                   0x8
eflags           0x10286        __ALIGN_SIZE+0xf286
esp           0xf52d7e6c
ss                  0x10
nfs_timer+0x30: cmpl    $0,0xc(%ebx)

The struct nfsmount *nmp passed as argument to nfs_timer() has been
freed.
ddb{0}> x/x 0xd631c300,8 
0xd631c300:     efffeecc
0xd631c304:     efff0017
0xd631c308:     ba65fc7c
0xd631c30c:     efffeecc
0xd631c310:     efffeecc
0xd631c314:     efffeecc
0xd631c318:     efffeecc
0xd631c31c:     efffeecc

%ebx contains the bad value from nmp->nm_reqsq.
00001af0 <nfs_timer>:
/usr/src/sys/nfs/nfs_socket.c:1138
*   1b20:       83 7b 0c 00             cmpl   $0x0,0xc(%ebx)
    1b24:       0f 85 f6 02 00 00       jne    1e20 <nfs_timer+0x330>
    1b2a:       8b 4b 24                mov    0x24(%ebx),%ecx
    1b2d:       f6 c1 04                test   $0x4,%cl
    1b30:       0f 85 ea 02 00 00       jne    1e20 <nfs_timer+0x330>
/usr/src/sys/nfs/nfs_socket.c:1140

When accessing nmp->nm_reqsq->r_mrep it crashes.
nfs/nfs_socket.c
  1136          NET_LOCK();
  1137          TAILQ_FOREACH(rep, &nmp->nm_reqsq, r_chain) {
* 1138                  if (rep->r_mrep || (rep->r_flags & R_SOFTTERM))
  1139                          continue;

I guess that the timeout is run after the nfs unmount.  Maybe it
was sleeping in NET_LOCK().  We should delay the free(9) of struct
nfsmount after all timeouts are run.  That is how we solve the issue
elsewhere.

ok?

bluhm

Index: nfs/nfs_vfsops.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/nfs/nfs_vfsops.c,v
retrieving revision 1.119
diff -u -p -r1.119 nfs_vfsops.c
--- nfs/nfs_vfsops.c    2 May 2018 02:24:56 -0000       1.119
+++ nfs/nfs_vfsops.c    3 May 2018 18:46:33 -0000
@@ -84,6 +84,7 @@ int   nfs_start(struct mount *, int, struc
 int    nfs_statfs(struct mount *, struct statfs *, struct proc *);
 int    nfs_sync(struct mount *, int, int, struct ucred *, struct proc *);
 int    nfs_unmount(struct mount *, int, struct proc *);
+void   nfs_reaper(void *);
 int    nfs_vget(struct mount *, ino_t, struct vnode **);
 int    nfs_vptofh(struct vnode *, struct fid *);
 int    nfs_mountroot(void);
@@ -746,9 +747,21 @@ nfs_unmount(struct mount *mp, int mntfla
        nfs_disconnect(nmp);
        m_freem(nmp->nm_nam);
        timeout_del(&nmp->nm_rtimeout);
-       free(nmp, M_NFSMNT, sizeof(*nmp));
+       timeout_set_proc(&nmp->nm_rtimeout, nfs_reaper, nmp);
+       timeout_add(&nmp->nm_rtimeout, 0);
        mp->mnt_data = NULL;
        return (0);
+}
+
+/*
+ * Delay nfs mount point free until pending or sleeping timeouts have finished.
+ */
+void
+nfs_reaper(void *arg)
+{
+       struct nfsmount *nmp = arg;
+
+       free(nmp, M_NFSMNT, sizeof(*nmp));
 }
 
 /*

crash in nfs_timer

Reply via email to