Hello,

Hrvoje gave a try to experimental diff, which trades rw-locks in pf(4)
for mutexes [1]. Hrvoje soon discovered machine panics, when doing 'pfctl -ss'
The callstack looks as follows:

panic: acquiring blockable sleep lock with spinlock or critical section
held (rwlock) vmmaplk
Stopped at      db_enter+0x10:  popq    %rbp
    TID    PID    UID     PRFLAGS     PFLAGS  CPU  COMMAND
*512895  28841      0         0x3          0    3K pfctl
db_enter() at db_enter+0x10
panic(ffffffff81e19411) at panic+0x12a
witness_checkorder(fffffd83b09b4d18,1,0) at witness_checkorder+0xbce
rw_enter_read(fffffd83b09b4d08) at rw_enter_read+0x38
uvmfault_lookup(ffff8000238e3418,0) at uvmfault_lookup+0x8a
uvm_fault_check(ffff8000238e3418,ffff8000238e3450,ffff8000238e3478) at
uvm_fault_check+0x32
uvm_fault(fffffd83b09b4d00,e36553c000,0,2) at uvm_fault+0xfc
kpageflttrap(ffff8000238e3590,e36553c000) at kpageflttrap+0x131
kerntrap(ffff8000238e3590) at kerntrap+0x91
alltraps_kern_meltdown() at alltraps_kern_meltdown+0x7b
copyout() at copyout+0x53
VOP_IOCTL(fffffd83b24340e0,c0104419,ffff8000238e3930,1,fffffd842f7d7120,ffff800
0239597a8) at VOP_IOCTL+0x68
vn_ioctl(fffffd83b294edc0,c0104419,ffff8000238e3930,ffff8000239597a8) at
vn_ioctl+0x75
sys_ioctl(ffff8000239597a8,ffff8000238e3a40,ffff8000238e3aa0) at
sys_ioctl+0x2c4
end trace frame: 0xffff8000238e3b00, count: 0
https://www.openbsd.org/ddb.html
reports.  Insufficient info makes it difficult to find and fix bugs.
ddb{3}>

The problem comes from DIOCGETSTATES in pfioctl() here:
1775 
1776                 NET_LOCK();
1777                 PF_STATE_ENTER_READ();
1778                 state = TAILQ_FIRST(&state_list);
1779                 while (state) {
1780                         if (state->timeout != PFTM_UNLINKED) {
1781                                 if ((nr+1) * sizeof(*p) > ps->ps_len)
1782                                         break;
1783                                 pf_state_export(pstore, state);
1784                                 error = copyout(pstore, p, sizeof(*p));
1785                                 if (error) {
1786                                         free(pstore, M_TEMP, 
sizeof(*pstore));
1787                                         PF_STATE_EXIT_READ();
1788                                         NET_UNLOCK();
1789                                         goto fail;
1790                                 }
1791                                 p++;
1792                                 nr++;
1793                         }
1794                         state = TAILQ_NEXT(state, entry_list);
1795                 }
1796                 PF_STATE_EXIT_READ();
1797                 NET_UNLOCK();
1798 

at line 1784 we do copyout() while holding mutex. As I've mentioned glitch is a
specific to experimental diff [1]. However this made me thinking, that
it's not a good idea to do copyout() while holding NET_LOCK() and state_lock.

Diff below moves copyout() at line 1784 outside of protection of both locks.
The approach I took is relatively straightforward:

    let DIOCGETSTATES to allocate hold_states array, which will keep
    references to states.

    grab locks and take references, keep those references in hold
    array.

    drop locks, export states and do copyout, while walking
    array of references.

    drop references, release hold_states array.

does it make sense? If we agree that this approach makes sense
I'll commit this diff and revisit other such places we currently
have in pfioctl().

thanks and
regards
sashan

[1] https://marc.info/?l=openbsd-tech&m=162138181106887&w=2

--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/pf_ioctl.c b/sys/net/pf_ioctl.c
index ae7bb008351..0d4ac97a92c 100644
--- a/sys/net/pf_ioctl.c
+++ b/sys/net/pf_ioctl.c
@@ -1762,43 +1762,58 @@ pfioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, 
struct proc *p)
                struct pf_state         *state;
                struct pfsync_state     *p, *pstore;
                u_int32_t                nr = 0;
+               struct pf_state         **hold_states;
+               u_int32_t                hold_len, i;
 
                if (ps->ps_len == 0) {
                        nr = pf_status.states;
                        ps->ps_len = sizeof(struct pfsync_state) * nr;
                        break;
+               } else {
+                       hold_len = ps->ps_len / sizeof(struct pfsync_state);
+                       hold_len = MIN(hold_len, pf_status.states);
                }
 
                pstore = malloc(sizeof(*pstore), M_TEMP, M_WAITOK);
+               hold_states = mallocarray(hold_len + 1,
+                   sizeof(struct pf_state *), M_TEMP, M_WAITOK | M_ZERO);
 
                p = ps->ps_states;
 
+               i = 0;
                NET_LOCK();
                PF_STATE_ENTER_READ();
-               state = TAILQ_FIRST(&state_list);
-               while (state) {
+               TAILQ_FOREACH(state, &state_list, entry_list) {
+                       hold_states[i++] = pf_state_ref(state);
+                       if (i >= hold_len)
+                               break;
+               }
+               PF_STATE_EXIT_READ();
+               NET_UNLOCK();
+
+               i = 0;
+               while ((state = hold_states[i++]) != NULL) {
                        if (state->timeout != PFTM_UNLINKED) {
-                               if ((nr+1) * sizeof(*p) > ps->ps_len)
-                                       break;
                                pf_state_export(pstore, state);
                                error = copyout(pstore, p, sizeof(*p));
-                               if (error) {
-                                       free(pstore, M_TEMP, sizeof(*pstore));
-                                       PF_STATE_EXIT_READ();
-                                       NET_UNLOCK();
-                                       goto fail;
-                               }
+                               if (error)
+                                       break;
                                p++;
                                nr++;
                        }
-                       state = TAILQ_NEXT(state, entry_list);
+                       pf_state_unref(state);
                }
-               PF_STATE_EXIT_READ();
-               NET_UNLOCK();
 
-               ps->ps_len = sizeof(struct pfsync_state) * nr;
+               if (error) {
+                       pf_state_unref(state);
+                       while ((state = hold_states[i++]) != NULL)
+                               pf_state_unref(state);
+               } else
+                       ps->ps_len = sizeof(struct pfsync_state) * nr;
 
                free(pstore, M_TEMP, sizeof(*pstore));
+               free(hold_states, M_TEMP,
+                   sizeof(struct pf_state *) * (hold_len + 1));
                break;
        }
 

Reply via email to