Hi,

Initially, I just wanted to try the examples in CMSG_DATA(3).
They are a bit "incomplete", so I had to guess a few things.

This is the result of my tests:

#include <sys/types.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/wait.h>

#include <err.h>
#include <errno.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#define RECVFD_DO_CLOSE         0x1
#define RECVFD_PASS_IOVEC       0x2
#define RECVFD_DEBUG            0x4
#define SENDFD_DEBUG            0x8

int     sendfd(int, u_int, u_int);
int     recvfd(int, u_int, u_int);

/*
 * Usage: send-recv-fd [-bc] [-d debug] [-n iterations] [-t socktype]
 *
 * -b passes a dummy iovec structure to recvmsg (fixes -t stream -n $n)
 * -c makes the child process close the fd received via recvmsg
 * -r verbose recvmsg calls
 * -s verbose sendmsg calls
 * -n specifies how many copies of a fd should be passed from the parent
 *    to the child
 * -t specifies the socket type.  can be stream (default), dgram, or seqtype
 */

int
main(int argc, char **argv)
{
        int              c;
        u_int            flags = 0;
        u_int            count = 5;
        int              socktype = SOCK_STREAM;
        const char      *errstr = NULL;
        int              fds[2];

        while ((c = getopt(argc, argv, "bcn:rst:")) != -1)
                switch (c) {
                case 'b':
                        flags |= RECVFD_PASS_IOVEC;
                        break;
                case 'c':
                        flags |= RECVFD_DO_CLOSE;
                        break;
                case 'n':
                        count = strtonum(optarg, 1, INT_MAX, &errstr);
                        if (errstr)
                                errx(1, "option n: %s is %s", optarg, errstr);
                        break;
                case 'r':
                        flags |= RECVFD_DEBUG;
                        break;
                case 's':
                        flags |= SENDFD_DEBUG;
                        break;
                case 't':
                        if (!strcmp(optarg, "dgram"))
                                socktype = SOCK_DGRAM;
                        else if (!strcmp(optarg, "seqpacket"))
                                socktype = SOCK_SEQPACKET;
                        else if (!strcmp(optarg, "stream"))
                                socktype = SOCK_STREAM;
                        else
                                errx(1, "bad type %s", optarg);
                        break;
                default:
                        errx(1, "unknown option %c", c);
                }

        if (socketpair(AF_LOCAL, socktype, 0, fds) == -1)
                err(1, "socketpair");

        switch (fork()) {
        case -1:
                err(1, "fork");
                /* NOTREACHED */
        case 0:
                sendfd(fds[0], count, flags);
                break;
        default:
                recvfd(fds[1], count, flags);
                wait(NULL);
                break;
        }
        return 0;
}

int
sendfd(int fd, u_int count, u_int flags)
{
        struct msghdr    msg;
        struct cmsghdr  *cmsg;
        union {
                struct cmsghdr  hdr;
                unsigned char   buf[CMSG_SPACE(sizeof(int))];
        } cmsgbuf;
        ssize_t          sent;
        u_int            i = 1;

        while (i <= count) {
                memset(&msg, 0, sizeof(msg));
                msg.msg_control = &cmsgbuf.buf;
                msg.msg_controllen = sizeof(cmsgbuf.buf);

                cmsg = CMSG_FIRSTHDR(&msg);
                cmsg->cmsg_len = CMSG_LEN(sizeof(int));
                cmsg->cmsg_level = SOL_SOCKET;
                cmsg->cmsg_type = SCM_RIGHTS;
                *(int *)CMSG_DATA(cmsg) = 2;

                if (flags & SENDFD_DEBUG)
                        fprintf(stderr, "debug: iteration %d, calling 
sendmsg\n",
                            i);

                sent = sendmsg(fd, &msg, 0);

                if (flags & SENDFD_DEBUG)
                        fprintf(stderr, "debug: iteration %d, sendmsg sent %zd 
bytes\n",
                            i, sent);

                if (sent == -1) {
                        if (errno == ENOBUFS)
                                usleep(20000);
                        else
                                err(1, "sendmsg");
                } else {
                        i++;
                }
        }

        return 0;
}

int
recvfd(int fd, u_int count, u_int flags)
{
        struct msghdr    msg;
        struct cmsghdr  *cmsg;
        union {
                struct cmsghdr  hdr;
                unsigned char   buf[CMSG_SPACE(sizeof(int))];
        } cmsgbuf;
        char            iov_buffer[1024];
        struct iovec    iov;
        char            fdstr[32];
        ssize_t         received;
        int             passed_fd;
        u_int           i = 1;

        while (i <= count) {
                memset(&msg, 0, sizeof(msg));
                msg.msg_control = &cmsgbuf.buf;
                msg.msg_controllen = sizeof(cmsgbuf.buf);

                if (flags & RECVFD_PASS_IOVEC) {
                        iov.iov_base = iov_buffer;
                        iov.iov_len = sizeof(iov_buffer);
                        msg.msg_iov = &iov;
                        msg.msg_iovlen = 1;
                }

                if (flags & RECVFD_DEBUG)
                        fprintf(stderr, "debug: iteration %d, calling 
recvmsg\n",
                            i);

                received = recvmsg(fd, &msg, 0);

                if (flags & RECVFD_DEBUG)
                        fprintf(stderr, "debug: iteration %d, recvmsg got %zd 
bytes\n",
                            i, received);

                if (received == -1)
                        err(1, "recvmsg");

                if (msg.msg_flags & MSG_TRUNC)
                        warnx("control message truncated (MSG_TRUNC)");

                if (msg.msg_flags & MSG_CTRUNC)
                        warnx("control message truncated (MSG_CTRUNC)");

                cmsg = CMSG_FIRSTHDR(&msg);
                if (cmsg == NULL)
                        warnx("empty control message");

                for (; cmsg != NULL;
                     cmsg = CMSG_NXTHDR(&msg, cmsg)) {

                        if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)) &&
                            cmsg->cmsg_level == SOL_SOCKET &&
                            cmsg->cmsg_type == SCM_RIGHTS) {
                                passed_fd = *(int *)CMSG_DATA(cmsg);
                                snprintf(fdstr, sizeof fdstr, "new fd: %d\n",
                                    passed_fd);
                                if (write(passed_fd, fdstr, strlen(fdstr)) == 
-1)
                                        warn("%s: write", __func__);
                                if (flags & RECVFD_DO_CLOSE)
                                        close(passed_fd);
                        }
                }
                i++;
        }

        return 0;
}


The first problem I had was when checking msg.msg_flags.  For SOCK_DGRAM
and SOCK_SEQPACKET sockets, MSG_TRUNC was set.  This sounded weird,
since the sending side pushed only control messages, but *not a single
byte of data*.  I could still get a valid fd, though.

So I switched to SOCK_STREAM, and the truncation warning went away. But
then I could not send fds in a loop: CMSG_FIRSTHDR yielded NULL
(msg.msg_controllen was zero), except at the first iteration of the
loop.

So the two issues show up when you want to call sendmsg and recvmsg
with no data at all, but only control messages.  sendmsg allocates an
empty mbuf for data (m_len == 0), this mbuf is not discarded and freed
by soreceive for SOCK_STREAM sockets, and stays in the socket buffer.

I understand that this API was probably not designed to pass only
control messages, and no data at all.  But it is easily fixable, and the
manpage could provide portability information.

So here's the diff I came up with:
- set MSG_TRUNC for atomic protocols only if there is actually a data
  loss
- drop the remaining mbuf(s) if the protocol is atomic *or* it is an
  empty message

I'm running this, and I can't see the downsides, but I could use eyes
and comments.

Index: kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.127
diff -u -p -r1.127 uipc_socket.c
--- kern/uipc_socket.c  7 Apr 2014 10:04:17 -0000       1.127
+++ kern/uipc_socket.c  14 May 2014 02:56:09 -0000
@@ -935,13 +935,15 @@ dontblock:
                }
        }
 
-       if (m && pr->pr_flags & PR_ATOMIC) {
+       if (m != NULL && m->m_len != 0 && pr->pr_flags & PR_ATOMIC)
+               /* Drop the remaining data later. */
                flags |= MSG_TRUNC;
-               if ((flags & MSG_PEEK) == 0)
-                       (void) sbdroprecord(&so->so_rcv);
-       }
+
        if ((flags & MSG_PEEK) == 0) {
-               if (m == NULL) {
+               if (m != NULL) {
+                       if (m->m_len == 0 || pr->pr_flags & PR_ATOMIC)
+                               sbdroprecord(&so->so_rcv);
+               } else {
                        /*
                         * First part is an inline SB_EMPTY_FIXUP().  Second
                         * part makes sure sb_lastrecord is up-to-date if

Reply via email to