Hi,
Initially, I just wanted to try the examples in CMSG_DATA(3).
They are a bit "incomplete", so I had to guess a few things.
This is the result of my tests:
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/uio.h>
#include <sys/wait.h>
#include <err.h>
#include <errno.h>
#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#define RECVFD_DO_CLOSE 0x1
#define RECVFD_PASS_IOVEC 0x2
#define RECVFD_DEBUG 0x4
#define SENDFD_DEBUG 0x8
int sendfd(int, u_int, u_int);
int recvfd(int, u_int, u_int);
/*
* Usage: send-recv-fd [-bc] [-d debug] [-n iterations] [-t socktype]
*
* -b passes a dummy iovec structure to recvmsg (fixes -t stream -n $n)
* -c makes the child process close the fd received via recvmsg
* -r verbose recvmsg calls
* -s verbose sendmsg calls
* -n specifies how many copies of a fd should be passed from the parent
* to the child
* -t specifies the socket type. can be stream (default), dgram, or seqtype
*/
int
main(int argc, char **argv)
{
int c;
u_int flags = 0;
u_int count = 5;
int socktype = SOCK_STREAM;
const char *errstr = NULL;
int fds[2];
while ((c = getopt(argc, argv, "bcn:rst:")) != -1)
switch (c) {
case 'b':
flags |= RECVFD_PASS_IOVEC;
break;
case 'c':
flags |= RECVFD_DO_CLOSE;
break;
case 'n':
count = strtonum(optarg, 1, INT_MAX, &errstr);
if (errstr)
errx(1, "option n: %s is %s", optarg, errstr);
break;
case 'r':
flags |= RECVFD_DEBUG;
break;
case 's':
flags |= SENDFD_DEBUG;
break;
case 't':
if (!strcmp(optarg, "dgram"))
socktype = SOCK_DGRAM;
else if (!strcmp(optarg, "seqpacket"))
socktype = SOCK_SEQPACKET;
else if (!strcmp(optarg, "stream"))
socktype = SOCK_STREAM;
else
errx(1, "bad type %s", optarg);
break;
default:
errx(1, "unknown option %c", c);
}
if (socketpair(AF_LOCAL, socktype, 0, fds) == -1)
err(1, "socketpair");
switch (fork()) {
case -1:
err(1, "fork");
/* NOTREACHED */
case 0:
sendfd(fds[0], count, flags);
break;
default:
recvfd(fds[1], count, flags);
wait(NULL);
break;
}
return 0;
}
int
sendfd(int fd, u_int count, u_int flags)
{
struct msghdr msg;
struct cmsghdr *cmsg;
union {
struct cmsghdr hdr;
unsigned char buf[CMSG_SPACE(sizeof(int))];
} cmsgbuf;
ssize_t sent;
u_int i = 1;
while (i <= count) {
memset(&msg, 0, sizeof(msg));
msg.msg_control = &cmsgbuf.buf;
msg.msg_controllen = sizeof(cmsgbuf.buf);
cmsg = CMSG_FIRSTHDR(&msg);
cmsg->cmsg_len = CMSG_LEN(sizeof(int));
cmsg->cmsg_level = SOL_SOCKET;
cmsg->cmsg_type = SCM_RIGHTS;
*(int *)CMSG_DATA(cmsg) = 2;
if (flags & SENDFD_DEBUG)
fprintf(stderr, "debug: iteration %d, calling
sendmsg\n",
i);
sent = sendmsg(fd, &msg, 0);
if (flags & SENDFD_DEBUG)
fprintf(stderr, "debug: iteration %d, sendmsg sent %zd
bytes\n",
i, sent);
if (sent == -1) {
if (errno == ENOBUFS)
usleep(20000);
else
err(1, "sendmsg");
} else {
i++;
}
}
return 0;
}
int
recvfd(int fd, u_int count, u_int flags)
{
struct msghdr msg;
struct cmsghdr *cmsg;
union {
struct cmsghdr hdr;
unsigned char buf[CMSG_SPACE(sizeof(int))];
} cmsgbuf;
char iov_buffer[1024];
struct iovec iov;
char fdstr[32];
ssize_t received;
int passed_fd;
u_int i = 1;
while (i <= count) {
memset(&msg, 0, sizeof(msg));
msg.msg_control = &cmsgbuf.buf;
msg.msg_controllen = sizeof(cmsgbuf.buf);
if (flags & RECVFD_PASS_IOVEC) {
iov.iov_base = iov_buffer;
iov.iov_len = sizeof(iov_buffer);
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
}
if (flags & RECVFD_DEBUG)
fprintf(stderr, "debug: iteration %d, calling
recvmsg\n",
i);
received = recvmsg(fd, &msg, 0);
if (flags & RECVFD_DEBUG)
fprintf(stderr, "debug: iteration %d, recvmsg got %zd
bytes\n",
i, received);
if (received == -1)
err(1, "recvmsg");
if (msg.msg_flags & MSG_TRUNC)
warnx("control message truncated (MSG_TRUNC)");
if (msg.msg_flags & MSG_CTRUNC)
warnx("control message truncated (MSG_CTRUNC)");
cmsg = CMSG_FIRSTHDR(&msg);
if (cmsg == NULL)
warnx("empty control message");
for (; cmsg != NULL;
cmsg = CMSG_NXTHDR(&msg, cmsg)) {
if (cmsg->cmsg_len == CMSG_LEN(sizeof(int)) &&
cmsg->cmsg_level == SOL_SOCKET &&
cmsg->cmsg_type == SCM_RIGHTS) {
passed_fd = *(int *)CMSG_DATA(cmsg);
snprintf(fdstr, sizeof fdstr, "new fd: %d\n",
passed_fd);
if (write(passed_fd, fdstr, strlen(fdstr)) ==
-1)
warn("%s: write", __func__);
if (flags & RECVFD_DO_CLOSE)
close(passed_fd);
}
}
i++;
}
return 0;
}
The first problem I had was when checking msg.msg_flags. For SOCK_DGRAM
and SOCK_SEQPACKET sockets, MSG_TRUNC was set. This sounded weird,
since the sending side pushed only control messages, but *not a single
byte of data*. I could still get a valid fd, though.
So I switched to SOCK_STREAM, and the truncation warning went away. But
then I could not send fds in a loop: CMSG_FIRSTHDR yielded NULL
(msg.msg_controllen was zero), except at the first iteration of the
loop.
So the two issues show up when you want to call sendmsg and recvmsg
with no data at all, but only control messages. sendmsg allocates an
empty mbuf for data (m_len == 0), this mbuf is not discarded and freed
by soreceive for SOCK_STREAM sockets, and stays in the socket buffer.
I understand that this API was probably not designed to pass only
control messages, and no data at all. But it is easily fixable, and the
manpage could provide portability information.
So here's the diff I came up with:
- set MSG_TRUNC for atomic protocols only if there is actually a data
loss
- drop the remaining mbuf(s) if the protocol is atomic *or* it is an
empty message
I'm running this, and I can't see the downsides, but I could use eyes
and comments.
Index: kern/uipc_socket.c
===================================================================
RCS file: /cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.127
diff -u -p -r1.127 uipc_socket.c
--- kern/uipc_socket.c 7 Apr 2014 10:04:17 -0000 1.127
+++ kern/uipc_socket.c 14 May 2014 02:56:09 -0000
@@ -935,13 +935,15 @@ dontblock:
}
}
- if (m && pr->pr_flags & PR_ATOMIC) {
+ if (m != NULL && m->m_len != 0 && pr->pr_flags & PR_ATOMIC)
+ /* Drop the remaining data later. */
flags |= MSG_TRUNC;
- if ((flags & MSG_PEEK) == 0)
- (void) sbdroprecord(&so->so_rcv);
- }
+
if ((flags & MSG_PEEK) == 0) {
- if (m == NULL) {
+ if (m != NULL) {
+ if (m->m_len == 0 || pr->pr_flags & PR_ATOMIC)
+ sbdroprecord(&so->so_rcv);
+ } else {
/*
* First part is an inline SB_EMPTY_FIXUP(). Second
* part makes sure sb_lastrecord is up-to-date if