On Tue, Sep 10, 2019 at 12:52:42PM +0200, Florian Westphal wrote:
> Pablo Neira Ayuso <[email protected]> wrote:
> > On Tue, Sep 10, 2019 at 12:19:18AM +0200, Florian Westphal wrote:
> > [...]
> > > diff --git a/src/mnl.c b/src/mnl.c
> > > index 9c1f5356c9b9..d664564e16af 100644
> > > --- a/src/mnl.c
> > > +++ b/src/mnl.c
> > > @@ -311,8 +311,6 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct
> > > list_head *err_list,
> > > int ret, fd = mnl_socket_get_fd(nl), portid = mnl_socket_get_portid(nl);
> > > uint32_t iov_len = nftnl_batch_iovec_len(ctx->batch);
> > > char rcv_buf[MNL_SOCKET_BUFFER_SIZE];
> > > - unsigned int enobuf_restarts = 0;
> > > - size_t avg_msg_size, batch_size;
> > > const struct sockaddr_nl snl = {
> > > .nl_family = AF_NETLINK
> > > };
> > > @@ -321,17 +319,22 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct
> > > list_head *err_list,
> > > .tv_usec = 0
> > > };
> > > struct iovec iov[iov_len];
> > > - unsigned int scale = 4;
> > > struct msghdr msg = {};
> > > fd_set readfds;
> > >
> > > mnl_set_sndbuffer(ctx->nft->nf_sock, ctx->batch);
> > >
> > > - batch_size = mnl_nft_batch_to_msg(ctx, &msg, &snl, iov, iov_len);
> > > - avg_msg_size = div_round_up(batch_size, num_cmds);
> > > + mnl_nft_batch_to_msg(ctx, &msg, &snl, iov, iov_len);
> > >
> > > -restart:
> > > - mnl_set_rcvbuffer(ctx->nft->nf_sock, num_cmds * avg_msg_size * scale);
> > > + if (nft_output_echo(&ctx->nft->output)) {
> > > + size_t buffer_size = MNL_SOCKET_BUFFER_SIZE * 1024;
> > > + size_t new_buffer_size = num_cmds * 1024;
> >
> > Probably all simplify this to?
> >
> > mnl_set_rcvbuffer(ctx->nft->nf_sock, (1 << 10) * num_cmds);
>
> Reason for above patch was to avoid any risk for normal operations by
> restricting the recvbuffer tuning to echo-mode and also adding a
> lower thresh.
>
> For some reason I don't like the idea of setting only 1k recvbuf by
> default in the extreme case.
I'd still like to keep setting the receive buffer for the non-echo
case, a ruleset with lots of acknowledments (lots of errors) might hit
ENOBUFS, I remember that was reproducible.
Probably this? it's based on your patch.
diff --git a/src/mnl.c b/src/mnl.c
index 9c1f5356c9b9..8031bd6add80 100644
--- a/src/mnl.c
+++ b/src/mnl.c
@@ -304,6 +304,8 @@ static ssize_t mnl_nft_socket_sendmsg(struct netlink_ctx *ctx,
return sendmsg(mnl_socket_get_fd(ctx->nft->nf_sock), msg, 0);
}
+#define NFT_MNL_ECHO_RCVBUFF_DEFAULT (MNL_SOCKET_BUFFER_SIZE * 1024)
+
int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list,
uint32_t num_cmds)
{
@@ -311,8 +313,6 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list,
int ret, fd = mnl_socket_get_fd(nl), portid = mnl_socket_get_portid(nl);
uint32_t iov_len = nftnl_batch_iovec_len(ctx->batch);
char rcv_buf[MNL_SOCKET_BUFFER_SIZE];
- unsigned int enobuf_restarts = 0;
- size_t avg_msg_size, batch_size;
const struct sockaddr_nl snl = {
.nl_family = AF_NETLINK
};
@@ -321,17 +321,24 @@ int mnl_batch_talk(struct netlink_ctx *ctx, struct list_head *err_list,
.tv_usec = 0
};
struct iovec iov[iov_len];
- unsigned int scale = 4;
struct msghdr msg = {};
+ unsigned int rcvbufsiz;
+ size_t batch_size;
fd_set readfds;
mnl_set_sndbuffer(ctx->nft->nf_sock, ctx->batch);
batch_size = mnl_nft_batch_to_msg(ctx, &msg, &snl, iov, iov_len);
- avg_msg_size = div_round_up(batch_size, num_cmds);
-restart:
- mnl_set_rcvbuffer(ctx->nft->nf_sock, num_cmds * avg_msg_size * scale);
+ if (nft_output_echo(&ctx->nft->output)) {
+ rcvbufsiz = num_cmds * 1024;
+ if (rcvbufsiz < NFT_MNL_ECHO_RCVBUFF_DEFAULT)
+ rcvbufsiz = NFT_MNL_ECHO_RCVBUFF_DEFAULT;
+ } else {
+ rcvbufsiz = num_cmds * div_round_up(batch_size, num_cmds) * 4;
+ }
+
+ mnl_set_rcvbuffer(ctx->nft->nf_sock, rcvbufsiz);
ret = mnl_nft_socket_sendmsg(ctx, &msg);
if (ret == -1)
@@ -350,13 +357,8 @@ restart:
break;
ret = mnl_socket_recvfrom(nl, rcv_buf, sizeof(rcv_buf));
- if (ret == -1) {
- if (errno == ENOBUFS && enobuf_restarts++ < 3) {
- scale *= 2;
- goto restart;
- }
+ if (ret == -1)
return -1;
- }
ret = mnl_cb_run(rcv_buf, ret, 0, portid, &netlink_echo_callback, ctx);
/* Continue on error, make sure we get all acknowledgments */