Flavio Leitner <[email protected]> writes:
> On Wed, Mar 08, 2023 at 05:37:11PM -0500, Aaron Conole wrote:
>> Open vSwitch generally tries to let the underlying operating system
>> managed the low level details of hardware, for example DMA mapping,
>> bus arbitration, etc. However, when using DPDK, the underlying
>> operating system yields control of many of these details to userspace
>> for management.
>>
>> In the case of some DPDK port drivers, configuring rte_flow or even
>> allocating resources may require access to iopl/ioperm calls, which
>> are guarded by the CAP_SYS_RAWIO privilege on linux systems. These
>> calls are dangerous, and can allow a process to completely compromise
>> a system. However, they are needed in the case of some userspace
>> driver code which manages the hardware (for example, the mlx
>> implementation of backend support for rte_flow).
>>
>> Here, we create an opt-in flag passed to the command line to allow
>> this access. We need to do this before ever accessing the database,
>> because we want to drop all privileges asap, and cannot wait for
>> a connection to the database to be established and functional before
>> dropping. There may be distribution specific ways to do capability
>> management as well (using for example, systemd), but they are not
>> as universal to the vswitchd as a flag.
>>
>> Reviewed-by: Simon Horman <[email protected]>
>> Signed-off-by: Aaron Conole <[email protected]>
>> ---
>> v1->v2: update daemon-windows for daemon_become_new_user
>>
>> v2->v3: update daemon-windows for daemon_start
>> change log messages to be clearer
>> update the manpage to provide example of why
>> one would want the flag
>>
>> NEWS | 4 ++++
>> lib/daemon-unix.c | 29 +++++++++++++++++++++--------
>> lib/daemon-windows.c | 6 ++++--
>> lib/daemon.c | 2 +-
>> lib/daemon.h | 4 ++--
>> ovsdb/ovsdb-client.c | 6 +++---
>> ovsdb/ovsdb-server.c | 4 ++--
>> tests/test-netflow.c | 2 +-
>> tests/test-sflow.c | 2 +-
>> tests/test-unixctl.c | 2 +-
>> utilities/ovs-ofctl.c | 4 ++--
>> utilities/ovs-testcontroller.c | 4 ++--
>> vswitchd/ovs-vswitchd.8.in | 9 +++++++++
>> vswitchd/ovs-vswitchd.c | 11 ++++++++++-
>> 14 files changed, 63 insertions(+), 26 deletions(-)
>>
>> diff --git a/NEWS b/NEWS
>> index 85b3496214..65f35dcdd5 100644
>> --- a/NEWS
>> +++ b/NEWS
>> @@ -10,6 +10,10 @@ Post-v3.1.0
>> in order to create OVSDB sockets with access mode of 0770.
>> - QoS:
>> * Added new configuration option 'jitter' for a linux-netem QoS type.
>> + - DPDK:
>> + * ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started
>> + with the --hw-rawio-access command line option. This allows the
>> + process extra privileges when mapping physical interconnect memory.
>>
>>
>> v3.1.0 - 16 Feb 2023
>> diff --git a/lib/daemon-unix.c b/lib/daemon-unix.c
>> index 1a7ba427d7..dd839015ab 100644
>> --- a/lib/daemon-unix.c
>> +++ b/lib/daemon-unix.c
>> @@ -88,7 +88,8 @@ static bool switch_user = false;
>> static uid_t uid;
>> static gid_t gid;
>> static char *user = NULL;
>> -static void daemon_become_new_user__(bool access_datapath);
>> +static void daemon_become_new_user__(bool access_datapath,
>> + bool access_hardware_ports);
>>
>> static void check_already_running(void);
>> static int lock_pidfile(FILE *, int command);
>> @@ -443,13 +444,13 @@ monitor_daemon(pid_t daemon_pid)
>> * daemonize_complete()) or that it failed to start up (by exiting with a
>> * nonzero exit code). */
>> void
>> -daemonize_start(bool access_datapath)
>> +daemonize_start(bool access_datapath, bool access_hardware_ports)
>> {
>> assert_single_threaded();
>> daemonize_fd = -1;
>>
>> if (switch_user) {
>> - daemon_become_new_user__(access_datapath);
>> + daemon_become_new_user__(access_datapath, access_hardware_ports);
>> switch_user = false;
>> }
>>
>> @@ -807,7 +808,8 @@ daemon_become_new_user_unix(void)
>> /* Linux specific implementation of daemon_become_new_user()
>> * using libcap-ng. */
>> static void
>> -daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
>> +daemon_become_new_user_linux(bool access_datapath OVS_UNUSED,
>> + bool access_hardware_ports OVS_UNUSED)
>> {
>> #if defined __linux__ && HAVE_LIBCAPNG
>> int ret;
>> @@ -827,6 +829,16 @@ daemon_become_new_user_linux(bool access_datapath
>> OVS_UNUSED)
>> ret = capng_update(CAPNG_ADD, cap_sets, CAP_NET_ADMIN)
>> || capng_update(CAPNG_ADD, cap_sets, CAP_NET_RAW)
>> || capng_update(CAPNG_ADD, cap_sets,
>> CAP_NET_BROADCAST);
>> +#ifdef DPDK_NETDEV
>> + if (access_hardware_ports && !ret) {
>> + ret = capng_update(CAPNG_ADD, cap_sets, CAP_SYS_RAWIO);
>> + VLOG_INFO("The Linux capability CAP_SYS_RAWIO
>> enabled.");
>
> Shouldn't it be ... CAP_SYS_RAWIO is enabled. ?
I can add the 'is' - but FYI, I copied all the log messages that were
suggested in previous emails.
>
>> + }
>> +#else
>> + if (access_hardware_ports) {
>> + VLOG_WARN("No driver requires Linux capability CAP_SYS_RAWIO,
>> disabling it.");
>
> Although it is not my personal preference, log messages respects
> the 79 characters limit. There is an example in this same file at
> line #405: "waiting until 10 seconds since last..."
Okay, I can spin a v4
> Works for me. This is how I tested:
> # /usr/local/share/openvswitch/scripts/ovs-ctl \
> --ovs-user='openvswitch:hugetlbfs' \
> --ovs-vswitchd-options='--hw-rawio-access' start
>
> # getpcaps $(pidof ovs-vswitchd)
> 26923:
> cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_sys_rawio=ep
> 26922:
> cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock,cap_sys_rawio=ep
>
>
> # /usr/local/share/openvswitch/scripts/ovs-ctl stop
> Exiting ovs-vswitchd (26923).
> Exiting ovsdb-server (26910).
>
>
> # /usr/local/share/openvswitch/scripts/ovs-ctl \
> --ovs-user='openvswitch:hugetlbfs' start
> # getpcaps $(pidof ovs-vswitchd)
> 27010:
> cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock=ep
> 27009:
> cap_net_bind_service,cap_net_broadcast,cap_net_admin,cap_net_raw,cap_ipc_lock=ep
Thanks for showing the result.
I didn't want to make a unit test for this because I wasn't sure
if there would ever be a reliable way to test for it (ie: we don't know
that the testing environment gets run as root, with those capabilities
available, etc). Also, I used 'capsh --decode' and didn't use getpcaps
before - neat!
> Thanks,
> fbl
>
>
>> + }
>> +#endif
>> }
>> } else {
>> ret = -1;
>> @@ -854,7 +866,7 @@ daemon_become_new_user_linux(bool access_datapath
>> OVS_UNUSED)
>> }
>>
>> static void
>> -daemon_become_new_user__(bool access_datapath)
>> +daemon_become_new_user__(bool access_datapath, bool access_hardware_ports)
>> {
>> /* If vlog file has been created, change its owner to the non-root user
>> * as specifed by the --user option. */
>> @@ -862,7 +874,8 @@ daemon_become_new_user__(bool access_datapath)
>>
>> if (LINUX) {
>> if (LIBCAPNG) {
>> - daemon_become_new_user_linux(access_datapath);
>> + daemon_become_new_user_linux(access_datapath,
>> + access_hardware_ports);
>> } else {
>> VLOG_FATAL("%s: fail to downgrade user using libcap-ng. "
>> "(libcap-ng is not configured at compile time), "
>> @@ -877,11 +890,11 @@ daemon_become_new_user__(bool access_datapath)
>> * However, there in case the user switch needs to be done
>> * before daemonize_start(), the following API can be used. */
>> void
>> -daemon_become_new_user(bool access_datapath)
>> +daemon_become_new_user(bool access_datapath, bool access_hardware_ports)
>> {
>> assert_single_threaded();
>> if (switch_user) {
>> - daemon_become_new_user__(access_datapath);
>> + daemon_become_new_user__(access_datapath, access_hardware_ports);
>> /* daemonize_start() should not switch user again. */
>> switch_user = false;
>> }
>> diff --git a/lib/daemon-windows.c b/lib/daemon-windows.c
>> index 7e5f264f5b..4e6bbe0f04 100644
>> --- a/lib/daemon-windows.c
>> +++ b/lib/daemon-windows.c
>> @@ -498,7 +498,8 @@ make_pidfile(void)
>> }
>>
>> void
>> -daemonize_start(bool access_datapath OVS_UNUSED)
>> +daemonize_start(bool access_datapath OVS_UNUSED,
>> + bool access_hardware_ports OVS_UNUSED)
>> {
>> if (pidfile) {
>> make_pidfile();
>> @@ -526,7 +527,8 @@ daemonize_complete(void)
>> }
>>
>> void
>> -daemon_become_new_user(bool access_datapath OVS_UNUSED)
>> +daemon_become_new_user(bool access_datapath OVS_UNUSED,
>> + bool access_hardware_ports OVS_UNUSED)
>> {
>> }
>>
>> diff --git a/lib/daemon.c b/lib/daemon.c
>> index 3249c5ab4b..1e1c019eb1 100644
>> --- a/lib/daemon.c
>> +++ b/lib/daemon.c
>> @@ -48,7 +48,7 @@ get_detach(void)
>> void
>> daemonize(void)
>> {
>> - daemonize_start(false);
>> + daemonize_start(false, false);
>> daemonize_complete();
>> }
>>
>> diff --git a/lib/daemon.h b/lib/daemon.h
>> index 0941574963..42372d1463 100644
>> --- a/lib/daemon.h
>> +++ b/lib/daemon.h
>> @@ -167,10 +167,10 @@ void set_detach(void);
>> bool get_detach(void);
>> void daemon_save_fd(int fd);
>> void daemonize(void);
>> -void daemonize_start(bool access_datapath);
>> +void daemonize_start(bool access_datapath, bool access_hardware_ports);
>> void daemonize_complete(void);
>> void daemon_set_new_user(const char * user_spec);
>> -void daemon_become_new_user(bool access_datapath);
>> +void daemon_become_new_user(bool access_datapath, bool
>> access_hardware_ports);
>> void daemon_usage(void);
>> void daemon_disable_self_confinement(void);
>> bool daemon_should_self_confine(void);
>> diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c
>> index f1b8d64910..bae2c5f041 100644
>> --- a/ovsdb/ovsdb-client.c
>> +++ b/ovsdb/ovsdb-client.c
>> @@ -250,7 +250,7 @@ main(int argc, char *argv[])
>> parse_options(argc, argv);
>> fatal_ignore_sigpipe();
>>
>> - daemon_become_new_user(false);
>> + daemon_become_new_user(false, false);
>> if (optind >= argc) {
>> ovs_fatal(0, "missing command name; use --help for help");
>> }
>> @@ -1392,7 +1392,7 @@ do_monitor__(struct jsonrpc *rpc, const char *database,
>>
>> daemon_save_fd(STDOUT_FILENO);
>> daemon_save_fd(STDERR_FILENO);
>> - daemonize_start(false);
>> + daemonize_start(false, false);
>> if (get_detach()) {
>> int error;
>>
>> @@ -2276,7 +2276,7 @@ do_lock(struct jsonrpc *rpc, const char *method, const
>> char *lock)
>> getting a reply of the previous
>> request. */
>> daemon_save_fd(STDOUT_FILENO);
>> - daemonize_start(false);
>> + daemonize_start(false, false);
>> lock_req_init(&lock_req, method, lock);
>>
>> if (get_detach()) {
>> diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
>> index 33ca4910d7..4fea2dbda7 100644
>> --- a/ovsdb/ovsdb-server.c
>> +++ b/ovsdb/ovsdb-server.c
>> @@ -341,7 +341,7 @@ main(int argc, char *argv[])
>> &run_command, &sync_from, &sync_exclude, &active);
>> is_backup = sync_from && !active;
>>
>> - daemon_become_new_user(false);
>> + daemon_become_new_user(false, false);
>>
>> /* Create and initialize 'config_tmpfile' as a temporary file to hold
>> * ovsdb-server's most basic configuration, and then save our initial
>> @@ -359,7 +359,7 @@ main(int argc, char *argv[])
>> save_config__(config_tmpfile, &remotes, &db_filenames, sync_from,
>> sync_exclude, is_backup);
>>
>> - daemonize_start(false);
>> + daemonize_start(false, false);
>>
>> /* Load the saved config. */
>> load_config(config_tmpfile, &remotes, &db_filenames, &sync_from,
>> diff --git a/tests/test-netflow.c b/tests/test-netflow.c
>> index d2322d4509..7f89cfcae0 100644
>> --- a/tests/test-netflow.c
>> +++ b/tests/test-netflow.c
>> @@ -195,7 +195,7 @@ test_netflow_main(int argc, char *argv[])
>> }
>>
>> daemon_save_fd(STDOUT_FILENO);
>> - daemonize_start(false);
>> + daemonize_start(false, false);
>>
>> error = unixctl_server_create(NULL, &server);
>> if (error) {
>> diff --git a/tests/test-sflow.c b/tests/test-sflow.c
>> index 460d4d6c54..3c617bdd16 100644
>> --- a/tests/test-sflow.c
>> +++ b/tests/test-sflow.c
>> @@ -709,7 +709,7 @@ test_sflow_main(int argc, char *argv[])
>> }
>>
>> daemon_save_fd(STDOUT_FILENO);
>> - daemonize_start(false);
>> + daemonize_start(false, false);
>>
>> error = unixctl_server_create(NULL, &server);
>> if (error) {
>> diff --git a/tests/test-unixctl.c b/tests/test-unixctl.c
>> index 3eadf54cd9..9e89827895 100644
>> --- a/tests/test-unixctl.c
>> +++ b/tests/test-unixctl.c
>> @@ -83,7 +83,7 @@ test_unixctl_main(int argc, char *argv[])
>> fatal_ignore_sigpipe();
>> parse_options(&argc, &argv, &unixctl_path);
>>
>> - daemonize_start(false);
>> + daemonize_start(false, false);
>> int retval = unixctl_server_create(unixctl_path, &unixctl);
>> if (retval) {
>> exit(EXIT_FAILURE);
>> diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c
>> index eabec18a36..f81f5f759a 100644
>> --- a/utilities/ovs-ofctl.c
>> +++ b/utilities/ovs-ofctl.c
>> @@ -173,7 +173,7 @@ main(int argc, char *argv[])
>> ctx.argc = argc - optind;
>> ctx.argv = argv + optind;
>>
>> - daemon_become_new_user(false);
>> + daemon_become_new_user(false, false);
>> if (read_only) {
>> ovs_cmdl_run_command_read_only(&ctx, get_all_commands());
>> } else {
>> @@ -2127,7 +2127,7 @@ monitor_vconn(struct vconn *vconn, bool
>> reply_to_echo_requests,
>> int error;
>>
>> daemon_save_fd(STDERR_FILENO);
>> - daemonize_start(false);
>> + daemonize_start(false, false);
>> error = unixctl_server_create(unixctl_path, &server);
>> if (error) {
>> ovs_fatal(error, "failed to create unixctl server");
>> diff --git a/utilities/ovs-testcontroller.c b/utilities/ovs-testcontroller.c
>> index b489ff5fc7..9f2fbfdf51 100644
>> --- a/utilities/ovs-testcontroller.c
>> +++ b/utilities/ovs-testcontroller.c
>> @@ -109,7 +109,7 @@ main(int argc, char *argv[])
>> parse_options(argc, argv);
>> fatal_ignore_sigpipe();
>>
>> - daemon_become_new_user(false);
>> + daemon_become_new_user(false, false);
>>
>> if (argc - optind < 1) {
>> ovs_fatal(0, "at least one vconn argument required; "
>> @@ -148,7 +148,7 @@ main(int argc, char *argv[])
>> ovs_fatal(0, "no active or passive switch connections");
>> }
>>
>> - daemonize_start(false);
>> + daemonize_start(false, false);
>>
>> retval = unixctl_server_create(unixctl_path, &unixctl);
>> if (retval) {
>> diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in
>> index 9569265fcb..10c6e077ba 100644
>> --- a/vswitchd/ovs-vswitchd.8.in
>> +++ b/vswitchd/ovs-vswitchd.8.in
>> @@ -81,6 +81,15 @@ unavailable or unsuccessful.
>> .SS "DPDK Options"
>> For details on initializing \fBovs\-vswitchd\fR to use DPDK ports,
>> refer to the documentation or \fBovs\-vswitchd.conf.db\fR(5).
>> +.SS "DPDK HW Access Options"
>> +.IP "\fB\-\-hw\-rawio\-access\fR"
>> +Tells \fBovs\-vswitchd\fR to retain the \fBCAP_SYS_RAWIO\fR capability,
>> +to allow userspace drivers access to raw hardware memory. This will
>> +also allow the \fBovs\-vswitchd\fR daemon to call \fBiopl()\fR and
>> +\fBioperm()\fR functions as well as access memory devices to set port
>> +access. This is a \fBvery\fR powerful capability, so generally only
>> +enable as needed for specific hardware (for example mlx5 with full
>> +hardware offload via rte_flow).
>> .SS "Daemon Options"
>> .ds DD \
>> \fBovs\-vswitchd\fR detaches only after it has connected to the \
>> diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
>> index 407bfc60eb..a244d2f709 100644
>> --- a/vswitchd/ovs-vswitchd.c
>> +++ b/vswitchd/ovs-vswitchd.c
>> @@ -60,6 +60,9 @@ VLOG_DEFINE_THIS_MODULE(vswitchd);
>> * the kernel from paging any of its memory to disk. */
>> static bool want_mlockall;
>>
>> +/* --hw-rawio-access: If set, retains CAP_SYS_RAWIO privileges. */
>> +static bool hw_rawio_access;
>> +
>> static unixctl_cb_func ovs_vswitchd_exit;
>>
>> static char *parse_options(int argc, char *argv[], char **unixctl_path);
>> @@ -89,7 +92,7 @@ main(int argc, char *argv[])
>> remote = parse_options(argc, argv, &unixctl_path);
>> fatal_ignore_sigpipe();
>>
>> - daemonize_start(true);
>> + daemonize_start(true, hw_rawio_access);
>>
>> if (want_mlockall) {
>> #ifdef HAVE_MLOCKALL
>> @@ -169,6 +172,7 @@ parse_options(int argc, char *argv[], char
>> **unixctl_pathp)
>> OPT_DPDK,
>> SSL_OPTION_ENUMS,
>> OPT_DUMMY_NUMA,
>> + OPT_HW_RAWIO_ACCESS,
>> };
>> static const struct option long_options[] = {
>> {"help", no_argument, NULL, 'h'},
>> @@ -185,6 +189,7 @@ parse_options(int argc, char *argv[], char
>> **unixctl_pathp)
>> {"disable-system-route", no_argument, NULL,
>> OPT_DISABLE_SYSTEM_ROUTE},
>> {"dpdk", optional_argument, NULL, OPT_DPDK},
>> {"dummy-numa", required_argument, NULL, OPT_DUMMY_NUMA},
>> + {"hw-rawio-access", no_argument, NULL, OPT_HW_RAWIO_ACCESS},
>> {NULL, 0, NULL, 0},
>> };
>> char *short_options =
>> ovs_cmdl_long_options_to_short_options(long_options);
>> @@ -249,6 +254,10 @@ parse_options(int argc, char *argv[], char
>> **unixctl_pathp)
>> ovs_numa_set_dummy(optarg);
>> break;
>>
>> + case OPT_HW_RAWIO_ACCESS:
>> + hw_rawio_access = true;
>> + break;
>> +
>> default:
>> abort();
>> }
>> --
>> 2.39.2
>>
>> _______________________________________________
>> dev mailing list
>> [email protected]
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev