On Fri, May 16, 2025 at 03:28:53PM -0500, Eric Blake via Libguestfs wrote:
> Instead of serializing connections (which in turn serializes requests
> because it is unsafe to open more than one handle into the filesystem
> at the same time), use the recently-added APIs to open the filesystem
> during .after_fork, then use the fact that ext2 code then supports
> parallel access to files within the single system handle.  Still, even
> if the underlying plugin accepts parallel actions, we serialize
> because e2fsprogs is not re-entrant.
> ---
> 
> Given the recent discussion on the openonce filter, I dug up and
> polished this old patch I had on the ext2 filter (I had started it in
> 2021, then abandoned it at the time when I couldn't figure out why it
> was not multi-conn consistent; but now I know it is because of a
> limitation in e2fsprogs).
> 
>  tests/Makefile.am           |   4 +-
>  filters/ext2/ext2.c         | 225 ++++++++++++++++++++++--------------
>  tests/test-ext2-parallel.sh | 105 +++++++++++++++++
>  3 files changed, 244 insertions(+), 90 deletions(-)
>  create mode 100755 tests/test-ext2-parallel.sh
> 
> diff --git a/tests/Makefile.am b/tests/Makefile.am
> index 7a7f6d37..63730821 100644
> --- a/tests/Makefile.am
> +++ b/tests/Makefile.am
> @@ -1835,8 +1835,8 @@ test_ext2_SOURCES = test-ext2.c test.h
>  test_ext2_CFLAGS = $(WARNINGS_CFLAGS) $(LIBGUESTFS_CFLAGS)
>  test_ext2_LDADD = libtest.la $(LIBGUESTFS_LIBS)
> 
> -TESTS += test-ext2-exportname.sh
> -EXTRA_DIST += test-ext2-exportname.sh
> +TESTS += test-ext2-exportname.sh test-ext2-parallel.sh
> +EXTRA_DIST += test-ext2-exportname.sh test-ext2-parallel.sh
> 
>  check_DATA += ext2.img
>  CLEANFILES += ext2.img
> diff --git a/filters/ext2/ext2.c b/filters/ext2/ext2.c
> index fb564c65..492740cd 100644
> --- a/filters/ext2/ext2.c
> +++ b/filters/ext2/ext2.c
> @@ -36,6 +36,7 @@
>  #include <stdlib.h>
>  #include <inttypes.h>
>  #include <errno.h>
> +#include <stdbool.h>
> 
>  /* Inlining is broken in the ext2fs header file.  Disable it by
>   * defining the following:
> @@ -55,6 +56,14 @@
>   */
>  static const char *file;
> 
> +/* Filesystem handle, shared between all client connections. */
> +static ext2_filsys fs;
> +
> +/* Plugin access shared between all client connections, and doubles as
> + * the "name" parameter for ext2fs_open.
> + */
> +static nbdkit_next *plugin;
> +
>  static void
>  ext2_load (void)
>  {
> @@ -89,7 +98,8 @@ ext2_config_complete (nbdkit_next_config_complete *next, 
> nbdkit_backend *nxdata)
>    if (strcmp (file, "exportname") == 0)
>      file = NULL;
>    else if (file[0] != '/') {
> -    nbdkit_error ("the file parameter must refer to an absolute path");
> +    nbdkit_error ("the file parameter must be 'exportname' or refer to "
> +                  "an absolute path");
>      return -1;
>    }
> 
> @@ -100,13 +110,87 @@ ext2_config_complete (nbdkit_next_config_complete 
> *next, nbdkit_backend *nxdata)
>    "ext2file=<FILENAME>  (required) Absolute name of file to serve inside\n" \
>    "                     the disk image, or 'exportname' for client choice."
> 
> +/* Opening more than one instance of the filesystem in parallel is a
> + * recipe for disaster, so instead we open a single instance during
> + * after_fork to share among all client connections.
> + */
> +static int
> +ext2_after_fork (nbdkit_backend *nxdata)
> +{
> +  CLEANUP_FREE char *name = NULL;
> +  int fs_flags;
> +  int64_t r;
> +  errcode_t err;
> +
> +  /* It would be desirable for ???nbdkit -r??? to behave the same way as
> +   * ???mount -o ro???.  But we don't know the state of the readonly flag

I think the mailing list may have munged some Unicode codepoints here.

> +   * until ext2_open is called, so for now we can't do that.  We could
> +   * add a knob during .config if desired; but until then, we blindly
> +   * request write access to the underlying plugin, for journal
> +   * replay.
> +   *
> +   * Similarly, there is no sane way to pass the client's exportname
> +   * through to the plugin (whether or not .config was set to serve a
> +   * single file or to let the client choose by exportname), so
> +   * blindly ask for "" and rely on the plugin's default.
> +   */
> +  plugin = nbdkit_next_context_open (nxdata, 0, "", true);
> +  if (plugin == NULL) {
> +    nbdkit_error ("unable to open plugin");
> +    return -1;
> +  }
> +  if (plugin->prepare (plugin) == -1)
> +    goto fail;
> +
> +  fs_flags = 0;
> +#ifdef EXT2_FLAG_64BITS
> +  fs_flags |= EXT2_FLAG_64BITS;
> +#endif
> +  r = plugin->get_size (plugin);
> +  if (r == -1)
> +    goto fail;
> +  /* XXX See note above about a knob for read-only */
> +  r = plugin->can_write (plugin);
> +  if (r == -1)
> +    goto fail;
> +  if (r == 1)
> +    fs_flags |= EXT2_FLAG_RW;
> +
> +  name = nbdkit_io_encode (plugin);
> +  if (!name) {
> +    nbdkit_error ("nbdkit_io_encode: %m");
> +    goto fail;
> +  }
> +
> +  err = ext2fs_open (name, fs_flags, 0, 0, nbdkit_io_manager, &fs);
> +  if (err != 0) {
> +    nbdkit_error ("open: %s", error_message (err));
> +    goto fail;
> +  }
> +
> +  return 0;
> +
> + fail:
> +  plugin->finalize (plugin);
> +  nbdkit_next_context_close (plugin);
> +  return -1;
> +}
> +
> +static void
> +ext2_cleanup (nbdkit_backend *nxdata)
> +{
> +  if (fs)
> +    ext2fs_close (fs);
> +  plugin->finalize (plugin);
> +  nbdkit_next_context_close (plugin);
> +}

Yes, calling nbdkit_next_context_close in .unload doesn't work.  I
didn't exactly get to the bottom of why, but I think it's because of
the order that the plugin and filter get unmapped from memory.

>  /* The per-connection handle. */
>  struct handle {
>    const char *exportname;       /* Client export name. */
> -  ext2_filsys fs;               /* Filesystem handle. */
>    ext2_ino_t ino;               /* Inode of open file. */
>    ext2_file_t file;             /* File handle. */
> -  nbdkit_next *next;            /* "name" parameter to ext2fs_open. */
> +  nbdkit_context *context;      /* Access to the filter context. */
>  };
> 
>  /* Export list. */
> @@ -117,18 +201,17 @@ ext2_list_exports (nbdkit_next_list_exports *next, 
> nbdkit_backend *nxdata,
>    /* If we are honoring export names, the default export "" won't
>     * work, and we must not leak export names from the underlying
>     * plugin.  Advertising all filenames within the ext2 image could be
> -   * huge, and even if we wanted to, it would require that we could
> -   * open the plugin prior to the client reaching our .open.  So leave
> -   * the list empty instead.
> +   * huge, although we could do it if we wanted to since the
> +   * filesystem was already opened in .after_fork.
>     */
>    if (!file)
>      return 0;
> 
>    /* If we are serving a specific ext2file, we don't care what export
> -   * name the user passes, but the underlying plugin might; there's no
> -   * harm in advertising that list.
> +   * name the user passes, but it's too late to pass that on to the
> +   * underlying plugin, so advertise just "".
>     */
> -  return next (nxdata, readonly, exports);
> +  return nbdkit_use_default_export (exports);
>  }
> 
>  /* Default export. */
> @@ -170,17 +253,7 @@ ext2_open (nbdkit_next_open *next, nbdkit_context 
> *nxdata,
>      return NULL;
>    }
> 
> -  /* If file == NULL (ie. using exportname) then don't
> -   * pass the client exportname to the lower layers.
> -   */
> -  exportname = file ? exportname : "";
> -
> -  /* Request write access to the underlying plugin, for journal replay. */
> -  if (next (nxdata, 0, exportname) == -1) {
> -    free (h);
> -    return NULL;
> -  }
> -
> +  h->context = nxdata;
>    return h;
>  }
> 
> @@ -189,36 +262,12 @@ ext2_prepare (nbdkit_next *next, void *handle, int 
> readonly)
>  {
>    struct handle *h = handle;
>    errcode_t err;
> -  int fs_flags;
>    int file_flags;
>    struct ext2_inode inode;
> -  int64_t r;
>    CLEANUP_FREE char *name = NULL;
>    const char *fname = file ?: h->exportname;
>    CLEANUP_FREE char *absname = NULL;
> -
> -  fs_flags = 0;
> -#ifdef EXT2_FLAG_64BITS
> -  fs_flags |= EXT2_FLAG_64BITS;
> -#endif
> -  r = next->get_size (next);
> -  if (r == -1)
> -    return -1;
> -  r = next->can_write (next);
> -  if (r == -1)
> -    return -1;
> -  if (r == 0)
> -    readonly = 1;
> -
> -  if (!readonly)
> -    fs_flags |= EXT2_FLAG_RW;
> -
> -  h->next = next;
> -  name = nbdkit_io_encode (next);
> -  if (!name) {
> -    nbdkit_error ("nbdkit_io_encode: %m");
> -    return -1;
> -  }
> +  nbdkit_next *old;
> 
>    if (fname[0] != '/') {
>      if (asprintf (&absname, "/%s", fname) < 0) {
> @@ -228,53 +277,58 @@ ext2_prepare (nbdkit_next *next, void *handle, int 
> readonly)
>      fname = absname;
>    }
> 
> -  err = ext2fs_open (name, fs_flags, 0, 0, nbdkit_io_manager, &h->fs);
> -  if (err != 0) {
> -    nbdkit_error ("open: %s", error_message (err));
> -    goto err0;
> -  }
> -
>    if (strcmp (fname, "/") == 0)
>      /* probably gonna fail, but we'll catch it later */
>      h->ino = EXT2_ROOT_INO;
>    else {
> -    err = ext2fs_namei (h->fs, EXT2_ROOT_INO, EXT2_ROOT_INO,
> +    err = ext2fs_namei (fs, EXT2_ROOT_INO, EXT2_ROOT_INO,
>                          &fname[1], &h->ino);
>      if (err != 0) {
>        nbdkit_error ("%s: namei: %s", fname, error_message (err));
> -      goto err1;
> +      return -1;
>      }
>    }
> 
>    /* Check that fname is a regular file.
>     * XXX This won't follow symlinks, we'd have to do that manually.
>     */
> -  err = ext2fs_read_inode (h->fs, h->ino, &inode);
> +  err = ext2fs_read_inode (fs, h->ino, &inode);
>    if (err != 0) {
>      nbdkit_error ("%s: inode: %s", fname, error_message (err));
> -    goto err1;
> +    return -1;
>    }
>    if (!LINUX_S_ISREG (inode.i_mode)) {
>      nbdkit_error ("%s: must be a regular file in the disk image", fname);
> -    goto err1;
> +    return -1;
>    }
> 
>    file_flags = 0;
>    if (!readonly)
>      file_flags |= EXT2_FILE_WRITE;
> -  err = ext2fs_file_open2 (h->fs, h->ino, NULL, file_flags, &h->file);
> +  err = ext2fs_file_open2 (fs, h->ino, NULL, file_flags, &h->file);
>    if (err != 0) {
>      nbdkit_error ("%s: open: %s", fname, error_message (err));
> -    goto err1;
> +    return -1;
>    }
> 
> +  /* Associate our shared backend with this connection, so we don't
> +   * have to override every single callback function.
> +   */
> +  old = nbdkit_context_set_next (h->context, plugin);
> +  assert (old == NULL);
>    return 0;
> +}
> 
> - err1:
> -  ext2fs_close (h->fs);
> -  h->fs = NULL;
> - err0:
> -  return -1;
> +static int
> +ext2_finalize (nbdkit_next *next, void *handle)
> +{
> +  struct handle *h = handle;
> +  nbdkit_next *old;
> +
> +  /* Ensure our shared plugin handle survives past this connection. */
> +  old = nbdkit_context_set_next (h->context, NULL);
> +  assert (old == next);
> +  return 0;
>  }
> 
>  /* Free up the per-connection handle. */
> @@ -283,10 +337,8 @@ ext2_close (void *handle)
>  {
>    struct handle *h = handle;
> 
> -  if (h->fs) {
> +  if (h->file)
>      ext2fs_file_close (h->file);
> -    ext2fs_close (h->fs);
> -  }
>    free (h);
>  }
> 
> @@ -306,12 +358,10 @@ ext2_can_cache (nbdkit_next *next, void *handle)
>  static int
>  ext2_can_multi_conn (nbdkit_next *next, void *handle)
>  {
> -  /* Since we do not permit parallel connections, it does not matter
> -   * what we advertise here, and we could just as easily inherit the
> -   * plugin's .can_multi_conn.  But realistically, if we adjust
> -   * .thread_model, we cannot advertise support unless .flush is
> -   * consistent, and that would require inspecting the ext2 source
> -   * code, so for now, we hard-code a safe answer.
> +  /* Although we permit parallel client connections multiplexed into
> +   * the single shared filesystem handle, we absolutely know that ext2
> +   * does not share caches between separate opens of the same inode.
> +   * Hard-code the only correct answer.
>     */
>    return 0;
>  }
> @@ -324,7 +374,7 @@ ext2_can_flush (nbdkit_next *next, void *handle)
>     * plugin ability, since ext2 wants to flush the filesystem into
>     * permanent storage when possible.
>     */
> -  if (next->can_flush (next) == -1)
> +  if (plugin->can_flush (plugin) == -1)
>      return -1;
>    return 1;
>  }
> @@ -340,7 +390,7 @@ ext2_can_zero (nbdkit_next *next, void *handle)
>     * though we don't implement .zero, the file system wants to know if
>     * it can use next->zero() during io_zeroout.
>     */
> -  if (next->can_zero (next) == -1)
> +  if (plugin->can_zero (plugin) == -1)
>      return -1;
>    return NBDKIT_ZERO_EMULATE;
>  }
> @@ -353,26 +403,22 @@ ext2_can_trim (nbdkit_next *next, void *handle)
>     * implement .trim, the file system wants to know if it can use
>     * next->trim() during io_discard.
>     */
> -  if (next->can_trim (next) == -1)
> +  if (plugin->can_trim (plugin) == -1)
>      return -1;
>    return 0;
>  }
> 
> -/* It might be possible to relax this, but it's complicated.
> - *
> - * It's desirable for ???nbdkit -r??? to behave the same way as
> - * ???mount -o ro???.  But we don't know the state of the readonly flag
> - * until ext2_open is called (because the NBD client can also request
> - * a readonly connection).  So we could not set the "ro" flag if we
> - * opened the filesystem any earlier (eg in ext2_config).
> - *
> - * So out of necessity we have one ext2_filsys handle per connection,
> - * but if we allowed parallel work on those handles then we would get
> - * data corruption, so we need to serialize connections.
> +/* ext2 is generally not re-entrant; even if the underlying plugin
> + * supports parallel actions, at most one thread should be
> + * manipulating the ext2 filesystem.  Since multiple clients are
> + * sharing the same common handle into the plugin, this tells nbdkit
> + * to execute only one action at a time.
>   */
>  static int ext2_thread_model (int next_thread_model)
>  {
> -  return NBDKIT_THREAD_MODEL_SERIALIZE_CONNECTIONS;
> +  if (next_thread_model == NBDKIT_THREAD_MODEL_SERIALIZE_CONNECTIONS)
> +    return next_thread_model;
> +  return NBDKIT_THREAD_MODEL_SERIALIZE_ALL_REQUESTS;
>  }
> 
>  /* Description. */
> @@ -382,7 +428,7 @@ ext2_export_description (nbdkit_next *next, void *handle)
>    struct handle *h = handle;
>    const char *fname = file ?: h->exportname;
>    const char *slash = fname[0] == '/' ? "" : "/";
> -  const char *base = next->export_description (next);
> +  const char *base = plugin->export_description (plugin);
> 
>    if (!base)
>      return NULL;
> @@ -510,10 +556,13 @@ static struct nbdkit_filter filter = {
>    .config_complete    = ext2_config_complete,
>    .config_help        = ext2_config_help,
>    .thread_model       = ext2_thread_model,
> +  .after_fork         = ext2_after_fork,
> +  .cleanup            = ext2_cleanup,
>    .list_exports       = ext2_list_exports,
>    .default_export     = ext2_default_export,
>    .open               = ext2_open,
>    .prepare            = ext2_prepare,
> +  .finalize           = ext2_finalize,
>    .close              = ext2_close,
>    .can_fua            = ext2_can_fua,
>    .can_cache          = ext2_can_cache,
> diff --git a/tests/test-ext2-parallel.sh b/tests/test-ext2-parallel.sh
> new file mode 100755
> index 00000000..62dc10a8
> --- /dev/null
> +++ b/tests/test-ext2-parallel.sh
> @@ -0,0 +1,105 @@
> +#!/usr/bin/env bash
> +# nbdkit
> +# Copyright (C) Red Hat Inc.
> +#
> +# Redistribution and use in source and binary forms, with or without
> +# modification, are permitted provided that the following conditions are
> +# met:
> +#
> +# * Redistributions of source code must retain the above copyright
> +# notice, this list of conditions and the following disclaimer.
> +#
> +# * Redistributions in binary form must reproduce the above copyright
> +# notice, this list of conditions and the following disclaimer in the
> +# documentation and/or other materials provided with the distribution.
> +#
> +# * Neither the name of Red Hat nor the names of its contributors may be
> +# used to endorse or promote products derived from this software without
> +# specific prior written permission.
> +#
> +# THIS SOFTWARE IS PROVIDED BY RED HAT AND CONTRIBUTORS ''AS IS'' AND
> +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
> +# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
> +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL RED HAT OR
> +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
> +# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
> +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
> +# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> +# SUCH DAMAGE.
> +
> +source ./functions.sh
> +set -e
> +set -x
> +
> +requires_plugin file
> +requires nbdsh
> +requires guestfish --version
> +
> +sock=$(mktemp -u /tmp/nbdkit-test-sock.XXXXXX)
> +files="$sock ext2-parallel.pid ext2-parallel.out ext2-parallel.err \
> +ext2-parallel.img"
> +rm -f $files
> +cleanup_fn rm -f $files
> +
> +# Create an image with two writable files
> +guestfish \
> +    sparse ext2-parallel.img 10M : \
> +    run : \
> +    mkfs ext4 /dev/sda : \
> +    mount /dev/sda / : \
> +    write /one hello : \
> +    write /two goodbye
> +
> +# Set up a long-running server responsive to the client's export name
> +start_nbdkit -P ext2-parallel.pid -U $sock --filter=ext2 \
> +    file ext2-parallel.img ext2file=exportname
> +
> +# Demonstrate 3 clients with parallel connections (but interleaved actions):
> +# first reads from /one,
> +# second writes to /one,
> +# third reads from /two
> +# second flushes
> +# first reads updated /one
> +export sock
> +nbdsh -c '
> +import os
> +sock = os.getenv("sock")
> +
> +h1 = nbd.NBD()
> +h1.set_export_name("/one")
> +h1.connect_unix(sock)
> +
> +h2 = nbd.NBD()
> +h2.set_export_name("/one")
> +h2.connect_unix(sock)
> +
> +h3 = nbd.NBD()
> +h3.set_export_name("/two")
> +h3.connect_unix(sock)
> +
> +buf1 = h1.pread(5, 0)
> +assert buf1 == b"hello"
> +h2.pwrite(b"world", 0)
> +buf3 = h3.pread(7, 0)
> +assert buf3 == b"goodbye"
> +h2.flush()
> +
> +# Even with the flush, two handles visiting the same inodes do not share
> +# a cache.  A new handle sees the updated inode content...
> +h4 = nbd.NBD()
> +h4.set_export_name("/one")
> +h4.connect_unix(sock)
> +buf4 = h4.pread(5, 0)
> +assert buf4 == b"world"
> +# ...but the older handle still sees old data.
> +buf1 = h1.pread(5, 0)
> +assert buf1 == b"hello"
> +
> +h1.shutdown()
> +h2.shutdown()
> +h3.shutdown()
> +h4.shutdown()
> +'
> -- 

Looks fine here, so:

Reviewed-by: Richard W.M. Jones <rjo...@redhat.com>

Thanks,

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-builder quickly builds VMs from scratch
http://libguestfs.org/virt-builder.1.html
_______________________________________________
Libguestfs mailing list -- guestfs@lists.libguestfs.org
To unsubscribe send an email to guestfs-le...@lists.libguestfs.org

Reply via email to