The branch, master has been updated
       via  8ec9903426ec4e559df8ac8306a8ebcdf0706176 (commit)
       via  0dcfa9ce1baa9f2074a002fdb5c8b88cc5db28db (commit)
      from  1ff9696306894c136015f83456e4c6e039e31e26 (commit)

http://gitweb.samba.org/?p=samba.git;a=shortlog;h=master


- Log -----------------------------------------------------------------
commit 8ec9903426ec4e559df8ac8306a8ebcdf0706176
Author: Tim Prouty <[email protected]>
Date:   Fri Feb 20 13:27:39 2009 -0800

    s3 OneFS: Add an atomic sendfile implementation

commit 0dcfa9ce1baa9f2074a002fdb5c8b88cc5db28db
Author: Tim Prouty <[email protected]>
Date:   Fri Feb 20 13:28:36 2009 -0800

    s3: If sendfile returns 0 bytes read, fall back to the normal read path
    
    This allows sendfile implementations that are atomic to avoid having
    to send zeros or kill the client connection on a short read (usually
    the file was truncated).

-----------------------------------------------------------------------

Summary of changes:
 source3/modules/onefs.h        |   10 ++
 source3/modules/onefs_system.c |  257 ++++++++++++++++++++++++++++++++++++++++
 source3/modules/vfs_onefs.c    |   15 +++
 source3/smbd/reply.c           |   24 ++++
 4 files changed, 306 insertions(+), 0 deletions(-)


Changeset truncated at 500 lines:

diff --git a/source3/modules/onefs.h b/source3/modules/onefs.h
index ea452a4..a70664b 100644
--- a/source3/modules/onefs.h
+++ b/source3/modules/onefs.h
@@ -47,6 +47,8 @@ enum onefs_acl_wire_format
 #define PARM_ATIME_STATIC_DEFAULT NULL
 #define PARM_ATIME_SLOP                "atime now slop"
 #define PARM_ATIME_SLOP_DEFAULT         0
+#define PARM_ATOMIC_SENDFILE "atomic sendfile"
+#define PARM_ATOMIC_SENDFILE_DEFAULT true
 #define PARM_CREATOR_OWNER_GETS_FULL_CONTROL "creator owner gets full control"
 #define PARM_CREATOR_OWNER_GETS_FULL_CONTROL_DEFAULT true
 #define PARM_CTIME_NOW         "ctime now files"
@@ -63,6 +65,10 @@ enum onefs_acl_wire_format
 #define PARM_MTIME_SLOP_DEFAULT        0
 #define PARM_USE_READDIRPLUS "use readdirplus"
 #define PARM_USE_READDIRPLUS_DEFAULT true
+#define PARM_SENDFILE_LARGE_READS "sendfile large reads"
+#define PARM_SENDFILE_LARGE_READS_DEFAULT false
+#define PARM_SENDFILE_SAFE "sendfile safe"
+#define PARM_SENDFILE_SAFE_DEFAULT true
 #define PARM_SIMPLE_FILE_SHARING_COMPATIBILITY_MODE "simple file sharing 
compatibility mode"
 #define PARM_SIMPLE_FILE_SHARING_COMPATIBILITY_MODE_DEFAULT false
 #define PARM_UNMAPPABLE_SIDS_DENY_EVERYONE "unmappable sids deny everyone"
@@ -254,6 +260,10 @@ int onefs_sys_create_file(connection_struct *conn,
                          uint32_t ntfs_flags,
                          int *granted_oplock);
 
+ssize_t onefs_sys_sendfile(connection_struct *conn, int tofd, int fromfd,
+                          const DATA_BLOB *header, SMB_OFF_T offset,
+                          size_t count);
+
 ssize_t onefs_sys_recvfile(int fromfd, int tofd, SMB_OFF_T offset,
                           size_t count);
 
diff --git a/source3/modules/onefs_system.c b/source3/modules/onefs_system.c
index 3a86b4b..1080289 100644
--- a/source3/modules/onefs_system.c
+++ b/source3/modules/onefs_system.c
@@ -178,6 +178,263 @@ int onefs_sys_create_file(connection_struct *conn,
 }
 
 /**
+ * FreeBSD based sendfile implementation that allows for atomic semantics.
+ */
+static ssize_t onefs_sys_do_sendfile(int tofd, int fromfd,
+    const DATA_BLOB *header, SMB_OFF_T offset, size_t count, bool atomic)
+{
+       size_t total=0;
+       struct sf_hdtr hdr;
+       struct iovec hdtrl;
+       size_t hdr_len = 0;
+       int flags = 0;
+
+       if (atomic) {
+               flags = SF_ATOMIC;
+       }
+
+       hdr.headers = &hdtrl;
+       hdr.hdr_cnt = 1;
+       hdr.trailers = NULL;
+       hdr.trl_cnt = 0;
+
+       /* Set up the header iovec. */
+       if (header) {
+               hdtrl.iov_base = header->data;
+               hdtrl.iov_len = hdr_len = header->length;
+       } else {
+               hdtrl.iov_base = NULL;
+               hdtrl.iov_len = 0;
+       }
+
+       total = count;
+       while (total + hdtrl.iov_len) {
+               SMB_OFF_T nwritten;
+               int ret;
+
+               /*
+                * FreeBSD sendfile returns 0 on success, -1 on error.
+                * Remember, the tofd and fromfd are reversed..... :-).
+                * nwritten includes the header data sent.
+                */
+
+               do {
+                       ret = sendfile(fromfd, tofd, offset, total, &hdr,
+                                      &nwritten, flags);
+               } while (ret == -1 && errno == EINTR);
+
+               /* On error we're done. */
+               if (ret == -1) {
+                       return -1;
+               }
+
+               /*
+                * If this was an ATOMIC sendfile, nwritten doesn't
+                * necessarily indicate an error.  It could mean count > than
+                * what sendfile can handle atomically (usually 64K) or that
+                * there was a short read due to the file being truncated.
+                */
+               if (nwritten == 0) {
+                       return atomic ? 0 : -1;
+               }
+
+               /*
+                * An atomic sendfile should never send partial data!
+                */
+               if (atomic && nwritten != total + hdtrl.iov_len) {
+                       DEBUG(0,("Atomic sendfile() sent partial data: "
+                                "%llu of %d\n", nwritten,
+                                total + hdtrl.iov_len));
+                       return -1;
+               }
+
+               /*
+                * If this was a short (signal interrupted) write we may need
+                * to subtract it from the header data, or null out the header
+                * data altogether if we wrote more than hdtrl.iov_len bytes.
+                * We change nwritten to be the number of file bytes written.
+                */
+
+               if (hdtrl.iov_base && hdtrl.iov_len) {
+                       if (nwritten >= hdtrl.iov_len) {
+                               nwritten -= hdtrl.iov_len;
+                               hdtrl.iov_base = NULL;
+                               hdtrl.iov_len = 0;
+                       } else {
+                               hdtrl.iov_base =
+                                   (caddr_t)hdtrl.iov_base + nwritten;
+                               hdtrl.iov_len -= nwritten;
+                               nwritten = 0;
+                       }
+               }
+               total -= nwritten;
+               offset += nwritten;
+       }
+       return count + hdr_len;
+}
+
+/**
+ * Handles the subtleties of using sendfile with CIFS.
+ */
+ssize_t onefs_sys_sendfile(connection_struct *conn, int tofd, int fromfd,
+                          const DATA_BLOB *header, SMB_OFF_T offset,
+                          size_t count)
+{
+       bool atomic = false;
+       ssize_t ret = 0;
+
+       if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+                        PARM_ATOMIC_SENDFILE,
+                        PARM_ATOMIC_SENDFILE_DEFAULT)) {
+               atomic = true;
+       }
+
+       /* Try the sendfile */
+       ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset, count,
+                                   atomic);
+
+       /* If the sendfile wasn't atomic, we're done. */
+       if (!atomic) {
+               DEBUG(10, ("non-atomic sendfile read %ul bytes", ret));
+               return ret;
+       }
+
+       /*
+        * Atomic sendfile takes care to not write anything to the socket
+        * until all of the requested bytes have been read from the file.
+        * There are two atomic cases that need to be handled.
+        *
+        *  1. The file was truncated causing less data to be read than was
+        *     requested.  In this case, we return back to the caller to
+        *     indicate 0 bytes were written to the socket.  This should
+        *     prompt the caller to fallback to the standard read path: read
+        *     the data, create a header that indicates how many bytes were
+        *     actually read, and send the header/data back to the client.
+        *
+        *     This saves us from standard sendfile behavior of sending a
+        *     header promising more data then will actually be sent.  The
+        *     only two options are to close the socket and kill the client
+        *     connection, or write a bunch of 0s.  Closing the client
+        *     connection is bad because there could actually be multiple
+        *     sessions multiplexed from the same client that are all dropped
+        *     because of a truncate.  Writing the remaining data as 0s also
+        *     isn't good, because the client will have an incorrect version
+        *     of the file.  If the file is written back to the server, the 0s
+        *     will be written back.  Fortunately, atomic sendfile allows us
+        *     to avoid making this choice in most cases.
+        *
+        *  2. One downside of atomic sendfile, is that there is a limit on
+        *     the number of bytes that can be sent atomically.  The kernel
+        *     has a limited amount of mbuf space that it can read file data
+        *     into without exhausting the system's mbufs, so a buffer of
+        *     length xfsize is used.  The xfsize at the time of writing this
+        *     is 64K.  xfsize bytes are read from the file, and subsequently
+        *     written to the socket.  This makes it impossible to do the
+        *     sendfile atomically for a byte count > xfsize.
+        *
+        *     To cope with large requests, atomic sendfile returns -1 with
+        *     errno set to E2BIG.  Since windows maxes out at 64K writes,
+        *     this is currently only a concern with non-windows clients.
+        *     Posix extensions allow the full 24bit bytecount field to be
+        *     used in ReadAndX, and clients such as smbclient and the linux
+        *     cifs client can request up to 16MB reads!  There are a few
+        *     options for handling large sendfile requests.
+        *
+        *      a. Fall back to the standard read path.  This is unacceptable
+        *         because it would require prohibitively large mallocs.
+        *
+        *      b. Fall back to using samba's fake_send_file which emulates
+        *         the kernel sendfile in userspace.  This still has the same
+        *         problem of sending the header before all of the data has
+        *         been read, so it doesn't buy us anything, and has worse
+        *         performance than the kernel's zero-copy sendfile.
+        *
+        *      c. Use non-atomic sendfile syscall to attempt a zero copy
+        *         read, and hope that there isn't a short read due to
+        *         truncation.  In the case of a short read, there are two
+        *         options:
+        *
+        *          1. Kill the client connection
+        *
+        *          2. Write zeros to the socket for the remaining bytes
+        *             promised in the header.
+        *
+        *         It is safer from a data corruption perspective to kill the
+        *         client connection, so this is our default behavior, but if
+        *         this causes problems this can be configured to write zeros
+        *         via smb.conf.
+        */
+
+       /* Handle case 1: short read -> truncated file. */
+       if (ret == 0) {
+               return ret;
+       }
+
+       /* Handle case 2: large read. */
+       if (ret == -1 && errno == E2BIG) {
+
+               if (!lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+                                PARM_SENDFILE_LARGE_READS,
+                                PARM_SENDFILE_LARGE_READS_DEFAULT)) {
+                       DEBUG(3, ("Not attempting non-atomic large sendfile: "
+                                 "%lu bytes\n", count));
+                       return 0;
+               }
+
+               if (count < 0x10000) {
+                       DEBUG(0, ("Count < 2^16 and E2BIG was returned! %lu",
+                                 count));
+               }
+
+               DEBUG(10, ("attempting non-atomic large sendfile: %lu bytes\n",
+                          count));
+
+               /* Try a non-atomic sendfile. */
+               ret = onefs_sys_do_sendfile(tofd, fromfd, header, offset,
+                                           count, false);
+               /* Real error: kill the client connection. */
+               if (ret == -1) {
+                       DEBUG(1, ("error on non-atomic large sendfile "
+                                 "(%lu bytes): %s\n", count,
+                                 strerror(errno)));
+                       return ret;
+               }
+
+               /* Short read: kill the client connection. */
+               if (ret != count + header->length) {
+                       DEBUG(1, ("short read on non-atomic large sendfile "
+                                 "(%lu of %lu bytes): %s\n", ret, count,
+                                 strerror(errno)));
+
+                       /*
+                        * Returning ret here would cause us to drop into the
+                        * codepath that calls sendfile_short_send, which
+                        * sends the client a bunch of zeros instead.
+                        * Returning -1 kills the connection.
+                        */
+                       if (lp_parm_bool(SNUM(conn), PARM_ONEFS_TYPE,
+                               PARM_SENDFILE_SAFE,
+                               PARM_SENDFILE_SAFE_DEFAULT)) {
+                               return -1;
+                       }
+
+                       return ret;
+               }
+
+               DEBUG(10, ("non-atomic large sendfile successful\n"));
+       }
+
+       /* There was error in the atomic sendfile. */
+       if (ret == -1) {
+               DEBUG(1, ("error on %s sendfile (%lu bytes): %s\n",
+                         atomic ? "atomic" : "non-atomic",
+                         count, strerror(errno)));
+       }
+
+       return ret;
+}
+
+/**
  * Only talloc the spill buffer once (reallocing when necessary).
  */
 static char *get_spill_buffer(size_t new_count)
diff --git a/source3/modules/vfs_onefs.c b/source3/modules/vfs_onefs.c
index f0c6a9d..60c2c97 100644
--- a/source3/modules/vfs_onefs.c
+++ b/source3/modules/vfs_onefs.c
@@ -156,6 +156,19 @@ static int onefs_open(vfs_handle_struct *handle, const 
char *fname,
        return SMB_VFS_NEXT_OPEN(handle, fname, fsp, flags, mode);
 }
 
+static ssize_t onefs_sendfile(vfs_handle_struct *handle, int tofd,
+                             files_struct *fromfsp, const DATA_BLOB *header,
+                             SMB_OFF_T offset, size_t count)
+{
+       ssize_t result;
+
+       START_PROFILE_BYTES(syscall_sendfile, count);
+       result = onefs_sys_sendfile(handle->conn, tofd, fromfsp->fh->fd,
+                                   header, offset, count);
+       END_PROFILE(syscall_sendfile);
+       return result;
+}
+
 static ssize_t onefs_recvfile(vfs_handle_struct *handle, int fromfd,
                              files_struct *tofsp, SMB_OFF_T offset,
                              size_t count)
@@ -340,6 +353,8 @@ static vfs_op_tuple onefs_ops[] = {
         SMB_VFS_LAYER_OPAQUE},
        {SMB_VFS_OP(onefs_close), SMB_VFS_OP_CLOSE,
         SMB_VFS_LAYER_TRANSPARENT},
+       {SMB_VFS_OP(onefs_sendfile), SMB_VFS_OP_SENDFILE,
+        SMB_VFS_LAYER_OPAQUE},
        {SMB_VFS_OP(onefs_recvfile), SMB_VFS_OP_RECVFILE,
         SMB_VFS_LAYER_OPAQUE},
        {SMB_VFS_OP(onefs_rename), SMB_VFS_OP_RENAME,
diff --git a/source3/smbd/reply.c b/source3/smbd/reply.c
index 457f941..b30ef23 100644
--- a/source3/smbd/reply.c
+++ b/source3/smbd/reply.c
@@ -2788,6 +2788,18 @@ static void send_file_readbraw(connection_struct *conn,
                        DEBUG(0,("send_file_readbraw: sendfile failed for file 
%s (%s). Terminating\n",
                                fsp->fsp_name, strerror(errno) ));
                        exit_server_cleanly("send_file_readbraw sendfile 
failed");
+               } else if (sendfile_read == 0) {
+                       /*
+                        * Some sendfile implementations return 0 to indicate
+                        * that there was a short read, but nothing was
+                        * actually written to the socket.  In this case,
+                        * fallback to the normal read path so the header gets
+                        * the correct byte count.
+                        */
+                       DEBUG(3, ("send_file_readbraw: sendfile sent zero "
+                                 "bytes falling back to the normal read: "
+                                 "%s\n", fsp->fsp_name));
+                       goto normal_readbraw;
                }
 
                /* Deal with possible short send. */
@@ -3284,6 +3296,18 @@ static void send_file_readX(connection_struct *conn, 
struct smb_request *req,
                        DEBUG(0,("send_file_readX: sendfile failed for file %s 
(%s). Terminating\n",
                                fsp->fsp_name, strerror(errno) ));
                        exit_server_cleanly("send_file_readX sendfile failed");
+               } else if (nread == 0) {
+                       /*
+                        * Some sendfile implementations return 0 to indicate
+                        * that there was a short read, but nothing was
+                        * actually written to the socket.  In this case,
+                        * fallback to the normal read path so the header gets
+                        * the correct byte count.
+                        */
+                       DEBUG(3, ("send_file_readX: sendfile sent zero bytes "
+                                 "falling back to the normal read: %s\n",
+                                 fsp->fsp_name));
+                       goto normal_read;
                }
 
                DEBUG( 3, ( "send_file_readX: sendfile fnum=%d max=%d 
nread=%d\n",


-- 
Samba Shared Repository

Reply via email to