Re: [hackers] [PATCH][sbase] tar: use bigger buffer size to increase performance

2017-09-11 Thread Hiltjo Posthuma
On Sun, Sep 10, 2017 at 06:58:27PM +0200, Silvan Jegen wrote:

> Hi Hiltjo
> 
> Thanks for the review!
> 
> On Sat, Sep 09, 2017 at 01:06:21PM +0200, Hiltjo Posthuma wrote:
> > On Sat, Sep 09, 2017 at 11:08:42AM +0200, Silvan Jegen wrote:
> > > From: Jim Beveridge 
> > > 
> > > The original code is by Jim Beveridge working on Fuchsia. I merged it
> > > with slight changes.
> > > 
> > 
> > To be clear: is it under the sbase LICENSE?
> 
> Yes, from what I can tell the License for this code is the same as for
> the rest of sbase.
> 
> > >  #define BLKSIZ 512
> > > +// COPY_CHUNK_SIZE must be a power of 2
> > > +#define COPY_CHUNK_SIZE 8192
> > >  
> > 
> > Instead of COPY_CHUNK_SIZE is might be worthwhile to query the pagesize, but
> > i've not tested it.
> 
> Yes, I will have a look.
> 

Hey,

A correction of myself: I meant blocksize not pagesize.

Something like this (simplified):

struct stat st;
char *buf;
size_t bufsiz;
int fd;

if (fstat(fd, ) < 0)
bufsiz = BUFSIZ;
else
bufsiz = MAX(st.st_blksize, BUFSIZ);
buf = ecalloc(1, bufsiz);
for (...) {
n = read(fd, buf, bufsiz);
...
}

I've tested this on a few sbase tools like cat and it significantly increases
throughput. I'll work on a patch.

-- 
Kind regards,
Hiltjo



Re: [hackers] [PATCH][sbase] tar: use bigger buffer size to increase performance

2017-09-10 Thread Silvan Jegen
Hi

On Sat, Sep 09, 2017 at 10:29:21AM -0700, Michael Forney wrote:
> On Sat, Sep 9, 2017 at 2:08 AM, Silvan Jegen  wrote:
> > From: Jim Beveridge 
> >
> > The original code is by Jim Beveridge working on Fuchsia. I merged it
> > with slight changes.
> >
> > Time to tar two 1GB files:
> >
> > Before patch:
> >
> > real0m6.428s
> > user0m0.245s
> > sys 0m4.881s
> >
> > real0m6.454s
> > user0m0.239s
> > sys 0m4.883s
> >
> > real0m6.515s
> > user0m0.259s
> > sys 0m4.839s
> >
> > After patch:
> >
> > real0m4.755s
> > user0m0.026s
> > sys 0m1.598s
> >
> > real0m4.788s
> > user0m0.063s
> > sys 0m1.578s
> >
> > real0m4.822s
> > user0m0.007s
> > sys 0m1.662s
> >
> > A similar speedup can be observed for untaring.
> >
> > In addition to the buffer size increase we change the code to only create
> > directories for non-compliant tar files and we check for st to be NULL
> > in the recursive copy function.
> 
> He also sent me a pull request on my github branch for oasis:
> https://github.com/michaelforney/sbase/pull/2

Ah, I did not see that one.
 

> I think we should work on fixing correctness of tar before trying to
> optimize it. Currently it does not handle short reads or writes at all
> (when working with pipes). I was thinking we should add a readall in
> libutil analogous to writeall and then make use of that.

That sounds good to me.
 

> Regarding COPY_CHUNK_SIZE, it is probably a good idea to put that in
> util.h (perhaps with a different name). concat has the same problem
> with a small BUFSIZ (musl's is only 1024).

What do you think of Hiltjo's idea of querying for the page size with

long sz = sysconf(_SC_PAGESIZE);

or similar? Such code could still be put inot util.h of course.
 

> > ---
> >  tar.c | 72 
> > +++
> >  1 file changed, 55 insertions(+), 17 deletions(-)
> >
> > diff --git a/tar.c b/tar.c
> > index 53a737c..8cd1abe 100644
> > --- a/tar.c
> > +++ b/tar.c
> > @@ -16,6 +16,8 @@
> >  #include "util.h"
> >
> >  #define BLKSIZ 512
> > +// COPY_CHUNK_SIZE must be a power of 2
> > +#define COPY_CHUNK_SIZE 8192
> >
> >  enum Type {
> > REG   = '0',
> > @@ -236,10 +238,13 @@ archive(const char *path)
> > ewrite(tarfd, b, BLKSIZ);
> >
> > if (fd != -1) {
> > -   while ((l = eread(fd, b, BLKSIZ)) > 0) {
> > -   if (l < BLKSIZ)
> > -   memset(b + l, 0, BLKSIZ - l);
> > -   ewrite(tarfd, b, BLKSIZ);
> > +   char chunk[COPY_CHUNK_SIZE];
> > +   while ((l = eread(fd, chunk, COPY_CHUNK_SIZE)) > 0) {
> > +   // Ceiling to BLKSIZ boundary
> > +   int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> > +   if (l < ceilsize)
> > +   memset(chunk + l, 0, ceilsize - l);
> > +   ewrite(tarfd, chunk, ceilsize);
> > }
> > close(fd);
> > }
> > @@ -250,7 +255,7 @@ archive(const char *path)
> >  static int
> >  unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> >  {
> > -   char lname[101], *tmp, *p;
> > +   char lname[101], *p;
> > long mode, major, minor, type, mtime, uid, gid;
> > struct header *h = (struct header *)b;
> > int fd = -1;
> > @@ -261,9 +266,13 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > if (remove(fname) < 0 && errno != ENOENT)
> > weprintf("remove %s:", fname);
> >
> > -   tmp = estrdup(fname);
> > -   mkdirp(dirname(tmp), 0777, 0777);
> > -   free(tmp);
> > +   // tar files normally create the directory chain. This is a fallback
> > +   // for noncompliant tar files.
> > +   if (h->type != DIRECTORY) {
> > +   char* tmp = estrdup(fname);
> > +   mkdirp(dirname(tmp), 0777, 0777);
> > +   free(tmp);
> > +   }
> 
> If you tar a file within another directory, you end up with just one
> entry. This check means that the parent directory won't be created
> when trying to extract this file. Other tar implementations are able
> to extract such an archive.
> 
> >
> > switch (h->type) {
> > case REG:
> > @@ -319,9 +328,25 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > eprintf("strtol %s: invalid number\n", h->gid);
> >
> > if (fd != -1) {
> > -   for (; l > 0; l -= BLKSIZ)
> > -   if (eread(tarfd, b, BLKSIZ) > 0)
> > -   ewrite(fd, b, MIN(l, BLKSIZ));
> > +   // Ceiling to BLKSIZ boundary
> > +   int readsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> > +   char chunk[COPY_CHUNK_SIZE];
> > +   int lastread = 0;
> > +
> > +   for (; readsize > 0; l -= lastread, readsize -= 

Re: [hackers] [PATCH][sbase] tar: use bigger buffer size to increase performance

2017-09-10 Thread Silvan Jegen
Hi Hiltjo

Thanks for the review!

On Sat, Sep 09, 2017 at 01:06:21PM +0200, Hiltjo Posthuma wrote:
> On Sat, Sep 09, 2017 at 11:08:42AM +0200, Silvan Jegen wrote:
> > From: Jim Beveridge 
> > 
> > The original code is by Jim Beveridge working on Fuchsia. I merged it
> > with slight changes.
> > 
> 
> To be clear: is it under the sbase LICENSE?

Yes, from what I can tell the License for this code is the same as for
the rest of sbase.

> >  #define BLKSIZ 512
> > +// COPY_CHUNK_SIZE must be a power of 2
> > +#define COPY_CHUNK_SIZE 8192
> >  
> 
> Instead of COPY_CHUNK_SIZE is might be worthwhile to query the pagesize, but
> i've not tested it.

Yes, I will have a look.

 
> >  enum Type {
> > REG   = '0',
> > @@ -236,10 +238,13 @@ archive(const char *path)
> > ewrite(tarfd, b, BLKSIZ);
> >  
> > if (fd != -1) {
> > -   while ((l = eread(fd, b, BLKSIZ)) > 0) {
> > -   if (l < BLKSIZ)
> > -   memset(b + l, 0, BLKSIZ - l);
> > -   ewrite(tarfd, b, BLKSIZ);
> > +   char chunk[COPY_CHUNK_SIZE];
> > +   while ((l = eread(fd, chunk, COPY_CHUNK_SIZE)) > 0) {
> > +   // Ceiling to BLKSIZ boundary
> > +   int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> > +   if (l < ceilsize)
> > +   memset(chunk + l, 0, ceilsize - l);
> > +   ewrite(tarfd, chunk, ceilsize);
> > }
> > close(fd);
> > }
> > @@ -250,7 +255,7 @@ archive(const char *path)
> >  static int
> >  unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> >  {
> > -   char lname[101], *tmp, *p;
> > +   char lname[101], *p;
> > long mode, major, minor, type, mtime, uid, gid;
> > struct header *h = (struct header *)b;
> > int fd = -1;
> > @@ -261,9 +266,13 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > if (remove(fname) < 0 && errno != ENOENT)
> > weprintf("remove %s:", fname);
> >  
> > -   tmp = estrdup(fname);
> > -   mkdirp(dirname(tmp), 0777, 0777);
> > -   free(tmp);
> > +   // tar files normally create the directory chain. This is a fallback
> > +   // for noncompliant tar files.
> > +   if (h->type != DIRECTORY) {
> > +   char* tmp = estrdup(fname);
> > +   mkdirp(dirname(tmp), 0777, 0777);
> > +   free(tmp);
> > +   }
> >  
> > switch (h->type) {
> > case REG:
> > @@ -319,9 +328,25 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > eprintf("strtol %s: invalid number\n", h->gid);
> >  
> > if (fd != -1) {
> > -   for (; l > 0; l -= BLKSIZ)
> > -   if (eread(tarfd, b, BLKSIZ) > 0)
> > -   ewrite(fd, b, MIN(l, BLKSIZ));
> > +   // Ceiling to BLKSIZ boundary
> > +   int readsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> > +   char chunk[COPY_CHUNK_SIZE];
> > +   int lastread = 0;
> > +
> > +   for (; readsize > 0; l -= lastread, readsize -= lastread) {
> > +   int chunk_size = MIN(readsize, COPY_CHUNK_SIZE);
> > +   // Short reads are legal, so don't expect to read
> > +   // everything that was requested.
> > +   lastread = eread(tarfd, chunk, chunk_size);
> > +   if (lastread == 0) {
> > +   close(fd);
> > +   remove(fname);
> 
> Do all the tar tools remove the file in this case? It might be better to not
> remove it.

You are right, this code should probably be removed if we use a
ReadAll-like function.


> > +   eprintf("unexpected end of file reading %s.\n",
> > +   fname);
> > +   }
> > +
> > +   ewrite(fd, chunk, MIN(l, lastread));
> > +   }
> > close(fd);
> > }
> >  
> > @@ -331,7 +356,7 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> > times[0].tv_sec = times[1].tv_sec = mtime;
> > times[0].tv_nsec = times[1].tv_nsec = 0;
> > if (!mflag && utimensat(AT_FDCWD, fname, times, AT_SYMLINK_NOFOLLOW) < 
> > 0)
> > -   weprintf("utimensat %s:\n", fname);
> > +   weprintf("utimensat %s %d:\n", fname, errno);
> > if (h->type == SYMLINK) {
> > if (!getuid() && lchown(fname, uid, gid))
> > weprintf("lchown %s:\n", fname);
> > @@ -349,10 +374,23 @@ static void
> >  skipblk(ssize_t l)
> >  {
> > char b[BLKSIZ];
> > -
> > -   for (; l > 0; l -= BLKSIZ)
> > -   if (!eread(tarfd, b, BLKSIZ))
> > -   break;
> > +   int lastread = 0;
> > +   // Ceiling to BLKSIZ boundary
> > +   int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> > +
> > +   off_t offset = lseek(tarfd, ceilsize, SEEK_CUR);
> > +   if (offset >= ceilsize)
> > +   return;
> > +   if (errno != ESPIPE)
> > +   eprintf("unexpected end of file.\n");
> > +
> > +   // 

Re: [hackers] [PATCH][sbase] tar: use bigger buffer size to increase performance

2017-09-09 Thread Michael Forney
Make sure to CC Jim in replies; he is not subscribed to the list.

On Sat, Sep 9, 2017 at 1:31 PM, Jim Beveridge  wrote:
> Several comments to clear up some points about my tar code:

Thanks, Jim. I still have a few questions.

I also have to insist that these issues are tackled one-by-one rather
in one large commit.

> The CHUNK changes were not intended to optimize the general case, that was
> just a happy side effect. The changes were intended to address severe
> performance problems on platforms (like Fuchsia, at least at the moment)
> that don't do write-behind and/or read-ahead caching.

Yes, this needs to be addressed, however, I think we should do this
with a copy buffer size define in util.h, since libutil/concat.c
should use a larger buffer as well.

> My code supports short reads. I specifically had a test bed that was capable
> of reproducing short reads from a pipe and I spent quite a bit of time
> making that case work properly. Perhaps I'm reading too much into your
> comments, but I don't see the point of waiting on support for short write
> support to commit these changes supporting short reads.

Yes, but I think adding a utility function to deal with the case of
reading an entire amount (like plan9's readn or go's io.ReadFull)
would simplify this.

> Regarding your comment about, "If you tar a file within another directory,
> you end up with just one entry. This check means that the parent directory
> won't be created when trying to extract this file. Other tar implementations
> are able to extract such an archive." I completely agree, but I'm not sure
> what's driving your comment? My implementation supports this case.

Sorry, I should have tested and provided an example. You're right,
that case is supported by your patch. However, the following case is
not:

$ mkdir -p a/b
$ tar -cf a.tar a/b
$ rm -r a
$ tar -xf a.tar
tar: mkdir a/b: No such file or directory

> The
> previous implementation had issues because it never allowed the directory
> creation switch case to do its job, which caused loss of permissions on
> directory creation and unnecessary user-facing warnings in some cases.

Can you provide a concrete example of one of these cases? As far as I
understand it, if all the directories are present in the archive, they
will be created by the directory creation switch case with the
appropriate permissions. mkdirp should be harmless on a parent
directory that already exists since it never removes directories or
changes their mode.

Which warnings are you talking about? mkdirp should not warn if the
call to mkdir fails with EEXIST, and since we call it with the dirname
of fname, it should not be creating directories for the tar entry
itself.

I also wonder if this recent commit solves your problem, but even if
it does, I'm curious about the behavior you were observing:

https://git.suckless.org/sbase/commit/libutil/mkdirp.c?id=6ac5f01cc94b2a6f7b6406ddd151e7b4d8fb1d7d

> Finally, st should not be added into the if clause. That was a missing merge
> in my code. I fixed this in my pull request to be:
> if (r->depth && S_ISDIR(st->st_mode))

Okay, thanks for confirming.



Re: [hackers] [PATCH][sbase] tar: use bigger buffer size to increase performance

2017-09-09 Thread Michael Forney
On Sat, Sep 9, 2017 at 2:08 AM, Silvan Jegen  wrote:
> From: Jim Beveridge 
>
> The original code is by Jim Beveridge working on Fuchsia. I merged it
> with slight changes.
>
> Time to tar two 1GB files:
>
> Before patch:
>
> real0m6.428s
> user0m0.245s
> sys 0m4.881s
>
> real0m6.454s
> user0m0.239s
> sys 0m4.883s
>
> real0m6.515s
> user0m0.259s
> sys 0m4.839s
>
> After patch:
>
> real0m4.755s
> user0m0.026s
> sys 0m1.598s
>
> real0m4.788s
> user0m0.063s
> sys 0m1.578s
>
> real0m4.822s
> user0m0.007s
> sys 0m1.662s
>
> A similar speedup can be observed for untaring.
>
> In addition to the buffer size increase we change the code to only create
> directories for non-compliant tar files and we check for st to be NULL
> in the recursive copy function.

He also sent me a pull request on my github branch for oasis:
https://github.com/michaelforney/sbase/pull/2

I think we should work on fixing correctness of tar before trying to
optimize it. Currently it does not handle short reads or writes at all
(when working with pipes). I was thinking we should add a readall in
libutil analogous to writeall and then make use of that.

Regarding COPY_CHUNK_SIZE, it is probably a good idea to put that in
util.h (perhaps with a different name). concat has the same problem
with a small BUFSIZ (musl's is only 1024).

> ---
>  tar.c | 72 
> +++
>  1 file changed, 55 insertions(+), 17 deletions(-)
>
> diff --git a/tar.c b/tar.c
> index 53a737c..8cd1abe 100644
> --- a/tar.c
> +++ b/tar.c
> @@ -16,6 +16,8 @@
>  #include "util.h"
>
>  #define BLKSIZ 512
> +// COPY_CHUNK_SIZE must be a power of 2
> +#define COPY_CHUNK_SIZE 8192
>
>  enum Type {
> REG   = '0',
> @@ -236,10 +238,13 @@ archive(const char *path)
> ewrite(tarfd, b, BLKSIZ);
>
> if (fd != -1) {
> -   while ((l = eread(fd, b, BLKSIZ)) > 0) {
> -   if (l < BLKSIZ)
> -   memset(b + l, 0, BLKSIZ - l);
> -   ewrite(tarfd, b, BLKSIZ);
> +   char chunk[COPY_CHUNK_SIZE];
> +   while ((l = eread(fd, chunk, COPY_CHUNK_SIZE)) > 0) {
> +   // Ceiling to BLKSIZ boundary
> +   int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> +   if (l < ceilsize)
> +   memset(chunk + l, 0, ceilsize - l);
> +   ewrite(tarfd, chunk, ceilsize);
> }
> close(fd);
> }
> @@ -250,7 +255,7 @@ archive(const char *path)
>  static int
>  unarchive(char *fname, ssize_t l, char b[BLKSIZ])
>  {
> -   char lname[101], *tmp, *p;
> +   char lname[101], *p;
> long mode, major, minor, type, mtime, uid, gid;
> struct header *h = (struct header *)b;
> int fd = -1;
> @@ -261,9 +266,13 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> if (remove(fname) < 0 && errno != ENOENT)
> weprintf("remove %s:", fname);
>
> -   tmp = estrdup(fname);
> -   mkdirp(dirname(tmp), 0777, 0777);
> -   free(tmp);
> +   // tar files normally create the directory chain. This is a fallback
> +   // for noncompliant tar files.
> +   if (h->type != DIRECTORY) {
> +   char* tmp = estrdup(fname);
> +   mkdirp(dirname(tmp), 0777, 0777);
> +   free(tmp);
> +   }

If you tar a file within another directory, you end up with just one
entry. This check means that the parent directory won't be created
when trying to extract this file. Other tar implementations are able
to extract such an archive.

>
> switch (h->type) {
> case REG:
> @@ -319,9 +328,25 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
> eprintf("strtol %s: invalid number\n", h->gid);
>
> if (fd != -1) {
> -   for (; l > 0; l -= BLKSIZ)
> -   if (eread(tarfd, b, BLKSIZ) > 0)
> -   ewrite(fd, b, MIN(l, BLKSIZ));
> +   // Ceiling to BLKSIZ boundary
> +   int readsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> +   char chunk[COPY_CHUNK_SIZE];
> +   int lastread = 0;
> +
> +   for (; readsize > 0; l -= lastread, readsize -= lastread) {
> +   int chunk_size = MIN(readsize, COPY_CHUNK_SIZE);
> +   // Short reads are legal, so don't expect to read
> +   // everything that was requested.
> +   lastread = eread(tarfd, chunk, chunk_size);
> +   if (lastread == 0) {
> +   close(fd);
> +   remove(fname);
> +   eprintf("unexpected end of file reading 
> %s.\n",
> +   

Re: [hackers] [PATCH][sbase] tar: use bigger buffer size to increase performance

2017-09-09 Thread Hiltjo Posthuma
On Sat, Sep 09, 2017 at 11:08:42AM +0200, Silvan Jegen wrote:
> From: Jim Beveridge 
> 
> The original code is by Jim Beveridge working on Fuchsia. I merged it
> with slight changes.
> 

To be clear: is it under the sbase LICENSE?

> Time to tar two 1GB files:
> 
> Before patch:
> 
> real  0m6.428s
> user  0m0.245s
> sys   0m4.881s
> 
> real  0m6.454s
> user  0m0.239s
> sys   0m4.883s
> 
> real  0m6.515s
> user  0m0.259s
> sys   0m4.839s
> 
> After patch:
> 
> real  0m4.755s
> user  0m0.026s
> sys   0m1.598s
> 
> real  0m4.788s
> user  0m0.063s
> sys   0m1.578s
> 
> real  0m4.822s
> user  0m0.007s
> sys   0m1.662s
> 
> A similar speedup can be observed for untaring.
> 
> In addition to the buffer size increase we change the code to only create
> directories for non-compliant tar files and we check for st to be NULL
> in the recursive copy function.
> ---
>  tar.c | 72 
> +++
>  1 file changed, 55 insertions(+), 17 deletions(-)
> 
> diff --git a/tar.c b/tar.c
> index 53a737c..8cd1abe 100644
> --- a/tar.c
> +++ b/tar.c
> @@ -16,6 +16,8 @@
>  #include "util.h"
>  
>  #define BLKSIZ 512
> +// COPY_CHUNK_SIZE must be a power of 2
> +#define COPY_CHUNK_SIZE 8192
>  

Instead of COPY_CHUNK_SIZE is might be worthwhile to query the pagesize, but
i've not tested it.

>  enum Type {
>   REG   = '0',
> @@ -236,10 +238,13 @@ archive(const char *path)
>   ewrite(tarfd, b, BLKSIZ);
>  
>   if (fd != -1) {
> - while ((l = eread(fd, b, BLKSIZ)) > 0) {
> - if (l < BLKSIZ)
> - memset(b + l, 0, BLKSIZ - l);
> - ewrite(tarfd, b, BLKSIZ);
> + char chunk[COPY_CHUNK_SIZE];
> + while ((l = eread(fd, chunk, COPY_CHUNK_SIZE)) > 0) {
> + // Ceiling to BLKSIZ boundary
> + int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> + if (l < ceilsize)
> + memset(chunk + l, 0, ceilsize - l);
> + ewrite(tarfd, chunk, ceilsize);
>   }
>   close(fd);
>   }
> @@ -250,7 +255,7 @@ archive(const char *path)
>  static int
>  unarchive(char *fname, ssize_t l, char b[BLKSIZ])
>  {
> - char lname[101], *tmp, *p;
> + char lname[101], *p;
>   long mode, major, minor, type, mtime, uid, gid;
>   struct header *h = (struct header *)b;
>   int fd = -1;
> @@ -261,9 +266,13 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
>   if (remove(fname) < 0 && errno != ENOENT)
>   weprintf("remove %s:", fname);
>  
> - tmp = estrdup(fname);
> - mkdirp(dirname(tmp), 0777, 0777);
> - free(tmp);
> + // tar files normally create the directory chain. This is a fallback
> + // for noncompliant tar files.
> + if (h->type != DIRECTORY) {
> + char* tmp = estrdup(fname);
> + mkdirp(dirname(tmp), 0777, 0777);
> + free(tmp);
> + }
>  
>   switch (h->type) {
>   case REG:
> @@ -319,9 +328,25 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
>   eprintf("strtol %s: invalid number\n", h->gid);
>  
>   if (fd != -1) {
> - for (; l > 0; l -= BLKSIZ)
> - if (eread(tarfd, b, BLKSIZ) > 0)
> - ewrite(fd, b, MIN(l, BLKSIZ));
> + // Ceiling to BLKSIZ boundary
> + int readsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
> + char chunk[COPY_CHUNK_SIZE];
> + int lastread = 0;
> +
> + for (; readsize > 0; l -= lastread, readsize -= lastread) {
> + int chunk_size = MIN(readsize, COPY_CHUNK_SIZE);
> + // Short reads are legal, so don't expect to read
> + // everything that was requested.
> + lastread = eread(tarfd, chunk, chunk_size);
> + if (lastread == 0) {
> + close(fd);
> + remove(fname);

Do all the tar tools remove the file in this case? It might be better to not
remove it.

> + eprintf("unexpected end of file reading %s.\n",
> + fname);
> + }
> +
> + ewrite(fd, chunk, MIN(l, lastread));
> + }
>   close(fd);
>   }
>  
> @@ -331,7 +356,7 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
>   times[0].tv_sec = times[1].tv_sec = mtime;
>   times[0].tv_nsec = times[1].tv_nsec = 0;
>   if (!mflag && utimensat(AT_FDCWD, fname, times, AT_SYMLINK_NOFOLLOW) < 
> 0)
> - weprintf("utimensat %s:\n", fname);
> + weprintf("utimensat %s %d:\n", fname, errno);
>   if (h->type == SYMLINK) {
>   if (!getuid() && lchown(fname, uid, gid))
>   weprintf("lchown %s:\n", fname);
> @@ -349,10 

[hackers] [PATCH][sbase] tar: use bigger buffer size to increase performance

2017-09-09 Thread Silvan Jegen
From: Jim Beveridge 

The original code is by Jim Beveridge working on Fuchsia. I merged it
with slight changes.

Time to tar two 1GB files:

Before patch:

real0m6.428s
user0m0.245s
sys 0m4.881s

real0m6.454s
user0m0.239s
sys 0m4.883s

real0m6.515s
user0m0.259s
sys 0m4.839s

After patch:

real0m4.755s
user0m0.026s
sys 0m1.598s

real0m4.788s
user0m0.063s
sys 0m1.578s

real0m4.822s
user0m0.007s
sys 0m1.662s

A similar speedup can be observed for untaring.

In addition to the buffer size increase we change the code to only create
directories for non-compliant tar files and we check for st to be NULL
in the recursive copy function.
---
 tar.c | 72 +++
 1 file changed, 55 insertions(+), 17 deletions(-)

diff --git a/tar.c b/tar.c
index 53a737c..8cd1abe 100644
--- a/tar.c
+++ b/tar.c
@@ -16,6 +16,8 @@
 #include "util.h"
 
 #define BLKSIZ 512
+// COPY_CHUNK_SIZE must be a power of 2
+#define COPY_CHUNK_SIZE 8192
 
 enum Type {
REG   = '0',
@@ -236,10 +238,13 @@ archive(const char *path)
ewrite(tarfd, b, BLKSIZ);
 
if (fd != -1) {
-   while ((l = eread(fd, b, BLKSIZ)) > 0) {
-   if (l < BLKSIZ)
-   memset(b + l, 0, BLKSIZ - l);
-   ewrite(tarfd, b, BLKSIZ);
+   char chunk[COPY_CHUNK_SIZE];
+   while ((l = eread(fd, chunk, COPY_CHUNK_SIZE)) > 0) {
+   // Ceiling to BLKSIZ boundary
+   int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
+   if (l < ceilsize)
+   memset(chunk + l, 0, ceilsize - l);
+   ewrite(tarfd, chunk, ceilsize);
}
close(fd);
}
@@ -250,7 +255,7 @@ archive(const char *path)
 static int
 unarchive(char *fname, ssize_t l, char b[BLKSIZ])
 {
-   char lname[101], *tmp, *p;
+   char lname[101], *p;
long mode, major, minor, type, mtime, uid, gid;
struct header *h = (struct header *)b;
int fd = -1;
@@ -261,9 +266,13 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
if (remove(fname) < 0 && errno != ENOENT)
weprintf("remove %s:", fname);
 
-   tmp = estrdup(fname);
-   mkdirp(dirname(tmp), 0777, 0777);
-   free(tmp);
+   // tar files normally create the directory chain. This is a fallback
+   // for noncompliant tar files.
+   if (h->type != DIRECTORY) {
+   char* tmp = estrdup(fname);
+   mkdirp(dirname(tmp), 0777, 0777);
+   free(tmp);
+   }
 
switch (h->type) {
case REG:
@@ -319,9 +328,25 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
eprintf("strtol %s: invalid number\n", h->gid);
 
if (fd != -1) {
-   for (; l > 0; l -= BLKSIZ)
-   if (eread(tarfd, b, BLKSIZ) > 0)
-   ewrite(fd, b, MIN(l, BLKSIZ));
+   // Ceiling to BLKSIZ boundary
+   int readsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
+   char chunk[COPY_CHUNK_SIZE];
+   int lastread = 0;
+
+   for (; readsize > 0; l -= lastread, readsize -= lastread) {
+   int chunk_size = MIN(readsize, COPY_CHUNK_SIZE);
+   // Short reads are legal, so don't expect to read
+   // everything that was requested.
+   lastread = eread(tarfd, chunk, chunk_size);
+   if (lastread == 0) {
+   close(fd);
+   remove(fname);
+   eprintf("unexpected end of file reading %s.\n",
+   fname);
+   }
+
+   ewrite(fd, chunk, MIN(l, lastread));
+   }
close(fd);
}
 
@@ -331,7 +356,7 @@ unarchive(char *fname, ssize_t l, char b[BLKSIZ])
times[0].tv_sec = times[1].tv_sec = mtime;
times[0].tv_nsec = times[1].tv_nsec = 0;
if (!mflag && utimensat(AT_FDCWD, fname, times, AT_SYMLINK_NOFOLLOW) < 
0)
-   weprintf("utimensat %s:\n", fname);
+   weprintf("utimensat %s %d:\n", fname, errno);
if (h->type == SYMLINK) {
if (!getuid() && lchown(fname, uid, gid))
weprintf("lchown %s:\n", fname);
@@ -349,10 +374,23 @@ static void
 skipblk(ssize_t l)
 {
char b[BLKSIZ];
-
-   for (; l > 0; l -= BLKSIZ)
-   if (!eread(tarfd, b, BLKSIZ))
-   break;
+   int lastread = 0;
+   // Ceiling to BLKSIZ boundary
+   int ceilsize = (l + (BLKSIZ-1)) & ~(BLKSIZ-1);
+
+   off_t offset = lseek(tarfd, ceilsize, SEEK_CUR);
+   if (offset >= ceilsize)
+