Steffen Prohaska <proha...@zib.de> writes:

> The data is streamed to the filter process anyway.  Better avoid mapping
> the file if possible.  This is especially useful if a clean filter
> reduces the size, for example if it computes a sha1 for binary data,
> like git media.  The file size that the previous implementation could
> handle was limited by the available address space; large files for
> example could not be handled with (32-bit) msysgit.  The new
> implementation can filter files of any size as long as the filter output
> is small enough.
>
> The new code path is only taken if the filter is required.  The filter
> consumes data directly from the fd.  The original data is not available
> to git, so it must fail if the filter fails.

Yay ;-)

>
> The test that exercises required filters is modified to verify that the
> data actually has been modified on its way from the file system to the
> object store.
>
> The expectation on the process size is tested using /usr/bin/time.  An
> alternative would have been tcsh, which could be used to print memory
> information as follows:
>
>     tcsh -c 'set time=(0 "%M"); <cmd>'
>
> Although the logic could perhaps be simplified with tcsh, I chose to use
> 'time' to avoid a dependency on tcsh.
>
> Signed-off-by: Steffen Prohaska <proha...@zib.de>
> ---
>  convert.c             | 58 ++++++++++++++++++++++++++++++++++++++-----
>  convert.h             | 10 +++++---
>  sha1_file.c           | 29 ++++++++++++++++++++--
>  t/t0021-conversion.sh | 68 
> +++++++++++++++++++++++++++++++++++++++++++++++----
>  4 files changed, 149 insertions(+), 16 deletions(-)
>
> diff --git a/convert.c b/convert.c
> index cb5fbb4..58a516a 100644
> --- a/convert.c
> +++ b/convert.c
> @@ -312,11 +312,12 @@ static int crlf_to_worktree(const char *path, const 
> char *src, size_t len,
>  struct filter_params {
>       const char *src;
>       unsigned long size;
> +     int fd;
>       const char *cmd;
>       const char *path;
>  };
>  
> -static int filter_buffer(int in, int out, void *data)
> +static int filter_buffer_or_fd(int in, int out, void *data)
>  {
>       /*
>        * Spawn cmd and feed the buffer contents through its stdin.
> @@ -325,6 +326,7 @@ static int filter_buffer(int in, int out, void *data)
>       struct filter_params *params = (struct filter_params *)data;
>       int write_err, status;
>       const char *argv[] = { NULL, NULL };
> +     int fd;
>  
>       /* apply % substitution to cmd */
>       struct strbuf cmd = STRBUF_INIT;
> @@ -355,7 +357,17 @@ static int filter_buffer(int in, int out, void *data)
>  
>       sigchain_push(SIGPIPE, SIG_IGN);
>  
> -     write_err = (write_in_full(child_process.in, params->src, params->size) 
> < 0);
> +     if (params->src) {
> +         write_err = (write_in_full(child_process.in, params->src, 
> params->size) < 0);
> +     } else {
> +         /* dup(), because copy_fd() closes the input fd. */
> +         fd = dup(params->fd);
> +         if (fd < 0)
> +             write_err = error("failed to dup file descriptor.");
> +         else
> +             write_err = copy_fd(fd, child_process.in);
> +     }
> +
>       if (close(child_process.in))
>               write_err = 1;
>       if (write_err)
> @@ -371,7 +383,7 @@ static int filter_buffer(int in, int out, void *data)
>       return (write_err || status);
>  }
>  
> -static int apply_filter(const char *path, const char *src, size_t len,
> +static int apply_filter(const char *path, const char *src, size_t len, int 
> fd,
>                          struct strbuf *dst, const char *cmd)
>  {
>       /*
> @@ -392,11 +404,12 @@ static int apply_filter(const char *path, const char 
> *src, size_t len,
>               return 1;
>  
>       memset(&async, 0, sizeof(async));
> -     async.proc = filter_buffer;
> +     async.proc = filter_buffer_or_fd;
>       async.data = &params;
>       async.out = -1;
>       params.src = src;
>       params.size = len;
> +     params.fd = fd;
>       params.cmd = cmd;
>       params.path = path;
>  
> @@ -747,6 +760,22 @@ static void convert_attrs(struct conv_attrs *ca, const 
> char *path)
>       }
>  }
>  
> +int would_convert_to_git_filter_fd(const char *path) {

Style; opening brace on its own line:

        int would_convert_to_git_filter_fd(const char *path)
        {

> +     struct conv_attrs ca;
> +     convert_attrs(&ca, path);
> +
> +     if (!ca.drv)

As we do not allow decl-after-stmt, it is easier to read if the
first blank line in the function was between the local variable
definition and the first statement.  I.e.

        int would_convert_to_git_filter_fd(const char *path)
        {
                struct ...;

                convert_attrs(...);
                if (!ca.drv)


> +         return 0;
> +
> +     /* Apply a filter to an fd only if the filter is required to succeed.
> +      * We must die if the filter fails, because the original data before
> +      * filtering is not available. */

 /*
  * multi-line
  * comment style.
  */

> +     if (!ca.drv->required)
> +         return 0;
> +
> +     return apply_filter(path, 0, 0, -1, 0, ca.drv->clean);

What's the significance of "-1" here?  Does somebody in the
callchain from apply_filter() check if fd < 0 and act differently
(not a complaint nor rhetoric question)?

Spell out the first "0" as "NULL" if you meant src == NULL
(similarly for dst).

> diff --git a/convert.h b/convert.h
> index 0c2143c..d9d853c 100644
> --- a/convert.h
> +++ b/convert.h
> @@ -40,11 +40,15 @@ extern int convert_to_working_tree(const char *path, 
> const char *src,
>                                  size_t len, struct strbuf *dst);
>  extern int renormalize_buffer(const char *path, const char *src, size_t len,
>                             struct strbuf *dst);
> -static inline int would_convert_to_git(const char *path, const char *src,
> -                                    size_t len, enum safe_crlf checksafe)
> +static inline int would_convert_to_git(const char *path)
>  {
> -     return convert_to_git(path, src, len, NULL, checksafe);
> +     return convert_to_git(path, NULL, 0, NULL, 0);
>  }

Is this change because there was only one caller that passed nothing
meaningful but "path"?  It would have been nicer to see such a change
as a clean-up _before_ the real patch, if that was the case.
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to