Re: [PATCH v1 2/2] convert.c: stream and early out

2016-10-10 Thread Junio C Hamano
tbo...@web.de writes:

> -static void gather_stats(const char *buf, unsigned long size, struct 
> text_stat *stats)
> +static void gather_stats_partly(const char *buf, unsigned long len,
> + struct text_stat *stats, unsigned earlyout)
>  {

I think it is OK not to rename the function (you'd be passing earlyout=0
for callers that want exact stat, right?).

>   unsigned long i;
>  
> - memset(stats, 0, sizeof(*stats));
> -
> - for (i = 0; i < size; i++) {
> + if (!buf || !len)
> + return;
> + for (i = 0; i < len; i++) {
>   unsigned char c = buf[i];
>   if (c == '\r') {
> - if (i+1 < size && buf[i+1] == '\n') {
> + stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR;
> + if (i+1 < len && buf[i+1] == '\n') {
>   stats->crlf++;
>   i++;
> - } else
> + stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF;
> + } else {
>   stats->lonecr++;
> + stats->stat_bits |= CONVERT_STAT_BITS_BIN;
> + }
>   continue;
>   }
>   if (c == '\n') {
>   stats->lonelf++;
> + stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF;
>   continue;
>   }
>   if (c == 127)
> @@ -67,7 +74,7 @@ static void gather_stats(const char *buf, unsigned long 
> size, struct text_stat *
>   stats->printable++;
>   break;
>   case 0:
> - stats->nul++;
> + stats->stat_bits |= CONVERT_STAT_BITS_BIN;
>   /* fall through */
>   default:
>   stats->nonprintable++;


So depending on the distribution of the bytes in the file, the
bitfields in stats->stat_bits will be filled one bit at a time in
random order.

> @@ -75,10 +82,12 @@ static void gather_stats(const char *buf, unsigned long 
> size, struct text_stat *
>   }
>   else
>   stats->printable++;
> + if (stats->stat_bits & earlyout)
> + break; /* We found what we have been searching for */

But an "earlyout" says that if "any" of the earlyout bit is seen, we
can return.

It somehow felt a bit too limited to me in my initial reading, but I
guess I shouldn't be surprised to see that such a limited interface
is sufficient for a file-local helper function ;-).  

The only caller that the semantics of this exit condition matters is
the one that wants to know "do we have NUL or CR anywhere?", so I
guess this should be sufficient.

>   }
>  
>   /* If file ends with EOF then don't count this EOF as non-printable. */
> - if (size >= 1 && buf[size-1] == '\032')
> + if (len >= 1 && buf[len-1] == '\032')
>   stats->nonprintable--;

This noise is somewhat irritating.  Was there a reason why size was
a bad name for the variable?

> +static const char *convert_stats_ascii(unsigned convert_stats)
>  {
> - unsigned int convert_stats = gather_convert_stats(data, size);
> -
> + const unsigned mask = CONVERT_STAT_BITS_TXT_LF |
> + CONVERT_STAT_BITS_TXT_CRLF;
>   if (convert_stats & CONVERT_STAT_BITS_BIN)
>   return "-text";
> - switch (convert_stats) {
> + switch (convert_stats & mask) {
>   case CONVERT_STAT_BITS_TXT_LF:
>   return "lf";
>   case CONVERT_STAT_BITS_TXT_CRLF:

Subtle.  The caller runs the stat colllection with early-out set to
BITS_BIN, so that this can set "-text" early.  It knows that without
BITS_BIN, the stat was taken for the whole contents and the check lf
or crlf can be reliable.

I wonder if we can/need to do something to remove this subtleness
out of this callchain, which could be a source of confusion.

> @@ -132,24 +162,45 @@ static const char *gather_convert_stats_ascii(const 
> char *data, unsigned long si
>   }
>  }
>  
> +static unsigned get_convert_stats_wt(const char *path)
> +{
> + struct text_stat stats;
> + unsigned earlyout = CONVERT_STAT_BITS_BIN;
> + int fd;
> + memset(, 0, sizeof(stats));
> + fd = open(path, O_RDONLY);
> + if (fd < 0)
> + return 0;
> + for (;;) {
> + char buf[2*1024];

Where is this 2kB come from?  Out of thin air?



[PATCH v1 2/2] convert.c: stream and early out

2016-10-09 Thread tboegi
From: Torsten Bögershausen 

When statistics are done for the autocrlf handling, the search in
the content can be stopped, if e.g
- a search for binary is done, and a NUL character is found
- a search for CRLF is done, and the first CRLF is found.

Similar when statistics for binary vs non-binary are gathered:
Whenever a lone CR or NUL is found, the search can be aborted.

When checking out files in "auto" mode, any file that has a "lone CR"
or a CRLF will not be converted, so the search can be aborted early.

Add the new bit, CONVERT_STAT_BITS_ANY_CR,
which is set for either lone CR or CRLF.

Many binary files have a NUL very early (within the first few bytes,
latest within the first 1..2K).
It is often not necessary to load the whole content of a file or blob
into memory.

Use a streaming handling for blobs and files in the worktree.
---
 convert.c | 195 +-
 1 file changed, 130 insertions(+), 65 deletions(-)

diff --git a/convert.c b/convert.c
index 077f5e6..6a625e5 100644
--- a/convert.c
+++ b/convert.c
@@ -3,6 +3,7 @@
 #include "run-command.h"
 #include "quote.h"
 #include "sigchain.h"
+#include "streaming.h"
 
 /*
  * convert.c - convert a file when checking it out and checking it in.
@@ -13,10 +14,10 @@
  * translation when the "text" attribute or "auto_crlf" option is set.
  */
 
-/* Stat bits: When BIN is set, the txt bits are unset */
 #define CONVERT_STAT_BITS_TXT_LF0x1
 #define CONVERT_STAT_BITS_TXT_CRLF  0x2
 #define CONVERT_STAT_BITS_BIN   0x4
+#define CONVERT_STAT_BITS_ANY_CR0x8
 
 enum crlf_action {
CRLF_UNDEFINED,
@@ -31,30 +32,36 @@ enum crlf_action {
 
 struct text_stat {
/* NUL, CR, LF and CRLF counts */
-   unsigned nul, lonecr, lonelf, crlf;
+   unsigned stat_bits, lonecr, lonelf, crlf;
 
/* These are just approximations! */
unsigned printable, nonprintable;
 };
 
-static void gather_stats(const char *buf, unsigned long size, struct text_stat 
*stats)
+static void gather_stats_partly(const char *buf, unsigned long len,
+   struct text_stat *stats, unsigned earlyout)
 {
unsigned long i;
 
-   memset(stats, 0, sizeof(*stats));
-
-   for (i = 0; i < size; i++) {
+   if (!buf || !len)
+   return;
+   for (i = 0; i < len; i++) {
unsigned char c = buf[i];
if (c == '\r') {
-   if (i+1 < size && buf[i+1] == '\n') {
+   stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR;
+   if (i+1 < len && buf[i+1] == '\n') {
stats->crlf++;
i++;
-   } else
+   stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF;
+   } else {
stats->lonecr++;
+   stats->stat_bits |= CONVERT_STAT_BITS_BIN;
+   }
continue;
}
if (c == '\n') {
stats->lonelf++;
+   stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF;
continue;
}
if (c == 127)
@@ -67,7 +74,7 @@ static void gather_stats(const char *buf, unsigned long size, 
struct text_stat *
stats->printable++;
break;
case 0:
-   stats->nul++;
+   stats->stat_bits |= CONVERT_STAT_BITS_BIN;
/* fall through */
default:
stats->nonprintable++;
@@ -75,10 +82,12 @@ static void gather_stats(const char *buf, unsigned long 
size, struct text_stat *
}
else
stats->printable++;
+   if (stats->stat_bits & earlyout)
+   break; /* We found what we have been searching for */
}
 
/* If file ends with EOF then don't count this EOF as non-printable. */
-   if (size >= 1 && buf[size-1] == '\032')
+   if (len >= 1 && buf[len-1] == '\032')
stats->nonprintable--;
 }
 
@@ -86,41 +95,62 @@ static void gather_stats(const char *buf, unsigned long 
size, struct text_stat *
  * The same heuristics as diff.c::mmfile_is_binary()
  * We treat files with bare CR as binary
  */
-static int convert_is_binary(unsigned long size, const struct text_stat *stats)
+static void convert_nonprintable(struct text_stat *stats)
 {
-   if (stats->lonecr)
-   return 1;
-   if (stats->nul)
-   return 1;
if ((stats->printable >> 7) < stats->nonprintable)
-   return 1;
-   return 0;
+   stats->stat_bits |= CONVERT_STAT_BITS_BIN;
 }
 
-static unsigned int gather_convert_stats(const char *data,