[email protected] writes:
> -static void gather_stats(const char *buf, unsigned long size, struct
> text_stat *stats)
> +static void gather_stats_partly(const char *buf, unsigned long len,
> + struct text_stat *stats, unsigned earlyout)
> {
I think it is OK not to rename the function (you'd be passing earlyout=0
for callers that want exact stat, right?).
> unsigned long i;
>
> - memset(stats, 0, sizeof(*stats));
> -
> - for (i = 0; i < size; i++) {
> + if (!buf || !len)
> + return;
> + for (i = 0; i < len; i++) {
> unsigned char c = buf[i];
> if (c == '\r') {
> - if (i+1 < size && buf[i+1] == '\n') {
> + stats->stat_bits |= CONVERT_STAT_BITS_ANY_CR;
> + if (i+1 < len && buf[i+1] == '\n') {
> stats->crlf++;
> i++;
> - } else
> + stats->stat_bits |= CONVERT_STAT_BITS_TXT_CRLF;
> + } else {
> stats->lonecr++;
> + stats->stat_bits |= CONVERT_STAT_BITS_BIN;
> + }
> continue;
> }
> if (c == '\n') {
> stats->lonelf++;
> + stats->stat_bits |= CONVERT_STAT_BITS_TXT_LF;
> continue;
> }
> if (c == 127)
> @@ -67,7 +74,7 @@ static void gather_stats(const char *buf, unsigned long
> size, struct text_stat *
> stats->printable++;
> break;
> case 0:
> - stats->nul++;
> + stats->stat_bits |= CONVERT_STAT_BITS_BIN;
> /* fall through */
> default:
> stats->nonprintable++;
So depending on the distribution of the bytes in the file, the
bitfields in stats->stat_bits will be filled one bit at a time in
random order.
> @@ -75,10 +82,12 @@ static void gather_stats(const char *buf, unsigned long
> size, struct text_stat *
> }
> else
> stats->printable++;
> + if (stats->stat_bits & earlyout)
> + break; /* We found what we have been searching for */
But an "earlyout" says that if "any" of the earlyout bit is seen, we
can return.
It somehow felt a bit too limited to me in my initial reading, but I
guess I shouldn't be surprised to see that such a limited interface
is sufficient for a file-local helper function ;-).
The only caller that the semantics of this exit condition matters is
the one that wants to know "do we have NUL or CR anywhere?", so I
guess this should be sufficient.
> }
>
> /* If file ends with EOF then don't count this EOF as non-printable. */
> - if (size >= 1 && buf[size-1] == '\032')
> + if (len >= 1 && buf[len-1] == '\032')
> stats->nonprintable--;
This noise is somewhat irritating. Was there a reason why size was
a bad name for the variable?
> +static const char *convert_stats_ascii(unsigned convert_stats)
> {
> - unsigned int convert_stats = gather_convert_stats(data, size);
> -
> + const unsigned mask = CONVERT_STAT_BITS_TXT_LF |
> + CONVERT_STAT_BITS_TXT_CRLF;
> if (convert_stats & CONVERT_STAT_BITS_BIN)
> return "-text";
> - switch (convert_stats) {
> + switch (convert_stats & mask) {
> case CONVERT_STAT_BITS_TXT_LF:
> return "lf";
> case CONVERT_STAT_BITS_TXT_CRLF:
Subtle. The caller runs the stat colllection with early-out set to
BITS_BIN, so that this can set "-text" early. It knows that without
BITS_BIN, the stat was taken for the whole contents and the check lf
or crlf can be reliable.
I wonder if we can/need to do something to remove this subtleness
out of this callchain, which could be a source of confusion.
> @@ -132,24 +162,45 @@ static const char *gather_convert_stats_ascii(const
> char *data, unsigned long si
> }
> }
>
> +static unsigned get_convert_stats_wt(const char *path)
> +{
> + struct text_stat stats;
> + unsigned earlyout = CONVERT_STAT_BITS_BIN;
> + int fd;
> + memset(&stats, 0, sizeof(stats));
> + fd = open(path, O_RDONLY);
> + if (fd < 0)
> + return 0;
> + for (;;) {
> + char buf[2*1024];
Where is this 2kB come from? Out of thin air?