This patch moves some logic from 'fold' into a local Gnulib module that
can be used by other programs.
The module is used in a similar manner to getc (). Here is a trivial
example of it's use:
#include "mbbuf.h"
mcel_t g;
while ((g = mbbuf_next_char (istream)).ch != MBBUF_EOF)
fwrite (mbbuf_prev_char (g), 1, g.len, stdout);
This module should make it pretty easy to port programs who use getc ()
and putc (). And it will make sure that invalid multi-byte characters
are treated consistently across programs.
Performance is the same as coreutils-9.8 since the logic isn't really
changed.
Thoughts?
Collin
>From 7c67faa24ee4ee1738669c065fc08c28d94527f4 Mon Sep 17 00:00:00 2001
Message-ID: <7c67faa24ee4ee1738669c065fc08c28d94527f4.1759116939.git.collin.fu...@gmail.com>
From: Collin Funk <[email protected]>
Date: Sun, 28 Sep 2025 20:16:26 -0700
Subject: [PATCH] fold: move multi-byte character reading to a module
* gl/modules/mbbuf: New file.
* gl/lib/mbbuf.c: Likewise.
* gl/lib/mbbuf.h: Likewise.
* bootstrap.conf (gnulib_modules): Add mbbuf.
* src/ioblksize.h: Add include guards.
* src/fold.c: Include mbbuf.h.
(fold_file): Use the mbbuf functions instead of calling fread and
handling the input buffer ourselves.
---
bootstrap.conf | 1 +
gl/lib/mbbuf.c | 24 ++++++++
gl/lib/mbbuf.h | 102 +++++++++++++++++++++++++++++++
gl/modules/mbbuf | 23 +++++++
src/fold.c | 154 +++++++++++++++++++----------------------------
src/ioblksize.h | 5 ++
6 files changed, 216 insertions(+), 93 deletions(-)
create mode 100644 gl/lib/mbbuf.c
create mode 100644 gl/lib/mbbuf.h
create mode 100644 gl/modules/mbbuf
diff --git a/bootstrap.conf b/bootstrap.conf
index adf09910d..f470aa48b 100644
--- a/bootstrap.conf
+++ b/bootstrap.conf
@@ -169,6 +169,7 @@ gnulib_modules="
maintainer-makefile
malloc-gnu
manywarnings
+ mbbuf
mbrlen
mbrtoc32
mbrtowc
diff --git a/gl/lib/mbbuf.c b/gl/lib/mbbuf.c
new file mode 100644
index 000000000..36f680221
--- /dev/null
+++ b/gl/lib/mbbuf.c
@@ -0,0 +1,24 @@
+/* Buffering for multi-byte characters.
+ Copyright 2025 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#include <config.h>
+
+#define MBBUF_INLINE _GL_EXTERN_INLINE
+#include "mbbuf.h"
+
+char mbbuf[IO_BUFSIZE];
+idx_t mbbuf_offset = 0;
+idx_t mbbuf_length = 0;
diff --git a/gl/lib/mbbuf.h b/gl/lib/mbbuf.h
new file mode 100644
index 000000000..757f17435
--- /dev/null
+++ b/gl/lib/mbbuf.h
@@ -0,0 +1,102 @@
+/* Buffering for multi-byte characters.
+ Copyright 2025 Free Software Foundation, Inc.
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <https://www.gnu.org/licenses/>. */
+
+#ifndef _MBBUF_H
+#define _MBBUF_H 1
+
+#ifndef _GL_INLINE_HEADER_BEGIN
+# error "Please include config.h first."
+#endif
+
+#include <stdio.h>
+#include <sys/stat.h>
+#include <stdlib.h>
+#include <stddef.h>
+
+#include "mcel.h"
+#include "idx.h"
+#include "ioblksize.h"
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef MBBUF_INLINE
+# define MBBUF_INLINE _GL_INLINE
+#endif
+
+/* End of file. */
+#define MBBUF_EOF UINT32_MAX
+
+/* MBBUF_EOF should not be a valid character. */
+static_assert (MCEL_CHAR_MAX < MBBUF_EOF);
+
+/* Input buffer. */
+extern char mbbuf[IO_BUFSIZE];
+
+/* The current position in MBBUF. */
+extern idx_t mbbuf_offset;
+
+/* The number of bytes in MBBUF. */
+extern idx_t mbbuf_length;
+
+/* Get the next character in the buffer, filling it from FP if necessary.
+ If an invalid multi-byte character is seen, we assume the program wants to
+ fall back to the read byte. */
+MBBUF_INLINE mcel_t
+mbbuf_next_char (FILE *fp)
+{
+ idx_t available = mbbuf_length - mbbuf_offset;
+ /* Check if we need to fill the input buffer. */
+ if (available < MCEL_LEN_MAX && !feof (fp))
+ {
+ idx_t mbbuf_start;
+ if (!(0 < available))
+ mbbuf_start = 0;
+ else
+ {
+ memmove (mbbuf, mbbuf + mbbuf_offset, available);
+ mbbuf_start = available;
+ }
+ mbbuf_length = fread (mbbuf + mbbuf_start, 1,
+ sizeof mbbuf - mbbuf_start, fp) + mbbuf_start;
+ mbbuf_offset = 0;
+ available = mbbuf_length - mbbuf_offset;
+ }
+ if (available <= 0)
+ return (mcel_t) { .ch = MBBUF_EOF };
+ mcel_t g = mcel_scan (mbbuf + mbbuf_offset, mbbuf + mbbuf_length);
+ if (! g.err)
+ mbbuf_offset += g.len;
+ else
+ {
+ /* Assume the program will emit the byte, but keep the error flag. */
+ g.ch = mbbuf[mbbuf_offset++];
+ g.len = 1;
+ }
+ return g;
+}
+
+/* Returns a pointer to the first character in the previously read character
+ from mbbuf_next_char. */
+MBBUF_INLINE char *
+mbbuf_prev_char (mcel_t g)
+{
+ if (mbbuf_offset < g.len)
+ unreachable ();
+ return mbbuf + (mbbuf_offset - g.len);
+}
+
+_GL_INLINE_HEADER_END
+
+#endif
diff --git a/gl/modules/mbbuf b/gl/modules/mbbuf
new file mode 100644
index 000000000..9b5719b25
--- /dev/null
+++ b/gl/modules/mbbuf
@@ -0,0 +1,23 @@
+Description:
+Buffering for multi-byte characters.
+
+Files:
+lib/mbbuf.c
+lib/mbbuf.h
+
+Depends-on:
+extern-inline
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += mbbuf.c mbbuf.h
+
+Include:
+"mbbuf.h"
+
+License:
+GPL
+
+Maintainer:
+all
diff --git a/src/fold.c b/src/fold.c
index e90837143..0605b9ae7 100644
--- a/src/fold.c
+++ b/src/fold.c
@@ -27,6 +27,7 @@
#include "fadvise.h"
#include "ioblksize.h"
#include "mcel.h"
+#include "mbbuf.h"
#include "xdectoint.h"
#define TAB_WIDTH 8
@@ -152,9 +153,6 @@ fold_file (char const *filename, size_t width)
size_t column = 0; /* Screen column where next char will go. */
idx_t offset_out = 0; /* Index in 'line_out' for next char. */
static char line_out[IO_BUFSIZE];
- static char line_in[IO_BUFSIZE];
- static size_t offset_in = 0;
- static size_t length_in = 0;
int saved_errno;
if (streq (filename, "-"))
@@ -173,115 +171,85 @@ fold_file (char const *filename, size_t width)
fadvise (istream, FADVISE_SEQUENTIAL);
- while (0 < (length_in = fread (line_in + offset_in, 1,
- sizeof line_in - offset_in, istream))
- || 0 < offset_in)
+ mcel_t g;
+ while ((g = mbbuf_next_char (istream)).ch != MBBUF_EOF)
{
- char *p = line_in;
- char *lim = p + length_in + offset_in;
- mcel_t g;
- for (; p < lim; p += g.len)
+ if (g.ch == '\n')
{
- g = mcel_scan (p, lim);
- if (g.err)
- {
- /* Replace the character with the byte if it cannot be a
- truncated multibyte sequence. */
- if (!(lim - p <= MCEL_LEN_MAX) || length_in == 0)
- g.ch = p[0];
- else
- {
- /* It may be a truncated multibyte sequence. Move it to the
- front of the input buffer. */
- memmove (line_in, p, lim - p);
- offset_in = lim - p;
- goto next_line;
- }
- }
- if (g.ch == '\n')
- {
- write_out (line_out, offset_out, /*newline=*/ true);
- column = offset_out = 0;
- continue;
- }
- rescan:
- column = adjust_column (column, g);
+ write_out (line_out, offset_out, /*newline=*/ true);
+ column = offset_out = 0;
+ continue;
+ }
+ rescan:
+ column = adjust_column (column, g);
- if (column > width)
+ if (column > width)
+ {
+ /* This character would make the line too long.
+ Print the line plus a newline, and make this character
+ start the next line. */
+ if (break_spaces)
{
- /* This character would make the line too long.
- Print the line plus a newline, and make this character
- start the next line. */
- if (break_spaces)
- {
- int space_length = 0;
- idx_t logical_end = offset_out;
- char *logical_p = line_out;
- char *logical_lim = logical_p + logical_end;
+ int space_length = 0;
+ idx_t logical_end = offset_out;
+ char *logical_p = line_out;
+ char *logical_lim = logical_p + logical_end;
- for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len)
- {
- g2 = mcel_scan (logical_p, logical_lim);
- if (c32isblank (g2.ch) && ! c32isnbspace (g2.ch))
- {
- space_length = g2.len;
- logical_end = logical_p - line_out;
- }
- }
-
- if (space_length)
+ for (mcel_t g2; logical_p < logical_lim; logical_p += g2.len)
+ {
+ g2 = mcel_scan (logical_p, logical_lim);
+ if (c32isblank (g2.ch) && ! c32isnbspace (g2.ch))
{
- logical_end += space_length;
- /* Found a blank. Don't output the part after it. */
- write_out (line_out, logical_end, /*newline=*/ true);
- /* Move the remainder to the beginning of the next line.
- The areas being copied here might overlap. */
- memmove (line_out, line_out + logical_end,
- offset_out - logical_end);
- offset_out -= logical_end;
- column = 0;
- char *printed_p = line_out;
- char *printed_lim = printed_p + offset_out;
- for (mcel_t g2; printed_p < printed_lim;
- printed_p += g2.len)
- {
- g2 = mcel_scan (printed_p, printed_lim);
- column = adjust_column (column, g2);
- }
- goto rescan;
+ space_length = g2.len;
+ logical_end = logical_p - line_out;
}
}
- if (offset_out == 0)
+ if (space_length)
{
- memcpy (line_out, p, g.len);
- offset_out += g.len;
- continue;
+ logical_end += space_length;
+ /* Found a blank. Don't output the part after it. */
+ write_out (line_out, logical_end, /*newline=*/ true);
+ /* Move the remainder to the beginning of the next line.
+ The areas being copied here might overlap. */
+ memmove (line_out, line_out + logical_end,
+ offset_out - logical_end);
+ offset_out -= logical_end;
+ column = 0;
+ char *printed_p = line_out;
+ char *printed_lim = printed_p + offset_out;
+ for (mcel_t g2; printed_p < printed_lim;
+ printed_p += g2.len)
+ {
+ g2 = mcel_scan (printed_p, printed_lim);
+ column = adjust_column (column, g2);
+ }
+ goto rescan;
}
-
- write_out (line_out, offset_out, /*newline=*/ true);
- column = offset_out = 0;
- goto rescan;
}
- /* This can occur if we have read characters with a width of
- zero. */
- if (sizeof line_out <= offset_out + g.len)
+ if (offset_out == 0)
{
- write_out (line_out, offset_out, /*newline=*/ false);
- offset_out = 0;
+ memcpy (line_out, mbbuf_prev_char (g), g.len);
+ offset_out += g.len;
+ continue;
}
- memcpy (line_out + offset_out, p, g.len);
- offset_out += g.len;
+ write_out (line_out, offset_out, /*newline=*/ true);
+ column = offset_out = 0;
+ goto rescan;
}
- if (feof (istream))
- break;
- /* We read a full buffer of complete characters. */
- offset_in = 0;
+ /* This can occur if we have read characters with a width of
+ zero. */
+ if (sizeof line_out <= offset_out + g.len)
+ {
+ write_out (line_out, offset_out, /*newline=*/ false);
+ offset_out = 0;
+ }
- next_line:;
+ memcpy (line_out + offset_out, mbbuf_prev_char (g), g.len);
+ offset_out += g.len;
}
saved_errno = errno;
diff --git a/src/ioblksize.h b/src/ioblksize.h
index 133ca122b..4de266aaf 100644
--- a/src/ioblksize.h
+++ b/src/ioblksize.h
@@ -16,6 +16,9 @@
/* Include this file _after_ system headers if possible. */
+#ifndef _IOBLKSIZE_H
+#define _IOBLKSIZE_H 1
+
/* sys/stat.h and minmax.h will already have been included by system.h. */
#include <stdbit.h>
#include "stat-size.h"
@@ -109,3 +112,5 @@ io_blksize (struct stat const *st)
as that is asking for trouble. */
return MIN (SYS_BUFSIZE_MAX, blocksize);
}
+
+#endif
--
2.51.0