Hello,
On 01/07/2015 03:09 PM, Eric Blake wrote:
On 01/07/2015 11:49 AM, Markus Elfring wrote:
Do I read the current documentation correctly in
the way that zero-terminated lines are not supported
so far?
http://www.gnu.org/software/coreutils/manual/html_node/split-invocation.html
Correct. Patches welcome. In the meantime:
attached is a quick attempt at splitting with custom line-separator '-t=SEP'
and NUL '-z' .
It's not complete, but could be improved if this is the right direction.
- assaf
>From d63d7708ba16494a2968490a686916a93b4a805e Mon Sep 17 00:00:00 2001
From: "A. Gordon" <[email protected]>
Date: Wed, 7 Jan 2015 18:30:28 -0500
Subject: [PATCH 1/3] split: replace hard-coded '\n' with a variable
---
src/split.c | 15 +++++++++------
1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/src/split.c b/src/split.c
index ef672f4..71fc9e2 100644
--- a/src/split.c
+++ b/src/split.c
@@ -108,6 +108,9 @@ static bool elide_empty_files;
input to output, which is much slower, so disabled by default. */
static bool unbuffered;
+/* The character marking end of line. Default to \n. */
+static char eolchar = '\n';
+
/* The split mode to use. */
enum Split_type
{
@@ -630,10 +633,10 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize)
error (EXIT_FAILURE, errno, "%s", infile);
bp = bp_out = buf;
eob = bp + n_read;
- *eob = '\n';
+ *eob = eolchar;
while (true)
{
- bp = memchr (bp, '\n', eob - bp + 1);
+ bp = memchr (bp, eolchar, eob - bp + 1);
if (bp == eob)
{
if (eob != bp_out) /* do not write 0 bytes! */
@@ -692,10 +695,10 @@ line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
/* Have enough for split. */
split_rest = n_bytes - n_out - n_hold;
eoc = sob + split_rest - 1;
- eol = memrchr (sob, '\n', split_rest);
+ eol = memrchr (sob, eolchar, split_rest);
}
else
- eol = memrchr (sob, '\n', n_left);
+ eol = memrchr (sob, eolchar, n_left);
/* Output hold space if possible. */
if (n_hold && !(!eol && n_out))
@@ -833,7 +836,7 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
/* Begin looking for '\n' at last byte of chunk. */
off_t skip = MIN (n_read, MAX (0, chunk_end - n_written));
- char *bp_out = memchr (bp + skip, '\n', n_read - skip);
+ char *bp_out = memchr (bp + skip, eolchar, n_read - skip);
if (bp_out++)
next = true;
else
@@ -1080,7 +1083,7 @@ lines_rr (uintmax_t k, uintmax_t n, char *buf, size_t bufsize)
bool next = false;
/* Find end of line. */
- char *bp_out = memchr (bp, '\n', eob - bp);
+ char *bp_out = memchr (bp, eolchar, eob - bp);
if (bp_out)
{
bp_out++;
--
1.9.1
>From 5086fa3cec116086b8b6be895dc9a91d3e27dc59 Mon Sep 17 00:00:00 2001
From: "A. Gordon" <[email protected]>
Date: Wed, 7 Jan 2015 18:40:14 -0500
Subject: [PATCH 2/3] split: accept -t=SEP/-z options
---
src/split.c | 40 ++++++++++++++++++++++++++++++++++++++--
1 file changed, 38 insertions(+), 2 deletions(-)
diff --git a/src/split.c b/src/split.c
index 71fc9e2..cb63b03 100644
--- a/src/split.c
+++ b/src/split.c
@@ -109,7 +109,8 @@ static bool elide_empty_files;
static bool unbuffered;
/* The character marking end of line. Default to \n. */
-static char eolchar = '\n';
+enum { DEFAULT_EOL = '\n' };
+static char eolchar = DEFAULT_EOL;
/* The split mode to use. */
enum Split_type
@@ -142,6 +143,8 @@ static struct option const longopts[] =
{"numeric-suffixes", optional_argument, NULL, 'd'},
{"filter", required_argument, NULL, FILTER_OPTION},
{"verbose", no_argument, NULL, VERBOSE_OPTION},
+ {"line-separator", required_argument, NULL, 't'},
+ {"zero-terminated", no_argument, NULL, 'z'},
{"-io-blksize", required_argument, NULL,
IO_BLKSIZE_OPTION}, /* do not document */
{GETOPT_HELP_OPTION_DECL},
@@ -226,7 +229,9 @@ is -, read standard input.\n\
--filter=COMMAND write to shell COMMAND; file name is $FILE\n\
-l, --lines=NUMBER put NUMBER lines per output file\n\
-n, --number=CHUNKS generate CHUNKS output files; see explanation below\n\
+ -t, --line-separator=SEP use SEP instead of new-line as line separator\n\
-u, --unbuffered immediately copy input to output with '-n r/...'\n\
+ -z, --zero-terminated line delimiter is NUL, not newline\n\
"), DEFAULT_SUFFIX_LENGTH);
fputs (_("\
--verbose print a diagnostic just before each\n\
@@ -1227,7 +1232,7 @@ main (int argc, char **argv)
int this_optind = optind ? optind : 1;
char *slash;
- c = getopt_long (argc, argv, "0123456789C:a:b:del:n:u",
+ c = getopt_long (argc, argv, "0123456789C:a:b:del:n:t:uz",
longopts, NULL);
if (c == -1)
break;
@@ -1306,6 +1311,37 @@ main (int argc, char **argv)
unbuffered = true;
break;
+ case 't':
+ {
+ char neweol = optarg[0];
+ if (! neweol)
+ error (EXIT_FAILURE, 0, _("empty line-delimiter"));
+ if (optarg[1])
+ {
+ if (STREQ (optarg, "\\0"))
+ neweol = '\0';
+ else
+ {
+ /* Provoke with 'split -txx'. Complain about
+ "multi-character tab" instead of "multibyte tab", so
+ that the diagnostic's wording does not need to be
+ changed once multibyte characters are supported. */
+ error (EXIT_FAILURE, 0, _("multi-character delimiter %s"),
+ quote (optarg));
+ }
+ }
+ if (eolchar != DEFAULT_EOL && neweol != eolchar)
+ error (EXIT_FAILURE, 0, _("incompatible line-delimiters"));
+ eolchar = neweol;
+ }
+ break;
+
+ case 'z':
+ if (eolchar != DEFAULT_EOL && eolchar != '\0')
+ error (EXIT_FAILURE, 0, _("incompatible line-delimiters"));
+ eolchar = '\0';
+ break;
+
case '0':
case '1':
case '2':
--
1.9.1
>From 029fed81ed90bd0dcaf34295c4819792c01ab3c6 Mon Sep 17 00:00:00 2001
From: "A. Gordon" <[email protected]>
Date: Wed, 7 Jan 2015 19:21:15 -0500
Subject: [PATCH 3/3] tests: test split with custom line separators
---
tests/local.mk | 1 +
tests/split/lines-sep.sh | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 75 insertions(+)
create mode 100644 tests/split/lines-sep.sh
diff --git a/tests/local.mk b/tests/local.mk
index 6fc8599..14dfaf3 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -355,6 +355,7 @@ all_tests = \
tests/split/b-chunk.sh \
tests/split/fail.sh \
tests/split/lines.sh \
+ tests/split/lines-sep.sh \
tests/split/line-bytes.sh \
tests/split/l-chunk.sh \
tests/split/r-chunk.sh \
diff --git a/tests/split/lines-sep.sh b/tests/split/lines-sep.sh
new file mode 100644
index 0000000..eb98b1d
--- /dev/null
+++ b/tests/split/lines-sep.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+# test split with custom line separators
+
+# Copyright (C) 2002-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ split
+
+# Prepare input/expected-output files,
+# with newline, zero, colon line-separators.
+printf '1\n2\n3\n4\n5\n' > in1-nl || framework_failure_
+printf '1\n2\n' > exp1-nl || framework_failure_
+printf '3\n4\n' > exp2-nl || framework_failure_
+printf '5\n' > exp3-nl || framework_failure_
+
+printf '1\0002\0003\0004\0005\000' > in1-z || framework_failure_
+printf '1\0002\000' > exp1-z || framework_failure_
+printf '3\0004\000' > exp2-z || framework_failure_
+printf '5\000' > exp3-z || framework_failure_
+
+printf '1:2:3:4:5:' > in1-cln || framework_failure_
+printf '1:2:' > exp1-cln || framework_failure_
+printf '3:4:' > exp2-cln || framework_failure_
+printf '5:' > exp3-cln || framework_failure_
+
+
+run_split()
+{
+ # test number (should be unique, to avoid output file dups)
+ num=$1
+ # suffix of test files (nl/z/cln)
+ suf=$2
+ shift 2
+
+ split --lines=2 $@ in1-$suf x$num- > out-$suf || return 1
+
+ compare exp1-$suf x$num-aa || return 1
+ compare exp2-$suf x$num-ab || return 1
+ compare exp3-$suf x$num-ac || return 1
+ test -f x$num-ad && return 1
+
+ return 0
+}
+
+
+# Test newline, without '-t' option (the default)
+run_split 1 nl || { warn_ "test 1 failed" ; fail=1 ; }
+
+#FIXME: Test newline specified as custom line separator
+#run_split 2 nl '-t$\n' || { warn_ "test 2 failed" ; fail=1 ; }
+
+# Test null line-separator with '-z'
+run_split 3 z -z || { warn_ "test 3 failed" ; fail=1 ; }
+
+#FIXME: Test null line-separator with '-t'
+#run_split 4 z -t$'\0' || { warn_ "test 4 failed" ; fail=1 ; }
+
+# Test non-default line-separator with '-t'
+run_split 5 cln -t: || { warn_ "test 5 failed" ; fail=1 ; }
+
+Exit $fail
--
1.9.1