Hello,

On 01/07/2015 03:09 PM, Eric Blake wrote:
On 01/07/2015 11:49 AM, Markus Elfring wrote:
Do I read the current documentation correctly in
the way that zero-terminated lines are not supported
so far?
http://www.gnu.org/software/coreutils/manual/html_node/split-invocation.html

Correct.  Patches welcome.  In the meantime:

attached is a quick attempt at splitting with custom line-separator '-t=SEP' 
and NUL '-z' .

It's not complete, but could be improved if this is the right direction.

- assaf

>From d63d7708ba16494a2968490a686916a93b4a805e Mon Sep 17 00:00:00 2001
From: "A. Gordon" <[email protected]>
Date: Wed, 7 Jan 2015 18:30:28 -0500
Subject: [PATCH 1/3] split: replace hard-coded '\n' with a variable

---
 src/split.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/split.c b/src/split.c
index ef672f4..71fc9e2 100644
--- a/src/split.c
+++ b/src/split.c
@@ -108,6 +108,9 @@ static bool elide_empty_files;
    input to output, which is much slower, so disabled by default.  */
 static bool unbuffered;
 
+/* The character marking end of line. Default to \n. */
+static char eolchar = '\n';
+
 /* The split mode to use.  */
 enum Split_type
 {
@@ -630,10 +633,10 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize)
         error (EXIT_FAILURE, errno, "%s", infile);
       bp = bp_out = buf;
       eob = bp + n_read;
-      *eob = '\n';
+      *eob = eolchar;
       while (true)
         {
-          bp = memchr (bp, '\n', eob - bp + 1);
+          bp = memchr (bp, eolchar, eob - bp + 1);
           if (bp == eob)
             {
               if (eob != bp_out) /* do not write 0 bytes! */
@@ -692,10 +695,10 @@ line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
               /* Have enough for split.  */
               split_rest = n_bytes - n_out - n_hold;
               eoc = sob + split_rest - 1;
-              eol = memrchr (sob, '\n', split_rest);
+              eol = memrchr (sob, eolchar, split_rest);
             }
           else
-            eol = memrchr (sob, '\n', n_left);
+            eol = memrchr (sob, eolchar, n_left);
 
           /* Output hold space if possible.  */
           if (n_hold && !(!eol && n_out))
@@ -833,7 +836,7 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
 
           /* Begin looking for '\n' at last byte of chunk.  */
           off_t skip = MIN (n_read, MAX (0, chunk_end - n_written));
-          char *bp_out = memchr (bp + skip, '\n', n_read - skip);
+          char *bp_out = memchr (bp + skip, eolchar, n_read - skip);
           if (bp_out++)
             next = true;
           else
@@ -1080,7 +1083,7 @@ lines_rr (uintmax_t k, uintmax_t n, char *buf, size_t bufsize)
           bool next = false;
 
           /* Find end of line. */
-          char *bp_out = memchr (bp, '\n', eob - bp);
+          char *bp_out = memchr (bp, eolchar, eob - bp);
           if (bp_out)
             {
               bp_out++;
-- 
1.9.1


>From 5086fa3cec116086b8b6be895dc9a91d3e27dc59 Mon Sep 17 00:00:00 2001
From: "A. Gordon" <[email protected]>
Date: Wed, 7 Jan 2015 18:40:14 -0500
Subject: [PATCH 2/3] split: accept -t=SEP/-z options

---
 src/split.c | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/split.c b/src/split.c
index 71fc9e2..cb63b03 100644
--- a/src/split.c
+++ b/src/split.c
@@ -109,7 +109,8 @@ static bool elide_empty_files;
 static bool unbuffered;
 
 /* The character marking end of line. Default to \n. */
-static char eolchar = '\n';
+enum { DEFAULT_EOL = '\n' };
+static char eolchar = DEFAULT_EOL;
 
 /* The split mode to use.  */
 enum Split_type
@@ -142,6 +143,8 @@ static struct option const longopts[] =
   {"numeric-suffixes", optional_argument, NULL, 'd'},
   {"filter", required_argument, NULL, FILTER_OPTION},
   {"verbose", no_argument, NULL, VERBOSE_OPTION},
+  {"line-separator", required_argument, NULL, 't'},
+  {"zero-terminated", no_argument, NULL, 'z'},
   {"-io-blksize", required_argument, NULL,
    IO_BLKSIZE_OPTION}, /* do not document */
   {GETOPT_HELP_OPTION_DECL},
@@ -226,7 +229,9 @@ is -, read standard input.\n\
       --filter=COMMAND    write to shell COMMAND; file name is $FILE\n\
   -l, --lines=NUMBER      put NUMBER lines per output file\n\
   -n, --number=CHUNKS     generate CHUNKS output files; see explanation below\n\
+  -t, --line-separator=SEP  use SEP instead of new-line as line separator\n\
   -u, --unbuffered        immediately copy input to output with '-n r/...'\n\
+  -z, --zero-terminated   line delimiter is NUL, not newline\n\
 "), DEFAULT_SUFFIX_LENGTH);
       fputs (_("\
       --verbose           print a diagnostic just before each\n\
@@ -1227,7 +1232,7 @@ main (int argc, char **argv)
       int this_optind = optind ? optind : 1;
       char *slash;
 
-      c = getopt_long (argc, argv, "0123456789C:a:b:del:n:u",
+      c = getopt_long (argc, argv, "0123456789C:a:b:del:n:t:uz",
                        longopts, NULL);
       if (c == -1)
         break;
@@ -1306,6 +1311,37 @@ main (int argc, char **argv)
           unbuffered = true;
           break;
 
+        case 't':
+          {
+            char neweol = optarg[0];
+            if (! neweol)
+              error (EXIT_FAILURE, 0, _("empty line-delimiter"));
+            if (optarg[1])
+              {
+                if (STREQ (optarg, "\\0"))
+                  neweol = '\0';
+                else
+                  {
+                    /* Provoke with 'split -txx'.  Complain about
+                       "multi-character tab" instead of "multibyte tab", so
+                       that the diagnostic's wording does not need to be
+                       changed once multibyte characters are supported.  */
+                    error (EXIT_FAILURE, 0, _("multi-character delimiter %s"),
+                           quote (optarg));
+                  }
+              }
+            if (eolchar != DEFAULT_EOL && neweol != eolchar)
+              error (EXIT_FAILURE, 0, _("incompatible line-delimiters"));
+            eolchar = neweol;
+          }
+          break;
+
+        case 'z':
+          if (eolchar != DEFAULT_EOL && eolchar != '\0')
+            error (EXIT_FAILURE, 0, _("incompatible line-delimiters"));
+          eolchar = '\0';
+          break;
+
         case '0':
         case '1':
         case '2':
-- 
1.9.1


>From 029fed81ed90bd0dcaf34295c4819792c01ab3c6 Mon Sep 17 00:00:00 2001
From: "A. Gordon" <[email protected]>
Date: Wed, 7 Jan 2015 19:21:15 -0500
Subject: [PATCH 3/3] tests: test split with custom line separators

---
 tests/local.mk           |  1 +
 tests/split/lines-sep.sh | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 tests/split/lines-sep.sh

diff --git a/tests/local.mk b/tests/local.mk
index 6fc8599..14dfaf3 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -355,6 +355,7 @@ all_tests =					\
   tests/split/b-chunk.sh			\
   tests/split/fail.sh				\
   tests/split/lines.sh				\
+  tests/split/lines-sep.sh          \
   tests/split/line-bytes.sh			\
   tests/split/l-chunk.sh			\
   tests/split/r-chunk.sh			\
diff --git a/tests/split/lines-sep.sh b/tests/split/lines-sep.sh
new file mode 100644
index 0000000..eb98b1d
--- /dev/null
+++ b/tests/split/lines-sep.sh
@@ -0,0 +1,74 @@
+#!/bin/sh
+# test split with custom line separators
+
+# Copyright (C) 2002-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ split
+
+# Prepare input/expected-output files,
+# with newline, zero, colon line-separators.
+printf '1\n2\n3\n4\n5\n' > in1-nl || framework_failure_
+printf '1\n2\n' > exp1-nl || framework_failure_
+printf '3\n4\n' > exp2-nl || framework_failure_
+printf '5\n'    > exp3-nl || framework_failure_
+
+printf '1\0002\0003\0004\0005\000' > in1-z || framework_failure_
+printf '1\0002\000' > exp1-z || framework_failure_
+printf '3\0004\000' > exp2-z || framework_failure_
+printf '5\000'      > exp3-z || framework_failure_
+
+printf '1:2:3:4:5:' > in1-cln || framework_failure_
+printf '1:2:' > exp1-cln || framework_failure_
+printf '3:4:' > exp2-cln || framework_failure_
+printf '5:'   > exp3-cln || framework_failure_
+
+
+run_split()
+{
+  # test number (should be unique, to avoid output file dups)
+  num=$1
+  # suffix of test files (nl/z/cln)
+  suf=$2
+  shift 2
+
+  split --lines=2 $@ in1-$suf x$num- > out-$suf || return 1
+
+  compare exp1-$suf x$num-aa || return 1
+  compare exp2-$suf x$num-ab || return 1
+  compare exp3-$suf x$num-ac || return 1
+  test -f x$num-ad && return 1
+
+  return 0
+}
+
+
+# Test newline, without '-t' option (the default)
+run_split 1 nl         || { warn_ "test 1 failed" ; fail=1 ; }
+
+#FIXME: Test newline specified as custom line separator
+#run_split 2 nl '-t$\n' || { warn_ "test 2 failed" ; fail=1 ; }
+
+# Test null line-separator with '-z'
+run_split 3 z -z       || { warn_ "test 3 failed" ; fail=1 ; }
+
+#FIXME: Test null line-separator with '-t'
+#run_split 4 z -t$'\0'  || { warn_ "test 4 failed" ; fail=1 ; }
+
+# Test non-default line-separator with '-t'
+run_split 5 cln -t:    || { warn_ "test 5 failed" ; fail=1 ; }
+
+Exit $fail
-- 
1.9.1

Reply via email to