From ea032c79ac28ff8146e3652e0a3ec7e684968486 Mon Sep 17 00:00:00 2001
From: "A. Gordon" <assafgordon@gmail.com>
Date: Wed, 7 Jan 2015 18:30:28 -0500
Subject: [PATCH 1/6] split: replace hard-coded '\n' with a variable

---
 src/split.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/split.c b/src/split.c
index ef672f4..71fc9e2 100644
--- a/src/split.c
+++ b/src/split.c
@@ -108,6 +108,9 @@ static bool elide_empty_files;
    input to output, which is much slower, so disabled by default.  */
 static bool unbuffered;
 
+/* The character marking end of line. Default to \n. */
+static char eolchar = '\n';
+
 /* The split mode to use.  */
 enum Split_type
 {
@@ -630,10 +633,10 @@ lines_split (uintmax_t n_lines, char *buf, size_t bufsize)
         error (EXIT_FAILURE, errno, "%s", infile);
       bp = bp_out = buf;
       eob = bp + n_read;
-      *eob = '\n';
+      *eob = eolchar;
       while (true)
         {
-          bp = memchr (bp, '\n', eob - bp + 1);
+          bp = memchr (bp, eolchar, eob - bp + 1);
           if (bp == eob)
             {
               if (eob != bp_out) /* do not write 0 bytes! */
@@ -692,10 +695,10 @@ line_bytes_split (uintmax_t n_bytes, char *buf, size_t bufsize)
               /* Have enough for split.  */
               split_rest = n_bytes - n_out - n_hold;
               eoc = sob + split_rest - 1;
-              eol = memrchr (sob, '\n', split_rest);
+              eol = memrchr (sob, eolchar, split_rest);
             }
           else
-            eol = memrchr (sob, '\n', n_left);
+            eol = memrchr (sob, eolchar, n_left);
 
           /* Output hold space if possible.  */
           if (n_hold && !(!eol && n_out))
@@ -833,7 +836,7 @@ lines_chunk_split (uintmax_t k, uintmax_t n, char *buf, size_t bufsize,
 
           /* Begin looking for '\n' at last byte of chunk.  */
           off_t skip = MIN (n_read, MAX (0, chunk_end - n_written));
-          char *bp_out = memchr (bp + skip, '\n', n_read - skip);
+          char *bp_out = memchr (bp + skip, eolchar, n_read - skip);
           if (bp_out++)
             next = true;
           else
@@ -1080,7 +1083,7 @@ lines_rr (uintmax_t k, uintmax_t n, char *buf, size_t bufsize)
           bool next = false;
 
           /* Find end of line. */
-          char *bp_out = memchr (bp, '\n', eob - bp);
+          char *bp_out = memchr (bp, eolchar, eob - bp);
           if (bp_out)
             {
               bp_out++;
-- 
2.1.3


From b97ad51c8e2cd3742d6dba8b13f7b4b3a9cf6a23 Mon Sep 17 00:00:00 2001
From: "A. Gordon" <assafgordon@gmail.com>
Date: Wed, 7 Jan 2015 18:40:14 -0500
Subject: [PATCH 2/6] split: accept -t=SEP/-z options

---
 src/split.c | 40 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/src/split.c b/src/split.c
index 71fc9e2..a90ddc5 100644
--- a/src/split.c
+++ b/src/split.c
@@ -109,7 +109,8 @@ static bool elide_empty_files;
 static bool unbuffered;
 
 /* The character marking end of line. Default to \n. */
-static char eolchar = '\n';
+enum { DEFAULT_EOL = '\n' };
+static char eolchar = DEFAULT_EOL;
 
 /* The split mode to use.  */
 enum Split_type
@@ -142,6 +143,8 @@ static struct option const longopts[] =
   {"numeric-suffixes", optional_argument, NULL, 'd'},
   {"filter", required_argument, NULL, FILTER_OPTION},
   {"verbose", no_argument, NULL, VERBOSE_OPTION},
+  {"line-separator", required_argument, NULL, 't'},
+  {"zero-terminated", no_argument, NULL, 'z'},
   {"-io-blksize", required_argument, NULL,
    IO_BLKSIZE_OPTION}, /* do not document */
   {GETOPT_HELP_OPTION_DECL},
@@ -226,7 +229,9 @@ is -, read standard input.\n\
       --filter=COMMAND    write to shell COMMAND; file name is $FILE\n\
   -l, --lines=NUMBER      put NUMBER lines per output file\n\
   -n, --number=CHUNKS     generate CHUNKS output files; see explanation below\n\
+  -t, --line-separator=SEP  use SEP instead of newline as line separator\n\
   -u, --unbuffered        immediately copy input to output with '-n r/...'\n\
+  -z, --zero-terminated   line delimiter is NUL, not newline\n\
 "), DEFAULT_SUFFIX_LENGTH);
       fputs (_("\
       --verbose           print a diagnostic just before each\n\
@@ -1227,7 +1232,7 @@ main (int argc, char **argv)
       int this_optind = optind ? optind : 1;
       char *slash;
 
-      c = getopt_long (argc, argv, "0123456789C:a:b:del:n:u",
+      c = getopt_long (argc, argv, "0123456789C:a:b:del:n:t:uz",
                        longopts, NULL);
       if (c == -1)
         break;
@@ -1306,6 +1311,37 @@ main (int argc, char **argv)
           unbuffered = true;
           break;
 
+        case 't':
+          {
+            char neweol = optarg[0];
+            if (! neweol)
+              error (EXIT_FAILURE, 0, _("empty line-delimiter"));
+            if (optarg[1])
+              {
+                if (STREQ (optarg, "\\0"))
+                  neweol = '\0';
+                else
+                  {
+                    /* Provoke with 'split -txx'.  Complain about
+                       "multi-character tab" instead of "multibyte tab", so
+                       that the diagnostic's wording does not need to be
+                       changed once multibyte characters are supported.  */
+                    error (EXIT_FAILURE, 0, _("multi-character delimiter %s"),
+                           quote (optarg));
+                  }
+              }
+            if (eolchar != DEFAULT_EOL && neweol != eolchar)
+              error (EXIT_FAILURE, 0, _("incompatible line-delimiters"));
+            eolchar = neweol;
+          }
+          break;
+
+        case 'z':
+          if (eolchar != DEFAULT_EOL && eolchar != '\0')
+            error (EXIT_FAILURE, 0, _("incompatible line-delimiters"));
+          eolchar = '\0';
+          break;
+
         case '0':
         case '1':
         case '2':
-- 
2.1.3


From 937dbafb9198a564a81f631b7bd182341804b079 Mon Sep 17 00:00:00 2001
From: "A. Gordon" <assafgordon@gmail.com>
Date: Wed, 7 Jan 2015 19:21:15 -0500
Subject: [PATCH 3/6] tests: test split with custom line separators

---
 tests/local.mk           |   1 +
 tests/split/lines-sep.sh | 113 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 114 insertions(+)
 create mode 100755 tests/split/lines-sep.sh

diff --git a/tests/local.mk b/tests/local.mk
index 6fc8599..14dfaf3 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -355,6 +355,7 @@ all_tests =					\
   tests/split/b-chunk.sh			\
   tests/split/fail.sh				\
   tests/split/lines.sh				\
+  tests/split/lines-sep.sh          \
   tests/split/line-bytes.sh			\
   tests/split/l-chunk.sh			\
   tests/split/r-chunk.sh			\
diff --git a/tests/split/lines-sep.sh b/tests/split/lines-sep.sh
new file mode 100755
index 0000000..e0727dc
--- /dev/null
+++ b/tests/split/lines-sep.sh
@@ -0,0 +1,113 @@
+#!/bin/sh
+# test split with custom line separators
+
+# Copyright (C) 2002-2015 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ split
+
+# Prepare input/expected-output files,
+# with newline, zero, colon line-separators.
+printf '1\n2\n3\n4\n5\n' > in1-nl || framework_failure_
+printf '1\n2\n' > exp1-nl || framework_failure_
+printf '3\n4\n' > exp2-nl || framework_failure_
+printf '5\n'    > exp3-nl || framework_failure_
+
+printf '1\0002\0003\0004\0005\000' > in1-z || framework_failure_
+printf '1\0002\000' > exp1-z || framework_failure_
+printf '3\0004\000' > exp2-z || framework_failure_
+printf '5\000'      > exp3-z || framework_failure_
+
+printf '1:2:3:4:5:' > in1-cln || framework_failure_
+printf '1:2:' > exp1-cln || framework_failure_
+printf '3:4:' > exp2-cln || framework_failure_
+printf '5:'   > exp3-cln || framework_failure_
+
+
+run_split()
+{
+  # test number (should be unique, to avoid output file dups)
+  num=$1
+  # suffix of test files (nl/z/cln)
+  suf=$2
+  # other parameters (if any) are passed to 'split'
+  shift 2
+
+  split --lines=2 "$@" in1-$suf x$num- > out-$suf || return 1
+
+  compare exp1-$suf x$num-aa || return 1
+  compare exp2-$suf x$num-ab || return 1
+  compare exp3-$suf x$num-ac || return 1
+  test -f x$num-ad && return 1
+
+  return 0
+}
+
+
+# Test newline, without '-t' option (the default)
+run_split 1 nl         || { warn_ "test 1 failed" ; fail=1 ; }
+
+# Test newline specified as custom line separator
+run_split 2 nl -t$'\n' || { warn_ "test 2 failed" ; fail=1 ; }
+
+# Test null line-separator with '-z'
+run_split 3 z -z       || { warn_ "test 3 failed" ; fail=1 ; }
+
+# Test null line-separator with '-t'
+run_split 4 z -t\\0    || { warn_ "test 4 failed" ; fail=1 ; }
+
+# Test non-default line-separator with '-t'
+run_split 5 cln -t:    || { warn_ "test 5 failed" ; fail=1 ; }
+
+
+
+#
+# Test usage edge cases
+#
+
+# Should fail: '-t' requires an argument
+split -t </dev/null >/dev/null 2>/dev/null \
+  && { warn_ "-t without argument did not trigger an error" ; fail=1 ; }
+
+# should fail: multi-character separator
+split -txx </dev/null >/dev/null 2>/dev/null \
+  && { warn_ "-txx did not trigger an error" ; fail=1 ; }
+
+# should fail: different separators used
+split -ta -tb </dev/null >/dev/null 2>/dev/null \
+  && { warn_ "-ta -tb did not trigger an error" ; fail=1 ; }
+
+# should fail: different separators used
+split -ta -z </dev/null >/dev/null 2>/dev/null \
+  && { warn_ "-ta -z did not trigger an error" ; fail=1 ; }
+
+# should fail: different separators used
+split -z -ta </dev/null >/dev/null 2>/dev/null \
+  && { warn_ "-z -ta did not trigger an error" ; fail=1 ; }
+
+# should not fail: same separator used multiple times
+split -t: -t: </dev/null >/dev/null 2>/dev/null \
+  || { warn_ "-t: -t: triggered an error" ; fail=1 ; }
+
+# should not fail: NUL separator used multiple times
+split -z -t\\0 </dev/null >/dev/null 2>/dev/null \
+  || { warn_ "-z -t\\0 triggered an error" ; fail=1 ; }
+split -t\\0 -z </dev/null >/dev/null 2>/dev/null \
+  || { warn_ "-t\\0 -z triggered an error" ; fail=1 ; }
+
+
+
+Exit $fail
-- 
2.1.3


From 71d98ae9672d9cfac8d7abc62f8e42f0a172702f Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgordon@gmail.com>
Date: Thu, 8 Jan 2015 00:20:06 -0500
Subject: [PATCH 4/6] split: update TODO comment regarding -p/-t

See:
http://lists.gnu.org/archive/html/coreutils/2015-01/msg00008.html
---
 src/split.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/split.c b/src/split.c
index a90ddc5..fce4dae 100644
--- a/src/split.c
+++ b/src/split.c
@@ -17,9 +17,7 @@
 /* By tege@sics.se, with rms.
 
    To do:
-   * Implement -t CHAR or -t REGEX to specify break characters other
-     than newline. */
-
+   * support -p REGEX as in BSD's split */
 #include <config.h>
 
 #include <assert.h>
-- 
2.1.3


From a8d7abafa1fd734ae6b91a3a7512e8aadd395264 Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgordon@gmail.com>
Date: Thu, 8 Jan 2015 01:18:09 -0500
Subject: [PATCH 5/6] doc: mention split's -t/-z options in texinfo

---
 doc/coreutils.texi | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index f6aef2d..abf46d8 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -3505,6 +3505,14 @@ than the number requested, or if a line is so long as to completely
 span a chunk.  The output file sequence numbers, always run consecutively
 even when this option is specified.
 
+@item -t @var{separator}
+@itemx --line-separator=@var{separator}
+@opindex -t
+@opindex --line-separator
+@cindex line separator character
+Use character @var{separator} as the line separator instead of the default
+newline character (ASCII LF).
+
 @item -u
 @itemx --unbuffered
 @opindex -u
@@ -3516,6 +3524,18 @@ which is a much slower mode of operation.
 @opindex --verbose
 Write a diagnostic just before each output file is opened.
 
+@item -z
+@itemx --zero-terminated
+@opindex -z
+@opindex --zero-terminated
+@cindex process zero-terminated lines
+Use zero byte (ASCII NUL character) as the line separator instead of the
+default newline character (ASCII LF).
+This option can be useful in conjunction with @samp{perl -0} or
+@samp{find -print0} and @samp{xargs -0} which do the same in order to
+reliably handle arbitrary file names (even those containing blanks
+or other special characters).
+
 @end table
 
 @exitstatus
-- 
2.1.3


From 932e5e1f71a19eb3fb5325ae744df3193a05f11e Mon Sep 17 00:00:00 2001
From: Assaf Gordon <assafgordon@gmail.com>
Date: Thu, 8 Jan 2015 01:49:31 -0500
Subject: [PATCH 6/6] doc: mention split's -z/-t in NEWS

---
 NEWS | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NEWS b/NEWS
index b81154d..5993b15 100644
--- a/NEWS
+++ b/NEWS
@@ -42,6 +42,8 @@ GNU coreutils NEWS                                    -*- outline -*-
   dd accepts a new status=progress level to print data transfer statistics
   on stderr approximately every second.
 
+  split accepts new options: --zero-terminated (-z), --line-separator=X (-tX).
+
 ** Changes in behavior
 
   df no longer suppresses separate exports of the same remote device, as
-- 
2.1.3

