commit 40978e333d3093e5f338c94c7435a79b6256f362
Author: FRIGN <[email protected]>
Date:   Thu Jan 22 12:32:50 2015 +0100

    Add UTF-8-delimiter-support to cut(1)
    
    Now you can specify a multibyte-delimiter to cut, which should
    definitely be possible for the end-user (Fuck POSIX).
    Looking at GNU/coreutils' cut(1)[0], which basically ignores the difference
    between characters and bytes, the -n-option and which is bloated as hell,
    one has to wonder why they are still default. This is insane!
    Things like this personally keep me motivated to make sbase better
    every day.
    
    [0]: 
http://git.savannah.gnu.org/gitweb/?p=coreutils.git;a=blob;f=src/cut.c;hb=HEAD
         NSFW! You have been warned.

diff --git a/README b/README
index ff90332..81b182f 100644
--- a/README
+++ b/README
@@ -22,7 +22,7 @@ The following tools are implemented ('*' == finished, '#' == 
UTF-8 support,
 =* comm            yes                             none
 =  cp              no                              -H, -i, -L
 =* cron            non-posix                       none
- * cut             yes                             none
+#* cut             yes                             none
 =  date            yes                             none
 =  dirname         yes                             none
 =  du              no                              -H, -L, -x
diff --git a/cut.1 b/cut.1
index 0be731e..72654e1 100644
--- a/cut.1
+++ b/cut.1
@@ -1,4 +1,4 @@
-.Dd January 18, 2015
+.Dd January 22, 2015
 .Dt CUT 1 sbase\-VERSION
 .Sh NAME
 .Nm cut
@@ -67,4 +67,4 @@ utility is compliant with the
 specification.
 .Pp
 The possibility of separating numbers and ranges with a space
-is an extension to that specification.
+and specifying multibyte delimiters is an extension to that specification.
diff --git a/cut.c b/cut.c
index c14ea25..50c2695 100644
--- a/cut.c
+++ b/cut.c
@@ -4,6 +4,7 @@
 #include <string.h>
 
 #include "text.h"
+#include "utf.h"
 #include "util.h"
 
 typedef struct Range {
@@ -11,11 +12,12 @@ typedef struct Range {
        struct Range *next;
 } Range;
 
-static Range *list = NULL;
-static char mode = 0;
-static char delim = '\t';
-static int nflag = 0;
-static int sflag = 0;
+static Range *list     = NULL;
+static char   mode     = 0;
+static Rune   delim    = '\t';
+static size_t delimlen = 1;
+static int    nflag    = 0;
+static int    sflag    = 0;
 
 static void
 insert(Range *r)
@@ -70,10 +72,11 @@ static size_t
 seek(const char *s, size_t pos, size_t *prev, size_t count)
 {
        const char *t;
-       size_t n = pos - *prev;
+       size_t n = pos - *prev, i;
+       Rune r;
 
        if (mode == 'b') {
-               if ((t = memchr(s, 0, n)))
+               if ((t = memchr(s, '\0', n)))
                        return t - s;
                if (nflag)
                        while (n && !UTF8_POINT(s[n]))
@@ -85,11 +88,18 @@ seek(const char *s, size_t pos, size_t *prev, size_t count)
                        if (UTF8_POINT(*t) && !--n)
                                break;
        } else {
-               for (t = (count < 2) ? s : s + 1; n && *t; t++)
-                       if (*t == delim && !--n && count)
+               for (t = (count < delimlen + 1) ? s : s + delimlen; n && *t; ) {
+                       for (i = 1; t[i]; i++)
+                               if (fullrune(t, i))
+                                       break;
+                       charntorune(&r, t, i);
+                       if (r == delim && !--n && count)
                                break;
+                       t += i;
+               }
        }
        *prev = pos;
+
        return t - s;
 }
 
@@ -106,20 +116,22 @@ cut(FILE *fp)
        while ((len = getline(&buf, &size, fp)) != -1) {
                if (len && buf[len - 1] == '\n')
                        buf[len - 1] = '\0';
-               if (mode == 'f' && !strchr(buf, delim)) {
+               if (mode == 'f' && !utfrune(buf, delim)) {
                        if (!sflag)
                                puts(buf);
                        continue;
                }
                for (i = 0, p = 1, s = buf, r = list; r; r = r->next, s += n) {
-                       s += seek(s, r->min, &p, i++);
+                       s += seek(s, r->min, &p, i);
+                       i += (mode == 'f') ? delimlen : 1;
                        if (!*s)
                                break;
                        if (!r->max) {
                                fputs(s, stdout);
                                break;
                        }
-                       n = seek(s, r->max + 1, &p, i++);
+                       n = seek(s, r->max + 1, &p, i);
+                       i += (mode == 'f') ? delimlen : 1;
                        if (fwrite(s, 1, n, stdout) != n)
                                eprintf("write error:");
                }
@@ -139,16 +151,27 @@ int
 main(int argc, char *argv[])
 {
        FILE *fp;
+       int i;
+       char *m, *d;
 
        ARGBEGIN {
        case 'b':
        case 'c':
        case 'f':
                mode = ARGC();
-               parselist(ARGF());
+               m = ARGF();
+               if (!m)
+                       usage();
+               parselist(m);
                break;
        case 'd':
-               delim = *ARGF();
+               if(!(d = ARGF()))
+                       usage();
+               for (i = 1; i <= strlen(d); i++)
+                       if (fullrune(d, i))
+                               break;
+               charntorune(&delim, d, i);
+               delimlen = i;
                break;
        case 'n':
                nflag = 1;
@@ -162,7 +185,6 @@ main(int argc, char *argv[])
 
        if (!mode)
                usage();
-
        if (!argc)
                cut(stdin);
        else for (; argc--; argv++) {

Reply via email to