Hello! Here is my patch to add -k support to sort and remove the -u flag from it, because it does not work (and even less so with my -k patch. It does strcmp to compare two lines that are already sorted to check if they are the same, but that already does not work when "-n -u" is given. A correct implementation of -u should rather check if the sort function thinks they were the same.
The -k flag is not complete in that it does not support modifiers specific to a single key definition. Regards, Jakob Kramer
>From 1ac4b7f4339c78b08cdb942b310a4c653ce8d1b1 Mon Sep 17 00:00:00 2001 From: Jakob Kramer <jakob.kra...@gmx.de> Date: Sat, 12 Apr 2014 17:53:10 +0200 Subject: [PATCH] sort: add -k, remove -u Options that are specific to a single key definition are not supported (e.g. "sort -k 2,3n -k 4,4"). Should you try to specify such definitions, sort will return with EXIT_FAILURE and an error message. Instead, all key definitions exclusively use the global settings. It always behaves like -b was set. I removed -u because it does not work the way that it was implemented here. It should be rewritten so that it checks if the sort function thinks that the strings were the same. --- sort.1 | 20 ++++++-- sort.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ 2 files changed, 169 insertions(+), 18 deletions(-) diff --git a/sort.1 b/sort.1 index 7913357..80fa692 100644 --- a/sort.1 +++ b/sort.1 @@ -3,7 +3,10 @@ sort \- sort lines .SH SYNOPSIS .B sort -.RB [ \-nru ] +.RB [ \-nr ] +.RB [ \-k +.I key +.R ]... .RI [ file ...] .SH DESCRIPTION .B sort @@ -17,5 +20,16 @@ perform a numeric sort. .B \-r reverses the sort. .TP -.B \-u -prints repeated lines only once. +.B \-k key +specifies a key definition of the form \fBS\fR[.\fBs\fR][,\fBE\fR[.\fBe\fR]], +where +.B S, +.B s, +.B E, +and +.B e +are the starting column, starting character in that column, ending column and +the ending character of that column respectively. If they are not specified, +s refers to the first character of the specified starting column, E refers to +the last column of every line, and e refers to the last character of that last +column. diff --git a/sort.c b/sort.c index 348e16b..f43464f 100644 --- a/sort.c +++ b/sort.c @@ -1,4 +1,5 @@ /* See LICENSE file for copyright and license details. */ +#include <ctype.h> #include <stdbool.h> #include <stdio.h> #include <stdlib.h> @@ -7,10 +8,30 @@ #include "text.h" #include "util.h" +struct keydef { + unsigned start_column; + unsigned end_column; + unsigned start_char; + unsigned end_char; +}; + +struct kdlist { + struct keydef keydef; + struct kdlist *next; +}; + +static struct kdlist *head = NULL; +static struct kdlist *curr = NULL; + +static void addkeydef(char *); +static void freelist(void); static int linecmp(const char **, const char **); +static char *next_nonblank(char *); +static char *next_blank(char *); +static int parse_keydef(struct keydef *, char *); +static char *columns(char *, const struct keydef *); static bool rflag = false; -static bool uflag = false; static bool nflag = false; static struct linebuf linebuf = EMPTY_LINEBUF; @@ -18,7 +39,7 @@ static struct linebuf linebuf = EMPTY_LINEBUF; static void usage(void) { - eprintf("usage: %s [-nru] [file...]\n", argv0); + enprintf(2, "usage: %s [-nr] [-k def]... [file...]\n", argv0); } int @@ -34,18 +55,20 @@ main(int argc, char *argv[]) case 'r': rflag = true; break; - case 'u': - uflag = true; + case 'k': + addkeydef(EARGF(usage())); break; default: usage(); } ARGEND; + addkeydef("1"); + if(argc == 0) { getlines(stdin, &linebuf); } else for(; argc > 0; argc--, argv++) { if(!(fp = fopen(argv[0], "r"))) { - weprintf("fopen %s:", argv[0]); + enprintf(2, "fopen %s:", argv[0]); continue; } getlines(fp, &linebuf); @@ -55,24 +78,138 @@ main(int argc, char *argv[]) (int (*)(const void *, const void *))linecmp); for(i = 0; i < linebuf.nlines; i++) { - if(!uflag || i == 0 || strcmp(linebuf.lines[i], - linebuf.lines[i-1]) != 0) { - fputs(linebuf.lines[i], stdout); - } + fputs(linebuf.lines[i], stdout); } + freelist(); return EXIT_SUCCESS; } -int +static void +addkeydef(char *def) +{ + struct kdlist *node; + + node = malloc(sizeof(*node)); + if(!node) + enprintf(2, "malloc:"); + if(!head) + head = node; + if(parse_keydef(&node->keydef, def)) + enprintf(2, "parse_keydef:"); + if(curr) + curr->next = node; + node->next = NULL; + curr = node; +} + +static void +freelist(void) +{ + struct kdlist *node; + struct kdlist *tmp; + + for(node = head; node; node = tmp) { + tmp = node->next; + free(node); + } +} + +static int linecmp(const char **a, const char **b) { - if (nflag) { - if (rflag) - return strtoul(*b, 0, 10) - strtoul(*a, 0, 10); + char *s1, *s2; + int res = 0; + struct kdlist *node; + + for(node = head; node && res == 0; node = node->next) { + s1 = columns((char *)*a, &node->keydef); + s2 = columns((char *)*b, &node->keydef); + + /* don't consider modifiers if it's the default key + * definition that was implicitly added */ + if(!(node == head) && !node->next) + res = strcmp(s1, s2); + else if(nflag) + res = strtoul(s1, 0, 10) - strtoul(s2, 0, 10); else - return strtoul(*a, 0, 10) - strtoul(*b, 0, 10); + res = strcmp(s1, s2); + + free(s1); + free(s2); + } + return rflag ? -res : res; +} + +static int +parse_keydef(struct keydef *kd, char *s) +{ + char *rest = s; + kd->start_column = 1; + kd->start_char = 1; + /* 0 means end of line */ + kd->end_column = 0; + kd->end_char = 0; + + kd->start_column = strtoul(rest, &rest, 10); + if(!kd->start_column) + enprintf(2, "starting column cannot be 0\n"); + if(*rest == '.') + kd->start_char = strtoul(rest+1, &rest, 10); + if(*rest == ',') { + kd->end_column = strtoul(rest+1, &rest, 10); + if(kd->end_column < kd->start_column) + enprintf(2, ",%u is too small\n", kd->end_column); } - return strcmp(*a, *b) * (rflag ? -1 : +1); + if(*rest == '.') + kd->end_char = strtoul(rest+1, &rest, 10); + if(*rest != '\0') + return -1; + return 0; } +static char * +next_nonblank(char *s) +{ + for(; *s && isblank(*s); s++); + return s; +} + +static char * +next_blank(char *s) +{ + for(; *s && !isblank(*s); s++); + return s; +} + +static char * +columns(char *line, const struct keydef *kd) +{ + char *rest; + char *start, *end; + unsigned i; + for(rest = line, i = 0; i < kd->start_column; i++) { + if(i != 0) + rest = next_blank(rest); + rest = next_nonblank(rest); + } + for(i = 1; i < kd->start_char && !isblank(*rest); i++, rest++); + start = rest; + + if(kd->end_column) { + for(rest = line, i = 0; i < kd->end_column; i++) { + if(i != 0) + rest = next_blank(rest); + rest = next_nonblank(rest); + } + if(kd->end_char) { + for(i = 1; i < kd->end_char && *rest && !isblank(*rest); i++, rest++); + } else { + rest = next_blank(rest); + } + end = rest; + } else { + end = rest + strlen(rest); + } + return strndup(start, end - start); +} -- 1.8.5.1