Hello, I'd like to (re)suggest a feature for the join program - the ability to automatically build an output format line (similar but easier than using "-o").
I've previously mentioned it here (but got no favorable responses): http://lists.gnu.org/archive/html/bug-coreutils/2009-11/msg00151.html Several people have been using this option for a year now (on our local servers), so I thought I might try to suggest it again. The full patch is attached, and also available here: http://cancan.cshl.edu/labmembers/gordon/files/join_auto_format_2010_10_06.patch Here's the common use case: Given two tabular files, with a common key at first column, and many numeric (or other) values on other columns, the user wants to join them together easily. One requirement is that empty/missing values should be populated with "00". File 1 ====== bar 10 13 15 16 11 32 foo 10 10 11 12 13 14 File 2 ====== bar 99 91 90 93 91 93 baz 90 91 99 96 97 95 Desired joined output ============== bar 10 13 15 16 11 32 99 91 90 93 91 93 baz 00 00 00 00 00 00 90 91 99 96 97 95 foo 10 10 11 12 13 14 00 00 00 00 00 00 There is no technical problem in achieving this, the parameters would be: "-a1 -a2 -e 00 -o 0,1.2,1.3,1.4,1.5,1.6,1.7,2.2,2.3,2.4,2.5,2.6,2.7" But building the "-o" parameter is cumbersome, and error-prone (imaging files with dozens of columns, which is very common in my case). The "--auto-format" feature simply builds the "-o" format line automatically, based on the number of columns from both input files. The auto-generated format order is: Key-column, all columns (except key) from first file, all columns (except key) from second file. The parameters for the above use case become: "-a1 -a2 -e 00 --auto-format" If "--auto-format" is not specified, there's no change to the rest of the workflow. If both "--auto-format" and "-o XXXX" are specified, the "-o" takes precedence. Let me know what you think about it. Please let me know what you think about it. Best regards, -gordon
>From 710fb9423391def7cb95c9c6ae911c5958f492db Mon Sep 17 00:00:00 2001 From: Assaf Gordon <[email protected]> Date: Wed, 6 Oct 2010 15:55:18 -0400 Subject: [PATCH 1/2] Join: add '--auto-format' option. --- doc/coreutils.texi | 8 ++++++++ src/join.c | 37 ++++++++++++++++++++++++++++++++++++- tests/misc/join | 19 +++++++++++++++++++ 3 files changed, 63 insertions(+), 1 deletions(-) diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 4d17ed1..3b10608 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -5563,6 +5563,14 @@ specified format. The header lines will not be checked for ordering even if @option{--check-order} is specified. Also if the header lines from each file do not match, the heading fields from the first file will be used. +...@item --auto-format +...@opindex --auto-format +Automatically detects output format based on the number of fields in the +first line of each input file (as if the user explicitly specified @samp{-o}). +Allows using @samp{-e} without a-priori knowledge of the fields in the input files. +The join field is printed first, followed by the remaining fields from the first +file and the second file. + @item -i @itemx --ignore-case @opindex -i diff --git a/src/join.c b/src/join.c index 6eaad65..17ad2f3 100644 --- a/src/join.c +++ b/src/join.c @@ -139,7 +139,8 @@ enum { CHECK_ORDER_OPTION = CHAR_MAX + 1, NOCHECK_ORDER_OPTION, - HEADER_LINE_OPTION + HEADER_LINE_OPTION, + AUTO_FORMAT_OPTION }; @@ -149,6 +150,7 @@ static struct option const longopts[] = {"check-order", no_argument, NULL, CHECK_ORDER_OPTION}, {"nocheck-order", no_argument, NULL, NOCHECK_ORDER_OPTION}, {"header", no_argument, NULL, HEADER_LINE_OPTION}, + {"auto-format", no_argument, NULL, AUTO_FORMAT_OPTION}, {GETOPT_HELP_OPTION_DECL}, {GETOPT_VERSION_OPTION_DECL}, {NULL, 0, NULL, 0} @@ -164,6 +166,12 @@ static bool ignore_case; join them without checking for ordering */ static bool join_header_lines; +/* if nonzero, automatically build a specific output field list, + based on the first line of each input file */ +static bool auto_output_format; + +static void build_output_format(struct line const *line1, struct line const* line2); + void usage (int status) { @@ -200,6 +208,8 @@ by whitespace. When FILE1 or FILE2 (not both) is -, read standard input.\n\ --nocheck-order do not check that the input is correctly sorted\n\ --header treat the first line in each file as field headers,\n\ print them without trying to pair them\n\ + --auto-format Automatically build output format, based on the first\n\ + line of each input file. Allows '-e' without using '-o'\n\ "), stdout); fputs (HELP_OPTION_DESCRIPTION, stdout); fputs (VERSION_OPTION_DESCRIPTION, stdout); @@ -636,6 +646,9 @@ join (FILE *fp1, FILE *fp2) advance_seq (fp2, &seq2, true, 2); } + if (auto_output_format && seq1.count && seq2.count) + build_output_format(seq1.lines[0],seq2.lines[0]); + while (seq1.count && seq2.count) { size_t i; @@ -947,6 +960,24 @@ add_file_name (char *name, char *names[2], *optc_status = MIGHT_BE_O_ARG; } +static void +build_output_format(struct line const *line1, struct line const* line2) +{ + int i ; + if (outlist_head.next) + return; + + add_field(0,0); + for (i = 0; i < join_field_1 && i < line1->nfields; ++i) + add_field(1,i); + for (i = join_field_1 + 1; i < line1->nfields; ++i) + add_field(1,i); + for (i = 0; i < join_field_2 && i < line2->nfields; ++i) + add_field(2,i); + for (i = join_field_2 + 1; i < line2->nfields; ++i) + add_field(2,i); +} + int main (int argc, char **argv) { @@ -1077,6 +1108,10 @@ main (int argc, char **argv) join_header_lines = true; break; + case AUTO_FORMAT_OPTION: + auto_output_format = true; + break; + case_GETOPT_HELP_CHAR; case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS); diff --git a/tests/misc/join b/tests/misc/join index a319b94..b07a18b 100755 --- a/tests/misc/join +++ b/tests/misc/join @@ -218,6 +218,25 @@ my @tv = ( ['header-5', '--header', [ "ID1 Name\n1 A\n2 B\n", "ID2 Color\n1 red\n"], "ID1 Name Color\n1 A red\n", 0], +# Auto-format +['autoformat-1', '-j1 -a1 -a2 --auto-format -e FOO', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b FOO\n3 FOO Y\n", 0], + +# Auto-format, with empty filler (no '-e' specified)- +# should print a column delimiters (space characters), but no filler. +# This should be equivalent to specifing "-o 0,1.2,2.2" without "-e". +['autoformat-2', '-j1 -a1 -a2 --auto-format', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b \n3 Y\n", 0], + +# auto-format sanity check: specify explicit output format without -e, +# make sure it matches the above test. +['autoformat-3', '-j1 -a1 -a2 -o 0,1.2,2.2', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 a X\n2 b \n3 Y\n", 0], + +# both auto-format and explicit output format (different format than 'auto'), +# auto-format should be silently ignored. +['autoformat-4', '-j1 -a1 -a2 -e FOO --auto-format -o 0,2.2,1.2', + ["1 a\n2 b\n", "1 X\n3 Y\n"], "1 X a\n2 FOO b\n3 Y FOO\n", 0], ); # Convert the above old-style test vectors to the newer -- 1.7.1 >From 7dd975752358da66e7ce91aa08d72e6be08cfc02 Mon Sep 17 00:00:00 2001 From: Assaf Gordon <[email protected]> Date: Wed, 6 Oct 2010 16:04:07 -0400 Subject: [PATCH 2/2] join (with autoformat) - update NEWS --- NEWS | 3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/NEWS b/NEWS index 22f257b..fc023bb 100644 --- a/NEWS +++ b/NEWS @@ -40,6 +40,9 @@ GNU coreutils NEWS -*- outline -*- for a file. It also accepts the %w and %W format directives for outputting the birth time of a file, if one is available. + join now accepts the option --auto-format, to automatically + detect the output format without requireing explicit -o. + ** Changes in behavior df now consistently prints the device name for a bind mounted file, -- 1.7.1
