bug#24015: [PATCH v2 1/3] sort: deduplicate code for traversing numbers

2016-07-18 Thread Pádraig Brady
On 18/07/16 22:07, Pádraig Brady wrote:
> Excellent. I'll push all three patches.
> 
> I'll adjust the first summary like s/sort:/maint: sort.c:/
> since there is no functionality change
> 
> I'll also add the check for the sv_SE locale in the test.

Oops you'd done that already.

One change I did make to the test was to change
the exp{1,3} bashism to exp1 exp3, as the tests need
to run under any POSIX compliant shell.
I tested that with:

  make SHELL=/bin/dash TESTS=tests/misc/sort-h-thousands-sep.sh SUBDIRS=. check

cheers,
Pádraig





bug#24015: [PATCH v2 1/3] sort: deduplicate code for traversing numbers

2016-07-18 Thread Pádraig Brady
Excellent. I'll push all three patches.

I'll adjust the first summary like s/sort:/maint: sort.c:/
since there is no functionality change

I'll also add the check for the sv_SE locale in the test.

thanks!
Pádraig.






bug#24015: [PATCH v2 3/3] sort: with -h, disallow thousands separator between number and unit

2016-07-18 Thread Kamil Dudka
* src/sort.c (traverse_raw_number): Accept thousands separator only
if it is immediately followed by a digit.
* tests/misc/sort-h-thousands-sep.sh: Cover the fix for this bug.

Suggested by Pádraig Brady in http://bugs.gnu.org/24015
---
 src/sort.c | 11 ++-
 tests/misc/sort-h-thousands-sep.sh | 24 
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/src/sort.c b/src/sort.c
index 038f6ae..6b2dc84 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -1895,6 +1895,7 @@ traverse_raw_number (char const **number)
   char const *p = *number;
   unsigned char ch;
   unsigned char max_digit = '\0';
+  bool ends_with_thousands_sep = false;
 
   /* Scan to end of number.
  Decimals or separators not followed by digits stop the scan.
@@ -1910,10 +1911,18 @@ traverse_raw_number (char const **number)
   /* Allow to skip only one occurrence of thousands_sep to avoid finding
  the unit in the next column in case thousands_sep matches as blank
  and is used as column delimiter.  */
-  if (*p == thousands_sep)
+  ends_with_thousands_sep = (*p == thousands_sep);
+  if (ends_with_thousands_sep)
 ++p;
 }
 
+  if (ends_with_thousands_sep)
+{
+  /* thousands_sep not followed by digit is not allowed.  */
+  *number = p - /* already incremented twice */ 2;
+  return max_digit;
+}
+
   if (ch == decimal_point)
 while (ISDIGIT (ch = *p++))
   if (max_digit < ch)
diff --git a/tests/misc/sort-h-thousands-sep.sh 
b/tests/misc/sort-h-thousands-sep.sh
index 17f1b6c..1168268 100755
--- a/tests/misc/sort-h-thousands-sep.sh
+++ b/tests/misc/sort-h-thousands-sep.sh
@@ -21,25 +21,25 @@ print_ver_ sort
 test "$(LC_ALL=sv_SE locale thousands_sep)" = ' ' \
   || skip_ 'The Swedish locale with blank thousands separator is unavailable.'
 
-tee exp1 > in << _EOF_
-1   1k  4 003   1M
-2k  2M  4 002   2
-3M  3   4 001   3k
+tee exp{1,3} > in << _EOF_
+1   1k  1 M 4 003   1M
+2k  2M  2 k 4 002   2
+3M  3   3 G 4 001   3k
 _EOF_
 
 cat > exp2 << _EOF_
-3M  3   4 001   3k
-1   1k  4 003   1M
-2k  2M  4 002   2
+3M  3   3 G 4 001   3k
+1   1k  1 M 4 003   1M
+2k  2M  2 k 4 002   2
 _EOF_
 
-cat > exp3 << _EOF_
-3M  3   4 001   3k
-2k  2M  4 002   2
-1   1k  4 003   1M
+cat > exp5 << _EOF_
+3M  3   3 G 4 001   3k
+2k  2M  2 k 4 002   2
+1   1k  1 M 4 003   1M
 _EOF_
 
-for i in 1 2 3; do
+for i in 1 2 3 5; do
   LC_ALL="sv_SE.utf8" sort -h -k $i "in" > "out${i}" || fail=1
   compare "exp${i}" "out${i}" || fail=1
 done
-- 
2.5.5






bug#24015: [PATCH v2 1/3] sort: deduplicate code for traversing numbers

2016-07-18 Thread Kamil Dudka
* src/sort.c (traverse_raw_number): New function for traversing numbers.
(find_unit_order): Use traverse_raw_number () instead of open-coding it.
(debug_key): Use traverse_raw_number () instead of open-coding it.
---
 src/sort.c | 63 ++
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/src/sort.c b/src/sort.c
index f717604..58c1167 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -1885,18 +1885,16 @@ static char const unit_order[UCHAR_LIM] =
 #endif
   };
 
-/* Return an integer that represents the order of magnitude of the
-   unit following the number.  The number may contain thousands
-   separators and a decimal point, but it may not contain leading blanks.
-   Negative numbers get negative orders; zero numbers have a zero order.  */
-
-static int _GL_ATTRIBUTE_PURE
-find_unit_order (char const *number)
+/* Traverse number given as *number consisting of digits, thousands_sep, and
+   decimal_point chars only.  Returns the highest digit found in the number,
+   or '\0' if no digit has been found.  Upon return *number points at the
+   character that immediately follows after the given number.  */
+static unsigned char
+traverse_raw_number (char const **number)
 {
-  bool minus_sign = (*number == '-');
-  char const *p = number + minus_sign;
-  int nonzero = 0;
+  char const *p = *number;
   unsigned char ch;
+  unsigned char max_digit = '\0';
 
   /* Scan to end of number.
  Decimals or separators not followed by digits stop the scan.
@@ -1907,16 +1905,34 @@ find_unit_order (char const *number)
   do
 {
   while (ISDIGIT (ch = *p++))
-nonzero |= ch - '0';
+if (max_digit < ch)
+  max_digit = ch;
 }
   while (ch == thousands_sep);
 
   if (ch == decimal_point)
 while (ISDIGIT (ch = *p++))
-  nonzero |= ch - '0';
+  if (max_digit < ch)
+max_digit = ch;
+
+  *number = p - 1;
+  return max_digit;
+}
+
+/* Return an integer that represents the order of magnitude of the
+   unit following the number.  The number may contain thousands
+   separators and a decimal point, but it may not contain leading blanks.
+   Negative numbers get negative orders; zero numbers have a zero order.  */
 
-  if (nonzero)
+static int _GL_ATTRIBUTE_PURE
+find_unit_order (char const *number)
+{
+  bool minus_sign = (*number == '-');
+  char const *p = number + minus_sign;
+  unsigned char max_digit = traverse_raw_number ();
+  if ('0' < max_digit)
 {
+  unsigned char ch = *p;
   int order = unit_order[ch];
   return (minus_sign ? -order : order);
 }
@@ -2293,23 +2309,14 @@ debug_key (struct line const *line, struct keyfield 
const *key)
 ignore_value (strtold (beg, _lim));
   else if (key->numeric || key->human_numeric)
 {
-  char *p = beg + (beg < lim && *beg == '-');
-  bool found_digit = false;
-  unsigned char ch;
-
-  do
+  char const *p = beg + (beg < lim && *beg == '-');
+  unsigned char max_digit = traverse_raw_number ();
+  if ('0' <= max_digit)
 {
-  while (ISDIGIT (ch = *p++))
-found_digit = true;
+  unsigned char ch = *p;
+  tighter_lim = (char *) p
++ (key->human_numeric && unit_order[ch]);
 }
-  while (ch == thousands_sep);
-
-  if (ch == decimal_point)
-while (ISDIGIT (ch = *p++))
-  found_digit = true;
-
-  if (found_digit)
-tighter_lim = p - ! (key->human_numeric && unit_order[ch]);
 }
   else
 tighter_lim = lim;
-- 
2.5.5






bug#24015: [PATCH v2 2/3] sort: make -h work with -k and blank used as thousands separator

2016-07-18 Thread Kamil Dudka
* src/sort.c (traverse_raw_number): Allow to skip only one occurrence
of thousands_sep to avoid finding the unit in the next column in case
thousands_sep matches as blank and is used as column delimiter.
* tests/misc/sort-h-thousands-sep.sh: Add regression test for this bug.
* tests/local.mk: Reference the test.
* NEWS: Mention the bug fix.
Reported at https://bugzilla.redhat.com/1355780
---
 NEWS   |  2 ++
 src/sort.c | 14 
 tests/local.mk |  1 +
 tests/misc/sort-h-thousands-sep.sh | 47 ++
 4 files changed, 59 insertions(+), 5 deletions(-)
 create mode 100755 tests/misc/sort-h-thousands-sep.sh

diff --git a/NEWS b/NEWS
index 4d8fb45..736b95e 100644
--- a/NEWS
+++ b/NEWS
@@ -21,6 +21,8 @@ GNU coreutils NEWS-*- 
outline -*-
   nl now resets numbering for each page section rather than just for each page.
   [This bug was present in "the beginning".]
 
+  sort -h -k now works even in locales that use blank as thousands separator.
+
   stty --help no longer outputs extraneous gettext header lines
   for translated languages. [bug introduced in coreutils-8.24]
 
diff --git a/src/sort.c b/src/sort.c
index 58c1167..038f6ae 100644
--- a/src/sort.c
+++ b/src/sort.c
@@ -1902,13 +1902,17 @@ traverse_raw_number (char const **number)
  to be lacking in units.
  FIXME: add support for multibyte thousands_sep and decimal_point.  */
 
-  do
+  while (ISDIGIT (ch = *p++))
 {
-  while (ISDIGIT (ch = *p++))
-if (max_digit < ch)
-  max_digit = ch;
+  if (max_digit < ch)
+max_digit = ch;
+
+  /* Allow to skip only one occurrence of thousands_sep to avoid finding
+ the unit in the next column in case thousands_sep matches as blank
+ and is used as column delimiter.  */
+  if (*p == thousands_sep)
+++p;
 }
-  while (ch == thousands_sep);
 
   if (ch == decimal_point)
 while (ISDIGIT (ch = *p++))
diff --git a/tests/local.mk b/tests/local.mk
index 27cbf6e..889142a 100644
--- a/tests/local.mk
+++ b/tests/local.mk
@@ -348,6 +348,7 @@ all_tests = \
   tests/misc/sort-discrim.sh   \
   tests/misc/sort-files0-from.pl   \
   tests/misc/sort-float.sh \
+  tests/misc/sort-h-thousands-sep.sh   \
   tests/misc/sort-merge.pl \
   tests/misc/sort-merge-fdlimit.sh \
   tests/misc/sort-month.sh \
diff --git a/tests/misc/sort-h-thousands-sep.sh 
b/tests/misc/sort-h-thousands-sep.sh
new file mode 100755
index 000..17f1b6c
--- /dev/null
+++ b/tests/misc/sort-h-thousands-sep.sh
@@ -0,0 +1,47 @@
+#!/bin/sh
+# exercise 'sort -h' in locales where thousands separator is blank
+
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ sort
+test "$(LC_ALL=sv_SE locale thousands_sep)" = ' ' \
+  || skip_ 'The Swedish locale with blank thousands separator is unavailable.'
+
+tee exp1 > in << _EOF_
+1   1k  4 003   1M
+2k  2M  4 002   2
+3M  3   4 001   3k
+_EOF_
+
+cat > exp2 << _EOF_
+3M  3   4 001   3k
+1   1k  4 003   1M
+2k  2M  4 002   2
+_EOF_
+
+cat > exp3 << _EOF_
+3M  3   4 001   3k
+2k  2M  4 002   2
+1   1k  4 003   1M
+_EOF_
+
+for i in 1 2 3; do
+  LC_ALL="sv_SE.utf8" sort -h -k $i "in" > "out${i}" || fail=1
+  compare "exp${i}" "out${i}" || fail=1
+done
+
+Exit $fail
-- 
2.5.5