2009/4/24 Pádraig Brady <[email protected]>:
> Michael Speer wrote:
>> I wrote the following patch to the 7.2 branch of coreutils to allow
>> `sort` to sort by human readable byte sizes. I looked around a bit to
>> see what the status of previous attempts to integrate this
>> functionality were, but didn't see any very recent activity. This is
>> my first interaction with coreutils, so if I missed something obvious,
>> please point me towards it.
>>
>> Is the last potential patch (
>> http://www.mail-archive.com/[email protected]/msg14080.html )
>> moving through? If not, if I cleaned this up ( tabs, documentation,
>> and test cases ) and applied it to the current HEAD on savannah is
>> there a chance of getting this functionality into sort?
>
> Thanks for reviving this again.
> There was a more recent attempt that petered out unfortunately:
> http://www.mail-archive.com/[email protected]/msg14080.html
>
>>
>> Patch assumptions :
>> * that numbers will use the best representation ( never uses 1024b
>> instead of 1k, etc )
>> * that the sizes will be specified via suffixes of b, K, M, G, T, P,
>> E, Z, Y or their alternately cased variants
>>
>> The first assumption results in checking only the suffix when they differ.
>> This enables it to match the output of `du -h / du --si`, but possibly
>> not other tools that do not conform to these assumptions.
>
> The consensus was that these assumptions are appropriate and useful.
>
> We assume C99 support now for coreutils so I tweaked your patch,
> the main change being to greatly shrink the lookup table initialisation.
> Note I commented out the lower case letters (except 'k') as I don't
> think any coreutils generate those and they could preclude supporting
> other suffixes in future. I'm not sure about doing that but I think it's
> better to err on the side of too few suffixes than too many?
>
That's much more readable. I tacked in a size. The standards do not
reference the lowercase letters you commented out, so I just deleted
them outright.
> Something else to consider is to flag when
> a mixture of SI and IEC units are used, as
> this not being supported might not be obvious
> to users and could cause difficult to debug issues for users.
> I.E. flag an error if the following input is presented.
> 999MB
> 998MiB
> I added a very quick hack for that to the patch for illustration.
>
While du only outputs the first letter, this makes the change better
for more general use. I added a bounds check, but do not see anything
else beyond your illustration would be needed.
> I also noticed that you didn't terminate the fields before
> processing as was done for the other numeric sorts?
> So I changed that also in the attached patch but didn't
> analyze it TBH.
>
Your change was entirely appropriate. I should have done that originally.
>
> p.s. obviously docs and help and tests need to be written,
> but we can do that after we get the implementation done.
>
I've attached the updated diff.
Thanks for taking an interest in this.
Michael Speer
--- orig/coreutils-7.2/src/sort.c 2009-03-29 13:44:10.000000000 -0400
+++ coreutils-7.2/src/sort.c 2009-04-25 04:46:06.000000000 -0400
@@ -176,6 +176,8 @@
bool random; /* Sort by random hash of key. */
bool general_numeric; /* Flag for general, numeric comparison.
Handle numbers in exponential notation. */
+ bool human_numeric; /* Flag for sorting by human readable
+ units with either SI or IEC prefixes */
bool month; /* Flag for comparison by month name. */
bool reverse; /* Reverse the sense of comparison. */
bool version; /* sort by version number */
@@ -336,6 +338,10 @@
-i, --ignore-nonprinting consider only printable characters\n\
-M, --month-sort compare (unknown) < `JAN' < ... < `DEC'\n\
"), stdout);
+ fputs(_("\
+ -h, --human-numeric-sort compare string numerical values ending in units\n\
+ prefixed with either SI xor IEC prefixes\n\
+"), stdout);
fputs (_("\
-n, --numeric-sort compare according to string numerical value\n\
-R, --random-sort sort by random hash of keys\n\
@@ -426,7 +432,7 @@
SORT_OPTION
};
-static char const short_options[] = "-bcCdfgik:mMno:rRsS:t:T:uVy:z";
+static char const short_options[] = "-bcCdfghik:mMno:rRsS:t:T:uVy:z";
static struct option const long_options[] =
{
@@ -442,6 +448,7 @@
{"merge", no_argument, NULL, 'm'},
{"month-sort", no_argument, NULL, 'M'},
{"numeric-sort", no_argument, NULL, 'n'},
+ {"human-numeric-sort", no_argument, NULL, 'h'},
{"version-sort", no_argument, NULL, 'V'},
{"random-sort", no_argument, NULL, 'R'},
{"random-source", required_argument, NULL, RANDOM_SOURCE_OPTION},
@@ -1673,6 +1680,57 @@
return strnumcmp (a, b, decimal_point, thousands_sep);
}
+/* error if a mixture of SI and IEC units used. */
+static void
+check_mixed_SI_IEC (char prefix)
+{
+ static int seen_si = -1;
+ bool si_present = prefix == 'i';
+ if (seen_si != -1 && seen_si != si_present)
+ error (SORT_FAILURE, 0, _("Both SI and IEC prefixes present on units"));
+ seen_si = si_present;
+}
+
+/* Compare numbers ending in units with SI xor IEC prefixes
+ <none/unknown> < K < M < G < T < P < E < Z < Y
+ Assume that numbers are properly abbreviated.
+ i.e. input will never have 5000K instead of 5M
+*/
+static int
+human_compare(const char *a, const char *b)
+{
+ static const char weights [UCHAR_LIM] = {
+ ['K']=1, ['M']=2, ['G']=3, ['T']=4, ['P']=5, ['E']=6, ['Z']=7, ['Y']=8,
+ ['k']=1,
+ };
+
+ while (blanks[to_uchar (*a)])
+ a++;
+ while (blanks[to_uchar (*b)])
+ b++;
+
+ const char *ar = a;
+ const char *br = b;
+
+ while( ISDIGIT (*ar) || (*ar) == decimal_point || (*ar) == thousands_sep )
+ ar++;
+ while( ISDIGIT (*br) || (*br) == decimal_point || (*br) == thousands_sep )
+ br++;
+
+ if( *ar )
+ check_mixed_SI_IEC (*(ar+1));
+
+ if( *br )
+ check_mixed_SI_IEC (*(br+1));
+
+ int aw = weights[to_uchar (*ar)];
+ int bw = weights[to_uchar (*br)];
+
+ return (aw > bw ? 1
+ : aw < bw ? -1
+ : strnumcmp ( a , b , decimal_point , thousands_sep));
+}
+
static int
general_numcompare (const char *sa, const char *sb)
{
@@ -1917,13 +1975,14 @@
if (key->random)
diff = compare_random (texta, lena, textb, lenb);
- else if (key->numeric | key->general_numeric)
+ else if (key->numeric | key->general_numeric | key->human_numeric)
{
char savea = *lima, saveb = *limb;
*lima = *limb = '\0';
- diff = ((key->numeric ? numcompare : general_numcompare)
- (texta, textb));
+ diff = ((key->numeric ? numcompare
+ : key->general_numeric ? general_numcompare
+ : human_compare) (texta, textb));
*lima = savea, *limb = saveb;
}
else if (key->version)
@@ -2887,7 +2946,7 @@
for (key = keylist; key; key = key->next)
if ((1 < (key->random + key->numeric + key->general_numeric + key->month
- + key->version + !!key->ignore))
+ + key->version + (!!key->ignore) + key->human_numeric))
|| (key->random && key->translate))
{
/* The following is too big, but guaranteed to be "big enough". */
@@ -2899,6 +2958,8 @@
*p++ = 'f';
if (key->general_numeric)
*p++ = 'g';
+ if (key->human_numeric)
+ *p++ = 'h';
if (key->ignore == nonprinting)
*p++ = 'i';
if (key->month)
@@ -2990,6 +3051,9 @@
case 'g':
key->general_numeric = true;
break;
+ case 'h':
+ key->human_numeric = true;
+ break;
case 'i':
/* Option order should not matter, so don't let -i override
-d. -d implies -i, but -i does not imply -d. */
@@ -3138,7 +3202,8 @@
gkey.sword = gkey.eword = SIZE_MAX;
gkey.ignore = NULL;
gkey.translate = NULL;
- gkey.numeric = gkey.general_numeric = gkey.random = gkey.version = false;
+ gkey.numeric = gkey.general_numeric = gkey.human_numeric = false;
+ gkey.random = gkey.version = false;
gkey.month = gkey.reverse = false;
gkey.skipsblanks = gkey.skipeblanks = false;
@@ -3217,6 +3282,7 @@
case 'd':
case 'f':
case 'g':
+ case 'h':
case 'i':
case 'M':
case 'n':
@@ -3469,6 +3535,7 @@
| key->numeric
| key->version
| key->general_numeric
+ | key->human_numeric
| key->random)))
{
key->ignore = gkey.ignore;
@@ -3478,6 +3545,7 @@
key->month = gkey.month;
key->numeric = gkey.numeric;
key->general_numeric = gkey.general_numeric;
+ key->human_numeric = gkey.human_numeric;
key->random = gkey.random;
key->reverse = gkey.reverse;
key->version = gkey.version;
@@ -3493,6 +3561,7 @@
| gkey.month
| gkey.numeric
| gkey.general_numeric
+ | gkey.human_numeric
| gkey.random
| gkey.version)))
{
_______________________________________________
Bug-coreutils mailing list
[email protected]
http://lists.gnu.org/mailman/listinfo/bug-coreutils