Hi there!
While working on a small awk program in an arm64 I found that bitwise
operations are broken.
Looking under the hood I found that awk numbers are doubles[1], whereas bitwise
operations are performed over unsigned longs[2].
The problem:
- double is typically 2^53
- unsigned long is 2^32 in 32bit archs
- unsigned long is typically 2^64 in 64bit archs
So, the result of a unsigned long bitwise operation is stored on a double
This means that data is lost in 64bit archs that use 64bit unsigned longs when
the result is greater than 2^53. For example, operating with a simple compl(0)
on an arm64 or x64 Linux generates unexpected results:
awk 'BEGIN{print compl(0)%4}'
It returns 0 instead of 3.
But it works on GNU Awk, why?
Well, apparently all gawk bitwise operations return the result of a function
called make_integer[3] which in turn calls another function that fixes the
issue I described above: adjust_uint[4].
adjust_uint basically truncates sizes greater than 2^53 (like 2^64 unsigned
long) to 2^53 from the left, preserving low order bits.
So I went ahead and shamelessly copied adjust_uint into Busybox Awk and it
worked!
And here I am submitting a patch with the changes adapted to Busybox :)
This adaptation includes:
- Replacing uintmax_t with unsigned long on adjust_uint and the
count_trailing_zeros helper, as the result of bitwise operations on Busybox is
unsigned long
- Replacing GCC __builtin_ctzll (unsigned long long) with GCC __builtin_ctzl
(unsigned long)
- Including float.h for the FLT_RADIX macro
- Removing some macros that adapt adjust_uint when gawk numbers are long
doubles in some platforms
- Renaming some macros and their mention in the original gawk comments
Cheers,
Carlos
[1] -
https://git.busybox.net/busybox/tree/editors/awk.c?id=c8c1fcdba163f264a503380bc63485aacd09214c#n123
[2] -
https://git.busybox.net/busybox/tree/editors/awk.c?id=c8c1fcdba163f264a503380bc63485aacd09214c#n1048
[3] -
https://git.savannah.gnu.org/cgit/gawk.git/tree/builtin.c?id=d434cb3ce61e0cc5e26180da914f1a58223897a2#n3565
[4] -
https://git.savannah.gnu.org/cgit/gawk.git/tree/floatcomp.c?id=d434cb3ce61e0cc5e26180da914f1a58223897a2#n91
diff --git a/editors/awk.c b/editors/awk.c
index 728ee8685..d41922460 100644
--- a/editors/awk.c
+++ b/editors/awk.c
@@ -48,6 +48,7 @@
#include "libbb.h"
#include "xregex.h"
#include <math.h>
+#include <float.h>
/* This is a NOEXEC applet. Be very careful! */
@@ -914,6 +915,69 @@ static double my_strtod_or_hexoct(char **pp)
# define my_strtod_or_hexoct(p) my_strtod(p)
#endif
+// Code shamelessly stolen from GNU Awk floatcomp.c
+#ifndef CHAR_BIT
+#define CHAR_BIT 8
+#endif
+
+#ifndef FLT_RADIX
+#define FLT_RADIX 2
+#endif
+
+/*
+ * The number of bits in a double fraction, assuming FLT_RADIX is
+ * either 2 or 16. IEEE and VAX formats use radix 2, and IBM
+ * mainframe format uses radix 16; we know of no other radices in
+ * practical use.
+ */
+#if FLT_RADIX != 2 && FLT_RADIX != 16
+Please port the following code to your weird host;
+#endif
+#define AWK_DOUBLE_FRACTION_BITS 53 * (FLT_RADIX == 2 ? 1 : 4)
+
+#ifdef __GNUC__
+#define USE_BUILTIN_CTZL 3 < (__GNUC__ + (4 <= __GNUC_MINOR__))
+#else
+#define USE_BUILTIN_CTZL 0
+#endif
+
+static int count_trailing_zeros(unsigned long n)
+{
+#if USE_BUILTIN_CTZL
+ return __builtin_ctzl(n);
+#else
+ int i = 0;
+ for (; (n & 3) == 0; n >>= 2)
+ i += 2;
+ return i + (1 & ~n);
+#endif
+#undef USE_BUILTIN_CTZL
+}
+
+static unsigned long adjust_uint(unsigned long n)
+{
+ /*
+ * If unsigned long is so wide that double cannot represent all its
+ * values, strip leading nonzero bits of integers that are so large
+ * that they cannot be represented exactly as double, so that their
+ * low order bits are represented exactly, without rounding errors.
+ * This is more desirable in practice, since it means the user sees
+ * integers that are the same width as the double fractions.
+ */
+ int wordbits = CHAR_BIT * sizeof n;
+ if (AWK_DOUBLE_FRACTION_BITS < wordbits) {
+ unsigned long one = 1;
+ unsigned long sentinel = one << (wordbits - AWK_DOUBLE_FRACTION_BITS);
+ int shift = count_trailing_zeros(n | sentinel);
+ unsigned long mask = (one << AWK_DOUBLE_FRACTION_BITS) - 1;
+
+ n &= mask << shift;
+ }
+
+ return n;
+}
+#undef AWK_DOUBLE_FRACTION_BITS
+
/* -------- working with variables (set/get/copy/etc) -------- */
static void fmt_num(const char *format, double n)
@@ -2688,27 +2752,27 @@ static NOINLINE var *exec_builtin(node *op, var *res)
/* Bitwise ops must assume that operands are unsigned. GNU Awk 3.1.5:
* awk '{ print or(-1,1) }' gives "4.29497e+09", not "-2.xxxe+09" */
case B_an:
- setvar_i(res, getvar_i_int(av[0]) & getvar_i_int(av[1]));
+ setvar_i(res, adjust_uint(getvar_i_int(av[0]) & getvar_i_int(av[1])));
break;
case B_co:
- setvar_i(res, ~getvar_i_int(av[0]));
+ setvar_i(res, adjust_uint(~getvar_i_int(av[0])));
break;
case B_ls:
- setvar_i(res, getvar_i_int(av[0]) << getvar_i_int(av[1]));
+ setvar_i(res, adjust_uint(getvar_i_int(av[0]) << getvar_i_int(av[1])));
break;
case B_or:
- setvar_i(res, getvar_i_int(av[0]) | getvar_i_int(av[1]));
+ setvar_i(res, adjust_uint(getvar_i_int(av[0]) | getvar_i_int(av[1])));
break;
case B_rs:
- setvar_i(res, getvar_i_int(av[0]) >> getvar_i_int(av[1]));
+ setvar_i(res, adjust_uint(getvar_i_int(av[0]) >> getvar_i_int(av[1])));
break;
case B_xo:
- setvar_i(res, getvar_i_int(av[0]) ^ getvar_i_int(av[1]));
+ setvar_i(res, adjust_uint(getvar_i_int(av[0]) ^ getvar_i_int(av[1])));
break;
case B_lo:
_______________________________________________
busybox mailing list
[email protected]
http://lists.busybox.net/mailman/listinfo/busybox