Awk bitwise operations are broken on 64bit archs - includes patch with fix

caribpa Tue, 27 Sep 2022 03:33:09 -0700

Hi there!

While working on a small awk program in an arm64 I found that bitwise 
operations are broken.


Looking under the hood I found that awk numbers are doubles[1], whereas bitwise 
operations are performed over unsigned longs[2].

The problem:
  - double is typically 2^53
  - unsigned long is 2^32 in 32bit archs
  - unsigned long is typically 2^64 in 64bit archs

So, the result of a unsigned long bitwise operation is stored on a double

This means that data is lost in 64bit archs that use 64bit unsigned longs when 
the result is greater than 2^53. For example, operating with a simple compl(0) 
on an arm64 or x64 Linux generates unexpected results:

  awk 'BEGIN{print compl(0)%4}'

It returns 0 instead of 3.

But it works on GNU Awk, why?

Well, apparently all gawk bitwise operations return the result of a function 
called make_integer[3] which in turn calls another function that fixes the 
issue I described above: adjust_uint[4].

adjust_uint basically truncates sizes greater than 2^53 (like 2^64 unsigned 
long) to 2^53 from the left, preserving low order bits.

So I went ahead and shamelessly copied adjust_uint into Busybox Awk and it 
worked!

And here I am submitting a patch with the changes adapted to Busybox :)

This adaptation includes:
  - Replacing uintmax_t with unsigned long on adjust_uint and the 
count_trailing_zeros helper, as the result of bitwise operations on Busybox is 
unsigned long
  - Replacing GCC __builtin_ctzll (unsigned long long) with GCC __builtin_ctzl 
(unsigned long)
  - Including float.h for the FLT_RADIX macro
  - Removing some macros that adapt adjust_uint when gawk numbers are long 
doubles in some platforms
  - Renaming some macros and their mention in the original gawk comments

Cheers,
Carlos


[1] - 
https://git.busybox.net/busybox/tree/editors/awk.c?id=c8c1fcdba163f264a503380bc63485aacd09214c#n123
[2] - 
https://git.busybox.net/busybox/tree/editors/awk.c?id=c8c1fcdba163f264a503380bc63485aacd09214c#n1048
[3] - 
https://git.savannah.gnu.org/cgit/gawk.git/tree/builtin.c?id=d434cb3ce61e0cc5e26180da914f1a58223897a2#n3565
[4] - 
https://git.savannah.gnu.org/cgit/gawk.git/tree/floatcomp.c?id=d434cb3ce61e0cc5e26180da914f1a58223897a2#n91

diff --git a/editors/awk.c b/editors/awk.c
index 728ee8685..d41922460 100644
--- a/editors/awk.c
+++ b/editors/awk.c
@@ -48,6 +48,7 @@
 #include "libbb.h"
 #include "xregex.h"
 #include <math.h>
+#include <float.h>
 
 /* This is a NOEXEC applet. Be very careful! */
 
@@ -914,6 +915,69 @@ static double my_strtod_or_hexoct(char **pp)
 # define my_strtod_or_hexoct(p) my_strtod(p)
 #endif
 
+// Code shamelessly stolen from GNU Awk floatcomp.c
+#ifndef CHAR_BIT
+#define CHAR_BIT 8
+#endif
+
+#ifndef FLT_RADIX
+#define FLT_RADIX 2
+#endif
+
+/*
+ * The number of bits in a double fraction, assuming FLT_RADIX is
+ * either 2 or 16.  IEEE and VAX formats use radix 2, and IBM
+ * mainframe format uses radix 16; we know of no other radices in
+ * practical use.
+ */
+#if FLT_RADIX != 2 && FLT_RADIX != 16
+Please port the following code to your weird host;
+#endif
+#define AWK_DOUBLE_FRACTION_BITS 53 * (FLT_RADIX == 2 ? 1 : 4)
+
+#ifdef __GNUC__
+#define USE_BUILTIN_CTZL 3 < (__GNUC__ + (4 <= __GNUC_MINOR__))
+#else
+#define USE_BUILTIN_CTZL 0
+#endif
+
+static int count_trailing_zeros(unsigned long n)
+{
+#if USE_BUILTIN_CTZL
+	return __builtin_ctzl(n);
+#else
+	int i = 0;
+	for (; (n & 3) == 0; n >>= 2)
+		i += 2;
+	return i + (1 & ~n);
+#endif
+#undef USE_BUILTIN_CTZL
+}
+
+static unsigned long adjust_uint(unsigned long n)
+{
+	/*
+	 * If unsigned long is so wide that double cannot represent all its
+	 * values, strip leading nonzero bits of integers that are so large
+	 * that they cannot be represented exactly as double, so that their
+	 * low order bits are represented exactly, without rounding errors.
+	 * This is more desirable in practice, since it means the user sees
+	 * integers that are the same width as the double fractions.
+	 */
+	int wordbits = CHAR_BIT * sizeof n;
+	if (AWK_DOUBLE_FRACTION_BITS < wordbits) {
+		unsigned long one = 1;
+		unsigned long sentinel = one << (wordbits - AWK_DOUBLE_FRACTION_BITS);
+		int shift = count_trailing_zeros(n | sentinel);
+		unsigned long mask = (one << AWK_DOUBLE_FRACTION_BITS) - 1;
+
+		n &= mask << shift;
+	}
+
+	return n;
+}
+#undef AWK_DOUBLE_FRACTION_BITS
+
 /* -------- working with variables (set/get/copy/etc) -------- */
 
 static void fmt_num(const char *format, double n)
@@ -2688,27 +2752,27 @@ static NOINLINE var *exec_builtin(node *op, var *res)
 	/* Bitwise ops must assume that operands are unsigned. GNU Awk 3.1.5:
 	 * awk '{ print or(-1,1) }' gives "4.29497e+09", not "-2.xxxe+09" */
 	case B_an:
-		setvar_i(res, getvar_i_int(av[0]) & getvar_i_int(av[1]));
+		setvar_i(res, adjust_uint(getvar_i_int(av[0]) & getvar_i_int(av[1])));
 		break;
 
 	case B_co:
-		setvar_i(res, ~getvar_i_int(av[0]));
+		setvar_i(res, adjust_uint(~getvar_i_int(av[0])));
 		break;
 
 	case B_ls:
-		setvar_i(res, getvar_i_int(av[0]) << getvar_i_int(av[1]));
+		setvar_i(res, adjust_uint(getvar_i_int(av[0]) << getvar_i_int(av[1])));
 		break;
 
 	case B_or:
-		setvar_i(res, getvar_i_int(av[0]) | getvar_i_int(av[1]));
+		setvar_i(res, adjust_uint(getvar_i_int(av[0]) | getvar_i_int(av[1])));
 		break;
 
 	case B_rs:
-		setvar_i(res, getvar_i_int(av[0]) >> getvar_i_int(av[1]));
+		setvar_i(res, adjust_uint(getvar_i_int(av[0]) >> getvar_i_int(av[1])));
 		break;
 
 	case B_xo:
-		setvar_i(res, getvar_i_int(av[0]) ^ getvar_i_int(av[1]));
+		setvar_i(res, adjust_uint(getvar_i_int(av[0]) ^ getvar_i_int(av[1])));
 		break;
 
 	case B_lo:

_______________________________________________
busybox mailing list
[email protected]
http://lists.busybox.net/mailman/listinfo/busybox

Awk bitwise operations are broken on 64bit archs - includes patch with fix

Reply via email to