[Qemu-devel] [PATCH v2 0/2] ARM: fix Neon vrecpe instruction.

2011-02-16 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

These 2 patches fix the ARM Neon vrecpe instruction by matching the
algorithm descibed in the ARM ARM.

With these patches, qemu passes my ARM/Neon tests.

Patch #1 modifies softfloat by exporting float32_nan and
float32_infinity. For consistency, I have also moved all the
target-dependent definitions of floatXX_default_nan to softfloat.h (ie
the 16, 64, x80 and 128 bits versions in addition to the 32 bits
ones).

Patch #2 uses these newly exported values and uses the vrecpe
algorithm described in the ARM ARM.

Christophe Lyon (2):
  softfloat: export float32_nan and float32_infinity.
  target-arm: fix support for vrecpe.

 fpu/softfloat-specialize.h |   68 ---
 fpu/softfloat.h|   71 +
 target-arm/helper.c|   84 +--
 3 files changed, 143 insertions(+), 80 deletions(-)

-- 
1.7.2.3




[Qemu-devel] [PATCH 1/2] softfloat: export float32_nan and float32_infinity.

2011-02-16 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

These two special values are needed to implement some helper
functions, which return these values in some cases.

This patch also moves the definitions of default_nan for 16, 64, x80
and 128 bits floats for consistency with float32.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 fpu/softfloat-specialize.h |   68 --
 fpu/softfloat.h|   71 
 2 files changed, 71 insertions(+), 68 deletions(-)

diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h
index 2d025bf..adc5ada 100644
--- a/fpu/softfloat-specialize.h
+++ b/fpu/softfloat-specialize.h
@@ -30,12 +30,6 @@ these four paragraphs for those parts of this code that are 
retained.
 
 =*/
 
-#if defined(TARGET_MIPS) || defined(TARGET_SH4)
-#define SNAN_BIT_IS_ONE1
-#else
-#define SNAN_BIT_IS_ONE0
-#endif
-
 /*
 | Raises the exceptions specified by `flags'.  Floating-point traps can be
 | defined here if desired.  It is currently not possible for such a trap
@@ -57,17 +51,6 @@ typedef struct {
 } commonNaNT;
 
 /*
-| The pattern for a default generated half-precision NaN.
-**/
-#if defined(TARGET_ARM)
-#define float16_default_nan make_float16(0x7E00)
-#elif SNAN_BIT_IS_ONE
-#define float16_default_nan make_float16(0x7DFF)
-#else
-#define float16_default_nan make_float16(0xFE00)
-#endif
-
-/*
 | Returns 1 if the half-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
 **/
@@ -158,19 +141,6 @@ static float16 commonNaNToFloat16(commonNaNT a 
STATUS_PARAM)
 }
 
 /*
-| The pattern for a default generated single-precision NaN.
-**/
-#if defined(TARGET_SPARC)
-#define float32_default_nan make_float32(0x7FFF)
-#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA)
-#define float32_default_nan make_float32(0x7FC0)
-#elif SNAN_BIT_IS_ONE
-#define float32_default_nan make_float32(0x7FBF)
-#else
-#define float32_default_nan make_float32(0xFFC0)
-#endif
-
-/*
 | Returns 1 if the single-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
 **/
@@ -413,19 +383,6 @@ static float32 propagateFloat32NaN( float32 a, float32 b 
STATUS_PARAM)
 }
 
 /*
-| The pattern for a default generated double-precision NaN.
-**/
-#if defined(TARGET_SPARC)
-#define float64_default_nan make_float64(LIT64( 0x7FFF ))
-#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA)
-#define float64_default_nan make_float64(LIT64( 0x7FF8 ))
-#elif SNAN_BIT_IS_ONE
-#define float64_default_nan make_float64(LIT64( 0x7FF7 ))
-#else
-#define float64_default_nan make_float64(LIT64( 0xFFF8 ))
-#endif
-
-/*
 | Returns 1 if the double-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
 **/
@@ -564,19 +521,6 @@ static float64 propagateFloat64NaN( float64 a, float64 b 
STATUS_PARAM)
 #ifdef FLOATX80
 
 /*
-| The pattern for a default generated extended double-precision NaN.  The
-| `high' and `low' values hold the most- and least-significant bits,
-| respectively.
-**/
-#if SNAN_BIT_IS_ONE
-#define floatx80_default_nan_high 0x7FFF
-#define floatx80_default_nan_low  LIT64( 0xBFFF )
-#else
-#define floatx80_default_nan_high 0x
-#define floatx80_default_nan_low  LIT64( 0xC000 )
-#endif
-
-/*
 | Returns 1 if the extended double-precision floating-point value `a' is a
 | quiet NaN; otherwise returns 0. This slightly differs from the same
 | function for other types as floatx80 has an explicit bit.
@@ -728,18 +672,6 @@ static floatx80 propagateFloatx80NaN( floatx80 

[Qemu-devel] [PATCH 2/2] target-arm: fix support for vrecpe.

2011-02-16 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Now use the same algorithm as described in the ARM ARM.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/helper.c |   84 +++---
 1 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/target-arm/helper.c b/target-arm/helper.c
index 7f63a28..a17df42 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -2687,13 +2687,68 @@ float32 HELPER(rsqrts_f32)(float32 a, float32 b, 
CPUState *env)
 
 /* NEON helpers.  */
 
-/* TODO: The architecture specifies the value that the estimate functions
-   should return.  We return the exact reciprocal/root instead.  */
+/* The algorithm that must be used to calculate the estimate
+ * is specified by the ARM ARM.
+ */
+static float64 recip_estimate(float64 a, CPUState *env)
+{
+float_status *s = env-vfp.standard_fp_status;
+float64 one = int64_to_float64(1, s);
+/* q = (int)(a * 512.0) */
+float64 x512 = int64_to_float64(512, s);
+float64 q = float64_mul(x512, a, s);
+int64_t q_int = float64_to_int64_round_to_zero(q, s);
+
+/* r = 1.0 / (((double)q + 0.5) / 512.0) */
+q = int64_to_float64(q_int, s);
+float64 half = float64_div(one, int64_to_float64(2, s), s);
+q = float64_add(q, half, s);
+q = float64_div(q, x512, s);
+q = float64_div(one, q, s);
+
+/* s = (int)(256.0 * r + 0.5) */
+float64 x256 = int64_to_float64(256, s);
+q = float64_mul(q, x256, s);
+q = float64_add(q, half, s);
+q_int = float64_to_int64_round_to_zero(q, s);
+
+/* return (double)s / 256.0 */
+return float64_div(int64_to_float64(q_int, s), x256, s);
+}
+
 float32 HELPER(recpe_f32)(float32 a, CPUState *env)
 {
-float_status *s = env-vfp.fp_status;
-float32 one = int32_to_float32(1, s);
-return float32_div(one, a, s);
+float_status *s = env-vfp.standard_fp_status;
+float64 f64;
+uint32_t val32 = float32_val(a);
+
+int result_exp;
+int a_exp = (val32   0x7F80)  23;
+int sign = val32  0x8000;
+
+if (float32_is_any_nan(a)) {
+return float32_maybe_silence_nan(a);
+} else if (float32_is_infinity(a)) {
+return float32_zero;
+} else if (float32_is_zero(a)) {
+float_raise(float_flag_divbyzero, s);
+return float32_infinity;
+} else if (a_exp = 253) {
+float_raise(float_flag_underflow, s);
+return float32_zero;
+}
+
+f64 = make_float64((0x3FEULL  52)
+   | ((int64_t)(val32  0x7F)  29));
+
+result_exp = 253 - a_exp;
+
+f64 = recip_estimate(f64, env);
+
+val32 = sign
+| ((result_exp  0xFF)  23)
+| ((float64_val(f64)  29)  0x7F);
+return make_float32(val32);
 }
 
 float32 HELPER(rsqrte_f32)(float32 a, CPUState *env)
@@ -2705,13 +2760,18 @@ float32 HELPER(rsqrte_f32)(float32 a, CPUState *env)
 
 uint32_t HELPER(recpe_u32)(uint32_t a, CPUState *env)
 {
-float_status *s = env-vfp.fp_status;
-float32 tmp;
-tmp = int32_to_float32(a, s);
-tmp = float32_scalbn(tmp, -32, s);
-tmp = helper_recpe_f32(tmp, env);
-tmp = float32_scalbn(tmp, 31, s);
-return float32_to_int32(tmp, s);
+float64 f64;
+
+if ((a  0x8000) == 0) {
+return 0x;
+}
+
+f64 = make_float64((0x3FEULL  52)
+   | ((int64_t)(a  0x7FFF)  21));
+
+f64 = recip_estimate (f64, env);
+
+return 0x8000 | ((float64_val(f64)  21)  0x7FFF);
 }
 
 uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUState *env)
-- 
1.7.2.3




[Qemu-devel] [PATCH v3 0/3] ARM: fix Neon vrecpe and vrsqrte instructions.

2011-02-16 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

These 3 patches fix the ARM Neon vrecpe and vrsqrte instructions by
matching the algorithms descibed in the ARM ARM.

With these patches, qemu passes my ARM/Neon tests.

Patch #1 modifies softfloat by exporting float32_default_nan and
float32_infinity. For consistency, I have also moved all the
target-dependent definitions of floatXX_default_nan to softfloat.h (ie
the 16, 64, x80 and 128 bits versions in addition to the 32 bits
ones).

It also adds float32_set_sign() to help return the right special
values (-0, -infinity).

Patch #2 uses these newly exported values and uses the vrecpe
algorithm described in the ARM ARM.

Patch #3 uses these newly exported values and uses the vrsqrte
algorithm described in the ARM ARM.

Christophe Lyon (3):
  softfloat: export float32_default_nan, and float32_infinity. Add
float32_set_sign().
  target-arm: fix support for vrecpe.
  target-arm: fix support for vrsqrte.

 fpu/softfloat-specialize.h |   68 ---
 fpu/softfloat.h|   75 
 target-arm/helper.c|  206 +++-
 3 files changed, 259 insertions(+), 90 deletions(-)

-- 
1.7.2.3




[Qemu-devel] [PATCH 1/3] softfloat: export float32_default_nan, and float32_infinity. Add float32_set_sign().

2011-02-16 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

These special values are needed to implement some helper functions,
which return these values in some cases.

This patch also moves the definitions of default_nan for 16, 64, x80
and 128 bits floats for consistency with float32.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 fpu/softfloat-specialize.h |   68 ---
 fpu/softfloat.h|   75 
 2 files changed, 75 insertions(+), 68 deletions(-)

diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h
index 2d025bf..adc5ada 100644
--- a/fpu/softfloat-specialize.h
+++ b/fpu/softfloat-specialize.h
@@ -30,12 +30,6 @@ these four paragraphs for those parts of this code that are 
retained.
 
 =*/
 
-#if defined(TARGET_MIPS) || defined(TARGET_SH4)
-#define SNAN_BIT_IS_ONE1
-#else
-#define SNAN_BIT_IS_ONE0
-#endif
-
 /*
 | Raises the exceptions specified by `flags'.  Floating-point traps can be
 | defined here if desired.  It is currently not possible for such a trap
@@ -57,17 +51,6 @@ typedef struct {
 } commonNaNT;
 
 /*
-| The pattern for a default generated half-precision NaN.
-**/
-#if defined(TARGET_ARM)
-#define float16_default_nan make_float16(0x7E00)
-#elif SNAN_BIT_IS_ONE
-#define float16_default_nan make_float16(0x7DFF)
-#else
-#define float16_default_nan make_float16(0xFE00)
-#endif
-
-/*
 | Returns 1 if the half-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
 **/
@@ -158,19 +141,6 @@ static float16 commonNaNToFloat16(commonNaNT a 
STATUS_PARAM)
 }
 
 /*
-| The pattern for a default generated single-precision NaN.
-**/
-#if defined(TARGET_SPARC)
-#define float32_default_nan make_float32(0x7FFF)
-#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA)
-#define float32_default_nan make_float32(0x7FC0)
-#elif SNAN_BIT_IS_ONE
-#define float32_default_nan make_float32(0x7FBF)
-#else
-#define float32_default_nan make_float32(0xFFC0)
-#endif
-
-/*
 | Returns 1 if the single-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
 **/
@@ -413,19 +383,6 @@ static float32 propagateFloat32NaN( float32 a, float32 b 
STATUS_PARAM)
 }
 
 /*
-| The pattern for a default generated double-precision NaN.
-**/
-#if defined(TARGET_SPARC)
-#define float64_default_nan make_float64(LIT64( 0x7FFF ))
-#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA)
-#define float64_default_nan make_float64(LIT64( 0x7FF8 ))
-#elif SNAN_BIT_IS_ONE
-#define float64_default_nan make_float64(LIT64( 0x7FF7 ))
-#else
-#define float64_default_nan make_float64(LIT64( 0xFFF8 ))
-#endif
-
-/*
 | Returns 1 if the double-precision floating-point value `a' is a quiet
 | NaN; otherwise returns 0.
 **/
@@ -564,19 +521,6 @@ static float64 propagateFloat64NaN( float64 a, float64 b 
STATUS_PARAM)
 #ifdef FLOATX80
 
 /*
-| The pattern for a default generated extended double-precision NaN.  The
-| `high' and `low' values hold the most- and least-significant bits,
-| respectively.
-**/
-#if SNAN_BIT_IS_ONE
-#define floatx80_default_nan_high 0x7FFF
-#define floatx80_default_nan_low  LIT64( 0xBFFF )
-#else
-#define floatx80_default_nan_high 0x
-#define floatx80_default_nan_low  LIT64( 0xC000 )
-#endif
-
-/*
 | Returns 1 if the extended double-precision floating-point value `a' is a
 | quiet NaN; otherwise returns 0. This slightly differs from the same
 | function for other types as floatx80 has an explicit bit.
@@ -728,18 +672,6 @@ static floatx80 propagateFloatx80NaN( floatx80 a, 

[Qemu-devel] [PATCH 3/3] target-arm: fix support for vrsqrte.

2011-02-16 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Now use the same algorithm as described in the ARM ARM.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/helper.c |  122 ++
 1 files changed, 112 insertions(+), 10 deletions(-)

diff --git a/target-arm/helper.c b/target-arm/helper.c
index 7751d21..f0f2231 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -2751,11 +2751,105 @@ float32 HELPER(recpe_f32)(float32 a, CPUState *env)
 return make_float32(val32);
 }
 
+/* The algorithm that must be used to calculate the estimate
+ * is specified by the ARM ARM.
+ */
+static float64 recip_sqrt_estimate(float64 a, CPUState *env)
+{
+float_status *s = env-vfp.standard_fp_status;
+float64 one = int64_to_float64(1, s);
+float64 half = float64_div(one, int64_to_float64(2, s), s);
+float64 x256 = int64_to_float64(256, s);
+float64 q;
+int64_t q_int;
+
+if (float64_lt(a, half, s)) {
+/* range 0.25 = a  0.5 */
+
+/* a in units of 1/512 rounded down */
+/* q0 = (int)(a * 512.0);  */
+float64 x512 = int64_to_float64(512, s);
+q = float64_mul(x512, a, s);
+q_int = float64_to_int64_round_to_zero(q, s);
+
+/* reciprocal root r */
+/* r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0);  */
+q = int64_to_float64(q_int, s);
+q = float64_add(q, half, s);
+q = float64_div(q, x512, s);
+q = float64_sqrt(q, s);
+q = float64_div(one, q, s);
+} else {
+/* range 0.5 = a  1.0 */
+
+/* a in units of 1/256 rounded down */
+/* q1 = (int)(a * 256.0); */
+q = float64_mul(x256, a, s);
+int64_t q_int = float64_to_int64_round_to_zero(q, s);
+
+/* reciprocal root r */
+/* r = 1.0 /sqrt(((double)q1 + 0.5) / 256); */
+q = int64_to_float64(q_int, s);
+q = float64_add(q, half, s);
+q = float64_div(q, x256, s);
+q = float64_sqrt(q, s);
+q = float64_div(one, q, s);
+}
+/* r in units of 1/256 rounded to nearest */
+/* s = (int)(256.0 * r + 0.5); */
+
+q = float64_mul(q, x256,s );
+q = float64_add(q, half, s);
+q_int = float64_to_int64_round_to_zero(q, s);
+
+/* return (double)s / 256.0;*/
+return float64_div(int64_to_float64(q_int, s), x256, s);
+}
+
 float32 HELPER(rsqrte_f32)(float32 a, CPUState *env)
 {
-float_status *s = env-vfp.fp_status;
-float32 one = int32_to_float32(1, s);
-return float32_div(one, float32_sqrt(a, s), s);
+float_status *s = env-vfp.standard_fp_status;
+int result_exp;
+float64 f64;
+int32_t val;
+int64_t val64;
+
+val = float32_val(a);
+
+if (float32_is_any_nan(a)) {
+return float32_default_nan;
+} else if (float32_is_zero(a)) {
+float_raise(float_flag_divbyzero, s);
+return float32_set_sign(float32_infinity, float32_is_neg(a));
+} else if (val  0) {
+float_raise(float_flag_invalid, s);
+return float32_default_nan;
+} else if (float32_is_infinity(a)) {
+return float32_zero;
+}
+
+/* Normalize to a double-precision value between 0.25 and 1.0,
+ * preserving the parity of the exponent.  */
+if ((val  0x80) == 0) {
+f64 = make_float64(((uint64_t)(val  0x8000)  32)
+   | (0x3feULL  52)
+   | ((uint64_t)(val  0x7ff)  29));
+} else {
+f64 = make_float64(((uint64_t)(val  0x8000)  32)
+   | (0x3fdULL  52)
+   | ((uint64_t)(val  0x7ff)  29));
+}
+
+result_exp = (380 - ((val  0x7f80)  23)) / 2;
+
+f64 = recip_sqrt_estimate(f64, env);
+
+val64 = float64_val(f64);
+
+val = ((val64  63)   0x8000)
+| ((result_exp  0xff)  23)
+| ((val64  29)   0x7f);
+return make_float32(val);
 }
 
 uint32_t HELPER(recpe_u32)(uint32_t a, CPUState *env)
@@ -2776,13 +2870,21 @@ uint32_t HELPER(recpe_u32)(uint32_t a, CPUState *env)
 
 uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUState *env)
 {
-float_status *s = env-vfp.fp_status;
-float32 tmp;
-tmp = int32_to_float32(a, s);
-tmp = float32_scalbn(tmp, -32, s);
-tmp = helper_rsqrte_f32(tmp, env);
-tmp = float32_scalbn(tmp, 31, s);
-return float32_to_int32(tmp, s);
+float64 f64;
+
+if ((a  0xc000) == 0) return 0x;
+
+if (a  0x8000) {
+f64 = make_float64((0x3feULL  52)
+   | ((uint64_t)(a  0x7fff)  21));
+} else { /* bits 31-30 == '01' */
+f64 = make_float64((0x3fdULL  52)
+   | ((uint64_t)(a  0x3fff)  22));
+}
+
+f64 = recip_sqrt_estimate(f64, env);
+
+return 0x8000 | ((float64_val(f64)  21)  0x7fff);
 }
 
 void HELPER(set_teecr)(CPUState *env, uint32_t val)
-- 
1.7.2.3




[Qemu-devel] [PATCH 2/3] target-arm: fix support for vrecpe.

2011-02-16 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Now use the same algorithm as described in the ARM ARM.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/helper.c |   84 +++---
 1 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/target-arm/helper.c b/target-arm/helper.c
index 7f63a28..7751d21 100644
--- a/target-arm/helper.c
+++ b/target-arm/helper.c
@@ -2687,13 +2687,68 @@ float32 HELPER(rsqrts_f32)(float32 a, float32 b, 
CPUState *env)
 
 /* NEON helpers.  */
 
-/* TODO: The architecture specifies the value that the estimate functions
-   should return.  We return the exact reciprocal/root instead.  */
+/* The algorithm that must be used to calculate the estimate
+ * is specified by the ARM ARM.
+ */
+static float64 recip_estimate(float64 a, CPUState *env)
+{
+float_status *s = env-vfp.standard_fp_status;
+float64 one = int64_to_float64(1, s);
+/* q = (int)(a * 512.0) */
+float64 x512 = int64_to_float64(512, s);
+float64 q = float64_mul(x512, a, s);
+int64_t q_int = float64_to_int64_round_to_zero(q, s);
+
+/* r = 1.0 / (((double)q + 0.5) / 512.0) */
+q = int64_to_float64(q_int, s);
+float64 half = float64_div(one, int64_to_float64(2, s), s);
+q = float64_add(q, half, s);
+q = float64_div(q, x512, s);
+q = float64_div(one, q, s);
+
+/* s = (int)(256.0 * r + 0.5) */
+float64 x256 = int64_to_float64(256, s);
+q = float64_mul(q, x256, s);
+q = float64_add(q, half, s);
+q_int = float64_to_int64_round_to_zero(q, s);
+
+/* return (double)s / 256.0 */
+return float64_div(int64_to_float64(q_int, s), x256, s);
+}
+
 float32 HELPER(recpe_f32)(float32 a, CPUState *env)
 {
-float_status *s = env-vfp.fp_status;
-float32 one = int32_to_float32(1, s);
-return float32_div(one, a, s);
+float_status *s = env-vfp.standard_fp_status;
+float64 f64;
+uint32_t val32 = float32_val(a);
+
+int result_exp;
+int a_exp = (val32   0x7f80)  23;
+int sign = val32  0x8000;
+
+if (float32_is_any_nan(a)) {
+return float32_default_nan;
+} else if (float32_is_infinity(a)) {
+return float32_set_sign(float32_zero, float32_is_neg(a));
+} else if (float32_is_zero_or_denormal(a)) {
+float_raise(float_flag_divbyzero, s);
+return float32_set_sign(float32_infinity, float32_is_neg(a));
+} else if (a_exp = 253) {
+float_raise(float_flag_underflow, s);
+return float32_set_sign(float32_zero, float32_is_neg(a));
+}
+
+f64 = make_float64((0x3feULL  52)
+   | ((int64_t)(val32  0x7f)  29));
+
+result_exp = 253 - a_exp;
+
+f64 = recip_estimate(f64, env);
+
+val32 = sign
+| ((result_exp  0xff)  23)
+| ((float64_val(f64)  29)  0x7f);
+return make_float32(val32);
 }
 
 float32 HELPER(rsqrte_f32)(float32 a, CPUState *env)
@@ -2705,13 +2760,18 @@ float32 HELPER(rsqrte_f32)(float32 a, CPUState *env)
 
 uint32_t HELPER(recpe_u32)(uint32_t a, CPUState *env)
 {
-float_status *s = env-vfp.fp_status;
-float32 tmp;
-tmp = int32_to_float32(a, s);
-tmp = float32_scalbn(tmp, -32, s);
-tmp = helper_recpe_f32(tmp, env);
-tmp = float32_scalbn(tmp, 31, s);
-return float32_to_int32(tmp, s);
+float64 f64;
+
+if ((a  0x8000) == 0) {
+return 0x;
+}
+
+f64 = make_float64((0x3feULL  52)
+   | ((int64_t)(a  0x7fff)  21));
+
+f64 = recip_estimate (f64, env);
+
+return 0x8000 | ((float64_val(f64)  21)  0x7fff);
 }
 
 uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUState *env)
-- 
1.7.2.3




[Qemu-devel] [PATCH 5/6] target-arm: fix Neon VQSHRN and VSHRN.

2011-02-11 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Call the normal shift helpers instead of the rounding ones.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/translate.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 8791bc5..ace533f 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4095,8 +4095,8 @@ static inline void gen_neon_shift_narrow(int size, TCGv 
var, TCGv shift,
 } else {
 if (u) {
 switch (size) {
-case 1: gen_helper_neon_rshl_u16(var, var, shift); break;
-case 2: gen_helper_neon_rshl_u32(var, var, shift); break;
+case 1: gen_helper_neon_shl_u16(var, var, shift); break;
+case 2: gen_helper_neon_shl_u32(var, var, shift); break;
 default: abort();
 }
 } else {
-- 
1.7.2.3




[Qemu-devel] [PATCH 4/6] target-arm: fix saturated values for Neon right shifts.

2011-02-11 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Fix value returned by signed qrshl helpers (8, 16 and 32 bits).

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/neon_helper.c |8 ++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 907f7b7..83d610a 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -903,7 +903,7 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t 
val, uint64_t shiftop)
 dest = src1  tmp; \
 if ((dest  tmp) != src1) { \
 SET_QC(); \
-dest = src1  31; \
+dest = (uint32_t)(1  (sizeof(src1) * 8 - 1)) - (src1  0 ? 1 : 
0); \
 } \
 }} while (0)
 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
@@ -924,7 +924,11 @@ uint32_t HELPER(neon_qrshl_s32)(CPUState *env, uint32_t 
valop, uint32_t shiftop)
 dest = val  shift;
 if ((dest  shift) != val) {
 SET_QC();
-dest = (uint32_t)(1  (sizeof(val) * 8 - 1)) - (val  0 ? 1 : 0);
+if (val  0) {
+dest = INT32_MIN;
+} else {
+dest = INT32_MAX;
+}
 }
 }
 return dest;
-- 
1.7.2.3




[Qemu-devel] [PATCH 2/6] target-arm: fix Neon right shifts with shift amount == input width.

2011-02-11 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Fix rshl helpers (s8, s16, s64, u8, u16)

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/neon_helper.c |6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 3f1f3d4..1ac362f 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -548,7 +548,7 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t 
shiftop)
 } else if (tmp  -(ssize_t)sizeof(src1) * 8) { \
 dest = src1  (sizeof(src1) * 8 - 1); \
 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
-dest = src1  (tmp - 1); \
+dest = src1  (-tmp - 1); \
 dest++; \
 dest = 1; \
 } else if (tmp  0) { \
@@ -594,7 +594,7 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t 
shiftop)
 val = 0;
 } else if (shift  -64) {
 val = 63;
-} else if (shift == -63) {
+} else if (shift == -64) {
 val = 63;
 val++;
 val = 1;
@@ -622,7 +622,7 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t 
shiftop)
 tmp  -(ssize_t)sizeof(src1) * 8) { \
 dest = 0; \
 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
-dest = src1  (tmp - 1); \
+dest = src1  (-tmp - 1); \
 } else if (tmp  0) { \
 dest = (src1 + (1  (-1 - tmp)))  -tmp; \
 } else { \
-- 
1.7.2.3




[Qemu-devel] [PATCH v3 0/6] target-arm: Fix Neon shift instructions.

2011-02-11 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

This patch series provides fixes such that ARM Neon instructions
VRSHR, VRSRA, VQRSHRN, VQRSHRUN, VRSHRN, VQSHRN, VSHRN, VQSHRUN now
pass all my tests.

I have reworked all these patches and I hope they are now easier to
review.

Christophe Lyon (6):
  target-arm: Fix rounding constant addition for Neon shift
instructions.
  target-arm: fix Neon right shifts with shift amount == input width.
  target-arm: fix unsigned 64 bit right shifts.
  target-arm: fix saturated values for Neon right shifts.
  target-arm: fix Neon VQSHRN and VSHRN.
  target-arm: fix decoding of Neon 64 bit shifts.

 target-arm/neon_helper.c |  163 +-
 target-arm/translate.c   |   47 +-
 2 files changed, 176 insertions(+), 34 deletions(-)

-- 
1.7.2.3




[Qemu-devel] [PATCH 6/6] target-arm: fix decoding of Neon 64 bit shifts.

2011-02-11 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Fix decoding of 64 bits variants of VSHRN, VRSHRN, VQSHRN, VQSHRUN, VQRSHRN, 
VQRSHRUN, taking into account whether inputs are unsigned or not.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/translate.c |   43 ---
 1 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index ace533f..10b8c5f 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4815,6 +4815,8 @@ static int disas_neon_data_insn(CPUState * env, 
DisasContext *s, uint32_t insn)
 } else if (op  10) {
 /* Shift by immediate and narrow:
VSHRN, VRSHRN, VQSHRN, VQRSHRN.  */
+int input_unsigned = (op == 8) ? !u : u;
+
 shift = shift - (1  (size + 3));
 size++;
 switch (size) {
@@ -4841,33 +4843,44 @@ static int disas_neon_data_insn(CPUState * env, 
DisasContext *s, uint32_t insn)
 if (size == 3) {
 neon_load_reg64(cpu_V0, rm + pass);
 if (q) {
-  if (u)
-gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, tmp64);
-  else
-gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, tmp64);
+if (input_unsigned) {
+gen_helper_neon_rshl_u64(cpu_V0, cpu_V0,
+ tmp64);
+} else {
+gen_helper_neon_rshl_s64(cpu_V0, cpu_V0,
+ tmp64);
+}
 } else {
-  if (u)
-gen_helper_neon_shl_u64(cpu_V0, cpu_V0, tmp64);
-  else
-gen_helper_neon_shl_s64(cpu_V0, cpu_V0, tmp64);
+if (input_unsigned) {
+gen_helper_neon_shl_u64(cpu_V0, cpu_V0,
+tmp64);
+} else {
+gen_helper_neon_shl_s64(cpu_V0, cpu_V0,
+tmp64);
+}
 }
 } else {
 tmp = neon_load_reg(rm + pass, 0);
-gen_neon_shift_narrow(size, tmp, tmp2, q, u);
+gen_neon_shift_narrow(size, tmp, tmp2, q, 
input_unsigned);
 tmp3 = neon_load_reg(rm + pass, 1);
-gen_neon_shift_narrow(size, tmp3, tmp2, q, u);
+gen_neon_shift_narrow(size, tmp3, tmp2, q, 
input_unsigned);
 tcg_gen_concat_i32_i64(cpu_V0, tmp, tmp3);
 dead_tmp(tmp);
 dead_tmp(tmp3);
 }
 tmp = new_tmp();
-if (op == 8  !u) {
-gen_neon_narrow(size - 1, tmp, cpu_V0);
+if (op == 8) {
+if (u) { /* VQSHRUN / VQRSHRUN */
+gen_neon_unarrow_sats(size - 1, tmp, cpu_V0);
+} else { /* VSHRN / VRSHRN */
+gen_neon_narrow(size - 1, tmp, cpu_V0);
+}
 } else {
-if (op == 8)
+if (u) { /* VQSHRN / VQRSHRN */
+gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
+} else { /* VQSHRN / VQRSHRN */
 gen_neon_narrow_sats(size - 1, tmp, cpu_V0);
-else
-gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
+}
 }
 neon_store_reg(rd, pass, tmp);
 } /* for pass */
-- 
1.7.2.3




[Qemu-devel] [PATCH 1/6] target-arm: Fix rounding constant addition for Neon shift instructions.

2011-02-11 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Handle cases where adding the rounding constant could overflow in Neon
shift instructions: VRSHR, VRSRA, VQRSHRN, VQRSHRUN, VRSHRN.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/neon_helper.c |  149 ++
 1 files changed, 137 insertions(+), 12 deletions(-)

diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index cf82072..3f1f3d4 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -558,9 +558,34 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t 
shiftop)
 }} while (0)
 NEON_VOP(rshl_s8, neon_s8, 4)
 NEON_VOP(rshl_s16, neon_s16, 2)
-NEON_VOP(rshl_s32, neon_s32, 1)
 #undef NEON_FN
 
+/* The addition of the rounding constant may overflow, so we use an
+ * intermediate 64 bits accumulator.  */
+uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
+{
+int32_t dest;
+int32_t val = (int32_t)valop;
+int8_t shift = (int8_t)shiftop;
+if (shift = 32) {
+dest = 0;
+} else if (shift  -32) {
+dest = val  31;
+} else if (shift == -32) {
+dest = val  31;
+dest++;
+dest = 1;
+} else if (shift  0) {
+int64_t big_dest = ((int64_t)val + (1  (-1 - shift)));
+dest = big_dest  -shift;
+} else {
+dest = val  shift;
+}
+return dest;
+}
+
+/* Handling addition overflow with 64 bits inputs values is more
+ * tricky than with 32 bits values.  */
 uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
 {
 int8_t shift = (int8_t)shiftop;
@@ -574,7 +599,16 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t 
shiftop)
 val++;
 val = 1;
 } else if (shift  0) {
-val = (val + ((int64_t)1  (-1 - shift)))  -shift;
+val = (-shift - 1);
+if (val == INT64_MAX) {
+/* In this case, it means that the rounding constant is 1,
+ * and the addition would overflow. Return the actual
+ * result directly.  */
+val = 0x4000LL;
+} else {
+val++;
+val = 1;
+}
 } else {
 val = shift;
 }
@@ -596,9 +630,29 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t 
shiftop)
 }} while (0)
 NEON_VOP(rshl_u8, neon_u8, 4)
 NEON_VOP(rshl_u16, neon_u16, 2)
-NEON_VOP(rshl_u32, neon_u32, 1)
 #undef NEON_FN
 
+/* The addition of the rounding constant may overflow, so we use an
+ * intermediate 64 bits accumulator.  */
+uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
+{
+uint32_t dest;
+int8_t shift = (int8_t)shiftop;
+if (shift = 32 || shift  -32) {
+dest = 0;
+} else if (shift == -32) {
+dest = val  31;
+} else if (shift  0) {
+uint64_t big_dest = ((uint64_t)val + (1  (-1 - shift)));
+dest = big_dest  -shift;
+} else {
+dest = val  shift;
+}
+return dest;
+}
+
+/* Handling addition overflow with 64 bits inputs values is more
+ * tricky than with 32 bits values.  */
 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
 {
 int8_t shift = (uint8_t)shiftop;
@@ -607,9 +661,17 @@ uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t 
shiftop)
 } else if (shift == -64) {
 /* Rounding a 1-bit result just preserves that bit.  */
 val = 63;
-} if (shift  0) {
-val = (val + ((uint64_t)1  (-1 - shift)))  -shift;
-val = -shift;
+} else if (shift  0) {
+val = (-shift - 1);
+if (val == UINT64_MAX) {
+/* In this case, it means that the rounding constant is 1,
+ * and the addition would overflow. Return the actual
+ * result directly.  */
+val = 0x8000ULL;
+} else {
+val++;
+val = 1;
+}
 } else {
 val = shift;
 }
@@ -784,14 +846,43 @@ uint64_t HELPER(neon_qshlu_s64)(CPUState *env, uint64_t 
valop, uint64_t shiftop)
 }} while (0)
 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
-NEON_VOP_ENV(qrshl_u32, neon_u32, 1)
 #undef NEON_FN
 
+/* The addition of the rounding constant may overflow, so we use an
+ * intermediate 64 bits accumulator.  */
+uint32_t HELPER(neon_qrshl_u32)(CPUState *env, uint32_t val, uint32_t shiftop)
+{
+uint32_t dest;
+int8_t shift = (int8_t)shiftop;
+if (shift  0) {
+uint64_t big_dest = ((uint64_t)val + ( 1  (-1 - shift)));
+dest = big_dest  -shift;
+} else {
+dest = val  shift;
+if ((dest  shift) != val) {
+SET_QC();
+dest = ~0;
+}
+}
+return dest;
+}
+
+/* Handling addition overflow with 64 bits inputs values is more
+ * tricky than with 32 bits values.  */
 uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
 {
 int8_t shift = (int8_t)shiftop;
 if (shift  0) {
-val = (val + (1  

[Qemu-devel] [PATCH 3/6] target-arm: fix unsigned 64 bit right shifts.

2011-02-11 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Fix range of shift amounts which always give 0 as result.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/neon_helper.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 1ac362f..907f7b7 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -656,7 +656,7 @@ uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t 
shiftop)
 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
 {
 int8_t shift = (uint8_t)shiftop;
-if (shift = 64 || shift  64) {
+if (shift = 64 || shift  -64) {
 val = 0;
 } else if (shift == -64) {
 /* Rounding a 1-bit result just preserves that bit.  */
-- 
1.7.2.3




[Qemu-devel] [PATCH 6/8] target-arm: Fix Neon VQ(R)SHRN instructions.

2011-01-31 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Handle unsigned variant of VQ(R)SHRN instructions.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/translate.c |8 ++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index a614e34..61d4c4c 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4865,8 +4865,12 @@ static int disas_neon_data_insn(CPUState * env, 
DisasContext *s, uint32_t insn)
 } else { /* VSHRN / VRSHRN */
 gen_neon_narrow(size - 1, tmp, cpu_V0);
 }
-} else { /* VQSHRN / VQRSHRN */
-gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
+} else {
+if (u) { /* VQSHRUN / VQRSHRUN */
+gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
+} else { /* VQSHRN / VQRSHRN */
+gen_neon_narrow_sats(size - 1, tmp, cpu_V0);
+}
 }
 neon_store_reg(rd, pass, tmp);
 } /* for pass */
-- 
1.7.2.3




[Qemu-devel] [PATCH v2 0/8] target-arm: Fix Neon instructions VQMOVUN VQRSHL VQRSHRN VQRSHRUN VQSHRN VQSHRUN VSLI VSRI

2011-01-31 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

This patchset combines fixes from the Meego tree (Peter Maydell, Juha
Riihimäki) and my own fixes such that ARM Neon instructions VQMOVUN
VQRSHL VQRSHRN VQRSHRUN VQSHRN VQSHRUN VSLI VSRI now pass all my
tests.

Christophe Lyon (3):
  Fixes for several shift instructions: VRSHL, VRSHR, VRSHRN, VSHLL,
VRSRA.
  target-arm: Fix Neon VQ(R)SHRN instructions.
  target-arm: Fix VQRSHL Neon instructions (signed/unsigned 64 bits and
signed 32 bits variants).

Juha Riihimäki (1):
  target-arm: fix neon vqrshl instruction

Meego (4):
  Create and use neon_unarrow_sat* helpers
  VQRSHRN related changes
  fiddle decoding of 64 bit shift by imm and narrow
  implement vsli.64, vsri.64

 target-arm/helpers.h |3 +
 target-arm/neon_helper.c |  195 ++
 target-arm/translate.c   |   98 +--
 3 files changed, 253 insertions(+), 43 deletions(-)

-- 
1.7.2.3




[Qemu-devel] [PATCH 7/8] target-arm: implement vsli.64, vsri.64

2011-01-31 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/translate.c |   11 ++-
 1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 61d4c4c..9150242 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4700,7 +4700,16 @@ static int disas_neon_data_insn(CPUState * env, 
DisasContext *s, uint32_t insn)
 tcg_gen_add_i64(cpu_V0, cpu_V0, cpu_V1);
 } else if (op == 4 || (op == 5  u)) {
 /* Insert */
-cpu_abort(env, VS[LR]I.64 not implemented);
+neon_load_reg64(cpu_V1, rd + pass);
+uint64_t mask;
+if (op == 4) {
+mask = 0xull  -shift;
+} else {
+mask = 0xull  shift;
+}
+tcg_gen_andi_i64(cpu_V0, cpu_V0, mask);
+tcg_gen_andi_i64(cpu_V1, cpu_V1, ~mask);
+tcg_gen_or_i64(cpu_V0, cpu_V0, cpu_V1);
 }
 neon_store_reg64(cpu_V0, rd + pass);
 } else { /* size  3 */
-- 
1.7.2.3




[Qemu-devel] [PATCH 1/8] target-arm: Fixes for several shift instructions: VRSHL, VRSHR, VRSHRN, VSHLL, VRSRA.

2011-01-31 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

For variants with rounding, fix cases where adding the rounding
constant could overflow.

For VSHLL, fix bit mask.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/neon_helper.c |   61 ++---
 target-arm/translate.c   |   12 +++-
 2 files changed, 61 insertions(+), 12 deletions(-)

diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index fead152..6c832b4 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -451,6 +451,9 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t 
shiftop)
 return val;
 }
 
+/* The addition of the rounding constant may overflow, so we use an
+ * intermediate 64 bits accumulator, which is really needed only when
+ * dealing with 32 bits input values.  */
 #define NEON_FN(dest, src1, src2) do { \
 int8_t tmp; \
 tmp = (int8_t)src2; \
@@ -459,11 +462,12 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t 
shiftop)
 } else if (tmp  -(ssize_t)sizeof(src1) * 8) { \
 dest = src1  (sizeof(src1) * 8 - 1); \
 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
-dest = src1  (tmp - 1); \
+dest = src1  (-tmp - 1); \
 dest++; \
 dest = 1; \
 } else if (tmp  0) { \
-dest = (src1 + (1  (-1 - tmp)))  -tmp; \
+int64_t big_dest = ((int64_t)src1 + (1  (-1 - tmp))); \
+dest = big_dest  -tmp; \
 } else { \
 dest = src1  tmp; \
 }} while (0)
@@ -472,6 +476,8 @@ NEON_VOP(rshl_s16, neon_s16, 2)
 NEON_VOP(rshl_s32, neon_s32, 1)
 #undef NEON_FN
 
+/* Handling addition overflow with 64 bits inputs values is more
+ * tricky than with 32 bits values.  */
 uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
 {
 int8_t shift = (int8_t)shiftop;
@@ -480,18 +486,37 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t 
shiftop)
 val = 0;
 } else if (shift  -64) {
 val = 63;
-} else if (shift == -63) {
+} else if (shift == -64) {
 val = 63;
 val++;
 val = 1;
 } else if (shift  0) {
-val = (val + ((int64_t)1  (-1 - shift)))  -shift;
+int64_t round = (int64_t)1  (-1 - shift);
+/* Reduce the range as long as the addition overflows.  It's
+ * sufficient to check if (val+round) is  0 and val  0
+ * because round is  0.  */
+while ((val  0)  ((val + round)  0)  round  1) {
+shift++;
+round = 1;
+val = 1;
+}
+if ((val  0)  (val + round)  0) {
+/* If addition still overflows at this point, it means
+ * that round==1, thus shift==-1, and also that
+ * val==0x7FFF.  */
+val = 0x4000LL;
+} else {
+val = (val + round)  -shift;
+}
 } else {
 val = shift;
 }
 return val;
 }
 
+/* The addition of the rounding constant may overflow, so we use an
+ * intermediate 64 bits accumulator, which is really needed only when
+ * dealing with 32 bits input values.  */
 #define NEON_FN(dest, src1, src2) do { \
 int8_t tmp; \
 tmp = (int8_t)src2; \
@@ -499,9 +524,10 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t 
shiftop)
 tmp  -(ssize_t)sizeof(src1) * 8) { \
 dest = 0; \
 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
-dest = src1  (tmp - 1); \
+dest = src1  (-tmp - 1); \
 } else if (tmp  0) { \
-dest = (src1 + (1  (-1 - tmp)))  -tmp; \
+uint64_t big_dest = ((uint64_t)src1 + (1  (-1 - tmp))); \
+dest = big_dest  -tmp; \
 } else { \
 dest = src1  tmp; \
 }} while (0)
@@ -513,14 +539,29 @@ NEON_VOP(rshl_u32, neon_u32, 1)
 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
 {
 int8_t shift = (uint8_t)shiftop;
-if (shift = 64 || shift  64) {
+if (shift = 64 || shift  -64) {
 val = 0;
 } else if (shift == -64) {
 /* Rounding a 1-bit result just preserves that bit.  */
 val = 63;
-} if (shift  0) {
-val = (val + ((uint64_t)1  (-1 - shift)))  -shift;
-val = -shift;
+} else if (shift  0) {
+uint64_t round = (uint64_t)1  (-1 - shift);
+/* Reduce the range as long as the addition overflows.  It's
+ * sufficient to check if (val+round) is  val
+ * because val and round are  0.  */
+while (((val + round)  val)  round  1) {
+shift++;
+round = 1;
+val = 1;
+}
+if ((val + round)  val) {
+/* If addition still overflows at this point, it means
+ * that round==1, thus shift==-1, and also that
+ * val==0xFFF.  */
+val = 0x8000LL;
+} else {
+val = (val + round)  -shift;
+}
 } else {
 val = shift;
 }
diff --git a/target-arm/translate.c 

[Qemu-devel] [PATCH 8/8] target-arm: Fix VQRSHL Neon instructions (signed/unsigned 64 bits and signed 32 bits variants).

2011-01-31 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

The addition of the rounding constant could cause overflows.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/neon_helper.c |   50 ++---
 1 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 46fcdc4..2f96575 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -758,7 +758,23 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t 
val, uint64_t shiftop)
 {
 int8_t shift = (int8_t)shiftop;
 if (shift  0) {
-val = (val + (1  (-1 - shift)))  -shift;
+uint64_t round = (uint64_t)1  (-1 - shift);
+/* Reduce the range as long as the addition overflows.  It's
+ * sufficient to check if (val+round) is  val
+ * because val and round are  0.  */
+while (((val + round)  val)  round  1) {
+shift++;
+round = 1;
+val = 1;
+}
+if ((val + round)  val) {
+/* If addition still overflows at this point, it means
+ * that round==1, thus shift==-1, and also that
+ * val==0xFFF.  */
+val = 0x8000LL;
+} else {
+val = (val + round)  -shift;
+}
 } else { \
 uint64_t tmp = val;
 val = shift;
@@ -770,11 +786,15 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t 
val, uint64_t shiftop)
 return val;
 }
 
+/* The addition of the rounding constant may overflow, so we use an
+ * intermediate 64 bits accumulator, which is really needed only when
+ * dealing with 32 bits input values.  */
 #define NEON_FN(dest, src1, src2) do { \
 int8_t tmp; \
 tmp = (int8_t)src2; \
 if (tmp  0) { \
-dest = (src1 + (1  (-1 - tmp)))  -tmp; \
+int64_t big_dest = ((int64_t)src1 + (1  (-1 - tmp))); \
+dest = big_dest  -tmp; \
 } else { \
 dest = src1  tmp; \
 if ((dest  tmp) != src1) { \
@@ -787,19 +807,41 @@ NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
 NEON_VOP_ENV(qrshl_s32, neon_s32, 1)
 #undef NEON_FN
 
+/* Handling addition overflow with 64 bits inputs values is more
+ * tricky than with 32 bits values.  */
 uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t 
shiftop)
 {
 int8_t shift = (uint8_t)shiftop;
 int64_t val = valop;
 
 if (shift  0) {
-val = (val + (1  (-1 - shift)))  -shift;
+int64_t round = (int64_t)1  (-1 - shift);
+/* Reduce the range as long as the addition overflows.  It's
+ * sufficient to check if (val+round) is  0 and val  0
+ * because round is  0.  */
+while ((val  0)  ((val + round)  0)  round  1) {
+shift++;
+round = 1;
+val = 1;
+}
+if ((val  0)  (val + round)  0) {
+/* If addition still overflows at this point, it means
+ * that round==1, thus shift==-1, and also that
+ * val==0x7FFF.  */
+val = 0x4000LL;
+} else {
+val = (val + round)  -shift;
+}
 } else {
 int64_t tmp = val;
 val = shift;
 if ((val  shift) != tmp) {
 SET_QC();
-val = tmp  31;
+if (tmp  0) {
+val = INT64_MIN;
+} else {
+val = INT64_MAX;
+}
 }
 }
 return val;
-- 
1.7.2.3




[Qemu-devel] [PATCH 4/8] target-arm: fiddle decoding of 64 bit shift by imm and narrow

2011-01-31 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Tweak decoding of the shift-by-imm and narrow 64 bit insns
(VSHRN, VRSHRN, VQSHRN, VQSHRUN, VQRSHRN, VQRSHRUN).

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/translate.c |   28 ++--
 1 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 9ca5b82..a614e34 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4831,21 +4831,29 @@ static int disas_neon_data_insn(CPUState * env, 
DisasContext *s, uint32_t insn)
 if (size == 3) {
 neon_load_reg64(cpu_V0, rm + pass);
 if (q) {
-  if (u)
-gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, tmp64);
-  else
-gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, tmp64);
+if ((op == 8  !u) || (op == 9  u)) {
+gen_helper_neon_rshl_u64(cpu_V0, cpu_V0,
+ tmp64);
+} else {
+gen_helper_neon_rshl_s64(cpu_V0, cpu_V0,
+ tmp64);
+}
 } else {
-  if (u)
-gen_helper_neon_shl_u64(cpu_V0, cpu_V0, tmp64);
-  else
-gen_helper_neon_shl_s64(cpu_V0, cpu_V0, tmp64);
+if ((op == 8  !u) || (op == 9  u)) {
+gen_helper_neon_shl_u64(cpu_V0, cpu_V0,
+tmp64);
+} else {
+gen_helper_neon_shl_s64(cpu_V0, cpu_V0,
+tmp64);
+}
 }
 } else {
 tmp = neon_load_reg(rm + pass, 0);
-gen_neon_shift_narrow(size, tmp, tmp2, q, u);
+gen_neon_shift_narrow(size, tmp, tmp2, q,
+  (op == 8) ? !u : u);
 tmp3 = neon_load_reg(rm + pass, 1);
-gen_neon_shift_narrow(size, tmp3, tmp2, q, u);
+gen_neon_shift_narrow(size, tmp3, tmp2, q,
+  (op == 8) ? !u : u);
 tcg_gen_concat_i32_i64(cpu_V0, tmp, tmp3);
 dead_tmp(tmp);
 dead_tmp(tmp3);
-- 
1.7.2.3




[Qemu-devel] [PATCH 5/8] target-arm: fix neon vqrshl instruction

2011-01-31 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Signed-off-by: Juha Riihimäki juha.riihim...@nokia.com
Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/neon_helper.c |   21 ++---
 1 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 891b812..46fcdc4 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -736,9 +736,24 @@ uint64_t HELPER(neon_qshlu_s64)(CPUState *env, uint64_t 
valop, uint64_t shiftop)
 }} while (0)
 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
-NEON_VOP_ENV(qrshl_u32, neon_u32, 1)
 #undef NEON_FN
 
+uint32_t HELPER(neon_qrshl_u32)(CPUState *env, uint32_t val, uint32_t shiftop)
+{
+int8_t shift = (int8_t)shiftop;
+if (shift  0) {
+val = ((uint64_t)val + (1  (-1 - shift)))  -shift;
+} else {
+uint32_t tmp = val;
+val = shift;
+if ((val  shift) != tmp) {
+SET_QC();
+val = ~0;
+}
+}
+return val;
+}
+
 uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
 {
 int8_t shift = (int8_t)shiftop;
@@ -764,7 +779,7 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t 
val, uint64_t shiftop)
 dest = src1  tmp; \
 if ((dest  tmp) != src1) { \
 SET_QC(); \
-dest = src1  31; \
+dest = (uint32_t)(1  (sizeof(src1) * 8 - 1)) - (src1  0 ? 1 : 
0); \
 } \
 }} while (0)
 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
@@ -780,7 +795,7 @@ uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t 
valop, uint64_t shiftop)
 if (shift  0) {
 val = (val + (1  (-1 - shift)))  -shift;
 } else {
-int64_t tmp = val;;
+int64_t tmp = val;
 val = shift;
 if ((val  shift) != tmp) {
 SET_QC();
-- 
1.7.2.3




[Qemu-devel] [PATCH 2/8] target-arm: Create and use neon_unarrow_sat* helpers

2011-01-31 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Fix VQMOVUN, improve VQSHRUN and VQRSHRUN.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/helpers.h |3 ++
 target-arm/neon_helper.c |   63 ++
 target-arm/translate.c   |   43 ++-
 3 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/target-arm/helpers.h b/target-arm/helpers.h
index b88ebae..8cc6a44 100644
--- a/target-arm/helpers.h
+++ b/target-arm/helpers.h
@@ -295,10 +295,13 @@ DEF_HELPER_3(neon_qrdmulh_s32, i32, env, i32, i32)
 
 DEF_HELPER_1(neon_narrow_u8, i32, i64)
 DEF_HELPER_1(neon_narrow_u16, i32, i64)
+DEF_HELPER_2(neon_unarrow_sat8, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_u8, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_s8, i32, env, i64)
+DEF_HELPER_2(neon_unarrow_sat16, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_u16, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_s16, i32, env, i64)
+DEF_HELPER_2(neon_unarrow_sat32, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_u32, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_s32, i32, env, i64)
 DEF_HELPER_1(neon_narrow_high_u8, i32, i64)
diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 6c832b4..891b812 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -1005,6 +1005,33 @@ uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
 return ((x  16)  0x) | ((x  32)  0x);
 }
 
+uint32_t HELPER(neon_unarrow_sat8)(CPUState *env, uint64_t x)
+{
+uint16_t s;
+uint8_t d;
+uint32_t res = 0;
+#define SAT8(n) \
+s = x  n; \
+if (s  0x8000) { \
+SET_QC(); \
+} else { \
+if (s  0xff) { \
+d = 0xff; \
+SET_QC(); \
+} else  { \
+d = s; \
+} \
+res |= (uint32_t)d  (n / 2); \
+}
+
+SAT8(0);
+SAT8(16);
+SAT8(32);
+SAT8(48);
+#undef SAT8
+return res;
+}
+
 uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
 {
 uint16_t s;
@@ -1051,6 +1078,29 @@ uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, 
uint64_t x)
 return res;
 }
 
+uint32_t HELPER(neon_unarrow_sat16)(CPUState *env, uint64_t x)
+{
+uint32_t high;
+uint32_t low;
+low = x;
+if (low  0x8000) {
+low = 0;
+SET_QC();
+} else if (low  0x) {
+low = 0x;
+SET_QC();
+}
+high = x  32;
+if (high  0x8000) {
+high = 0;
+SET_QC();
+} else if (high  0x) {
+high = 0x;
+SET_QC();
+}
+return low | (high  16);
+}
+
 uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
 {
 uint32_t high;
@@ -1085,6 +1135,19 @@ uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, 
uint64_t x)
 return (uint16_t)low | (high  16);
 }
 
+uint32_t HELPER(neon_unarrow_sat32)(CPUState *env, uint64_t x)
+{
+if (x  0x8000ull) {
+SET_QC();
+return 0;
+}
+if (x  0xu) {
+SET_QC();
+return 0xu;
+}
+return x;
+}
+
 uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
 {
 if (x  0xu) {
diff --git a/target-arm/translate.c b/target-arm/translate.c
index b44f7a1..6dd024d 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4071,6 +4071,16 @@ static inline void gen_neon_narrow_satu(int size, TCGv 
dest, TCGv_i64 src)
 }
 }
 
+static inline void gen_neon_unarrow_sats(int size, TCGv dest, TCGv_i64 src)
+{
+switch(size) {
+case 0: gen_helper_neon_unarrow_sat8(dest, cpu_env, src); break;
+case 1: gen_helper_neon_unarrow_sat16(dest, cpu_env, src); break;
+case 2: gen_helper_neon_unarrow_sat32(dest, cpu_env, src); break;
+default: abort();
+}
+}
+
 static inline void gen_neon_shift_narrow(int size, TCGv var, TCGv shift,
  int q, int u)
 {
@@ -4841,13 +4851,14 @@ static int disas_neon_data_insn(CPUState * env, 
DisasContext *s, uint32_t insn)
 dead_tmp(tmp3);
 }
 tmp = new_tmp();
-if (op == 8  !u) {
-gen_neon_narrow(size - 1, tmp, cpu_V0);
-} else {
-if (op == 8)
-gen_neon_narrow_sats(size - 1, tmp, cpu_V0);
-else
-gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
+if (op == 8) {
+if (u) { /* VQSHRUN / VQRSHRUN */
+gen_neon_unarrow_sats(size - 1, tmp, cpu_V0);
+} else { /* VSHRN / VRSHRN */
+gen_neon_narrow(size - 1, tmp, cpu_V0);
+}
+} else { /* VQSHRN / VQRSHRN */
+gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
 }
 neon_store_reg(rd, 

[Qemu-devel] [PATCH 3/8] target-arm: VQRSHRN related changes

2011-01-31 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

More fixes for VQSHRN and VQSHRUN.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/translate.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 6dd024d..9ca5b82 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4101,8 +4101,8 @@ static inline void gen_neon_shift_narrow(int size, TCGv 
var, TCGv shift,
 } else {
 if (u) {
 switch (size) {
-case 1: gen_helper_neon_rshl_u16(var, var, shift); break;
-case 2: gen_helper_neon_rshl_u32(var, var, shift); break;
+case 1: gen_helper_neon_shl_u16(var, var, shift); break;
+case 2: gen_helper_neon_shl_u32(var, var, shift); break;
 default: abort();
 }
 } else {
-- 
1.7.2.3




[Qemu-devel] [PATCH 0/8] target-arm: Fix Neon instructions VQMOVUN VQRSHL VQRSHRN VQRSHRUN VQSHRN VQSHRUN VSLI VSRI

2011-01-28 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

This patchset combines fixes from the Meego tree (Peter Maydell, Juha
Riihimäki) and my own fixes such that ARM Neon instructions VQMOVUN
VQRSHL VQRSHRN VQRSHRUN VQSHRN VQSHRUN VSLI VSRI now pass all my
tests.

Christophe Lyon (3):
  Fixes for several shift instructions: VRSHL, VRSHR, VRSHRN, VSHLL,
VRSRA.
  target-arm: Fix Neon VQ(R)SHRN instructions.
  target-arm: Fix VQRSHL Neon instructions (signed/unsigned 64 bits and
signed 32 bits variants).

Juha Riihimäki (1):
  target-arm: fix neon vqrshl instruction

Peter Maydell (4):
  Create and use neon_unarrow_sat* helpers
  VQRSHRN related changes
  fiddle decoding of 64 bit shift by imm and narrow
  implement vsli.64, vsri.64

 target-arm/helpers.h |3 +
 target-arm/neon_helper.c |  195 ++
 target-arm/translate.c   |  103 ++---
 3 files changed, 257 insertions(+), 44 deletions(-)

-- 
1.7.2.3




[Qemu-devel] [PATCH 7/8] implement vsli.64, vsri.64

2011-01-28 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/translate.c |   11 ++-
 1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 3b14b8f..984df08 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4711,7 +4711,16 @@ static int disas_neon_data_insn(CPUState * env, 
DisasContext *s, uint32_t insn)
 tcg_gen_add_i64(cpu_V0, cpu_V0, cpu_V1);
 } else if (op == 4 || (op == 5  u)) {
 /* Insert */
-cpu_abort(env, VS[LR]I.64 not implemented);
+neon_load_reg64(cpu_V1, rd + pass);
+uint64_t mask;
+if (op == 4) {
+mask = 0xull  -shift;
+} else {
+mask = 0xull  shift;
+}
+tcg_gen_andi_i64(cpu_V0, cpu_V0, mask);
+tcg_gen_andi_i64(cpu_V1, cpu_V1, ~mask);
+tcg_gen_or_i64(cpu_V0, cpu_V0, cpu_V1);
 }
 neon_store_reg64(cpu_V0, rd + pass);
 } else { /* size  3 */
-- 
1.7.2.3




[Qemu-devel] [PATCH 1/8] target-arm: Fixes for several shift instructions: VRSHL, VRSHR, VRSHRN, VSHLL, VRSRA.

2011-01-28 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Handle corner cases where the addition of the rounding constant could
cause overflows.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/neon_helper.c |   61 ++---
 target-arm/translate.c   |   17 ++--
 2 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index bf29bbe..5971275 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -540,6 +540,9 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t 
shiftop)
 return val;
 }
 
+/* The addition of the rounding constant may overflow, so we use an
+ * intermediate 64 bits accumulator, which is really needed only when
+ * dealing with 32 bits input values.  */
 #define NEON_FN(dest, src1, src2) do { \
 int8_t tmp; \
 tmp = (int8_t)src2; \
@@ -548,11 +551,12 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t 
shiftop)
 } else if (tmp  -(ssize_t)sizeof(src1) * 8) { \
 dest = src1  (sizeof(src1) * 8 - 1); \
 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
-dest = src1  (tmp - 1); \
+dest = src1  (-tmp - 1); \
 dest++; \
 dest = 1; \
 } else if (tmp  0) { \
-dest = (src1 + (1  (-1 - tmp)))  -tmp; \
+int64_t big_dest = ((int64_t)src1 + (1  (-1 - tmp))); \
+dest = big_dest  -tmp; \
 } else { \
 dest = src1  tmp; \
 }} while (0)
@@ -561,6 +565,8 @@ NEON_VOP(rshl_s16, neon_s16, 2)
 NEON_VOP(rshl_s32, neon_s32, 1)
 #undef NEON_FN
 
+/* Handling addition overflow with 64 bits inputs values is more
+ * tricky than with 32 bits values.  */
 uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
 {
 int8_t shift = (int8_t)shiftop;
@@ -569,18 +575,37 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t 
shiftop)
 val = 0;
 } else if (shift  -64) {
 val = 63;
-} else if (shift == -63) {
+} else if (shift == -64) {
 val = 63;
 val++;
 val = 1;
 } else if (shift  0) {
-val = (val + ((int64_t)1  (-1 - shift)))  -shift;
+int64_t round = (int64_t)1  (-1 - shift);
+/* Reduce the range as long as the addition overflows.  It's
+ * sufficient to check if (val+round) is  0 and val  0
+ * because round is  0.  */
+while ((val  0)  ((val + round)  0)  round  1) {
+shift++;
+round = 1;
+val = 1;
+}
+if ((val  0)  (val + round)  0) {
+/* If addition still overflows at this point, it means
+ * that round==1, thus shift==-1, and also that
+ * val==0x7FFF.  */
+val = 0x4000LL;
+} else {
+val = (val + round)  -shift;
+}
 } else {
 val = shift;
 }
 return val;
 }
 
+/* The addition of the rounding constant may overflow, so we use an
+ * intermediate 64 bits accumulator, which is really needed only when
+ * dealing with 32 bits input values.  */
 #define NEON_FN(dest, src1, src2) do { \
 int8_t tmp; \
 tmp = (int8_t)src2; \
@@ -588,9 +613,10 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t 
shiftop)
 tmp  -(ssize_t)sizeof(src1) * 8) { \
 dest = 0; \
 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
-dest = src1  (tmp - 1); \
+dest = src1  (-tmp - 1); \
 } else if (tmp  0) { \
-dest = (src1 + (1  (-1 - tmp)))  -tmp; \
+uint64_t big_dest = ((uint64_t)src1 + (1  (-1 - tmp))); \
+dest = big_dest  -tmp; \
 } else { \
 dest = src1  tmp; \
 }} while (0)
@@ -602,14 +628,29 @@ NEON_VOP(rshl_u32, neon_u32, 1)
 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
 {
 int8_t shift = (uint8_t)shiftop;
-if (shift = 64 || shift  64) {
+if (shift = 64 || shift  -64) {
 val = 0;
 } else if (shift == -64) {
 /* Rounding a 1-bit result just preserves that bit.  */
 val = 63;
-} if (shift  0) {
-val = (val + ((uint64_t)1  (-1 - shift)))  -shift;
-val = -shift;
+} else if (shift  0) {
+uint64_t round = (uint64_t)1  (-1 - shift);
+/* Reduce the range as long as the addition overflows.  It's
+ * sufficient to check if (val+round) is  val
+ * because val and round are  0.  */
+while (((val + round)  val)  round  1) {
+shift++;
+round = 1;
+val = 1;
+}
+if ((val + round)  val) {
+/* If addition still overflows at this point, it means
+ * that round==1, thus shift==-1, and also that
+ * val==0xFFF.  */
+val = 0x8000LL;
+} else {
+val = (val + round)  -shift;
+}
 } else {
 val = shift;
 }
diff --git a/target-arm/translate.c b/target-arm/translate.c

[Qemu-devel] [PATCH 4/8] target-arm: fiddle decoding of 64 bit shift by imm and narrow

2011-01-28 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Tweak decoding of the shift-by-imm and narrow 64 bit insns
(VSHRN, VRSHRN, VQSHRN, VQSHRUN, VQRSHRN, VQRSHRUN).

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/translate.c |   28 ++--
 1 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 3537698..452cb71 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4842,21 +4842,29 @@ static int disas_neon_data_insn(CPUState * env, 
DisasContext *s, uint32_t insn)
 if (size == 3) {
 neon_load_reg64(cpu_V0, rm + pass);
 if (q) {
-  if (u)
-gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, tmp64);
-  else
-gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, tmp64);
+if ((op == 8  !u) || (op == 9  u)) {
+gen_helper_neon_rshl_u64(cpu_V0, cpu_V0,
+ tmp64);
+} else {
+gen_helper_neon_rshl_s64(cpu_V0, cpu_V0,
+ tmp64);
+}
 } else {
-  if (u)
-gen_helper_neon_shl_u64(cpu_V0, cpu_V0, tmp64);
-  else
-gen_helper_neon_shl_s64(cpu_V0, cpu_V0, tmp64);
+if ((op == 8  !u) || (op == 9  u)) {
+gen_helper_neon_shl_u64(cpu_V0, cpu_V0,
+tmp64);
+} else {
+gen_helper_neon_shl_s64(cpu_V0, cpu_V0,
+tmp64);
+}
 }
 } else {
 tmp = neon_load_reg(rm + pass, 0);
-gen_neon_shift_narrow(size, tmp, tmp2, q, u);
+gen_neon_shift_narrow(size, tmp, tmp2, q,
+  (op == 8) ? !u : u);
 tmp3 = neon_load_reg(rm + pass, 1);
-gen_neon_shift_narrow(size, tmp3, tmp2, q, u);
+gen_neon_shift_narrow(size, tmp3, tmp2, q,
+  (op == 8) ? !u : u);
 tcg_gen_concat_i32_i64(cpu_V0, tmp, tmp3);
 dead_tmp(tmp);
 dead_tmp(tmp3);
-- 
1.7.2.3




[Qemu-devel] [PATCH 6/8] target-arm: Fix Neon VQ(R)SHRN instructions.

2011-01-28 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Handle unsigned variant of VQ(R)SHRN instructions.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/translate.c |8 ++--
 1 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 452cb71..3b14b8f 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4876,8 +4876,12 @@ static int disas_neon_data_insn(CPUState * env, 
DisasContext *s, uint32_t insn)
 } else { /* VSHRN / VRSHRN */
 gen_neon_narrow(size - 1, tmp, cpu_V0);
 }
-} else { /* VQSHRN / VQRSHRN */
-gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
+} else {
+if (u) { /* VQSHRUN / VQRSHRUN */
+gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
+} else { /* VQSHRN / VQRSHRN */
+gen_neon_narrow_sats(size - 1, tmp, cpu_V0);
+}
 }
 neon_store_reg(rd, pass, tmp);
 } /* for pass */
-- 
1.7.2.3




[Qemu-devel] [PATCH 5/8] target-arm: fix neon vqrshl instruction

2011-01-28 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Signed-off-by: Juha Riihimäki juha.riihim...@nokia.com
Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/neon_helper.c |   21 ++---
 1 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 71e3c74..3337c52 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -825,9 +825,24 @@ uint64_t HELPER(neon_qshlu_s64)(CPUState *env, uint64_t 
valop, uint64_t shiftop)
 }} while (0)
 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
-NEON_VOP_ENV(qrshl_u32, neon_u32, 1)
 #undef NEON_FN
 
+uint32_t HELPER(neon_qrshl_u32)(CPUState *env, uint32_t val, uint32_t shiftop)
+{
+int8_t shift = (int8_t)shiftop;
+if (shift  0) {
+val = ((uint64_t)val + (1  (-1 - shift)))  -shift;
+} else {
+uint32_t tmp = val;
+val = shift;
+if ((val  shift) != tmp) {
+SET_QC();
+val = ~0;
+}
+}
+return val;
+}
+
 uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
 {
 int8_t shift = (int8_t)shiftop;
@@ -853,7 +868,7 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t 
val, uint64_t shiftop)
 dest = src1  tmp; \
 if ((dest  tmp) != src1) { \
 SET_QC(); \
-dest = src1  31; \
+dest = (uint32_t)(1  (sizeof(src1) * 8 - 1)) - (src1  0 ? 1 : 
0); \
 } \
 }} while (0)
 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
@@ -869,7 +884,7 @@ uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t 
valop, uint64_t shiftop)
 if (shift  0) {
 val = (val + (1  (-1 - shift)))  -shift;
 } else {
-int64_t tmp = val;;
+int64_t tmp = val;
 val = shift;
 if ((val  shift) != tmp) {
 SET_QC();
-- 
1.7.2.3




[Qemu-devel] [PATCH 3/8] target-arm: VQRSHRN related changes

2011-01-28 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

More fixes for VQSHRN and VQSHRUN.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/translate.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index cda5a73..3537698 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4108,8 +4108,8 @@ static inline void gen_neon_shift_narrow(int size, TCGv 
var, TCGv shift,
 } else {
 if (u) {
 switch (size) {
-case 1: gen_helper_neon_rshl_u16(var, var, shift); break;
-case 2: gen_helper_neon_rshl_u32(var, var, shift); break;
+case 1: gen_helper_neon_shl_u16(var, var, shift); break;
+case 2: gen_helper_neon_shl_u32(var, var, shift); break;
 default: abort();
 }
 } else {
-- 
1.7.2.3




[Qemu-devel] [PATCH 2/8] target-arm: Create and use neon_unarrow_sat* helpers

2011-01-28 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

Fix VQMOVUN, improve VQSHRUN and VQRSHRUN.

Signed-off-by: Peter Maydell peter.mayd...@linaro.org
Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/helpers.h |3 ++
 target-arm/neon_helper.c |   63 ++
 target-arm/translate.c   |   43 ++-
 3 files changed, 96 insertions(+), 13 deletions(-)

diff --git a/target-arm/helpers.h b/target-arm/helpers.h
index 8a2564e..4d0de00 100644
--- a/target-arm/helpers.h
+++ b/target-arm/helpers.h
@@ -299,10 +299,13 @@ DEF_HELPER_3(neon_qrdmulh_s32, i32, env, i32, i32)
 
 DEF_HELPER_1(neon_narrow_u8, i32, i64)
 DEF_HELPER_1(neon_narrow_u16, i32, i64)
+DEF_HELPER_2(neon_unarrow_sat8, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_u8, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_s8, i32, env, i64)
+DEF_HELPER_2(neon_unarrow_sat16, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_u16, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_s16, i32, env, i64)
+DEF_HELPER_2(neon_unarrow_sat32, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_u32, i32, env, i64)
 DEF_HELPER_2(neon_narrow_sat_s32, i32, env, i64)
 DEF_HELPER_1(neon_narrow_high_u8, i32, i64)
diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 5971275..71e3c74 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -1094,6 +1094,33 @@ uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
 return ((x  16)  0x) | ((x  32)  0x);
 }
 
+uint32_t HELPER(neon_unarrow_sat8)(CPUState *env, uint64_t x)
+{
+uint16_t s;
+uint8_t d;
+uint32_t res = 0;
+#define SAT8(n) \
+s = x  n; \
+if (s  0x8000) { \
+SET_QC(); \
+} else { \
+if (s  0xff) { \
+d = 0xff; \
+SET_QC(); \
+} else  { \
+d = s; \
+} \
+res |= (uint32_t)d  (n / 2); \
+}
+
+SAT8(0);
+SAT8(16);
+SAT8(32);
+SAT8(48);
+#undef SAT8
+return res;
+}
+
 uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
 {
 uint16_t s;
@@ -1140,6 +1167,29 @@ uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, 
uint64_t x)
 return res;
 }
 
+uint32_t HELPER(neon_unarrow_sat16)(CPUState *env, uint64_t x)
+{
+uint32_t high;
+uint32_t low;
+low = x;
+if (low  0x8000) {
+low = 0;
+SET_QC();
+} else if (low  0x) {
+low = 0x;
+SET_QC();
+}
+high = x  32;
+if (high  0x8000) {
+high = 0;
+SET_QC();
+} else if (high  0x) {
+high = 0x;
+SET_QC();
+}
+return low | (high  16);
+}
+
 uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
 {
 uint32_t high;
@@ -1174,6 +1224,19 @@ uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, 
uint64_t x)
 return (uint16_t)low | (high  16);
 }
 
+uint32_t HELPER(neon_unarrow_sat32)(CPUState *env, uint64_t x)
+{
+if (x  0x8000ull) {
+SET_QC();
+return 0;
+}
+if (x  0xu) {
+SET_QC();
+return 0xu;
+}
+return x;
+}
+
 uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
 {
 if (x  0xu) {
diff --git a/target-arm/translate.c b/target-arm/translate.c
index b14fa4b..cda5a73 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -4078,6 +4078,16 @@ static inline void gen_neon_narrow_satu(int size, TCGv 
dest, TCGv_i64 src)
 }
 }
 
+static inline void gen_neon_unarrow_sats(int size, TCGv dest, TCGv_i64 src)
+{
+switch(size) {
+case 0: gen_helper_neon_unarrow_sat8(dest, cpu_env, src); break;
+case 1: gen_helper_neon_unarrow_sat16(dest, cpu_env, src); break;
+case 2: gen_helper_neon_unarrow_sat32(dest, cpu_env, src); break;
+default: abort();
+}
+}
+
 static inline void gen_neon_shift_narrow(int size, TCGv var, TCGv shift,
  int q, int u)
 {
@@ -4852,13 +4862,14 @@ static int disas_neon_data_insn(CPUState * env, 
DisasContext *s, uint32_t insn)
 dead_tmp(tmp3);
 }
 tmp = new_tmp();
-if (op == 8  !u) {
-gen_neon_narrow(size - 1, tmp, cpu_V0);
-} else {
-if (op == 8)
-gen_neon_narrow_sats(size - 1, tmp, cpu_V0);
-else
-gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
+if (op == 8) {
+if (u) { /* VQSHRUN / VQRSHRUN */
+gen_neon_unarrow_sats(size - 1, tmp, cpu_V0);
+} else { /* VSHRN / VRSHRN */
+gen_neon_narrow(size - 1, tmp, cpu_V0);
+}
+} else { /* VQSHRN / VQRSHRN */
+gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
 

[Qemu-devel] [PATCH 8/8] target-arm: Fix VQRSHL Neon instructions (signed/unsigned 64 bits and signed 32 bits variants).

2011-01-28 Thread christophe.lyon
From: Christophe Lyon christophe.l...@st.com

The addition of the rounding constant could cause overflows.

Signed-off-by: Christophe Lyon christophe.l...@st.com
---
 target-arm/neon_helper.c |   50 ++---
 1 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c
index 3337c52..9faa348 100644
--- a/target-arm/neon_helper.c
+++ b/target-arm/neon_helper.c
@@ -847,7 +847,23 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t 
val, uint64_t shiftop)
 {
 int8_t shift = (int8_t)shiftop;
 if (shift  0) {
-val = (val + (1  (-1 - shift)))  -shift;
+uint64_t round = (uint64_t)1  (-1 - shift);
+/* Reduce the range as long as the addition overflows.  It's
+ * sufficient to check if (val+round) is  val
+ * because val and round are  0.  */
+while (((val + round)  val)  round  1) {
+shift++;
+round = 1;
+val = 1;
+}
+if ((val + round)  val) {
+/* If addition still overflows at this point, it means
+ * that round==1, thus shift==-1, and also that
+ * val==0xFFF.  */
+val = 0x8000LL;
+} else {
+val = (val + round)  -shift;
+}
 } else { \
 uint64_t tmp = val;
 val = shift;
@@ -859,11 +875,15 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t 
val, uint64_t shiftop)
 return val;
 }
 
+/* The addition of the rounding constant may overflow, so we use an
+ * intermediate 64 bits accumulator, which is really needed only when
+ * dealing with 32 bits input values.  */
 #define NEON_FN(dest, src1, src2) do { \
 int8_t tmp; \
 tmp = (int8_t)src2; \
 if (tmp  0) { \
-dest = (src1 + (1  (-1 - tmp)))  -tmp; \
+int64_t big_dest = ((int64_t)src1 + (1  (-1 - tmp))); \
+dest = big_dest  -tmp; \
 } else { \
 dest = src1  tmp; \
 if ((dest  tmp) != src1) { \
@@ -876,19 +896,41 @@ NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
 NEON_VOP_ENV(qrshl_s32, neon_s32, 1)
 #undef NEON_FN
 
+/* Handling addition overflow with 64 bits inputs values is more
+ * tricky than with 32 bits values.  */
 uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t 
shiftop)
 {
 int8_t shift = (uint8_t)shiftop;
 int64_t val = valop;
 
 if (shift  0) {
-val = (val + (1  (-1 - shift)))  -shift;
+int64_t round = (int64_t)1  (-1 - shift);
+/* Reduce the range as long as the addition overflows.  It's
+ * sufficient to check if (val+round) is  0 and val  0
+ * because round is  0.  */
+while ((val  0)  ((val + round)  0)  round  1) {
+shift++;
+round = 1;
+val = 1;
+}
+if ((val  0)  (val + round)  0) {
+/* If addition still overflows at this point, it means
+ * that round==1, thus shift==-1, and also that
+ * val==0x7FFF.  */
+val = 0x4000LL;
+} else {
+val = (val + round)  -shift;
+}
 } else {
 int64_t tmp = val;
 val = shift;
 if ((val  shift) != tmp) {
 SET_QC();
-val = tmp  31;
+if (tmp  0) {
+val = INT64_MIN;
+} else {
+val = INT64_MAX;
+}
 }
 }
 return val;
-- 
1.7.2.3