[Qemu-devel] [PATCH v2 0/2] ARM: fix Neon vrecpe instruction.
From: Christophe Lyon christophe.l...@st.com These 2 patches fix the ARM Neon vrecpe instruction by matching the algorithm descibed in the ARM ARM. With these patches, qemu passes my ARM/Neon tests. Patch #1 modifies softfloat by exporting float32_nan and float32_infinity. For consistency, I have also moved all the target-dependent definitions of floatXX_default_nan to softfloat.h (ie the 16, 64, x80 and 128 bits versions in addition to the 32 bits ones). Patch #2 uses these newly exported values and uses the vrecpe algorithm described in the ARM ARM. Christophe Lyon (2): softfloat: export float32_nan and float32_infinity. target-arm: fix support for vrecpe. fpu/softfloat-specialize.h | 68 --- fpu/softfloat.h| 71 + target-arm/helper.c| 84 +-- 3 files changed, 143 insertions(+), 80 deletions(-) -- 1.7.2.3
[Qemu-devel] [PATCH 1/2] softfloat: export float32_nan and float32_infinity.
From: Christophe Lyon christophe.l...@st.com These two special values are needed to implement some helper functions, which return these values in some cases. This patch also moves the definitions of default_nan for 16, 64, x80 and 128 bits floats for consistency with float32. Signed-off-by: Christophe Lyon christophe.l...@st.com --- fpu/softfloat-specialize.h | 68 -- fpu/softfloat.h| 71 2 files changed, 71 insertions(+), 68 deletions(-) diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h index 2d025bf..adc5ada 100644 --- a/fpu/softfloat-specialize.h +++ b/fpu/softfloat-specialize.h @@ -30,12 +30,6 @@ these four paragraphs for those parts of this code that are retained. =*/ -#if defined(TARGET_MIPS) || defined(TARGET_SH4) -#define SNAN_BIT_IS_ONE1 -#else -#define SNAN_BIT_IS_ONE0 -#endif - /* | Raises the exceptions specified by `flags'. Floating-point traps can be | defined here if desired. It is currently not possible for such a trap @@ -57,17 +51,6 @@ typedef struct { } commonNaNT; /* -| The pattern for a default generated half-precision NaN. -**/ -#if defined(TARGET_ARM) -#define float16_default_nan make_float16(0x7E00) -#elif SNAN_BIT_IS_ONE -#define float16_default_nan make_float16(0x7DFF) -#else -#define float16_default_nan make_float16(0xFE00) -#endif - -/* | Returns 1 if the half-precision floating-point value `a' is a quiet | NaN; otherwise returns 0. **/ @@ -158,19 +141,6 @@ static float16 commonNaNToFloat16(commonNaNT a STATUS_PARAM) } /* -| The pattern for a default generated single-precision NaN. -**/ -#if defined(TARGET_SPARC) -#define float32_default_nan make_float32(0x7FFF) -#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA) -#define float32_default_nan make_float32(0x7FC0) -#elif SNAN_BIT_IS_ONE -#define float32_default_nan make_float32(0x7FBF) -#else -#define float32_default_nan make_float32(0xFFC0) -#endif - -/* | Returns 1 if the single-precision floating-point value `a' is a quiet | NaN; otherwise returns 0. **/ @@ -413,19 +383,6 @@ static float32 propagateFloat32NaN( float32 a, float32 b STATUS_PARAM) } /* -| The pattern for a default generated double-precision NaN. -**/ -#if defined(TARGET_SPARC) -#define float64_default_nan make_float64(LIT64( 0x7FFF )) -#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA) -#define float64_default_nan make_float64(LIT64( 0x7FF8 )) -#elif SNAN_BIT_IS_ONE -#define float64_default_nan make_float64(LIT64( 0x7FF7 )) -#else -#define float64_default_nan make_float64(LIT64( 0xFFF8 )) -#endif - -/* | Returns 1 if the double-precision floating-point value `a' is a quiet | NaN; otherwise returns 0. **/ @@ -564,19 +521,6 @@ static float64 propagateFloat64NaN( float64 a, float64 b STATUS_PARAM) #ifdef FLOATX80 /* -| The pattern for a default generated extended double-precision NaN. The -| `high' and `low' values hold the most- and least-significant bits, -| respectively. -**/ -#if SNAN_BIT_IS_ONE -#define floatx80_default_nan_high 0x7FFF -#define floatx80_default_nan_low LIT64( 0xBFFF ) -#else -#define floatx80_default_nan_high 0x -#define floatx80_default_nan_low LIT64( 0xC000 ) -#endif - -/* | Returns 1 if the extended double-precision floating-point value `a' is a | quiet NaN; otherwise returns 0. This slightly differs from the same | function for other types as floatx80 has an explicit bit. @@ -728,18 +672,6 @@ static floatx80 propagateFloatx80NaN( floatx80
[Qemu-devel] [PATCH 2/2] target-arm: fix support for vrecpe.
From: Christophe Lyon christophe.l...@st.com Now use the same algorithm as described in the ARM ARM. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/helper.c | 84 +++--- 1 files changed, 72 insertions(+), 12 deletions(-) diff --git a/target-arm/helper.c b/target-arm/helper.c index 7f63a28..a17df42 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -2687,13 +2687,68 @@ float32 HELPER(rsqrts_f32)(float32 a, float32 b, CPUState *env) /* NEON helpers. */ -/* TODO: The architecture specifies the value that the estimate functions - should return. We return the exact reciprocal/root instead. */ +/* The algorithm that must be used to calculate the estimate + * is specified by the ARM ARM. + */ +static float64 recip_estimate(float64 a, CPUState *env) +{ +float_status *s = env-vfp.standard_fp_status; +float64 one = int64_to_float64(1, s); +/* q = (int)(a * 512.0) */ +float64 x512 = int64_to_float64(512, s); +float64 q = float64_mul(x512, a, s); +int64_t q_int = float64_to_int64_round_to_zero(q, s); + +/* r = 1.0 / (((double)q + 0.5) / 512.0) */ +q = int64_to_float64(q_int, s); +float64 half = float64_div(one, int64_to_float64(2, s), s); +q = float64_add(q, half, s); +q = float64_div(q, x512, s); +q = float64_div(one, q, s); + +/* s = (int)(256.0 * r + 0.5) */ +float64 x256 = int64_to_float64(256, s); +q = float64_mul(q, x256, s); +q = float64_add(q, half, s); +q_int = float64_to_int64_round_to_zero(q, s); + +/* return (double)s / 256.0 */ +return float64_div(int64_to_float64(q_int, s), x256, s); +} + float32 HELPER(recpe_f32)(float32 a, CPUState *env) { -float_status *s = env-vfp.fp_status; -float32 one = int32_to_float32(1, s); -return float32_div(one, a, s); +float_status *s = env-vfp.standard_fp_status; +float64 f64; +uint32_t val32 = float32_val(a); + +int result_exp; +int a_exp = (val32 0x7F80) 23; +int sign = val32 0x8000; + +if (float32_is_any_nan(a)) { +return float32_maybe_silence_nan(a); +} else if (float32_is_infinity(a)) { +return float32_zero; +} else if (float32_is_zero(a)) { +float_raise(float_flag_divbyzero, s); +return float32_infinity; +} else if (a_exp = 253) { +float_raise(float_flag_underflow, s); +return float32_zero; +} + +f64 = make_float64((0x3FEULL 52) + | ((int64_t)(val32 0x7F) 29)); + +result_exp = 253 - a_exp; + +f64 = recip_estimate(f64, env); + +val32 = sign +| ((result_exp 0xFF) 23) +| ((float64_val(f64) 29) 0x7F); +return make_float32(val32); } float32 HELPER(rsqrte_f32)(float32 a, CPUState *env) @@ -2705,13 +2760,18 @@ float32 HELPER(rsqrte_f32)(float32 a, CPUState *env) uint32_t HELPER(recpe_u32)(uint32_t a, CPUState *env) { -float_status *s = env-vfp.fp_status; -float32 tmp; -tmp = int32_to_float32(a, s); -tmp = float32_scalbn(tmp, -32, s); -tmp = helper_recpe_f32(tmp, env); -tmp = float32_scalbn(tmp, 31, s); -return float32_to_int32(tmp, s); +float64 f64; + +if ((a 0x8000) == 0) { +return 0x; +} + +f64 = make_float64((0x3FEULL 52) + | ((int64_t)(a 0x7FFF) 21)); + +f64 = recip_estimate (f64, env); + +return 0x8000 | ((float64_val(f64) 21) 0x7FFF); } uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUState *env) -- 1.7.2.3
[Qemu-devel] [PATCH v3 0/3] ARM: fix Neon vrecpe and vrsqrte instructions.
From: Christophe Lyon christophe.l...@st.com These 3 patches fix the ARM Neon vrecpe and vrsqrte instructions by matching the algorithms descibed in the ARM ARM. With these patches, qemu passes my ARM/Neon tests. Patch #1 modifies softfloat by exporting float32_default_nan and float32_infinity. For consistency, I have also moved all the target-dependent definitions of floatXX_default_nan to softfloat.h (ie the 16, 64, x80 and 128 bits versions in addition to the 32 bits ones). It also adds float32_set_sign() to help return the right special values (-0, -infinity). Patch #2 uses these newly exported values and uses the vrecpe algorithm described in the ARM ARM. Patch #3 uses these newly exported values and uses the vrsqrte algorithm described in the ARM ARM. Christophe Lyon (3): softfloat: export float32_default_nan, and float32_infinity. Add float32_set_sign(). target-arm: fix support for vrecpe. target-arm: fix support for vrsqrte. fpu/softfloat-specialize.h | 68 --- fpu/softfloat.h| 75 target-arm/helper.c| 206 +++- 3 files changed, 259 insertions(+), 90 deletions(-) -- 1.7.2.3
[Qemu-devel] [PATCH 1/3] softfloat: export float32_default_nan, and float32_infinity. Add float32_set_sign().
From: Christophe Lyon christophe.l...@st.com These special values are needed to implement some helper functions, which return these values in some cases. This patch also moves the definitions of default_nan for 16, 64, x80 and 128 bits floats for consistency with float32. Signed-off-by: Christophe Lyon christophe.l...@st.com --- fpu/softfloat-specialize.h | 68 --- fpu/softfloat.h| 75 2 files changed, 75 insertions(+), 68 deletions(-) diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h index 2d025bf..adc5ada 100644 --- a/fpu/softfloat-specialize.h +++ b/fpu/softfloat-specialize.h @@ -30,12 +30,6 @@ these four paragraphs for those parts of this code that are retained. =*/ -#if defined(TARGET_MIPS) || defined(TARGET_SH4) -#define SNAN_BIT_IS_ONE1 -#else -#define SNAN_BIT_IS_ONE0 -#endif - /* | Raises the exceptions specified by `flags'. Floating-point traps can be | defined here if desired. It is currently not possible for such a trap @@ -57,17 +51,6 @@ typedef struct { } commonNaNT; /* -| The pattern for a default generated half-precision NaN. -**/ -#if defined(TARGET_ARM) -#define float16_default_nan make_float16(0x7E00) -#elif SNAN_BIT_IS_ONE -#define float16_default_nan make_float16(0x7DFF) -#else -#define float16_default_nan make_float16(0xFE00) -#endif - -/* | Returns 1 if the half-precision floating-point value `a' is a quiet | NaN; otherwise returns 0. **/ @@ -158,19 +141,6 @@ static float16 commonNaNToFloat16(commonNaNT a STATUS_PARAM) } /* -| The pattern for a default generated single-precision NaN. -**/ -#if defined(TARGET_SPARC) -#define float32_default_nan make_float32(0x7FFF) -#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA) -#define float32_default_nan make_float32(0x7FC0) -#elif SNAN_BIT_IS_ONE -#define float32_default_nan make_float32(0x7FBF) -#else -#define float32_default_nan make_float32(0xFFC0) -#endif - -/* | Returns 1 if the single-precision floating-point value `a' is a quiet | NaN; otherwise returns 0. **/ @@ -413,19 +383,6 @@ static float32 propagateFloat32NaN( float32 a, float32 b STATUS_PARAM) } /* -| The pattern for a default generated double-precision NaN. -**/ -#if defined(TARGET_SPARC) -#define float64_default_nan make_float64(LIT64( 0x7FFF )) -#elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA) -#define float64_default_nan make_float64(LIT64( 0x7FF8 )) -#elif SNAN_BIT_IS_ONE -#define float64_default_nan make_float64(LIT64( 0x7FF7 )) -#else -#define float64_default_nan make_float64(LIT64( 0xFFF8 )) -#endif - -/* | Returns 1 if the double-precision floating-point value `a' is a quiet | NaN; otherwise returns 0. **/ @@ -564,19 +521,6 @@ static float64 propagateFloat64NaN( float64 a, float64 b STATUS_PARAM) #ifdef FLOATX80 /* -| The pattern for a default generated extended double-precision NaN. The -| `high' and `low' values hold the most- and least-significant bits, -| respectively. -**/ -#if SNAN_BIT_IS_ONE -#define floatx80_default_nan_high 0x7FFF -#define floatx80_default_nan_low LIT64( 0xBFFF ) -#else -#define floatx80_default_nan_high 0x -#define floatx80_default_nan_low LIT64( 0xC000 ) -#endif - -/* | Returns 1 if the extended double-precision floating-point value `a' is a | quiet NaN; otherwise returns 0. This slightly differs from the same | function for other types as floatx80 has an explicit bit. @@ -728,18 +672,6 @@ static floatx80 propagateFloatx80NaN( floatx80 a,
[Qemu-devel] [PATCH 3/3] target-arm: fix support for vrsqrte.
From: Christophe Lyon christophe.l...@st.com Now use the same algorithm as described in the ARM ARM. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/helper.c | 122 ++ 1 files changed, 112 insertions(+), 10 deletions(-) diff --git a/target-arm/helper.c b/target-arm/helper.c index 7751d21..f0f2231 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -2751,11 +2751,105 @@ float32 HELPER(recpe_f32)(float32 a, CPUState *env) return make_float32(val32); } +/* The algorithm that must be used to calculate the estimate + * is specified by the ARM ARM. + */ +static float64 recip_sqrt_estimate(float64 a, CPUState *env) +{ +float_status *s = env-vfp.standard_fp_status; +float64 one = int64_to_float64(1, s); +float64 half = float64_div(one, int64_to_float64(2, s), s); +float64 x256 = int64_to_float64(256, s); +float64 q; +int64_t q_int; + +if (float64_lt(a, half, s)) { +/* range 0.25 = a 0.5 */ + +/* a in units of 1/512 rounded down */ +/* q0 = (int)(a * 512.0); */ +float64 x512 = int64_to_float64(512, s); +q = float64_mul(x512, a, s); +q_int = float64_to_int64_round_to_zero(q, s); + +/* reciprocal root r */ +/* r = 1.0 / sqrt(((double)q0 + 0.5) / 512.0); */ +q = int64_to_float64(q_int, s); +q = float64_add(q, half, s); +q = float64_div(q, x512, s); +q = float64_sqrt(q, s); +q = float64_div(one, q, s); +} else { +/* range 0.5 = a 1.0 */ + +/* a in units of 1/256 rounded down */ +/* q1 = (int)(a * 256.0); */ +q = float64_mul(x256, a, s); +int64_t q_int = float64_to_int64_round_to_zero(q, s); + +/* reciprocal root r */ +/* r = 1.0 /sqrt(((double)q1 + 0.5) / 256); */ +q = int64_to_float64(q_int, s); +q = float64_add(q, half, s); +q = float64_div(q, x256, s); +q = float64_sqrt(q, s); +q = float64_div(one, q, s); +} +/* r in units of 1/256 rounded to nearest */ +/* s = (int)(256.0 * r + 0.5); */ + +q = float64_mul(q, x256,s ); +q = float64_add(q, half, s); +q_int = float64_to_int64_round_to_zero(q, s); + +/* return (double)s / 256.0;*/ +return float64_div(int64_to_float64(q_int, s), x256, s); +} + float32 HELPER(rsqrte_f32)(float32 a, CPUState *env) { -float_status *s = env-vfp.fp_status; -float32 one = int32_to_float32(1, s); -return float32_div(one, float32_sqrt(a, s), s); +float_status *s = env-vfp.standard_fp_status; +int result_exp; +float64 f64; +int32_t val; +int64_t val64; + +val = float32_val(a); + +if (float32_is_any_nan(a)) { +return float32_default_nan; +} else if (float32_is_zero(a)) { +float_raise(float_flag_divbyzero, s); +return float32_set_sign(float32_infinity, float32_is_neg(a)); +} else if (val 0) { +float_raise(float_flag_invalid, s); +return float32_default_nan; +} else if (float32_is_infinity(a)) { +return float32_zero; +} + +/* Normalize to a double-precision value between 0.25 and 1.0, + * preserving the parity of the exponent. */ +if ((val 0x80) == 0) { +f64 = make_float64(((uint64_t)(val 0x8000) 32) + | (0x3feULL 52) + | ((uint64_t)(val 0x7ff) 29)); +} else { +f64 = make_float64(((uint64_t)(val 0x8000) 32) + | (0x3fdULL 52) + | ((uint64_t)(val 0x7ff) 29)); +} + +result_exp = (380 - ((val 0x7f80) 23)) / 2; + +f64 = recip_sqrt_estimate(f64, env); + +val64 = float64_val(f64); + +val = ((val64 63) 0x8000) +| ((result_exp 0xff) 23) +| ((val64 29) 0x7f); +return make_float32(val); } uint32_t HELPER(recpe_u32)(uint32_t a, CPUState *env) @@ -2776,13 +2870,21 @@ uint32_t HELPER(recpe_u32)(uint32_t a, CPUState *env) uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUState *env) { -float_status *s = env-vfp.fp_status; -float32 tmp; -tmp = int32_to_float32(a, s); -tmp = float32_scalbn(tmp, -32, s); -tmp = helper_rsqrte_f32(tmp, env); -tmp = float32_scalbn(tmp, 31, s); -return float32_to_int32(tmp, s); +float64 f64; + +if ((a 0xc000) == 0) return 0x; + +if (a 0x8000) { +f64 = make_float64((0x3feULL 52) + | ((uint64_t)(a 0x7fff) 21)); +} else { /* bits 31-30 == '01' */ +f64 = make_float64((0x3fdULL 52) + | ((uint64_t)(a 0x3fff) 22)); +} + +f64 = recip_sqrt_estimate(f64, env); + +return 0x8000 | ((float64_val(f64) 21) 0x7fff); } void HELPER(set_teecr)(CPUState *env, uint32_t val) -- 1.7.2.3
[Qemu-devel] [PATCH 2/3] target-arm: fix support for vrecpe.
From: Christophe Lyon christophe.l...@st.com Now use the same algorithm as described in the ARM ARM. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/helper.c | 84 +++--- 1 files changed, 72 insertions(+), 12 deletions(-) diff --git a/target-arm/helper.c b/target-arm/helper.c index 7f63a28..7751d21 100644 --- a/target-arm/helper.c +++ b/target-arm/helper.c @@ -2687,13 +2687,68 @@ float32 HELPER(rsqrts_f32)(float32 a, float32 b, CPUState *env) /* NEON helpers. */ -/* TODO: The architecture specifies the value that the estimate functions - should return. We return the exact reciprocal/root instead. */ +/* The algorithm that must be used to calculate the estimate + * is specified by the ARM ARM. + */ +static float64 recip_estimate(float64 a, CPUState *env) +{ +float_status *s = env-vfp.standard_fp_status; +float64 one = int64_to_float64(1, s); +/* q = (int)(a * 512.0) */ +float64 x512 = int64_to_float64(512, s); +float64 q = float64_mul(x512, a, s); +int64_t q_int = float64_to_int64_round_to_zero(q, s); + +/* r = 1.0 / (((double)q + 0.5) / 512.0) */ +q = int64_to_float64(q_int, s); +float64 half = float64_div(one, int64_to_float64(2, s), s); +q = float64_add(q, half, s); +q = float64_div(q, x512, s); +q = float64_div(one, q, s); + +/* s = (int)(256.0 * r + 0.5) */ +float64 x256 = int64_to_float64(256, s); +q = float64_mul(q, x256, s); +q = float64_add(q, half, s); +q_int = float64_to_int64_round_to_zero(q, s); + +/* return (double)s / 256.0 */ +return float64_div(int64_to_float64(q_int, s), x256, s); +} + float32 HELPER(recpe_f32)(float32 a, CPUState *env) { -float_status *s = env-vfp.fp_status; -float32 one = int32_to_float32(1, s); -return float32_div(one, a, s); +float_status *s = env-vfp.standard_fp_status; +float64 f64; +uint32_t val32 = float32_val(a); + +int result_exp; +int a_exp = (val32 0x7f80) 23; +int sign = val32 0x8000; + +if (float32_is_any_nan(a)) { +return float32_default_nan; +} else if (float32_is_infinity(a)) { +return float32_set_sign(float32_zero, float32_is_neg(a)); +} else if (float32_is_zero_or_denormal(a)) { +float_raise(float_flag_divbyzero, s); +return float32_set_sign(float32_infinity, float32_is_neg(a)); +} else if (a_exp = 253) { +float_raise(float_flag_underflow, s); +return float32_set_sign(float32_zero, float32_is_neg(a)); +} + +f64 = make_float64((0x3feULL 52) + | ((int64_t)(val32 0x7f) 29)); + +result_exp = 253 - a_exp; + +f64 = recip_estimate(f64, env); + +val32 = sign +| ((result_exp 0xff) 23) +| ((float64_val(f64) 29) 0x7f); +return make_float32(val32); } float32 HELPER(rsqrte_f32)(float32 a, CPUState *env) @@ -2705,13 +2760,18 @@ float32 HELPER(rsqrte_f32)(float32 a, CPUState *env) uint32_t HELPER(recpe_u32)(uint32_t a, CPUState *env) { -float_status *s = env-vfp.fp_status; -float32 tmp; -tmp = int32_to_float32(a, s); -tmp = float32_scalbn(tmp, -32, s); -tmp = helper_recpe_f32(tmp, env); -tmp = float32_scalbn(tmp, 31, s); -return float32_to_int32(tmp, s); +float64 f64; + +if ((a 0x8000) == 0) { +return 0x; +} + +f64 = make_float64((0x3feULL 52) + | ((int64_t)(a 0x7fff) 21)); + +f64 = recip_estimate (f64, env); + +return 0x8000 | ((float64_val(f64) 21) 0x7fff); } uint32_t HELPER(rsqrte_u32)(uint32_t a, CPUState *env) -- 1.7.2.3
[Qemu-devel] [PATCH 5/6] target-arm: fix Neon VQSHRN and VSHRN.
From: Christophe Lyon christophe.l...@st.com Call the normal shift helpers instead of the rounding ones. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/translate.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 8791bc5..ace533f 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4095,8 +4095,8 @@ static inline void gen_neon_shift_narrow(int size, TCGv var, TCGv shift, } else { if (u) { switch (size) { -case 1: gen_helper_neon_rshl_u16(var, var, shift); break; -case 2: gen_helper_neon_rshl_u32(var, var, shift); break; +case 1: gen_helper_neon_shl_u16(var, var, shift); break; +case 2: gen_helper_neon_shl_u32(var, var, shift); break; default: abort(); } } else { -- 1.7.2.3
[Qemu-devel] [PATCH 4/6] target-arm: fix saturated values for Neon right shifts.
From: Christophe Lyon christophe.l...@st.com Fix value returned by signed qrshl helpers (8, 16 and 32 bits). Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/neon_helper.c |8 ++-- 1 files changed, 6 insertions(+), 2 deletions(-) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index 907f7b7..83d610a 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -903,7 +903,7 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) dest = src1 tmp; \ if ((dest tmp) != src1) { \ SET_QC(); \ -dest = src1 31; \ +dest = (uint32_t)(1 (sizeof(src1) * 8 - 1)) - (src1 0 ? 1 : 0); \ } \ }} while (0) NEON_VOP_ENV(qrshl_s8, neon_s8, 4) @@ -924,7 +924,11 @@ uint32_t HELPER(neon_qrshl_s32)(CPUState *env, uint32_t valop, uint32_t shiftop) dest = val shift; if ((dest shift) != val) { SET_QC(); -dest = (uint32_t)(1 (sizeof(val) * 8 - 1)) - (val 0 ? 1 : 0); +if (val 0) { +dest = INT32_MIN; +} else { +dest = INT32_MAX; +} } } return dest; -- 1.7.2.3
[Qemu-devel] [PATCH 2/6] target-arm: fix Neon right shifts with shift amount == input width.
From: Christophe Lyon christophe.l...@st.com Fix rshl helpers (s8, s16, s64, u8, u16) Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/neon_helper.c |6 +++--- 1 files changed, 3 insertions(+), 3 deletions(-) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index 3f1f3d4..1ac362f 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -548,7 +548,7 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) } else if (tmp -(ssize_t)sizeof(src1) * 8) { \ dest = src1 (sizeof(src1) * 8 - 1); \ } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ -dest = src1 (tmp - 1); \ +dest = src1 (-tmp - 1); \ dest++; \ dest = 1; \ } else if (tmp 0) { \ @@ -594,7 +594,7 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) val = 0; } else if (shift -64) { val = 63; -} else if (shift == -63) { +} else if (shift == -64) { val = 63; val++; val = 1; @@ -622,7 +622,7 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) tmp -(ssize_t)sizeof(src1) * 8) { \ dest = 0; \ } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ -dest = src1 (tmp - 1); \ +dest = src1 (-tmp - 1); \ } else if (tmp 0) { \ dest = (src1 + (1 (-1 - tmp))) -tmp; \ } else { \ -- 1.7.2.3
[Qemu-devel] [PATCH v3 0/6] target-arm: Fix Neon shift instructions.
From: Christophe Lyon christophe.l...@st.com This patch series provides fixes such that ARM Neon instructions VRSHR, VRSRA, VQRSHRN, VQRSHRUN, VRSHRN, VQSHRN, VSHRN, VQSHRUN now pass all my tests. I have reworked all these patches and I hope they are now easier to review. Christophe Lyon (6): target-arm: Fix rounding constant addition for Neon shift instructions. target-arm: fix Neon right shifts with shift amount == input width. target-arm: fix unsigned 64 bit right shifts. target-arm: fix saturated values for Neon right shifts. target-arm: fix Neon VQSHRN and VSHRN. target-arm: fix decoding of Neon 64 bit shifts. target-arm/neon_helper.c | 163 +- target-arm/translate.c | 47 +- 2 files changed, 176 insertions(+), 34 deletions(-) -- 1.7.2.3
[Qemu-devel] [PATCH 6/6] target-arm: fix decoding of Neon 64 bit shifts.
From: Christophe Lyon christophe.l...@st.com Fix decoding of 64 bits variants of VSHRN, VRSHRN, VQSHRN, VQSHRUN, VQRSHRN, VQRSHRUN, taking into account whether inputs are unsigned or not. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/translate.c | 43 --- 1 files changed, 28 insertions(+), 15 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index ace533f..10b8c5f 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4815,6 +4815,8 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) } else if (op 10) { /* Shift by immediate and narrow: VSHRN, VRSHRN, VQSHRN, VQRSHRN. */ +int input_unsigned = (op == 8) ? !u : u; + shift = shift - (1 (size + 3)); size++; switch (size) { @@ -4841,33 +4843,44 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) if (size == 3) { neon_load_reg64(cpu_V0, rm + pass); if (q) { - if (u) -gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, tmp64); - else -gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, tmp64); +if (input_unsigned) { +gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, + tmp64); +} else { +gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, + tmp64); +} } else { - if (u) -gen_helper_neon_shl_u64(cpu_V0, cpu_V0, tmp64); - else -gen_helper_neon_shl_s64(cpu_V0, cpu_V0, tmp64); +if (input_unsigned) { +gen_helper_neon_shl_u64(cpu_V0, cpu_V0, +tmp64); +} else { +gen_helper_neon_shl_s64(cpu_V0, cpu_V0, +tmp64); +} } } else { tmp = neon_load_reg(rm + pass, 0); -gen_neon_shift_narrow(size, tmp, tmp2, q, u); +gen_neon_shift_narrow(size, tmp, tmp2, q, input_unsigned); tmp3 = neon_load_reg(rm + pass, 1); -gen_neon_shift_narrow(size, tmp3, tmp2, q, u); +gen_neon_shift_narrow(size, tmp3, tmp2, q, input_unsigned); tcg_gen_concat_i32_i64(cpu_V0, tmp, tmp3); dead_tmp(tmp); dead_tmp(tmp3); } tmp = new_tmp(); -if (op == 8 !u) { -gen_neon_narrow(size - 1, tmp, cpu_V0); +if (op == 8) { +if (u) { /* VQSHRUN / VQRSHRUN */ +gen_neon_unarrow_sats(size - 1, tmp, cpu_V0); +} else { /* VSHRN / VRSHRN */ +gen_neon_narrow(size - 1, tmp, cpu_V0); +} } else { -if (op == 8) +if (u) { /* VQSHRN / VQRSHRN */ +gen_neon_narrow_satu(size - 1, tmp, cpu_V0); +} else { /* VQSHRN / VQRSHRN */ gen_neon_narrow_sats(size - 1, tmp, cpu_V0); -else -gen_neon_narrow_satu(size - 1, tmp, cpu_V0); +} } neon_store_reg(rd, pass, tmp); } /* for pass */ -- 1.7.2.3
[Qemu-devel] [PATCH 1/6] target-arm: Fix rounding constant addition for Neon shift instructions.
From: Christophe Lyon christophe.l...@st.com Handle cases where adding the rounding constant could overflow in Neon shift instructions: VRSHR, VRSRA, VQRSHRN, VQRSHRUN, VRSHRN. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/neon_helper.c | 149 ++ 1 files changed, 137 insertions(+), 12 deletions(-) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index cf82072..3f1f3d4 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -558,9 +558,34 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) }} while (0) NEON_VOP(rshl_s8, neon_s8, 4) NEON_VOP(rshl_s16, neon_s16, 2) -NEON_VOP(rshl_s32, neon_s32, 1) #undef NEON_FN +/* The addition of the rounding constant may overflow, so we use an + * intermediate 64 bits accumulator. */ +uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop) +{ +int32_t dest; +int32_t val = (int32_t)valop; +int8_t shift = (int8_t)shiftop; +if (shift = 32) { +dest = 0; +} else if (shift -32) { +dest = val 31; +} else if (shift == -32) { +dest = val 31; +dest++; +dest = 1; +} else if (shift 0) { +int64_t big_dest = ((int64_t)val + (1 (-1 - shift))); +dest = big_dest -shift; +} else { +dest = val shift; +} +return dest; +} + +/* Handling addition overflow with 64 bits inputs values is more + * tricky than with 32 bits values. */ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) { int8_t shift = (int8_t)shiftop; @@ -574,7 +599,16 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) val++; val = 1; } else if (shift 0) { -val = (val + ((int64_t)1 (-1 - shift))) -shift; +val = (-shift - 1); +if (val == INT64_MAX) { +/* In this case, it means that the rounding constant is 1, + * and the addition would overflow. Return the actual + * result directly. */ +val = 0x4000LL; +} else { +val++; +val = 1; +} } else { val = shift; } @@ -596,9 +630,29 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) }} while (0) NEON_VOP(rshl_u8, neon_u8, 4) NEON_VOP(rshl_u16, neon_u16, 2) -NEON_VOP(rshl_u32, neon_u32, 1) #undef NEON_FN +/* The addition of the rounding constant may overflow, so we use an + * intermediate 64 bits accumulator. */ +uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop) +{ +uint32_t dest; +int8_t shift = (int8_t)shiftop; +if (shift = 32 || shift -32) { +dest = 0; +} else if (shift == -32) { +dest = val 31; +} else if (shift 0) { +uint64_t big_dest = ((uint64_t)val + (1 (-1 - shift))); +dest = big_dest -shift; +} else { +dest = val shift; +} +return dest; +} + +/* Handling addition overflow with 64 bits inputs values is more + * tricky than with 32 bits values. */ uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) { int8_t shift = (uint8_t)shiftop; @@ -607,9 +661,17 @@ uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) } else if (shift == -64) { /* Rounding a 1-bit result just preserves that bit. */ val = 63; -} if (shift 0) { -val = (val + ((uint64_t)1 (-1 - shift))) -shift; -val = -shift; +} else if (shift 0) { +val = (-shift - 1); +if (val == UINT64_MAX) { +/* In this case, it means that the rounding constant is 1, + * and the addition would overflow. Return the actual + * result directly. */ +val = 0x8000ULL; +} else { +val++; +val = 1; +} } else { val = shift; } @@ -784,14 +846,43 @@ uint64_t HELPER(neon_qshlu_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) }} while (0) NEON_VOP_ENV(qrshl_u8, neon_u8, 4) NEON_VOP_ENV(qrshl_u16, neon_u16, 2) -NEON_VOP_ENV(qrshl_u32, neon_u32, 1) #undef NEON_FN +/* The addition of the rounding constant may overflow, so we use an + * intermediate 64 bits accumulator. */ +uint32_t HELPER(neon_qrshl_u32)(CPUState *env, uint32_t val, uint32_t shiftop) +{ +uint32_t dest; +int8_t shift = (int8_t)shiftop; +if (shift 0) { +uint64_t big_dest = ((uint64_t)val + ( 1 (-1 - shift))); +dest = big_dest -shift; +} else { +dest = val shift; +if ((dest shift) != val) { +SET_QC(); +dest = ~0; +} +} +return dest; +} + +/* Handling addition overflow with 64 bits inputs values is more + * tricky than with 32 bits values. */ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) { int8_t shift = (int8_t)shiftop; if (shift 0) { -val = (val + (1
[Qemu-devel] [PATCH 3/6] target-arm: fix unsigned 64 bit right shifts.
From: Christophe Lyon christophe.l...@st.com Fix range of shift amounts which always give 0 as result. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/neon_helper.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index 1ac362f..907f7b7 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -656,7 +656,7 @@ uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop) uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) { int8_t shift = (uint8_t)shiftop; -if (shift = 64 || shift 64) { +if (shift = 64 || shift -64) { val = 0; } else if (shift == -64) { /* Rounding a 1-bit result just preserves that bit. */ -- 1.7.2.3
[Qemu-devel] [PATCH 6/8] target-arm: Fix Neon VQ(R)SHRN instructions.
From: Christophe Lyon christophe.l...@st.com Handle unsigned variant of VQ(R)SHRN instructions. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/translate.c |8 ++-- 1 files changed, 6 insertions(+), 2 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index a614e34..61d4c4c 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4865,8 +4865,12 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) } else { /* VSHRN / VRSHRN */ gen_neon_narrow(size - 1, tmp, cpu_V0); } -} else { /* VQSHRN / VQRSHRN */ -gen_neon_narrow_satu(size - 1, tmp, cpu_V0); +} else { +if (u) { /* VQSHRUN / VQRSHRUN */ +gen_neon_narrow_satu(size - 1, tmp, cpu_V0); +} else { /* VQSHRN / VQRSHRN */ +gen_neon_narrow_sats(size - 1, tmp, cpu_V0); +} } neon_store_reg(rd, pass, tmp); } /* for pass */ -- 1.7.2.3
[Qemu-devel] [PATCH v2 0/8] target-arm: Fix Neon instructions VQMOVUN VQRSHL VQRSHRN VQRSHRUN VQSHRN VQSHRUN VSLI VSRI
From: Christophe Lyon christophe.l...@st.com This patchset combines fixes from the Meego tree (Peter Maydell, Juha Riihimäki) and my own fixes such that ARM Neon instructions VQMOVUN VQRSHL VQRSHRN VQRSHRUN VQSHRN VQSHRUN VSLI VSRI now pass all my tests. Christophe Lyon (3): Fixes for several shift instructions: VRSHL, VRSHR, VRSHRN, VSHLL, VRSRA. target-arm: Fix Neon VQ(R)SHRN instructions. target-arm: Fix VQRSHL Neon instructions (signed/unsigned 64 bits and signed 32 bits variants). Juha Riihimäki (1): target-arm: fix neon vqrshl instruction Meego (4): Create and use neon_unarrow_sat* helpers VQRSHRN related changes fiddle decoding of 64 bit shift by imm and narrow implement vsli.64, vsri.64 target-arm/helpers.h |3 + target-arm/neon_helper.c | 195 ++ target-arm/translate.c | 98 +-- 3 files changed, 253 insertions(+), 43 deletions(-) -- 1.7.2.3
[Qemu-devel] [PATCH 7/8] target-arm: implement vsli.64, vsri.64
From: Christophe Lyon christophe.l...@st.com Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/translate.c | 11 ++- 1 files changed, 10 insertions(+), 1 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 61d4c4c..9150242 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4700,7 +4700,16 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) tcg_gen_add_i64(cpu_V0, cpu_V0, cpu_V1); } else if (op == 4 || (op == 5 u)) { /* Insert */ -cpu_abort(env, VS[LR]I.64 not implemented); +neon_load_reg64(cpu_V1, rd + pass); +uint64_t mask; +if (op == 4) { +mask = 0xull -shift; +} else { +mask = 0xull shift; +} +tcg_gen_andi_i64(cpu_V0, cpu_V0, mask); +tcg_gen_andi_i64(cpu_V1, cpu_V1, ~mask); +tcg_gen_or_i64(cpu_V0, cpu_V0, cpu_V1); } neon_store_reg64(cpu_V0, rd + pass); } else { /* size 3 */ -- 1.7.2.3
[Qemu-devel] [PATCH 1/8] target-arm: Fixes for several shift instructions: VRSHL, VRSHR, VRSHRN, VSHLL, VRSRA.
From: Christophe Lyon christophe.l...@st.com For variants with rounding, fix cases where adding the rounding constant could overflow. For VSHLL, fix bit mask. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/neon_helper.c | 61 ++--- target-arm/translate.c | 12 +++- 2 files changed, 61 insertions(+), 12 deletions(-) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index fead152..6c832b4 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -451,6 +451,9 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) return val; } +/* The addition of the rounding constant may overflow, so we use an + * intermediate 64 bits accumulator, which is really needed only when + * dealing with 32 bits input values. */ #define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src2; \ @@ -459,11 +462,12 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) } else if (tmp -(ssize_t)sizeof(src1) * 8) { \ dest = src1 (sizeof(src1) * 8 - 1); \ } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ -dest = src1 (tmp - 1); \ +dest = src1 (-tmp - 1); \ dest++; \ dest = 1; \ } else if (tmp 0) { \ -dest = (src1 + (1 (-1 - tmp))) -tmp; \ +int64_t big_dest = ((int64_t)src1 + (1 (-1 - tmp))); \ +dest = big_dest -tmp; \ } else { \ dest = src1 tmp; \ }} while (0) @@ -472,6 +476,8 @@ NEON_VOP(rshl_s16, neon_s16, 2) NEON_VOP(rshl_s32, neon_s32, 1) #undef NEON_FN +/* Handling addition overflow with 64 bits inputs values is more + * tricky than with 32 bits values. */ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) { int8_t shift = (int8_t)shiftop; @@ -480,18 +486,37 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) val = 0; } else if (shift -64) { val = 63; -} else if (shift == -63) { +} else if (shift == -64) { val = 63; val++; val = 1; } else if (shift 0) { -val = (val + ((int64_t)1 (-1 - shift))) -shift; +int64_t round = (int64_t)1 (-1 - shift); +/* Reduce the range as long as the addition overflows. It's + * sufficient to check if (val+round) is 0 and val 0 + * because round is 0. */ +while ((val 0) ((val + round) 0) round 1) { +shift++; +round = 1; +val = 1; +} +if ((val 0) (val + round) 0) { +/* If addition still overflows at this point, it means + * that round==1, thus shift==-1, and also that + * val==0x7FFF. */ +val = 0x4000LL; +} else { +val = (val + round) -shift; +} } else { val = shift; } return val; } +/* The addition of the rounding constant may overflow, so we use an + * intermediate 64 bits accumulator, which is really needed only when + * dealing with 32 bits input values. */ #define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src2; \ @@ -499,9 +524,10 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) tmp -(ssize_t)sizeof(src1) * 8) { \ dest = 0; \ } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ -dest = src1 (tmp - 1); \ +dest = src1 (-tmp - 1); \ } else if (tmp 0) { \ -dest = (src1 + (1 (-1 - tmp))) -tmp; \ +uint64_t big_dest = ((uint64_t)src1 + (1 (-1 - tmp))); \ +dest = big_dest -tmp; \ } else { \ dest = src1 tmp; \ }} while (0) @@ -513,14 +539,29 @@ NEON_VOP(rshl_u32, neon_u32, 1) uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) { int8_t shift = (uint8_t)shiftop; -if (shift = 64 || shift 64) { +if (shift = 64 || shift -64) { val = 0; } else if (shift == -64) { /* Rounding a 1-bit result just preserves that bit. */ val = 63; -} if (shift 0) { -val = (val + ((uint64_t)1 (-1 - shift))) -shift; -val = -shift; +} else if (shift 0) { +uint64_t round = (uint64_t)1 (-1 - shift); +/* Reduce the range as long as the addition overflows. It's + * sufficient to check if (val+round) is val + * because val and round are 0. */ +while (((val + round) val) round 1) { +shift++; +round = 1; +val = 1; +} +if ((val + round) val) { +/* If addition still overflows at this point, it means + * that round==1, thus shift==-1, and also that + * val==0xFFF. */ +val = 0x8000LL; +} else { +val = (val + round) -shift; +} } else { val = shift; } diff --git a/target-arm/translate.c
[Qemu-devel] [PATCH 8/8] target-arm: Fix VQRSHL Neon instructions (signed/unsigned 64 bits and signed 32 bits variants).
From: Christophe Lyon christophe.l...@st.com The addition of the rounding constant could cause overflows. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/neon_helper.c | 50 ++--- 1 files changed, 46 insertions(+), 4 deletions(-) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index 46fcdc4..2f96575 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -758,7 +758,23 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) { int8_t shift = (int8_t)shiftop; if (shift 0) { -val = (val + (1 (-1 - shift))) -shift; +uint64_t round = (uint64_t)1 (-1 - shift); +/* Reduce the range as long as the addition overflows. It's + * sufficient to check if (val+round) is val + * because val and round are 0. */ +while (((val + round) val) round 1) { +shift++; +round = 1; +val = 1; +} +if ((val + round) val) { +/* If addition still overflows at this point, it means + * that round==1, thus shift==-1, and also that + * val==0xFFF. */ +val = 0x8000LL; +} else { +val = (val + round) -shift; +} } else { \ uint64_t tmp = val; val = shift; @@ -770,11 +786,15 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) return val; } +/* The addition of the rounding constant may overflow, so we use an + * intermediate 64 bits accumulator, which is really needed only when + * dealing with 32 bits input values. */ #define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src2; \ if (tmp 0) { \ -dest = (src1 + (1 (-1 - tmp))) -tmp; \ +int64_t big_dest = ((int64_t)src1 + (1 (-1 - tmp))); \ +dest = big_dest -tmp; \ } else { \ dest = src1 tmp; \ if ((dest tmp) != src1) { \ @@ -787,19 +807,41 @@ NEON_VOP_ENV(qrshl_s16, neon_s16, 2) NEON_VOP_ENV(qrshl_s32, neon_s32, 1) #undef NEON_FN +/* Handling addition overflow with 64 bits inputs values is more + * tricky than with 32 bits values. */ uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) { int8_t shift = (uint8_t)shiftop; int64_t val = valop; if (shift 0) { -val = (val + (1 (-1 - shift))) -shift; +int64_t round = (int64_t)1 (-1 - shift); +/* Reduce the range as long as the addition overflows. It's + * sufficient to check if (val+round) is 0 and val 0 + * because round is 0. */ +while ((val 0) ((val + round) 0) round 1) { +shift++; +round = 1; +val = 1; +} +if ((val 0) (val + round) 0) { +/* If addition still overflows at this point, it means + * that round==1, thus shift==-1, and also that + * val==0x7FFF. */ +val = 0x4000LL; +} else { +val = (val + round) -shift; +} } else { int64_t tmp = val; val = shift; if ((val shift) != tmp) { SET_QC(); -val = tmp 31; +if (tmp 0) { +val = INT64_MIN; +} else { +val = INT64_MAX; +} } } return val; -- 1.7.2.3
[Qemu-devel] [PATCH 4/8] target-arm: fiddle decoding of 64 bit shift by imm and narrow
From: Christophe Lyon christophe.l...@st.com Tweak decoding of the shift-by-imm and narrow 64 bit insns (VSHRN, VRSHRN, VQSHRN, VQSHRUN, VQRSHRN, VQRSHRUN). Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/translate.c | 28 ++-- 1 files changed, 18 insertions(+), 10 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 9ca5b82..a614e34 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4831,21 +4831,29 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) if (size == 3) { neon_load_reg64(cpu_V0, rm + pass); if (q) { - if (u) -gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, tmp64); - else -gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, tmp64); +if ((op == 8 !u) || (op == 9 u)) { +gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, + tmp64); +} else { +gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, + tmp64); +} } else { - if (u) -gen_helper_neon_shl_u64(cpu_V0, cpu_V0, tmp64); - else -gen_helper_neon_shl_s64(cpu_V0, cpu_V0, tmp64); +if ((op == 8 !u) || (op == 9 u)) { +gen_helper_neon_shl_u64(cpu_V0, cpu_V0, +tmp64); +} else { +gen_helper_neon_shl_s64(cpu_V0, cpu_V0, +tmp64); +} } } else { tmp = neon_load_reg(rm + pass, 0); -gen_neon_shift_narrow(size, tmp, tmp2, q, u); +gen_neon_shift_narrow(size, tmp, tmp2, q, + (op == 8) ? !u : u); tmp3 = neon_load_reg(rm + pass, 1); -gen_neon_shift_narrow(size, tmp3, tmp2, q, u); +gen_neon_shift_narrow(size, tmp3, tmp2, q, + (op == 8) ? !u : u); tcg_gen_concat_i32_i64(cpu_V0, tmp, tmp3); dead_tmp(tmp); dead_tmp(tmp3); -- 1.7.2.3
[Qemu-devel] [PATCH 5/8] target-arm: fix neon vqrshl instruction
From: Christophe Lyon christophe.l...@st.com Signed-off-by: Juha Riihimäki juha.riihim...@nokia.com Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/neon_helper.c | 21 ++--- 1 files changed, 18 insertions(+), 3 deletions(-) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index 891b812..46fcdc4 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -736,9 +736,24 @@ uint64_t HELPER(neon_qshlu_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) }} while (0) NEON_VOP_ENV(qrshl_u8, neon_u8, 4) NEON_VOP_ENV(qrshl_u16, neon_u16, 2) -NEON_VOP_ENV(qrshl_u32, neon_u32, 1) #undef NEON_FN +uint32_t HELPER(neon_qrshl_u32)(CPUState *env, uint32_t val, uint32_t shiftop) +{ +int8_t shift = (int8_t)shiftop; +if (shift 0) { +val = ((uint64_t)val + (1 (-1 - shift))) -shift; +} else { +uint32_t tmp = val; +val = shift; +if ((val shift) != tmp) { +SET_QC(); +val = ~0; +} +} +return val; +} + uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) { int8_t shift = (int8_t)shiftop; @@ -764,7 +779,7 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) dest = src1 tmp; \ if ((dest tmp) != src1) { \ SET_QC(); \ -dest = src1 31; \ +dest = (uint32_t)(1 (sizeof(src1) * 8 - 1)) - (src1 0 ? 1 : 0); \ } \ }} while (0) NEON_VOP_ENV(qrshl_s8, neon_s8, 4) @@ -780,7 +795,7 @@ uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) if (shift 0) { val = (val + (1 (-1 - shift))) -shift; } else { -int64_t tmp = val;; +int64_t tmp = val; val = shift; if ((val shift) != tmp) { SET_QC(); -- 1.7.2.3
[Qemu-devel] [PATCH 2/8] target-arm: Create and use neon_unarrow_sat* helpers
From: Christophe Lyon christophe.l...@st.com Fix VQMOVUN, improve VQSHRUN and VQRSHRUN. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/helpers.h |3 ++ target-arm/neon_helper.c | 63 ++ target-arm/translate.c | 43 ++- 3 files changed, 96 insertions(+), 13 deletions(-) diff --git a/target-arm/helpers.h b/target-arm/helpers.h index b88ebae..8cc6a44 100644 --- a/target-arm/helpers.h +++ b/target-arm/helpers.h @@ -295,10 +295,13 @@ DEF_HELPER_3(neon_qrdmulh_s32, i32, env, i32, i32) DEF_HELPER_1(neon_narrow_u8, i32, i64) DEF_HELPER_1(neon_narrow_u16, i32, i64) +DEF_HELPER_2(neon_unarrow_sat8, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_u8, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_s8, i32, env, i64) +DEF_HELPER_2(neon_unarrow_sat16, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_u16, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_s16, i32, env, i64) +DEF_HELPER_2(neon_unarrow_sat32, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_u32, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_s32, i32, env, i64) DEF_HELPER_1(neon_narrow_high_u8, i32, i64) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index 6c832b4..891b812 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -1005,6 +1005,33 @@ uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) return ((x 16) 0x) | ((x 32) 0x); } +uint32_t HELPER(neon_unarrow_sat8)(CPUState *env, uint64_t x) +{ +uint16_t s; +uint8_t d; +uint32_t res = 0; +#define SAT8(n) \ +s = x n; \ +if (s 0x8000) { \ +SET_QC(); \ +} else { \ +if (s 0xff) { \ +d = 0xff; \ +SET_QC(); \ +} else { \ +d = s; \ +} \ +res |= (uint32_t)d (n / 2); \ +} + +SAT8(0); +SAT8(16); +SAT8(32); +SAT8(48); +#undef SAT8 +return res; +} + uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x) { uint16_t s; @@ -1051,6 +1078,29 @@ uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x) return res; } +uint32_t HELPER(neon_unarrow_sat16)(CPUState *env, uint64_t x) +{ +uint32_t high; +uint32_t low; +low = x; +if (low 0x8000) { +low = 0; +SET_QC(); +} else if (low 0x) { +low = 0x; +SET_QC(); +} +high = x 32; +if (high 0x8000) { +high = 0; +SET_QC(); +} else if (high 0x) { +high = 0x; +SET_QC(); +} +return low | (high 16); +} + uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x) { uint32_t high; @@ -1085,6 +1135,19 @@ uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x) return (uint16_t)low | (high 16); } +uint32_t HELPER(neon_unarrow_sat32)(CPUState *env, uint64_t x) +{ +if (x 0x8000ull) { +SET_QC(); +return 0; +} +if (x 0xu) { +SET_QC(); +return 0xu; +} +return x; +} + uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x) { if (x 0xu) { diff --git a/target-arm/translate.c b/target-arm/translate.c index b44f7a1..6dd024d 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4071,6 +4071,16 @@ static inline void gen_neon_narrow_satu(int size, TCGv dest, TCGv_i64 src) } } +static inline void gen_neon_unarrow_sats(int size, TCGv dest, TCGv_i64 src) +{ +switch(size) { +case 0: gen_helper_neon_unarrow_sat8(dest, cpu_env, src); break; +case 1: gen_helper_neon_unarrow_sat16(dest, cpu_env, src); break; +case 2: gen_helper_neon_unarrow_sat32(dest, cpu_env, src); break; +default: abort(); +} +} + static inline void gen_neon_shift_narrow(int size, TCGv var, TCGv shift, int q, int u) { @@ -4841,13 +4851,14 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) dead_tmp(tmp3); } tmp = new_tmp(); -if (op == 8 !u) { -gen_neon_narrow(size - 1, tmp, cpu_V0); -} else { -if (op == 8) -gen_neon_narrow_sats(size - 1, tmp, cpu_V0); -else -gen_neon_narrow_satu(size - 1, tmp, cpu_V0); +if (op == 8) { +if (u) { /* VQSHRUN / VQRSHRUN */ +gen_neon_unarrow_sats(size - 1, tmp, cpu_V0); +} else { /* VSHRN / VRSHRN */ +gen_neon_narrow(size - 1, tmp, cpu_V0); +} +} else { /* VQSHRN / VQRSHRN */ +gen_neon_narrow_satu(size - 1, tmp, cpu_V0); } neon_store_reg(rd,
[Qemu-devel] [PATCH 3/8] target-arm: VQRSHRN related changes
From: Christophe Lyon christophe.l...@st.com More fixes for VQSHRN and VQSHRUN. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/translate.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 6dd024d..9ca5b82 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4101,8 +4101,8 @@ static inline void gen_neon_shift_narrow(int size, TCGv var, TCGv shift, } else { if (u) { switch (size) { -case 1: gen_helper_neon_rshl_u16(var, var, shift); break; -case 2: gen_helper_neon_rshl_u32(var, var, shift); break; +case 1: gen_helper_neon_shl_u16(var, var, shift); break; +case 2: gen_helper_neon_shl_u32(var, var, shift); break; default: abort(); } } else { -- 1.7.2.3
[Qemu-devel] [PATCH 0/8] target-arm: Fix Neon instructions VQMOVUN VQRSHL VQRSHRN VQRSHRUN VQSHRN VQSHRUN VSLI VSRI
From: Christophe Lyon christophe.l...@st.com This patchset combines fixes from the Meego tree (Peter Maydell, Juha Riihimäki) and my own fixes such that ARM Neon instructions VQMOVUN VQRSHL VQRSHRN VQRSHRUN VQSHRN VQSHRUN VSLI VSRI now pass all my tests. Christophe Lyon (3): Fixes for several shift instructions: VRSHL, VRSHR, VRSHRN, VSHLL, VRSRA. target-arm: Fix Neon VQ(R)SHRN instructions. target-arm: Fix VQRSHL Neon instructions (signed/unsigned 64 bits and signed 32 bits variants). Juha Riihimäki (1): target-arm: fix neon vqrshl instruction Peter Maydell (4): Create and use neon_unarrow_sat* helpers VQRSHRN related changes fiddle decoding of 64 bit shift by imm and narrow implement vsli.64, vsri.64 target-arm/helpers.h |3 + target-arm/neon_helper.c | 195 ++ target-arm/translate.c | 103 ++--- 3 files changed, 257 insertions(+), 44 deletions(-) -- 1.7.2.3
[Qemu-devel] [PATCH 7/8] implement vsli.64, vsri.64
From: Christophe Lyon christophe.l...@st.com Signed-off-by: Peter Maydell peter.mayd...@linaro.org Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/translate.c | 11 ++- 1 files changed, 10 insertions(+), 1 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 3b14b8f..984df08 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4711,7 +4711,16 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) tcg_gen_add_i64(cpu_V0, cpu_V0, cpu_V1); } else if (op == 4 || (op == 5 u)) { /* Insert */ -cpu_abort(env, VS[LR]I.64 not implemented); +neon_load_reg64(cpu_V1, rd + pass); +uint64_t mask; +if (op == 4) { +mask = 0xull -shift; +} else { +mask = 0xull shift; +} +tcg_gen_andi_i64(cpu_V0, cpu_V0, mask); +tcg_gen_andi_i64(cpu_V1, cpu_V1, ~mask); +tcg_gen_or_i64(cpu_V0, cpu_V0, cpu_V1); } neon_store_reg64(cpu_V0, rd + pass); } else { /* size 3 */ -- 1.7.2.3
[Qemu-devel] [PATCH 1/8] target-arm: Fixes for several shift instructions: VRSHL, VRSHR, VRSHRN, VSHLL, VRSRA.
From: Christophe Lyon christophe.l...@st.com Handle corner cases where the addition of the rounding constant could cause overflows. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/neon_helper.c | 61 ++--- target-arm/translate.c | 17 ++-- 2 files changed, 65 insertions(+), 13 deletions(-) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index bf29bbe..5971275 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -540,6 +540,9 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) return val; } +/* The addition of the rounding constant may overflow, so we use an + * intermediate 64 bits accumulator, which is really needed only when + * dealing with 32 bits input values. */ #define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src2; \ @@ -548,11 +551,12 @@ uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) } else if (tmp -(ssize_t)sizeof(src1) * 8) { \ dest = src1 (sizeof(src1) * 8 - 1); \ } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ -dest = src1 (tmp - 1); \ +dest = src1 (-tmp - 1); \ dest++; \ dest = 1; \ } else if (tmp 0) { \ -dest = (src1 + (1 (-1 - tmp))) -tmp; \ +int64_t big_dest = ((int64_t)src1 + (1 (-1 - tmp))); \ +dest = big_dest -tmp; \ } else { \ dest = src1 tmp; \ }} while (0) @@ -561,6 +565,8 @@ NEON_VOP(rshl_s16, neon_s16, 2) NEON_VOP(rshl_s32, neon_s32, 1) #undef NEON_FN +/* Handling addition overflow with 64 bits inputs values is more + * tricky than with 32 bits values. */ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) { int8_t shift = (int8_t)shiftop; @@ -569,18 +575,37 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) val = 0; } else if (shift -64) { val = 63; -} else if (shift == -63) { +} else if (shift == -64) { val = 63; val++; val = 1; } else if (shift 0) { -val = (val + ((int64_t)1 (-1 - shift))) -shift; +int64_t round = (int64_t)1 (-1 - shift); +/* Reduce the range as long as the addition overflows. It's + * sufficient to check if (val+round) is 0 and val 0 + * because round is 0. */ +while ((val 0) ((val + round) 0) round 1) { +shift++; +round = 1; +val = 1; +} +if ((val 0) (val + round) 0) { +/* If addition still overflows at this point, it means + * that round==1, thus shift==-1, and also that + * val==0x7FFF. */ +val = 0x4000LL; +} else { +val = (val + round) -shift; +} } else { val = shift; } return val; } +/* The addition of the rounding constant may overflow, so we use an + * intermediate 64 bits accumulator, which is really needed only when + * dealing with 32 bits input values. */ #define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src2; \ @@ -588,9 +613,10 @@ uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) tmp -(ssize_t)sizeof(src1) * 8) { \ dest = 0; \ } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ -dest = src1 (tmp - 1); \ +dest = src1 (-tmp - 1); \ } else if (tmp 0) { \ -dest = (src1 + (1 (-1 - tmp))) -tmp; \ +uint64_t big_dest = ((uint64_t)src1 + (1 (-1 - tmp))); \ +dest = big_dest -tmp; \ } else { \ dest = src1 tmp; \ }} while (0) @@ -602,14 +628,29 @@ NEON_VOP(rshl_u32, neon_u32, 1) uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) { int8_t shift = (uint8_t)shiftop; -if (shift = 64 || shift 64) { +if (shift = 64 || shift -64) { val = 0; } else if (shift == -64) { /* Rounding a 1-bit result just preserves that bit. */ val = 63; -} if (shift 0) { -val = (val + ((uint64_t)1 (-1 - shift))) -shift; -val = -shift; +} else if (shift 0) { +uint64_t round = (uint64_t)1 (-1 - shift); +/* Reduce the range as long as the addition overflows. It's + * sufficient to check if (val+round) is val + * because val and round are 0. */ +while (((val + round) val) round 1) { +shift++; +round = 1; +val = 1; +} +if ((val + round) val) { +/* If addition still overflows at this point, it means + * that round==1, thus shift==-1, and also that + * val==0xFFF. */ +val = 0x8000LL; +} else { +val = (val + round) -shift; +} } else { val = shift; } diff --git a/target-arm/translate.c b/target-arm/translate.c
[Qemu-devel] [PATCH 4/8] target-arm: fiddle decoding of 64 bit shift by imm and narrow
From: Christophe Lyon christophe.l...@st.com Tweak decoding of the shift-by-imm and narrow 64 bit insns (VSHRN, VRSHRN, VQSHRN, VQSHRUN, VQRSHRN, VQRSHRUN). Signed-off-by: Peter Maydell peter.mayd...@linaro.org Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/translate.c | 28 ++-- 1 files changed, 18 insertions(+), 10 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 3537698..452cb71 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4842,21 +4842,29 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) if (size == 3) { neon_load_reg64(cpu_V0, rm + pass); if (q) { - if (u) -gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, tmp64); - else -gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, tmp64); +if ((op == 8 !u) || (op == 9 u)) { +gen_helper_neon_rshl_u64(cpu_V0, cpu_V0, + tmp64); +} else { +gen_helper_neon_rshl_s64(cpu_V0, cpu_V0, + tmp64); +} } else { - if (u) -gen_helper_neon_shl_u64(cpu_V0, cpu_V0, tmp64); - else -gen_helper_neon_shl_s64(cpu_V0, cpu_V0, tmp64); +if ((op == 8 !u) || (op == 9 u)) { +gen_helper_neon_shl_u64(cpu_V0, cpu_V0, +tmp64); +} else { +gen_helper_neon_shl_s64(cpu_V0, cpu_V0, +tmp64); +} } } else { tmp = neon_load_reg(rm + pass, 0); -gen_neon_shift_narrow(size, tmp, tmp2, q, u); +gen_neon_shift_narrow(size, tmp, tmp2, q, + (op == 8) ? !u : u); tmp3 = neon_load_reg(rm + pass, 1); -gen_neon_shift_narrow(size, tmp3, tmp2, q, u); +gen_neon_shift_narrow(size, tmp3, tmp2, q, + (op == 8) ? !u : u); tcg_gen_concat_i32_i64(cpu_V0, tmp, tmp3); dead_tmp(tmp); dead_tmp(tmp3); -- 1.7.2.3
[Qemu-devel] [PATCH 6/8] target-arm: Fix Neon VQ(R)SHRN instructions.
From: Christophe Lyon christophe.l...@st.com Handle unsigned variant of VQ(R)SHRN instructions. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/translate.c |8 ++-- 1 files changed, 6 insertions(+), 2 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index 452cb71..3b14b8f 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4876,8 +4876,12 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) } else { /* VSHRN / VRSHRN */ gen_neon_narrow(size - 1, tmp, cpu_V0); } -} else { /* VQSHRN / VQRSHRN */ -gen_neon_narrow_satu(size - 1, tmp, cpu_V0); +} else { +if (u) { /* VQSHRUN / VQRSHRUN */ +gen_neon_narrow_satu(size - 1, tmp, cpu_V0); +} else { /* VQSHRN / VQRSHRN */ +gen_neon_narrow_sats(size - 1, tmp, cpu_V0); +} } neon_store_reg(rd, pass, tmp); } /* for pass */ -- 1.7.2.3
[Qemu-devel] [PATCH 5/8] target-arm: fix neon vqrshl instruction
From: Christophe Lyon christophe.l...@st.com Signed-off-by: Juha Riihimäki juha.riihim...@nokia.com Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/neon_helper.c | 21 ++--- 1 files changed, 18 insertions(+), 3 deletions(-) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index 71e3c74..3337c52 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -825,9 +825,24 @@ uint64_t HELPER(neon_qshlu_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) }} while (0) NEON_VOP_ENV(qrshl_u8, neon_u8, 4) NEON_VOP_ENV(qrshl_u16, neon_u16, 2) -NEON_VOP_ENV(qrshl_u32, neon_u32, 1) #undef NEON_FN +uint32_t HELPER(neon_qrshl_u32)(CPUState *env, uint32_t val, uint32_t shiftop) +{ +int8_t shift = (int8_t)shiftop; +if (shift 0) { +val = ((uint64_t)val + (1 (-1 - shift))) -shift; +} else { +uint32_t tmp = val; +val = shift; +if ((val shift) != tmp) { +SET_QC(); +val = ~0; +} +} +return val; +} + uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) { int8_t shift = (int8_t)shiftop; @@ -853,7 +868,7 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) dest = src1 tmp; \ if ((dest tmp) != src1) { \ SET_QC(); \ -dest = src1 31; \ +dest = (uint32_t)(1 (sizeof(src1) * 8 - 1)) - (src1 0 ? 1 : 0); \ } \ }} while (0) NEON_VOP_ENV(qrshl_s8, neon_s8, 4) @@ -869,7 +884,7 @@ uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) if (shift 0) { val = (val + (1 (-1 - shift))) -shift; } else { -int64_t tmp = val;; +int64_t tmp = val; val = shift; if ((val shift) != tmp) { SET_QC(); -- 1.7.2.3
[Qemu-devel] [PATCH 3/8] target-arm: VQRSHRN related changes
From: Christophe Lyon christophe.l...@st.com More fixes for VQSHRN and VQSHRUN. Signed-off-by: Peter Maydell peter.mayd...@linaro.org Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/translate.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/target-arm/translate.c b/target-arm/translate.c index cda5a73..3537698 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4108,8 +4108,8 @@ static inline void gen_neon_shift_narrow(int size, TCGv var, TCGv shift, } else { if (u) { switch (size) { -case 1: gen_helper_neon_rshl_u16(var, var, shift); break; -case 2: gen_helper_neon_rshl_u32(var, var, shift); break; +case 1: gen_helper_neon_shl_u16(var, var, shift); break; +case 2: gen_helper_neon_shl_u32(var, var, shift); break; default: abort(); } } else { -- 1.7.2.3
[Qemu-devel] [PATCH 2/8] target-arm: Create and use neon_unarrow_sat* helpers
From: Christophe Lyon christophe.l...@st.com Fix VQMOVUN, improve VQSHRUN and VQRSHRUN. Signed-off-by: Peter Maydell peter.mayd...@linaro.org Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/helpers.h |3 ++ target-arm/neon_helper.c | 63 ++ target-arm/translate.c | 43 ++- 3 files changed, 96 insertions(+), 13 deletions(-) diff --git a/target-arm/helpers.h b/target-arm/helpers.h index 8a2564e..4d0de00 100644 --- a/target-arm/helpers.h +++ b/target-arm/helpers.h @@ -299,10 +299,13 @@ DEF_HELPER_3(neon_qrdmulh_s32, i32, env, i32, i32) DEF_HELPER_1(neon_narrow_u8, i32, i64) DEF_HELPER_1(neon_narrow_u16, i32, i64) +DEF_HELPER_2(neon_unarrow_sat8, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_u8, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_s8, i32, env, i64) +DEF_HELPER_2(neon_unarrow_sat16, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_u16, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_s16, i32, env, i64) +DEF_HELPER_2(neon_unarrow_sat32, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_u32, i32, env, i64) DEF_HELPER_2(neon_narrow_sat_s32, i32, env, i64) DEF_HELPER_1(neon_narrow_high_u8, i32, i64) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index 5971275..71e3c74 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -1094,6 +1094,33 @@ uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) return ((x 16) 0x) | ((x 32) 0x); } +uint32_t HELPER(neon_unarrow_sat8)(CPUState *env, uint64_t x) +{ +uint16_t s; +uint8_t d; +uint32_t res = 0; +#define SAT8(n) \ +s = x n; \ +if (s 0x8000) { \ +SET_QC(); \ +} else { \ +if (s 0xff) { \ +d = 0xff; \ +SET_QC(); \ +} else { \ +d = s; \ +} \ +res |= (uint32_t)d (n / 2); \ +} + +SAT8(0); +SAT8(16); +SAT8(32); +SAT8(48); +#undef SAT8 +return res; +} + uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x) { uint16_t s; @@ -1140,6 +1167,29 @@ uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x) return res; } +uint32_t HELPER(neon_unarrow_sat16)(CPUState *env, uint64_t x) +{ +uint32_t high; +uint32_t low; +low = x; +if (low 0x8000) { +low = 0; +SET_QC(); +} else if (low 0x) { +low = 0x; +SET_QC(); +} +high = x 32; +if (high 0x8000) { +high = 0; +SET_QC(); +} else if (high 0x) { +high = 0x; +SET_QC(); +} +return low | (high 16); +} + uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x) { uint32_t high; @@ -1174,6 +1224,19 @@ uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x) return (uint16_t)low | (high 16); } +uint32_t HELPER(neon_unarrow_sat32)(CPUState *env, uint64_t x) +{ +if (x 0x8000ull) { +SET_QC(); +return 0; +} +if (x 0xu) { +SET_QC(); +return 0xu; +} +return x; +} + uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x) { if (x 0xu) { diff --git a/target-arm/translate.c b/target-arm/translate.c index b14fa4b..cda5a73 100644 --- a/target-arm/translate.c +++ b/target-arm/translate.c @@ -4078,6 +4078,16 @@ static inline void gen_neon_narrow_satu(int size, TCGv dest, TCGv_i64 src) } } +static inline void gen_neon_unarrow_sats(int size, TCGv dest, TCGv_i64 src) +{ +switch(size) { +case 0: gen_helper_neon_unarrow_sat8(dest, cpu_env, src); break; +case 1: gen_helper_neon_unarrow_sat16(dest, cpu_env, src); break; +case 2: gen_helper_neon_unarrow_sat32(dest, cpu_env, src); break; +default: abort(); +} +} + static inline void gen_neon_shift_narrow(int size, TCGv var, TCGv shift, int q, int u) { @@ -4852,13 +4862,14 @@ static int disas_neon_data_insn(CPUState * env, DisasContext *s, uint32_t insn) dead_tmp(tmp3); } tmp = new_tmp(); -if (op == 8 !u) { -gen_neon_narrow(size - 1, tmp, cpu_V0); -} else { -if (op == 8) -gen_neon_narrow_sats(size - 1, tmp, cpu_V0); -else -gen_neon_narrow_satu(size - 1, tmp, cpu_V0); +if (op == 8) { +if (u) { /* VQSHRUN / VQRSHRUN */ +gen_neon_unarrow_sats(size - 1, tmp, cpu_V0); +} else { /* VSHRN / VRSHRN */ +gen_neon_narrow(size - 1, tmp, cpu_V0); +} +} else { /* VQSHRN / VQRSHRN */ +gen_neon_narrow_satu(size - 1, tmp, cpu_V0);
[Qemu-devel] [PATCH 8/8] target-arm: Fix VQRSHL Neon instructions (signed/unsigned 64 bits and signed 32 bits variants).
From: Christophe Lyon christophe.l...@st.com The addition of the rounding constant could cause overflows. Signed-off-by: Christophe Lyon christophe.l...@st.com --- target-arm/neon_helper.c | 50 ++--- 1 files changed, 46 insertions(+), 4 deletions(-) diff --git a/target-arm/neon_helper.c b/target-arm/neon_helper.c index 3337c52..9faa348 100644 --- a/target-arm/neon_helper.c +++ b/target-arm/neon_helper.c @@ -847,7 +847,23 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) { int8_t shift = (int8_t)shiftop; if (shift 0) { -val = (val + (1 (-1 - shift))) -shift; +uint64_t round = (uint64_t)1 (-1 - shift); +/* Reduce the range as long as the addition overflows. It's + * sufficient to check if (val+round) is val + * because val and round are 0. */ +while (((val + round) val) round 1) { +shift++; +round = 1; +val = 1; +} +if ((val + round) val) { +/* If addition still overflows at this point, it means + * that round==1, thus shift==-1, and also that + * val==0xFFF. */ +val = 0x8000LL; +} else { +val = (val + round) -shift; +} } else { \ uint64_t tmp = val; val = shift; @@ -859,11 +875,15 @@ uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) return val; } +/* The addition of the rounding constant may overflow, so we use an + * intermediate 64 bits accumulator, which is really needed only when + * dealing with 32 bits input values. */ #define NEON_FN(dest, src1, src2) do { \ int8_t tmp; \ tmp = (int8_t)src2; \ if (tmp 0) { \ -dest = (src1 + (1 (-1 - tmp))) -tmp; \ +int64_t big_dest = ((int64_t)src1 + (1 (-1 - tmp))); \ +dest = big_dest -tmp; \ } else { \ dest = src1 tmp; \ if ((dest tmp) != src1) { \ @@ -876,19 +896,41 @@ NEON_VOP_ENV(qrshl_s16, neon_s16, 2) NEON_VOP_ENV(qrshl_s32, neon_s32, 1) #undef NEON_FN +/* Handling addition overflow with 64 bits inputs values is more + * tricky than with 32 bits values. */ uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) { int8_t shift = (uint8_t)shiftop; int64_t val = valop; if (shift 0) { -val = (val + (1 (-1 - shift))) -shift; +int64_t round = (int64_t)1 (-1 - shift); +/* Reduce the range as long as the addition overflows. It's + * sufficient to check if (val+round) is 0 and val 0 + * because round is 0. */ +while ((val 0) ((val + round) 0) round 1) { +shift++; +round = 1; +val = 1; +} +if ((val 0) (val + round) 0) { +/* If addition still overflows at this point, it means + * that round==1, thus shift==-1, and also that + * val==0x7FFF. */ +val = 0x4000LL; +} else { +val = (val + round) -shift; +} } else { int64_t tmp = val; val = shift; if ((val shift) != tmp) { SET_QC(); -val = tmp 31; +if (tmp 0) { +val = INT64_MIN; +} else { +val = INT64_MAX; +} } } return val; -- 1.7.2.3