from:"Kirill Batuzov"

Re: [Qemu-devel] [PATCH v7 08/26] tcg/i386: Add vector operations

2017-12-27 Thread Kirill Batuzov

On Mon, 18 Dec 2017, Richard Henderson wrote:

> The x86 vector instruction set is extremely irregular.  With newer
> editions, Intel has filled in some of the blanks.  However, we don't
> get many 64-bit operations until SSE4.2, introduced in 2009.
> 
> The subsequent edition was for AVX1, introduced in 2011, which added
> three-operand addressing, and adjusts how all instructions should be
> encoded.
> 
> Given the relatively narrow 2 year window between possible to support
> and desirable to support, and to vastly simplify code maintainence,
> I am only planning to support AVX1 and later cpus.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/i386/tcg-target.h |  36 ++-
>  tcg/i386/tcg-target.inc.c | 561 
> ++
>  2 files changed, 546 insertions(+), 51 deletions(-)
> 

> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
> index 63d27f10e7..e9a4d92598 100644
> --- a/tcg/i386/tcg-target.inc.c
> +++ b/tcg/i386/tcg-target.inc.c

> -static inline void tcg_out_mov(TCGContext *s, TCGType type,
> -   TCGReg ret, TCGReg arg)
> +static void tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
> +{
> +int rexw = 0;
> +
> +if (arg == ret) {
> +return;
> +}
> +switch (type) {
> +case TCG_TYPE_I64:
> +rexw = P_REXW;
> +/* fallthru */
> +case TCG_TYPE_I32:
> +if (ret < 16) {
> +if (arg < 16) {
> +tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
> +} else {
> +tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, ret, 0, arg);
> +}
> +} else {
> +if (arg < 16) {
> +tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
> +} else {
> +tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
> +}
> +}
> +break;
> +
> +case TCG_TYPE_V64:
> +tcg_debug_assert(ret >= 16 && arg >= 16);
> +tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
> +break;
> +case TCG_TYPE_V128:
> +tcg_debug_assert(ret >= 16 && arg >= 16);
> +tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
> +break;
> +case TCG_TYPE_V256:
> +tcg_debug_assert(ret >= 16 && arg >= 16);
> +tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
> +break;
> +
> +default:
> +g_assert_not_reached();
> +}
> +}

I think something is wrong with instruction encodings here. Looks like
  tcg_out_mov(&tcg_ctx, TCG_TYPE_I64, TCG_REG_EBP, TCG_REG_XMM0)
produces
  vmovq %xmm5, %rax
instead.

Here is the dump.

IN: 
0x00400580:  4e040c41  dup  v1.4s, w2
0x00400584:  4b0203e2  neg  w2, w2
0x00400588:  3d800021  str  q1, [x1]
0x0040058c:  d65f03c0  ret  

OP after optimization and liveness analysis:
 ld_i32 tmp0,env,$0xffec  dead: 1
 movi_i32 tmp1,$0x0
 brcond_i32 tmp0,tmp1,lt,$L0  dead: 0 1

  00400580  
 dup_vec v128,e32,tmp2,x2
 st_vec v128,e8,tmp2,env,$0x8b0   dead: 0

  00400584  
 ext32u_i64 tmp4,x2   dead: 1
 neg_i64 tmp5,tmp4dead: 1
 ext32u_i64 x2,tmp5   sync: 0  dead: 0 1

<...>

OUT: [size=111]
0x6075bf40:  41 8b 6e ec  movl -0x14(%r14), %ebp
0x6075bf44:  85 edtestl%ebp, %ebp
0x6075bf46:  0f 8c 59 00 00 00jl   0x6075bfa5
0x6075bf4c:  c4 c1 7a 7e 46 50vmovq0x50(%r14), %xmm0
0x6075bf52:  c5 f9 70 c8 00   vpshufd  $0, %xmm0, %xmm1
0x6075bf57:  c4 c1 7a 7f 8e b0 08 00  vmovdqu  %xmm1, 0x8b0(%r14)
0x6075bf5f:  00
0x6075bf60:  c4 e1 f9 7e e8   vmovq%xmm5, %rax
0x6075bf65:  8b edmovl %ebp, %ebp
0x6075bf67:  48 f7 dd negq %rbp
0x6075bf6a:  8b edmovl %ebp, %ebp
0x6075bf6c:  49 89 6e 50  movq %rbp, 0x50(%r14)
<...>

%xmm5 is used uninitialized, there is no move from either %xmm0 or
0x50(%r14) to %ebp, there are two unnecessary movl %ebp, %ebp.

-- 
Kirill

Re: [Qemu-devel] [PATCH v7 02/26] tcg: Add generic vector expanders

2017-12-27 Thread Kirill Batuzov

On Mon, 18 Dec 2017, Richard Henderson wrote:

> Signed-off-by: Richard Henderson 
> ---
>  Makefile.target  |2 +-
>  accel/tcg/tcg-runtime.h  |   29 ++
>  tcg/tcg-gvec-desc.h  |   49 ++
>  tcg/tcg-op-gvec.h|  152 ++
>  tcg/tcg-op.h |1 +
>  accel/tcg/tcg-runtime-gvec.c |  295 
>  tcg/tcg-op-gvec.c| 1099 
> ++
>  tcg/tcg-op-vec.c |   36 +-
>  accel/tcg/Makefile.objs  |2 +-
>  9 files changed, 1655 insertions(+), 10 deletions(-)
>  create mode 100644 tcg/tcg-gvec-desc.h
>  create mode 100644 tcg/tcg-op-gvec.h
>  create mode 100644 accel/tcg/tcg-runtime-gvec.c
>  create mode 100644 tcg/tcg-op-gvec.c
> 

> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
> new file mode 100644
> index 00..120e301096
> --- /dev/null
> +++ b/tcg/tcg-op-gvec.c

> +/* Set OPRSZ bytes at DOFS to replications of IN or IN_C.  */
> +static void do_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
> +   uint32_t maxsz, TCGv_i32 in, uint32_t in_c,
> +   void (*ool)(TCGv_ptr, TCGv_i32, TCGv_i32))
> +{
> +TCGType type;
> +TCGv_vec t_vec;
> +uint32_t i;
> +
> +assert(vece <= MO_32);
> +
> +if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
> +type = TCG_TYPE_V256;
> +} else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
> +type = TCG_TYPE_V128;
> +} else if (TCG_TARGET_HAS_v64 && check_size_impl(oprsz, 8)) {
> +type = TCG_TYPE_V64;
> +} else {
> +if (check_size_impl(oprsz, 4)) {
> +TCGv_i32 t_i32 = tcg_temp_new_i32();
> +
> +if (in) {
> +switch (vece) {
> +case MO_8:
> +tcg_gen_deposit_i32(t_i32, in, in, 8, 24);
> +in = t_i32;
> +/* fallthru */
> +case MO_16:
> +tcg_gen_deposit_i32(t_i32, in, in, 16, 16);
> +break;
> +}

If vece == MO_32 then t_i32 will be left uninitialized here... 

> +} else {
> +switch (vece) {
> +case MO_8:
> +in_c = (in_c & 0xff) * 0x01010101;
> +break;
> +case MO_16:
> +in_c = deposit32(in_c, 16, 16, in_c);
> +break;
> +}
> +tcg_gen_movi_i32(t_i32, in_c);
> +}
> +
> +for (i = 0; i < oprsz; i += 4) {
> +tcg_gen_st_i32(t_i32, cpu_env, dofs + i);
> +}

...and used uninitialized here.

> +tcg_temp_free_i32(t_i32);
> +goto done;
> +} else {
> +TCGv_i32 t_i32 = in ? in : tcg_const_i32(in_c);
> +TCGv_ptr a0 = tcg_temp_new_ptr();
> +TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
> +
> +tcg_gen_addi_ptr(a0, cpu_env, dofs);
> +ool(a0, desc, t_i32);
> +
> +tcg_temp_free_ptr(a0);
> +tcg_temp_free_i32(desc);
> +if (in == NULL) {
> +tcg_temp_free_i32(t_i32);
> +}
> +return;
> +}
> +}
> +
> +t_vec = tcg_temp_new_vec(type);
> +if (in) {
> +tcg_gen_dup_i32_vec(vece, t_vec, in);
> +} else {
> +switch (vece) {
> +case MO_8:
> +tcg_gen_dup8i_vec(t_vec, in_c);
> +break;
> +case MO_16:
> +tcg_gen_dup16i_vec(t_vec, in_c);
> +break;
> +default:
> +tcg_gen_dup32i_vec(t_vec, in_c);
> +break;
> +}
> +}
> +
> +i = 0;
> +if (TCG_TARGET_HAS_v256) {
> +for (; i + 32 <= oprsz; i += 32) {
> +tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
> +}
> +}
> +if (TCG_TARGET_HAS_v128) {
> +for (; i + 16 <= oprsz; i += 16) {
> +tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
> +}
> +}
> +if (TCG_TARGET_HAS_v64) {
> +for (; i < oprsz; i += 8) {
> +tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
> +}
> +}
> +tcg_temp_free_vec(t_vec);
> +
> + done:
> +tcg_debug_assert(i == oprsz);
> +if (i < maxsz) {
> +expand_clr(dofs + i, maxsz - i);
> +}
> +}
> +
> +/* Likewise, but with 64-bit quantities.  */
> +static void do_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
> +   uint32_t maxsz, TCGv_i64 in, uint64_t in_c)
> +{
> +TCGType type;
> +TCGv_vec t_vec;
> +uint32_t i;
> +
> +assert(vece <= MO_64);
> +
> +if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
> +type = TCG_TYPE_V256;
> +} else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
> +type = TCG_TYPE_V128;
> +} else if (TCG_TARGET_HAS_v64 && TCG_TARGET_REG_BITS =

Re: [Qemu-devel] [PATCH v6 05/26] tcg: Add generic vector expanders

2017-12-06 Thread Kirill Batuzov

On Tue, 21 Nov 2017, Richard Henderson wrote:

> diff --git a/tcg/tcg-op-gvec.c b/tcg/tcg-op-gvec.c
> new file mode 100644
> index 00..925c293f9c
> --- /dev/null
> +++ b/tcg/tcg-op-gvec.c

<...>

> +/* Set OPRSZ bytes at DOFS to replications of IN or IN_C.  */
> +static void do_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
> +   uint32_t maxsz, TCGv_i32 in, uint32_t in_c,
> +   void (*ool)(TCGv_ptr, TCGv_i32, TCGv_i32))
> +{
> +TCGv_vec t_vec;
> +uint32_t i;
> +
> +if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
> +t_vec = tcg_temp_new_vec(TCG_TYPE_V256);
> +} else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
> +t_vec = tcg_temp_new_vec(TCG_TYPE_V128);
> +} else if (TCG_TARGET_HAS_v64 && check_size_impl(oprsz, 8)) {
> +t_vec = tcg_temp_new_vec(TCG_TYPE_V64);
> +} else  {
> +TCGv_i32 t_i32 = in ? in : tcg_const_i32(in_c);
> +
> +if (check_size_impl(oprsz, 4)) {
> +for (i = 0; i < oprsz; i += 4) {
> +tcg_gen_st_i32(t_i32, cpu_env, dofs + i);
> +}

You are ignoring VECE argument here and always duplicate a 32-bit value.
For this to be correct for smaller VECE, you need to prepare this 32-bit
value beforehand. The current code produces incorrect expansion:

IN: 
0x004005e8:  d2824682  movz x2, #0x1234
0x004005ec:  4e010c41  dup  v1.16b, w2

OP:

  004005e8  
 movi_i64 x2,$0x1234

  004005ec  
 st_i32 x2,env,$0x8b0
 st_i32 x2,env,$0x8b4
 st_i32 x2,env,$0x8b8
 st_i32 x2,env,$0x8bc



> +if (in == NULL) {
> +tcg_temp_free_i32(t_i32);
> +}
> +goto done;
> +} else {
> +TCGv_ptr a0 = tcg_temp_new_ptr();
> +TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
> +
> +tcg_gen_addi_ptr(a0, cpu_env, dofs);
> +ool(a0, desc, t_i32);
> +
> +tcg_temp_free_ptr(a0);
> +tcg_temp_free_i32(desc);
> +if (in == NULL) {
> +tcg_temp_free_i32(t_i32);
> +}
> +return;
> +}
> +}
> +
> +if (in) {
> +tcg_gen_dup_i32_vec(vece, t_vec, in);
> +} else {
> +tcg_gen_dup32i_vec(t_vec, in_c);

May be it'll be better to call this function tcg_gen_dupi_i32_vec?
This will go along with current naming conventions like tcg_gen_addi_i32.

> +}
> +
> +i = 0;
> +if (TCG_TARGET_HAS_v256) {
> +for (; i + 32 <= oprsz; i += 32) {
> +tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
> +}
> +}
> +if (TCG_TARGET_HAS_v128) {
> +for (; i + 16 <= oprsz; i += 16) {
> +tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
> +}
> +}
> +if (TCG_TARGET_HAS_v64) {
> +for (; i < oprsz; i += 8) {
> +tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
> +}
> +}
> +
> + done:
> +tcg_debug_assert(i == oprsz);
> +if (i < maxsz) {
> +expand_clr(dofs + i, maxsz - i);
> +}
> +}
> +

<...>
(the following is context)

> +void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
> +  uint32_t maxsz, TCGv_i32 in)
> +{
> +typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
> +static dup_fn * const fns[3] = {
> +gen_helper_gvec_dup8,
> +gen_helper_gvec_dup16,
> +gen_helper_gvec_dup32
> +};
> +
> +check_size_align(oprsz, maxsz, dofs);
> +tcg_debug_assert(vece <= MO_32);
> +do_dup_i32(vece, dofs, oprsz, maxsz, in, 0, fns[vece]);
> +}
> +
> +void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
> +  uint32_t maxsz, TCGv_i64 in)
> +{
> +check_size_align(oprsz, maxsz, dofs);
> +tcg_debug_assert(vece <= MO_64);
> +if (vece <= MO_32) {
> +/* This works for both host register sizes.  */
> +tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, (TCGv_i32)in);
> +} else {
> +do_dup_i64(vece, dofs, oprsz, maxsz, in, 0);
> +}
> +}
> +
> +void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
> +  uint32_t oprsz, uint32_t maxsz)
> +{
> +tcg_debug_assert(vece <= MO_64);
> +if (vece <= MO_32) {
> +TCGv_i32 in = tcg_temp_new_i32();
> +switch (vece) {
> +case MO_8:
> +tcg_gen_ld8u_i32(in, cpu_env, aofs);
> +break;
> +case MO_16:
> +tcg_gen_ld16u_i32(in, cpu_env, aofs);
> +break;
> +case MO_32:
> +tcg_gen_ld_i32(in, cpu_env, aofs);
> +break;
> +}
> +tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
> +tcg_temp_free_i32(in);
> +} else {
> +TCGv_i64 in = tcg_temp_new_i64();
> +tcg_gen_ld_i64(in, cpu_

Re: [Qemu-devel] [PATCH v6 21/26] tcg: Add generic vector ops for multiplication

2017-12-05 Thread Kirill Batuzov

On Tue, 21 Nov 2017, Richard Henderson wrote:

> Signed-off-by: Richard Henderson 

> +void tcg_gen_mul_vec(unsigned vece, TCGv_vec r, TCGv_vec a, TCGv_vec b)
> +{
> +TCGTemp *rt = tcgv_vec_temp(r);
> +TCGTemp *at = tcgv_vec_temp(a);
> +TCGTemp *bt = tcgv_vec_temp(b);
> +TCGArg ri = temp_arg(rt);
> +TCGArg ai = temp_arg(at);
> +TCGArg bi = temp_arg(bt);
> +TCGType type = rt->base_type;
> +int can;
> +
> +tcg_debug_assert(at->base_type == type);
> +tcg_debug_assert(bt->base_type == type);
> +can = tcg_can_emit_vec_op(INDEX_op_cmp_vec, type, vece);

Should be INDEX_op_mul_vec in the line above.

> +if (can > 0) {
> +vec_gen_3(INDEX_op_mul_vec, type, vece, ri, ai, bi);
> +} else {
> +tcg_debug_assert(can < 0);
> +tcg_expand_vec_op(INDEX_op_mul_vec, type, vece, ri, ai, bi);
> +}
> +}

-- 
Kirill

Re: [Qemu-devel] [PATCH RFC 2/3] tcg/optimize: do copy propagation for memory locations

2017-11-23 Thread Kirill Batuzov

On Wed, 22 Nov 2017, Richard Henderson wrote:

> On 11/09/2017 03:41 PM, Kirill Batuzov wrote:
> > +typedef struct TCGMemLocation {
> > +/* Offset is relative to ENV. Only fields of CPUState are accounted.  
> > */
> > +tcg_target_ulong offset;
> > +tcg_target_ulong size;
> > +TCGType type;
> > +/* Pointer to a temp containing a valid copy of this memory location.  
> > */
> > +TCGTemp *copy;
> > +/* Pointer to the next memory location containing copy of the same
> > +   content.  */
> > +struct TCGMemLocation *next_copy;
> 
> Did you ever find copies of memories that weren't also copies within temps?
> I.e. you could have found this through copy->next_copy?

Yes. This happens when a temp was stored to multiple memory locations.

> 
> > +/* Double-linked list of all memory locations.  */
> > +struct TCGMemLocation *next;
> > +struct TCGMemLocation **prev_ptr;
> 
> Use QTAILQ_* for common double-linked-list manipulation.
> 
> > +struct TCGMemLocation *mem_locations;
> > +struct TCGMemLocation *free_mls;
> 
> These can't be globals anymore -- we're do multi-threaded code generation now.

Then they should be moved to TCGContext I assume?

> 
> > @@ -77,12 +125,27 @@ static void reset_ts(TCGTemp *ts)
> >  struct tcg_temp_info *pi = ts_info(ti->prev_copy);
> >  struct tcg_temp_info *ni = ts_info(ti->next_copy);
> >  
> > +if (ti->mem_loc && ts_is_copy(ts) && 0) {
> > +TCGMemLocation *ml, *nml;
> > +for (ml = ti->mem_loc; ml; ml = nml) {
> > +nml = ml->next_copy;
> > +ml->copy = ti->next_copy;
> > +ml->next_copy = ni->mem_loc;
> > +ni->mem_loc = ml;
> > +}
> > +} else {
> > +while (ti->mem_loc) {
> > +reset_ml(ti->mem_loc);
> > +}
> 
> Why would a single temp be associated with more than one memory?
>

Because it was stored to multiple memory locations. And when reading from
any of these locations we want to access the temp instead.

For example this happens when we translate ARM32 VDUP instruction. One
value is duplicated into all elements of the vector. When elements of the
vector are accessed later, we want to use this value instead of rereading
it from memory.

> > +static TCGOpcode ld_to_mov(TCGOpcode op)
> > +{
> > +#define LD_TO_EXT(sz, w) \
> > +case glue(glue(INDEX_op_ld, sz), glue(_i, w)):   \
> > +return glue(glue(INDEX_op_ext, sz), glue(_i, w))
> > +
> > +switch (op) {
> > +LD_TO_EXT(8s, 32);
> > +LD_TO_EXT(8u, 32);
> > +LD_TO_EXT(16s, 32);
> > +LD_TO_EXT(16u, 32);
> > +LD_TO_EXT(8s, 64);
> > +LD_TO_EXT(8u, 64);
> > +LD_TO_EXT(16s, 64);
> > +LD_TO_EXT(16u, 64);
> > +LD_TO_EXT(32s, 64);
> > +LD_TO_EXT(32u, 64);
> 
> How many extensions did you find?  Or is this Just In Case?
> 

One. It was in Aartch64 build of x264. So this is more Just in Case. But
it may become useful if we try to emulate some 8- or 16-bit architectures.

-- 
Kirill

[Qemu-devel] [PATCH RFC 3/3] tcg/optimize: handle vector loads and stores during copy propagation

2017-11-09 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/optimize.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index da7f069444..1b6962c6c5 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -318,6 +318,8 @@ static TCGOpcode ld_to_mov(TCGOpcode op)
 return INDEX_op_mov_i32;
 case INDEX_op_ld_i64:
 return INDEX_op_mov_i64;
+case INDEX_op_ld_vec:
+return INDEX_op_mov_vec;
 default:
 tcg_abort();
 }
@@ -782,6 +784,13 @@ static bool swap_commutative2(TCGArg *p1, TCGArg *p2)
 return false;
 }
 
+static int tcg_vec_size(const TCGOp *op)
+{
+TCGArg arg = op->args[0];
+TCGTemp *tmp = arg_temp(arg);
+return 1 << (3 + tmp->base_type - TCG_TYPE_V64);
+}
+
 static int ldst_size(const TCGOp *op)
 {
 switch (op->opc) {
@@ -802,6 +811,9 @@ static int ldst_size(const TCGOp *op)
 case INDEX_op_st_i64:
 case INDEX_op_ld_i64:
 return 8;
+case INDEX_op_ld_vec:
+case INDEX_op_st_vec:
+return tcg_vec_size(op);
 default:
 /* Some unsupported opcode? */
 tcg_abort();
@@ -1660,6 +1672,7 @@ void tcg_optimize(TCGContext *s)
 CASE_OP_32_64(st16):
 CASE_OP_32_64(st):
 case INDEX_op_st32_i64:
+case INDEX_op_st_vec:
 if (op->args[1] == tcgv_ptr_arg(cpu_env)) {
 remove_ml_range(op->args[2], ldst_size(op));
 new_ml(op->args[2], ldst_size(op), arg_temp(op->args[0]));
@@ -1677,6 +1690,7 @@ void tcg_optimize(TCGContext *s)
 CASE_OP_32_64(ld):
 case INDEX_op_ld32s_i64:
 case INDEX_op_ld32u_i64:
+case INDEX_op_ld_vec:
 /* Only loads that are relative to ENV can be handled.  */
 if (op->args[1] == tcgv_ptr_arg(cpu_env)) {
 ml = find_ml(op->args[2], ldst_size(op),
@@ -1689,6 +1703,14 @@ void tcg_optimize(TCGContext *s)
 TCGTemp *copy = find_better_copy(s, ml->copy);
 tcg_opt_gen_mov(s, op, op->args[0], temp_arg(copy));
 break;
+} else if (re == INDEX_op_mov_vec) {
+if (ts_are_copies(arg_temp(op->args[0]), ml->copy)) {
+tcg_op_remove(s, op);
+break;
+}
+op->opc = re;
+op->args[1] = temp_arg(find_better_copy(s, ml->copy));
+op->args[2] = op->args[3];
 } else {
 if (tcg_op_defs[re].flags & TCG_OPF_NOT_PRESENT) {
 /* Required operation is not supported by host.  */
-- 
2.11.0

[Qemu-devel] [PATCH RFC 1/3] tcg: support MOV_VEC and MOVI_VEC opcodes in register allocator

2017-11-09 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index a7854a59a1..6db7dd526a 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -3327,10 +3327,12 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
 switch (opc) {
 case INDEX_op_mov_i32:
 case INDEX_op_mov_i64:
+case INDEX_op_mov_vec:
 tcg_reg_alloc_mov(s, op);
 break;
 case INDEX_op_movi_i32:
 case INDEX_op_movi_i64:
+case INDEX_op_movi_vec:
 tcg_reg_alloc_movi(s, op);
 break;
 case INDEX_op_insn_start:
-- 
2.11.0

[Qemu-devel] [PATCH RFC 2/3] tcg/optimize: do copy propagation for memory locations

2017-11-09 Thread Kirill Batuzov

During copy propagation phase, keep track of memory locations that store value
of a known live variable. Only memory locations that are addressed relative to
ENV are accounted. Any other access types are handled conservatively.

When a load is encountered, source memory location is checked against list of
known memory locations. If its content is a copy of some variable, then MOV or
EXT from this variable is issued instead of load. This allows us to keep values
of some CPUState fields that are not represented by global variables on host
registers during computations involving them.

Signed-off-by: Kirill Batuzov 
---
 tcg/optimize.c | 266 +
 1 file changed, 266 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 847dfa44c9..da7f069444 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -38,8 +38,28 @@ struct tcg_temp_info {
 TCGTemp *next_copy;
 tcg_target_ulong val;
 tcg_target_ulong mask;
+struct TCGMemLocation *mem_loc;
 };
 
+typedef struct TCGMemLocation {
+/* Offset is relative to ENV. Only fields of CPUState are accounted.  */
+tcg_target_ulong offset;
+tcg_target_ulong size;
+TCGType type;
+/* Pointer to a temp containing a valid copy of this memory location.  */
+TCGTemp *copy;
+/* Pointer to the next memory location containing copy of the same
+   content.  */
+struct TCGMemLocation *next_copy;
+
+/* Double-linked list of all memory locations.  */
+struct TCGMemLocation *next;
+struct TCGMemLocation **prev_ptr;
+} TCGMemLocation;
+
+struct TCGMemLocation *mem_locations;
+struct TCGMemLocation *free_mls;
+
 static inline struct tcg_temp_info *ts_info(TCGTemp *ts)
 {
 return ts->state_ptr;
@@ -70,6 +90,34 @@ static inline bool arg_is_copy(TCGArg arg)
 return ts_is_copy(arg_temp(arg));
 }
 
+/* Reset MEMORY LOCATION state. */
+static void reset_ml(TCGMemLocation *ml)
+{
+if (!ml) {
+return ;
+}
+if (ml->copy) {
+TCGMemLocation **prev_ptr = &ts_info(ml->copy)->mem_loc;
+TCGMemLocation *cur_ptr = ts_info(ml->copy)->mem_loc;
+while (cur_ptr && cur_ptr != ml) {
+prev_ptr = &cur_ptr->next_copy;
+cur_ptr = cur_ptr->next_copy;
+}
+*prev_ptr = ml->next_copy;
+if (ts_info(ml->copy)->mem_loc == ml) {
+ts_info(ml->copy)->mem_loc = ml->next_copy;
+}
+}
+
+*ml->prev_ptr = ml->next;
+if (ml->next) {
+ml->next->prev_ptr = ml->prev_ptr;
+}
+ml->prev_ptr = NULL;
+ml->next = free_mls;
+free_mls = ml;
+}
+
 /* Reset TEMP's state, possibly removing the temp for the list of copies.  */
 static void reset_ts(TCGTemp *ts)
 {
@@ -77,12 +125,27 @@ static void reset_ts(TCGTemp *ts)
 struct tcg_temp_info *pi = ts_info(ti->prev_copy);
 struct tcg_temp_info *ni = ts_info(ti->next_copy);
 
+if (ti->mem_loc && ts_is_copy(ts) && 0) {
+TCGMemLocation *ml, *nml;
+for (ml = ti->mem_loc; ml; ml = nml) {
+nml = ml->next_copy;
+ml->copy = ti->next_copy;
+ml->next_copy = ni->mem_loc;
+ni->mem_loc = ml;
+}
+} else {
+while (ti->mem_loc) {
+reset_ml(ti->mem_loc);
+}
+}
+
 ni->prev_copy = ti->prev_copy;
 pi->next_copy = ti->next_copy;
 ti->next_copy = ts;
 ti->prev_copy = ts;
 ti->is_const = false;
 ti->mask = -1;
+ti->mem_loc = NULL;
 }
 
 static void reset_temp(TCGArg arg)
@@ -103,6 +166,7 @@ static void init_ts_info(struct tcg_temp_info *infos,
 ti->prev_copy = ts;
 ti->is_const = false;
 ti->mask = -1;
+ti->mem_loc = NULL;
 set_bit(idx, temps_used->l);
 }
 }
@@ -119,6 +183,92 @@ static int op_bits(TCGOpcode op)
 return def->flags & TCG_OPF_64BIT ? 64 : 32;
 }
 
+/* Allocate a new MEMORY LOCATION of reuse a free one.  */
+static TCGMemLocation *alloc_ml(void)
+{
+if (free_mls) {
+TCGMemLocation *ml = free_mls;
+free_mls = free_mls->next;
+return ml;
+}
+return tcg_malloc(sizeof(TCGMemLocation));
+}
+
+/* Allocate and initialize MEMORY LOCATION.  */
+static TCGMemLocation *new_ml(tcg_target_ulong off, tcg_target_ulong sz,
+  TCGTemp *copy)
+{
+TCGMemLocation *ml = alloc_ml();
+
+ml->offset = off;
+ml->size = sz;
+ml->copy = copy;
+if (copy) {
+ml->type = copy->base_type;
+ml->next_copy = ts_info(copy)->mem_loc;
+ts_info(copy)->mem_loc = ml;
+} else {
+tcg_abort();
+}
+ml->next = mem_locations;
+if (ml->next) {
+ml->next->prev_ptr = &ml->next;
+}
+ml->prev_ptr = &mem_locations;
+

[Qemu-devel] [PATCH RFC 0/3] TCG: do copy propagation through memory locations

2017-11-09 Thread Kirill Batuzov

This patch series is based on native-vector-registers-3:
  git://github.com/rth7680/qemu.git native-vector-registers-3

Particular goal of this change was to retain values of guest vector registers
on host vector registers between different guest instructions.

Relation between memory locations and variables is many-to-many.
Variables can be copies of each other; multiple variables can have the same
value as the one stored in memory location. Any variable can be stored to
multiple memory locations as well. To represent all this a data structure that
can handle the following operations is needed.

 (0) Allocate and deallocate memory locations. Exact number of possible memory
 locations is unknown, but there should not be too many of them known to
 algorithm simultaneously.
 (1) Find a memory location with specified offset, size and type among all
 memory locations. Needed to replace LOADs.
 (2) For a memory location find a variable containing the same value. Also
 needed to replace LOADs.
 (3) Remove memory locations overlapping with specified range of addresses.
 Needed to remove memory locations affected by STOREs.
 (4) For a variable find all memory locations containing the same value.
 In case the value of the variable has changed, these memory locations
 should not reference this variable any more.

In proposed implementation all these cases are handled by multiple lists
containing memory locations.
 - List of unused memory location descriptors.
 - List of all known memory locations.
 - List of memory locations containing the same value for every variable.

Change was tested on x264 video encoder compiled for ARM32 and run using
qemu-linux-user on x86_64 host. Some loads were replaced by MOVs, but no
change in performance was observed.

x264 video encoder compiled for ARM64 crashed under qemu-linux-user
unfortunately.

On the artificial test case nearly 3x speedup was observed (8s vs 22s).

IN:

0x004005c0:  4ea18400  add v0.4s, v0.4s, v1.4s
0x004005c4:  4ea18400  add v0.4s, v0.4s, v1.4s
0x004005c8:  4ea18400  add v0.4s, v0.4s, v1.4s
0x004005cc:  4ea18400  add v0.4s, v0.4s, v1.4s


OP:

  004005c0  
 ld_vec tmp7,env,$0x8a0,$0x1
 ld_vec tmp8,env,$0x8b0,$0x1
 add32_vec tmp9,tmp7,tmp8,$0x1
 st_vec tmp9,env,$0x8a0,$0x1

  004005c4  
 ld_vec tmp7,env,$0x8a0,$0x1
 ld_vec tmp8,env,$0x8b0,$0x1
 add32_vec tmp9,tmp7,tmp8,$0x1
 st_vec tmp9,env,$0x8a0,$0x1

  004005c8  
 ld_vec tmp7,env,$0x8a0,$0x1
 ld_vec tmp8,env,$0x8b0,$0x1
 add32_vec tmp9,tmp7,tmp8,$0x1
 st_vec tmp9,env,$0x8a0,$0x1

  004005cc  
 ld_vec tmp7,env,$0x8a0,$0x1
 ld_vec tmp8,env,$0x8b0,$0x1
 add32_vec tmp9,tmp7,tmp8,$0x1
 st_vec tmp9,env,$0x8a0,$0x1


OP after optimization and liveness analysis:

  004005c0  
 ld_vec tmp7,env,$0x8a0,$0x1
 ld_vec tmp8,env,$0x8b0,$0x1
 add32_vec tmp9,tmp7,tmp8,$0x1dead: 1
 st_vec tmp9,env,$0x8a0,$0x1

  004005c4  
 mov_vec tmp7,tmp9,$0x1   dead: 1
 add32_vec tmp9,tmp7,tmp8,$0x1dead: 1
 st_vec tmp9,env,$0x8a0,$0x1

  004005c8  
 mov_vec tmp7,tmp9,$0x1   dead: 1
 add32_vec tmp9,tmp7,tmp8,$0x1dead: 1
 st_vec tmp9,env,$0x8a0,$0x1

  004005cc  
 mov_vec tmp7,tmp9,$0x1   dead: 1
 add32_vec tmp9,tmp7,tmp8,$0x1dead: 1 2
 st_vec tmp9,env,$0x8a0,$0x1  dead: 0 1


I'm not particularly happy about the current implementation.
 - Data structure seems to be a bit too complicated for the task at hand. May
   be I'm doing something wrong?
 - Current data structure is tightly related to struct tcg_temp_info and is a
   part of optimizations. Very similar data structure will be needed in
   liveness analysis to eliminate redundant STOREs.

Having SSA (or at least single assignment per basic block) will help a lot.
It will remove use case (4) completely, and with it the need for the lists of
memory locations for each variable, leaving only one list. Another result will
be that operation on TCGMemLocation will no longer need to do any
modifications of TCGTemp or tcg_temp_info structures thus making TCGMemLocation
reusable in liveness or register allocation.

But we do not have SSA (yet?).

Any thoughts or comments?

Kirill Batuzov (3):
  tcg: support MOV_VEC and MOVI_VEC opcodes in register allocator
  tcg/optimize: do copy propagation for memory locations
  tcg/optimize: handle vector loads and stores during copy propagation

 tcg/optimize.c | 288 +
 tcg/tcg.c  |   2 +
 2 fil

Re: [Qemu-devel] [RFC PATCH 0/9] TCG Vector types and example conversion

2017-08-22 Thread Kirill Batuzov

On Fri, 18 Aug 2017, Richard Henderson wrote:

> On 08/18/2017 04:33 AM, Kirill Batuzov wrote:
> > From my own experimentations some times ago,
> > 
> > (1) translating vector instructions to vector instructions in TCG is faster 
> > than
> > 
> > (2) translating vector instructions to series of scalar instructions in TCG,
> > which is faster than*
> > 
> > (3) translating vector instructions to single helper calls, which is faster
> > than*
> > 
> > (4) translating vector instructions to helper calls for each vector element.
> > 
> > (*) (2) and (3) may change their respective places in case of some
> > complicated instructions.
> 
> This was my gut feeling as well.  With the caveat that for the ARM SVE case of
> 2048-bit registers we cannot afford to expand inline due to generated code 
> size.
> 
> > ARM (at least ARM32, I have not checked aarch64 in this regard) uses the
> > last, the slowest scheme. As far as I understand, you are want to change
> > it to the third approach. This approach is used in SSE emulation, may be
> > you can use similar structure of helpers?
> > 
> > I still hope to finish my own series about implementation of the first
> > approach. I apologize for the long delay since last update and hope to
> > send next version somewhere next week. I do not think our series
> > contradict each other: you are trying to optimize existing general
> > purpose case while I'm trying to optimize case where both host and guest
> > support vector instructions. Since I'm experimenting on ARM32, we'll not
> > have much merge conflicts either.
> 
> I posted my own, different, take on vectorization yesterday as well.
> 
>   http://lists.nongnu.org/archive/html/qemu-devel/2017-08/msg03272.html
> 
> The primary difference between my version and your version is that I do not
> allow target/cpu/translate*.c to create vector types.  All of the host vector
> expansion is done within tcg/*.c.

I took a look at your approach. The only problem with it is that in
current implementation it does not allow to keep vector variables on
register between consecutive guest instructions. But this can be
changed. To do it we need to make copy propagation work with memory
locations as well, and dead code elimination to be able to remove excess
stores to memory. While in general case these can be troublesome if we
limit analysis to addresses that are [env + Const] it becomes relatively
easy. I've done similar thing in my series to track interference between
memory operations and vector global variables. In case of your series
this affects only performance so it does not need to be added in the
initial series and can be added later as a separate patch. I can care of
this once initial series are pulled to master.

Overall I like your approach the most out of three:
 - it handles different representations of guest vectors with host
   vectors seamlessly (unlike my approach where I still do not know how
   to make it right),
 - it provides better performance than Alex's (and the same as mine once
   we add a bit of alias analysis),
 - it moves in the direction of representing guest vectors not as
   globals, but as a pair (offset, size) in a special address space
   (this approach was successfully used in Valgrind and it handles
   intersecting registers much better than what we have now; we are
   moving in this direction anyway).

-- 
Kirill

Re: [Qemu-devel] [RFC PATCH 0/9] TCG Vector types and example conversion

2017-08-18 Thread Kirill Batuzov



On Thu, 17 Aug 2017, Alex Bennée wrote:

> Hi,
> 
> With upcoming work on SVE I've been looking at the way we implement
> vector registers in QEMU's TCG. The current orthodoxy is to decompose
> the vector into a series of TCG registers, often calling a helper
> function the calculation of each element. The result of the helper is
> then is then stored back in the vector representation afterwards.
> There are occasional outliers like simd_tbl which access elements
> directly from a passed CPUFooState env pointer but these are rare.
> 
> This series introduces the concept of TCGv_vec type. This is a pointer
> to the start of the in memory representation of an arbitrarily long
> vector register. This is passed to a helper function as a pointer
> along with a normal TCG register containing information about the
> actual vector length and any additional information the helper needs
> to do the operation. The hope* is this saves on the churn of having
> the TCG do things element by element and allows the compiler to use
> native vector operations to streamline the helpers.
> 
> There are some downsides to this approach. The first is you have to be
> careful about register aliasing. If you are doing a same reg to same
> reg operation you need to make a copy of the vector so you don't
> trample your input data as you go. The second is this involves
> changing some of the assumptions the TCG makes about things. I've
> managed to keep all the changes within the core TCG code for now but
> so far it has only been tested for the tcg_call path which is the only
> place where TCGv_vec's should turn up. It is possible to do the same
> thing without touching the TCG code generation by using TCGv_ptrs and
> manually emitting tcg_addi ops to pass the correct address. Richard
> has been exploring this approach with his series. The downside of that
> is you do miss the ability to have named global vector registers which
> makes reading the TCG dumps a little easier.
> 
> I've only patched one helper in this series which implements the
> indexed smull. This is because it appears in the profiles for my test
> case which was using an arm64 ffmpeg to transcode:
> 
>   ./ffmpeg.arm64 -i big_buck_bunny_480p_surround-fix.avi \
> -threads 1 -qscale:v 3 -f null -
> 
> * hope. On an earlier revision (which included sqshrn conversions) I
>   had measured a minor saving but this had disappeared once I measured
>   the final code. However the profile is fairly dominated by
>   softfloat.
> 
> master:
>  8.05%  qemu-aarch64  qemu-aarch64 [.] roundAndPackFloat32
>  7.28%  qemu-aarch64  qemu-aarch64 [.] float32_mul
>  6.56%  qemu-aarch64  qemu-aarch64 [.] helper_lookup_tb_ptr
>  5.31%  qemu-aarch64  qemu-aarch64 [.] float32_muladd
>  4.09%  qemu-aarch64  qemu-aarch64 [.] helper_neon_mull_s16
>  4.00%  qemu-aarch64  qemu-aarch64 [.] addFloat32Sigs
>  3.86%  qemu-aarch64  qemu-aarch64 [.] subFloat32Sigs
>  2.26%  qemu-aarch64  qemu-aarch64 [.] helper_simd_tbl
>  2.00%  qemu-aarch64  qemu-aarch64 [.] float32_add
>  1.81%  qemu-aarch64  qemu-aarch64 [.] 
> helper_neon_unarrow_sat8
>  1.64%  qemu-aarch64  qemu-aarch64 [.] float32_sub
>  1.43%  qemu-aarch64  qemu-aarch64 [.] helper_neon_subl_u32
>  0.98%  qemu-aarch64  qemu-aarch64 [.] helper_neon_widen_u8
> 
> tcg-native-vectors-rfc:
>  7.93%  qemu-aarch64  qemu-aarch64 [.] roundAndPackFloat32
>  
>  7.54%  qemu-aarch64  qemu-aarch64 [.] float32_mul
>  
>  6.29%  qemu-aarch64  qemu-aarch64 [.] helper_lookup_tb_ptr
>  5.39%  qemu-aarch64  qemu-aarch64 [.] float32_muladd
>  3.92%  qemu-aarch64  qemu-aarch64 [.] addFloat32Sigs
>  3.86%  qemu-aarch64  qemu-aarch64 [.] subFloat32Sigs
>  3.62%  qemu-aarch64  qemu-aarch64 [.] 
> helper_advsimd_smull_idx_s32
>  2.19%  qemu-aarch64  qemu-aarch64 [.] helper_simd_tbl
>  2.09%  qemu-aarch64  qemu-aarch64 [.] helper_neon_mull_s16
>  1.99%  qemu-aarch64  qemu-aarch64 [.] float32_add
>  1.79%  qemu-aarch64  qemu-aarch64 [.] 
> helper_neon_unarrow_sat8
>  1.62%  qemu-aarch64  qemu-aarch64 [.] float32_sub
>  1.43%  qemu-aarch64  qemu-aarch64 [.] helper_neon_subl_u32
>  1.00%  qemu-aarch64  qemu-aarch64 [.] helper_neon_widen_u8
>  0.98%  qemu-aarch64  qemu-aarch64 [.] helper_neon_addl_u32
> 
> At the moment the default compiler settings don't actually vectorise
> the helper. I could get it to once I added some alignment guarantees
> but the casting I did broke the instruction emulation so I haven't
> included that patch in this series.
> 
> Given the results why continue investigating th

Re: [Qemu-devel] [PATCH v2.1 13/21] tcg/i386: support remaining vector addition operations

2017-02-21 Thread Kirill Batuzov

On Tue, 21 Feb 2017, Philippe Mathieu-Daudé wrote:

> Hi Kirill,
> 
> could you check my previous comment?
>

Hi Philippe,

thank you for your comments. I've seen them and I'll apply changes you
suggested in the next version of the series. I was just hoping to get
a bit more feedback before I proceed to v3.

-- 
Kirill

Re: [Qemu-devel] [PATCH v2.1 00/20] Emulate guest vector operations with host vector operations

2017-02-21 Thread Kirill Batuzov

On Thu, 2 Feb 2017, Kirill Batuzov wrote:

> The goal of these patch series is to set up an infrastructure to emulate
> guest vector operations using host vector operations. Preliminary
> experiments show that simply translating loads and stores increases
> performance of x264 video codec by 10%. The performance of a gcc vectorized
> for loop increased 2x.
> 
> To be able to emulate guest vector operations using host vector operations,
> several things need to be done.
> 
> 1. Corresponding vector types should be added to TCG. These series add
> TCG_v128 and TCG_v64. I've made TCG_v64 a different type than TCG_i64
> because it usually needs to be allocated to different registers and
> supports different operations.
> 
> 2. Load/store operations for these new types need to be implemented.
> 
> 3. For seamless transition from current model to a new one we need to
> handle cases where memory occupied by global variable can be accessed via
> pointer to the CPUArchState structure. A very simple conservative alias
> analysis has been added to do it. This analysis tracks memory loads and
> stores that overlap with fields of CPUArchState and provides this
> information to the register allocator. The allocator then spills and
> reloads affected globals when needed.
> 
> 4. Allow overlapping globals. For scalar registers this is a rare case, and
> overlapping registers can ba handled as a single one (ah, al, ax, eax,
> rax). In ARM every Q-register consists of two D-register each consisting of
> two S-registers. Handling 4 S-registers as one because they are parts of
> the same Q-register is way too inefficient.
> 
> 5. Add new memory addressing mode to MMU code for large accesses and create
> needed helpers. Only 128-bit vectors have been handled for now.
> 
> 6. Create TCG opcodes for vector operations. Only addition has beed handled
> in these series. Each operation has a wrapper that checks if the backend
> supports the corresponding operation or not. In one case the vector opcode
> is generated, in the other the operation is emulated with scalar
> operations. The emulation code is generated inline for performance reasons
> (there is a huge performance difference between inline generation
> and calling a helper). As a positive side effect this will eventually allow
>  to merge similar emulation code for vector instructions from different
> frontends to target-independent implementation.
> 
> 7. Use new operations in the frontend (ARM was used in these series).
> 
> 8. Support new operations in the backend (x86_64 was used in these series).
> 
> For experiments I have used ARM guest on x86_64 host. I wanted some pair of
> different architectures with vector extensions both. ARM and x86_64 pair
> fits well.
> 
> v1 -> v2:
>  - represent v128 type with smaller types when it is not supported by the host
>  - detect AVX support and use AVX instructions when available
>  - tcg/README updated
>  - generate two v64 adds instead of one v128 when applicable
>  - rebased to newer master
>  - overlap detection for temps added (it needs to be explicitly called from
>_translate_init)
>  - the stack is used to temporary store 128 bit variables to memory
>(instead of the TCGContext field)
> 
> v2 -> v2.1
>  - automatic build failure fixed
> 
> Outstanding issues:
>  - qemu_ld_v128 and qemu_st_v128 do not generate fallback code if the host
>does not support 128 bit registers. The reason is that I do not know how to
>handle the host/guest different endianness (whether do we swap only bytes
>in elements or whole vectors?). Different targets seem to have different
>ideas on how this should be done.
>

Ping?

-- 
Kirill

[Qemu-devel] [PATCH v2.1 14/21] tcg: do not rely on exact values of MO_BSWAP or MO_SIGN in backend

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/aarch64/tcg-target.inc.c |  4 ++--
 tcg/arm/tcg-target.inc.c |  4 ++--
 tcg/i386/tcg-target.inc.c|  4 ++--
 tcg/mips/tcg-target.inc.c|  4 ++--
 tcg/ppc/tcg-target.inc.c |  4 ++--
 tcg/s390/tcg-target.inc.c|  4 ++--
 tcg/sparc/tcg-target.inc.c   | 12 ++--
 tcg/tcg-op.c |  4 ++--
 tcg/tcg.h|  1 +
 9 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 6d227a5..2b0b548 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -1032,7 +1032,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, 
TCGReg d,
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * TCGMemOpIdx oi, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
 [MO_LEUL] = helper_le_ldul_mmu,
@@ -1046,7 +1046,7 @@ static void * const qemu_ld_helpers[16] = {
  * uintxx_t val, TCGMemOpIdx oi,
  * uintptr_t ra)
  */
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index e75a6d4..f603f02 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -1058,7 +1058,7 @@ static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_SB]   = helper_ret_ldsb_mmu,
 
@@ -1078,7 +1078,7 @@ static void * const qemu_ld_helpers[16] = {
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  * uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index d8f0d81..263c15e 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1334,7 +1334,7 @@ static void tcg_out_nopn(TCGContext *s, int n)
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
 [MO_LEUL] = helper_le_ldul_mmu,
@@ -1347,7 +1347,7 @@ static void * const qemu_ld_helpers[16] = {
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  * uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index 01ac7b2..4f2d5d1 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -1108,7 +1108,7 @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit 
*arg)
 }
 
 #if defined(CONFIG_SOFTMMU)
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_SB]   = helper_ret_ldsb_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
@@ -1125,7 +1125,7 @@ static void * const qemu_ld_helpers[16] = {
 #endif
 };
 
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 64f67d2..680050b 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -1419,7 +1419,7 @@ static const uint32_t qemu_exts_opc[4] = {
 /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
 [MO_LEUL] = helper_le_ldul_mmu,
@@ -1432,7 +1432,7 @@ static void * const qemu_ld_helpers[16] = {
 /* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
  * uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_st_helpers[16

[Qemu-devel] [PATCH v2.1 18/21] softmmu: create helpers for vector loads

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 cputlb.c  |   4 +
 softmmu_template_vector.h | 266 ++
 tcg/tcg.h |   5 +
 3 files changed, 275 insertions(+)
 create mode 100644 softmmu_template_vector.h

diff --git a/cputlb.c b/cputlb.c
index 6c39927..41c9a01 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -660,6 +660,10 @@ static void *atomic_mmu_lookup(CPUArchState *env, 
target_ulong addr,
 #define DATA_SIZE 8
 #include "softmmu_template.h"
 
+#define SHIFT 4
+#include "softmmu_template_vector.h"
+#undef MMUSUFFIX
+
 /* First set of helpers allows passing in of OI and RETADDR.  This makes
them callable from other helpers.  */
 
diff --git a/softmmu_template_vector.h b/softmmu_template_vector.h
new file mode 100644
index 000..b286d65
--- /dev/null
+++ b/softmmu_template_vector.h
@@ -0,0 +1,266 @@
+/*
+ *  Software MMU support
+ *
+ * Generate helpers used by TCG for qemu_ld/st vector ops and code
+ * load functions.
+ *
+ * Included from target op helpers and exec.c.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/timer.h"
+#include "exec/address-spaces.h"
+#include "exec/memory.h"
+
+#define DATA_SIZE (1 << SHIFT)
+
+#if DATA_SIZE == 16
+#define SUFFIX v128
+#else
+#error unsupported data size
+#endif
+
+
+#ifdef SOFTMMU_CODE_ACCESS
+#define READ_ACCESS_TYPE MMU_INST_FETCH
+#define ADDR_READ addr_code
+#else
+#define READ_ACCESS_TYPE MMU_DATA_LOAD
+#define ADDR_READ addr_read
+#endif
+
+#define helper_te_ld_name  glue(glue(helper_te_ld, SUFFIX), MMUSUFFIX)
+#define helper_te_st_name  glue(glue(helper_te_st, SUFFIX), MMUSUFFIX)
+
+#ifndef SOFTMMU_CODE_ACCESS
+static inline void glue(io_read, SUFFIX)(CPUArchState *env,
+ CPUIOTLBEntry *iotlbentry,
+ target_ulong addr,
+ uintptr_t retaddr,
+ uint8_t *res)
+{
+CPUState *cpu = ENV_GET_CPU(env);
+hwaddr physaddr = iotlbentry->addr;
+MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+int i;
+
+assert(0); /* Needs testing */
+
+physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
+cpu->mem_io_pc = retaddr;
+if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+cpu_io_recompile(cpu, retaddr);
+}
+
+cpu->mem_io_vaddr = addr;
+for (i = 0; i < (1 << SHIFT); i += 8) {
+memory_region_dispatch_read(mr, physaddr + i, (uint64_t *)(res + i),
+8, iotlbentry->attrs);
+}
+}
+#endif
+
+void helper_te_ld_name(CPUArchState *env, target_ulong addr,
+   TCGMemOpIdx oi, uintptr_t retaddr, uint8_t *res)
+{
+unsigned mmu_idx = get_mmuidx(oi);
+int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+uintptr_t haddr;
+int i;
+
+/* Adjust the given return address.  */
+retaddr -= GETPC_ADJ;
+
+/* If the TLB entry is for a different page, reload and try again.  */
+if ((addr & TARGET_PAGE_MASK)
+ != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+if ((addr & (DATA_SIZE - 1)) != 0
+&& (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+ mmu_idx, retaddr);
+}
+if (!VICTIM_TLB_HIT(ADDR_READ, addr)) {
+tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+ mmu_idx, retaddr);
+}
+tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+}
+
+/* Handle an IO access.  */
+if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
+CPUIOTLBEntry *iotlbentry;
+if ((addr & (DATA_SIZE - 1)) != 0) {
+goto do_unaligned_access;
+}
+iotlbentry = &env->iotlb[mmu_idx][index];
+
+/* ??? Note that the io helpers always read data in the target
+   byte ordering.  We should push the LE/BE request down into io.  */
+glue(io_read, SUFFIX)(env, iotlbe

[Qemu-devel] [PATCH v2.1 19/21] tcg/i386: add support for qemu_ld_v128/qemu_st_v128 ops

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.inc.c | 68 ++-
 1 file changed, 61 insertions(+), 7 deletions(-)

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 1e6edc0..4647e97 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1342,6 +1342,7 @@ static void * const qemu_ld_helpers[] = {
 [MO_BEUW] = helper_be_lduw_mmu,
 [MO_BEUL] = helper_be_ldul_mmu,
 [MO_BEQ]  = helper_be_ldq_mmu,
+[MO_128]  = helper_te_ldv128_mmu,
 };
 
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
@@ -1355,6 +1356,7 @@ static void * const qemu_st_helpers[] = {
 [MO_BEUW] = helper_be_stw_mmu,
 [MO_BEUL] = helper_be_stl_mmu,
 [MO_BEQ]  = helper_be_stq_mmu,
+[MO_128]  = helper_te_stv128_mmu,
 };
 
 /* Perform the TLB load and compare.
@@ -1521,12 +1523,30 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 ofs += 4;
 
 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
+
+if ((opc & MO_SSIZE) == MO_128) {
+ofs += 4;
+tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP);
+tcg_out_addi(s, TCG_REG_EAX, TCG_STATIC_CALL_ARGS_SIZE - 16);
+tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP, ofs);
+}
 } else {
 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 /* The second argument is already loaded with addrlo.  */
 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
  (uintptr_t)l->raddr);
+if ((opc & MO_SSIZE) == MO_128) {
+tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP);
+tcg_out_addi(s, TCG_REG_EAX, TCG_STATIC_CALL_ARGS_SIZE - 16);
+if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
+tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[4],
+TCG_REG_EAX);
+} else {
+tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_EAX,
+TCG_REG_ESP, TCG_TARGET_CALL_STACK_OFFSET);
+}
+}
 }
 
 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
@@ -1562,6 +1582,11 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
 }
 break;
+case MO_128:
+tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP);
+tcg_out_addi(s, TCG_REG_EAX, TCG_STATIC_CALL_ARGS_SIZE - 16);
+tcg_out_ld(s, TCG_TYPE_V128, l->datalo_reg, TCG_REG_EAX, 0);
+break;
 default:
 tcg_abort();
 }
@@ -1601,12 +1626,20 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 ofs += 4;
 }
 
-tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
-ofs += 4;
-
-if (s_bits == MO_64) {
-tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
+if (s_bits == MO_128) {
+tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP);
+tcg_out_addi(s, TCG_REG_EAX, TCG_STATIC_CALL_ARGS_SIZE - 16);
+tcg_out_st(s, TCG_TYPE_V128, l->datalo_reg, TCG_REG_EAX, 0);
+tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP, ofs);
 ofs += 4;
+} else {
+tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
+ofs += 4;
+
+if (s_bits == MO_64) {
+tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
+ofs += 4;
+}
 }
 
 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
@@ -1618,8 +1651,16 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 } else {
 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 /* The second argument is already loaded with addrlo.  */
-tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
-tcg_target_call_iarg_regs[2], l->datalo_reg);
+if (s_bits == MO_128) {
+tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_RAX, TCG_REG_ESP);
+tcg_out_addi(s, TCG_REG_RAX, TCG_STATIC_CALL_ARGS_SIZE - 16);
+tcg_out_st(s, TCG_TYPE_V128, l->datalo_reg, TCG_REG_RAX, 0);
+tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[2],
+TCG_REG_RAX);
+} else {
+tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
+tcg_target_call_iarg_regs[2], l->datalo_reg);
+}
 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
 
 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
@@ -1751,6 +1792,

[Qemu-devel] [PATCH v2.1 11/21] tcg/i386: add support for vector opcodes

2017-02-02 Thread Kirill Batuzov

To be able to generate vector operations in a TCG backend we need to do
several things.

1. We need to tell the register allocator about vector target's register.
   In case of x86 we'll use xmm0..xmm7. xmm7 is designated as a scratch
   register, others can be used by the register allocator.

2. We need a new constraint to indicate where to use vector registers. In
   this commit the 'V' constraint is introduced.

3. We need to be able to generate bare minimum: load, store and reg-to-reg
   move. MOVDQU is used for loads and stores. MOVDQA is used for reg-to-reg
   moves.

4. Finally we need to support any other opcodes we want. INDEX_op_add_i32x4
   is the only one for now. The PADDD instruction handles it perfectly.

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.h |  34 +-
 tcg/i386/tcg-target.inc.c | 111 +++---
 2 files changed, 137 insertions(+), 8 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 21d96ec..b0704e8 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -29,8 +29,16 @@
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
 
 #ifdef __x86_64__
-# define TCG_TARGET_REG_BITS  64
-# define TCG_TARGET_NB_REGS   16
+# if defined(TARGET_WORDS_BIGENDIAN) == defined(HOST_WORDS_BIGENDIAN)
+#  define TCG_TARGET_HAS_REG128 1
+# endif
+# ifdef TCG_TARGET_HAS_REG128
+#  define TCG_TARGET_REG_BITS  64
+#  define TCG_TARGET_NB_REGS   32
+# else
+#  define TCG_TARGET_REG_BITS  64
+#  define TCG_TARGET_NB_REGS   16
+# endif
 #else
 # define TCG_TARGET_REG_BITS  32
 # define TCG_TARGET_NB_REGS8
@@ -56,6 +64,24 @@ typedef enum {
 TCG_REG_R13,
 TCG_REG_R14,
 TCG_REG_R15,
+
+TCG_REG_XMM0,
+TCG_REG_XMM1,
+TCG_REG_XMM2,
+TCG_REG_XMM3,
+TCG_REG_XMM4,
+TCG_REG_XMM5,
+TCG_REG_XMM6,
+TCG_REG_XMM7,
+TCG_REG_XMM8,
+TCG_REG_XMM9,
+TCG_REG_XMM10,
+TCG_REG_XMM11,
+TCG_REG_XMM12,
+TCG_REG_XMM13,
+TCG_REG_XMM14,
+TCG_REG_XMM15,
+
 TCG_REG_RAX = TCG_REG_EAX,
 TCG_REG_RCX = TCG_REG_ECX,
 TCG_REG_RDX = TCG_REG_EDX,
@@ -144,6 +170,10 @@ extern bool have_popcnt;
 #define TCG_TARGET_HAS_mulsh_i640
 #endif
 
+#ifdef TCG_TARGET_HAS_REG128
+#define TCG_TARGET_HAS_add_i32x41
+#endif
+
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
 (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
  ((ofs) == 0 && (len) == 16))
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 5918008..3e718f3 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -32,6 +32,11 @@ static const char * const 
tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #else
 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
 #endif
+#ifdef TCG_TARGET_HAS_REG128
+"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14",
+"%xmm15",
+#endif
 };
 #endif
 
@@ -61,6 +66,24 @@ static const int tcg_target_reg_alloc_order[] = {
 TCG_REG_EDX,
 TCG_REG_EAX,
 #endif
+#ifdef TCG_TARGET_HAS_REG128
+TCG_REG_XMM0,
+TCG_REG_XMM1,
+TCG_REG_XMM2,
+TCG_REG_XMM3,
+TCG_REG_XMM4,
+TCG_REG_XMM5,
+TCG_REG_XMM6,
+/*  TCG_REG_XMM7, <- scratch register */
+TCG_REG_XMM8,
+TCG_REG_XMM9,
+TCG_REG_XMM10,
+TCG_REG_XMM11,
+TCG_REG_XMM12,
+TCG_REG_XMM13,
+TCG_REG_XMM14,
+TCG_REG_XMM15,
+#endif
 };
 
 static const int tcg_target_call_iarg_regs[] = {
@@ -247,6 +270,10 @@ static const char 
*target_parse_constraint(TCGArgConstraint *ct,
 case 'I':
 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
 break;
+case 'V':
+ct->ct |= TCG_CT_REG;
+tcg_regset_set32(ct->u.regs, 0, 0xff);
+break;
 
 default:
 return NULL;
@@ -302,6 +329,9 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define P_SIMDF30x1 /* 0xf3 opcode prefix */
 #define P_SIMDF20x2 /* 0xf2 opcode prefix */
 
+#define P_SSE_660F  (P_DATA16 | P_EXT)
+#define P_SSE_F30F  (P_SIMDF3 | P_EXT)
+
 #define OPC_ARITH_EvIz (0x81)
 #define OPC_ARITH_EvIb (0x83)
 #define OPC_ARITH_GvEv (0x03)  /* ... plus (ARITH_FOO << 3) */
@@ -357,6 +387,11 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define OPC_GRP3_Ev(0xf7)
 #define OPC_GRP5   (0xff)
 
+#define OPC_MOVDQU_M2R  (0x6f | P_SSE_F30F)  /* store 128-bit value */
+#define OPC_MOVDQU_R2M  (0x7f | P_SSE_F30F)  /* load 128-bit value */
+#define OP

[Qemu-devel] [PATCH v2.1 21/21] tcg/README: update README to include information about vector opcodes

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/README | 47 ++-
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/tcg/README b/tcg/README
index a9858c2..209dbc4 100644
--- a/tcg/README
+++ b/tcg/README
@@ -53,9 +53,18 @@ an "undefined result".
 
 TCG instructions operate on variables which are temporaries, local
 temporaries or globals. TCG instructions and variables are strongly
-typed. Two types are supported: 32 bit integers and 64 bit
-integers. Pointers are defined as an alias to 32 bit or 64 bit
-integers depending on the TCG target word size.
+typed. Several types are supported:
+
+* 32 bit integers,
+
+* 64 bit integers,
+
+* 64 bit vectors,
+
+* 128 bit vectors.
+
+Pointers are defined as an alias to 32 bit or 64 bit integers
+depending on the TCG target word size.
 
 Each instruction has a fixed number of output variable operands, input
 variable operands and always constant operands.
@@ -208,6 +217,22 @@ t0=t1%t2 (signed). Undefined behavior if division by zero 
or overflow.
 
 t0=t1%t2 (unsigned). Undefined behavior if division by zero.
 
+* add_i8x16 t0, t1, t2
+add_i16x8 t0, t1, t2
+add_i32x4 t0, t1, t2
+add_i64x2 t0, t1, t2
+
+t0=t1+t2 where t0, t1 and t2 are 128 bit vectors of 8, 16, 32 or 64 bit
+integers.
+
+* add_i8x8 t0, t1, t2
+add_i16x4 t0, t1, t2
+add_i32x2 t0, t1, t2
+add_i64x1 t0, t1, t2
+
+t0=t1+t2 where t0, t1 and t2 are 64 bit vectors of 8, 16, 32 or 64 bit
+integers.
+
 * Logical
 
 * and_i32/i64 t0, t1, t2
@@ -477,8 +502,8 @@ current TB was linked to this TB. Otherwise execute the next
 instructions. Only indices 0 and 1 are valid and tcg_gen_goto_tb may be issued
 at most once with each slot index per TB.
 
-* qemu_ld_i32/i64 t0, t1, flags, memidx
-* qemu_st_i32/i64 t0, t1, flags, memidx
+* qemu_ld_i32/i64/v128 t0, t1, flags, memidx
+* qemu_st_i32/i64/v128 t0, t1, flags, memidx
 
 Load data at the guest address t1 into t0, or store data in t0 at guest
 address t1.  The _i32/_i64 size applies to the size of the input/output
@@ -488,6 +513,9 @@ and the width of the memory operation is controlled by 
flags.
 Both t0 and t1 may be split into little-endian ordered pairs of registers
 if dealing with 64-bit quantities on a 32-bit host.
 
+The _v128 size can only be used to read exactly 128 bit. Host and target
+are required to be of the same endianness for it to work.
+
 The memidx selects the qemu tlb index to use (e.g. user or kernel access).
 The flags are the TCGMemOp bits, selecting the sign, width, and endianness
 of the memory access.
@@ -538,6 +566,15 @@ Floating point operations are not supported in this 
version. A
 previous incarnation of the code generator had full support of them,
 but it is better to concentrate on integer operations first.
 
+To support vector operations, the backend must define:
+- TCG_TARGET_HAS_REGV64 for the 64 bit vector type and/or
+- TCG_TARGET_HAS_REG128 for the 128 bit vector type.
+For supported types, load and store operations must be supported. An
+arbitrary set of other vector operations may be supported. Vector operations
+that were not explicitly declared as supported (by defining
+TCG_TARGET_HAS_ to 1) will never appear in the intermediate
+representation. In this case, the emulation code will be emitted instead.
+
 4.2) Constraints
 
 GCC like constraints are used to define the constraints of every
-- 
2.1.4

[Qemu-devel] [PATCH v2.1 20/21] target/arm: load two consecutive 64-bits vector regs as a 128-bit vector reg

2017-02-02 Thread Kirill Batuzov

ARM instruction set does not have loads to 128-bit vector register (q-regs).
Instead it can read several consecutive 64-bit vector register (d-regs)
which is used by GCC to load 128-bit registers from memory.

For vector operations to work we need to detect such loads and transform them
into 128-bit loads to 128-bit temporaries.

Signed-off-by: Kirill Batuzov 
---
 target/arm/translate.c | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index 90e14df..5bd0b1c 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -4710,6 +4710,21 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t 
insn)
 tcg_gen_addi_i32(addr, addr, 1 << size);
 }
 if (size == 3) {
+#ifdef TCG_TARGET_HAS_REG128
+if (rd % 2 == 0 && nregs == 2) {
+TCGv aa32addr = gen_aa32_addr(s, addr, MO_TE | MO_128);
+/* 128-bit load */
+if (load) {
+tcg_gen_qemu_ld_v128(cpu_Q[rd / 2], aa32addr,
+ get_mem_index(s), MO_TE | MO_128);
+} else {
+tcg_gen_qemu_st_v128(cpu_Q[rd / 2], aa32addr,
+ get_mem_index(s), MO_TE | MO_128);
+}
+tcg_temp_free(aa32addr);
+break;
+}
+#endif
 tmp64 = tcg_temp_new_i64();
 if (load) {
 gen_aa32_ld64(s, tmp64, addr, get_mem_index(s));
-- 
2.1.4

[Qemu-devel] [PATCH v2.1 15/21] target/aarch64: do not check for non-existent TCGMemOp

2017-02-02 Thread Kirill Batuzov

MO_64|MO_SIGN is not a valid TCGMemOp. This code compiles only because by
coincidence this value equals to MO_SSIGN mask defined in the same enum.

Signed-off-by: Kirill Batuzov 
---

Bugfix which is only indirectly related to this series. Other changes of
the series exposed the problem.

---
 target/arm/translate-a64.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index d0352e2..8a1f70e 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -990,7 +990,6 @@ static void read_vec_element(DisasContext *s, TCGv_i64 
tcg_dest, int srcidx,
 tcg_gen_ld32s_i64(tcg_dest, cpu_env, vect_off);
 break;
 case MO_64:
-case MO_64|MO_SIGN:
 tcg_gen_ld_i64(tcg_dest, cpu_env, vect_off);
 break;
 default:
-- 
2.1.4

[Qemu-devel] [PATCH v2.1 05/21] tcg: add simple alias analysis

2017-02-02 Thread Kirill Batuzov

Add a simple alias analysis to TCG which finds out memory loads and stores
that overlap with CPUState. This information can be used later in liveness
analysis to ensure correctness of register allocation. In particular, if load
or store overlaps with memory location of some global variable, this variable
should be spilled and reloaded at appropriate times.

Previously no such analysis was performed and for correctness reasons it was
required that no load/store operations overlap with memory locations of global
variables.

Signed-off-by: Kirill Batuzov 
---

I believe checkpatch warning here to be false-positive.

---
 tcg/optimize.c | 146 +
 tcg/tcg.h  |  17 +++
 2 files changed, 163 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index adfc56c..2347ce3 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -34,6 +34,7 @@
 
 struct tcg_temp_info {
 bool is_const;
+bool is_base;
 uint16_t prev_copy;
 uint16_t next_copy;
 tcg_target_ulong val;
@@ -61,6 +62,7 @@ static void reset_temp(TCGArg temp)
 temps[temp].next_copy = temp;
 temps[temp].prev_copy = temp;
 temps[temp].is_const = false;
+temps[temp].is_base = false;
 temps[temp].mask = -1;
 }
 
@@ -1429,3 +1431,147 @@ void tcg_optimize(TCGContext *s)
 }
 }
 }
+
+/* Simple alias analysis. It finds out which load/store operations overlap
+   with CPUArchState. The result is stored in TCGContext and can be used
+   during liveness analysis and register allocation. */
+void tcg_alias_analysis(TCGContext *s)
+{
+int oi, oi_next;
+
+reset_all_temps(s->nb_temps);
+temps[GET_TCGV_PTR(s->tcg_env)].is_base = true;
+temps[GET_TCGV_PTR(s->tcg_env)].val = 0;
+
+for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) {
+int nb_oargs, i;
+int size;
+TCGAliasType tp;
+
+TCGOp * const op = &s->gen_op_buf[oi];
+TCGArg * const args = &s->gen_opparam_buf[op->args];
+TCGOpcode opc = op->opc;
+const TCGOpDef *def = &tcg_op_defs[opc];
+
+oi_next = op->next;
+
+if (opc == INDEX_op_call) {
+nb_oargs = op->callo;
+} else {
+nb_oargs = def->nb_oargs;
+}
+
+s->alias_info[oi] = (TCGAliasInfo){
+TCG_NOT_ALIAS,
+false,
+0,
+0
+};
+
+switch (opc) {
+CASE_OP_32_64(movi):
+temps[args[0]].is_const = 1;
+temps[args[0]].val = args[1];
+break;
+CASE_OP_32_64(mov):
+temps[args[0]].is_const = temps[args[1]].is_const;
+temps[args[0]].is_base = temps[args[1]].is_base;
+temps[args[0]].val = temps[args[1]].val;
+break;
+CASE_OP_32_64(add):
+CASE_OP_32_64(sub):
+if (temps[args[1]].is_base && temps[args[2]].is_const) {
+temps[args[0]].is_base = true;
+temps[args[0]].is_const = false;
+temps[args[0]].val =
+do_constant_folding(opc, temps[args[1]].val,
+temps[args[2]].val);
+} else {
+reset_temp(args[0]);
+}
+CASE_OP_32_64(ld8s):
+CASE_OP_32_64(ld8u):
+size = 1;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+CASE_OP_32_64(ld16s):
+CASE_OP_32_64(ld16u):
+size = 2;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+case INDEX_op_ld_i32:
+case INDEX_op_ld32s_i64:
+case INDEX_op_ld32u_i64:
+size = 4;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+case INDEX_op_ld_i64:
+size = 8;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+case INDEX_op_ld_v128:
+size = 16;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+CASE_OP_32_64(st8):
+size = 1;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+CASE_OP_32_64(st16):
+size = 2;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+case INDEX_op_st_i32:
+case INDEX_op_st32_i64:
+size = 4;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+case INDEX_op_st_i64:
+size = 8;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+case INDEX_op_st_v128:
+size = 16;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+do_ldst:
+if (temps[args[1]].is_base) {
+TCGArg val;
+#if TCG_TARGET_REG_BITS == 32
+val = do_constant_folding(INDEX_op_add_i32,
+  temps[args[1]].val,
+  args[2]);
+#else
+

[Qemu-devel] [PATCH v2.1 02/21] tcg: add support for 64bit vector type

2017-02-02 Thread Kirill Batuzov

Introduce TCG_TYPE_V64 and corresponding TCGv_v64 for TCG temps. Add helper
functions that work with temps of this new type.

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h | 23 +++
 tcg/tcg.c| 13 +
 tcg/tcg.h| 34 ++
 3 files changed, 70 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 5abf8b2..517745e 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -266,6 +266,24 @@ static inline void tcg_gen_op3_v128(TCGOpcode opc, 
TCGv_v128 a1,
 GET_TCGV_V128(a3));
 }
 
+static inline void tcg_gen_op1_v64(TCGOpcode opc, TCGv_v64 a1)
+{
+tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_V64(a1));
+}
+
+static inline void tcg_gen_op2_v64(TCGOpcode opc, TCGv_v64 a1,
+TCGv_v64 a2)
+{
+tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_V64(a1), GET_TCGV_V64(a2));
+}
+
+static inline void tcg_gen_op3_v64(TCGOpcode opc, TCGv_v64 a1,
+TCGv_v64 a2, TCGv_v64 a3)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V64(a1), GET_TCGV_V64(a2),
+GET_TCGV_V64(a3));
+}
+
 /* Generic ops.  */
 
 static inline void gen_set_label(TCGLabel *l)
@@ -478,6 +496,11 @@ static inline void tcg_gen_discard_v128(TCGv_v128 arg)
 tcg_gen_op1_v128(INDEX_op_discard, arg);
 }
 
+static inline void tcg_gen_discard_v64(TCGv_v64 arg)
+{
+tcg_gen_op1_v64(INDEX_op_discard, arg);
+}
+
 /* 64 bit ops */
 
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 2a5e83b..5e69103 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -641,6 +641,14 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
 return MAKE_TCGV_I64(idx);
 }
 
+TCGv_v64 tcg_temp_new_internal_v64(int temp_local)
+{
+int idx;
+
+idx = tcg_temp_new_internal(TCG_TYPE_V64, temp_local);
+return MAKE_TCGV_V64(idx);
+}
+
 TCGv_v128 tcg_temp_new_internal_v128(int temp_local)
 {
 int idx;
@@ -681,6 +689,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
 tcg_temp_free_internal(GET_TCGV_I64(arg));
 }
 
+void tcg_temp_free_v64(TCGv_v64 arg)
+{
+tcg_temp_free_internal(GET_TCGV_V64(arg));
+}
+
 void tcg_temp_free_v128(TCGv_v128 arg)
 {
 tcg_temp_free_internal(GET_TCGV_V128(arg));
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 56484e7..fa455ae 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -246,6 +246,7 @@ typedef struct TCGPool {
 typedef enum TCGType {
 TCG_TYPE_I32,
 TCG_TYPE_I64,
+TCG_TYPE_V64,
 TCG_TYPE_V128,
 TCG_TYPE_COUNT, /* number of different types */
 
@@ -422,6 +423,7 @@ typedef tcg_target_ulong TCGArg;
 typedef struct TCGv_i32_d *TCGv_i32;
 typedef struct TCGv_i64_d *TCGv_i64;
 typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef struct TCGv_v64_d *TCGv_v64;
 typedef struct TCGv_v128_d *TCGv_v128;
 typedef TCGv_ptr TCGv_env;
 #if TARGET_LONG_BITS == 32
@@ -447,6 +449,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL 
MAKE_TCGV_PTR(intptr_t i)
 return (TCGv_ptr)i;
 }
 
+static inline TCGv_v64 QEMU_ARTIFICIAL MAKE_TCGV_V64(intptr_t i)
+{
+return (TCGv_v64)i;
+}
+
 static inline TCGv_v128 QEMU_ARTIFICIAL MAKE_TCGV_V128(intptr_t i)
 {
 return (TCGv_v128)i;
@@ -467,6 +474,11 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_PTR(TCGv_ptr t)
 return (intptr_t)t;
 }
 
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_V64(TCGv_v64 t)
+{
+return (intptr_t)t;
+}
+
 static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_V128(TCGv_v128 t)
 {
 return (intptr_t)t;
@@ -479,17 +491,20 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_V128(TCGv_v128 t)
 
 #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
 #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
+#define TCGV_EQUAL_V64(a, b) (GET_TCGV_V64(a) == GET_TCGV_V64(b))
 #define TCGV_EQUAL_V128(a, b) (GET_TCGV_V128(a) == GET_TCGV_V128(b))
 #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
 
 /* Dummy definition to avoid compiler warnings.  */
 #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
 #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
+#define TCGV_UNUSED_V64(x) x = MAKE_TCGV_V64(-1)
 #define TCGV_UNUSED_V128(x) x = MAKE_TCGV_V128(-1)
 #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
 
 #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
 #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
+#define TCGV_IS_UNUSED_V64(x) (GET_TCGV_V64(x) == -1)
 #define TCGV_IS_UNUSED_V128(x) (GET_TCGV_V128(x) == -1)
 #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
 
@@ -813,10 +828,12 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char 
*name);
 
 TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
 TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+TCGv_v64 tcg_temp_new_internal_v64(int temp_local);
 TCGv_v128 tcg_temp_new_internal_v128(int temp_local);
 
 void tcg_temp_free_i32(TCGv_i32 arg);
 void tcg_temp_free_i64(TCGv_i64 arg);
+void tcg_temp_free_v64(TCGv_v64 arg);
 void tcg_temp_free_v128(TCGv_v128 arg);
 
 static inl

[Qemu-devel] [PATCH v2.1 00/20] Emulate guest vector operations with host vector operations

2017-02-02 Thread Kirill Batuzov

The goal of these patch series is to set up an infrastructure to emulate
guest vector operations using host vector operations. Preliminary
experiments show that simply translating loads and stores increases
performance of x264 video codec by 10%. The performance of a gcc vectorized
for loop increased 2x.

To be able to emulate guest vector operations using host vector operations,
several things need to be done.

1. Corresponding vector types should be added to TCG. These series add
TCG_v128 and TCG_v64. I've made TCG_v64 a different type than TCG_i64
because it usually needs to be allocated to different registers and
supports different operations.

2. Load/store operations for these new types need to be implemented.

3. For seamless transition from current model to a new one we need to
handle cases where memory occupied by global variable can be accessed via
pointer to the CPUArchState structure. A very simple conservative alias
analysis has been added to do it. This analysis tracks memory loads and
stores that overlap with fields of CPUArchState and provides this
information to the register allocator. The allocator then spills and
reloads affected globals when needed.

4. Allow overlapping globals. For scalar registers this is a rare case, and
overlapping registers can ba handled as a single one (ah, al, ax, eax,
rax). In ARM every Q-register consists of two D-register each consisting of
two S-registers. Handling 4 S-registers as one because they are parts of
the same Q-register is way too inefficient.

5. Add new memory addressing mode to MMU code for large accesses and create
needed helpers. Only 128-bit vectors have been handled for now.

6. Create TCG opcodes for vector operations. Only addition has beed handled
in these series. Each operation has a wrapper that checks if the backend
supports the corresponding operation or not. In one case the vector opcode
is generated, in the other the operation is emulated with scalar
operations. The emulation code is generated inline for performance reasons
(there is a huge performance difference between inline generation
and calling a helper). As a positive side effect this will eventually allow
 to merge similar emulation code for vector instructions from different
frontends to target-independent implementation.

7. Use new operations in the frontend (ARM was used in these series).

8. Support new operations in the backend (x86_64 was used in these series).

For experiments I have used ARM guest on x86_64 host. I wanted some pair of
different architectures with vector extensions both. ARM and x86_64 pair
fits well.

v1 -> v2:
 - represent v128 type with smaller types when it is not supported by the host
 - detect AVX support and use AVX instructions when available
 - tcg/README updated
 - generate two v64 adds instead of one v128 when applicable
 - rebased to newer master
 - overlap detection for temps added (it needs to be explicitly called from
   _translate_init)
 - the stack is used to temporary store 128 bit variables to memory
   (instead of the TCGContext field)

v2 -> v2.1
 - automatic build failure fixed

Outstanding issues:
 - qemu_ld_v128 and qemu_st_v128 do not generate fallback code if the host
   does not support 128 bit registers. The reason is that I do not know how to
   handle the host/guest different endianness (whether do we swap only bytes
   in elements or whole vectors?). Different targets seem to have different
   ideas on how this should be done.

Kirill Batuzov (20):
  tcg: add support for 128bit vector type
  tcg: add support for 64bit vector type
  tcg: support representing vector type with smaller vector or scalar
types
  tcg: add ld_v128, ld_v64, st_v128 and st_v64 opcodes
  tcg: add simple alias analysis
  tcg: use results of alias analysis in liveness analysis
  tcg: allow globals to overlap
  tcg: add vector addition operations
  target/arm: support access to vector guest registers as globals
  target/arm: use vector opcode to handle vadd. instruction
  tcg/i386: add support for vector opcodes
  tcg/i386: support 64-bit vector operations
  tcg/i386: support remaining vector addition operations
  tcg: do not rely on exact values of MO_BSWAP or MO_SIGN in backend
  tcg: introduce new TCGMemOp - MO_128
  tcg: introduce qemu_ld_v128 and qemu_st_v128 opcodes
  softmmu: create helpers for vector loads
  tcg/i386: add support for qemu_ld_v128/qemu_st_v128 ops
  target/arm: load two consecutive 64-bits vector regs as a 128-bit
vector reg
  tcg/README: update README to include information about vector opcodes

Kirill Batuzov (21):
  tcg: add support for 128bit vector type
  tcg: add support for 64bit vector type
  tcg: support representing vector type with smaller vector or scalar
types
  tcg: add ld_v128, ld_v64, st_v128 and st_v64 opcodes
  tcg: add simple alias analysis
  tcg: use results of alias analysis in liveness analysis
  tcg: allow globals to overlap
  tcg: add vector addition operations
  target/a

[Qemu-devel] [PATCH v2.1 08/21] tcg: add vector addition operations

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---

Support for representing a v128 addition as two v64 additions have been added.
As a result GEN_VECT_WRAPPER_HALVES macro was added. It is larger and more
complicated than original GEN_VECT_WRAPPER (which is still used for v64 
additions
because they do not have half operations (v32 additions)).

GEN_VECT_WRAPPER_HALVES seems to grow fast (in size and complexity) for each
supported representation. Calling tcg_gen_add_ may not be 
desirable
because last resort fallback code is better be generated for the whole vector as
it will require less additional operations.

Some additional performance optimization can be done by creating hand written
tcg_gen_internal_ for some cases (for example, add_i8x16). This 
function
will still operate on memory locations but will use 64 bit scalar additions 
with some
bit masking as Richard suggested in v1 discussion. This series is focused on
infrastructure (not on optimization of particular instructions), so I have not
included this optimization yet.

---
 tcg/tcg-op.c  |  64 ++
 tcg/tcg-op.h  | 167 ++
 tcg/tcg-opc.h |  12 +
 tcg/tcg.c |  12 +
 tcg/tcg.h |  43 +++
 5 files changed, 298 insertions(+)

diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 95a39b7..8a19eee 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -3038,3 +3038,67 @@ static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, 
TCGv_i64 b)
 GEN_ATOMIC_HELPER(xchg, mov2, 0)
 
 #undef GEN_ATOMIC_HELPER
+
+/* Find a memory location for 128-bit TCG variable. */
+void tcg_v128_to_ptr(TCGv_v128 tmp, TCGv_ptr base, int slot,
+ TCGv_ptr *real_base, intptr_t *real_offset, int is_read)
+{
+int idx = GET_TCGV_V128(tmp);
+assert(idx >= 0 && idx < tcg_ctx.nb_temps);
+if (idx < tcg_ctx.nb_globals) {
+/* Globals use their locations within CPUArchState. */
+int env = GET_TCGV_PTR(tcg_ctx.tcg_env);
+TCGTemp *ts_env = &tcg_ctx.temps[env];
+TCGTemp *ts_arg = &tcg_ctx.temps[idx];
+
+/* Sanity checks: global's memory locations must be addressed
+   relative to ENV. */
+assert(ts_env->val_type == TEMP_VAL_REG &&
+   ts_env == ts_arg->mem_base &&
+   ts_arg->mem_allocated);
+
+*real_base = tcg_ctx.tcg_env;
+*real_offset = ts_arg->mem_offset;
+} else {
+/* Temporaries use swap space in TCGContext. Since we already have
+   a 128-bit temporary we'll assume that the target supports 128-bit
+   loads and stores. */
+*real_base = base;
+*real_offset = slot * 16;
+if (is_read) {
+tcg_gen_st_v128(tmp, base, slot * 16);
+}
+}
+}
+
+/* Find a memory location for 64-bit vector TCG variable. */
+void tcg_v64_to_ptr(TCGv_v64 tmp, TCGv_ptr base, int slot,
+TCGv_ptr *real_base, intptr_t *real_offset, int is_read)
+{
+int idx = GET_TCGV_V64(tmp);
+assert(idx >= 0 && idx < tcg_ctx.nb_temps);
+if (idx < tcg_ctx.nb_globals) {
+/* Globals use their locations within CPUArchState. */
+int env = GET_TCGV_PTR(tcg_ctx.tcg_env);
+TCGTemp *ts_env = &tcg_ctx.temps[env];
+TCGTemp *ts_arg = &tcg_ctx.temps[idx];
+
+/* Sanity checks: global's memory locations must be addressed
+   relative to ENV. */
+assert(ts_env->val_type == TEMP_VAL_REG &&
+   ts_env == ts_arg->mem_base &&
+   ts_arg->mem_allocated);
+
+*real_base = tcg_ctx.tcg_env;
+*real_offset = ts_arg->mem_offset;
+} else {
+/* Temporaries use swap space in TCGContext. Since we already have
+   a 128-bit temporary we'll assume that the target supports 128-bit
+   loads and stores. */
+*real_base = base;
+*real_offset = slot * 16;
+if (is_read) {
+tcg_gen_st_v64(tmp, base, slot * 16);
+}
+}
+}
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 250493b..3727be7 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -1195,6 +1195,10 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, 
TCGv_i64, TCGArg, TCGMemOp);
 tcg_gen_add_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), TCGV_PTR_TO_NAT(B))
 # define tcg_gen_addi_ptr(R, A, B) \
 tcg_gen_addi_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), (B))
+# define tcg_gen_mov_ptr(R, B) \
+tcg_gen_mov_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(B))
+# define tcg_gen_movi_ptr(R, B) \
+tcg_gen_movi_i32(TCGV_PTR_TO_NAT(R), (B))
 # define tcg_gen_ext_i32_ptr(R, A) \
 tcg_gen_mov_i32(TCGV_PTR_TO_NAT(R), (A))
 #else
@@ -1206,6 +1210,169 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, 
TCGv_i64, TCGArg, TCGMemOp);
 tcg_gen_add_i64(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), TCGV_PTR_TO_NAT(B))
 # define

[Qemu-devel] [PATCH v2.1 07/21] tcg: allow globals to overlap

2017-02-02 Thread Kirill Batuzov

Sometimes the target architecture may allow some parts of a register to be
accessed as a different register. If both of these registers are
implemented as globals in QEMU, then their content will overlap and the
change to one global will also change the value of the other. To handle
such situation properly, some fixes are needed in the register allocator
and liveness analysis.

Signed-off-by: Kirill Batuzov 
---
 tcg/optimize.c |  19 -
 tcg/tcg.c  | 128 +
 tcg/tcg.h  |  20 +
 3 files changed, 166 insertions(+), 1 deletion(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 2347ce3..7a69ff0 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -55,7 +55,7 @@ static inline bool temp_is_copy(TCGArg arg)
 }
 
 /* Reset TEMP's state, possibly removing the temp for the list of copies.  */
-static void reset_temp(TCGArg temp)
+static void reset_this_temp(TCGArg temp)
 {
 temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
 temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
@@ -66,6 +66,23 @@ static void reset_temp(TCGArg temp)
 temps[temp].mask = -1;
 }
 
+static void reset_temp(TCGArg temp)
+{
+int i;
+TCGTemp *ts = &tcg_ctx.temps[temp];
+reset_this_temp(temp);
+if (ts->sub_temps) {
+for (i = 0; ts->sub_temps[i] != (TCGArg)-1; i++) {
+reset_this_temp(ts->sub_temps[i]);
+}
+}
+if (ts->overlap_temps) {
+for (i = 0; ts->overlap_temps[i] != (TCGArg)-1; i++) {
+reset_this_temp(ts->overlap_temps[i]);
+}
+}
+}
+
 /* Reset all temporaries, given that there are NB_TEMPS of them.  */
 static void reset_all_temps(int nb_temps)
 {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 27e5944..a8df040 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -623,9 +623,13 @@ int tcg_global_mem_new_internal(TCGType base_type, 
TCGv_ptr base,
 ts2->mem_base = base_ts;
 ts2->mem_offset = offset + cur_offset;
 ts2->name = g_strdup_printf("%s_%d", name, i);
+ts2->sub_temps = NULL;
+ts2->overlap_temps = NULL;
 ts1 = ts2;
 }
 }
+ts->sub_temps = NULL;
+ts->overlap_temps = NULL;
 return temp_idx(s, ts);
 }
 
@@ -1514,6 +1518,35 @@ static int tcg_temp_overlap(TCGContext *s, const TCGTemp 
*tmp,
 }
 }
 
+static void tcg_temp_arr_apply(const TCGArg *arr, uint8_t *temp_state,
+   uint8_t temp_val)
+{
+TCGArg i;
+if (!arr) {
+return ;
+}
+for (i = 0; arr[i] != (TCGArg)-1; i++) {
+temp_state[arr[i]] = temp_val;
+}
+}
+
+static void tcg_sub_temps_dead(TCGContext *s, TCGArg tmp, uint8_t *temp_state)
+{
+tcg_temp_arr_apply(s->temps[tmp].sub_temps, temp_state, TS_DEAD);
+}
+
+static void tcg_sub_temps_sync(TCGContext *s, TCGArg tmp, uint8_t *temp_state)
+{
+tcg_temp_arr_apply(s->temps[tmp].sub_temps, temp_state, TS_MEM | TS_DEAD);
+}
+
+static void tcg_overlap_temps_sync(TCGContext *s, TCGArg tmp,
+   uint8_t *temp_state)
+{
+tcg_temp_arr_apply(s->temps[tmp].overlap_temps, temp_state,
+   TS_MEM | TS_DEAD);
+}
+
 /* Liveness analysis : update the opc_arg_life array to tell if a
given input arguments is dead. Instructions updating dead
temporaries are removed. */
@@ -1568,6 +1601,11 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 if (temp_state[arg] & TS_MEM) {
 arg_life |= SYNC_ARG << i;
 }
+/* sub_temps are also dead */
+tcg_sub_temps_dead(&tcg_ctx, arg, temp_state);
+/* overlap_temps need to go to memory */
+tcg_overlap_temps_sync(&tcg_ctx, arg, temp_state);
+
 temp_state[arg] = TS_DEAD;
 }
 
@@ -1595,6 +1633,11 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
 arg = args[i];
 if (arg != TCG_CALL_DUMMY_ARG) {
+/* both sub_temps and overlap_temps need to go
+   to memory */
+tcg_sub_temps_sync(&tcg_ctx, arg, temp_state);
+tcg_overlap_temps_sync(&tcg_ctx, arg, temp_state);
+
 temp_state[arg] &= ~TS_DEAD;
 }
 }
@@ -1713,6 +1756,11 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 if (temp_state[arg] & TS_MEM) {
 arg_life |= SYNC_ARG << i;
 }
+/* sub_temps are also dead */
+

[Qemu-devel] [PATCH v2.1 12/21] tcg/i386: support 64-bit vector operations

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.h |  1 +
 tcg/i386/tcg-target.inc.c | 22 ++
 2 files changed, 23 insertions(+)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index b0704e8..755ebaa 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -31,6 +31,7 @@
 #ifdef __x86_64__
 # if defined(TARGET_WORDS_BIGENDIAN) == defined(HOST_WORDS_BIGENDIAN)
 #  define TCG_TARGET_HAS_REG128 1
+#  define TCG_TARGET_HAS_REGV64 1
 # endif
 # ifdef TCG_TARGET_HAS_REG128
 #  define TCG_TARGET_REG_BITS  64
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 3e718f3..208bb81 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -390,6 +390,9 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define OPC_MOVDQU_M2R  (0x6f | P_SSE_F30F)  /* store 128-bit value */
 #define OPC_MOVDQU_R2M  (0x7f | P_SSE_F30F)  /* load 128-bit value */
 #define OPC_MOVDQA_R2R  (0x6f | P_SSE_660F)  /* reg-to-reg 128-bit mov */
+#define OPC_MOVQ_M2R(0x7e | P_SSE_F30F)
+#define OPC_MOVQ_R2M(0xd6 | P_SSE_660F)
+#define OPC_MOVQ_R2R(0x7e | P_SSE_F30F)
 #define OPC_PADDD   (0xfe | P_SSE_660F)
 
 /* Group 1 opcode extensions for 0x80-0x83.
@@ -700,6 +703,15 @@ static inline void tcg_out_mov(TCGContext *s, TCGType type,
 tcg_out_modrm(s, OPC_MOVDQA_R2R, ret, arg);
 }
 break;
+case TCG_TYPE_V64:
+ret -= TCG_REG_XMM0;
+arg -= TCG_REG_XMM0;
+if (have_avx) {
+tcg_out_vex_modrm(s, OPC_MOVQ_R2R, ret, 15, arg);
+} else {
+tcg_out_modrm(s, OPC_MOVQ_R2R, ret, arg);
+}
+break;
 case TCG_TYPE_I32:
 case TCG_TYPE_I64:
 opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
@@ -788,6 +800,10 @@ static inline void tcg_out_ld(TCGContext *s, TCGType type, 
TCGReg ret,
 ret -= TCG_REG_XMM0;
 tcg_out_modrm_offset(s, OPC_MOVDQU_M2R, ret, arg1, arg2);
 break;
+case TCG_TYPE_V64:
+ret -= TCG_REG_XMM0;
+tcg_out_modrm_offset(s, OPC_MOVQ_M2R, ret, arg1, arg2);
+break;
 case TCG_TYPE_I32:
 case TCG_TYPE_I64:
 opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
@@ -807,6 +823,10 @@ static inline void tcg_out_st(TCGContext *s, TCGType type, 
TCGReg arg,
 arg -= TCG_REG_XMM0;
 tcg_out_modrm_offset(s, OPC_MOVDQU_R2M, arg, arg1, arg2);
 break;
+case TCG_TYPE_V64:
+arg -= TCG_REG_XMM0;
+tcg_out_modrm_offset(s, OPC_MOVQ_R2M, arg, arg1, arg2);
+break;
 case TCG_TYPE_I32:
 case TCG_TYPE_I64:
 opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
@@ -2407,6 +2427,8 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 
 case INDEX_op_ld_v128:
 case INDEX_op_st_v128:
+case INDEX_op_ld_v64:
+case INDEX_op_st_v64:
 return &V_r;
 
 case INDEX_op_st8_i32:
-- 
2.1.4

[Qemu-devel] [PATCH v2.1 10/21] target/arm: use vector opcode to handle vadd. instruction

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 target/arm/translate.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index d7578e2..90e14df 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -5628,6 +5628,37 @@ static int disas_neon_data_insn(DisasContext *s, 
uint32_t insn)
 return 1;
 }
 
+/* Use vector ops to handle what we can */
+switch (op) {
+case NEON_3R_VADD_VSUB:
+if (!u) {
+void (* const gen_add_v128[])(TCGv_v128, TCGv_v128,
+ TCGv_v128) = {
+tcg_gen_add_i8x16,
+tcg_gen_add_i16x8,
+tcg_gen_add_i32x4,
+tcg_gen_add_i64x2
+};
+void (* const gen_add_v64[])(TCGv_v64, TCGv_v64,
+ TCGv_v64) = {
+tcg_gen_add_i8x8,
+tcg_gen_add_i16x4,
+tcg_gen_add_i32x2,
+tcg_gen_add_i64x1
+};
+if (q) {
+gen_add_v128[size](cpu_Q[rd >> 1], cpu_Q[rn >> 1],
+   cpu_Q[rm >> 1]);
+} else {
+gen_add_v64[size](cpu_D[rd], cpu_D[rn], cpu_D[rm]);
+}
+return 0;
+}
+break;
+default:
+break;
+}
+
 for (pass = 0; pass < (q ? 4 : 2); pass++) {
 
 if (pairwise) {
-- 
2.1.4

[Qemu-devel] [PATCH v2.1 13/21] tcg/i386: support remaining vector addition operations

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---

I believe checkpatch warning here to be false-positive.

---
 tcg/i386/tcg-target.h | 10 +
 tcg/i386/tcg-target.inc.c | 54 +--
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 755ebaa..bd6cfe1 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -172,7 +172,17 @@ extern bool have_popcnt;
 #endif
 
 #ifdef TCG_TARGET_HAS_REG128
+#define TCG_TARGET_HAS_add_i8x161
+#define TCG_TARGET_HAS_add_i16x81
 #define TCG_TARGET_HAS_add_i32x41
+#define TCG_TARGET_HAS_add_i64x21
+#endif
+
+#ifdef TCG_TARGET_HAS_REGV64
+#define TCG_TARGET_HAS_add_i8x8 1
+#define TCG_TARGET_HAS_add_i16x41
+#define TCG_TARGET_HAS_add_i32x21
+#define TCG_TARGET_HAS_add_i64x11
 #endif
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 208bb81..d8f0d81 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -168,6 +168,11 @@ static bool have_lzcnt;
 #else
 # define have_lzcnt 0
 #endif
+#if defined(CONFIG_CPUID_H) && defined(bit_AVX) && defined(bit_OSXSAVE)
+static bool have_avx;
+#else
+# define have_avx 0
+#endif
 
 static tcg_insn_unit *tb_ret_addr;
 
@@ -393,7 +398,10 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define OPC_MOVQ_M2R(0x7e | P_SSE_F30F)
 #define OPC_MOVQ_R2M(0xd6 | P_SSE_660F)
 #define OPC_MOVQ_R2R(0x7e | P_SSE_F30F)
+#define OPC_PADDB   (0xfc | P_SSE_660F)
+#define OPC_PADDW   (0xfd | P_SSE_660F)
 #define OPC_PADDD   (0xfe | P_SSE_660F)
+#define OPC_PADDQ   (0xd4 | P_SSE_660F)
 
 /* Group 1 opcode extensions for 0x80-0x83.
These are also used as modifiers for OPC_ARITH.  */
@@ -1963,6 +1971,19 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 TCGArg a0, a1, a2;
 int c, const_a2, vexop, rexw = 0;
 
+static const int vect_binop[] = {
+[INDEX_op_add_i8x16] = OPC_PADDB,
+[INDEX_op_add_i16x8] = OPC_PADDW,
+[INDEX_op_add_i32x4] = OPC_PADDD,
+[INDEX_op_add_i64x2] = OPC_PADDQ,
+
+[INDEX_op_add_i8x8]  = OPC_PADDB,
+[INDEX_op_add_i16x4] = OPC_PADDW,
+[INDEX_op_add_i32x2] = OPC_PADDD,
+[INDEX_op_add_i64x1] = OPC_PADDQ,
+};
+
+
 #if TCG_TARGET_REG_BITS == 64
 # define OP_32_64(x) \
 case glue(glue(INDEX_op_, x), _i64): \
@@ -1972,6 +1993,17 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 # define OP_32_64(x) \
 case glue(glue(INDEX_op_, x), _i32)
 #endif
+#define OP_V128_ALL(x) \
+case glue(glue(INDEX_op_, x), _i8x16): \
+case glue(glue(INDEX_op_, x), _i16x8): \
+case glue(glue(INDEX_op_, x), _i32x4): \
+case glue(glue(INDEX_op_, x), _i64x2)
+
+#define OP_V64_ALL(x) \
+case glue(glue(INDEX_op_, x), _i8x8):  \
+case glue(glue(INDEX_op_, x), _i16x4): \
+case glue(glue(INDEX_op_, x), _i32x2): \
+case glue(glue(INDEX_op_, x), _i64x1)
 
 /* Hoist the loads of the most common arguments.  */
 a0 = args[0];
@@ -2369,8 +2401,13 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 tcg_out_mb(s, a0);
 break;
 
-case INDEX_op_add_i32x4:
-tcg_out_modrm(s, OPC_PADDD, args[0], args[2]);
+OP_V128_ALL(add):
+OP_V64_ALL(add):
+if (have_avx) {
+tcg_out_vex_modrm(s, vect_binop[opc], args[0], args[1], args[2]);
+} else {
+tcg_out_modrm(s, vect_binop[opc], args[0], args[2]);
+}
 break;
 
 case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
@@ -2383,6 +2420,8 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 }
 
 #undef OP_32_64
+#undef OP_V128_ALL
+#undef OP_V64_ALL
 }
 
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
@@ -2613,7 +2652,14 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 return &s2;
 }
 
+case INDEX_op_add_i8x16:
+case INDEX_op_add_i16x8:
 case INDEX_op_add_i32x4:
+case INDEX_op_add_i64x2:
+case INDEX_op_add_i8x8:
+case INDEX_op_add_i16x4:
+case INDEX_op_add_i32x2:
+case INDEX_op_add_i64x1:
 return &V_0_V;
 
 default:
@@ -2728,6 +2774,10 @@ static void tcg_target_init(TCGContext *s)
 #ifdef bit_POPCNT
 have_popcnt = (c & bit_POPCNT) != 0;
 #endif
+#if defined(bit_AVX) && defined(bit_OSXSAVE)
+have_avx = (c & (bit_AVX | bit_OSXSAVE)) == (bit_AVX | bit_OSXSAVE);
+#endif
+
 }
 
 if (max >= 7) {
-- 
2.1.4

[Qemu-devel] [PATCH v2.1 09/21] target/arm: support access to vector guest registers as globals

2017-02-02 Thread Kirill Batuzov

To support vector guest registers as globals we need to do two things:

1) create corresponding globals,
2) mark which globals can overlap,

Signed-off-by: Kirill Batuzov 
---

For vector registers I used the same coding style as was used for scalar
registers. Should I change braces placement for them all?

---
 target/arm/translate.c | 30 --
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index 493c627..d7578e2 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -65,6 +65,8 @@ static TCGv_i32 cpu_R[16];
 TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
 TCGv_i64 cpu_exclusive_addr;
 TCGv_i64 cpu_exclusive_val;
+static TCGv_v128 cpu_Q[16];
+static TCGv_v64 cpu_D[32];
 
 /* FIXME:  These should be removed.  */
 static TCGv_i32 cpu_F0s, cpu_F1s;
@@ -72,10 +74,20 @@ static TCGv_i64 cpu_F0d, cpu_F1d;
 
 #include "exec/gen-icount.h"
 
-static const char *regnames[] =
+static const char *regnames_r[] =
 { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
   "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" };
 
+static const char *regnames_q[] =
+{ "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+  "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" };
+
+static const char *regnames_d[] =
+{ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+  "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+  "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
+
 /* initialize TCG globals.  */
 void arm_translate_init(void)
 {
@@ -87,8 +99,22 @@ void arm_translate_init(void)
 for (i = 0; i < 16; i++) {
 cpu_R[i] = tcg_global_mem_new_i32(cpu_env,
   offsetof(CPUARMState, regs[i]),
-  regnames[i]);
+  regnames_r[i]);
+}
+for (i = 0; i < 16; i++) {
+cpu_Q[i] = tcg_global_mem_new_v128(cpu_env,
+   offsetof(CPUARMState,
+vfp.regs[2 * i]),
+   regnames_q[i]);
 }
+for (i = 0; i < 32; i++) {
+cpu_D[i] = tcg_global_mem_new_v64(cpu_env,
+  offsetof(CPUARMState, vfp.regs[i]),
+  regnames_d[i]);
+}
+
+tcg_detect_overlapping_temps(&tcg_ctx);
+
 cpu_CF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, CF), "CF");
 cpu_NF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, NF), "NF");
 cpu_VF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, VF), "VF");
-- 
2.1.4

[Qemu-devel] [PATCH v2.1 06/21] tcg: use results of alias analysis in liveness analysis

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg.c | 61 +
 1 file changed, 61 insertions(+)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 18d97ec..27e5944 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -564,6 +564,11 @@ static intptr_t tcg_type_size(TCGType type)
 }
 }
 
+static intptr_t tcg_temp_size(const TCGTemp *tmp)
+{
+return tcg_type_size(tmp->type);
+}
+
 int tcg_global_mem_new_internal(TCGType base_type, TCGv_ptr base,
 intptr_t offset, const char *name)
 {
@@ -1472,6 +1477,43 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t 
*temp_state)
 }
 }
 
+/* Check if memory write completely overwrites temp's memory location.
+   If this is the case then the temp can be considered dead. */
+static int tcg_temp_overwrite(TCGContext *s, const TCGTemp *tmp,
+   const TCGAliasInfo *ai)
+{
+if (!(ai->alias_type & TCG_ALIAS_WRITE) || !ai->fixed_offset) {
+return 0;
+}
+if (tmp->mem_base != &s->temps[GET_TCGV_PTR(s->tcg_env)]) {
+return 0;
+}
+if (ai->offset > tmp->mem_offset
+|| ai->offset + ai->size < tmp->mem_offset + tcg_temp_size(tmp)) {
+return 0;
+}
+return 1;
+}
+
+/* Check if memory read or write overlaps with temp's memory location.
+   If this is the case then the temp must be synced to memory. */
+static int tcg_temp_overlap(TCGContext *s, const TCGTemp *tmp,
+const TCGAliasInfo *ai)
+{
+if (!ai->fixed_offset || tmp->fixed_reg) {
+return 0;
+}
+if (tmp->mem_base != &s->temps[GET_TCGV_PTR(s->tcg_env)]) {
+return 1;
+}
+if (ai->offset >= tmp->mem_offset + tcg_temp_size(tmp)
+|| ai->offset + ai->size <= tmp->mem_offset) {
+return 0;
+} else {
+return 1;
+}
+}
+
 /* Liveness analysis : update the opc_arg_life array to tell if a
given input arguments is dead. Instructions updating dead
temporaries are removed. */
@@ -1674,6 +1716,23 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 temp_state[arg] = TS_DEAD;
 }
 
+/* record if the operation uses some globals' memory location 
*/
+if (s->alias_info[oi].alias_type != TCG_NOT_ALIAS) {
+for (i = 0; i < s->nb_globals; i++) {
+if (tcg_temp_overwrite(s, &s->temps[i],
+   &s->alias_info[oi])) {
+temp_state[i] = TS_DEAD;
+} else if (tcg_temp_overlap(s, &s->temps[i],
+&s->alias_info[oi])) {
+if (s->alias_info[oi].alias_type & TCG_ALIAS_READ) 
{
+temp_state[i] = TS_MEM | TS_DEAD;
+} else if (!(temp_state[i] & TS_DEAD)) {
+temp_state[i] |= TS_MEM;
+}
+}
+}
+}
+
 /* if end of basic block, update */
 if (def->flags & TCG_OPF_BB_END) {
 tcg_la_bb_end(s, temp_state);
@@ -2622,6 +2681,8 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
 s->la_time -= profile_getclock();
 #endif
 
+tcg_alias_analysis(s);
+
 {
 uint8_t *temp_state = tcg_malloc(s->nb_temps + s->nb_indirects);
 
-- 
2.1.4

[Qemu-devel] [PATCH v2.1 01/21] tcg: add support for 128bit vector type

2017-02-02 Thread Kirill Batuzov

Introduce TCG_TYPE_V128 and corresponding TCGv_v128 for TCG temps. Add helper
functions that work with temps of this new type.

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h | 24 
 tcg/tcg.c| 13 +
 tcg/tcg.h| 34 ++
 3 files changed, 71 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index c68e300..5abf8b2 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -248,6 +248,23 @@ static inline void tcg_gen_op6ii_i64(TCGOpcode opc, 
TCGv_i64 a1, TCGv_i64 a2,
 GET_TCGV_I64(a3), GET_TCGV_I64(a4), a5, a6);
 }
 
+static inline void tcg_gen_op1_v128(TCGOpcode opc, TCGv_v128 a1)
+{
+tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_V128(a1));
+}
+
+static inline void tcg_gen_op2_v128(TCGOpcode opc, TCGv_v128 a1,
+TCGv_v128 a2)
+{
+tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_V128(a2));
+}
+
+static inline void tcg_gen_op3_v128(TCGOpcode opc, TCGv_v128 a1,
+TCGv_v128 a2, TCGv_v128 a3)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_V128(a2),
+GET_TCGV_V128(a3));
+}
 
 /* Generic ops.  */
 
@@ -454,6 +471,13 @@ static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 
arg)
 }
 }
 
+/* Vector ops */
+
+static inline void tcg_gen_discard_v128(TCGv_v128 arg)
+{
+tcg_gen_op1_v128(INDEX_op_discard, arg);
+}
+
 /* 64 bit ops */
 
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index cb898f1..2a5e83b 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -641,6 +641,14 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
 return MAKE_TCGV_I64(idx);
 }
 
+TCGv_v128 tcg_temp_new_internal_v128(int temp_local)
+{
+int idx;
+
+idx = tcg_temp_new_internal(TCG_TYPE_V128, temp_local);
+return MAKE_TCGV_V128(idx);
+}
+
 static void tcg_temp_free_internal(int idx)
 {
 TCGContext *s = &tcg_ctx;
@@ -673,6 +681,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
 tcg_temp_free_internal(GET_TCGV_I64(arg));
 }
 
+void tcg_temp_free_v128(TCGv_v128 arg)
+{
+tcg_temp_free_internal(GET_TCGV_V128(arg));
+}
+
 TCGv_i32 tcg_const_i32(int32_t val)
 {
 TCGv_i32 t0;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 631c6f6..56484e7 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -246,6 +246,7 @@ typedef struct TCGPool {
 typedef enum TCGType {
 TCG_TYPE_I32,
 TCG_TYPE_I64,
+TCG_TYPE_V128,
 TCG_TYPE_COUNT, /* number of different types */
 
 /* An alias for the size of the host register.  */
@@ -421,6 +422,7 @@ typedef tcg_target_ulong TCGArg;
 typedef struct TCGv_i32_d *TCGv_i32;
 typedef struct TCGv_i64_d *TCGv_i64;
 typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef struct TCGv_v128_d *TCGv_v128;
 typedef TCGv_ptr TCGv_env;
 #if TARGET_LONG_BITS == 32
 #define TCGv TCGv_i32
@@ -445,6 +447,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL 
MAKE_TCGV_PTR(intptr_t i)
 return (TCGv_ptr)i;
 }
 
+static inline TCGv_v128 QEMU_ARTIFICIAL MAKE_TCGV_V128(intptr_t i)
+{
+return (TCGv_v128)i;
+}
+
 static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t)
 {
 return (intptr_t)t;
@@ -460,6 +467,11 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_PTR(TCGv_ptr t)
 return (intptr_t)t;
 }
 
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_V128(TCGv_v128 t)
+{
+return (intptr_t)t;
+}
+
 #if TCG_TARGET_REG_BITS == 32
 #define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
 #define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
@@ -467,15 +479,18 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_PTR(TCGv_ptr t)
 
 #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
 #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
+#define TCGV_EQUAL_V128(a, b) (GET_TCGV_V128(a) == GET_TCGV_V128(b))
 #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
 
 /* Dummy definition to avoid compiler warnings.  */
 #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
 #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
+#define TCGV_UNUSED_V128(x) x = MAKE_TCGV_V128(-1)
 #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
 
 #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
 #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
+#define TCGV_IS_UNUSED_V128(x) (GET_TCGV_V128(x) == -1)
 #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
 
 /* call flags */
@@ -798,9 +813,11 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char 
*name);
 
 TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
 TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+TCGv_v128 tcg_temp_new_internal_v128(int temp_local);
 
 void tcg_temp_free_i32(TCGv_i32 arg);
 void tcg_temp_free_i64(TCGv_i64 arg);
+void tcg_temp_free_v128(TCGv_v128 arg);
 
 static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
   const char *name)
@@ -836,6 +853,23 @@ static inline TCGv_i64 tcg_temp_local_new_i64(v

[Qemu-devel] [PATCH v2.1 16/21] tcg: introduce new TCGMemOp - MO_128

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg.h | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/tcg.h b/tcg/tcg.h
index 5e0c6da..63a83f9 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -306,11 +306,12 @@ typedef enum TCGMemOp {
 MO_16= 1,
 MO_32= 2,
 MO_64= 3,
-MO_SIZE  = 3,   /* Mask for the above.  */
+MO_128   = 4,
+MO_SIZE  = 7,   /* Mask for the above.  */
 
-MO_SIGN  = 4,   /* Sign-extended, otherwise zero-extended.  */
+MO_SIGN  = 8,   /* Sign-extended, otherwise zero-extended.  */
 
-MO_BSWAP = 8,   /* Host reverse endian.  */
+MO_BSWAP = 16,   /* Host reverse endian.  */
 #ifdef HOST_WORDS_BIGENDIAN
 MO_LE= MO_BSWAP,
 MO_BE= 0,
@@ -342,7 +343,7 @@ typedef enum TCGMemOp {
  * - an alignment to a specified size, which may be more or less than
  *   the access size (MO_ALIGN_x where 'x' is a size in bytes);
  */
-MO_ASHIFT = 4,
+MO_ASHIFT = 5,
 MO_AMASK = 7 << MO_ASHIFT,
 #ifdef ALIGNED_ONLY
 MO_ALIGN = 0,
-- 
2.1.4

[Qemu-devel] [PATCH v2.1 04/21] tcg: add ld_v128, ld_v64, st_v128 and st_v64 opcodes

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h  | 38 ++
 tcg/tcg-opc.h | 18 ++
 2 files changed, 56 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 517745e..250493b 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -501,6 +501,44 @@ static inline void tcg_gen_discard_v64(TCGv_v64 arg)
 tcg_gen_op1_v64(INDEX_op_discard, arg);
 }
 
+static inline void tcg_gen_ldst_op_v128(TCGOpcode opc, TCGv_v128 val,
+   TCGv_ptr base, TCGArg offset)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V128(val), GET_TCGV_PTR(base),
+offset);
+}
+
+static inline void tcg_gen_st_v128(TCGv_v128 arg1, TCGv_ptr arg2,
+   tcg_target_long offset)
+{
+tcg_gen_ldst_op_v128(INDEX_op_st_v128, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_ld_v128(TCGv_v128 ret, TCGv_ptr arg2,
+   tcg_target_long offset)
+{
+tcg_gen_ldst_op_v128(INDEX_op_ld_v128, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ldst_op_v64(TCGOpcode opc, TCGv_v64 val,
+   TCGv_ptr base, TCGArg offset)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V64(val), GET_TCGV_PTR(base),
+offset);
+}
+
+static inline void tcg_gen_st_v64(TCGv_v64 arg1, TCGv_ptr arg2,
+  tcg_target_long offset)
+{
+tcg_gen_ldst_op_v64(INDEX_op_st_v64, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_ld_v64(TCGv_v64 ret, TCGv_ptr arg2,
+  tcg_target_long offset)
+{
+tcg_gen_ldst_op_v64(INDEX_op_ld_v64, ret, arg2, offset);
+}
+
 /* 64 bit ops */
 
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index f06f894..2365c97 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -42,6 +42,18 @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END)
 # define IMPL64  TCG_OPF_64BIT
 #endif
 
+#ifdef TCG_TARGET_HAS_REG128
+# define IMPL128 0
+#else
+# define IMPL128 TCG_OPF_NOT_PRESENT
+#endif
+
+#ifdef TCG_TARGET_HAS_REGV64
+# define IMPLV64 0
+#else
+# define IMPLV64 TCG_OPF_NOT_PRESENT
+#endif
+
 DEF(mb, 0, 0, 1, 0)
 
 DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT)
@@ -188,6 +200,12 @@ DEF(mulsh_i64, 1, 2, 0, IMPL(TCG_TARGET_HAS_mulsh_i64))
 #define TLADDR_ARGS  (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? 1 : 2)
 #define DATA64_ARGS  (TCG_TARGET_REG_BITS == 64 ? 1 : 2)
 
+/* load/store */
+DEF(st_v128, 0, 2, 1, IMPL128)
+DEF(ld_v128, 1, 1, 1, IMPL128)
+DEF(st_v64, 0, 2, 1, IMPLV64)
+DEF(ld_v64, 1, 1, 1, IMPLV64)
+
 /* QEMU specific */
 DEF(insn_start, 0, 0, TLADDR_ARGS * TARGET_INSN_START_WORDS,
 TCG_OPF_NOT_PRESENT)
-- 
2.1.4

[Qemu-devel] [PATCH v2.1 17/21] tcg: introduce qemu_ld_v128 and qemu_st_v128 opcodes

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.inc.c |  5 +
 tcg/tcg-op.c  | 24 
 tcg/tcg-op.h  | 15 +++
 tcg/tcg-opc.h |  4 
 4 files changed, 48 insertions(+)

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 263c15e..1e6edc0 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -2448,6 +2448,7 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 = { .args_ct_str = { "L", "L", "L", "L" } };
 static const TCGTargetOpDef V_r = { .args_ct_str  = { "V", "r" } };
 static const TCGTargetOpDef V_0_V = { .args_ct_str  = { "V", "0", "V" } };
+static const TCGTargetOpDef V_L = { .args_ct_str  = { "V", "L" } };
 
 switch (op) {
 case INDEX_op_ld8u_i32:
@@ -2662,6 +2663,10 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 case INDEX_op_add_i64x1:
 return &V_0_V;
 
+case INDEX_op_qemu_ld_v128:
+case INDEX_op_qemu_st_v128:
+return &V_L;
+
 default:
 break;
 }
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 0dfe611..db74017 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -3102,3 +3102,27 @@ void tcg_v64_to_ptr(TCGv_v64 tmp, TCGv_ptr base, int 
slot,
 }
 }
 }
+
+void tcg_gen_qemu_ld_v128(TCGv_v128 val, TCGv addr, TCGArg idx,
+  TCGMemOp memop)
+{
+#ifdef TCG_TARGET_HAS_REG128
+tcg_debug_assert((memop & MO_BSWAP) == MO_TE);
+TCGMemOpIdx oi = make_memop_idx(memop, idx);
+tcg_gen_op3si_v128(INDEX_op_qemu_ld_v128, val, addr, oi);
+#else
+g_assert_not_reached();
+#endif
+}
+
+void tcg_gen_qemu_st_v128(TCGv_v128 val, TCGv addr, TCGArg idx,
+  TCGMemOp memop)
+{
+#ifdef TCG_TARGET_HAS_REG128
+tcg_debug_assert((memop & MO_BSWAP) == MO_TE);
+TCGMemOpIdx oi = make_memop_idx(memop, idx);
+tcg_gen_op3si_v128(INDEX_op_qemu_st_v128, val, addr, oi);
+#else
+g_assert_not_reached();
+#endif
+}
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 3727be7..dc1d032 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -266,6 +266,19 @@ static inline void tcg_gen_op3_v128(TCGOpcode opc, 
TCGv_v128 a1,
 GET_TCGV_V128(a3));
 }
 
+static inline void tcg_gen_op3si_v128(TCGOpcode opc, TCGv_v128 a1,
+  TCGv a2, TCGArg a3)
+{
+#if TARGET_LONG_BITS == 64 && TCG_TARGET_REG_BITS == 32
+tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_I32(TCGV_LOW(a2)),
+GET_TCGV_I32(TCGV_HIGH(a2)), a3);
+#elif TARGET_LONG_BITS == 32
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_I32(a2), a3);
+#else
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_I64(a2), a3);
+#endif
+}
+
 static inline void tcg_gen_op1_v64(TCGOpcode opc, TCGv_v64 a1)
 {
 tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_V64(a1));
@@ -909,6 +922,8 @@ void tcg_gen_qemu_ld_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_st_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_ld_i64(TCGv_i64, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_st_i64(TCGv_i64, TCGv, TCGArg, TCGMemOp);
+void tcg_gen_qemu_ld_v128(TCGv_v128, TCGv, TCGArg, TCGMemOp);
+void tcg_gen_qemu_st_v128(TCGv_v128, TCGv, TCGArg, TCGMemOp);
 
 static inline void tcg_gen_qemu_ld8u(TCGv ret, TCGv addr, int mem_index)
 {
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 4c8f195..6c2e697 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -232,6 +232,10 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
 TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
 DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
 TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
+DEF(qemu_ld_v128, 1, 1, 1,
+TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | IMPL128)
+DEF(qemu_st_v128, 0, 2, 1,
+TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | IMPL128)
 
 #undef TLADDR_ARGS
 #undef DATA64_ARGS
-- 
2.1.4

[Qemu-devel] [PATCH v2.1 03/21] tcg: support representing vector type with smaller vector or scalar types

2017-02-02 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---

This is not as bad as I thought it would be.
Only two cases: type == base_type and type != base_type.

---
 tcg/tcg.c | 136 +-
 1 file changed, 91 insertions(+), 45 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 5e69103..18d97ec 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -523,12 +523,54 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char 
*name)
 return MAKE_TCGV_I64(idx);
 }
 
-int tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
+static TCGType tcg_choose_type(TCGType type)
+{
+switch (type) {
+case TCG_TYPE_I64:
+if (TCG_TARGET_REG_BITS == 64) {
+return TCG_TYPE_I64;
+}
+/* Fallthrough */
+case TCG_TYPE_I32:
+return TCG_TYPE_I32;
+case TCG_TYPE_V128:
+#ifdef TCG_TARGET_HAS_REG128
+return TCG_TYPE_V128;
+#endif
+/* Fallthrough */
+case TCG_TYPE_V64:
+#ifdef TCG_TARGET_HAS_REGV64
+return TCG_TYPE_V64;
+#else
+return tcg_choose_type(TCG_TYPE_I64);
+#endif
+default:
+g_assert_not_reached();
+}
+}
+
+static intptr_t tcg_type_size(TCGType type)
+{
+switch (type) {
+case TCG_TYPE_I32:
+return 4;
+case TCG_TYPE_I64:
+case TCG_TYPE_V64:
+return 8;
+case TCG_TYPE_V128:
+return 16;
+default:
+g_assert_not_reached();
+}
+}
+
+int tcg_global_mem_new_internal(TCGType base_type, TCGv_ptr base,
 intptr_t offset, const char *name)
 {
 TCGContext *s = &tcg_ctx;
 TCGTemp *base_ts = &s->temps[GET_TCGV_PTR(base)];
 TCGTemp *ts = tcg_global_alloc(s);
+TCGType type = tcg_choose_type(base_type);
 int indirect_reg = 0, bigendian = 0;
 #ifdef HOST_WORDS_BIGENDIAN
 bigendian = 1;
@@ -543,47 +585,51 @@ int tcg_global_mem_new_internal(TCGType type, TCGv_ptr 
base,
 indirect_reg = 1;
 }
 
-if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
-TCGTemp *ts2 = tcg_global_alloc(s);
-char buf[64];
-
-ts->base_type = TCG_TYPE_I64;
-ts->type = TCG_TYPE_I32;
+if (type == base_type) {
+ts->base_type = type;
+ts->type = type;
 ts->indirect_reg = indirect_reg;
 ts->mem_allocated = 1;
 ts->mem_base = base_ts;
-ts->mem_offset = offset + bigendian * 4;
-pstrcpy(buf, sizeof(buf), name);
-pstrcat(buf, sizeof(buf), "_0");
-ts->name = strdup(buf);
-
-tcg_debug_assert(ts2 == ts + 1);
-ts2->base_type = TCG_TYPE_I64;
-ts2->type = TCG_TYPE_I32;
-ts2->indirect_reg = indirect_reg;
-ts2->mem_allocated = 1;
-ts2->mem_base = base_ts;
-ts2->mem_offset = offset + (1 - bigendian) * 4;
-pstrcpy(buf, sizeof(buf), name);
-pstrcat(buf, sizeof(buf), "_1");
-ts2->name = strdup(buf);
+ts->mem_offset = offset;
+ts->name = name;
 } else {
-ts->base_type = type;
+int i, count = tcg_type_size(base_type) / tcg_type_size(type);
+TCGTemp *ts2, *ts1 = ts;
+int cur_offset =
+bigendian ? tcg_type_size(base_type) - tcg_type_size(type) : 0;
+
+ts->base_type = base_type;
 ts->type = type;
 ts->indirect_reg = indirect_reg;
 ts->mem_allocated = 1;
 ts->mem_base = base_ts;
-ts->mem_offset = offset;
-ts->name = name;
+ts->mem_offset = offset + cur_offset;
+ts->name = g_strdup_printf("%s_0", name);
+
+for (i = 1; i < count; i++) {
+ts2 = tcg_global_alloc(s);
+tcg_debug_assert(ts2 == ts1 + 1);
+cur_offset += (bigendian ? -1 : 1) * tcg_type_size(type);
+ts2->base_type = base_type;
+ts2->type = type;
+ts2->indirect_reg = indirect_reg;
+ts2->mem_allocated = 1;
+ts2->mem_base = base_ts;
+ts2->mem_offset = offset + cur_offset;
+ts2->name = g_strdup_printf("%s_%d", name, i);
+ts1 = ts2;
+}
 }
 return temp_idx(s, ts);
 }
 
-static int tcg_temp_new_internal(TCGType type, int temp_local)
+static int tcg_temp_new_internal(TCGType base_type, int temp_local)
 {
 TCGContext *s = &tcg_ctx;
 TCGTemp *ts;
 int idx, k;
+TCGType type = tcg_choose_type(base_type);
 
 k = type + (temp_local ? TCG_TYPE_COUNT : 0);
 idx = find_first_bit(s->free_temps[k].l, TCG_MAX_TEMPS);
@@ -593,28 +639,28 @@ static int tcg_temp_new_internal(TCGType type, int 
temp_local)
 
 ts = &s->temps[idx];
 ts->temp_allocated = 1;
-tcg_debug_assert(ts->base_type == type);
+tcg_debug_assert(ts->base_type == base_type);
 tcg_debug_asser

[Qemu-devel] [PATCH v2 14/20] tcg: do not rely on exact values of MO_BSWAP or MO_SIGN in backend

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/aarch64/tcg-target.inc.c |  4 ++--
 tcg/arm/tcg-target.inc.c |  4 ++--
 tcg/i386/tcg-target.inc.c|  4 ++--
 tcg/mips/tcg-target.inc.c|  4 ++--
 tcg/ppc/tcg-target.inc.c |  4 ++--
 tcg/s390/tcg-target.inc.c|  4 ++--
 tcg/sparc/tcg-target.inc.c   | 12 ++--
 tcg/tcg-op.c |  4 ++--
 tcg/tcg.h|  1 +
 9 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 6d227a5..2b0b548 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -1032,7 +1032,7 @@ static void tcg_out_cltz(TCGContext *s, TCGType ext, 
TCGReg d,
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * TCGMemOpIdx oi, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
 [MO_LEUL] = helper_le_ldul_mmu,
@@ -1046,7 +1046,7 @@ static void * const qemu_ld_helpers[16] = {
  * uintxx_t val, TCGMemOpIdx oi,
  * uintptr_t ra)
  */
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index e75a6d4..f603f02 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -1058,7 +1058,7 @@ static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_SB]   = helper_ret_ldsb_mmu,
 
@@ -1078,7 +1078,7 @@ static void * const qemu_ld_helpers[16] = {
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  * uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index d8f0d81..263c15e 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1334,7 +1334,7 @@ static void tcg_out_nopn(TCGContext *s, int n)
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
 [MO_LEUL] = helper_le_ldul_mmu,
@@ -1347,7 +1347,7 @@ static void * const qemu_ld_helpers[16] = {
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  * uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index 01ac7b2..4f2d5d1 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -1108,7 +1108,7 @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit 
*arg)
 }
 
 #if defined(CONFIG_SOFTMMU)
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_SB]   = helper_ret_ldsb_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
@@ -1125,7 +1125,7 @@ static void * const qemu_ld_helpers[16] = {
 #endif
 };
 
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index 64f67d2..680050b 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -1419,7 +1419,7 @@ static const uint32_t qemu_exts_opc[4] = {
 /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
 [MO_LEUL] = helper_le_ldul_mmu,
@@ -1432,7 +1432,7 @@ static void * const qemu_ld_helpers[16] = {
 /* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
  * uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_st_helpers[16

[Qemu-devel] [PATCH v2 17/20] softmmu: create helpers for vector loads

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 cputlb.c  |   4 +
 softmmu_template_vector.h | 266 ++
 tcg/tcg.h |   5 +
 3 files changed, 275 insertions(+)
 create mode 100644 softmmu_template_vector.h

diff --git a/cputlb.c b/cputlb.c
index 6c39927..41c9a01 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -660,6 +660,10 @@ static void *atomic_mmu_lookup(CPUArchState *env, 
target_ulong addr,
 #define DATA_SIZE 8
 #include "softmmu_template.h"
 
+#define SHIFT 4
+#include "softmmu_template_vector.h"
+#undef MMUSUFFIX
+
 /* First set of helpers allows passing in of OI and RETADDR.  This makes
them callable from other helpers.  */
 
diff --git a/softmmu_template_vector.h b/softmmu_template_vector.h
new file mode 100644
index 000..b286d65
--- /dev/null
+++ b/softmmu_template_vector.h
@@ -0,0 +1,266 @@
+/*
+ *  Software MMU support
+ *
+ * Generate helpers used by TCG for qemu_ld/st vector ops and code
+ * load functions.
+ *
+ * Included from target op helpers and exec.c.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/timer.h"
+#include "exec/address-spaces.h"
+#include "exec/memory.h"
+
+#define DATA_SIZE (1 << SHIFT)
+
+#if DATA_SIZE == 16
+#define SUFFIX v128
+#else
+#error unsupported data size
+#endif
+
+
+#ifdef SOFTMMU_CODE_ACCESS
+#define READ_ACCESS_TYPE MMU_INST_FETCH
+#define ADDR_READ addr_code
+#else
+#define READ_ACCESS_TYPE MMU_DATA_LOAD
+#define ADDR_READ addr_read
+#endif
+
+#define helper_te_ld_name  glue(glue(helper_te_ld, SUFFIX), MMUSUFFIX)
+#define helper_te_st_name  glue(glue(helper_te_st, SUFFIX), MMUSUFFIX)
+
+#ifndef SOFTMMU_CODE_ACCESS
+static inline void glue(io_read, SUFFIX)(CPUArchState *env,
+ CPUIOTLBEntry *iotlbentry,
+ target_ulong addr,
+ uintptr_t retaddr,
+ uint8_t *res)
+{
+CPUState *cpu = ENV_GET_CPU(env);
+hwaddr physaddr = iotlbentry->addr;
+MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+int i;
+
+assert(0); /* Needs testing */
+
+physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
+cpu->mem_io_pc = retaddr;
+if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+cpu_io_recompile(cpu, retaddr);
+}
+
+cpu->mem_io_vaddr = addr;
+for (i = 0; i < (1 << SHIFT); i += 8) {
+memory_region_dispatch_read(mr, physaddr + i, (uint64_t *)(res + i),
+8, iotlbentry->attrs);
+}
+}
+#endif
+
+void helper_te_ld_name(CPUArchState *env, target_ulong addr,
+   TCGMemOpIdx oi, uintptr_t retaddr, uint8_t *res)
+{
+unsigned mmu_idx = get_mmuidx(oi);
+int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+uintptr_t haddr;
+int i;
+
+/* Adjust the given return address.  */
+retaddr -= GETPC_ADJ;
+
+/* If the TLB entry is for a different page, reload and try again.  */
+if ((addr & TARGET_PAGE_MASK)
+ != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+if ((addr & (DATA_SIZE - 1)) != 0
+&& (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+ mmu_idx, retaddr);
+}
+if (!VICTIM_TLB_HIT(ADDR_READ, addr)) {
+tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+ mmu_idx, retaddr);
+}
+tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+}
+
+/* Handle an IO access.  */
+if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
+CPUIOTLBEntry *iotlbentry;
+if ((addr & (DATA_SIZE - 1)) != 0) {
+goto do_unaligned_access;
+}
+iotlbentry = &env->iotlb[mmu_idx][index];
+
+/* ??? Note that the io helpers always read data in the target
+   byte ordering.  We should push the LE/BE request down into io.  */
+glue(io_read, SUFFIX)(env, iotlbe

[Qemu-devel] [PATCH v2 20/20] tcg/README: update README to include information about vector opcodes

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/README | 47 ++-
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/tcg/README b/tcg/README
index a9858c2..209dbc4 100644
--- a/tcg/README
+++ b/tcg/README
@@ -53,9 +53,18 @@ an "undefined result".
 
 TCG instructions operate on variables which are temporaries, local
 temporaries or globals. TCG instructions and variables are strongly
-typed. Two types are supported: 32 bit integers and 64 bit
-integers. Pointers are defined as an alias to 32 bit or 64 bit
-integers depending on the TCG target word size.
+typed. Several types are supported:
+
+* 32 bit integers,
+
+* 64 bit integers,
+
+* 64 bit vectors,
+
+* 128 bit vectors.
+
+Pointers are defined as an alias to 32 bit or 64 bit integers
+depending on the TCG target word size.
 
 Each instruction has a fixed number of output variable operands, input
 variable operands and always constant operands.
@@ -208,6 +217,22 @@ t0=t1%t2 (signed). Undefined behavior if division by zero 
or overflow.
 
 t0=t1%t2 (unsigned). Undefined behavior if division by zero.
 
+* add_i8x16 t0, t1, t2
+add_i16x8 t0, t1, t2
+add_i32x4 t0, t1, t2
+add_i64x2 t0, t1, t2
+
+t0=t1+t2 where t0, t1 and t2 are 128 bit vectors of 8, 16, 32 or 64 bit
+integers.
+
+* add_i8x8 t0, t1, t2
+add_i16x4 t0, t1, t2
+add_i32x2 t0, t1, t2
+add_i64x1 t0, t1, t2
+
+t0=t1+t2 where t0, t1 and t2 are 64 bit vectors of 8, 16, 32 or 64 bit
+integers.
+
 * Logical
 
 * and_i32/i64 t0, t1, t2
@@ -477,8 +502,8 @@ current TB was linked to this TB. Otherwise execute the next
 instructions. Only indices 0 and 1 are valid and tcg_gen_goto_tb may be issued
 at most once with each slot index per TB.
 
-* qemu_ld_i32/i64 t0, t1, flags, memidx
-* qemu_st_i32/i64 t0, t1, flags, memidx
+* qemu_ld_i32/i64/v128 t0, t1, flags, memidx
+* qemu_st_i32/i64/v128 t0, t1, flags, memidx
 
 Load data at the guest address t1 into t0, or store data in t0 at guest
 address t1.  The _i32/_i64 size applies to the size of the input/output
@@ -488,6 +513,9 @@ and the width of the memory operation is controlled by 
flags.
 Both t0 and t1 may be split into little-endian ordered pairs of registers
 if dealing with 64-bit quantities on a 32-bit host.
 
+The _v128 size can only be used to read exactly 128 bit. Host and target
+are required to be of the same endianness for it to work.
+
 The memidx selects the qemu tlb index to use (e.g. user or kernel access).
 The flags are the TCGMemOp bits, selecting the sign, width, and endianness
 of the memory access.
@@ -538,6 +566,15 @@ Floating point operations are not supported in this 
version. A
 previous incarnation of the code generator had full support of them,
 but it is better to concentrate on integer operations first.
 
+To support vector operations, the backend must define:
+- TCG_TARGET_HAS_REGV64 for the 64 bit vector type and/or
+- TCG_TARGET_HAS_REG128 for the 128 bit vector type.
+For supported types, load and store operations must be supported. An
+arbitrary set of other vector operations may be supported. Vector operations
+that were not explicitly declared as supported (by defining
+TCG_TARGET_HAS_ to 1) will never appear in the intermediate
+representation. In this case, the emulation code will be emitted instead.
+
 4.2) Constraints
 
 GCC like constraints are used to define the constraints of every
-- 
2.1.4

[Qemu-devel] [PATCH v2 13/20] tcg/i386: support remaining vector addition operations

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.h | 10 +
 tcg/i386/tcg-target.inc.c | 54 +--
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 755ebaa..bd6cfe1 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -172,7 +172,17 @@ extern bool have_popcnt;
 #endif
 
 #ifdef TCG_TARGET_HAS_REG128
+#define TCG_TARGET_HAS_add_i8x161
+#define TCG_TARGET_HAS_add_i16x81
 #define TCG_TARGET_HAS_add_i32x41
+#define TCG_TARGET_HAS_add_i64x21
+#endif
+
+#ifdef TCG_TARGET_HAS_REGV64
+#define TCG_TARGET_HAS_add_i8x8 1
+#define TCG_TARGET_HAS_add_i16x41
+#define TCG_TARGET_HAS_add_i32x21
+#define TCG_TARGET_HAS_add_i64x11
 #endif
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 208bb81..d8f0d81 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -168,6 +168,11 @@ static bool have_lzcnt;
 #else
 # define have_lzcnt 0
 #endif
+#if defined(CONFIG_CPUID_H) && defined(bit_AVX) && defined(bit_OSXSAVE)
+static bool have_avx;
+#else
+# define have_avx 0
+#endif
 
 static tcg_insn_unit *tb_ret_addr;
 
@@ -393,7 +398,10 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define OPC_MOVQ_M2R(0x7e | P_SSE_F30F)
 #define OPC_MOVQ_R2M(0xd6 | P_SSE_660F)
 #define OPC_MOVQ_R2R(0x7e | P_SSE_F30F)
+#define OPC_PADDB   (0xfc | P_SSE_660F)
+#define OPC_PADDW   (0xfd | P_SSE_660F)
 #define OPC_PADDD   (0xfe | P_SSE_660F)
+#define OPC_PADDQ   (0xd4 | P_SSE_660F)
 
 /* Group 1 opcode extensions for 0x80-0x83.
These are also used as modifiers for OPC_ARITH.  */
@@ -1963,6 +1971,19 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 TCGArg a0, a1, a2;
 int c, const_a2, vexop, rexw = 0;
 
+static const int vect_binop[] = {
+[INDEX_op_add_i8x16] = OPC_PADDB,
+[INDEX_op_add_i16x8] = OPC_PADDW,
+[INDEX_op_add_i32x4] = OPC_PADDD,
+[INDEX_op_add_i64x2] = OPC_PADDQ,
+
+[INDEX_op_add_i8x8]  = OPC_PADDB,
+[INDEX_op_add_i16x4] = OPC_PADDW,
+[INDEX_op_add_i32x2] = OPC_PADDD,
+[INDEX_op_add_i64x1] = OPC_PADDQ,
+};
+
+
 #if TCG_TARGET_REG_BITS == 64
 # define OP_32_64(x) \
 case glue(glue(INDEX_op_, x), _i64): \
@@ -1972,6 +1993,17 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 # define OP_32_64(x) \
 case glue(glue(INDEX_op_, x), _i32)
 #endif
+#define OP_V128_ALL(x) \
+case glue(glue(INDEX_op_, x), _i8x16): \
+case glue(glue(INDEX_op_, x), _i16x8): \
+case glue(glue(INDEX_op_, x), _i32x4): \
+case glue(glue(INDEX_op_, x), _i64x2)
+
+#define OP_V64_ALL(x) \
+case glue(glue(INDEX_op_, x), _i8x8):  \
+case glue(glue(INDEX_op_, x), _i16x4): \
+case glue(glue(INDEX_op_, x), _i32x2): \
+case glue(glue(INDEX_op_, x), _i64x1)
 
 /* Hoist the loads of the most common arguments.  */
 a0 = args[0];
@@ -2369,8 +2401,13 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 tcg_out_mb(s, a0);
 break;
 
-case INDEX_op_add_i32x4:
-tcg_out_modrm(s, OPC_PADDD, args[0], args[2]);
+OP_V128_ALL(add):
+OP_V64_ALL(add):
+if (have_avx) {
+tcg_out_vex_modrm(s, vect_binop[opc], args[0], args[1], args[2]);
+} else {
+tcg_out_modrm(s, vect_binop[opc], args[0], args[2]);
+}
 break;
 
 case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
@@ -2383,6 +2420,8 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 }
 
 #undef OP_32_64
+#undef OP_V128_ALL
+#undef OP_V64_ALL
 }
 
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
@@ -2613,7 +2652,14 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 return &s2;
 }
 
+case INDEX_op_add_i8x16:
+case INDEX_op_add_i16x8:
 case INDEX_op_add_i32x4:
+case INDEX_op_add_i64x2:
+case INDEX_op_add_i8x8:
+case INDEX_op_add_i16x4:
+case INDEX_op_add_i32x2:
+case INDEX_op_add_i64x1:
 return &V_0_V;
 
 default:
@@ -2728,6 +2774,10 @@ static void tcg_target_init(TCGContext *s)
 #ifdef bit_POPCNT
 have_popcnt = (c & bit_POPCNT) != 0;
 #endif
+#if defined(bit_AVX) && defined(bit_OSXSAVE)
+have_avx = (c & (bit_AVX | bit_OSXSAVE)) == (bit_AVX | bit_OSXSAVE);
+#endif
+
 }
 
 if (max >= 7) {
-- 
2.1.4

[Qemu-devel] [PATCH v2 01/20] tcg: add support for 128bit vector type

2017-02-01 Thread Kirill Batuzov

Introduce TCG_TYPE_V128 and corresponding TCGv_v128 for TCG temps. Add helper
functions that work with temps of this new type.

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h | 24 
 tcg/tcg.c| 13 +
 tcg/tcg.h| 34 ++
 3 files changed, 71 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index c68e300..5abf8b2 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -248,6 +248,23 @@ static inline void tcg_gen_op6ii_i64(TCGOpcode opc, 
TCGv_i64 a1, TCGv_i64 a2,
 GET_TCGV_I64(a3), GET_TCGV_I64(a4), a5, a6);
 }
 
+static inline void tcg_gen_op1_v128(TCGOpcode opc, TCGv_v128 a1)
+{
+tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_V128(a1));
+}
+
+static inline void tcg_gen_op2_v128(TCGOpcode opc, TCGv_v128 a1,
+TCGv_v128 a2)
+{
+tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_V128(a2));
+}
+
+static inline void tcg_gen_op3_v128(TCGOpcode opc, TCGv_v128 a1,
+TCGv_v128 a2, TCGv_v128 a3)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_V128(a2),
+GET_TCGV_V128(a3));
+}
 
 /* Generic ops.  */
 
@@ -454,6 +471,13 @@ static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 
arg)
 }
 }
 
+/* Vector ops */
+
+static inline void tcg_gen_discard_v128(TCGv_v128 arg)
+{
+tcg_gen_op1_v128(INDEX_op_discard, arg);
+}
+
 /* 64 bit ops */
 
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index cb898f1..2a5e83b 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -641,6 +641,14 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
 return MAKE_TCGV_I64(idx);
 }
 
+TCGv_v128 tcg_temp_new_internal_v128(int temp_local)
+{
+int idx;
+
+idx = tcg_temp_new_internal(TCG_TYPE_V128, temp_local);
+return MAKE_TCGV_V128(idx);
+}
+
 static void tcg_temp_free_internal(int idx)
 {
 TCGContext *s = &tcg_ctx;
@@ -673,6 +681,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
 tcg_temp_free_internal(GET_TCGV_I64(arg));
 }
 
+void tcg_temp_free_v128(TCGv_v128 arg)
+{
+tcg_temp_free_internal(GET_TCGV_V128(arg));
+}
+
 TCGv_i32 tcg_const_i32(int32_t val)
 {
 TCGv_i32 t0;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 631c6f6..56484e7 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -246,6 +246,7 @@ typedef struct TCGPool {
 typedef enum TCGType {
 TCG_TYPE_I32,
 TCG_TYPE_I64,
+TCG_TYPE_V128,
 TCG_TYPE_COUNT, /* number of different types */
 
 /* An alias for the size of the host register.  */
@@ -421,6 +422,7 @@ typedef tcg_target_ulong TCGArg;
 typedef struct TCGv_i32_d *TCGv_i32;
 typedef struct TCGv_i64_d *TCGv_i64;
 typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef struct TCGv_v128_d *TCGv_v128;
 typedef TCGv_ptr TCGv_env;
 #if TARGET_LONG_BITS == 32
 #define TCGv TCGv_i32
@@ -445,6 +447,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL 
MAKE_TCGV_PTR(intptr_t i)
 return (TCGv_ptr)i;
 }
 
+static inline TCGv_v128 QEMU_ARTIFICIAL MAKE_TCGV_V128(intptr_t i)
+{
+return (TCGv_v128)i;
+}
+
 static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t)
 {
 return (intptr_t)t;
@@ -460,6 +467,11 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_PTR(TCGv_ptr t)
 return (intptr_t)t;
 }
 
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_V128(TCGv_v128 t)
+{
+return (intptr_t)t;
+}
+
 #if TCG_TARGET_REG_BITS == 32
 #define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
 #define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
@@ -467,15 +479,18 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_PTR(TCGv_ptr t)
 
 #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
 #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
+#define TCGV_EQUAL_V128(a, b) (GET_TCGV_V128(a) == GET_TCGV_V128(b))
 #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
 
 /* Dummy definition to avoid compiler warnings.  */
 #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
 #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
+#define TCGV_UNUSED_V128(x) x = MAKE_TCGV_V128(-1)
 #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
 
 #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
 #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
+#define TCGV_IS_UNUSED_V128(x) (GET_TCGV_V128(x) == -1)
 #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
 
 /* call flags */
@@ -798,9 +813,11 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char 
*name);
 
 TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
 TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+TCGv_v128 tcg_temp_new_internal_v128(int temp_local);
 
 void tcg_temp_free_i32(TCGv_i32 arg);
 void tcg_temp_free_i64(TCGv_i64 arg);
+void tcg_temp_free_v128(TCGv_v128 arg);
 
 static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
   const char *name)
@@ -836,6 +853,23 @@ static inline TCGv_i64 tcg_temp_local_new_i64(v

[Qemu-devel] [PATCH v2 11/20] tcg/i386: add support for vector opcodes

2017-02-01 Thread Kirill Batuzov

To be able to generate vector operations in a TCG backend we need to do
several things.

1. We need to tell the register allocator about vector target's register.
   In case of x86 we'll use xmm0..xmm7. xmm7 is designated as a scratch
   register, others can be used by the register allocator.

2. We need a new constraint to indicate where to use vector registers. In
   this commit the 'V' constraint is introduced.

3. We need to be able to generate bare minimum: load, store and reg-to-reg
   move. MOVDQU is used for loads and stores. MOVDQA is used for reg-to-reg
   moves.

4. Finally we need to support any other opcodes we want. INDEX_op_add_i32x4
   is the only one for now. The PADDD instruction handles it perfectly.

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.h |  34 +-
 tcg/i386/tcg-target.inc.c | 111 +++---
 2 files changed, 137 insertions(+), 8 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 21d96ec..b0704e8 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -29,8 +29,16 @@
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
 
 #ifdef __x86_64__
-# define TCG_TARGET_REG_BITS  64
-# define TCG_TARGET_NB_REGS   16
+# if defined(TARGET_WORDS_BIGENDIAN) == defined(HOST_WORDS_BIGENDIAN)
+#  define TCG_TARGET_HAS_REG128 1
+# endif
+# ifdef TCG_TARGET_HAS_REG128
+#  define TCG_TARGET_REG_BITS  64
+#  define TCG_TARGET_NB_REGS   32
+# else
+#  define TCG_TARGET_REG_BITS  64
+#  define TCG_TARGET_NB_REGS   16
+# endif
 #else
 # define TCG_TARGET_REG_BITS  32
 # define TCG_TARGET_NB_REGS8
@@ -56,6 +64,24 @@ typedef enum {
 TCG_REG_R13,
 TCG_REG_R14,
 TCG_REG_R15,
+
+TCG_REG_XMM0,
+TCG_REG_XMM1,
+TCG_REG_XMM2,
+TCG_REG_XMM3,
+TCG_REG_XMM4,
+TCG_REG_XMM5,
+TCG_REG_XMM6,
+TCG_REG_XMM7,
+TCG_REG_XMM8,
+TCG_REG_XMM9,
+TCG_REG_XMM10,
+TCG_REG_XMM11,
+TCG_REG_XMM12,
+TCG_REG_XMM13,
+TCG_REG_XMM14,
+TCG_REG_XMM15,
+
 TCG_REG_RAX = TCG_REG_EAX,
 TCG_REG_RCX = TCG_REG_ECX,
 TCG_REG_RDX = TCG_REG_EDX,
@@ -144,6 +170,10 @@ extern bool have_popcnt;
 #define TCG_TARGET_HAS_mulsh_i640
 #endif
 
+#ifdef TCG_TARGET_HAS_REG128
+#define TCG_TARGET_HAS_add_i32x41
+#endif
+
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
 (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
  ((ofs) == 0 && (len) == 16))
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 5918008..3e718f3 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -32,6 +32,11 @@ static const char * const 
tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #else
 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
 #endif
+#ifdef TCG_TARGET_HAS_REG128
+"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+"%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12", "%xmm13", "%xmm14",
+"%xmm15",
+#endif
 };
 #endif
 
@@ -61,6 +66,24 @@ static const int tcg_target_reg_alloc_order[] = {
 TCG_REG_EDX,
 TCG_REG_EAX,
 #endif
+#ifdef TCG_TARGET_HAS_REG128
+TCG_REG_XMM0,
+TCG_REG_XMM1,
+TCG_REG_XMM2,
+TCG_REG_XMM3,
+TCG_REG_XMM4,
+TCG_REG_XMM5,
+TCG_REG_XMM6,
+/*  TCG_REG_XMM7, <- scratch register */
+TCG_REG_XMM8,
+TCG_REG_XMM9,
+TCG_REG_XMM10,
+TCG_REG_XMM11,
+TCG_REG_XMM12,
+TCG_REG_XMM13,
+TCG_REG_XMM14,
+TCG_REG_XMM15,
+#endif
 };
 
 static const int tcg_target_call_iarg_regs[] = {
@@ -247,6 +270,10 @@ static const char 
*target_parse_constraint(TCGArgConstraint *ct,
 case 'I':
 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
 break;
+case 'V':
+ct->ct |= TCG_CT_REG;
+tcg_regset_set32(ct->u.regs, 0, 0xff);
+break;
 
 default:
 return NULL;
@@ -302,6 +329,9 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define P_SIMDF30x1 /* 0xf3 opcode prefix */
 #define P_SIMDF20x2 /* 0xf2 opcode prefix */
 
+#define P_SSE_660F  (P_DATA16 | P_EXT)
+#define P_SSE_F30F  (P_SIMDF3 | P_EXT)
+
 #define OPC_ARITH_EvIz (0x81)
 #define OPC_ARITH_EvIb (0x83)
 #define OPC_ARITH_GvEv (0x03)  /* ... plus (ARITH_FOO << 3) */
@@ -357,6 +387,11 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define OPC_GRP3_Ev(0xf7)
 #define OPC_GRP5   (0xff)
 
+#define OPC_MOVDQU_M2R  (0x6f | P_SSE_F30F)  /* store 128-bit value */
+#define OPC_MOVDQU_R2M  (0x7f | P_SSE_F30F)  /* load 128-bit value */
+#define OP

[Qemu-devel] [PATCH v2 04/20] tcg: add ld_v128, ld_v64, st_v128 and st_v64 opcodes

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h  | 38 ++
 tcg/tcg-opc.h | 18 ++
 2 files changed, 56 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 517745e..250493b 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -501,6 +501,44 @@ static inline void tcg_gen_discard_v64(TCGv_v64 arg)
 tcg_gen_op1_v64(INDEX_op_discard, arg);
 }
 
+static inline void tcg_gen_ldst_op_v128(TCGOpcode opc, TCGv_v128 val,
+   TCGv_ptr base, TCGArg offset)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V128(val), GET_TCGV_PTR(base),
+offset);
+}
+
+static inline void tcg_gen_st_v128(TCGv_v128 arg1, TCGv_ptr arg2,
+   tcg_target_long offset)
+{
+tcg_gen_ldst_op_v128(INDEX_op_st_v128, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_ld_v128(TCGv_v128 ret, TCGv_ptr arg2,
+   tcg_target_long offset)
+{
+tcg_gen_ldst_op_v128(INDEX_op_ld_v128, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ldst_op_v64(TCGOpcode opc, TCGv_v64 val,
+   TCGv_ptr base, TCGArg offset)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V64(val), GET_TCGV_PTR(base),
+offset);
+}
+
+static inline void tcg_gen_st_v64(TCGv_v64 arg1, TCGv_ptr arg2,
+  tcg_target_long offset)
+{
+tcg_gen_ldst_op_v64(INDEX_op_st_v64, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_ld_v64(TCGv_v64 ret, TCGv_ptr arg2,
+  tcg_target_long offset)
+{
+tcg_gen_ldst_op_v64(INDEX_op_ld_v64, ret, arg2, offset);
+}
+
 /* 64 bit ops */
 
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index f06f894..2365c97 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -42,6 +42,18 @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END)
 # define IMPL64  TCG_OPF_64BIT
 #endif
 
+#ifdef TCG_TARGET_HAS_REG128
+# define IMPL128 0
+#else
+# define IMPL128 TCG_OPF_NOT_PRESENT
+#endif
+
+#ifdef TCG_TARGET_HAS_REGV64
+# define IMPLV64 0
+#else
+# define IMPLV64 TCG_OPF_NOT_PRESENT
+#endif
+
 DEF(mb, 0, 0, 1, 0)
 
 DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT)
@@ -188,6 +200,12 @@ DEF(mulsh_i64, 1, 2, 0, IMPL(TCG_TARGET_HAS_mulsh_i64))
 #define TLADDR_ARGS  (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? 1 : 2)
 #define DATA64_ARGS  (TCG_TARGET_REG_BITS == 64 ? 1 : 2)
 
+/* load/store */
+DEF(st_v128, 0, 2, 1, IMPL128)
+DEF(ld_v128, 1, 1, 1, IMPL128)
+DEF(st_v64, 0, 2, 1, IMPLV64)
+DEF(ld_v64, 1, 1, 1, IMPLV64)
+
 /* QEMU specific */
 DEF(insn_start, 0, 0, TLADDR_ARGS * TARGET_INSN_START_WORDS,
 TCG_OPF_NOT_PRESENT)
-- 
2.1.4

[Qemu-devel] [PATCH v2 19/20] target/arm: load two consecutive 64-bits vector regs as a 128-bit vector reg

2017-02-01 Thread Kirill Batuzov

ARM instruction set does not have loads to 128-bit vector register (q-regs).
Instead it can read several consecutive 64-bit vector register (d-regs)
which is used by GCC to load 128-bit registers from memory.

For vector operations to work we need to detect such loads and transform them
into 128-bit loads to 128-bit temporaries.

Signed-off-by: Kirill Batuzov 
---
 target/arm/translate.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index 90e14df..76f9927 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -4710,6 +4710,19 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t 
insn)
 tcg_gen_addi_i32(addr, addr, 1 << size);
 }
 if (size == 3) {
+#ifdef TCG_TARGET_HAS_REG128
+if (rd % 2 == 0 && nregs == 2) {
+/* 128-bit load */
+if (load) {
+tcg_gen_qemu_ld_v128(cpu_Q[rd / 2], addr,
+ get_mem_index(s), MO_LE | MO_128);
+} else {
+tcg_gen_qemu_st_v128(cpu_Q[rd / 2], addr,
+ get_mem_index(s), MO_LE | MO_128);
+}
+break;
+}
+#endif
 tmp64 = tcg_temp_new_i64();
 if (load) {
 gen_aa32_ld64(s, tmp64, addr, get_mem_index(s));
-- 
2.1.4

[Qemu-devel] [PATCH v2 08/20] tcg: add vector addition operations

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---

Support for representing a v128 addition as two v64 additions have been added.
As a result GEN_VECT_WRAPPER_HALVES macro was added. It is larger and more
complicated than original GEN_VECT_WRAPPER (which is still used for v64 
additions
because they do not have half operations (v32 additions)).

GEN_VECT_WRAPPER_HALVES seems to grow fast (in size and complexity) for each
supported representation. Calling tcg_gen_add_ may not be 
desirable
because last resort fallback code is better be generated for the whole vector as
it will require less additional operations.

Some additional performance optimization can be done by creating hand written
tcg_gen_internal_ for some cases (for example, add_i8x16). This 
function
will still operate on memory locations but will use 64 bit scalar additions 
with some
bit masking as Richard suggested in v1 discussion. This series is focused on
infrastructure (not on optimization of particular instructions), so I have not
included this optimization yet.

---
 tcg/tcg-op.c  |  64 ++
 tcg/tcg-op.h  | 167 ++
 tcg/tcg-opc.h |  12 +
 tcg/tcg.c |  12 +
 tcg/tcg.h |  43 +++
 5 files changed, 298 insertions(+)

diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 95a39b7..8a19eee 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -3038,3 +3038,67 @@ static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, 
TCGv_i64 b)
 GEN_ATOMIC_HELPER(xchg, mov2, 0)
 
 #undef GEN_ATOMIC_HELPER
+
+/* Find a memory location for 128-bit TCG variable. */
+void tcg_v128_to_ptr(TCGv_v128 tmp, TCGv_ptr base, int slot,
+ TCGv_ptr *real_base, intptr_t *real_offset, int is_read)
+{
+int idx = GET_TCGV_V128(tmp);
+assert(idx >= 0 && idx < tcg_ctx.nb_temps);
+if (idx < tcg_ctx.nb_globals) {
+/* Globals use their locations within CPUArchState. */
+int env = GET_TCGV_PTR(tcg_ctx.tcg_env);
+TCGTemp *ts_env = &tcg_ctx.temps[env];
+TCGTemp *ts_arg = &tcg_ctx.temps[idx];
+
+/* Sanity checks: global's memory locations must be addressed
+   relative to ENV. */
+assert(ts_env->val_type == TEMP_VAL_REG &&
+   ts_env == ts_arg->mem_base &&
+   ts_arg->mem_allocated);
+
+*real_base = tcg_ctx.tcg_env;
+*real_offset = ts_arg->mem_offset;
+} else {
+/* Temporaries use swap space in TCGContext. Since we already have
+   a 128-bit temporary we'll assume that the target supports 128-bit
+   loads and stores. */
+*real_base = base;
+*real_offset = slot * 16;
+if (is_read) {
+tcg_gen_st_v128(tmp, base, slot * 16);
+}
+}
+}
+
+/* Find a memory location for 64-bit vector TCG variable. */
+void tcg_v64_to_ptr(TCGv_v64 tmp, TCGv_ptr base, int slot,
+TCGv_ptr *real_base, intptr_t *real_offset, int is_read)
+{
+int idx = GET_TCGV_V64(tmp);
+assert(idx >= 0 && idx < tcg_ctx.nb_temps);
+if (idx < tcg_ctx.nb_globals) {
+/* Globals use their locations within CPUArchState. */
+int env = GET_TCGV_PTR(tcg_ctx.tcg_env);
+TCGTemp *ts_env = &tcg_ctx.temps[env];
+TCGTemp *ts_arg = &tcg_ctx.temps[idx];
+
+/* Sanity checks: global's memory locations must be addressed
+   relative to ENV. */
+assert(ts_env->val_type == TEMP_VAL_REG &&
+   ts_env == ts_arg->mem_base &&
+   ts_arg->mem_allocated);
+
+*real_base = tcg_ctx.tcg_env;
+*real_offset = ts_arg->mem_offset;
+} else {
+/* Temporaries use swap space in TCGContext. Since we already have
+   a 128-bit temporary we'll assume that the target supports 128-bit
+   loads and stores. */
+*real_base = base;
+*real_offset = slot * 16;
+if (is_read) {
+tcg_gen_st_v64(tmp, base, slot * 16);
+}
+}
+}
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 250493b..3727be7 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -1195,6 +1195,10 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, 
TCGv_i64, TCGArg, TCGMemOp);
 tcg_gen_add_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), TCGV_PTR_TO_NAT(B))
 # define tcg_gen_addi_ptr(R, A, B) \
 tcg_gen_addi_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), (B))
+# define tcg_gen_mov_ptr(R, B) \
+tcg_gen_mov_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(B))
+# define tcg_gen_movi_ptr(R, B) \
+tcg_gen_movi_i32(TCGV_PTR_TO_NAT(R), (B))
 # define tcg_gen_ext_i32_ptr(R, A) \
 tcg_gen_mov_i32(TCGV_PTR_TO_NAT(R), (A))
 #else
@@ -1206,6 +1210,169 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, 
TCGv_i64, TCGArg, TCGMemOp);
 tcg_gen_add_i64(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), TCGV_PTR_TO_NAT(B))
 # define

[Qemu-devel] [PATCH v2 05/20] tcg: add simple alias analysis

2017-02-01 Thread Kirill Batuzov

Add a simple alias analysis to TCG which finds out memory loads and stores
that overlap with CPUState. This information can be used later in liveness
analysis to ensure correctness of register allocation. In particular, if load
or store overlaps with memory location of some global variable, this variable
should be spilled and reloaded at appropriate times.

Previously no such analysis was performed and for correctness reasons it was
required that no load/store operations overlap with memory locations of global
variables.

Signed-off-by: Kirill Batuzov 
---
 tcg/optimize.c | 146 +
 tcg/tcg.h  |  17 +++
 2 files changed, 163 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index adfc56c..2347ce3 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -34,6 +34,7 @@
 
 struct tcg_temp_info {
 bool is_const;
+bool is_base;
 uint16_t prev_copy;
 uint16_t next_copy;
 tcg_target_ulong val;
@@ -61,6 +62,7 @@ static void reset_temp(TCGArg temp)
 temps[temp].next_copy = temp;
 temps[temp].prev_copy = temp;
 temps[temp].is_const = false;
+temps[temp].is_base = false;
 temps[temp].mask = -1;
 }
 
@@ -1429,3 +1431,147 @@ void tcg_optimize(TCGContext *s)
 }
 }
 }
+
+/* Simple alias analysis. It finds out which load/store operations overlap
+   with CPUArchState. The result is stored in TCGContext and can be used
+   during liveness analysis and register allocation. */
+void tcg_alias_analysis(TCGContext *s)
+{
+int oi, oi_next;
+
+reset_all_temps(s->nb_temps);
+temps[GET_TCGV_PTR(s->tcg_env)].is_base = true;
+temps[GET_TCGV_PTR(s->tcg_env)].val = 0;
+
+for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) {
+int nb_oargs, i;
+int size;
+TCGAliasType tp;
+
+TCGOp * const op = &s->gen_op_buf[oi];
+TCGArg * const args = &s->gen_opparam_buf[op->args];
+TCGOpcode opc = op->opc;
+const TCGOpDef *def = &tcg_op_defs[opc];
+
+oi_next = op->next;
+
+if (opc == INDEX_op_call) {
+nb_oargs = op->callo;
+} else {
+nb_oargs = def->nb_oargs;
+}
+
+s->alias_info[oi] = (TCGAliasInfo){
+TCG_NOT_ALIAS,
+false,
+0,
+0
+};
+
+switch (opc) {
+CASE_OP_32_64(movi):
+temps[args[0]].is_const = 1;
+temps[args[0]].val = args[1];
+break;
+CASE_OP_32_64(mov):
+temps[args[0]].is_const = temps[args[1]].is_const;
+temps[args[0]].is_base = temps[args[1]].is_base;
+temps[args[0]].val = temps[args[1]].val;
+break;
+CASE_OP_32_64(add):
+CASE_OP_32_64(sub):
+if (temps[args[1]].is_base && temps[args[2]].is_const) {
+temps[args[0]].is_base = true;
+temps[args[0]].is_const = false;
+temps[args[0]].val =
+do_constant_folding(opc, temps[args[1]].val,
+temps[args[2]].val);
+} else {
+reset_temp(args[0]);
+}
+CASE_OP_32_64(ld8s):
+CASE_OP_32_64(ld8u):
+size = 1;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+CASE_OP_32_64(ld16s):
+CASE_OP_32_64(ld16u):
+size = 2;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+case INDEX_op_ld_i32:
+case INDEX_op_ld32s_i64:
+case INDEX_op_ld32u_i64:
+size = 4;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+case INDEX_op_ld_i64:
+size = 8;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+case INDEX_op_ld_v128:
+size = 16;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+CASE_OP_32_64(st8):
+size = 1;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+CASE_OP_32_64(st16):
+size = 2;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+case INDEX_op_st_i32:
+case INDEX_op_st32_i64:
+size = 4;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+case INDEX_op_st_i64:
+size = 8;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+case INDEX_op_st_v128:
+size = 16;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+do_ldst:
+if (temps[args[1]].is_base) {
+TCGArg val;
+#if TCG_TARGET_REG_BITS == 32
+val = do_constant_folding(INDEX_op_add_i32,
+  temps[args[1]].val,
+  args[2]);
+#else
+val = do_constant_folding(INDEX_op_add_i64,
+

[Qemu-devel] [PATCH v2 18/20] tcg/i386: add support for qemu_ld_v128/qemu_st_v128 ops

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.inc.c | 68 ++-
 1 file changed, 61 insertions(+), 7 deletions(-)

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 1e6edc0..4647e97 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1342,6 +1342,7 @@ static void * const qemu_ld_helpers[] = {
 [MO_BEUW] = helper_be_lduw_mmu,
 [MO_BEUL] = helper_be_ldul_mmu,
 [MO_BEQ]  = helper_be_ldq_mmu,
+[MO_128]  = helper_te_ldv128_mmu,
 };
 
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
@@ -1355,6 +1356,7 @@ static void * const qemu_st_helpers[] = {
 [MO_BEUW] = helper_be_stw_mmu,
 [MO_BEUL] = helper_be_stl_mmu,
 [MO_BEQ]  = helper_be_stq_mmu,
+[MO_128]  = helper_te_stv128_mmu,
 };
 
 /* Perform the TLB load and compare.
@@ -1521,12 +1523,30 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 ofs += 4;
 
 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
+
+if ((opc & MO_SSIZE) == MO_128) {
+ofs += 4;
+tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP);
+tcg_out_addi(s, TCG_REG_EAX, TCG_STATIC_CALL_ARGS_SIZE - 16);
+tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP, ofs);
+}
 } else {
 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 /* The second argument is already loaded with addrlo.  */
 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
  (uintptr_t)l->raddr);
+if ((opc & MO_SSIZE) == MO_128) {
+tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP);
+tcg_out_addi(s, TCG_REG_EAX, TCG_STATIC_CALL_ARGS_SIZE - 16);
+if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
+tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[4],
+TCG_REG_EAX);
+} else {
+tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_EAX,
+TCG_REG_ESP, TCG_TARGET_CALL_STACK_OFFSET);
+}
+}
 }
 
 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
@@ -1562,6 +1582,11 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
 }
 break;
+case MO_128:
+tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP);
+tcg_out_addi(s, TCG_REG_EAX, TCG_STATIC_CALL_ARGS_SIZE - 16);
+tcg_out_ld(s, TCG_TYPE_V128, l->datalo_reg, TCG_REG_EAX, 0);
+break;
 default:
 tcg_abort();
 }
@@ -1601,12 +1626,20 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 ofs += 4;
 }
 
-tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
-ofs += 4;
-
-if (s_bits == MO_64) {
-tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
+if (s_bits == MO_128) {
+tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP);
+tcg_out_addi(s, TCG_REG_EAX, TCG_STATIC_CALL_ARGS_SIZE - 16);
+tcg_out_st(s, TCG_TYPE_V128, l->datalo_reg, TCG_REG_EAX, 0);
+tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP, ofs);
 ofs += 4;
+} else {
+tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
+ofs += 4;
+
+if (s_bits == MO_64) {
+tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
+ofs += 4;
+}
 }
 
 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
@@ -1618,8 +1651,16 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 } else {
 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 /* The second argument is already loaded with addrlo.  */
-tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
-tcg_target_call_iarg_regs[2], l->datalo_reg);
+if (s_bits == MO_128) {
+tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_RAX, TCG_REG_ESP);
+tcg_out_addi(s, TCG_REG_RAX, TCG_STATIC_CALL_ARGS_SIZE - 16);
+tcg_out_st(s, TCG_TYPE_V128, l->datalo_reg, TCG_REG_RAX, 0);
+tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[2],
+TCG_REG_RAX);
+} else {
+tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
+tcg_target_call_iarg_regs[2], l->datalo_reg);
+}
 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
 
 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
@@ -1751,6 +1792,

[Qemu-devel] [PATCH v2 12/20] tcg/i386: support 64-bit vector operations

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.h |  1 +
 tcg/i386/tcg-target.inc.c | 22 ++
 2 files changed, 23 insertions(+)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index b0704e8..755ebaa 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -31,6 +31,7 @@
 #ifdef __x86_64__
 # if defined(TARGET_WORDS_BIGENDIAN) == defined(HOST_WORDS_BIGENDIAN)
 #  define TCG_TARGET_HAS_REG128 1
+#  define TCG_TARGET_HAS_REGV64 1
 # endif
 # ifdef TCG_TARGET_HAS_REG128
 #  define TCG_TARGET_REG_BITS  64
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 3e718f3..208bb81 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -390,6 +390,9 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define OPC_MOVDQU_M2R  (0x6f | P_SSE_F30F)  /* store 128-bit value */
 #define OPC_MOVDQU_R2M  (0x7f | P_SSE_F30F)  /* load 128-bit value */
 #define OPC_MOVDQA_R2R  (0x6f | P_SSE_660F)  /* reg-to-reg 128-bit mov */
+#define OPC_MOVQ_M2R(0x7e | P_SSE_F30F)
+#define OPC_MOVQ_R2M(0xd6 | P_SSE_660F)
+#define OPC_MOVQ_R2R(0x7e | P_SSE_F30F)
 #define OPC_PADDD   (0xfe | P_SSE_660F)
 
 /* Group 1 opcode extensions for 0x80-0x83.
@@ -700,6 +703,15 @@ static inline void tcg_out_mov(TCGContext *s, TCGType type,
 tcg_out_modrm(s, OPC_MOVDQA_R2R, ret, arg);
 }
 break;
+case TCG_TYPE_V64:
+ret -= TCG_REG_XMM0;
+arg -= TCG_REG_XMM0;
+if (have_avx) {
+tcg_out_vex_modrm(s, OPC_MOVQ_R2R, ret, 15, arg);
+} else {
+tcg_out_modrm(s, OPC_MOVQ_R2R, ret, arg);
+}
+break;
 case TCG_TYPE_I32:
 case TCG_TYPE_I64:
 opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
@@ -788,6 +800,10 @@ static inline void tcg_out_ld(TCGContext *s, TCGType type, 
TCGReg ret,
 ret -= TCG_REG_XMM0;
 tcg_out_modrm_offset(s, OPC_MOVDQU_M2R, ret, arg1, arg2);
 break;
+case TCG_TYPE_V64:
+ret -= TCG_REG_XMM0;
+tcg_out_modrm_offset(s, OPC_MOVQ_M2R, ret, arg1, arg2);
+break;
 case TCG_TYPE_I32:
 case TCG_TYPE_I64:
 opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
@@ -807,6 +823,10 @@ static inline void tcg_out_st(TCGContext *s, TCGType type, 
TCGReg arg,
 arg -= TCG_REG_XMM0;
 tcg_out_modrm_offset(s, OPC_MOVDQU_R2M, arg, arg1, arg2);
 break;
+case TCG_TYPE_V64:
+arg -= TCG_REG_XMM0;
+tcg_out_modrm_offset(s, OPC_MOVQ_R2M, arg, arg1, arg2);
+break;
 case TCG_TYPE_I32:
 case TCG_TYPE_I64:
 opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
@@ -2407,6 +2427,8 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 
 case INDEX_op_ld_v128:
 case INDEX_op_st_v128:
+case INDEX_op_ld_v64:
+case INDEX_op_st_v64:
 return &V_r;
 
 case INDEX_op_st8_i32:
-- 
2.1.4

[Qemu-devel] [PATCH v2 15/20] tcg: introduce new TCGMemOp - MO_128

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg.h | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/tcg.h b/tcg/tcg.h
index 5e0c6da..63a83f9 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -306,11 +306,12 @@ typedef enum TCGMemOp {
 MO_16= 1,
 MO_32= 2,
 MO_64= 3,
-MO_SIZE  = 3,   /* Mask for the above.  */
+MO_128   = 4,
+MO_SIZE  = 7,   /* Mask for the above.  */
 
-MO_SIGN  = 4,   /* Sign-extended, otherwise zero-extended.  */
+MO_SIGN  = 8,   /* Sign-extended, otherwise zero-extended.  */
 
-MO_BSWAP = 8,   /* Host reverse endian.  */
+MO_BSWAP = 16,   /* Host reverse endian.  */
 #ifdef HOST_WORDS_BIGENDIAN
 MO_LE= MO_BSWAP,
 MO_BE= 0,
@@ -342,7 +343,7 @@ typedef enum TCGMemOp {
  * - an alignment to a specified size, which may be more or less than
  *   the access size (MO_ALIGN_x where 'x' is a size in bytes);
  */
-MO_ASHIFT = 4,
+MO_ASHIFT = 5,
 MO_AMASK = 7 << MO_ASHIFT,
 #ifdef ALIGNED_ONLY
 MO_ALIGN = 0,
-- 
2.1.4

[Qemu-devel] [PATCH v2 03/20] tcg: support representing vector type with smaller vector or scalar types

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---

This is not as bad as I thought it would be.
Only two cases: type == base_type and type != base_type.

---
 tcg/tcg.c | 136 +-
 1 file changed, 91 insertions(+), 45 deletions(-)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 5e69103..18d97ec 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -523,12 +523,54 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char 
*name)
 return MAKE_TCGV_I64(idx);
 }
 
-int tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
+static TCGType tcg_choose_type(TCGType type)
+{
+switch (type) {
+case TCG_TYPE_I64:
+if (TCG_TARGET_REG_BITS == 64) {
+return TCG_TYPE_I64;
+}
+/* Fallthrough */
+case TCG_TYPE_I32:
+return TCG_TYPE_I32;
+case TCG_TYPE_V128:
+#ifdef TCG_TARGET_HAS_REG128
+return TCG_TYPE_V128;
+#endif
+/* Fallthrough */
+case TCG_TYPE_V64:
+#ifdef TCG_TARGET_HAS_REGV64
+return TCG_TYPE_V64;
+#else
+return tcg_choose_type(TCG_TYPE_I64);
+#endif
+default:
+g_assert_not_reached();
+}
+}
+
+static intptr_t tcg_type_size(TCGType type)
+{
+switch (type) {
+case TCG_TYPE_I32:
+return 4;
+case TCG_TYPE_I64:
+case TCG_TYPE_V64:
+return 8;
+case TCG_TYPE_V128:
+return 16;
+default:
+g_assert_not_reached();
+}
+}
+
+int tcg_global_mem_new_internal(TCGType base_type, TCGv_ptr base,
 intptr_t offset, const char *name)
 {
 TCGContext *s = &tcg_ctx;
 TCGTemp *base_ts = &s->temps[GET_TCGV_PTR(base)];
 TCGTemp *ts = tcg_global_alloc(s);
+TCGType type = tcg_choose_type(base_type);
 int indirect_reg = 0, bigendian = 0;
 #ifdef HOST_WORDS_BIGENDIAN
 bigendian = 1;
@@ -543,47 +585,51 @@ int tcg_global_mem_new_internal(TCGType type, TCGv_ptr 
base,
 indirect_reg = 1;
 }
 
-if (TCG_TARGET_REG_BITS == 32 && type == TCG_TYPE_I64) {
-TCGTemp *ts2 = tcg_global_alloc(s);
-char buf[64];
-
-ts->base_type = TCG_TYPE_I64;
-ts->type = TCG_TYPE_I32;
+if (type == base_type) {
+ts->base_type = type;
+ts->type = type;
 ts->indirect_reg = indirect_reg;
 ts->mem_allocated = 1;
 ts->mem_base = base_ts;
-ts->mem_offset = offset + bigendian * 4;
-pstrcpy(buf, sizeof(buf), name);
-pstrcat(buf, sizeof(buf), "_0");
-ts->name = strdup(buf);
-
-tcg_debug_assert(ts2 == ts + 1);
-ts2->base_type = TCG_TYPE_I64;
-ts2->type = TCG_TYPE_I32;
-ts2->indirect_reg = indirect_reg;
-ts2->mem_allocated = 1;
-ts2->mem_base = base_ts;
-ts2->mem_offset = offset + (1 - bigendian) * 4;
-pstrcpy(buf, sizeof(buf), name);
-pstrcat(buf, sizeof(buf), "_1");
-ts2->name = strdup(buf);
+ts->mem_offset = offset;
+ts->name = name;
 } else {
-ts->base_type = type;
+int i, count = tcg_type_size(base_type) / tcg_type_size(type);
+TCGTemp *ts2, *ts1 = ts;
+int cur_offset =
+bigendian ? tcg_type_size(base_type) - tcg_type_size(type) : 0;
+
+ts->base_type = base_type;
 ts->type = type;
 ts->indirect_reg = indirect_reg;
 ts->mem_allocated = 1;
 ts->mem_base = base_ts;
-ts->mem_offset = offset;
-ts->name = name;
+ts->mem_offset = offset + cur_offset;
+ts->name = g_strdup_printf("%s_0", name);
+
+for (i = 1; i < count; i++) {
+ts2 = tcg_global_alloc(s);
+tcg_debug_assert(ts2 == ts1 + 1);
+cur_offset += (bigendian ? -1 : 1) * tcg_type_size(type);
+ts2->base_type = base_type;
+ts2->type = type;
+ts2->indirect_reg = indirect_reg;
+ts2->mem_allocated = 1;
+ts2->mem_base = base_ts;
+ts2->mem_offset = offset + cur_offset;
+ts2->name = g_strdup_printf("%s_%d", name, i);
+ts1 = ts2;
+}
 }
 return temp_idx(s, ts);
 }
 
-static int tcg_temp_new_internal(TCGType type, int temp_local)
+static int tcg_temp_new_internal(TCGType base_type, int temp_local)
 {
 TCGContext *s = &tcg_ctx;
 TCGTemp *ts;
 int idx, k;
+TCGType type = tcg_choose_type(base_type);
 
 k = type + (temp_local ? TCG_TYPE_COUNT : 0);
 idx = find_first_bit(s->free_temps[k].l, TCG_MAX_TEMPS);
@@ -593,28 +639,28 @@ static int tcg_temp_new_internal(TCGType type, int 
temp_local)
 
 ts = &s->temps[idx];
 ts->temp_allocated = 1;
-tcg_debug_assert(ts->base_type == type);
+tcg_debug_assert(ts->base_type == base_type);
 tcg_debug_asser

[Qemu-devel] [PATCH v2 00/20] Emulate guest vector operations with host vector operations

2017-02-01 Thread Kirill Batuzov

The goal of these patch series is to set up an infrastructure to emulate
guest vector operations using host vector operations. Preliminary
experiments show that simply translating loads and stores increases
performance of x264 video codec by 10%. The performance of a gcc vectorized
for loop increased 2x.

To be able to emulate guest vector operations using host vector operations,
several things need to be done.

1. Corresponding vector types should be added to TCG. These series add
TCG_v128 and TCG_v64. I've made TCG_v64 a different type than TCG_i64
because it usually needs to be allocated to different registers and
supports different operations.

2. Load/store operations for these new types need to be implemented.

3. For seamless transition from current model to a new one we need to
handle cases where memory occupied by global variable can be accessed via
pointer to the CPUArchState structure. A very simple conservative alias
analysis has been added to do it. This analysis tracks memory loads and
stores that overlap with fields of CPUArchState and provides this
information to the register allocator. The allocator then spills and
reloads affected globals when needed.

4. Allow overlapping globals. For scalar registers this is a rare case, and
overlapping registers can ba handled as a single one (ah, al, ax, eax,
rax). In ARM every Q-register consists of two D-register each consisting of
two S-registers. Handling 4 S-registers as one because they are parts of
the same Q-register is way too inefficient.

5. Add new memory addressing mode to MMU code for large accesses and create
needed helpers. Only 128-bit vectors have been handled for now.

6. Create TCG opcodes for vector operations. Only addition has beed handled
in these series. Each operation has a wrapper that checks if the backend
supports the corresponding operation or not. In one case the vector opcode
is generated, in the other the operation is emulated with scalar
operations. The emulation code is generated inline for performance reasons
(there is a huge performance difference between inline generation
and calling a helper). As a positive side effect this will eventually allow
 to merge similar emulation code for vector instructions from different
frontends to target-independent implementation.

7. Use new operations in the frontend (ARM was used in these series).

8. Support new operations in the backend (x86_64 was used in these series).

For experiments I have used ARM guest on x86_64 host. I wanted some pair of
different architectures with vector extensions both. ARM and x86_64 pair
fits well.

v1 -> v2:
 - represent v128 type with smaller types when it is not supported by the host
 - detect AVX support and use AVX instructions when available
 - tcg/README updated
 - generate two v64 adds instead of one v128 when applicable
 - rebased to newer master
 - overlap detection for temps added (it needs to be explicitly called from
   _translate_init)
 - the stack is used to temporary store 128 bit variables to memory
   (instead of the TCGContext field)

Outstanding issues:
 - qemu_ld_v128 and qemu_st_v128 do not generate fallback code if the host
   does not support 128 bit registers. The reason is that I do not know how to
   handle the host/guest different endianness (whether do we swap only bytes
   in elements or whole vectors?). Different targets seem to have different
   ideas on how this should be done.

Kirill Batuzov (20):
  tcg: add support for 128bit vector type
  tcg: add support for 64bit vector type
  tcg: support representing vector type with smaller vector or scalar
types
  tcg: add ld_v128, ld_v64, st_v128 and st_v64 opcodes
  tcg: add simple alias analysis
  tcg: use results of alias analysis in liveness analysis
  tcg: allow globals to overlap
  tcg: add vector addition operations
  target/arm: support access to vector guest registers as globals
  target/arm: use vector opcode to handle vadd. instruction
  tcg/i386: add support for vector opcodes
  tcg/i386: support 64-bit vector operations
  tcg/i386: support remaining vector addition operations
  tcg: do not rely on exact values of MO_BSWAP or MO_SIGN in backend
  tcg: introduce new TCGMemOp - MO_128
  tcg: introduce qemu_ld_v128 and qemu_st_v128 opcodes
  softmmu: create helpers for vector loads
  tcg/i386: add support for qemu_ld_v128/qemu_st_v128 ops
  target/arm: load two consecutive 64-bits vector regs as a 128-bit
vector reg
  tcg/README: update README to include information about vector opcodes

 cputlb.c |   4 +
 softmmu_template_vector.h| 266 +++
 target/arm/translate.c   |  74 -
 tcg/README   |  47 +-
 tcg/aarch64/tcg-target.inc.c |   4 +-
 tcg/arm/tcg-target.inc.c |   4 +-
 tcg/i386/tcg-target.h|  45 +-
 tcg/i386/tcg-target.inc.c| 260 +--
 tcg/mips/tcg-target.inc.c|   4 +-
 tcg/optimize.c

[Qemu-devel] [PATCH v2 16/20] tcg: introduce qemu_ld_v128 and qemu_st_v128 opcodes

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.inc.c |  5 +
 tcg/tcg-op.c  | 24 
 tcg/tcg-op.h  | 15 +++
 tcg/tcg-opc.h |  4 
 4 files changed, 48 insertions(+)

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 263c15e..1e6edc0 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -2448,6 +2448,7 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 = { .args_ct_str = { "L", "L", "L", "L" } };
 static const TCGTargetOpDef V_r = { .args_ct_str  = { "V", "r" } };
 static const TCGTargetOpDef V_0_V = { .args_ct_str  = { "V", "0", "V" } };
+static const TCGTargetOpDef V_L = { .args_ct_str  = { "V", "L" } };
 
 switch (op) {
 case INDEX_op_ld8u_i32:
@@ -2662,6 +2663,10 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 case INDEX_op_add_i64x1:
 return &V_0_V;
 
+case INDEX_op_qemu_ld_v128:
+case INDEX_op_qemu_st_v128:
+return &V_L;
+
 default:
 break;
 }
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 0dfe611..db74017 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -3102,3 +3102,27 @@ void tcg_v64_to_ptr(TCGv_v64 tmp, TCGv_ptr base, int 
slot,
 }
 }
 }
+
+void tcg_gen_qemu_ld_v128(TCGv_v128 val, TCGv addr, TCGArg idx,
+  TCGMemOp memop)
+{
+#ifdef TCG_TARGET_HAS_REG128
+tcg_debug_assert((memop & MO_BSWAP) == MO_TE);
+TCGMemOpIdx oi = make_memop_idx(memop, idx);
+tcg_gen_op3si_v128(INDEX_op_qemu_ld_v128, val, addr, oi);
+#else
+g_assert_not_reached();
+#endif
+}
+
+void tcg_gen_qemu_st_v128(TCGv_v128 val, TCGv addr, TCGArg idx,
+  TCGMemOp memop)
+{
+#ifdef TCG_TARGET_HAS_REG128
+tcg_debug_assert((memop & MO_BSWAP) == MO_TE);
+TCGMemOpIdx oi = make_memop_idx(memop, idx);
+tcg_gen_op3si_v128(INDEX_op_qemu_st_v128, val, addr, oi);
+#else
+g_assert_not_reached();
+#endif
+}
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 3727be7..dc1d032 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -266,6 +266,19 @@ static inline void tcg_gen_op3_v128(TCGOpcode opc, 
TCGv_v128 a1,
 GET_TCGV_V128(a3));
 }
 
+static inline void tcg_gen_op3si_v128(TCGOpcode opc, TCGv_v128 a1,
+  TCGv a2, TCGArg a3)
+{
+#if TARGET_LONG_BITS == 64 && TCG_TARGET_REG_BITS == 32
+tcg_gen_op4(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_I32(TCGV_LOW(a2)),
+GET_TCGV_I32(TCGV_HIGH(a2)), a3);
+#elif TARGET_LONG_BITS == 32
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_I32(a2), a3);
+#else
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_I64(a2), a3);
+#endif
+}
+
 static inline void tcg_gen_op1_v64(TCGOpcode opc, TCGv_v64 a1)
 {
 tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_V64(a1));
@@ -909,6 +922,8 @@ void tcg_gen_qemu_ld_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_st_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_ld_i64(TCGv_i64, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_st_i64(TCGv_i64, TCGv, TCGArg, TCGMemOp);
+void tcg_gen_qemu_ld_v128(TCGv_v128, TCGv, TCGArg, TCGMemOp);
+void tcg_gen_qemu_st_v128(TCGv_v128, TCGv, TCGArg, TCGMemOp);
 
 static inline void tcg_gen_qemu_ld8u(TCGv ret, TCGv addr, int mem_index)
 {
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 4c8f195..6c2e697 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -232,6 +232,10 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
 TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
 DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
 TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
+DEF(qemu_ld_v128, 1, 1, 1,
+TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | IMPL128)
+DEF(qemu_st_v128, 0, 2, 1,
+TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | IMPL128)
 
 #undef TLADDR_ARGS
 #undef DATA64_ARGS
-- 
2.1.4

[Qemu-devel] [PATCH v2 06/20] tcg: use results of alias analysis in liveness analysis

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg.c | 61 +
 1 file changed, 61 insertions(+)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 18d97ec..27e5944 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -564,6 +564,11 @@ static intptr_t tcg_type_size(TCGType type)
 }
 }
 
+static intptr_t tcg_temp_size(const TCGTemp *tmp)
+{
+return tcg_type_size(tmp->type);
+}
+
 int tcg_global_mem_new_internal(TCGType base_type, TCGv_ptr base,
 intptr_t offset, const char *name)
 {
@@ -1472,6 +1477,43 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t 
*temp_state)
 }
 }
 
+/* Check if memory write completely overwrites temp's memory location.
+   If this is the case then the temp can be considered dead. */
+static int tcg_temp_overwrite(TCGContext *s, const TCGTemp *tmp,
+   const TCGAliasInfo *ai)
+{
+if (!(ai->alias_type & TCG_ALIAS_WRITE) || !ai->fixed_offset) {
+return 0;
+}
+if (tmp->mem_base != &s->temps[GET_TCGV_PTR(s->tcg_env)]) {
+return 0;
+}
+if (ai->offset > tmp->mem_offset
+|| ai->offset + ai->size < tmp->mem_offset + tcg_temp_size(tmp)) {
+return 0;
+}
+return 1;
+}
+
+/* Check if memory read or write overlaps with temp's memory location.
+   If this is the case then the temp must be synced to memory. */
+static int tcg_temp_overlap(TCGContext *s, const TCGTemp *tmp,
+const TCGAliasInfo *ai)
+{
+if (!ai->fixed_offset || tmp->fixed_reg) {
+return 0;
+}
+if (tmp->mem_base != &s->temps[GET_TCGV_PTR(s->tcg_env)]) {
+return 1;
+}
+if (ai->offset >= tmp->mem_offset + tcg_temp_size(tmp)
+|| ai->offset + ai->size <= tmp->mem_offset) {
+return 0;
+} else {
+return 1;
+}
+}
+
 /* Liveness analysis : update the opc_arg_life array to tell if a
given input arguments is dead. Instructions updating dead
temporaries are removed. */
@@ -1674,6 +1716,23 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 temp_state[arg] = TS_DEAD;
 }
 
+/* record if the operation uses some globals' memory location 
*/
+if (s->alias_info[oi].alias_type != TCG_NOT_ALIAS) {
+for (i = 0; i < s->nb_globals; i++) {
+if (tcg_temp_overwrite(s, &s->temps[i],
+   &s->alias_info[oi])) {
+temp_state[i] = TS_DEAD;
+} else if (tcg_temp_overlap(s, &s->temps[i],
+&s->alias_info[oi])) {
+if (s->alias_info[oi].alias_type & TCG_ALIAS_READ) 
{
+temp_state[i] = TS_MEM | TS_DEAD;
+} else if (!(temp_state[i] & TS_DEAD)) {
+temp_state[i] |= TS_MEM;
+}
+}
+}
+}
+
 /* if end of basic block, update */
 if (def->flags & TCG_OPF_BB_END) {
 tcg_la_bb_end(s, temp_state);
@@ -2622,6 +2681,8 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
 s->la_time -= profile_getclock();
 #endif
 
+tcg_alias_analysis(s);
+
 {
 uint8_t *temp_state = tcg_malloc(s->nb_temps + s->nb_indirects);
 
-- 
2.1.4

[Qemu-devel] [PATCH v2 02/20] tcg: add support for 64bit vector type

2017-02-01 Thread Kirill Batuzov

Introduce TCG_TYPE_V64 and corresponding TCGv_v64 for TCG temps. Add helper
functions that work with temps of this new type.

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h | 23 +++
 tcg/tcg.c| 13 +
 tcg/tcg.h| 34 ++
 3 files changed, 70 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 5abf8b2..517745e 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -266,6 +266,24 @@ static inline void tcg_gen_op3_v128(TCGOpcode opc, 
TCGv_v128 a1,
 GET_TCGV_V128(a3));
 }
 
+static inline void tcg_gen_op1_v64(TCGOpcode opc, TCGv_v64 a1)
+{
+tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_V64(a1));
+}
+
+static inline void tcg_gen_op2_v64(TCGOpcode opc, TCGv_v64 a1,
+TCGv_v64 a2)
+{
+tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_V64(a1), GET_TCGV_V64(a2));
+}
+
+static inline void tcg_gen_op3_v64(TCGOpcode opc, TCGv_v64 a1,
+TCGv_v64 a2, TCGv_v64 a3)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V64(a1), GET_TCGV_V64(a2),
+GET_TCGV_V64(a3));
+}
+
 /* Generic ops.  */
 
 static inline void gen_set_label(TCGLabel *l)
@@ -478,6 +496,11 @@ static inline void tcg_gen_discard_v128(TCGv_v128 arg)
 tcg_gen_op1_v128(INDEX_op_discard, arg);
 }
 
+static inline void tcg_gen_discard_v64(TCGv_v64 arg)
+{
+tcg_gen_op1_v64(INDEX_op_discard, arg);
+}
+
 /* 64 bit ops */
 
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 2a5e83b..5e69103 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -641,6 +641,14 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
 return MAKE_TCGV_I64(idx);
 }
 
+TCGv_v64 tcg_temp_new_internal_v64(int temp_local)
+{
+int idx;
+
+idx = tcg_temp_new_internal(TCG_TYPE_V64, temp_local);
+return MAKE_TCGV_V64(idx);
+}
+
 TCGv_v128 tcg_temp_new_internal_v128(int temp_local)
 {
 int idx;
@@ -681,6 +689,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
 tcg_temp_free_internal(GET_TCGV_I64(arg));
 }
 
+void tcg_temp_free_v64(TCGv_v64 arg)
+{
+tcg_temp_free_internal(GET_TCGV_V64(arg));
+}
+
 void tcg_temp_free_v128(TCGv_v128 arg)
 {
 tcg_temp_free_internal(GET_TCGV_V128(arg));
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 56484e7..fa455ae 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -246,6 +246,7 @@ typedef struct TCGPool {
 typedef enum TCGType {
 TCG_TYPE_I32,
 TCG_TYPE_I64,
+TCG_TYPE_V64,
 TCG_TYPE_V128,
 TCG_TYPE_COUNT, /* number of different types */
 
@@ -422,6 +423,7 @@ typedef tcg_target_ulong TCGArg;
 typedef struct TCGv_i32_d *TCGv_i32;
 typedef struct TCGv_i64_d *TCGv_i64;
 typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef struct TCGv_v64_d *TCGv_v64;
 typedef struct TCGv_v128_d *TCGv_v128;
 typedef TCGv_ptr TCGv_env;
 #if TARGET_LONG_BITS == 32
@@ -447,6 +449,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL 
MAKE_TCGV_PTR(intptr_t i)
 return (TCGv_ptr)i;
 }
 
+static inline TCGv_v64 QEMU_ARTIFICIAL MAKE_TCGV_V64(intptr_t i)
+{
+return (TCGv_v64)i;
+}
+
 static inline TCGv_v128 QEMU_ARTIFICIAL MAKE_TCGV_V128(intptr_t i)
 {
 return (TCGv_v128)i;
@@ -467,6 +474,11 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_PTR(TCGv_ptr t)
 return (intptr_t)t;
 }
 
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_V64(TCGv_v64 t)
+{
+return (intptr_t)t;
+}
+
 static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_V128(TCGv_v128 t)
 {
 return (intptr_t)t;
@@ -479,17 +491,20 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_V128(TCGv_v128 t)
 
 #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
 #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
+#define TCGV_EQUAL_V64(a, b) (GET_TCGV_V64(a) == GET_TCGV_V64(b))
 #define TCGV_EQUAL_V128(a, b) (GET_TCGV_V128(a) == GET_TCGV_V128(b))
 #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
 
 /* Dummy definition to avoid compiler warnings.  */
 #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
 #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
+#define TCGV_UNUSED_V64(x) x = MAKE_TCGV_V64(-1)
 #define TCGV_UNUSED_V128(x) x = MAKE_TCGV_V128(-1)
 #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
 
 #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
 #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
+#define TCGV_IS_UNUSED_V64(x) (GET_TCGV_V64(x) == -1)
 #define TCGV_IS_UNUSED_V128(x) (GET_TCGV_V128(x) == -1)
 #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
 
@@ -813,10 +828,12 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char 
*name);
 
 TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
 TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+TCGv_v64 tcg_temp_new_internal_v64(int temp_local);
 TCGv_v128 tcg_temp_new_internal_v128(int temp_local);
 
 void tcg_temp_free_i32(TCGv_i32 arg);
 void tcg_temp_free_i64(TCGv_i64 arg);
+void tcg_temp_free_v64(TCGv_v64 arg);
 void tcg_temp_free_v128(TCGv_v128 arg);
 
 static inl

[Qemu-devel] [PATCH v2 07/20] tcg: allow globals to overlap

2017-02-01 Thread Kirill Batuzov

Sometimes the target architecture may allow some parts of a register to be
accessed as a different register. If both of these registers are
implemented as globals in QEMU, then their content will overlap and the
change to one global will also change the value of the other. To handle
such situation properly, some fixes are needed in the register allocator
and liveness analysis.

Signed-off-by: Kirill Batuzov 
---
 tcg/optimize.c |  19 -
 tcg/tcg.c  | 128 +
 tcg/tcg.h  |  20 +
 3 files changed, 166 insertions(+), 1 deletion(-)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 2347ce3..7a69ff0 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -55,7 +55,7 @@ static inline bool temp_is_copy(TCGArg arg)
 }
 
 /* Reset TEMP's state, possibly removing the temp for the list of copies.  */
-static void reset_temp(TCGArg temp)
+static void reset_this_temp(TCGArg temp)
 {
 temps[temps[temp].next_copy].prev_copy = temps[temp].prev_copy;
 temps[temps[temp].prev_copy].next_copy = temps[temp].next_copy;
@@ -66,6 +66,23 @@ static void reset_temp(TCGArg temp)
 temps[temp].mask = -1;
 }
 
+static void reset_temp(TCGArg temp)
+{
+int i;
+TCGTemp *ts = &tcg_ctx.temps[temp];
+reset_this_temp(temp);
+if (ts->sub_temps) {
+for (i = 0; ts->sub_temps[i] != (TCGArg)-1; i++) {
+reset_this_temp(ts->sub_temps[i]);
+}
+}
+if (ts->overlap_temps) {
+for (i = 0; ts->overlap_temps[i] != (TCGArg)-1; i++) {
+reset_this_temp(ts->overlap_temps[i]);
+}
+}
+}
+
 /* Reset all temporaries, given that there are NB_TEMPS of them.  */
 static void reset_all_temps(int nb_temps)
 {
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 27e5944..a8df040 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -623,9 +623,13 @@ int tcg_global_mem_new_internal(TCGType base_type, 
TCGv_ptr base,
 ts2->mem_base = base_ts;
 ts2->mem_offset = offset + cur_offset;
 ts2->name = g_strdup_printf("%s_%d", name, i);
+ts2->sub_temps = NULL;
+ts2->overlap_temps = NULL;
 ts1 = ts2;
 }
 }
+ts->sub_temps = NULL;
+ts->overlap_temps = NULL;
 return temp_idx(s, ts);
 }
 
@@ -1514,6 +1518,35 @@ static int tcg_temp_overlap(TCGContext *s, const TCGTemp 
*tmp,
 }
 }
 
+static void tcg_temp_arr_apply(const TCGArg *arr, uint8_t *temp_state,
+   uint8_t temp_val)
+{
+TCGArg i;
+if (!arr) {
+return ;
+}
+for (i = 0; arr[i] != (TCGArg)-1; i++) {
+temp_state[arr[i]] = temp_val;
+}
+}
+
+static void tcg_sub_temps_dead(TCGContext *s, TCGArg tmp, uint8_t *temp_state)
+{
+tcg_temp_arr_apply(s->temps[tmp].sub_temps, temp_state, TS_DEAD);
+}
+
+static void tcg_sub_temps_sync(TCGContext *s, TCGArg tmp, uint8_t *temp_state)
+{
+tcg_temp_arr_apply(s->temps[tmp].sub_temps, temp_state, TS_MEM | TS_DEAD);
+}
+
+static void tcg_overlap_temps_sync(TCGContext *s, TCGArg tmp,
+   uint8_t *temp_state)
+{
+tcg_temp_arr_apply(s->temps[tmp].overlap_temps, temp_state,
+   TS_MEM | TS_DEAD);
+}
+
 /* Liveness analysis : update the opc_arg_life array to tell if a
given input arguments is dead. Instructions updating dead
temporaries are removed. */
@@ -1568,6 +1601,11 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 if (temp_state[arg] & TS_MEM) {
 arg_life |= SYNC_ARG << i;
 }
+/* sub_temps are also dead */
+tcg_sub_temps_dead(&tcg_ctx, arg, temp_state);
+/* overlap_temps need to go to memory */
+tcg_overlap_temps_sync(&tcg_ctx, arg, temp_state);
+
 temp_state[arg] = TS_DEAD;
 }
 
@@ -1595,6 +1633,11 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
 arg = args[i];
 if (arg != TCG_CALL_DUMMY_ARG) {
+/* both sub_temps and overlap_temps need to go
+   to memory */
+tcg_sub_temps_sync(&tcg_ctx, arg, temp_state);
+tcg_overlap_temps_sync(&tcg_ctx, arg, temp_state);
+
 temp_state[arg] &= ~TS_DEAD;
 }
 }
@@ -1713,6 +1756,11 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 if (temp_state[arg] & TS_MEM) {
 arg_life |= SYNC_ARG << i;
 }
+/* sub_temps are also dead */
+

[Qemu-devel] [PATCH v2 09/20] target/arm: support access to vector guest registers as globals

2017-02-01 Thread Kirill Batuzov

To support vector guest registers as globals we need to do two things:

1) create corresponding globals,
2) mark which globals can overlap,

Signed-off-by: Kirill Batuzov 
---
 target/arm/translate.c | 30 --
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index 493c627..d7578e2 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -65,6 +65,8 @@ static TCGv_i32 cpu_R[16];
 TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
 TCGv_i64 cpu_exclusive_addr;
 TCGv_i64 cpu_exclusive_val;
+static TCGv_v128 cpu_Q[16];
+static TCGv_v64 cpu_D[32];
 
 /* FIXME:  These should be removed.  */
 static TCGv_i32 cpu_F0s, cpu_F1s;
@@ -72,10 +74,20 @@ static TCGv_i64 cpu_F0d, cpu_F1d;
 
 #include "exec/gen-icount.h"
 
-static const char *regnames[] =
+static const char *regnames_r[] =
 { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
   "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" };
 
+static const char *regnames_q[] =
+{ "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+  "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" };
+
+static const char *regnames_d[] =
+{ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+  "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+  "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
+
 /* initialize TCG globals.  */
 void arm_translate_init(void)
 {
@@ -87,8 +99,22 @@ void arm_translate_init(void)
 for (i = 0; i < 16; i++) {
 cpu_R[i] = tcg_global_mem_new_i32(cpu_env,
   offsetof(CPUARMState, regs[i]),
-  regnames[i]);
+  regnames_r[i]);
+}
+for (i = 0; i < 16; i++) {
+cpu_Q[i] = tcg_global_mem_new_v128(cpu_env,
+   offsetof(CPUARMState,
+vfp.regs[2 * i]),
+   regnames_q[i]);
 }
+for (i = 0; i < 32; i++) {
+cpu_D[i] = tcg_global_mem_new_v64(cpu_env,
+  offsetof(CPUARMState, vfp.regs[i]),
+  regnames_d[i]);
+}
+
+tcg_detect_overlapping_temps(&tcg_ctx);
+
 cpu_CF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, CF), "CF");
 cpu_NF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, NF), "NF");
 cpu_VF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, VF), "VF");
-- 
2.1.4

[Qemu-devel] [PATCH v2 10/20] target/arm: use vector opcode to handle vadd. instruction

2017-02-01 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 target/arm/translate.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index d7578e2..90e14df 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -5628,6 +5628,37 @@ static int disas_neon_data_insn(DisasContext *s, 
uint32_t insn)
 return 1;
 }
 
+/* Use vector ops to handle what we can */
+switch (op) {
+case NEON_3R_VADD_VSUB:
+if (!u) {
+void (* const gen_add_v128[])(TCGv_v128, TCGv_v128,
+ TCGv_v128) = {
+tcg_gen_add_i8x16,
+tcg_gen_add_i16x8,
+tcg_gen_add_i32x4,
+tcg_gen_add_i64x2
+};
+void (* const gen_add_v64[])(TCGv_v64, TCGv_v64,
+ TCGv_v64) = {
+tcg_gen_add_i8x8,
+tcg_gen_add_i16x4,
+tcg_gen_add_i32x2,
+tcg_gen_add_i64x1
+};
+if (q) {
+gen_add_v128[size](cpu_Q[rd >> 1], cpu_Q[rn >> 1],
+   cpu_Q[rm >> 1]);
+} else {
+gen_add_v64[size](cpu_D[rd], cpu_D[rn], cpu_D[rm]);
+}
+return 0;
+}
+break;
+default:
+break;
+}
+
 for (pass = 0; pass < (q ? 4 : 2); pass++) {
 
 if (pairwise) {
-- 
2.1.4

Re: [Qemu-devel] [PATCH 01/18] tcg: add support for 128bit vector type

2017-01-24 Thread Kirill Batuzov

On Mon, 23 Jan 2017, Richard Henderson wrote:

> On 01/23/2017 02:30 AM, Kirill Batuzov wrote:
> > Because 4 adds on 4 i32 registers work good only when the size of
> > vector elements matches the size of scalar variables we use for
> > representation of a vector. add_i16x8 will not be that great if we use
> > 4 i32 variables: each will need to be split into two values, processed
> > independently and merged back afterwards.
> 
> Certainly.  But that's pretty much exactly how they are processed now.  
> Usually
> via a helper function that accepts an i64 input as a pair of i32 arguments.
> 
> > Scalar variables lack primitives to work with them as vectors of shorter
> > values. This is one of the reasons I added v64 type instead of using i64
> > for 64-bit vector operations. And this is the reason I'm so opposed to
> > using them to represent vector types if vector registers are not
> > supported by host. Handling vector operations with element size that
> > does not match representation will be complicated, may require special
> > handling for different operations and will produce a lot of if-s in code.
> 
> A lot of if's?  I've no idea what you're talking about.
> 
> A v64 type makes sense because generally we're going to allocate them to a
> different register set than i64.  That said, i64 is perfectly adequate for
> implementing add_i8x8:
> 
>   t0  = in1 & 0x7f7f7f7f7f7f7f7f
>   t1  = in0 + t0;
>   t2  = in1 & 0x8080808080808080
>   out = t1 ^ t2
> 
> This is less expensive than addition by pieces if there are at least 4 pieces.
> 
> > The method I'm proposing can handle any operation regardless of
> > representation. This includes handling situation where host supports
> > vector registers but does not support required operation (for example 
> > SSE/AVX does not support multiplication of vectors of 8-bit values).
> 
> Not for nothing but it's trivial to expand with punpcklbw, punpckhbw, pmullw,
> pand, packuswb.  That said, if an expansion gets too complicated, it's still
> better to move it into a helper than expand 16 * (load, op, store).
>

I'm a bit lost in the discussion so let me try to summarise. As far as I
understand there is only one major point on which we disagree: is it
worth representing vector variables as a sequences of scalar ones?

Pros:
1. We will not get phantom variables of unsupported type like we do in
my current implementation.

2. If we manage to efficiently emulate large enough number of vector
operations using scalar types we'll get some performance benefits. In
this case scalar variables can be allocated on registers and stay there
across several consecutive guest instructions.

I personally doubt that first "if": logical operations will be fine,
addition and subtraction can be implemented, may be shifts, but
everything else will end up as helpers (and they are expensive
from performance perspective).

Cons:
1. Additional cases for each possible representation in
tcg_global_mem_new_internal and tcg_temp_new_internal. I do not see how I
can use existing i64 as a pair of i32 recursively. TCG supports only one
level of indirection: there is a "type" of variable, and a "base_type"
it is used to represent. i64 code does not check "base_type" explicitly,
so if I pass two consecutive i32 variables to these functions they will
work, but this sounds like some dirty hack to me.

2. Additional cases for each possible representation in
tcg_gen_ wrappers. We need to generate adequate expansion
code for each representation. That is if do not default to memory
location every time (in which case why bother with different
representation to begin with).

3. TCG variables exhaustion: to (potentially) represent AVX-512
registers with 32 bit variables we'll need 512 of them (32 of 512 bit
registers). TCG_MAX_TEMP is 512. Sure, it can be increased.

Making something a global variable is only beneficial when we can carry
a value of it in a register from one operation to another (so we'll get
ld+op1+op2+st instead of l1+op1+st+ld+op2+st). I'm not sure that subset
of operations we can effectively emulate is large enough for this to
happen often, but my experience with vector operations is limited so it
might be.

Let's do the following: in v2 I'll add representation of v128 as a pair
of v64 and update tcg_gen_ wrappers. We'll see how this works
out and decide if it is good to follow with representation of v128 as a
sequence of scalar types.

-- 
Kirill

Re: [Qemu-devel] [PATCH 01/18] tcg: add support for 128bit vector type

2017-01-23 Thread Kirill Batuzov

On Sat, 21 Jan 2017, Richard Henderson wrote:

> On 01/19/2017 08:54 AM, Kirill Batuzov wrote:
> > 
> > Wrappers issue emulation code instead of operation if it is not supported by
> > host.
> > 
> > tcg_gen_add_i32x4 looks like this:
> > 
> > if (TCG_TARGET_HAS_add_i32x4) {
> > tcg_gen_op3_v128(INDEX_op_add_i32x4, args[0], args[1], args[2]);
> > } else {
> > for (i = 0; i < 4; i++) {
> > tcg_gen_ld_i32(...);
> > tcg_gen_ld_i32(...);
> > tcg_gen_add_i32(...);
> > tcg_gen_st_i32(...);
> > }
> > }
> 
> To me that begs the question of why you wouldn't issue 4 adds on 4 i32
> registers instead.
>

Because 4 adds on 4 i32 registers work good only when the size of
vector elements matches the size of scalar variables we use for
representation of a vector. add_i16x8 will not be that great if we use
4 i32 variables: each will need to be split into two values, processed
independently and merged back afterwards. And when we create variable we
do not know which operations will be performed on it.

Scalar variables lack primitives to work with them as vectors of shorter
values. This is one of the reasons I added v64 type instead of using i64
for 64-bit vector operations. And this is the reason I'm so opposed to
using them to represent vector types if vector registers are not
supported by host. Handling vector operations with element size that
does not match representation will be complicated, may require special
handling for different operations and will produce a lot of if-s in code.

The method I'm proposing can handle any operation regardless of
representation. This includes handling situation where host supports
vector registers but does not support required operation (for example 
SSE/AVX does not support multiplication of vectors of 8-bit values).

-- 
Kirill

Re: [Qemu-devel] [PATCH 01/18] tcg: add support for 128bit vector type

2017-01-19 Thread Kirill Batuzov


On 19.01.2017 18:09, Richard Henderson wrote:

On 01/19/2017 05:04 AM, Kirill Batuzov wrote:

On Wed, 18 Jan 2017, Richard Henderson wrote:


On 01/17/2017 01:07 AM, Kirill Batuzov wrote:

+static inline TCGv_v128 tcg_global_mem_new_v128(TCGv_ptr reg, intptr_t
offset,
+const char *name)
+{
+int idx = tcg_global_mem_new_internal(TCG_TYPE_V128, reg, offset,
name);
+return MAKE_TCGV_V128(idx);
+}


You shouldn't allow a v128 type to be created if the host doesn't
support it.


The idea here was to create it either way, but make sure no operation
will ever be issued if host does not support it (tcg_gen_* wrappers take
care of this).


Huh?  If you issue *no* operation, then how is the operation being
emulated?


Wrappers issue emulation code instead of operation if it is not 
supported by host.


tcg_gen_add_i32x4 looks like this:

if (TCG_TARGET_HAS_add_i32x4) {
tcg_gen_op3_v128(INDEX_op_add_i32x4, args[0], args[1], args[2]);
} else {
for (i = 0; i < 4; i++) {
tcg_gen_ld_i32(...);
tcg_gen_ld_i32(...);
tcg_gen_add_i32(...);
tcg_gen_st_i32(...);
}
}

So no operation working directly with TCGV_v128 temp should appear 
anywhere in the intermediate representation unless host claims it 
supports it (in which case it must support 128-bit type as well).





I'm not sure about this last part. The host may not have i64, so there
should be another case - 4 x i32. So we'll get 4 cases for v128:


Recursively you'd get 4 x i32, but at least they'll be tagged
TCG_TYPE_I64, and be handled by the rest of the tcg code generator like
it should be.



v128
2 x v64
2 x i64
4 x i32

3 cases will need to be added to tcg_temp_new_internal and
tcg_global_new_mem_internal, two of which are rather useless (2 x i64,
4 x i32).
Introduction of v256 will add 4 more cases two of which will be useless
again. This sounds like too much code that serves no purpose to me.


Useless?  Surely you mean "used by hosts that don't implement v128".


I meant that host that doesn't support v128 type will not use this 
variables. It'll use their memory locations instead, so it does not 
matter how we represent them. The only TCG code that'll see them is 
tcg_gen_ wrappers which know how to deal with them.


2 x v64 is a different story. We can make a much better emulation code 
if we represent a v128 variable as a pair of v64 variables and work with 
them as variables.




I think one of us is very confused about how you intend to generate
fallback code.  Perhaps any future patchset revision should update
tcg/README first.



Sure, I'll do it in v2.

--
Kirill

Re: [Qemu-devel] [PATCH 01/18] tcg: add support for 128bit vector type

2017-01-19 Thread Kirill Batuzov

On Wed, 18 Jan 2017, Richard Henderson wrote:

> On 01/17/2017 01:07 AM, Kirill Batuzov wrote:
> > +static inline TCGv_v128 tcg_global_mem_new_v128(TCGv_ptr reg, intptr_t
> > offset,
> > +const char *name)
> > +{
> > +int idx = tcg_global_mem_new_internal(TCG_TYPE_V128, reg, offset,
> > name);
> > +return MAKE_TCGV_V128(idx);
> > +}
> 
> You shouldn't allow a v128 type to be created if the host doesn't support it.

The idea here was to create it either way, but make sure no operation
will ever be issued if host does not support it (tcg_gen_* wrappers take
care of this).

> 
> You may want to treat v128 as a pair of v64 if the host supports that.
> Although there's limited applicability there, since only minor hosts (MIPS,
> Sparc, ia64) have 64-bit-only vector extensions.
> 
> That said, treating v128 as 2 x v64 scales nicely when we add v256.  Which, if
> we've already gone this far, is clearly how avx2 guest support should be
> implemented.
> 
> For hosts that have had no vector support added, you may want to represent
> v128 as 2 x i64, for the purpose of intermediate expansion.
>

I'm not sure about this last part. The host may not have i64, so there
should be another case - 4 x i32. So we'll get 4 cases for v128:

v128
2 x v64
2 x i64
4 x i32

3 cases will need to be added to tcg_temp_new_internal and
tcg_global_new_mem_internal, two of which are rather useless (2 x i64, 4 x i32).
Introduction of v256 will add 4 more cases two of which will be useless
again. This sounds like too much code that serves no purpose to me.

Maybe we can only adapt 2 x v64 (and later 2 x v128 and may be 4 x v64)
cases and just generate v128 temp that'll never be used if none of these
worked?

-- 
Kirill

Re: [Qemu-devel] [PATCH 10/18] tcg/i386: add support for vector opcodes

2017-01-18 Thread Kirill Batuzov

On Tue, 17 Jan 2017, Richard Henderson wrote:

> On 01/17/2017 01:07 AM, Kirill Batuzov wrote:
> > To be able to generate vector operations in a TCG backend we need to do
> > several things.
> > 
> > 1. We need to tell the register allocator about vector target's register.
> >In case of x86 we'll use xmm0..xmm7. xmm7 is designated as a scratch
> >register, others can be used by the register allocator.
> > 
> > 2. We need a new constraint to indicate where to use vector registers. In
> >this commit the 'V' constraint is introduced.
> > 
> > 3. We need to be able to generate bare minimum: load, store and reg-to-reg
> >move. MOVDQU is used for loads and stores. MOVDQA is used for reg-to-reg
> >moves.
> > 
> > 4. Finally we need to support any other opcodes we want. INDEX_op_add_i32x4
> >is the only one for now. The PADDD instruction handles it perfectly.
> > 
> > Signed-off-by: Kirill Batuzov 
> > ---
> >  tcg/i386/tcg-target.h |  24 +-
> >  tcg/i386/tcg-target.inc.c | 109
> > +++---
> >  2 files changed, 125 insertions(+), 8 deletions(-)
> > 
> > diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
> > index 524cfc6..974a58b 100644
> > --- a/tcg/i386/tcg-target.h
> > +++ b/tcg/i386/tcg-target.h
> > @@ -29,8 +29,14 @@
> >  #define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
> > 
> >  #ifdef __x86_64__
> > -# define TCG_TARGET_REG_BITS  64
> > -# define TCG_TARGET_NB_REGS   16
> > +# define TCG_TARGET_HAS_REG128 1
> > +# ifdef TCG_TARGET_HAS_REG128
> > +#  define TCG_TARGET_REG_BITS  64
> > +#  define TCG_TARGET_NB_REGS   24
> > +# else
> > +#  define TCG_TARGET_REG_BITS  64
> > +#  define TCG_TARGET_NB_REGS   16
> > +# endif
> >  #else
> >  # define TCG_TARGET_REG_BITS  32
> >  # define TCG_TARGET_NB_REGS8
> > @@ -56,6 +62,16 @@ typedef enum {
> >  TCG_REG_R13,
> >  TCG_REG_R14,
> >  TCG_REG_R15,
> > +#ifdef TCG_TARGET_HAS_REG128
> > +TCG_REG_XMM0,
> > +TCG_REG_XMM1,
> > +TCG_REG_XMM2,
> > +TCG_REG_XMM3,
> > +TCG_REG_XMM4,
> > +TCG_REG_XMM5,
> > +TCG_REG_XMM6,
> > +TCG_REG_XMM7,
> > +#endif
> 
> There's no need to conditionalize this.  The registers can be always defined
> even if they're not used.  We really really really want to keep ifdefs to an
> absolute minimum.
> 
> Why are you not defining xmm8-15?

At first I thought about supporting both x86_64 and i386 targets, but
put this idea away (at least for the time being). Since defining xmm8-15
does not contradict anything (as I see it now) I'll add them too.

> 
> > @@ -634,9 +662,24 @@ static inline void tgen_arithr(TCGContext *s, int
> > subop, int dest, int src)
> >  static inline void tcg_out_mov(TCGContext *s, TCGType type,
> > TCGReg ret, TCGReg arg)
> >  {
> > +int opc;
> >  if (arg != ret) {
> > -int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
> > -tcg_out_modrm(s, opc, ret, arg);
> > +switch (type) {
> > +#ifdef TCG_TARGET_HAS_REG128
> > +case TCG_TYPE_V128:
> > +ret -= TCG_REG_XMM0;
> > +arg -= TCG_REG_XMM0;
> > +tcg_out_modrm(s, OPC_MOVDQA_R2R, ret, arg);
> > +break;
> > +#endif
> > +case TCG_TYPE_I32:
> > +case TCG_TYPE_I64:
> > +opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
> > +tcg_out_modrm(s, opc, ret, arg);
> > +break;
> > +default:
> > +assert(0);
> 
> g_assert_not_reached().
> 
> Again, no ifdefs.
> 
> We probably want to generate avx1 code when the cpu supports it, to avoid mode
> switches in the vector registers.  In this case, simply issue the same opcode,
> vex encoded.
> 
> > +#ifdef TCG_TARGET_HAS_REG128
> > +{ INDEX_op_add_i32x4, { "V", "0", "V" } },
> > +#endif
> 
> And, clearly, you need to rebase.
> 

I was too late to notice that some conflicting tcg-related pull has hit
master after my last rebase. Sorry. v2 will be rebased.

-- 
Kirill

[Qemu-devel] [PATCH 18/18] target/arm: load two consecutive 64-bits vector regs as a 128-bit vector reg

2017-01-17 Thread Kirill Batuzov

ARM instruction set does not have loads to 128-bit vector register (q-regs).
Instead it can read several consecutive 64-bit vector register (d-regs)
which is used by GCC to load 128-bit registers from memory.

For vector operations to work we need to detect such loads and transform them
into 128-bit loads to 128-bit temporaries.

Signed-off-by: Kirill Batuzov 
---
 target/arm/translate.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index 4378d44..8b28f77 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -4748,6 +4748,19 @@ static int disas_neon_ls_insn(DisasContext *s, uint32_t 
insn)
 tcg_gen_addi_i32(addr, addr, 1 << size);
 }
 if (size == 3) {
+#ifdef TCG_TARGET_HAS_REG128
+if (rd % 2 == 0 && nregs == 2) {
+/* 128-bit load */
+if (load) {
+tcg_gen_qemu_ld_v128(cpu_Q[rd / 2], addr,
+ get_mem_index(s), MO_LE | MO_128);
+} else {
+tcg_gen_qemu_st_v128(cpu_Q[rd / 2], addr,
+ get_mem_index(s), MO_LE | MO_128);
+}
+break;
+}
+#endif
 tmp64 = tcg_temp_new_i64();
 if (load) {
 gen_aa32_ld64(s, tmp64, addr, get_mem_index(s));
-- 
2.1.4

[Qemu-devel] [PATCH 07/18] tcg: add vector addition operations

2017-01-17 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h  | 169 ++
 tcg/tcg-opc.h |  12 +
 tcg/tcg.h |  29 ++
 3 files changed, 210 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index c469ea3..5de74d3 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -1153,6 +1153,8 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, 
TCGv_i64, TCGArg, TCGMemOp);
 tcg_gen_add_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), TCGV_PTR_TO_NAT(B))
 # define tcg_gen_addi_ptr(R, A, B) \
 tcg_gen_addi_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), (B))
+# define tcg_gen_movi_ptr(R, B) \
+tcg_gen_movi_i32(TCGV_PTR_TO_NAT(R), (B))
 # define tcg_gen_ext_i32_ptr(R, A) \
 tcg_gen_mov_i32(TCGV_PTR_TO_NAT(R), (A))
 #else
@@ -1164,6 +1166,173 @@ void tcg_gen_atomic_xor_fetch_i64(TCGv_i64, TCGv, 
TCGv_i64, TCGArg, TCGMemOp);
 tcg_gen_add_i64(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), TCGV_PTR_TO_NAT(B))
 # define tcg_gen_addi_ptr(R, A, B) \
 tcg_gen_addi_i64(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), (B))
+# define tcg_gen_movi_ptr(R, B) \
+tcg_gen_movi_i64(TCGV_PTR_TO_NAT(R), (B))
 # define tcg_gen_ext_i32_ptr(R, A) \
 tcg_gen_ext_i32_i64(TCGV_PTR_TO_NAT(R), (A))
 #endif /* UINTPTR_MAX == UINT32_MAX */
+
+/***/
+/* 64-bit and 128-bit vector arithmetic.  */
+
+static inline void *tcg_v128_swap_slot(int n)
+{
+return &tcg_ctx.v128_swap[n * 16];
+}
+
+/* Find a memory location for 128-bit TCG variable. */
+static inline void tcg_v128_to_ptr(TCGv_v128 tmp, TCGv_ptr base, int slot,
+   TCGv_ptr *real_base, intptr_t *real_offset,
+   int is_read)
+{
+int idx = GET_TCGV_V128(tmp);
+assert(idx >= 0 && idx < tcg_ctx.nb_temps);
+if (idx < tcg_ctx.nb_globals) {
+/* Globals use their locations within CPUArchState. */
+int env = GET_TCGV_PTR(tcg_ctx.tcg_env);
+TCGTemp *ts_env = &tcg_ctx.temps[env];
+TCGTemp *ts_arg = &tcg_ctx.temps[idx];
+
+/* Sanity checks: global's memory locations must be addressed
+   relative to ENV. */
+assert(ts_env->val_type == TEMP_VAL_REG &&
+   ts_env == ts_arg->mem_base &&
+   ts_arg->mem_allocated);
+
+*real_base = tcg_ctx.tcg_env;
+*real_offset = ts_arg->mem_offset;
+} else {
+/* Temporaries use swap space in TCGContext. Since we already have
+   a 128-bit temporary we'll assume that the target supports 128-bit
+   loads and stores. */
+*real_base = base;
+*real_offset = slot * 16;
+if (is_read) {
+tcg_gen_st_v128(tmp, base, slot * 16);
+}
+}
+}
+
+/* Find a memory location for 64-bit vector TCG variable. */
+static inline void tcg_v64_to_ptr(TCGv_v64 tmp, TCGv_ptr base, int slot,
+  TCGv_ptr *real_base, intptr_t *real_offset,
+  int is_read)
+{
+int idx = GET_TCGV_V64(tmp);
+assert(idx >= 0 && idx < tcg_ctx.nb_temps);
+if (idx < tcg_ctx.nb_globals) {
+/* Globals use their locations within CPUArchState. */
+int env = GET_TCGV_PTR(tcg_ctx.tcg_env);
+TCGTemp *ts_env = &tcg_ctx.temps[env];
+TCGTemp *ts_arg = &tcg_ctx.temps[idx];
+
+/* Sanity checks: global's memory locations must be addressed
+   relative to ENV. */
+assert(ts_env->val_type == TEMP_VAL_REG &&
+   ts_env == ts_arg->mem_base &&
+   ts_arg->mem_allocated);
+
+*real_base = tcg_ctx.tcg_env;
+*real_offset = ts_arg->mem_offset;
+} else {
+/* Temporaries use swap space in TCGContext. Since we already have
+   a 128-bit temporary we'll assume that the target supports 128-bit
+   loads and stores. */
+*real_base = base;
+*real_offset = slot * 16;
+if (is_read) {
+tcg_gen_st_v64(tmp, base, slot * 16);
+}
+}
+}
+
+#define GEN_VECT_WRAPPER(name, type, func)   \
+static inline void glue(tcg_gen_, name)(glue(TCGv_, type) res,   \
+glue(TCGv_, type) arg1,  \
+glue(TCGv_, type) arg2)  \
+{\
+if (glue(TCG_TARGET_HAS_, name)) {   \
+glue(tcg_gen_op3_, type)(glue(INDEX_op_, name), res, arg1,   \
+ arg2);  \
+} else { \
+TCGv_ptr base = tcg_temp_new_ptr();

[Qemu-devel] [PATCH 12/18] tcg/i386: support remaining vector addition operations

2017-01-17 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.h | 10 ++
 tcg/i386/tcg-target.inc.c | 37 +
 2 files changed, 47 insertions(+)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 849b339..5deb08e 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -151,7 +151,17 @@ extern bool have_bmi1;
 #endif
 
 #ifdef TCG_TARGET_HAS_REG128
+#define TCG_TARGET_HAS_add_i8x161
+#define TCG_TARGET_HAS_add_i16x81
 #define TCG_TARGET_HAS_add_i32x41
+#define TCG_TARGET_HAS_add_i64x21
+#endif
+
+#ifdef TCG_TARGET_HAS_REGV64
+#define TCG_TARGET_HAS_add_i8x8 1
+#define TCG_TARGET_HAS_add_i16x41
+#define TCG_TARGET_HAS_add_i32x21
+#define TCG_TARGET_HAS_add_i64x11
 #endif
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index a2d5e09..d00bd12 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -377,7 +377,10 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define OPC_MOVQ_M2R(0x7e | P_SSE_F30F)
 #define OPC_MOVQ_R2M(0xd6 | P_SSE_660F)
 #define OPC_MOVQ_R2R(0xd6 | P_SSE_660F)
+#define OPC_PADDB   (0xfc | P_SSE_660F)
+#define OPC_PADDW   (0xfd | P_SSE_660F)
 #define OPC_PADDD   (0xfe | P_SSE_660F)
+#define OPC_PADDQ   (0xd4 | P_SSE_660F)
 
 /* Group 1 opcode extensions for 0x80-0x83.
These are also used as modifiers for OPC_ARITH.  */
@@ -2251,9 +2254,33 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 break;
 
 #ifdef TCG_TARGET_HAS_REG128
+case INDEX_op_add_i8x16:
+tcg_out_modrm(s, OPC_PADDB, args[0], args[2]);
+break;
+case INDEX_op_add_i16x8:
+tcg_out_modrm(s, OPC_PADDW, args[0], args[2]);
+break;
 case INDEX_op_add_i32x4:
 tcg_out_modrm(s, OPC_PADDD, args[0], args[2]);
 break;
+case INDEX_op_add_i64x2:
+tcg_out_modrm(s, OPC_PADDQ, args[0], args[2]);
+break;
+#endif
+
+#ifdef TCG_TARGET_HAS_REGV64
+case INDEX_op_add_i8x8:
+tcg_out_modrm(s, OPC_PADDB, args[0], args[2]);
+break;
+case INDEX_op_add_i16x4:
+tcg_out_modrm(s, OPC_PADDW, args[0], args[2]);
+break;
+case INDEX_op_add_i32x2:
+tcg_out_modrm(s, OPC_PADDD, args[0], args[2]);
+break;
+case INDEX_op_add_i64x1:
+tcg_out_modrm(s, OPC_PADDQ, args[0], args[2]);
+break;
 #endif
 
 case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
@@ -2411,7 +2438,17 @@ static const TCGTargetOpDef x86_op_defs[] = {
 #endif
 
 #ifdef TCG_TARGET_HAS_REG128
+{ INDEX_op_add_i8x16, { "V", "0", "V" } },
+{ INDEX_op_add_i16x8, { "V", "0", "V" } },
 { INDEX_op_add_i32x4, { "V", "0", "V" } },
+{ INDEX_op_add_i64x2, { "V", "0", "V" } },
+#endif
+
+#ifdef TCG_TARGET_HAS_REGV64
+{ INDEX_op_add_i8x8, { "V", "0", "V" } },
+{ INDEX_op_add_i16x4, { "V", "0", "V" } },
+{ INDEX_op_add_i32x2, { "V", "0", "V" } },
+{ INDEX_op_add_i64x1, { "V", "0", "V" } },
 #endif
 { -1 },
 };
-- 
2.1.4

[Qemu-devel] [PATCH 09/18] target/arm: use vector opcode to handle vadd. instruction

2017-01-17 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 target/arm/translate.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index 2b81b5d..4378d44 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -5666,6 +5666,37 @@ static int disas_neon_data_insn(DisasContext *s, 
uint32_t insn)
 return 1;
 }
 
+/* Use vector ops to handle what we can */
+switch (op) {
+case NEON_3R_VADD_VSUB:
+if (!u) {
+void (* const gen_add_v128[])(TCGv_v128, TCGv_v128,
+ TCGv_v128) = {
+tcg_gen_add_i8x16,
+tcg_gen_add_i16x8,
+tcg_gen_add_i32x4,
+tcg_gen_add_i64x2
+};
+void (* const gen_add_v64[])(TCGv_v64, TCGv_v64,
+ TCGv_v64) = {
+tcg_gen_add_i8x8,
+tcg_gen_add_i16x4,
+tcg_gen_add_i32x2,
+tcg_gen_add_i64x1
+};
+if (q) {
+gen_add_v128[size](cpu_Q[rd >> 1], cpu_Q[rn >> 1],
+   cpu_Q[rm >> 1]);
+} else {
+gen_add_v64[size](cpu_D[rd], cpu_D[rn], cpu_D[rm]);
+}
+return 0;
+}
+break;
+default:
+break;
+}
+
 for (pass = 0; pass < (q ? 4 : 2); pass++) {
 
 if (pairwise) {
-- 
2.1.4

[Qemu-devel] [PATCH 11/18] tcg/i386: support 64-bit vector operations

2017-01-17 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.h |  1 +
 tcg/i386/tcg-target.inc.c | 27 +++
 2 files changed, 28 insertions(+)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 974a58b..849b339 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -30,6 +30,7 @@
 
 #ifdef __x86_64__
 # define TCG_TARGET_HAS_REG128 1
+# define TCG_TARGET_HAS_REGV64 1
 # ifdef TCG_TARGET_HAS_REG128
 #  define TCG_TARGET_REG_BITS  64
 #  define TCG_TARGET_NB_REGS   24
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 69e3198..a2d5e09 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -374,6 +374,9 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define OPC_MOVDQU_M2R  (0x6f | P_SSE_F30F)  /* store 128-bit value */
 #define OPC_MOVDQU_R2M  (0x7f | P_SSE_F30F)  /* load 128-bit value */
 #define OPC_MOVDQA_R2R  (0x6f | P_SSE_660F)  /* reg-to-reg 128-bit mov */
+#define OPC_MOVQ_M2R(0x7e | P_SSE_F30F)
+#define OPC_MOVQ_R2M(0xd6 | P_SSE_660F)
+#define OPC_MOVQ_R2R(0xd6 | P_SSE_660F)
 #define OPC_PADDD   (0xfe | P_SSE_660F)
 
 /* Group 1 opcode extensions for 0x80-0x83.
@@ -672,6 +675,13 @@ static inline void tcg_out_mov(TCGContext *s, TCGType type,
 tcg_out_modrm(s, OPC_MOVDQA_R2R, ret, arg);
 break;
 #endif
+#ifdef TCG_TARGET_HAS_REGV64
+case TCG_TYPE_V64:
+ret -= TCG_REG_XMM0;
+arg -= TCG_REG_XMM0;
+tcg_out_modrm(s, OPC_MOVQ_R2R, ret, arg);
+break;
+#endif
 case TCG_TYPE_I32:
 case TCG_TYPE_I64:
 opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
@@ -762,6 +772,12 @@ static inline void tcg_out_ld(TCGContext *s, TCGType type, 
TCGReg ret,
 tcg_out_modrm_offset(s, OPC_MOVDQU_M2R, ret, arg1, arg2);
 break;
 #endif
+#ifdef TCG_TARGET_HAS_REGV64
+case TCG_TYPE_V64:
+ret -= TCG_REG_XMM0;
+tcg_out_modrm_offset(s, OPC_MOVQ_M2R, ret, arg1, arg2);
+break;
+#endif
 case TCG_TYPE_I32:
 case TCG_TYPE_I64:
 opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
@@ -783,6 +799,12 @@ static inline void tcg_out_st(TCGContext *s, TCGType type, 
TCGReg arg,
 tcg_out_modrm_offset(s, OPC_MOVDQU_R2M, arg, arg1, arg2);
 break;
 #endif
+#ifdef TCG_TARGET_HAS_REGV64
+case TCG_TYPE_V64:
+arg -= TCG_REG_XMM0;
+tcg_out_modrm_offset(s, OPC_MOVQ_R2M, arg, arg1, arg2);
+break;
+#endif
 case TCG_TYPE_I32:
 case TCG_TYPE_I64:
 opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
@@ -2264,6 +2286,11 @@ static const TCGTargetOpDef x86_op_defs[] = {
 { INDEX_op_st_v128, { "V", "r" } },
 #endif
 
+#ifdef TCG_TARGET_HAS_REGV64
+{ INDEX_op_ld_v64, { "V", "r" } },
+{ INDEX_op_st_v64, { "V", "r" } },
+#endif
+
 { INDEX_op_add_i32, { "r", "r", "ri" } },
 { INDEX_op_sub_i32, { "r", "0", "ri" } },
 { INDEX_op_mul_i32, { "r", "0", "ri" } },
-- 
2.1.4

[Qemu-devel] [PATCH 04/18] tcg: add simple alias analysis

2017-01-17 Thread Kirill Batuzov

Add a simple alias analysis to TCG which finds out memory loads and stores
that overlap with CPUState. This information can be used later in liveness
analysis to ensure correctness of register allocation. In particular, if load
or store overlaps with memory location of some global variable, this variable
should be spilled and reloaded at appropriate times.

Previously no such analysis was performed and for correctness reasons it was
required that no load/store operations overlap with memory locations of global
variables.

Signed-off-by: Kirill Batuzov 
---

checkpatch complains here, but I believe this to be false-positive.

---
 tcg/optimize.c | 146 +
 tcg/tcg.h  |  17 +++
 2 files changed, 163 insertions(+)

diff --git a/tcg/optimize.c b/tcg/optimize.c
index 0f13490..1d0eac2 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -34,6 +34,7 @@
 
 struct tcg_temp_info {
 bool is_const;
+bool is_base;
 uint16_t prev_copy;
 uint16_t next_copy;
 tcg_target_ulong val;
@@ -61,6 +62,7 @@ static void reset_temp(TCGArg temp)
 temps[temp].next_copy = temp;
 temps[temp].prev_copy = temp;
 temps[temp].is_const = false;
+temps[temp].is_base = false;
 temps[temp].mask = -1;
 }
 
@@ -1335,3 +1337,147 @@ void tcg_optimize(TCGContext *s)
 }
 }
 }
+
+/* Simple alias analysis. It finds out which load/store operations overlap
+   with CPUArchState. The result is stored in TCGContext and can be used
+   during liveness analysis and register allocation. */
+void tcg_alias_analysis(TCGContext *s)
+{
+int oi, oi_next;
+
+reset_all_temps(s->nb_temps);
+temps[GET_TCGV_PTR(s->tcg_env)].is_base = true;
+temps[GET_TCGV_PTR(s->tcg_env)].val = 0;
+
+for (oi = s->gen_op_buf[0].next; oi != 0; oi = oi_next) {
+int nb_oargs, i;
+int size;
+TCGAliasType tp;
+
+TCGOp * const op = &s->gen_op_buf[oi];
+TCGArg * const args = &s->gen_opparam_buf[op->args];
+TCGOpcode opc = op->opc;
+const TCGOpDef *def = &tcg_op_defs[opc];
+
+oi_next = op->next;
+
+if (opc == INDEX_op_call) {
+nb_oargs = op->callo;
+} else {
+nb_oargs = def->nb_oargs;
+}
+
+s->alias_info[oi] = (TCGAliasInfo){
+TCG_NOT_ALIAS,
+false,
+0,
+0
+};
+
+switch (opc) {
+CASE_OP_32_64(movi):
+temps[args[0]].is_const = 1;
+temps[args[0]].val = args[1];
+break;
+CASE_OP_32_64(mov):
+temps[args[0]].is_const = temps[args[1]].is_const;
+temps[args[0]].is_base = temps[args[1]].is_base;
+temps[args[0]].val = temps[args[1]].val;
+break;
+CASE_OP_32_64(add):
+CASE_OP_32_64(sub):
+if (temps[args[1]].is_base && temps[args[2]].is_const) {
+temps[args[0]].is_base = true;
+temps[args[0]].is_const = false;
+temps[args[0]].val =
+do_constant_folding(opc, temps[args[1]].val,
+temps[args[2]].val);
+} else {
+reset_temp(args[0]);
+}
+CASE_OP_32_64(ld8s):
+CASE_OP_32_64(ld8u):
+size = 1;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+CASE_OP_32_64(ld16s):
+CASE_OP_32_64(ld16u):
+size = 2;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+case INDEX_op_ld_i32:
+case INDEX_op_ld32s_i64:
+case INDEX_op_ld32u_i64:
+size = 4;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+case INDEX_op_ld_i64:
+size = 8;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+case INDEX_op_ld_v128:
+size = 16;
+tp = TCG_ALIAS_READ;
+goto do_ldst;
+CASE_OP_32_64(st8):
+size = 1;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+CASE_OP_32_64(st16):
+size = 2;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+case INDEX_op_st_i32:
+case INDEX_op_st32_i64:
+size = 4;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+case INDEX_op_st_i64:
+size = 8;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+case INDEX_op_st_v128:
+size = 16;
+tp = TCG_ALIAS_WRITE;
+goto do_ldst;
+do_ldst:
+if (temps[args[1]].is_base) {
+TCGArg val;
+#if TCG_TARGET_REG_BITS == 32
+val = do_constant_folding(INDEX_op_add_i32,
+  temps[args[1]].val,
+  args[2]);
+#else
+

[Qemu-devel] [PATCH 03/18] tcg: add ld_v128, ld_v64, st_v128 and st_v64 opcodes

2017-01-17 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h  | 38 ++
 tcg/tcg-opc.h | 18 ++
 2 files changed, 56 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 173fb24..c469ea3 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -489,6 +489,44 @@ static inline void tcg_gen_discard_v64(TCGv_v64 arg)
 tcg_gen_op1_v64(INDEX_op_discard, arg);
 }
 
+static inline void tcg_gen_ldst_op_v128(TCGOpcode opc, TCGv_v128 val,
+   TCGv_ptr base, TCGArg offset)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V128(val), GET_TCGV_PTR(base),
+offset);
+}
+
+static inline void tcg_gen_st_v128(TCGv_v128 arg1, TCGv_ptr arg2,
+   tcg_target_long offset)
+{
+tcg_gen_ldst_op_v128(INDEX_op_st_v128, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_ld_v128(TCGv_v128 ret, TCGv_ptr arg2,
+   tcg_target_long offset)
+{
+tcg_gen_ldst_op_v128(INDEX_op_ld_v128, ret, arg2, offset);
+}
+
+static inline void tcg_gen_ldst_op_v64(TCGOpcode opc, TCGv_v64 val,
+   TCGv_ptr base, TCGArg offset)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V64(val), GET_TCGV_PTR(base),
+offset);
+}
+
+static inline void tcg_gen_st_v64(TCGv_v64 arg1, TCGv_ptr arg2,
+  tcg_target_long offset)
+{
+tcg_gen_ldst_op_v64(INDEX_op_st_v64, arg1, arg2, offset);
+}
+
+static inline void tcg_gen_ld_v64(TCGv_v64 ret, TCGv_ptr arg2,
+  tcg_target_long offset)
+{
+tcg_gen_ldst_op_v64(INDEX_op_ld_v64, ret, arg2, offset);
+}
+
 /* 64 bit ops */
 
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 45528d2..d622592 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -42,6 +42,18 @@ DEF(br, 0, 0, 1, TCG_OPF_BB_END)
 # define IMPL64  TCG_OPF_64BIT
 #endif
 
+#ifdef TCG_TARGET_HAS_REG128
+# define IMPL128 0
+#else
+# define IMPL128 TCG_OPF_NOT_PRESENT
+#endif
+
+#ifdef TCG_TARGET_HAS_REGV64
+# define IMPLV64 0
+#else
+# define IMPLV64 TCG_OPF_NOT_PRESENT
+#endif
+
 DEF(mb, 0, 0, 1, 0)
 
 DEF(mov_i32, 1, 1, 0, TCG_OPF_NOT_PRESENT)
@@ -178,6 +190,12 @@ DEF(mulsh_i64, 1, 2, 0, IMPL(TCG_TARGET_HAS_mulsh_i64))
 #define TLADDR_ARGS  (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? 1 : 2)
 #define DATA64_ARGS  (TCG_TARGET_REG_BITS == 64 ? 1 : 2)
 
+/* load/store */
+DEF(st_v128, 0, 2, 1, IMPL128)
+DEF(ld_v128, 1, 1, 1, IMPL128)
+DEF(st_v64, 0, 2, 1, IMPLV64)
+DEF(ld_v64, 1, 1, 1, IMPLV64)
+
 /* QEMU specific */
 DEF(insn_start, 0, 0, TLADDR_ARGS * TARGET_INSN_START_WORDS,
 TCG_OPF_NOT_PRESENT)
-- 
2.1.4

[Qemu-devel] [PATCH 01/18] tcg: add support for 128bit vector type

2017-01-17 Thread Kirill Batuzov

Introduce TCG_TYPE_V128 and corresponding TCGv_v128 for TCG temps. Add hepler
functions that work with temps of this new type.

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h | 24 
 tcg/tcg.c| 13 +
 tcg/tcg.h| 34 ++
 3 files changed, 71 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 6d044b7..df077d6 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -248,6 +248,23 @@ static inline void tcg_gen_op6ii_i64(TCGOpcode opc, 
TCGv_i64 a1, TCGv_i64 a2,
 GET_TCGV_I64(a3), GET_TCGV_I64(a4), a5, a6);
 }
 
+static inline void tcg_gen_op1_v128(TCGOpcode opc, TCGv_v128 a1)
+{
+tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_V128(a1));
+}
+
+static inline void tcg_gen_op2_v128(TCGOpcode opc, TCGv_v128 a1,
+TCGv_v128 a2)
+{
+tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_V128(a2));
+}
+
+static inline void tcg_gen_op3_v128(TCGOpcode opc, TCGv_v128 a1,
+TCGv_v128 a2, TCGv_v128 a3)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_V128(a2),
+GET_TCGV_V128(a3));
+}
 
 /* Generic ops.  */
 
@@ -442,6 +459,13 @@ static inline void tcg_gen_not_i32(TCGv_i32 ret, TCGv_i32 
arg)
 }
 }
 
+/* Vector ops */
+
+static inline void tcg_gen_discard_v128(TCGv_v128 arg)
+{
+tcg_gen_op1_v128(INDEX_op_discard, arg);
+}
+
 /* 64 bit ops */
 
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index aabf94f..b20a044 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -637,6 +637,14 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
 return MAKE_TCGV_I64(idx);
 }
 
+TCGv_v128 tcg_temp_new_internal_v128(int temp_local)
+{
+int idx;
+
+idx = tcg_temp_new_internal(TCG_TYPE_V128, temp_local);
+return MAKE_TCGV_V128(idx);
+}
+
 static void tcg_temp_free_internal(int idx)
 {
 TCGContext *s = &tcg_ctx;
@@ -669,6 +677,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
 tcg_temp_free_internal(GET_TCGV_I64(arg));
 }
 
+void tcg_temp_free_v128(TCGv_v128 arg)
+{
+tcg_temp_free_internal(GET_TCGV_V128(arg));
+}
+
 TCGv_i32 tcg_const_i32(int32_t val)
 {
 TCGv_i32 t0;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index a35e4c4..b9aa56b 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -235,6 +235,7 @@ typedef struct TCGPool {
 typedef enum TCGType {
 TCG_TYPE_I32,
 TCG_TYPE_I64,
+TCG_TYPE_V128,
 TCG_TYPE_COUNT, /* number of different types */
 
 /* An alias for the size of the host register.  */
@@ -410,6 +411,7 @@ typedef tcg_target_ulong TCGArg;
 typedef struct TCGv_i32_d *TCGv_i32;
 typedef struct TCGv_i64_d *TCGv_i64;
 typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef struct TCGv_v128_d *TCGv_v128;
 typedef TCGv_ptr TCGv_env;
 #if TARGET_LONG_BITS == 32
 #define TCGv TCGv_i32
@@ -434,6 +436,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL 
MAKE_TCGV_PTR(intptr_t i)
 return (TCGv_ptr)i;
 }
 
+static inline TCGv_v128 QEMU_ARTIFICIAL MAKE_TCGV_V128(intptr_t i)
+{
+return (TCGv_v128)i;
+}
+
 static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t)
 {
 return (intptr_t)t;
@@ -449,6 +456,11 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_PTR(TCGv_ptr t)
 return (intptr_t)t;
 }
 
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_V128(TCGv_v128 t)
+{
+return (intptr_t)t;
+}
+
 #if TCG_TARGET_REG_BITS == 32
 #define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
 #define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
@@ -456,15 +468,18 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_PTR(TCGv_ptr t)
 
 #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
 #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
+#define TCGV_EQUAL_V128(a, b) (GET_TCGV_V128(a) == GET_TCGV_V128(b))
 #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
 
 /* Dummy definition to avoid compiler warnings.  */
 #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
 #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
+#define TCGV_UNUSED_V128(x) x = MAKE_TCGV_V128(-1)
 #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
 
 #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
 #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
+#define TCGV_IS_UNUSED_V128(x) (GET_TCGV_V128(x) == -1)
 #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
 
 /* call flags */
@@ -787,9 +802,11 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char 
*name);
 
 TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
 TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+TCGv_v128 tcg_temp_new_internal_v128(int temp_local);
 
 void tcg_temp_free_i32(TCGv_i32 arg);
 void tcg_temp_free_i64(TCGv_i64 arg);
+void tcg_temp_free_v128(TCGv_v128 arg);
 
 static inline TCGv_i32 tcg_global_mem_new_i32(TCGv_ptr reg, intptr_t offset,
   const char *name)
@@ -825,6 +842,23 @@ static inline TCGv_i64 tcg_temp_local_new_i64(v

[Qemu-devel] [PATCH 17/18] tcg/i386: add support for qemu_ld_v128/qemu_st_v128 ops

2017-01-17 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.inc.c | 63 +--
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index c28fd09..a48da20 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1265,6 +1265,7 @@ static void * const qemu_ld_helpers[] = {
 [MO_BEUW] = helper_be_lduw_mmu,
 [MO_BEUL] = helper_be_ldul_mmu,
 [MO_BEQ]  = helper_be_ldq_mmu,
+[MO_128]  = helper_te_ldv128_mmu,
 };
 
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
@@ -1278,6 +1279,7 @@ static void * const qemu_st_helpers[] = {
 [MO_BEUW] = helper_be_stw_mmu,
 [MO_BEUL] = helper_be_stl_mmu,
 [MO_BEQ]  = helper_be_stq_mmu,
+[MO_128]  = helper_te_stv128_mmu,
 };
 
 /* Perform the TLB load and compare.
@@ -1444,12 +1446,27 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 ofs += 4;
 
 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
+
+if ((opc & MO_SSIZE) == MO_128) {
+ofs += 4;
+tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)s->v128_swap,
+TCG_REG_ESP, ofs);
+}
 } else {
 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 /* The second argument is already loaded with addrlo.  */
 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
  (uintptr_t)l->raddr);
+if ((opc & MO_SSIZE) == MO_128) {
+if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
+tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[4],
+ (uintptr_t)s->v128_swap);
+} else {
+tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)s->v128_swap,
+TCG_REG_ESP, TCG_TARGET_CALL_STACK_OFFSET);
+}
+}
 }
 
 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
@@ -1485,6 +1502,10 @@ static void tcg_out_qemu_ld_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
 }
 break;
+case MO_128:
+tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, (uintptr_t)s->v128_swap);
+tcg_out_ld(s, TCG_TYPE_V128, l->datalo_reg, TCG_REG_EAX, 0);
+break;
 default:
 tcg_abort();
 }
@@ -1524,12 +1545,19 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 ofs += 4;
 }
 
-tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
-ofs += 4;
-
-if (s_bits == MO_64) {
-tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
+if (s_bits == MO_128) {
+tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, 
(uintptr_t)s->v128_swap);
+tcg_out_st(s, TCG_TYPE_V128, l->datalo_reg, TCG_REG_EAX, 0);
+tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_EAX, TCG_REG_ESP, ofs);
+ofs += 4;
+} else {
+tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
 ofs += 4;
+
+if (s_bits == MO_64) {
+tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
+ofs += 4;
+}
 }
 
 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
@@ -1541,8 +1569,16 @@ static void tcg_out_qemu_st_slow_path(TCGContext *s, 
TCGLabelQemuLdst *l)
 } else {
 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
 /* The second argument is already loaded with addrlo.  */
-tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
-tcg_target_call_iarg_regs[2], l->datalo_reg);
+if (s_bits == MO_128) {
+tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_RAX,
+ (uintptr_t)s->v128_swap);
+tcg_out_st(s, TCG_TYPE_V128, l->datalo_reg, TCG_REG_RAX, 0);
+tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[2],
+TCG_REG_RAX);
+} else {
+tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
+tcg_target_call_iarg_regs[2], l->datalo_reg);
+}
 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
 
 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
@@ -1674,6 +1710,10 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg 
datalo, TCGReg datahi,
 }
 }
 break;
+case MO_128:
+tcg_out_modrm_sib_offset(s, OPC_MOVDQU_M2R + seg, datalo,
+ base, index, 0, ofs);
+break;
 default:
 tcg_abort();
 }
@@ -1817,6 +1857,9 @@ static

[Qemu-devel] [PATCH 14/18] tcg: introduce new TCGMemOp - MO_128

2017-01-17 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg.h | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tcg/tcg.h b/tcg/tcg.h
index cb672f2..f205c6b 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -295,11 +295,12 @@ typedef enum TCGMemOp {
 MO_16= 1,
 MO_32= 2,
 MO_64= 3,
-MO_SIZE  = 3,   /* Mask for the above.  */
+MO_128   = 4,
+MO_SIZE  = 7,   /* Mask for the above.  */
 
-MO_SIGN  = 4,   /* Sign-extended, otherwise zero-extended.  */
+MO_SIGN  = 8,   /* Sign-extended, otherwise zero-extended.  */
 
-MO_BSWAP = 8,   /* Host reverse endian.  */
+MO_BSWAP = 16,   /* Host reverse endian.  */
 #ifdef HOST_WORDS_BIGENDIAN
 MO_LE= MO_BSWAP,
 MO_BE= 0,
@@ -331,7 +332,7 @@ typedef enum TCGMemOp {
  * - an alignment to a specified size, which may be more or less than
  *   the access size (MO_ALIGN_x where 'x' is a size in bytes);
  */
-MO_ASHIFT = 4,
+MO_ASHIFT = 5,
 MO_AMASK = 7 << MO_ASHIFT,
 #ifdef ALIGNED_ONLY
 MO_ALIGN = 0,
-- 
2.1.4

[Qemu-devel] [PATCH 15/18] tcg: introduce qemu_ld_v128 and qemu_st_v128 opcodes

2017-01-17 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.inc.c |  5 +
 tcg/tcg-op.c  | 16 
 tcg/tcg-op.h  |  8 
 tcg/tcg-opc.h |  4 
 4 files changed, 33 insertions(+)

diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index cd9de4d..c28fd09 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -2438,6 +2438,11 @@ static const TCGTargetOpDef x86_op_defs[] = {
 #endif
 
 #ifdef TCG_TARGET_HAS_REG128
+{ INDEX_op_qemu_ld_v128, { "V", "L" } },
+{ INDEX_op_qemu_st_v128, { "V", "L" } },
+#endif
+
+#ifdef TCG_TARGET_HAS_REG128
 { INDEX_op_add_i8x16, { "V", "0", "V" } },
 { INDEX_op_add_i16x8, { "V", "0", "V" } },
 { INDEX_op_add_i32x4, { "V", "0", "V" } },
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 0925fab..dd92e71 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -2350,3 +2350,19 @@ static void tcg_gen_mov2_i64(TCGv_i64 r, TCGv_i64 a, 
TCGv_i64 b)
 GEN_ATOMIC_HELPER(xchg, mov2, 0)
 
 #undef GEN_ATOMIC_HELPER
+
+void tcg_gen_qemu_ld_v128(TCGv_v128 val, TCGv addr, TCGArg idx,
+  TCGMemOp memop)
+{
+assert((memop & MO_BSWAP) == MO_TE);
+TCGMemOpIdx oi = make_memop_idx(memop, idx);
+tcg_gen_op3si_v128(INDEX_op_qemu_ld_v128, val, addr, oi);
+}
+
+void tcg_gen_qemu_st_v128(TCGv_v128 val, TCGv addr, TCGArg idx,
+  TCGMemOp memop)
+{
+assert((memop & MO_BSWAP) == MO_TE);
+TCGMemOpIdx oi = make_memop_idx(memop, idx);
+tcg_gen_op3si_v128(INDEX_op_qemu_st_v128, val, addr, oi);
+}
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 5de74d3..4646f87 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -266,6 +266,12 @@ static inline void tcg_gen_op3_v128(TCGOpcode opc, 
TCGv_v128 a1,
 GET_TCGV_V128(a3));
 }
 
+static inline void tcg_gen_op3si_v128(TCGOpcode opc, TCGv_v128 a1,
+ TCGv_i32 a2, TCGArg a3)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V128(a1), GET_TCGV_I32(a2), a3);
+}
+
 static inline void tcg_gen_op1_v64(TCGOpcode opc, TCGv_v64 a1)
 {
 tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_V64(a1));
@@ -885,6 +891,8 @@ void tcg_gen_qemu_ld_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_st_i32(TCGv_i32, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_ld_i64(TCGv_i64, TCGv, TCGArg, TCGMemOp);
 void tcg_gen_qemu_st_i64(TCGv_i64, TCGv, TCGArg, TCGMemOp);
+void tcg_gen_qemu_ld_v128(TCGv_v128, TCGv, TCGArg, TCGMemOp);
+void tcg_gen_qemu_st_v128(TCGv_v128, TCGv, TCGArg, TCGMemOp);
 
 static inline void tcg_gen_qemu_ld8u(TCGv ret, TCGv addr, int mem_index)
 {
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 0022535..8ff1416 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -222,6 +222,10 @@ DEF(qemu_ld_i64, DATA64_ARGS, TLADDR_ARGS, 1,
 TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
 DEF(qemu_st_i64, 0, TLADDR_ARGS + DATA64_ARGS, 1,
 TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | TCG_OPF_64BIT)
+DEF(qemu_ld_v128, 1, 1, 1,
+TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | IMPL128)
+DEF(qemu_st_v128, 0, 2, 1,
+TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS | IMPL128)
 
 #undef TLADDR_ARGS
 #undef DATA64_ARGS
-- 
2.1.4

[Qemu-devel] [PATCH 02/18] tcg: add support for 64bit vector type

2017-01-17 Thread Kirill Batuzov

Introduce TCG_TYPE_V64 and corresponding TCGv_v64 for TCG temps. Add hepler
functions that work with temps of this new type.

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h | 23 +++
 tcg/tcg.c| 13 +
 tcg/tcg.h| 34 ++
 3 files changed, 70 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index df077d6..173fb24 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -266,6 +266,24 @@ static inline void tcg_gen_op3_v128(TCGOpcode opc, 
TCGv_v128 a1,
 GET_TCGV_V128(a3));
 }
 
+static inline void tcg_gen_op1_v64(TCGOpcode opc, TCGv_v64 a1)
+{
+tcg_gen_op1(&tcg_ctx, opc, GET_TCGV_V64(a1));
+}
+
+static inline void tcg_gen_op2_v64(TCGOpcode opc, TCGv_v64 a1,
+TCGv_v64 a2)
+{
+tcg_gen_op2(&tcg_ctx, opc, GET_TCGV_V64(a1), GET_TCGV_V64(a2));
+}
+
+static inline void tcg_gen_op3_v64(TCGOpcode opc, TCGv_v64 a1,
+TCGv_v64 a2, TCGv_v64 a3)
+{
+tcg_gen_op3(&tcg_ctx, opc, GET_TCGV_V64(a1), GET_TCGV_V64(a2),
+GET_TCGV_V64(a3));
+}
+
 /* Generic ops.  */
 
 static inline void gen_set_label(TCGLabel *l)
@@ -466,6 +484,11 @@ static inline void tcg_gen_discard_v128(TCGv_v128 arg)
 tcg_gen_op1_v128(INDEX_op_discard, arg);
 }
 
+static inline void tcg_gen_discard_v64(TCGv_v64 arg)
+{
+tcg_gen_op1_v64(INDEX_op_discard, arg);
+}
+
 /* 64 bit ops */
 
 void tcg_gen_addi_i64(TCGv_i64 ret, TCGv_i64 arg1, int64_t arg2);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index b20a044..e81d1c4 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -637,6 +637,14 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
 return MAKE_TCGV_I64(idx);
 }
 
+TCGv_v64 tcg_temp_new_internal_v64(int temp_local)
+{
+int idx;
+
+idx = tcg_temp_new_internal(TCG_TYPE_V64, temp_local);
+return MAKE_TCGV_V64(idx);
+}
+
 TCGv_v128 tcg_temp_new_internal_v128(int temp_local)
 {
 int idx;
@@ -677,6 +685,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
 tcg_temp_free_internal(GET_TCGV_I64(arg));
 }
 
+void tcg_temp_free_v64(TCGv_v64 arg)
+{
+tcg_temp_free_internal(GET_TCGV_V64(arg));
+}
+
 void tcg_temp_free_v128(TCGv_v128 arg)
 {
 tcg_temp_free_internal(GET_TCGV_V128(arg));
diff --git a/tcg/tcg.h b/tcg/tcg.h
index b9aa56b..397ba86 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -235,6 +235,7 @@ typedef struct TCGPool {
 typedef enum TCGType {
 TCG_TYPE_I32,
 TCG_TYPE_I64,
+TCG_TYPE_V64,
 TCG_TYPE_V128,
 TCG_TYPE_COUNT, /* number of different types */
 
@@ -411,6 +412,7 @@ typedef tcg_target_ulong TCGArg;
 typedef struct TCGv_i32_d *TCGv_i32;
 typedef struct TCGv_i64_d *TCGv_i64;
 typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef struct TCGv_v64_d *TCGv_v64;
 typedef struct TCGv_v128_d *TCGv_v128;
 typedef TCGv_ptr TCGv_env;
 #if TARGET_LONG_BITS == 32
@@ -436,6 +438,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL 
MAKE_TCGV_PTR(intptr_t i)
 return (TCGv_ptr)i;
 }
 
+static inline TCGv_v64 QEMU_ARTIFICIAL MAKE_TCGV_V64(intptr_t i)
+{
+return (TCGv_v64)i;
+}
+
 static inline TCGv_v128 QEMU_ARTIFICIAL MAKE_TCGV_V128(intptr_t i)
 {
 return (TCGv_v128)i;
@@ -456,6 +463,11 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_PTR(TCGv_ptr t)
 return (intptr_t)t;
 }
 
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_V64(TCGv_v64 t)
+{
+return (intptr_t)t;
+}
+
 static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_V128(TCGv_v128 t)
 {
 return (intptr_t)t;
@@ -468,17 +480,20 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_V128(TCGv_v128 t)
 
 #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
 #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
+#define TCGV_EQUAL_V64(a, b) (GET_TCGV_V64(a) == GET_TCGV_V64(b))
 #define TCGV_EQUAL_V128(a, b) (GET_TCGV_V128(a) == GET_TCGV_V128(b))
 #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
 
 /* Dummy definition to avoid compiler warnings.  */
 #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
 #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
+#define TCGV_UNUSED_V64(x) x = MAKE_TCGV_V64(-1)
 #define TCGV_UNUSED_V128(x) x = MAKE_TCGV_V128(-1)
 #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)
 
 #define TCGV_IS_UNUSED_I32(x) (GET_TCGV_I32(x) == -1)
 #define TCGV_IS_UNUSED_I64(x) (GET_TCGV_I64(x) == -1)
+#define TCGV_IS_UNUSED_V64(x) (GET_TCGV_V64(x) == -1)
 #define TCGV_IS_UNUSED_V128(x) (GET_TCGV_V128(x) == -1)
 #define TCGV_IS_UNUSED_PTR(x) (GET_TCGV_PTR(x) == -1)
 
@@ -802,10 +817,12 @@ TCGv_i64 tcg_global_reg_new_i64(TCGReg reg, const char 
*name);
 
 TCGv_i32 tcg_temp_new_internal_i32(int temp_local);
 TCGv_i64 tcg_temp_new_internal_i64(int temp_local);
+TCGv_v64 tcg_temp_new_internal_v64(int temp_local);
 TCGv_v128 tcg_temp_new_internal_v128(int temp_local);
 
 void tcg_temp_free_i32(TCGv_i32 arg);
 void tcg_temp_free_i64(TCGv_i64 arg);
+void tcg_temp_free_v64(TCGv_v64 arg);
 void tcg_temp_free_v128(TCGv_v128 arg);
 
 static inl

[Qemu-devel] [PATCH 13/18] tcg: do not relay on exact values of MO_BSWAP or MO_SIGN in backend

2017-01-17 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/aarch64/tcg-target.inc.c |  4 ++--
 tcg/arm/tcg-target.inc.c |  4 ++--
 tcg/i386/tcg-target.inc.c|  4 ++--
 tcg/mips/tcg-target.inc.c|  4 ++--
 tcg/ppc/tcg-target.inc.c |  4 ++--
 tcg/s390/tcg-target.inc.c|  4 ++--
 tcg/sparc/tcg-target.inc.c   | 12 ++--
 tcg/tcg-op.c |  4 ++--
 tcg/tcg.h|  1 +
 9 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 1939d35..a3314e3 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -1002,7 +1002,7 @@ static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * TCGMemOpIdx oi, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
 [MO_LEUL] = helper_le_ldul_mmu,
@@ -1016,7 +1016,7 @@ static void * const qemu_ld_helpers[16] = {
  * uintxx_t val, TCGMemOpIdx oi,
  * uintptr_t ra)
  */
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/arm/tcg-target.inc.c b/tcg/arm/tcg-target.inc.c
index ffa0d40..c685785 100644
--- a/tcg/arm/tcg-target.inc.c
+++ b/tcg/arm/tcg-target.inc.c
@@ -1083,7 +1083,7 @@ static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_SB]   = helper_ret_ldsb_mmu,
 
@@ -1103,7 +1103,7 @@ static void * const qemu_ld_helpers[16] = {
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  * uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index d00bd12..cd9de4d 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1257,7 +1257,7 @@ static void tcg_out_nopn(TCGContext *s, int n)
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
 [MO_LEUL] = helper_le_ldul_mmu,
@@ -1270,7 +1270,7 @@ static void * const qemu_ld_helpers[16] = {
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
  * uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/mips/tcg-target.inc.c b/tcg/mips/tcg-target.inc.c
index 5b2fe98..f9c02c9 100644
--- a/tcg/mips/tcg-target.inc.c
+++ b/tcg/mips/tcg-target.inc.c
@@ -1101,7 +1101,7 @@ static void tcg_out_call(TCGContext *s, tcg_insn_unit 
*arg)
 }
 
 #if defined(CONFIG_SOFTMMU)
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_SB]   = helper_ret_ldsb_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
@@ -1118,7 +1118,7 @@ static void * const qemu_ld_helpers[16] = {
 #endif
 };
 
-static void * const qemu_st_helpers[16] = {
+static void * const qemu_st_helpers[] = {
 [MO_UB]   = helper_ret_stb_mmu,
 [MO_LEUW] = helper_le_stw_mmu,
 [MO_LEUL] = helper_le_stl_mmu,
diff --git a/tcg/ppc/tcg-target.inc.c b/tcg/ppc/tcg-target.inc.c
index a3262cf..b3fde1e 100644
--- a/tcg/ppc/tcg-target.inc.c
+++ b/tcg/ppc/tcg-target.inc.c
@@ -1383,7 +1383,7 @@ static const uint32_t qemu_exts_opc[4] = {
 /* helper signature: helper_ld_mmu(CPUState *env, target_ulong addr,
  * int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_ld_helpers[16] = {
+static void * const qemu_ld_helpers[] = {
 [MO_UB]   = helper_ret_ldub_mmu,
 [MO_LEUW] = helper_le_lduw_mmu,
 [MO_LEUL] = helper_le_ldul_mmu,
@@ -1396,7 +1396,7 @@ static void * const qemu_ld_helpers[16] = {
 /* helper signature: helper_st_mmu(CPUState *env, target_ulong addr,
  * uintxx_t val, int mmu_idx, uintptr_t ra)
  */
-static void * const qemu_st_helpers[16

[Qemu-devel] [PATCH 06/18] tcg: allow globals to overlap

2017-01-17 Thread Kirill Batuzov

Sometimes the target architecture may allow some parts of a register to be
accessed as a different register. If both of these registers are
implemented as globals in QEMU, then their content will overlap and the
change to one global will also change the value of the other. To handle
such situation properly, some fixes are needed in the register allocator
and liveness analysis.

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg.c | 49 +
 tcg/tcg.h | 18 ++
 2 files changed, 67 insertions(+)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index 2f97c13..330a1c0 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -572,6 +572,8 @@ int tcg_global_mem_new_internal(TCGType type, TCGv_ptr base,
 ts->mem_offset = offset;
 ts->name = name;
 }
+ts->sub_temps = NULL;
+ts->overlap_temps = NULL;
 return temp_idx(s, ts);
 }
 
@@ -1500,6 +1502,35 @@ static int tcg_temp_overlap(TCGContext *s, const TCGTemp 
*tmp,
 }
 }
 
+static void tcg_temp_arr_apply(const TCGArg *arr, uint8_t *temp_state,
+   uint8_t temp_val)
+{
+TCGArg i;
+if (!arr) {
+return ;
+}
+for (i = 0; arr[i] != (TCGArg)-1; i++) {
+temp_state[arr[i]] = temp_val;
+}
+}
+
+static void tcg_sub_temps_dead(TCGContext *s, TCGArg tmp, uint8_t *temp_state)
+{
+tcg_temp_arr_apply(s->temps[tmp].sub_temps, temp_state, TS_DEAD);
+}
+
+static void tcg_sub_temps_sync(TCGContext *s, TCGArg tmp, uint8_t *temp_state)
+{
+tcg_temp_arr_apply(s->temps[tmp].sub_temps, temp_state, TS_MEM | TS_DEAD);
+}
+
+static void tcg_overlap_temps_sync(TCGContext *s, TCGArg tmp,
+   uint8_t *temp_state)
+{
+tcg_temp_arr_apply(s->temps[tmp].overlap_temps, temp_state,
+   TS_MEM | TS_DEAD);
+}
+
 /* Liveness analysis : update the opc_arg_life array to tell if a
given input arguments is dead. Instructions updating dead
temporaries are removed. */
@@ -1554,6 +1585,11 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 if (temp_state[arg] & TS_MEM) {
 arg_life |= SYNC_ARG << i;
 }
+/* sub_temps are also dead */
+tcg_sub_temps_dead(&tcg_ctx, arg, temp_state);
+/* overlap_temps need to go to memory */
+tcg_overlap_temps_sync(&tcg_ctx, arg, temp_state);
+
 temp_state[arg] = TS_DEAD;
 }
 
@@ -1581,6 +1617,11 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 for (i = nb_oargs; i < nb_iargs + nb_oargs; i++) {
 arg = args[i];
 if (arg != TCG_CALL_DUMMY_ARG) {
+/* both sub_temps and overlap_temps need to go
+   to memory */
+tcg_sub_temps_sync(&tcg_ctx, arg, temp_state);
+tcg_overlap_temps_sync(&tcg_ctx, arg, temp_state);
+
 temp_state[arg] &= ~TS_DEAD;
 }
 }
@@ -1699,6 +1740,11 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 if (temp_state[arg] & TS_MEM) {
 arg_life |= SYNC_ARG << i;
 }
+/* sub_temps are also dead */
+tcg_sub_temps_dead(&tcg_ctx, arg, temp_state);
+/* overlap_temps need to go to memory */
+tcg_overlap_temps_sync(&tcg_ctx, arg, temp_state);
+
 temp_state[arg] = TS_DEAD;
 }
 
@@ -1739,6 +1785,9 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 /* input arguments are live for preceding opcodes */
 for (i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
 temp_state[args[i]] &= ~TS_DEAD;
+/* both sub_temps and overlap_temps need to go to memory */
+tcg_sub_temps_sync(&tcg_ctx, arg, temp_state);
+tcg_overlap_temps_sync(&tcg_ctx, arg, temp_state);
 }
 }
 break;
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 921892f..6473228 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -623,6 +623,14 @@ typedef struct TCGTemp {
 struct TCGTemp *mem_base;
 intptr_t mem_offset;
 const char *name;
+
+/* -1 terminated array of temps that are parts of this temp.
+   All bits of them are part of this temp. */
+const TCGArg *sub_temps;
+/* -1 terminated array of temps that overlap with this temp.
+   Some bits of them are part of this temp, but some are not. sub_temps
+   are not included here. */
+const TCGArg *overlap_temps;
 } TCGTemp;
 
 ty

[Qemu-devel] [PATCH 16/18] softmmu: create helpers for vector loads

2017-01-17 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 cputlb.c  |   4 +
 softmmu_template_vector.h | 266 ++
 tcg/tcg.h |   5 +
 3 files changed, 275 insertions(+)
 create mode 100644 softmmu_template_vector.h

diff --git a/cputlb.c b/cputlb.c
index 813279f..e174773 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -669,6 +669,10 @@ static void *atomic_mmu_lookup(CPUArchState *env, 
target_ulong addr,
 #define DATA_SIZE 8
 #include "softmmu_template.h"
 
+#define SHIFT 4
+#include "softmmu_template_vector.h"
+#undef MMUSUFFIX
+
 /* First set of helpers allows passing in of OI and RETADDR.  This makes
them callable from other helpers.  */
 
diff --git a/softmmu_template_vector.h b/softmmu_template_vector.h
new file mode 100644
index 000..b286d65
--- /dev/null
+++ b/softmmu_template_vector.h
@@ -0,0 +1,266 @@
+/*
+ *  Software MMU support
+ *
+ * Generate helpers used by TCG for qemu_ld/st vector ops and code
+ * load functions.
+ *
+ * Included from target op helpers and exec.c.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/timer.h"
+#include "exec/address-spaces.h"
+#include "exec/memory.h"
+
+#define DATA_SIZE (1 << SHIFT)
+
+#if DATA_SIZE == 16
+#define SUFFIX v128
+#else
+#error unsupported data size
+#endif
+
+
+#ifdef SOFTMMU_CODE_ACCESS
+#define READ_ACCESS_TYPE MMU_INST_FETCH
+#define ADDR_READ addr_code
+#else
+#define READ_ACCESS_TYPE MMU_DATA_LOAD
+#define ADDR_READ addr_read
+#endif
+
+#define helper_te_ld_name  glue(glue(helper_te_ld, SUFFIX), MMUSUFFIX)
+#define helper_te_st_name  glue(glue(helper_te_st, SUFFIX), MMUSUFFIX)
+
+#ifndef SOFTMMU_CODE_ACCESS
+static inline void glue(io_read, SUFFIX)(CPUArchState *env,
+ CPUIOTLBEntry *iotlbentry,
+ target_ulong addr,
+ uintptr_t retaddr,
+ uint8_t *res)
+{
+CPUState *cpu = ENV_GET_CPU(env);
+hwaddr physaddr = iotlbentry->addr;
+MemoryRegion *mr = iotlb_to_region(cpu, physaddr, iotlbentry->attrs);
+int i;
+
+assert(0); /* Needs testing */
+
+physaddr = (physaddr & TARGET_PAGE_MASK) + addr;
+cpu->mem_io_pc = retaddr;
+if (mr != &io_mem_rom && mr != &io_mem_notdirty && !cpu->can_do_io) {
+cpu_io_recompile(cpu, retaddr);
+}
+
+cpu->mem_io_vaddr = addr;
+for (i = 0; i < (1 << SHIFT); i += 8) {
+memory_region_dispatch_read(mr, physaddr + i, (uint64_t *)(res + i),
+8, iotlbentry->attrs);
+}
+}
+#endif
+
+void helper_te_ld_name(CPUArchState *env, target_ulong addr,
+   TCGMemOpIdx oi, uintptr_t retaddr, uint8_t *res)
+{
+unsigned mmu_idx = get_mmuidx(oi);
+int index = (addr >> TARGET_PAGE_BITS) & (CPU_TLB_SIZE - 1);
+target_ulong tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+uintptr_t haddr;
+int i;
+
+/* Adjust the given return address.  */
+retaddr -= GETPC_ADJ;
+
+/* If the TLB entry is for a different page, reload and try again.  */
+if ((addr & TARGET_PAGE_MASK)
+ != (tlb_addr & (TARGET_PAGE_MASK | TLB_INVALID_MASK))) {
+if ((addr & (DATA_SIZE - 1)) != 0
+&& (get_memop(oi) & MO_AMASK) == MO_ALIGN) {
+cpu_unaligned_access(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+ mmu_idx, retaddr);
+}
+if (!VICTIM_TLB_HIT(ADDR_READ, addr)) {
+tlb_fill(ENV_GET_CPU(env), addr, READ_ACCESS_TYPE,
+ mmu_idx, retaddr);
+}
+tlb_addr = env->tlb_table[mmu_idx][index].ADDR_READ;
+}
+
+/* Handle an IO access.  */
+if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
+CPUIOTLBEntry *iotlbentry;
+if ((addr & (DATA_SIZE - 1)) != 0) {
+goto do_unaligned_access;
+}
+iotlbentry = &env->iotlb[mmu_idx][index];
+
+/* ??? Note that the io helpers always read data in the target
+   byte ordering.  We should push the LE/BE request down into io.  */
+glue(io_read, SUFFIX)(env, iotlbe

[Qemu-devel] [PATCH 10/18] tcg/i386: add support for vector opcodes

2017-01-17 Thread Kirill Batuzov

To be able to generate vector operations in a TCG backend we need to do
several things.

1. We need to tell the register allocator about vector target's register.
   In case of x86 we'll use xmm0..xmm7. xmm7 is designated as a scratch
   register, others can be used by the register allocator.

2. We need a new constraint to indicate where to use vector registers. In
   this commit the 'V' constraint is introduced.

3. We need to be able to generate bare minimum: load, store and reg-to-reg
   move. MOVDQU is used for loads and stores. MOVDQA is used for reg-to-reg
   moves.

4. Finally we need to support any other opcodes we want. INDEX_op_add_i32x4
   is the only one for now. The PADDD instruction handles it perfectly.

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.h |  24 +-
 tcg/i386/tcg-target.inc.c | 109 +++---
 2 files changed, 125 insertions(+), 8 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 524cfc6..974a58b 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -29,8 +29,14 @@
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
 
 #ifdef __x86_64__
-# define TCG_TARGET_REG_BITS  64
-# define TCG_TARGET_NB_REGS   16
+# define TCG_TARGET_HAS_REG128 1
+# ifdef TCG_TARGET_HAS_REG128
+#  define TCG_TARGET_REG_BITS  64
+#  define TCG_TARGET_NB_REGS   24
+# else
+#  define TCG_TARGET_REG_BITS  64
+#  define TCG_TARGET_NB_REGS   16
+# endif
 #else
 # define TCG_TARGET_REG_BITS  32
 # define TCG_TARGET_NB_REGS8
@@ -56,6 +62,16 @@ typedef enum {
 TCG_REG_R13,
 TCG_REG_R14,
 TCG_REG_R15,
+#ifdef TCG_TARGET_HAS_REG128
+TCG_REG_XMM0,
+TCG_REG_XMM1,
+TCG_REG_XMM2,
+TCG_REG_XMM3,
+TCG_REG_XMM4,
+TCG_REG_XMM5,
+TCG_REG_XMM6,
+TCG_REG_XMM7,
+#endif
 TCG_REG_RAX = TCG_REG_EAX,
 TCG_REG_RCX = TCG_REG_ECX,
 TCG_REG_RDX = TCG_REG_EDX,
@@ -133,6 +149,10 @@ extern bool have_bmi1;
 #define TCG_TARGET_HAS_mulsh_i640
 #endif
 
+#ifdef TCG_TARGET_HAS_REG128
+#define TCG_TARGET_HAS_add_i32x41
+#endif
+
 #define TCG_TARGET_deposit_i32_valid(ofs, len) \
 (((ofs) == 0 && (len) == 8) || ((ofs) == 8 && (len) == 8) || \
  ((ofs) == 0 && (len) == 16))
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index eeb1777..69e3198 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -32,6 +32,9 @@ static const char * const 
tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #else
 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
 #endif
+#ifdef TCG_TARGET_HAS_REG128
+"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+#endif
 };
 #endif
 
@@ -61,6 +64,16 @@ static const int tcg_target_reg_alloc_order[] = {
 TCG_REG_EDX,
 TCG_REG_EAX,
 #endif
+#ifdef TCG_TARGET_HAS_REG128
+TCG_REG_XMM0,
+TCG_REG_XMM1,
+TCG_REG_XMM2,
+TCG_REG_XMM3,
+TCG_REG_XMM4,
+TCG_REG_XMM5,
+TCG_REG_XMM6,
+/*  TCG_REG_XMM7, <- scratch register */
+#endif
 };
 
 static const int tcg_target_call_iarg_regs[] = {
@@ -247,6 +260,10 @@ static int target_parse_constraint(TCGArgConstraint *ct, 
const char **pct_str)
 case 'I':
 ct->ct |= TCG_CT_CONST_I32;
 break;
+case 'V':
+ct->ct |= TCG_CT_REG;
+tcg_regset_set32(ct->u.regs, 0, 0xff);
+break;
 
 default:
 return -1;
@@ -301,6 +318,9 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define P_SIMDF30x1 /* 0xf3 opcode prefix */
 #define P_SIMDF20x2 /* 0xf2 opcode prefix */
 
+#define P_SSE_660F  (P_DATA16 | P_EXT)
+#define P_SSE_F30F  (P_SIMDF3 | P_EXT)
+
 #define OPC_ARITH_EvIz (0x81)
 #define OPC_ARITH_EvIb (0x83)
 #define OPC_ARITH_GvEv (0x03)  /* ... plus (ARITH_FOO << 3) */
@@ -351,6 +371,11 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define OPC_GRP3_Ev(0xf7)
 #define OPC_GRP5   (0xff)
 
+#define OPC_MOVDQU_M2R  (0x6f | P_SSE_F30F)  /* store 128-bit value */
+#define OPC_MOVDQU_R2M  (0x7f | P_SSE_F30F)  /* load 128-bit value */
+#define OPC_MOVDQA_R2R  (0x6f | P_SSE_660F)  /* reg-to-reg 128-bit mov */
+#define OPC_PADDD   (0xfe | P_SSE_660F)
+
 /* Group 1 opcode extensions for 0x80-0x83.
These are also used as modifiers for OPC_ARITH.  */
 #define ARITH_ADD 0
@@ -428,6 +453,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int 
rm, int x)
 tcg_debug_assert((opc & P_REXW) == 0);
 tcg_out8(s, 0x66);
 }
+if (opc & P_SIMDF3) {
+tcg_out8(s, 0xf3);
+}
 if (opc & P_ADDR32) {
 tcg_out8(s, 0x67);
 }
@@ -634,9 +662,24 @@ static inlin

[Qemu-devel] [PATCH 05/18] tcg: use results of alias analysis in liveness analysis

2017-01-17 Thread Kirill Batuzov

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg.c | 71 +++
 1 file changed, 71 insertions(+)

diff --git a/tcg/tcg.c b/tcg/tcg.c
index e81d1c4..2f97c13 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1448,6 +1448,58 @@ static inline void tcg_la_bb_end(TCGContext *s, uint8_t 
*temp_state)
 }
 }
 
+static intptr_t tcg_temp_size(const TCGTemp *tmp)
+{
+switch (tmp->base_type) {
+case TCG_TYPE_I32:
+return 4;
+case TCG_TYPE_I64:
+case TCG_TYPE_V64:
+return 8;
+case TCG_TYPE_V128:
+return 16;
+default:
+tcg_abort();
+}
+}
+
+/* Check if memory write completely overwrites temp's memory location.
+   If this is the case then the temp can be considered dead. */
+static int tcg_temp_overwrite(TCGContext *s, const TCGTemp *tmp,
+   const TCGAliasInfo *ai)
+{
+if (!(ai->alias_type & TCG_ALIAS_WRITE) || !ai->fixed_offset) {
+return 0;
+}
+if (tmp->mem_base != &s->temps[GET_TCGV_PTR(s->tcg_env)]) {
+return 0;
+}
+if (ai->offset > tmp->mem_offset
+|| ai->offset + ai->size < tmp->mem_offset + tcg_temp_size(tmp)) {
+return 0;
+}
+return 1;
+}
+
+/* Check if memory read or write overlaps with temp's memory location.
+   If this is the case then the temp must be synced to memory. */
+static int tcg_temp_overlap(TCGContext *s, const TCGTemp *tmp,
+const TCGAliasInfo *ai)
+{
+if (!ai->fixed_offset || tmp->fixed_reg) {
+return 0;
+}
+if (tmp->mem_base != &s->temps[GET_TCGV_PTR(s->tcg_env)]) {
+return 1;
+}
+if (ai->offset >= tmp->mem_offset + tcg_temp_size(tmp)
+|| ai->offset + ai->size <= tmp->mem_offset) {
+return 0;
+} else {
+return 1;
+}
+}
+
 /* Liveness analysis : update the opc_arg_life array to tell if a
given input arguments is dead. Instructions updating dead
temporaries are removed. */
@@ -1650,6 +1702,23 @@ static void liveness_pass_1(TCGContext *s, uint8_t 
*temp_state)
 temp_state[arg] = TS_DEAD;
 }
 
+/* record if the operation uses some globals' memory location 
*/
+if (s->alias_info[oi].alias_type != TCG_NOT_ALIAS) {
+for (i = 0; i < s->nb_globals; i++) {
+if (tcg_temp_overwrite(s, &s->temps[i],
+   &s->alias_info[oi])) {
+temp_state[i] = TS_DEAD;
+} else if (tcg_temp_overlap(s, &s->temps[i],
+&s->alias_info[oi])) {
+if (s->alias_info[oi].alias_type & TCG_ALIAS_READ) 
{
+temp_state[i] = TS_MEM | TS_DEAD;
+} else if (!(temp_state[i] & TS_DEAD)) {
+temp_state[i] |= TS_MEM;
+}
+}
+}
+}
+
 /* if end of basic block, update */
 if (def->flags & TCG_OPF_BB_END) {
 tcg_la_bb_end(s, temp_state);
@@ -2591,6 +2660,8 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb)
 s->la_time -= profile_getclock();
 #endif
 
+tcg_alias_analysis(s);
+
 {
 uint8_t *temp_state = tcg_malloc(s->nb_temps + s->nb_indirects);
 
-- 
2.1.4

[Qemu-devel] [PATCH 08/18] target/arm: support access to vector guest registers as globals

2017-01-17 Thread Kirill Batuzov

To support vector guest registers as globals we need to do two things:

1) create corresponding globals,
2) mark which globals can overlap,

Signed-off-by: Kirill Batuzov 
---

I've declared regnames for new globals the same way they used to be declared for
scalar regs. checkpatch complains about it. Should I move '{' to the same line
for all 3 arrays?

---
 target/arm/translate.c | 45 +++--
 1 file changed, 43 insertions(+), 2 deletions(-)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index 0ad9070..2b81b5d 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -65,6 +65,12 @@ static TCGv_i32 cpu_R[16];
 TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
 TCGv_i64 cpu_exclusive_addr;
 TCGv_i64 cpu_exclusive_val;
+static TCGv_v128 cpu_Q[16];
+static TCGv_v64 cpu_D[32];
+#ifdef CONFIG_USER_ONLY
+TCGv_i64 cpu_exclusive_test;
+TCGv_i32 cpu_exclusive_info;
+#endif
 
 /* FIXME:  These should be removed.  */
 static TCGv_i32 cpu_F0s, cpu_F1s;
@@ -72,14 +78,26 @@ static TCGv_i64 cpu_F0d, cpu_F1d;
 
 #include "exec/gen-icount.h"
 
-static const char *regnames[] =
+static const char *regnames_r[] =
 { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
   "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" };
 
+static const char *regnames_q[] =
+{ "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+  "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" };
+
+static const char *regnames_d[] =
+{ "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
+  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
+  "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
+  "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
+
 /* initialize TCG globals.  */
 void arm_translate_init(void)
 {
 int i;
+static TCGArg overlap_temps[16][2];
+static TCGArg sub_temps[16][3];
 
 cpu_env = tcg_global_reg_new_ptr(TCG_AREG0, "env");
 tcg_ctx.tcg_env = cpu_env;
@@ -87,7 +105,30 @@ void arm_translate_init(void)
 for (i = 0; i < 16; i++) {
 cpu_R[i] = tcg_global_mem_new_i32(cpu_env,
   offsetof(CPUARMState, regs[i]),
-  regnames[i]);
+  regnames_r[i]);
+}
+for (i = 0; i < 16; i++) {
+cpu_Q[i] = tcg_global_mem_new_v128(cpu_env,
+   offsetof(CPUARMState,
+vfp.regs[2 * i]),
+   regnames_q[i]);
+}
+for (i = 0; i < 32; i++) {
+cpu_D[i] = tcg_global_mem_new_v64(cpu_env,
+  offsetof(CPUARMState, vfp.regs[i]),
+  regnames_d[i]);
+}
+for (i = 0; i < 16; i++) {
+overlap_temps[i][0] = GET_TCGV_V128(cpu_Q[i]);
+overlap_temps[i][1] = (TCGArg)-1;
+sub_temps[i][0] = GET_TCGV_V64(cpu_D[i * 2]);
+sub_temps[i][1] = GET_TCGV_V64(cpu_D[i * 2 + 1]);
+sub_temps[i][2] = (TCGArg)-1;
+tcg_temp_set_overlap_temps(GET_TCGV_V64(cpu_D[i * 2]),
+   overlap_temps[i]);
+tcg_temp_set_overlap_temps(GET_TCGV_V64(cpu_D[i * 2 + 1]),
+   overlap_temps[i]);
+tcg_temp_set_sub_temps(GET_TCGV_V128(cpu_Q[i]), sub_temps[i]);
 }
 cpu_CF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, CF), "CF");
 cpu_NF = tcg_global_mem_new_i32(cpu_env, offsetof(CPUARMState, NF), "NF");
-- 
2.1.4

[Qemu-devel] [PATCH 00/18] Emulate guest vector operations with host vector operations

2017-01-17 Thread Kirill Batuzov

The goal of these patch series is to set up an infrastructure to emulate
guest vector operations using host vector operations. Preliminary
experiments show that simply translating loads and stores increases
performance of x264 video codec by 10%. The performance of a gcc vectorized
for loop increased 2x.

To be able to emulate guest vector operations using host vector operations,
several things need to be done.

1. Corresponding vector types should be added to TCG. These series add
TCG_v128 and TCG_v64. I've made TCG_v64 a different type than TCG_i64
because it usually needs to be allocated to different registers and
supports different operations.

2. Load/store operations for these new types need to be implemented.

3. For seamless transition from current model to a new one we need to
handle cases where memory occupied by global variable can be accessed via
pointer to the CPUArchState structure. A very simple conservative alias
analysis has been added to do it. This analysis tracks memory loads and
stores that overlap with fields of CPUArchState and provides this
information to the register allocator. The allocator then spills and
reloads affected globals when needed.

4. Allow overlapping globals. For scalar registers this is a rare case, and
overlapping registers can ba handled as a single one (ah, al, ax, eax,
rax). In ARM every Q-register consists of two D-register each consisting of
two S-registers. Handling 4 S-registers as one because they are parts of
the same Q-register is way too inefficient.

5. Add new memory addressing mode to MMU code for large accesses and create
needed helpers. Only 128-bit vectors have been handled for now.

6. Create TCG opcodes for vector operations. Only addition has beed handled
in these series. Each operation has a wrapper that checks if the backend
supports the corresponding operation or not. In one case the vector opcode
is generated, in the other the operation is emulated with scalar
operations. The emulation code is generated inline for performance reasons
(there is a huge performance difference between inline generation
and calling a helper). As a positive side effect this will eventually allow
 to merge similar emulation code for vector instructions from different
frontends to target-independent implementation.

7. Use new operations in the frontend (ARM was used in these series).

8. Support new operations in the backend (x86_64 was used in these series).

For experiments I have used ARM guest on x86_64 host. I wanted some pair of
different architectures with vector extensions both. ARM and x86_64 pair
fits well.

Kirill Batuzov (18):
  tcg: add support for 128bit vector type
  tcg: add support for 64bit vector type
  tcg: add ld_v128, ld_v64, st_v128 and st_v64 opcodes
  tcg: add simple alias analysis
  tcg: use results of alias analysis in liveness analysis
  tcg: allow globals to overlap
  tcg: add vector addition operations
  target/arm: support access to vector guest registers as globals
  target/arm: use vector opcode to handle vadd. instruction
  tcg/i386: add support for vector opcodes
  tcg/i386: support 64-bit vector operations
  tcg/i386: support remaining vector addition operations
  tcg: do not relay on exact values of MO_BSWAP or MO_SIGN in backend
  tcg: introduce new TCGMemOp - MO_128
  tcg: introduce qemu_ld_v128 and qemu_st_v128 opcodes
  softmmu: create helpers for vector loads
  tcg/i386: add support for qemu_ld_v128/qemu_st_v128 ops
  target/arm: load two consecutive 64-bits vector regs as a 128-bit
vector reg

 cputlb.c |   4 +
 softmmu_template_vector.h| 266 +++
 target/arm/translate.c   |  89 ++-
 tcg/aarch64/tcg-target.inc.c |   4 +-
 tcg/arm/tcg-target.inc.c |   4 +-
 tcg/i386/tcg-target.h|  35 +-
 tcg/i386/tcg-target.inc.c| 245 ---
 tcg/mips/tcg-target.inc.c|   4 +-
 tcg/optimize.c   | 146 
 tcg/ppc/tcg-target.inc.c |   4 +-
 tcg/s390/tcg-target.inc.c|   4 +-
 tcg/sparc/tcg-target.inc.c   |  12 +-
 tcg/tcg-op.c |  20 +++-
 tcg/tcg-op.h | 262 ++
 tcg/tcg-opc.h|  34 ++
 tcg/tcg.c| 146 
 tcg/tcg.h| 147 +++-
 17 files changed, 1385 insertions(+), 41 deletions(-)
 create mode 100644 softmmu_template_vector.h

-- 
2.1.4

Re: [Qemu-devel] Commit 812c1057f, Handle G_IO_HUP in tcp_chr_read for tcp chardev, broke CloudStack

2015-07-17 Thread Kirill Batuzov

On Fri, 17 Jul 2015, Nils Carlson wrote:

> Hi,
> 
> The commit 812c1057f, Handle G_IO_HUP in tcp_chr_read for tcp chardev, broke
> CloudStack. CloudStack was relying on fire-and-forget style messaging across a
> unix socket to the VM. Because the host "fires" the message and then closes
> the socket a HUP is present on the line when the VM starts reading the socket.
> Commit 812c1057f ensured that the socket was checked for a HUP prior to
> calling recv, causing recv never to be called by the VM and no data to be
> read.
> 
> I've posted a patch, attached here, which moves the HUP detection to after all
> data has been read, but only for Linux as I suspect windows requires HUPs to
> be detected prior to reading data.
> 
> Could you comment on the validity of this assumption? I would be really happy
> to have this issue solved as it stops us from upgrading to later versions of
> qemu.

I do not think your assumption is valid. Original goal of commit 812c1057f was
to handle all conditions in one watch because glib implementation for
Windows does not support multiple watches on one channel. Any changes
regarding order in which conditions are checked were unintended. On the
other hand I do not know if in pre-812c1057f implementation (with
multiple watches) this order was something defined, implementation
defined or undefined.

Some time ago another solution for this problem was proposed but was
never commited unfortunately (slipped through the cracks?).

[PATCH v3] qemu-char: Do not disconnect when there's data for reading
https://lists.gnu.org/archive/html/qemu-devel/2014-09/msg03857.html

My comments on why I think it's better to handle disconnects with POSIX
return values can be found in the discussion of the first version of the
patch above.

https://lists.gnu.org/archive/html/qemu-devel/2014-09/msg03261.html

Can you verify that the above patch v3 solves your problem? I would really
prefer to use return values instead of GIOConditions. They are much more
reliable and better documented.

>
> Amit also has concerns regarding the return values from the tcp_chr_read
> function, which seem a bit odd as they are all TRUE, even for failure paths.
> 
> All feedback very much appreciated.
> 
> Best Regards,
> Nils Carlson
> 
> 
>

[Qemu-devel] [PATCH v2] target-arm: check that LSB <= MSB in BFI instruction

2015-01-30 Thread Kirill Batuzov

The documentation states that if LSB > MSB in BFI instruction behaviour
is unpredictable. Currently QEMU crashes because of assertion failure in
this case:

tcg/tcg-op.h:2061: tcg_gen_deposit_i32: Assertion `len <= 32' failed.

While assertion failure may meet the "unpredictable" definition this
behaviour is undesirable because it allows an unprivileged guest program
to crash the emulator with the OS and other programs.

This patch addresses the issue by throwing illegal instruction exception
if LSB > MSB. Only ARM decoder is affected because Thumb decoder already
has this check in place.

To reproduce issue run the following program

int main(void) {
asm volatile (".long 0x07c00c12" :: );
return 0;
}

compiled with
  gcc -marm -static badop_arm.c -o badop_arm

Signed-off-by: Kirill Batuzov 
---
 target-arm/translate.c |4 
 1 file changed, 4 insertions(+)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index bdfcdf1..2c1c2a7 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -8739,6 +8739,10 @@ static void disas_arm_insn(DisasContext *s, unsigned int 
insn)
 ARCH(6T2);
 shift = (insn >> 7) & 0x1f;
 i = (insn >> 16) & 0x1f;
+if (i < shift) {
+/* UNPREDICTABLE; we choose to UNDEF */
+goto illegal_op;
+}
 i = i + 1 - shift;
 if (rm == 15) {
 tmp = tcg_temp_new_i32();
-- 
1.7.10.4

[Qemu-devel] [PATCH] target-arm: check that LSB <= MSB in BFI instruction

2015-01-30 Thread Kirill Batuzov

The documentation states that if LSB > MSB in BFI instruction behaviour
is unpredictable. Currently QEMU crashes because of assertion failure in
this case:

tcg/tcg-op.h:2061: tcg_gen_deposit_i32: Assertion `len <= 32' failed.

While assertion failure may meet the "unpredictable" definition this
behaviour is undesirable because it allows an unprivileged guest program
to crash the emulator with the OS and other programs.

This patch addresses the issue by throwing illegal instruction exception
if LSB > MSB. Only ARM decoder is affected because Thumb decoder already
has this check in place.

To reproduce issue run the following program

int main(void) {
asm volatile (".long 0x07c00c12" :: );
return 0;
}

compiled with
  gcc -marm -static badop_arm.c -o badop_arm

Signed-off-by: Kirill Batuzov 
---
 target-arm/translate.c |2 ++
 1 file changed, 2 insertions(+)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index bdfcdf1..2821289 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -8739,6 +8739,8 @@ static void disas_arm_insn(DisasContext *s, unsigned int 
insn)
 ARCH(6T2);
 shift = (insn >> 7) & 0x1f;
 i = (insn >> 16) & 0x1f;
+if (i < shift)
+goto illegal_op;
 i = i + 1 - shift;
 if (rm == 15) {
 tmp = tcg_temp_new_i32();
-- 
1.7.10.4

Re: [Qemu-devel] Update on TCG Multithreading

2014-12-02 Thread Kirill Batuzov

On Mon, 1 Dec 2014, Mark Burton wrote:
> 
> One issue I’d like to see more opinions on is the question of a cache per 
> core, or a shared cache.
> I have heard anecdotal evidence that a shared cache gives a major performance 
> benefit….
> Does anybody have anything more concrete?

There is a theoretical and experimental comparison of these approaches in
PQEMU article (you've cited it on wiki page). Only the authors call them
differently: they call cache-per-core "Separate Code Cache" (SCC) and
they call shared cache "Unified Code Cache" (UCC).

-- 
Kirill

Re: [Qemu-devel] [PATCH] hw/arm/realview.c: Fix memory leak in realview_init()

2014-11-20 Thread Kirill Batuzov

> On 20 November 2014 11:53, Kirill Batuzov  wrote:
> > I'm surprised that this small patch caused so much controversy. It seems
> > very simple and straightforward to me.
> >
> > This patch fixes a memory leak. The fact that it indeed was a memory
> > leak is indicated by Valgrind output (Memcheck's false-positives are
> > extremely rare unless you do some really nasty things with your pointers).
> > It can be verified manually too: there are only 4 occurrences of 'ram_lo'
> > in realview.c.
> 
> It's in exactly the same situation as the other blocks of memory
> like ram_hi in that file: we allocate it and then don't care about
> freeing it, because we don't happen to have a board state struct.
> The correct fix if you care about this kind of thing would be
> to have a board state struct which had MemoryRegion fields (not
> MemoryRegion* fields). We have lots of bits of memory that we
> allocate once on startup and then don't care about freeing.
>

I think we are talking about a bit different problems here. Indeed
ram_hi is allocated, then used until QEMU exits but is never freed. Yet
it is never completely lost: there is at least one pointer to it in
memory hierarchy. Valgrind calls such situations "still reachable" and
does not consider them errors (because memory is in use until the very
moment the program exits; at least it can not be proven different).

ram_lo is different. It can be added to memory hierarchy in which case
it will behave exactly the same way as ram_hi. But it may be not used at
all in which case all pointers to it will be lost. This is the real
memory leak. Valgrind reports such situations as "definitely lost" and
they are considered errors (because it can be proven that memory was
allocated, is not in use and was not freed).

In our case ram_lo was reported as "definitely lost" while ram_hi was
"still reachable" and was never reported as error.

This patch addresses the second problem (when ram_lo is "definitely
lost") because it has very short and simple solution. While you are
arguing that we need to address the first problem - which is also true
but it is different problem that will need different solution and a much
larger one.

> It just
> doesn't seem to me very useful to merely silence the warning
> rather than actually fixing the underlying thing that the
> warning is telling you about.
> 

As I described above, it actually solves the problem Valgrind reports.
It is just a different problem than you are talking about.

> I'll probably put it in, because it's not very harmful.

Either way is fine with me. I'm still sure this patch is worthwhile but on
the other hand it is not that big of an issue to be arguing about it for
too long.

-- 
Kirill

Re: [Qemu-devel] [PATCH] hw/arm/realview.c: Fix memory leak in realview_init()

2014-11-20 Thread Kirill Batuzov

On Wed, 19 Nov 2014, Peter Maydell wrote:
> 
> Not for 2.2,

Fair enough.

> and I'm still not really convinced in
> general that it's worthwhile at all.
>

I'm surprised that this small patch caused so much controversy. It seems
very simple and straightforward to me.

This patch fixes a memory leak. The fact that it indeed was a memory
leak is indicated by Valgrind output (Memcheck's false-positives are
extremely rare unless you do some really nasty things with your pointers).
It can be verified manually too: there are only 4 occurrences of 'ram_lo'
in realview.c.

By fixing memory leak this patch silences warnings from automatic checking
tools like Valgrind. Not having minor warnings is good because it simplifies
usage of such tools in order to find new and important bugs.

This patch is local: it does not affect any other function except
realview_init.

Given all this I can see benefits of this patch with no real downsides to it.
Is this enough to proof it's worthwhile?

-- 
Kirill

Re: [Qemu-devel] [PATCH RFC 0/7] Translate guest vector operations to host vector operations

2014-11-11 Thread Kirill Batuzov

On Thu, 16 Oct 2014, Kirill Batuzov wrote:

> > (4) Consider supporting generic vector operations in the TCG?
> 
> I gave it a go and was quite happy with the result. I have implemented the 
> add_i32x4
> opcode which is addition of 128-bit vectors composed of four 32-bit integers
> and used it to translate NEON vadd.i32 to SSE paddd instruction. 



> 
> Why I think all this is worth doing:
> 
> (1) Performance. 200% speedup is a lot. My test was specifically crafted and 
> real
> life applications may not have that much vector operations on average, but
> there is a specific class of applications where it will matter a lot - 
> media
> processing applications like ffmpeg.
> 
> (2) Some unification of common operations. Right now every target reimplements
> common vector operations (like vector add/sub/mul/min/compare etc.). We 
> can
> do it once in the common TCG code.
> 
> Still there are some cons I mentioned earlier. The need to support a lot of
> opcodes is the most significant in the long run I think. So before I commit my
> time to conversion of more operations I'd like to hear your opinions if this
> approach is acceptable and worth spending efforts.
> 
> Kirill Batuzov (7):
>   tcg: add support for 128bit vector type
>   tcg: store ENV global in TCGContext
>   tcg: add sync_temp opcode
>   tcg: add add_i32x4 opcode
>   target-arm: support access to 128-bit guest registers as globals
>   target-arm: use add_i32x4 opcode to handle vadd.i32 instruction
>   tcg/i386: add support for vector opcodes
> 
>  target-arm/translate.c |   30 ++-
>  tcg/i386/tcg-target.c  |  103 ---
>  tcg/i386/tcg-target.h  |   24 -
>  tcg/tcg-op.h   |  141 
> 
>  tcg/tcg-opc.h  |   13 +
>  tcg/tcg.c  |   36 +
>  tcg/tcg.h  |   34 
>  7 files changed, 371 insertions(+), 10 deletions(-)
> 
> 

Ping? Any more comments?

-- 
Kirill

Re: [Qemu-devel] [PATCH] hw/arm/realview.c: Fix memory leak in realview_init()

2014-10-31 Thread Kirill Batuzov

On Fri, 31 Oct 2014, Peter Maydell wrote:

> On 31 October 2014 10:42, Nikita Belov  wrote:
> > On 2014-10-29 19:03, Peter Maydell wrote:
> >> We leak all of the MemoryRegions we allocate here, because we
> >> don't have a persistent state struct to keep them in. This
> >> doesn't really matter much because they're generally needed
> >> for the lifetime of the QEMU process anyway, and we only call
> >> board init functions once. So why worry about ram_lo in
> >> particular (and why this board in particular)?
> 
> > Indeed, generally we need memory regions for the lifetime of QEMU, but
> > 'mem_lo'
> > is different. It may not be used at all. We use 'ram_lo' only when a
> > condition is
> > true, in other case we will lose this pointer. Because of that if the
> > condition is
> > false we have memory leak immediately (not when QEMU exits).
> 
> No, ram_lo is exactly the same as the other memory regions
> here: we allocate it in this function, we don't keep any
> kind of pointer to it after we leave this function,

This is not true. We keep pointer to the memory region when we add it as
a subregion of another region. As long as we have a pointer to a root
region(s) we have a pointer to any other used region. Which is not the
case for unused ones.

Actually it is impossible to use a dinamically allocated piece of
memory and not to have a pointer to it at the same time.

> and
> we rely on it being freed on QEMU exit. The fact that we
> don't happen to use ram_lo in all cases is irrelevant.
>

-- 
Kirill

Re: [Qemu-devel] [PATCH] MAINTAINERS: Add more TCG files

2014-10-22 Thread Kirill Batuzov

On Wed, 22 Oct 2014, Paolo Bonzini wrote:

> Unfortunately, TCG files do not really have a maintainer yet.
> But at least there will be fewer unmaintained files.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  MAINTAINERS | 16 
>  1 file changed, 16 insertions(+)
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 206bf7e..70d58a5 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -61,6 +61,16 @@ L: secal...@redhat.com
>  
>  Guest CPU cores (TCG):
>  --
> +Overall
> +M: qemu-devel@nongnu.org
> +S: Odd fixes
> +F: cpu-exec.c
> +F: cputlb.c
> +F: softmmu_template.h
> +F: translate-all.c
> +F: include/exec/cpu_ldst_template.h

You probably wan to add here
F: include/exec/cpu_ldst.h

and may be
F: include/exec/cputlb.h (it covers functions from cputlb.c and exec.c)

> +F: include/exec/helper*.h
> +
>  Alpha
>  M: Richard Henderson 
>  S: Maintained
> @@ -873,6 +883,12 @@ F: include/sysemu/seccomp.h
>  
>  Usermode Emulation
>  --
> +Overall
> +M: Riku Voipio 
> +S: Maintained
> +F: thunk.c
> +F: user-exec.c
> +
>  BSD user
>  M: Blue Swirl 
>  S: Maintained
>

Re: [Qemu-devel] [PATCH] get_maintainer.pl: Default to --no-git-fallback

2014-10-21 Thread Kirill Batuzov

On Tue, 21 Oct 2014, Markus Armbruster wrote:

> Paolo Bonzini  writes:
> 
> > On 10/20/2014 04:15 PM, Michael S. Tsirkin wrote:
> >> What do you want to happen in this case?
> >> Won't this cause even more patches to fall to the floor?
> >>
> >> The benefit seems marginal, the risk high.
> >
> > I agree with Michael.
> >
> > Can we detect if get_maintainer.pl is invoked as a cccmd, and in this
> > case default to --no-git-fallback?  If it is invoked manually, I would
> > like to show the committers (I will then cherry pick the right ones).
> 
> I don't like context-sensitive defaults.  Too much magic.
> 
> What about this: if get_maintainer.pl comes up empty, it points you to
> --git-fallback.
>

I am in favor of this. Empty output does not tell a new contributors what
to do next. So either this or document it on corresponding wiki page.

-- 
Kirill

Re: [Qemu-devel] [PATCH] get_maintainer.pl: Default to --no-git-fallback

2014-10-21 Thread Kirill Batuzov

On Tue, 21 Oct 2014, Markus Armbruster wrote:

> "Michael S. Tsirkin"  writes:
> 
> > On Mon, Oct 20, 2014 at 03:04:44PM +0100, Peter Maydell wrote:
> >> On 20 October 2014 10:19, Markus Armbruster  wrote:
> >> > Contributors rely on this script to find maintainers to copy.  The
> >> > script falls back to git when no exact MAINTAINERS pattern matches.
> >> > When that happens, recent contributors get copied, which tends not be
> >> > particularly useful.  Some contributors find it even annoying.
> >> >
> >> > Flip the default to "don't fall back to git".  Use --git-fallback to
> >> > ask it to fall back to git.
> >> >
> >> > Signed-off-by: Markus Armbruster 
> >> 
> >> Good idea.
> >> 
> >> Reviewed-by: Peter Maydell 
> >> 
> >> -- PMM
> >
> > What do you want to happen in this case?
> > Won't this cause even more patches to fall to the floor?
> >
> > The benefit seems marginal, the risk high.
> >
> > I would be OK with this if you also go over history
> > and assign maintainers to all core files which lack
> > maintainers listed in MAINTAINERS.
> 
> Define "core files".
>

Files implementing common infrastructure used in different guests on
different hosts? This probably is the least covered in MAINTAINERS part.
MAINTAINERS covers pretty well target architectures, host architectures
and guest machines (each of them is a well-defined independent subsystem).
On the other hand a lot of common files are missing from MAINTAINERS:
cpu-exec.c, hw/core/*.c, cputlb.c etc.

> I don't think I (or anyone) should *assign* maintainers.  We've always
> let people volunteer for the maintainer role.  Prodding them to
> volunteer is fine, but shanghaiing them outright is a different matter.
> 

May be we can start searching for volunteers by making a list of
unmaintained files grouped by subsystems? It is hard to find volunteers
when we do not know exactly what we need them for.

> We do have too may files lacking maintainers.  See
> 
> Subject: MAINTAINERS leaves too many files uncovered
> Date: Mon, 20 Oct 2014 11:19:44 +0200
> Message-ID: <87mw8rumhb@blackfin.pond.sub.org>
> https://lists.nongnu.org/archive/html/qemu-devel/2014-10/msg01951.html
> 
> > I'm yet to see contributors who are annoyed but we
> > can always blacklist specific people.
> 
> Quite a few have grumbled, both in this thread and elsewhere.  Usually,
> for every one who grumbles, there are several quietly annoyed.
> 
>

Re: [Qemu-devel] [PATCH RFC 0/7] Translate guest vector operations to host vector operations

2014-10-16 Thread Kirill Batuzov

On Thu, 16 Oct 2014, Alex Bennée wrote:

> >
> > From Valgrind experience there are enough genericism. Valgrind can translate
> > SSE, AltiVec and NEON instructions to vector opcodes. Most of the opcodes 
> > are
> > reused between instruction sets.
> 
> Doesn't Valgrind have the advantage of same-arch->same-arch (I've not
> looked at it's generated code in detail though).
>

Yes, they have this advantage, but Valgrind tools look at intermediate
code in an architecture-independent way. For tools to work they need
to preserve opcode's semantics across different architectures. For
example Iop_QAdd16Sx4 (addition with saturation) must have the same
meaning on ARM (vqadd.s16 instruction) and on x86 (paddsw instruction).
So in most cases where Valgrind uses same opcode for different
instructions from different architectures QEMU can do the same.

> > But keep in mind - there are a lot of vector opcodes. Much much more than
> > scalar ones. You can see full list in Valgrind sources
> > (VEX/pub/libvex_ir.h).
> 
> I think we could only approach this is in a piecemeal way guided by
> performance bottlenecks when we find them.
> 

I'm not sure this will work. In my example larger part of speedup comes
from the fact that I could preserve value on registers and do not need
them to be saved and loaded for each vadd.i32 instruction. To be able to
do it on the real-life application we need to support as large fraction
of its vector instructions as possible. In short: the speedup does not
come from faster emulation of one instruction but from interaction
between sequential guest instructions.

> > We can reduce the amount of opcodes by converting vector element size from 
> > part
> > of an opcode to a constant argument. But we will lose some flexibility 
> > offered
> > by the TARGET_HAS_opcode macro when target has support for some sizes but 
> > not for
> > others. For example SSE has vector minimum for sizes i8x16, i16x8, i32x4 but
> > does not have one for size i64x2. 
> >
> > Some implementation details and concerns.
> >
> > The most problematic issue was the fact that with vector registers we have 
> > one
> > entity that can be accessed as both global variable and memory location. I
> > solved it by introducing the sync_temp opcode that instructs register 
> > allocator to
> > save global variable to its memory location if it is on the register. If a
> > variable is not on a register or memory is already coherent - no store is 
> > issued,
> > so performance penalty for it is minimal. Still this approach has a serious
> > drawback: we need to generate sync_temp explicitly. But I do not know any 
> > better
> > way to achieve consistency.
> 
> I'm not sure I follow. I thought we only needed the memory access when
> the backend can't support the vector width operations so shouldn't have
> stuff in the vector registers?
> 

The target support for vector operations is not binary ("support all" or
"support none"). In most cases it will support some large subset but
some guest vector operations will be emulated. In that case we'll need
to access guest vector registers as memory locations.

Scalar operations which are not supported in opcodes are very uncommon
and a helper with large performance overhead is a reasonable option. I'd
like to avoid such heavy helpers in vector operations because
unsupported opcodes will be more common.

Another cause is the transition from existing code to vector opcodes.
During transition we'll have mix of old code (access as memory) and new
one (access as globals). Doing transition in one go is unrealistic.

> > Note that as of this RFC I have not finished conversion of ARM guest so 
> > mixing
> > NEON with VFP code can cause a miscompile.
> >
> > The second problem is that a backend may or may not support vector 
> > operations. We
> > do not want each frontend to check it on every operation. I created a 
> > wrapper that
> > generates vector opcode if it is supported or generates emulation code.
> >
> > For add_i32x4 emulation code is generated inline. I tried to make it a 
> > helper
> > but got a very significant performance loss (5x slowdown). I'm not sure 
> > about
> > the cause but I suspect that memory was a bottleneck and extra stores needed
> > by calling conventions mattered a lot.
> 
> So the generic helper was more API heavy than the existing NEON helpers?

Existing NEON implementation generates emulation code inline too. That
is how I found that my helper was slow.

-- 
Kirill

[Qemu-devel] [PATCH RFC 4/7] tcg: add add_i32x4 opcode

2014-10-16 Thread Kirill Batuzov

Introduce INDEX_op_add_i32x4 opcode which adds two 128-bit variables as vectors
of four 32-bit integers.

Add tcg_gen_add_i32x4 wrapper function that generates this opcode. If a TCG 
target
does not support it, the wrapper falls back to emulation of vector operation as
a series of scalar ones. Wrapper arguments should be globals unless the 
frontend is
sure that the backend has at least some support for vector operations (by "some
support" I mean loads, stores and moves).

Note that emulation of vector operation with scalar ones is done inline. An
attempt to do it as a helper resulted in a serious performance degradation.

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h  |  108 +
 tcg/tcg-opc.h |   12 +++
 tcg/tcg.h |5 +++
 3 files changed, 125 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index ea2b14f..c5f777d 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -139,6 +139,15 @@ static inline void tcg_gen_ldst_op_i64(TCGOpcode opc, 
TCGv_i64 val,
 *tcg_ctx.gen_opparam_ptr++ = offset;
 }
 
+static inline void tcg_gen_ldst_op_v128(TCGOpcode opc, TCGv_v128 val,
+   TCGv_ptr base, TCGArg offset)
+{
+*tcg_ctx.gen_opc_ptr++ = opc;
+*tcg_ctx.gen_opparam_ptr++ = GET_TCGV_V128(val);
+*tcg_ctx.gen_opparam_ptr++ = GET_TCGV_PTR(base);
+*tcg_ctx.gen_opparam_ptr++ = offset;
+}
+
 static inline void tcg_gen_op4_i32(TCGOpcode opc, TCGv_i32 arg1, TCGv_i32 arg2,
TCGv_i32 arg3, TCGv_i32 arg4)
 {
@@ -1069,6 +1078,11 @@ static inline void tcg_gen_ld_i64(TCGv_i64 ret, TCGv_ptr 
arg2, tcg_target_long o
 tcg_gen_ldst_op_i64(INDEX_op_ld_i64, ret, arg2, offset);
 }
 
+static inline void tcg_gen_ld_v128(TCGv_v128 ret, TCGv_ptr arg2, 
tcg_target_long offset)
+{
+tcg_gen_ldst_op_v128(INDEX_op_ld_v128, ret, arg2, offset);
+}
+
 static inline void tcg_gen_st8_i64(TCGv_i64 arg1, TCGv_ptr arg2,
tcg_target_long offset)
 {
@@ -1092,6 +1106,11 @@ static inline void tcg_gen_st_i64(TCGv_i64 arg1, 
TCGv_ptr arg2, tcg_target_long
 tcg_gen_ldst_op_i64(INDEX_op_st_i64, arg1, arg2, offset);
 }
 
+static inline void tcg_gen_st_v128(TCGv_v128 arg1, TCGv_ptr arg2, 
tcg_target_long offset)
+{
+tcg_gen_ldst_op_v128(INDEX_op_st_v128, arg1, arg2, offset);
+}
+
 static inline void tcg_gen_add_i64(TCGv_i64 ret, TCGv_i64 arg1, TCGv_i64 arg2)
 {
 tcg_gen_op3_i64(INDEX_op_add_i64, ret, arg1, arg2);
@@ -2780,6 +2799,8 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv 
addr, int mem_index)
 tcg_gen_add_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), TCGV_PTR_TO_NAT(B))
 # define tcg_gen_addi_ptr(R, A, B) \
 tcg_gen_addi_i32(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), (B))
+# define tcg_gen_movi_ptr(R, B) \
+tcg_gen_movi_i32(TCGV_PTR_TO_NAT(R), (B))
 # define tcg_gen_ext_i32_ptr(R, A) \
 tcg_gen_mov_i32(TCGV_PTR_TO_NAT(R), (A))
 #else
@@ -2791,6 +2812,93 @@ static inline void tcg_gen_qemu_st64(TCGv_i64 arg, TCGv 
addr, int mem_index)
 tcg_gen_add_i64(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), TCGV_PTR_TO_NAT(B))
 # define tcg_gen_addi_ptr(R, A, B) \
 tcg_gen_addi_i64(TCGV_PTR_TO_NAT(R), TCGV_PTR_TO_NAT(A), (B))
+# define tcg_gen_movi_ptr(R, B) \
+tcg_gen_movi_i64(TCGV_PTR_TO_NAT(R), (B))
 # define tcg_gen_ext_i32_ptr(R, A) \
 tcg_gen_ext_i32_i64(TCGV_PTR_TO_NAT(R), (A))
 #endif /* UINTPTR_MAX == UINT32_MAX */
+
+/***/
+/* 128-bit vector arithmetic.  */
+
+static inline void *tcg_v128_swap_slot(int n)
+{
+return &tcg_ctx.v128_swap[n * 16];
+}
+
+/* Find a memory location for 128-bit TCG variable. */
+static inline void tcg_v128_to_ptr(TCGv_v128 tmp, TCGv_ptr base, int slot,
+   TCGv_ptr *real_base, intptr_t *real_offset,
+   int is_read)
+{
+int idx = GET_TCGV_V128(tmp);
+assert(idx >= 0 && idx < tcg_ctx.nb_temps);
+if (idx < tcg_ctx.nb_globals) {
+/* Globals use their locations within CPUArchState. */
+int env = GET_TCGV_PTR(tcg_ctx.cpu_env);
+TCGTemp *ts_env = &tcg_ctx.temps[env];
+TCGTemp *ts_arg = &tcg_ctx.temps[idx];
+
+/* Sanity checks: global's memory locations must be addressed
+   relative to ENV. */
+assert(ts_env->val_type == TEMP_VAL_REG &&
+   ts_env->reg == ts_arg->mem_reg &&
+   ts_arg->mem_allocated);
+
+*real_base = tcg_ctx.cpu_env;
+*real_offset = ts_arg->mem_offset;
+
+if (is_read) {
+tcg_gen_sync_temp_v128(tmp);
+} else {
+tcg_gen_discard_v128(tmp);
+}
+} else {
+/* Temporaries use swap space in TCGContext. Since we already have
+   a 128-bit temporary we'll assume that the target supports 128-bit
+   loads and store

[Qemu-devel] [PATCH RFC 7/7] tcg/i386: add support for vector opcodes

2014-10-16 Thread Kirill Batuzov

To be able to generate vector operations in TCG backend we need to do several
things.

1. We need to tell the register allocator about the target's vector registers.
   In the case of x86 we'll use xmm0..xmm7. xmm7 is designated as a scratch
   register, others can be used by register allocator.

2. We need a new constraint to indicate where to use vector registers. In this
   commit constraint 'V' is introduced.

3. We need to be able to generate bare minimum: load, store and reg-to-reg
   move. MOVDQU is used for loads and stores. MOVDQA is used for reg-to-reg
   moves.

4. Finally we need to support any other opcodes we want. INDEX_op_add_i32x4 is
   the only one for now. PADDD instruction handles it perfectly.

Signed-off-by: Kirill Batuzov 
---
 tcg/i386/tcg-target.c |  103 ++---
 tcg/i386/tcg-target.h |   24 +++-
 2 files changed, 119 insertions(+), 8 deletions(-)

diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
index 4133dcf..f26750d 100644
--- a/tcg/i386/tcg-target.c
+++ b/tcg/i386/tcg-target.c
@@ -32,6 +32,9 @@ static const char * const 
tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
 #else
 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
 #endif
+#ifdef TCG_TARGET_HAS_REG128
+"%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
+#endif
 };
 #endif
 
@@ -61,6 +64,16 @@ static const int tcg_target_reg_alloc_order[] = {
 TCG_REG_EDX,
 TCG_REG_EAX,
 #endif
+#ifdef TCG_TARGET_HAS_REG128
+TCG_REG_XMM0,
+TCG_REG_XMM1,
+TCG_REG_XMM2,
+TCG_REG_XMM3,
+TCG_REG_XMM4,
+TCG_REG_XMM5,
+TCG_REG_XMM6,
+/*  TCG_REG_XMM7, <- scratch register */
+#endif
 };
 
 static const int tcg_target_call_iarg_regs[] = {
@@ -247,6 +260,10 @@ static int target_parse_constraint(TCGArgConstraint *ct, 
const char **pct_str)
 case 'I':
 ct->ct |= TCG_CT_CONST_I32;
 break;
+case 'V':
+ct->ct |= TCG_CT_REG;
+tcg_regset_set32(ct->u.regs, 0, 0xff);
+break;
 
 default:
 return -1;
@@ -301,6 +318,9 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define P_SIMDF30x1 /* 0xf3 opcode prefix */
 #define P_SIMDF20x2 /* 0xf2 opcode prefix */
 
+#define P_SSE_660F  (P_DATA16 | P_EXT)
+#define P_SSE_F30F  (P_SIMDF3 | P_EXT)
+
 #define OPC_ARITH_EvIz (0x81)
 #define OPC_ARITH_EvIb (0x83)
 #define OPC_ARITH_GvEv (0x03)  /* ... plus (ARITH_FOO << 3) */
@@ -351,6 +371,11 @@ static inline int tcg_target_const_match(tcg_target_long 
val, TCGType type,
 #define OPC_GRP3_Ev(0xf7)
 #define OPC_GRP5   (0xff)
 
+#define OPC_MOVDQU_M2R  (0x6f | P_SSE_F30F)  /* store 128-bit value */
+#define OPC_MOVDQU_R2M  (0x7f | P_SSE_F30F)  /* load 128-bit value */
+#define OPC_MOVDQA_R2R  (0x6f | P_SSE_660F)  /* reg-to-reg 128-bit mov */
+#define OPC_PADDD   (0xfe | P_SSE_660F)
+
 /* Group 1 opcode extensions for 0x80-0x83.
These are also used as modifiers for OPC_ARITH.  */
 #define ARITH_ADD 0
@@ -428,6 +453,9 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, int 
rm, int x)
 assert((opc & P_REXW) == 0);
 tcg_out8(s, 0x66);
 }
+if (opc & P_SIMDF3) {
+tcg_out8(s, 0xf3);
+}
 if (opc & P_ADDR32) {
 tcg_out8(s, 0x67);
 }
@@ -634,9 +662,22 @@ static inline void tgen_arithr(TCGContext *s, int subop, 
int dest, int src)
 static inline void tcg_out_mov(TCGContext *s, TCGType type,
TCGReg ret, TCGReg arg)
 {
+int opc;
 if (arg != ret) {
-int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
-tcg_out_modrm(s, opc, ret, arg);
+switch (type) {
+case TCG_TYPE_V128:
+ret -= TCG_REG_XMM0;
+arg -= TCG_REG_XMM0;
+tcg_out_modrm(s, OPC_MOVDQA_R2R, ret, arg);
+break;
+case TCG_TYPE_I32:
+case TCG_TYPE_I64:
+opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
+tcg_out_modrm(s, opc, ret, arg);
+break;
+default:
+assert(0);
+}
 }
 }
 
@@ -699,15 +740,39 @@ static inline void tcg_out_pop(TCGContext *s, int reg)
 static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
   TCGReg arg1, intptr_t arg2)
 {
-int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
-tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
+int opc;
+switch (type) {
+case TCG_TYPE_V128:
+ret -= TCG_REG_XMM0;
+tcg_out_modrm_offset(s, OPC_MOVDQU_M2R, ret, arg1, arg2);
+break;
+case TCG_TYPE_I32:
+case TCG_TYPE_I64:
+

[Qemu-devel] [PATCH RFC 0/7] Translate guest vector operations to host vector operations

2014-10-16 Thread Kirill Batuzov

ers we have NEON where an 128-bit Q register consists of two 64-bit
D registers each consisting of two 32-bit S registers. I think I'll need
to add alias list to each global listing every other global it can clobber and
then iterate over it in the optimizer. Fortunately this list will be static and 
not
very long.

Why I think all this is worth doing:

(1) Performance. 200% speedup is a lot. My test was specifically crafted and 
real
life applications may not have that much vector operations on average, but
there is a specific class of applications where it will matter a lot - media
processing applications like ffmpeg.

(2) Some unification of common operations. Right now every target reimplements
common vector operations (like vector add/sub/mul/min/compare etc.). We can
do it once in the common TCG code.

Still there are some cons I mentioned earlier. The need to support a lot of
opcodes is the most significant in the long run I think. So before I commit my
time to conversion of more operations I'd like to hear your opinions if this
approach is acceptable and worth spending efforts.

Kirill Batuzov (7):
  tcg: add support for 128bit vector type
  tcg: store ENV global in TCGContext
  tcg: add sync_temp opcode
  tcg: add add_i32x4 opcode
  target-arm: support access to 128-bit guest registers as globals
  target-arm: use add_i32x4 opcode to handle vadd.i32 instruction
  tcg/i386: add support for vector opcodes

 target-arm/translate.c |   30 ++-
 tcg/i386/tcg-target.c  |  103 ---
 tcg/i386/tcg-target.h  |   24 -
 tcg/tcg-op.h   |  141 
 tcg/tcg-opc.h  |   13 +
 tcg/tcg.c  |   36 +
 tcg/tcg.h  |   34 
 7 files changed, 371 insertions(+), 10 deletions(-)

-- 
1.7.10.4

[Qemu-devel] [PATCH RFC 6/7] target-arm: use add_i32x4 opcode to handle vadd.i32 instruction

2014-10-16 Thread Kirill Batuzov


Signed-off-by: Kirill Batuzov 
---
 target-arm/translate.c |   12 
 1 file changed, 12 insertions(+)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 22855d8..00ea5cf 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -5239,6 +5239,18 @@ static int disas_neon_data_insn(CPUARMState * env, 
DisasContext *s, uint32_t ins
 return 1;
 }
 
+/* Use vector ops to handle what we can */
+switch (op) {
+case NEON_3R_VADD_VSUB:
+if (!u && size == 2) {
+tcg_gen_add_i32x4(cpu_Q[rd >> 1], cpu_Q[rn >> 1], cpu_Q[rm 
>> 1]);
+return 0;
+}
+break;
+default:
+break;
+}
+
 for (pass = 0; pass < (q ? 4 : 2); pass++) {
 
 if (pairwise) {
-- 
1.7.10.4

[Qemu-devel] [PATCH RFC 1/7] tcg: add support for 128bit vector type

2014-10-16 Thread Kirill Batuzov

Introduce TCG_TYPE_V128 and corresponding TCGv_v128 for TCG temps. Add wrapper
functions that work with temps of this new type.

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h |   23 +++
 tcg/tcg.c|   24 
 tcg/tcg.h|   28 
 3 files changed, 75 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 019dd9b..81291fd 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -345,6 +345,29 @@ static inline void tcg_gen_op6ii_i64(TCGOpcode opc, 
TCGv_i64 arg1,
 *tcg_ctx.gen_opparam_ptr++ = arg6;
 }
 
+static inline void tcg_gen_op1_v128(TCGOpcode opc, TCGv_v128 arg1)
+{
+*tcg_ctx.gen_opc_ptr++ = opc;
+*tcg_ctx.gen_opparam_ptr++ = GET_TCGV_V128(arg1);
+}
+
+static inline void tcg_gen_op2_v128(TCGOpcode opc, TCGv_v128 arg1,
+TCGv_v128 arg2)
+{
+*tcg_ctx.gen_opc_ptr++ = opc;
+*tcg_ctx.gen_opparam_ptr++ = GET_TCGV_V128(arg1);
+*tcg_ctx.gen_opparam_ptr++ = GET_TCGV_V128(arg2);
+}
+
+static inline void tcg_gen_op3_v128(TCGOpcode opc, TCGv_v128 arg1,
+TCGv_v128 arg2, TCGv_v128 arg3)
+{
+*tcg_ctx.gen_opc_ptr++ = opc;
+*tcg_ctx.gen_opparam_ptr++ = GET_TCGV_V128(arg1);
+*tcg_ctx.gen_opparam_ptr++ = GET_TCGV_V128(arg2);
+*tcg_ctx.gen_opparam_ptr++ = GET_TCGV_V128(arg3);
+}
+
 static inline void tcg_add_param_i32(TCGv_i32 val)
 {
 *tcg_ctx.gen_opparam_ptr++ = GET_TCGV_I32(val);
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 7a84b87..d01f357 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -542,6 +542,12 @@ TCGv_i64 tcg_global_mem_new_i64(int reg, intptr_t offset, 
const char *name)
 return MAKE_TCGV_I64(idx);
 }
 
+TCGv_v128 tcg_global_mem_new_v128(int reg, intptr_t offset, const char *name)
+{
+int idx = tcg_global_mem_new_internal(TCG_TYPE_V128, reg, offset, name);
+return MAKE_TCGV_V128(idx);
+}
+
 static inline int tcg_temp_new_internal(TCGType type, int temp_local)
 {
 TCGContext *s = &tcg_ctx;
@@ -612,6 +618,14 @@ TCGv_i64 tcg_temp_new_internal_i64(int temp_local)
 return MAKE_TCGV_I64(idx);
 }
 
+TCGv_v128 tcg_temp_new_internal_v128(int temp_local)
+{
+int idx;
+
+idx = tcg_temp_new_internal(TCG_TYPE_V128, temp_local);
+return MAKE_TCGV_V128(idx);
+}
+
 static void tcg_temp_free_internal(int idx)
 {
 TCGContext *s = &tcg_ctx;
@@ -644,6 +658,11 @@ void tcg_temp_free_i64(TCGv_i64 arg)
 tcg_temp_free_internal(GET_TCGV_I64(arg));
 }
 
+void tcg_temp_free_v128(TCGv_v128 arg)
+{
+tcg_temp_free_internal(GET_TCGV_V128(arg));
+}
+
 TCGv_i32 tcg_const_i32(int32_t val)
 {
 TCGv_i32 t0;
@@ -1062,6 +1081,11 @@ char *tcg_get_arg_str_i64(TCGContext *s, char *buf, int 
buf_size, TCGv_i64 arg)
 return tcg_get_arg_str_idx(s, buf, buf_size, GET_TCGV_I64(arg));
 }
 
+char *tcg_get_arg_str_v128(TCGContext *s, char *buf, int buf_size, TCGv_v128 
arg)
+{
+return tcg_get_arg_str_idx(s, buf, buf_size, GET_TCGV_V128(arg));
+}
+
 /* Find helper name.  */
 static inline const char *tcg_find_helper(TCGContext *s, uintptr_t val)
 {
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 7285f71..01dbede 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -194,6 +194,7 @@ typedef struct TCGPool {
 typedef enum TCGType {
 TCG_TYPE_I32,
 TCG_TYPE_I64,
+TCG_TYPE_V128,
 TCG_TYPE_COUNT, /* number of different types */
 
 /* An alias for the size of the host register.  */
@@ -286,6 +287,7 @@ typedef tcg_target_ulong TCGArg;
 typedef struct TCGv_i32_d *TCGv_i32;
 typedef struct TCGv_i64_d *TCGv_i64;
 typedef struct TCGv_ptr_d *TCGv_ptr;
+typedef struct TCGv_v128_d *TCGv_v128;
 
 static inline TCGv_i32 QEMU_ARTIFICIAL MAKE_TCGV_I32(intptr_t i)
 {
@@ -302,6 +304,11 @@ static inline TCGv_ptr QEMU_ARTIFICIAL 
MAKE_TCGV_PTR(intptr_t i)
 return (TCGv_ptr)i;
 }
 
+static inline TCGv_v128 QEMU_ARTIFICIAL MAKE_TCGV_V128(intptr_t i)
+{
+return (TCGv_v128)i;
+}
+
 static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_I32(TCGv_i32 t)
 {
 return (intptr_t)t;
@@ -317,6 +324,11 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_PTR(TCGv_ptr t)
 return (intptr_t)t;
 }
 
+static inline intptr_t QEMU_ARTIFICIAL GET_TCGV_V128(TCGv_v128 t)
+{
+return (intptr_t)t;
+}
+
 #if TCG_TARGET_REG_BITS == 32
 #define TCGV_LOW(t) MAKE_TCGV_I32(GET_TCGV_I64(t))
 #define TCGV_HIGH(t) MAKE_TCGV_I32(GET_TCGV_I64(t) + 1)
@@ -324,15 +336,18 @@ static inline intptr_t QEMU_ARTIFICIAL 
GET_TCGV_PTR(TCGv_ptr t)
 
 #define TCGV_EQUAL_I32(a, b) (GET_TCGV_I32(a) == GET_TCGV_I32(b))
 #define TCGV_EQUAL_I64(a, b) (GET_TCGV_I64(a) == GET_TCGV_I64(b))
+#define TCGV_EQUAL_V128(a, b) (GET_TCGV_V128(a) == GET_TCGV_V128(b))
 #define TCGV_EQUAL_PTR(a, b) (GET_TCGV_PTR(a) == GET_TCGV_PTR(b))
 
 /* Dummy definition to avoid compiler warnings.  */
 #define TCGV_UNUSED_I32(x) x = MAKE_TCGV_I32(-1)
 #define TCGV_UNUSED_I64(x) x = MAKE_TCGV_I64(-1)
+#define TCGV_UNUSED_V128(x) x = MAKE_TCGV_V128(-1)
 #define TCGV_UNUSED_PTR(x) x = MAKE_TCGV_PTR(-1)

[Qemu-devel] [PATCH RFC 2/7] tcg: store ENV global in TCGContext

2014-10-16 Thread Kirill Batuzov

When a TCG backend does not support some vector operation we need to emulate
this operation. Unlike arguments of the scalar operations vector values are
hard to operate on directly or to be passed as function arguments (because
a target may lack corresponding type support). To avoid this we will use
pointers to host memory locations holding values of temporaries. This memory
locations for globals must be their canonical locations in CPUArchState
because moving them around is expensive and hard to implement.

Fortunately globals always have memory locations statically assigned to them.
They are addressed relative to AREG0. To express direct access to this memory
in TCG opcodes we need to know global variable ENV (which corresponds to this
AREG0).

Add a field to TCGContext. Frontends can save ENV there during translate_init.
It will be used in handling vector operations only so targets that do not use
vector support do not need to set it.

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg.h |1 +
 1 file changed, 1 insertion(+)

diff --git a/tcg/tcg.h b/tcg/tcg.h
index 01dbede..83fb0d3 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -496,6 +496,7 @@ struct TCGContext {
 tcg_insn_unit *code_ptr;
 TCGTemp temps[TCG_MAX_TEMPS]; /* globals first, temps after */
 TCGTempSet free_temps[TCG_TYPE_COUNT * 2];
+TCGv_ptr cpu_env; /* used to access memory locations for vector globals */
 
 GHashTable *helpers;
 
-- 
1.7.10.4

[Qemu-devel] [PATCH RFC 3/7] tcg: add sync_temp opcode

2014-10-16 Thread Kirill Batuzov

Currently every field of CPUArchState can be accessed from the TCG-generated 
code
as a memory location or as a global but not both. In order to be able to mix
these two approaches we need to restore consistency between value of global
(possibly kept on register) and value in corresponding memory location.

Introduce sync_temp TCGOpcode which instructs register allocator to
save value of a global into its memory location.

Signed-off-by: Kirill Batuzov 
---
 tcg/tcg-op.h  |   10 ++
 tcg/tcg-opc.h |1 +
 tcg/tcg.c |   12 
 3 files changed, 23 insertions(+)

diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index 81291fd..ea2b14f 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -1808,6 +1808,16 @@ static inline void tcg_gen_discard_i64(TCGv_i64 arg)
 #endif
 }
 
+static inline void tcg_gen_discard_v128(TCGv_v128 arg)
+{
+tcg_gen_op1_v128(INDEX_op_discard, arg);
+}
+
+static inline void tcg_gen_sync_temp_v128(TCGv_v128 arg)
+{
+tcg_gen_op1_v128(INDEX_op_sync_temp, arg);
+}
+
 static inline void tcg_gen_andc_i32(TCGv_i32 ret, TCGv_i32 arg1, TCGv_i32 arg2)
 {
 if (TCG_TARGET_HAS_andc_i32) {
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index 042d442..0916d83 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -37,6 +37,7 @@ DEF(nop3, 0, 0, 3, TCG_OPF_NOT_PRESENT)
 DEF(nopn, 0, 0, 1, TCG_OPF_NOT_PRESENT)
 
 DEF(discard, 1, 0, 0, TCG_OPF_NOT_PRESENT)
+DEF(sync_temp, 0, 1, 0, TCG_OPF_NOT_PRESENT)
 DEF(set_label, 0, 0, 1, TCG_OPF_BB_END | TCG_OPF_NOT_PRESENT)
 
 /* variable number of parameters */
diff --git a/tcg/tcg.c b/tcg/tcg.c
index d01f357..ff157b7 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -1553,6 +1553,11 @@ static void tcg_liveness_analysis(TCGContext *s)
 dead_temps[args[0]] = 1;
 mem_temps[args[0]] = 0;
 break;
+case INDEX_op_sync_temp:
+args--;
+dead_temps[args[0]] = 1;
+mem_temps[args[0]] = 1;
+break;
 case INDEX_op_end:
 break;
 
@@ -2527,6 +2532,13 @@ static inline int tcg_gen_code_common(TCGContext *s,
 case INDEX_op_discard:
 temp_dead(s, args[0]);
 break;
+case INDEX_op_sync_temp:
+/* We use it only for globals currently. */
+assert(args[0] < s->nb_globals);
+if (s->temps[args[0]].val_type == TEMP_VAL_REG) {
+tcg_reg_free(s, s->temps[args[0]].reg);
+}
+break;
 case INDEX_op_set_label:
 tcg_reg_alloc_bb_end(s, s->reserved_regs);
 tcg_out_label(s, args[0], s->code_ptr);
-- 
1.7.10.4

[Qemu-devel] [PATCH RFC 5/7] target-arm: support access to 128-bit guest registers as globals

2014-10-16 Thread Kirill Batuzov

To support 128-bit guest registers as globals we need to do two things:

1) create corresponding globals,
2) add sync_temp/discard to code that access these registers as memory
   locations.

Note that the second part is not complete in this RFC yet and mixing NEON with
VFP code can result in miscompile.

Signed-off-by: Kirill Batuzov 
---
 target-arm/translate.c |   18 --
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/target-arm/translate.c b/target-arm/translate.c
index 8a2994f..22855d8 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -64,6 +64,7 @@ TCGv_ptr cpu_env;
 /* We reuse the same 64-bit temporaries for efficiency.  */
 static TCGv_i64 cpu_V0, cpu_V1, cpu_M0;
 static TCGv_i32 cpu_R[16];
+static TCGv_v128 cpu_Q[16];
 static TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
 static TCGv_i64 cpu_exclusive_addr;
 static TCGv_i64 cpu_exclusive_val;
@@ -78,10 +79,14 @@ static TCGv_i64 cpu_F0d, cpu_F1d;
 
 #include "exec/gen-icount.h"
 
-static const char *regnames[] =
+static const char *regnames_r[] =
 { "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
   "r8", "r9", "r10", "r11", "r12", "r13", "r14", "pc" };
 
+static const char *regnames_q[] =
+{ "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+  "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" };
+
 /* initialize TCG globals.  */
 void arm_translate_init(void)
 {
@@ -92,7 +97,12 @@ void arm_translate_init(void)
 for (i = 0; i < 16; i++) {
 cpu_R[i] = tcg_global_mem_new_i32(TCG_AREG0,
   offsetof(CPUARMState, regs[i]),
-  regnames[i]);
+  regnames_r[i]);
+}
+for (i = 0; i < 16; i++) {
+cpu_Q[i] = tcg_global_mem_new_v128(TCG_AREG0,
+   offsetof(CPUARMState, vfp.regs[2 * 
i]),
+   regnames_q[i]);
 }
 cpu_CF = tcg_global_mem_new_i32(TCG_AREG0, offsetof(CPUARMState, CF), 
"CF");
 cpu_NF = tcg_global_mem_new_i32(TCG_AREG0, offsetof(CPUARMState, NF), 
"NF");
@@ -1237,23 +1247,27 @@ neon_reg_offset (int reg, int n)
 static TCGv_i32 neon_load_reg(int reg, int pass)
 {
 TCGv_i32 tmp = tcg_temp_new_i32();
+tcg_gen_sync_temp_v128(cpu_Q[reg >> 1]);
 tcg_gen_ld_i32(tmp, cpu_env, neon_reg_offset(reg, pass));
 return tmp;
 }
 
 static void neon_store_reg(int reg, int pass, TCGv_i32 var)
 {
+tcg_gen_discard_v128(cpu_Q[reg >> 1]);
 tcg_gen_st_i32(var, cpu_env, neon_reg_offset(reg, pass));
 tcg_temp_free_i32(var);
 }
 
 static inline void neon_load_reg64(TCGv_i64 var, int reg)
 {
+tcg_gen_sync_temp_v128(cpu_Q[reg >> 1]);
 tcg_gen_ld_i64(var, cpu_env, vfp_reg_offset(1, reg));
 }
 
 static inline void neon_store_reg64(TCGv_i64 var, int reg)
 {
+tcg_gen_discard_v128(cpu_Q[reg >> 1]);
 tcg_gen_st_i64(var, cpu_env, vfp_reg_offset(1, reg));
 }
 
-- 
1.7.10.4

1 2 >

1 - 100 of 186 matches

Mail list logo