Add flag -fassume-phsa that is on by default. If -fno-assume-phsa
is given, these optimizations are disabled.

With this flag, gccbrig can generate GENERIC that assumes we are
targeting a phsa-runtime based implementation, which allows us
to expose the work-item context accesses to retrieve WI IDs etc.
which helps optimizers.

First optimization that takes advantage of this is to get rid of
the setworkitemid calls whenever we have non-inlined calls that
use IDs internally.

Other optimizations added in this commit:

- expand absoluteid to similar level of simplicity as workitemid.
At the moment absoluteid is the best indexing ID to end up with
WG vectorization.
- propagate ID variables closer to their uses. This is mainly
to avoid known useless casts, which confuse at least scalar
evolution analysis.
- use signed long long for storing IDs. Unsigned integers have
defined wraparound semantics, which confuse at least scalar
evolution analysis, leading to unvectorizable WI loops.
- also refactor some BRIG function generation helpers to brig_function.
- no point in having the wi-loop as a for-loop. It's really
a do...while and SCEV can analyze it just fine still.
- add consts to ptrs etc. in BRIG builtin defs.
Improves optimization opportunities.
- add qualifiers to generated function parameters.
Const and restrict on the hidden local/private pointers,
the arg buffer and the context pointer help some optimizations.
---
 gcc/brig-builtins.def                              |  27 +-
 gcc/brig/brigfrontend/brig-basic-inst-handler.cc   | 172 +---
 gcc/brig/brigfrontend/brig-branch-inst-handler.cc  |  21 +-
 gcc/brig/brigfrontend/brig-cmp-inst-handler.cc     |   6 +-
 gcc/brig/brigfrontend/brig-code-entry-handler.cc   | 503 +----------
 gcc/brig/brigfrontend/brig-code-entry-handler.h    |  21 -
 gcc/brig/brigfrontend/brig-control-handler.cc      |  20 +-
 gcc/brig/brigfrontend/brig-cvt-inst-handler.cc     |   6 +
 gcc/brig/brigfrontend/brig-function-handler.cc     |  89 +-
gcc/brig/brigfrontend/brig-function.cc | 925 +++++++++++++++++++--
 gcc/brig/brigfrontend/brig-function.h              |  43 +
 gcc/brig/brigfrontend/brig-label-handler.cc        |   3 +
 gcc/brig/brigfrontend/brig-lane-inst-handler.cc    |   2 +-
 gcc/brig/brigfrontend/brig-mem-inst-handler.cc     |   7 +-
 gcc/brig/brigfrontend/phsa.h                       |   9 +
 gcc/brig/lang.opt                                  |   5 +
 gcc/builtin-types.def                              |   4 +
 gcc/testsuite/brig.dg/test/gimple/smoke_test.hsail |  10 +-
 libhsail-rt/include/internal/phsa-rt.h             |   1 -
 libhsail-rt/include/internal/workitems.h           |  50 +-
 libhsail-rt/rt/workitems.c                         |  84 +-
 21 files changed, 1195 insertions(+), 813 deletions(-)

>From 56864a873079ab21087474abe19949f93be9b3d3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pekka=20J=C3=A4=C3=A4skel=C3=A4inen?=
 <pekka.jaaskelai...@parmance.com>
Date: Sat, 17 Feb 2018 10:16:03 +0200
Subject: [PATCH 7/8] [BRIGFE] phsa-specific optimizations

Add flag -fassume-phsa that is on by default. If -fno-assume-phsa
is given, these optimizations are disabled.

With this flag, gccbrig can generate GENERIC that assumes we are
targeting a phsa-runtime based implementation, which allows us
to expose the work-item context accesses to retrieve WI IDs etc.
which helps optimizers.

First optimization that takes advantage of this is to get rid of
the setworkitemid calls whenever we have non-inlined calls that
use IDs internally.

Other optimizations added in this commit:

- expand absoluteid to similar level of simplicity as workitemid.
At the moment absoluteid is the best indexing ID to end up with
WG vectorization.
- propagate ID variables closer to their uses. This is mainly
to avoid known useless casts, which confuse at least scalar
evolution analysis.
- use signed long long for storing IDs. Unsigned integers have
defined wraparound semantics, which confuse at least scalar
evolution analysis, leading to unvectorizable WI loops.
- also refactor some BRIG function generation helpers to brig_function.
- no point in having the wi-loop as a for-loop. It's really
a do...while and SCEV can analyze it just fine still.
- add consts to ptrs etc. in BRIG builtin defs.
Improves optimization opportunities.
- add qualifiers to generated function parameters.
Const and restrict on the hidden local/private pointers,
the arg buffer and the context pointer help some optimizations.
---
 gcc/brig-builtins.def                              |  27 +-
 gcc/brig/brigfrontend/brig-basic-inst-handler.cc   | 172 +---
 gcc/brig/brigfrontend/brig-branch-inst-handler.cc  |  21 +-
 gcc/brig/brigfrontend/brig-cmp-inst-handler.cc     |   6 +-
 gcc/brig/brigfrontend/brig-code-entry-handler.cc   | 503 +----------
 gcc/brig/brigfrontend/brig-code-entry-handler.h    |  21 -
 gcc/brig/brigfrontend/brig-control-handler.cc      |  20 +-
 gcc/brig/brigfrontend/brig-cvt-inst-handler.cc     |   6 +
 gcc/brig/brigfrontend/brig-function-handler.cc     |  89 +-
 gcc/brig/brigfrontend/brig-function.cc             | 925 +++++++++++++++++++--
 gcc/brig/brigfrontend/brig-function.h              |  43 +
 gcc/brig/brigfrontend/brig-label-handler.cc        |   3 +
 gcc/brig/brigfrontend/brig-lane-inst-handler.cc    |   2 +-
 gcc/brig/brigfrontend/brig-mem-inst-handler.cc     |   7 +-
 gcc/brig/brigfrontend/phsa.h                       |   9 +
 gcc/brig/lang.opt                                  |   5 +
 gcc/builtin-types.def                              |   4 +
 gcc/testsuite/brig.dg/test/gimple/smoke_test.hsail |  10 +-
 libhsail-rt/include/internal/phsa-rt.h             |   1 -
 libhsail-rt/include/internal/workitems.h           |  50 +-
 libhsail-rt/rt/workitems.c                         |  84 +-
 21 files changed, 1195 insertions(+), 813 deletions(-)

diff --git a/gcc/brig-builtins.def b/gcc/brig-builtins.def
index f94f7e6..c2e8d2c 100644
--- a/gcc/brig-builtins.def
+++ b/gcc/brig-builtins.def
@@ -45,25 +45,25 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_GRIDSIZE, BRIG_OPCODE_GRIDSIZE,
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U32,
 		  BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U32,
-		  "__hsail_workitemflatabsid_u32", BT_FN_UINT_PTR,
-		  ATTR_NOTHROW_LEAF_LIST)
+		  "__hsail_workitemflatabsid_u32", BT_FN_UINT_CONST_PTR,
+		  ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATABSID_U64,
 		  BRIG_OPCODE_WORKITEMFLATABSID, BRIG_TYPE_U64,
-		  "__hsail_workitemflatabsid_u64", BT_FN_ULONG_PTR,
-		  ATTR_NOTHROW_LEAF_LIST)
+		  "__hsail_workitemflatabsid_u64", BT_FN_ULONG_CONST_PTR,
+		  ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMFLATID, BRIG_OPCODE_WORKITEMFLATID,
-		  BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_PTR,
-		  ATTR_NOTHROW_LEAF_LIST)
+		  BRIG_TYPE_U32, "__hsail_workitemflatid", BT_FN_UINT_CONST_PTR,
+		  ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKITEMID, BRIG_OPCODE_WORKITEMID,
-		  BRIG_TYPE_U32, "__hsail_workitemid", BT_FN_UINT_UINT_PTR,
-		  ATTR_NOTHROW_LEAF_LIST)
+		  BRIG_TYPE_U32, "__hsail_workitemid",
+		  BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPID, BRIG_OPCODE_WORKGROUPID,
-		  BRIG_TYPE_U32, "__hsail_workgroupid", BT_FN_UINT_UINT_PTR,
-		  ATTR_PURE_NOTHROW_LEAF_LIST)
+		  BRIG_TYPE_U32, "__hsail_workgroupid",
+		  BT_FN_UINT_UINT_CONST_PTR, ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKITEMFLATID,
 		  BRIG_OPCODE_CURRENTWORKITEMFLATID,
@@ -90,11 +90,12 @@ DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_PACKETCOMPLETIONSIG_SIG32,
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE,
 		  BRIG_OPCODE_CURRENTWORKGROUPSIZE, BRIG_TYPE_U32,
-		  "__hsail_currentworkgroupsize", BT_FN_UINT_UINT_PTR,
+		  "__hsail_currentworkgroupsize", BT_FN_UINT_UINT_CONST_PTR,
 		  ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_WORKGROUPSIZE, BRIG_OPCODE_WORKGROUPSIZE,
-		  BRIG_TYPE_U32, "__hsail_workgroupsize", BT_FN_UINT_UINT_PTR,
+		  BRIG_TYPE_U32, "__hsail_workgroupsize",
+		  BT_FN_UINT_UINT_CONST_PTR,
 		  ATTR_PURE_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_BUILTIN (BUILT_IN_HSAIL_DIM, BRIG_OPCODE_DIM,
@@ -565,7 +566,7 @@ DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_SETWORKITEMID, "__hsail_setworkitemid",
 
 DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_WG_FUNC,
 		       "__hsail_launch_wg_function",
-		       BT_FN_VOID_PTR_PTR_PTR, ATTR_NOTHROW_LEAF_LIST)
+		       BT_FN_VOID_PTR_PTR_UINT32, ATTR_NOTHROW_LEAF_LIST)
 
 DEF_HSAIL_INTR_BUILTIN (BUILT_IN_HSAIL_LAUNCH_KERNEL,
 		       "__hsail_launch_kernel",
diff --git a/gcc/brig/brigfrontend/brig-basic-inst-handler.cc b/gcc/brig/brigfrontend/brig-basic-inst-handler.cc
index 283da7a..c8224ae 100644
--- a/gcc/brig/brigfrontend/brig-basic-inst-handler.cc
+++ b/gcc/brig/brigfrontend/brig-basic-inst-handler.cc
@@ -105,7 +105,8 @@ brig_basic_inst_handler::build_shuffle (tree arith_type,
   /* Unpack the tightly packed mask elements to BIT_FIELD_REFs
      from which to construct the mask vector as understood by
      VEC_PERM_EXPR.  */
-  tree mask_operand = add_temp_var ("shuffle_mask", operands[2]);
+  tree mask_operand
+    = m_parent.m_cf->add_temp_var ("shuffle_mask", operands[2]);
 
   tree mask_element_type
     = build_nonstandard_integer_type (input_mask_element_size, true);
@@ -219,10 +220,11 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
   tree wide_type = build_nonstandard_integer_type (vecsize, 1);
 
   tree src_vect = build_resize_convert_view (wide_type, operands[0]);
-  src_vect = add_temp_var ("src_vect", src_vect);
+  src_vect = m_parent.m_cf->add_temp_var ("src_vect", src_vect);
 
   tree scalar = operands[1];
-  scalar = add_temp_var ("scalar", convert_to_integer (wide_type, scalar));
+  scalar = m_parent.m_cf->add_temp_var ("scalar",
+					convert_to_integer (wide_type, scalar));
 
   tree pos = operands[2];
 
@@ -230,21 +232,22 @@ brig_basic_inst_handler::build_pack (tree_stl_vec &operands)
      Zero them for well-defined semantics.  */
   tree t = build2 (BIT_AND_EXPR, TREE_TYPE (pos), operands[2],
 		   build_int_cstu (TREE_TYPE (pos), ecount - 1));
-  pos = add_temp_var ("pos", convert (wide_type, t));
+  pos = m_parent.m_cf->add_temp_var ("pos", convert (wide_type, t));
 
   tree element_type = TREE_TYPE (TREE_TYPE (operands[0]));
   size_t element_width = int_size_in_bytes (element_type) * BITS_PER_UNIT;
   tree ewidth = build_int_cstu (wide_type, element_width);
 
   tree bitoffset = build2 (MULT_EXPR, wide_type, ewidth, pos);
-  bitoffset = add_temp_var ("offset", bitoffset);
+  bitoffset = m_parent.m_cf->add_temp_var ("offset", bitoffset);
 
   uint64_t mask_int
     = element_width == 64 ? (uint64_t) -1 : ((uint64_t) 1 << element_width) - 1;
 
   tree mask = build_int_cstu (wide_type, mask_int);
 
-  mask = add_temp_var ("mask", convert_to_integer (wide_type, mask));
+  mask = m_parent.m_cf->add_temp_var ("mask",
+				      convert_to_integer (wide_type, mask));
 
   tree clearing_mask
     = build1 (BIT_NOT_EXPR, wide_type,
@@ -311,7 +314,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
 					  tree arith_type,
 					  tree_stl_vec &operands)
 {
-  tree_code opcode = get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
+  tree_code opcode
+    = brig_function::get_tree_code_for_hsa_opcode (brig_opcode, brig_type);
 
   BrigType16_t inner_type = brig_type & BRIG_TYPE_BASE_MASK;
 
@@ -388,8 +392,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
 	     on which cannot be used in general to remain HSAIL compliant.
 	     Perhaps a builtin call would be better option here.  */
 	  return build2 (RDIV_EXPR, arith_type, build_one_cst (arith_type),
-			 expand_or_call_builtin (BRIG_OPCODE_SQRT, brig_type,
-						 arith_type, operands));
+			 m_parent.m_cf->expand_or_call_builtin
+			 (BRIG_OPCODE_SQRT, brig_type, arith_type, operands));
 	}
       else if (brig_opcode == BRIG_OPCODE_NRCP)
 	{
@@ -410,8 +414,8 @@ brig_basic_inst_handler::build_inst_expr (BrigOpcode16_t brig_opcode,
 	gcc_unreachable ();
     }
   else if (opcode == CALL_EXPR)
-    return expand_or_call_builtin (brig_opcode, brig_type, arith_type,
-				   operands);
+    return m_parent.m_cf->expand_or_call_builtin (brig_opcode, brig_type,
+						  arith_type, operands);
   else if (output_count == 1)
     {
       if (input_count == 1)
@@ -520,7 +524,8 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
     in_operands[0] = build_lower_element_broadcast (in_operands[0]);
 
   tree_code opcode
-    = get_tree_code_for_hsa_opcode (brig_inst->opcode, brig_inst_type);
+    = brig_function::get_tree_code_for_hsa_opcode (brig_inst->opcode,
+						   brig_inst_type);
 
   if (p >= BRIG_PACK_PPSAT && p <= BRIG_PACK_PSAT)
     {
@@ -566,11 +571,11 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
       */
       tree_stl_vec operand0_elements;
       if (input_count > 0)
-	unpack (in_operands[0], operand0_elements);
+	m_parent.m_cf->unpack (in_operands[0], operand0_elements);
 
       tree_stl_vec operand1_elements;
       if (input_count > 1)
-	unpack (in_operands[1], operand1_elements);
+	m_parent.m_cf->unpack (in_operands[1], operand1_elements);
 
       tree_stl_vec result_elements;
 
@@ -617,7 +622,7 @@ brig_basic_inst_handler::operator () (const BrigBase *base)
 
 	  result_elements.push_back (convert (scalar_type, scalar_expr));
 	}
-      instr_expr = pack (result_elements);
+      instr_expr = m_parent.m_cf->pack (result_elements);
     }
   else
     {
@@ -728,140 +733,3 @@ brig_basic_inst_handler::build_lower_element_broadcast (tree vec_operand)
 		 vec_operand, mask);
 }
 
-/* Returns the tree code that should be used to implement the given
-   HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
-   (BRIG_TYPE).  In case the opcode cannot be mapped to a TREE node directly,
-   returns TREE_LIST (if it can be emulated with a simple chain of tree
-   nodes) or CALL_EXPR if the opcode should be implemented using a builtin
-   call.  */
-
-tree_code
-brig_basic_inst_handler::get_tree_code_for_hsa_opcode
-  (BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
-{
-  BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
-  switch (brig_opcode)
-    {
-    case BRIG_OPCODE_NOP:
-      return NOP_EXPR;
-    case BRIG_OPCODE_ADD:
-      return PLUS_EXPR;
-    case BRIG_OPCODE_CMOV:
-      if (brig_inner_type == brig_type)
-	return COND_EXPR;
-      else
-	return VEC_COND_EXPR;
-    case BRIG_OPCODE_SUB:
-      return MINUS_EXPR;
-    case BRIG_OPCODE_MUL:
-    case BRIG_OPCODE_MUL24:
-      return MULT_EXPR;
-    case BRIG_OPCODE_MULHI:
-    case BRIG_OPCODE_MUL24HI:
-      return MULT_HIGHPART_EXPR;
-    case BRIG_OPCODE_DIV:
-      if (gccbrig_is_float_type (brig_inner_type))
-	return RDIV_EXPR;
-      else
-	return TRUNC_DIV_EXPR;
-    case BRIG_OPCODE_NEG:
-      return NEGATE_EXPR;
-    case BRIG_OPCODE_MIN:
-      if (gccbrig_is_float_type (brig_inner_type))
-	return CALL_EXPR;
-      else
-	return MIN_EXPR;
-    case BRIG_OPCODE_MAX:
-      if (gccbrig_is_float_type (brig_inner_type))
-	return CALL_EXPR;
-      else
-	return MAX_EXPR;
-    case BRIG_OPCODE_FMA:
-      return FMA_EXPR;
-    case BRIG_OPCODE_ABS:
-      return ABS_EXPR;
-    case BRIG_OPCODE_SHL:
-      return LSHIFT_EXPR;
-    case BRIG_OPCODE_SHR:
-      return RSHIFT_EXPR;
-    case BRIG_OPCODE_OR:
-      return BIT_IOR_EXPR;
-    case BRIG_OPCODE_XOR:
-      return BIT_XOR_EXPR;
-    case BRIG_OPCODE_AND:
-      return BIT_AND_EXPR;
-    case BRIG_OPCODE_NOT:
-      return BIT_NOT_EXPR;
-    case BRIG_OPCODE_RET:
-      return RETURN_EXPR;
-    case BRIG_OPCODE_MOV:
-    case BRIG_OPCODE_LDF:
-      return MODIFY_EXPR;
-    case BRIG_OPCODE_LD:
-    case BRIG_OPCODE_ST:
-      return MEM_REF;
-    case BRIG_OPCODE_BR:
-      return GOTO_EXPR;
-    case BRIG_OPCODE_REM:
-      if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
-	return TRUNC_MOD_EXPR;
-      else
-	return CALL_EXPR;
-    case BRIG_OPCODE_NRCP:
-    case BRIG_OPCODE_NRSQRT:
-      /* Implement as 1/f (x).  gcc should pattern detect that and
-	 use a native instruction, if available, for it.  */
-      return TREE_LIST;
-    case BRIG_OPCODE_FLOOR:
-    case BRIG_OPCODE_CEIL:
-    case BRIG_OPCODE_SQRT:
-    case BRIG_OPCODE_NSQRT:
-    case BRIG_OPCODE_RINT:
-    case BRIG_OPCODE_TRUNC:
-    case BRIG_OPCODE_POPCOUNT:
-    case BRIG_OPCODE_COPYSIGN:
-    case BRIG_OPCODE_NCOS:
-    case BRIG_OPCODE_NSIN:
-    case BRIG_OPCODE_NLOG2:
-    case BRIG_OPCODE_NEXP2:
-    case BRIG_OPCODE_NFMA:
-      /* Class has type B1 regardless of the float type, thus
-	 the below builtin map search cannot find it.  */
-    case BRIG_OPCODE_CLASS:
-    case BRIG_OPCODE_WORKITEMABSID:
-      return CALL_EXPR;
-    default:
-
-      /* Some BRIG opcodes can use the same builtins for unsigned and
-	 signed types.  Force these cases to unsigned types.
-      */
-
-      if (brig_opcode == BRIG_OPCODE_BORROW
-	  || brig_opcode == BRIG_OPCODE_CARRY
-	  || brig_opcode == BRIG_OPCODE_LASTBIT
-	  || brig_opcode == BRIG_OPCODE_BITINSERT)
-	{
-	  if (brig_type == BRIG_TYPE_S32)
-	    brig_type = BRIG_TYPE_U32;
-	  else if (brig_type == BRIG_TYPE_S64)
-	    brig_type = BRIG_TYPE_U64;
-	}
-
-
-      builtin_map::const_iterator i
-	= s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
-      if (i != s_custom_builtins.end ())
-	return CALL_EXPR;
-      else if (s_custom_builtins.find
-	       (std::make_pair (brig_opcode, brig_inner_type))
-	       != s_custom_builtins.end ())
-	return CALL_EXPR;
-      if (brig_inner_type == BRIG_TYPE_F16
-	  && s_custom_builtins.find
-	  (std::make_pair (brig_opcode, BRIG_TYPE_F32))
-	  != s_custom_builtins.end ())
-	return CALL_EXPR;
-      break;
-    }
-  return TREE_LIST; /* Emulate using a chain of nodes.  */
-}
diff --git a/gcc/brig/brigfrontend/brig-branch-inst-handler.cc b/gcc/brig/brigfrontend/brig-branch-inst-handler.cc
index 1340b74..b6baf13 100644
--- a/gcc/brig/brigfrontend/brig-branch-inst-handler.cc
+++ b/gcc/brig/brigfrontend/brig-branch-inst-handler.cc
@@ -119,10 +119,11 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
 	 memory.  */
 
       tree group_local_offset
-	= add_temp_var ("group_local_offset",
-			build_int_cst
-			(uint32_type_node,
-			 m_parent.m_cf->m_local_group_variables.size()));
+	= m_parent.m_cf->add_temp_var ("group_local_offset",
+				       build_int_cst
+				       (uint32_type_node,
+					m_parent.m_cf->
+					m_local_group_variables.size()));
 
       /* TODO: ensure the callee's frame is aligned!  */
 
@@ -152,6 +153,7 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
       m_parent.m_cf->m_called_functions.push_back (func_ref);
       if (DECL_EXTERNAL (func_ref))
 	m_parent.add_decl_call (call);
+      m_parent.m_cf->start_new_bb ();
 
       return base->byteCount;
     }
@@ -216,18 +218,21 @@ brig_branch_inst_handler::operator () (const BrigBase *base)
 	 ensure the barrier won't be duplicated or moved out of loops etc.
 	 Like the 'noduplicate' of LLVM.  Same goes for fbarriers.  */
       m_parent.m_cf->append_statement
-	(expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE, NULL_TREE,
-				 call_operands));
+	(m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode,
+						BRIG_TYPE_NONE, NULL_TREE,
+						call_operands));
     }
   else if (brig_inst->opcode >= BRIG_OPCODE_ARRIVEFBAR
 	   && brig_inst->opcode <= BRIG_OPCODE_WAITFBAR)
     {
       m_parent.m_cf->m_has_barriers = true;
       m_parent.m_cf->append_statement
-	(expand_or_call_builtin (brig_inst->opcode, BRIG_TYPE_NONE,
-				 uint32_type_node, operands));
+	(m_parent.m_cf->expand_or_call_builtin (brig_inst->opcode,
+						BRIG_TYPE_NONE,
+						uint32_type_node, operands));
     }
   else
     gcc_unreachable ();
+  m_parent.m_cf->start_new_bb ();
   return base->byteCount;
 }
diff --git a/gcc/brig/brigfrontend/brig-cmp-inst-handler.cc b/gcc/brig/brigfrontend/brig-cmp-inst-handler.cc
index 1155ead..729e3fd 100644
--- a/gcc/brig/brigfrontend/brig-cmp-inst-handler.cc
+++ b/gcc/brig/brigfrontend/brig-cmp-inst-handler.cc
@@ -180,17 +180,17 @@ brig_cmp_inst_handler::operator () (const BrigBase *base)
 	 results, we must now truncate the result vector to S16s so it
 	 fits to the destination register.  We can build the target vector
 	 type from the f16 storage type (unsigned ints).  */
-      expr = add_temp_var ("wide_cmp_result", expr);
+      expr = m_parent.m_cf->add_temp_var ("wide_cmp_result", expr);
       tree_stl_vec wide_elements;
       tree_stl_vec shrunk_elements;
-      unpack (expr, wide_elements);
+      m_parent.m_cf->unpack (expr, wide_elements);
       for (size_t i = 0; i < wide_elements.size (); ++i)
 	{
 	  tree wide = wide_elements.at (i);
 	  shrunk_elements.push_back
 	    (convert_to_integer (short_integer_type_node, wide));
 	}
-      expr = pack (shrunk_elements);
+      expr = m_parent.m_cf->pack (shrunk_elements);
     }
   build_output_assignment (*inst_base, operands[0], expr);
 
diff --git a/gcc/brig/brigfrontend/brig-code-entry-handler.cc b/gcc/brig/brigfrontend/brig-code-entry-handler.cc
index 36a8deb..4fa37fd 100644
--- a/gcc/brig/brigfrontend/brig-code-entry-handler.cc
+++ b/gcc/brig/brigfrontend/brig-code-entry-handler.cc
@@ -41,24 +41,9 @@
 #include "brig-builtins.h"
 #include "fold-const.h"
 
-brig_code_entry_handler::builtin_map brig_code_entry_handler::s_custom_builtins;
-
 brig_code_entry_handler::brig_code_entry_handler (brig_to_generic &parent)
   : brig_entry_handler (parent)
 {
-  if (s_custom_builtins.size () > 0) return;
-
-  /* Populate the builtin index.  */
-#undef DEF_HSAIL_ATOMIC_BUILTIN
-#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
-#undef DEF_HSAIL_INTR_BUILTIN
-#undef DEF_HSAIL_SAT_BUILTIN
-#undef DEF_HSAIL_BUILTIN
-#define DEF_HSAIL_BUILTIN(ENUM, HSAIL_OPCODE, HSAIL_TYPE, NAME, TYPE, ATTRS) \
-  s_custom_builtins[std::make_pair (HSAIL_OPCODE, HSAIL_TYPE)]		\
-    = builtin_decl_explicit (ENUM);
-
-#include "brig-builtins.def"
 }
 
 /* Build a tree operand which is a reference to a piece of code.  REF is the
@@ -301,18 +286,18 @@ brig_code_entry_handler::build_address_operand
 
 	  tree local_size
 	    = build2 (MULT_EXPR, uint32_type_node,
-		      expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
-					      BRIG_TYPE_U32,
-					      uint32_type_node, uint32_0),
-		      expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
-					      BRIG_TYPE_U32,
-					      uint32_type_node, uint32_1));
+		      m_parent.m_cf->expand_or_call_builtin
+		      (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+		       uint32_type_node, uint32_0),
+		      m_parent.m_cf->expand_or_call_builtin
+		      (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+		       uint32_type_node, uint32_1));
 
 	  local_size
 	    = build2 (MULT_EXPR, uint32_type_node,
-		      expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE,
-					      BRIG_TYPE_U32,
-					      uint32_type_node, uint32_2),
+		      m_parent.m_cf->expand_or_call_builtin
+		      (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+		       uint32_type_node, uint32_2),
 		      local_size);
 
 	  tree var_region
@@ -324,9 +309,9 @@ brig_code_entry_handler::build_address_operand
 	    = build2 (MULT_EXPR, uint32_type_node,
 		      build_int_cst (uint32_type_node,
 				     m_parent.private_variable_size (var_name)),
-		      expand_or_call_builtin (BRIG_OPCODE_WORKITEMFLATID,
-					      BRIG_TYPE_U32,
-					      uint32_type_node, operands));
+		      m_parent.m_cf->expand_or_call_builtin
+		      (BRIG_OPCODE_WORKITEMFLATID, BRIG_TYPE_U32,
+		       uint32_type_node, operands));
 
 	  tree var_offset
 	    = build2 (PLUS_EXPR, uint32_type_node, var_region, pos);
@@ -336,8 +321,9 @@ brig_code_entry_handler::build_address_operand
 	     offset to a flat address by adding it as an offset to a (private
 	     or group) base pointer later on.  Same applies to group_var_offset.  */
 	  symbol_base
-	    = add_temp_var ("priv_var_offset",
-			    convert (size_type_node, var_offset));
+	    = m_parent.m_cf->add_temp_var ("priv_var_offset",
+					   convert (size_type_node,
+						    var_offset));
 	}
       else if (segment == BRIG_SEGMENT_ARG)
 	{
@@ -699,138 +685,6 @@ brig_code_entry_handler::get_tree_expr_type_for_hsa_type
     return gccbrig_tree_type_for_hsa_type (brig_type);
 }
 
-/* In case the HSA instruction must be implemented using a builtin,
-   this function is called to get the correct builtin function.
-   TYPE is the instruction tree type, BRIG_OPCODE the opcode of the
-   brig instruction and BRIG_TYPE the brig instruction's type.  */
-
-tree
-brig_code_entry_handler::get_builtin_for_hsa_opcode
-  (tree type, BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
-{
-  tree builtin = NULL_TREE;
-  tree builtin_type = type;
-
-  /* For vector types, first find the scalar version of the builtin.  */
-  if (type != NULL_TREE && VECTOR_TYPE_P (type))
-    builtin_type = TREE_TYPE (type);
-  BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
-
-  /* Some BRIG opcodes can use the same builtins for unsigned and
-     signed types.  Force these cases to unsigned types.  */
-
-  if (brig_opcode == BRIG_OPCODE_BORROW
-      || brig_opcode == BRIG_OPCODE_CARRY
-      || brig_opcode == BRIG_OPCODE_LASTBIT
-      || brig_opcode == BRIG_OPCODE_BITINSERT)
-    {
-      if (brig_type == BRIG_TYPE_S32)
-	brig_type = BRIG_TYPE_U32;
-      else if (brig_type == BRIG_TYPE_S64)
-	brig_type = BRIG_TYPE_U64;
-    }
-
-  switch (brig_opcode)
-    {
-    case BRIG_OPCODE_FLOOR:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_FLOOR);
-      break;
-    case BRIG_OPCODE_CEIL:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_CEIL);
-      break;
-    case BRIG_OPCODE_SQRT:
-    case BRIG_OPCODE_NSQRT:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_SQRT);
-      break;
-    case BRIG_OPCODE_RINT:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_RINT);
-      break;
-    case BRIG_OPCODE_TRUNC:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_TRUNC);
-      break;
-    case BRIG_OPCODE_COPYSIGN:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_COPYSIGN);
-      break;
-    case BRIG_OPCODE_NSIN:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_SIN);
-      break;
-    case BRIG_OPCODE_NLOG2:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_LOG2);
-      break;
-    case BRIG_OPCODE_NEXP2:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_EXP2);
-      break;
-    case BRIG_OPCODE_NFMA:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_FMA);
-      break;
-    case BRIG_OPCODE_NCOS:
-      builtin = mathfn_built_in (builtin_type, BUILT_IN_COS);
-      break;
-    case BRIG_OPCODE_POPCOUNT:
-      /* Popcount should be typed by its argument type (the return value
-	 is always u32).  Let's use a b64 version for also for b32 for now.  */
-      return builtin_decl_explicit (BUILT_IN_POPCOUNTL);
-    case BRIG_OPCODE_BORROW:
-      /* Borrow uses the same builtin for unsigned and signed types.  */
-      if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
-	return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U32);
-      else
-	return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U64);
-    case BRIG_OPCODE_CARRY:
-      /* Carry also uses the same builtin for unsigned and signed types.  */
-      if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
-	return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U32);
-      else
-	return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U64);
-    default:
-
-      /* Use our builtin index for finding a proper builtin for the BRIG
-	 opcode and BRIG type.  This takes care most of the builtin cases,
-	 the special cases are handled in the separate 'case' statements
-	 above.  */
-      builtin_map::const_iterator i
-	= s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
-      if (i != s_custom_builtins.end ())
-	return (*i).second;
-
-      if (brig_inner_type != brig_type)
-	{
-	  /* Try to find a scalar built-in we could use.  */
-	  i = s_custom_builtins.find
-	    (std::make_pair (brig_opcode, brig_inner_type));
-	  if (i != s_custom_builtins.end ())
-	    return (*i).second;
-	}
-
-      /* In case this is an fp16 operation that is promoted to fp32,
-	 try to find a fp32 scalar built-in.  */
-      if (brig_inner_type == BRIG_TYPE_F16)
-	{
-	  i = s_custom_builtins.find
-	    (std::make_pair (brig_opcode, BRIG_TYPE_F32));
-	  if (i != s_custom_builtins.end ())
-	    return (*i).second;
-	}
-      gcc_unreachable ();
-    }
-
-  if (VECTOR_TYPE_P (type) && builtin != NULL_TREE)
-    {
-      /* Try to find a vectorized version of the built-in.
-	 TODO: properly assert that builtin is a mathfn builtin? */
-      tree vec_builtin
-	= targetm.vectorize.builtin_vectorized_function
-	(builtin_mathfn_code (builtin), type, type);
-      if (vec_builtin != NULL_TREE)
-	return vec_builtin;
-      else
-	return builtin;
-    }
-  if (builtin == NULL_TREE)
-    gcc_unreachable ();
-  return builtin;
-}
-
 /* Return the correct GENERIC type for storing comparison results
    of operand with the type given in SOURCE_TYPE.  */
 
@@ -848,264 +702,6 @@ brig_code_entry_handler::get_comparison_result_type (tree source_type)
     return gccbrig_tree_type_for_hsa_type (BRIG_TYPE_B1);
 }
 
-/* Returns true in case the given opcode needs to know about work-item context
-   data.  In such case the context data is passed as a pointer to a work-item
-   context object, as the last argument in the builtin call.  */
-
-bool
-brig_code_entry_handler::needs_workitem_context_data
-  (BrigOpcode16_t brig_opcode) const
-{
-  switch (brig_opcode)
-    {
-    case BRIG_OPCODE_WORKITEMABSID:
-    case BRIG_OPCODE_WORKITEMFLATABSID:
-    case BRIG_OPCODE_WORKITEMFLATID:
-    case BRIG_OPCODE_CURRENTWORKITEMFLATID:
-    case BRIG_OPCODE_WORKITEMID:
-    case BRIG_OPCODE_WORKGROUPID:
-    case BRIG_OPCODE_WORKGROUPSIZE:
-    case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
-    case BRIG_OPCODE_GRIDGROUPS:
-    case BRIG_OPCODE_GRIDSIZE:
-    case BRIG_OPCODE_DIM:
-    case BRIG_OPCODE_PACKETID:
-    case BRIG_OPCODE_PACKETCOMPLETIONSIG:
-    case BRIG_OPCODE_BARRIER:
-    case BRIG_OPCODE_WAVEBARRIER:
-    case BRIG_OPCODE_ARRIVEFBAR:
-    case BRIG_OPCODE_INITFBAR:
-    case BRIG_OPCODE_JOINFBAR:
-    case BRIG_OPCODE_LEAVEFBAR:
-    case BRIG_OPCODE_RELEASEFBAR:
-    case BRIG_OPCODE_WAITFBAR:
-    case BRIG_OPCODE_CUID:
-    case BRIG_OPCODE_MAXCUID:
-    case BRIG_OPCODE_DEBUGTRAP:
-    case BRIG_OPCODE_GROUPBASEPTR:
-    case BRIG_OPCODE_KERNARGBASEPTR:
-    case BRIG_OPCODE_ALLOCA:
-      return true;
-    default:
-      return false;
-    };
-}
-
-/* Returns true in case the given opcode that would normally be generated
-   as a builtin call can be expanded to tree nodes.  */
-
-bool
-brig_code_entry_handler::can_expand_builtin (BrigOpcode16_t brig_opcode) const
-{
-  switch (brig_opcode)
-    {
-    case BRIG_OPCODE_WORKITEMFLATABSID:
-    case BRIG_OPCODE_WORKITEMFLATID:
-    case BRIG_OPCODE_WORKITEMABSID:
-    case BRIG_OPCODE_WORKGROUPSIZE:
-    case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
-      /* TODO: expand more builtins.  */
-      return true;
-    default:
-      return false;
-    };
-}
-
-/* Try to expand the given builtin call to reuse a previously generated
-   variable, if possible.  If not, just call the given builtin.
-   BRIG_OPCODE and BRIG_TYPE identify the builtin's BRIG opcode/type,
-   ARITH_TYPE its GENERIC type, and OPERANDS contains the builtin's
-   input operands.  */
-
-tree
-brig_code_entry_handler::expand_or_call_builtin (BrigOpcode16_t brig_opcode,
-						 BrigType16_t brig_type,
-						 tree arith_type,
-						 tree_stl_vec &operands)
-{
-  if (m_parent.m_cf->m_is_kernel && can_expand_builtin (brig_opcode))
-    return expand_builtin (brig_opcode, operands);
-
-  tree built_in
-    = get_builtin_for_hsa_opcode (arith_type, brig_opcode, brig_type);
-
-  if (!VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (built_in)))
-      && arith_type != NULL_TREE && VECTOR_TYPE_P (arith_type)
-      && brig_opcode != BRIG_OPCODE_LERP
-      && brig_opcode != BRIG_OPCODE_PACKCVT
-      && brig_opcode != BRIG_OPCODE_SAD
-      && brig_opcode != BRIG_OPCODE_SADHI)
-    {
-      /* Call the scalar built-in for all elements in the vector.  */
-      tree_stl_vec operand0_elements;
-      if (operands.size () > 0)
-	unpack (operands[0], operand0_elements);
-
-      tree_stl_vec operand1_elements;
-      if (operands.size () > 1)
-	unpack (operands[1], operand1_elements);
-
-      tree_stl_vec result_elements;
-
-      size_t element_count = gccbrig_type_vector_subparts (arith_type);
-      for (size_t i = 0; i < element_count; ++i)
-	{
-	  tree_stl_vec call_operands;
-	  if (operand0_elements.size () > 0)
-	    call_operands.push_back (operand0_elements.at (i));
-
-	  if (operand1_elements.size () > 0)
-	    call_operands.push_back (operand1_elements.at (i));
-
-	  result_elements.push_back
-	    (expand_or_call_builtin (brig_opcode, brig_type,
-				     TREE_TYPE (arith_type),
-				     call_operands));
-	}
-      return pack (result_elements);
-    }
-
-  tree_stl_vec call_operands;
-  tree_stl_vec operand_types;
-
-  tree arg_type_chain = TYPE_ARG_TYPES (TREE_TYPE (built_in));
-
-  for (size_t i = 0; i < operands.size (); ++i)
-    {
-      tree operand_type = TREE_VALUE (arg_type_chain);
-      call_operands.push_back (convert (operand_type, operands[i]));
-      operand_types.push_back (operand_type);
-      arg_type_chain = TREE_CHAIN (arg_type_chain);
-    }
-
-  if (needs_workitem_context_data (brig_opcode))
-    {
-      call_operands.push_back (m_parent.m_cf->m_context_arg);
-      operand_types.push_back (ptr_type_node);
-      m_parent.m_cf->m_has_unexpanded_dp_builtins = true;
-    }
-
-  size_t operand_count = call_operands.size ();
-
-  call_operands.resize (4, NULL_TREE);
-  operand_types.resize (4, NULL_TREE);
-  for (size_t i = 0; i < operand_count; ++i)
-    call_operands.at (i) = build_resize_convert_view (operand_types.at (i),
-						      call_operands.at (i));
-
-  tree fnptr = build_fold_addr_expr (built_in);
-  return build_call_array (TREE_TYPE (TREE_TYPE (built_in)), fnptr,
-			   operand_count, &call_operands[0]);
-}
-
-/* Instead of calling a built-in, reuse a previously returned value known to
-   be still valid.  This is beneficial especially for the work-item
-   identification related builtins as not having them as calls can lead to
-   more easily vectorizable parallel loops for multi work-item work-groups.
-   BRIG_OPCODE identifies the builtin and OPERANDS store the operands.  */
-
-tree
-brig_code_entry_handler::expand_builtin (BrigOpcode16_t brig_opcode,
-					 tree_stl_vec &operands)
-{
-  tree_stl_vec uint32_0 = tree_stl_vec (1, build_int_cst (uint32_type_node, 0));
-
-  tree_stl_vec uint32_1 = tree_stl_vec (1, build_int_cst (uint32_type_node, 1));
-
-  tree_stl_vec uint32_2 = tree_stl_vec (1, build_int_cst (uint32_type_node, 2));
-
-  if (brig_opcode == BRIG_OPCODE_WORKITEMFLATABSID)
-    {
-      tree id0 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_0);
-      id0 = convert (uint64_type_node, id0);
-
-      tree id1 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_1);
-      id1 = convert (uint64_type_node, id1);
-
-      tree id2 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_2);
-      id2 = convert (uint64_type_node, id2);
-
-      tree max0 = convert (uint64_type_node,
-			   m_parent.m_cf->m_grid_size_vars[0]);
-      tree max1 = convert (uint64_type_node,
-			   m_parent.m_cf->m_grid_size_vars[1]);
-
-      tree id2_x_max0_x_max1 = build2 (MULT_EXPR, uint64_type_node, id2, max0);
-      id2_x_max0_x_max1
-	= build2 (MULT_EXPR, uint64_type_node, id2_x_max0_x_max1, max1);
-
-      tree id1_x_max0 = build2 (MULT_EXPR, uint64_type_node, id1, max0);
-
-      tree sum = build2 (PLUS_EXPR, uint64_type_node, id0, id1_x_max0);
-      sum = build2 (PLUS_EXPR, uint64_type_node, sum, id2_x_max0_x_max1);
-
-      return add_temp_var ("workitemflatabsid", sum);
-    }
-  else if (brig_opcode == BRIG_OPCODE_WORKITEMABSID)
-    {
-      HOST_WIDE_INT dim = int_constant_value (operands[0]);
-
-      tree local_id_var = m_parent.m_cf->m_local_id_vars[dim];
-      tree wg_id_var = m_parent.m_cf->m_wg_id_vars[dim];
-      tree wg_size_var = m_parent.m_cf->m_wg_size_vars[dim];
-
-      tree wg_id_x_wg_size = build2 (MULT_EXPR, uint32_type_node,
-				     convert (uint32_type_node, wg_id_var),
-				     convert (uint32_type_node, wg_size_var));
-      tree sum
-	= build2 (PLUS_EXPR, uint32_type_node, wg_id_x_wg_size, local_id_var);
-
-      return add_temp_var (std::string ("workitemabsid_")
-			   + (char) ((int) 'x' + dim), sum);
-    }
-  else if (brig_opcode == BRIG_OPCODE_WORKITEMFLATID)
-    {
-      tree z_x_wgsx_wgsy
-	= build2 (MULT_EXPR, uint32_type_node,
-		  m_parent.m_cf->m_local_id_vars[2],
-		  m_parent.m_cf->m_wg_size_vars[0]);
-      z_x_wgsx_wgsy = build2 (MULT_EXPR, uint32_type_node, z_x_wgsx_wgsy,
-			      m_parent.m_cf->m_wg_size_vars[1]);
-
-      tree y_x_wgsx
-	= build2 (MULT_EXPR, uint32_type_node,
-		  m_parent.m_cf->m_local_id_vars[1],
-		  m_parent.m_cf->m_wg_size_vars[0]);
-
-      tree sum = build2 (PLUS_EXPR, uint32_type_node, y_x_wgsx, z_x_wgsx_wgsy);
-      sum = build2 (PLUS_EXPR, uint32_type_node,
-		    m_parent.m_cf->m_local_id_vars[0],
-		    sum);
-      return add_temp_var ("workitemflatid", sum);
-    }
-  else if (brig_opcode == BRIG_OPCODE_WORKGROUPSIZE)
-    {
-      HOST_WIDE_INT dim = int_constant_value (operands[0]);
-      return m_parent.m_cf->m_wg_size_vars[dim];
-    }
-  else if (brig_opcode == BRIG_OPCODE_CURRENTWORKGROUPSIZE)
-    {
-      HOST_WIDE_INT dim = int_constant_value (operands[0]);
-      return m_parent.m_cf->m_cur_wg_size_vars[dim];
-    }
-  else
-    gcc_unreachable ();
-
-  return NULL_TREE;
-}
-
-/* Appends and returns a new temp variable and an accompanying assignment
-   statement that stores the value of the given EXPR and has the given NAME.  */
-
-tree
-brig_code_entry_handler::add_temp_var (std::string name, tree expr)
-{
-  tree temp_var = create_tmp_var (TREE_TYPE (expr), name.c_str ());
-  tree assign = build2 (MODIFY_EXPR, TREE_TYPE (temp_var), temp_var, expr);
-  m_parent.m_cf->append_statement (assign);
-  return temp_var;
-}
-
 /* Creates a FP32 to FP16 conversion call, assuming the source and destination
    are FP32 type variables.  */
 
@@ -1387,7 +983,6 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst,
      variable type (can be any type; see get_m_var_declfor_reg @
      brig-function.cc).  */
   tree output_type = TREE_TYPE (output);
-  tree input_type = TREE_TYPE (inst_expr);
   bool is_fp16 = (brig_inst.type & BRIG_TYPE_BASE_MASK) == BRIG_TYPE_F16
 		 && brig_inst.base.kind != BRIG_KIND_INST_MEM
 		 && !gccbrig_is_bit_operation (brig_inst.opcode);
@@ -1396,6 +991,13 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst,
   bool ftz = false;
   const BrigBase *base = &brig_inst.base;
 
+  if (m_parent.m_cf->is_id_val (inst_expr))
+    inst_expr = m_parent.m_cf->id_val (inst_expr);
+
+  tree input_type = TREE_TYPE (inst_expr);
+
+  m_parent.m_cf->add_reg_var_update (output, inst_expr);
+
   if (base->kind == BRIG_KIND_INST_MOD)
     {
       const BrigInstMod *mod = (const BrigInstMod *) base;
@@ -1418,13 +1020,13 @@ brig_code_entry_handler::build_output_assignment (const BrigInstBase &brig_inst,
     {
       /* Ensure we don't duplicate the arithmetics to the arguments of the bit
 	 field reference operators.  */
-      inst_expr = add_temp_var ("before_ftz", inst_expr);
+      inst_expr = m_parent.m_cf->add_temp_var ("before_ftz", inst_expr);
       inst_expr = flush_to_zero (is_fp16) (*this, inst_expr);
     }
 
   if (is_fp16)
     {
-      inst_expr = add_temp_var ("before_f2h", inst_expr);
+      inst_expr = m_parent.m_cf->add_temp_var ("before_f2h", inst_expr);
       tree f2h_output = build_f2h_conversion (inst_expr);
       tree conv = build_resize_convert_view (output_type, f2h_output);
       tree assign = build2 (MODIFY_EXPR, output_type, output, conv);
@@ -1486,62 +1088,6 @@ brig_code_entry_handler::append_statement (tree stmt)
   m_parent.m_cf->append_statement (stmt);
 }
 
-/* Unpacks the elements of the vector in VALUE to scalars (bit field
-   references) in ELEMENTS.  */
-
-void
-brig_code_entry_handler::unpack (tree value, tree_stl_vec &elements)
-{
-  size_t vec_size = int_size_in_bytes (TREE_TYPE (value));
-  size_t element_size
-    = int_size_in_bytes (TREE_TYPE (TREE_TYPE (value))) * BITS_PER_UNIT;
-  size_t element_count
-    = vec_size * BITS_PER_UNIT / element_size;
-
-  tree input_element_type = TREE_TYPE (TREE_TYPE (value));
-
-  value = add_temp_var ("unpack_input", value);
-
-  for (size_t i = 0; i < element_count; ++i)
-    {
-      tree element
-	= build3 (BIT_FIELD_REF, input_element_type, value,
-		  TYPE_SIZE (input_element_type),
-		  bitsize_int(i * element_size));
-
-      element = add_temp_var ("scalar", element);
-      elements.push_back (element);
-    }
-}
-
-/* Pack the elements of the scalars in ELEMENTS to the returned vector.  */
-
-tree
-brig_code_entry_handler::pack (tree_stl_vec &elements)
-{
-  size_t element_count = elements.size ();
-
-  gcc_assert (element_count > 1);
-
-  tree output_element_type = TREE_TYPE (elements.at (0));
-
-  vec<constructor_elt, va_gc> *constructor_vals = NULL;
-  for (size_t i = 0; i < element_count; ++i)
-    CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, elements.at (i));
-
-  tree vec_type = build_vector_type (output_element_type, element_count);
-
-  /* build_constructor creates a vector type which is not a vector_cst
-     that requires compile time constant elements.  */
-  tree vec = build_constructor (vec_type, constructor_vals);
-
-  /* Add a temp variable for readability.  */
-  tree tmp_var = create_tmp_var (vec_type, "vec_out");
-  tree vec_tmp_assign = build2 (MODIFY_EXPR, TREE_TYPE (tmp_var), tmp_var, vec);
-  m_parent.m_cf->append_statement (vec_tmp_assign);
-  return tmp_var;
-}
-
 /* Visits the element(s) in the OPERAND, calling HANDLER to each of them.  */
 
 tree
@@ -1757,4 +1303,3 @@ brig_code_entry_handler::int_constant_value (tree node)
     n = TREE_OPERAND (n, 0);
   return int_cst_value (n);
 }
-
diff --git a/gcc/brig/brigfrontend/brig-code-entry-handler.h b/gcc/brig/brigfrontend/brig-code-entry-handler.h
index 3aa4d9e..1e082c4 100644
--- a/gcc/brig/brigfrontend/brig-code-entry-handler.h
+++ b/gcc/brig/brigfrontend/brig-code-entry-handler.h
@@ -35,8 +35,6 @@ class tree_element_unary_visitor;
 class brig_code_entry_handler : public brig_entry_handler
 {
 public:
-  typedef std::map<std::pair<BrigOpcode16_t, BrigType16_t>, tree> builtin_map;
-
   brig_code_entry_handler (brig_to_generic &parent);
 
   /* Handles the brig_code data at the given pointer and adds it to the
@@ -51,8 +49,6 @@ protected:
   tree get_tree_expr_type_for_hsa_type (BrigType16_t brig_type) const;
   tree get_tree_cst_for_hsa_operand (const BrigOperandConstantBytes *brigConst,
 				     tree type) const;
-  tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode,
-				   BrigType16_t brig_type) const;
   tree get_comparison_result_type (tree source_type);
 
   tree build_code_ref (const BrigBase &ref);
@@ -73,16 +69,6 @@ protected:
 
   bool needs_workitem_context_data (BrigOpcode16_t brig_opcode) const;
 
-  void unpack (tree value, tree_stl_vec &elements);
-  tree pack (tree_stl_vec &elements);
-
-  bool can_expand_builtin (BrigOpcode16_t brig_opcode) const;
-  tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands);
-
-  tree expand_or_call_builtin (BrigOpcode16_t brig_opcode,
-			       BrigType16_t brig_type, tree arith_type,
-			       tree_stl_vec &operands);
-
   tree add_temp_var (std::string name, tree expr);
 
   tree build_f2h_conversion (tree source);
@@ -100,10 +86,6 @@ protected:
 
   tree extend_int (tree input, tree dest_type, tree src_type);
 
-  /* HSAIL-specific builtin functions not yet integrated to gcc.  */
-
-  static builtin_map s_custom_builtins;
-
 private:
 
   tree_stl_vec build_or_analyze_operands (const BrigInstBase &brig_inst,
@@ -299,9 +281,6 @@ private:
 
   tree build_unpack_lo_or_hi (BrigOpcode16_t brig_opcode, tree arith_type,
 			      tree_stl_vec &operands);
-
-  tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode,
-					  BrigType16_t brig_type) const;
 };
 
 class brig_cvt_inst_handler : public brig_inst_mod_handler
diff --git a/gcc/brig/brigfrontend/brig-control-handler.cc b/gcc/brig/brigfrontend/brig-control-handler.cc
index b7e0722..82189e1 100644
--- a/gcc/brig/brigfrontend/brig-control-handler.cc
+++ b/gcc/brig/brigfrontend/brig-control-handler.cc
@@ -53,45 +53,45 @@ brig_directive_control_handler::operator () (const BrigBase *base)
     case BRIG_CONTROL_MAXDYNAMICGROUPSIZE:
       {
 	m_parent.m_cf->m_descriptor.max_dynamic_group_size
-	  = int_constant_value (operands.at (0));
+	  = brig_function::int_constant_value (operands.at (0));
 	break;
       }
     case BRIG_CONTROL_MAXFLATGRIDSIZE:
       {
 	m_parent.m_cf->m_descriptor.max_flat_grid_size
-	  = int_constant_value (operands.at (0));
+	  = brig_function::int_constant_value (operands.at (0));
 	break;
       }
     case BRIG_CONTROL_MAXFLATWORKGROUPSIZE:
       {
 	m_parent.m_cf->m_descriptor.max_flat_workgroup_size
-	  = int_constant_value (operands.at (0));
+	  = brig_function::int_constant_value (operands.at (0));
 	break;
       }
     case BRIG_CONTROL_REQUIREDDIM:
       {
 	m_parent.m_cf->m_descriptor.required_dim
-	  = int_constant_value (operands.at (0));
+	  = brig_function::int_constant_value (operands.at (0));
 	break;
       }
     case BRIG_CONTROL_REQUIREDGRIDSIZE:
       {
 	m_parent.m_cf->m_descriptor.required_grid_size[0]
-	  = int_constant_value (operands.at (0));
+	  = brig_function::int_constant_value (operands.at (0));
 	m_parent.m_cf->m_descriptor.required_grid_size[1]
-	  = int_constant_value (operands.at (1));
+	  = brig_function::int_constant_value (operands.at (1));
 	m_parent.m_cf->m_descriptor.required_grid_size[2]
-	  = int_constant_value (operands.at (2));
+	  = brig_function::int_constant_value (operands.at (2));
 	break;
       }
     case BRIG_CONTROL_REQUIREDWORKGROUPSIZE:
       {
 	m_parent.m_cf->m_descriptor.required_workgroup_size[0]
-	  = int_constant_value (operands.at (0));
+	  = brig_function::int_constant_value (operands.at (0));
 	m_parent.m_cf->m_descriptor.required_workgroup_size[1]
-	  = int_constant_value (operands.at (1));
+	  = brig_function::int_constant_value (operands.at (1));
 	m_parent.m_cf->m_descriptor.required_workgroup_size[2]
-	  = int_constant_value (operands.at (2));
+	  = brig_function::int_constant_value (operands.at (2));
 	break;
       }
     case BRIG_CONTROL_REQUIRENOPARTIALWORKGROUPS:
diff --git a/gcc/brig/brigfrontend/brig-cvt-inst-handler.cc b/gcc/brig/brigfrontend/brig-cvt-inst-handler.cc
index e5ac799..3b8c9ea 100644
--- a/gcc/brig/brigfrontend/brig-cvt-inst-handler.cc
+++ b/gcc/brig/brigfrontend/brig-cvt-inst-handler.cc
@@ -83,6 +83,12 @@ brig_cvt_inst_handler::generate (const BrigBase *base)
   tree &input = operands.at (1);
   tree &output = operands.at (0);
 
+  if (m_parent.m_cf->is_id_val (input))
+    {
+      input = m_parent.m_cf->id_val (input);
+      src_type = TREE_TYPE (input);
+    }
+
   size_t conv_src_size = int_size_in_bytes (src_type);
   size_t conv_dst_size = int_size_in_bytes (dest_type);
   size_t src_reg_size = int_size_in_bytes (TREE_TYPE (input));
diff --git a/gcc/brig/brigfrontend/brig-function-handler.cc b/gcc/brig/brigfrontend/brig-function-handler.cc
index d64135d..f22f065 100644
--- a/gcc/brig/brigfrontend/brig-function-handler.cc
+++ b/gcc/brig/brigfrontend/brig-function-handler.cc
@@ -93,6 +93,25 @@ brig_directive_function_handler::operator () (const BrigBase *base)
      represent HSAIL registers.  */
   tree bind_expr = build3 (BIND_EXPR, void_type_node, NULL, stmt_list, NULL);
 
+  tree restrict_char_ptr
+    = build_qualified_type (build_pointer_type (char_type_node),
+			    TYPE_QUAL_RESTRICT);
+  tree restrict_void_ptr
+    = build_qualified_type (build_pointer_type (void_type_node),
+			    TYPE_QUAL_RESTRICT);
+
+  tree restrict_const_char_ptr
+    = build_qualified_type (build_pointer_type
+			    (build_qualified_type (char_type_node,
+						   TYPE_QUAL_CONST)),
+			    TYPE_QUAL_RESTRICT);
+
+  tree restrict_const_void_ptr
+    = build_qualified_type (build_pointer_type
+			    (build_qualified_type (void_type_node,
+						   TYPE_QUAL_CONST)),
+			    TYPE_QUAL_RESTRICT);
+
   if (is_kernel)
     {
       tree name_identifier
@@ -107,12 +126,11 @@ brig_directive_function_handler::operator () (const BrigBase *base)
 	 3) a void* parameter that contains the first flat address of the group
 	 region allocated to the current work-group.  */
 
-      tree char_ptr_type_node = build_pointer_type (char_type_node);
       fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
 			   build_function_type_list (void_type_node,
-						     char_ptr_type_node,
-						     ptr_type_node,
-						     ptr_type_node, NULL_TREE));
+						     restrict_const_char_ptr,
+						     restrict_void_ptr,
+						     restrict_char_ptr, NULL_TREE));
 
       SET_DECL_ASSEMBLER_NAME (fndecl, name_identifier);
 
@@ -125,9 +143,10 @@ brig_directive_function_handler::operator () (const BrigBase *base)
 	= gccbrig_get_target_addr_space_id (BRIG_SEGMENT_KERNARG);
 
       tree arg_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-				 get_identifier ("__args"), char_ptr_type_node);
+				 get_identifier ("__args"),
+				 restrict_const_char_ptr);
       DECL_ARGUMENTS (fndecl) = arg_arg;
-      DECL_ARG_TYPE (arg_arg) = char_ptr_type_node;
+      DECL_ARG_TYPE (arg_arg) = restrict_const_char_ptr;
       DECL_CONTEXT (arg_arg) = fndecl;
       DECL_ARTIFICIAL (arg_arg) = 1;
       TREE_READONLY (arg_arg) = 1;
@@ -189,7 +208,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
 	      if (arg_decls == NULL_TREE)
 		arg_decls = arg_var;
 	      else
-		chainon (arg_decls, arg_var);
+		arg_decls = chainon (arg_decls, arg_var);
 
 	      m_parent.m_cf->add_arg_variable (brigVar, arg_var);
 
@@ -230,18 +249,13 @@ brig_directive_function_handler::operator () (const BrigBase *base)
 	      vec_safe_push (args, TREE_TYPE (arg_var));
 
 	      m_parent.m_cf->add_arg_variable (brigVar, arg_var);
-
-	      if (arg_decls == NULL_TREE)
-		arg_decls = arg_var;
-	      else
-		chainon (arg_decls, arg_var);
+	      arg_decls = chainon (arg_decls, arg_var);
 	    }
 	}
-
-      vec_safe_push (args, ptr_type_node);
-      vec_safe_push (args, ptr_type_node);
-      vec_safe_push (args, ptr_type_node);
-      vec_safe_push (args, ptr_type_node);
+      vec_safe_push (args, restrict_void_ptr);
+      vec_safe_push (args, restrict_char_ptr);
+      vec_safe_push (args, uint32_type_node);
+      vec_safe_push (args, restrict_char_ptr);
 
       fndecl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
 			   build_function_type_vec (ret_type, args));
@@ -254,26 +268,30 @@ brig_directive_function_handler::operator () (const BrigBase *base)
 
   /* All functions need the hidden __context argument passed on
      because they might call WI-specific functions which need
-     the context info.  */
+     the context info.  Only kernels can write it, if they need
+     to update the local ids in the work-item loop.  */
+
+  tree context_arg_type
+    = true ? restrict_void_ptr : restrict_const_void_ptr;
   tree context_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-				 get_identifier ("__context"), ptr_type_node);
-  if (DECL_ARGUMENTS (fndecl) == NULL_TREE)
-    DECL_ARGUMENTS (fndecl) = context_arg;
-  else
-    chainon (DECL_ARGUMENTS (fndecl), context_arg);
+				 get_identifier ("__context"),
+				 context_arg_type);
+  DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), context_arg);
   DECL_CONTEXT (context_arg) = fndecl;
-  DECL_ARG_TYPE (context_arg) = ptr_type_node;
+  DECL_ARG_TYPE (context_arg) = context_arg_type;
   DECL_ARTIFICIAL (context_arg) = 1;
   TREE_READONLY (context_arg) = 1;
   TREE_USED (context_arg) = 1;
+  m_parent.m_cf->m_context_arg = context_arg;
 
   /* They can also access group memory, so we need to pass the
      group pointer along too.  */
   tree group_base_arg
     = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-		  get_identifier ("__group_base_addr"), ptr_type_node);
-  chainon (DECL_ARGUMENTS (fndecl), group_base_arg);
-  DECL_ARG_TYPE (group_base_arg) = ptr_type_node;
+		  get_identifier ("__group_base_addr"),
+		  restrict_char_ptr);
+  DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_base_arg);
+  DECL_ARG_TYPE (group_base_arg) = restrict_char_ptr;
   DECL_CONTEXT (group_base_arg) = fndecl;
   DECL_ARTIFICIAL (group_base_arg) = 1;
   TREE_READONLY (group_base_arg) = 1;
@@ -288,7 +306,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
   tree group_local_offset_arg
     = build_decl (UNKNOWN_LOCATION, PARM_DECL,
 		  get_identifier ("__group_local_offset"), uint32_type_node);
-  chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg);
+  DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), group_local_offset_arg);
   DECL_ARG_TYPE (group_local_offset_arg) = uint32_type_node;
   DECL_CONTEXT (group_local_offset_arg) = fndecl;
   DECL_ARTIFICIAL (group_local_offset_arg) = 1;
@@ -299,24 +317,25 @@ brig_directive_function_handler::operator () (const BrigBase *base)
   /* Same for private.  */
   tree private_base_arg
     = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-		  get_identifier ("__private_base_addr"), ptr_type_node);
-  chainon (DECL_ARGUMENTS (fndecl), private_base_arg);
-  DECL_ARG_TYPE (private_base_arg) = ptr_type_node;
+		  get_identifier ("__private_base_addr"), restrict_char_ptr);
+  DECL_ARGUMENTS (fndecl) = chainon (DECL_ARGUMENTS (fndecl), private_base_arg);
+  DECL_ARG_TYPE (private_base_arg) = restrict_char_ptr;
   DECL_CONTEXT (private_base_arg) = fndecl;
   DECL_ARTIFICIAL (private_base_arg) = 1;
   TREE_READONLY (private_base_arg) = 1;
   TREE_USED (private_base_arg) = 1;
+  m_parent.m_cf->m_private_base_arg = private_base_arg;
 
   DECL_SAVED_TREE (fndecl) = bind_expr;
 
-  set_externally_visible (fndecl);
-
   if (base->kind == BRIG_KIND_DIRECTIVE_FUNCTION)
     {
       TREE_STATIC (fndecl) = 0;
       TREE_PUBLIC (fndecl) = 1;
       DECL_EXTERNAL (fndecl) = 0;
       DECL_DECLARED_INLINE_P (fndecl) = 1;
+      set_inline (fndecl);
+      set_externally_visible (fndecl);
     }
   else if (base->kind == BRIG_KIND_DIRECTIVE_KERNEL)
     {
@@ -330,6 +349,7 @@ brig_directive_function_handler::operator () (const BrigBase *base)
       TREE_STATIC (fndecl) = 0;
       TREE_PUBLIC (fndecl) = 1;
       DECL_EXTERNAL (fndecl) = 1;
+      set_inline (fndecl);
     }
   else if (base->kind == BRIG_KIND_DIRECTIVE_INDIRECT_FUNCTION)
     {
@@ -371,11 +391,8 @@ brig_directive_function_handler::operator () (const BrigBase *base)
     }
 
   m_parent.start_function (fndecl);
-
   m_parent.m_cf->m_func_decl = fndecl;
   m_parent.m_cf->m_current_bind_expr = bind_expr;
-  m_parent.m_cf->m_context_arg = context_arg;
-  m_parent.m_cf->m_private_base_arg = private_base_arg;
 
   if (ret_value != NULL_TREE && TREE_TYPE (ret_value) != void_type_node)
     {
diff --git a/gcc/brig/brigfrontend/brig-function.cc b/gcc/brig/brigfrontend/brig-function.cc
index e1a14da..f0c499d 100644
--- a/gcc/brig/brigfrontend/brig-function.cc
+++ b/gcc/brig/brigfrontend/brig-function.cc
@@ -44,6 +44,12 @@
 #include "function.h"
 #include "brig-to-generic.h"
 #include "brig-builtins.h"
+#include "options.h"
+#include "fold-const.h"
+#include "target.h"
+#include "builtins.h"
+
+brig_function::builtin_map brig_function::s_custom_builtins;
 
 brig_function::brig_function (const BrigDirectiveExecutable *exec,
 			      brig_to_generic *parent)
@@ -60,6 +66,20 @@ brig_function::brig_function (const BrigDirectiveExecutable *exec,
   memset (m_regs, 0,
 	  BRIG_2_TREE_HSAIL_TOTAL_REG_COUNT * sizeof (BrigOperandRegister *));
   memset (&m_descriptor, 0, sizeof (phsa_descriptor));
+
+  if (s_custom_builtins.size () > 0) return;
+
+  /* Populate the builtin index.  */
+#undef DEF_HSAIL_ATOMIC_BUILTIN
+#undef DEF_HSAIL_CVT_ZEROI_SAT_BUILTIN
+#undef DEF_HSAIL_INTR_BUILTIN
+#undef DEF_HSAIL_SAT_BUILTIN
+#undef DEF_HSAIL_BUILTIN
+#define DEF_HSAIL_BUILTIN(ENUM, HSAIL_OPCODE, HSAIL_TYPE, NAME, TYPE, ATTRS) \
+  s_custom_builtins[std::make_pair (HSAIL_OPCODE, HSAIL_TYPE)]		\
+    = builtin_decl_explicit (ENUM);
+
+#include "brig-builtins.def"
 }
 
 brig_function::~brig_function ()
@@ -158,8 +178,7 @@ brig_function::add_id_variables ()
   tree stmts = BIND_EXPR_BODY (bind_expr);
 
   /* Initialize the WG limits and local ids.  */
-
-  tree_stmt_iterator entry = tsi_start (stmts);
+  m_kernel_entry = tsi_start (stmts);
 
   for (int i = 0; i < 3; ++i)
     {
@@ -169,7 +188,7 @@ brig_function::add_id_variables ()
 	 to avoid unnecessary casts (the ID functions are 32b).  */
       m_local_id_vars[i]
 	= add_local_variable (std::string ("__local_") + dim_char,
-			      uint32_type_node);
+			      long_long_integer_type_node);
 
       tree workitemid_call
 	= call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKITEMID), 2,
@@ -178,54 +197,88 @@ brig_function::add_id_variables ()
 			m_context_arg);
 
       tree id_init = build2 (MODIFY_EXPR, TREE_TYPE (m_local_id_vars[i]),
-			     m_local_id_vars[i], workitemid_call);
+			     m_local_id_vars[i],
+			     convert (TREE_TYPE (m_local_id_vars[i]),
+				      workitemid_call));
 
-      tsi_link_after (&entry, id_init, TSI_NEW_STMT);
+      append_statement (id_init);
 
       m_cur_wg_size_vars[i]
 	= add_local_variable (std::string ("__cur_wg_size_") + dim_char,
-			      uint32_type_node);
+			      long_long_integer_type_node);
 
-      tree cwgz_call
-	= call_builtin
-	(builtin_decl_explicit (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE),
-	 2, uint32_type_node, uint32_type_node,
-	 build_int_cst (uint32_type_node, i), ptr_type_node, m_context_arg);
+      tree cwgz_call;
+      if (flag_assume_phsa)
+	{
+	  tree_stl_vec operands
+	    = tree_stl_vec (1, build_int_cst (uint32_type_node, i));
+	  cwgz_call
+	    = expand_or_call_builtin (BRIG_OPCODE_CURRENTWORKGROUPSIZE,
+				      BRIG_TYPE_U32, uint32_type_node,
+				      operands);
+	}
+      else
+	cwgz_call = call_builtin
+	  (builtin_decl_explicit (BUILT_IN_HSAIL_CURRENTWORKGROUPSIZE),
+	   2, uint32_type_node, uint32_type_node,
+	   build_int_cst (uint32_type_node, i), ptr_type_node, m_context_arg);
 
       tree limit_init = build2 (MODIFY_EXPR, TREE_TYPE (m_cur_wg_size_vars[i]),
-				m_cur_wg_size_vars[i], cwgz_call);
+				m_cur_wg_size_vars[i],
+				convert (TREE_TYPE (m_cur_wg_size_vars[i]),
+					 cwgz_call));
 
-      tsi_link_after (&entry, limit_init, TSI_NEW_STMT);
+      append_statement (limit_init);
 
       m_wg_id_vars[i]
 	= add_local_variable (std::string ("__workgroupid_") + dim_char,
 			      uint32_type_node);
 
-      tree wgid_call
-	= call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPID),
-			2, uint32_type_node, uint32_type_node,
-			build_int_cst (uint32_type_node, i), ptr_type_node,
-			m_context_arg);
+      tree wgid_call;
+      if (flag_assume_phsa)
+	{
+	  tree_stl_vec operands
+	    = tree_stl_vec (1, build_int_cst (uint32_type_node, i));
+	  wgid_call
+	    = expand_or_call_builtin (BRIG_OPCODE_WORKGROUPID, BRIG_TYPE_U32,
+				      uint32_type_node, operands);
+	}
+      else
+	wgid_call
+	  = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPID),
+			  2, uint32_type_node, uint32_type_node,
+			  build_int_cst (uint32_type_node, i), ptr_type_node,
+			  m_context_arg);
 
       tree wgid_init = build2 (MODIFY_EXPR, TREE_TYPE (m_wg_id_vars[i]),
 			       m_wg_id_vars[i], wgid_call);
 
-      tsi_link_after (&entry, wgid_init, TSI_NEW_STMT);
+      append_statement (wgid_init);
 
       m_wg_size_vars[i]
 	= add_local_variable (std::string ("__workgroupsize_") + dim_char,
 			      uint32_type_node);
 
-      tree wgsize_call
-	= call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPSIZE),
-			2, uint32_type_node, uint32_type_node,
-			build_int_cst (uint32_type_node, i), ptr_type_node,
-			m_context_arg);
+      tree wgsize_call;
+      if (flag_assume_phsa)
+	{
+	  tree_stl_vec operands
+	    = tree_stl_vec (1, build_int_cst (uint32_type_node, i));
+	  wgsize_call
+	    = expand_or_call_builtin (BRIG_OPCODE_WORKGROUPSIZE, BRIG_TYPE_U32,
+				      uint32_type_node, operands);
+	}
+      else
+	wgsize_call
+	  = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_WORKGROUPSIZE),
+			  2, uint32_type_node, uint32_type_node,
+			  build_int_cst (uint32_type_node, i), ptr_type_node,
+			  m_context_arg);
 
       tree wgsize_init = build2 (MODIFY_EXPR, TREE_TYPE (m_wg_size_vars[i]),
 				 m_wg_size_vars[i], wgsize_call);
 
-      tsi_link_after (&entry, wgsize_init, TSI_NEW_STMT);
+      append_statement (wgsize_init);
 
       m_grid_size_vars[i]
 	= add_local_variable (std::string ("__gridsize_") + dim_char,
@@ -240,10 +293,34 @@ brig_function::add_id_variables ()
       tree gridsize_init = build2 (MODIFY_EXPR, TREE_TYPE (m_grid_size_vars[i]),
 				   m_grid_size_vars[i], gridsize_call);
 
-      tsi_link_after (&entry, gridsize_init, TSI_NEW_STMT);
+      append_statement (gridsize_init);
+
+      m_abs_id_base_vars[i]
+	= add_local_variable (std::string ("__abs_id_base_") + dim_char,
+			      long_long_integer_type_node);
+
+      m_abs_id_vars[i]
+	= add_local_variable (std::string ("__abs_id_") + dim_char,
+			      long_long_integer_type_node);
+
+      tree abs_id_base
+	= build2 (MULT_EXPR, long_long_integer_type_node,
+		  convert (long_long_integer_type_node, m_wg_id_vars[i]),
+		  convert (long_long_integer_type_node, m_wg_size_vars[i]));
+      tree abs_id
+	= build2 (PLUS_EXPR, long_long_integer_type_node, abs_id_base,
+		  convert (long_long_integer_type_node, m_local_id_vars[i]));
+
+      tree abs_id_base_init
+	= build2 (MODIFY_EXPR, TREE_TYPE (m_abs_id_base_vars[i]),
+		  m_abs_id_base_vars[i], abs_id_base);
+      append_statement (abs_id_base_init);
+
+      tree abs_id_init = build2 (MODIFY_EXPR,
+				 TREE_TYPE (m_abs_id_vars[i]),
+				 m_abs_id_vars[i], abs_id);
+      append_statement (abs_id_init);
     }
-
-  m_kernel_entry = entry;
 }
 
 /* Creates a new local variable with the given NAME and given GENERIC
@@ -359,6 +436,8 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry,
 			    tree_stmt_iterator *branch_after)
 {
   tree ivar = m_local_id_vars[dim];
+  tree abs_id_base_var = m_abs_id_base_vars[dim];
+  tree abs_id_var = m_abs_id_vars[dim];
   tree ivar_max = m_cur_wg_size_vars[dim];
   tree_stmt_iterator entry = *header_entry;
 
@@ -371,6 +450,12 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry,
 			   build_zero_cst (TREE_TYPE (ivar)));
   tsi_link_after (&entry, ivar_init, TSI_NEW_STMT);
 
+  tree abs_id_var_init = build2 (MODIFY_EXPR, TREE_TYPE (abs_id_var),
+				 abs_id_var,
+				 convert (TREE_TYPE (abs_id_var),
+					  abs_id_base_var));
+  tsi_link_after (&entry, abs_id_var_init, TSI_NEW_STMT);
+
   tree loop_body_label
     = label (std::string ("__wi_loop_") + (char) ((int) 'x' + dim));
   tree loop_body_label_stmt = build_stmt (LABEL_EXPR, loop_body_label);
@@ -379,16 +464,30 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry,
 
   if (m_has_unexpanded_dp_builtins)
     {
-      tree id_set_builtin
-	= builtin_decl_explicit (BUILT_IN_HSAIL_SETWORKITEMID);
-      /* Set the local ID to the current wi-loop iteration variable value to
-	 ensure the builtins see the correct values.  */
-      tree id_set_call
-	= call_builtin (id_set_builtin, 3,
-			void_type_node, uint32_type_node,
-			build_int_cst (uint32_type_node, dim), uint32_type_node,
-			ivar, ptr_type_node, m_context_arg);
-      tsi_link_after (&entry, id_set_call, TSI_NEW_STMT);
+      if (!flag_assume_phsa)
+	{
+	  tree id_set_builtin
+	    = builtin_decl_explicit (BUILT_IN_HSAIL_SETWORKITEMID);
+	  /* Set the local ID to the current wi-loop iteration variable value
+	     to ensure the builtins see the correct values.  */
+	  tree id_set_call
+	    = call_builtin (id_set_builtin, 3,
+			    void_type_node, uint32_type_node,
+			    build_int_cst (uint32_type_node, dim),
+			    uint32_type_node, convert (uint32_type_node, ivar),
+			    ptr_type_node, m_context_arg);
+	  tsi_link_after (&entry, id_set_call, TSI_NEW_STMT);
+	}
+      else
+	{
+	  tree ptr_type = build_pointer_type (uint32_type_node);
+	  tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+			     build_int_cst (ptr_type, dim * 4));
+	  tree assign = build2 (MODIFY_EXPR, uint32_type_node, ctx,
+				convert (uint32_type_node, ivar));
+
+	  tsi_link_after (&entry, assign, TSI_NEW_STMT);
+	}
     }
 
   /* Increment the WI iteration variable.  */
@@ -397,6 +496,13 @@ brig_function::add_wi_loop (int dim, tree_stmt_iterator *header_entry,
 
   tsi_link_after (branch_after, incr, TSI_NEW_STMT);
 
+  /* ...and the abs id variable.  */
+  tree abs_id_incr = build2 (PREINCREMENT_EXPR, TREE_TYPE (abs_id_var),
+			     abs_id_var,
+			     build_one_cst (TREE_TYPE (abs_id_var)));
+
+  tsi_link_after (branch_after, abs_id_incr, TSI_NEW_STMT);
+
   /* Append the predicate check with the back edge goto.  */
   tree condition = build2 (LT_EXPR, TREE_TYPE (ivar), ivar, ivar_max);
   tree target_goto = build1 (GOTO_EXPR, void_type_node, loop_body_label);
@@ -549,29 +655,36 @@ brig_function::emit_launcher_and_metadata ()
   tree name_identifier
     = get_identifier_with_length (kern_name.c_str (), kern_name.size ());
 
+  tree restrict_void_ptr
+    = build_qualified_type (build_pointer_type (void_type_node),
+			    TYPE_QUAL_RESTRICT);
+  tree restrict_char_ptr
+    = build_qualified_type (build_pointer_type (char_type_node),
+			    TYPE_QUAL_RESTRICT);
   tree launcher
     = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL, name_identifier,
-		  build_function_type_list (void_type_node, ptr_type_node,
-					    ptr_type_node, NULL_TREE));
+		  build_function_type_list (void_type_node, restrict_void_ptr,
+					    restrict_char_ptr, NULL_TREE));
 
   TREE_USED (launcher) = 1;
   DECL_ARTIFICIAL (launcher) = 1;
 
   tree context_arg = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-				 get_identifier ("__context"), ptr_type_node);
+				 get_identifier ("__context"),
+				 restrict_void_ptr);
 
   DECL_ARGUMENTS (launcher) = context_arg;
-  DECL_ARG_TYPE (context_arg) = ptr_type_node;
+  DECL_ARG_TYPE (context_arg) = restrict_void_ptr;
   DECL_CONTEXT (context_arg) = launcher;
   TREE_USED (context_arg) = 1;
   DECL_ARTIFICIAL (context_arg) = 1;
 
   tree group_base_addr_arg
     = build_decl (UNKNOWN_LOCATION, PARM_DECL,
-		  get_identifier ("__group_base_addr"), ptr_type_node);
+		  get_identifier ("__group_base_addr"), restrict_char_ptr);
 
   chainon (DECL_ARGUMENTS (launcher), group_base_addr_arg);
-  DECL_ARG_TYPE (group_base_addr_arg) = ptr_type_node;
+  DECL_ARG_TYPE (group_base_addr_arg) = restrict_char_ptr;
   DECL_CONTEXT (group_base_addr_arg) = launcher;
   TREE_USED (group_base_addr_arg) = 1;
   DECL_ARTIFICIAL (group_base_addr_arg) = 1;
@@ -618,15 +731,15 @@ brig_function::emit_launcher_and_metadata ()
     phsail_launch_kernel_call
       = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_LAUNCH_WG_FUNC),
 		      4, void_type_node,
-		      ptr_type_node, kernel_func_ptr, ptr_type_node,
-		      context_arg, ptr_type_node, group_base_addr_arg,
+		      ptr_type_node, kernel_func_ptr, restrict_void_ptr,
+		      context_arg, restrict_char_ptr, group_base_addr_arg,
 		      uint32_type_node, group_local_offset_arg);
   else
     phsail_launch_kernel_call
       = call_builtin (builtin_decl_explicit (BUILT_IN_HSAIL_LAUNCH_KERNEL),
 		      4, void_type_node,
-		      ptr_type_node, kernel_func_ptr, ptr_type_node,
-		      context_arg, ptr_type_node, group_base_addr_arg,
+		      ptr_type_node, kernel_func_ptr, restrict_void_ptr,
+		      context_arg, restrict_char_ptr, group_base_addr_arg,
 		      uint32_type_node, group_local_offset_arg);
 
   append_to_statement_list_force (phsail_launch_kernel_call, &stmt_list);
@@ -771,3 +884,719 @@ brig_function::group_variable_segment_offset (const std::string &name) const
   gcc_assert (m_parent->m_module_group_variables.has_variable (name));
   return m_parent->m_module_group_variables.segment_offset (name);
 }
+
+/* Try to expand the given builtin call to reuse a previously generated
+   variable, if possible.  If not, just call the given builtin.
+   BRIG_OPCODE and BRIG_TYPE identify the builtin's BRIG opcode/type,
+   ARITH_TYPE its GENERIC type, and OPERANDS contains the builtin's
+   input operands.  */
+
+tree
+brig_function::expand_or_call_builtin (BrigOpcode16_t brig_opcode,
+				       BrigType16_t brig_type,
+				       tree arith_type,
+				       tree_stl_vec &operands)
+{
+  if (needs_workitem_context_data (brig_opcode))
+    m_has_unexpanded_dp_builtins = true;
+
+  if (can_expand_builtin (brig_opcode))
+    return expand_builtin (brig_opcode, operands);
+
+  tree built_in
+    = get_builtin_for_hsa_opcode (arith_type, brig_opcode, brig_type);
+
+  if (!VECTOR_TYPE_P (TREE_TYPE (TREE_TYPE (built_in)))
+      && arith_type != NULL_TREE && VECTOR_TYPE_P (arith_type)
+      && brig_opcode != BRIG_OPCODE_LERP
+      && brig_opcode != BRIG_OPCODE_PACKCVT
+      && brig_opcode != BRIG_OPCODE_SAD
+      && brig_opcode != BRIG_OPCODE_SADHI)
+    {
+      /* Call the scalar built-in for all elements in the vector.  */
+      tree_stl_vec operand0_elements;
+      if (operands.size () > 0)
+	unpack (operands[0], operand0_elements);
+
+      tree_stl_vec operand1_elements;
+      if (operands.size () > 1)
+	unpack (operands[1], operand1_elements);
+
+      tree_stl_vec result_elements;
+
+      size_t element_count = gccbrig_type_vector_subparts (arith_type);
+      for (size_t i = 0; i < element_count; ++i)
+	{
+	  tree_stl_vec call_operands;
+	  if (operand0_elements.size () > 0)
+	    call_operands.push_back (operand0_elements.at (i));
+
+	  if (operand1_elements.size () > 0)
+	    call_operands.push_back (operand1_elements.at (i));
+
+	  result_elements.push_back
+	    (expand_or_call_builtin (brig_opcode, brig_type,
+				     TREE_TYPE (arith_type),
+				     call_operands));
+	}
+      return pack (result_elements);
+    }
+
+  tree_stl_vec call_operands;
+  tree_stl_vec operand_types;
+
+  tree arg_type_chain = TYPE_ARG_TYPES (TREE_TYPE (built_in));
+
+  for (size_t i = 0; i < operands.size (); ++i)
+    {
+      tree operand_type = TREE_VALUE (arg_type_chain);
+      call_operands.push_back (convert (operand_type, operands[i]));
+      operand_types.push_back (operand_type);
+      arg_type_chain = TREE_CHAIN (arg_type_chain);
+    }
+
+  if (needs_workitem_context_data (brig_opcode))
+    {
+      call_operands.push_back (m_context_arg);
+      operand_types.push_back (ptr_type_node);
+    }
+
+  size_t operand_count = call_operands.size ();
+
+  call_operands.resize (4, NULL_TREE);
+  operand_types.resize (4, NULL_TREE);
+  for (size_t i = 0; i < operand_count; ++i)
+    call_operands.at (i) = build_resize_convert_view (operand_types.at (i),
+						      call_operands.at (i));
+
+  tree fnptr = build_fold_addr_expr (built_in);
+  return build_call_array (TREE_TYPE (TREE_TYPE (built_in)), fnptr,
+			   operand_count, &call_operands[0]);
+}
+
+/* Instead of calling a built-in function, use a more efficient mechanism
+   such as reuse a previously returned value known to be still valid, or
+   access the work-item context struct directly.  This is beneficial especially
+   for the work-item identification related builtins as not having them as
+   unanalyzable black box calls can lead to more easily vectorizable parallel
+   loops for multi work-item work-groups.  BRIG_OPCODE identifies the builtin
+   and OPERANDS store the operands.  */
+
+tree
+brig_function::expand_builtin (BrigOpcode16_t brig_opcode,
+			       tree_stl_vec &operands)
+{
+  tree_stl_vec uint32_0 = tree_stl_vec (1, build_int_cst (uint32_type_node, 0));
+
+  tree_stl_vec uint32_1 = tree_stl_vec (1, build_int_cst (uint32_type_node, 1));
+
+  tree_stl_vec uint32_2 = tree_stl_vec (1, build_int_cst (uint32_type_node, 2));
+
+  if (brig_opcode == BRIG_OPCODE_WORKITEMFLATABSID)
+    {
+      tree id0 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_0);
+      id0 = convert (uint64_type_node, id0);
+
+      tree id1 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_1);
+      id1 = convert (uint64_type_node, id1);
+
+      tree id2 = expand_builtin (BRIG_OPCODE_WORKITEMABSID, uint32_2);
+      id2 = convert (uint64_type_node, id2);
+
+      tree max0 = convert (uint64_type_node, m_grid_size_vars[0]);
+      tree max1 = convert (uint64_type_node, m_grid_size_vars[1]);
+
+      tree id2_x_max0_x_max1 = build2 (MULT_EXPR, uint64_type_node, id2, max0);
+      id2_x_max0_x_max1
+	= build2 (MULT_EXPR, uint64_type_node, id2_x_max0_x_max1, max1);
+
+      tree id1_x_max0 = build2 (MULT_EXPR, uint64_type_node, id1, max0);
+
+      tree sum = build2 (PLUS_EXPR, uint64_type_node, id0, id1_x_max0);
+      sum = build2 (PLUS_EXPR, uint64_type_node, sum, id2_x_max0_x_max1);
+
+      return add_temp_var ("workitemflatabsid", sum);
+    }
+  else if (brig_opcode == BRIG_OPCODE_WORKITEMABSID)
+    {
+      HOST_WIDE_INT dim = int_constant_value (operands[0]);
+      return m_abs_id_vars[dim];
+    }
+  else if (brig_opcode == BRIG_OPCODE_WORKITEMFLATID)
+    {
+
+      tree wg_size_x = expand_builtin (BRIG_OPCODE_WORKGROUPSIZE, uint32_0);
+      tree wg_size_y = expand_builtin (BRIG_OPCODE_WORKGROUPSIZE, uint32_1);
+      tree z_x_wgsx_wgsy
+	= build2 (MULT_EXPR, uint32_type_node,
+		  convert (uint32_type_node,
+			   expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_2)),
+		  wg_size_x);
+      z_x_wgsx_wgsy = build2 (MULT_EXPR, uint32_type_node, z_x_wgsx_wgsy,
+			      wg_size_y);
+
+      tree y_x_wgsx
+	= build2 (MULT_EXPR, uint32_type_node,
+		  convert (uint32_type_node,
+			   expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_1)),
+		  wg_size_x);
+
+      tree sum = build2 (PLUS_EXPR, uint32_type_node, y_x_wgsx, z_x_wgsx_wgsy);
+      sum = build2 (PLUS_EXPR, uint32_type_node,
+		    convert (uint32_type_node,
+			     expand_builtin (BRIG_OPCODE_WORKITEMID, uint32_0)),
+		    sum);
+      return add_temp_var ("workitemflatid", sum);
+    }
+  else if (brig_opcode == BRIG_OPCODE_WORKGROUPSIZE)
+    {
+      HOST_WIDE_INT dim = int_constant_value (operands[0]);
+      if (flag_assume_phsa)
+	{
+	  tree ptr_type = build_pointer_type (uint32_type_node);
+	  tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+			     build_int_cst (ptr_type,
+					    PHSA_CONTEXT_WG_SIZES
+					    + dim * 4));
+	  std::string name ("wgsize_x");
+	  name [name.length() - 1] += dim;
+	  return add_temp_var (name.c_str(), ctx);
+	}
+      else if (m_is_kernel)
+	{
+	  /* For kernels without phsa we generate certain temps before
+	     the WI loop, which means we don't need to rely on LICM to get
+	     them moved out.  */
+	  return m_wg_size_vars[dim];
+	}
+      else
+	gcc_unreachable ();
+    }
+  else if (brig_opcode == BRIG_OPCODE_WORKITEMID)
+    {
+      HOST_WIDE_INT dim = int_constant_value (operands[0]);
+      if (m_is_kernel)
+	{
+	  return m_local_id_vars [dim];
+	}
+      else if (flag_assume_phsa)
+	{
+	  tree ptr_type = build_pointer_type (uint32_type_node);
+	  tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+			     build_int_cst (ptr_type,
+					    PHSA_CONTEXT_OFFS_WI_IDS
+					    + dim * 4));
+	  std::string name ("wiid_x");
+	  name [name.length() - 1] += dim;
+	  return add_temp_var (name.c_str(), ctx);
+	}
+      else
+	gcc_unreachable ();
+    }
+  else if (brig_opcode == BRIG_OPCODE_WORKGROUPID)
+    {
+      HOST_WIDE_INT dim = int_constant_value (operands[0]);
+      if (flag_assume_phsa)
+	{
+	  tree ptr_type = build_pointer_type (uint32_type_node);
+	  tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+			     build_int_cst (ptr_type,
+					    PHSA_CONTEXT_OFFS_WG_IDS
+					    + dim * 4));
+	  std::string name ("wgid_x");
+	  name [name.length() - 1] += dim;
+	  return add_temp_var (name.c_str(), ctx);
+	} else if (m_is_kernel)
+	return m_wg_id_vars [dim];
+      else
+	gcc_unreachable ();
+    }
+  else if (brig_opcode == BRIG_OPCODE_CURRENTWORKGROUPSIZE)
+    {
+      HOST_WIDE_INT dim = int_constant_value (operands[0]);
+      if (flag_assume_phsa)
+	{
+	  tree ptr_type = build_pointer_type (uint32_type_node);
+	  tree ctx = build2 (MEM_REF, uint32_type_node, m_context_arg,
+			     build_int_cst (ptr_type,
+					    PHSA_CONTEXT_CURRENT_WG_SIZES
+					    + dim * 4));
+	  std::string name ("curwgsize_x");
+	  name [name.length() - 1] += dim;
+	  return add_temp_var (name.c_str(), ctx);
+	} else if (m_is_kernel)
+	return m_cur_wg_size_vars[dim];
+      else
+	gcc_unreachable ();
+    }
+  else
+    gcc_unreachable ();
+
+  return NULL_TREE;
+}
+
+/* Returns true in case the given opcode that would normally be generated
+   as a builtin call can be expanded to tree nodes.  */
+
+bool
+brig_function::can_expand_builtin (BrigOpcode16_t brig_opcode) const
+{
+  switch (brig_opcode)
+    {
+    case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
+    case BRIG_OPCODE_WORKITEMFLATID:
+    case BRIG_OPCODE_WORKITEMID:
+    case BRIG_OPCODE_WORKGROUPID:
+    case BRIG_OPCODE_WORKGROUPSIZE:
+      return m_is_kernel || flag_assume_phsa;
+    case BRIG_OPCODE_WORKITEMFLATABSID:
+    case BRIG_OPCODE_WORKITEMABSID:
+      return m_is_kernel;
+    default:
+      return false;
+    };
+}
+
+/* In case the HSA instruction must be implemented using a builtin,
+   this function is called to get the correct builtin function.
+   TYPE is the instruction tree type, BRIG_OPCODE the opcode of the
+   brig instruction and BRIG_TYPE the brig instruction's type.  */
+
+tree
+brig_function::get_builtin_for_hsa_opcode
+  (tree type, BrigOpcode16_t brig_opcode, BrigType16_t brig_type) const
+{
+  tree builtin = NULL_TREE;
+  tree builtin_type = type;
+
+  /* For vector types, first find the scalar version of the builtin.  */
+  if (type != NULL_TREE && VECTOR_TYPE_P (type))
+    builtin_type = TREE_TYPE (type);
+  BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
+
+  /* Some BRIG opcodes can use the same builtins for unsigned and
+     signed types.  Force these cases to unsigned types.  */
+
+  if (brig_opcode == BRIG_OPCODE_BORROW
+      || brig_opcode == BRIG_OPCODE_CARRY
+      || brig_opcode == BRIG_OPCODE_LASTBIT
+      || brig_opcode == BRIG_OPCODE_BITINSERT)
+    {
+      if (brig_type == BRIG_TYPE_S32)
+	brig_type = BRIG_TYPE_U32;
+      else if (brig_type == BRIG_TYPE_S64)
+	brig_type = BRIG_TYPE_U64;
+    }
+
+  switch (brig_opcode)
+    {
+    case BRIG_OPCODE_FLOOR:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_FLOOR);
+      break;
+    case BRIG_OPCODE_CEIL:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_CEIL);
+      break;
+    case BRIG_OPCODE_SQRT:
+    case BRIG_OPCODE_NSQRT:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_SQRT);
+      break;
+    case BRIG_OPCODE_RINT:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_RINT);
+      break;
+    case BRIG_OPCODE_TRUNC:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_TRUNC);
+      break;
+    case BRIG_OPCODE_COPYSIGN:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_COPYSIGN);
+      break;
+    case BRIG_OPCODE_NSIN:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_SIN);
+      break;
+    case BRIG_OPCODE_NLOG2:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_LOG2);
+      break;
+    case BRIG_OPCODE_NEXP2:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_EXP2);
+      break;
+    case BRIG_OPCODE_NFMA:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_FMA);
+      break;
+    case BRIG_OPCODE_NCOS:
+      builtin = mathfn_built_in (builtin_type, BUILT_IN_COS);
+      break;
+    case BRIG_OPCODE_POPCOUNT:
+      /* Popcount should be typed by its argument type (the return value
+	 is always u32).  Let's use a b64 version for also for b32 for now.  */
+      return builtin_decl_explicit (BUILT_IN_POPCOUNTL);
+    case BRIG_OPCODE_BORROW:
+      /* Borrow uses the same builtin for unsigned and signed types.  */
+      if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
+	return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U32);
+      else
+	return builtin_decl_explicit (BUILT_IN_HSAIL_BORROW_U64);
+    case BRIG_OPCODE_CARRY:
+      /* Carry also uses the same builtin for unsigned and signed types.  */
+      if (brig_type == BRIG_TYPE_S32 || brig_type == BRIG_TYPE_U32)
+	return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U32);
+      else
+	return builtin_decl_explicit (BUILT_IN_HSAIL_CARRY_U64);
+    default:
+
+      /* Use our builtin index for finding a proper builtin for the BRIG
+	 opcode and BRIG type.  This takes care most of the builtin cases,
+	 the special cases are handled in the separate 'case' statements
+	 above.  */
+      builtin_map::const_iterator i
+	= s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
+      if (i != s_custom_builtins.end ())
+	return (*i).second;
+
+      if (brig_inner_type != brig_type)
+	{
+	  /* Try to find a scalar built-in we could use.  */
+	  i = s_custom_builtins.find
+	    (std::make_pair (brig_opcode, brig_inner_type));
+	  if (i != s_custom_builtins.end ())
+	    return (*i).second;
+	}
+
+      /* In case this is an fp16 operation that is promoted to fp32,
+	 try to find a fp32 scalar built-in.  */
+      if (brig_inner_type == BRIG_TYPE_F16)
+	{
+	  i = s_custom_builtins.find
+	    (std::make_pair (brig_opcode, BRIG_TYPE_F32));
+	  if (i != s_custom_builtins.end ())
+	    return (*i).second;
+	}
+      gcc_unreachable ();
+    }
+
+  if (VECTOR_TYPE_P (type) && builtin != NULL_TREE)
+    {
+      /* Try to find a vectorized version of the built-in.
+	 TODO: properly assert that builtin is a mathfn builtin? */
+      tree vec_builtin
+	= targetm.vectorize.builtin_vectorized_function
+	(builtin_mathfn_code (builtin), type, type);
+      if (vec_builtin != NULL_TREE)
+	return vec_builtin;
+      else
+	return builtin;
+    }
+  if (builtin == NULL_TREE)
+    gcc_unreachable ();
+  return builtin;
+}
+
+/* Unpacks the elements of the vector in VALUE to scalars (bit field
+   references) in ELEMENTS.  */
+
+void
+brig_function::unpack (tree value, tree_stl_vec &elements)
+{
+  size_t vec_size = int_size_in_bytes (TREE_TYPE (value));
+  size_t element_size
+    = int_size_in_bytes (TREE_TYPE (TREE_TYPE (value))) * BITS_PER_UNIT;
+  size_t element_count
+    = vec_size * BITS_PER_UNIT / element_size;
+
+  tree input_element_type = TREE_TYPE (TREE_TYPE (value));
+
+  value = add_temp_var ("unpack_input", value);
+
+  for (size_t i = 0; i < element_count; ++i)
+    {
+      tree element
+	= build3 (BIT_FIELD_REF, input_element_type, value,
+		  TYPE_SIZE (input_element_type),
+		  bitsize_int(i * element_size));
+
+      element = add_temp_var ("scalar", element);
+      elements.push_back (element);
+    }
+}
+
+/* Pack the elements of the scalars in ELEMENTS to the returned vector.  */
+
+tree
+brig_function::pack (tree_stl_vec &elements)
+{
+  size_t element_count = elements.size ();
+
+  gcc_assert (element_count > 1);
+
+  tree output_element_type = TREE_TYPE (elements.at (0));
+
+  vec<constructor_elt, va_gc> *constructor_vals = NULL;
+  for (size_t i = 0; i < element_count; ++i)
+    CONSTRUCTOR_APPEND_ELT (constructor_vals, NULL_TREE, elements.at (i));
+
+  tree vec_type = build_vector_type (output_element_type, element_count);
+
+  /* build_constructor creates a vector type which is not a vector_cst
+     that requires compile time constant elements.  */
+  tree vec = build_constructor (vec_type, constructor_vals);
+
+  /* Add a temp variable for readability.  */
+  tree tmp_var = create_tmp_var (vec_type, "vec_out");
+  tree vec_tmp_assign = build2 (MODIFY_EXPR, TREE_TYPE (tmp_var), tmp_var, vec);
+  append_statement (vec_tmp_assign);
+  return tmp_var;
+}
+
+/* Returns true in case the given opcode needs to know about work-item context
+   data.  In such case the context data is passed as a pointer to a work-item
+   context object, as the last argument in the builtin call.  */
+
+bool
+brig_function::needs_workitem_context_data
+(BrigOpcode16_t brig_opcode)
+{
+  switch (brig_opcode)
+    {
+    case BRIG_OPCODE_WORKITEMABSID:
+    case BRIG_OPCODE_WORKITEMFLATABSID:
+    case BRIG_OPCODE_WORKITEMFLATID:
+    case BRIG_OPCODE_CURRENTWORKITEMFLATID:
+    case BRIG_OPCODE_WORKITEMID:
+    case BRIG_OPCODE_WORKGROUPID:
+    case BRIG_OPCODE_WORKGROUPSIZE:
+    case BRIG_OPCODE_CURRENTWORKGROUPSIZE:
+    case BRIG_OPCODE_GRIDGROUPS:
+    case BRIG_OPCODE_GRIDSIZE:
+    case BRIG_OPCODE_DIM:
+    case BRIG_OPCODE_PACKETID:
+    case BRIG_OPCODE_PACKETCOMPLETIONSIG:
+    case BRIG_OPCODE_BARRIER:
+    case BRIG_OPCODE_WAVEBARRIER:
+    case BRIG_OPCODE_ARRIVEFBAR:
+    case BRIG_OPCODE_INITFBAR:
+    case BRIG_OPCODE_JOINFBAR:
+    case BRIG_OPCODE_LEAVEFBAR:
+    case BRIG_OPCODE_RELEASEFBAR:
+    case BRIG_OPCODE_WAITFBAR:
+    case BRIG_OPCODE_CUID:
+    case BRIG_OPCODE_MAXCUID:
+    case BRIG_OPCODE_DEBUGTRAP:
+    case BRIG_OPCODE_GROUPBASEPTR:
+    case BRIG_OPCODE_KERNARGBASEPTR:
+    case BRIG_OPCODE_ALLOCA:
+      return true;
+    default:
+      return false;
+    };
+}
+
+/* Appends and returns a new temp variable and an accompanying assignment
+   statement that stores the value of the given EXPR and has the given NAME.  */
+
+tree
+brig_function::add_temp_var (std::string name, tree expr)
+{
+  tree temp_var = create_tmp_var (TREE_TYPE (expr), name.c_str ());
+  tree assign = build2 (MODIFY_EXPR, TREE_TYPE (temp_var), temp_var, expr);
+  append_statement (assign);
+  return temp_var;
+}
+
+/* Returns the integer constant value of the given node.
+   If it's a cast, looks into the source of the cast.  */
+
+HOST_WIDE_INT
+brig_function::int_constant_value (tree node)
+{
+  tree n = node;
+  if (TREE_CODE (n) == VIEW_CONVERT_EXPR)
+    n = TREE_OPERAND (n, 0);
+  return int_cst_value (n);
+}
+
+/* Returns the tree code that should be used to implement the given
+   HSA instruction opcode (BRIG_OPCODE) for the given type of instruction
+   (BRIG_TYPE).  In case the opcode cannot be mapped to a TREE node directly,
+   returns TREE_LIST (if it can be emulated with a simple chain of tree
+   nodes) or CALL_EXPR if the opcode should be implemented using a builtin
+   call.  */
+
+tree_code
+brig_function::get_tree_code_for_hsa_opcode
+  (BrigOpcode16_t brig_opcode, BrigType16_t brig_type)
+{
+  BrigType16_t brig_inner_type = brig_type & BRIG_TYPE_BASE_MASK;
+  switch (brig_opcode)
+    {
+    case BRIG_OPCODE_NOP:
+      return NOP_EXPR;
+    case BRIG_OPCODE_ADD:
+      return PLUS_EXPR;
+    case BRIG_OPCODE_CMOV:
+      if (brig_inner_type == brig_type)
+	return COND_EXPR;
+      else
+	return VEC_COND_EXPR;
+    case BRIG_OPCODE_SUB:
+      return MINUS_EXPR;
+    case BRIG_OPCODE_MUL:
+    case BRIG_OPCODE_MUL24:
+      return MULT_EXPR;
+    case BRIG_OPCODE_MULHI:
+    case BRIG_OPCODE_MUL24HI:
+      return MULT_HIGHPART_EXPR;
+    case BRIG_OPCODE_DIV:
+      if (gccbrig_is_float_type (brig_inner_type))
+	return RDIV_EXPR;
+      else
+	return TRUNC_DIV_EXPR;
+    case BRIG_OPCODE_NEG:
+      return NEGATE_EXPR;
+    case BRIG_OPCODE_MIN:
+      if (gccbrig_is_float_type (brig_inner_type))
+	return CALL_EXPR;
+      else
+	return MIN_EXPR;
+    case BRIG_OPCODE_MAX:
+      if (gccbrig_is_float_type (brig_inner_type))
+	return CALL_EXPR;
+      else
+	return MAX_EXPR;
+    case BRIG_OPCODE_FMA:
+      return FMA_EXPR;
+    case BRIG_OPCODE_ABS:
+      return ABS_EXPR;
+    case BRIG_OPCODE_SHL:
+      return LSHIFT_EXPR;
+    case BRIG_OPCODE_SHR:
+      return RSHIFT_EXPR;
+    case BRIG_OPCODE_OR:
+      return BIT_IOR_EXPR;
+    case BRIG_OPCODE_XOR:
+      return BIT_XOR_EXPR;
+    case BRIG_OPCODE_AND:
+      return BIT_AND_EXPR;
+    case BRIG_OPCODE_NOT:
+      return BIT_NOT_EXPR;
+    case BRIG_OPCODE_RET:
+      return RETURN_EXPR;
+    case BRIG_OPCODE_MOV:
+    case BRIG_OPCODE_LDF:
+      return MODIFY_EXPR;
+    case BRIG_OPCODE_LD:
+    case BRIG_OPCODE_ST:
+      return MEM_REF;
+    case BRIG_OPCODE_BR:
+      return GOTO_EXPR;
+    case BRIG_OPCODE_REM:
+      if (brig_type == BRIG_TYPE_U64 || brig_type == BRIG_TYPE_U32)
+	return TRUNC_MOD_EXPR;
+      else
+	return CALL_EXPR;
+    case BRIG_OPCODE_NRCP:
+    case BRIG_OPCODE_NRSQRT:
+      /* Implement as 1/f (x).  gcc should pattern detect that and
+	 use a native instruction, if available, for it.  */
+      return TREE_LIST;
+    case BRIG_OPCODE_FLOOR:
+    case BRIG_OPCODE_CEIL:
+    case BRIG_OPCODE_SQRT:
+    case BRIG_OPCODE_NSQRT:
+    case BRIG_OPCODE_RINT:
+    case BRIG_OPCODE_TRUNC:
+    case BRIG_OPCODE_POPCOUNT:
+    case BRIG_OPCODE_COPYSIGN:
+    case BRIG_OPCODE_NCOS:
+    case BRIG_OPCODE_NSIN:
+    case BRIG_OPCODE_NLOG2:
+    case BRIG_OPCODE_NEXP2:
+    case BRIG_OPCODE_NFMA:
+      /* Class has type B1 regardless of the float type, thus
+	 the below builtin map search cannot find it.  */
+    case BRIG_OPCODE_CLASS:
+    case BRIG_OPCODE_WORKITEMABSID:
+      return CALL_EXPR;
+    default:
+
+      /* Some BRIG opcodes can use the same builtins for unsigned and
+	 signed types.  Force these cases to unsigned types.
+      */
+
+      if (brig_opcode == BRIG_OPCODE_BORROW
+	  || brig_opcode == BRIG_OPCODE_CARRY
+	  || brig_opcode == BRIG_OPCODE_LASTBIT
+	  || brig_opcode == BRIG_OPCODE_BITINSERT)
+	{
+	  if (brig_type == BRIG_TYPE_S32)
+	    brig_type = BRIG_TYPE_U32;
+	  else if (brig_type == BRIG_TYPE_S64)
+	    brig_type = BRIG_TYPE_U64;
+	}
+
+
+      builtin_map::const_iterator i
+	= s_custom_builtins.find (std::make_pair (brig_opcode, brig_type));
+      if (i != s_custom_builtins.end ())
+	return CALL_EXPR;
+      else if (s_custom_builtins.find
+	       (std::make_pair (brig_opcode, brig_inner_type))
+	       != s_custom_builtins.end ())
+	return CALL_EXPR;
+      if (brig_inner_type == BRIG_TYPE_F16
+	  && s_custom_builtins.find
+	  (std::make_pair (brig_opcode, BRIG_TYPE_F32))
+	  != s_custom_builtins.end ())
+	return CALL_EXPR;
+      break;
+    }
+  return TREE_LIST; /* Emulate using a chain of nodes.  */
+}
+
+/* Inform of an update to the REG_VAR.  */
+
+void
+brig_function::add_reg_var_update (tree reg_var, tree var)
+{
+  if (var == m_abs_id_vars[0] || var == m_abs_id_vars[1]
+      || var == m_abs_id_vars[2] || var == m_local_id_vars[0]
+      || var == m_local_id_vars[1] || var == m_local_id_vars[2])
+    m_id_val_defs [reg_var] = var;
+  else
+    {
+      /* Possible overwrite of an ID value.  */
+
+      id_val_map::iterator i = m_id_val_defs.find (reg_var);
+      if (i != m_id_val_defs.end())
+	m_id_val_defs.erase (i);
+    }
+}
+
+/* If the REG_VAR is known to contain an ID value at this point in
+   the basic block, return true.  */
+
+bool
+brig_function::is_id_val (tree reg_var)
+{
+  id_val_map::iterator i = m_id_val_defs.find (reg_var);
+  return i != m_id_val_defs.end();
+}
+
+/* Return an ID value for the given REG_VAR if its known to contain
+   one at this point in the BB, NULL_TREE otherwise.  */
+
+tree
+brig_function::id_val (tree reg_var)
+{
+  id_val_map::iterator i = m_id_val_defs.find (reg_var);
+  if (i != m_id_val_defs.end())
+    return (*i).second;
+  else
+    return NULL_TREE;
+}
+
+/* Informs of starting a new basic block.  Called when generating
+   a label, a call, a jump, or a return.  */
+
+void
+brig_function::start_new_bb ()
+{
+  m_id_val_defs.clear ();
+}
diff --git a/gcc/brig/brigfrontend/brig-function.h b/gcc/brig/brigfrontend/brig-function.h
index 6149719..8fde3a5 100644
--- a/gcc/brig/brigfrontend/brig-function.h
+++ b/gcc/brig/brigfrontend/brig-function.h
@@ -105,6 +105,30 @@ public:
 
   void analyze_calls ();
 
+  tree expand_builtin (BrigOpcode16_t brig_opcode, tree_stl_vec &operands);
+
+  tree expand_or_call_builtin (BrigOpcode16_t brig_opcode,
+			       BrigType16_t brig_type, tree arith_type,
+			       tree_stl_vec &operands);
+  bool can_expand_builtin (BrigOpcode16_t brig_opcode) const;
+
+  tree get_builtin_for_hsa_opcode (tree type, BrigOpcode16_t brig_opcode,
+				   BrigType16_t brig_type) const;
+
+  void unpack (tree value, tree_stl_vec &elements);
+  tree pack (tree_stl_vec &elements);
+  tree add_temp_var (std::string name, tree expr);
+
+  static bool needs_workitem_context_data (BrigOpcode16_t brig_opcode);
+  static HOST_WIDE_INT int_constant_value (tree node);
+  static tree_code get_tree_code_for_hsa_opcode (BrigOpcode16_t brig_opcode,
+						 BrigType16_t brig_type);
+
+  void start_new_bb ();
+  void add_reg_var_update (tree reg_var, tree val);
+  bool is_id_val (tree reg_var);
+  tree id_val (tree reg_var);
+
   const BrigDirectiveExecutable *m_brig_def;
 
   bool m_is_kernel;
@@ -183,6 +207,11 @@ public:
   tree m_wg_id_vars[3];
   tree m_wg_size_vars[3];
   tree m_grid_size_vars[3];
+  /* Explicitly computed WG base for the absolute IDs which is used
+     as the initial value when looping that dimension.   We update
+     the abs id with ++ to make it easy for the vectorizer.  */
+  tree m_abs_id_base_vars[3];
+  tree m_abs_id_vars[3];
 
   /* Set to true in case the kernel contains at least one dispatch packet
      (work-item ID-related) builtin call that could not be expanded to
@@ -219,6 +248,20 @@ private:
   /* Bookkeeping for the different HSA registers and their tree declarations
      for the currently generated function.  */
   reg_decl_index_entry *m_regs[BRIG_2_TREE_HSAIL_TOTAL_REG_COUNT];
+
+  /* Map for keeping book reads of ID variables, which can be propagated
+     to uses in address expressions to produce cleaner indexing functions
+     with unnecessary casts stripped off, etc.  */
+  typedef std::map<tree, tree> id_val_map;
+
+  /* Keeps track of ID values alive in registers in the currently
+     processed BB.  */
+  id_val_map m_id_val_defs;
+
+  /* HSAIL-specific builtin functions not yet integrated to gcc.  */
+  typedef std::map<std::pair<BrigOpcode16_t, BrigType16_t>, tree> builtin_map;
+
+  static builtin_map s_custom_builtins;
 };
 
 #endif
diff --git a/gcc/brig/brigfrontend/brig-label-handler.cc b/gcc/brig/brigfrontend/brig-label-handler.cc
index 7605b76..938df82 100644
--- a/gcc/brig/brigfrontend/brig-label-handler.cc
+++ b/gcc/brig/brigfrontend/brig-label-handler.cc
@@ -31,7 +31,10 @@ brig_directive_label_handler::operator () (const BrigBase *base)
   std::string label_str ((const char *) (label_name->bytes),
 			 label_name->byteCount);
 
+  m_parent.m_cf->start_new_bb ();
+
   tree stmt = build_stmt (LABEL_EXPR, m_parent.m_cf->label (label_str));
   m_parent.m_cf->append_statement (stmt);
+
   return base->byteCount;
 }
diff --git a/gcc/brig/brigfrontend/brig-lane-inst-handler.cc b/gcc/brig/brigfrontend/brig-lane-inst-handler.cc
index 1da0bc0..385da33 100644
--- a/gcc/brig/brigfrontend/brig-lane-inst-handler.cc
+++ b/gcc/brig/brigfrontend/brig-lane-inst-handler.cc
@@ -59,7 +59,7 @@ brig_lane_inst_handler::operator () (const BrigBase *base)
       elements.push_back (zero_cst);
       elements.push_back (zero_cst);
 
-      expr = pack (elements);
+      expr = m_parent.m_cf->pack (elements);
     }
   else if (inst.base.opcode == BRIG_OPCODE_ACTIVELANEPERMUTE)
     {
diff --git a/gcc/brig/brigfrontend/brig-mem-inst-handler.cc b/gcc/brig/brigfrontend/brig-mem-inst-handler.cc
index 350516f..d8374f2 100644
--- a/gcc/brig/brigfrontend/brig-mem-inst-handler.cc
+++ b/gcc/brig/brigfrontend/brig-mem-inst-handler.cc
@@ -63,7 +63,7 @@ brig_mem_inst_handler::build_mem_access (const BrigInstBase *brig_inst,
     {
       /* Add a temporary variable so there won't be multiple
 	 reads in case of vector unpack.  */
-      mem_ref = add_temp_var ("mem_read", mem_ref);
+      mem_ref = m_parent.m_cf->add_temp_var ("mem_read", mem_ref);
       return build_output_assignment (*brig_inst, data, mem_ref);
     }
   else
@@ -95,8 +95,9 @@ brig_mem_inst_handler::operator () (const BrigBase *base)
       inputs.push_back (operands[1]);
       inputs.push_back (align_opr);
       tree builtin_call
-	= expand_or_call_builtin (BRIG_OPCODE_ALLOCA, BRIG_TYPE_U32,
-				  uint32_type_node, inputs);
+	= m_parent.m_cf->expand_or_call_builtin (BRIG_OPCODE_ALLOCA,
+						 BRIG_TYPE_U32,
+						 uint32_type_node, inputs);
       build_output_assignment (*brig_inst, operands[0], builtin_call);
       m_parent.m_cf->m_has_allocas = true;
       return base->byteCount;
diff --git a/gcc/brig/brigfrontend/phsa.h b/gcc/brig/brigfrontend/phsa.h
index d224752..fe0b9a5 100644
--- a/gcc/brig/brigfrontend/phsa.h
+++ b/gcc/brig/brigfrontend/phsa.h
@@ -58,13 +58,22 @@ typedef struct __attribute__((__packed__))
 
 /* The prefix to use in the ELF section containing descriptor for
    a function.  */
+
 #define PHSA_DESC_SECTION_PREFIX "phsa.desc."
 #define PHSA_HOST_DEF_PTR_PREFIX "__phsa.host_def."
 
 /* The frontend error messages are parsed by the host runtime.  Known
    prefix strings are used to separate the different runtime error
    codes.  */
+
 #define PHSA_ERROR_PREFIX_INCOMPATIBLE_MODULE "Incompatible module: "
 #define PHSA_ERROR_PREFIX_CORRUPTED_MODULE "Corrupted module: "
 
+/* Offsets of attributes in the PHSA context structs.
+   Used by -fphsa-wi-context-opt.  */
+#define PHSA_CONTEXT_OFFS_WI_IDS 0
+#define PHSA_CONTEXT_OFFS_WG_IDS (PHSA_CONTEXT_OFFS_WI_IDS + 3 * 4)
+#define PHSA_CONTEXT_WG_SIZES (PHSA_CONTEXT_OFFS_WG_IDS + 3 * 4)
+#define PHSA_CONTEXT_CURRENT_WG_SIZES (PHSA_CONTEXT_WG_SIZES + 3 * 4)
+
 #endif
diff --git a/gcc/brig/lang.opt b/gcc/brig/lang.opt
index 1c83f5f..2cc6cb9 100644
--- a/gcc/brig/lang.opt
+++ b/gcc/brig/lang.opt
@@ -31,6 +31,11 @@ BRIG Separate Alias(d)
 -dump=
 BRIG Joined Alias(d)
 
+fassume-phsa
+BRIG Report Var(flag_assume_phsa) Init(1) Optimization
+Assume we are finalizing for phsa and its libhsail-rt.  Enables additional
+phsa-specific optimizations (default).
+
 L
 BRIG Joined Separate
 ; Not documented
diff --git a/gcc/builtin-types.def b/gcc/builtin-types.def
index 8f3d796..5365bef 100644
--- a/gcc/builtin-types.def
+++ b/gcc/builtin-types.def
@@ -283,7 +283,9 @@ DEF_FUNCTION_TYPE_1 (BT_FN_UINT_INT, BT_UINT, BT_INT)
 DEF_FUNCTION_TYPE_1 (BT_FN_UINT_ULONG, BT_UINT, BT_ULONG)
 DEF_FUNCTION_TYPE_1 (BT_FN_UINT_LONG, BT_UINT, BT_LONG)
 DEF_FUNCTION_TYPE_1 (BT_FN_UINT_PTR, BT_UINT, BT_PTR)
+DEF_FUNCTION_TYPE_1 (BT_FN_UINT_CONST_PTR, BT_UINT, BT_CONST_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_PTR, BT_ULONG, BT_PTR)
+DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_CONST_PTR, BT_ULONG, BT_CONST_PTR)
 DEF_FUNCTION_TYPE_1 (BT_FN_ULONG_ULONG, BT_ULONG, BT_ULONG)
 DEF_FUNCTION_TYPE_1 (BT_FN_ULONGLONG_ULONGLONG, BT_ULONGLONG, BT_ULONGLONG)
 DEF_FUNCTION_TYPE_1 (BT_FN_INT8_FLOAT, BT_INT8, BT_FLOAT)
@@ -480,6 +482,7 @@ DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_SIZE_CONST_VPTR, BT_BOOL, BT_SIZE,
 DEF_FUNCTION_TYPE_2 (BT_FN_BOOL_INT_BOOL, BT_BOOL, BT_INT, BT_BOOL)
 DEF_FUNCTION_TYPE_2 (BT_FN_VOID_UINT_UINT, BT_VOID, BT_UINT, BT_UINT)
 DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_PTR, BT_UINT, BT_UINT, BT_PTR)
+DEF_FUNCTION_TYPE_2 (BT_FN_UINT_UINT_CONST_PTR, BT_UINT, BT_UINT, BT_CONST_PTR)
 DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_SIZE, BT_PTR, BT_CONST_PTR, BT_SIZE)
 DEF_FUNCTION_TYPE_2 (BT_FN_PTR_CONST_PTR_CONST_PTR, BT_PTR, BT_CONST_PTR, BT_CONST_PTR)
 DEF_FUNCTION_TYPE_2 (BT_FN_VOID_PTRPTR_CONST_PTR, BT_VOID, BT_PTR_PTR, BT_CONST_PTR)
@@ -569,6 +572,7 @@ DEF_FUNCTION_TYPE_3 (BT_FN_VOID_DOUBLE_DOUBLEPTR_DOUBLEPTR,
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_LONGDOUBLE_LONGDOUBLEPTR_LONGDOUBLEPTR,
 		     BT_VOID, BT_LONGDOUBLE, BT_LONGDOUBLE_PTR, BT_LONGDOUBLE_PTR)
 DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_PTR, BT_VOID, BT_PTR, BT_PTR, BT_PTR)
+DEF_FUNCTION_TYPE_3 (BT_FN_VOID_PTR_PTR_UINT32, BT_VOID, BT_PTR, BT_PTR, BT_UINT32)
 DEF_FUNCTION_TYPE_3 (BT_FN_INT_CONST_STRING_PTR_CONST_STRING_PTR_CONST_STRING,
 		     BT_INT, BT_CONST_STRING, BT_PTR_CONST_STRING, BT_PTR_CONST_STRING)
 DEF_FUNCTION_TYPE_3 (BT_FN_INT_INT_CONST_STRING_VALIST_ARG,
diff --git a/gcc/testsuite/brig.dg/test/gimple/smoke_test.hsail b/gcc/testsuite/brig.dg/test/gimple/smoke_test.hsail
index 1f36ddc..6e23263 100644
--- a/gcc/testsuite/brig.dg/test/gimple/smoke_test.hsail
+++ b/gcc/testsuite/brig.dg/test/gimple/smoke_test.hsail
@@ -41,15 +41,15 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr)
 };
 
 /* The kernel function itself should have a fingerprint as follows */
-/* _Kernel (unsigned char * __args, void * __context, void * __group_base_addr, void * __private_base_addr) */
-/* { dg-final { scan-tree-dump "_Kernel \\\(unsigned char \\\* __args, void \\\* __context, void \\\* __group_base_addr, unsigned int __group_local_offset, void \\\* __private_base_addr\\\)" "gimple"} } */
+/* _Kernel (const unsigned char * restrict __args, void * restrict __context, unsigned char * restrict __group_base_addr, unsigned int __group_local_offset, unsigned char * restrict __private_base_addr) */
+/* { dg-final { scan-tree-dump "_Kernel \\\(const unsigned char \\\* restrict __args, void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr, unsigned int __group_local_offset, unsigned char \\\* restrict __private_base_addr\\\)" "gimple"} } */
 
 /* ld_kernarg: mem_read.0 = MEM[(unsigned long *)__args]; */
 /* { dg-final { scan-tree-dump "mem_read.\[0-9\] = MEM\\\[\\\(unsigned long \\\*\\\)__args\\\];" "gimple"} } */
 
 /* The latter ld_global_u32 should be visible as a pointer dereference (after pointer arithmetics on a temporary var): */
 /* mem_read.2 = *D.1691; */
-/* { dg-final { scan-tree-dump "mem_read.\[0-9\] = \\\*\[_0-9\]+;" "gimple"} } */
+/* { dg-final { scan-tree-dump "mem_read.\[0-9\]+ = \\\*\[_0-9\]+;" "gimple"} } */
 
 /* add_u32s should generate +operators */
 /* { dg-final { scan-tree-dump "s2 = s0 \\\+ s1;" "gimple"} } */
@@ -71,8 +71,8 @@ prog kernel &KernelWithBarrier(kernarg_u64 %input_ptr, kernarg_u64 %output_ptr)
 /* { dg-final { scan-tree-dump "if \\\(__local_z < __cur_wg_size_z\\\) goto __wi_loop_z; else goto" "gimple"} } */
 
 /* The launcher should call __hsail_launch_wg_function in this case: */
-/* Kernel (void * __context, void * __group_base_addr) */
-/* { dg-final { scan-tree-dump "Kernel \\\(void \\\* __context, void \\\* __group_base_addr\\\)" "gimple"} } */
+/* Kernel (void * restrict __context, unsigned char * restrict __group_base_addr) */
+/* { dg-final { scan-tree-dump "Kernel \\\(void \\\* restrict __context, unsigned char \\\* restrict __group_base_addr\\\)" "gimple"} } */
 /* { dg-final { scan-tree-dump "__hsail_launch_wg_function \\\(_Kernel, __context, __group_base_addr, group_local_offset.*\\\);" "gimple"} }*/
 
 /* The kernel should have the magic metadata section injected to the ELF. */
diff --git a/libhsail-rt/include/internal/phsa-rt.h b/libhsail-rt/include/internal/phsa-rt.h
index d9db56c..c09f18d 100644
--- a/libhsail-rt/include/internal/phsa-rt.h
+++ b/libhsail-rt/include/internal/phsa-rt.h
@@ -54,7 +54,6 @@ typedef void (*gccbrigKernelFunc) (unsigned char *, void *, void *, uint32_t,
 */
 typedef struct
 {
-
   /* Data set by the HSA Runtime's kernel launcher.  */
   hsa_kernel_dispatch_packet_t *dp;
 
diff --git a/libhsail-rt/include/internal/workitems.h b/libhsail-rt/include/internal/workitems.h
index 73add28..0839853f 100644
--- a/libhsail-rt/include/internal/workitems.h
+++ b/libhsail-rt/include/internal/workitems.h
@@ -45,11 +45,6 @@
 
 typedef struct
 {
-  /* The group id of the currently executed WG.  */
-  size_t x;
-  size_t y;
-  size_t z;
-
   /* This is 1 in case there are more work groups to execute.
      If 0, the work-item threads should finish themselves.  */
   int more_wgs;
@@ -89,6 +84,16 @@ typedef struct
      stack frame.  Initialized to point outside the private segment.  */
   uint32_t alloca_frame_p;
 
+  /* The group id of the currently executed WG.  This is for fiber based
+     execution.  The group ids are duplicated also to the per WI context
+     struct for simplified single pointer access in the GCCBRIG produced
+     code.
+   */
+
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+
 } PHSAWorkGroup;
 
 /* Data identifying a single work-item, passed to the work-item thread in case
@@ -96,17 +101,42 @@ typedef struct
 
 typedef struct
 {
+  /* NOTE: These members STARTing here should not be moved as they are
+     accessed directly by code emitted by BRIG FE.   */
+
+  /* The local id of the current WI. */
+
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+
+  /* The group id of the currently executed WG.  */
+
+  uint32_t group_x;
+  uint32_t group_y;
+  uint32_t group_z;
+
+  /* The local size of a complete WG.  */
+
+  uint32_t wg_size_x;
+  uint32_t wg_size_y;
+  uint32_t wg_size_z;
+
+  /* The local size of the current WG.  */
+
+  uint32_t cur_wg_size_x;
+  uint32_t cur_wg_size_y;
+  uint32_t cur_wg_size_z;
+
+  /* NOTE: Fixed members END here.  */
+
   PHSAKernelLaunchData *launch_data;
   /* Identifies and keeps book of the currently executed WG of the WI swarm.  */
   volatile PHSAWorkGroup *wg;
-  /* The local id of the current WI.  */
-  size_t x;
-  size_t y;
-  size_t z;
 #ifdef HAVE_FIBERS
   fiber_t fiber;
 #endif
-} PHSAWorkItem;
+} __attribute__((packed)) PHSAWorkItem;
 
 
 #endif
diff --git a/libhsail-rt/rt/workitems.c b/libhsail-rt/rt/workitems.c
index 36c9169..c846350 100644
--- a/libhsail-rt/rt/workitems.c
+++ b/libhsail-rt/rt/workitems.c
@@ -107,11 +107,20 @@ phsa_work_item_thread (int arg0, int arg1)
 	 the current_work_group_* is set to point to the WG executed next.  */
       if (!wi->wg->more_wgs)
 	break;
+
+      wi->group_x = wg->x;
+      wi->group_y = wg->y;
+      wi->group_z = wg->z;
+
+      wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
+      wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
+      wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
+
 #ifdef DEBUG_PHSA_RT
       printf (
 	"Running work-item %lu/%lu/%lu for wg %lu/%lu/%lu / %lu/%lu/%lu...\n",
-	wi->x, wi->y, wi->z, wg->x, wg->y, wg->z, l_data->wg_max_x,
-	l_data->wg_max_y, l_data->wg_max_z);
+	wi->x, wi->y, wi->z, wi->group_x, wi->group_y, wi->group_z,
+	l_data->wg_max_x, l_data->wg_max_y, l_data->wg_max_z);
 #endif
 
       if (wi->x < __hsail_currentworkgroupsize (0, wi)
@@ -180,6 +189,13 @@ phsa_work_item_thread (int arg0, int arg1)
 	  else
 	    wg->x++;
 #endif
+	  wi->group_x = wg->x;
+	  wi->group_y = wg->y;
+	  wi->group_z = wg->z;
+
+	  wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
+	  wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
+	  wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
 
 	  /* Reinitialize the work-group barrier according to the new WG's
 	     size, which might not be the same as the previous ones, due
@@ -233,6 +249,7 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
   PHSAWorkItem *wi_threads = NULL;
   PHSAWorkGroup wg;
   size_t flat_wi_id = 0, x, y, z, max_x, max_y, max_z;
+  uint32_t group_x, group_y, group_z;
   fiber_barrier_t wg_start_barrier;
   fiber_barrier_t wg_completion_barrier;
   fiber_barrier_t wg_sync_barrier;
@@ -257,13 +274,13 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
   wg.initial_group_offset = group_local_offset;
 
 #ifdef EXECUTE_WGS_BACKWARDS
-  wg.x = context->wg_max_x - 1;
-  wg.y = context->wg_max_y - 1;
-  wg.z = context->wg_max_z - 1;
+  group_x = context->wg_max_x - 1;
+  group_y = context->wg_max_y - 1;
+  group_z = context->wg_max_z - 1;
 #else
-  wg.x = context->wg_min_x;
-  wg.y = context->wg_min_y;
-  wg.z = context->wg_min_z;
+  group_x = context->wg_min_x;
+  group_y = context->wg_min_y;
+  group_z = context->wg_min_z;
 #endif
 
   fiber_barrier_init (&wg_sync_barrier, wg_size);
@@ -290,6 +307,19 @@ phsa_execute_wi_gang (PHSAKernelLaunchData *context, void *group_base_ptr,
 	  PHSAWorkItem *wi = &wi_threads[flat_wi_id];
 	  wi->launch_data = context;
 	  wi->wg = &wg;
+
+	  wg.x = wi->group_x = group_x;
+	  wg.y = wi->group_y = group_y;
+	  wg.z = wi->group_z = group_z;
+
+	  wi->wg_size_x = context->dp->workgroup_size_x;
+	  wi->wg_size_y = context->dp->workgroup_size_y;
+	  wi->wg_size_z = context->dp->workgroup_size_z;
+
+	  wi->cur_wg_size_x = __hsail_currentworkgroupsize (0, wi);
+	  wi->cur_wg_size_y = __hsail_currentworkgroupsize (1, wi);
+	  wi->cur_wg_size_z = __hsail_currentworkgroupsize (2, wi);
+
 	  wi->x = x;
 	  wi->y = y;
 	  wi->z = z;
@@ -467,9 +497,17 @@ phsa_execute_work_groups (PHSAKernelLaunchData *context, void *group_base_ptr,
     for (wg_y = context->wg_min_y; wg_y < context->wg_max_y; ++wg_y)
       for (wg_x = context->wg_min_x; wg_x < context->wg_max_x; ++wg_x)
 	{
-	  wi.wg->x = wg_x;
-	  wi.wg->y = wg_y;
-	  wi.wg->z = wg_z;
+	  wi.group_x = wg_x;
+	  wi.group_y = wg_y;
+	  wi.group_z = wg_z;
+
+	  wi.wg_size_x = context->dp->workgroup_size_x;
+	  wi.wg_size_y = context->dp->workgroup_size_y;
+	  wi.wg_size_z = context->dp->workgroup_size_z;
+
+	  wi.cur_wg_size_x = __hsail_currentworkgroupsize (0, &wi);
+	  wi.cur_wg_size_y = __hsail_currentworkgroupsize (1, &wi);
+	  wi.cur_wg_size_z = __hsail_currentworkgroupsize (2, &wi);
 
 	  context->kernel (context->kernarg_addr, &wi, group_base_ptr,
 			   group_local_offset, private_base_ptr);
@@ -564,15 +602,15 @@ __hsail_workitemabsid (uint32_t dim, PHSAWorkItem *context)
     default:
     case 0:
       /* Overflow semantics in the case of WG dim > grid dim.  */
-      id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
+      id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
 	   % dp->grid_size_x;
       break;
     case 1:
-      id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
+      id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
 	   % dp->grid_size_y;
       break;
     case 2:
-      id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
+      id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
 	   % dp->grid_size_z;
       break;
     }
@@ -590,15 +628,15 @@ __hsail_workitemabsid_u64 (uint32_t dim, PHSAWorkItem *context)
     default:
     case 0:
       /* Overflow semantics in the case of WG dim > grid dim.  */
-      id = ((uint64_t) context->wg->x * dp->workgroup_size_x + context->x)
+      id = ((uint64_t) context->group_x * dp->workgroup_size_x + context->x)
 	   % dp->grid_size_x;
       break;
     case 1:
-      id = ((uint64_t) context->wg->y * dp->workgroup_size_y + context->y)
+      id = ((uint64_t) context->group_y * dp->workgroup_size_y + context->y)
 	   % dp->grid_size_y;
       break;
     case 2:
-      id = ((uint64_t) context->wg->z * dp->workgroup_size_z + context->z)
+      id = ((uint64_t) context->group_z * dp->workgroup_size_z + context->z)
 	   % dp->grid_size_z;
       break;
     }
@@ -738,19 +776,19 @@ __hsail_currentworkgroupsize (uint32_t dim, PHSAWorkItem *wi)
     {
     default:
     case 0:
-      if ((uint64_t) wi->wg->x < dp->grid_size_x / dp->workgroup_size_x)
+      if ((uint64_t) wi->group_x < dp->grid_size_x / dp->workgroup_size_x)
 	wg_size = dp->workgroup_size_x; /* Full WG.  */
       else
 	wg_size = dp->grid_size_x % dp->workgroup_size_x; /* Partial WG.  */
       break;
     case 1:
-      if ((uint64_t) wi->wg->y < dp->grid_size_y / dp->workgroup_size_y)
+      if ((uint64_t) wi->group_y < dp->grid_size_y / dp->workgroup_size_y)
 	wg_size = dp->workgroup_size_y; /* Full WG.  */
       else
 	wg_size = dp->grid_size_y % dp->workgroup_size_y; /* Partial WG.  */
       break;
     case 2:
-      if ((uint64_t) wi->wg->z < dp->grid_size_z / dp->workgroup_size_z)
+      if ((uint64_t) wi->group_z < dp->grid_size_z / dp->workgroup_size_z)
 	wg_size = dp->workgroup_size_z; /* Full WG.  */
       else
 	wg_size = dp->grid_size_z % dp->workgroup_size_z; /* Partial WG.  */
@@ -798,11 +836,11 @@ __hsail_workgroupid (uint32_t dim, PHSAWorkItem *wi)
     {
     default:
     case 0:
-      return wi->wg->x;
+      return wi->group_x;
     case 1:
-      return wi->wg->y;
+      return wi->group_y;
     case 2:
-      return wi->wg->z;
+      return wi->group_z;
     }
 }
 
-- 
2.7.4

Reply via email to