diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 9205d49..4951b0a 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -48577,6 +48577,25 @@ ix86_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
     }
 }
 
+/* Implement targetm.vectorize.builtin_masking_cost.  */
+
+static int
+ix86_builtin_masking_cost (enum vect_cost_for_masking k, tree vectype)
+{
+  if (GET_MODE_CLASS (TYPE_MODE (vectype)) != MODE_VECTOR_INT)
+    return 0;
+
+  switch (k)
+    {
+      case masking_load:
+	return 0;
+      case masking_store:
+	return (ix86_tune == PROCESSOR_HASWELL) ? 10 : 0;
+      default:
+	return ix86_builtin_vectorization_cost (vector_stmt, NULL_TREE, 0);
+    }
+}
+
 /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
    insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
    insn every time.  */
@@ -54300,6 +54319,9 @@ ix86_addr_space_zero_address_valid (addr_space_t as)
 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
   ix86_builtin_vectorization_cost
+#undef TARGET_VECTORIZE_BUILTIN_MASKING_COST
+#define TARGET_VECTORIZE_BUILTIN_MASKING_COST \
+  ix86_builtin_masking_cost
 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
   ix86_vectorize_vec_perm_const_ok
diff --git a/gcc/doc/tm.texi.in b/gcc/doc/tm.texi.in
index 96ca063a..ba6a841 100644
--- a/gcc/doc/tm.texi.in
+++ b/gcc/doc/tm.texi.in
@@ -4222,6 +4222,8 @@ address;  but often a machine-dependent strategy can generate better code.
 
 @hook TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
 
+@hook TARGET_VECTORIZE_BUILTIN_MASKING_COST
+
 @hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
 
 @hook TARGET_VECTORIZE_VEC_PERM_CONST_OK
diff --git a/gcc/params.def b/gcc/params.def
index c5d96e7..849373b 100644
--- a/gcc/params.def
+++ b/gcc/params.def
@@ -1177,6 +1177,11 @@ DEFPARAM (PARAM_MAX_SSA_NAME_QUERY_DEPTH,
 	  "Maximum recursion depth allowed when querying a property of an"
 	  " SSA name.",
 	  2, 1, 0)
+
+DEFPARAM (PARAM_VECT_COST_INCREASE_THRESHOLD,
+	  "vect-cost-increase-threshold",
+	  "Threshold for cost increase in scalar epilogue vectorization.",
+	  10, 0, 100)
 /*
 
 Local variables:
diff --git a/gcc/target.def b/gcc/target.def
index 61cb14b..aa95f52 100644
--- a/gcc/target.def
+++ b/gcc/target.def
@@ -1768,6 +1768,14 @@ misalignment value (@var{misalign}).",
  int, (enum vect_cost_for_stmt type_of_cost, tree vectype, int misalign),
  default_builtin_vectorization_cost)
 
+/* Cost of masking for different statements in vectorized loop for
+   scalar epilog vectorization.  */
+DEFHOOK
+(builtin_masking_cost,
+ "Returns cost of convertion vector statement to masked form.",
+  int, (enum vect_cost_for_masking kind, tree vectype),
+  defualt_builtin_masking_cost)
+
 /* Return true if vector alignment is reachable (by peeling N
    iterations) for the given type.  */
 DEFHOOK
diff --git a/gcc/target.h b/gcc/target.h
index ffc4d6a..3f7d3c6 100644
--- a/gcc/target.h
+++ b/gcc/target.h
@@ -173,6 +173,15 @@ enum vect_cost_for_stmt
   vec_construct
 };
 
+/* Types of costs for masking statements in vectorized loops.  */
+enum vect_cost_for_masking
+{
+  masking_load,
+  masking_store,
+  masking_masked_stmt,
+  masking_reduction
+};
+
 /* Separate locations for which the vectorizer cost model should
    track costs.  */
 enum vect_cost_model_location {
diff --git a/gcc/targhooks.c b/gcc/targhooks.c
index c34b4e9..ca3a044 100644
--- a/gcc/targhooks.c
+++ b/gcc/targhooks.c
@@ -591,6 +591,26 @@ default_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
     }
 }
 
+/* Default masking cost model values.  */
+
+int
+default_builtin_masking_cost (enum vect_cost_for_masking kind,
+			      tree vectype ATTRIBUTE_UNUSED)
+{
+  switch (kind)
+    {
+      case masking_load:
+      case masking_store:
+	return 1;
+
+      case masking_reduction:
+      case masking_masked_stmt:
+	return default_builtin_vectorization_cost (vector_stmt, NULL_TREE, 0);
+      defualt:
+	gcc_unreachable ();
+    }
+}
+
 /* Reciprocal.  */
 
 tree
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index 55e5309..500add9 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -1064,6 +1064,7 @@ new_loop_vec_info (struct loop *loop)
   LOOP_VINFO_NITERSM1 (res) = NULL;
   LOOP_VINFO_NITERS (res) = NULL;
   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
+  LOOP_VINFO_NITERS_VECT_LOOP (res) = NULL;
   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
@@ -3210,6 +3211,10 @@ vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
 	       &vec_inside_cost, &vec_epilogue_cost);
 
   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
+
+  /* Save cost of a vector single iteration for possible vectorization of
+     a scalar epilogue.  */
+  LOOP_VINFO_SINGLE_VECTOR_ITERATION_COST (loop_vinfo) = vec_inside_cost;
   
   if (dump_enabled_p ())
     {
@@ -6405,9 +6410,13 @@ vect_transform_loop (loop_vec_info loop_vinfo)
     {
       tree ratio_mult_vf;
       if (!ni_name)
-	ni_name = vect_build_loop_niters (loop_vinfo);
+	{
+	  ni_name = vect_build_loop_niters (loop_vinfo);
+	  LOOP_VINFO_NITERS (loop_vinfo) = ni_name;
+	}
       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
 				       &ratio);
+      LOOP_VINFO_NITERS_VECT_LOOP (loop_vinfo) = ratio_mult_vf;
       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
 				      th, check_profitability);
     }
diff --git a/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
index cfe30e0..39ea438b 100644
--- a/gcc/tree-vect-stmts.c
+++ b/gcc/tree-vect-stmts.c
@@ -47,6 +47,9 @@ along with GCC; see the file COPYING3.  If not see
 #include "tree-scalar-evolution.h"
 #include "tree-vectorizer.h"
 #include "builtins.h"
+#include "tree-ssa-address.h"
+#include "tree-ssa-loop-ivopts.h"
+#include "params.h"
 
 /* For lang_hooks.types.type_for_mode.  */
 #include "langhooks.h"
@@ -8938,3 +8941,742 @@ supportable_narrowing_operation (enum tree_code code,
   interm_types->release ();
   return false;
 }
+
+/* Fix trip count of vectorized loop to iterate for loop remainder also.  */
+
+static void
+fix_vec_loop_trip_count (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree niters;
+  tree ratio_mult_vf = LOOP_VINFO_NITERS_VECT_LOOP (loop_vinfo);
+  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  gimple *stmt;
+  gimple_stmt_iterator gsi;
+
+  niters = (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)) ?
+	    LOOP_VINFO_NITERS (loop_vinfo)
+	    : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo);
+
+  if (TREE_CODE (ratio_mult_vf) == SSA_NAME)
+    {
+      gimple *def = SSA_NAME_DEF_STMT (ratio_mult_vf);
+      tree bnd, lhs, tmp, log_vf;
+      gimple *def_bnd;
+      gimple *new_def_bnd;
+      gcc_assert (gimple_code (def) == GIMPLE_ASSIGN);
+      gcc_assert (gimple_assign_rhs_code (def) == LSHIFT_EXPR);
+      bnd = gimple_assign_rhs1 (def);
+      gcc_assert (TREE_CODE (bnd) == SSA_NAME);
+      gcc_assert (TREE_CODE (gimple_assign_rhs2 (def)) == INTEGER_CST);
+      def_bnd = SSA_NAME_DEF_STMT (bnd);
+      gsi = gsi_for_stmt (def_bnd);
+      /* Create t = niters + vfm1 statement.  */
+      lhs = create_tmp_var (TREE_TYPE (bnd));
+      stmt = gimple_build_assign (lhs, PLUS_EXPR, niters,
+				  build_int_cst (TREE_TYPE (bnd), vf - 1));
+      tmp = make_ssa_name (lhs, stmt);
+      gimple_assign_set_lhs (stmt, tmp);
+      gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+      /* Replace BND definition with bnd = t >> log2 (vf).  */
+      log_vf = build_int_cst (TREE_TYPE (tmp), exact_log2 (vf));
+      new_def_bnd = gimple_build_assign (bnd, RSHIFT_EXPR, tmp, log_vf);
+      gsi_replace (&gsi, new_def_bnd, false);
+    }
+  else
+    {
+      tree op_const;
+      unsigned n;
+      unsigned logvf = exact_log2 (vf);
+      gcond *cond;
+      gcc_assert (TREE_CODE (ratio_mult_vf) == INTEGER_CST);
+      gcc_assert (TREE_CODE (niters) == INTEGER_CST);
+      /* Change value of bnd in GIMPLE_COND.  */
+      gcc_assert (loop->num_nodes == 2);
+      stmt = last_stmt (loop->header);
+      gcc_assert (gimple_code (stmt) == GIMPLE_COND);
+      n = tree_to_uhwi (niters);
+      n = ((n + (vf - 1)) >> logvf) << logvf;
+      op_const = build_int_cst (TREE_TYPE (gimple_cond_lhs (stmt)), n);
+      gcc_assert (TREE_CODE (gimple_cond_rhs (stmt)) == INTEGER_CST);
+      cond = dyn_cast <gcond *> (stmt);
+      gimple_cond_set_rhs (cond, op_const);
+    }
+}
+
+/* Did scalar remainder unreachable through vecotirzed loop.  */
+
+static void
+isolate_remainder (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  edge e;
+  basic_block bb = loop->header;
+  gimple *last;
+  gcond *cond;
+
+  e = EDGE_SUCC ((bb), 0);
+  if (flow_bb_inside_loop_p (loop, e->dest))
+    e = EDGE_SUCC ((bb), 1);
+  bb = e->dest;
+  gcc_assert (!flow_bb_inside_loop_p (loop, bb));
+  last = last_stmt (bb);
+  gcc_assert (gimple_code (last) == GIMPLE_COND);
+  cond = as_a <gcond *> (last);
+  /* Assume that target of false edge is scalar loop preheader.  */
+  gimple_cond_make_true (cond);
+}
+
+/* Generate induction_vector which will be used to mask evaluation.  */
+
+static tree
+gen_vec_induction (loop_vec_info loop_vinfo, unsigned elem_size, unsigned size)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  edge pe = loop_preheader_edge (loop);
+  vec<constructor_elt, va_gc> *v;
+  gimple *stmt;
+  gimple_stmt_iterator gsi;
+  gphi *induction_phi;
+  tree iv_type, vectype, cmp_vectype;
+  tree lhs, rhs, iv;
+  unsigned n;
+  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  int i;
+  tree new_vec, new_var;
+  tree vec_init, vec_step, vec_dest, vec_def;
+  tree val;
+  tree induc_def;
+  basic_block new_bb;
+
+  /* Find control iv.  */
+  stmt = last_stmt (loop->header);
+  gcc_assert (gimple_code (stmt) == GIMPLE_COND);
+  lhs = gimple_cond_lhs (stmt);
+  rhs = gimple_cond_rhs (stmt);
+  /* Assume any operand order.  */
+  if (TREE_CODE (lhs) != SSA_NAME)
+    iv = rhs;
+  else
+    {
+      gimple *def_stmt = SSA_NAME_DEF_STMT (lhs);
+      if (gimple_bb (def_stmt) != loop->header)
+	iv = rhs;
+      else
+	iv = lhs;
+    }
+  gcc_assert (TREE_CODE (iv) == SSA_NAME);
+  /* Determine type to build vector index aka induction vector.  */
+  n = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (iv)));
+  if (n > elem_size)
+    /* Multiple types are not yet supported.  */
+    return NULL_TREE;
+  if (n == elem_size && !TYPE_UNSIGNED (TREE_TYPE (iv)))
+    iv_type = TREE_TYPE (iv);
+  else
+    iv_type = build_nonstandard_integer_type (elem_size, 0);
+  vectype = get_vectype_for_scalar_type_and_size (iv_type, size);
+  /* Check that vector comparison for IV_TYPE is supported.  */
+  cmp_vectype = build_same_sized_truth_vector_type (vectype);
+  if (!expand_vec_cmp_expr_p (vectype, cmp_vectype))
+    {
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location,
+			   "type is not supported for vector compare!\n");
+	  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+	}
+      return NULL_TREE;
+    }
+
+  /* Build induction initialization and insert it to loop preheader.  */
+  vec_alloc (v, vf);
+  for (i = 0; i < vf; i++)
+    {
+      tree elem;
+      elem = build_int_cst (iv_type, i);
+      CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elem);
+    }
+  new_vec = build_vector_from_ctor (vectype, v);
+  new_var = vect_get_new_vect_var (vectype, vect_simple_var, "cst_");
+  stmt = gimple_build_assign (new_var, new_vec);
+  vec_init = make_ssa_name (new_var, stmt);
+  gimple_assign_set_lhs (stmt, vec_init);
+  new_bb = gsi_insert_on_edge_immediate (pe, stmt);
+  gcc_assert (!new_bb);
+
+  /* Create vector-step consisting from VF.  */
+  val = build_int_cst (iv_type, vf);
+  new_vec = build_vector_from_val (vectype, val);
+  new_var = vect_get_new_vect_var (vectype, vect_simple_var, "cst_");
+  stmt = gimple_build_assign (new_var, new_vec);
+  vec_step = make_ssa_name (new_var, stmt);
+  gimple_assign_set_lhs (stmt, vec_step);
+  new_bb = gsi_insert_on_edge_immediate (pe, stmt);
+  gcc_assert (!new_bb);
+
+  /* Create the induction-phi.  */
+  vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
+  induction_phi = create_phi_node (vec_dest, loop->header);
+  induc_def = PHI_RESULT (induction_phi);
+
+  /* Create vector iv increment inside loop.  */
+  gsi = gsi_after_labels (loop->header);
+  stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
+  vec_def = make_ssa_name (vec_dest, stmt);
+  gimple_assign_set_lhs (stmt, vec_def);
+  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+
+  /* Set the arguments of phi node.  */
+  add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
+  add_phi_arg (induction_phi, vec_def, loop_latch_edge (loop),
+	       UNKNOWN_LOCATION);
+  return induc_def;
+}
+
+/* Produce mask which will be used for masking.  */
+
+static tree
+gen_mask_for_remainder (loop_vec_info loop_vinfo, tree vec_index)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree new_vec, new_var;
+  tree niters, vec_niters, new_niters, vec_res, vec_mask;
+  gimple *stmt;
+  basic_block new_bb;
+  edge pe = loop_preheader_edge (loop);
+  gimple_stmt_iterator gsi;
+  tree vectype = TREE_TYPE (vec_index);
+  tree s_vectype;
+
+  gsi = gsi_after_labels (loop->header);
+  niters = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
+	   ? LOOP_VINFO_NITERS (loop_vinfo)
+	   : LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo);
+
+  /* Create vector for comparison consisting of niters.  */
+  if (!types_compatible_p (TREE_TYPE (niters), TREE_TYPE (vectype)))
+    {
+      tree new_type = TREE_TYPE (vectype);
+      enum tree_code cop;
+      cop = tree_to_uhwi (TYPE_SIZE (new_type)) ==
+	    tree_to_uhwi (TYPE_SIZE (TREE_TYPE (niters)))
+	    ? NOP_EXPR : CONVERT_EXPR;
+      new_niters = make_ssa_name (new_type);
+      stmt = gimple_build_assign (new_niters, cop, niters);
+      new_bb = gsi_insert_on_edge_immediate (pe, stmt);
+      gcc_assert (!new_bb);
+    }
+  else
+    new_niters = niters;
+  new_vec = build_vector_from_val (vectype, new_niters);
+  new_var = vect_get_new_vect_var (vectype, vect_simple_var, "cst_");
+  stmt = gimple_build_assign (new_var, new_vec);
+  vec_niters = make_ssa_name (new_var, stmt);
+  gimple_assign_set_lhs (stmt, vec_niters);
+  new_bb = gsi_insert_on_edge_immediate (pe, stmt);
+  gcc_assert (!new_bb);
+  /* Create vector comparison the result of which will be used as mask
+     for loads/stores.  */
+  s_vectype = build_same_sized_truth_vector_type (vectype);
+  vec_mask = vect_get_new_vect_var (s_vectype, vect_simple_var, "vec_mask_");
+  stmt = gimple_build_assign (vec_mask, LT_EXPR, vec_index, vec_niters);
+  vec_res = make_ssa_name (vec_mask, stmt);
+  gimple_assign_set_lhs (stmt, vec_res);
+  gsi_insert_before (&gsi, stmt, GSI_SAME_STMT);
+  return vec_res;
+}
+
+/* Convert each load to masked load.  */
+
+static void
+convert_loads_to_masked (vec<gimple *> *loads, tree mask)
+{
+  gimple *stmt, *new_stmt;
+  tree addr, ref;
+  gimple_stmt_iterator gsi;
+
+  while (loads->length () > 0)
+    {
+      tree lhs, ptr;
+      stmt = loads->pop ();
+      gsi = gsi_for_stmt (stmt);
+      lhs = gimple_assign_lhs (stmt);
+      ref = gimple_assign_rhs1 (stmt);
+      addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (ref),
+				       true, NULL_TREE, true,
+				       GSI_SAME_STMT);
+      ptr = build_int_cst (reference_alias_ptr_type (ref), 0);
+      if (!SSA_NAME_PTR_INFO (addr))
+	copy_ref_info (build2 (MEM_REF, TREE_TYPE (ref), addr, ptr), ref);
+      new_stmt = gimple_build_call_internal (IFN_MASK_LOAD, 3,
+					     addr, ptr, mask);
+      gimple_call_set_lhs (new_stmt, lhs);
+      gsi_replace (&gsi, new_stmt, false);
+    }
+}
+
+/* Convert each store to masked one.  */
+
+static void
+convert_stores_to_masked (vec<gimple *> *stores, tree mask)
+{
+  gimple *stmt, *new_stmt;
+  tree addr, ref;
+  gimple_stmt_iterator gsi;
+
+  while (stores->length () > 0)
+    {
+      tree rhs, ptr;
+      stmt = stores->pop ();
+      gsi = gsi_for_stmt (stmt);
+      ref = gimple_assign_lhs (stmt);
+      rhs = gimple_assign_rhs1 (stmt);
+      addr = force_gimple_operand_gsi (&gsi, build_fold_addr_expr (ref),
+				       true, NULL_TREE, true,
+				       GSI_SAME_STMT);
+      ptr = build_int_cst (reference_alias_ptr_type (ref), 0);
+      if (!SSA_NAME_PTR_INFO (addr))
+	copy_ref_info (build2 (MEM_REF, TREE_TYPE (ref), addr, ptr), ref);
+      new_stmt = gimple_build_call_internal (IFN_MASK_STORE, 4, addr, ptr,
+					      mask, rhs);
+      gsi_replace (&gsi, new_stmt, false);
+    }
+}
+
+static void
+fix_mask_for_masked_ld_st (vec<gimple *> *masked_stmt, tree mask)
+{
+  gimple *stmt, *new_stmt;
+  tree old, lhs, vectype, var, n_lhs;
+  gimple_stmt_iterator gsi;
+
+  while (masked_stmt->length () > 0)
+    {
+      stmt = masked_stmt->pop ();
+      gsi = gsi_for_stmt (stmt);
+      old = gimple_call_arg (stmt, 2);
+      vectype = TREE_TYPE (old);
+      if (TREE_TYPE (mask) != vectype)
+	{
+	  tree new_vtype = TREE_TYPE (mask);
+	  tree n_var;
+	  tree conv_expr;
+	  n_var = vect_get_new_vect_var (new_vtype, vect_simple_var, NULL);
+	  conv_expr = build1 (VIEW_CONVERT_EXPR, new_vtype, old);
+	  new_stmt = gimple_build_assign (n_var, conv_expr);
+	  n_lhs = make_ssa_name (n_var);
+	  gimple_assign_set_lhs (new_stmt, n_lhs);
+	  vectype = new_vtype;
+	  gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
+	}
+      else
+	n_lhs = old;
+      var = vect_get_new_vect_var (vectype, vect_simple_var, NULL);
+      new_stmt = gimple_build_assign (var, BIT_AND_EXPR, mask, n_lhs);
+      lhs = make_ssa_name (var, new_stmt);
+      gimple_assign_set_lhs (new_stmt, lhs);
+      gsi_insert_before (&gsi, new_stmt, GSI_SAME_STMT);
+      gimple_call_set_arg (stmt, 2, lhs);
+      update_stmt (stmt);
+    }
+}
+
+/* Convert vectorized reductions to VEC_COND statements to preserve
+   reduction semantic:
+	s1 = x + s2 --> t = x + s2; s1 = (mask)? t : s2.  */
+
+static void
+convert_reductions (loop_vec_info loop_vinfo, tree mask)
+{
+  unsigned i;
+  for (i = 0; i < LOOP_VINFO_REDUCTIONS (loop_vinfo).length (); i++)
+    {
+      gimple *stmt = LOOP_VINFO_REDUCTIONS (loop_vinfo)[i];
+      gimple_stmt_iterator gsi;
+      tree vectype;
+      tree lhs, rhs;
+      tree var, new_lhs, vec_cond_expr;
+      gimple *new_stmt, *def;
+      stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
+      stmt = STMT_VINFO_VEC_STMT (stmt_info);
+      lhs = gimple_assign_lhs (stmt);
+      vectype = TREE_TYPE (lhs);
+      gsi = gsi_for_stmt (stmt);
+      rhs = gimple_assign_rhs1 (stmt);
+      gcc_assert (TREE_CODE (rhs) == SSA_NAME);
+      def = SSA_NAME_DEF_STMT (rhs);
+      if (gimple_code (def) != GIMPLE_PHI)
+	{
+	  rhs = gimple_assign_rhs2 (stmt);
+	  gcc_assert (TREE_CODE (rhs) == SSA_NAME);
+	  def = SSA_NAME_DEF_STMT (rhs);
+	  gcc_assert (gimple_code (def) == GIMPLE_PHI);
+	}
+      /* Change lhs of STMT.  */
+      var = vect_get_new_vect_var (vectype, vect_simple_var, NULL);
+      new_lhs = make_ssa_name (var, stmt);
+      gimple_assign_set_lhs (stmt, new_lhs);
+      /* Generate new VEC_COND expr.  */
+      vec_cond_expr = build3 (VEC_COND_EXPR, vectype, mask, new_lhs, rhs);
+      new_stmt = gimple_build_assign (lhs, vec_cond_expr);
+      gsi_insert_after (&gsi, new_stmt, GSI_SAME_STMT);
+    }
+}
+
+/* Return true if MEM_REF is incremented by vector size and false otherwise.  */
+
+static bool
+mem_ref_is_vec_size_incremented (loop_vec_info loop_vinfo, tree lhs)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  tree vectype = TREE_TYPE (lhs);
+  unsigned n = GET_MODE_SIZE (TYPE_MODE (vectype));
+  gphi *phi;
+  edge e = loop_latch_edge (loop);
+  tree arg;
+  gimple *def;
+  tree name;
+  if (TREE_CODE (lhs) != MEM_REF)
+    return false;
+  name = TREE_OPERAND (lhs, 0);
+  if (TREE_CODE (name) != SSA_NAME)
+    return false;
+  def = SSA_NAME_DEF_STMT (name);
+  if (!def || gimple_code (def) != GIMPLE_PHI)
+    return false;
+  phi = as_a <gphi *> (def);
+  arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
+  gcc_assert (TREE_CODE (arg) == SSA_NAME);
+  def = SSA_NAME_DEF_STMT (arg);
+  if (gimple_code (def) != GIMPLE_ASSIGN
+      || gimple_assign_rhs_code (def) != POINTER_PLUS_EXPR)
+    return false;
+  arg = gimple_assign_rhs2 (def);
+  if (TREE_CODE (arg) != INTEGER_CST)
+    arg = gimple_assign_rhs1 (def);
+  if (TREE_CODE (arg) != INTEGER_CST)
+    return false;
+  if (compare_tree_int (arg, n) != 0)
+    return false;
+  return true;
+}
+
+/* Combine vectorized loop with scalar remainder through masking statemnts
+   such as memoryt read/write and reduction to produce legal result.
+   New vector inductive variable is created to generate mask which simply is
+   result of compare new variable with vector containing a number of iteration.
+   Loop tripe count is adjusted and scalar loop correspondent to remainder
+   is made unreachable through vectorized loop.  */
+
+void
+combine_vect_loop_remainder (loop_vec_info loop_vinfo)
+{
+  struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
+  auto_vec<gimple *, 10> loads;
+  auto_vec<gimple *, 5> stores;
+  auto_vec<gimple *, 5> masked_ld_st;
+  int elem_size = 0;
+  int n;
+  int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
+  basic_block bb;
+  gimple_stmt_iterator gsi;
+  gimple *stmt;
+  stmt_vec_info stmt_info;
+  tree lhs, rhs, vectype, mask_vectype;
+  tree vec_index, vec_mask;
+  bool has_reductions = false;
+  unsigned size = 0;
+  unsigned additional_cost;
+  unsigned val;
+
+  if (!loop)
+    return;
+  if (loop->inner)
+    return;  /* Do not support outer-loop vectorization.  */
+  gcc_assert (LOOP_VINFO_VECTORIZABLE_P (loop_vinfo));
+  vect_location = find_loop_location (loop);
+  if (!LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
+      || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
+    return;
+  if (!LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ()
+      || !LOOP_VINFO_GROUPED_STORES (loop_vinfo).is_empty ())
+    return;
+  bb = loop->header;
+  /* Initialize ADDITIONAL_COST as cost of two vector statements.  */
+  additional_cost = builtin_vectorization_cost (vector_stmt, NULL_TREE, 0) * 2;
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "=== try to eliminate scalar epilogue ===\n");
+
+  /* Collect all statements that need to be fixed, compute cost of masking.  */
+  for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
+    {
+      stmt = gsi_stmt (gsi);
+      stmt_info = vinfo_for_stmt (stmt);
+      if (stmt_info && STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+	/* Not supported yet!  */
+	return;
+      /* Check that we support given define type.  */
+      if (stmt_info)
+	switch (STMT_VINFO_DEF_TYPE (stmt_info))
+	  {
+	    case vect_induction_def:
+	      if (STMT_VINFO_LIVE_P (stmt_info))
+		return;
+	      break;
+	    case vect_nested_cycle:
+	    case vect_double_reduction_def:
+	    case vect_external_def:
+	      return;
+	    default:
+	      break;
+	  }
+
+      if (gimple_assign_load_p (stmt))
+	{
+	  lhs = gimple_assign_lhs (stmt);
+	  rhs = gimple_assign_rhs1 (stmt);
+	  vectype = TREE_TYPE (lhs);
+	  if (may_be_nonaddressable_p (rhs))
+	    return;
+	  if (!VECTOR_TYPE_P (vectype))
+	    {
+	      struct data_reference *dr;
+	      if (!stmt_info)
+		continue;
+	      dr = STMT_VINFO_DATA_REF (stmt_info);
+	      if (!dr)
+		continue;
+	      if (TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
+		return;
+	      if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) <= 0)
+		{
+		  if (dump_enabled_p ())
+		    dump_printf_loc (MSG_NOTE, vect_location,
+				 "Load with decrement is not masked.\n");
+		  return;
+		}
+	      continue;
+	    }
+	  if (vf / TYPE_VECTOR_SUBPARTS (vectype) > 1)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "multiple-types are not supported yet.\n");
+	      return;
+	    }
+	  n = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype)));
+	  if (elem_size == 0)
+	    elem_size = n;
+	  else if (n != elem_size)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "multiple-types are not supported yet.\n");
+	      return;
+	    }
+	  if (size == 0)
+	    size = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+	  mask_vectype = build_same_sized_truth_vector_type (vectype);
+	  if (!can_vec_mask_load_store_p (TYPE_MODE (vectype),
+					  TYPE_MODE (mask_vectype),
+					  true))
+	    {
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_NOTE, vect_location,
+				   "type is not supported for masking!\n");
+		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+		}
+	      return;
+	    }
+	  additional_cost += builtin_masking_cost (masking_load, mask_vectype);
+	  loads.safe_push (stmt);
+	}
+      else if (gimple_store_p (stmt))
+	{
+	  gcc_assert (gimple_assign_single_p (stmt));
+	  lhs = gimple_assign_lhs (stmt);
+	  if (may_be_nonaddressable_p (lhs))
+	    return;
+	  vectype = TREE_TYPE (lhs);
+	  if (!VECTOR_TYPE_P (vectype))
+	    continue;
+	  if (vf / TYPE_VECTOR_SUBPARTS (vectype) > 1)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "multiple-types are not supported yet.\n");
+	      return;
+	    }
+	  n = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype)));
+	  if (elem_size == 0)
+	      elem_size = n;
+	  else if (n != elem_size)
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "multiple-types are not supported yet.\n");
+	      return;
+	    }
+	  if (!mem_ref_is_vec_size_incremented (loop_vinfo, lhs))
+	    {
+	      if (dump_enabled_p ())
+		dump_printf_loc (MSG_NOTE, vect_location,
+				 "Store with decrement is not masked.\n");
+	      return;
+	    }
+	  if (size == 0)
+	    size = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+	  mask_vectype = build_same_sized_truth_vector_type (vectype);
+	  if (!can_vec_mask_load_store_p (TYPE_MODE (vectype),
+					  TYPE_MODE (mask_vectype),
+					  false))
+	    {
+	      if (dump_enabled_p ())
+		{
+		  dump_printf_loc (MSG_NOTE, vect_location,
+				   "type is not supported for masking!\n");
+		  dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
+		}
+	      return;
+	    }
+	  additional_cost += builtin_masking_cost (masking_store, mask_vectype);
+	  stores.safe_push (stmt);
+	}
+      else if (is_gimple_call (stmt)
+	       && gimple_call_internal_p (stmt)
+	       && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
+		   || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
+	{
+	  tree mask = gimple_call_arg (stmt, 2);
+	  additional_cost += builtin_masking_cost (masking_masked_stmt,
+						   TREE_TYPE (mask));
+	  masked_ld_st.safe_push (stmt);
+	}
+      else if (is_gimple_call (stmt))
+	return;
+    }
+
+  /* Check that all vectorizable reductions can be converted to VCOND.  */
+  if (!LOOP_VINFO_REDUCTIONS (loop_vinfo).is_empty ())
+    {
+      unsigned i;
+      has_reductions = true;
+      for (i = 0; i < LOOP_VINFO_REDUCTIONS (loop_vinfo).length (); i++)
+	{
+	  tree mask_vectype;
+
+	  stmt = LOOP_VINFO_REDUCTIONS (loop_vinfo)[i];
+	  stmt_info = vinfo_for_stmt (stmt);
+	  gcc_assert (stmt_info);
+	  if (PURE_SLP_STMT (stmt_info))
+	    return;
+	  gcc_assert (STMT_VINFO_VEC_STMT (stmt_info));
+	  stmt = STMT_VINFO_VEC_STMT (stmt_info);
+	  if (gimple_code (stmt) != GIMPLE_ASSIGN)
+	    return;
+	  /* Only reduction with binary operation is supported.  */
+	  if (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))
+	      != GIMPLE_BINARY_RHS)
+	    return;
+	  lhs = gimple_assign_lhs (stmt);
+	  vectype = TREE_TYPE (lhs);
+	  if (vf / TYPE_VECTOR_SUBPARTS (vectype) > 1)
+	    /* Not yet supported!  */
+	    return;
+	  n = GET_MODE_BITSIZE (TYPE_MODE (TREE_TYPE (vectype)));
+	  if (elem_size == 0)
+	    elem_size = n;
+	  else if (n != elem_size)
+	    /* Not yet supported!  */
+	    return;
+	  if (size == 0)
+	    size = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+	  mask_vectype = build_same_sized_truth_vector_type (vectype);
+	  if (!expand_vec_cond_expr_p (vectype, mask_vectype))
+	    return;
+	  additional_cost += builtin_masking_cost (masking_reduction,
+						   mask_vectype);
+	}
+    }
+  /* Check masked load/stores is any.  */
+  if (!masked_ld_st.is_empty ())
+    {
+      unsigned i;
+      for (i = 0; i < masked_ld_st.length (); i++)
+	{
+	  tree mask;
+	  tree vectype;
+	  optab tab;
+	  stmt = masked_ld_st[i];
+	  mask = gimple_call_arg (stmt, 2);
+	  vectype = TREE_TYPE (mask);
+	  n = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (vectype)));
+	  if (elem_size == 0)
+	    elem_size = n;
+	  else if (n != elem_size)
+	    /* Mask conversion is not supported yet!  */
+	    return;
+	  if (size == 0)
+	    size = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
+	  /* Check that BIT_AND is supported on target.  */
+	  tab = optab_for_tree_code (BIT_AND_EXPR, vectype, optab_default);
+	  if (!tab)
+	    return;
+	  if (optab_handler (tab, TYPE_MODE (vectype)) == CODE_FOR_nothing)
+	    return;
+	}
+    }
+
+  /* Vectorization of scalar epilogue is reasonable if cost increase in
+     percents does not exceed value of "vect-cost-increase-threshold"
+     parameter.  */
+  val = additional_cost * 100;
+  val /= LOOP_VINFO_SINGLE_VECTOR_ITERATION_COST (loop_vinfo);
+  if (val > (unsigned) PARAM_VALUE (PARAM_VECT_COST_INCREASE_THRESHOLD))
+    {
+      if (dump_enabled_p ())
+	{
+	  dump_printf_loc (MSG_NOTE, vect_location, "Vectorization of scalar"
+			   " epilogue is not profitable.\n");
+	  dump_printf (MSG_NOTE, "Vector iteration cost: %d\n",
+		       LOOP_VINFO_SINGLE_VECTOR_ITERATION_COST (loop_vinfo));
+	  dump_printf (MSG_NOTE, " Additional iteration cost: %d\n",
+		       additional_cost);
+	}
+      return;
+    }
+  /* Generate induction vector which will be used to evaluate mask.  */
+  vec_index = gen_vec_induction (loop_vinfo, elem_size, size);
+  if (!vec_index)
+    return;
+
+  /* Generate mask vector which will be used to nask saved statements.  */
+  vec_mask = gen_mask_for_remainder (loop_vinfo, vec_index);
+  gcc_assert (vec_mask);
+
+  /* Convert vectororized loads to masked ones.  */
+  if (!loads.is_empty ())
+    convert_loads_to_masked (&loads, vec_mask);
+
+  /* Convert vectoirizzed stores to masked ones.  */
+  if (!stores.is_empty ())
+    convert_stores_to_masked (&stores, vec_mask);
+
+  if (has_reductions)
+    convert_reductions (loop_vinfo, vec_mask);
+
+  if (!masked_ld_st.is_empty ())
+    fix_mask_for_masked_ld_st (&masked_ld_st, vec_mask);
+
+  /* Fix loop trip count.  */
+  fix_vec_loop_trip_count (loop_vinfo);
+
+  /* Fix up cfg to make scalar loop remainder unreachable.  */
+  isolate_remainder (loop_vinfo);
+  if (dump_enabled_p ())
+    dump_printf_loc (MSG_NOTE, vect_location,
+		     "=== scalar epilogue has been deleted ===\n");
+}
diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c
index 41e87a8..46535a2 100644
--- a/gcc/tree-vectorizer.c
+++ b/gcc/tree-vectorizer.c
@@ -580,6 +580,18 @@ vectorize_loops (void)
 	  }
       }
 
+  /* Try to combine vectorized loop and scalar remainder.  */
+  for (i = 1; i < vect_loops_num; i++)
+    {
+      loop_vec_info loop_vinfo;
+      loop = get_loop (cfun, i);
+      if (!loop || loop->inner)
+	continue;
+      loop_vinfo = (loop_vec_info) loop->aux;
+      if (loop_vinfo && LOOP_VINFO_VECTORIZABLE_P (loop_vinfo))
+	combine_vect_loop_remainder (loop_vinfo);
+    }
+
   for (i = 1; i < vect_loops_num; i++)
     {
       loop_vec_info loop_vinfo;
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 6ad0cc4..6cedc01 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -231,6 +231,8 @@ typedef struct _loop_vec_info : public vec_info {
   tree num_iters;
   /* Number of iterations of the original loop.  */
   tree num_iters_unchanged;
+  /* Number of iteration of vectorized loop.  */
+  tree num_iters_vect_loop;
 
   /* Threshold of number of iterations below which vectorzation will not be
      performed. It is calculated from MIN_PROFITABLE_ITERS and
@@ -291,6 +293,9 @@ typedef struct _loop_vec_info : public vec_info {
   /* Cost of a single scalar iteration.  */
   int single_scalar_iteration_cost;
 
+  /* Cost of a single vector iteration.  */
+  unsigned single_vector_iteration_cost;
+
   /* When we have grouped data accesses with gaps, we may introduce invalid
      memory accesses.  We peel the last iteration of the loop to prevent
      this.  */
@@ -336,6 +341,7 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_BBS(L)                  (L)->bbs
 #define LOOP_VINFO_NITERSM1(L)             (L)->num_itersm1
 #define LOOP_VINFO_NITERS(L)               (L)->num_iters
+#define LOOP_VINFO_NITERS_VECT_LOOP(L)    (L)->num_iters_vect_loop
 /* Since LOOP_VINFO_NITERS and LOOP_VINFO_NITERSM1 can change after
    prologue peeling retain total unchanged scalar loop iterations for
    cost model.  */
@@ -366,6 +372,8 @@ typedef struct _loop_vec_info : public vec_info {
 #define LOOP_VINFO_SCALAR_LOOP(L)	   (L)->scalar_loop
 #define LOOP_VINFO_SCALAR_ITERATION_COST(L) (L)->scalar_cost_vec
 #define LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST(L) (L)->single_scalar_iteration_cost
+#define LOOP_VINFO_SINGLE_VECTOR_ITERATION_COST(L) \
+  (L)->single_vector_iteration_cost
 
 #define LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT(L) \
   ((L)->may_misalign_stmts.length () > 0)
@@ -822,6 +830,14 @@ builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
 						       vectype, misalign);
 }
 
+/* Alias targetm.vectorize.builtin_masking_cost.  */
+
+static inline int
+builtin_masking_cost (enum vect_cost_for_masking kind, tree vectype)
+{
+  return targetm.vectorize.builtin_masking_cost (kind, vectype);
+}
+
 /* Get cost by calling cost target builtin.  */
 
 static inline
@@ -1001,6 +1017,7 @@ extern void vect_get_vec_defs (tree, tree, gimple *, vec<tree> *,
 			       vec<tree> *, slp_tree, int);
 extern tree vect_gen_perm_mask_any (tree, const unsigned char *);
 extern tree vect_gen_perm_mask_checked (tree, const unsigned char *);
+extern void combine_vect_loop_remainder (loop_vec_info);
 
 /* In tree-vect-data-refs.c.  */
 extern bool vect_can_force_dr_alignment_p (const_tree, unsigned int);
