From 4a0ab2dcaca8ebc8042ce20fd9ba40a710b6f391 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <ktkachov@nvidia.com>
Date: Tue, 8 Oct 2024 02:01:31 -0700
Subject: [PATCH] [RFC] Locality cloning and partitioning

Implement partitioning and cloning in the callgraph to help locality.
A new -flto-partition=locality flag is used to enable this.
This includes a different partitioning algorithm during WPA partitioning
and enables a new IPA locality cloning pass.

Signed-off-by: Prachi Godbole <pgodbole@nvidia.com>
Co-authored-by: Kyrylo Tkachov <ktkachov@nvidia.com>

config/ChangeLog:

	* bootstrap-lto-locality.mk: New file.

gcc/ChangeLog:

	* Makefile.in (OBJS): Add ipa-locality-cloning.o
	(GTFILES): Add ipa-locality-cloning.cc dependency.
	* common.opt (lto_partition_model): Add locality value.
	* flag-types.h (lto_partition_model): Add LTO_PARTITION_LOCALITY value.
	(enum lto_locality_cloning_model): Define.
	* lto-cgraph.cc (lto_set_symtab_encoder_in_partition): Add dumping of node
	and index.
	* params.opt (lto_locality_cloning_model): New enum.
	(lto-partition-locality-cloning): New param.
	(lto-partition-locality-frequency-cutoff): Likewise.
	(lto-partition-locality-size-cutoff): Likewise.
	(lto-max-locality-partition): Likewise.
	* passes.def: Add pass_ipa_locality_cloning.
	* timevar.def (TV_IPA_LC): New timevar.
	* tree-pass.h (make_pass_ipa_locality_cloning): Declare.
	* ipa-locality-cloning.cc: New file.
	* ipa-locality-cloning.h: New file.

gcc/lto/ChangeLog:

	* lto-partition.cc: Include ipa-locality-cloning.h
	(add_node_references_to_partition): Define.
	(create_partition): Likewise.
	(lto_locality_map): Likewise.
	(lto_promote_cross_file_statics): Add extra dumping.
	* lto-partition.h (lto_locality_map): Declare.
	* lto.cc (do_whole_program_analysis): Handle LTO_PARTITION_LOCALITY.
---
 config/bootstrap-lto-locality.mk |  20 +
 gcc/Makefile.in                  |   2 +
 gcc/common.opt                   |   3 +
 gcc/flag-types.h                 |  10 +-
 gcc/ipa-locality-cloning.cc      | 931 +++++++++++++++++++++++++++++++
 gcc/ipa-locality-cloning.h       |  36 ++
 gcc/lto-cgraph.cc                |   2 +
 gcc/lto/lto-partition.cc         | 140 ++++-
 gcc/lto/lto-partition.h          |   1 +
 gcc/lto/lto.cc                   |   3 +
 gcc/params.opt                   |  27 +
 gcc/passes.def                   |   1 +
 gcc/timevar.def                  |   1 +
 gcc/tree-pass.h                  |   1 +
 14 files changed, 1176 insertions(+), 2 deletions(-)
 create mode 100644 config/bootstrap-lto-locality.mk
 create mode 100644 gcc/ipa-locality-cloning.cc
 create mode 100644 gcc/ipa-locality-cloning.h

diff --git a/config/bootstrap-lto-locality.mk b/config/bootstrap-lto-locality.mk
new file mode 100644
index 00000000000..7792bbe6823
--- /dev/null
+++ b/config/bootstrap-lto-locality.mk
@@ -0,0 +1,20 @@
+# This option enables LTO and locality partitioning for stage2 and stage3 in slim mode
+
+STAGE2_CFLAGS += -flto=jobserver -frandom-seed=1 -flto-partition=locality
+STAGE3_CFLAGS += -flto=jobserver -frandom-seed=1 -flto-partition=locality
+STAGEprofile_CFLAGS += -flto=jobserver -frandom-seed=1 -flto-partition=locality
+STAGEtrain_CFLAGS += -flto=jobserver -frandom-seed=1 -flto-partition=locality
+STAGEfeedback_CFLAGS += -flto=jobserver -frandom-seed=1 -flto-partition=locality
+
+# assumes the host supports the linker plugin
+LTO_AR = $$r/$(HOST_SUBDIR)/prev-gcc/gcc-ar$(exeext) -B$$r/$(HOST_SUBDIR)/prev-gcc/
+LTO_RANLIB = $$r/$(HOST_SUBDIR)/prev-gcc/gcc-ranlib$(exeext) -B$$r/$(HOST_SUBDIR)/prev-gcc/
+LTO_NM = $$r/$(HOST_SUBDIR)/prev-gcc/gcc-nm$(exeext) -B$$r/$(HOST_SUBDIR)/prev-gcc/
+
+LTO_EXPORTS = AR="$(LTO_AR)"; export AR; \
+	      RANLIB="$(LTO_RANLIB)"; export RANLIB; \
+	      NM="$(LTO_NM)"; export NM;
+LTO_FLAGS_TO_PASS = AR="$(LTO_AR)" RANLIB="$(LTO_RANLIB)" NM="$(LTO_NM)"
+
+do-compare = $(SHELL) $(srcdir)/contrib/compare-lto $$f1 $$f2
+extra-compare = gcc/lto1$(exeext)
diff --git a/gcc/Makefile.in b/gcc/Makefile.in
index 059cf2e8f79..f98cdb0f96d 100644
--- a/gcc/Makefile.in
+++ b/gcc/Makefile.in
@@ -1535,6 +1535,7 @@ OBJS = \
 	incpath.o \
 	init-regs.o \
 	internal-fn.o \
+	ipa-locality-cloning.o \
 	ipa-cp.o \
 	ipa-sra.o \
 	ipa-devirt.o \
@@ -2843,6 +2844,7 @@ GTFILES = $(CPPLIB_H) $(srcdir)/input.h $(srcdir)/coretypes.h \
   $(srcdir)/ipa-param-manipulation.h $(srcdir)/ipa-sra.cc \
   $(srcdir)/ipa-modref.h $(srcdir)/ipa-modref.cc \
   $(srcdir)/ipa-modref-tree.h \
+  $(srcdir)/ipa-locality-cloning.cc \
   $(srcdir)/signop.h \
   $(srcdir)/diagnostic-spec.h $(srcdir)/diagnostic-spec.cc \
   $(srcdir)/dwarf2out.h \
diff --git a/gcc/common.opt b/gcc/common.opt
index 12b25ff486d..6989064c7ab 100644
--- a/gcc/common.opt
+++ b/gcc/common.opt
@@ -2249,6 +2249,9 @@ Enum(lto_partition_model) String(max) Value(LTO_PARTITION_MAX)
 EnumValue
 Enum(lto_partition_model) String(cache) Value(LTO_PARTITION_CACHE)
 
+EnumValue
+Enum(lto_partition_model) String(locality) Value(LTO_PARTITION_LOCALITY)
+
 flto-partition=
 Common Joined RejectNegative Enum(lto_partition_model) Var(flag_lto_partition) Init(LTO_PARTITION_BALANCED)
 Specify the algorithm to partition symbols and vars at linktime.
diff --git a/gcc/flag-types.h b/gcc/flag-types.h
index df56337f7e8..0cbc7584380 100644
--- a/gcc/flag-types.h
+++ b/gcc/flag-types.h
@@ -397,7 +397,15 @@ enum lto_partition_model {
   LTO_PARTITION_BALANCED = 2,
   LTO_PARTITION_1TO1 = 3,
   LTO_PARTITION_MAX = 4,
-  LTO_PARTITION_CACHE = 5
+  LTO_PARTITION_CACHE = 5,
+  LTO_PARTITION_LOCALITY = 6
+};
+
+/* flag_lto_locality_cloning initialization values.  */
+enum lto_locality_cloning_model {
+  LTO_LOCALITY_NO_CLONING = 0,
+  LTO_LOCALITY_NON_INTERPOSABLE_CLONING = 1,
+  LTO_LOCALITY_MAXIMAL_CLONING = 2,
 };
 
 /* flag_lto_linker_output initialization values.  */
diff --git a/gcc/ipa-locality-cloning.cc b/gcc/ipa-locality-cloning.cc
new file mode 100644
index 00000000000..26d52a9533b
--- /dev/null
+++ b/gcc/ipa-locality-cloning.cc
@@ -0,0 +1,931 @@
+/* Code locality based function cloning.
+   Copyright The GNU Toolchain Authors
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+/* Implement cloning required to improve partitioning of the callgraph
+   for locality considerations.  */
+
+#include "config.h"
+#define INCLUDE_ALGORITHM
+#include "system.h"
+#include "coretypes.h"
+#include "target.h"
+#include "function.h"
+#include "tree.h"
+#include "alloc-pool.h"
+#include "tree-pass.h"
+#include "cgraph.h"
+#include "symbol-summary.h"
+#include "tree-vrp.h"
+#include "sreal.h"
+#include "ipa-cp.h"
+#include "ipa-prop.h"
+#include "ipa-fnsummary.h"
+#include "ipa-modref-tree.h"
+#include "ipa-modref.h"
+#include "symtab-clones.h"
+#include "ipa-locality-cloning.h"
+
+/* Locality partitions, assigns nodes to partitions.  These are used later in
+   WPA partitioning.  */
+vec<locality_partition> partitions;
+/* A workaround for disabling summary if locality_cloning pass is in progress
+   Currently not being used.  */
+bool locality_clone_in_progress_p = false;
+
+/* Map from origial node to its latest clone.  Gets overwritten whenever a new
+   clone is created from the same node.  */
+hash_map<cgraph_node *, cgraph_node *> node_to_clone;
+/* Map from clone to its original node.  */
+hash_map<cgraph_node *, cgraph_node *> clone_to_node;
+
+/* Data structure to hold static heuristics and orders for cgraph_nodes.  */
+struct locality_order
+{
+  cgraph_node *node;
+  sreal order;
+  locality_order (cgraph_node *node, sreal order) : node (node), order (order)
+  {}
+};
+
+/* Return true if NODE is already in some partition.  */
+static inline bool
+node_partitioned_p (cgraph_node *node)
+{
+  return node->aux;
+}
+
+/* Add symbol NODE to partition PART.  */
+static void
+add_node_to_partition (locality_partition part, cgraph_node *node)
+{
+  struct cgraph_edge *e;
+  if (node_partitioned_p (node))
+    return;
+
+  part->nodes.safe_push (node);
+  node->aux = (void *) (uintptr_t) (part->part_id);
+
+  if (!node->alias && node->get_partitioning_class () == SYMBOL_PARTITION)
+    part->insns += ipa_size_summaries->get (node)->size;
+
+  /* Add all inline clones and callees that are duplicated.  */
+  for (e = node->callees; e; e = e->next_callee)
+    if (!e->inline_failed)
+      add_node_to_partition (part, e->callee);
+    /* omp declare_variant_alt or transparent_alias with definition or linker
+       discardable (non-local comdat but not forced and not
+       used by non-LTO).  */
+    else if (e->callee->get_partitioning_class () == SYMBOL_DUPLICATE)
+      add_node_to_partition (part, e->callee);
+
+  /* Add all thunks associated with the function.  */
+  for (e = node->callers; e; e = e->next_caller)
+    if (e->caller->thunk && !e->caller->inlined_to)
+      add_node_to_partition (part, e->caller);
+}
+
+/* Return TRUE if NODE is in PARTITION.  */
+static bool
+node_in_partition_p (locality_partition partition, cgraph_node *node)
+{
+  return (std::find (partition->nodes.begin (), partition->nodes.end (), node)
+	  != partition->nodes.end ());
+}
+
+/* Helper function for qsort; to break ties.  */
+static int
+compare_node_uids (cgraph_node *n1, cgraph_node *n2)
+{
+  int res = n1->get_uid () - n2->get_uid ();
+  gcc_assert (res != 0);
+  return res > 0 ? 1 : -1;
+}
+
+/* Helper function for qsort; sort nodes by order.  */
+static int
+static_profile_cmp (const void *pa, const void *pb)
+{
+  const locality_order *a = *static_cast<const locality_order *const *> (pa);
+  const locality_order *b = *static_cast<const locality_order *const *> (pb);
+  /* Ascending order.  */
+  if (b->order < a->order)
+    return 1;
+  if (b->order > a->order)
+    return -1;
+  return compare_node_uids (a->node, b->node);
+}
+
+/* Helper function for qsort; sort nodes by profile count.  */
+static int
+compare_edge_profile_counts (const void *pa, const void *pb)
+{
+  const locality_order *a = *static_cast<const locality_order *const *> (pa);
+  const locality_order *b = *static_cast<const locality_order *const *> (pb);
+
+  profile_count cnt1 = a->node->count.ipa ();
+  profile_count cnt2 = b->node->count.ipa ();
+  if (!cnt1.compatible_p (cnt2))
+    return static_profile_cmp (pa, pb);
+
+  if (cnt1 < cnt2)
+    return 1;
+  if (cnt1 > cnt2)
+    return -1;
+  return static_profile_cmp (pa, pb);
+}
+
+/* Create and return the created partition of name NAME.  */
+
+static locality_partition
+create_partition (int &npartitions)
+{
+  locality_partition part = XCNEW (struct locality_partition_def);
+  npartitions++;
+  part->part_id = npartitions;
+  part->nodes.create (1);
+  part->insns = 0;
+  partitions.safe_push (part);
+  return part;
+}
+
+/* Create a locality clone of CNODE and redirect all callers present in
+   PARTITION.
+   Create a clone dpending on whether CNODE itself is a clone or not.  */
+
+static cgraph_node *
+create_locality_clone (cgraph_node *cnode, cgraph_node *,
+		       locality_partition partition, int &cl_num)
+{
+  cgraph_node *cl_node = NULL;
+  vec<cgraph_edge *> redirect_callers;
+  redirect_callers.create (1);
+  /* All callers of cnode in current partition are redirected.  */
+  struct cgraph_edge *edge;
+  for (edge = cnode->callers; edge; edge = edge->next_caller)
+    {
+      struct cgraph_node *caller = edge->caller;
+      if (node_in_partition_p (partition, caller) && caller->definition
+	  && caller != cnode)
+	redirect_callers.safe_push (edge);
+    }
+
+  const char *suffix = "locality_clone";
+
+  if (cnode->clone_of)
+    {
+      if (dump_file)
+	fprintf (dump_file, "It's a clone %s\n", cnode->dump_asm_name ());
+      clone_info *info = clone_info::get (cnode);
+      cl_node
+	= cnode->create_virtual_clone (redirect_callers,
+				       info ? info->tree_map : NULL,
+				       info ? info->param_adjustments : NULL,
+				       suffix, cl_num++);
+      cgraph_node *orig_node = cnode->clone_of;
+
+      if (cl_node->next_sibling_clone)
+	cl_node->next_sibling_clone->prev_sibling_clone
+	  = cl_node->prev_sibling_clone;
+      if (cl_node->prev_sibling_clone)
+	cl_node->prev_sibling_clone->next_sibling_clone
+	  = cl_node->next_sibling_clone;
+      else
+	cnode->clones = cl_node->next_sibling_clone;
+
+      cl_node->next_sibling_clone = orig_node->clones;
+      cl_node->prev_sibling_clone = NULL;
+      orig_node->clones->prev_sibling_clone = cl_node;
+      orig_node->clones = cl_node;
+      cl_node->clone_of = orig_node;
+    }
+  else
+    {
+      tree old_decl = cnode->decl;
+      tree new_decl = copy_node (old_decl);
+
+      /* Generate a new name for the new version. */
+      const char *name = IDENTIFIER_POINTER (DECL_NAME (old_decl));
+      DECL_NAME (new_decl) = clone_function_name (name, suffix, cl_num);
+      SET_DECL_ASSEMBLER_NAME (new_decl,
+			       clone_function_name (old_decl, suffix, cl_num));
+      cl_num++;
+      if (dump_file)
+	fprintf (dump_file, "\tNew name %s\n",
+		 IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (new_decl)));
+
+      SET_DECL_RTL (new_decl, NULL);
+      DECL_EXTERNAL (new_decl) = 0;
+      TREE_PUBLIC (new_decl) = 0;
+      DECL_COMDAT (new_decl) = 0;
+      DECL_WEAK (new_decl) = 0;
+      DECL_VIRTUAL_P (new_decl) = 0;
+      DECL_STATIC_CONSTRUCTOR (new_decl) = 0;
+      DECL_STATIC_DESTRUCTOR (new_decl) = 0;
+      DECL_SET_IS_OPERATOR_NEW (new_decl, 0);
+      DECL_SET_IS_OPERATOR_DELETE (new_decl, 0);
+      DECL_IS_REPLACEABLE_OPERATOR (new_decl) = 0;
+
+      cl_node = cnode->create_clone (new_decl, cnode->count /*profile_count*/,
+				     true /*update_original*/, redirect_callers,
+				     false /*call_duplication_hook*/,
+				     NULL /*new_inlined_to*/,
+				     NULL /*param_adjustments*/, suffix);
+
+      cl_node->lowered = true;
+      cl_node->externally_visible = 0;
+      cl_node->local = 1;
+      cl_node->semantic_interposition = 0;
+
+      if (cnode->ipa_transforms_to_apply.exists ())
+	cl_node->ipa_transforms_to_apply
+	  = cnode->ipa_transforms_to_apply.copy ();
+    }
+
+  if (dump_file)
+    {
+      fprintf (dump_file, "Cloned Node: %s %s\n", cnode->dump_asm_name (),
+	       cl_node->dump_asm_name ());
+
+      for (edge = cl_node->callers; edge; edge = edge->next_caller)
+	fprintf (dump_file, "Redirected callers: %s\n",
+		 edge->caller->dump_asm_name ());
+
+      for (edge = cl_node->callees; edge; edge = edge->next_callee)
+	fprintf (dump_file, "Callees of clone: %s %d\n",
+		 edge->callee->dump_asm_name (), edge->frequency ());
+    }
+  return cl_node;
+}
+
+/* Redirect recursive edges of CLONE to correctly point to CLONE.  As part of
+   cloning process, all callee edges of a node are just duplicated but not
+   redirected.  Therefore, these edges still call to original of CLONE.
+
+   For non-inlined CLONEs, NEW_CALLEE == CLONE and ORIG_CALLEE is CLONE's
+   original node.
+
+   For inlined node, self recursion to CLONE's original same as non-inlined,
+   additionally, calls to CLONE->inlined_to are also recursive:
+   NEW_CALLEE == CLONE->inlined_into and
+   ORIG_CALLEE == original node of CLONE->inlined_into  */
+
+static void
+adjust_recursive_callees (cgraph_node *clone, cgraph_node *new_callee,
+			  cgraph_node *orig_callee)
+{
+  cgraph_node *alias = NULL;
+  for (cgraph_edge *e = clone->callees; e; e = e->next_callee)
+    {
+      if (!e->inline_failed)
+	continue;
+
+      /* Only self-cycle or local alias are handled.  */
+      cgraph_node *callee = e->callee;
+      if (callee == orig_callee)
+	{
+	  cgraph_node **cl = node_to_clone.get (orig_callee);
+	  gcc_assert (cl && *cl == new_callee);
+	  e->redirect_callee_duplicating_thunks (new_callee);
+	  if (dump_file)
+	    fprintf (dump_file, "recursive call from %s to %s orig %s\n",
+		     e->caller->dump_asm_name (), e->callee->dump_asm_name (),
+		     callee->dump_asm_name ());
+	}
+      else if (callee->alias
+	       && e->callee->ultimate_alias_target () == orig_callee)
+	{
+	  if (!alias)
+	    {
+	      alias = dyn_cast<cgraph_node *> (
+		new_callee->noninterposable_alias ());
+	    }
+	  e->redirect_callee_duplicating_thunks (alias);
+	  if (dump_file)
+	    fprintf (dump_file, "recursive call from %s to %s orig %s\n",
+		     e->caller->dump_asm_name (), e->callee->dump_asm_name (),
+		     callee->dump_asm_name ());
+	}
+    }
+  new_callee->expand_all_artificial_thunks ();
+  if (alias)
+    alias->expand_all_artificial_thunks ();
+}
+
+/* Create clones for CALLER's inlined callees, ORIG_INLINED_TO is the original
+   node from clone_as_needed () such that new_inlined_to is a clone of it.  */
+
+static void
+inline_clones (cgraph_node *caller, cgraph_node *orig_inlined_to,
+	       bool update_summary_p)
+{
+  struct cgraph_edge *edge;
+  for (edge = caller->callees; edge; edge = edge->next_callee)
+    {
+      struct cgraph_node *callee = edge->callee;
+      if (edge->inline_failed)
+	continue;
+
+      if (callee->inlined_to != orig_inlined_to) // FIXME: Add an assert here?
+	continue;
+
+      struct cgraph_node *new_inlined_to, *cl;
+      if (caller->inlined_to)
+	new_inlined_to = caller->inlined_to;
+      else
+	new_inlined_to = caller;
+
+      cl = callee->create_clone (callee->decl,
+				 callee->count, // profile_count,
+				 true,		// false, //update_original,
+				 vNULL,
+				 false, // true, //call_duplication_hook,
+				 new_inlined_to,    // new_inlined_to,
+				 NULL,		    // param_adjustments,
+				 "locality_clone"); // suffix
+      edge->redirect_callee (cl);
+
+      cl->lowered = true;
+      cl->externally_visible = 0;
+      cl->local = 1;
+      cl->semantic_interposition = 0;
+      node_to_clone.put (callee, cl);
+      clone_to_node.put (cl, callee);
+
+      adjust_recursive_callees (cl, new_inlined_to, orig_inlined_to);
+      adjust_recursive_callees (cl, cl, callee);
+      if (dump_file)
+	{
+	  fprintf (dump_file, "Inline cloned\n");
+	  cl->dump (dump_file);
+	}
+
+      /* Recursively inline till end of this callchain.  */
+      inline_clones (cl, orig_inlined_to, false);
+      if (update_summary_p)
+	{
+	  // a -> bi, ac -> bi, where bi doesn't have summary info, then ac ->
+	  // bii ipa_merge_fn_summary_after_inlining won't have summary for
+	  // edge.  When lc is created ac -> bii won't have an issue
+	  // ipa_merge_modref_summary_after_inlining (edge);
+	  // ipa_merge_fn_summary_after_inlining (edge);
+	  // ipa_update_overall_fn_summary (new_inlined_to);
+	}
+    }
+}
+
+/* Clone EDGE->CALLEE if it or a clone of it is not already in PARTITION.
+   Redirect all callers of EDGE->CALLEE that are in PARTITION, not just the
+   EDGE.  If a clone is already present in PARTITION, redirect all edges from
+   EDGE->CALLER to EDGE->CALLEE.  This is because we only visit one edge per
+   caller to callee and redirect for all others from there.
+
+   If cloning, also recursively clone inlined functions till the end of the
+   callchain because inlined clones have 1-1 exclusive copy and edge from
+   caller to inlined node.  */
+
+/* There are 2 flows possible:
+   1. Only redirect
+      1.1. cnode is already in current partition - cnode mustn't be a
+      locality_clone -> nothing to do
+      1.2. A clone of cnode is in current partition - find out if it's the
+      correct clone for edge - must be a locality_clone but the exact same
+      kind as callee i.e. orig or cp/sra clone, if yes, redirect, else go to #2
+      1.3. Cnode/a clone of cnode is in current partition but caller is inlined
+   2. Clone and redirect
+      2.1. cnode is original node
+      2.2. cnode itself is a clone
+      Clone inlines
+   Flavors of edges:
+   1. Normal -> orig nodes, locality clones or cp/sra clones
+   2. Recursive -> direct recursion
+   3. Alias -> recursion via aliasing or as a result of IPA code duplication
+   4. Inline -> shouldn't be included in callchain.  */
+
+static cgraph_node *
+clone_node_as_needed (cgraph_edge *edge, locality_partition partition,
+		      int &cl_num)
+{
+  struct cgraph_node *cnode = edge->callee;
+  struct cgraph_node *caller = edge->caller;
+
+  /* If clone of cnode is already in the partition
+   Get latest clone of cnode.  If current partition has cloned cnode, that
+   clone should be returned.  Otherwise, clone from previous partition is
+   returned
+   Original node and its clone shouldn't co-exist in current partition
+
+   This is required if callee is partitioned via another edge before caller
+   was, and we are now visiting caller->callee edge
+
+   Caller is always partitioned before.  If may or may not be cloned itself
+   caller should be in PARTITION? TODO: think about stradling callchains
+
+   1) a -> b ==> a -> bc1; b was cloned say via d -> bc1, a is orig
+   2) ac1 -> b ==> ac1 -> bc1; b was cloned and a was just cloned
+   3) a -> bc1 and bc2 present, mustn't happen, b was cloned and a was
+	redirected qithout being partitioned first.
+	Why will we do this again - multiple edges and something's wrong in
+	partition_callchain ()
+   4) ac1 -> bc1 ==> ac1 -> bc2; a was cloned and we already got (1) in some
+	other partition
+   5) ac1 -> bc1 but no clone present in this PARTITION.  Create from b, not
+	from bc1?
+   6) a -> b; a -> bc0; create new clone, no clone present
+   7) ac0 -> b; ac0 -> bc0 same as (6)
+   8) a -> bc0 and no clone present, mustn't happen, same as (3)
+
+   Redirect when bc1 is present and:
+   a -> b or ac -> b or ac -> bc0  */
+
+  cgraph_node *orig_cnode = cnode;
+  cgraph_node **o_cnode = clone_to_node.get (cnode);
+  if (o_cnode)
+    orig_cnode = *o_cnode;
+
+  cgraph_node **cnode_cl = node_to_clone.get (orig_cnode);
+
+  if (cnode_cl && node_in_partition_p (partition, *cnode_cl))
+    {
+      if (node_in_partition_p (partition, caller))
+	{
+	  bool clone_p = false;
+	  for (cgraph_edge *ec = caller->callees; ec; ec = ec->next_callee)
+	    if (ec->callee == cnode)
+	      {
+		ec->redirect_callee_duplicating_thunks (*cnode_cl);
+		clone_p = true;
+		if (dump_file)
+		  {
+		    fprintf (dump_file, "clone present %s %s redirecting %s\n",
+			     cnode->dump_asm_name (),
+			     (*cnode_cl)->dump_asm_name (),
+			     caller->dump_asm_name ());
+		  }
+	      }
+	  if (clone_p)
+	    {
+	      (*cnode_cl)->expand_all_artificial_thunks ();
+	      return NULL;
+	    }
+	}
+    }
+
+  /* Create a new clone for a -> b, ac -> b.
+     For ac -> bc, should be done on bc or b?
+     bc could be from b_cp/b_sra or b.  */
+
+  if (orig_cnode != cnode)
+    {
+      if (dump_file)
+	fprintf (dump_file, "Clone of clone %s %s\n", cnode->dump_asm_name (),
+		 orig_cnode->dump_asm_name ());
+      return NULL;
+    }
+
+  struct cgraph_node *cloned_node
+    = create_locality_clone (cnode, orig_cnode, partition, cl_num);
+
+  gcc_assert (cloned_node);
+  if (!cloned_node)
+    return NULL;
+
+  node_to_clone.put (cnode, cloned_node);
+  clone_to_node.put (cloned_node, cnode);
+
+  adjust_recursive_callees (cloned_node, cloned_node, cnode);
+
+  /*  Inline clones are created iff their inlined_to == CNODE.  */
+  inline_clones (cloned_node, cnode, true);
+
+  /* ipa_update_overall_fn_summary should be called from duplication hooks.  */
+  symtab->call_cgraph_duplication_hooks (cnode, cloned_node);
+  return cloned_node;
+}
+
+/* Accumulate frequency of all edges from EDGE->caller to EDGE->callee.  */
+
+static sreal
+accumulate_incoming_edge_frequency (cgraph_edge *edge)
+{
+  sreal count = 0;
+  struct cgraph_edge *e;
+  for (e = edge->callee->callers; e; e = e->next_caller)
+    {
+      /* Make a local decision about all edges for EDGE->caller but not the
+	 other nodes already in the partition.  Their edges will be visited
+	 later or may have been visited before and not for the
+	 cut-off criteria.  */
+      if (e->caller == edge->caller)
+	{
+	  profile_count caller_count = e->caller->inlined_to
+					 ? e->caller->inlined_to->count
+					 : e->caller->count;
+	  if (e->count.compatible_p (caller_count))
+	    count += e->sreal_frequency ();
+	}
+    }
+  return count;
+}
+
+/* Determine if NODE has inlined callees in its callchain or has mem functions.
+   This is an attempt to make clones at least partially work.
+   Assumes NODE is known to be a clone.  */
+
+static bool
+clone_suitable_for_locality_cloning_p (cgraph_node *node, cgraph_node *cur,
+				       hash_map<cgraph_node *, bool> &visited)
+{
+  bool v = visited.put (cur, true);
+  /* if v, cur was already present in visited, shouldn't happen.  */
+  gcc_assert (!v);
+  struct cgraph_edge *edge;
+  for (edge = cur->callees; edge; edge = edge->next_callee)
+    {
+      if (!edge->inline_failed)
+	return false;
+      if (DECL_IS_MALLOC (cur->decl))
+	return false;
+      if (DECL_IS_OPERATOR_NEW_P (cur->decl))
+	return false;
+      if (DECL_IS_OPERATOR_DELETE_P (cur->decl))
+	return false;
+      if (DECL_IS_NOVOPS (cur->decl))
+	return false;
+      bool *cv = visited.get (edge->callee);
+      if (!edge->recursive_p () && (cv && (!(*cv))))
+	if (!clone_suitable_for_locality_cloning_p (node, edge->callee,
+						    visited))
+	  return false;
+    }
+  return true;
+}
+
+/* Determine if EDGE->CALLEE is suitable for cloning.  It is assummed that the
+   callee is not an inlined node.  */
+
+static bool
+suitable_for_locality_cloning_p (cgraph_edge *edge,
+				 lto_locality_cloning_model cm)
+{
+  cgraph_node *node = edge->callee;
+  if (!node->versionable)
+    return false;
+
+  if (!node->can_change_signature)
+    return false;
+
+  if (node->clone_of)
+    return false;
+
+  if (node->alias)
+    return false;
+
+  if (edge->recursive_p ())
+    return false;
+
+  if (!node->definition)
+    return false;
+
+  if (cm == LTO_LOCALITY_NON_INTERPOSABLE_CLONING
+      && node->get_availability () == AVAIL_INTERPOSABLE)
+    return false;
+
+  return true;
+}
+
+/* Map from caller to all callees already visited for partitioning.  */
+hash_map<cgraph_node *, auto_vec<cgraph_node *> > caller_to_callees;
+
+/* Partition EDGE->CALLEE into PARTITION or clone if already partitioned and
+   satisfies cloning criteria such as CLONING_MODEL, REAL_FREQ and SIZE
+   cut-offs and CLONE_FURTHER_P set by previous caller.  */
+
+/* callgraph can have multiple caller to callee edges for multiple callsites
+   For the first such edge, we make decisions about cutoffs and cloning because
+   we redirect ALL callsites to cloned callee, not just one of them.  */
+
+static void
+partition_callchain (cgraph_edge *edge, locality_partition partition,
+		     bool clone_further_p,
+		     lto_locality_cloning_model cloning_model,
+		     double freq_cutoff, int size, int &cl_num)
+{
+  cgraph_node *node = edge->callee;
+  cgraph_node *caller = edge->caller;
+  cgraph_node *caller_node = node, *cl_node = NULL;
+
+  /* Already visited the caller to callee edges.  */
+  auto_vec<cgraph_node *> &callees = caller_to_callees.get_or_insert (caller);
+  if (std::find (callees.begin (), callees.end (), node) != callees.end ())
+    return;
+
+  callees.safe_push (node);
+
+  if (node->get_partitioning_class () == SYMBOL_PARTITION)
+    {
+      if (!node_partitioned_p (node))
+	{
+	  add_node_to_partition (partition, node);
+	  if (dump_file)
+	    fprintf (dump_file, "Partitioned node: %s\n",
+		     node->dump_asm_name ());
+	}
+      else if (cloning_model >= LTO_LOCALITY_NON_INTERPOSABLE_CLONING
+	       && !node_in_partition_p (partition, node))
+	{
+	  /* Non-inlined node, or alias, already partitioned
+	     If cut-off, don't clone callees but partition unpartitioned
+	     callees.
+	     size is node + inlined nodes.  */
+	  if (clone_further_p)
+	    {
+	      if (!node->alias)
+		if (ipa_size_summaries->get (node)->size >= size)
+		  clone_further_p = false;
+
+	      if (freq_cutoff != 0.0)
+		{
+		  sreal acc_freq = accumulate_incoming_edge_frequency (edge);
+		  if (acc_freq.to_double () < freq_cutoff)
+		    clone_further_p = false;
+		}
+	    }
+
+	  if (!suitable_for_locality_cloning_p (edge, cloning_model))
+	    clone_further_p = false;
+
+	  if (clone_further_p)
+	    {
+	      /* Try to clone NODE and its inline chain.  */
+	      if (dump_file)
+		fprintf (dump_file, "Cloning node: %s\n",
+			 node->dump_asm_name ());
+	      cl_node = clone_node_as_needed (edge, partition, cl_num);
+	      if (cl_node)
+		{
+		  add_node_to_partition (partition, cl_node);
+		  caller_node = cl_node;
+		}
+	      else
+		caller_node = NULL;
+	    }
+	}
+    }
+  else if (!node->inlined_to)
+    return;
+
+  // clone_further_p should be set also based on cloning verdict?
+  if (caller_node)
+    for (cgraph_edge *e = caller_node->callees; e; e = e->next_callee)
+      partition_callchain (e, partition, clone_further_p, cloning_model,
+			   freq_cutoff, size, cl_num);
+}
+
+/* Determine order of all external nodes if PGO profile is available.
+   Store the order in ORDER.  */
+
+static bool
+locality_determine_ipa_order (auto_vec<locality_order *> *order)
+{
+  struct cgraph_node *node;
+  auto_vec<locality_order *> non_comparable_nodes;
+  FOR_EACH_DEFINED_FUNCTION (node)
+    if (node->get_partitioning_class () == SYMBOL_PARTITION)
+      {
+	if (node->no_reorder)
+	  {
+	    if (dump_file)
+	      fprintf (dump_file, "no reorder %s\n", node->dump_asm_name ());
+	    return false;
+	  }
+	else if (!node->callers)
+	  {
+	    profile_count pcnt = node->count.ipa ();
+	    if (!pcnt.initialized_p () || !pcnt.ipa_p ())
+	      {
+		sreal cnt = 0;
+		locality_order *lo = new locality_order (node, cnt);
+		non_comparable_nodes.safe_push (lo);
+		continue;
+	      }
+	    sreal count = 0;
+	    struct cgraph_edge *edge;
+	    for (edge = node->callees; edge; edge = edge->next_callee)
+	      {
+		/* For PGO, frequency is not used in
+		   compare_edge_profile_counts (), it's used only as part of
+		   static profile order.  TODO: Get rid of this from here.  */
+		sreal freq = edge->sreal_frequency ();
+		count += freq;
+	      }
+	    locality_order *cl = new locality_order (node, count);
+	    order->safe_push (cl);
+	  }
+      }
+  order->qsort (compare_edge_profile_counts);
+  for (auto el : non_comparable_nodes)
+    order->safe_push (el);
+  return true;
+}
+
+/* Determine order of all external nodes if only static profile is available.
+   Store the order in ORDER.  */
+
+static bool
+locality_determine_static_order (auto_vec<locality_order *> *order)
+{
+  struct cgraph_node *node;
+  FOR_EACH_DEFINED_FUNCTION (node)
+    if (node->get_partitioning_class () == SYMBOL_PARTITION)
+      {
+	if (node->no_reorder)
+	  {
+	    if (dump_file)
+	      fprintf (dump_file, "no reorder %s\n", node->dump_asm_name ());
+	    return false;
+	  }
+	else if (!node->callers)
+	  {
+	    sreal count = 0;
+	    struct cgraph_edge *edge;
+	    for (edge = node->callees; edge; edge = edge->next_callee)
+	      {
+		sreal freq = edge->sreal_frequency ();
+		count += freq;
+	      }
+	    locality_order *cl = new locality_order (node, count);
+	    order->safe_push (cl);
+	  }
+      }
+  order->qsort (static_profile_cmp);
+  return true;
+}
+
+/* Partitioning for code locality.
+   1. Create and sort callchains.  If PGO is available, use real profile
+   counts.  Otherwise, use a set of heuristics to sort the callchains.
+   2. Partition the external nodes and their callchains in the determined order
+      2.1. If !partition, partition, else try and clone if it satisfies cloning
+      criteria.
+   3. Partition all other unpartitioned nodes.  */
+
+static void
+locality_partition_and_clone (int max_partition_size,
+			      lto_locality_cloning_model cloning_model,
+			      int freq_denominator, int size)
+{
+  // symtab_node *snode;
+  locality_partition partition;
+  int npartitions = 0;
+
+  auto_vec<locality_order *> order;
+  auto_vec<varpool_node *> varpool_order;
+  struct cgraph_node *node;
+  bool order_p;
+
+  /* Default partition size is the maximum allowed in param max_partition_size
+   This is to increase code sharing and reduce code bloating.
+   At partition stage, instructions are in early gimple, which will get
+   optimized in ltrans.  Partition can increase or decrease in size at final
+   codegen.  Regardless, almost all callees of a caller will be within 4MB.
+   A possible heuristic is to consider both previous and current partition
+   for cloning and redirection decisions.  */
+  int64_t partition_size = 1000000;
+
+  int cl_num = 0;
+
+  double real_freq = 0.0;
+  if (freq_denominator > 0)
+    real_freq = 1.0 / (double) freq_denominator;
+
+  cgraph_node *n = symtab->first_defined_function ();
+  if (n && n->count.ipa_p ())
+    order_p = locality_determine_ipa_order (&order);
+  else
+    order_p = locality_determine_static_order (&order);
+  if (!order_p)
+    {
+      if (dump_file)
+	{
+	  fprintf (dump_file, "Locality partition: falling back to balanced"
+			      "model\n");
+	}
+
+      return;
+    }
+
+  if (max_partition_size != 0)
+    partition_size = max_partition_size;
+  partition = create_partition (npartitions);
+
+  for (unsigned i = 0; i < order.length (); i++)
+    {
+      node = order[i]->node;
+      if (node_partitioned_p (node))
+	continue;
+
+      if (partition->insns > partition_size) // FORNOW: random boundary
+	partition = create_partition (npartitions);
+      if (dump_file)
+	fprintf (dump_file, "Partition id: %d\n", partition->part_id);
+
+      add_node_to_partition (partition, node);
+      if (dump_file)
+	fprintf (dump_file, "Ordered Node: %s\n", node->dump_asm_name ());
+
+      for (cgraph_edge *edge = node->callees; edge; edge = edge->next_callee)
+	{
+	  /* Recursively partition the callchain of edge->callee.  */
+	  partition_callchain (edge, partition, true, cloning_model, real_freq,
+			       size, cl_num);
+	}
+    }
+}
+
+/* Locality code placement is done in 2 parts.
+   1. IPA pass to be executed after ipa-inline and before ipa-pure-const.
+      Execute stage prepares the plan to place all nodes into partitions.
+   2. WPA Partition stage actually implements the plan.  */
+
+static int
+lc_execute ()
+{
+  symtab_node *node;
+  locality_clone_in_progress_p = true;
+  FOR_EACH_SYMBOL (node)
+    node->aux = NULL;
+
+  locality_partition_and_clone (param_max_locality_partition_size,
+				flag_lto_locality_cloning,
+				param_lto_locality_frequency,
+				param_lto_locality_size);
+
+  FOR_EACH_SYMBOL (node)
+    node->aux = NULL;
+  locality_clone_in_progress_p = false;
+  return 0;
+}
+
+namespace {
+
+const pass_data pass_data_ipa_locality_clone = {
+  IPA_PASS,				      /* type */
+  "locality-clone",			      /* name */
+  OPTGROUP_NONE,			      /* optinfo_flags */
+  TV_IPA_LC,				      /* tv_id */
+  0,					      /* properties_required */
+  0,					      /* properties_provided */
+  0,					      /* properties_destroyed */
+  0,					      /* todo_flags_start */
+  (TODO_dump_symtab | TODO_remove_functions), /* todo_flags_finish */
+};
+
+class pass_ipa_locality_cloning : public ipa_opt_pass_d
+{
+public:
+  pass_ipa_locality_cloning (gcc::context *ctxt)
+    : ipa_opt_pass_d (pass_data_ipa_locality_clone, ctxt,
+		      NULL, /* generate_summary */
+		      NULL, /* write_summary */
+		      NULL, /* read_summary */
+		      NULL, /* write_optimization_summary */
+		      NULL, /* read_optimization_summary */
+		      NULL, /* stmt_fixup */
+		      0,    /* function_transform_todo_flags_start */
+		      NULL, /* function_transform */
+		      NULL) /* variable_transform */
+  {}
+
+  /* opt_pass methods: */
+  virtual bool gate (function *)
+  {
+    return (flag_wpa && flag_lto_partition == LTO_PARTITION_LOCALITY);
+  }
+
+  virtual unsigned int execute (function *) { return lc_execute (); }
+
+}; // class pass_ipa_locality_cloning
+
+} // namespace
+
+ipa_opt_pass_d *
+make_pass_ipa_locality_cloning (gcc::context *ctxt)
+{
+  return new pass_ipa_locality_cloning (ctxt);
+}
diff --git a/gcc/ipa-locality-cloning.h b/gcc/ipa-locality-cloning.h
new file mode 100644
index 00000000000..ef96aaf9a0c
--- /dev/null
+++ b/gcc/ipa-locality-cloning.h
@@ -0,0 +1,36 @@
+/* LTO partitioning logic routines.
+   Copyright The GNU Toolchain Authors
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 3, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING3.  If not see
+<http://www.gnu.org/licenses/>.  */
+
+#ifndef IPA_LOCALITY_CLONING_H
+#define IPA_LOCALITY_CLONING_H
+
+/* Structure describing locality partitions.  */
+struct locality_partition_def
+{
+  int part_id;
+  vec<cgraph_node *> nodes;
+  int insns;
+};
+
+typedef struct locality_partition_def *locality_partition;
+
+extern vec<locality_partition> partitions;
+extern bool locality_clone_in_progress_p;
+
+#endif /* IPA_LOCALITY_CLONING_H */
diff --git a/gcc/lto-cgraph.cc b/gcc/lto-cgraph.cc
index 1d4311a8832..56e467b2aee 100644
--- a/gcc/lto-cgraph.cc
+++ b/gcc/lto-cgraph.cc
@@ -229,6 +229,8 @@ lto_set_symtab_encoder_in_partition (lto_symtab_encoder_t encoder,
 				     symtab_node *node)
 {
   int index = lto_symtab_encoder_encode (encoder, node);
+  if (dump_file)
+    fprintf(dump_file, "Node %s, index %d\n", node->asm_name(), index);
   encoder->nodes[index].in_partition = true;
 }
 
diff --git a/gcc/lto/lto-partition.cc b/gcc/lto/lto-partition.cc
index 2238650fa0e..7b59f9119cd 100644
--- a/gcc/lto/lto-partition.cc
+++ b/gcc/lto/lto-partition.cc
@@ -37,8 +37,8 @@ along with GCC; see the file COPYING3.  If not see
 #include "ipa-prop.h"
 #include "ipa-fnsummary.h"
 #include "lto-partition.h"
-
 #include <limits>
+#include "ipa-locality-cloning.h"
 
 vec<ltrans_partition> ltrans_partitions;
 
@@ -1419,6 +1419,139 @@ lto_balanced_map (int n_lto_partitions, int max_partition_size)
     }
 }
 
+/* Add all references of NODE into PARTITION.  */
+
+static void
+add_node_references_to_partition (ltrans_partition partition, symtab_node *node)
+{
+  struct ipa_ref *ref = NULL;
+  varpool_node *vnode;
+  for (int j = 0; node->iterate_reference (j, ref); j++)
+    if (is_a <varpool_node *> (ref->referred))
+      {
+	vnode = dyn_cast <varpool_node *> (ref->referred);
+	if (!symbol_partitioned_p (vnode)
+	    && !vnode->no_reorder
+	    && vnode->get_partitioning_class () == SYMBOL_PARTITION)
+	  {
+	    add_symbol_to_partition (partition, vnode);
+	    if (dump_file)
+	      fprintf (dump_file, "Varpool Node: %s\n", vnode->dump_asm_name ());
+	    add_node_references_to_partition (partition, vnode);
+	  }
+      }
+
+  for (int j = 0; node->iterate_referring (j, ref); j++)
+    if (is_a <varpool_node *> (ref->referring))
+      {
+	vnode = dyn_cast <varpool_node *> (ref->referring);
+	gcc_assert (vnode->definition);
+	if (!symbol_partitioned_p (vnode)
+	    && !vnode->no_reorder
+	    && !vnode->can_remove_if_no_refs_p ()
+	    && vnode->get_partitioning_class () == SYMBOL_PARTITION)
+	  {
+	    add_symbol_to_partition (partition, vnode);
+	    if (dump_file)
+	      fprintf (dump_file, "Varpool Node: %s\n", vnode->dump_asm_name ());
+	    add_node_references_to_partition (partition, vnode);
+	  }
+      }
+  if (cgraph_node *cnode = dyn_cast <cgraph_node *> (node))
+    {
+      struct cgraph_edge *e;
+
+      /* Add all inline clones and callees that are duplicated.  */
+      for (e = cnode->callees; e; e = e->next_callee)
+	if (e->callee->get_partitioning_class () == SYMBOL_DUPLICATE)
+	  add_node_references_to_partition (partition, e->callee);
+
+      /* Add all thunks associated with the function.  */
+      for (e = cnode->callers; e; e = e->next_caller)
+	if (e->caller->thunk && !e->caller->inlined_to)
+	  add_node_references_to_partition (partition, e->caller);
+    }
+
+}
+
+/* Create and return the created partition of name NAME.  */
+
+static ltrans_partition
+create_partition (int &npartitions, const char *name)
+{
+  npartitions++;
+  return new_partition (name);
+}
+
+/* Partitioning for code locality.
+   1. Create and sort callchains.  If PGO is available, use real profile
+   counts.  Otherwise, use a set of heuristics to sort the callchains.
+   2. Partition the external nodes and their callchains in the determined order
+      2.1. If !partition, partition, else try and clone if it satisfies cloning
+      criteria.
+   3. Partition all other unpartitioned nodes.  */
+
+void
+lto_locality_map (int max_partition_size, lto_locality_cloning_model
+		 /* cloning_model*/, int /*freq_denominator*/, int /* size*/)
+{
+  symtab_node *snode;
+  int npartitions = 0;
+
+  auto_vec<varpool_node *> varpool_order;
+  struct cgraph_node *node;
+
+  /* Default partition size is the maximum allowed in param max_partition_size
+   This is to increase code sharing and reduce code bloating.
+   At partition stage, instructions are in early gimple, which will get
+   optimized in ltrans. Partition can increase or decrease in size at final
+   codegen. Regardless, almost all callees of a caller will be within 4MB.
+   A possible heuristic is to consider both previous and current partition
+   for cloning and redirection decisions.  */
+  int64_t partition_size = 1000000;
+
+  if (partitions.length () == 0)
+    {
+      if (dump_file)
+	{
+	  fprintf (dump_file, "Locality partition: falling back to balanced "
+		   "model\n");
+	}
+      lto_balanced_map (128 /* lto_partitions */, max_partition_size);
+      return;
+    }
+  ltrans_partition partition = nullptr;
+  for (unsigned i = 0; i < partitions.length (); i++)
+    {
+      locality_partition part = partitions[i];
+      partition = create_partition (npartitions, ""/*part->part_id*/);
+      for (unsigned j = 0; j < part->nodes.length (); j++)
+	{
+	  node = part->nodes[j];
+	  if (symbol_partitioned_p (node))
+	    continue;
+
+	  add_symbol_to_partition (partition, node);
+	  add_node_references_to_partition (partition, node);
+	}
+    }
+
+  // All other unpartitioned symbols
+  FOR_EACH_SYMBOL (snode)
+    {
+      if (snode->get_partitioning_class () == SYMBOL_PARTITION
+	  && !symbol_partitioned_p (snode))
+	{
+	  if (partition->insns > partition_size)
+	    partition = create_partition (npartitions, "");
+
+	  add_symbol_to_partition (partition, snode);
+	  if (dump_file)
+	    fprintf (dump_file, "Un-ordered Node: %s\n", snode->dump_asm_name ());
+	}
+    }
+}
+
 /* Return true if we must not change the name of the NODE.  The name as
    extracted from the corresponding decl should be passed in NAME.  */
 
@@ -1733,7 +1866,12 @@ lto_promote_cross_file_statics (void)
     {
       ltrans_partition part
 	= ltrans_partitions[i];
+      if (dump_file)
+	fprintf (dump_file, "lto_promote_cross_file_statics for part %s %p\n",
+		 part->name, (void *)part->encoder);
       part->encoder = compute_ltrans_boundary (part->encoder);
+      if (dump_file)
+	fprintf (dump_file, "new encoder %p\n", (void *)part->encoder);
     }
 
   lto_clone_numbers = new hash_map<const char *, unsigned>;
diff --git a/gcc/lto/lto-partition.h b/gcc/lto/lto-partition.h
index c139dee0e0d..185a462d29d 100644
--- a/gcc/lto/lto-partition.h
+++ b/gcc/lto/lto-partition.h
@@ -36,6 +36,7 @@ extern vec<ltrans_partition> ltrans_partitions;
 void lto_1_to_1_map (void);
 void lto_max_map (void);
 void lto_cache_map (int, int);
+void lto_locality_map (int, lto_locality_cloning_model, int, int);
 void lto_balanced_map (int, int);
 void lto_promote_cross_file_statics (void);
 void free_ltrans_partitions (void);
diff --git a/gcc/lto/lto.cc b/gcc/lto/lto.cc
index 1ee215d8f1d..3260902098a 100644
--- a/gcc/lto/lto.cc
+++ b/gcc/lto/lto.cc
@@ -558,6 +558,9 @@ do_whole_program_analysis (void)
 		      param_max_partition_size);
   else if (flag_lto_partition == LTO_PARTITION_CACHE)
     lto_cache_map (param_lto_partitions, param_max_partition_size);
+  else if (flag_lto_partition == LTO_PARTITION_LOCALITY)
+    lto_locality_map(param_max_locality_partition_size, flag_lto_locality_cloning,
+		     param_lto_locality_frequency, param_lto_locality_size);
   else
     gcc_unreachable ();
 
diff --git a/gcc/params.opt b/gcc/params.opt
index a08e4c1042d..ac34e7b6ce3 100644
--- a/gcc/params.opt
+++ b/gcc/params.opt
@@ -457,6 +457,33 @@ Minimal size of a partition for LTO (in estimated instructions).
 Common Joined UInteger Var(param_lto_partitions) Init(128) IntegerRange(1, 65536) Param
 Number of partitions the program should be split to.
 
+Enum
+Name(lto_locality_cloning_model) Type(enum lto_locality_cloning_model) UnknownError(unknown LTO partitioning model %qs)
+
+EnumValue
+Enum(lto_locality_cloning_model) String(no) Value(LTO_LOCALITY_NO_CLONING)
+
+EnumValue
+Enum(lto_locality_cloning_model) String(non_interposable) Value(LTO_LOCALITY_NON_INTERPOSABLE_CLONING)
+
+EnumValue
+Enum(lto_locality_cloning_model) String(maximal) Value(LTO_LOCALITY_MAXIMAL_CLONING)
+
+-param=lto-partition-locality-cloning=
+Common Joined RejectNegative Enum(lto_locality_cloning_model) Var(flag_lto_locality_cloning) Init(LTO_LOCALITY_MAXIMAL_CLONING) Optimization
+
+-param=lto-partition-locality-frequency-cutoff=
+Common Joined UInteger Var(param_lto_locality_frequency) Init(1) IntegerRange(0, 65536) Param Optimization
+The denominator n of fraction 1/n of the execution frequency of callee to be cloned for a particular caller. Special value of 0 dictates to always clone without a cut-off.
+
+-param=lto-partition-locality-size-cutoff=
+Common Joined UInteger Var(param_lto_locality_size) Init(1000) IntegerRange(1, 65536) Param Optimization
+Size cut-off for callee including inlined calls to be cloned for a particular caller.
+
+-param=lto-max-locality-partition=
+Common Joined UInteger Var(param_max_locality_partition_size) Init(1000000) Param
+Maximal size of a locality partition for LTO (in estimated instructions). Value of 0 results in default value being used.
+
 -param=max-average-unrolled-insns=
 Common Joined UInteger Var(param_max_average_unrolled_insns) Init(80) Param Optimization
 The maximum number of instructions to consider to unroll in a loop on average.
diff --git a/gcc/passes.def b/gcc/passes.def
index 40162ac20a0..eafbec0408f 100644
--- a/gcc/passes.def
+++ b/gcc/passes.def
@@ -162,6 +162,7 @@ along with GCC; see the file COPYING3.  If not see
   NEXT_PASS (pass_ipa_sra);
   NEXT_PASS (pass_ipa_fn_summary);
   NEXT_PASS (pass_ipa_inline);
+  NEXT_PASS (pass_ipa_locality_cloning);
   NEXT_PASS (pass_ipa_pure_const);
   NEXT_PASS (pass_ipa_modref);
   NEXT_PASS (pass_ipa_free_fn_summary, false /* small_p */);
diff --git a/gcc/timevar.def b/gcc/timevar.def
index 0f9d2c0b032..6e5b5f4f033 100644
--- a/gcc/timevar.def
+++ b/gcc/timevar.def
@@ -105,6 +105,7 @@ DEFTIMEVAR (TV_IPA_PURE_CONST        , "ipa pure const")
 DEFTIMEVAR (TV_IPA_ICF		     , "ipa icf")
 DEFTIMEVAR (TV_IPA_PTA               , "ipa points-to")
 DEFTIMEVAR (TV_IPA_SRA               , "ipa SRA")
+DEFTIMEVAR (TV_IPA_LC               , "ipa locality clone")
 DEFTIMEVAR (TV_IPA_FREE_LANG_DATA    , "ipa free lang data")
 DEFTIMEVAR (TV_IPA_FREE_INLINE_SUMMARY, "ipa free inline summary")
 DEFTIMEVAR (TV_IPA_MODREF	     , "ipa modref")
diff --git a/gcc/tree-pass.h b/gcc/tree-pass.h
index a928cbe4557..e4b647c47ec 100644
--- a/gcc/tree-pass.h
+++ b/gcc/tree-pass.h
@@ -549,6 +549,7 @@ extern ipa_opt_pass_d *make_pass_ipa_cdtor_merge (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_single_use (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_comdats (gcc::context *ctxt);
 extern ipa_opt_pass_d *make_pass_ipa_modref (gcc::context *ctxt);
+extern ipa_opt_pass_d *make_pass_ipa_locality_cloning (gcc::context *ctxt);
 
 extern gimple_opt_pass *make_pass_cleanup_cfg_post_optimizing (gcc::context
 							       *ctxt);
-- 
2.43.2

