tejohnson updated this revision to Diff 441529.
tejohnson marked 3 inline comments as done.
tejohnson added a comment.

Rebase on top of D128854 <https://reviews.llvm.org/D128854> which now includes 
the extracted Analysis utilities.
I have not yet addressed the other comments on this patch.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D128142/new/

https://reviews.llvm.org/D128142

Files:
  clang/lib/Frontend/CompilerInvocation.cpp
  clang/test/CodeGen/Inputs/memprof.exe
  clang/test/CodeGen/Inputs/memprof.memprofraw
  clang/test/CodeGen/memprof.cpp
  llvm/include/llvm/Analysis/MemoryBuiltins.h
  llvm/include/llvm/ProfileData/InstrProfReader.h
  llvm/lib/Analysis/MemoryBuiltins.cpp
  llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
  llvm/test/Transforms/PGOProfile/Inputs/memprof.exe
  llvm/test/Transforms/PGOProfile/Inputs/memprof.memprofraw
  llvm/test/Transforms/PGOProfile/Inputs/memprof_pgo.profraw
  llvm/test/Transforms/PGOProfile/memprof.ll
  llvm/test/Transforms/PGOProfile/memprofmissingfunc.ll

Index: llvm/test/Transforms/PGOProfile/memprofmissingfunc.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/PGOProfile/memprofmissingfunc.ll
@@ -0,0 +1,25 @@
+;; Tests that we get a missing memprof error for a function not in profile when
+;; using -pgo-warn-missing-function.
+
+;; TODO: Use text profile inputs once that is available for memprof.
+
+;; The raw profiles have been generated from the source used for the memprof.ll
+;; test (see comments at the top of that file).
+
+; RUN: llvm-profdata merge %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdata
+
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.memprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s
+
+; CHECK: memprof record not found for function hash 10477964663628735180 _Z16funcnotinprofilev
+
+; ModuleID = 'memprofmissingfunc.cc'
+source_filename = "memprofmissingfunc.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress noinline nounwind optnone uwtable
+define dso_local void @_Z16funcnotinprofilev() {
+entry:
+  ret void
+}
+
Index: llvm/test/Transforms/PGOProfile/memprof.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/PGOProfile/memprof.ll
@@ -0,0 +1,493 @@
+;; Tests memprof profile matching (with and without instrumentation profiles).
+
+;; TODO: Use text profile inputs once that is available for memprof.
+
+;; The input IR and raw profiles have been generated from the following source:
+;;
+;; #include <stdlib.h>
+;; #include <string.h>
+;; #include <unistd.h>
+;; char *foo() {
+;;   return new char[10];
+;; }
+;; char *foo2() {
+;;   return foo();
+;; }
+;; char *bar() {
+;;   return foo2();
+;; }
+;; char *baz() {
+;;   return foo2();
+;; }
+;; char *recurse(unsigned n) {
+;;   if (!n)
+;;     return foo();
+;;   return recurse(n-1);
+;; }
+;; int main(int argc, char **argv) {
+;;   // Test allocations with different combinations of stack contexts and
+;;   // coldness (based on lifetime, since they are all accessed a single time
+;;   // per byte via the memset).
+;;   char *a = new char[10];
+;;   char *b = new char[10];
+;;   char *c = foo();
+;;   char *d = foo();
+;;   char *e = bar();
+;;   char *f = baz();
+;;   memset(a, 0, 10);
+;;   memset(b, 0, 10);
+;;   memset(c, 0, 10);
+;;   memset(d, 0, 10);
+;;   memset(e, 0, 10);
+;;   memset(f, 0, 10);
+;;   // a and c have short lifetimes
+;;   delete[] a;
+;;   delete[] c;
+;;   // b, d, e, and f have long lifetimes and will be detected as cold by default.
+;;   sleep(200);
+;;   delete[] b;
+;;   delete[] d;
+;;   delete[] e;
+;;   delete[] f;
+;;   // Loop ensures the two calls to recurse have stack contexts that only differ
+;;   // in one level of recursion. Tests recursion collapsing during matching, and
+;;   // subsequent handling of identical stack contexts but differing allocation
+;;   // behavior (since the first has a very long lifetime and the second has a
+;;   // short lifetime).
+;;   for (unsigned i = 0; i < 2; i++) {
+;;     char *g = recurse(i + 3);
+;;     memset(g, 0, 10);
+;;     if (!i)
+;;       sleep(200);
+;;     delete[] g;
+;;   }
+;;   return 0;
+;; }
+;;
+;; The following commands were used to compile the source to instrumented
+;; executables and collect raw binary format profiles:
+;;
+;; # Collect memory profile:
+;; $ clang++ -fuse-ld=lld -Wl,-no-pie -Wl,--no-rosegment -gmlt \
+;; 	-fdebug-info-for-profiling -mno-omit-leaf-frame-pointer \
+;;	-fno-omit-frame-pointer -fno-optimize-sibling-calls -m64 -Wl,-build-id \
+;; 	memprof.cc -o memprof.exe -fmemory-profile
+;; $ env MEMPROF_OPTIONS=log_path=stdout ./memprof.exe > memprof.memprofraw
+;;
+;; # Collect IR PGO profile:
+;; $ clang++ -fuse-ld=lld -Wl,-no-pie -Wl,--no-rosegment -gmlt \
+;; 	-fdebug-info-for-profiling -mno-omit-leaf-frame-pointer \
+;;	-fno-omit-frame-pointer -fno-optimize-sibling-calls -m64 -Wl,-build-id \
+;; 	memprof.cc -o pgo.exe -fprofile-generate=.
+;; $ pgo.exe
+;; $ mv default_*.profraw memprof_pgo.profraw
+;;
+;; # Generate below LLVM IR for use in matching:
+;; $ clang++ -gmlt -fdebug-info-for-profiling -fno-omit-frame-pointer \
+;;	-fno-optimize-sibling-calls memprof.cc -S -emit-llvm
+
+;; Generate indexed profiles of all combinations:
+; RUN: llvm-profdata merge %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdata
+; RUN: llvm-profdata merge %S/Inputs/memprof_pgo.profraw %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.pgomemprofdata
+; RUN: llvm-profdata merge %S/Inputs/memprof_pgo.profraw -o %t.pgoprofdata
+
+;; Feed back memprof-only profile
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.memprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefix=MEMPROF --check-prefix=ALL
+;; All memprof functions should be found
+; ALL-NOT: memprof record not found for function hash
+;; We should not attempt pgo matching (so should not get any missing pgo profile
+;; messages)
+; ALL-NOT: no profile data available for function
+; There should not be any PGO metadata
+; MEMPROF-NOT: !prof
+
+;; Feed back pgo-only profile
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.pgoprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefix=PGO --check-prefix=ALL --check-prefix=PGOONLY
+;; We should not attempt memprof matching (so should not get any missing memprof
+;; profile messages)
+; ALL-NOT: memprof record not found for function hash
+;; All pgo functions should be found
+; ALL-NOT: no profile data available for function
+; There should not be any memprof related metadata
+; PGOONLY-NOT: !memprof
+; PGOONLY-NOT: !callsite
+
+;; Feed back memprof-only profile
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.pgomemprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefix=MEMPROF --check-prefix=PGO --check-prefix=ALL
+;; All memprof functions should be found
+; ALL-NOT: memprof record not found for function hash
+;; We should not attempt pgo matching (so should not get any missing pgo profile
+;; messages)
+; ALL-NOT: no profile data available for function
+; There should not be any PGO metadata
+; MEMPROF-NOT: !prof
+
+; ModuleID = 'memprof.cc'
+source_filename = "memprof.cc"
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: mustprogress noinline optnone uwtable
+; ALL-LABEL: define dso_local noundef ptr @_Z3foov()
+; There should be some PGO metadata
+; PGO: !prof
+define dso_local noundef ptr @_Z3foov() #0 !dbg !10 {
+entry:
+  ; MEMPROF: call {{.*}} @_Znam{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !dbg !13
+  ret ptr %call, !dbg !14
+}
+
+; Function Attrs: nobuiltin allocsize(0)
+declare noundef nonnull ptr @_Znam(i64 noundef) #1
+
+; Function Attrs: mustprogress noinline optnone uwtable
+; ALL-LABEL: define dso_local noundef ptr @_Z4foo2v()
+define dso_local noundef ptr @_Z4foo2v() #0 !dbg !15 {
+entry:
+  ; MEMPROF: call {{.*}} @_Z3foov{{.*}} !callsite ![[C2:[0-9]+]]
+  %call = call noundef ptr @_Z3foov(), !dbg !16
+  ret ptr %call, !dbg !17
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define dso_local noundef ptr @_Z3barv() #0 !dbg !18 {
+entry:
+  ; MEMPROF: call {{.*}} @_Z4foo2v{{.*}} !callsite ![[C3:[0-9]+]]
+  %call = call noundef ptr @_Z4foo2v(), !dbg !19
+  ret ptr %call, !dbg !20
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define dso_local noundef ptr @_Z3bazv() #0 !dbg !21 {
+entry:
+  ; MEMPROF: call {{.*}} @_Z4foo2v{{.*}} !callsite ![[C4:[0-9]+]]
+  %call = call noundef ptr @_Z4foo2v(), !dbg !22
+  ret ptr %call, !dbg !23
+}
+
+; Function Attrs: mustprogress noinline optnone uwtable
+define dso_local noundef ptr @_Z7recursej(i32 noundef %n) #0 !dbg !24 {
+entry:
+  %retval = alloca ptr, align 8
+  %n.addr = alloca i32, align 4
+  store i32 %n, ptr %n.addr, align 4
+  %0 = load i32, ptr %n.addr, align 4, !dbg !25
+  %tobool = icmp ne i32 %0, 0, !dbg !25
+  br i1 %tobool, label %if.end, label %if.then, !dbg !26
+
+if.then:                                          ; preds = %entry
+  ; MEMPROF: call {{.*}} @_Z3foov{{.*}} !callsite ![[C5:[0-9]+]]
+  %call = call noundef ptr @_Z3foov(), !dbg !27
+  store ptr %call, ptr %retval, align 8, !dbg !28
+  br label %return, !dbg !28
+
+if.end:                                           ; preds = %entry
+  %1 = load i32, ptr %n.addr, align 4, !dbg !29
+  %sub = sub i32 %1, 1, !dbg !30
+  ; MEMPROF: call {{.*}} @_Z7recursej{{.*}} !callsite ![[C6:[0-9]+]]
+  %call1 = call noundef ptr @_Z7recursej(i32 noundef %sub), !dbg !31
+  store ptr %call1, ptr %retval, align 8, !dbg !32
+  br label %return, !dbg !32
+
+return:                                           ; preds = %if.end, %if.then
+  %2 = load ptr, ptr %retval, align 8, !dbg !33
+  ret ptr %2, !dbg !33
+}
+
+; Function Attrs: mustprogress noinline norecurse optnone uwtable
+define dso_local noundef i32 @main(i32 noundef %argc, ptr noundef %argv) #2 !dbg !34 {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca ptr, align 8
+  %a = alloca ptr, align 8
+  %b = alloca ptr, align 8
+  %c = alloca ptr, align 8
+  %d = alloca ptr, align 8
+  %e = alloca ptr, align 8
+  %f = alloca ptr, align 8
+  %i = alloca i32, align 4
+  %g = alloca ptr, align 8
+  store i32 0, ptr %retval, align 4
+  store i32 %argc, ptr %argc.addr, align 4
+  store ptr %argv, ptr %argv.addr, align 8
+  ; MEMPROF: call {{.*}} @_Znam{{.*}} #[[A1:[0-9]+]]
+  %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !dbg !35
+  store ptr %call, ptr %a, align 8, !dbg !36
+  ; MEMPROF: call {{.*}} @_Znam{{.*}} #[[A2:[0-9]+]]
+  %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !dbg !37
+  store ptr %call1, ptr %b, align 8, !dbg !38
+  ; MEMPROF: call {{.*}} @_Z3foov{{.*}} !callsite ![[C7:[0-9]+]]
+  %call2 = call noundef ptr @_Z3foov(), !dbg !39
+  store ptr %call2, ptr %c, align 8, !dbg !40
+  ; MEMPROF: call {{.*}} @_Z3foov{{.*}} !callsite ![[C8:[0-9]+]]
+  %call3 = call noundef ptr @_Z3foov(), !dbg !41
+  store ptr %call3, ptr %d, align 8, !dbg !42
+  ; MEMPROF: call {{.*}} @_Z3barv{{.*}} !callsite ![[C9:[0-9]+]]
+  %call4 = call noundef ptr @_Z3barv(), !dbg !43
+  store ptr %call4, ptr %e, align 8, !dbg !44
+  ; MEMPROF: call {{.*}} @_Z3bazv{{.*}} !callsite ![[C10:[0-9]+]]
+  %call5 = call noundef ptr @_Z3bazv(), !dbg !45
+  store ptr %call5, ptr %f, align 8, !dbg !46
+  %0 = load ptr, ptr %a, align 8, !dbg !47
+  call void @llvm.memset.p0.i64(ptr align 1 %0, i8 0, i64 10, i1 false), !dbg !48
+  %1 = load ptr, ptr %b, align 8, !dbg !49
+  call void @llvm.memset.p0.i64(ptr align 1 %1, i8 0, i64 10, i1 false), !dbg !50
+  %2 = load ptr, ptr %c, align 8, !dbg !51
+  call void @llvm.memset.p0.i64(ptr align 1 %2, i8 0, i64 10, i1 false), !dbg !52
+  %3 = load ptr, ptr %d, align 8, !dbg !53
+  call void @llvm.memset.p0.i64(ptr align 1 %3, i8 0, i64 10, i1 false), !dbg !54
+  %4 = load ptr, ptr %e, align 8, !dbg !55
+  call void @llvm.memset.p0.i64(ptr align 1 %4, i8 0, i64 10, i1 false), !dbg !56
+  %5 = load ptr, ptr %f, align 8, !dbg !57
+  call void @llvm.memset.p0.i64(ptr align 1 %5, i8 0, i64 10, i1 false), !dbg !58
+  %6 = load ptr, ptr %a, align 8, !dbg !59
+  %isnull = icmp eq ptr %6, null, !dbg !60
+  br i1 %isnull, label %delete.end, label %delete.notnull, !dbg !60
+
+delete.notnull:                                   ; preds = %entry
+  call void @_ZdaPv(ptr noundef %6) #7, !dbg !61
+  br label %delete.end, !dbg !61
+
+delete.end:                                       ; preds = %delete.notnull, %entry
+  %7 = load ptr, ptr %c, align 8, !dbg !63
+  %isnull6 = icmp eq ptr %7, null, !dbg !64
+  br i1 %isnull6, label %delete.end8, label %delete.notnull7, !dbg !64
+
+delete.notnull7:                                  ; preds = %delete.end
+  call void @_ZdaPv(ptr noundef %7) #7, !dbg !65
+  br label %delete.end8, !dbg !65
+
+delete.end8:                                      ; preds = %delete.notnull7, %delete.end
+  %call9 = call i32 @sleep(i32 noundef 200), !dbg !66
+  %8 = load ptr, ptr %b, align 8, !dbg !67
+  %isnull10 = icmp eq ptr %8, null, !dbg !68
+  br i1 %isnull10, label %delete.end12, label %delete.notnull11, !dbg !68
+
+delete.notnull11:                                 ; preds = %delete.end8
+  call void @_ZdaPv(ptr noundef %8) #7, !dbg !69
+  br label %delete.end12, !dbg !69
+
+delete.end12:                                     ; preds = %delete.notnull11, %delete.end8
+  %9 = load ptr, ptr %d, align 8, !dbg !70
+  %isnull13 = icmp eq ptr %9, null, !dbg !71
+  br i1 %isnull13, label %delete.end15, label %delete.notnull14, !dbg !71
+
+delete.notnull14:                                 ; preds = %delete.end12
+  call void @_ZdaPv(ptr noundef %9) #7, !dbg !72
+  br label %delete.end15, !dbg !72
+
+delete.end15:                                     ; preds = %delete.notnull14, %delete.end12
+  %10 = load ptr, ptr %e, align 8, !dbg !73
+  %isnull16 = icmp eq ptr %10, null, !dbg !74
+  br i1 %isnull16, label %delete.end18, label %delete.notnull17, !dbg !74
+
+delete.notnull17:                                 ; preds = %delete.end15
+  call void @_ZdaPv(ptr noundef %10) #7, !dbg !75
+  br label %delete.end18, !dbg !75
+
+delete.end18:                                     ; preds = %delete.notnull17, %delete.end15
+  %11 = load ptr, ptr %f, align 8, !dbg !76
+  %isnull19 = icmp eq ptr %11, null, !dbg !77
+  br i1 %isnull19, label %delete.end21, label %delete.notnull20, !dbg !77
+
+delete.notnull20:                                 ; preds = %delete.end18
+  call void @_ZdaPv(ptr noundef %11) #7, !dbg !78
+  br label %delete.end21, !dbg !78
+
+delete.end21:                                     ; preds = %delete.notnull20, %delete.end18
+  store i32 0, ptr %i, align 4, !dbg !79
+  br label %for.cond, !dbg !80
+
+for.cond:                                         ; preds = %for.inc, %delete.end21
+  %12 = load i32, ptr %i, align 4, !dbg !81
+  %cmp = icmp ult i32 %12, 2, !dbg !82
+  br i1 %cmp, label %for.body, label %for.end, !dbg !83
+
+for.body:                                         ; preds = %for.cond
+  %13 = load i32, ptr %i, align 4, !dbg !84
+  %add = add i32 %13, 3, !dbg !85
+  ; MEMPROF: call {{.*}} @_Z7recursej{{.*}} !callsite ![[C11:[0-9]+]]
+  %call22 = call noundef ptr @_Z7recursej(i32 noundef %add), !dbg !86
+  store ptr %call22, ptr %g, align 8, !dbg !87
+  %14 = load ptr, ptr %g, align 8, !dbg !88
+  call void @llvm.memset.p0.i64(ptr align 1 %14, i8 0, i64 10, i1 false), !dbg !89
+  %15 = load i32, ptr %i, align 4, !dbg !90
+  %tobool = icmp ne i32 %15, 0, !dbg !90
+  br i1 %tobool, label %if.end, label %if.then, !dbg !91
+
+if.then:                                          ; preds = %for.body
+  %call23 = call i32 @sleep(i32 noundef 200), !dbg !92
+  br label %if.end, !dbg !92
+
+if.end:                                           ; preds = %if.then, %for.body
+  %16 = load ptr, ptr %g, align 8, !dbg !93
+  %isnull24 = icmp eq ptr %16, null, !dbg !94
+  br i1 %isnull24, label %delete.end26, label %delete.notnull25, !dbg !94
+
+delete.notnull25:                                 ; preds = %if.end
+  call void @_ZdaPv(ptr noundef %16) #7, !dbg !95
+  br label %delete.end26, !dbg !95
+
+delete.end26:                                     ; preds = %delete.notnull25, %if.end
+  br label %for.inc, !dbg !96
+
+for.inc:                                          ; preds = %delete.end26
+  %17 = load i32, ptr %i, align 4, !dbg !97
+  %inc = add i32 %17, 1, !dbg !97
+  store i32 %inc, ptr %i, align 4, !dbg !97
+  br label %for.cond, !dbg !99, !llvm.loop !100
+
+for.end:                                          ; preds = %for.cond
+  ret i32 0, !dbg !103
+}
+
+; MEMPROF: #[[A1]] = { builtin allocsize(0) "memprof"="notcold" }
+; MEMPROF: #[[A2]] = { builtin allocsize(0) "memprof"="cold" }
+; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]]}
+; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"notcold"}
+; MEMPROF: ![[STACK1]] = !{i64 -2458008693472584243, i64 3952224878458323}
+; MEMPROF: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"cold"}
+; MEMPROF: ![[STACK2]] = !{i64 -2458008693472584243, i64 4060711043150162853}
+; MEMPROF: ![[MIB3]] = !{![[STACK3:[0-9]+]], !"notcold"}
+; MEMPROF: ![[STACK3]] = !{i64 -2458008693472584243, i64 6197270713521362189}
+; MEMPROF: ![[MIB4]] = !{![[STACK4:[0-9]+]], !"cold"}
+; MEMPROF: ![[STACK4]] = !{i64 -2458008693472584243, i64 -8079659623765193173}
+; MEMPROF: ![[C1]] = !{i64 -2458008693472584243}
+; MEMPROF: ![[C2]] = !{i64 -8079659623765193173}
+; MEMPROF: ![[C3]] = !{i64 -972865200055133905}
+; MEMPROF: ![[C4]] = !{i64 -4805294506621015872}
+; MEMPROF: ![[C5]] = !{i64 3952224878458323}
+; MEMPROF: ![[C6]] = !{i64 -6408471049535768163}
+; MEMPROF: ![[C7]] = !{i64 6197270713521362189}
+; MEMPROF: ![[C8]] = !{i64 4060711043150162853}
+; MEMPROF: ![[C9]] = !{i64 1503792662459039327}
+; MEMPROF: ![[C10]] = !{i64 -1910610273966575552}
+; MEMPROF: ![[C11]] = !{i64 -2523213715586649525}
+
+; Function Attrs: argmemonly nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3
+
+; Function Attrs: nobuiltin nounwind
+declare void @_ZdaPv(ptr noundef) #4
+
+declare i32 @sleep(i32 noundef) #5
+
+attributes #0 = { mustprogress noinline optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #1 = { nobuiltin allocsize(0) "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #2 = { mustprogress noinline norecurse optnone uwtable "disable-tail-calls"="true" "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #3 = { argmemonly nofree nounwind willreturn writeonly }
+attributes #4 = { nobuiltin nounwind "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #5 = { "disable-tail-calls"="true" "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
+attributes #6 = { builtin allocsize(0) }
+attributes #7 = { builtin nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
+!1 = !DIFile(filename: "memprof.cc", directory: "/usr/local/google/home/tejohnson/llvm/tmp", checksumkind: CSK_MD5, checksum: "e8c40ebe4b21776b4d60e9632cbc13c2")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !{i32 7, !"frame-pointer", i32 2}
+!9 = !{!"clang version 15.0.0 (https://github.com/llvm/llvm-project.git 6cbe6284d1f0a088b5c6482ae27b738f03d82fe7)"}
+!10 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 4, type: !11, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = !DILocation(line: 5, column: 10, scope: !10)
+!14 = !DILocation(line: 5, column: 3, scope: !10)
+!15 = distinct !DISubprogram(name: "foo2", linkageName: "_Z4foo2v", scope: !1, file: !1, line: 7, type: !11, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
+!16 = !DILocation(line: 8, column: 10, scope: !15)
+!17 = !DILocation(line: 8, column: 3, scope: !15)
+!18 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barv", scope: !1, file: !1, line: 10, type: !11, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
+!19 = !DILocation(line: 11, column: 10, scope: !18)
+!20 = !DILocation(line: 11, column: 3, scope: !18)
+!21 = distinct !DISubprogram(name: "baz", linkageName: "_Z3bazv", scope: !1, file: !1, line: 13, type: !11, scopeLine: 13, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
+!22 = !DILocation(line: 14, column: 10, scope: !21)
+!23 = !DILocation(line: 14, column: 3, scope: !21)
+!24 = distinct !DISubprogram(name: "recurse", linkageName: "_Z7recursej", scope: !1, file: !1, line: 16, type: !11, scopeLine: 16, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
+!25 = !DILocation(line: 17, column: 8, scope: !24)
+!26 = !DILocation(line: 17, column: 7, scope: !24)
+!27 = !DILocation(line: 18, column: 12, scope: !24)
+!28 = !DILocation(line: 18, column: 5, scope: !24)
+!29 = !DILocation(line: 19, column: 18, scope: !24)
+!30 = !DILocation(line: 19, column: 19, scope: !24)
+!31 = !DILocation(line: 19, column: 10, scope: !24)
+!32 = !DILocation(line: 19, column: 3, scope: !24)
+!33 = !DILocation(line: 20, column: 1, scope: !24)
+!34 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 21, type: !11, scopeLine: 21, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
+!35 = !DILocation(line: 25, column: 13, scope: !34)
+!36 = !DILocation(line: 25, column: 9, scope: !34)
+!37 = !DILocation(line: 26, column: 13, scope: !34)
+!38 = !DILocation(line: 26, column: 9, scope: !34)
+!39 = !DILocation(line: 27, column: 13, scope: !34)
+!40 = !DILocation(line: 27, column: 9, scope: !34)
+!41 = !DILocation(line: 28, column: 13, scope: !34)
+!42 = !DILocation(line: 28, column: 9, scope: !34)
+!43 = !DILocation(line: 29, column: 13, scope: !34)
+!44 = !DILocation(line: 29, column: 9, scope: !34)
+!45 = !DILocation(line: 30, column: 13, scope: !34)
+!46 = !DILocation(line: 30, column: 9, scope: !34)
+!47 = !DILocation(line: 31, column: 10, scope: !34)
+!48 = !DILocation(line: 31, column: 3, scope: !34)
+!49 = !DILocation(line: 32, column: 10, scope: !34)
+!50 = !DILocation(line: 32, column: 3, scope: !34)
+!51 = !DILocation(line: 33, column: 10, scope: !34)
+!52 = !DILocation(line: 33, column: 3, scope: !34)
+!53 = !DILocation(line: 34, column: 10, scope: !34)
+!54 = !DILocation(line: 34, column: 3, scope: !34)
+!55 = !DILocation(line: 35, column: 10, scope: !34)
+!56 = !DILocation(line: 35, column: 3, scope: !34)
+!57 = !DILocation(line: 36, column: 10, scope: !34)
+!58 = !DILocation(line: 36, column: 3, scope: !34)
+!59 = !DILocation(line: 38, column: 12, scope: !34)
+!60 = !DILocation(line: 38, column: 3, scope: !34)
+!61 = !DILocation(line: 38, column: 3, scope: !62)
+!62 = !DILexicalBlockFile(scope: !34, file: !1, discriminator: 2)
+!63 = !DILocation(line: 39, column: 12, scope: !34)
+!64 = !DILocation(line: 39, column: 3, scope: !34)
+!65 = !DILocation(line: 39, column: 3, scope: !62)
+!66 = !DILocation(line: 41, column: 3, scope: !34)
+!67 = !DILocation(line: 42, column: 12, scope: !34)
+!68 = !DILocation(line: 42, column: 3, scope: !34)
+!69 = !DILocation(line: 42, column: 3, scope: !62)
+!70 = !DILocation(line: 43, column: 12, scope: !34)
+!71 = !DILocation(line: 43, column: 3, scope: !34)
+!72 = !DILocation(line: 43, column: 3, scope: !62)
+!73 = !DILocation(line: 44, column: 12, scope: !34)
+!74 = !DILocation(line: 44, column: 3, scope: !34)
+!75 = !DILocation(line: 44, column: 3, scope: !62)
+!76 = !DILocation(line: 45, column: 12, scope: !34)
+!77 = !DILocation(line: 45, column: 3, scope: !34)
+!78 = !DILocation(line: 45, column: 3, scope: !62)
+!79 = !DILocation(line: 51, column: 17, scope: !34)
+!80 = !DILocation(line: 51, column: 8, scope: !34)
+!81 = !DILocation(line: 51, column: 24, scope: !62)
+!82 = !DILocation(line: 51, column: 26, scope: !62)
+!83 = !DILocation(line: 51, column: 3, scope: !62)
+!84 = !DILocation(line: 52, column: 23, scope: !34)
+!85 = !DILocation(line: 52, column: 25, scope: !34)
+!86 = !DILocation(line: 52, column: 15, scope: !34)
+!87 = !DILocation(line: 52, column: 11, scope: !34)
+!88 = !DILocation(line: 53, column: 12, scope: !34)
+!89 = !DILocation(line: 53, column: 5, scope: !34)
+!90 = !DILocation(line: 54, column: 10, scope: !34)
+!91 = !DILocation(line: 54, column: 9, scope: !34)
+!92 = !DILocation(line: 55, column: 7, scope: !34)
+!93 = !DILocation(line: 56, column: 14, scope: !34)
+!94 = !DILocation(line: 56, column: 5, scope: !34)
+!95 = !DILocation(line: 56, column: 5, scope: !62)
+!96 = !DILocation(line: 57, column: 3, scope: !34)
+!97 = !DILocation(line: 51, column: 32, scope: !98)
+!98 = !DILexicalBlockFile(scope: !34, file: !1, discriminator: 4)
+!99 = !DILocation(line: 51, column: 3, scope: !98)
+!100 = distinct !{!100, !101, !96, !102}
+!101 = !DILocation(line: 51, column: 3, scope: !34)
+!102 = !{!"llvm.loop.mustprogress"}
+!103 = !DILocation(line: 58, column: 3, scope: !34)
Index: llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
===================================================================
--- llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -65,6 +65,8 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryProfileInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -113,6 +115,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <map>
 #include <memory>
 #include <numeric>
 #include <string>
@@ -121,6 +124,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace llvm::memprof;
 using ProfileCount = Function::ProfileCount;
 using VPCandidateInfo = ValueProfileCollector::CandidateInfo;
 
@@ -135,6 +139,7 @@
 STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts.");
 STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile.");
 STATISTIC(NumOfPGOMissing, "Number of functions without profile.");
+STATISTIC(NumOfMemProfMissing, "Number of functions without memory profile.");
 STATISTIC(NumOfPGOICall, "Number of indirect call value instrumentations.");
 STATISTIC(NumOfCSPGOInstrument, "Number of edges instrumented in CSPGO.");
 STATISTIC(NumOfCSPGOSelectInsts,
@@ -287,6 +292,10 @@
     cl::desc("Set the threshold for pgo-verify-bfi: skip the counts whose "
              "profile count value is below."));
 
+static cl::opt<bool> MatchMemProf(
+    "pgo-match-memprof", cl::init(true), cl::Hidden,
+    cl::desc("Perform matching and annotation of memprof profiles."));
+
 namespace llvm {
 // Command line option to turn on CFG dot dump after profile annotation.
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
@@ -495,6 +504,7 @@
   void renameComdatFunction();
 
 public:
+  const TargetLibraryInfo &TLI;
   std::vector<std::vector<VPCandidateInfo>> ValueSites;
   SelectInstVisitor SIVisitor;
   std::string FuncName;
@@ -533,7 +543,7 @@
       BlockFrequencyInfo *BFI = nullptr, bool IsCS = false,
       bool InstrumentFuncEntry = true)
       : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func, TLI),
-        ValueSites(IPVK_Last + 1), SIVisitor(Func),
+        TLI(TLI), ValueSites(IPVK_Last + 1), SIVisitor(Func),
         MST(F, InstrumentFuncEntry, BPI, BFI) {
     // This should be done before CFG hash computation.
     SIVisitor.countSelects(Func);
@@ -1010,6 +1020,9 @@
   bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros,
                     bool &AllMinusOnes);
 
+  // Read memprof data for the instrumented function from profile.
+  bool readMemprof(IndexedInstrProfReader *PGOReader);
+
   // Populate the counts for all BBs.
   void populateCounters();
 
@@ -1210,6 +1223,248 @@
   F.setMetadata(LLVMContext::MD_annotation, MD);
 }
 
+static void addCallsiteMetadata(Instruction &I,
+                                std::vector<uint64_t> &InlinedCallStack,
+                                LLVMContext &Ctx) {
+  I.setMetadata(LLVMContext::MD_callsite,
+                buildCallstackMetadata(InlinedCallStack, Ctx));
+}
+
+static hash_code computeStackId(GlobalValue::GUID Function, uint32_t LineOffset,
+                                uint32_t Column) {
+  return hash_combine(Function, LineOffset, Column);
+}
+
+static hash_code computeStackId(const memprof::Frame &Frame) {
+  return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column);
+}
+
+void addCallStack(CallStackTrie &AllocTrie, const AllocationInfo *AllocInfo) {
+  auto AllocType = getAllocType(AllocInfo->Info.getMaxAccessCount(),
+                                AllocInfo->Info.getMinSize(),
+                                AllocInfo->Info.getMinLifetime());
+  SmallVector<uint64_t> StackIds;
+  std::set<hash_code> StackHashSet;
+  for (auto StackFrame : AllocInfo->CallStack) {
+    auto StackId = computeStackId(StackFrame);
+    // Remove recursion cycles.
+    // TODO: Consider handling this during profile generation.
+    auto Insert = StackHashSet.insert(StackId);
+    if (!Insert.second)
+      continue;
+    StackIds.push_back(StackId);
+  }
+  AllocTrie.addCallStack(AllocType, StackIds);
+}
+
+bool PGOUseFunc::readMemprof(IndexedInstrProfReader *PGOReader) {
+  if (!MatchMemProf)
+    return true;
+
+  auto &Ctx = M->getContext();
+
+  auto FuncGUID = Function::getGUID(FuncInfo.FuncName);
+  Expected<memprof::MemProfRecord> MemProfResult =
+      PGOReader->getMemProfRecord(FuncGUID);
+  if (Error E = MemProfResult.takeError()) {
+    handleAllErrors(std::move(E), [&](const InstrProfError &IPE) {
+      auto Err = IPE.get();
+      bool SkipWarning = false;
+      LLVM_DEBUG(dbgs() << "Error in reading profile for Func "
+                        << FuncInfo.FuncName << ": ");
+      if (Err == instrprof_error::unknown_function) {
+        NumOfMemProfMissing++;
+        SkipWarning = !PGOWarnMissing;
+        LLVM_DEBUG(dbgs() << "unknown function");
+      } else if (Err == instrprof_error::hash_mismatch) {
+        SkipWarning =
+            NoPGOWarnMismatch ||
+            (NoPGOWarnMismatchComdat &&
+             (F.hasComdat() ||
+              F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
+        LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")");
+      }
+
+      if (SkipWarning)
+        return;
+
+      std::string Msg = IPE.message() + std::string(" ") + F.getName().str() +
+                        std::string(" Hash = ") +
+                        std::to_string(FuncInfo.FunctionHash);
+
+      Ctx.diagnose(
+          DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    });
+    return false;
+  }
+
+  // Build maps of the location hash to all profile data with that leaf location
+  // (allocation info and the callsites).
+  std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
+  // For the callsites we need to record the index of the associated frame in
+  // the frame array (see comments below where the map entries are added).
+  std::map<uint64_t, std::set<std::pair<const SmallVector<Frame> *, unsigned>>>
+      LocHashToCallSites;
+  const auto MemProfRec = std::move(MemProfResult.get());
+  for (auto &AI : MemProfRec.AllocSites) {
+    // Associate the allocation info with the leaf frame. The later matching
+    // code will match any inlined call sequences in the IR with a longer prefix
+    // of call stack frames.
+    auto StackId = computeStackId(AI.CallStack[0]);
+    LocHashToAllocInfo[StackId].insert(&AI);
+  }
+  for (auto &CS : MemProfRec.CallSites) {
+    // Need to record all frames from leaf up to and including this function,
+    // as any of these may or may not have been inlined at this point.
+    unsigned Idx = 0;
+    for (auto &StackFrame : CS) {
+      auto StackId = computeStackId(StackFrame);
+      LocHashToCallSites[StackId].insert(std::make_pair(&CS, Idx++));
+      // Once we find this function, we can stop recording.
+      if (StackFrame.Function == FuncGUID)
+        break;
+    }
+  }
+
+  auto GetOffset = [](const DILocation *DIL) {
+    return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
+           0xffff;
+  };
+
+  // Now walk the instructions, looking up the associated profile data using
+  // dbug locations.
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      if (I.isDebugOrPseudoInst())
+        continue;
+      // We are only interested in calls (allocation or interior call stack
+      // context calls).
+      auto *CI = dyn_cast<CallBase>(&I);
+      if (!CI)
+        continue;
+      auto *CalledFunction = CI->getCalledFunction();
+      if (CalledFunction && CalledFunction->isIntrinsic())
+        continue;
+      // List of call stack ids computed from the location hashes on debug
+      // locations (leaf to inlined at root).
+      std::vector<uint64_t> InlinedCallStack;
+      // Was the leaf location found in one of the profile maps?
+      bool LeafFound = false;
+      // If leaf was found in a map, iterators pointing to its location in both
+      // of the maps (it may only exist in one).
+      std::map<uint64_t, std::set<const AllocationInfo *>>::iterator
+          AllocInfoIter;
+      std::map<uint64_t, std::set<std::pair<const SmallVector<Frame> *,
+                                            unsigned>>>::iterator CallSitesIter;
+      for (const DILocation *DIL = I.getDebugLoc(); DIL;
+           DIL = DIL->getInlinedAt()) {
+        // Use C++ linkage name if possible. Need to compile with
+        // -fdebug-info-for-profiling to get linkage name.
+        StringRef Name = DIL->getScope()->getSubprogram()->getLinkageName();
+        if (Name.empty())
+          Name = DIL->getScope()->getSubprogram()->getName();
+        auto CalleeGUID = Function::getGUID(Name);
+        auto StackId =
+            computeStackId(CalleeGUID, GetOffset(DIL), DIL->getColumn());
+        // LeafFound will only be false on the first iteration, since we either
+        // set it true or break out of the loop below.
+        if (!LeafFound) {
+          AllocInfoIter = LocHashToAllocInfo.find(StackId);
+          CallSitesIter = LocHashToCallSites.find(StackId);
+          // Check if the leaf is in one of the maps. If not, no need to look
+          // further at this call.
+          if (AllocInfoIter == LocHashToAllocInfo.end() &&
+              CallSitesIter == LocHashToCallSites.end())
+            break;
+          LeafFound = true;
+        }
+        InlinedCallStack.push_back(StackId);
+      }
+      // If leaf not in either of the maps, skip inst.
+      if (!LeafFound)
+        continue;
+
+      // Helper to compare the InlinedCallStack computed from this instruction
+      // to a list of Frame from profile data (either the allocation data or a
+      // callsite). For callsites, the StartIndex to use in the Frame array may
+      // be non-zero.
+      auto StackFrameIncludesInlinedCallStack =
+          [&InlinedCallStack](ArrayRef<Frame> ProfileCallStack,
+                              unsigned StartIndex = 0) {
+            auto StackFrame = ProfileCallStack.begin() + StartIndex;
+            auto InlCallStackIter = InlinedCallStack.begin();
+            for (; StackFrame != ProfileCallStack.end() &&
+                   InlCallStackIter != InlinedCallStack.end();
+                 ++StackFrame, ++InlCallStackIter) {
+              uint64_t StackId = computeStackId(*StackFrame);
+              if (StackId != *InlCallStackIter)
+                return false;
+            }
+            // Return true if we found and matched all stack ids from the call
+            // instruction.
+            return InlCallStackIter == InlinedCallStack.end();
+          };
+
+      // First add !memprof metadata from allocation info, if we found the
+      // instruction's leaf location in that map, and if the rest of the
+      // instruction's locations match the prefix Frame locations on an
+      // allocation context with the same leaf.
+      // Only consider allocations via new, to reduce unnecessary metadata,
+      // since those are the only allocations that will be targeted initially.
+      if (isNewLikeFn(CI, &FuncInfo.TLI) &&
+          AllocInfoIter != LocHashToAllocInfo.end()) {
+        // We may match this instruction's location list to multiple MIB
+        // contexts. Add them to a Trie specialized for trimming the contexts to
+        // the minimal needed to disambiguate contexts with unique behavior.
+        CallStackTrie AllocTrie;
+        for (auto *AllocInfo : AllocInfoIter->second) {
+          // Check the full inlined call stack against this one.
+          // If we found and thus matched all frames on the call, include
+          // this MIB.
+          if (StackFrameIncludesInlinedCallStack(AllocInfo->CallStack))
+            addCallStack(AllocTrie, AllocInfo);
+        }
+        // We might not have matched any to the full inlined call stack.
+        // But if we did, create and attach metadata, or a function attribute if
+        // all contexts have identical profiled behavior.
+        if (!AllocTrie.empty()) {
+          // MemprofMDAttached will be false if a function attribute was
+          // attached.
+          bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI);
+          assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof));
+          if (MemprofMDAttached) {
+            // Add callsite metadata for the instruction's location list so that
+            // it simpler later on to identify which part of the MIB contexts
+            // are from this particular instruction (including during inlining,
+            // when the callsite metdata will be updated appropriately).
+            // FIXME: can this be changed to strip out the matching stack
+            // context ids from the MIB contexts and not add any callsite
+            // metadata here to save space?
+            addCallsiteMetadata(I, InlinedCallStack, Ctx);
+          }
+        }
+      }
+      // Otherwise, add callsite metadata if we found the instruction's leaf
+      // location in the callsites map.
+      else if (CallSitesIter != LocHashToCallSites.end()) {
+        for (auto CallStackIdx : CallSitesIter->second) {
+          // If we found and thus matched all frames on the call, create and
+          // attach call stack metadata.
+          if (StackFrameIncludesInlinedCallStack(*CallStackIdx.first,
+                                                 CallStackIdx.second)) {
+            addCallsiteMetadata(I, InlinedCallStack, Ctx);
+            // Only need to find one with a matching call stack and add a single
+            // callsite metadata.
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
 // Read the profile from ProfileFileName and assign the value to the
 // instrumented BB and the edges. This function also updates ProgramMaxCount.
 // Return true if the profile are successfully read, and false on errors.
@@ -1752,7 +2007,7 @@
     return false;
 
   // TODO: might need to change the warning once the clang option is finalized.
-  if (!PGOReader->isIRLevelProfile()) {
+  if (!PGOReader->isIRLevelProfile() && !PGOReader->hasMemoryProfile()) {
     Ctx.diagnose(DiagnosticInfoPGOProfile(
         ProfileFileName.data(), "Not an IR level instrumentation profile"));
     return false;
@@ -1799,6 +2054,14 @@
     SplitIndirectBrCriticalEdges(F, /*IgnoreBlocksWithoutPHI=*/false, BPI, BFI);
     PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS,
                     InstrumentFuncEntry);
+    // Read and match memprof first since we do this via debug info and can
+    // match even if there is an IR mismatch detected for regular PGO below.
+    if (PGOReader->hasMemoryProfile())
+      Func.readMemprof(PGOReader.get());
+
+    if (!PGOReader->isIRLevelProfile())
+      continue;
+
     // When AllMinusOnes is true, it means the profile for the function
     // is unrepresentative and this function is actually hot. Set the
     // entry count of the function to be multiple times of hot threshold
Index: llvm/lib/Analysis/MemoryBuiltins.cpp
===================================================================
--- llvm/lib/Analysis/MemoryBuiltins.cpp
+++ llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -283,6 +283,12 @@
   return getAllocationData(V, MallocOrOpNewLike, TLI).hasValue();
 }
 
+/// Tests if a value is a call or invoke to a library function that
+/// allocates memory via new.
+bool llvm::isNewLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
+  return getAllocationData(V, OpNewLike, TLI).hasValue();
+}
+
 /// Tests if a value is a call or invoke to a library function that
 /// allocates uninitialized memory with alignment (such as aligned_alloc).
 static bool isAlignedAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI) {
Index: llvm/include/llvm/ProfileData/InstrProfReader.h
===================================================================
--- llvm/include/llvm/ProfileData/InstrProfReader.h
+++ llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -118,6 +118,9 @@
   /// Return true if the profile only instruments function entries.
   virtual bool functionEntryOnly() const = 0;
 
+  /// Return true if profile includes a memory profile.
+  virtual bool hasMemoryProfile() const = 0;
+
   /// Returns a BitsetEnum describing the attributes of the profile. To check
   /// individual attributes prefer using the helpers above.
   virtual InstrProfKind getProfileKind() const = 0;
@@ -233,6 +236,11 @@
     return static_cast<bool>(ProfileKind & InstrProfKind::FunctionEntryOnly);
   }
 
+  bool hasMemoryProfile() const override {
+    // TODO: Add support for text format memory profiles.
+    return false;
+  }
+
   InstrProfKind getProfileKind() const override { return ProfileKind; }
 
   /// Read the header.
@@ -322,6 +330,10 @@
     return (Version & VARIANT_MASK_FUNCTION_ENTRY_ONLY) != 0;
   }
 
+  bool hasMemoryProfile() const override {
+    return (Version & VARIANT_MASK_MEMPROF) != 0;
+  }
+
   /// Returns a BitsetEnum describing the attributes of the raw instr profile.
   InstrProfKind getProfileKind() const override;
 
@@ -466,6 +478,7 @@
   virtual bool instrEntryBBEnabled() const = 0;
   virtual bool hasSingleByteCoverage() const = 0;
   virtual bool functionEntryOnly() const = 0;
+  virtual bool hasMemoryProfile() const = 0;
   virtual InstrProfKind getProfileKind() const = 0;
   virtual Error populateSymtab(InstrProfSymtab &) = 0;
 };
@@ -532,6 +545,10 @@
     return (FormatVersion & VARIANT_MASK_FUNCTION_ENTRY_ONLY) != 0;
   }
 
+  bool hasMemoryProfile() const override {
+    return (FormatVersion & VARIANT_MASK_MEMPROF) != 0;
+  }
+
   InstrProfKind getProfileKind() const override;
 
   Error populateSymtab(InstrProfSymtab &Symtab) override {
@@ -605,6 +622,8 @@
 
   bool functionEntryOnly() const override { return Index->functionEntryOnly(); }
 
+  bool hasMemoryProfile() const override { return Index->hasMemoryProfile(); }
+
   /// Returns a BitsetEnum describing the attributes of the indexed instr
   /// profile.
   InstrProfKind getProfileKind() const override {
Index: llvm/include/llvm/Analysis/MemoryBuiltins.h
===================================================================
--- llvm/include/llvm/Analysis/MemoryBuiltins.h
+++ llvm/include/llvm/Analysis/MemoryBuiltins.h
@@ -57,6 +57,10 @@
 bool isAllocationFn(const Value *V,
                     function_ref<const TargetLibraryInfo &(Function &)> GetTLI);
 
+/// Tests if a value is a call or invoke to a library function that
+/// allocates memory via new.
+bool isNewLikeFn(const Value *V, const TargetLibraryInfo *TLI);
+
 /// Tests if a value is a call or invoke to a library function that
 /// allocates memory similar to malloc or calloc.
 bool isMallocOrCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI);
Index: clang/test/CodeGen/memprof.cpp
===================================================================
--- /dev/null
+++ clang/test/CodeGen/memprof.cpp
@@ -0,0 +1,35 @@
+// Test if memprof instrumentation and use pass are invoked.
+//
+// Instrumentation:
+// Ensure Pass MemProfilerPass and ModuleMemProfilerPass are invoked.
+// RUN: %clang_cc1 -O2 -fmemory-profile %s -fdebug-pass-manager -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=INSTRUMENT
+// INSTRUMENT: Running pass: MemProfilerPass on main
+// INSTRUMENT: Running pass: ModuleMemProfilerPass on [module]
+
+// TODO: Use text profile inputs once that is available for memprof.
+//
+// The following commands were used to compile the source to instrumented
+// executables and collect raw binary format profiles:
+//
+// # Collect memory profile:
+// $ clang++ -fuse-ld=lld -Wl,-no-pie -Wl,--no-rosegment -gmlt \
+//      -fdebug-info-for-profiling -mno-omit-leaf-frame-pointer \
+//      -fno-omit-frame-pointer -fno-optimize-sibling-calls -m64 -Wl,-build-id \
+//      memprof.cpp -o memprof.exe -fmemory-profile
+// $ env MEMPROF_OPTIONS=log_path=stdout ./memprof.exe > memprof.memprofraw
+//
+// RUN: llvm-profdata merge %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdata
+
+// Profile use:
+// Ensure Pass PGOInstrumentationUse is invoked with the memprof-only profile.
+// RUN: %clang_cc1 -O2 -fprofile-instrument-use-path=%t.memprofdata %s -fdebug-pass-manager  -emit-llvm -o - 2>&1 | FileCheck %s -check-prefix=USE
+// USE: Running pass: PGOInstrumentationUse on [module]
+
+char *foo() {
+  return new char[10];
+}
+int main() {
+  char *a = foo();
+  delete[] a;
+  return 0;
+}
Index: clang/lib/Frontend/CompilerInvocation.cpp
===================================================================
--- clang/lib/Frontend/CompilerInvocation.cpp
+++ clang/lib/Frontend/CompilerInvocation.cpp
@@ -1307,7 +1307,10 @@
   }
   std::unique_ptr<llvm::IndexedInstrProfReader> PGOReader =
     std::move(ReaderOrErr.get());
-  if (PGOReader->isIRLevelProfile()) {
+  // Currently memprof profiles are only added at the IR level. Mark the profile
+  // type as IR in that case as well and the subsequent matching needs to detect
+  // which is available (might be one or both).
+  if (PGOReader->isIRLevelProfile() || PGOReader->hasMemoryProfile()) {
     if (PGOReader->hasCSIRLevelProfile())
       Opts.setProfileUse(CodeGenOptions::ProfileCSIRInstr);
     else
_______________________________________________
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to